wgit 0.0.18 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit.rb +0 -1
- data/lib/wgit/assertable.rb +20 -23
- data/lib/wgit/core_ext.rb +6 -14
- data/lib/wgit/crawler.rb +94 -183
- data/lib/wgit/database/database.rb +209 -185
- data/lib/wgit/database/model.rb +7 -7
- data/lib/wgit/document.rb +281 -241
- data/lib/wgit/indexer.rb +99 -92
- data/lib/wgit/logger.rb +5 -1
- data/lib/wgit/url.rb +171 -185
- data/lib/wgit/utils.rb +57 -68
- data/lib/wgit/version.rb +1 -1
- metadata +86 -60
- data/CHANGELOG.md +0 -61
- data/LICENSE.txt +0 -21
- data/README.md +0 -361
- data/TODO.txt +0 -34
- data/lib/wgit/database/connection_details.rb +0 -41
data/lib/wgit/database/model.rb
CHANGED
@@ -3,14 +3,14 @@
|
|
3
3
|
require_relative '../utils'
|
4
4
|
|
5
5
|
module Wgit
|
6
|
-
# Module
|
6
|
+
# Module used to build the database collection objects.
|
7
7
|
module Model
|
8
8
|
# The data model for a Wgit::Url.
|
9
9
|
#
|
10
|
-
# @param url [Wgit::Url] The
|
10
|
+
# @param url [Wgit::Url] The Url DB record.
|
11
11
|
# @return [Hash] The URL model ready for DB insertion.
|
12
12
|
def self.url(url)
|
13
|
-
raise 'url must respond_to? to_h' unless url.respond_to?(:to_h)
|
13
|
+
raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
|
14
14
|
|
15
15
|
model = url.to_h
|
16
16
|
Wgit::Utils.remove_non_bson_types(model)
|
@@ -21,13 +21,13 @@ module Wgit
|
|
21
21
|
# @param doc [Wgit::Document] The Document DB record.
|
22
22
|
# @return [Hash] The Document model ready for DB insertion.
|
23
23
|
def self.document(doc)
|
24
|
-
raise 'doc must respond_to? to_h' unless doc.respond_to?(:to_h)
|
24
|
+
raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
|
25
25
|
|
26
|
-
model = doc.to_h(false)
|
26
|
+
model = doc.to_h(include_html: false)
|
27
27
|
Wgit::Utils.remove_non_bson_types(model)
|
28
28
|
end
|
29
29
|
|
30
|
-
#
|
30
|
+
# Common fields when inserting a record into the DB.
|
31
31
|
#
|
32
32
|
# @return [Hash] Containing common insertion fields for all models.
|
33
33
|
def self.common_insert_data
|
@@ -37,7 +37,7 @@ module Wgit
|
|
37
37
|
}
|
38
38
|
end
|
39
39
|
|
40
|
-
#
|
40
|
+
# Common fields when updating a record in the DB.
|
41
41
|
#
|
42
42
|
# @return [Hash] Containing common update fields for all models.
|
43
43
|
def self.common_update_data
|
data/lib/wgit/document.rb
CHANGED
@@ -6,15 +6,18 @@ require 'json'
|
|
6
6
|
|
7
7
|
module Wgit
|
8
8
|
# Class modeling a HTML web document. Also doubles as a search result when
|
9
|
-
# loading Documents from the database.
|
9
|
+
# loading Documents from the database via Wgit::Database#search.
|
10
10
|
#
|
11
|
-
# The initialize method dynamically initializes
|
11
|
+
# The initialize method dynamically initializes instance variables from the
|
12
12
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
13
13
|
# Document class can be easily extended allowing you to pull out the bits of
|
14
14
|
# a webpage that are important to you. See Wgit::Document.define_extension.
|
15
15
|
class Document
|
16
16
|
include Assertable
|
17
17
|
|
18
|
+
# Regex for the allowed var names when defining an extension.
|
19
|
+
REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
|
20
|
+
|
18
21
|
# The HTML elements that make up the visible text on a page.
|
19
22
|
# These elements are used to initialize the @text of the Document.
|
20
23
|
# See the README.md for how to add to this Array dynamically.
|
@@ -25,7 +28,6 @@ module Wgit
|
|
25
28
|
|
26
29
|
class << self
|
27
30
|
# Class level instance reader method for @text_elements.
|
28
|
-
# Call using Wgit::Document.text_elements.
|
29
31
|
attr_reader :text_elements
|
30
32
|
end
|
31
33
|
|
@@ -35,7 +37,7 @@ module Wgit
|
|
35
37
|
# The HTML of the webpage, an instance of String.
|
36
38
|
attr_reader :html
|
37
39
|
|
38
|
-
# The Nokogiri document object initialized from @html.
|
40
|
+
# The Nokogiri::HTML document object initialized from @html.
|
39
41
|
attr_reader :doc
|
40
42
|
|
41
43
|
# The score is only used following a Database#search and records matches.
|
@@ -43,72 +45,140 @@ module Wgit
|
|
43
45
|
|
44
46
|
# Initialize takes either two strings (representing the URL and HTML) or an
|
45
47
|
# object representing a database record (of a HTTP crawled web page). This
|
46
|
-
# allows for initialisation from both crawled web pages and
|
47
|
-
#
|
48
|
-
#
|
49
|
-
# During initialisation, the Document will call any
|
50
|
-
# 'init_*_from_html' and 'init_*_from_object' methods it can find.
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
54
|
-
#
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
48
|
+
# allows for initialisation from both crawled web pages and documents/web
|
49
|
+
# pages retrieved from the database.
|
50
|
+
#
|
51
|
+
# During initialisation, the Document will call any private
|
52
|
+
# 'init_*_from_html' and 'init_*_from_object' methods it can find. See the
|
53
|
+
# README.md and Wgit::Document.define_extension method for more details.
|
54
|
+
#
|
55
|
+
# @param url_or_obj [String, Wgit::Url, Object#fetch] Either a String
|
56
|
+
# representing a URL or a Hash-like object responding to :fetch. e.g. a
|
57
|
+
# MongoDB collection object. The Object's :fetch method should support
|
58
|
+
# Strings as keys.
|
59
|
+
# @param html [String, NilClass] The crawled web page's HTML. This param is
|
60
|
+
# only used if url_or_obj is a String representing the web page's URL.
|
61
|
+
# Otherwise, the HTML comes from the database object. A html of nil will
|
62
|
+
# be defaulted to an empty String.
|
60
63
|
def initialize(url_or_obj, html = '')
|
61
|
-
# Init from URL String and HTML String.
|
62
64
|
if url_or_obj.is_a?(String)
|
63
|
-
|
64
|
-
assert_type(url, Wgit::Url)
|
65
|
-
|
66
|
-
@url = url
|
67
|
-
@html = html || ''
|
68
|
-
@doc = init_nokogiri
|
69
|
-
@score = 0.0
|
70
|
-
|
71
|
-
process_url_and_html
|
72
|
-
|
73
|
-
# Dynamically run the init_*_from_html methods.
|
74
|
-
Document.private_instance_methods(false).each do |method|
|
75
|
-
if method.to_s.start_with?('init_') &&
|
76
|
-
method.to_s.end_with?('_from_html')
|
77
|
-
send(method)
|
78
|
-
end
|
79
|
-
end
|
80
|
-
# Init from a Hash like object containing Strings as keys e.g. Mongo
|
81
|
-
# collection obj.
|
65
|
+
init_from_strings(url_or_obj, html)
|
82
66
|
else
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
67
|
+
init_from_object(url_or_obj)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
### Document Class Methods ###
|
72
|
+
|
73
|
+
# Uses Document.text_elements to build an xpath String, used to obtain
|
74
|
+
# all of the combined text on a webpage.
|
75
|
+
#
|
76
|
+
# @return [String] An xpath String to obtain a webpage's text elements.
|
77
|
+
def self.text_elements_xpath
|
78
|
+
xpath = ''
|
79
|
+
return xpath if Wgit::Document.text_elements.empty?
|
80
|
+
|
81
|
+
el_xpath = '//%s/text()'
|
82
|
+
Wgit::Document.text_elements.each_with_index do |el, i|
|
83
|
+
xpath += ' | ' unless i.zero?
|
84
|
+
xpath += format(el_xpath, el)
|
100
85
|
end
|
86
|
+
|
87
|
+
xpath
|
101
88
|
end
|
102
89
|
|
90
|
+
# Defines an extension, which is a way to extract HTML elements into
|
91
|
+
# instance variables upon Document initialization. See the default
|
92
|
+
# extensions defined in 'document_extensions.rb' as examples.
|
93
|
+
#
|
94
|
+
# Initialises a private instance variable with the xpath or database object
|
95
|
+
# result(s). When initialising from HTML, a true singleton value will only
|
96
|
+
# ever return one result otherwise all xpath results are returned in an
|
97
|
+
# Array. When initialising from a database object, the value is taken as
|
98
|
+
# is and singleton is only used to define the default empty value.
|
99
|
+
# If a value cannot be found (in either the HTML or database object), then
|
100
|
+
# a default will be used. The default value is: singleton ? nil : [].
|
101
|
+
#
|
102
|
+
# Note that defined extensions work for both documents initialized from
|
103
|
+
# the WWW (via Wgit::Crawler methods) and from database objects. This
|
104
|
+
# effectively implements ORM like behavior using this class.
|
105
|
+
#
|
106
|
+
# @param var [Symbol] The name of the variable to be initialised.
|
107
|
+
# @param xpath [String, Object#call] The xpath used to find the element(s)
|
108
|
+
# of the webpage. Pass a callable object (proc etc.) if you want the
|
109
|
+
# xpath value to be derived on Document initialisation (instead of when
|
110
|
+
# the extension is defined). The call method must return a valid xpath
|
111
|
+
# String.
|
112
|
+
# @param options [Hash] The options to define an extension with.
|
113
|
+
# @option options [Boolean] :singleton The singleton option determines
|
114
|
+
# whether or not the result(s) should be in an Array. If multiple
|
115
|
+
# results are found and singleton is true then the first result will be
|
116
|
+
# used. Defaults to true.
|
117
|
+
# @option options [Boolean] :text_content_only The text_content_only option
|
118
|
+
# if true will use the text content of the Nokogiri result object,
|
119
|
+
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
120
|
+
# @yield [value, source] Yields the value (Object) about to be assigned to
|
121
|
+
# the new var and the source (Symbol) of the value (either :html or
|
122
|
+
# :object). The return value of the block becomes the new var value,
|
123
|
+
# unless nil. Return nil if you want to inspect but not change the var
|
124
|
+
# value. The block gets executed when a Document is initialized from html
|
125
|
+
# or an object e.g. database.
|
126
|
+
# @raise [StandardError] If the var param isn't valid.
|
127
|
+
# @return [Symbol] The first half of the newly defined method names e.g.
|
128
|
+
# if var == "title" then :init_title is returned.
|
129
|
+
def self.define_extension(var, xpath, options = {}, &block)
|
130
|
+
default_options = { singleton: true, text_content_only: true }
|
131
|
+
options = default_options.merge(options)
|
132
|
+
|
133
|
+
raise "var must match #{REGEX_EXTENSION_NAME}" unless \
|
134
|
+
var =~ REGEX_EXTENSION_NAME
|
135
|
+
|
136
|
+
# Define the private init_*_from_html method for HTML.
|
137
|
+
# Gets the HTML's xpath value and creates a var for it.
|
138
|
+
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
139
|
+
result = find_in_html(xpath, options, &block)
|
140
|
+
init_var(var, result)
|
141
|
+
end
|
142
|
+
Document.send :private, func_name
|
143
|
+
|
144
|
+
# Define the private init_*_from_object method for a Database object.
|
145
|
+
# Gets the Object's 'key' value and creates a var for it.
|
146
|
+
func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
|
147
|
+
result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
|
148
|
+
init_var(var, result)
|
149
|
+
end
|
150
|
+
Document.send :private, func_name
|
151
|
+
|
152
|
+
"init_#{var}".to_sym
|
153
|
+
end
|
154
|
+
|
155
|
+
# Removes the init_* methods created when an extension is defined.
|
156
|
+
# Therefore, this is the opposing method to Document.define_extension.
|
157
|
+
# Returns true if successful or false if the method(s) cannot be found.
|
158
|
+
#
|
159
|
+
# @param var [Symbol] The extension variable already defined.
|
160
|
+
# @return [Boolean] True if the extension var was found and removed;
|
161
|
+
# otherwise false.
|
162
|
+
def self.remove_extension(var)
|
163
|
+
Document.send(:remove_method, "init_#{var}_from_html")
|
164
|
+
Document.send(:remove_method, "init_#{var}_from_object")
|
165
|
+
|
166
|
+
true
|
167
|
+
rescue NameError
|
168
|
+
false
|
169
|
+
end
|
170
|
+
|
171
|
+
### Document Instance Methods ###
|
172
|
+
|
103
173
|
# Determines if both the url and html match. Use
|
104
|
-
# doc.object_id ==
|
174
|
+
# doc.object_id == other.object_id for exact object comparison.
|
105
175
|
#
|
106
|
-
# @param
|
176
|
+
# @param other [Wgit::Document] To compare self against.
|
107
177
|
# @return [Boolean] True if @url and @html are equal, false if not.
|
108
|
-
def ==(
|
109
|
-
return false unless
|
178
|
+
def ==(other)
|
179
|
+
return false unless other.is_a?(Wgit::Document)
|
110
180
|
|
111
|
-
(@url ==
|
181
|
+
(@url == other.url) && (@html == other.html)
|
112
182
|
end
|
113
183
|
|
114
184
|
# Is a shortcut for calling Document#html[range].
|
@@ -129,33 +199,38 @@ module Wgit
|
|
129
199
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
130
200
|
# <base> element's href value or @url (if @base is nil). If @base is
|
131
201
|
# present and relative, then @url.to_base + @base is returned. This method
|
132
|
-
# should be used instead of `doc.url.to_base` etc.
|
133
|
-
# absolute links.
|
202
|
+
# should be used instead of `doc.url.to_base` etc. when manually building
|
203
|
+
# absolute links from relative links.
|
134
204
|
#
|
135
205
|
# Provide the `link:` parameter to get the correct base URL for that type
|
136
206
|
# of link. For example, a link of `#top` would always return @url because
|
137
207
|
# it applies to that page, not a different one. Query strings work in the
|
138
|
-
# same way. Use this parameter if manually concatting
|
139
|
-
# `absolute_link = doc.base_url(link: link).concat(link)` etc.
|
208
|
+
# same way. Use this parameter if manually concatting Url's e.g.
|
140
209
|
#
|
141
|
-
#
|
210
|
+
# relative_link = Wgit::Url.new '?q=hello'
|
211
|
+
# absolute_link = doc.base_url(link: relative_link).concat(relative_link)
|
212
|
+
#
|
213
|
+
# This is similar to how Wgit::Document#internal_absolute_links works.
|
214
|
+
#
|
215
|
+
# @param link [Wgit::Url, String] The link to obtain the correct base URL
|
216
|
+
# for.
|
142
217
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
143
218
|
# 'http://example.com/public'.
|
144
219
|
def base_url(link: nil)
|
145
220
|
get_base = -> { @base.is_relative? ? @url.to_base.concat(@base) : @base }
|
146
221
|
|
147
222
|
if link
|
148
|
-
|
223
|
+
link = Wgit::Url.new(link)
|
149
224
|
raise "link must be relative: #{link}" unless link.is_relative?
|
150
225
|
|
151
|
-
if link.is_anchor? || link.
|
226
|
+
if link.is_anchor? || link.is_query?
|
152
227
|
base_url = @base ? get_base.call : @url
|
153
|
-
return base_url.without_anchor.
|
228
|
+
return base_url.without_anchor.without_query
|
154
229
|
end
|
155
230
|
end
|
156
231
|
|
157
232
|
base_url = @base ? get_base.call : @url.base
|
158
|
-
base_url.without_anchor.
|
233
|
+
base_url.without_anchor.without_query
|
159
234
|
end
|
160
235
|
|
161
236
|
# Returns a Hash containing this Document's instance vars.
|
@@ -165,52 +240,51 @@ module Wgit
|
|
165
240
|
# @param include_html [Boolean] Whether or not to include @html in the
|
166
241
|
# returned Hash.
|
167
242
|
# @return [Hash] Containing self's instance vars.
|
168
|
-
def to_h(include_html
|
243
|
+
def to_h(include_html: false)
|
169
244
|
ignore = include_html ? [] : ['@html']
|
170
|
-
ignore << '@doc' # Always ignore
|
171
|
-
|
245
|
+
ignore << '@doc' # Always ignore Nokogiri @doc.
|
246
|
+
|
247
|
+
Wgit::Utils.to_h(self, ignore: ignore)
|
172
248
|
end
|
173
249
|
|
174
|
-
# Converts this Document's to_h return value to a JSON String.
|
250
|
+
# Converts this Document's #to_h return value to a JSON String.
|
175
251
|
#
|
176
252
|
# @param include_html [Boolean] Whether or not to include @html in the
|
177
253
|
# returned JSON String.
|
178
254
|
# @return [String] This Document represented as a JSON String.
|
179
|
-
def to_json(include_html
|
180
|
-
h = to_h(include_html)
|
255
|
+
def to_json(include_html: false)
|
256
|
+
h = to_h(include_html: include_html)
|
181
257
|
JSON.generate(h)
|
182
258
|
end
|
183
259
|
|
184
260
|
# Returns a Hash containing this Document's instance variables and
|
185
|
-
# their
|
261
|
+
# their #length (if they respond to it). Works dynamically so that any
|
186
262
|
# user defined extensions (and their created instance vars) will appear in
|
187
263
|
# the returned Hash as well. The number of text snippets as well as total
|
188
264
|
# number of textual bytes are always included in the returned Hash.
|
189
265
|
#
|
190
|
-
# @return [Hash] Containing self's HTML statistics.
|
266
|
+
# @return [Hash] Containing self's HTML page statistics.
|
191
267
|
def stats
|
192
268
|
hash = {}
|
193
269
|
instance_variables.each do |var|
|
194
270
|
# Add up the total bytes of text as well as the length.
|
195
271
|
if var == :@text
|
196
|
-
count = 0
|
197
|
-
@text.each { |t| count += t.length }
|
198
272
|
hash[:text_snippets] = @text.length
|
199
|
-
hash[:text_bytes]
|
273
|
+
hash[:text_bytes] = @text.sum(&:length)
|
200
274
|
# Else take the var's #length method return value.
|
201
275
|
else
|
202
276
|
next unless instance_variable_get(var).respond_to?(:length)
|
203
277
|
|
204
|
-
hash[var[1..-1].to_sym] =
|
205
|
-
instance_variable_get(var).send(:length)
|
278
|
+
hash[var[1..-1].to_sym] = instance_variable_get(var).send(:length)
|
206
279
|
end
|
207
280
|
end
|
281
|
+
|
208
282
|
hash
|
209
283
|
end
|
210
284
|
|
211
285
|
# Determine the size of this Document's HTML.
|
212
286
|
#
|
213
|
-
# @return [Integer] The total number of
|
287
|
+
# @return [Integer] The total number of @html bytes.
|
214
288
|
def size
|
215
289
|
stats[:html]
|
216
290
|
end
|
@@ -242,56 +316,55 @@ module Wgit
|
|
242
316
|
@doc.css(selector)
|
243
317
|
end
|
244
318
|
|
245
|
-
#
|
246
|
-
# meaning a link to another document on the same host.
|
247
|
-
#
|
248
|
-
#
|
249
|
-
#
|
250
|
-
#
|
251
|
-
#
|
319
|
+
# Returns all internal links from this Document in relative form. Internal
|
320
|
+
# meaning a link to another document on the same host.
|
321
|
+
#
|
322
|
+
# This Document's host is used to determine if an absolute URL is actually
|
323
|
+
# a relative link e.g. For a Document representing
|
324
|
+
# http://www.server.com/about, an absolute link of
|
325
|
+
# <a href='http://www.server.com/search'> will be recognized and returned
|
326
|
+
# as an internal link because both Documents live on the same host. Also
|
327
|
+
# see Wgit::Document#internal_absolute_links.
|
252
328
|
#
|
253
|
-
# @return [Array<Wgit::Url>]
|
329
|
+
# @return [Array<Wgit::Url>] Self's internal Url's in relative form.
|
254
330
|
def internal_links
|
255
331
|
return [] if @links.empty?
|
256
332
|
|
257
333
|
links = @links
|
258
334
|
.select { |link| link.is_relative?(host: @url.to_base) }
|
259
335
|
.map(&:without_base)
|
260
|
-
.map do |link| #
|
336
|
+
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
261
337
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
262
338
|
end
|
263
339
|
|
264
340
|
Wgit::Utils.process_arr(links)
|
265
341
|
end
|
266
342
|
|
267
|
-
#
|
268
|
-
#
|
343
|
+
# Returns all internal links from this Document in absolute form by
|
344
|
+
# appending them to self's #base_url. Also see
|
269
345
|
# Wgit::Document#internal_links.
|
270
346
|
#
|
271
|
-
# @return [Array<Wgit::Url>]
|
272
|
-
|
273
|
-
|
274
|
-
links = internal_links
|
275
|
-
return [] if links.empty?
|
276
|
-
|
277
|
-
links.map { |link| base_url(link: link).concat(link) }
|
347
|
+
# @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
|
348
|
+
def internal_absolute_links
|
349
|
+
internal_links.map { |link| base_url(link: link).concat(link) }
|
278
350
|
end
|
279
351
|
|
280
|
-
#
|
281
|
-
# a different host.
|
352
|
+
# Returns all external links from this Document in absolute form. External
|
353
|
+
# meaning a link to a different host.
|
282
354
|
#
|
283
|
-
# @return [Array<Wgit::Url>]
|
355
|
+
# @return [Array<Wgit::Url>] Self's external Url's in absolute form.
|
284
356
|
def external_links
|
285
357
|
return [] if @links.empty?
|
286
358
|
|
287
359
|
links = @links
|
288
|
-
.reject { |link| link.
|
360
|
+
.reject { |link| link.is_relative?(host: @url.to_base) }
|
289
361
|
.map(&:without_trailing_slash)
|
290
362
|
|
291
363
|
Wgit::Utils.process_arr(links)
|
292
364
|
end
|
293
365
|
|
294
|
-
# Searches
|
366
|
+
# Searches the @text for the given query and returns the results.
|
367
|
+
#
|
295
368
|
# The number of search hits for each sentenence are recorded internally
|
296
369
|
# and used to rank/sort the search results before being returned. Where
|
297
370
|
# the Wgit::Database#search method search all documents for the most hits,
|
@@ -302,24 +375,33 @@ module Wgit
|
|
302
375
|
# original sentence, which ever is less. The algorithm obviously ensures
|
303
376
|
# that the search query is visible somewhere in the sentence.
|
304
377
|
#
|
305
|
-
# @param query [String] The value to search the document's
|
378
|
+
# @param query [String, Object#to_s] The value to search the document's
|
379
|
+
# @text for.
|
380
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
381
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
382
|
+
# for separately.
|
306
383
|
# @param sentence_limit [Integer] The max length of each search result
|
307
384
|
# sentence.
|
308
|
-
# @return [Array<String>]
|
309
|
-
def search(
|
385
|
+
# @return [Array<String>] A subset of @text, matching the query.
|
386
|
+
def search(
|
387
|
+
query, case_sensitive: false, whole_sentence: false, sentence_limit: 80
|
388
|
+
)
|
389
|
+
query = query.to_s
|
310
390
|
raise 'A search query must be provided' if query.empty?
|
311
391
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
312
392
|
|
393
|
+
query = query.gsub(' ', '|') unless whole_sentence
|
394
|
+
regex = Regexp.new(query, !case_sensitive)
|
313
395
|
results = {}
|
314
|
-
regex = Regexp.new(query, Regexp::IGNORECASE)
|
315
396
|
|
316
397
|
@text.each do |sentence|
|
317
398
|
hits = sentence.scan(regex).count
|
318
|
-
next unless hits
|
399
|
+
next unless hits.positive?
|
319
400
|
|
320
401
|
sentence.strip!
|
321
|
-
index = sentence.index(regex)
|
402
|
+
index = sentence.index(regex) # Index of first match.
|
322
403
|
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
404
|
+
|
323
405
|
results[sentence] = hits
|
324
406
|
end
|
325
407
|
|
@@ -334,112 +416,33 @@ module Wgit
|
|
334
416
|
# functionality. The original text is returned; no other reference to it
|
335
417
|
# is kept thereafter.
|
336
418
|
#
|
337
|
-
# @param query [String] The value to search the document's
|
419
|
+
# @param query [String, Object#to_s] The value to search the document's
|
420
|
+
# @text for.
|
421
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
422
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
423
|
+
# for separately.
|
338
424
|
# @param sentence_limit [Integer] The max length of each search result
|
339
425
|
# sentence.
|
340
426
|
# @return [String] This Document's original @text value.
|
341
|
-
def search!(
|
427
|
+
def search!(
|
428
|
+
query, case_sensitive: false, whole_sentence: false, sentence_limit: 80
|
429
|
+
)
|
342
430
|
orig_text = @text
|
343
|
-
@text = search(
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
### Document (Class) methods ###
|
431
|
+
@text = search(
|
432
|
+
query, case_sensitive: case_sensitive,
|
433
|
+
whole_sentence: whole_sentence, sentence_limit: sentence_limit
|
434
|
+
)
|
348
435
|
|
349
|
-
|
350
|
-
# all of the combined text on a webpage.
|
351
|
-
#
|
352
|
-
# @return [String] An xpath String to obtain a webpage's text elements.
|
353
|
-
def self.text_elements_xpath
|
354
|
-
xpath = ''
|
355
|
-
return xpath if Wgit::Document.text_elements.empty?
|
356
|
-
|
357
|
-
el_xpath = '//%s/text()'
|
358
|
-
Wgit::Document.text_elements.each_with_index do |el, i|
|
359
|
-
xpath += ' | ' unless i == 0
|
360
|
-
xpath += format(el_xpath, el)
|
361
|
-
end
|
362
|
-
xpath
|
363
|
-
end
|
364
|
-
|
365
|
-
# Initialises a private instance variable with the xpath or database object
|
366
|
-
# result(s). When initialising from HTML, a true singleton value will only
|
367
|
-
# ever return one result otherwise all xpath results are returned in an
|
368
|
-
# Array. When initialising from a database object, the value is taken as
|
369
|
-
# is and singleton is only used to define the default empty value.
|
370
|
-
# If a value cannot be found (in either the HTML or database object), then
|
371
|
-
# a default will be used. The default is: singleton ? nil : [].
|
372
|
-
#
|
373
|
-
# Note that defined extensions work for both documents being crawled from
|
374
|
-
# the WWW and for documents being retrieved from the database. This
|
375
|
-
# effectively implements ORM like behavior using this class.
|
376
|
-
#
|
377
|
-
# @param var [Symbol] The name of the variable to be initialised.
|
378
|
-
# @param xpath [String, Object#call] The xpath used to find the element(s)
|
379
|
-
# of the webpage. Pass a callable object (proc etc.) if you want the
|
380
|
-
# xpath value to be derived on Document initialisation (instead of when
|
381
|
-
# the extension is defined). The call method must return a valid xpath
|
382
|
-
# String.
|
383
|
-
# @param options [Hash] The options to define an extension with.
|
384
|
-
# @option options [Boolean] :singleton The singleton option determines
|
385
|
-
# whether or not the result(s) should be in an Array. If multiple
|
386
|
-
# results are found and singleton is true then the first result will be
|
387
|
-
# used. Defaults to true.
|
388
|
-
# @option options [Boolean] :text_content_only The text_content_only option
|
389
|
-
# if true will use the text content of the Nokogiri result object,
|
390
|
-
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
391
|
-
# @yield [Object, Symbol] Yields the value about to be assigned to the new
|
392
|
-
# var and the source of the value (either :html or :object aka database).
|
393
|
-
# The return value of the block becomes the new var value, unless nil.
|
394
|
-
# Return nil if you want to inspect but not change the var value. The
|
395
|
-
# block gets executed when a Document is initialized from html or an
|
396
|
-
# object.
|
397
|
-
# @return [Symbol] The first half of the newly defined method names e.g.
|
398
|
-
# if var == "title" then :init_title is returned.
|
399
|
-
def self.define_extension(var, xpath, options = {}, &block)
|
400
|
-
default_options = { singleton: true, text_content_only: true }
|
401
|
-
options = default_options.merge(options)
|
402
|
-
|
403
|
-
# Define the private init_*_from_html method for HTML.
|
404
|
-
# Gets the HTML's xpath value and creates a var for it.
|
405
|
-
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
406
|
-
result = find_in_html(xpath, options, &block)
|
407
|
-
init_var(var, result)
|
408
|
-
end
|
409
|
-
Document.send :private, func_name
|
410
|
-
|
411
|
-
# Define the private init_*_from_object method for a Database object.
|
412
|
-
# Gets the Object's "key" value and creates a var for it.
|
413
|
-
func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
|
414
|
-
result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
|
415
|
-
init_var(var, result)
|
416
|
-
end
|
417
|
-
Document.send :private, func_name
|
418
|
-
|
419
|
-
"init_#{var}".to_sym
|
420
|
-
end
|
421
|
-
|
422
|
-
# Removes the init_* methods created when an extension is defined.
|
423
|
-
# Therefore, this is the opposing method to Document.define_extension.
|
424
|
-
# Returns true if successful or false if the method(s) cannot be found.
|
425
|
-
#
|
426
|
-
# @param var [Symbol] The extension variable already defined.
|
427
|
-
# @return [Boolean] True if the extension var was found and removed;
|
428
|
-
# otherwise false.
|
429
|
-
def self.remove_extension(var)
|
430
|
-
Document.send(:remove_method, "init_#{var}_from_html")
|
431
|
-
Document.send(:remove_method, "init_#{var}_from_object")
|
432
|
-
true
|
433
|
-
rescue NameError
|
434
|
-
false
|
436
|
+
orig_text
|
435
437
|
end
|
436
438
|
|
437
439
|
protected
|
438
440
|
|
439
441
|
# Initializes the nokogiri object using @html, which cannot be nil.
|
440
442
|
# Override this method to custom configure the Nokogiri object returned.
|
441
|
-
# Gets called from Wgit::Document.new.
|
443
|
+
# Gets called from Wgit::Document.new upon initialization.
|
442
444
|
#
|
445
|
+
# @raise [StandardError] If @html isn't set.
|
443
446
|
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
444
447
|
def init_nokogiri
|
445
448
|
raise '@html must be set' unless @html
|
@@ -459,31 +462,30 @@ module Wgit
|
|
459
462
|
# Object) : results (Array).
|
460
463
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
461
464
|
# (String) : result (Nokogiri Object).
|
462
|
-
# @yield [
|
463
|
-
# instance variable so that you can inspect/alter the value if
|
464
|
-
# Return nil from the block if you don't want to override the
|
465
|
-
# given the source which is always :html.
|
465
|
+
# @yield [value, source] Given the value (String/Object) before it's set as
|
466
|
+
# an instance variable so that you can inspect/alter the value if
|
467
|
+
# desired. Return nil from the block if you don't want to override the
|
468
|
+
# value. Also given the source (Symbol) which is always :html.
|
466
469
|
# @return [String, Object] The value found in the html or the default value
|
467
470
|
# (singleton ? nil : []).
|
468
471
|
def find_in_html(xpath, singleton: true, text_content_only: true)
|
469
|
-
|
472
|
+
default = singleton ? nil : []
|
473
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
470
474
|
results = @doc.xpath(xpath)
|
471
475
|
|
472
|
-
if results
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
result = singleton ? nil : []
|
480
|
-
end
|
476
|
+
return default if results.nil? || results.empty?
|
477
|
+
|
478
|
+
result = if singleton
|
479
|
+
text_content_only ? results.first.content : results.first
|
480
|
+
else
|
481
|
+
text_content_only ? results.map(&:content) : results
|
482
|
+
end
|
481
483
|
|
482
484
|
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
483
485
|
|
484
486
|
if block_given?
|
485
487
|
new_result = yield(result, :html)
|
486
|
-
result = new_result
|
488
|
+
result = new_result unless new_result.nil?
|
487
489
|
end
|
488
490
|
|
489
491
|
result
|
@@ -494,22 +496,23 @@ module Wgit
|
|
494
496
|
# @param obj [Object#fetch] The object containing the key/value.
|
495
497
|
# @param key [String] Used to find the value in the obj.
|
496
498
|
# @param singleton [Boolean] True if a single value, false otherwise.
|
497
|
-
# @yield [
|
498
|
-
# instance variable so that you can inspect/alter the value if
|
499
|
-
# Return nil from the block if you don't want to override the
|
500
|
-
# given the source which is always :object.
|
499
|
+
# @yield [value, source] Given the value (String/Object) before it's set as
|
500
|
+
# an instance variable so that you can inspect/alter the value if
|
501
|
+
# desired. Return nil from the block if you don't want to override the
|
502
|
+
# value. Also given the source (Symbol) which is always :object.
|
501
503
|
# @return [String, Object] The value found in the obj or the default value
|
502
504
|
# (singleton ? nil : []).
|
503
505
|
def find_in_object(obj, key, singleton: true)
|
504
506
|
assert_respond_to(obj, :fetch)
|
505
507
|
|
506
508
|
default = singleton ? nil : []
|
507
|
-
result
|
509
|
+
result = obj.fetch(key.to_s, default)
|
510
|
+
|
508
511
|
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
509
512
|
|
510
513
|
if block_given?
|
511
514
|
new_result = yield(result, :object)
|
512
|
-
result = new_result
|
515
|
+
result = new_result unless new_result.nil?
|
513
516
|
end
|
514
517
|
|
515
518
|
result
|
@@ -517,6 +520,54 @@ module Wgit
|
|
517
520
|
|
518
521
|
private
|
519
522
|
|
523
|
+
# Initialise the Document from URL and HTML Strings.
|
524
|
+
def init_from_strings(url, html)
|
525
|
+
assert_types(html, [String, NilClass])
|
526
|
+
|
527
|
+
# We already know url.is_a?(String) so parse into Url unless already so.
|
528
|
+
@url = Wgit::Url.parse(url)
|
529
|
+
@html = html || ''
|
530
|
+
@doc = init_nokogiri
|
531
|
+
@score = 0.0
|
532
|
+
|
533
|
+
process_url_and_html
|
534
|
+
|
535
|
+
# Dynamically run the init_*_from_html methods.
|
536
|
+
Document.private_instance_methods(false).each do |method|
|
537
|
+
if method.to_s.start_with?('init_') &&
|
538
|
+
method.to_s.end_with?('_from_html')
|
539
|
+
send(method) unless method == __method__
|
540
|
+
end
|
541
|
+
end
|
542
|
+
end
|
543
|
+
|
544
|
+
# Initialise the Document from a Hash like Object containing Strings as
|
545
|
+
# keys e.g. database collection object or Hash.
|
546
|
+
def init_from_object(obj)
|
547
|
+
assert_respond_to(obj, :fetch)
|
548
|
+
|
549
|
+
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
550
|
+
@html = obj.fetch('html', '')
|
551
|
+
@doc = init_nokogiri
|
552
|
+
@score = obj.fetch('score', 0.0)
|
553
|
+
|
554
|
+
process_url_and_html
|
555
|
+
|
556
|
+
# Dynamically run the init_*_from_object methods.
|
557
|
+
Document.private_instance_methods(false).each do |method|
|
558
|
+
if method.to_s.start_with?('init_') &&
|
559
|
+
method.to_s.end_with?('_from_object')
|
560
|
+
send(method, obj) unless method == __method__
|
561
|
+
end
|
562
|
+
end
|
563
|
+
end
|
564
|
+
|
565
|
+
# Ensure the @url and @html Strings are correctly encoded etc.
|
566
|
+
def process_url_and_html
|
567
|
+
@url = Wgit::Utils.process_str(@url)
|
568
|
+
@html = Wgit::Utils.process_str(@html)
|
569
|
+
end
|
570
|
+
|
520
571
|
# Initialises an instance variable and defines a getter method for it.
|
521
572
|
#
|
522
573
|
# @param var [Symbol] The name of the variable to be initialized.
|
@@ -535,19 +586,8 @@ module Wgit
|
|
535
586
|
end
|
536
587
|
end
|
537
588
|
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
@html = Wgit::Utils.process_str(@html)
|
542
|
-
end
|
543
|
-
|
544
|
-
alias relative_links internal_links
|
545
|
-
alias relative_urls internal_links
|
546
|
-
alias relative_full_links internal_full_links
|
547
|
-
alias relative_full_urls internal_full_links
|
548
|
-
alias internal_absolute_links internal_full_links
|
549
|
-
alias relative_absolute_links internal_full_links
|
550
|
-
alias relative_absolute_urls internal_full_links
|
551
|
-
alias external_urls external_links
|
589
|
+
alias internal_urls internal_links
|
590
|
+
alias internal_absolute_urls internal_absolute_links
|
591
|
+
alias external_urls external_links
|
552
592
|
end
|
553
593
|
end
|