wgit 0.5.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/CHANGELOG.md +240 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +239 -0
- data/bin/wgit +39 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +304 -148
- data/lib/wgit/database/database.rb +310 -135
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +234 -169
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +20 -10
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +68 -156
- data/lib/wgit/response.rb +17 -17
- data/lib/wgit/url.rb +170 -42
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +8 -2
- metadata +54 -32
data/lib/wgit/database/model.rb
CHANGED
@@ -14,8 +14,7 @@ module Wgit
|
|
14
14
|
raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
|
15
15
|
|
16
16
|
model = url.to_h
|
17
|
-
|
18
|
-
Wgit::Utils.remove_non_bson_types(model)
|
17
|
+
select_bson_types(model)
|
19
18
|
end
|
20
19
|
|
21
20
|
# The data model for a Wgit::Document collection object.
|
@@ -28,7 +27,7 @@ module Wgit
|
|
28
27
|
model = doc.to_h(include_html: false, include_score: false)
|
29
28
|
model['url'] = url(doc.url) # Expand Url String into full object.
|
30
29
|
|
31
|
-
|
30
|
+
select_bson_types(model)
|
32
31
|
end
|
33
32
|
|
34
33
|
# Common fields when inserting a record into the DB.
|
@@ -49,5 +48,13 @@ module Wgit
|
|
49
48
|
date_modified: Wgit::Utils.time_stamp
|
50
49
|
}
|
51
50
|
end
|
51
|
+
|
52
|
+
# Returns the model having removed non bson types (for use with MongoDB).
|
53
|
+
#
|
54
|
+
# @param model_hash [Hash] The model Hash to sanitize.
|
55
|
+
# @return [Hash] The model Hash with non bson types removed.
|
56
|
+
def self.select_bson_types(model_hash)
|
57
|
+
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
58
|
+
end
|
52
59
|
end
|
53
60
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -3,45 +3,56 @@ require_relative 'utils'
|
|
3
3
|
require_relative 'assertable'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'json'
|
6
|
+
require 'set'
|
6
7
|
|
7
8
|
module Wgit
|
8
|
-
# Class
|
9
|
+
# Class modeling/serialising a HTML web document, although other MIME types
|
9
10
|
# will work e.g. images etc. Also doubles as a search result when
|
10
|
-
# loading Documents from the database via Wgit::Database#search
|
11
|
+
# loading Documents from the database via `Wgit::Database#search`.
|
11
12
|
#
|
12
13
|
# The initialize method dynamically initializes instance variables from the
|
13
14
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
14
|
-
# Document class can be easily extended allowing you to
|
15
|
-
# a webpage that are important to you. See Wgit::Document.
|
15
|
+
# Document class can be easily extended allowing you to extract the bits of
|
16
|
+
# a webpage that are important to you. See `Wgit::Document.define_extractor`.
|
16
17
|
class Document
|
17
18
|
include Assertable
|
18
19
|
|
19
|
-
# Regex for the allowed var names when defining an
|
20
|
-
|
20
|
+
# Regex for the allowed var names when defining an extractor.
|
21
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
|
21
22
|
|
22
|
-
#
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
23
|
+
# Set of text elements used to build Document#text.
|
24
|
+
@text_elements = Set.new(%i[
|
25
|
+
a abbr address article aside b bdi bdo blockquote button caption cite
|
26
|
+
code data dd del details dfn div dl dt em figcaption figure footer h1 h2
|
27
|
+
h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
|
28
|
+
option output p pre q rb rt ruby s samp section small span strong sub
|
29
|
+
summary sup td textarea th time u ul var wbr
|
30
|
+
])
|
31
|
+
|
32
|
+
# Set of Symbols representing the defined Document extractors.
|
33
|
+
@extractors = Set.new
|
29
34
|
|
30
35
|
class << self
|
31
|
-
#
|
36
|
+
# Set of HTML elements that make up the visible text on a page. These
|
37
|
+
# elements are used to initialize the Wgit::Document#text. See the
|
38
|
+
# README.md for how to add to this Set dynamically.
|
32
39
|
attr_reader :text_elements
|
40
|
+
|
41
|
+
# Set of Symbols representing the defined Document extractors. Is
|
42
|
+
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
|
+
attr_reader :extractors
|
33
44
|
end
|
34
45
|
|
35
46
|
# The URL of the webpage, an instance of Wgit::Url.
|
36
47
|
attr_reader :url
|
37
48
|
|
38
|
-
# The HTML of the
|
49
|
+
# The content/HTML of the document, an instance of String.
|
39
50
|
attr_reader :html
|
40
51
|
|
41
52
|
# The Nokogiri::HTML document object initialized from @html.
|
42
|
-
attr_reader :
|
53
|
+
attr_reader :parser
|
43
54
|
|
44
|
-
# The score is only used following a Database#search and records matches.
|
55
|
+
# The score is only used following a `Database#search` and records matches.
|
45
56
|
attr_reader :score
|
46
57
|
|
47
58
|
# Initialize takes either two strings (representing the URL and HTML) or an
|
@@ -50,29 +61,31 @@ module Wgit
|
|
50
61
|
# pages retrieved from the database.
|
51
62
|
#
|
52
63
|
# During initialisation, the Document will call any private
|
53
|
-
#
|
54
|
-
#
|
64
|
+
# `init_*_from_html` and `init_*_from_object` methods it can find. See the
|
65
|
+
# Wgit::Document.define_extractor method for more details.
|
55
66
|
#
|
56
|
-
# @param url_or_obj [String, Wgit::Url,
|
67
|
+
# @param url_or_obj [String, Wgit::Url, #fetch] Either a String
|
57
68
|
# representing a URL or a Hash-like object responding to :fetch. e.g. a
|
58
69
|
# MongoDB collection object. The Object's :fetch method should support
|
59
70
|
# Strings as keys.
|
60
|
-
# @param html [String, NilClass] The crawled web page's HTML. This
|
61
|
-
# only used if url_or_obj is a String representing the web
|
62
|
-
# Otherwise, the HTML comes from the database object. A html
|
63
|
-
# be defaulted to an empty String.
|
64
|
-
|
71
|
+
# @param html [String, NilClass] The crawled web page's content/HTML. This
|
72
|
+
# param is only used if url_or_obj is a String representing the web
|
73
|
+
# page's URL. Otherwise, the HTML comes from the database object. A html
|
74
|
+
# of nil will be defaulted to an empty String.
|
75
|
+
# @param encode [Boolean] Whether or not to UTF-8 encode the html. Set to
|
76
|
+
# false if the Document content is an image etc.
|
77
|
+
def initialize(url_or_obj, html = '', encode: true)
|
65
78
|
if url_or_obj.is_a?(String)
|
66
|
-
init_from_strings(url_or_obj, html,
|
79
|
+
init_from_strings(url_or_obj, html, encode: encode)
|
67
80
|
else
|
68
|
-
init_from_object(url_or_obj,
|
81
|
+
init_from_object(url_or_obj, encode: encode)
|
69
82
|
end
|
70
83
|
end
|
71
84
|
|
72
85
|
### Document Class Methods ###
|
73
86
|
|
74
87
|
# Uses Document.text_elements to build an xpath String, used to obtain
|
75
|
-
# all of the combined text on a webpage.
|
88
|
+
# all of the combined visual text on a webpage.
|
76
89
|
#
|
77
90
|
# @return [String] An xpath String to obtain a webpage's text elements.
|
78
91
|
def self.text_elements_xpath
|
@@ -88,86 +101,101 @@ module Wgit
|
|
88
101
|
xpath
|
89
102
|
end
|
90
103
|
|
91
|
-
# Defines
|
92
|
-
# instance variables upon Document initialization. See the default
|
93
|
-
#
|
104
|
+
# Defines a content extractor, which extracts HTML elements/content
|
105
|
+
# into instance variables upon Document initialization. See the default
|
106
|
+
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
|
+
# extractor means that every subsequently crawled/initialized document
|
108
|
+
# will attempt to extract the xpath's content. Use `#xpath` for a one off
|
109
|
+
# content extraction.
|
94
110
|
#
|
95
|
-
# Note that defined
|
111
|
+
# Note that defined extractors work for both Documents initialized from
|
96
112
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
97
|
-
# An
|
113
|
+
# An extractor once defined, initializes a private instance variable with
|
98
114
|
# the xpath or database object result(s).
|
99
115
|
#
|
100
116
|
# When initialising from HTML, a singleton value of true will only
|
101
|
-
# ever return
|
102
|
-
# Array. When initialising from a database object, the value
|
103
|
-
# is and singleton is only used to define the default empty
|
104
|
-
# If a value cannot be found (in either the HTML or database
|
105
|
-
# a default will be used. The default value is:
|
106
|
-
#
|
107
|
-
#
|
108
|
-
# @param
|
117
|
+
# ever return the first result found; otherwise all the results are
|
118
|
+
# returned in an Array. When initialising from a database object, the value
|
119
|
+
# is taken as is and singleton is only used to define the default empty
|
120
|
+
# value. If a value cannot be found (in either the HTML or database
|
121
|
+
# object), then a default will be used. The default value is:
|
122
|
+
# `singleton ? nil : []`.
|
123
|
+
#
|
124
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
125
|
+
# contain the extracted content. A getter and setter method is defined
|
126
|
+
# for the initialised variable.
|
127
|
+
# @param xpath [String, #call] The xpath used to find the element(s)
|
109
128
|
# of the webpage. Only used when initializing from HTML.
|
110
129
|
#
|
111
130
|
# Pass a callable object (proc etc.) if you want the
|
112
131
|
# xpath value to be derived on Document initialisation (instead of when
|
113
|
-
# the
|
132
|
+
# the extractor is defined). The call method must return a valid xpath
|
114
133
|
# String.
|
115
|
-
# @param
|
134
|
+
# @param opts [Hash] The options to define an extractor with. The
|
116
135
|
# options are only used when intializing from HTML, not the database.
|
117
|
-
# @option
|
136
|
+
# @option opts [Boolean] :singleton The singleton option determines
|
118
137
|
# whether or not the result(s) should be in an Array. If multiple
|
119
138
|
# results are found and singleton is true then the first result will be
|
120
139
|
# used. Defaults to true.
|
121
|
-
# @option
|
140
|
+
# @option opts [Boolean] :text_content_only The text_content_only option
|
122
141
|
# if true will use the text content of the Nokogiri result object,
|
123
142
|
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
124
|
-
# @yield
|
125
|
-
#
|
126
|
-
#
|
127
|
-
#
|
128
|
-
#
|
129
|
-
#
|
130
|
-
#
|
143
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
144
|
+
# regardless of the source. Use it (optionally) to process the result
|
145
|
+
# value.
|
146
|
+
# @yieldparam value [Object] The result value to be assigned to the new
|
147
|
+
# `var`.
|
148
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
149
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
150
|
+
# `:object`.
|
151
|
+
# @yieldreturn [Object] The return value of the block becomes the new var's
|
152
|
+
# value. Return the block's value param unchanged if you want to inspect.
|
131
153
|
# @raise [StandardError] If the var param isn't valid.
|
132
|
-
# @return [Symbol] The given var Symbol.
|
133
|
-
def self.
|
154
|
+
# @return [Symbol] The given var Symbol if successful.
|
155
|
+
def self.define_extractor(var, xpath, opts = {}, &block)
|
134
156
|
var = var.to_sym
|
135
|
-
|
136
|
-
|
157
|
+
defaults = { singleton: true, text_content_only: true }
|
158
|
+
opts = defaults.merge(opts)
|
137
159
|
|
138
|
-
raise "var must match #{
|
139
|
-
var =~
|
160
|
+
raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
|
161
|
+
var =~ REGEX_EXTRACTOR_NAME
|
140
162
|
|
141
163
|
# Define the private init_*_from_html method for HTML.
|
142
164
|
# Gets the HTML's xpath value and creates a var for it.
|
143
165
|
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
144
|
-
result =
|
166
|
+
result = extract_from_html(xpath, **opts, &block)
|
145
167
|
init_var(var, result)
|
146
168
|
end
|
147
|
-
Document.send
|
169
|
+
Document.send(:private, func_name)
|
148
170
|
|
149
171
|
# Define the private init_*_from_object method for a Database object.
|
150
172
|
# Gets the Object's 'key' value and creates a var for it.
|
151
|
-
func_name = Document.send(
|
152
|
-
|
173
|
+
func_name = Document.send(
|
174
|
+
:define_method, "init_#{var}_from_object"
|
175
|
+
) do |obj|
|
176
|
+
result = extract_from_object(
|
177
|
+
obj, var.to_s, singleton: opts[:singleton], &block
|
178
|
+
)
|
153
179
|
init_var(var, result)
|
154
180
|
end
|
155
|
-
Document.send
|
181
|
+
Document.send(:private, func_name)
|
156
182
|
|
183
|
+
@extractors << var
|
157
184
|
var
|
158
185
|
end
|
159
186
|
|
160
|
-
# Removes the init_
|
161
|
-
# Therefore, this is the opposing method to Document.
|
187
|
+
# Removes the `init_*` methods created when an extractor is defined.
|
188
|
+
# Therefore, this is the opposing method to `Document.define_extractor`.
|
162
189
|
# Returns true if successful or false if the method(s) cannot be found.
|
163
190
|
#
|
164
|
-
# @param var [Symbol] The
|
165
|
-
# @return [Boolean] True if the
|
191
|
+
# @param var [Symbol] The extractor variable to remove.
|
192
|
+
# @return [Boolean] True if the extractor `var` was found and removed;
|
166
193
|
# otherwise false.
|
167
|
-
def self.
|
194
|
+
def self.remove_extractor(var)
|
168
195
|
Document.send(:remove_method, "init_#{var}_from_html")
|
169
196
|
Document.send(:remove_method, "init_#{var}_from_object")
|
170
197
|
|
198
|
+
@extractors.delete(var.to_sym)
|
171
199
|
true
|
172
200
|
rescue NameError
|
173
201
|
false
|
@@ -186,7 +214,7 @@ module Wgit
|
|
186
214
|
(@url == other.url) && (@html == other.html)
|
187
215
|
end
|
188
216
|
|
189
|
-
#
|
217
|
+
# Shortcut for calling Document#html[range].
|
190
218
|
#
|
191
219
|
# @param range [Range] The range of @html to return.
|
192
220
|
# @return [String] The given range of @html.
|
@@ -196,9 +224,9 @@ module Wgit
|
|
196
224
|
|
197
225
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
198
226
|
# <base> element's href value or @url (if @base is nil). If @base is
|
199
|
-
# present and relative, then @url.
|
200
|
-
# should be used instead of `doc.url.
|
201
|
-
# absolute links from relative links; or use `link.
|
227
|
+
# present and relative, then @url.to_origin + @base is returned. This method
|
228
|
+
# should be used instead of `doc.url.to_origin` etc. when manually building
|
229
|
+
# absolute links from relative links; or use `link.make_absolute(doc)`.
|
202
230
|
#
|
203
231
|
# Provide the `link:` parameter to get the correct base URL for that type
|
204
232
|
# of link. For example, a link of `#top` would always return @url because
|
@@ -217,12 +245,16 @@ module Wgit
|
|
217
245
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
218
246
|
# 'http://example.com/public'.
|
219
247
|
def base_url(link: nil)
|
220
|
-
raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
|
221
248
|
if @url.relative? && @base.nil?
|
222
|
-
|
249
|
+
raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
|
250
|
+
end
|
251
|
+
|
223
252
|
if @url.relative? && @base&.relative?
|
253
|
+
raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
|
254
|
+
be relative"
|
255
|
+
end
|
224
256
|
|
225
|
-
get_base = -> { @base.relative? ? @url.
|
257
|
+
get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
|
226
258
|
|
227
259
|
if link
|
228
260
|
link = Wgit::Url.new(link)
|
@@ -234,7 +266,7 @@ module Wgit
|
|
234
266
|
end
|
235
267
|
end
|
236
268
|
|
237
|
-
base_url = @base ? get_base.call : @url.
|
269
|
+
base_url = @base ? get_base.call : @url.to_origin
|
238
270
|
base_url.omit_fragment.omit_query
|
239
271
|
end
|
240
272
|
|
@@ -248,7 +280,7 @@ module Wgit
|
|
248
280
|
def to_h(include_html: false, include_score: true)
|
249
281
|
ignore = include_html ? [] : ['@html']
|
250
282
|
ignore << '@score' unless include_score
|
251
|
-
ignore << '@
|
283
|
+
ignore << '@parser' # Always ignore the Nokogiri object.
|
252
284
|
|
253
285
|
Wgit::Utils.to_h(self, ignore: ignore)
|
254
286
|
end
|
@@ -265,7 +297,7 @@ module Wgit
|
|
265
297
|
|
266
298
|
# Returns a Hash containing this Document's instance variables and
|
267
299
|
# their #length (if they respond to it). Works dynamically so that any
|
268
|
-
# user defined
|
300
|
+
# user defined extractors (and their created instance vars) will appear in
|
269
301
|
# the returned Hash as well. The number of text snippets as well as total
|
270
302
|
# number of textual bytes are always included in the returned Hash.
|
271
303
|
#
|
@@ -275,8 +307,8 @@ module Wgit
|
|
275
307
|
instance_variables.each do |var|
|
276
308
|
# Add up the total bytes of text as well as the length.
|
277
309
|
if var == :@text
|
278
|
-
hash[:
|
279
|
-
hash[:text_bytes]
|
310
|
+
hash[:text] = @text.length
|
311
|
+
hash[:text_bytes] = @text.sum(&:length)
|
280
312
|
# Else take the var's #length method return value.
|
281
313
|
else
|
282
314
|
next unless instance_variable_get(var).respond_to?(:length)
|
@@ -305,25 +337,43 @@ module Wgit
|
|
305
337
|
end
|
306
338
|
|
307
339
|
# Uses Nokogiri's xpath method to search the doc's html and return the
|
308
|
-
# results.
|
340
|
+
# results. Use `#at_xpath` for returning the first result only.
|
309
341
|
#
|
310
342
|
# @param xpath [String] The xpath to search the @html with.
|
311
343
|
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
312
344
|
def xpath(xpath)
|
313
|
-
@
|
345
|
+
@parser.xpath(xpath)
|
346
|
+
end
|
347
|
+
|
348
|
+
# Uses Nokogiri's `at_xpath` method to search the doc's html and return the
|
349
|
+
# result. Use `#xpath` for returning several results.
|
350
|
+
#
|
351
|
+
# @param xpath [String] The xpath to search the @html with.
|
352
|
+
# @return [Nokogiri::XML::Element] The result of the xpath search.
|
353
|
+
def at_xpath(xpath)
|
354
|
+
@parser.at_xpath(xpath)
|
314
355
|
end
|
315
356
|
|
316
|
-
# Uses Nokogiri's css method to search the doc's html and return the
|
317
|
-
# results.
|
357
|
+
# Uses Nokogiri's `css` method to search the doc's html and return the
|
358
|
+
# results. Use `#at_css` for returning the first result only.
|
318
359
|
#
|
319
360
|
# @param selector [String] The CSS selector to search the @html with.
|
320
361
|
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
321
362
|
def css(selector)
|
322
|
-
@
|
363
|
+
@parser.css(selector)
|
323
364
|
end
|
324
365
|
|
325
|
-
#
|
326
|
-
#
|
366
|
+
# Uses Nokogiri's `at_css` method to search the doc's html and return the
|
367
|
+
# result. Use `#css` for returning several results.
|
368
|
+
#
|
369
|
+
# @param selector [String] The CSS selector to search the @html with.
|
370
|
+
# @return [Nokogiri::XML::Element] The result of the CSS search.
|
371
|
+
def at_css(selector)
|
372
|
+
@parser.at_css(selector)
|
373
|
+
end
|
374
|
+
|
375
|
+
# Returns all unique internal links from this Document in relative form.
|
376
|
+
# Internal meaning a link to another document on the same host.
|
327
377
|
#
|
328
378
|
# This Document's host is used to determine if an absolute URL is actually
|
329
379
|
# a relative link e.g. For a Document representing
|
@@ -332,41 +382,41 @@ module Wgit
|
|
332
382
|
# as an internal link because both Documents live on the same host. Also
|
333
383
|
# see Wgit::Document#internal_absolute_links.
|
334
384
|
#
|
335
|
-
# @return [Array<Wgit::Url>] Self's internal Url's in relative form.
|
385
|
+
# @return [Array<Wgit::Url>] Self's unique internal Url's in relative form.
|
336
386
|
def internal_links
|
337
387
|
return [] if @links.empty?
|
338
388
|
|
339
389
|
links = @links
|
340
|
-
.select { |link| link.relative?(host: @url.
|
390
|
+
.select { |link| link.relative?(host: @url.to_origin) }
|
341
391
|
.map(&:omit_base)
|
342
392
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
343
393
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
344
394
|
end
|
345
395
|
|
346
|
-
Wgit::Utils.
|
396
|
+
Wgit::Utils.sanitize(links)
|
347
397
|
end
|
348
398
|
|
349
|
-
# Returns all internal links from this Document in absolute form by
|
399
|
+
# Returns all unique internal links from this Document in absolute form by
|
350
400
|
# appending them to self's #base_url. Also see
|
351
401
|
# Wgit::Document#internal_links.
|
352
402
|
#
|
353
|
-
# @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
|
403
|
+
# @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
|
354
404
|
def internal_absolute_links
|
355
|
-
internal_links.map { |link| link.
|
405
|
+
internal_links.map { |link| link.make_absolute(self) }
|
356
406
|
end
|
357
407
|
|
358
|
-
# Returns all external links from this Document in absolute form.
|
359
|
-
# meaning a link to a different host.
|
408
|
+
# Returns all unique external links from this Document in absolute form.
|
409
|
+
# External meaning a link to a different host.
|
360
410
|
#
|
361
|
-
# @return [Array<Wgit::Url>] Self's external Url's in absolute form.
|
411
|
+
# @return [Array<Wgit::Url>] Self's unique external Url's in absolute form.
|
362
412
|
def external_links
|
363
413
|
return [] if @links.empty?
|
364
414
|
|
365
415
|
links = @links
|
366
|
-
.reject { |link| link.relative?(host: @url.
|
416
|
+
.reject { |link| link.relative?(host: @url.to_origin) }
|
367
417
|
.map(&:omit_trailing_slash)
|
368
418
|
|
369
|
-
Wgit::Utils.
|
419
|
+
Wgit::Utils.sanitize(links)
|
370
420
|
end
|
371
421
|
|
372
422
|
# Searches the @text for the given query and returns the results.
|
@@ -381,8 +431,8 @@ module Wgit
|
|
381
431
|
# original sentence, which ever is less. The algorithm obviously ensures
|
382
432
|
# that the search query is visible somewhere in the sentence.
|
383
433
|
#
|
384
|
-
# @param query [
|
385
|
-
# @text for.
|
434
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
435
|
+
# document's @text for.
|
386
436
|
# @param case_sensitive [Boolean] Whether character case must match.
|
387
437
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
388
438
|
# for separately.
|
@@ -390,21 +440,27 @@ module Wgit
|
|
390
440
|
# sentence.
|
391
441
|
# @return [Array<String>] A subset of @text, matching the query.
|
392
442
|
def search(
|
393
|
-
query, case_sensitive: false, whole_sentence:
|
443
|
+
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
394
444
|
)
|
395
|
-
query = query.to_s
|
396
|
-
raise 'A search query must be provided' if query.empty?
|
397
445
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
398
446
|
|
399
|
-
|
400
|
-
|
447
|
+
if query.is_a?(Regexp)
|
448
|
+
regex = query
|
449
|
+
else # respond_to? #to_s == true
|
450
|
+
query = query.to_s
|
451
|
+
query = query.gsub(' ', '|') unless whole_sentence
|
452
|
+
regex = Regexp.new(query, !case_sensitive)
|
453
|
+
end
|
454
|
+
|
401
455
|
results = {}
|
402
456
|
|
403
457
|
@text.each do |sentence|
|
458
|
+
sentence = sentence.strip
|
459
|
+
next if results[sentence]
|
460
|
+
|
404
461
|
hits = sentence.scan(regex).count
|
405
462
|
next unless hits.positive?
|
406
463
|
|
407
|
-
sentence.strip!
|
408
464
|
index = sentence.index(regex) # Index of first match.
|
409
465
|
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
410
466
|
|
@@ -422,8 +478,8 @@ module Wgit
|
|
422
478
|
# functionality. The original text is returned; no other reference to it
|
423
479
|
# is kept thereafter.
|
424
480
|
#
|
425
|
-
# @param query [
|
426
|
-
# @text for.
|
481
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
482
|
+
# document's @text for.
|
427
483
|
# @param case_sensitive [Boolean] Whether character case must match.
|
428
484
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
429
485
|
# for separately.
|
@@ -431,7 +487,7 @@ module Wgit
|
|
431
487
|
# sentence.
|
432
488
|
# @return [String] This Document's original @text value.
|
433
489
|
def search!(
|
434
|
-
query, case_sensitive: false, whole_sentence:
|
490
|
+
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
435
491
|
)
|
436
492
|
orig_text = @text
|
437
493
|
@text = search(
|
@@ -442,104 +498,114 @@ module Wgit
|
|
442
498
|
orig_text
|
443
499
|
end
|
444
500
|
|
501
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
502
|
+
# parameter.
|
503
|
+
#
|
504
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
505
|
+
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
506
|
+
# Object) : results (Array).
|
507
|
+
# @param text_content_only [Boolean] text_content_only ? result.content
|
508
|
+
# (String) : result (Nokogiri Object).
|
509
|
+
# @return [String, Object] The value found in the html or the default value
|
510
|
+
# (singleton ? nil : []).
|
511
|
+
def extract(xpath, singleton: true, text_content_only: true)
|
512
|
+
send(
|
513
|
+
:extract_from_html, xpath,
|
514
|
+
singleton: singleton, text_content_only: text_content_only
|
515
|
+
)
|
516
|
+
end
|
517
|
+
|
445
518
|
protected
|
446
519
|
|
447
520
|
# Initializes the nokogiri object using @html, which cannot be nil.
|
448
521
|
# Override this method to custom configure the Nokogiri object returned.
|
449
522
|
# Gets called from Wgit::Document.new upon initialization.
|
450
523
|
#
|
524
|
+
# @yield [config] The given block is passed to Nokogiri::HTML for
|
525
|
+
# initialisation.
|
451
526
|
# @raise [StandardError] If @html isn't set.
|
452
527
|
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
453
|
-
def init_nokogiri
|
528
|
+
def init_nokogiri(&block)
|
454
529
|
raise '@html must be set' unless @html
|
455
530
|
|
456
|
-
Nokogiri::HTML(@html)
|
457
|
-
# TODO: Remove #'s below when crawling in production.
|
458
|
-
# config.options = Nokogiri::XML::ParseOptions::STRICT |
|
459
|
-
# Nokogiri::XML::ParseOptions::NONET
|
460
|
-
end
|
531
|
+
Nokogiri::HTML(@html, &block)
|
461
532
|
end
|
462
533
|
|
463
|
-
#
|
534
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
464
535
|
# parameter.
|
465
536
|
#
|
466
|
-
# @param xpath [String] Used to find the value/object in @html.
|
537
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
467
538
|
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
468
539
|
# Object) : results (Array).
|
469
540
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
470
541
|
# (String) : result (Nokogiri Object).
|
471
|
-
# @yield
|
472
|
-
#
|
473
|
-
#
|
474
|
-
#
|
542
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
543
|
+
# regardless of the source. Use it (optionally) to process the result
|
544
|
+
# value.
|
545
|
+
# @yieldparam value [Object] The result value to be returned.
|
546
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
547
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
548
|
+
# `:object`.
|
549
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
550
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
475
551
|
# @return [String, Object] The value found in the html or the default value
|
476
552
|
# (singleton ? nil : []).
|
477
|
-
def
|
478
|
-
|
479
|
-
|
480
|
-
results = @doc.xpath(xpath)
|
481
|
-
|
482
|
-
return default if results.nil? || results.empty?
|
483
|
-
|
484
|
-
result = if singleton
|
485
|
-
text_content_only ? results.first.content : results.first
|
486
|
-
else
|
487
|
-
text_content_only ? results.map(&:content) : results
|
488
|
-
end
|
489
|
-
|
490
|
-
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
553
|
+
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
554
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
555
|
+
result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
|
491
556
|
|
492
|
-
if
|
493
|
-
|
494
|
-
result = new_result unless new_result.nil?
|
557
|
+
if text_content_only
|
558
|
+
result = singleton ? result&.content : result.map(&:content)
|
495
559
|
end
|
496
560
|
|
561
|
+
Wgit::Utils.sanitize(result)
|
562
|
+
result = yield(result, self, :document) if block_given?
|
497
563
|
result
|
498
564
|
end
|
499
565
|
|
500
|
-
# Returns a value from the obj using the given key via obj#fetch
|
566
|
+
# Returns a value from the obj using the given key via `obj#fetch`.
|
501
567
|
#
|
502
|
-
# @param obj [
|
568
|
+
# @param obj [#fetch] The object containing the key/value.
|
503
569
|
# @param key [String] Used to find the value in the obj.
|
504
570
|
# @param singleton [Boolean] True if a single value, false otherwise.
|
505
|
-
# @yield
|
506
|
-
#
|
507
|
-
#
|
508
|
-
#
|
571
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
572
|
+
# regardless of the source. Use it (optionally) to process the result
|
573
|
+
# value.
|
574
|
+
# @yieldparam value [Object] The result value to be returned.
|
575
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
576
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
577
|
+
# `:object`.
|
578
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
579
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
509
580
|
# @return [String, Object] The value found in the obj or the default value
|
510
581
|
# (singleton ? nil : []).
|
511
|
-
def
|
582
|
+
def extract_from_object(obj, key, singleton: true)
|
512
583
|
assert_respond_to(obj, :fetch)
|
513
584
|
|
514
585
|
default = singleton ? nil : []
|
515
586
|
result = obj.fetch(key.to_s, default)
|
516
587
|
|
517
|
-
|
518
|
-
|
519
|
-
if block_given?
|
520
|
-
new_result = yield(result, obj, :object)
|
521
|
-
result = new_result unless new_result.nil?
|
522
|
-
end
|
523
|
-
|
588
|
+
Wgit::Utils.sanitize(result)
|
589
|
+
result = yield(result, obj, :object) if block_given?
|
524
590
|
result
|
525
591
|
end
|
526
592
|
|
527
593
|
private
|
528
594
|
|
529
595
|
# Initialise the Document from URL and HTML Strings.
|
530
|
-
def init_from_strings(url, html,
|
596
|
+
def init_from_strings(url, html, encode: true)
|
531
597
|
assert_types(html, [String, NilClass])
|
532
598
|
|
533
599
|
# We already know url.is_a?(String) so parse into Url unless already so.
|
534
600
|
url = Wgit::Url.parse(url)
|
535
601
|
url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
|
536
602
|
|
537
|
-
@url
|
538
|
-
@html
|
539
|
-
@
|
540
|
-
@score
|
603
|
+
@url = url
|
604
|
+
@html = html || ''
|
605
|
+
@parser = init_nokogiri
|
606
|
+
@score = 0.0
|
541
607
|
|
542
|
-
Wgit::Utils.
|
608
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
543
609
|
|
544
610
|
# Dynamically run the init_*_from_html methods.
|
545
611
|
Document.private_instance_methods(false).each do |method|
|
@@ -552,15 +618,15 @@ module Wgit
|
|
552
618
|
|
553
619
|
# Initialise the Document from a Hash like Object containing Strings as
|
554
620
|
# keys e.g. database collection object or Hash.
|
555
|
-
def init_from_object(obj,
|
621
|
+
def init_from_object(obj, encode: true)
|
556
622
|
assert_respond_to(obj, :fetch)
|
557
623
|
|
558
|
-
@url
|
559
|
-
@html
|
560
|
-
@
|
561
|
-
@score
|
624
|
+
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
625
|
+
@html = obj.fetch('html', '')
|
626
|
+
@parser = init_nokogiri
|
627
|
+
@score = obj.fetch('score', 0.0)
|
562
628
|
|
563
|
-
Wgit::Utils.
|
629
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
564
630
|
|
565
631
|
# Dynamically run the init_*_from_object methods.
|
566
632
|
Document.private_instance_methods(false).each do |method|
|
@@ -571,11 +637,11 @@ module Wgit
|
|
571
637
|
end
|
572
638
|
end
|
573
639
|
|
574
|
-
# Initialises an instance variable and defines
|
640
|
+
# Initialises an instance variable and defines an accessor method for it.
|
575
641
|
#
|
576
642
|
# @param var [Symbol] The name of the variable to be initialized.
|
577
643
|
# @param value [Object] The newly initialized variable's value.
|
578
|
-
# @return [Symbol] The name of the
|
644
|
+
# @return [Symbol] The name of the defined getter method.
|
579
645
|
def init_var(var, value)
|
580
646
|
# instance_var_name starts with @, var_name doesn't.
|
581
647
|
var = var.to_s
|
@@ -583,10 +649,9 @@ module Wgit
|
|
583
649
|
instance_var_name = "@#{var_name}".to_sym
|
584
650
|
|
585
651
|
instance_variable_set(instance_var_name, value)
|
652
|
+
Wgit::Document.attr_accessor(var_name)
|
586
653
|
|
587
|
-
|
588
|
-
instance_variable_get(instance_var_name)
|
589
|
-
end
|
654
|
+
var_name
|
590
655
|
end
|
591
656
|
|
592
657
|
alias content html
|