wgit 0.7.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +74 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -290
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +219 -79
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +226 -143
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +21 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +11 -8
- data/lib/wgit/url.rb +192 -61
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/lib/wgit/document.rb
CHANGED
@@ -6,29 +6,41 @@ require 'json'
|
|
6
6
|
require 'set'
|
7
7
|
|
8
8
|
module Wgit
|
9
|
-
# Class
|
9
|
+
# Class modeling/serialising a HTML web document, although other MIME types
|
10
10
|
# will work e.g. images etc. Also doubles as a search result when
|
11
11
|
# loading Documents from the database via `Wgit::Database#search`.
|
12
12
|
#
|
13
13
|
# The initialize method dynamically initializes instance variables from the
|
14
14
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
15
|
-
# Document class can be easily extended allowing you to
|
16
|
-
# a webpage that are important to you. See `Wgit::Document.
|
15
|
+
# Document class can be easily extended allowing you to extract the bits of
|
16
|
+
# a webpage that are important to you. See `Wgit::Document.define_extractor`.
|
17
17
|
class Document
|
18
18
|
include Assertable
|
19
19
|
|
20
|
-
# Regex for the allowed var names when defining an
|
21
|
-
|
20
|
+
# Regex for the allowed var names when defining an extractor.
|
21
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
|
22
22
|
|
23
|
-
#
|
24
|
-
|
23
|
+
# Set of text elements used to build Document#text.
|
24
|
+
@text_elements = Set.new(%i[
|
25
|
+
a abbr address article aside b bdi bdo blockquote button caption cite
|
26
|
+
code data dd del details dfn div dl dt em figcaption figure footer h1 h2
|
27
|
+
h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
|
28
|
+
option output p pre q rb rt ruby s samp section small span strong sub
|
29
|
+
summary sup td textarea th time u ul var wbr
|
30
|
+
])
|
25
31
|
|
26
|
-
# Set of Symbols representing the defined Document
|
27
|
-
@
|
32
|
+
# Set of Symbols representing the defined Document extractors.
|
33
|
+
@extractors = Set.new
|
28
34
|
|
29
35
|
class << self
|
30
|
-
#
|
31
|
-
|
36
|
+
# Set of HTML elements that make up the visible text on a page. These
|
37
|
+
# elements are used to initialize the Wgit::Document#text. See the
|
38
|
+
# README.md for how to add to this Set dynamically.
|
39
|
+
attr_reader :text_elements
|
40
|
+
|
41
|
+
# Set of Symbols representing the defined Document extractors. Is
|
42
|
+
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
|
+
attr_reader :extractors
|
32
44
|
end
|
33
45
|
|
34
46
|
# The URL of the webpage, an instance of Wgit::Url.
|
@@ -38,7 +50,7 @@ module Wgit
|
|
38
50
|
attr_reader :html
|
39
51
|
|
40
52
|
# The Nokogiri::HTML document object initialized from @html.
|
41
|
-
attr_reader :
|
53
|
+
attr_reader :parser
|
42
54
|
|
43
55
|
# The score is only used following a `Database#search` and records matches.
|
44
56
|
attr_reader :score
|
@@ -50,7 +62,7 @@ module Wgit
|
|
50
62
|
#
|
51
63
|
# During initialisation, the Document will call any private
|
52
64
|
# `init_*_from_html` and `init_*_from_object` methods it can find. See the
|
53
|
-
#
|
65
|
+
# Wgit::Document.define_extractor method for more details.
|
54
66
|
#
|
55
67
|
# @param url_or_obj [String, Wgit::Url, #fetch] Either a String
|
56
68
|
# representing a URL or a Hash-like object responding to :fetch. e.g. a
|
@@ -72,31 +84,54 @@ module Wgit
|
|
72
84
|
|
73
85
|
### Document Class Methods ###
|
74
86
|
|
75
|
-
#
|
76
|
-
#
|
77
|
-
# extensions defined in 'document_extensions.rb' as examples.
|
87
|
+
# Uses Document.text_elements to build an xpath String, used to obtain
|
88
|
+
# all of the combined visual text on a webpage.
|
78
89
|
#
|
79
|
-
#
|
90
|
+
# @return [String] An xpath String to obtain a webpage's text elements.
|
91
|
+
def self.text_elements_xpath
|
92
|
+
xpath = ''
|
93
|
+
return xpath if Wgit::Document.text_elements.empty?
|
94
|
+
|
95
|
+
el_xpath = '//%s/text()'
|
96
|
+
Wgit::Document.text_elements.each_with_index do |el, i|
|
97
|
+
xpath += ' | ' unless i.zero?
|
98
|
+
xpath += format(el_xpath, el)
|
99
|
+
end
|
100
|
+
|
101
|
+
xpath
|
102
|
+
end
|
103
|
+
|
104
|
+
# Defines a content extractor, which extracts HTML elements/content
|
105
|
+
# into instance variables upon Document initialization. See the default
|
106
|
+
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
|
+
# extractor means that every subsequently crawled/initialized document
|
108
|
+
# will attempt to extract the xpath's content. Use `#xpath` for a one off
|
109
|
+
# content extraction.
|
110
|
+
#
|
111
|
+
# Note that defined extractors work for both Documents initialized from
|
80
112
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
81
|
-
# An
|
113
|
+
# An extractor once defined, initializes a private instance variable with
|
82
114
|
# the xpath or database object result(s).
|
83
115
|
#
|
84
116
|
# When initialising from HTML, a singleton value of true will only
|
85
|
-
# ever return
|
86
|
-
# Array. When initialising from a database object, the value
|
87
|
-
# is and singleton is only used to define the default empty
|
88
|
-
# If a value cannot be found (in either the HTML or database
|
89
|
-
# a default will be used. The default value is:
|
90
|
-
#
|
91
|
-
#
|
117
|
+
# ever return the first result found; otherwise all the results are
|
118
|
+
# returned in an Array. When initialising from a database object, the value
|
119
|
+
# is taken as is and singleton is only used to define the default empty
|
120
|
+
# value. If a value cannot be found (in either the HTML or database
|
121
|
+
# object), then a default will be used. The default value is:
|
122
|
+
# `singleton ? nil : []`.
|
123
|
+
#
|
124
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
125
|
+
# contain the extracted content. A getter and setter method is defined
|
126
|
+
# for the initialised variable.
|
92
127
|
# @param xpath [String, #call] The xpath used to find the element(s)
|
93
128
|
# of the webpage. Only used when initializing from HTML.
|
94
129
|
#
|
95
130
|
# Pass a callable object (proc etc.) if you want the
|
96
131
|
# xpath value to be derived on Document initialisation (instead of when
|
97
|
-
# the
|
132
|
+
# the extractor is defined). The call method must return a valid xpath
|
98
133
|
# String.
|
99
|
-
# @param opts [Hash] The options to define an
|
134
|
+
# @param opts [Hash] The options to define an extractor with. The
|
100
135
|
# options are only used when intializing from HTML, not the database.
|
101
136
|
# @option opts [Boolean] :singleton The singleton option determines
|
102
137
|
# whether or not the result(s) should be in an Array. If multiple
|
@@ -105,56 +140,62 @@ module Wgit
|
|
105
140
|
# @option opts [Boolean] :text_content_only The text_content_only option
|
106
141
|
# if true will use the text content of the Nokogiri result object,
|
107
142
|
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
108
|
-
# @
|
109
|
-
#
|
110
|
-
#
|
111
|
-
#
|
112
|
-
#
|
113
|
-
#
|
114
|
-
#
|
115
|
-
#
|
143
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
144
|
+
# regardless of the source. Use it (optionally) to process the result
|
145
|
+
# value.
|
146
|
+
# @yieldparam value [Object] The result value to be assigned to the new
|
147
|
+
# `var`.
|
148
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
149
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
150
|
+
# `:object`.
|
151
|
+
# @yieldreturn [Object] The return value of the block becomes the new var's
|
152
|
+
# value. Return the block's value param unchanged if you want to inspect.
|
116
153
|
# @raise [StandardError] If the var param isn't valid.
|
117
154
|
# @return [Symbol] The given var Symbol if successful.
|
118
|
-
def self.
|
155
|
+
def self.define_extractor(var, xpath, opts = {}, &block)
|
119
156
|
var = var.to_sym
|
120
157
|
defaults = { singleton: true, text_content_only: true }
|
121
158
|
opts = defaults.merge(opts)
|
122
159
|
|
123
|
-
raise "var must match #{
|
124
|
-
var =~
|
160
|
+
raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
|
161
|
+
var =~ REGEX_EXTRACTOR_NAME
|
125
162
|
|
126
163
|
# Define the private init_*_from_html method for HTML.
|
127
164
|
# Gets the HTML's xpath value and creates a var for it.
|
128
165
|
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
129
|
-
result =
|
166
|
+
result = extract_from_html(xpath, **opts, &block)
|
130
167
|
init_var(var, result)
|
131
168
|
end
|
132
169
|
Document.send(:private, func_name)
|
133
170
|
|
134
171
|
# Define the private init_*_from_object method for a Database object.
|
135
172
|
# Gets the Object's 'key' value and creates a var for it.
|
136
|
-
func_name = Document.send(
|
137
|
-
|
173
|
+
func_name = Document.send(
|
174
|
+
:define_method, "init_#{var}_from_object"
|
175
|
+
) do |obj|
|
176
|
+
result = extract_from_object(
|
177
|
+
obj, var.to_s, singleton: opts[:singleton], &block
|
178
|
+
)
|
138
179
|
init_var(var, result)
|
139
180
|
end
|
140
181
|
Document.send(:private, func_name)
|
141
182
|
|
142
|
-
@
|
183
|
+
@extractors << var
|
143
184
|
var
|
144
185
|
end
|
145
186
|
|
146
|
-
# Removes the init_
|
147
|
-
# Therefore, this is the opposing method to Document.
|
187
|
+
# Removes the `init_*` methods created when an extractor is defined.
|
188
|
+
# Therefore, this is the opposing method to `Document.define_extractor`.
|
148
189
|
# Returns true if successful or false if the method(s) cannot be found.
|
149
190
|
#
|
150
|
-
# @param var [Symbol] The
|
151
|
-
# @return [Boolean] True if the
|
191
|
+
# @param var [Symbol] The extractor variable to remove.
|
192
|
+
# @return [Boolean] True if the extractor `var` was found and removed;
|
152
193
|
# otherwise false.
|
153
|
-
def self.
|
194
|
+
def self.remove_extractor(var)
|
154
195
|
Document.send(:remove_method, "init_#{var}_from_html")
|
155
196
|
Document.send(:remove_method, "init_#{var}_from_object")
|
156
197
|
|
157
|
-
@
|
198
|
+
@extractors.delete(var.to_sym)
|
158
199
|
true
|
159
200
|
rescue NameError
|
160
201
|
false
|
@@ -173,7 +214,7 @@ module Wgit
|
|
173
214
|
(@url == other.url) && (@html == other.html)
|
174
215
|
end
|
175
216
|
|
176
|
-
#
|
217
|
+
# Shortcut for calling Document#html[range].
|
177
218
|
#
|
178
219
|
# @param range [Range] The range of @html to return.
|
179
220
|
# @return [String] The given range of @html.
|
@@ -183,9 +224,9 @@ module Wgit
|
|
183
224
|
|
184
225
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
185
226
|
# <base> element's href value or @url (if @base is nil). If @base is
|
186
|
-
# present and relative, then @url.
|
187
|
-
# should be used instead of `doc.url.
|
188
|
-
# absolute links from relative links; or use `link.
|
227
|
+
# present and relative, then @url.to_origin + @base is returned. This method
|
228
|
+
# should be used instead of `doc.url.to_origin` etc. when manually building
|
229
|
+
# absolute links from relative links; or use `link.make_absolute(doc)`.
|
189
230
|
#
|
190
231
|
# Provide the `link:` parameter to get the correct base URL for that type
|
191
232
|
# of link. For example, a link of `#top` would always return @url because
|
@@ -204,12 +245,16 @@ module Wgit
|
|
204
245
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
205
246
|
# 'http://example.com/public'.
|
206
247
|
def base_url(link: nil)
|
207
|
-
raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
|
208
248
|
if @url.relative? && @base.nil?
|
209
|
-
|
249
|
+
raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
|
250
|
+
end
|
251
|
+
|
210
252
|
if @url.relative? && @base&.relative?
|
253
|
+
raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
|
254
|
+
be relative"
|
255
|
+
end
|
211
256
|
|
212
|
-
get_base = -> { @base.relative? ? @url.
|
257
|
+
get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
|
213
258
|
|
214
259
|
if link
|
215
260
|
link = Wgit::Url.new(link)
|
@@ -221,7 +266,7 @@ module Wgit
|
|
221
266
|
end
|
222
267
|
end
|
223
268
|
|
224
|
-
base_url = @base ? get_base.call : @url.
|
269
|
+
base_url = @base ? get_base.call : @url.to_origin
|
225
270
|
base_url.omit_fragment.omit_query
|
226
271
|
end
|
227
272
|
|
@@ -235,7 +280,7 @@ module Wgit
|
|
235
280
|
def to_h(include_html: false, include_score: true)
|
236
281
|
ignore = include_html ? [] : ['@html']
|
237
282
|
ignore << '@score' unless include_score
|
238
|
-
ignore << '@
|
283
|
+
ignore << '@parser' # Always ignore the Nokogiri object.
|
239
284
|
|
240
285
|
Wgit::Utils.to_h(self, ignore: ignore)
|
241
286
|
end
|
@@ -252,7 +297,7 @@ module Wgit
|
|
252
297
|
|
253
298
|
# Returns a Hash containing this Document's instance variables and
|
254
299
|
# their #length (if they respond to it). Works dynamically so that any
|
255
|
-
# user defined
|
300
|
+
# user defined extractors (and their created instance vars) will appear in
|
256
301
|
# the returned Hash as well. The number of text snippets as well as total
|
257
302
|
# number of textual bytes are always included in the returned Hash.
|
258
303
|
#
|
@@ -262,8 +307,8 @@ module Wgit
|
|
262
307
|
instance_variables.each do |var|
|
263
308
|
# Add up the total bytes of text as well as the length.
|
264
309
|
if var == :@text
|
265
|
-
hash[:
|
266
|
-
hash[:text_bytes]
|
310
|
+
hash[:text] = @text.length
|
311
|
+
hash[:text_bytes] = @text.sum(&:length)
|
267
312
|
# Else take the var's #length method return value.
|
268
313
|
else
|
269
314
|
next unless instance_variable_get(var).respond_to?(:length)
|
@@ -292,25 +337,43 @@ module Wgit
|
|
292
337
|
end
|
293
338
|
|
294
339
|
# Uses Nokogiri's xpath method to search the doc's html and return the
|
295
|
-
# results.
|
340
|
+
# results. Use `#at_xpath` for returning the first result only.
|
296
341
|
#
|
297
342
|
# @param xpath [String] The xpath to search the @html with.
|
298
343
|
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
299
344
|
def xpath(xpath)
|
300
|
-
@
|
345
|
+
@parser.xpath(xpath)
|
301
346
|
end
|
302
347
|
|
303
|
-
# Uses Nokogiri's
|
304
|
-
# results.
|
348
|
+
# Uses Nokogiri's `at_xpath` method to search the doc's html and return the
|
349
|
+
# result. Use `#xpath` for returning several results.
|
350
|
+
#
|
351
|
+
# @param xpath [String] The xpath to search the @html with.
|
352
|
+
# @return [Nokogiri::XML::Element] The result of the xpath search.
|
353
|
+
def at_xpath(xpath)
|
354
|
+
@parser.at_xpath(xpath)
|
355
|
+
end
|
356
|
+
|
357
|
+
# Uses Nokogiri's `css` method to search the doc's html and return the
|
358
|
+
# results. Use `#at_css` for returning the first result only.
|
305
359
|
#
|
306
360
|
# @param selector [String] The CSS selector to search the @html with.
|
307
361
|
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
308
362
|
def css(selector)
|
309
|
-
@
|
363
|
+
@parser.css(selector)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Uses Nokogiri's `at_css` method to search the doc's html and return the
|
367
|
+
# result. Use `#css` for returning several results.
|
368
|
+
#
|
369
|
+
# @param selector [String] The CSS selector to search the @html with.
|
370
|
+
# @return [Nokogiri::XML::Element] The result of the CSS search.
|
371
|
+
def at_css(selector)
|
372
|
+
@parser.at_css(selector)
|
310
373
|
end
|
311
374
|
|
312
|
-
# Returns all internal links from this Document in relative form.
|
313
|
-
# meaning a link to another document on the same host.
|
375
|
+
# Returns all unique internal links from this Document in relative form.
|
376
|
+
# Internal meaning a link to another document on the same host.
|
314
377
|
#
|
315
378
|
# This Document's host is used to determine if an absolute URL is actually
|
316
379
|
# a relative link e.g. For a Document representing
|
@@ -319,41 +382,48 @@ module Wgit
|
|
319
382
|
# as an internal link because both Documents live on the same host. Also
|
320
383
|
# see Wgit::Document#internal_absolute_links.
|
321
384
|
#
|
322
|
-
# @return [Array<Wgit::Url>] Self's internal Url's in relative form.
|
385
|
+
# @return [Array<Wgit::Url>] Self's unique internal Url's in relative form.
|
323
386
|
def internal_links
|
324
387
|
return [] if @links.empty?
|
325
388
|
|
326
389
|
links = @links
|
327
|
-
.select { |link| link.relative?(host: @url.
|
390
|
+
.select { |link| link.relative?(host: @url.to_origin) }
|
328
391
|
.map(&:omit_base)
|
329
392
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
330
393
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
331
394
|
end
|
332
395
|
|
333
|
-
Wgit::Utils.
|
396
|
+
Wgit::Utils.sanitize(links)
|
334
397
|
end
|
335
398
|
|
336
|
-
# Returns all internal links from this Document in absolute form by
|
399
|
+
# Returns all unique internal links from this Document in absolute form by
|
337
400
|
# appending them to self's #base_url. Also see
|
338
401
|
# Wgit::Document#internal_links.
|
339
402
|
#
|
340
|
-
# @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
|
403
|
+
# @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
|
341
404
|
def internal_absolute_links
|
342
|
-
internal_links.map { |link| link.
|
405
|
+
internal_links.map { |link| link.make_absolute(self) }
|
343
406
|
end
|
344
407
|
|
345
|
-
# Returns all external links from this Document in absolute form.
|
346
|
-
# meaning a link to a different host.
|
408
|
+
# Returns all unique external links from this Document in absolute form.
|
409
|
+
# External meaning a link to a different host.
|
347
410
|
#
|
348
|
-
# @return [Array<Wgit::Url>] Self's external Url's in absolute form.
|
411
|
+
# @return [Array<Wgit::Url>] Self's unique external Url's in absolute form.
|
349
412
|
def external_links
|
350
413
|
return [] if @links.empty?
|
351
414
|
|
352
415
|
links = @links
|
353
|
-
.
|
416
|
+
.map do |link|
|
417
|
+
if link.scheme_relative?
|
418
|
+
link.prefix_scheme(@url.to_scheme.to_sym)
|
419
|
+
else
|
420
|
+
link
|
421
|
+
end
|
422
|
+
end
|
423
|
+
.reject { |link| link.relative?(host: @url.to_origin) }
|
354
424
|
.map(&:omit_trailing_slash)
|
355
425
|
|
356
|
-
Wgit::Utils.
|
426
|
+
Wgit::Utils.sanitize(links)
|
357
427
|
end
|
358
428
|
|
359
429
|
# Searches the @text for the given query and returns the results.
|
@@ -368,8 +438,8 @@ module Wgit
|
|
368
438
|
# original sentence, which ever is less. The algorithm obviously ensures
|
369
439
|
# that the search query is visible somewhere in the sentence.
|
370
440
|
#
|
371
|
-
# @param query [
|
372
|
-
# @text for.
|
441
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
442
|
+
# document's @text for.
|
373
443
|
# @param case_sensitive [Boolean] Whether character case must match.
|
374
444
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
375
445
|
# for separately.
|
@@ -379,12 +449,16 @@ module Wgit
|
|
379
449
|
def search(
|
380
450
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
381
451
|
)
|
382
|
-
query = query.to_s
|
383
|
-
raise 'A search query must be provided' if query.empty?
|
384
452
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
385
453
|
|
386
|
-
|
387
|
-
|
454
|
+
if query.is_a?(Regexp)
|
455
|
+
regex = query
|
456
|
+
else # respond_to? #to_s == true
|
457
|
+
query = query.to_s
|
458
|
+
query = query.gsub(' ', '|') unless whole_sentence
|
459
|
+
regex = Regexp.new(query, !case_sensitive)
|
460
|
+
end
|
461
|
+
|
388
462
|
results = {}
|
389
463
|
|
390
464
|
@text.each do |sentence|
|
@@ -411,8 +485,8 @@ module Wgit
|
|
411
485
|
# functionality. The original text is returned; no other reference to it
|
412
486
|
# is kept thereafter.
|
413
487
|
#
|
414
|
-
# @param query [
|
415
|
-
# @text for.
|
488
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
489
|
+
# document's @text for.
|
416
490
|
# @param case_sensitive [Boolean] Whether character case must match.
|
417
491
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
418
492
|
# for separately.
|
@@ -431,85 +505,95 @@ module Wgit
|
|
431
505
|
orig_text
|
432
506
|
end
|
433
507
|
|
508
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
509
|
+
# parameter.
|
510
|
+
#
|
511
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
512
|
+
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
513
|
+
# Object) : results (Array).
|
514
|
+
# @param text_content_only [Boolean] text_content_only ? result.content
|
515
|
+
# (String) : result (Nokogiri Object).
|
516
|
+
# @return [String, Object] The value found in the html or the default value
|
517
|
+
# (singleton ? nil : []).
|
518
|
+
def extract(xpath, singleton: true, text_content_only: true)
|
519
|
+
send(
|
520
|
+
:extract_from_html, xpath,
|
521
|
+
singleton: singleton, text_content_only: text_content_only
|
522
|
+
)
|
523
|
+
end
|
524
|
+
|
434
525
|
protected
|
435
526
|
|
436
527
|
# Initializes the nokogiri object using @html, which cannot be nil.
|
437
528
|
# Override this method to custom configure the Nokogiri object returned.
|
438
529
|
# Gets called from Wgit::Document.new upon initialization.
|
439
530
|
#
|
531
|
+
# @yield [config] The given block is passed to Nokogiri::HTML for
|
532
|
+
# initialisation.
|
440
533
|
# @raise [StandardError] If @html isn't set.
|
441
534
|
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
442
|
-
def init_nokogiri
|
535
|
+
def init_nokogiri(&block)
|
443
536
|
raise '@html must be set' unless @html
|
444
537
|
|
445
|
-
Nokogiri::HTML(@html)
|
446
|
-
# TODO: Remove #'s below when crawling in production.
|
447
|
-
# config.options = Nokogiri::XML::ParseOptions::STRICT |
|
448
|
-
# Nokogiri::XML::ParseOptions::NONET
|
449
|
-
end
|
538
|
+
Nokogiri::HTML(@html, &block)
|
450
539
|
end
|
451
540
|
|
452
|
-
#
|
541
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
453
542
|
# parameter.
|
454
543
|
#
|
455
|
-
# @param xpath [String] Used to find the value/object in @html.
|
544
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
456
545
|
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
457
546
|
# Object) : results (Array).
|
458
547
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
459
548
|
# (String) : result (Nokogiri Object).
|
460
|
-
# @yield
|
461
|
-
#
|
462
|
-
#
|
463
|
-
#
|
549
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
550
|
+
# regardless of the source. Use it (optionally) to process the result
|
551
|
+
# value.
|
552
|
+
# @yieldparam value [Object] The result value to be returned.
|
553
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
554
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
555
|
+
# `:object`.
|
556
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
557
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
464
558
|
# @return [String, Object] The value found in the html or the default value
|
465
559
|
# (singleton ? nil : []).
|
466
|
-
def
|
467
|
-
|
468
|
-
|
469
|
-
results = @doc.xpath(xpath)
|
470
|
-
|
471
|
-
return default if results.nil? || results.empty?
|
472
|
-
|
473
|
-
result = if singleton
|
474
|
-
text_content_only ? results.first.content : results.first
|
475
|
-
else
|
476
|
-
text_content_only ? results.map(&:content) : results
|
477
|
-
end
|
560
|
+
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
561
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
562
|
+
result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
|
478
563
|
|
479
|
-
|
480
|
-
|
481
|
-
if block_given?
|
482
|
-
new_result = yield(result, self, :document)
|
483
|
-
result = new_result unless new_result.nil?
|
564
|
+
if text_content_only
|
565
|
+
result = singleton ? result&.content : result.map(&:content)
|
484
566
|
end
|
485
567
|
|
568
|
+
Wgit::Utils.sanitize(result)
|
569
|
+
result = yield(result, self, :document) if block_given?
|
486
570
|
result
|
487
571
|
end
|
488
572
|
|
489
|
-
# Returns a value from the obj using the given key via obj#fetch
|
573
|
+
# Returns a value from the obj using the given key via `obj#fetch`.
|
490
574
|
#
|
491
575
|
# @param obj [#fetch] The object containing the key/value.
|
492
576
|
# @param key [String] Used to find the value in the obj.
|
493
577
|
# @param singleton [Boolean] True if a single value, false otherwise.
|
494
|
-
# @yield
|
495
|
-
#
|
496
|
-
#
|
497
|
-
#
|
578
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
579
|
+
# regardless of the source. Use it (optionally) to process the result
|
580
|
+
# value.
|
581
|
+
# @yieldparam value [Object] The result value to be returned.
|
582
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
583
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
584
|
+
# `:object`.
|
585
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
586
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
498
587
|
# @return [String, Object] The value found in the obj or the default value
|
499
588
|
# (singleton ? nil : []).
|
500
|
-
def
|
589
|
+
def extract_from_object(obj, key, singleton: true)
|
501
590
|
assert_respond_to(obj, :fetch)
|
502
591
|
|
503
592
|
default = singleton ? nil : []
|
504
593
|
result = obj.fetch(key.to_s, default)
|
505
594
|
|
506
|
-
|
507
|
-
|
508
|
-
if block_given?
|
509
|
-
new_result = yield(result, obj, :object)
|
510
|
-
result = new_result unless new_result.nil?
|
511
|
-
end
|
512
|
-
|
595
|
+
Wgit::Utils.sanitize(result)
|
596
|
+
result = yield(result, obj, :object) if block_given?
|
513
597
|
result
|
514
598
|
end
|
515
599
|
|
@@ -523,12 +607,12 @@ module Wgit
|
|
523
607
|
url = Wgit::Url.parse(url)
|
524
608
|
url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
|
525
609
|
|
526
|
-
@url
|
527
|
-
@html
|
528
|
-
@
|
529
|
-
@score
|
610
|
+
@url = url
|
611
|
+
@html = html || ''
|
612
|
+
@parser = init_nokogiri
|
613
|
+
@score = 0.0
|
530
614
|
|
531
|
-
Wgit::Utils.
|
615
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
532
616
|
|
533
617
|
# Dynamically run the init_*_from_html methods.
|
534
618
|
Document.private_instance_methods(false).each do |method|
|
@@ -544,12 +628,12 @@ module Wgit
|
|
544
628
|
def init_from_object(obj, encode: true)
|
545
629
|
assert_respond_to(obj, :fetch)
|
546
630
|
|
547
|
-
@url
|
548
|
-
@html
|
549
|
-
@
|
550
|
-
@score
|
631
|
+
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
632
|
+
@html = obj.fetch('html', '')
|
633
|
+
@parser = init_nokogiri
|
634
|
+
@score = obj.fetch('score', 0.0)
|
551
635
|
|
552
|
-
Wgit::Utils.
|
636
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
553
637
|
|
554
638
|
# Dynamically run the init_*_from_object methods.
|
555
639
|
Document.private_instance_methods(false).each do |method|
|
@@ -560,11 +644,11 @@ module Wgit
|
|
560
644
|
end
|
561
645
|
end
|
562
646
|
|
563
|
-
# Initialises an instance variable and defines
|
647
|
+
# Initialises an instance variable and defines an accessor method for it.
|
564
648
|
#
|
565
649
|
# @param var [Symbol] The name of the variable to be initialized.
|
566
650
|
# @param value [Object] The newly initialized variable's value.
|
567
|
-
# @return [Symbol] The name of the
|
651
|
+
# @return [Symbol] The name of the defined getter method.
|
568
652
|
def init_var(var, value)
|
569
653
|
# instance_var_name starts with @, var_name doesn't.
|
570
654
|
var = var.to_s
|
@@ -572,10 +656,9 @@ module Wgit
|
|
572
656
|
instance_var_name = "@#{var_name}".to_sym
|
573
657
|
|
574
658
|
instance_variable_set(instance_var_name, value)
|
659
|
+
Wgit::Document.attr_accessor(var_name)
|
575
660
|
|
576
|
-
|
577
|
-
instance_variable_get(instance_var_name)
|
578
|
-
end
|
661
|
+
var_name
|
579
662
|
end
|
580
663
|
|
581
664
|
alias content html
|