wgit 0.7.0 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +74 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -290
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +219 -79
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +226 -143
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +21 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +11 -8
- data/lib/wgit/url.rb +192 -61
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/lib/wgit/document.rb
CHANGED
@@ -6,29 +6,41 @@ require 'json'
|
|
6
6
|
require 'set'
|
7
7
|
|
8
8
|
module Wgit
|
9
|
-
# Class
|
9
|
+
# Class modeling/serialising a HTML web document, although other MIME types
|
10
10
|
# will work e.g. images etc. Also doubles as a search result when
|
11
11
|
# loading Documents from the database via `Wgit::Database#search`.
|
12
12
|
#
|
13
13
|
# The initialize method dynamically initializes instance variables from the
|
14
14
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
15
|
-
# Document class can be easily extended allowing you to
|
16
|
-
# a webpage that are important to you. See `Wgit::Document.
|
15
|
+
# Document class can be easily extended allowing you to extract the bits of
|
16
|
+
# a webpage that are important to you. See `Wgit::Document.define_extractor`.
|
17
17
|
class Document
|
18
18
|
include Assertable
|
19
19
|
|
20
|
-
# Regex for the allowed var names when defining an
|
21
|
-
|
20
|
+
# Regex for the allowed var names when defining an extractor.
|
21
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
|
22
22
|
|
23
|
-
#
|
24
|
-
|
23
|
+
# Set of text elements used to build Document#text.
|
24
|
+
@text_elements = Set.new(%i[
|
25
|
+
a abbr address article aside b bdi bdo blockquote button caption cite
|
26
|
+
code data dd del details dfn div dl dt em figcaption figure footer h1 h2
|
27
|
+
h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
|
28
|
+
option output p pre q rb rt ruby s samp section small span strong sub
|
29
|
+
summary sup td textarea th time u ul var wbr
|
30
|
+
])
|
25
31
|
|
26
|
-
# Set of Symbols representing the defined Document
|
27
|
-
@
|
32
|
+
# Set of Symbols representing the defined Document extractors.
|
33
|
+
@extractors = Set.new
|
28
34
|
|
29
35
|
class << self
|
30
|
-
#
|
31
|
-
|
36
|
+
# Set of HTML elements that make up the visible text on a page. These
|
37
|
+
# elements are used to initialize the Wgit::Document#text. See the
|
38
|
+
# README.md for how to add to this Set dynamically.
|
39
|
+
attr_reader :text_elements
|
40
|
+
|
41
|
+
# Set of Symbols representing the defined Document extractors. Is
|
42
|
+
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
|
+
attr_reader :extractors
|
32
44
|
end
|
33
45
|
|
34
46
|
# The URL of the webpage, an instance of Wgit::Url.
|
@@ -38,7 +50,7 @@ module Wgit
|
|
38
50
|
attr_reader :html
|
39
51
|
|
40
52
|
# The Nokogiri::HTML document object initialized from @html.
|
41
|
-
attr_reader :
|
53
|
+
attr_reader :parser
|
42
54
|
|
43
55
|
# The score is only used following a `Database#search` and records matches.
|
44
56
|
attr_reader :score
|
@@ -50,7 +62,7 @@ module Wgit
|
|
50
62
|
#
|
51
63
|
# During initialisation, the Document will call any private
|
52
64
|
# `init_*_from_html` and `init_*_from_object` methods it can find. See the
|
53
|
-
#
|
65
|
+
# Wgit::Document.define_extractor method for more details.
|
54
66
|
#
|
55
67
|
# @param url_or_obj [String, Wgit::Url, #fetch] Either a String
|
56
68
|
# representing a URL or a Hash-like object responding to :fetch. e.g. a
|
@@ -72,31 +84,54 @@ module Wgit
|
|
72
84
|
|
73
85
|
### Document Class Methods ###
|
74
86
|
|
75
|
-
#
|
76
|
-
#
|
77
|
-
# extensions defined in 'document_extensions.rb' as examples.
|
87
|
+
# Uses Document.text_elements to build an xpath String, used to obtain
|
88
|
+
# all of the combined visual text on a webpage.
|
78
89
|
#
|
79
|
-
#
|
90
|
+
# @return [String] An xpath String to obtain a webpage's text elements.
|
91
|
+
def self.text_elements_xpath
|
92
|
+
xpath = ''
|
93
|
+
return xpath if Wgit::Document.text_elements.empty?
|
94
|
+
|
95
|
+
el_xpath = '//%s/text()'
|
96
|
+
Wgit::Document.text_elements.each_with_index do |el, i|
|
97
|
+
xpath += ' | ' unless i.zero?
|
98
|
+
xpath += format(el_xpath, el)
|
99
|
+
end
|
100
|
+
|
101
|
+
xpath
|
102
|
+
end
|
103
|
+
|
104
|
+
# Defines a content extractor, which extracts HTML elements/content
|
105
|
+
# into instance variables upon Document initialization. See the default
|
106
|
+
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
|
+
# extractor means that every subsequently crawled/initialized document
|
108
|
+
# will attempt to extract the xpath's content. Use `#xpath` for a one off
|
109
|
+
# content extraction.
|
110
|
+
#
|
111
|
+
# Note that defined extractors work for both Documents initialized from
|
80
112
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
81
|
-
# An
|
113
|
+
# An extractor once defined, initializes a private instance variable with
|
82
114
|
# the xpath or database object result(s).
|
83
115
|
#
|
84
116
|
# When initialising from HTML, a singleton value of true will only
|
85
|
-
# ever return
|
86
|
-
# Array. When initialising from a database object, the value
|
87
|
-
# is and singleton is only used to define the default empty
|
88
|
-
# If a value cannot be found (in either the HTML or database
|
89
|
-
# a default will be used. The default value is:
|
90
|
-
#
|
91
|
-
#
|
117
|
+
# ever return the first result found; otherwise all the results are
|
118
|
+
# returned in an Array. When initialising from a database object, the value
|
119
|
+
# is taken as is and singleton is only used to define the default empty
|
120
|
+
# value. If a value cannot be found (in either the HTML or database
|
121
|
+
# object), then a default will be used. The default value is:
|
122
|
+
# `singleton ? nil : []`.
|
123
|
+
#
|
124
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
125
|
+
# contain the extracted content. A getter and setter method is defined
|
126
|
+
# for the initialised variable.
|
92
127
|
# @param xpath [String, #call] The xpath used to find the element(s)
|
93
128
|
# of the webpage. Only used when initializing from HTML.
|
94
129
|
#
|
95
130
|
# Pass a callable object (proc etc.) if you want the
|
96
131
|
# xpath value to be derived on Document initialisation (instead of when
|
97
|
-
# the
|
132
|
+
# the extractor is defined). The call method must return a valid xpath
|
98
133
|
# String.
|
99
|
-
# @param opts [Hash] The options to define an
|
134
|
+
# @param opts [Hash] The options to define an extractor with. The
|
100
135
|
# options are only used when intializing from HTML, not the database.
|
101
136
|
# @option opts [Boolean] :singleton The singleton option determines
|
102
137
|
# whether or not the result(s) should be in an Array. If multiple
|
@@ -105,56 +140,62 @@ module Wgit
|
|
105
140
|
# @option opts [Boolean] :text_content_only The text_content_only option
|
106
141
|
# if true will use the text content of the Nokogiri result object,
|
107
142
|
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
108
|
-
# @
|
109
|
-
#
|
110
|
-
#
|
111
|
-
#
|
112
|
-
#
|
113
|
-
#
|
114
|
-
#
|
115
|
-
#
|
143
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
144
|
+
# regardless of the source. Use it (optionally) to process the result
|
145
|
+
# value.
|
146
|
+
# @yieldparam value [Object] The result value to be assigned to the new
|
147
|
+
# `var`.
|
148
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
149
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
150
|
+
# `:object`.
|
151
|
+
# @yieldreturn [Object] The return value of the block becomes the new var's
|
152
|
+
# value. Return the block's value param unchanged if you want to inspect.
|
116
153
|
# @raise [StandardError] If the var param isn't valid.
|
117
154
|
# @return [Symbol] The given var Symbol if successful.
|
118
|
-
def self.
|
155
|
+
def self.define_extractor(var, xpath, opts = {}, &block)
|
119
156
|
var = var.to_sym
|
120
157
|
defaults = { singleton: true, text_content_only: true }
|
121
158
|
opts = defaults.merge(opts)
|
122
159
|
|
123
|
-
raise "var must match #{
|
124
|
-
var =~
|
160
|
+
raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
|
161
|
+
var =~ REGEX_EXTRACTOR_NAME
|
125
162
|
|
126
163
|
# Define the private init_*_from_html method for HTML.
|
127
164
|
# Gets the HTML's xpath value and creates a var for it.
|
128
165
|
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
129
|
-
result =
|
166
|
+
result = extract_from_html(xpath, **opts, &block)
|
130
167
|
init_var(var, result)
|
131
168
|
end
|
132
169
|
Document.send(:private, func_name)
|
133
170
|
|
134
171
|
# Define the private init_*_from_object method for a Database object.
|
135
172
|
# Gets the Object's 'key' value and creates a var for it.
|
136
|
-
func_name = Document.send(
|
137
|
-
|
173
|
+
func_name = Document.send(
|
174
|
+
:define_method, "init_#{var}_from_object"
|
175
|
+
) do |obj|
|
176
|
+
result = extract_from_object(
|
177
|
+
obj, var.to_s, singleton: opts[:singleton], &block
|
178
|
+
)
|
138
179
|
init_var(var, result)
|
139
180
|
end
|
140
181
|
Document.send(:private, func_name)
|
141
182
|
|
142
|
-
@
|
183
|
+
@extractors << var
|
143
184
|
var
|
144
185
|
end
|
145
186
|
|
146
|
-
# Removes the init_
|
147
|
-
# Therefore, this is the opposing method to Document.
|
187
|
+
# Removes the `init_*` methods created when an extractor is defined.
|
188
|
+
# Therefore, this is the opposing method to `Document.define_extractor`.
|
148
189
|
# Returns true if successful or false if the method(s) cannot be found.
|
149
190
|
#
|
150
|
-
# @param var [Symbol] The
|
151
|
-
# @return [Boolean] True if the
|
191
|
+
# @param var [Symbol] The extractor variable to remove.
|
192
|
+
# @return [Boolean] True if the extractor `var` was found and removed;
|
152
193
|
# otherwise false.
|
153
|
-
def self.
|
194
|
+
def self.remove_extractor(var)
|
154
195
|
Document.send(:remove_method, "init_#{var}_from_html")
|
155
196
|
Document.send(:remove_method, "init_#{var}_from_object")
|
156
197
|
|
157
|
-
@
|
198
|
+
@extractors.delete(var.to_sym)
|
158
199
|
true
|
159
200
|
rescue NameError
|
160
201
|
false
|
@@ -173,7 +214,7 @@ module Wgit
|
|
173
214
|
(@url == other.url) && (@html == other.html)
|
174
215
|
end
|
175
216
|
|
176
|
-
#
|
217
|
+
# Shortcut for calling Document#html[range].
|
177
218
|
#
|
178
219
|
# @param range [Range] The range of @html to return.
|
179
220
|
# @return [String] The given range of @html.
|
@@ -183,9 +224,9 @@ module Wgit
|
|
183
224
|
|
184
225
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
185
226
|
# <base> element's href value or @url (if @base is nil). If @base is
|
186
|
-
# present and relative, then @url.
|
187
|
-
# should be used instead of `doc.url.
|
188
|
-
# absolute links from relative links; or use `link.
|
227
|
+
# present and relative, then @url.to_origin + @base is returned. This method
|
228
|
+
# should be used instead of `doc.url.to_origin` etc. when manually building
|
229
|
+
# absolute links from relative links; or use `link.make_absolute(doc)`.
|
189
230
|
#
|
190
231
|
# Provide the `link:` parameter to get the correct base URL for that type
|
191
232
|
# of link. For example, a link of `#top` would always return @url because
|
@@ -204,12 +245,16 @@ module Wgit
|
|
204
245
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
205
246
|
# 'http://example.com/public'.
|
206
247
|
def base_url(link: nil)
|
207
|
-
raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
|
208
248
|
if @url.relative? && @base.nil?
|
209
|
-
|
249
|
+
raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
|
250
|
+
end
|
251
|
+
|
210
252
|
if @url.relative? && @base&.relative?
|
253
|
+
raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
|
254
|
+
be relative"
|
255
|
+
end
|
211
256
|
|
212
|
-
get_base = -> { @base.relative? ? @url.
|
257
|
+
get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
|
213
258
|
|
214
259
|
if link
|
215
260
|
link = Wgit::Url.new(link)
|
@@ -221,7 +266,7 @@ module Wgit
|
|
221
266
|
end
|
222
267
|
end
|
223
268
|
|
224
|
-
base_url = @base ? get_base.call : @url.
|
269
|
+
base_url = @base ? get_base.call : @url.to_origin
|
225
270
|
base_url.omit_fragment.omit_query
|
226
271
|
end
|
227
272
|
|
@@ -235,7 +280,7 @@ module Wgit
|
|
235
280
|
def to_h(include_html: false, include_score: true)
|
236
281
|
ignore = include_html ? [] : ['@html']
|
237
282
|
ignore << '@score' unless include_score
|
238
|
-
ignore << '@
|
283
|
+
ignore << '@parser' # Always ignore the Nokogiri object.
|
239
284
|
|
240
285
|
Wgit::Utils.to_h(self, ignore: ignore)
|
241
286
|
end
|
@@ -252,7 +297,7 @@ module Wgit
|
|
252
297
|
|
253
298
|
# Returns a Hash containing this Document's instance variables and
|
254
299
|
# their #length (if they respond to it). Works dynamically so that any
|
255
|
-
# user defined
|
300
|
+
# user defined extractors (and their created instance vars) will appear in
|
256
301
|
# the returned Hash as well. The number of text snippets as well as total
|
257
302
|
# number of textual bytes are always included in the returned Hash.
|
258
303
|
#
|
@@ -262,8 +307,8 @@ module Wgit
|
|
262
307
|
instance_variables.each do |var|
|
263
308
|
# Add up the total bytes of text as well as the length.
|
264
309
|
if var == :@text
|
265
|
-
hash[:
|
266
|
-
hash[:text_bytes]
|
310
|
+
hash[:text] = @text.length
|
311
|
+
hash[:text_bytes] = @text.sum(&:length)
|
267
312
|
# Else take the var's #length method return value.
|
268
313
|
else
|
269
314
|
next unless instance_variable_get(var).respond_to?(:length)
|
@@ -292,25 +337,43 @@ module Wgit
|
|
292
337
|
end
|
293
338
|
|
294
339
|
# Uses Nokogiri's xpath method to search the doc's html and return the
|
295
|
-
# results.
|
340
|
+
# results. Use `#at_xpath` for returning the first result only.
|
296
341
|
#
|
297
342
|
# @param xpath [String] The xpath to search the @html with.
|
298
343
|
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
299
344
|
def xpath(xpath)
|
300
|
-
@
|
345
|
+
@parser.xpath(xpath)
|
301
346
|
end
|
302
347
|
|
303
|
-
# Uses Nokogiri's
|
304
|
-
# results.
|
348
|
+
# Uses Nokogiri's `at_xpath` method to search the doc's html and return the
|
349
|
+
# result. Use `#xpath` for returning several results.
|
350
|
+
#
|
351
|
+
# @param xpath [String] The xpath to search the @html with.
|
352
|
+
# @return [Nokogiri::XML::Element] The result of the xpath search.
|
353
|
+
def at_xpath(xpath)
|
354
|
+
@parser.at_xpath(xpath)
|
355
|
+
end
|
356
|
+
|
357
|
+
# Uses Nokogiri's `css` method to search the doc's html and return the
|
358
|
+
# results. Use `#at_css` for returning the first result only.
|
305
359
|
#
|
306
360
|
# @param selector [String] The CSS selector to search the @html with.
|
307
361
|
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
308
362
|
def css(selector)
|
309
|
-
@
|
363
|
+
@parser.css(selector)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Uses Nokogiri's `at_css` method to search the doc's html and return the
|
367
|
+
# result. Use `#css` for returning several results.
|
368
|
+
#
|
369
|
+
# @param selector [String] The CSS selector to search the @html with.
|
370
|
+
# @return [Nokogiri::XML::Element] The result of the CSS search.
|
371
|
+
def at_css(selector)
|
372
|
+
@parser.at_css(selector)
|
310
373
|
end
|
311
374
|
|
312
|
-
# Returns all internal links from this Document in relative form.
|
313
|
-
# meaning a link to another document on the same host.
|
375
|
+
# Returns all unique internal links from this Document in relative form.
|
376
|
+
# Internal meaning a link to another document on the same host.
|
314
377
|
#
|
315
378
|
# This Document's host is used to determine if an absolute URL is actually
|
316
379
|
# a relative link e.g. For a Document representing
|
@@ -319,41 +382,48 @@ module Wgit
|
|
319
382
|
# as an internal link because both Documents live on the same host. Also
|
320
383
|
# see Wgit::Document#internal_absolute_links.
|
321
384
|
#
|
322
|
-
# @return [Array<Wgit::Url>] Self's internal Url's in relative form.
|
385
|
+
# @return [Array<Wgit::Url>] Self's unique internal Url's in relative form.
|
323
386
|
def internal_links
|
324
387
|
return [] if @links.empty?
|
325
388
|
|
326
389
|
links = @links
|
327
|
-
.select { |link| link.relative?(host: @url.
|
390
|
+
.select { |link| link.relative?(host: @url.to_origin) }
|
328
391
|
.map(&:omit_base)
|
329
392
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
330
393
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
331
394
|
end
|
332
395
|
|
333
|
-
Wgit::Utils.
|
396
|
+
Wgit::Utils.sanitize(links)
|
334
397
|
end
|
335
398
|
|
336
|
-
# Returns all internal links from this Document in absolute form by
|
399
|
+
# Returns all unique internal links from this Document in absolute form by
|
337
400
|
# appending them to self's #base_url. Also see
|
338
401
|
# Wgit::Document#internal_links.
|
339
402
|
#
|
340
|
-
# @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
|
403
|
+
# @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
|
341
404
|
def internal_absolute_links
|
342
|
-
internal_links.map { |link| link.
|
405
|
+
internal_links.map { |link| link.make_absolute(self) }
|
343
406
|
end
|
344
407
|
|
345
|
-
# Returns all external links from this Document in absolute form.
|
346
|
-
# meaning a link to a different host.
|
408
|
+
# Returns all unique external links from this Document in absolute form.
|
409
|
+
# External meaning a link to a different host.
|
347
410
|
#
|
348
|
-
# @return [Array<Wgit::Url>] Self's external Url's in absolute form.
|
411
|
+
# @return [Array<Wgit::Url>] Self's unique external Url's in absolute form.
|
349
412
|
def external_links
|
350
413
|
return [] if @links.empty?
|
351
414
|
|
352
415
|
links = @links
|
353
|
-
.
|
416
|
+
.map do |link|
|
417
|
+
if link.scheme_relative?
|
418
|
+
link.prefix_scheme(@url.to_scheme.to_sym)
|
419
|
+
else
|
420
|
+
link
|
421
|
+
end
|
422
|
+
end
|
423
|
+
.reject { |link| link.relative?(host: @url.to_origin) }
|
354
424
|
.map(&:omit_trailing_slash)
|
355
425
|
|
356
|
-
Wgit::Utils.
|
426
|
+
Wgit::Utils.sanitize(links)
|
357
427
|
end
|
358
428
|
|
359
429
|
# Searches the @text for the given query and returns the results.
|
@@ -368,8 +438,8 @@ module Wgit
|
|
368
438
|
# original sentence, which ever is less. The algorithm obviously ensures
|
369
439
|
# that the search query is visible somewhere in the sentence.
|
370
440
|
#
|
371
|
-
# @param query [
|
372
|
-
# @text for.
|
441
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
442
|
+
# document's @text for.
|
373
443
|
# @param case_sensitive [Boolean] Whether character case must match.
|
374
444
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
375
445
|
# for separately.
|
@@ -379,12 +449,16 @@ module Wgit
|
|
379
449
|
def search(
|
380
450
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
381
451
|
)
|
382
|
-
query = query.to_s
|
383
|
-
raise 'A search query must be provided' if query.empty?
|
384
452
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
385
453
|
|
386
|
-
|
387
|
-
|
454
|
+
if query.is_a?(Regexp)
|
455
|
+
regex = query
|
456
|
+
else # respond_to? #to_s == true
|
457
|
+
query = query.to_s
|
458
|
+
query = query.gsub(' ', '|') unless whole_sentence
|
459
|
+
regex = Regexp.new(query, !case_sensitive)
|
460
|
+
end
|
461
|
+
|
388
462
|
results = {}
|
389
463
|
|
390
464
|
@text.each do |sentence|
|
@@ -411,8 +485,8 @@ module Wgit
|
|
411
485
|
# functionality. The original text is returned; no other reference to it
|
412
486
|
# is kept thereafter.
|
413
487
|
#
|
414
|
-
# @param query [
|
415
|
-
# @text for.
|
488
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
489
|
+
# document's @text for.
|
416
490
|
# @param case_sensitive [Boolean] Whether character case must match.
|
417
491
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
418
492
|
# for separately.
|
@@ -431,85 +505,95 @@ module Wgit
|
|
431
505
|
orig_text
|
432
506
|
end
|
433
507
|
|
508
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
509
|
+
# parameter.
|
510
|
+
#
|
511
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
512
|
+
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
513
|
+
# Object) : results (Array).
|
514
|
+
# @param text_content_only [Boolean] text_content_only ? result.content
|
515
|
+
# (String) : result (Nokogiri Object).
|
516
|
+
# @return [String, Object] The value found in the html or the default value
|
517
|
+
# (singleton ? nil : []).
|
518
|
+
def extract(xpath, singleton: true, text_content_only: true)
|
519
|
+
send(
|
520
|
+
:extract_from_html, xpath,
|
521
|
+
singleton: singleton, text_content_only: text_content_only
|
522
|
+
)
|
523
|
+
end
|
524
|
+
|
434
525
|
protected
|
435
526
|
|
436
527
|
# Initializes the nokogiri object using @html, which cannot be nil.
|
437
528
|
# Override this method to custom configure the Nokogiri object returned.
|
438
529
|
# Gets called from Wgit::Document.new upon initialization.
|
439
530
|
#
|
531
|
+
# @yield [config] The given block is passed to Nokogiri::HTML for
|
532
|
+
# initialisation.
|
440
533
|
# @raise [StandardError] If @html isn't set.
|
441
534
|
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
442
|
-
def init_nokogiri
|
535
|
+
def init_nokogiri(&block)
|
443
536
|
raise '@html must be set' unless @html
|
444
537
|
|
445
|
-
Nokogiri::HTML(@html)
|
446
|
-
# TODO: Remove #'s below when crawling in production.
|
447
|
-
# config.options = Nokogiri::XML::ParseOptions::STRICT |
|
448
|
-
# Nokogiri::XML::ParseOptions::NONET
|
449
|
-
end
|
538
|
+
Nokogiri::HTML(@html, &block)
|
450
539
|
end
|
451
540
|
|
452
|
-
#
|
541
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
453
542
|
# parameter.
|
454
543
|
#
|
455
|
-
# @param xpath [String] Used to find the value/object in @html.
|
544
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
456
545
|
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
457
546
|
# Object) : results (Array).
|
458
547
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
459
548
|
# (String) : result (Nokogiri Object).
|
460
|
-
# @yield
|
461
|
-
#
|
462
|
-
#
|
463
|
-
#
|
549
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
550
|
+
# regardless of the source. Use it (optionally) to process the result
|
551
|
+
# value.
|
552
|
+
# @yieldparam value [Object] The result value to be returned.
|
553
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
554
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
555
|
+
# `:object`.
|
556
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
557
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
464
558
|
# @return [String, Object] The value found in the html or the default value
|
465
559
|
# (singleton ? nil : []).
|
466
|
-
def
|
467
|
-
|
468
|
-
|
469
|
-
results = @doc.xpath(xpath)
|
470
|
-
|
471
|
-
return default if results.nil? || results.empty?
|
472
|
-
|
473
|
-
result = if singleton
|
474
|
-
text_content_only ? results.first.content : results.first
|
475
|
-
else
|
476
|
-
text_content_only ? results.map(&:content) : results
|
477
|
-
end
|
560
|
+
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
561
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
562
|
+
result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
|
478
563
|
|
479
|
-
|
480
|
-
|
481
|
-
if block_given?
|
482
|
-
new_result = yield(result, self, :document)
|
483
|
-
result = new_result unless new_result.nil?
|
564
|
+
if text_content_only
|
565
|
+
result = singleton ? result&.content : result.map(&:content)
|
484
566
|
end
|
485
567
|
|
568
|
+
Wgit::Utils.sanitize(result)
|
569
|
+
result = yield(result, self, :document) if block_given?
|
486
570
|
result
|
487
571
|
end
|
488
572
|
|
489
|
-
# Returns a value from the obj using the given key via obj#fetch
|
573
|
+
# Returns a value from the obj using the given key via `obj#fetch`.
|
490
574
|
#
|
491
575
|
# @param obj [#fetch] The object containing the key/value.
|
492
576
|
# @param key [String] Used to find the value in the obj.
|
493
577
|
# @param singleton [Boolean] True if a single value, false otherwise.
|
494
|
-
# @yield
|
495
|
-
#
|
496
|
-
#
|
497
|
-
#
|
578
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
579
|
+
# regardless of the source. Use it (optionally) to process the result
|
580
|
+
# value.
|
581
|
+
# @yieldparam value [Object] The result value to be returned.
|
582
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
583
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
584
|
+
# `:object`.
|
585
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
586
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
498
587
|
# @return [String, Object] The value found in the obj or the default value
|
499
588
|
# (singleton ? nil : []).
|
500
|
-
def
|
589
|
+
def extract_from_object(obj, key, singleton: true)
|
501
590
|
assert_respond_to(obj, :fetch)
|
502
591
|
|
503
592
|
default = singleton ? nil : []
|
504
593
|
result = obj.fetch(key.to_s, default)
|
505
594
|
|
506
|
-
|
507
|
-
|
508
|
-
if block_given?
|
509
|
-
new_result = yield(result, obj, :object)
|
510
|
-
result = new_result unless new_result.nil?
|
511
|
-
end
|
512
|
-
|
595
|
+
Wgit::Utils.sanitize(result)
|
596
|
+
result = yield(result, obj, :object) if block_given?
|
513
597
|
result
|
514
598
|
end
|
515
599
|
|
@@ -523,12 +607,12 @@ module Wgit
|
|
523
607
|
url = Wgit::Url.parse(url)
|
524
608
|
url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
|
525
609
|
|
526
|
-
@url
|
527
|
-
@html
|
528
|
-
@
|
529
|
-
@score
|
610
|
+
@url = url
|
611
|
+
@html = html || ''
|
612
|
+
@parser = init_nokogiri
|
613
|
+
@score = 0.0
|
530
614
|
|
531
|
-
Wgit::Utils.
|
615
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
532
616
|
|
533
617
|
# Dynamically run the init_*_from_html methods.
|
534
618
|
Document.private_instance_methods(false).each do |method|
|
@@ -544,12 +628,12 @@ module Wgit
|
|
544
628
|
def init_from_object(obj, encode: true)
|
545
629
|
assert_respond_to(obj, :fetch)
|
546
630
|
|
547
|
-
@url
|
548
|
-
@html
|
549
|
-
@
|
550
|
-
@score
|
631
|
+
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
632
|
+
@html = obj.fetch('html', '')
|
633
|
+
@parser = init_nokogiri
|
634
|
+
@score = obj.fetch('score', 0.0)
|
551
635
|
|
552
|
-
Wgit::Utils.
|
636
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
553
637
|
|
554
638
|
# Dynamically run the init_*_from_object methods.
|
555
639
|
Document.private_instance_methods(false).each do |method|
|
@@ -560,11 +644,11 @@ module Wgit
|
|
560
644
|
end
|
561
645
|
end
|
562
646
|
|
563
|
-
# Initialises an instance variable and defines
|
647
|
+
# Initialises an instance variable and defines an accessor method for it.
|
564
648
|
#
|
565
649
|
# @param var [Symbol] The name of the variable to be initialized.
|
566
650
|
# @param value [Object] The newly initialized variable's value.
|
567
|
-
# @return [Symbol] The name of the
|
651
|
+
# @return [Symbol] The name of the defined getter method.
|
568
652
|
def init_var(var, value)
|
569
653
|
# instance_var_name starts with @, var_name doesn't.
|
570
654
|
var = var.to_s
|
@@ -572,10 +656,9 @@ module Wgit
|
|
572
656
|
instance_var_name = "@#{var_name}".to_sym
|
573
657
|
|
574
658
|
instance_variable_set(instance_var_name, value)
|
659
|
+
Wgit::Document.attr_accessor(var_name)
|
575
660
|
|
576
|
-
|
577
|
-
instance_variable_get(instance_var_name)
|
578
|
-
end
|
661
|
+
var_name
|
579
662
|
end
|
580
663
|
|
581
664
|
alias content html
|