wgit 0.5.1 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/CHANGELOG.md +249 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +232 -0
- data/bin/wgit +39 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +304 -148
- data/lib/wgit/database/database.rb +310 -135
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +241 -169
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +20 -10
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +68 -156
- data/lib/wgit/response.rb +17 -14
- data/lib/wgit/url.rb +213 -73
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +3 -2
- metadata +38 -19
data/lib/wgit/database/model.rb
CHANGED
@@ -14,8 +14,7 @@ module Wgit
|
|
14
14
|
raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
|
15
15
|
|
16
16
|
model = url.to_h
|
17
|
-
|
18
|
-
Wgit::Utils.remove_non_bson_types(model)
|
17
|
+
select_bson_types(model)
|
19
18
|
end
|
20
19
|
|
21
20
|
# The data model for a Wgit::Document collection object.
|
@@ -28,7 +27,7 @@ module Wgit
|
|
28
27
|
model = doc.to_h(include_html: false, include_score: false)
|
29
28
|
model['url'] = url(doc.url) # Expand Url String into full object.
|
30
29
|
|
31
|
-
|
30
|
+
select_bson_types(model)
|
32
31
|
end
|
33
32
|
|
34
33
|
# Common fields when inserting a record into the DB.
|
@@ -49,5 +48,13 @@ module Wgit
|
|
49
48
|
date_modified: Wgit::Utils.time_stamp
|
50
49
|
}
|
51
50
|
end
|
51
|
+
|
52
|
+
# Returns the model having removed non bson types (for use with MongoDB).
|
53
|
+
#
|
54
|
+
# @param model_hash [Hash] The model Hash to sanitize.
|
55
|
+
# @return [Hash] The model Hash with non bson types removed.
|
56
|
+
def self.select_bson_types(model_hash)
|
57
|
+
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
58
|
+
end
|
52
59
|
end
|
53
60
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -3,45 +3,56 @@ require_relative 'utils'
|
|
3
3
|
require_relative 'assertable'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'json'
|
6
|
+
require 'set'
|
6
7
|
|
7
8
|
module Wgit
|
8
|
-
# Class
|
9
|
+
# Class modeling/serialising a HTML web document, although other MIME types
|
9
10
|
# will work e.g. images etc. Also doubles as a search result when
|
10
|
-
# loading Documents from the database via Wgit::Database#search
|
11
|
+
# loading Documents from the database via `Wgit::Database#search`.
|
11
12
|
#
|
12
13
|
# The initialize method dynamically initializes instance variables from the
|
13
14
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
14
|
-
# Document class can be easily extended allowing you to
|
15
|
-
# a webpage that are important to you. See Wgit::Document.
|
15
|
+
# Document class can be easily extended allowing you to extract the bits of
|
16
|
+
# a webpage that are important to you. See `Wgit::Document.define_extractor`.
|
16
17
|
class Document
|
17
18
|
include Assertable
|
18
19
|
|
19
|
-
# Regex for the allowed var names when defining an
|
20
|
-
|
20
|
+
# Regex for the allowed var names when defining an extractor.
|
21
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
|
21
22
|
|
22
|
-
#
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
23
|
+
# Set of text elements used to build Document#text.
|
24
|
+
@text_elements = Set.new(%i[
|
25
|
+
a abbr address article aside b bdi bdo blockquote button caption cite
|
26
|
+
code data dd del details dfn div dl dt em figcaption figure footer h1 h2
|
27
|
+
h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
|
28
|
+
option output p pre q rb rt ruby s samp section small span strong sub
|
29
|
+
summary sup td textarea th time u ul var wbr
|
30
|
+
])
|
31
|
+
|
32
|
+
# Set of Symbols representing the defined Document extractors.
|
33
|
+
@extractors = Set.new
|
29
34
|
|
30
35
|
class << self
|
31
|
-
#
|
36
|
+
# Set of HTML elements that make up the visible text on a page. These
|
37
|
+
# elements are used to initialize the Wgit::Document#text. See the
|
38
|
+
# README.md for how to add to this Set dynamically.
|
32
39
|
attr_reader :text_elements
|
40
|
+
|
41
|
+
# Set of Symbols representing the defined Document extractors. Is
|
42
|
+
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
|
+
attr_reader :extractors
|
33
44
|
end
|
34
45
|
|
35
46
|
# The URL of the webpage, an instance of Wgit::Url.
|
36
47
|
attr_reader :url
|
37
48
|
|
38
|
-
# The HTML of the
|
49
|
+
# The content/HTML of the document, an instance of String.
|
39
50
|
attr_reader :html
|
40
51
|
|
41
52
|
# The Nokogiri::HTML document object initialized from @html.
|
42
|
-
attr_reader :
|
53
|
+
attr_reader :parser
|
43
54
|
|
44
|
-
# The score is only used following a Database#search and records matches.
|
55
|
+
# The score is only used following a `Database#search` and records matches.
|
45
56
|
attr_reader :score
|
46
57
|
|
47
58
|
# Initialize takes either two strings (representing the URL and HTML) or an
|
@@ -50,29 +61,31 @@ module Wgit
|
|
50
61
|
# pages retrieved from the database.
|
51
62
|
#
|
52
63
|
# During initialisation, the Document will call any private
|
53
|
-
#
|
54
|
-
#
|
64
|
+
# `init_*_from_html` and `init_*_from_object` methods it can find. See the
|
65
|
+
# Wgit::Document.define_extractor method for more details.
|
55
66
|
#
|
56
|
-
# @param url_or_obj [String, Wgit::Url,
|
67
|
+
# @param url_or_obj [String, Wgit::Url, #fetch] Either a String
|
57
68
|
# representing a URL or a Hash-like object responding to :fetch. e.g. a
|
58
69
|
# MongoDB collection object. The Object's :fetch method should support
|
59
70
|
# Strings as keys.
|
60
|
-
# @param html [String, NilClass] The crawled web page's HTML. This
|
61
|
-
# only used if url_or_obj is a String representing the web
|
62
|
-
# Otherwise, the HTML comes from the database object. A html
|
63
|
-
# be defaulted to an empty String.
|
64
|
-
|
71
|
+
# @param html [String, NilClass] The crawled web page's content/HTML. This
|
72
|
+
# param is only used if url_or_obj is a String representing the web
|
73
|
+
# page's URL. Otherwise, the HTML comes from the database object. A html
|
74
|
+
# of nil will be defaulted to an empty String.
|
75
|
+
# @param encode [Boolean] Whether or not to UTF-8 encode the html. Set to
|
76
|
+
# false if the Document content is an image etc.
|
77
|
+
def initialize(url_or_obj, html = '', encode: true)
|
65
78
|
if url_or_obj.is_a?(String)
|
66
|
-
init_from_strings(url_or_obj, html,
|
79
|
+
init_from_strings(url_or_obj, html, encode: encode)
|
67
80
|
else
|
68
|
-
init_from_object(url_or_obj,
|
81
|
+
init_from_object(url_or_obj, encode: encode)
|
69
82
|
end
|
70
83
|
end
|
71
84
|
|
72
85
|
### Document Class Methods ###
|
73
86
|
|
74
87
|
# Uses Document.text_elements to build an xpath String, used to obtain
|
75
|
-
# all of the combined text on a webpage.
|
88
|
+
# all of the combined visual text on a webpage.
|
76
89
|
#
|
77
90
|
# @return [String] An xpath String to obtain a webpage's text elements.
|
78
91
|
def self.text_elements_xpath
|
@@ -88,86 +101,101 @@ module Wgit
|
|
88
101
|
xpath
|
89
102
|
end
|
90
103
|
|
91
|
-
# Defines
|
92
|
-
# instance variables upon Document initialization. See the default
|
93
|
-
#
|
104
|
+
# Defines a content extractor, which extracts HTML elements/content
|
105
|
+
# into instance variables upon Document initialization. See the default
|
106
|
+
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
|
+
# extractor means that every subsequently crawled/initialized document
|
108
|
+
# will attempt to extract the xpath's content. Use `#xpath` for a one off
|
109
|
+
# content extraction.
|
94
110
|
#
|
95
|
-
# Note that defined
|
111
|
+
# Note that defined extractors work for both Documents initialized from
|
96
112
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
97
|
-
# An
|
113
|
+
# An extractor once defined, initializes a private instance variable with
|
98
114
|
# the xpath or database object result(s).
|
99
115
|
#
|
100
116
|
# When initialising from HTML, a singleton value of true will only
|
101
|
-
# ever return
|
102
|
-
# Array. When initialising from a database object, the value
|
103
|
-
# is and singleton is only used to define the default empty
|
104
|
-
# If a value cannot be found (in either the HTML or database
|
105
|
-
# a default will be used. The default value is:
|
106
|
-
#
|
107
|
-
#
|
108
|
-
# @param
|
117
|
+
# ever return the first result found; otherwise all the results are
|
118
|
+
# returned in an Array. When initialising from a database object, the value
|
119
|
+
# is taken as is and singleton is only used to define the default empty
|
120
|
+
# value. If a value cannot be found (in either the HTML or database
|
121
|
+
# object), then a default will be used. The default value is:
|
122
|
+
# `singleton ? nil : []`.
|
123
|
+
#
|
124
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
125
|
+
# contain the extracted content. A getter and setter method is defined
|
126
|
+
# for the initialised variable.
|
127
|
+
# @param xpath [String, #call] The xpath used to find the element(s)
|
109
128
|
# of the webpage. Only used when initializing from HTML.
|
110
129
|
#
|
111
130
|
# Pass a callable object (proc etc.) if you want the
|
112
131
|
# xpath value to be derived on Document initialisation (instead of when
|
113
|
-
# the
|
132
|
+
# the extractor is defined). The call method must return a valid xpath
|
114
133
|
# String.
|
115
|
-
# @param
|
134
|
+
# @param opts [Hash] The options to define an extractor with. The
|
116
135
|
# options are only used when intializing from HTML, not the database.
|
117
|
-
# @option
|
136
|
+
# @option opts [Boolean] :singleton The singleton option determines
|
118
137
|
# whether or not the result(s) should be in an Array. If multiple
|
119
138
|
# results are found and singleton is true then the first result will be
|
120
139
|
# used. Defaults to true.
|
121
|
-
# @option
|
140
|
+
# @option opts [Boolean] :text_content_only The text_content_only option
|
122
141
|
# if true will use the text content of the Nokogiri result object,
|
123
142
|
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
124
|
-
# @yield
|
125
|
-
#
|
126
|
-
#
|
127
|
-
#
|
128
|
-
#
|
129
|
-
#
|
130
|
-
#
|
143
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
144
|
+
# regardless of the source. Use it (optionally) to process the result
|
145
|
+
# value.
|
146
|
+
# @yieldparam value [Object] The result value to be assigned to the new
|
147
|
+
# `var`.
|
148
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
149
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
150
|
+
# `:object`.
|
151
|
+
# @yieldreturn [Object] The return value of the block becomes the new var's
|
152
|
+
# value. Return the block's value param unchanged if you want to inspect.
|
131
153
|
# @raise [StandardError] If the var param isn't valid.
|
132
|
-
# @return [Symbol] The given var Symbol.
|
133
|
-
def self.
|
154
|
+
# @return [Symbol] The given var Symbol if successful.
|
155
|
+
def self.define_extractor(var, xpath, opts = {}, &block)
|
134
156
|
var = var.to_sym
|
135
|
-
|
136
|
-
|
157
|
+
defaults = { singleton: true, text_content_only: true }
|
158
|
+
opts = defaults.merge(opts)
|
137
159
|
|
138
|
-
raise "var must match #{
|
139
|
-
var =~
|
160
|
+
raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
|
161
|
+
var =~ REGEX_EXTRACTOR_NAME
|
140
162
|
|
141
163
|
# Define the private init_*_from_html method for HTML.
|
142
164
|
# Gets the HTML's xpath value and creates a var for it.
|
143
165
|
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
144
|
-
result =
|
166
|
+
result = extract_from_html(xpath, **opts, &block)
|
145
167
|
init_var(var, result)
|
146
168
|
end
|
147
|
-
Document.send
|
169
|
+
Document.send(:private, func_name)
|
148
170
|
|
149
171
|
# Define the private init_*_from_object method for a Database object.
|
150
172
|
# Gets the Object's 'key' value and creates a var for it.
|
151
|
-
func_name = Document.send(
|
152
|
-
|
173
|
+
func_name = Document.send(
|
174
|
+
:define_method, "init_#{var}_from_object"
|
175
|
+
) do |obj|
|
176
|
+
result = extract_from_object(
|
177
|
+
obj, var.to_s, singleton: opts[:singleton], &block
|
178
|
+
)
|
153
179
|
init_var(var, result)
|
154
180
|
end
|
155
|
-
Document.send
|
181
|
+
Document.send(:private, func_name)
|
156
182
|
|
183
|
+
@extractors << var
|
157
184
|
var
|
158
185
|
end
|
159
186
|
|
160
|
-
# Removes the init_
|
161
|
-
# Therefore, this is the opposing method to Document.
|
187
|
+
# Removes the `init_*` methods created when an extractor is defined.
|
188
|
+
# Therefore, this is the opposing method to `Document.define_extractor`.
|
162
189
|
# Returns true if successful or false if the method(s) cannot be found.
|
163
190
|
#
|
164
|
-
# @param var [Symbol] The
|
165
|
-
# @return [Boolean] True if the
|
191
|
+
# @param var [Symbol] The extractor variable to remove.
|
192
|
+
# @return [Boolean] True if the extractor `var` was found and removed;
|
166
193
|
# otherwise false.
|
167
|
-
def self.
|
194
|
+
def self.remove_extractor(var)
|
168
195
|
Document.send(:remove_method, "init_#{var}_from_html")
|
169
196
|
Document.send(:remove_method, "init_#{var}_from_object")
|
170
197
|
|
198
|
+
@extractors.delete(var.to_sym)
|
171
199
|
true
|
172
200
|
rescue NameError
|
173
201
|
false
|
@@ -186,7 +214,7 @@ module Wgit
|
|
186
214
|
(@url == other.url) && (@html == other.html)
|
187
215
|
end
|
188
216
|
|
189
|
-
#
|
217
|
+
# Shortcut for calling Document#html[range].
|
190
218
|
#
|
191
219
|
# @param range [Range] The range of @html to return.
|
192
220
|
# @return [String] The given range of @html.
|
@@ -196,9 +224,9 @@ module Wgit
|
|
196
224
|
|
197
225
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
198
226
|
# <base> element's href value or @url (if @base is nil). If @base is
|
199
|
-
# present and relative, then @url.
|
200
|
-
# should be used instead of `doc.url.
|
201
|
-
# absolute links from relative links; or use `link.
|
227
|
+
# present and relative, then @url.to_origin + @base is returned. This method
|
228
|
+
# should be used instead of `doc.url.to_origin` etc. when manually building
|
229
|
+
# absolute links from relative links; or use `link.make_absolute(doc)`.
|
202
230
|
#
|
203
231
|
# Provide the `link:` parameter to get the correct base URL for that type
|
204
232
|
# of link. For example, a link of `#top` would always return @url because
|
@@ -217,12 +245,16 @@ module Wgit
|
|
217
245
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
218
246
|
# 'http://example.com/public'.
|
219
247
|
def base_url(link: nil)
|
220
|
-
raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
|
221
248
|
if @url.relative? && @base.nil?
|
222
|
-
|
249
|
+
raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
|
250
|
+
end
|
251
|
+
|
223
252
|
if @url.relative? && @base&.relative?
|
253
|
+
raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
|
254
|
+
be relative"
|
255
|
+
end
|
224
256
|
|
225
|
-
get_base = -> { @base.relative? ? @url.
|
257
|
+
get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
|
226
258
|
|
227
259
|
if link
|
228
260
|
link = Wgit::Url.new(link)
|
@@ -234,7 +266,7 @@ module Wgit
|
|
234
266
|
end
|
235
267
|
end
|
236
268
|
|
237
|
-
base_url = @base ? get_base.call : @url.
|
269
|
+
base_url = @base ? get_base.call : @url.to_origin
|
238
270
|
base_url.omit_fragment.omit_query
|
239
271
|
end
|
240
272
|
|
@@ -248,7 +280,7 @@ module Wgit
|
|
248
280
|
def to_h(include_html: false, include_score: true)
|
249
281
|
ignore = include_html ? [] : ['@html']
|
250
282
|
ignore << '@score' unless include_score
|
251
|
-
ignore << '@
|
283
|
+
ignore << '@parser' # Always ignore the Nokogiri object.
|
252
284
|
|
253
285
|
Wgit::Utils.to_h(self, ignore: ignore)
|
254
286
|
end
|
@@ -265,7 +297,7 @@ module Wgit
|
|
265
297
|
|
266
298
|
# Returns a Hash containing this Document's instance variables and
|
267
299
|
# their #length (if they respond to it). Works dynamically so that any
|
268
|
-
# user defined
|
300
|
+
# user defined extractors (and their created instance vars) will appear in
|
269
301
|
# the returned Hash as well. The number of text snippets as well as total
|
270
302
|
# number of textual bytes are always included in the returned Hash.
|
271
303
|
#
|
@@ -275,8 +307,8 @@ module Wgit
|
|
275
307
|
instance_variables.each do |var|
|
276
308
|
# Add up the total bytes of text as well as the length.
|
277
309
|
if var == :@text
|
278
|
-
hash[:
|
279
|
-
hash[:text_bytes]
|
310
|
+
hash[:text] = @text.length
|
311
|
+
hash[:text_bytes] = @text.sum(&:length)
|
280
312
|
# Else take the var's #length method return value.
|
281
313
|
else
|
282
314
|
next unless instance_variable_get(var).respond_to?(:length)
|
@@ -305,25 +337,43 @@ module Wgit
|
|
305
337
|
end
|
306
338
|
|
307
339
|
# Uses Nokogiri's xpath method to search the doc's html and return the
|
308
|
-
# results.
|
340
|
+
# results. Use `#at_xpath` for returning the first result only.
|
309
341
|
#
|
310
342
|
# @param xpath [String] The xpath to search the @html with.
|
311
343
|
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
312
344
|
def xpath(xpath)
|
313
|
-
@
|
345
|
+
@parser.xpath(xpath)
|
346
|
+
end
|
347
|
+
|
348
|
+
# Uses Nokogiri's `at_xpath` method to search the doc's html and return the
|
349
|
+
# result. Use `#xpath` for returning several results.
|
350
|
+
#
|
351
|
+
# @param xpath [String] The xpath to search the @html with.
|
352
|
+
# @return [Nokogiri::XML::Element] The result of the xpath search.
|
353
|
+
def at_xpath(xpath)
|
354
|
+
@parser.at_xpath(xpath)
|
314
355
|
end
|
315
356
|
|
316
|
-
# Uses Nokogiri's css method to search the doc's html and return the
|
317
|
-
# results.
|
357
|
+
# Uses Nokogiri's `css` method to search the doc's html and return the
|
358
|
+
# results. Use `#at_css` for returning the first result only.
|
318
359
|
#
|
319
360
|
# @param selector [String] The CSS selector to search the @html with.
|
320
361
|
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
321
362
|
def css(selector)
|
322
|
-
@
|
363
|
+
@parser.css(selector)
|
323
364
|
end
|
324
365
|
|
325
|
-
#
|
326
|
-
#
|
366
|
+
# Uses Nokogiri's `at_css` method to search the doc's html and return the
|
367
|
+
# result. Use `#css` for returning several results.
|
368
|
+
#
|
369
|
+
# @param selector [String] The CSS selector to search the @html with.
|
370
|
+
# @return [Nokogiri::XML::Element] The result of the CSS search.
|
371
|
+
def at_css(selector)
|
372
|
+
@parser.at_css(selector)
|
373
|
+
end
|
374
|
+
|
375
|
+
# Returns all unique internal links from this Document in relative form.
|
376
|
+
# Internal meaning a link to another document on the same host.
|
327
377
|
#
|
328
378
|
# This Document's host is used to determine if an absolute URL is actually
|
329
379
|
# a relative link e.g. For a Document representing
|
@@ -332,41 +382,48 @@ module Wgit
|
|
332
382
|
# as an internal link because both Documents live on the same host. Also
|
333
383
|
# see Wgit::Document#internal_absolute_links.
|
334
384
|
#
|
335
|
-
# @return [Array<Wgit::Url>] Self's internal Url's in relative form.
|
385
|
+
# @return [Array<Wgit::Url>] Self's unique internal Url's in relative form.
|
336
386
|
def internal_links
|
337
387
|
return [] if @links.empty?
|
338
388
|
|
339
389
|
links = @links
|
340
|
-
.select { |link| link.relative?(host: @url.
|
390
|
+
.select { |link| link.relative?(host: @url.to_origin) }
|
341
391
|
.map(&:omit_base)
|
342
392
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
343
393
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
344
394
|
end
|
345
395
|
|
346
|
-
Wgit::Utils.
|
396
|
+
Wgit::Utils.sanitize(links)
|
347
397
|
end
|
348
398
|
|
349
|
-
# Returns all internal links from this Document in absolute form by
|
399
|
+
# Returns all unique internal links from this Document in absolute form by
|
350
400
|
# appending them to self's #base_url. Also see
|
351
401
|
# Wgit::Document#internal_links.
|
352
402
|
#
|
353
|
-
# @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
|
403
|
+
# @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
|
354
404
|
def internal_absolute_links
|
355
|
-
internal_links.map { |link| link.
|
405
|
+
internal_links.map { |link| link.make_absolute(self) }
|
356
406
|
end
|
357
407
|
|
358
|
-
# Returns all external links from this Document in absolute form.
|
359
|
-
# meaning a link to a different host.
|
408
|
+
# Returns all unique external links from this Document in absolute form.
|
409
|
+
# External meaning a link to a different host.
|
360
410
|
#
|
361
|
-
# @return [Array<Wgit::Url>] Self's external Url's in absolute form.
|
411
|
+
# @return [Array<Wgit::Url>] Self's unique external Url's in absolute form.
|
362
412
|
def external_links
|
363
413
|
return [] if @links.empty?
|
364
414
|
|
365
415
|
links = @links
|
366
|
-
.
|
416
|
+
.map do |link|
|
417
|
+
if link.scheme_relative?
|
418
|
+
link.prefix_scheme(@url.to_scheme.to_sym)
|
419
|
+
else
|
420
|
+
link
|
421
|
+
end
|
422
|
+
end
|
423
|
+
.reject { |link| link.relative?(host: @url.to_origin) }
|
367
424
|
.map(&:omit_trailing_slash)
|
368
425
|
|
369
|
-
Wgit::Utils.
|
426
|
+
Wgit::Utils.sanitize(links)
|
370
427
|
end
|
371
428
|
|
372
429
|
# Searches the @text for the given query and returns the results.
|
@@ -381,8 +438,8 @@ module Wgit
|
|
381
438
|
# original sentence, which ever is less. The algorithm obviously ensures
|
382
439
|
# that the search query is visible somewhere in the sentence.
|
383
440
|
#
|
384
|
-
# @param query [
|
385
|
-
# @text for.
|
441
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
442
|
+
# document's @text for.
|
386
443
|
# @param case_sensitive [Boolean] Whether character case must match.
|
387
444
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
388
445
|
# for separately.
|
@@ -390,21 +447,27 @@ module Wgit
|
|
390
447
|
# sentence.
|
391
448
|
# @return [Array<String>] A subset of @text, matching the query.
|
392
449
|
def search(
|
393
|
-
query, case_sensitive: false, whole_sentence:
|
450
|
+
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
394
451
|
)
|
395
|
-
query = query.to_s
|
396
|
-
raise 'A search query must be provided' if query.empty?
|
397
452
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
398
453
|
|
399
|
-
|
400
|
-
|
454
|
+
if query.is_a?(Regexp)
|
455
|
+
regex = query
|
456
|
+
else # respond_to? #to_s == true
|
457
|
+
query = query.to_s
|
458
|
+
query = query.gsub(' ', '|') unless whole_sentence
|
459
|
+
regex = Regexp.new(query, !case_sensitive)
|
460
|
+
end
|
461
|
+
|
401
462
|
results = {}
|
402
463
|
|
403
464
|
@text.each do |sentence|
|
465
|
+
sentence = sentence.strip
|
466
|
+
next if results[sentence]
|
467
|
+
|
404
468
|
hits = sentence.scan(regex).count
|
405
469
|
next unless hits.positive?
|
406
470
|
|
407
|
-
sentence.strip!
|
408
471
|
index = sentence.index(regex) # Index of first match.
|
409
472
|
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
410
473
|
|
@@ -422,8 +485,8 @@ module Wgit
|
|
422
485
|
# functionality. The original text is returned; no other reference to it
|
423
486
|
# is kept thereafter.
|
424
487
|
#
|
425
|
-
# @param query [
|
426
|
-
# @text for.
|
488
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
489
|
+
# document's @text for.
|
427
490
|
# @param case_sensitive [Boolean] Whether character case must match.
|
428
491
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
429
492
|
# for separately.
|
@@ -431,7 +494,7 @@ module Wgit
|
|
431
494
|
# sentence.
|
432
495
|
# @return [String] This Document's original @text value.
|
433
496
|
def search!(
|
434
|
-
query, case_sensitive: false, whole_sentence:
|
497
|
+
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
435
498
|
)
|
436
499
|
orig_text = @text
|
437
500
|
@text = search(
|
@@ -442,104 +505,114 @@ module Wgit
|
|
442
505
|
orig_text
|
443
506
|
end
|
444
507
|
|
508
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
509
|
+
# parameter.
|
510
|
+
#
|
511
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
512
|
+
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
513
|
+
# Object) : results (Array).
|
514
|
+
# @param text_content_only [Boolean] text_content_only ? result.content
|
515
|
+
# (String) : result (Nokogiri Object).
|
516
|
+
# @return [String, Object] The value found in the html or the default value
|
517
|
+
# (singleton ? nil : []).
|
518
|
+
def extract(xpath, singleton: true, text_content_only: true)
|
519
|
+
send(
|
520
|
+
:extract_from_html, xpath,
|
521
|
+
singleton: singleton, text_content_only: text_content_only
|
522
|
+
)
|
523
|
+
end
|
524
|
+
|
445
525
|
protected
|
446
526
|
|
447
527
|
# Initializes the nokogiri object using @html, which cannot be nil.
|
448
528
|
# Override this method to custom configure the Nokogiri object returned.
|
449
529
|
# Gets called from Wgit::Document.new upon initialization.
|
450
530
|
#
|
531
|
+
# @yield [config] The given block is passed to Nokogiri::HTML for
|
532
|
+
# initialisation.
|
451
533
|
# @raise [StandardError] If @html isn't set.
|
452
534
|
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
453
|
-
def init_nokogiri
|
535
|
+
def init_nokogiri(&block)
|
454
536
|
raise '@html must be set' unless @html
|
455
537
|
|
456
|
-
Nokogiri::HTML(@html)
|
457
|
-
# TODO: Remove #'s below when crawling in production.
|
458
|
-
# config.options = Nokogiri::XML::ParseOptions::STRICT |
|
459
|
-
# Nokogiri::XML::ParseOptions::NONET
|
460
|
-
end
|
538
|
+
Nokogiri::HTML(@html, &block)
|
461
539
|
end
|
462
540
|
|
463
|
-
#
|
541
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
464
542
|
# parameter.
|
465
543
|
#
|
466
|
-
# @param xpath [String] Used to find the value/object in @html.
|
544
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
467
545
|
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
468
546
|
# Object) : results (Array).
|
469
547
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
470
548
|
# (String) : result (Nokogiri Object).
|
471
|
-
# @yield
|
472
|
-
#
|
473
|
-
#
|
474
|
-
#
|
549
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
550
|
+
# regardless of the source. Use it (optionally) to process the result
|
551
|
+
# value.
|
552
|
+
# @yieldparam value [Object] The result value to be returned.
|
553
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
554
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
555
|
+
# `:object`.
|
556
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
557
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
475
558
|
# @return [String, Object] The value found in the html or the default value
|
476
559
|
# (singleton ? nil : []).
|
477
|
-
def
|
478
|
-
|
479
|
-
|
480
|
-
results = @doc.xpath(xpath)
|
481
|
-
|
482
|
-
return default if results.nil? || results.empty?
|
483
|
-
|
484
|
-
result = if singleton
|
485
|
-
text_content_only ? results.first.content : results.first
|
486
|
-
else
|
487
|
-
text_content_only ? results.map(&:content) : results
|
488
|
-
end
|
489
|
-
|
490
|
-
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
560
|
+
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
561
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
562
|
+
result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
|
491
563
|
|
492
|
-
if
|
493
|
-
|
494
|
-
result = new_result unless new_result.nil?
|
564
|
+
if text_content_only
|
565
|
+
result = singleton ? result&.content : result.map(&:content)
|
495
566
|
end
|
496
567
|
|
568
|
+
Wgit::Utils.sanitize(result)
|
569
|
+
result = yield(result, self, :document) if block_given?
|
497
570
|
result
|
498
571
|
end
|
499
572
|
|
500
|
-
# Returns a value from the obj using the given key via obj#fetch
|
573
|
+
# Returns a value from the obj using the given key via `obj#fetch`.
|
501
574
|
#
|
502
|
-
# @param obj [
|
575
|
+
# @param obj [#fetch] The object containing the key/value.
|
503
576
|
# @param key [String] Used to find the value in the obj.
|
504
577
|
# @param singleton [Boolean] True if a single value, false otherwise.
|
505
|
-
# @yield
|
506
|
-
#
|
507
|
-
#
|
508
|
-
#
|
578
|
+
# @yield The block is executed when a Wgit::Document is initialized,
|
579
|
+
# regardless of the source. Use it (optionally) to process the result
|
580
|
+
# value.
|
581
|
+
# @yieldparam value [Object] The result value to be returned.
|
582
|
+
# @yieldparam source [Wgit::Document, Object] The source of the `value`.
|
583
|
+
# @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
|
584
|
+
# `:object`.
|
585
|
+
# @yieldreturn [Object] The return value of the block gets returned. Return
|
586
|
+
# the block's `value` param unchanged if you simply want to inspect it.
|
509
587
|
# @return [String, Object] The value found in the obj or the default value
|
510
588
|
# (singleton ? nil : []).
|
511
|
-
def
|
589
|
+
def extract_from_object(obj, key, singleton: true)
|
512
590
|
assert_respond_to(obj, :fetch)
|
513
591
|
|
514
592
|
default = singleton ? nil : []
|
515
593
|
result = obj.fetch(key.to_s, default)
|
516
594
|
|
517
|
-
|
518
|
-
|
519
|
-
if block_given?
|
520
|
-
new_result = yield(result, obj, :object)
|
521
|
-
result = new_result unless new_result.nil?
|
522
|
-
end
|
523
|
-
|
595
|
+
Wgit::Utils.sanitize(result)
|
596
|
+
result = yield(result, obj, :object) if block_given?
|
524
597
|
result
|
525
598
|
end
|
526
599
|
|
527
600
|
private
|
528
601
|
|
529
602
|
# Initialise the Document from URL and HTML Strings.
|
530
|
-
def init_from_strings(url, html,
|
603
|
+
def init_from_strings(url, html, encode: true)
|
531
604
|
assert_types(html, [String, NilClass])
|
532
605
|
|
533
606
|
# We already know url.is_a?(String) so parse into Url unless already so.
|
534
607
|
url = Wgit::Url.parse(url)
|
535
608
|
url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
|
536
609
|
|
537
|
-
@url
|
538
|
-
@html
|
539
|
-
@
|
540
|
-
@score
|
610
|
+
@url = url
|
611
|
+
@html = html || ''
|
612
|
+
@parser = init_nokogiri
|
613
|
+
@score = 0.0
|
541
614
|
|
542
|
-
Wgit::Utils.
|
615
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
543
616
|
|
544
617
|
# Dynamically run the init_*_from_html methods.
|
545
618
|
Document.private_instance_methods(false).each do |method|
|
@@ -552,15 +625,15 @@ module Wgit
|
|
552
625
|
|
553
626
|
# Initialise the Document from a Hash like Object containing Strings as
|
554
627
|
# keys e.g. database collection object or Hash.
|
555
|
-
def init_from_object(obj,
|
628
|
+
def init_from_object(obj, encode: true)
|
556
629
|
assert_respond_to(obj, :fetch)
|
557
630
|
|
558
|
-
@url
|
559
|
-
@html
|
560
|
-
@
|
561
|
-
@score
|
631
|
+
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
632
|
+
@html = obj.fetch('html', '')
|
633
|
+
@parser = init_nokogiri
|
634
|
+
@score = obj.fetch('score', 0.0)
|
562
635
|
|
563
|
-
Wgit::Utils.
|
636
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
564
637
|
|
565
638
|
# Dynamically run the init_*_from_object methods.
|
566
639
|
Document.private_instance_methods(false).each do |method|
|
@@ -571,11 +644,11 @@ module Wgit
|
|
571
644
|
end
|
572
645
|
end
|
573
646
|
|
574
|
-
# Initialises an instance variable and defines
|
647
|
+
# Initialises an instance variable and defines an accessor method for it.
|
575
648
|
#
|
576
649
|
# @param var [Symbol] The name of the variable to be initialized.
|
577
650
|
# @param value [Object] The newly initialized variable's value.
|
578
|
-
# @return [Symbol] The name of the
|
651
|
+
# @return [Symbol] The name of the defined getter method.
|
579
652
|
def init_var(var, value)
|
580
653
|
# instance_var_name starts with @, var_name doesn't.
|
581
654
|
var = var.to_s
|
@@ -583,10 +656,9 @@ module Wgit
|
|
583
656
|
instance_var_name = "@#{var_name}".to_sym
|
584
657
|
|
585
658
|
instance_variable_set(instance_var_name, value)
|
659
|
+
Wgit::Document.attr_accessor(var_name)
|
586
660
|
|
587
|
-
|
588
|
-
instance_variable_get(instance_var_name)
|
589
|
-
end
|
661
|
+
var_name
|
590
662
|
end
|
591
663
|
|
592
664
|
alias content html
|