wgit 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/document.rb CHANGED
@@ -1,592 +1,594 @@
1
- require_relative 'url'
2
- require_relative 'utils'
3
- require_relative 'assertable'
4
- require 'nokogiri'
5
- require 'json'
6
-
7
- module Wgit
8
-
9
- # Class modeling a HTML web document. Also doubles as a search result when
10
- # loading Documents from the database.
11
- #
12
- # The initialize method dynamically initializes certain variables from the
13
- # Document HTML / Database object e.g. text. This bit is dynamic so that the
14
- # Document class can be easily extended allowing you to pull out the bits of
15
- # a webpage that are important to you. See Wgit::Document.define_extension.
16
- class Document
17
- include Assertable
18
-
19
- # The HTML elements that make up the visible text on a page.
20
- # These elements are used to initialize the @text of the Document.
21
- # See the README.md for how to add to this Array dynamically.
22
- @@text_elements = [
23
- :dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
24
- :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
25
- ]
26
-
27
- # The URL of the webpage, an instance of Wgit::Url.
28
- attr_reader :url
29
-
30
- # The HTML of the webpage, an instance of String.
31
- attr_reader :html
32
-
33
- # The Nokogiri document object initialized from @html.
34
- attr_reader :doc
35
-
36
- # The score is only used following a Database#search and records matches.
37
- attr_reader :score
38
-
39
- # Initialize takes either two strings (representing the URL and HTML) or an
40
- # object representing a database record (of a HTTP crawled web page). This
41
- # allows for initialisation from both crawled web pages and (afterwards)
42
- # documents/web pages retrieved from the database.
43
- #
44
- # During initialisation, the Document will call any
45
- # 'init_*_from_html' and 'init_*_from_object' methods it can find. Some
46
- # default init_* methods exist while others can be defined by the user.
47
- # See the README and Wgit::Document.define_extension for more info.
48
- #
49
- # @param url_or_obj [String, Object#fetch] Either a String representing a
50
- # URL or a Hash-like object responding to :fetch. e.g. a MongoDB
51
- # collection object. The Object's :fetch method should support Strings as
52
- # keys.
53
- # @param html [String] The crawled web page's HTML. This param is only
54
- # required if url_or_obj is a String representing the web page's URL.
55
- def initialize(url_or_obj, html = "")
56
- # Init from URL String and HTML String.
57
- if url_or_obj.is_a?(String)
58
- url = url_or_obj
59
- assert_type(url, Wgit::Url)
60
-
61
- @url = url
62
- @html = html ||= ""
63
- @doc = init_nokogiri
64
- @score = 0.0
65
-
66
- process_url_and_html
67
-
68
- # Dynamically run the init_*_from_html methods.
69
- Document.private_instance_methods(false).each do |method|
70
- if method.to_s.start_with?("init_") &&
71
- method.to_s.end_with?("_from_html")
72
- self.send(method)
73
- end
74
- end
75
- # Init from a Hash like object containing Strings as keys e.g. Mongo
76
- # collection obj.
77
- else
78
- obj = url_or_obj
79
- assert_respond_to(obj, :fetch)
80
-
81
- @url = obj.fetch("url") # Should always be present.
82
- @html = obj.fetch("html", "")
83
- @doc = init_nokogiri
84
- @score = obj.fetch("score", 0.0)
85
-
86
- process_url_and_html
87
-
88
- # Dynamically run the init_*_from_object methods.
89
- Document.private_instance_methods(false).each do |method|
90
- if method.to_s.start_with?("init_") &&
91
- method.to_s.end_with?("_from_object")
92
- self.send(method, obj)
93
- end
94
- end
95
- end
96
- end
97
-
98
- # Determines if both the url and html match. Use
99
- # doc.object_id == other_doc.object_id for exact object comparison.
100
- #
101
- # @param other_doc [Wgit::Document] To compare self against.
102
- # @return [Boolean] True if @url and @html are equal, false if not.
103
- def ==(other_doc)
104
- return false unless other_doc.is_a? Wgit::Document
105
- @url == other_doc.url and @html == other_doc.html
106
- end
107
-
108
- # Is a shortcut for calling Document#html[range].
109
- #
110
- # @param range [Range] The range of @html to return.
111
- # @return [String] The given range of @html.
112
- def [](range)
113
- @html[range]
114
- end
115
-
116
- def date_crawled
117
- @url.date_crawled
118
- end
119
-
120
- # Returns a Hash containing this Document's instance vars.
121
- # Used when storing the Document in a Database e.g. MongoDB etc.
122
- # By default the @html var is excluded from the returned Hash.
123
- #
124
- # @param include_html [Boolean] Whether or not to include @html in the
125
- # returned Hash.
126
- # @return [Hash] Containing self's instance vars.
127
- def to_h(include_html = false)
128
- ignore = include_html ? [] : ["@html"]
129
- ignore << "@doc" # Always ignore "@doc"
130
- Wgit::Utils.to_h(self, ignore)
131
- end
132
-
133
- # Converts this Document's to_h return value to a JSON String.
134
- #
135
- # @param include_html [Boolean] Whether or not to include @html in the
136
- # returned JSON String.
137
- # @return [String] This Document represented as a JSON String.
138
- def to_json(include_html = false)
139
- h = to_h(include_html)
140
- JSON.generate(h)
141
- end
142
-
143
- # Returns a Hash containing this Document's instance variables and
144
- # their :length (if they respond to it). Works dynamically so that any
145
- # user defined extensions (and their created instance vars) will appear in
146
- # the returned Hash as well. The number of text snippets as well as total
147
- # number of textual bytes are always included in the returned Hash.
148
- #
149
- # @return [Hash] Containing self's HTML statistics.
150
- def stats
151
- hash = {}
152
- instance_variables.each do |var|
153
- # Add up the total bytes of text as well as the length.
154
- if var == :@text
155
- count = 0
156
- @text.each { |t| count += t.length }
157
- hash[:text_length] = @text.length
158
- hash[:text_bytes] = count
159
- # Else take the var's #length method return value.
160
- else
161
- next unless instance_variable_get(var).respond_to?(:length)
162
- hash[var[1..-1].to_sym] =
163
- instance_variable_get(var).send(:length)
164
- end
165
- end
166
- hash
167
- end
168
-
169
- # Determine the size of this Document's HTML.
170
- #
171
- # @return [Integer] The total number of bytes in @html.
172
- def size
173
- stats[:html]
174
- end
175
-
176
- # Determine if this Document's HTML is empty or not.
177
- #
178
- # @return [Boolean] True if @html is nil/empty, false otherwise.
179
- def empty?
180
- return true if @html.nil?
181
- @html.empty?
182
- end
183
-
184
- # Uses Nokogiri's xpath method to search the doc's html and return the
185
- # results.
186
- #
187
- # @param xpath [String] The xpath to search the @html with.
188
- # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
189
- def xpath(xpath)
190
- @doc.xpath(xpath)
191
- end
192
-
193
- # Uses Nokogiri's css method to search the doc's html and return the
194
- # results.
195
- #
196
- # @param selector [String] The CSS selector to search the @html with.
197
- # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
198
- def css(selector)
199
- @doc.css(selector)
200
- end
201
-
202
- # Get all internal links of this Document in relative form. Internal
203
- # meaning a link to another page on this website. Also see
204
- # Wgit::Document#internal_full_links.
205
- #
206
- # @return [Array<Wgit::Url>] self's internal/relative URL's.
207
- def internal_links
208
- return [] if @links.empty?
209
-
210
- links = @links.
211
- reject do |link|
212
- not link.relative_link?(base: @url.to_base)
213
- rescue
214
- true
215
- end.
216
- map(&:to_path)
217
-
218
- process_arr(links)
219
- end
220
-
221
- # Get all internal links of this Document and append them to this
222
- # Document's base URL making them absolute. Also see
223
- # Wgit::Document#internal_links.
224
- #
225
- # @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
226
- # form.
227
- def internal_full_links
228
- in_links = internal_links
229
- return [] if in_links.empty?
230
- in_links.map { |link| @url.to_base.concat(link) }
231
- end
232
-
233
- # Get all external links of this Document. External meaning a link to
234
- # another website.
235
- #
236
- # @return [Array<Wgit::Url>] self's external/absolute URL's.
237
- def external_links
238
- return [] if @links.empty?
239
-
240
- links = @links.
241
- reject do |link|
242
- link.relative_link?(base: @url.to_base)
243
- rescue
244
- true
245
- end.
246
- map { |link| link.end_with?('/') ? link.chop : link }
247
-
248
- process_arr(links)
249
- end
250
-
251
- # Searches against the @text for the given search query.
252
- # The number of search hits for each sentenence are recorded internally
253
- # and used to rank/sort the search results before being returned. Where
254
- # the Wgit::Database#search method search all documents for the most hits,
255
- # this method searches each document's @text for the most hits.
256
- #
257
- # Each search result comprises of a sentence of a given length. The length
258
- # will be based on the sentence_limit parameter or the full length of the
259
- # original sentence, which ever is less. The algorithm obviously ensures
260
- # that the search query is visible somewhere in the sentence.
261
- #
262
- # @param query [String] The value to search the document's text against.
263
- # @param sentence_limit [Integer] The max length of each search result
264
- # sentence.
265
- # @return [Array<String>] Representing the search results.
266
- def search(query, sentence_limit = 80)
267
- raise "A search query must be provided" if query.empty?
268
- raise "The sentence_limit value must be even" if sentence_limit.odd?
269
-
270
- results = {}
271
- regex = Regexp.new(query, Regexp::IGNORECASE)
272
-
273
- @text.each do |sentence|
274
- hits = sentence.scan(regex).count
275
- if hits > 0
276
- sentence.strip!
277
- index = sentence.index(regex)
278
- Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
279
- results[sentence] = hits
280
- end
281
- end
282
-
283
- return [] if results.empty?
284
- results = Hash[results.sort_by { |k, v| v }]
285
- results.keys.reverse
286
- end
287
-
288
- # Performs a text search (see Document#search for details) but assigns the
289
- # results to the @text instance variable. This can be used for sub search
290
- # functionality. The original text is returned; no other reference to it
291
- # is kept thereafter.
292
- #
293
- # @param query [String] The value to search the document's text against.
294
- # @param sentence_limit [Integer] The max length of each search result
295
- # sentence.
296
- # @return [String] This Document's original @text value.
297
- def search!(query, sentence_limit = 80)
298
- orig_text = @text
299
- @text = search(query, sentence_limit)
300
- orig_text
301
- end
302
-
303
- ### Document (Class) methods ###
304
-
305
- # Returns Document.text_elements used to obtain the text in a webpage.
306
- #
307
- # @return [Array<Symbols>] The page elements containing visual text on a
308
- # webpage.
309
- def self.text_elements
310
- @@text_elements
311
- end
312
-
313
- # Initialises a private instance variable with the xpath or database object
314
- # result(s). When initialising from HTML, a true singleton value will only
315
- # ever return one result otherwise all xpath results are returned in an
316
- # Array. When initialising from a database object, the value is taken as
317
- # is and singleton is only used to define the default empty value.
318
- # If a value cannot be found (in either the HTML or database object), then
319
- # a default will be used. The default is: singleton ? nil : [].
320
- #
321
- # Note that defined extensions work for both documents being crawled from
322
- # the WWW and for documents being retrieved from the database. This
323
- # effectively implements ORM like behavior using this class.
324
- #
325
- # @param var [Symbol] The name of the variable to be initialised.
326
- # @param xpath [String] Used to find the element(s) of the webpage.
327
- # @option options [Boolean] :singleton The singleton option determines
328
- # whether or not the result(s) should be in an Array. If multiple
329
- # results are found and singleton is true then the first result will be
330
- # used. Defaults to true.
331
- # @option options [Boolean] :text_content_only The text_content_only option
332
- # if true will use the text content of the Nokogiri result object,
333
- # otherwise the Nokogiri object itself is returned. Defaults to true.
334
- # @yield [var_value] Gives the value about to be assigned to the new var.
335
- # The return value of the block becomes the new var value, unless nil.
336
- # Return nil if you want to inspect but not change the var value.
337
- # @return [Symbol] The first half of the newly created method names e.g.
338
- # if var == "title" then :init_title is returned.
339
- def self.define_extension(var, xpath, options = {}, &block)
340
- default_options = { singleton: true, text_content_only: true }
341
- options = default_options.merge(options)
342
-
343
- # Define the private init_*_from_html method for HTML.
344
- # Gets the HTML's xpath value and creates a var for it.
345
- func_name = Document.send(:define_method, "init_#{var}_from_html") do
346
- result = find_in_html(xpath, options, &block)
347
- init_var(var, result)
348
- end
349
- Document.send :private, func_name
350
-
351
- # Define the private init_*_from_object method for a Database object.
352
- # Gets the Object's "key" value and creates a var for it.
353
- func_name = Document.send(
354
- :define_method, "init_#{var}_from_object") do |obj|
355
- result = find_in_object(
356
- obj, var.to_s, singleton: options[:singleton], &block)
357
- init_var(var, result)
358
- end
359
- Document.send :private, func_name
360
-
361
- "init_#{var}".to_sym
362
- end
363
-
364
- # Removes the init_* methods created when an extension is defined.
365
- # Therefore, this is the opposing method to Document.define_extension.
366
- # Returns true if successful or false if the method(s) cannot be found.
367
- #
368
- # @param var [Symbol] The extension variable already defined.
369
- # @return [Boolean] True if the extension var was found and removed;
370
- # otherwise false.
371
- def self.remove_extension(var)
372
- Document.send(:remove_method, "init_#{var}_from_html")
373
- Document.send(:remove_method, "init_#{var}_from_object")
374
- true
375
- rescue NameError
376
- false
377
- end
378
-
379
- private
380
-
381
- # Initializes the nokogiri object using @html, which must be already set.
382
- def init_nokogiri
383
- raise "@html must be set" unless @html
384
- Nokogiri::HTML(@html) do |config|
385
- # TODO: Remove #'s below when crawling in production.
386
- #config.options = Nokogiri::XML::ParseOptions::STRICT |
387
- # Nokogiri::XML::ParseOptions::NONET
388
- end
389
- end
390
-
391
- # Returns an object/value from this Document's @html using the provided
392
- # xpath param.
393
- # singleton ? results.first (single Object) : results (Array)
394
- # text_content_only ? result.content (String) : result (nokogiri Object)
395
- # A block can be used to set the final value before it is returned.
396
- # Return nil from the block if you don't want to override the value.
397
- def find_in_html(xpath, singleton: true, text_content_only: true)
398
- results = @doc.xpath(xpath)
399
-
400
- if results and not results.empty?
401
- result = if singleton
402
- text_content_only ? results.first.content : results.first
403
- else
404
- text_content_only ? results.map(&:content) : results
405
- end
406
- else
407
- result = singleton ? nil : []
408
- end
409
-
410
- singleton ? process_str(result) : process_arr(result)
411
-
412
- if block_given?
413
- new_result = yield(result)
414
- result = new_result if new_result
415
- end
416
-
417
- result
418
- end
419
-
420
- # Finds a value in the obj using the key.
421
- # singleton is used to set the value if not found in obj.
422
- # A block can be used to set the final value before it is returned.
423
- # Return nil from the block if you don't want to override the value.
424
- def find_in_object(obj, key, singleton: true)
425
- assert_respond_to(obj, :fetch)
426
-
427
- default = singleton ? nil : []
428
- result = obj.fetch(key.to_s, default)
429
- singleton ? process_str(result) : process_arr(result)
430
-
431
- if block_given?
432
- new_result = yield(result)
433
- result = new_result if new_result
434
- end
435
-
436
- result
437
- end
438
-
439
- # Initialises an instance variable and defines a getter method for it.
440
- # @param var [Symbol] The name of the variable to be initialized.
441
- # @param value [Object] The newly initialized variable's value.
442
- # @return [Symbol] The name of the newly created getter method.
443
- def init_var(var, value)
444
- # instance_var_name starts with @, var_name doesn't.
445
- var = var.to_s
446
- var_name = (var.start_with?("@") ? var[1..-1] : var).to_sym
447
- instance_var_name = "@#{var_name}".to_sym
448
-
449
- instance_variable_set(instance_var_name, value)
450
-
451
- Document.send(:define_method, var_name) do
452
- instance_variable_get(instance_var_name)
453
- end
454
- end
455
-
456
- # Takes Docuent.text_elements and returns an xpath String used to obtain
457
- # all of the combined text.
458
- def text_elements_xpath
459
- xpath = ""
460
- return xpath if @@text_elements.empty?
461
- el_xpath = "//%s/text()"
462
- @@text_elements.each_with_index do |el, i|
463
- xpath += " | " unless i == 0
464
- xpath += el_xpath % [el]
465
- end
466
- xpath
467
- end
468
-
469
- # Processes a String to make it uniform.
470
- def process_str(str)
471
- if str.is_a?(String)
472
- str.encode!('UTF-8', 'UTF-8', invalid: :replace)
473
- str.strip!
474
- end
475
- str
476
- end
477
-
478
- # Processes an Array to make it uniform.
479
- def process_arr(array)
480
- if array.is_a?(Array)
481
- array.map! { |str| process_str(str) }
482
- array.reject! { |str| str.is_a?(String) ? str.empty? : false }
483
- array.compact!
484
- array.uniq!
485
- end
486
- array
487
- end
488
-
489
- # Ensure the @url and @html Strings are correctly encoded etc.
490
- def process_url_and_html
491
- @url = process_str(@url)
492
- @html = process_str(@html)
493
- end
494
-
495
- ### Default init_* (Document extension) methods. ###
496
-
497
- # Init methods for title.
498
-
499
- def init_title_from_html
500
- xpath = "//title"
501
- result = find_in_html(xpath)
502
- init_var(:@title, result)
503
- end
504
-
505
- def init_title_from_object(obj)
506
- result = find_in_object(obj, "title")
507
- init_var(:@title, result)
508
- end
509
-
510
- # Init methods for author.
511
-
512
- def init_author_from_html
513
- xpath = "//meta[@name='author']/@content"
514
- result = find_in_html(xpath)
515
- init_var(:@author, result)
516
- end
517
-
518
- def init_author_from_object(obj)
519
- result = find_in_object(obj, "author")
520
- init_var(:@author, result)
521
- end
522
-
523
- # Init methods for keywords.
524
-
525
- def init_keywords_from_html
526
- xpath = "//meta[@name='keywords']/@content"
527
- result = find_in_html(xpath) do |keywords|
528
- if keywords
529
- keywords = keywords.split(",")
530
- process_arr(keywords)
531
- end
532
- keywords
533
- end
534
- init_var(:@keywords, result)
535
- end
536
-
537
- def init_keywords_from_object(obj)
538
- result = find_in_object(obj, "keywords", singleton: false)
539
- init_var(:@keywords, result)
540
- end
541
-
542
- # Init methods for links.
543
-
544
- def init_links_from_html
545
- xpath = "//a/@href"
546
- result = find_in_html(xpath, singleton: false) do |links|
547
- if links
548
- links.map! do |link|
549
- begin
550
- Wgit::Url.new(link)
551
- rescue
552
- nil
553
- end
554
- end
555
- links.compact!
556
- end
557
- links
558
- end
559
- init_var(:@links, result)
560
- end
561
-
562
- def init_links_from_object(obj)
563
- result = find_in_object(obj, "links", singleton: false) do |links|
564
- if links
565
- links.map! { |link| Wgit::Url.new(link) }
566
- end
567
- links
568
- end
569
- init_var(:@links, result)
570
- end
571
-
572
- # Init methods for text.
573
-
574
- def init_text_from_html
575
- xpath = text_elements_xpath
576
- result = find_in_html(xpath, singleton: false)
577
- init_var(:@text, result)
578
- end
579
-
580
- def init_text_from_object(obj)
581
- result = find_in_object(obj, "text", singleton: false)
582
- init_var(:@text, result)
583
- end
584
-
585
- alias :to_hash :to_h
586
- alias :relative_links :internal_links
587
- alias :relative_urls :internal_links
588
- alias :relative_full_links :internal_full_links
589
- alias :relative_full_urls :internal_full_links
590
- alias :external_urls :external_links
591
- end
592
- end
1
+ require_relative 'url'
2
+ require_relative 'utils'
3
+ require_relative 'assertable'
4
+ require 'nokogiri'
5
+ require 'json'
6
+
7
+ module Wgit
8
+
9
+ # Class modeling a HTML web document. Also doubles as a search result when
10
+ # loading Documents from the database.
11
+ #
12
+ # The initialize method dynamically initializes certain variables from the
13
+ # Document HTML / Database object e.g. text. This bit is dynamic so that the
14
+ # Document class can be easily extended allowing you to pull out the bits of
15
+ # a webpage that are important to you. See Wgit::Document.define_extension.
16
+ class Document
17
+ include Assertable
18
+
19
+ # The HTML elements that make up the visible text on a page.
20
+ # These elements are used to initialize the @text of the Document.
21
+ # See the README.md for how to add to this Array dynamically.
22
+ @@text_elements = [
23
+ :dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
24
+ :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
25
+ ]
26
+
27
+ # The URL of the webpage, an instance of Wgit::Url.
28
+ attr_reader :url
29
+
30
+ # The HTML of the webpage, an instance of String.
31
+ attr_reader :html
32
+
33
+ # The Nokogiri document object initialized from @html.
34
+ attr_reader :doc
35
+
36
+ # The score is only used following a Database#search and records matches.
37
+ attr_reader :score
38
+
39
+ # Initialize takes either two strings (representing the URL and HTML) or an
40
+ # object representing a database record (of a HTTP crawled web page). This
41
+ # allows for initialisation from both crawled web pages and (afterwards)
42
+ # documents/web pages retrieved from the database.
43
+ #
44
+ # During initialisation, the Document will call any
45
+ # 'init_*_from_html' and 'init_*_from_object' methods it can find. Some
46
+ # default init_* methods exist while others can be defined by the user.
47
+ # See the README and Wgit::Document.define_extension for more info.
48
+ #
49
+ # @param url_or_obj [String, Object#fetch] Either a String representing a
50
+ # URL or a Hash-like object responding to :fetch. e.g. a MongoDB
51
+ # collection object. The Object's :fetch method should support Strings as
52
+ # keys.
53
+ # @param html [String] The crawled web page's HTML. This param is only
54
+ # required if url_or_obj is a String representing the web page's URL.
55
+ def initialize(url_or_obj, html = "")
56
+ # Init from URL String and HTML String.
57
+ if url_or_obj.is_a?(String)
58
+ url = url_or_obj
59
+ assert_type(url, Wgit::Url)
60
+
61
+ @url = url
62
+ @html = html ||= ""
63
+ @doc = init_nokogiri
64
+ @score = 0.0
65
+
66
+ process_url_and_html
67
+
68
+ # Dynamically run the init_*_from_html methods.
69
+ Document.private_instance_methods(false).each do |method|
70
+ if method.to_s.start_with?("init_") &&
71
+ method.to_s.end_with?("_from_html")
72
+ self.send(method)
73
+ end
74
+ end
75
+ # Init from a Hash like object containing Strings as keys e.g. Mongo
76
+ # collection obj.
77
+ else
78
+ obj = url_or_obj
79
+ assert_respond_to(obj, :fetch)
80
+
81
+ @url = obj.fetch("url") # Should always be present.
82
+ @html = obj.fetch("html", "")
83
+ @doc = init_nokogiri
84
+ @score = obj.fetch("score", 0.0)
85
+
86
+ process_url_and_html
87
+
88
+ # Dynamically run the init_*_from_object methods.
89
+ Document.private_instance_methods(false).each do |method|
90
+ if method.to_s.start_with?("init_") &&
91
+ method.to_s.end_with?("_from_object")
92
+ self.send(method, obj)
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+ # Determines if both the url and html match. Use
99
+ # doc.object_id == other_doc.object_id for exact object comparison.
100
+ #
101
+ # @param other_doc [Wgit::Document] To compare self against.
102
+ # @return [Boolean] True if @url and @html are equal, false if not.
103
+ def ==(other_doc)
104
+ return false unless other_doc.is_a? Wgit::Document
105
+ @url == other_doc.url and @html == other_doc.html
106
+ end
107
+
108
+ # Is a shortcut for calling Document#html[range].
109
+ #
110
+ # @param range [Range] The range of @html to return.
111
+ # @return [String] The given range of @html.
112
+ def [](range)
113
+ @html[range]
114
+ end
115
+
116
+ # Returns the timestamp of when this Wgit::Document was crawled.
117
+ #
118
+ # @return [Time] Time of when this Wgit::Document was crawled.
119
+ def date_crawled
120
+ @url.date_crawled
121
+ end
122
+
123
+ # Returns a Hash containing this Document's instance vars.
124
+ # Used when storing the Document in a Database e.g. MongoDB etc.
125
+ # By default the @html var is excluded from the returned Hash.
126
+ #
127
+ # @param include_html [Boolean] Whether or not to include @html in the
128
+ # returned Hash.
129
+ # @return [Hash] Containing self's instance vars.
130
+ def to_h(include_html = false)
131
+ ignore = include_html ? [] : ["@html"]
132
+ ignore << "@doc" # Always ignore "@doc"
133
+ Wgit::Utils.to_h(self, ignore)
134
+ end
135
+
136
+ # Converts this Document's to_h return value to a JSON String.
137
+ #
138
+ # @param include_html [Boolean] Whether or not to include @html in the
139
+ # returned JSON String.
140
+ # @return [String] This Document represented as a JSON String.
141
+ def to_json(include_html = false)
142
+ h = to_h(include_html)
143
+ JSON.generate(h)
144
+ end
145
+
146
+ # Returns a Hash containing this Document's instance variables and
147
+ # their :length (if they respond to it). Works dynamically so that any
148
+ # user defined extensions (and their created instance vars) will appear in
149
+ # the returned Hash as well. The number of text snippets as well as total
150
+ # number of textual bytes are always included in the returned Hash.
151
+ #
152
+ # @return [Hash] Containing self's HTML statistics.
153
+ def stats
154
+ hash = {}
155
+ instance_variables.each do |var|
156
+ # Add up the total bytes of text as well as the length.
157
+ if var == :@text
158
+ count = 0
159
+ @text.each { |t| count += t.length }
160
+ hash[:text_length] = @text.length
161
+ hash[:text_bytes] = count
162
+ # Else take the var's #length method return value.
163
+ else
164
+ next unless instance_variable_get(var).respond_to?(:length)
165
+ hash[var[1..-1].to_sym] =
166
+ instance_variable_get(var).send(:length)
167
+ end
168
+ end
169
+ hash
170
+ end
171
+
172
+ # Determine the size of this Document's HTML.
173
+ #
174
+ # @return [Integer] The total number of bytes in @html.
175
+ def size
176
+ stats[:html]
177
+ end
178
+
179
+ # Determine if this Document's HTML is empty or not.
180
+ #
181
+ # @return [Boolean] True if @html is nil/empty, false otherwise.
182
+ def empty?
183
+ return true if @html.nil?
184
+ @html.empty?
185
+ end
186
+
187
+ # Uses Nokogiri's xpath method to search the doc's html and return the
188
+ # results.
189
+ #
190
+ # @param xpath [String] The xpath to search the @html with.
191
+ # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
192
+ def xpath(xpath)
193
+ @doc.xpath(xpath)
194
+ end
195
+
196
+ # Uses Nokogiri's css method to search the doc's html and return the
197
+ # results.
198
+ #
199
+ # @param selector [String] The CSS selector to search the @html with.
200
+ # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
201
+ def css(selector)
202
+ @doc.css(selector)
203
+ end
204
+
205
+ # Get all internal links of this Document in relative form. Internal
206
+ # meaning a link to another page on this website. Also see
207
+ # Wgit::Document#internal_full_links.
208
+ #
209
+ # @return [Array<Wgit::Url>] self's internal/relative URL's.
210
+ def internal_links
211
+ return [] if @links.empty?
212
+
213
+ links = @links.
214
+ reject do |link|
215
+ not link.relative_link?(base: @url.to_base)
216
+ rescue
217
+ true
218
+ end.
219
+ map(&:to_path_and_anchor)
220
+
221
+ process_arr(links)
222
+ end
223
+
224
+ # Get all internal links of this Document and append them to this
225
+ # Document's base URL making them absolute. Also see
226
+ # Wgit::Document#internal_links.
227
+ #
228
+ # @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
229
+ # form.
230
+ def internal_full_links
231
+ in_links = internal_links
232
+ return [] if in_links.empty?
233
+ in_links.map { |link| @url.to_base.concat(link) }
234
+ end
235
+
236
+ # Get all external links of this Document. External meaning a link to
237
+ # another website.
238
+ #
239
+ # @return [Array<Wgit::Url>] self's external/absolute URL's.
240
+ def external_links
241
+ return [] if @links.empty?
242
+
243
+ links = @links.
244
+ reject do |link|
245
+ link.relative_link?(base: @url.to_base)
246
+ rescue
247
+ true
248
+ end.
249
+ map(&:without_trailing_slash)
250
+
251
+ process_arr(links)
252
+ end
253
+
254
+ # Searches against the @text for the given search query.
255
+ # The number of search hits for each sentenence are recorded internally
256
+ # and used to rank/sort the search results before being returned. Where
257
+ # the Wgit::Database#search method search all documents for the most hits,
258
+ # this method searches each document's @text for the most hits.
259
+ #
260
+ # Each search result comprises of a sentence of a given length. The length
261
+ # will be based on the sentence_limit parameter or the full length of the
262
+ # original sentence, which ever is less. The algorithm obviously ensures
263
+ # that the search query is visible somewhere in the sentence.
264
+ #
265
+ # @param query [String] The value to search the document's text against.
266
+ # @param sentence_limit [Integer] The max length of each search result
267
+ # sentence.
268
+ # @return [Array<String>] Representing the search results.
269
+ def search(query, sentence_limit = 80)
270
+ raise "A search query must be provided" if query.empty?
271
+ raise "The sentence_limit value must be even" if sentence_limit.odd?
272
+
273
+ results = {}
274
+ regex = Regexp.new(query, Regexp::IGNORECASE)
275
+
276
+ @text.each do |sentence|
277
+ hits = sentence.scan(regex).count
278
+ if hits > 0
279
+ sentence.strip!
280
+ index = sentence.index(regex)
281
+ Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
282
+ results[sentence] = hits
283
+ end
284
+ end
285
+
286
+ return [] if results.empty?
287
+ results = Hash[results.sort_by { |k, v| v }]
288
+ results.keys.reverse
289
+ end
290
+
291
+ # Performs a text search (see Document#search for details) but assigns the
292
+ # results to the @text instance variable. This can be used for sub search
293
+ # functionality. The original text is returned; no other reference to it
294
+ # is kept thereafter.
295
+ #
296
+ # @param query [String] The value to search the document's text against.
297
+ # @param sentence_limit [Integer] The max length of each search result
298
+ # sentence.
299
+ # @return [String] This Document's original @text value.
300
+ def search!(query, sentence_limit = 80)
301
+ orig_text = @text
302
+ @text = search(query, sentence_limit)
303
+ orig_text
304
+ end
305
+
306
+ ### Document (Class) methods ###
307
+
308
+ # Returns Document.text_elements used to obtain the text in a webpage.
309
+ #
310
+ # @return [Array<Symbols>] The page elements containing visual text on a
311
+ # webpage.
312
+ def self.text_elements
313
+ @@text_elements
314
+ end
315
+
316
+ # Initialises a private instance variable with the xpath or database object
317
+ # result(s). When initialising from HTML, a true singleton value will only
318
+ # ever return one result otherwise all xpath results are returned in an
319
+ # Array. When initialising from a database object, the value is taken as
320
+ # is and singleton is only used to define the default empty value.
321
+ # If a value cannot be found (in either the HTML or database object), then
322
+ # a default will be used. The default is: singleton ? nil : [].
323
+ #
324
+ # Note that defined extensions work for both documents being crawled from
325
+ # the WWW and for documents being retrieved from the database. This
326
+ # effectively implements ORM like behavior using this class.
327
+ #
328
+ # @param var [Symbol] The name of the variable to be initialised.
329
+ # @param xpath [String] Used to find the element(s) of the webpage.
330
+ # @option options [Boolean] :singleton The singleton option determines
331
+ # whether or not the result(s) should be in an Array. If multiple
332
+ # results are found and singleton is true then the first result will be
333
+ # used. Defaults to true.
334
+ # @option options [Boolean] :text_content_only The text_content_only option
335
+ # if true will use the text content of the Nokogiri result object,
336
+ # otherwise the Nokogiri object itself is returned. Defaults to true.
337
+ # @yield [var_value] Gives the value about to be assigned to the new var.
338
+ # The return value of the block becomes the new var value, unless nil.
339
+ # Return nil if you want to inspect but not change the var value.
340
+ # @return [Symbol] The first half of the newly created method names e.g.
341
+ # if var == "title" then :init_title is returned.
342
+ def self.define_extension(var, xpath, options = {}, &block)
343
+ default_options = { singleton: true, text_content_only: true }
344
+ options = default_options.merge(options)
345
+
346
+ # Define the private init_*_from_html method for HTML.
347
+ # Gets the HTML's xpath value and creates a var for it.
348
+ func_name = Document.send(:define_method, "init_#{var}_from_html") do
349
+ result = find_in_html(xpath, options, &block)
350
+ init_var(var, result)
351
+ end
352
+ Document.send :private, func_name
353
+
354
+ # Define the private init_*_from_object method for a Database object.
355
+ # Gets the Object's "key" value and creates a var for it.
356
+ func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
357
+ result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
358
+ init_var(var, result)
359
+ end
360
+ Document.send :private, func_name
361
+
362
+ "init_#{var}".to_sym
363
+ end
364
+
365
+ # Removes the init_* methods created when an extension is defined.
366
+ # Therefore, this is the opposing method to Document.define_extension.
367
+ # Returns true if successful or false if the method(s) cannot be found.
368
+ #
369
+ # @param var [Symbol] The extension variable already defined.
370
+ # @return [Boolean] True if the extension var was found and removed;
371
+ # otherwise false.
372
+ def self.remove_extension(var)
373
+ Document.send(:remove_method, "init_#{var}_from_html")
374
+ Document.send(:remove_method, "init_#{var}_from_object")
375
+ true
376
+ rescue NameError
377
+ false
378
+ end
379
+
380
+ private
381
+
382
+ # Initializes the nokogiri object using @html, which must be already set.
383
+ def init_nokogiri
384
+ raise "@html must be set" unless @html
385
+ Nokogiri::HTML(@html) do |config|
386
+ # TODO: Remove #'s below when crawling in production.
387
+ #config.options = Nokogiri::XML::ParseOptions::STRICT |
388
+ # Nokogiri::XML::ParseOptions::NONET
389
+ end
390
+ end
391
+
392
+ # Returns an object/value from this Document's @html using the provided
393
+ # xpath param.
394
+ # singleton ? results.first (single Object) : results (Array)
395
+ # text_content_only ? result.content (String) : result (nokogiri Object)
396
+ # A block can be used to set the final value before it is returned.
397
+ # Return nil from the block if you don't want to override the value.
398
+ def find_in_html(xpath, singleton: true, text_content_only: true)
399
+ results = @doc.xpath(xpath)
400
+
401
+ if results and not results.empty?
402
+ result = if singleton
403
+ text_content_only ? results.first.content : results.first
404
+ else
405
+ text_content_only ? results.map(&:content) : results
406
+ end
407
+ else
408
+ result = singleton ? nil : []
409
+ end
410
+
411
+ singleton ? process_str(result) : process_arr(result)
412
+
413
+ if block_given?
414
+ new_result = yield(result)
415
+ result = new_result if new_result
416
+ end
417
+
418
+ result
419
+ end
420
+
421
+ # Finds a value in the obj using the key.
422
+ # singleton is used to set the value if not found in obj.
423
+ # A block can be used to set the final value before it is returned.
424
+ # Return nil from the block if you don't want to override the value.
425
+ def find_in_object(obj, key, singleton: true)
426
+ assert_respond_to(obj, :fetch)
427
+
428
+ default = singleton ? nil : []
429
+ result = obj.fetch(key.to_s, default)
430
+ singleton ? process_str(result) : process_arr(result)
431
+
432
+ if block_given?
433
+ new_result = yield(result)
434
+ result = new_result if new_result
435
+ end
436
+
437
+ result
438
+ end
439
+
440
+ # Initialises an instance variable and defines a getter method for it.
441
+ # @param var [Symbol] The name of the variable to be initialized.
442
+ # @param value [Object] The newly initialized variable's value.
443
+ # @return [Symbol] The name of the newly created getter method.
444
+ def init_var(var, value)
445
+ # instance_var_name starts with @, var_name doesn't.
446
+ var = var.to_s
447
+ var_name = (var.start_with?("@") ? var[1..-1] : var).to_sym
448
+ instance_var_name = "@#{var_name}".to_sym
449
+
450
+ instance_variable_set(instance_var_name, value)
451
+
452
+ Document.send(:define_method, var_name) do
453
+ instance_variable_get(instance_var_name)
454
+ end
455
+ end
456
+
457
+ # Takes Docuent.text_elements and returns an xpath String used to obtain
458
+ # all of the combined text.
459
+ def text_elements_xpath
460
+ xpath = ""
461
+ return xpath if @@text_elements.empty?
462
+ el_xpath = "//%s/text()"
463
+ @@text_elements.each_with_index do |el, i|
464
+ xpath += " | " unless i == 0
465
+ xpath += el_xpath % [el]
466
+ end
467
+ xpath
468
+ end
469
+
470
+ # Processes a String to make it uniform.
471
+ def process_str(str)
472
+ if str.is_a?(String)
473
+ str.encode!('UTF-8', 'UTF-8', invalid: :replace)
474
+ str.strip!
475
+ end
476
+ str
477
+ end
478
+
479
+ # Processes an Array to make it uniform.
480
+ def process_arr(array)
481
+ if array.is_a?(Array)
482
+ array.map! { |str| process_str(str) }
483
+ array.reject! { |str| str.is_a?(String) ? str.empty? : false }
484
+ array.compact!
485
+ array.uniq!
486
+ end
487
+ array
488
+ end
489
+
490
+ # Ensure the @url and @html Strings are correctly encoded etc.
491
+ def process_url_and_html
492
+ @url = process_str(@url)
493
+ @html = process_str(@html)
494
+ end
495
+
496
+ ### Default init_* (Document extension) methods. ###
497
+
498
+ # Init methods for title.
499
+
500
+ def init_title_from_html
501
+ xpath = "//title"
502
+ result = find_in_html(xpath)
503
+ init_var(:@title, result)
504
+ end
505
+
506
+ def init_title_from_object(obj)
507
+ result = find_in_object(obj, "title")
508
+ init_var(:@title, result)
509
+ end
510
+
511
+ # Init methods for author.
512
+
513
+ def init_author_from_html
514
+ xpath = "//meta[@name='author']/@content"
515
+ result = find_in_html(xpath)
516
+ init_var(:@author, result)
517
+ end
518
+
519
+ def init_author_from_object(obj)
520
+ result = find_in_object(obj, "author")
521
+ init_var(:@author, result)
522
+ end
523
+
524
+ # Init methods for keywords.
525
+
526
+ def init_keywords_from_html
527
+ xpath = "//meta[@name='keywords']/@content"
528
+ result = find_in_html(xpath) do |keywords|
529
+ if keywords
530
+ keywords = keywords.split(",")
531
+ process_arr(keywords)
532
+ end
533
+ keywords
534
+ end
535
+ init_var(:@keywords, result)
536
+ end
537
+
538
+ def init_keywords_from_object(obj)
539
+ result = find_in_object(obj, "keywords", singleton: false)
540
+ init_var(:@keywords, result)
541
+ end
542
+
543
+ # Init methods for links.
544
+
545
+ def init_links_from_html
546
+ # Any element with a href or src attribute is considered a link.
547
+ xpath = '//*/@href | //*/@src'
548
+ result = find_in_html(xpath, singleton: false) do |links|
549
+ if links
550
+ links.map! do |link|
551
+ begin
552
+ Wgit::Url.new(link)
553
+ rescue
554
+ nil
555
+ end
556
+ end
557
+ links.compact!
558
+ end
559
+ links
560
+ end
561
+ init_var(:@links, result)
562
+ end
563
+
564
+ def init_links_from_object(obj)
565
+ result = find_in_object(obj, "links", singleton: false) do |links|
566
+ if links
567
+ links.map! { |link| Wgit::Url.new(link) }
568
+ end
569
+ links
570
+ end
571
+ init_var(:@links, result)
572
+ end
573
+
574
+ # Init methods for text.
575
+
576
+ def init_text_from_html
577
+ xpath = text_elements_xpath
578
+ result = find_in_html(xpath, singleton: false)
579
+ init_var(:@text, result)
580
+ end
581
+
582
+ def init_text_from_object(obj)
583
+ result = find_in_object(obj, "text", singleton: false)
584
+ init_var(:@text, result)
585
+ end
586
+
587
+ alias :to_hash :to_h
588
+ alias :relative_links :internal_links
589
+ alias :relative_urls :internal_links
590
+ alias :relative_full_links :internal_full_links
591
+ alias :relative_full_urls :internal_full_links
592
+ alias :external_urls :external_links
593
+ end
594
+ end