wgit 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,30 +2,46 @@ require_relative '../utils'
2
2
 
3
3
  module Wgit
4
4
 
5
- # @author Michael Telford
6
- # Module containing the DB data model structure.
5
+ # Module containing the database (DB) data model structure.
7
6
  module Model
8
- def self.url(url)
9
- raise "url must respond to to_h" unless url.respond_to?(:to_h)
10
- url.to_h
11
- end
12
-
13
- def self.document(doc)
14
- raise "doc must respond to to_h" unless doc.respond_to?(:to_h)
15
- doc.to_h(false)
16
- end
17
-
18
- def self.common_insert_data
19
- {
20
- :date_added => Wgit::Utils.time_stamp,
21
- :date_modified => Wgit::Utils.time_stamp,
22
- }
23
- end
24
-
25
- def self.common_update_data
26
- {
27
- :date_modified => Wgit::Utils.time_stamp,
28
- }
29
- end
7
+
8
+ # The data model for a Wgit::Url.
9
+ #
10
+ # @param url [Wgit::Url] The URL DB record.
11
+ # @return [Hash] The URL model ready for DB insertion.
12
+ def self.url(url)
13
+ raise "url must respond_to? to_h" unless url.respond_to?(:to_h)
14
+ model = url.to_h
15
+ Wgit::Utils.remove_non_bson_types(model)
16
+ end
17
+
18
+ # The data model for a Wgit::Document.
19
+ #
20
+ # @param doc [Wgit::Document] The Document DB record.
21
+ # @return [Hash] The Document model ready for DB insertion.
22
+ def self.document(doc)
23
+ raise "doc must respond_to? to_h" unless doc.respond_to?(:to_h)
24
+ model = doc.to_h(false)
25
+ Wgit::Utils.remove_non_bson_types(model)
26
+ end
27
+
28
+ # Default fields when inserting a record into the DB.
29
+ #
30
+ # @return [Hash] Containing common insertion fields for all models.
31
+ def self.common_insert_data
32
+ {
33
+ date_added: Wgit::Utils.time_stamp,
34
+ date_modified: Wgit::Utils.time_stamp,
35
+ }
36
+ end
37
+
38
+ # Default fields when updating a record in the DB.
39
+ #
40
+ # @return [Hash] Containing common update fields for all models.
41
+ def self.common_update_data
42
+ {
43
+ date_modified: Wgit::Utils.time_stamp,
44
+ }
45
+ end
30
46
  end
31
47
  end
@@ -1,27 +1,48 @@
1
-
2
- # @author Michael Telford
3
1
  module Wgit
4
- DB_PROVIDER = :MongoLabs.freeze
2
+ # The connection details for the database. This must be set if you want to
3
+ # store and access webpages in a database. Don't set the constant directly,
4
+ # instead use the funcs contained within the Wgit module.
5
+ CONNECTION_DETAILS = {}
6
+
7
+ # Set the database's connection details from the given hash and freeze them.
8
+ # It is your responsibility to ensure the correct hash vars are present and
9
+ # set. Due to the freezing of the CONNECTION_DETAILS, this func is designed
10
+ # to be called only once.
11
+ #
12
+ # @param hash [Hash] Containing the database connection details to use.
13
+ # The hash should contain the following keys (of type String):
14
+ # host, port, uname, pword, db
15
+ # @raise [KeyError, FrozenError] If any of the required connection
16
+ # details are missing or if the connection details have already been set.
17
+ # @return [Hash] Containing the database connection details from hash.
18
+ def self.set_connection_details(hash)
19
+ CONNECTION_DETAILS[:host] = hash.fetch('host')
20
+ CONNECTION_DETAILS[:port] = hash.fetch('port')
21
+ CONNECTION_DETAILS[:uname] = hash.fetch('uname')
22
+ CONNECTION_DETAILS[:pword] = hash.fetch('pword')
23
+ CONNECTION_DETAILS[:db] = hash.fetch('db')
24
+
25
+ CONNECTION_DETAILS.freeze
26
+ end
27
+
28
+ # Set the database's connection details from the ENV and freeze them. It is
29
+ # your responsibility to ensure the correct ENV vars are present and set.
30
+ # Due to the freezing of the CONNECTION_DETAILS, this func is designed to be
31
+ # called only once.
32
+ #
33
+ # The ENV should contain the following keys (of type String):
34
+ # DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
35
+ #
36
+ # @raise [KeyError, FrozenError] If any of the required connection
37
+ # details are missing or if the connection details have already been set.
38
+ # @return [Hash] Containing the database connection details from the ENV.
39
+ def self.set_connection_details_from_env
40
+ CONNECTION_DETAILS[:host] = ENV.fetch('DB_HOST')
41
+ CONNECTION_DETAILS[:port] = ENV.fetch('DB_PORT')
42
+ CONNECTION_DETAILS[:uname] = ENV.fetch('DB_USERNAME')
43
+ CONNECTION_DETAILS[:pword] = ENV.fetch('DB_PASSWORD')
44
+ CONNECTION_DETAILS[:db] = ENV.fetch('DB_DATABASE')
5
45
 
6
- # OpenShift (MongoDB 2.4)
7
- if DB_PROVIDER == :OpenShift
8
- CONNECTION_DETAILS = {
9
- :host => "127.0.0.1",
10
- :port => "27017",
11
- :db => "admin",
12
- :uname => "admin",
13
- :pword => "R5jUKv1fessb"
14
- }.freeze
15
- # MongoLabs (MongoDB 3.0)
16
- elsif DB_PROVIDER == :MongoLabs
17
- CONNECTION_DETAILS = {
18
- :host => "ds037205.mongolab.com",
19
- :port => "37205",
20
- :db => "crawler",
21
- :uname => "rubyapp",
22
- :pword => "R5jUKv1fessb",
23
- }.freeze
24
- else
25
- raise "Database provider '#{DB_PROVIDER}' is not recognized"
46
+ CONNECTION_DETAILS.freeze
26
47
  end
27
48
  end
@@ -2,288 +2,589 @@ require_relative 'url'
2
2
  require_relative 'utils'
3
3
  require_relative 'assertable'
4
4
  require 'nokogiri'
5
+ require 'json'
5
6
 
6
7
  module Wgit
7
8
 
8
- # @author Michael Telford
9
- # Class modeling a HTML web document. Also doubles as a search result.
9
+ # Class modeling a HTML web document. Also doubles as a search result when
10
+ # loading Documents from the database.
11
+ #
12
+ # The initialize method dynamically initializes certain variables from the
13
+ # Document HTML / Database object e.g. text. This bit is dynamic so that the
14
+ # Document class can be easily extended allowing you to pull out the bits of
15
+ # a webpage that are important to you. See Wgit::Document.define_extension.
10
16
  class Document
11
17
  include Assertable
12
-
13
- TEXT_ELEMENTS = [:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
14
- :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
15
18
 
16
- attr_reader :url, :html, :title, :author, :keywords, :links, :text, :score
17
-
18
- def initialize(url_or_doc, html = nil)
19
- if (url_or_doc.is_a?(String))
20
- assert_type(url_or_doc, Url)
21
- html ||= ""
19
+ # The HTML elements that make up the visible text on a page.
20
+ # These elements are used to initialize the @text of the Document.
21
+ # See the README.md for how to add to this Array dynamically.
22
+ @@text_elements = [
23
+ :dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
24
+ :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
25
+ ]
26
+
27
+ # The URL of the webpage, an instance of Wgit:Url.
28
+ attr_reader :url
29
+
30
+ # The HTML of the webpage, an instance of String.
31
+ attr_reader :html
32
+
33
+ # The Nokogiri document object initialized from @html.
34
+ attr_reader :doc
35
+
36
+ # The score is only used following a Database#search and records matches.
37
+ attr_reader :score
38
+
39
+ # Initialize takes either two strings (representing the URL and HTML) or an
40
+ # object representing a database record (of a HTTP crawled web page). This
41
+ # allows for initialisation from both crawled web pages and (afterwards)
42
+ # documents/web pages retrieved from the database.
43
+ #
44
+ # During initialisation, the Document will call any
45
+ # 'init_*_from_html' and 'init_*_from_object' methods it can find. Some
46
+ # default init_* methods exist while others can be defined by the user.
47
+ # See the README and Wgit::Document.define_extension for more info.
48
+ #
49
+ # @param url_or_obj [String, Object#fetch] Either a String representing a
50
+ # URL or a Hash-like object responding to :fetch. e.g. a MongoDB
51
+ # collection object. The Object's :fetch method should support Strings as
52
+ # keys.
53
+ # @param html [String] The crawled web page's HTML. This param is only
54
+ # required if url_or_obj is a String representing the web page's URL.
55
+ def initialize(url_or_obj, html = "")
56
+ # Init from URL String and HTML String.
57
+ if url_or_obj.is_a?(String)
58
+ url = url_or_obj
59
+ assert_type(url, Url)
60
+
61
+ @url = url
62
+ @html = html ||= ""
63
+ @doc = init_nokogiri
64
+ @score = 0.0
22
65
 
23
- @url = url_or_doc
24
- @html = html
25
-
26
- @doc = Nokogiri::HTML(html) do |config|
27
- # TODO: Remove #'s below when crawling in production.
28
- #config.options = Nokogiri::XML::ParseOptions::STRICT |
29
- # Nokogiri::XML::ParseOptions::NONET
30
- end
31
-
32
- init_title
33
- init_author
34
- init_keywords
35
- init_links
36
- init_text
37
- @score = 0.0
38
- else
39
- # Init from a mongo collection document.
40
- @url = Wgit::Url.new(url_or_doc[:url])
41
- @html = url_or_doc[:html].nil? ? "" : url_or_doc[:html]
42
- @title = url_or_doc[:title]
43
- @author = url_or_doc[:author]
44
- @keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords]
45
- @links = url_or_doc[:links].nil? ? [] : url_or_doc[:links]
46
- @links.map! { |link| Wgit::Url.new(link) }
47
- @text = url_or_doc[:text].nil? ? [] : url_or_doc[:text]
48
- @score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score]
66
+ # Dynamically run the init_*_from_html methods.
67
+ Document.private_instance_methods(false).each do |method|
68
+ if method.to_s.start_with?("init_") &&
69
+ method.to_s.end_with?("_from_html")
70
+ self.send(method)
49
71
  end
50
- end
51
-
52
- def internal_links
53
- return [] if @links.empty?
54
- @links.reject do |link|
55
- begin
56
- not link.relative_link?
57
- rescue
58
- true
59
- end
72
+ end
73
+ # Init from a Hash like object containing Strings as keys e.g. Mongo
74
+ # collection obj.
75
+ else
76
+ obj = url_or_obj
77
+ assert_respond_to(obj, :fetch)
78
+
79
+ @url = obj.fetch("url") # Should always be present.
80
+ @html = obj.fetch("html", "")
81
+ @doc = init_nokogiri
82
+ @score = obj.fetch("score", 0.0)
83
+
84
+ # Dynamically run the init_*_from_object methods.
85
+ Document.private_instance_methods(false).each do |method|
86
+ if method.to_s.start_with?("init_") &&
87
+ method.to_s.end_with?("_from_object")
88
+ self.send(method, obj)
60
89
  end
61
- end
90
+ end
91
+ end
92
+ end
62
93
 
63
- def internal_full_links
64
- return [] if internal_links.empty?
65
- internal_links.map do |link|
66
- link.replace("/" + link) unless link.start_with?("/")
67
- Wgit::Url.new(@url.to_base + link)
68
- end
94
+ # Determines if both the url and html match. Use
95
+ # doc.object_id == other_doc.object_id for exact object comparison.
96
+ #
97
+ # @param other_doc [Wgit::Document] To compare self against.
98
+ # @return [Boolean] True if @url and @html are equal, false if not.
99
+ def ==(other_doc)
100
+ return false unless other_doc.is_a? Wgit::Document
101
+ @url == other_doc.url and @html == other_doc.html
102
+ end
103
+
104
+ # Is a shortcut for calling Document#html[range].
105
+ #
106
+ # @param range [Range] The range of @html to return.
107
+ # @return [String] The given range of @html.
108
+ def [](range)
109
+ @html[range]
110
+ end
111
+
112
+ def date_crawled
113
+ @url.date_crawled
114
+ end
115
+
116
+ # Returns a Hash containing this Document's instance vars.
117
+ # Used when storing the Document in a Database e.g. MongoDB etc.
118
+ # By default the @html var is excluded from the returned Hash.
119
+ #
120
+ # @param include_html [Boolean] Whether or not to include @html in the
121
+ # returned Hash.
122
+ # @return [Hash] Containing self's instance vars.
123
+ def to_h(include_html = false)
124
+ ignore = include_html ? [] : ["@html"]
125
+ ignore << "@doc" # Always ignore "@doc"
126
+ Wgit::Utils.to_h(self, ignore)
127
+ end
128
+
129
+ # Converts this Document's to_h return value to a JSON String.
130
+ #
131
+ # @param include_html [Boolean] Whether or not to include @html in the
132
+ # returned JSON String.
133
+ # @return [String] This Document represented as a JSON String.
134
+ def to_json(include_html = false)
135
+ h = to_h(include_html)
136
+ JSON.generate(h)
137
+ end
138
+
139
+ # Returns a Hash containing this Document's instance variables and
140
+ # their :length (if they respond to it). Works dynamically so that any
141
+ # user defined extensions (and their created instance vars) will appear in
142
+ # the returned Hash as well. The number of text snippets as well as total
143
+ # number of textual bytes are always included in the returned Hash.
144
+ #
145
+ # @return [Hash] Containing self's HTML statistics.
146
+ def stats
147
+ hash = {}
148
+ instance_variables.each do |var|
149
+ # Add up the total bytes of text as well as the length.
150
+ if var == :@text
151
+ count = 0
152
+ @text.each { |t| count += t.length }
153
+ hash[:text_length] = @text.length
154
+ hash[:text_bytes] = count
155
+ # Else take the var's #length method return value.
156
+ else
157
+ next unless instance_variable_get(var).respond_to?(:length)
158
+ hash[var[1..-1].to_sym] =
159
+ instance_variable_get(var).send(:length)
160
+ end
69
161
  end
70
-
71
- def external_links
162
+ hash
163
+ end
164
+
165
+ # Determine the size of this Document's HTML.
166
+ #
167
+ # @return [Integer] The total number of bytes in @html.
168
+ def size
169
+ stats[:html]
170
+ end
171
+
172
+ # Determine if this Document's HTML is empty or not.
173
+ #
174
+ # @return [Boolean] True if @html is nil/empty, false otherwise.
175
+ def empty?
176
+ return true if @html.nil?
177
+ @html.strip.empty?
178
+ end
179
+
180
+ # Uses Nokogiri's xpath method to search the doc's html and return the
181
+ # results.
182
+ #
183
+ # @param xpath [String] The xpath to search the @html with.
184
+ # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
185
+ def xpath(xpath)
186
+ @doc.xpath(xpath)
187
+ end
188
+
189
+ # Uses Nokogiri's css method to search the doc's html and return the
190
+ # results.
191
+ #
192
+ # @param selector [String] The CSS selector to search the @html with.
193
+ # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
194
+ def css(selector)
195
+ @doc.css(selector)
196
+ end
197
+
198
+ # Get all internal links of this Document.
199
+ #
200
+ # @return [Array<Wgit::Url>] self's internal/relative URL's.
201
+ def internal_links
72
202
  return [] if @links.empty?
73
- @links.reject do |link|
203
+ @links.reject do |link|
74
204
  begin
75
- link.relative_link?
205
+ not link.relative_link?
76
206
  rescue
77
- true
207
+ true
78
208
  end
79
209
  end
80
- end
210
+ end
81
211
 
82
- def stats
83
- hash = {}
84
- instance_variables.each do |var|
85
- # Add up the total bytes of text as well as the length.
86
- if var == :@text
87
- count = 0
88
- @text.each { |t| count += t.length }
89
- hash[:text_length] = @text.length
90
- hash[:text_bytes] = count
91
- # Else take the #length method return value.
92
- else
93
- next unless instance_variable_get(var).respond_to?(:length)
94
- hash[var[1..-1].to_sym] =
95
- instance_variable_get(var).send(:length)
96
- end
97
- end
98
- hash
212
+ # Get all internal links of this Document and append them to this
213
+ # Document's base URL.
214
+ #
215
+ # @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
216
+ # form.
217
+ def internal_full_links
218
+ in_links = internal_links
219
+ return [] if in_links.empty?
220
+ in_links.map do |link|
221
+ link.replace("/" + link) unless link.start_with?("/")
222
+ Wgit::Url.new(@url.to_base + link)
99
223
  end
100
-
101
- def size
102
- stats[:html]
224
+ end
225
+
226
+ # Get all external links of this Document.
227
+ #
228
+ # @return [Array<Wgit::Url>] self's external/absolute URL's.
229
+ def external_links
230
+ return [] if @links.empty?
231
+ @links.reject do |link|
232
+ begin
233
+ link.relative_link?
234
+ rescue
235
+ true
236
+ end
103
237
  end
238
+ end
239
+
240
+ # Searches against the @text for the given search query.
241
+ # The number of search hits for each sentenence are recorded internally
242
+ # and used to rank/sort the search results before being returned. Where
243
+ # the Wgit::Database#search method search all documents for the most hits,
244
+ # this method searches each document's @text for the most hits.
245
+ #
246
+ # Each search result comprises of a sentence of a given length. The length
247
+ # will be based on the sentence_limit parameter or the full length of the
248
+ # original sentence, which ever is less. The algorithm obviously ensures
249
+ # that the search query is visible somewhere in the sentence.
250
+ #
251
+ # @param query [String] The value to search the document's text against.
252
+ # @param sentence_limit [Integer] The max length of each search result
253
+ # sentence.
254
+ # @return [Array<String>] Representing the search results.
255
+ def search(query, sentence_limit = 80)
256
+ raise "A search value must be provided" if query.empty?
257
+ raise "The sentence length value must be even" if sentence_limit.odd?
258
+
259
+ results = {}
260
+ regex = Regexp.new(query, Regexp::IGNORECASE)
104
261
 
105
- def to_h(include_html = false)
106
- ignore = include_html ? [] : [:@html]
107
- ignore << :@doc # Always ignore :@doc
108
- Wgit::Utils.to_h(self, ignore)
262
+ @text.each do |sentence|
263
+ hits = sentence.scan(regex).count
264
+ if hits > 0
265
+ sentence.strip!
266
+ index = sentence.index(regex)
267
+ Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
268
+ results[sentence] = hits
269
+ end
109
270
  end
271
+
272
+ return [] if results.empty?
273
+ results = Hash[results.sort_by { |k, v| v }]
274
+ results.keys.reverse
275
+ end
276
+
277
+ # Performs a text search (see Document#search for details) but assigns the
278
+ # results to the @text instance variable. This can be used for sub search
279
+ # functionality. The original text is returned; no other reference to it
280
+ # is kept thereafter.
281
+ #
282
+ # @param query [String] The value to search the document's text against.
283
+ # @return [String] This Document's original @text value.
284
+ def search!(query)
285
+ orig_text = @text
286
+ @text = search(query)
287
+ orig_text
288
+ end
289
+
290
+ ### Document (Class) methods ###
291
+
292
+ # Returns Document.text_elements used to obtain the text in a webpage.
293
+ #
294
+ # @return [Array<Symbols>] The page elements containing visual text on a
295
+ # webpage.
296
+ def self.text_elements
297
+ @@text_elements
298
+ end
299
+
300
+ # Initialises a private instance variable with the xpath or database object
301
+ # result(s). When initialising from HTML, a true singleton value will only
302
+ # ever return one result otherwise all xpath results are returned in an
303
+ # Array. When initialising from a database object, the value is taken as
304
+ # is and singleton is only used to define the default empty value.
305
+ # If a value cannot be found (in either the HTML or database object), then
306
+ # a default will be used. The default is: singleton ? nil : [].
307
+ #
308
+ # Note that defined extensions work for both documents being crawled from
309
+ # the WWW and for documents being retrieved from the database. This
310
+ # effectively implements ORM like behavior using this class.
311
+ #
312
+ # @param var [Symbol] The name of the variable to be initialised.
313
+ # @param xpath [String] Used to find the element(s) of the webpage.
314
+ # @option options [Boolean] :singleton The singleton option determines
315
+ # whether or not the result(s) should be in an Array. If multiple
316
+ # results are found and singleton is true then the first result will be
317
+ # used. Defaults to true.
318
+ # @option options [Boolean] :text_content_only The text_content_only option
319
+ # if true will use the text content of the Nokogiri result object,
320
+ # otherwise the Nokogiri object itself is returned. Defaults to true.
321
+ # @yield [var_value] Gives the value about to be assigned to the new var.
322
+ # The return value of the block becomes the new var value, unless nil.
323
+ # Return nil if you want to inspect but not change the var value.
324
+ # @return [Symbol] The first half of the newly created method names e.g.
325
+ # if var == "title" then :init_title is returned.
326
+ def self.define_extension(var, xpath, options = {}, &block)
327
+ default_options = { singleton: true, text_content_only: true }
328
+ options = default_options.merge(options)
110
329
 
111
- # Override of the default == method, is equal if url and html both match.
112
- # Use doc.object_id == other_doc.object_id for exact object comparison.
113
- def ==(other_doc)
114
- return false unless other_doc.is_a? Wgit::Document
115
- url == other_doc.url and html == other_doc.html
330
+ # Define the private init_*_from_html method for HTML.
331
+ # Gets the HTML's xpath value and creates a var for it.
332
+ func_name = Document.send(:define_method, "init_#{var}_from_html") do
333
+ result = find_in_html(xpath, options, &block)
334
+ init_var(var, result)
335
+ end
336
+ Document.send :private, func_name
337
+
338
+ # Define the private init_*_from_object method for a Database object.
339
+ # Gets the Object's "key" value and creates a var for it.
340
+ func_name = Document.send(
341
+ :define_method, "init_#{var}_from_object") do |obj|
342
+ result = find_in_object(
343
+ obj, var.to_s, singleton: options[:singleton], &block)
344
+ init_var(var, result)
345
+ end
346
+ Document.send :private, func_name
347
+
348
+ "init_#{var}".to_sym
349
+ end
350
+
351
+ # Removes the init_* methods created when an extension is defined.
352
+ # Therefore, this is the opposing method to Document.define_extension.
353
+ # Returns true if successful or false if the method(s) cannot be found.
354
+ #
355
+ # @param var [Symbol] The extension variable already defined.
356
+ # @return [Boolean] True if the extension var was found and removed;
357
+ # otherwise false.
358
+ def self.remove_extension(var)
359
+ Document.send(:remove_method, "init_#{var}_from_html")
360
+ Document.send(:remove_method, "init_#{var}_from_object")
361
+ true
362
+ rescue NameError
363
+ false
364
+ end
365
+
366
+ private
367
+
368
+ # Initializes the nokogiri object using @html, which must be already set.
369
+ def init_nokogiri
370
+ raise "@html must be set" unless @html
371
+ Nokogiri::HTML(@html) do |config|
372
+ # TODO: Remove #'s below when crawling in production.
373
+ #config.options = Nokogiri::XML::ParseOptions::STRICT |
374
+ # Nokogiri::XML::ParseOptions::NONET
116
375
  end
376
+ end
377
+
378
+ # Returns an object/value from this Document's @html using the provided
379
+ # xpath param.
380
+ # singleton ? results.first (single Object) : results (Array)
381
+ # text_content_only ? result.content (String) : result (nokogiri Object)
382
+ # A block can be used to set the final value before it is returned.
383
+ # Return nil from the block if you don't want to override the value.
384
+ def find_in_html(xpath, singleton: true, text_content_only: true)
385
+ results = @doc.xpath(xpath)
117
386
 
118
- # Shortcut for calling Document#html[range].
119
- def [](range)
120
- html[range]
387
+ if results and not results.empty?
388
+ result = if singleton
389
+ text_content_only ? results.first.content : results.first
390
+ else
391
+ text_content_only ? results.map(&:content) : results
392
+ end
393
+ else
394
+ result = singleton ? nil : []
121
395
  end
122
-
123
- def empty?
124
- html.strip.empty?
396
+
397
+ singleton ? process_str(result) : process_arr(result)
398
+
399
+ if block_given?
400
+ new_result = yield(result)
401
+ result = new_result if new_result
125
402
  end
126
-
127
- # Searches against the Document#text for the given search text.
128
- # The number of search hits for each sentenence are recorded internally
129
- # and used to rank/sort the search results before being returned. Where
130
- # the Database#search method search all documents for the most hits this
131
- # method searches each documents text for the most hits.
132
- #
133
- # Each search result comprises of a sentence of a given length. The length
134
- # will be based on the sentence_limit parameter or the full length of the
135
- # original sentence, which ever is less. The algorithm obviously ensures
136
- # that the search value is visible somewhere in the sentence.
137
- #
138
- # @param text [String] the value to search the document text against.
139
- # @param sentence_limit [Fixnum] the length of each search result
140
- # sentence.
141
- #
142
- # @return [Array] of String objects representing the search results.
143
- def search(text, sentence_limit = 80)
144
- raise "A search value must be provided" if text.empty?
145
- raise "The sentence length value must be even" if sentence_limit.odd?
146
-
147
- results = {}
148
- regex = Regexp.new(text, Regexp::IGNORECASE)
149
-
150
- @text.each do |sentence|
151
- hits = sentence.scan(regex).count
152
- if hits > 0
153
- sentence.strip!
154
- index = sentence.index(regex)
155
- Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
156
- results[sentence] = hits
157
- end
158
- end
159
-
160
- return [] if results.empty?
161
- results = Hash[results.sort_by { |k, v| v }]
162
- results.keys.reverse
403
+
404
+ result
405
+ end
406
+
407
+ # Finds a value in the obj using the key.
408
+ # singleton is used to set the value if not found in obj.
409
+ # A block can be used to set the final value before it is returned.
410
+ # Return nil from the block if you don't want to override the value.
411
+ def find_in_object(obj, key, singleton: true)
412
+ assert_respond_to(obj, :fetch)
413
+
414
+ default = singleton ? nil : []
415
+ result = obj.fetch(key.to_s, default)
416
+ singleton ? process_str(result) : process_arr(result)
417
+
418
+ if block_given?
419
+ new_result = yield(result)
420
+ result = new_result if new_result
163
421
  end
422
+
423
+ result
424
+ end
164
425
 
165
- # Performs a text search (see search for details) but assigns the results
166
- # to the @text instance variable. This can be used for sub search
167
- # functionality. Note that there is no way of getting the original text
168
- # back however.
169
- def search!(text)
170
- @text = search(text)
426
+ # Initialises an instance variable and defines a getter method for it.
427
+ # @param var [Symbol] The name of the variable to be initialized.
428
+ # @param value [Object] The newly initialized variable's value.
429
+ # @return [Symbol] The name of the newly created getter method.
430
+ def init_var(var, value)
431
+ # instance_var_name starts with @, var_name doesn't.
432
+ var = var.to_s
433
+ var_name = (var.start_with?("@") ? var[1..-1] : var).to_sym
434
+ instance_var_name = "@#{var_name}".to_sym
435
+
436
+ instance_variable_set(instance_var_name, value)
437
+
438
+ Document.send(:define_method, var_name) do
439
+ instance_variable_get(instance_var_name)
171
440
  end
441
+ end
172
442
 
173
- # Uses Nokogiri's xpath method to search the doc's html and return the
174
- # results.
175
- def xpath(xpath)
176
- @doc.xpath(xpath)
443
+ # Takes Docuent.text_elements and returns an xpath String used to obtain
444
+ # all of the combined text.
445
+ def text_elements_xpath
446
+ xpath = ""
447
+ return xpath if @@text_elements.empty?
448
+ el_xpath = "//%s/text()"
449
+ @@text_elements.each_with_index do |el, i|
450
+ xpath += " | " unless i == 0
451
+ xpath += el_xpath % [el]
177
452
  end
178
-
179
- private
453
+ xpath
454
+ end
180
455
 
181
- def process_str(str)
182
- str.encode!('UTF-8', 'UTF-8', :invalid => :replace)
183
- str.strip!
184
- str # This is required to return the str, do not remove.
456
+ # Processes a String to make it uniform.
457
+ def process_str(str)
458
+ if str.is_a?(String)
459
+ str.encode!('UTF-8', 'UTF-8', invalid: :replace)
460
+ str.strip!
185
461
  end
462
+ str
463
+ end
186
464
 
187
- def process_arr(array)
188
- assert_arr_types(array, String)
189
- array.map! { |str| process_str(str) }
190
- array.reject! { |str| str.empty? }
191
- array.uniq!
465
+ # Processes an Array to make it uniform.
466
+ def process_arr(array)
467
+ if array.is_a?(Array)
468
+ array.map! { |str| process_str(str) }
469
+ array.reject! { |str| str.is_a?(String) ? str.empty? : false }
470
+ array.uniq!
192
471
  end
193
-
194
- # Modifies internal links by removing this doc's base or host url if
195
- # present. http://www.google.co.uk/about.html (with or without the
196
- # protocol prefix) will become about.html meaning it'll appear within
197
- # internal_links.
198
- def process_internal_links(links)
199
- links.map! do |link|
200
- host_or_base = if link.start_with?("http")
201
- url.base
202
- else
203
- url.host
204
- end
205
- if link.start_with?(host_or_base)
206
- link.sub!(host_or_base, "")
207
- link.replace(link[1..-1]) if link.start_with?("/")
208
- link.strip!
209
- end
210
- link
211
- end
472
+ array
473
+ end
474
+
475
+ # Modifies internal links by removing this doc's base or host URL, if
476
+ # present. http://www.google.co.uk/about.html (with or without the
477
+ # protocol prefix) will become about.html meaning it'll appear within
478
+ # Document#internal_links.
479
+ def process_internal_links(links)
480
+ links.map! do |link|
481
+ host_or_base = if link.start_with?("http")
482
+ @url.base
483
+ else
484
+ @url.host
485
+ end
486
+ if link.start_with?(host_or_base)
487
+ link.sub!(host_or_base, "")
488
+ link.replace(link[1..-1]) if link.start_with?("/")
489
+ link.strip!
490
+ end
491
+ link
212
492
  end
493
+ end
494
+
495
+ ### Default init_* (Document extension) methods. ###
213
496
 
214
- def text_elements_xpath
215
- xpath = ""
216
- return xpath if TEXT_ELEMENTS.empty?
217
- el_xpath = "//%s/text()"
218
- TEXT_ELEMENTS.each_with_index do |el, i|
219
- xpath += " | " unless i == 0
220
- xpath += el_xpath % [el]
221
- end
222
- xpath
223
- end
497
+ # Init methods for title.
224
498
 
225
- def init_var(xpath, var, first_result = true)
226
- results = @doc.xpath(xpath)
227
- unless results.nil? || results.empty?
228
- result = if first_result
229
- results.first.content
230
- else
231
- results.map { |res| res.content }
232
- end
233
- instance_variable_set(var, result)
234
- end
235
- end
236
-
237
- def init_title
238
- @title = nil
499
+ def init_title_from_html
239
500
  xpath = "//title"
240
- init_var(xpath, :@title)
241
- process_str(@title) unless @title.nil?
242
- end
243
-
244
- def init_author
245
- @author = nil
501
+ result = find_in_html(xpath)
502
+ init_var(:@title, result)
503
+ end
504
+
505
+ def init_title_from_object(obj)
506
+ result = find_in_object(obj, "title")
507
+ init_var(:@title, result)
508
+ end
509
+
510
+ # Init methods for author.
511
+
512
+ def init_author_from_html
246
513
  xpath = "//meta[@name='author']/@content"
247
- init_var(xpath, :@author)
248
- process_str(@author) unless @author.nil?
249
- end
250
-
251
- def init_keywords
252
- @keywords = nil
514
+ result = find_in_html(xpath)
515
+ init_var(:@author, result)
516
+ end
517
+
518
+ def init_author_from_object(obj)
519
+ result = find_in_object(obj, "author")
520
+ init_var(:@author, result)
521
+ end
522
+
523
+ # Init methods for keywords.
524
+
525
+ def init_keywords_from_html
253
526
  xpath = "//meta[@name='keywords']/@content"
254
- init_var(xpath, :@keywords)
255
- return @keywords = [] unless @keywords
256
- @keywords = @keywords.split(",")
257
- process_arr(@keywords)
258
- end
527
+ result = find_in_html(xpath) do |keywords|
528
+ if keywords
529
+ keywords = keywords.split(",")
530
+ process_arr(keywords)
531
+ end
532
+ keywords
533
+ end
534
+ init_var(:@keywords, result)
535
+ end
536
+
537
+ def init_keywords_from_object(obj)
538
+ result = find_in_object(obj, "keywords", singleton: false)
539
+ init_var(:@keywords, result)
540
+ end
259
541
 
260
- def init_links
261
- @links = nil
542
+ # Init methods for links.
543
+
544
+ def init_links_from_html
262
545
  xpath = "//a/@href"
263
- init_var(xpath, :@links, false)
264
- return @links = [] unless @links
265
- process_arr(@links)
266
- @links.reject! { |link| link == "/" }
267
- @links.map! do |link|
268
- begin
269
- Wgit::Url.new(link)
270
- rescue
271
- nil
546
+ result = find_in_html(xpath, singleton: false) do |links|
547
+ if links
548
+ links.reject! { |link| link == "/" }
549
+ links.map! do |link|
550
+ begin
551
+ Wgit::Url.new(link)
552
+ rescue
553
+ nil
554
+ end
555
+ end
556
+ links.reject! { |link| link.nil? }
557
+ process_internal_links(links)
272
558
  end
559
+ links
273
560
  end
274
- @links.reject! { |link| link.nil? }
275
- process_internal_links(@links)
561
+ init_var(:@links, result)
276
562
  end
277
-
278
- def init_text
279
- @text = nil
563
+
564
+ def init_links_from_object(obj)
565
+ result = find_in_object(obj, "links", singleton: false) do |links|
566
+ if links
567
+ links.map! { |link| Wgit::Url.new(link) }
568
+ end
569
+ links
570
+ end
571
+ init_var(:@links, result)
572
+ end
573
+
574
+ # Init methods for text.
575
+
576
+ def init_text_from_html
280
577
  xpath = text_elements_xpath
281
- init_var(xpath, :@text, false)
282
- return @text = [] unless @text
283
- process_arr(@text)
578
+ result = find_in_html(xpath, singleton: false)
579
+ init_var(:@text, result)
580
+ end
581
+
582
+ def init_text_from_object(obj)
583
+ result = find_in_object(obj, "text", singleton: false)
584
+ init_var(:@text, result)
284
585
  end
285
586
 
286
- alias :to_hash :to_h
587
+ alias :to_hash :to_h
287
588
  alias :relative_links :internal_links
288
589
  alias :relative_urls :internal_links
289
590
  alias :relative_full_links :internal_full_links