wgit 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/wgit.rb +1 -1
- data/lib/wgit/assertable.rb +72 -61
- data/lib/wgit/core_ext.rb +11 -5
- data/lib/wgit/crawler.rb +97 -57
- data/lib/wgit/database/database.rb +247 -170
- data/lib/wgit/database/model.rb +40 -24
- data/lib/wgit/database/mongo_connection_details.rb +44 -23
- data/lib/wgit/document.rb +534 -233
- data/lib/wgit/indexer.rb +235 -0
- data/lib/wgit/url.rb +199 -121
- data/lib/wgit/utils.rb +143 -96
- data/lib/wgit/version.rb +5 -1
- metadata +10 -9
- data/lib/wgit/web_crawler.rb +0 -134
data/lib/wgit/database/model.rb
CHANGED
@@ -2,30 +2,46 @@ require_relative '../utils'
|
|
2
2
|
|
3
3
|
module Wgit
|
4
4
|
|
5
|
-
#
|
6
|
-
# Module containing the DB data model structure.
|
5
|
+
# Module containing the database (DB) data model structure.
|
7
6
|
module Model
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
7
|
+
|
8
|
+
# The data model for a Wgit::Url.
|
9
|
+
#
|
10
|
+
# @param url [Wgit::Url] The URL DB record.
|
11
|
+
# @return [Hash] The URL model ready for DB insertion.
|
12
|
+
def self.url(url)
|
13
|
+
raise "url must respond_to? to_h" unless url.respond_to?(:to_h)
|
14
|
+
model = url.to_h
|
15
|
+
Wgit::Utils.remove_non_bson_types(model)
|
16
|
+
end
|
17
|
+
|
18
|
+
# The data model for a Wgit::Document.
|
19
|
+
#
|
20
|
+
# @param doc [Wgit::Document] The Document DB record.
|
21
|
+
# @return [Hash] The Document model ready for DB insertion.
|
22
|
+
def self.document(doc)
|
23
|
+
raise "doc must respond_to? to_h" unless doc.respond_to?(:to_h)
|
24
|
+
model = doc.to_h(false)
|
25
|
+
Wgit::Utils.remove_non_bson_types(model)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Default fields when inserting a record into the DB.
|
29
|
+
#
|
30
|
+
# @return [Hash] Containing common insertion fields for all models.
|
31
|
+
def self.common_insert_data
|
32
|
+
{
|
33
|
+
date_added: Wgit::Utils.time_stamp,
|
34
|
+
date_modified: Wgit::Utils.time_stamp,
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
# Default fields when updating a record in the DB.
|
39
|
+
#
|
40
|
+
# @return [Hash] Containing common update fields for all models.
|
41
|
+
def self.common_update_data
|
42
|
+
{
|
43
|
+
date_modified: Wgit::Utils.time_stamp,
|
44
|
+
}
|
45
|
+
end
|
30
46
|
end
|
31
47
|
end
|
@@ -1,27 +1,48 @@
|
|
1
|
-
|
2
|
-
# @author Michael Telford
|
3
1
|
module Wgit
|
4
|
-
|
2
|
+
# The connection details for the database. This must be set if you want to
|
3
|
+
# store and access webpages in a database. Don't set the constant directly,
|
4
|
+
# instead use the funcs contained within the Wgit module.
|
5
|
+
CONNECTION_DETAILS = {}
|
6
|
+
|
7
|
+
# Set the database's connection details from the given hash and freeze them.
|
8
|
+
# It is your responsibility to ensure the correct hash vars are present and
|
9
|
+
# set. Due to the freezing of the CONNECTION_DETAILS, this func is designed
|
10
|
+
# to be called only once.
|
11
|
+
#
|
12
|
+
# @param hash [Hash] Containing the database connection details to use.
|
13
|
+
# The hash should contain the following keys (of type String):
|
14
|
+
# host, port, uname, pword, db
|
15
|
+
# @raise [KeyError, FrozenError] If any of the required connection
|
16
|
+
# details are missing or if the connection details have already been set.
|
17
|
+
# @return [Hash] Containing the database connection details from hash.
|
18
|
+
def self.set_connection_details(hash)
|
19
|
+
CONNECTION_DETAILS[:host] = hash.fetch('host')
|
20
|
+
CONNECTION_DETAILS[:port] = hash.fetch('port')
|
21
|
+
CONNECTION_DETAILS[:uname] = hash.fetch('uname')
|
22
|
+
CONNECTION_DETAILS[:pword] = hash.fetch('pword')
|
23
|
+
CONNECTION_DETAILS[:db] = hash.fetch('db')
|
24
|
+
|
25
|
+
CONNECTION_DETAILS.freeze
|
26
|
+
end
|
27
|
+
|
28
|
+
# Set the database's connection details from the ENV and freeze them. It is
|
29
|
+
# your responsibility to ensure the correct ENV vars are present and set.
|
30
|
+
# Due to the freezing of the CONNECTION_DETAILS, this func is designed to be
|
31
|
+
# called only once.
|
32
|
+
#
|
33
|
+
# The ENV should contain the following keys (of type String):
|
34
|
+
# DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
|
35
|
+
#
|
36
|
+
# @raise [KeyError, FrozenError] If any of the required connection
|
37
|
+
# details are missing or if the connection details have already been set.
|
38
|
+
# @return [Hash] Containing the database connection details from the ENV.
|
39
|
+
def self.set_connection_details_from_env
|
40
|
+
CONNECTION_DETAILS[:host] = ENV.fetch('DB_HOST')
|
41
|
+
CONNECTION_DETAILS[:port] = ENV.fetch('DB_PORT')
|
42
|
+
CONNECTION_DETAILS[:uname] = ENV.fetch('DB_USERNAME')
|
43
|
+
CONNECTION_DETAILS[:pword] = ENV.fetch('DB_PASSWORD')
|
44
|
+
CONNECTION_DETAILS[:db] = ENV.fetch('DB_DATABASE')
|
5
45
|
|
6
|
-
|
7
|
-
if DB_PROVIDER == :OpenShift
|
8
|
-
CONNECTION_DETAILS = {
|
9
|
-
:host => "127.0.0.1",
|
10
|
-
:port => "27017",
|
11
|
-
:db => "admin",
|
12
|
-
:uname => "admin",
|
13
|
-
:pword => "R5jUKv1fessb"
|
14
|
-
}.freeze
|
15
|
-
# MongoLabs (MongoDB 3.0)
|
16
|
-
elsif DB_PROVIDER == :MongoLabs
|
17
|
-
CONNECTION_DETAILS = {
|
18
|
-
:host => "ds037205.mongolab.com",
|
19
|
-
:port => "37205",
|
20
|
-
:db => "crawler",
|
21
|
-
:uname => "rubyapp",
|
22
|
-
:pword => "R5jUKv1fessb",
|
23
|
-
}.freeze
|
24
|
-
else
|
25
|
-
raise "Database provider '#{DB_PROVIDER}' is not recognized"
|
46
|
+
CONNECTION_DETAILS.freeze
|
26
47
|
end
|
27
48
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -2,288 +2,589 @@ require_relative 'url'
|
|
2
2
|
require_relative 'utils'
|
3
3
|
require_relative 'assertable'
|
4
4
|
require 'nokogiri'
|
5
|
+
require 'json'
|
5
6
|
|
6
7
|
module Wgit
|
7
8
|
|
8
|
-
#
|
9
|
-
#
|
9
|
+
# Class modeling a HTML web document. Also doubles as a search result when
|
10
|
+
# loading Documents from the database.
|
11
|
+
#
|
12
|
+
# The initialize method dynamically initializes certain variables from the
|
13
|
+
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
14
|
+
# Document class can be easily extended allowing you to pull out the bits of
|
15
|
+
# a webpage that are important to you. See Wgit::Document.define_extension.
|
10
16
|
class Document
|
11
17
|
include Assertable
|
12
|
-
|
13
|
-
TEXT_ELEMENTS = [:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
|
14
|
-
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
|
15
18
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
19
|
+
# The HTML elements that make up the visible text on a page.
|
20
|
+
# These elements are used to initialize the @text of the Document.
|
21
|
+
# See the README.md for how to add to this Array dynamically.
|
22
|
+
@@text_elements = [
|
23
|
+
:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
|
24
|
+
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
|
25
|
+
]
|
26
|
+
|
27
|
+
# The URL of the webpage, an instance of Wgit:Url.
|
28
|
+
attr_reader :url
|
29
|
+
|
30
|
+
# The HTML of the webpage, an instance of String.
|
31
|
+
attr_reader :html
|
32
|
+
|
33
|
+
# The Nokogiri document object initialized from @html.
|
34
|
+
attr_reader :doc
|
35
|
+
|
36
|
+
# The score is only used following a Database#search and records matches.
|
37
|
+
attr_reader :score
|
38
|
+
|
39
|
+
# Initialize takes either two strings (representing the URL and HTML) or an
|
40
|
+
# object representing a database record (of a HTTP crawled web page). This
|
41
|
+
# allows for initialisation from both crawled web pages and (afterwards)
|
42
|
+
# documents/web pages retrieved from the database.
|
43
|
+
#
|
44
|
+
# During initialisation, the Document will call any
|
45
|
+
# 'init_*_from_html' and 'init_*_from_object' methods it can find. Some
|
46
|
+
# default init_* methods exist while others can be defined by the user.
|
47
|
+
# See the README and Wgit::Document.define_extension for more info.
|
48
|
+
#
|
49
|
+
# @param url_or_obj [String, Object#fetch] Either a String representing a
|
50
|
+
# URL or a Hash-like object responding to :fetch. e.g. a MongoDB
|
51
|
+
# collection object. The Object's :fetch method should support Strings as
|
52
|
+
# keys.
|
53
|
+
# @param html [String] The crawled web page's HTML. This param is only
|
54
|
+
# required if url_or_obj is a String representing the web page's URL.
|
55
|
+
def initialize(url_or_obj, html = "")
|
56
|
+
# Init from URL String and HTML String.
|
57
|
+
if url_or_obj.is_a?(String)
|
58
|
+
url = url_or_obj
|
59
|
+
assert_type(url, Url)
|
60
|
+
|
61
|
+
@url = url
|
62
|
+
@html = html ||= ""
|
63
|
+
@doc = init_nokogiri
|
64
|
+
@score = 0.0
|
22
65
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
#config.options = Nokogiri::XML::ParseOptions::STRICT |
|
29
|
-
# Nokogiri::XML::ParseOptions::NONET
|
30
|
-
end
|
31
|
-
|
32
|
-
init_title
|
33
|
-
init_author
|
34
|
-
init_keywords
|
35
|
-
init_links
|
36
|
-
init_text
|
37
|
-
@score = 0.0
|
38
|
-
else
|
39
|
-
# Init from a mongo collection document.
|
40
|
-
@url = Wgit::Url.new(url_or_doc[:url])
|
41
|
-
@html = url_or_doc[:html].nil? ? "" : url_or_doc[:html]
|
42
|
-
@title = url_or_doc[:title]
|
43
|
-
@author = url_or_doc[:author]
|
44
|
-
@keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords]
|
45
|
-
@links = url_or_doc[:links].nil? ? [] : url_or_doc[:links]
|
46
|
-
@links.map! { |link| Wgit::Url.new(link) }
|
47
|
-
@text = url_or_doc[:text].nil? ? [] : url_or_doc[:text]
|
48
|
-
@score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score]
|
66
|
+
# Dynamically run the init_*_from_html methods.
|
67
|
+
Document.private_instance_methods(false).each do |method|
|
68
|
+
if method.to_s.start_with?("init_") &&
|
69
|
+
method.to_s.end_with?("_from_html")
|
70
|
+
self.send(method)
|
49
71
|
end
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
72
|
+
end
|
73
|
+
# Init from a Hash like object containing Strings as keys e.g. Mongo
|
74
|
+
# collection obj.
|
75
|
+
else
|
76
|
+
obj = url_or_obj
|
77
|
+
assert_respond_to(obj, :fetch)
|
78
|
+
|
79
|
+
@url = obj.fetch("url") # Should always be present.
|
80
|
+
@html = obj.fetch("html", "")
|
81
|
+
@doc = init_nokogiri
|
82
|
+
@score = obj.fetch("score", 0.0)
|
83
|
+
|
84
|
+
# Dynamically run the init_*_from_object methods.
|
85
|
+
Document.private_instance_methods(false).each do |method|
|
86
|
+
if method.to_s.start_with?("init_") &&
|
87
|
+
method.to_s.end_with?("_from_object")
|
88
|
+
self.send(method, obj)
|
60
89
|
end
|
61
|
-
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
62
93
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
94
|
+
# Determines if both the url and html match. Use
|
95
|
+
# doc.object_id == other_doc.object_id for exact object comparison.
|
96
|
+
#
|
97
|
+
# @param other_doc [Wgit::Document] To compare self against.
|
98
|
+
# @return [Boolean] True if @url and @html are equal, false if not.
|
99
|
+
def ==(other_doc)
|
100
|
+
return false unless other_doc.is_a? Wgit::Document
|
101
|
+
@url == other_doc.url and @html == other_doc.html
|
102
|
+
end
|
103
|
+
|
104
|
+
# Is a shortcut for calling Document#html[range].
|
105
|
+
#
|
106
|
+
# @param range [Range] The range of @html to return.
|
107
|
+
# @return [String] The given range of @html.
|
108
|
+
def [](range)
|
109
|
+
@html[range]
|
110
|
+
end
|
111
|
+
|
112
|
+
def date_crawled
|
113
|
+
@url.date_crawled
|
114
|
+
end
|
115
|
+
|
116
|
+
# Returns a Hash containing this Document's instance vars.
|
117
|
+
# Used when storing the Document in a Database e.g. MongoDB etc.
|
118
|
+
# By default the @html var is excluded from the returned Hash.
|
119
|
+
#
|
120
|
+
# @param include_html [Boolean] Whether or not to include @html in the
|
121
|
+
# returned Hash.
|
122
|
+
# @return [Hash] Containing self's instance vars.
|
123
|
+
def to_h(include_html = false)
|
124
|
+
ignore = include_html ? [] : ["@html"]
|
125
|
+
ignore << "@doc" # Always ignore "@doc"
|
126
|
+
Wgit::Utils.to_h(self, ignore)
|
127
|
+
end
|
128
|
+
|
129
|
+
# Converts this Document's to_h return value to a JSON String.
|
130
|
+
#
|
131
|
+
# @param include_html [Boolean] Whether or not to include @html in the
|
132
|
+
# returned JSON String.
|
133
|
+
# @return [String] This Document represented as a JSON String.
|
134
|
+
def to_json(include_html = false)
|
135
|
+
h = to_h(include_html)
|
136
|
+
JSON.generate(h)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Returns a Hash containing this Document's instance variables and
|
140
|
+
# their :length (if they respond to it). Works dynamically so that any
|
141
|
+
# user defined extensions (and their created instance vars) will appear in
|
142
|
+
# the returned Hash as well. The number of text snippets as well as total
|
143
|
+
# number of textual bytes are always included in the returned Hash.
|
144
|
+
#
|
145
|
+
# @return [Hash] Containing self's HTML statistics.
|
146
|
+
def stats
|
147
|
+
hash = {}
|
148
|
+
instance_variables.each do |var|
|
149
|
+
# Add up the total bytes of text as well as the length.
|
150
|
+
if var == :@text
|
151
|
+
count = 0
|
152
|
+
@text.each { |t| count += t.length }
|
153
|
+
hash[:text_length] = @text.length
|
154
|
+
hash[:text_bytes] = count
|
155
|
+
# Else take the var's #length method return value.
|
156
|
+
else
|
157
|
+
next unless instance_variable_get(var).respond_to?(:length)
|
158
|
+
hash[var[1..-1].to_sym] =
|
159
|
+
instance_variable_get(var).send(:length)
|
160
|
+
end
|
69
161
|
end
|
70
|
-
|
71
|
-
|
162
|
+
hash
|
163
|
+
end
|
164
|
+
|
165
|
+
# Determine the size of this Document's HTML.
|
166
|
+
#
|
167
|
+
# @return [Integer] The total number of bytes in @html.
|
168
|
+
def size
|
169
|
+
stats[:html]
|
170
|
+
end
|
171
|
+
|
172
|
+
# Determine if this Document's HTML is empty or not.
|
173
|
+
#
|
174
|
+
# @return [Boolean] True if @html is nil/empty, false otherwise.
|
175
|
+
def empty?
|
176
|
+
return true if @html.nil?
|
177
|
+
@html.strip.empty?
|
178
|
+
end
|
179
|
+
|
180
|
+
# Uses Nokogiri's xpath method to search the doc's html and return the
|
181
|
+
# results.
|
182
|
+
#
|
183
|
+
# @param xpath [String] The xpath to search the @html with.
|
184
|
+
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
185
|
+
def xpath(xpath)
|
186
|
+
@doc.xpath(xpath)
|
187
|
+
end
|
188
|
+
|
189
|
+
# Uses Nokogiri's css method to search the doc's html and return the
|
190
|
+
# results.
|
191
|
+
#
|
192
|
+
# @param selector [String] The CSS selector to search the @html with.
|
193
|
+
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
194
|
+
def css(selector)
|
195
|
+
@doc.css(selector)
|
196
|
+
end
|
197
|
+
|
198
|
+
# Get all internal links of this Document.
|
199
|
+
#
|
200
|
+
# @return [Array<Wgit::Url>] self's internal/relative URL's.
|
201
|
+
def internal_links
|
72
202
|
return [] if @links.empty?
|
73
|
-
|
203
|
+
@links.reject do |link|
|
74
204
|
begin
|
75
|
-
|
205
|
+
not link.relative_link?
|
76
206
|
rescue
|
77
|
-
|
207
|
+
true
|
78
208
|
end
|
79
209
|
end
|
80
|
-
|
210
|
+
end
|
81
211
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
next unless instance_variable_get(var).respond_to?(:length)
|
94
|
-
hash[var[1..-1].to_sym] =
|
95
|
-
instance_variable_get(var).send(:length)
|
96
|
-
end
|
97
|
-
end
|
98
|
-
hash
|
212
|
+
# Get all internal links of this Document and append them to this
|
213
|
+
# Document's base URL.
|
214
|
+
#
|
215
|
+
# @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
|
216
|
+
# form.
|
217
|
+
def internal_full_links
|
218
|
+
in_links = internal_links
|
219
|
+
return [] if in_links.empty?
|
220
|
+
in_links.map do |link|
|
221
|
+
link.replace("/" + link) unless link.start_with?("/")
|
222
|
+
Wgit::Url.new(@url.to_base + link)
|
99
223
|
end
|
100
|
-
|
101
|
-
|
102
|
-
|
224
|
+
end
|
225
|
+
|
226
|
+
# Get all external links of this Document.
|
227
|
+
#
|
228
|
+
# @return [Array<Wgit::Url>] self's external/absolute URL's.
|
229
|
+
def external_links
|
230
|
+
return [] if @links.empty?
|
231
|
+
@links.reject do |link|
|
232
|
+
begin
|
233
|
+
link.relative_link?
|
234
|
+
rescue
|
235
|
+
true
|
236
|
+
end
|
103
237
|
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# Searches against the @text for the given search query.
|
241
|
+
# The number of search hits for each sentenence are recorded internally
|
242
|
+
# and used to rank/sort the search results before being returned. Where
|
243
|
+
# the Wgit::Database#search method search all documents for the most hits,
|
244
|
+
# this method searches each document's @text for the most hits.
|
245
|
+
#
|
246
|
+
# Each search result comprises of a sentence of a given length. The length
|
247
|
+
# will be based on the sentence_limit parameter or the full length of the
|
248
|
+
# original sentence, which ever is less. The algorithm obviously ensures
|
249
|
+
# that the search query is visible somewhere in the sentence.
|
250
|
+
#
|
251
|
+
# @param query [String] The value to search the document's text against.
|
252
|
+
# @param sentence_limit [Integer] The max length of each search result
|
253
|
+
# sentence.
|
254
|
+
# @return [Array<String>] Representing the search results.
|
255
|
+
def search(query, sentence_limit = 80)
|
256
|
+
raise "A search value must be provided" if query.empty?
|
257
|
+
raise "The sentence length value must be even" if sentence_limit.odd?
|
258
|
+
|
259
|
+
results = {}
|
260
|
+
regex = Regexp.new(query, Regexp::IGNORECASE)
|
104
261
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
262
|
+
@text.each do |sentence|
|
263
|
+
hits = sentence.scan(regex).count
|
264
|
+
if hits > 0
|
265
|
+
sentence.strip!
|
266
|
+
index = sentence.index(regex)
|
267
|
+
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
268
|
+
results[sentence] = hits
|
269
|
+
end
|
109
270
|
end
|
271
|
+
|
272
|
+
return [] if results.empty?
|
273
|
+
results = Hash[results.sort_by { |k, v| v }]
|
274
|
+
results.keys.reverse
|
275
|
+
end
|
276
|
+
|
277
|
+
# Performs a text search (see Document#search for details) but assigns the
|
278
|
+
# results to the @text instance variable. This can be used for sub search
|
279
|
+
# functionality. The original text is returned; no other reference to it
|
280
|
+
# is kept thereafter.
|
281
|
+
#
|
282
|
+
# @param query [String] The value to search the document's text against.
|
283
|
+
# @return [String] This Document's original @text value.
|
284
|
+
def search!(query)
|
285
|
+
orig_text = @text
|
286
|
+
@text = search(query)
|
287
|
+
orig_text
|
288
|
+
end
|
289
|
+
|
290
|
+
### Document (Class) methods ###
|
291
|
+
|
292
|
+
# Returns Document.text_elements used to obtain the text in a webpage.
|
293
|
+
#
|
294
|
+
# @return [Array<Symbols>] The page elements containing visual text on a
|
295
|
+
# webpage.
|
296
|
+
def self.text_elements
|
297
|
+
@@text_elements
|
298
|
+
end
|
299
|
+
|
300
|
+
# Initialises a private instance variable with the xpath or database object
|
301
|
+
# result(s). When initialising from HTML, a true singleton value will only
|
302
|
+
# ever return one result otherwise all xpath results are returned in an
|
303
|
+
# Array. When initialising from a database object, the value is taken as
|
304
|
+
# is and singleton is only used to define the default empty value.
|
305
|
+
# If a value cannot be found (in either the HTML or database object), then
|
306
|
+
# a default will be used. The default is: singleton ? nil : [].
|
307
|
+
#
|
308
|
+
# Note that defined extensions work for both documents being crawled from
|
309
|
+
# the WWW and for documents being retrieved from the database. This
|
310
|
+
# effectively implements ORM like behavior using this class.
|
311
|
+
#
|
312
|
+
# @param var [Symbol] The name of the variable to be initialised.
|
313
|
+
# @param xpath [String] Used to find the element(s) of the webpage.
|
314
|
+
# @option options [Boolean] :singleton The singleton option determines
|
315
|
+
# whether or not the result(s) should be in an Array. If multiple
|
316
|
+
# results are found and singleton is true then the first result will be
|
317
|
+
# used. Defaults to true.
|
318
|
+
# @option options [Boolean] :text_content_only The text_content_only option
|
319
|
+
# if true will use the text content of the Nokogiri result object,
|
320
|
+
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
321
|
+
# @yield [var_value] Gives the value about to be assigned to the new var.
|
322
|
+
# The return value of the block becomes the new var value, unless nil.
|
323
|
+
# Return nil if you want to inspect but not change the var value.
|
324
|
+
# @return [Symbol] The first half of the newly created method names e.g.
|
325
|
+
# if var == "title" then :init_title is returned.
|
326
|
+
def self.define_extension(var, xpath, options = {}, &block)
|
327
|
+
default_options = { singleton: true, text_content_only: true }
|
328
|
+
options = default_options.merge(options)
|
110
329
|
|
111
|
-
#
|
112
|
-
#
|
113
|
-
|
114
|
-
|
115
|
-
|
330
|
+
# Define the private init_*_from_html method for HTML.
|
331
|
+
# Gets the HTML's xpath value and creates a var for it.
|
332
|
+
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
333
|
+
result = find_in_html(xpath, options, &block)
|
334
|
+
init_var(var, result)
|
335
|
+
end
|
336
|
+
Document.send :private, func_name
|
337
|
+
|
338
|
+
# Define the private init_*_from_object method for a Database object.
|
339
|
+
# Gets the Object's "key" value and creates a var for it.
|
340
|
+
func_name = Document.send(
|
341
|
+
:define_method, "init_#{var}_from_object") do |obj|
|
342
|
+
result = find_in_object(
|
343
|
+
obj, var.to_s, singleton: options[:singleton], &block)
|
344
|
+
init_var(var, result)
|
345
|
+
end
|
346
|
+
Document.send :private, func_name
|
347
|
+
|
348
|
+
"init_#{var}".to_sym
|
349
|
+
end
|
350
|
+
|
351
|
+
# Removes the init_* methods created when an extension is defined.
|
352
|
+
# Therefore, this is the opposing method to Document.define_extension.
|
353
|
+
# Returns true if successful or false if the method(s) cannot be found.
|
354
|
+
#
|
355
|
+
# @param var [Symbol] The extension variable already defined.
|
356
|
+
# @return [Boolean] True if the extension var was found and removed;
|
357
|
+
# otherwise false.
|
358
|
+
def self.remove_extension(var)
|
359
|
+
Document.send(:remove_method, "init_#{var}_from_html")
|
360
|
+
Document.send(:remove_method, "init_#{var}_from_object")
|
361
|
+
true
|
362
|
+
rescue NameError
|
363
|
+
false
|
364
|
+
end
|
365
|
+
|
366
|
+
private
|
367
|
+
|
368
|
+
# Initializes the nokogiri object using @html, which must be already set.
|
369
|
+
def init_nokogiri
|
370
|
+
raise "@html must be set" unless @html
|
371
|
+
Nokogiri::HTML(@html) do |config|
|
372
|
+
# TODO: Remove #'s below when crawling in production.
|
373
|
+
#config.options = Nokogiri::XML::ParseOptions::STRICT |
|
374
|
+
# Nokogiri::XML::ParseOptions::NONET
|
116
375
|
end
|
376
|
+
end
|
377
|
+
|
378
|
+
# Returns an object/value from this Document's @html using the provided
|
379
|
+
# xpath param.
|
380
|
+
# singleton ? results.first (single Object) : results (Array)
|
381
|
+
# text_content_only ? result.content (String) : result (nokogiri Object)
|
382
|
+
# A block can be used to set the final value before it is returned.
|
383
|
+
# Return nil from the block if you don't want to override the value.
|
384
|
+
def find_in_html(xpath, singleton: true, text_content_only: true)
|
385
|
+
results = @doc.xpath(xpath)
|
117
386
|
|
118
|
-
|
119
|
-
|
120
|
-
|
387
|
+
if results and not results.empty?
|
388
|
+
result = if singleton
|
389
|
+
text_content_only ? results.first.content : results.first
|
390
|
+
else
|
391
|
+
text_content_only ? results.map(&:content) : results
|
392
|
+
end
|
393
|
+
else
|
394
|
+
result = singleton ? nil : []
|
121
395
|
end
|
122
|
-
|
123
|
-
|
124
|
-
|
396
|
+
|
397
|
+
singleton ? process_str(result) : process_arr(result)
|
398
|
+
|
399
|
+
if block_given?
|
400
|
+
new_result = yield(result)
|
401
|
+
result = new_result if new_result
|
125
402
|
end
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
raise "A search value must be provided" if text.empty?
|
145
|
-
raise "The sentence length value must be even" if sentence_limit.odd?
|
146
|
-
|
147
|
-
results = {}
|
148
|
-
regex = Regexp.new(text, Regexp::IGNORECASE)
|
149
|
-
|
150
|
-
@text.each do |sentence|
|
151
|
-
hits = sentence.scan(regex).count
|
152
|
-
if hits > 0
|
153
|
-
sentence.strip!
|
154
|
-
index = sentence.index(regex)
|
155
|
-
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
156
|
-
results[sentence] = hits
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
|
-
return [] if results.empty?
|
161
|
-
results = Hash[results.sort_by { |k, v| v }]
|
162
|
-
results.keys.reverse
|
403
|
+
|
404
|
+
result
|
405
|
+
end
|
406
|
+
|
407
|
+
# Finds a value in the obj using the key.
|
408
|
+
# singleton is used to set the value if not found in obj.
|
409
|
+
# A block can be used to set the final value before it is returned.
|
410
|
+
# Return nil from the block if you don't want to override the value.
|
411
|
+
def find_in_object(obj, key, singleton: true)
|
412
|
+
assert_respond_to(obj, :fetch)
|
413
|
+
|
414
|
+
default = singleton ? nil : []
|
415
|
+
result = obj.fetch(key.to_s, default)
|
416
|
+
singleton ? process_str(result) : process_arr(result)
|
417
|
+
|
418
|
+
if block_given?
|
419
|
+
new_result = yield(result)
|
420
|
+
result = new_result if new_result
|
163
421
|
end
|
422
|
+
|
423
|
+
result
|
424
|
+
end
|
164
425
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
426
|
+
# Initialises an instance variable and defines a getter method for it.
|
427
|
+
# @param var [Symbol] The name of the variable to be initialized.
|
428
|
+
# @param value [Object] The newly initialized variable's value.
|
429
|
+
# @return [Symbol] The name of the newly created getter method.
|
430
|
+
def init_var(var, value)
|
431
|
+
# instance_var_name starts with @, var_name doesn't.
|
432
|
+
var = var.to_s
|
433
|
+
var_name = (var.start_with?("@") ? var[1..-1] : var).to_sym
|
434
|
+
instance_var_name = "@#{var_name}".to_sym
|
435
|
+
|
436
|
+
instance_variable_set(instance_var_name, value)
|
437
|
+
|
438
|
+
Document.send(:define_method, var_name) do
|
439
|
+
instance_variable_get(instance_var_name)
|
171
440
|
end
|
441
|
+
end
|
172
442
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
443
|
+
# Takes Docuent.text_elements and returns an xpath String used to obtain
|
444
|
+
# all of the combined text.
|
445
|
+
def text_elements_xpath
|
446
|
+
xpath = ""
|
447
|
+
return xpath if @@text_elements.empty?
|
448
|
+
el_xpath = "//%s/text()"
|
449
|
+
@@text_elements.each_with_index do |el, i|
|
450
|
+
xpath += " | " unless i == 0
|
451
|
+
xpath += el_xpath % [el]
|
177
452
|
end
|
178
|
-
|
179
|
-
|
453
|
+
xpath
|
454
|
+
end
|
180
455
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
456
|
+
# Processes a String to make it uniform.
|
457
|
+
def process_str(str)
|
458
|
+
if str.is_a?(String)
|
459
|
+
str.encode!('UTF-8', 'UTF-8', invalid: :replace)
|
460
|
+
str.strip!
|
185
461
|
end
|
462
|
+
str
|
463
|
+
end
|
186
464
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
465
|
+
# Processes an Array to make it uniform.
|
466
|
+
def process_arr(array)
|
467
|
+
if array.is_a?(Array)
|
468
|
+
array.map! { |str| process_str(str) }
|
469
|
+
array.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
470
|
+
array.uniq!
|
192
471
|
end
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
472
|
+
array
|
473
|
+
end
|
474
|
+
|
475
|
+
# Modifies internal links by removing this doc's base or host URL, if
|
476
|
+
# present. http://www.google.co.uk/about.html (with or without the
|
477
|
+
# protocol prefix) will become about.html meaning it'll appear within
|
478
|
+
# Document#internal_links.
|
479
|
+
def process_internal_links(links)
|
480
|
+
links.map! do |link|
|
481
|
+
host_or_base = if link.start_with?("http")
|
482
|
+
@url.base
|
483
|
+
else
|
484
|
+
@url.host
|
485
|
+
end
|
486
|
+
if link.start_with?(host_or_base)
|
487
|
+
link.sub!(host_or_base, "")
|
488
|
+
link.replace(link[1..-1]) if link.start_with?("/")
|
489
|
+
link.strip!
|
490
|
+
end
|
491
|
+
link
|
212
492
|
end
|
493
|
+
end
|
494
|
+
|
495
|
+
### Default init_* (Document extension) methods. ###
|
213
496
|
|
214
|
-
|
215
|
-
xpath = ""
|
216
|
-
return xpath if TEXT_ELEMENTS.empty?
|
217
|
-
el_xpath = "//%s/text()"
|
218
|
-
TEXT_ELEMENTS.each_with_index do |el, i|
|
219
|
-
xpath += " | " unless i == 0
|
220
|
-
xpath += el_xpath % [el]
|
221
|
-
end
|
222
|
-
xpath
|
223
|
-
end
|
497
|
+
# Init methods for title.
|
224
498
|
|
225
|
-
|
226
|
-
results = @doc.xpath(xpath)
|
227
|
-
unless results.nil? || results.empty?
|
228
|
-
result = if first_result
|
229
|
-
results.first.content
|
230
|
-
else
|
231
|
-
results.map { |res| res.content }
|
232
|
-
end
|
233
|
-
instance_variable_set(var, result)
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
def init_title
|
238
|
-
@title = nil
|
499
|
+
def init_title_from_html
|
239
500
|
xpath = "//title"
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
501
|
+
result = find_in_html(xpath)
|
502
|
+
init_var(:@title, result)
|
503
|
+
end
|
504
|
+
|
505
|
+
def init_title_from_object(obj)
|
506
|
+
result = find_in_object(obj, "title")
|
507
|
+
init_var(:@title, result)
|
508
|
+
end
|
509
|
+
|
510
|
+
# Init methods for author.
|
511
|
+
|
512
|
+
def init_author_from_html
|
246
513
|
xpath = "//meta[@name='author']/@content"
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
514
|
+
result = find_in_html(xpath)
|
515
|
+
init_var(:@author, result)
|
516
|
+
end
|
517
|
+
|
518
|
+
def init_author_from_object(obj)
|
519
|
+
result = find_in_object(obj, "author")
|
520
|
+
init_var(:@author, result)
|
521
|
+
end
|
522
|
+
|
523
|
+
# Init methods for keywords.
|
524
|
+
|
525
|
+
def init_keywords_from_html
|
253
526
|
xpath = "//meta[@name='keywords']/@content"
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
527
|
+
result = find_in_html(xpath) do |keywords|
|
528
|
+
if keywords
|
529
|
+
keywords = keywords.split(",")
|
530
|
+
process_arr(keywords)
|
531
|
+
end
|
532
|
+
keywords
|
533
|
+
end
|
534
|
+
init_var(:@keywords, result)
|
535
|
+
end
|
536
|
+
|
537
|
+
def init_keywords_from_object(obj)
|
538
|
+
result = find_in_object(obj, "keywords", singleton: false)
|
539
|
+
init_var(:@keywords, result)
|
540
|
+
end
|
259
541
|
|
260
|
-
|
261
|
-
|
542
|
+
# Init methods for links.
|
543
|
+
|
544
|
+
def init_links_from_html
|
262
545
|
xpath = "//a/@href"
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
546
|
+
result = find_in_html(xpath, singleton: false) do |links|
|
547
|
+
if links
|
548
|
+
links.reject! { |link| link == "/" }
|
549
|
+
links.map! do |link|
|
550
|
+
begin
|
551
|
+
Wgit::Url.new(link)
|
552
|
+
rescue
|
553
|
+
nil
|
554
|
+
end
|
555
|
+
end
|
556
|
+
links.reject! { |link| link.nil? }
|
557
|
+
process_internal_links(links)
|
272
558
|
end
|
559
|
+
links
|
273
560
|
end
|
274
|
-
|
275
|
-
process_internal_links(@links)
|
561
|
+
init_var(:@links, result)
|
276
562
|
end
|
277
|
-
|
278
|
-
def
|
279
|
-
|
563
|
+
|
564
|
+
def init_links_from_object(obj)
|
565
|
+
result = find_in_object(obj, "links", singleton: false) do |links|
|
566
|
+
if links
|
567
|
+
links.map! { |link| Wgit::Url.new(link) }
|
568
|
+
end
|
569
|
+
links
|
570
|
+
end
|
571
|
+
init_var(:@links, result)
|
572
|
+
end
|
573
|
+
|
574
|
+
# Init methods for text.
|
575
|
+
|
576
|
+
def init_text_from_html
|
280
577
|
xpath = text_elements_xpath
|
281
|
-
|
282
|
-
|
283
|
-
|
578
|
+
result = find_in_html(xpath, singleton: false)
|
579
|
+
init_var(:@text, result)
|
580
|
+
end
|
581
|
+
|
582
|
+
def init_text_from_object(obj)
|
583
|
+
result = find_in_object(obj, "text", singleton: false)
|
584
|
+
init_var(:@text, result)
|
284
585
|
end
|
285
586
|
|
286
|
-
|
587
|
+
alias :to_hash :to_h
|
287
588
|
alias :relative_links :internal_links
|
288
589
|
alias :relative_urls :internal_links
|
289
590
|
alias :relative_full_links :internal_full_links
|