wgit 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/wgit.rb +1 -1
- data/lib/wgit/assertable.rb +72 -61
- data/lib/wgit/core_ext.rb +11 -5
- data/lib/wgit/crawler.rb +97 -57
- data/lib/wgit/database/database.rb +247 -170
- data/lib/wgit/database/model.rb +40 -24
- data/lib/wgit/database/mongo_connection_details.rb +44 -23
- data/lib/wgit/document.rb +534 -233
- data/lib/wgit/indexer.rb +235 -0
- data/lib/wgit/url.rb +199 -121
- data/lib/wgit/utils.rb +143 -96
- data/lib/wgit/version.rb +5 -1
- metadata +10 -9
- data/lib/wgit/web_crawler.rb +0 -134
data/lib/wgit/database/model.rb
CHANGED
@@ -2,30 +2,46 @@ require_relative '../utils'
|
|
2
2
|
|
3
3
|
module Wgit
|
4
4
|
|
5
|
-
#
|
6
|
-
# Module containing the DB data model structure.
|
5
|
+
# Module containing the database (DB) data model structure.
|
7
6
|
module Model
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
7
|
+
|
8
|
+
# The data model for a Wgit::Url.
|
9
|
+
#
|
10
|
+
# @param url [Wgit::Url] The URL DB record.
|
11
|
+
# @return [Hash] The URL model ready for DB insertion.
|
12
|
+
def self.url(url)
|
13
|
+
raise "url must respond_to? to_h" unless url.respond_to?(:to_h)
|
14
|
+
model = url.to_h
|
15
|
+
Wgit::Utils.remove_non_bson_types(model)
|
16
|
+
end
|
17
|
+
|
18
|
+
# The data model for a Wgit::Document.
|
19
|
+
#
|
20
|
+
# @param doc [Wgit::Document] The Document DB record.
|
21
|
+
# @return [Hash] The Document model ready for DB insertion.
|
22
|
+
def self.document(doc)
|
23
|
+
raise "doc must respond_to? to_h" unless doc.respond_to?(:to_h)
|
24
|
+
model = doc.to_h(false)
|
25
|
+
Wgit::Utils.remove_non_bson_types(model)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Default fields when inserting a record into the DB.
|
29
|
+
#
|
30
|
+
# @return [Hash] Containing common insertion fields for all models.
|
31
|
+
def self.common_insert_data
|
32
|
+
{
|
33
|
+
date_added: Wgit::Utils.time_stamp,
|
34
|
+
date_modified: Wgit::Utils.time_stamp,
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
# Default fields when updating a record in the DB.
|
39
|
+
#
|
40
|
+
# @return [Hash] Containing common update fields for all models.
|
41
|
+
def self.common_update_data
|
42
|
+
{
|
43
|
+
date_modified: Wgit::Utils.time_stamp,
|
44
|
+
}
|
45
|
+
end
|
30
46
|
end
|
31
47
|
end
|
@@ -1,27 +1,48 @@
|
|
1
|
-
|
2
|
-
# @author Michael Telford
|
3
1
|
module Wgit
|
4
|
-
|
2
|
+
# The connection details for the database. This must be set if you want to
|
3
|
+
# store and access webpages in a database. Don't set the constant directly,
|
4
|
+
# instead use the funcs contained within the Wgit module.
|
5
|
+
CONNECTION_DETAILS = {}
|
6
|
+
|
7
|
+
# Set the database's connection details from the given hash and freeze them.
|
8
|
+
# It is your responsibility to ensure the correct hash vars are present and
|
9
|
+
# set. Due to the freezing of the CONNECTION_DETAILS, this func is designed
|
10
|
+
# to be called only once.
|
11
|
+
#
|
12
|
+
# @param hash [Hash] Containing the database connection details to use.
|
13
|
+
# The hash should contain the following keys (of type String):
|
14
|
+
# host, port, uname, pword, db
|
15
|
+
# @raise [KeyError, FrozenError] If any of the required connection
|
16
|
+
# details are missing or if the connection details have already been set.
|
17
|
+
# @return [Hash] Containing the database connection details from hash.
|
18
|
+
def self.set_connection_details(hash)
|
19
|
+
CONNECTION_DETAILS[:host] = hash.fetch('host')
|
20
|
+
CONNECTION_DETAILS[:port] = hash.fetch('port')
|
21
|
+
CONNECTION_DETAILS[:uname] = hash.fetch('uname')
|
22
|
+
CONNECTION_DETAILS[:pword] = hash.fetch('pword')
|
23
|
+
CONNECTION_DETAILS[:db] = hash.fetch('db')
|
24
|
+
|
25
|
+
CONNECTION_DETAILS.freeze
|
26
|
+
end
|
27
|
+
|
28
|
+
# Set the database's connection details from the ENV and freeze them. It is
|
29
|
+
# your responsibility to ensure the correct ENV vars are present and set.
|
30
|
+
# Due to the freezing of the CONNECTION_DETAILS, this func is designed to be
|
31
|
+
# called only once.
|
32
|
+
#
|
33
|
+
# The ENV should contain the following keys (of type String):
|
34
|
+
# DB_HOST, DB_PORT, DB_USERNAME, DB_PASSWORD, DB_DATABASE
|
35
|
+
#
|
36
|
+
# @raise [KeyError, FrozenError] If any of the required connection
|
37
|
+
# details are missing or if the connection details have already been set.
|
38
|
+
# @return [Hash] Containing the database connection details from the ENV.
|
39
|
+
def self.set_connection_details_from_env
|
40
|
+
CONNECTION_DETAILS[:host] = ENV.fetch('DB_HOST')
|
41
|
+
CONNECTION_DETAILS[:port] = ENV.fetch('DB_PORT')
|
42
|
+
CONNECTION_DETAILS[:uname] = ENV.fetch('DB_USERNAME')
|
43
|
+
CONNECTION_DETAILS[:pword] = ENV.fetch('DB_PASSWORD')
|
44
|
+
CONNECTION_DETAILS[:db] = ENV.fetch('DB_DATABASE')
|
5
45
|
|
6
|
-
|
7
|
-
if DB_PROVIDER == :OpenShift
|
8
|
-
CONNECTION_DETAILS = {
|
9
|
-
:host => "127.0.0.1",
|
10
|
-
:port => "27017",
|
11
|
-
:db => "admin",
|
12
|
-
:uname => "admin",
|
13
|
-
:pword => "R5jUKv1fessb"
|
14
|
-
}.freeze
|
15
|
-
# MongoLabs (MongoDB 3.0)
|
16
|
-
elsif DB_PROVIDER == :MongoLabs
|
17
|
-
CONNECTION_DETAILS = {
|
18
|
-
:host => "ds037205.mongolab.com",
|
19
|
-
:port => "37205",
|
20
|
-
:db => "crawler",
|
21
|
-
:uname => "rubyapp",
|
22
|
-
:pword => "R5jUKv1fessb",
|
23
|
-
}.freeze
|
24
|
-
else
|
25
|
-
raise "Database provider '#{DB_PROVIDER}' is not recognized"
|
46
|
+
CONNECTION_DETAILS.freeze
|
26
47
|
end
|
27
48
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -2,288 +2,589 @@ require_relative 'url'
|
|
2
2
|
require_relative 'utils'
|
3
3
|
require_relative 'assertable'
|
4
4
|
require 'nokogiri'
|
5
|
+
require 'json'
|
5
6
|
|
6
7
|
module Wgit
|
7
8
|
|
8
|
-
#
|
9
|
-
#
|
9
|
+
# Class modeling a HTML web document. Also doubles as a search result when
|
10
|
+
# loading Documents from the database.
|
11
|
+
#
|
12
|
+
# The initialize method dynamically initializes certain variables from the
|
13
|
+
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
14
|
+
# Document class can be easily extended allowing you to pull out the bits of
|
15
|
+
# a webpage that are important to you. See Wgit::Document.define_extension.
|
10
16
|
class Document
|
11
17
|
include Assertable
|
12
|
-
|
13
|
-
TEXT_ELEMENTS = [:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
|
14
|
-
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
|
15
18
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
19
|
+
# The HTML elements that make up the visible text on a page.
|
20
|
+
# These elements are used to initialize the @text of the Document.
|
21
|
+
# See the README.md for how to add to this Array dynamically.
|
22
|
+
@@text_elements = [
|
23
|
+
:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
|
24
|
+
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
|
25
|
+
]
|
26
|
+
|
27
|
+
# The URL of the webpage, an instance of Wgit:Url.
|
28
|
+
attr_reader :url
|
29
|
+
|
30
|
+
# The HTML of the webpage, an instance of String.
|
31
|
+
attr_reader :html
|
32
|
+
|
33
|
+
# The Nokogiri document object initialized from @html.
|
34
|
+
attr_reader :doc
|
35
|
+
|
36
|
+
# The score is only used following a Database#search and records matches.
|
37
|
+
attr_reader :score
|
38
|
+
|
39
|
+
# Initialize takes either two strings (representing the URL and HTML) or an
|
40
|
+
# object representing a database record (of a HTTP crawled web page). This
|
41
|
+
# allows for initialisation from both crawled web pages and (afterwards)
|
42
|
+
# documents/web pages retrieved from the database.
|
43
|
+
#
|
44
|
+
# During initialisation, the Document will call any
|
45
|
+
# 'init_*_from_html' and 'init_*_from_object' methods it can find. Some
|
46
|
+
# default init_* methods exist while others can be defined by the user.
|
47
|
+
# See the README and Wgit::Document.define_extension for more info.
|
48
|
+
#
|
49
|
+
# @param url_or_obj [String, Object#fetch] Either a String representing a
|
50
|
+
# URL or a Hash-like object responding to :fetch. e.g. a MongoDB
|
51
|
+
# collection object. The Object's :fetch method should support Strings as
|
52
|
+
# keys.
|
53
|
+
# @param html [String] The crawled web page's HTML. This param is only
|
54
|
+
# required if url_or_obj is a String representing the web page's URL.
|
55
|
+
def initialize(url_or_obj, html = "")
|
56
|
+
# Init from URL String and HTML String.
|
57
|
+
if url_or_obj.is_a?(String)
|
58
|
+
url = url_or_obj
|
59
|
+
assert_type(url, Url)
|
60
|
+
|
61
|
+
@url = url
|
62
|
+
@html = html ||= ""
|
63
|
+
@doc = init_nokogiri
|
64
|
+
@score = 0.0
|
22
65
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
#config.options = Nokogiri::XML::ParseOptions::STRICT |
|
29
|
-
# Nokogiri::XML::ParseOptions::NONET
|
30
|
-
end
|
31
|
-
|
32
|
-
init_title
|
33
|
-
init_author
|
34
|
-
init_keywords
|
35
|
-
init_links
|
36
|
-
init_text
|
37
|
-
@score = 0.0
|
38
|
-
else
|
39
|
-
# Init from a mongo collection document.
|
40
|
-
@url = Wgit::Url.new(url_or_doc[:url])
|
41
|
-
@html = url_or_doc[:html].nil? ? "" : url_or_doc[:html]
|
42
|
-
@title = url_or_doc[:title]
|
43
|
-
@author = url_or_doc[:author]
|
44
|
-
@keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords]
|
45
|
-
@links = url_or_doc[:links].nil? ? [] : url_or_doc[:links]
|
46
|
-
@links.map! { |link| Wgit::Url.new(link) }
|
47
|
-
@text = url_or_doc[:text].nil? ? [] : url_or_doc[:text]
|
48
|
-
@score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score]
|
66
|
+
# Dynamically run the init_*_from_html methods.
|
67
|
+
Document.private_instance_methods(false).each do |method|
|
68
|
+
if method.to_s.start_with?("init_") &&
|
69
|
+
method.to_s.end_with?("_from_html")
|
70
|
+
self.send(method)
|
49
71
|
end
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
72
|
+
end
|
73
|
+
# Init from a Hash like object containing Strings as keys e.g. Mongo
|
74
|
+
# collection obj.
|
75
|
+
else
|
76
|
+
obj = url_or_obj
|
77
|
+
assert_respond_to(obj, :fetch)
|
78
|
+
|
79
|
+
@url = obj.fetch("url") # Should always be present.
|
80
|
+
@html = obj.fetch("html", "")
|
81
|
+
@doc = init_nokogiri
|
82
|
+
@score = obj.fetch("score", 0.0)
|
83
|
+
|
84
|
+
# Dynamically run the init_*_from_object methods.
|
85
|
+
Document.private_instance_methods(false).each do |method|
|
86
|
+
if method.to_s.start_with?("init_") &&
|
87
|
+
method.to_s.end_with?("_from_object")
|
88
|
+
self.send(method, obj)
|
60
89
|
end
|
61
|
-
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
62
93
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
94
|
+
# Determines if both the url and html match. Use
|
95
|
+
# doc.object_id == other_doc.object_id for exact object comparison.
|
96
|
+
#
|
97
|
+
# @param other_doc [Wgit::Document] To compare self against.
|
98
|
+
# @return [Boolean] True if @url and @html are equal, false if not.
|
99
|
+
def ==(other_doc)
|
100
|
+
return false unless other_doc.is_a? Wgit::Document
|
101
|
+
@url == other_doc.url and @html == other_doc.html
|
102
|
+
end
|
103
|
+
|
104
|
+
# Is a shortcut for calling Document#html[range].
|
105
|
+
#
|
106
|
+
# @param range [Range] The range of @html to return.
|
107
|
+
# @return [String] The given range of @html.
|
108
|
+
def [](range)
|
109
|
+
@html[range]
|
110
|
+
end
|
111
|
+
|
112
|
+
def date_crawled
|
113
|
+
@url.date_crawled
|
114
|
+
end
|
115
|
+
|
116
|
+
# Returns a Hash containing this Document's instance vars.
|
117
|
+
# Used when storing the Document in a Database e.g. MongoDB etc.
|
118
|
+
# By default the @html var is excluded from the returned Hash.
|
119
|
+
#
|
120
|
+
# @param include_html [Boolean] Whether or not to include @html in the
|
121
|
+
# returned Hash.
|
122
|
+
# @return [Hash] Containing self's instance vars.
|
123
|
+
def to_h(include_html = false)
|
124
|
+
ignore = include_html ? [] : ["@html"]
|
125
|
+
ignore << "@doc" # Always ignore "@doc"
|
126
|
+
Wgit::Utils.to_h(self, ignore)
|
127
|
+
end
|
128
|
+
|
129
|
+
# Converts this Document's to_h return value to a JSON String.
|
130
|
+
#
|
131
|
+
# @param include_html [Boolean] Whether or not to include @html in the
|
132
|
+
# returned JSON String.
|
133
|
+
# @return [String] This Document represented as a JSON String.
|
134
|
+
def to_json(include_html = false)
|
135
|
+
h = to_h(include_html)
|
136
|
+
JSON.generate(h)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Returns a Hash containing this Document's instance variables and
|
140
|
+
# their :length (if they respond to it). Works dynamically so that any
|
141
|
+
# user defined extensions (and their created instance vars) will appear in
|
142
|
+
# the returned Hash as well. The number of text snippets as well as total
|
143
|
+
# number of textual bytes are always included in the returned Hash.
|
144
|
+
#
|
145
|
+
# @return [Hash] Containing self's HTML statistics.
|
146
|
+
def stats
|
147
|
+
hash = {}
|
148
|
+
instance_variables.each do |var|
|
149
|
+
# Add up the total bytes of text as well as the length.
|
150
|
+
if var == :@text
|
151
|
+
count = 0
|
152
|
+
@text.each { |t| count += t.length }
|
153
|
+
hash[:text_length] = @text.length
|
154
|
+
hash[:text_bytes] = count
|
155
|
+
# Else take the var's #length method return value.
|
156
|
+
else
|
157
|
+
next unless instance_variable_get(var).respond_to?(:length)
|
158
|
+
hash[var[1..-1].to_sym] =
|
159
|
+
instance_variable_get(var).send(:length)
|
160
|
+
end
|
69
161
|
end
|
70
|
-
|
71
|
-
|
162
|
+
hash
|
163
|
+
end
|
164
|
+
|
165
|
+
# Determine the size of this Document's HTML.
|
166
|
+
#
|
167
|
+
# @return [Integer] The total number of bytes in @html.
|
168
|
+
def size
|
169
|
+
stats[:html]
|
170
|
+
end
|
171
|
+
|
172
|
+
# Determine if this Document's HTML is empty or not.
|
173
|
+
#
|
174
|
+
# @return [Boolean] True if @html is nil/empty, false otherwise.
|
175
|
+
def empty?
|
176
|
+
return true if @html.nil?
|
177
|
+
@html.strip.empty?
|
178
|
+
end
|
179
|
+
|
180
|
+
# Uses Nokogiri's xpath method to search the doc's html and return the
|
181
|
+
# results.
|
182
|
+
#
|
183
|
+
# @param xpath [String] The xpath to search the @html with.
|
184
|
+
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
185
|
+
def xpath(xpath)
|
186
|
+
@doc.xpath(xpath)
|
187
|
+
end
|
188
|
+
|
189
|
+
# Uses Nokogiri's css method to search the doc's html and return the
|
190
|
+
# results.
|
191
|
+
#
|
192
|
+
# @param selector [String] The CSS selector to search the @html with.
|
193
|
+
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
194
|
+
def css(selector)
|
195
|
+
@doc.css(selector)
|
196
|
+
end
|
197
|
+
|
198
|
+
# Get all internal links of this Document.
|
199
|
+
#
|
200
|
+
# @return [Array<Wgit::Url>] self's internal/relative URL's.
|
201
|
+
def internal_links
|
72
202
|
return [] if @links.empty?
|
73
|
-
|
203
|
+
@links.reject do |link|
|
74
204
|
begin
|
75
|
-
|
205
|
+
not link.relative_link?
|
76
206
|
rescue
|
77
|
-
|
207
|
+
true
|
78
208
|
end
|
79
209
|
end
|
80
|
-
|
210
|
+
end
|
81
211
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
next unless instance_variable_get(var).respond_to?(:length)
|
94
|
-
hash[var[1..-1].to_sym] =
|
95
|
-
instance_variable_get(var).send(:length)
|
96
|
-
end
|
97
|
-
end
|
98
|
-
hash
|
212
|
+
# Get all internal links of this Document and append them to this
|
213
|
+
# Document's base URL.
|
214
|
+
#
|
215
|
+
# @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
|
216
|
+
# form.
|
217
|
+
def internal_full_links
|
218
|
+
in_links = internal_links
|
219
|
+
return [] if in_links.empty?
|
220
|
+
in_links.map do |link|
|
221
|
+
link.replace("/" + link) unless link.start_with?("/")
|
222
|
+
Wgit::Url.new(@url.to_base + link)
|
99
223
|
end
|
100
|
-
|
101
|
-
|
102
|
-
|
224
|
+
end
|
225
|
+
|
226
|
+
# Get all external links of this Document.
|
227
|
+
#
|
228
|
+
# @return [Array<Wgit::Url>] self's external/absolute URL's.
|
229
|
+
def external_links
|
230
|
+
return [] if @links.empty?
|
231
|
+
@links.reject do |link|
|
232
|
+
begin
|
233
|
+
link.relative_link?
|
234
|
+
rescue
|
235
|
+
true
|
236
|
+
end
|
103
237
|
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# Searches against the @text for the given search query.
|
241
|
+
# The number of search hits for each sentenence are recorded internally
|
242
|
+
# and used to rank/sort the search results before being returned. Where
|
243
|
+
# the Wgit::Database#search method search all documents for the most hits,
|
244
|
+
# this method searches each document's @text for the most hits.
|
245
|
+
#
|
246
|
+
# Each search result comprises of a sentence of a given length. The length
|
247
|
+
# will be based on the sentence_limit parameter or the full length of the
|
248
|
+
# original sentence, which ever is less. The algorithm obviously ensures
|
249
|
+
# that the search query is visible somewhere in the sentence.
|
250
|
+
#
|
251
|
+
# @param query [String] The value to search the document's text against.
|
252
|
+
# @param sentence_limit [Integer] The max length of each search result
|
253
|
+
# sentence.
|
254
|
+
# @return [Array<String>] Representing the search results.
|
255
|
+
def search(query, sentence_limit = 80)
|
256
|
+
raise "A search value must be provided" if query.empty?
|
257
|
+
raise "The sentence length value must be even" if sentence_limit.odd?
|
258
|
+
|
259
|
+
results = {}
|
260
|
+
regex = Regexp.new(query, Regexp::IGNORECASE)
|
104
261
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
262
|
+
@text.each do |sentence|
|
263
|
+
hits = sentence.scan(regex).count
|
264
|
+
if hits > 0
|
265
|
+
sentence.strip!
|
266
|
+
index = sentence.index(regex)
|
267
|
+
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
268
|
+
results[sentence] = hits
|
269
|
+
end
|
109
270
|
end
|
271
|
+
|
272
|
+
return [] if results.empty?
|
273
|
+
results = Hash[results.sort_by { |k, v| v }]
|
274
|
+
results.keys.reverse
|
275
|
+
end
|
276
|
+
|
277
|
+
# Performs a text search (see Document#search for details) but assigns the
|
278
|
+
# results to the @text instance variable. This can be used for sub search
|
279
|
+
# functionality. The original text is returned; no other reference to it
|
280
|
+
# is kept thereafter.
|
281
|
+
#
|
282
|
+
# @param query [String] The value to search the document's text against.
|
283
|
+
# @return [String] This Document's original @text value.
|
284
|
+
def search!(query)
|
285
|
+
orig_text = @text
|
286
|
+
@text = search(query)
|
287
|
+
orig_text
|
288
|
+
end
|
289
|
+
|
290
|
+
### Document (Class) methods ###
|
291
|
+
|
292
|
+
# Returns Document.text_elements used to obtain the text in a webpage.
|
293
|
+
#
|
294
|
+
# @return [Array<Symbols>] The page elements containing visual text on a
|
295
|
+
# webpage.
|
296
|
+
def self.text_elements
|
297
|
+
@@text_elements
|
298
|
+
end
|
299
|
+
|
300
|
+
# Initialises a private instance variable with the xpath or database object
|
301
|
+
# result(s). When initialising from HTML, a true singleton value will only
|
302
|
+
# ever return one result otherwise all xpath results are returned in an
|
303
|
+
# Array. When initialising from a database object, the value is taken as
|
304
|
+
# is and singleton is only used to define the default empty value.
|
305
|
+
# If a value cannot be found (in either the HTML or database object), then
|
306
|
+
# a default will be used. The default is: singleton ? nil : [].
|
307
|
+
#
|
308
|
+
# Note that defined extensions work for both documents being crawled from
|
309
|
+
# the WWW and for documents being retrieved from the database. This
|
310
|
+
# effectively implements ORM like behavior using this class.
|
311
|
+
#
|
312
|
+
# @param var [Symbol] The name of the variable to be initialised.
|
313
|
+
# @param xpath [String] Used to find the element(s) of the webpage.
|
314
|
+
# @option options [Boolean] :singleton The singleton option determines
|
315
|
+
# whether or not the result(s) should be in an Array. If multiple
|
316
|
+
# results are found and singleton is true then the first result will be
|
317
|
+
# used. Defaults to true.
|
318
|
+
# @option options [Boolean] :text_content_only The text_content_only option
|
319
|
+
# if true will use the text content of the Nokogiri result object,
|
320
|
+
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
321
|
+
# @yield [var_value] Gives the value about to be assigned to the new var.
|
322
|
+
# The return value of the block becomes the new var value, unless nil.
|
323
|
+
# Return nil if you want to inspect but not change the var value.
|
324
|
+
# @return [Symbol] The first half of the newly created method names e.g.
|
325
|
+
# if var == "title" then :init_title is returned.
|
326
|
+
def self.define_extension(var, xpath, options = {}, &block)
|
327
|
+
default_options = { singleton: true, text_content_only: true }
|
328
|
+
options = default_options.merge(options)
|
110
329
|
|
111
|
-
#
|
112
|
-
#
|
113
|
-
|
114
|
-
|
115
|
-
|
330
|
+
# Define the private init_*_from_html method for HTML.
|
331
|
+
# Gets the HTML's xpath value and creates a var for it.
|
332
|
+
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
333
|
+
result = find_in_html(xpath, options, &block)
|
334
|
+
init_var(var, result)
|
335
|
+
end
|
336
|
+
Document.send :private, func_name
|
337
|
+
|
338
|
+
# Define the private init_*_from_object method for a Database object.
|
339
|
+
# Gets the Object's "key" value and creates a var for it.
|
340
|
+
func_name = Document.send(
|
341
|
+
:define_method, "init_#{var}_from_object") do |obj|
|
342
|
+
result = find_in_object(
|
343
|
+
obj, var.to_s, singleton: options[:singleton], &block)
|
344
|
+
init_var(var, result)
|
345
|
+
end
|
346
|
+
Document.send :private, func_name
|
347
|
+
|
348
|
+
"init_#{var}".to_sym
|
349
|
+
end
|
350
|
+
|
351
|
+
# Removes the init_* methods created when an extension is defined.
|
352
|
+
# Therefore, this is the opposing method to Document.define_extension.
|
353
|
+
# Returns true if successful or false if the method(s) cannot be found.
|
354
|
+
#
|
355
|
+
# @param var [Symbol] The extension variable already defined.
|
356
|
+
# @return [Boolean] True if the extension var was found and removed;
|
357
|
+
# otherwise false.
|
358
|
+
def self.remove_extension(var)
|
359
|
+
Document.send(:remove_method, "init_#{var}_from_html")
|
360
|
+
Document.send(:remove_method, "init_#{var}_from_object")
|
361
|
+
true
|
362
|
+
rescue NameError
|
363
|
+
false
|
364
|
+
end
|
365
|
+
|
366
|
+
private
|
367
|
+
|
368
|
+
# Initializes the nokogiri object using @html, which must be already set.
|
369
|
+
def init_nokogiri
|
370
|
+
raise "@html must be set" unless @html
|
371
|
+
Nokogiri::HTML(@html) do |config|
|
372
|
+
# TODO: Remove #'s below when crawling in production.
|
373
|
+
#config.options = Nokogiri::XML::ParseOptions::STRICT |
|
374
|
+
# Nokogiri::XML::ParseOptions::NONET
|
116
375
|
end
|
376
|
+
end
|
377
|
+
|
378
|
+
# Returns an object/value from this Document's @html using the provided
|
379
|
+
# xpath param.
|
380
|
+
# singleton ? results.first (single Object) : results (Array)
|
381
|
+
# text_content_only ? result.content (String) : result (nokogiri Object)
|
382
|
+
# A block can be used to set the final value before it is returned.
|
383
|
+
# Return nil from the block if you don't want to override the value.
|
384
|
+
def find_in_html(xpath, singleton: true, text_content_only: true)
|
385
|
+
results = @doc.xpath(xpath)
|
117
386
|
|
118
|
-
|
119
|
-
|
120
|
-
|
387
|
+
if results and not results.empty?
|
388
|
+
result = if singleton
|
389
|
+
text_content_only ? results.first.content : results.first
|
390
|
+
else
|
391
|
+
text_content_only ? results.map(&:content) : results
|
392
|
+
end
|
393
|
+
else
|
394
|
+
result = singleton ? nil : []
|
121
395
|
end
|
122
|
-
|
123
|
-
|
124
|
-
|
396
|
+
|
397
|
+
singleton ? process_str(result) : process_arr(result)
|
398
|
+
|
399
|
+
if block_given?
|
400
|
+
new_result = yield(result)
|
401
|
+
result = new_result if new_result
|
125
402
|
end
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
raise "A search value must be provided" if text.empty?
|
145
|
-
raise "The sentence length value must be even" if sentence_limit.odd?
|
146
|
-
|
147
|
-
results = {}
|
148
|
-
regex = Regexp.new(text, Regexp::IGNORECASE)
|
149
|
-
|
150
|
-
@text.each do |sentence|
|
151
|
-
hits = sentence.scan(regex).count
|
152
|
-
if hits > 0
|
153
|
-
sentence.strip!
|
154
|
-
index = sentence.index(regex)
|
155
|
-
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
156
|
-
results[sentence] = hits
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
|
-
return [] if results.empty?
|
161
|
-
results = Hash[results.sort_by { |k, v| v }]
|
162
|
-
results.keys.reverse
|
403
|
+
|
404
|
+
result
|
405
|
+
end
|
406
|
+
|
407
|
+
# Finds a value in the obj using the key.
|
408
|
+
# singleton is used to set the value if not found in obj.
|
409
|
+
# A block can be used to set the final value before it is returned.
|
410
|
+
# Return nil from the block if you don't want to override the value.
|
411
|
+
def find_in_object(obj, key, singleton: true)
|
412
|
+
assert_respond_to(obj, :fetch)
|
413
|
+
|
414
|
+
default = singleton ? nil : []
|
415
|
+
result = obj.fetch(key.to_s, default)
|
416
|
+
singleton ? process_str(result) : process_arr(result)
|
417
|
+
|
418
|
+
if block_given?
|
419
|
+
new_result = yield(result)
|
420
|
+
result = new_result if new_result
|
163
421
|
end
|
422
|
+
|
423
|
+
result
|
424
|
+
end
|
164
425
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
426
|
+
# Initialises an instance variable and defines a getter method for it.
|
427
|
+
# @param var [Symbol] The name of the variable to be initialized.
|
428
|
+
# @param value [Object] The newly initialized variable's value.
|
429
|
+
# @return [Symbol] The name of the newly created getter method.
|
430
|
+
def init_var(var, value)
|
431
|
+
# instance_var_name starts with @, var_name doesn't.
|
432
|
+
var = var.to_s
|
433
|
+
var_name = (var.start_with?("@") ? var[1..-1] : var).to_sym
|
434
|
+
instance_var_name = "@#{var_name}".to_sym
|
435
|
+
|
436
|
+
instance_variable_set(instance_var_name, value)
|
437
|
+
|
438
|
+
Document.send(:define_method, var_name) do
|
439
|
+
instance_variable_get(instance_var_name)
|
171
440
|
end
|
441
|
+
end
|
172
442
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
443
|
+
# Takes Docuent.text_elements and returns an xpath String used to obtain
|
444
|
+
# all of the combined text.
|
445
|
+
def text_elements_xpath
|
446
|
+
xpath = ""
|
447
|
+
return xpath if @@text_elements.empty?
|
448
|
+
el_xpath = "//%s/text()"
|
449
|
+
@@text_elements.each_with_index do |el, i|
|
450
|
+
xpath += " | " unless i == 0
|
451
|
+
xpath += el_xpath % [el]
|
177
452
|
end
|
178
|
-
|
179
|
-
|
453
|
+
xpath
|
454
|
+
end
|
180
455
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
456
|
+
# Processes a String to make it uniform.
|
457
|
+
def process_str(str)
|
458
|
+
if str.is_a?(String)
|
459
|
+
str.encode!('UTF-8', 'UTF-8', invalid: :replace)
|
460
|
+
str.strip!
|
185
461
|
end
|
462
|
+
str
|
463
|
+
end
|
186
464
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
465
|
+
# Processes an Array to make it uniform.
|
466
|
+
def process_arr(array)
|
467
|
+
if array.is_a?(Array)
|
468
|
+
array.map! { |str| process_str(str) }
|
469
|
+
array.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
470
|
+
array.uniq!
|
192
471
|
end
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
472
|
+
array
|
473
|
+
end
|
474
|
+
|
475
|
+
# Modifies internal links by removing this doc's base or host URL, if
|
476
|
+
# present. http://www.google.co.uk/about.html (with or without the
|
477
|
+
# protocol prefix) will become about.html meaning it'll appear within
|
478
|
+
# Document#internal_links.
|
479
|
+
def process_internal_links(links)
|
480
|
+
links.map! do |link|
|
481
|
+
host_or_base = if link.start_with?("http")
|
482
|
+
@url.base
|
483
|
+
else
|
484
|
+
@url.host
|
485
|
+
end
|
486
|
+
if link.start_with?(host_or_base)
|
487
|
+
link.sub!(host_or_base, "")
|
488
|
+
link.replace(link[1..-1]) if link.start_with?("/")
|
489
|
+
link.strip!
|
490
|
+
end
|
491
|
+
link
|
212
492
|
end
|
493
|
+
end
|
494
|
+
|
495
|
+
### Default init_* (Document extension) methods. ###
|
213
496
|
|
214
|
-
|
215
|
-
xpath = ""
|
216
|
-
return xpath if TEXT_ELEMENTS.empty?
|
217
|
-
el_xpath = "//%s/text()"
|
218
|
-
TEXT_ELEMENTS.each_with_index do |el, i|
|
219
|
-
xpath += " | " unless i == 0
|
220
|
-
xpath += el_xpath % [el]
|
221
|
-
end
|
222
|
-
xpath
|
223
|
-
end
|
497
|
+
# Init methods for title.
|
224
498
|
|
225
|
-
|
226
|
-
results = @doc.xpath(xpath)
|
227
|
-
unless results.nil? || results.empty?
|
228
|
-
result = if first_result
|
229
|
-
results.first.content
|
230
|
-
else
|
231
|
-
results.map { |res| res.content }
|
232
|
-
end
|
233
|
-
instance_variable_set(var, result)
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
def init_title
|
238
|
-
@title = nil
|
499
|
+
def init_title_from_html
|
239
500
|
xpath = "//title"
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
501
|
+
result = find_in_html(xpath)
|
502
|
+
init_var(:@title, result)
|
503
|
+
end
|
504
|
+
|
505
|
+
def init_title_from_object(obj)
|
506
|
+
result = find_in_object(obj, "title")
|
507
|
+
init_var(:@title, result)
|
508
|
+
end
|
509
|
+
|
510
|
+
# Init methods for author.
|
511
|
+
|
512
|
+
def init_author_from_html
|
246
513
|
xpath = "//meta[@name='author']/@content"
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
514
|
+
result = find_in_html(xpath)
|
515
|
+
init_var(:@author, result)
|
516
|
+
end
|
517
|
+
|
518
|
+
def init_author_from_object(obj)
|
519
|
+
result = find_in_object(obj, "author")
|
520
|
+
init_var(:@author, result)
|
521
|
+
end
|
522
|
+
|
523
|
+
# Init methods for keywords.
|
524
|
+
|
525
|
+
def init_keywords_from_html
|
253
526
|
xpath = "//meta[@name='keywords']/@content"
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
527
|
+
result = find_in_html(xpath) do |keywords|
|
528
|
+
if keywords
|
529
|
+
keywords = keywords.split(",")
|
530
|
+
process_arr(keywords)
|
531
|
+
end
|
532
|
+
keywords
|
533
|
+
end
|
534
|
+
init_var(:@keywords, result)
|
535
|
+
end
|
536
|
+
|
537
|
+
def init_keywords_from_object(obj)
|
538
|
+
result = find_in_object(obj, "keywords", singleton: false)
|
539
|
+
init_var(:@keywords, result)
|
540
|
+
end
|
259
541
|
|
260
|
-
|
261
|
-
|
542
|
+
# Init methods for links.
|
543
|
+
|
544
|
+
def init_links_from_html
|
262
545
|
xpath = "//a/@href"
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
546
|
+
result = find_in_html(xpath, singleton: false) do |links|
|
547
|
+
if links
|
548
|
+
links.reject! { |link| link == "/" }
|
549
|
+
links.map! do |link|
|
550
|
+
begin
|
551
|
+
Wgit::Url.new(link)
|
552
|
+
rescue
|
553
|
+
nil
|
554
|
+
end
|
555
|
+
end
|
556
|
+
links.reject! { |link| link.nil? }
|
557
|
+
process_internal_links(links)
|
272
558
|
end
|
559
|
+
links
|
273
560
|
end
|
274
|
-
|
275
|
-
process_internal_links(@links)
|
561
|
+
init_var(:@links, result)
|
276
562
|
end
|
277
|
-
|
278
|
-
def
|
279
|
-
|
563
|
+
|
564
|
+
def init_links_from_object(obj)
|
565
|
+
result = find_in_object(obj, "links", singleton: false) do |links|
|
566
|
+
if links
|
567
|
+
links.map! { |link| Wgit::Url.new(link) }
|
568
|
+
end
|
569
|
+
links
|
570
|
+
end
|
571
|
+
init_var(:@links, result)
|
572
|
+
end
|
573
|
+
|
574
|
+
# Init methods for text.
|
575
|
+
|
576
|
+
def init_text_from_html
|
280
577
|
xpath = text_elements_xpath
|
281
|
-
|
282
|
-
|
283
|
-
|
578
|
+
result = find_in_html(xpath, singleton: false)
|
579
|
+
init_var(:@text, result)
|
580
|
+
end
|
581
|
+
|
582
|
+
def init_text_from_object(obj)
|
583
|
+
result = find_in_object(obj, "text", singleton: false)
|
584
|
+
init_var(:@text, result)
|
284
585
|
end
|
285
586
|
|
286
|
-
|
587
|
+
alias :to_hash :to_h
|
287
588
|
alias :relative_links :internal_links
|
288
589
|
alias :relative_urls :internal_links
|
289
590
|
alias :relative_full_links :internal_full_links
|