wgit 0.11.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -0
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +27 -24
- data/bin/wgit +72 -18
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +91 -20
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -663
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +187 -77
- data/lib/wgit/document_extractors.rb +15 -23
- data/lib/wgit/dsl.rb +64 -67
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +29 -10
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +5 -8
- data/lib/wgit/robots_parser.rb +8 -8
- data/lib/wgit/url.rb +38 -38
- data/lib/wgit/utils.rb +124 -14
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -14
- metadata +74 -30
- data/lib/wgit/database/model.rb +0 -60
data/lib/wgit/dsl.rb
CHANGED
|
@@ -44,14 +44,14 @@ the 'start' function".freeze
|
|
|
44
44
|
Wgit::Document.define_extractor(var, xpath, opts, &block)
|
|
45
45
|
end
|
|
46
46
|
|
|
47
|
-
#
|
|
48
|
-
#
|
|
49
|
-
# more details.
|
|
47
|
+
# Sets and returns the Wgit::Crawler used in subsequent crawls including
|
|
48
|
+
# indexing. Defaults to `Wgit::Crawler.new` if not given a param. See the
|
|
49
|
+
# Wgit::Crawler documentation for more details.
|
|
50
50
|
#
|
|
51
|
-
# @yield [crawler]
|
|
52
|
-
# @return [Wgit::Crawler] The
|
|
53
|
-
def crawler
|
|
54
|
-
@dsl_crawler
|
|
51
|
+
# @yield [crawler] Given the DSL crawler; use the block to configure.
|
|
52
|
+
# @return [Wgit::Crawler] The crawler instance used by the DSL.
|
|
53
|
+
def use_crawler(crawler = nil)
|
|
54
|
+
@dsl_crawler = crawler || @dsl_crawler || Wgit::Crawler.new
|
|
55
55
|
yield @dsl_crawler if block_given?
|
|
56
56
|
@dsl_crawler
|
|
57
57
|
end
|
|
@@ -66,7 +66,7 @@ the 'start' function".freeze
|
|
|
66
66
|
# @yield [crawler] The crawler that'll be used in the subsequent
|
|
67
67
|
# crawl/index; use the block to configure.
|
|
68
68
|
def start(*urls, &block)
|
|
69
|
-
|
|
69
|
+
use_crawler(&block) if block_given?
|
|
70
70
|
@dsl_start = urls
|
|
71
71
|
end
|
|
72
72
|
|
|
@@ -101,7 +101,7 @@ the 'start' function".freeze
|
|
|
101
101
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
|
102
102
|
|
|
103
103
|
urls.map! { |url| Wgit::Url.parse(url) }
|
|
104
|
-
|
|
104
|
+
get_crawler.crawl_urls(*urls, follow_redirects:, &block)
|
|
105
105
|
end
|
|
106
106
|
|
|
107
107
|
# Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
|
|
@@ -138,42 +138,41 @@ the 'start' function".freeze
|
|
|
138
138
|
opts = { follow: xpath, allow_paths:, disallow_paths: }
|
|
139
139
|
|
|
140
140
|
urls.reduce([]) do |externals, url|
|
|
141
|
-
externals +
|
|
141
|
+
externals + get_crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
|
|
142
142
|
end
|
|
143
143
|
end
|
|
144
144
|
|
|
145
|
-
# Returns the DSL's `
|
|
145
|
+
# Returns the DSL's `Wgit::Crawler#last_response`.
|
|
146
146
|
#
|
|
147
147
|
# @return [Wgit::Response] The response from the last URL crawled.
|
|
148
148
|
def last_response
|
|
149
|
-
|
|
149
|
+
get_crawler.last_response
|
|
150
150
|
end
|
|
151
151
|
|
|
152
152
|
# Nilifies the DSL instance variables.
|
|
153
153
|
def reset
|
|
154
|
-
@dsl_crawler
|
|
155
|
-
@dsl_start
|
|
156
|
-
@dsl_follow
|
|
157
|
-
@
|
|
154
|
+
@dsl_crawler = nil
|
|
155
|
+
@dsl_start = nil
|
|
156
|
+
@dsl_follow = nil
|
|
157
|
+
@dsl_db = nil
|
|
158
158
|
end
|
|
159
159
|
|
|
160
160
|
### INDEXER METHODS ###
|
|
161
161
|
|
|
162
|
-
# Defines the
|
|
163
|
-
# method calls. This method is optional as
|
|
164
|
-
#
|
|
162
|
+
# Defines the connected database instance used in subsequent index and DB
|
|
163
|
+
# method calls. This method is optional however, as a new instance of the
|
|
164
|
+
# Wgit::Database.adapter_class will be initialised otherwise. Therefore
|
|
165
|
+
# if not calling this method, you should ensure
|
|
166
|
+
# ENV['WGIT_CONNECTION_STRING'] is set or the connection will fail.
|
|
165
167
|
#
|
|
166
|
-
# @param
|
|
167
|
-
#
|
|
168
|
-
def
|
|
169
|
-
@
|
|
168
|
+
# @param db [Wgit::Database::DatabaseAdapter] The connected database
|
|
169
|
+
# instance used in subsequent `index*` method calls.
|
|
170
|
+
def use_database(db)
|
|
171
|
+
@dsl_db = db
|
|
170
172
|
end
|
|
171
173
|
|
|
172
174
|
# Indexes the World Wide Web using `Wgit::Indexer#index_www` underneath.
|
|
173
175
|
#
|
|
174
|
-
# @param connection_string [String] The database connection string. Set as
|
|
175
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
|
176
|
-
# `connection_string`.
|
|
177
176
|
# @param max_sites [Integer] The number of separate and whole
|
|
178
177
|
# websites to be crawled before the method exits. Defaults to -1 which
|
|
179
178
|
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
|
@@ -181,11 +180,8 @@ the 'start' function".freeze
|
|
|
181
180
|
# scraped from the web (default is 1GB). Note, that this value is used to
|
|
182
181
|
# determine when to stop crawling; it's not a guarantee of the max data
|
|
183
182
|
# that will be obtained.
|
|
184
|
-
def index_www(
|
|
185
|
-
|
|
186
|
-
)
|
|
187
|
-
db = Wgit::Database.new(connection_string)
|
|
188
|
-
indexer = Wgit::Indexer.new(db, crawler)
|
|
183
|
+
def index_www(max_sites: -1, max_data: 1_048_576_000)
|
|
184
|
+
indexer = Wgit::Indexer.new(get_db, get_crawler)
|
|
189
185
|
|
|
190
186
|
indexer.index_www(max_sites:, max_data:)
|
|
191
187
|
end
|
|
@@ -194,9 +190,6 @@ the 'start' function".freeze
|
|
|
194
190
|
#
|
|
195
191
|
# @param urls [*String, *Wgit::Url] The base URL(s) of the website(s) to
|
|
196
192
|
# crawl. Can be set using `start`.
|
|
197
|
-
# @param connection_string [String] The database connection string. Set as
|
|
198
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
|
199
|
-
# `connection_string`.
|
|
200
193
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
|
201
194
|
# external URL's into the database.
|
|
202
195
|
# @param follow [String] The xpath extracting links to be followed during
|
|
@@ -213,15 +206,13 @@ the 'start' function".freeze
|
|
|
213
206
|
# set.
|
|
214
207
|
# @return [Integer] The total number of pages crawled within the website.
|
|
215
208
|
def index_site(
|
|
216
|
-
*urls,
|
|
217
|
-
insert_externals: false, follow: @dsl_follow,
|
|
209
|
+
*urls, insert_externals: false, follow: @dsl_follow,
|
|
218
210
|
allow_paths: nil, disallow_paths: nil, &block
|
|
219
211
|
)
|
|
220
212
|
urls = (@dsl_start || []) if urls.empty?
|
|
221
213
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
|
222
214
|
|
|
223
|
-
|
|
224
|
-
indexer = Wgit::Indexer.new(db, crawler)
|
|
215
|
+
indexer = Wgit::Indexer.new(get_db, get_crawler)
|
|
225
216
|
xpath = follow || :default
|
|
226
217
|
crawl_opts = {
|
|
227
218
|
insert_externals:, follow: xpath, allow_paths:, disallow_paths:
|
|
@@ -236,9 +227,6 @@ the 'start' function".freeze
|
|
|
236
227
|
#
|
|
237
228
|
# @param urls [*Wgit::Url] The webpage URL's to crawl. Defaults to the
|
|
238
229
|
# `start` URL(s).
|
|
239
|
-
# @param connection_string [String] The database connection string. Set as
|
|
240
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
|
241
|
-
# `connection_string`.
|
|
242
230
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
|
243
231
|
# external URL's into the database.
|
|
244
232
|
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
|
@@ -247,15 +235,11 @@ the 'start' function".freeze
|
|
|
247
235
|
# document from being saved into the database.
|
|
248
236
|
# @raise [StandardError] If no urls are provided and no `start` URL has
|
|
249
237
|
# been set.
|
|
250
|
-
def index(
|
|
251
|
-
*urls, connection_string: @dsl_conn_str,
|
|
252
|
-
insert_externals: false, &block
|
|
253
|
-
)
|
|
238
|
+
def index(*urls, insert_externals: false, &block)
|
|
254
239
|
urls = (@dsl_start || []) if urls.empty?
|
|
255
240
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
|
256
241
|
|
|
257
|
-
|
|
258
|
-
indexer = Wgit::Indexer.new(db, crawler)
|
|
242
|
+
indexer = Wgit::Indexer.new(get_db, get_crawler)
|
|
259
243
|
|
|
260
244
|
urls.map! { |url| Wgit::Url.parse(url) }
|
|
261
245
|
indexer.index_urls(*urls, insert_externals:, &block)
|
|
@@ -264,13 +248,11 @@ the 'start' function".freeze
|
|
|
264
248
|
### DATABASE METHODS ###
|
|
265
249
|
|
|
266
250
|
# Performs a search of the database's indexed documents and pretty prints
|
|
267
|
-
# the results in a search engine-esque format. See
|
|
268
|
-
# and `Wgit::Document#search!`
|
|
251
|
+
# the results in a search engine-esque format. See
|
|
252
|
+
# `Wgit::Database::DatabaseAdapter#search` and `Wgit::Document#search!`
|
|
253
|
+
# for details of how the search methods work.
|
|
269
254
|
#
|
|
270
255
|
# @param query [String] The text query to search with.
|
|
271
|
-
# @param connection_string [String] The database connection string. Set as
|
|
272
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'] or set using
|
|
273
|
-
# `connection_string`.
|
|
274
256
|
# @param stream [nil, #puts] Any object that respond_to?(:puts). It is used
|
|
275
257
|
# to output text somewhere e.g. a file or STDERR. Use nil for no output.
|
|
276
258
|
# @param case_sensitive [Boolean] Whether character case must match.
|
|
@@ -284,38 +266,53 @@ the 'start' function".freeze
|
|
|
284
266
|
# database containing only its matching `#text`.
|
|
285
267
|
# @return [Array<Wgit::Document>] The search results with matching text.
|
|
286
268
|
def search(
|
|
287
|
-
query,
|
|
269
|
+
query, stream: $stdout,
|
|
270
|
+
top_result_only: true, include_score: false,
|
|
288
271
|
case_sensitive: false, whole_sentence: true,
|
|
289
|
-
limit: 10, skip: 0, sentence_limit: 80
|
|
272
|
+
limit: 10, skip: 0, sentence_limit: 80
|
|
290
273
|
)
|
|
291
274
|
stream ||= File.open(File::NULL, 'w')
|
|
292
|
-
db = Wgit::Database.new(connection_string)
|
|
293
275
|
|
|
294
|
-
results =
|
|
295
|
-
query,
|
|
296
|
-
case_sensitive:, whole_sentence:,
|
|
297
|
-
limit:, skip:,
|
|
298
|
-
sentence_limit:, &block
|
|
299
|
-
)
|
|
276
|
+
results = get_db.search(
|
|
277
|
+
query, case_sensitive:, whole_sentence:, limit:, skip:)
|
|
300
278
|
|
|
301
|
-
|
|
279
|
+
results.each do |doc|
|
|
280
|
+
doc.search_text!(
|
|
281
|
+
query, case_sensitive:, whole_sentence:, sentence_limit:)
|
|
282
|
+
yield(doc) if block_given?
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
if top_result_only
|
|
286
|
+
Wgit::Utils.pprint_top_search_results(results, include_score:, stream:)
|
|
287
|
+
else
|
|
288
|
+
Wgit::Utils.pprint_all_search_results(results, include_score:, stream:)
|
|
289
|
+
end
|
|
302
290
|
|
|
303
291
|
results
|
|
304
292
|
end
|
|
305
293
|
|
|
306
294
|
# Deletes everything in the urls and documents collections by calling
|
|
307
|
-
# `Wgit::Database#
|
|
308
|
-
# so yeah... be careful.
|
|
295
|
+
# `Wgit::Database::DatabaseAdapter#empty` underneath.
|
|
309
296
|
#
|
|
310
297
|
# @return [Integer] The number of deleted records.
|
|
311
|
-
def
|
|
312
|
-
|
|
313
|
-
|
|
298
|
+
def empty_db!
|
|
299
|
+
get_db.empty
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
private
|
|
303
|
+
|
|
304
|
+
def get_crawler
|
|
305
|
+
@dsl_crawler ||= Wgit::Crawler.new
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def get_db
|
|
309
|
+
@dsl_db ||= Wgit::Database.new
|
|
314
310
|
end
|
|
315
311
|
|
|
316
312
|
alias_method :crawl_url, :crawl
|
|
317
313
|
alias_method :crawl_r, :crawl_site
|
|
318
314
|
alias_method :index_r, :index_site
|
|
315
|
+
alias_method :index_url, :index
|
|
319
316
|
alias_method :start_urls, :start
|
|
320
317
|
end
|
|
321
318
|
end
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
require_relative "utils"
|
|
2
|
+
require_relative "assertable"
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
5
|
+
module Wgit
|
|
6
|
+
# Class used to extract the visible page text from a HTML string.
|
|
7
|
+
# This is in turn used to set the output of a Wgit::Document#text method.
|
|
8
|
+
class HTMLToText
|
|
9
|
+
include Assertable
|
|
10
|
+
|
|
11
|
+
# Set of text elements used to extract the visible text.
|
|
12
|
+
# The element's display (:inline or :block) is used to delimit sentences e.g.
|
|
13
|
+
# <div>foo</div><div>bar</div> will be extracted as ['foo', 'bar'] whereas
|
|
14
|
+
# <span>foo</span><span>bar</span> will be extracted as ['foobar'].
|
|
15
|
+
@text_elements = {
|
|
16
|
+
a: :inline,
|
|
17
|
+
abbr: :inline,
|
|
18
|
+
address: :block,
|
|
19
|
+
article: :block,
|
|
20
|
+
aside: :block,
|
|
21
|
+
b: :inline,
|
|
22
|
+
bdi: :inline,
|
|
23
|
+
bdo: :inline,
|
|
24
|
+
blockquote: :block,
|
|
25
|
+
br: :block,
|
|
26
|
+
button: :block, # Normally inline but Wgit treats as block.
|
|
27
|
+
caption: :block,
|
|
28
|
+
cite: :inline,
|
|
29
|
+
code: :inline,
|
|
30
|
+
data: :inline,
|
|
31
|
+
dd: :block,
|
|
32
|
+
del: :inline,
|
|
33
|
+
details: :block,
|
|
34
|
+
dfn: :inline,
|
|
35
|
+
div: :block,
|
|
36
|
+
dl: :block,
|
|
37
|
+
dt: :block,
|
|
38
|
+
em: :inline,
|
|
39
|
+
figcaption: :block,
|
|
40
|
+
figure: :block,
|
|
41
|
+
footer: :block,
|
|
42
|
+
h1: :block,
|
|
43
|
+
h2: :block,
|
|
44
|
+
h3: :block,
|
|
45
|
+
h4: :block,
|
|
46
|
+
h5: :block,
|
|
47
|
+
h6: :block,
|
|
48
|
+
header: :block,
|
|
49
|
+
hr: :block,
|
|
50
|
+
i: :inline,
|
|
51
|
+
input: :inline,
|
|
52
|
+
ins: :block,
|
|
53
|
+
kbd: :inline,
|
|
54
|
+
label: :inline,
|
|
55
|
+
legend: :block,
|
|
56
|
+
li: :block,
|
|
57
|
+
main: :block,
|
|
58
|
+
mark: :inline,
|
|
59
|
+
meter: :block,
|
|
60
|
+
ol: :block,
|
|
61
|
+
option: :block,
|
|
62
|
+
output: :block,
|
|
63
|
+
p: :block,
|
|
64
|
+
pre: :block,
|
|
65
|
+
q: :inline,
|
|
66
|
+
rb: :inline,
|
|
67
|
+
rt: :inline,
|
|
68
|
+
ruby: :inline,
|
|
69
|
+
s: :inline,
|
|
70
|
+
samp: :inline,
|
|
71
|
+
section: :block,
|
|
72
|
+
small: :inline,
|
|
73
|
+
span: :inline,
|
|
74
|
+
strong: :inline,
|
|
75
|
+
sub: :inline,
|
|
76
|
+
summary: :block,
|
|
77
|
+
sup: :inline,
|
|
78
|
+
td: :block,
|
|
79
|
+
textarea: :block,
|
|
80
|
+
th: :block,
|
|
81
|
+
time: :inline,
|
|
82
|
+
u: :inline,
|
|
83
|
+
ul: :block,
|
|
84
|
+
var: :inline,
|
|
85
|
+
wbr: :inline
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
class << self
|
|
89
|
+
# Set of HTML elements that make up the visible text on a page. These
|
|
90
|
+
# elements are used to initialize the Wgit::Document#text. See the
|
|
91
|
+
# README.md for how to add to this Hash dynamically.
|
|
92
|
+
attr_reader :text_elements
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# The Nokogiri::HTML document object initialized from a HTML string.
|
|
96
|
+
attr_reader :parser
|
|
97
|
+
|
|
98
|
+
# Creates a new HTML to text extractor instance.
|
|
99
|
+
#
|
|
100
|
+
# @param parser [Nokogiri::HTML4::Document] The nokogiri parser object.
|
|
101
|
+
# @raise [StandardError] If the given parser is of an invalid type.
|
|
102
|
+
def initialize(parser)
|
|
103
|
+
assert_type(parser, Nokogiri::HTML4::Document)
|
|
104
|
+
|
|
105
|
+
@parser = parser
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Extracts and returns the text sentences from the @parser HTML.
|
|
109
|
+
#
|
|
110
|
+
# @return [Array<String>] An array of unique text sentences.
|
|
111
|
+
def extract_arr
|
|
112
|
+
return [] if @parser.to_s.empty?
|
|
113
|
+
|
|
114
|
+
text_str = extract_str
|
|
115
|
+
|
|
116
|
+
# Split the text_str into an Array of text sentences.
|
|
117
|
+
text_str
|
|
118
|
+
.split("\n")
|
|
119
|
+
.map(&:strip)
|
|
120
|
+
.reject(&:empty?)
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Extracts and returns a text string from the @parser HTML.
|
|
124
|
+
#
|
|
125
|
+
# @return [String] A string of text with \n delimiting sentences.
|
|
126
|
+
def extract_str
|
|
127
|
+
text_str = ""
|
|
128
|
+
|
|
129
|
+
iterate_child_nodes(@parser) do |node, display|
|
|
130
|
+
# Handle any special cases e.g. skip nodes we don't care about...
|
|
131
|
+
# <pre> nodes should have their contents displayed exactly as is.
|
|
132
|
+
if node_name(node) == :pre
|
|
133
|
+
text_str << "\n"
|
|
134
|
+
text_str << node.text
|
|
135
|
+
next
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Skip any child node of <pre> since they're handled as a special case above.
|
|
139
|
+
next if child_of?(:pre, node)
|
|
140
|
+
|
|
141
|
+
if node.text?
|
|
142
|
+
# Skip any text element that is purely whitespace.
|
|
143
|
+
next unless valid_text_content?(node.text)
|
|
144
|
+
else
|
|
145
|
+
# Skip a concrete node if it has other concrete child nodes as these
|
|
146
|
+
# will be iterated onto later.
|
|
147
|
+
#
|
|
148
|
+
# Process if node has no children or one child which is a valid text node.
|
|
149
|
+
next unless node.children.empty? || parent_of_text_node_only?(node)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Apply display rules deciding if a new line is needed before node.text.
|
|
153
|
+
add_new_line = false
|
|
154
|
+
prev = prev_sibling_or_parent(node)
|
|
155
|
+
|
|
156
|
+
if node.text?
|
|
157
|
+
add_new_line = true unless prev && inline?(prev)
|
|
158
|
+
else
|
|
159
|
+
add_new_line = true if display == :block
|
|
160
|
+
add_new_line = true if prev && block?(prev)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
text_str << "\n" if add_new_line
|
|
164
|
+
text_str << format_text(node.text)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
text_str
|
|
168
|
+
.strip
|
|
169
|
+
.squeeze("\n")
|
|
170
|
+
.squeeze(" ")
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
private
|
|
174
|
+
|
|
175
|
+
def node_name(node)
|
|
176
|
+
node.name&.downcase&.to_sym
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def display(node)
|
|
180
|
+
name = node_name(node)
|
|
181
|
+
Wgit::HTMLToText.text_elements[name]
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def inline?(node)
|
|
185
|
+
display(node) == :inline
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def block?(node)
|
|
189
|
+
display(node) == :block
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Returns the previous sibling of node or nil. Only valid text elements are
|
|
193
|
+
# returned i.e. non duplicates with valid text content.
|
|
194
|
+
def prev_sibling(node)
|
|
195
|
+
prev = node.previous
|
|
196
|
+
|
|
197
|
+
return nil unless prev
|
|
198
|
+
return prev unless prev.text?
|
|
199
|
+
return prev if valid_text_node?(prev) && !contains_new_line?(prev.text)
|
|
200
|
+
return prev if valid_text_node?(prev) && !format_text(prev.text).strip.empty?
|
|
201
|
+
|
|
202
|
+
prev.previous
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Returns node's previous sibling, parent or nil; in that order. Only valid
|
|
206
|
+
# text elements are returned i.e. non duplicates with valid text content.
|
|
207
|
+
def prev_sibling_or_parent(node)
|
|
208
|
+
prev = prev_sibling(node)
|
|
209
|
+
return prev if prev
|
|
210
|
+
|
|
211
|
+
node.parent
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def child_of?(ancestor_name, node)
|
|
215
|
+
node.ancestors.any? { |ancestor| node_name(ancestor) == ancestor_name }
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Returns true if any of the child nodes contain a non empty :text node.
|
|
219
|
+
def parent_of_text_node?(node)
|
|
220
|
+
node.children.any? { |child| child.text? && valid_text_content?(child.text) }
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def parent_of_text_node_only?(node)
|
|
224
|
+
node.children.size == 1 && parent_of_text_node?(node)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Returns true if text is not empty having removed all new lines.
|
|
228
|
+
def valid_text_content?(text)
|
|
229
|
+
!format_text(text).empty?
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# Returns true if node is a text node.
|
|
233
|
+
# Duplicate text nodes (that follow a concrete node) are omitted.
|
|
234
|
+
def valid_text_node?(node)
|
|
235
|
+
node.text? && node.text != node.parent.text
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def contains_new_line?(text)
|
|
239
|
+
["\n", '\\n'].any? { |new_line| text.include?(new_line) }
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# Remove special characters including any new lines; as semantic HTML will
|
|
243
|
+
# typically use <br> and/or block elements to denote a line break.
|
|
244
|
+
def format_text(text)
|
|
245
|
+
text
|
|
246
|
+
.encode("UTF-8", undef: :replace, invalid: :replace)
|
|
247
|
+
.gsub("\n", "")
|
|
248
|
+
.gsub('\\n', "")
|
|
249
|
+
.gsub("\r", "")
|
|
250
|
+
.gsub('\\r', "")
|
|
251
|
+
.gsub("\f", "")
|
|
252
|
+
.gsub('\\f', "")
|
|
253
|
+
.gsub("\t", "")
|
|
254
|
+
.gsub('\\t', "")
|
|
255
|
+
.gsub("‌", "")
|
|
256
|
+
.gsub(" ", " ")
|
|
257
|
+
.gsub(" ", " ")
|
|
258
|
+
.gsub(" ", " ")
|
|
259
|
+
.gsub(" ", " ")
|
|
260
|
+
.gsub(" ", " ")
|
|
261
|
+
.gsub('\u00a0', " ")
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Iterate over node and it's child nodes, yielding each to &block.
|
|
265
|
+
# Only HTMLToText.text_elements or valid :text nodes will be yielded.
|
|
266
|
+
# Duplicate text nodes (that follow a concrete node) are omitted.
|
|
267
|
+
def iterate_child_nodes(node, &block)
|
|
268
|
+
display = display(node)
|
|
269
|
+
text_node = valid_text_node?(node)
|
|
270
|
+
|
|
271
|
+
yield(node, display) if display || text_node
|
|
272
|
+
node.children.each { |child| iterate_child_nodes(child, &block) }
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
alias_method :extract, :extract_arr
|
|
276
|
+
end
|
|
277
|
+
end
|
data/lib/wgit/indexer.rb
CHANGED
|
@@ -1,12 +1,23 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'assertable'
|
|
3
4
|
require_relative 'crawler'
|
|
4
|
-
require_relative 'database/
|
|
5
|
+
require_relative 'database/database_adapter'
|
|
5
6
|
|
|
6
7
|
module Wgit
|
|
7
8
|
# Class which crawls and saves the Documents to a database. Can be thought of
|
|
8
|
-
# as a combination of Wgit::Crawler and Wgit::Database.
|
|
9
|
+
# as a combination of Wgit::Crawler and Wgit::Database::DatabaseAdapter.
|
|
9
10
|
class Indexer
|
|
11
|
+
include Assertable
|
|
12
|
+
|
|
13
|
+
# The ENV var used to omit and ignore robots.txt parsing during an index.
|
|
14
|
+
# Applies to all index_* methods if set in the ENV.
|
|
15
|
+
WGIT_IGNORE_ROBOTS_TXT = "WGIT_IGNORE_ROBOTS_TXT".freeze
|
|
16
|
+
|
|
17
|
+
# The block return value used to skip saving a crawled document to the
|
|
18
|
+
# database. Applies to all index_* methods that take a block.
|
|
19
|
+
SKIP_UPSERT = :skip.freeze
|
|
20
|
+
|
|
10
21
|
# The crawler used to index the WWW.
|
|
11
22
|
attr_reader :crawler
|
|
12
23
|
|
|
@@ -15,10 +26,13 @@ module Wgit
|
|
|
15
26
|
|
|
16
27
|
# Initialize the Indexer.
|
|
17
28
|
#
|
|
18
|
-
# @param database [Wgit::Database] The database instance
|
|
19
|
-
# initialized and connected) used
|
|
20
|
-
# @param crawler [Wgit::Crawler] The crawler instance used
|
|
29
|
+
# @param database [Wgit::Database::DatabaseAdapter] The database instance
|
|
30
|
+
# (already initialized and connected) used for indexing.
|
|
31
|
+
# @param crawler [Wgit::Crawler] The crawler instance used for indexing.
|
|
21
32
|
def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
|
|
33
|
+
assert_type(database, Wgit::Database::DatabaseAdapter)
|
|
34
|
+
assert_type(crawler, Wgit::Crawler)
|
|
35
|
+
|
|
22
36
|
@db = database
|
|
23
37
|
@crawler = crawler
|
|
24
38
|
end
|
|
@@ -143,11 +157,10 @@ future iterations")
|
|
|
143
157
|
next if no_index?(@crawler.last_response, doc)
|
|
144
158
|
|
|
145
159
|
result = block_given? ? yield(doc) : true
|
|
160
|
+
next if doc.empty? || result == SKIP_UPSERT
|
|
146
161
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
total_pages_indexed += 1
|
|
150
|
-
end
|
|
162
|
+
upsert_doc(doc)
|
|
163
|
+
total_pages_indexed += 1
|
|
151
164
|
end
|
|
152
165
|
|
|
153
166
|
upsert_url_and_redirects(url)
|
|
@@ -207,7 +220,9 @@ for the site: #{url}")
|
|
|
207
220
|
break if no_index?(@crawler.last_response, doc)
|
|
208
221
|
|
|
209
222
|
result = block_given? ? yield(doc) : true
|
|
210
|
-
|
|
223
|
+
break if doc.empty? || result == SKIP_UPSERT
|
|
224
|
+
|
|
225
|
+
upsert_doc(doc)
|
|
211
226
|
end
|
|
212
227
|
|
|
213
228
|
upsert_url_and_redirects(url)
|
|
@@ -285,6 +300,8 @@ for the site: #{url}")
|
|
|
285
300
|
|
|
286
301
|
# Crawls and parses robots.txt file (if found). Returns the parser or nil.
|
|
287
302
|
def parse_robots_txt(url)
|
|
303
|
+
return nil if ENV[WGIT_IGNORE_ROBOTS_TXT]
|
|
304
|
+
|
|
288
305
|
robots_url = url.to_origin.join('/robots.txt')
|
|
289
306
|
|
|
290
307
|
Wgit.logger.info("Crawling for robots.txt: #{robots_url}")
|
|
@@ -328,6 +345,8 @@ for the site: #{url}")
|
|
|
328
345
|
|
|
329
346
|
# Returns if the last_response or doc #no_index? is true or not.
|
|
330
347
|
def no_index?(last_response, doc)
|
|
348
|
+
return false if ENV[WGIT_IGNORE_ROBOTS_TXT]
|
|
349
|
+
|
|
331
350
|
url = last_response.url.to_s
|
|
332
351
|
if last_response.no_index?
|
|
333
352
|
Wgit.logger.info("Skipping page due to no-index response header: #{url}")
|
data/lib/wgit/logger.rb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
# FYI: The default logger is set at the bottom of this file.
|
|
4
4
|
|
|
5
|
-
require
|
|
5
|
+
require "logger"
|
|
6
6
|
|
|
7
7
|
module Wgit
|
|
8
8
|
# The Logger instance used by Wgit. Set your own custom logger after
|
|
@@ -28,7 +28,7 @@ module Wgit
|
|
|
28
28
|
#
|
|
29
29
|
# @return [Logger] The default Logger instance.
|
|
30
30
|
def self.default_logger
|
|
31
|
-
logger = Logger.new($stdout, progname:
|
|
31
|
+
logger = Logger.new($stdout, progname: "wgit", level: :info)
|
|
32
32
|
logger.formatter = proc do |_severity, _datetime, progname, msg|
|
|
33
33
|
"[#{progname}] #{msg}\n"
|
|
34
34
|
end
|