wgit 0.7.0 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +74 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -290
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +219 -79
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +226 -143
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +21 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +11 -8
- data/lib/wgit/url.rb +192 -61
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/lib/wgit/indexer.rb
CHANGED
@@ -4,129 +4,8 @@ require_relative 'crawler'
|
|
4
4
|
require_relative 'database/database'
|
5
5
|
|
6
6
|
module Wgit
|
7
|
-
#
|
8
|
-
# Wgit::
|
9
|
-
#
|
10
|
-
# Retrieves uncrawled url's from the database and recursively crawls each
|
11
|
-
# site storing their internal pages into the database and adding their
|
12
|
-
# external url's to be crawled later on. Logs info on the crawl
|
13
|
-
# using Wgit.logger as it goes along.
|
14
|
-
#
|
15
|
-
# @param connection_string [String] The database connection string. Set as
|
16
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
17
|
-
# @param max_sites [Integer] The number of separate and whole
|
18
|
-
# websites to be crawled before the method exits. Defaults to -1 which
|
19
|
-
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
20
|
-
# @param max_data [Integer] The maximum amount of bytes that will be
|
21
|
-
# scraped from the web (default is 1GB). Note, that this value is used to
|
22
|
-
# determine when to stop crawling; it's not a guarantee of the max data
|
23
|
-
# that will be obtained.
|
24
|
-
def self.index_www(
|
25
|
-
connection_string: nil, max_sites: -1, max_data: 1_048_576_000
|
26
|
-
)
|
27
|
-
db = Wgit::Database.new(connection_string)
|
28
|
-
indexer = Wgit::Indexer.new(db)
|
29
|
-
indexer.index_www(max_sites: max_sites, max_data: max_data)
|
30
|
-
end
|
31
|
-
|
32
|
-
# Convience method to index a single website using
|
33
|
-
# Wgit::Indexer#index_site.
|
34
|
-
#
|
35
|
-
# Crawls a single website's pages and stores them into the database.
|
36
|
-
# There is no max download limit so be careful which sites you index.
|
37
|
-
#
|
38
|
-
# @param url [Wgit::Url, String] The base Url of the website to crawl.
|
39
|
-
# @param connection_string [String] The database connection string. Set as
|
40
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
41
|
-
# @param insert_externals [Boolean] Whether or not to insert the website's
|
42
|
-
# external Url's into the database.
|
43
|
-
# @param allow_paths [String, Array<String>] Filters links by selecting
|
44
|
-
# them if their path `File.fnmatch?` one of allow_paths.
|
45
|
-
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
46
|
-
# them if their path `File.fnmatch?` one of disallow_paths.
|
47
|
-
# @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
|
48
|
-
# inserted into the database allowing for prior manipulation.
|
49
|
-
# @return [Integer] The total number of pages crawled within the website.
|
50
|
-
def self.index_site(
|
51
|
-
url, connection_string: nil, insert_externals: true,
|
52
|
-
allow_paths: nil, disallow_paths: nil, &block
|
53
|
-
)
|
54
|
-
url = Wgit::Url.parse(url)
|
55
|
-
db = Wgit::Database.new(connection_string)
|
56
|
-
indexer = Wgit::Indexer.new(db)
|
57
|
-
indexer.index_site(
|
58
|
-
url, insert_externals: insert_externals,
|
59
|
-
allow_paths: allow_paths, disallow_paths: disallow_paths, &block
|
60
|
-
)
|
61
|
-
end
|
62
|
-
|
63
|
-
# Convience method to index a single webpage using
|
64
|
-
# Wgit::Indexer#index_page.
|
65
|
-
#
|
66
|
-
# Crawls a single webpage and stores it into the database.
|
67
|
-
# There is no max download limit so be careful of large pages.
|
68
|
-
#
|
69
|
-
# @param url [Wgit::Url, String] The Url of the webpage to crawl.
|
70
|
-
# @param connection_string [String] The database connection string. Set as
|
71
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
72
|
-
# @param insert_externals [Boolean] Whether or not to insert the website's
|
73
|
-
# external Url's into the database.
|
74
|
-
# @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
|
75
|
-
# inserted into the database allowing for prior manipulation.
|
76
|
-
def self.index_page(
|
77
|
-
url, connection_string: nil, insert_externals: true, &block
|
78
|
-
)
|
79
|
-
url = Wgit::Url.parse(url)
|
80
|
-
db = Wgit::Database.new(connection_string)
|
81
|
-
indexer = Wgit::Indexer.new(db)
|
82
|
-
indexer.index_page(url, insert_externals: insert_externals, &block)
|
83
|
-
end
|
84
|
-
|
85
|
-
# Performs a search of the database's indexed documents and pretty prints
|
86
|
-
# the results. See Wgit::Database#search and Wgit::Document#search for
|
87
|
-
# details of how the search works.
|
88
|
-
#
|
89
|
-
# @param query [String] The text query to search with.
|
90
|
-
# @param connection_string [String] The database connection string. Set as
|
91
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
92
|
-
# @param case_sensitive [Boolean] Whether character case must match.
|
93
|
-
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
94
|
-
# for separately.
|
95
|
-
# @param limit [Integer] The max number of results to print.
|
96
|
-
# @param skip [Integer] The number of DB records to skip.
|
97
|
-
# @param sentence_limit [Integer] The max length of each result's text
|
98
|
-
# snippet.
|
99
|
-
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
100
|
-
# database.
|
101
|
-
def self.indexed_search(
|
102
|
-
query, connection_string: nil,
|
103
|
-
case_sensitive: false, whole_sentence: true,
|
104
|
-
limit: 10, skip: 0, sentence_limit: 80, &block
|
105
|
-
)
|
106
|
-
db = Wgit::Database.new(connection_string)
|
107
|
-
|
108
|
-
results = db.search(
|
109
|
-
query,
|
110
|
-
case_sensitive: case_sensitive,
|
111
|
-
whole_sentence: whole_sentence,
|
112
|
-
limit: limit,
|
113
|
-
skip: skip,
|
114
|
-
&block
|
115
|
-
)
|
116
|
-
|
117
|
-
results.each do |doc|
|
118
|
-
doc.search!(
|
119
|
-
query,
|
120
|
-
case_sensitive: case_sensitive,
|
121
|
-
whole_sentence: whole_sentence,
|
122
|
-
sentence_limit: sentence_limit
|
123
|
-
)
|
124
|
-
end
|
125
|
-
|
126
|
-
Wgit::Utils.printf_search_results(results)
|
127
|
-
end
|
128
|
-
|
129
|
-
# Class which crawls and saves the indexed Documents to a database.
|
7
|
+
# Class which crawls and saves the Documents to a database. Can be thought of
|
8
|
+
# as a combination of Wgit::Crawler and Wgit::Database.
|
130
9
|
class Indexer
|
131
10
|
# The crawler used to index the WWW.
|
132
11
|
attr_reader :crawler
|
@@ -139,7 +18,7 @@ module Wgit
|
|
139
18
|
# @param database [Wgit::Database] The database instance (already
|
140
19
|
# initialized and connected) used to index.
|
141
20
|
# @param crawler [Wgit::Crawler] The crawler instance used to index.
|
142
|
-
def initialize(database, crawler = Wgit::Crawler.new)
|
21
|
+
def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
|
143
22
|
@db = database
|
144
23
|
@crawler = crawler
|
145
24
|
end
|
@@ -189,7 +68,8 @@ database capacity, exiting.")
|
|
189
68
|
|
190
69
|
site_docs_count = 0
|
191
70
|
ext_links = @crawler.crawl_site(url) do |doc|
|
192
|
-
|
71
|
+
unless doc.empty?
|
72
|
+
write_doc_to_db(doc)
|
193
73
|
docs_count += 1
|
194
74
|
site_docs_count += 1
|
195
75
|
end
|
@@ -198,12 +78,9 @@ database capacity, exiting.")
|
|
198
78
|
raise 'Error updating url' unless @db.update(url) == 1
|
199
79
|
|
200
80
|
urls_count += write_urls_to_db(ext_links)
|
201
|
-
|
202
|
-
Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
|
203
|
-
site: #{url}")
|
204
81
|
end
|
205
82
|
|
206
|
-
Wgit.logger.info("Crawled and
|
83
|
+
Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
|
207
84
|
overall for this iteration.")
|
208
85
|
Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
|
209
86
|
the next iteration.")
|
@@ -219,66 +96,91 @@ the next iteration.")
|
|
219
96
|
# @param url [Wgit::Url] The base Url of the website to crawl.
|
220
97
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
221
98
|
# external Url's into the database.
|
222
|
-
# @param
|
223
|
-
#
|
224
|
-
#
|
225
|
-
#
|
99
|
+
# @param follow [String] The xpath extracting links to be followed during
|
100
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
101
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
102
|
+
# HTML.
|
103
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
104
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
105
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
106
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
226
107
|
# @yield [doc] Given the Wgit::Document of each crawled web page before
|
227
108
|
# it's inserted into the database allowing for prior manipulation. Return
|
228
109
|
# nil or false from the block to prevent the document from being saved
|
229
110
|
# into the database.
|
230
111
|
# @return [Integer] The total number of webpages/documents indexed.
|
231
112
|
def index_site(
|
232
|
-
url, insert_externals:
|
113
|
+
url, insert_externals: false, follow: :default,
|
114
|
+
allow_paths: nil, disallow_paths: nil
|
233
115
|
)
|
234
|
-
crawl_opts = {
|
116
|
+
crawl_opts = {
|
117
|
+
follow: follow,
|
118
|
+
allow_paths: allow_paths,
|
119
|
+
disallow_paths: disallow_paths
|
120
|
+
}
|
235
121
|
total_pages_indexed = 0
|
236
122
|
|
237
|
-
ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
|
238
|
-
result = true
|
239
|
-
result = yield(doc) if block_given?
|
123
|
+
ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
|
124
|
+
result = block_given? ? yield(doc) : true
|
240
125
|
|
241
|
-
if result && !doc.empty?
|
126
|
+
if result && !doc.empty?
|
127
|
+
write_doc_to_db(doc)
|
242
128
|
total_pages_indexed += 1
|
243
|
-
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
244
129
|
end
|
245
130
|
end
|
246
131
|
|
247
|
-
@db.
|
132
|
+
@db.upsert(url)
|
248
133
|
|
249
134
|
if insert_externals && ext_urls
|
250
135
|
num_inserted_urls = write_urls_to_db(ext_urls)
|
251
136
|
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
252
137
|
end
|
253
138
|
|
254
|
-
Wgit.logger.info("Crawled and
|
255
|
-
site: #{url}")
|
139
|
+
Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
|
140
|
+
the site: #{url}")
|
256
141
|
|
257
142
|
total_pages_indexed
|
258
143
|
end
|
259
144
|
|
145
|
+
# Crawls one or more webpages and stores them into the database.
|
146
|
+
# There is no max download limit so be careful of large pages.
|
147
|
+
# Logs info on the crawl using Wgit.logger as it goes along.
|
148
|
+
#
|
149
|
+
# @param urls [*Wgit::Url] The webpage Url's to crawl.
|
150
|
+
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
151
|
+
# external Url's into the database.
|
152
|
+
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
153
|
+
# before it's inserted into the database allowing for prior
|
154
|
+
# manipulation. Return nil or false from the block to prevent the
|
155
|
+
# document from being saved into the database.
|
156
|
+
# @raise [StandardError] if no urls are provided.
|
157
|
+
def index_urls(*urls, insert_externals: false, &block)
|
158
|
+
raise 'You must provide at least one Url' if urls.empty?
|
159
|
+
|
160
|
+
opts = { insert_externals: insert_externals }
|
161
|
+
Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
|
162
|
+
|
163
|
+
nil
|
164
|
+
end
|
165
|
+
|
260
166
|
# Crawls a single webpage and stores it into the database.
|
261
167
|
# There is no max download limit so be careful of large pages.
|
262
168
|
# Logs info on the crawl using Wgit.logger as it goes along.
|
263
169
|
#
|
264
170
|
# @param url [Wgit::Url] The webpage Url to crawl.
|
265
|
-
# @param insert_externals [Boolean] Whether or not to insert the
|
171
|
+
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
266
172
|
# external Url's into the database.
|
267
173
|
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
268
174
|
# before it's inserted into the database allowing for prior
|
269
175
|
# manipulation. Return nil or false from the block to prevent the
|
270
176
|
# document from being saved into the database.
|
271
|
-
def
|
177
|
+
def index_url(url, insert_externals: false)
|
272
178
|
document = @crawler.crawl_url(url) do |doc|
|
273
|
-
result = true
|
274
|
-
|
275
|
-
|
276
|
-
if result && !doc.empty? && write_doc_to_db(doc)
|
277
|
-
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
278
|
-
end
|
179
|
+
result = block_given? ? yield(doc) : true
|
180
|
+
write_doc_to_db(doc) if result && !doc.empty?
|
279
181
|
end
|
280
182
|
|
281
|
-
@db.
|
183
|
+
@db.upsert(url)
|
282
184
|
|
283
185
|
ext_urls = document&.external_links
|
284
186
|
if insert_externals && ext_urls
|
@@ -311,23 +213,19 @@ site: #{url}")
|
|
311
213
|
# collection deliberately prevents duplicate inserts.
|
312
214
|
#
|
313
215
|
# @param doc [Wgit::Document] The document to write to the DB.
|
314
|
-
# @return [Boolean] True if the write was successful, false otherwise.
|
315
216
|
def write_doc_to_db(doc)
|
316
|
-
@db.
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
Wgit.logger.info("Document already exists: #{doc.url}")
|
322
|
-
|
323
|
-
false
|
217
|
+
if @db.upsert(doc)
|
218
|
+
Wgit.logger.info("Saved document for url: #{doc.url}")
|
219
|
+
else
|
220
|
+
Wgit.logger.info("Updated document for url: #{doc.url}")
|
221
|
+
end
|
324
222
|
end
|
325
223
|
|
326
224
|
# Write the urls to the DB. Note that the unique url index on the urls
|
327
225
|
# collection deliberately prevents duplicate inserts.
|
328
226
|
#
|
329
227
|
# @param urls [Array<Wgit::Url>] The urls to write to the DB.
|
330
|
-
# @return [
|
228
|
+
# @return [Integer] The number of inserted urls.
|
331
229
|
def write_urls_to_db(urls)
|
332
230
|
count = 0
|
333
231
|
|
@@ -341,6 +239,7 @@ site: #{url}")
|
|
341
239
|
|
342
240
|
@db.insert(url)
|
343
241
|
count += 1
|
242
|
+
|
344
243
|
Wgit.logger.info("Inserted external url: #{url}")
|
345
244
|
rescue Mongo::Error::OperationFailure
|
346
245
|
Wgit.logger.info("External url already exists: #{url}")
|
@@ -348,5 +247,9 @@ site: #{url}")
|
|
348
247
|
|
349
248
|
count
|
350
249
|
end
|
250
|
+
|
251
|
+
alias database db
|
252
|
+
alias index index_urls
|
253
|
+
alias index_r index_site
|
351
254
|
end
|
352
255
|
end
|
data/lib/wgit/response.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
module Wgit
|
2
|
-
# Response class
|
2
|
+
# Response class modeling a generic HTTP GET response.
|
3
3
|
class Response
|
4
4
|
# The underlying HTTP adapter/library response object.
|
5
5
|
attr_accessor :adapter_response
|
@@ -69,7 +69,10 @@ module Wgit
|
|
69
69
|
# @param headers [Hash] The new response headers.
|
70
70
|
# @return [Hash] @headers's new value.
|
71
71
|
def headers=(headers)
|
72
|
-
|
72
|
+
unless headers
|
73
|
+
@headers = {}
|
74
|
+
return
|
75
|
+
end
|
73
76
|
|
74
77
|
@headers = headers.map do |k, v|
|
75
78
|
k = k.downcase.gsub('-', '_').to_sym
|
@@ -131,11 +134,11 @@ module Wgit
|
|
131
134
|
@status.positive?
|
132
135
|
end
|
133
136
|
|
134
|
-
alias code
|
135
|
-
alias content
|
136
|
-
alias
|
137
|
-
alias to_s
|
138
|
-
alias redirects
|
139
|
-
alias length
|
137
|
+
alias code status
|
138
|
+
alias content body
|
139
|
+
alias crawl_duration total_time
|
140
|
+
alias to_s body
|
141
|
+
alias redirects redirections
|
142
|
+
alias length size
|
140
143
|
end
|
141
144
|
end
|