wgit 0.7.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +74 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -290
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +219 -79
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +226 -143
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +21 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +11 -8
- data/lib/wgit/url.rb +192 -61
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/lib/wgit/indexer.rb
CHANGED
@@ -4,129 +4,8 @@ require_relative 'crawler'
|
|
4
4
|
require_relative 'database/database'
|
5
5
|
|
6
6
|
module Wgit
|
7
|
-
#
|
8
|
-
# Wgit::
|
9
|
-
#
|
10
|
-
# Retrieves uncrawled url's from the database and recursively crawls each
|
11
|
-
# site storing their internal pages into the database and adding their
|
12
|
-
# external url's to be crawled later on. Logs info on the crawl
|
13
|
-
# using Wgit.logger as it goes along.
|
14
|
-
#
|
15
|
-
# @param connection_string [String] The database connection string. Set as
|
16
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
17
|
-
# @param max_sites [Integer] The number of separate and whole
|
18
|
-
# websites to be crawled before the method exits. Defaults to -1 which
|
19
|
-
# means the crawl will occur until manually stopped (Ctrl+C etc).
|
20
|
-
# @param max_data [Integer] The maximum amount of bytes that will be
|
21
|
-
# scraped from the web (default is 1GB). Note, that this value is used to
|
22
|
-
# determine when to stop crawling; it's not a guarantee of the max data
|
23
|
-
# that will be obtained.
|
24
|
-
def self.index_www(
|
25
|
-
connection_string: nil, max_sites: -1, max_data: 1_048_576_000
|
26
|
-
)
|
27
|
-
db = Wgit::Database.new(connection_string)
|
28
|
-
indexer = Wgit::Indexer.new(db)
|
29
|
-
indexer.index_www(max_sites: max_sites, max_data: max_data)
|
30
|
-
end
|
31
|
-
|
32
|
-
# Convience method to index a single website using
|
33
|
-
# Wgit::Indexer#index_site.
|
34
|
-
#
|
35
|
-
# Crawls a single website's pages and stores them into the database.
|
36
|
-
# There is no max download limit so be careful which sites you index.
|
37
|
-
#
|
38
|
-
# @param url [Wgit::Url, String] The base Url of the website to crawl.
|
39
|
-
# @param connection_string [String] The database connection string. Set as
|
40
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
41
|
-
# @param insert_externals [Boolean] Whether or not to insert the website's
|
42
|
-
# external Url's into the database.
|
43
|
-
# @param allow_paths [String, Array<String>] Filters links by selecting
|
44
|
-
# them if their path `File.fnmatch?` one of allow_paths.
|
45
|
-
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
46
|
-
# them if their path `File.fnmatch?` one of disallow_paths.
|
47
|
-
# @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
|
48
|
-
# inserted into the database allowing for prior manipulation.
|
49
|
-
# @return [Integer] The total number of pages crawled within the website.
|
50
|
-
def self.index_site(
|
51
|
-
url, connection_string: nil, insert_externals: true,
|
52
|
-
allow_paths: nil, disallow_paths: nil, &block
|
53
|
-
)
|
54
|
-
url = Wgit::Url.parse(url)
|
55
|
-
db = Wgit::Database.new(connection_string)
|
56
|
-
indexer = Wgit::Indexer.new(db)
|
57
|
-
indexer.index_site(
|
58
|
-
url, insert_externals: insert_externals,
|
59
|
-
allow_paths: allow_paths, disallow_paths: disallow_paths, &block
|
60
|
-
)
|
61
|
-
end
|
62
|
-
|
63
|
-
# Convience method to index a single webpage using
|
64
|
-
# Wgit::Indexer#index_page.
|
65
|
-
#
|
66
|
-
# Crawls a single webpage and stores it into the database.
|
67
|
-
# There is no max download limit so be careful of large pages.
|
68
|
-
#
|
69
|
-
# @param url [Wgit::Url, String] The Url of the webpage to crawl.
|
70
|
-
# @param connection_string [String] The database connection string. Set as
|
71
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
72
|
-
# @param insert_externals [Boolean] Whether or not to insert the website's
|
73
|
-
# external Url's into the database.
|
74
|
-
# @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
|
75
|
-
# inserted into the database allowing for prior manipulation.
|
76
|
-
def self.index_page(
|
77
|
-
url, connection_string: nil, insert_externals: true, &block
|
78
|
-
)
|
79
|
-
url = Wgit::Url.parse(url)
|
80
|
-
db = Wgit::Database.new(connection_string)
|
81
|
-
indexer = Wgit::Indexer.new(db)
|
82
|
-
indexer.index_page(url, insert_externals: insert_externals, &block)
|
83
|
-
end
|
84
|
-
|
85
|
-
# Performs a search of the database's indexed documents and pretty prints
|
86
|
-
# the results. See Wgit::Database#search and Wgit::Document#search for
|
87
|
-
# details of how the search works.
|
88
|
-
#
|
89
|
-
# @param query [String] The text query to search with.
|
90
|
-
# @param connection_string [String] The database connection string. Set as
|
91
|
-
# nil to use ENV['WGIT_CONNECTION_STRING'].
|
92
|
-
# @param case_sensitive [Boolean] Whether character case must match.
|
93
|
-
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
94
|
-
# for separately.
|
95
|
-
# @param limit [Integer] The max number of results to print.
|
96
|
-
# @param skip [Integer] The number of DB records to skip.
|
97
|
-
# @param sentence_limit [Integer] The max length of each result's text
|
98
|
-
# snippet.
|
99
|
-
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
100
|
-
# database.
|
101
|
-
def self.indexed_search(
|
102
|
-
query, connection_string: nil,
|
103
|
-
case_sensitive: false, whole_sentence: true,
|
104
|
-
limit: 10, skip: 0, sentence_limit: 80, &block
|
105
|
-
)
|
106
|
-
db = Wgit::Database.new(connection_string)
|
107
|
-
|
108
|
-
results = db.search(
|
109
|
-
query,
|
110
|
-
case_sensitive: case_sensitive,
|
111
|
-
whole_sentence: whole_sentence,
|
112
|
-
limit: limit,
|
113
|
-
skip: skip,
|
114
|
-
&block
|
115
|
-
)
|
116
|
-
|
117
|
-
results.each do |doc|
|
118
|
-
doc.search!(
|
119
|
-
query,
|
120
|
-
case_sensitive: case_sensitive,
|
121
|
-
whole_sentence: whole_sentence,
|
122
|
-
sentence_limit: sentence_limit
|
123
|
-
)
|
124
|
-
end
|
125
|
-
|
126
|
-
Wgit::Utils.printf_search_results(results)
|
127
|
-
end
|
128
|
-
|
129
|
-
# Class which crawls and saves the indexed Documents to a database.
|
7
|
+
# Class which crawls and saves the Documents to a database. Can be thought of
|
8
|
+
# as a combination of Wgit::Crawler and Wgit::Database.
|
130
9
|
class Indexer
|
131
10
|
# The crawler used to index the WWW.
|
132
11
|
attr_reader :crawler
|
@@ -139,7 +18,7 @@ module Wgit
|
|
139
18
|
# @param database [Wgit::Database] The database instance (already
|
140
19
|
# initialized and connected) used to index.
|
141
20
|
# @param crawler [Wgit::Crawler] The crawler instance used to index.
|
142
|
-
def initialize(database, crawler = Wgit::Crawler.new)
|
21
|
+
def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
|
143
22
|
@db = database
|
144
23
|
@crawler = crawler
|
145
24
|
end
|
@@ -189,7 +68,8 @@ database capacity, exiting.")
|
|
189
68
|
|
190
69
|
site_docs_count = 0
|
191
70
|
ext_links = @crawler.crawl_site(url) do |doc|
|
192
|
-
|
71
|
+
unless doc.empty?
|
72
|
+
write_doc_to_db(doc)
|
193
73
|
docs_count += 1
|
194
74
|
site_docs_count += 1
|
195
75
|
end
|
@@ -198,12 +78,9 @@ database capacity, exiting.")
|
|
198
78
|
raise 'Error updating url' unless @db.update(url) == 1
|
199
79
|
|
200
80
|
urls_count += write_urls_to_db(ext_links)
|
201
|
-
|
202
|
-
Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
|
203
|
-
site: #{url}")
|
204
81
|
end
|
205
82
|
|
206
|
-
Wgit.logger.info("Crawled and
|
83
|
+
Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
|
207
84
|
overall for this iteration.")
|
208
85
|
Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
|
209
86
|
the next iteration.")
|
@@ -219,66 +96,91 @@ the next iteration.")
|
|
219
96
|
# @param url [Wgit::Url] The base Url of the website to crawl.
|
220
97
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
221
98
|
# external Url's into the database.
|
222
|
-
# @param
|
223
|
-
#
|
224
|
-
#
|
225
|
-
#
|
99
|
+
# @param follow [String] The xpath extracting links to be followed during
|
100
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
101
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
102
|
+
# HTML.
|
103
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
104
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
105
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
106
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
226
107
|
# @yield [doc] Given the Wgit::Document of each crawled web page before
|
227
108
|
# it's inserted into the database allowing for prior manipulation. Return
|
228
109
|
# nil or false from the block to prevent the document from being saved
|
229
110
|
# into the database.
|
230
111
|
# @return [Integer] The total number of webpages/documents indexed.
|
231
112
|
def index_site(
|
232
|
-
url, insert_externals:
|
113
|
+
url, insert_externals: false, follow: :default,
|
114
|
+
allow_paths: nil, disallow_paths: nil
|
233
115
|
)
|
234
|
-
crawl_opts = {
|
116
|
+
crawl_opts = {
|
117
|
+
follow: follow,
|
118
|
+
allow_paths: allow_paths,
|
119
|
+
disallow_paths: disallow_paths
|
120
|
+
}
|
235
121
|
total_pages_indexed = 0
|
236
122
|
|
237
|
-
ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
|
238
|
-
result = true
|
239
|
-
result = yield(doc) if block_given?
|
123
|
+
ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
|
124
|
+
result = block_given? ? yield(doc) : true
|
240
125
|
|
241
|
-
if result && !doc.empty?
|
126
|
+
if result && !doc.empty?
|
127
|
+
write_doc_to_db(doc)
|
242
128
|
total_pages_indexed += 1
|
243
|
-
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
244
129
|
end
|
245
130
|
end
|
246
131
|
|
247
|
-
@db.
|
132
|
+
@db.upsert(url)
|
248
133
|
|
249
134
|
if insert_externals && ext_urls
|
250
135
|
num_inserted_urls = write_urls_to_db(ext_urls)
|
251
136
|
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
252
137
|
end
|
253
138
|
|
254
|
-
Wgit.logger.info("Crawled and
|
255
|
-
site: #{url}")
|
139
|
+
Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
|
140
|
+
the site: #{url}")
|
256
141
|
|
257
142
|
total_pages_indexed
|
258
143
|
end
|
259
144
|
|
145
|
+
# Crawls one or more webpages and stores them into the database.
|
146
|
+
# There is no max download limit so be careful of large pages.
|
147
|
+
# Logs info on the crawl using Wgit.logger as it goes along.
|
148
|
+
#
|
149
|
+
# @param urls [*Wgit::Url] The webpage Url's to crawl.
|
150
|
+
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
151
|
+
# external Url's into the database.
|
152
|
+
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
153
|
+
# before it's inserted into the database allowing for prior
|
154
|
+
# manipulation. Return nil or false from the block to prevent the
|
155
|
+
# document from being saved into the database.
|
156
|
+
# @raise [StandardError] if no urls are provided.
|
157
|
+
def index_urls(*urls, insert_externals: false, &block)
|
158
|
+
raise 'You must provide at least one Url' if urls.empty?
|
159
|
+
|
160
|
+
opts = { insert_externals: insert_externals }
|
161
|
+
Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
|
162
|
+
|
163
|
+
nil
|
164
|
+
end
|
165
|
+
|
260
166
|
# Crawls a single webpage and stores it into the database.
|
261
167
|
# There is no max download limit so be careful of large pages.
|
262
168
|
# Logs info on the crawl using Wgit.logger as it goes along.
|
263
169
|
#
|
264
170
|
# @param url [Wgit::Url] The webpage Url to crawl.
|
265
|
-
# @param insert_externals [Boolean] Whether or not to insert the
|
171
|
+
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
266
172
|
# external Url's into the database.
|
267
173
|
# @yield [doc] Given the Wgit::Document of the crawled webpage,
|
268
174
|
# before it's inserted into the database allowing for prior
|
269
175
|
# manipulation. Return nil or false from the block to prevent the
|
270
176
|
# document from being saved into the database.
|
271
|
-
def
|
177
|
+
def index_url(url, insert_externals: false)
|
272
178
|
document = @crawler.crawl_url(url) do |doc|
|
273
|
-
result = true
|
274
|
-
|
275
|
-
|
276
|
-
if result && !doc.empty? && write_doc_to_db(doc)
|
277
|
-
Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
|
278
|
-
end
|
179
|
+
result = block_given? ? yield(doc) : true
|
180
|
+
write_doc_to_db(doc) if result && !doc.empty?
|
279
181
|
end
|
280
182
|
|
281
|
-
@db.
|
183
|
+
@db.upsert(url)
|
282
184
|
|
283
185
|
ext_urls = document&.external_links
|
284
186
|
if insert_externals && ext_urls
|
@@ -311,23 +213,19 @@ site: #{url}")
|
|
311
213
|
# collection deliberately prevents duplicate inserts.
|
312
214
|
#
|
313
215
|
# @param doc [Wgit::Document] The document to write to the DB.
|
314
|
-
# @return [Boolean] True if the write was successful, false otherwise.
|
315
216
|
def write_doc_to_db(doc)
|
316
|
-
@db.
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
Wgit.logger.info("Document already exists: #{doc.url}")
|
322
|
-
|
323
|
-
false
|
217
|
+
if @db.upsert(doc)
|
218
|
+
Wgit.logger.info("Saved document for url: #{doc.url}")
|
219
|
+
else
|
220
|
+
Wgit.logger.info("Updated document for url: #{doc.url}")
|
221
|
+
end
|
324
222
|
end
|
325
223
|
|
326
224
|
# Write the urls to the DB. Note that the unique url index on the urls
|
327
225
|
# collection deliberately prevents duplicate inserts.
|
328
226
|
#
|
329
227
|
# @param urls [Array<Wgit::Url>] The urls to write to the DB.
|
330
|
-
# @return [
|
228
|
+
# @return [Integer] The number of inserted urls.
|
331
229
|
def write_urls_to_db(urls)
|
332
230
|
count = 0
|
333
231
|
|
@@ -341,6 +239,7 @@ site: #{url}")
|
|
341
239
|
|
342
240
|
@db.insert(url)
|
343
241
|
count += 1
|
242
|
+
|
344
243
|
Wgit.logger.info("Inserted external url: #{url}")
|
345
244
|
rescue Mongo::Error::OperationFailure
|
346
245
|
Wgit.logger.info("External url already exists: #{url}")
|
@@ -348,5 +247,9 @@ site: #{url}")
|
|
348
247
|
|
349
248
|
count
|
350
249
|
end
|
250
|
+
|
251
|
+
alias database db
|
252
|
+
alias index index_urls
|
253
|
+
alias index_r index_site
|
351
254
|
end
|
352
255
|
end
|
data/lib/wgit/response.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
module Wgit
|
2
|
-
# Response class
|
2
|
+
# Response class modeling a generic HTTP GET response.
|
3
3
|
class Response
|
4
4
|
# The underlying HTTP adapter/library response object.
|
5
5
|
attr_accessor :adapter_response
|
@@ -69,7 +69,10 @@ module Wgit
|
|
69
69
|
# @param headers [Hash] The new response headers.
|
70
70
|
# @return [Hash] @headers's new value.
|
71
71
|
def headers=(headers)
|
72
|
-
|
72
|
+
unless headers
|
73
|
+
@headers = {}
|
74
|
+
return
|
75
|
+
end
|
73
76
|
|
74
77
|
@headers = headers.map do |k, v|
|
75
78
|
k = k.downcase.gsub('-', '_').to_sym
|
@@ -131,11 +134,11 @@ module Wgit
|
|
131
134
|
@status.positive?
|
132
135
|
end
|
133
136
|
|
134
|
-
alias code
|
135
|
-
alias content
|
136
|
-
alias
|
137
|
-
alias to_s
|
138
|
-
alias redirects
|
139
|
-
alias length
|
137
|
+
alias code status
|
138
|
+
alias content body
|
139
|
+
alias crawl_duration total_time
|
140
|
+
alias to_s body
|
141
|
+
alias redirects redirections
|
142
|
+
alias length size
|
140
143
|
end
|
141
144
|
end
|