wgit 0.7.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/indexer.rb CHANGED
@@ -4,129 +4,8 @@ require_relative 'crawler'
4
4
  require_relative 'database/database'
5
5
 
6
6
  module Wgit
7
- # Convience method to index the World Wide Web using
8
- # Wgit::Indexer#index_www.
9
- #
10
- # Retrieves uncrawled url's from the database and recursively crawls each
11
- # site storing their internal pages into the database and adding their
12
- # external url's to be crawled later on. Logs info on the crawl
13
- # using Wgit.logger as it goes along.
14
- #
15
- # @param connection_string [String] The database connection string. Set as
16
- # nil to use ENV['WGIT_CONNECTION_STRING'].
17
- # @param max_sites [Integer] The number of separate and whole
18
- # websites to be crawled before the method exits. Defaults to -1 which
19
- # means the crawl will occur until manually stopped (Ctrl+C etc).
20
- # @param max_data [Integer] The maximum amount of bytes that will be
21
- # scraped from the web (default is 1GB). Note, that this value is used to
22
- # determine when to stop crawling; it's not a guarantee of the max data
23
- # that will be obtained.
24
- def self.index_www(
25
- connection_string: nil, max_sites: -1, max_data: 1_048_576_000
26
- )
27
- db = Wgit::Database.new(connection_string)
28
- indexer = Wgit::Indexer.new(db)
29
- indexer.index_www(max_sites: max_sites, max_data: max_data)
30
- end
31
-
32
- # Convience method to index a single website using
33
- # Wgit::Indexer#index_site.
34
- #
35
- # Crawls a single website's pages and stores them into the database.
36
- # There is no max download limit so be careful which sites you index.
37
- #
38
- # @param url [Wgit::Url, String] The base Url of the website to crawl.
39
- # @param connection_string [String] The database connection string. Set as
40
- # nil to use ENV['WGIT_CONNECTION_STRING'].
41
- # @param insert_externals [Boolean] Whether or not to insert the website's
42
- # external Url's into the database.
43
- # @param allow_paths [String, Array<String>] Filters links by selecting
44
- # them if their path `File.fnmatch?` one of allow_paths.
45
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
46
- # them if their path `File.fnmatch?` one of disallow_paths.
47
- # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
48
- # inserted into the database allowing for prior manipulation.
49
- # @return [Integer] The total number of pages crawled within the website.
50
- def self.index_site(
51
- url, connection_string: nil, insert_externals: true,
52
- allow_paths: nil, disallow_paths: nil, &block
53
- )
54
- url = Wgit::Url.parse(url)
55
- db = Wgit::Database.new(connection_string)
56
- indexer = Wgit::Indexer.new(db)
57
- indexer.index_site(
58
- url, insert_externals: insert_externals,
59
- allow_paths: allow_paths, disallow_paths: disallow_paths, &block
60
- )
61
- end
62
-
63
- # Convience method to index a single webpage using
64
- # Wgit::Indexer#index_page.
65
- #
66
- # Crawls a single webpage and stores it into the database.
67
- # There is no max download limit so be careful of large pages.
68
- #
69
- # @param url [Wgit::Url, String] The Url of the webpage to crawl.
70
- # @param connection_string [String] The database connection string. Set as
71
- # nil to use ENV['WGIT_CONNECTION_STRING'].
72
- # @param insert_externals [Boolean] Whether or not to insert the website's
73
- # external Url's into the database.
74
- # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
75
- # inserted into the database allowing for prior manipulation.
76
- def self.index_page(
77
- url, connection_string: nil, insert_externals: true, &block
78
- )
79
- url = Wgit::Url.parse(url)
80
- db = Wgit::Database.new(connection_string)
81
- indexer = Wgit::Indexer.new(db)
82
- indexer.index_page(url, insert_externals: insert_externals, &block)
83
- end
84
-
85
- # Performs a search of the database's indexed documents and pretty prints
86
- # the results. See Wgit::Database#search and Wgit::Document#search for
87
- # details of how the search works.
88
- #
89
- # @param query [String] The text query to search with.
90
- # @param connection_string [String] The database connection string. Set as
91
- # nil to use ENV['WGIT_CONNECTION_STRING'].
92
- # @param case_sensitive [Boolean] Whether character case must match.
93
- # @param whole_sentence [Boolean] Whether multiple words should be searched
94
- # for separately.
95
- # @param limit [Integer] The max number of results to print.
96
- # @param skip [Integer] The number of DB records to skip.
97
- # @param sentence_limit [Integer] The max length of each result's text
98
- # snippet.
99
- # @yield [doc] Given each search result (Wgit::Document) returned from the
100
- # database.
101
- def self.indexed_search(
102
- query, connection_string: nil,
103
- case_sensitive: false, whole_sentence: true,
104
- limit: 10, skip: 0, sentence_limit: 80, &block
105
- )
106
- db = Wgit::Database.new(connection_string)
107
-
108
- results = db.search(
109
- query,
110
- case_sensitive: case_sensitive,
111
- whole_sentence: whole_sentence,
112
- limit: limit,
113
- skip: skip,
114
- &block
115
- )
116
-
117
- results.each do |doc|
118
- doc.search!(
119
- query,
120
- case_sensitive: case_sensitive,
121
- whole_sentence: whole_sentence,
122
- sentence_limit: sentence_limit
123
- )
124
- end
125
-
126
- Wgit::Utils.printf_search_results(results)
127
- end
128
-
129
- # Class which crawls and saves the indexed Documents to a database.
7
+ # Class which crawls and saves the Documents to a database. Can be thought of
8
+ # as a combination of Wgit::Crawler and Wgit::Database.
130
9
  class Indexer
131
10
  # The crawler used to index the WWW.
132
11
  attr_reader :crawler
@@ -139,7 +18,7 @@ module Wgit
139
18
  # @param database [Wgit::Database] The database instance (already
140
19
  # initialized and connected) used to index.
141
20
  # @param crawler [Wgit::Crawler] The crawler instance used to index.
142
- def initialize(database, crawler = Wgit::Crawler.new)
21
+ def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
143
22
  @db = database
144
23
  @crawler = crawler
145
24
  end
@@ -189,7 +68,8 @@ database capacity, exiting.")
189
68
 
190
69
  site_docs_count = 0
191
70
  ext_links = @crawler.crawl_site(url) do |doc|
192
- if !doc.empty? && write_doc_to_db(doc)
71
+ unless doc.empty?
72
+ write_doc_to_db(doc)
193
73
  docs_count += 1
194
74
  site_docs_count += 1
195
75
  end
@@ -198,12 +78,9 @@ database capacity, exiting.")
198
78
  raise 'Error updating url' unless @db.update(url) == 1
199
79
 
200
80
  urls_count += write_urls_to_db(ext_links)
201
-
202
- Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
203
- site: #{url}")
204
81
  end
205
82
 
206
- Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
83
+ Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
207
84
  overall for this iteration.")
208
85
  Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
209
86
  the next iteration.")
@@ -219,66 +96,91 @@ the next iteration.")
219
96
  # @param url [Wgit::Url] The base Url of the website to crawl.
220
97
  # @param insert_externals [Boolean] Whether or not to insert the website's
221
98
  # external Url's into the database.
222
- # @param allow_paths [String, Array<String>] Filters links by selecting
223
- # them if their path `File.fnmatch?` one of allow_paths.
224
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
225
- # them if their path `File.fnmatch?` one of disallow_paths.
99
+ # @param follow [String] The xpath extracting links to be followed during
100
+ # the crawl. This changes how a site is crawled. Only links pointing to
101
+ # the site domain are allowed. The `:default` is any `<a>` href returning
102
+ # HTML.
103
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
104
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
105
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
106
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
226
107
  # @yield [doc] Given the Wgit::Document of each crawled web page before
227
108
  # it's inserted into the database allowing for prior manipulation. Return
228
109
  # nil or false from the block to prevent the document from being saved
229
110
  # into the database.
230
111
  # @return [Integer] The total number of webpages/documents indexed.
231
112
  def index_site(
232
- url, insert_externals: true, allow_paths: nil, disallow_paths: nil
113
+ url, insert_externals: false, follow: :default,
114
+ allow_paths: nil, disallow_paths: nil
233
115
  )
234
- crawl_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
116
+ crawl_opts = {
117
+ follow: follow,
118
+ allow_paths: allow_paths,
119
+ disallow_paths: disallow_paths
120
+ }
235
121
  total_pages_indexed = 0
236
122
 
237
- ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
238
- result = true
239
- result = yield(doc) if block_given?
123
+ ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
124
+ result = block_given? ? yield(doc) : true
240
125
 
241
- if result && !doc.empty? && write_doc_to_db(doc)
126
+ if result && !doc.empty?
127
+ write_doc_to_db(doc)
242
128
  total_pages_indexed += 1
243
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
244
129
  end
245
130
  end
246
131
 
247
- @db.url?(url) ? @db.update(url) : @db.insert(url)
132
+ @db.upsert(url)
248
133
 
249
134
  if insert_externals && ext_urls
250
135
  num_inserted_urls = write_urls_to_db(ext_urls)
251
136
  Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
252
137
  end
253
138
 
254
- Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
255
- site: #{url}")
139
+ Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
140
+ the site: #{url}")
256
141
 
257
142
  total_pages_indexed
258
143
  end
259
144
 
145
+ # Crawls one or more webpages and stores them into the database.
146
+ # There is no max download limit so be careful of large pages.
147
+ # Logs info on the crawl using Wgit.logger as it goes along.
148
+ #
149
+ # @param urls [*Wgit::Url] The webpage Url's to crawl.
150
+ # @param insert_externals [Boolean] Whether or not to insert the webpages
151
+ # external Url's into the database.
152
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
153
+ # before it's inserted into the database allowing for prior
154
+ # manipulation. Return nil or false from the block to prevent the
155
+ # document from being saved into the database.
156
+ # @raise [StandardError] if no urls are provided.
157
+ def index_urls(*urls, insert_externals: false, &block)
158
+ raise 'You must provide at least one Url' if urls.empty?
159
+
160
+ opts = { insert_externals: insert_externals }
161
+ Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
162
+
163
+ nil
164
+ end
165
+
260
166
  # Crawls a single webpage and stores it into the database.
261
167
  # There is no max download limit so be careful of large pages.
262
168
  # Logs info on the crawl using Wgit.logger as it goes along.
263
169
  #
264
170
  # @param url [Wgit::Url] The webpage Url to crawl.
265
- # @param insert_externals [Boolean] Whether or not to insert the webpage's
171
+ # @param insert_externals [Boolean] Whether or not to insert the webpages
266
172
  # external Url's into the database.
267
173
  # @yield [doc] Given the Wgit::Document of the crawled webpage,
268
174
  # before it's inserted into the database allowing for prior
269
175
  # manipulation. Return nil or false from the block to prevent the
270
176
  # document from being saved into the database.
271
- def index_page(url, insert_externals: true)
177
+ def index_url(url, insert_externals: false)
272
178
  document = @crawler.crawl_url(url) do |doc|
273
- result = true
274
- result = yield(doc) if block_given?
275
-
276
- if result && !doc.empty? && write_doc_to_db(doc)
277
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
278
- end
179
+ result = block_given? ? yield(doc) : true
180
+ write_doc_to_db(doc) if result && !doc.empty?
279
181
  end
280
182
 
281
- @db.url?(url) ? @db.update(url) : @db.insert(url)
183
+ @db.upsert(url)
282
184
 
283
185
  ext_urls = document&.external_links
284
186
  if insert_externals && ext_urls
@@ -311,23 +213,19 @@ site: #{url}")
311
213
  # collection deliberately prevents duplicate inserts.
312
214
  #
313
215
  # @param doc [Wgit::Document] The document to write to the DB.
314
- # @return [Boolean] True if the write was successful, false otherwise.
315
216
  def write_doc_to_db(doc)
316
- @db.insert(doc)
317
- Wgit.logger.info("Saved document for url: #{doc.url}")
318
-
319
- true
320
- rescue Mongo::Error::OperationFailure
321
- Wgit.logger.info("Document already exists: #{doc.url}")
322
-
323
- false
217
+ if @db.upsert(doc)
218
+ Wgit.logger.info("Saved document for url: #{doc.url}")
219
+ else
220
+ Wgit.logger.info("Updated document for url: #{doc.url}")
221
+ end
324
222
  end
325
223
 
326
224
  # Write the urls to the DB. Note that the unique url index on the urls
327
225
  # collection deliberately prevents duplicate inserts.
328
226
  #
329
227
  # @param urls [Array<Wgit::Url>] The urls to write to the DB.
330
- # @return [Boolean] True if the write was successful, false otherwise.
228
+ # @return [Integer] The number of inserted urls.
331
229
  def write_urls_to_db(urls)
332
230
  count = 0
333
231
 
@@ -341,6 +239,7 @@ site: #{url}")
341
239
 
342
240
  @db.insert(url)
343
241
  count += 1
242
+
344
243
  Wgit.logger.info("Inserted external url: #{url}")
345
244
  rescue Mongo::Error::OperationFailure
346
245
  Wgit.logger.info("External url already exists: #{url}")
@@ -348,5 +247,9 @@ site: #{url}")
348
247
 
349
248
  count
350
249
  end
250
+
251
+ alias database db
252
+ alias index index_urls
253
+ alias index_r index_site
351
254
  end
352
255
  end
data/lib/wgit/response.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  module Wgit
2
- # Response class representing a generic HTTP crawl response.
2
+ # Response class modeling a generic HTTP GET response.
3
3
  class Response
4
4
  # The underlying HTTP adapter/library response object.
5
5
  attr_accessor :adapter_response
@@ -69,7 +69,10 @@ module Wgit
69
69
  # @param headers [Hash] The new response headers.
70
70
  # @return [Hash] @headers's new value.
71
71
  def headers=(headers)
72
- return @headers = {} unless headers
72
+ unless headers
73
+ @headers = {}
74
+ return
75
+ end
73
76
 
74
77
  @headers = headers.map do |k, v|
75
78
  k = k.downcase.gsub('-', '_').to_sym
@@ -131,11 +134,11 @@ module Wgit
131
134
  @status.positive?
132
135
  end
133
136
 
134
- alias code status
135
- alias content body
136
- alias crawl_time total_time
137
- alias to_s body
138
- alias redirects redirections
139
- alias length size
137
+ alias code status
138
+ alias content body
139
+ alias crawl_duration total_time
140
+ alias to_s body
141
+ alias redirects redirections
142
+ alias length size
140
143
  end
141
144
  end