wgit 0.7.0 → 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/indexer.rb CHANGED
@@ -4,129 +4,8 @@ require_relative 'crawler'
4
4
  require_relative 'database/database'
5
5
 
6
6
  module Wgit
7
- # Convience method to index the World Wide Web using
8
- # Wgit::Indexer#index_www.
9
- #
10
- # Retrieves uncrawled url's from the database and recursively crawls each
11
- # site storing their internal pages into the database and adding their
12
- # external url's to be crawled later on. Logs info on the crawl
13
- # using Wgit.logger as it goes along.
14
- #
15
- # @param connection_string [String] The database connection string. Set as
16
- # nil to use ENV['WGIT_CONNECTION_STRING'].
17
- # @param max_sites [Integer] The number of separate and whole
18
- # websites to be crawled before the method exits. Defaults to -1 which
19
- # means the crawl will occur until manually stopped (Ctrl+C etc).
20
- # @param max_data [Integer] The maximum amount of bytes that will be
21
- # scraped from the web (default is 1GB). Note, that this value is used to
22
- # determine when to stop crawling; it's not a guarantee of the max data
23
- # that will be obtained.
24
- def self.index_www(
25
- connection_string: nil, max_sites: -1, max_data: 1_048_576_000
26
- )
27
- db = Wgit::Database.new(connection_string)
28
- indexer = Wgit::Indexer.new(db)
29
- indexer.index_www(max_sites: max_sites, max_data: max_data)
30
- end
31
-
32
- # Convience method to index a single website using
33
- # Wgit::Indexer#index_site.
34
- #
35
- # Crawls a single website's pages and stores them into the database.
36
- # There is no max download limit so be careful which sites you index.
37
- #
38
- # @param url [Wgit::Url, String] The base Url of the website to crawl.
39
- # @param connection_string [String] The database connection string. Set as
40
- # nil to use ENV['WGIT_CONNECTION_STRING'].
41
- # @param insert_externals [Boolean] Whether or not to insert the website's
42
- # external Url's into the database.
43
- # @param allow_paths [String, Array<String>] Filters links by selecting
44
- # them if their path `File.fnmatch?` one of allow_paths.
45
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
46
- # them if their path `File.fnmatch?` one of disallow_paths.
47
- # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
48
- # inserted into the database allowing for prior manipulation.
49
- # @return [Integer] The total number of pages crawled within the website.
50
- def self.index_site(
51
- url, connection_string: nil, insert_externals: true,
52
- allow_paths: nil, disallow_paths: nil, &block
53
- )
54
- url = Wgit::Url.parse(url)
55
- db = Wgit::Database.new(connection_string)
56
- indexer = Wgit::Indexer.new(db)
57
- indexer.index_site(
58
- url, insert_externals: insert_externals,
59
- allow_paths: allow_paths, disallow_paths: disallow_paths, &block
60
- )
61
- end
62
-
63
- # Convience method to index a single webpage using
64
- # Wgit::Indexer#index_page.
65
- #
66
- # Crawls a single webpage and stores it into the database.
67
- # There is no max download limit so be careful of large pages.
68
- #
69
- # @param url [Wgit::Url, String] The Url of the webpage to crawl.
70
- # @param connection_string [String] The database connection string. Set as
71
- # nil to use ENV['WGIT_CONNECTION_STRING'].
72
- # @param insert_externals [Boolean] Whether or not to insert the website's
73
- # external Url's into the database.
74
- # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
75
- # inserted into the database allowing for prior manipulation.
76
- def self.index_page(
77
- url, connection_string: nil, insert_externals: true, &block
78
- )
79
- url = Wgit::Url.parse(url)
80
- db = Wgit::Database.new(connection_string)
81
- indexer = Wgit::Indexer.new(db)
82
- indexer.index_page(url, insert_externals: insert_externals, &block)
83
- end
84
-
85
- # Performs a search of the database's indexed documents and pretty prints
86
- # the results. See Wgit::Database#search and Wgit::Document#search for
87
- # details of how the search works.
88
- #
89
- # @param query [String] The text query to search with.
90
- # @param connection_string [String] The database connection string. Set as
91
- # nil to use ENV['WGIT_CONNECTION_STRING'].
92
- # @param case_sensitive [Boolean] Whether character case must match.
93
- # @param whole_sentence [Boolean] Whether multiple words should be searched
94
- # for separately.
95
- # @param limit [Integer] The max number of results to print.
96
- # @param skip [Integer] The number of DB records to skip.
97
- # @param sentence_limit [Integer] The max length of each result's text
98
- # snippet.
99
- # @yield [doc] Given each search result (Wgit::Document) returned from the
100
- # database.
101
- def self.indexed_search(
102
- query, connection_string: nil,
103
- case_sensitive: false, whole_sentence: true,
104
- limit: 10, skip: 0, sentence_limit: 80, &block
105
- )
106
- db = Wgit::Database.new(connection_string)
107
-
108
- results = db.search(
109
- query,
110
- case_sensitive: case_sensitive,
111
- whole_sentence: whole_sentence,
112
- limit: limit,
113
- skip: skip,
114
- &block
115
- )
116
-
117
- results.each do |doc|
118
- doc.search!(
119
- query,
120
- case_sensitive: case_sensitive,
121
- whole_sentence: whole_sentence,
122
- sentence_limit: sentence_limit
123
- )
124
- end
125
-
126
- Wgit::Utils.printf_search_results(results)
127
- end
128
-
129
- # Class which crawls and saves the indexed Documents to a database.
7
+ # Class which crawls and saves the Documents to a database. Can be thought of
8
+ # as a combination of Wgit::Crawler and Wgit::Database.
130
9
  class Indexer
131
10
  # The crawler used to index the WWW.
132
11
  attr_reader :crawler
@@ -139,7 +18,7 @@ module Wgit
139
18
  # @param database [Wgit::Database] The database instance (already
140
19
  # initialized and connected) used to index.
141
20
  # @param crawler [Wgit::Crawler] The crawler instance used to index.
142
- def initialize(database, crawler = Wgit::Crawler.new)
21
+ def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
143
22
  @db = database
144
23
  @crawler = crawler
145
24
  end
@@ -189,7 +68,8 @@ database capacity, exiting.")
189
68
 
190
69
  site_docs_count = 0
191
70
  ext_links = @crawler.crawl_site(url) do |doc|
192
- if !doc.empty? && write_doc_to_db(doc)
71
+ unless doc.empty?
72
+ write_doc_to_db(doc)
193
73
  docs_count += 1
194
74
  site_docs_count += 1
195
75
  end
@@ -198,12 +78,9 @@ database capacity, exiting.")
198
78
  raise 'Error updating url' unless @db.update(url) == 1
199
79
 
200
80
  urls_count += write_urls_to_db(ext_links)
201
-
202
- Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
203
- site: #{url}")
204
81
  end
205
82
 
206
- Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
83
+ Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
207
84
  overall for this iteration.")
208
85
  Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
209
86
  the next iteration.")
@@ -219,66 +96,91 @@ the next iteration.")
219
96
  # @param url [Wgit::Url] The base Url of the website to crawl.
220
97
  # @param insert_externals [Boolean] Whether or not to insert the website's
221
98
  # external Url's into the database.
222
- # @param allow_paths [String, Array<String>] Filters links by selecting
223
- # them if their path `File.fnmatch?` one of allow_paths.
224
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
225
- # them if their path `File.fnmatch?` one of disallow_paths.
99
+ # @param follow [String] The xpath extracting links to be followed during
100
+ # the crawl. This changes how a site is crawled. Only links pointing to
101
+ # the site domain are allowed. The `:default` is any `<a>` href returning
102
+ # HTML.
103
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
104
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
105
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
106
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
226
107
  # @yield [doc] Given the Wgit::Document of each crawled web page before
227
108
  # it's inserted into the database allowing for prior manipulation. Return
228
109
  # nil or false from the block to prevent the document from being saved
229
110
  # into the database.
230
111
  # @return [Integer] The total number of webpages/documents indexed.
231
112
  def index_site(
232
- url, insert_externals: true, allow_paths: nil, disallow_paths: nil
113
+ url, insert_externals: false, follow: :default,
114
+ allow_paths: nil, disallow_paths: nil
233
115
  )
234
- crawl_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
116
+ crawl_opts = {
117
+ follow: follow,
118
+ allow_paths: allow_paths,
119
+ disallow_paths: disallow_paths
120
+ }
235
121
  total_pages_indexed = 0
236
122
 
237
- ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
238
- result = true
239
- result = yield(doc) if block_given?
123
+ ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
124
+ result = block_given? ? yield(doc) : true
240
125
 
241
- if result && !doc.empty? && write_doc_to_db(doc)
126
+ if result && !doc.empty?
127
+ write_doc_to_db(doc)
242
128
  total_pages_indexed += 1
243
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
244
129
  end
245
130
  end
246
131
 
247
- @db.url?(url) ? @db.update(url) : @db.insert(url)
132
+ @db.upsert(url)
248
133
 
249
134
  if insert_externals && ext_urls
250
135
  num_inserted_urls = write_urls_to_db(ext_urls)
251
136
  Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
252
137
  end
253
138
 
254
- Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
255
- site: #{url}")
139
+ Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
140
+ the site: #{url}")
256
141
 
257
142
  total_pages_indexed
258
143
  end
259
144
 
145
+ # Crawls one or more webpages and stores them into the database.
146
+ # There is no max download limit so be careful of large pages.
147
+ # Logs info on the crawl using Wgit.logger as it goes along.
148
+ #
149
+ # @param urls [*Wgit::Url] The webpage Url's to crawl.
150
+ # @param insert_externals [Boolean] Whether or not to insert the webpages
151
+ # external Url's into the database.
152
+ # @yield [doc] Given the Wgit::Document of the crawled webpage,
153
+ # before it's inserted into the database allowing for prior
154
+ # manipulation. Return nil or false from the block to prevent the
155
+ # document from being saved into the database.
156
+ # @raise [StandardError] if no urls are provided.
157
+ def index_urls(*urls, insert_externals: false, &block)
158
+ raise 'You must provide at least one Url' if urls.empty?
159
+
160
+ opts = { insert_externals: insert_externals }
161
+ Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
162
+
163
+ nil
164
+ end
165
+
260
166
  # Crawls a single webpage and stores it into the database.
261
167
  # There is no max download limit so be careful of large pages.
262
168
  # Logs info on the crawl using Wgit.logger as it goes along.
263
169
  #
264
170
  # @param url [Wgit::Url] The webpage Url to crawl.
265
- # @param insert_externals [Boolean] Whether or not to insert the webpage's
171
+ # @param insert_externals [Boolean] Whether or not to insert the webpages
266
172
  # external Url's into the database.
267
173
  # @yield [doc] Given the Wgit::Document of the crawled webpage,
268
174
  # before it's inserted into the database allowing for prior
269
175
  # manipulation. Return nil or false from the block to prevent the
270
176
  # document from being saved into the database.
271
- def index_page(url, insert_externals: true)
177
+ def index_url(url, insert_externals: false)
272
178
  document = @crawler.crawl_url(url) do |doc|
273
- result = true
274
- result = yield(doc) if block_given?
275
-
276
- if result && !doc.empty? && write_doc_to_db(doc)
277
- Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
278
- end
179
+ result = block_given? ? yield(doc) : true
180
+ write_doc_to_db(doc) if result && !doc.empty?
279
181
  end
280
182
 
281
- @db.url?(url) ? @db.update(url) : @db.insert(url)
183
+ @db.upsert(url)
282
184
 
283
185
  ext_urls = document&.external_links
284
186
  if insert_externals && ext_urls
@@ -311,23 +213,19 @@ site: #{url}")
311
213
  # collection deliberately prevents duplicate inserts.
312
214
  #
313
215
  # @param doc [Wgit::Document] The document to write to the DB.
314
- # @return [Boolean] True if the write was successful, false otherwise.
315
216
  def write_doc_to_db(doc)
316
- @db.insert(doc)
317
- Wgit.logger.info("Saved document for url: #{doc.url}")
318
-
319
- true
320
- rescue Mongo::Error::OperationFailure
321
- Wgit.logger.info("Document already exists: #{doc.url}")
322
-
323
- false
217
+ if @db.upsert(doc)
218
+ Wgit.logger.info("Saved document for url: #{doc.url}")
219
+ else
220
+ Wgit.logger.info("Updated document for url: #{doc.url}")
221
+ end
324
222
  end
325
223
 
326
224
  # Write the urls to the DB. Note that the unique url index on the urls
327
225
  # collection deliberately prevents duplicate inserts.
328
226
  #
329
227
  # @param urls [Array<Wgit::Url>] The urls to write to the DB.
330
- # @return [Boolean] True if the write was successful, false otherwise.
228
+ # @return [Integer] The number of inserted urls.
331
229
  def write_urls_to_db(urls)
332
230
  count = 0
333
231
 
@@ -341,6 +239,7 @@ site: #{url}")
341
239
 
342
240
  @db.insert(url)
343
241
  count += 1
242
+
344
243
  Wgit.logger.info("Inserted external url: #{url}")
345
244
  rescue Mongo::Error::OperationFailure
346
245
  Wgit.logger.info("External url already exists: #{url}")
@@ -348,5 +247,9 @@ site: #{url}")
348
247
 
349
248
  count
350
249
  end
250
+
251
+ alias database db
252
+ alias index index_urls
253
+ alias index_r index_site
351
254
  end
352
255
  end
data/lib/wgit/response.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  module Wgit
2
- # Response class representing a generic HTTP crawl response.
2
+ # Response class modeling a generic HTTP GET response.
3
3
  class Response
4
4
  # The underlying HTTP adapter/library response object.
5
5
  attr_accessor :adapter_response
@@ -69,7 +69,10 @@ module Wgit
69
69
  # @param headers [Hash] The new response headers.
70
70
  # @return [Hash] @headers's new value.
71
71
  def headers=(headers)
72
- return @headers = {} unless headers
72
+ unless headers
73
+ @headers = {}
74
+ return
75
+ end
73
76
 
74
77
  @headers = headers.map do |k, v|
75
78
  k = k.downcase.gsub('-', '_').to_sym
@@ -131,11 +134,11 @@ module Wgit
131
134
  @status.positive?
132
135
  end
133
136
 
134
- alias code status
135
- alias content body
136
- alias crawl_time total_time
137
- alias to_s body
138
- alias redirects redirections
139
- alias length size
137
+ alias code status
138
+ alias content body
139
+ alias crawl_duration total_time
140
+ alias to_s body
141
+ alias redirects redirections
142
+ alias length size
140
143
  end
141
144
  end