wgit 0.10.8 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/indexer.rb CHANGED
@@ -1,12 +1,23 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative 'assertable'
3
4
  require_relative 'crawler'
4
- require_relative 'database/database'
5
+ require_relative 'database/database_adapter'
5
6
 
6
7
  module Wgit
7
8
  # Class which crawls and saves the Documents to a database. Can be thought of
8
- # as a combination of Wgit::Crawler and Wgit::Database.
9
+ # as a combination of Wgit::Crawler and Wgit::Database::DatabaseAdapter.
9
10
  class Indexer
11
+ include Assertable
12
+
13
+ # The ENV var used to omit and ignore robots.txt parsing during an index.
14
+ # Applies to all index_* methods if set in the ENV.
15
+ WGIT_IGNORE_ROBOTS_TXT = "WGIT_IGNORE_ROBOTS_TXT".freeze
16
+
17
+ # The block return value used to skip saving a crawled document to the
18
+ # database. Applies to all index_* methods that take a block.
19
+ SKIP_UPSERT = :skip.freeze
20
+
10
21
  # The crawler used to index the WWW.
11
22
  attr_reader :crawler
12
23
 
@@ -15,10 +26,13 @@ module Wgit
15
26
 
16
27
  # Initialize the Indexer.
17
28
  #
18
- # @param database [Wgit::Database] The database instance (already
19
- # initialized and connected) used to index.
20
- # @param crawler [Wgit::Crawler] The crawler instance used to index.
29
+ # @param database [Wgit::Database::DatabaseAdapter] The database instance
30
+ # (already initialized and connected) used for indexing.
31
+ # @param crawler [Wgit::Crawler] The crawler instance used for indexing.
21
32
  def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
33
+ assert_type(database, Wgit::Database::DatabaseAdapter)
34
+ assert_type(crawler, Wgit::Crawler)
35
+
22
36
  @db = database
23
37
  @crawler = crawler
24
38
  end
@@ -26,33 +40,38 @@ module Wgit
26
40
  # Retrieves uncrawled url's from the database and recursively crawls each
27
41
  # site storing their internal pages into the database and adding their
28
42
  # external url's to be crawled later on. Logs info on the crawl using
29
- # Wgit.logger as it goes along.
43
+ # Wgit.logger as it goes along. This method will honour all site's
44
+ # robots.txt and 'noindex' requests.
30
45
  #
31
46
  # @param max_sites [Integer] The number of separate and whole
32
47
  # websites to be crawled before the method exits. Defaults to -1 which
33
- # means the crawl will occur until manually stopped (Ctrl+C etc).
48
+ # means the crawl will occur until manually stopped (Ctrl+C), the
49
+ # max_data has been reached, or it runs out of external urls to index.
34
50
  # @param max_data [Integer] The maximum amount of bytes that will be
35
51
  # scraped from the web (default is 1GB). Note, that this value is used to
36
52
  # determine when to stop crawling; it's not a guarantee of the max data
37
53
  # that will be obtained.
38
- def index_www(max_sites: -1, max_data: 1_048_576_000)
54
+ # @param max_urls_per_iteration [Integer] The maximum number of uncrawled
55
+ # urls to index for each iteration, before checking max_sites and
56
+ # max_data, possibly ending the crawl.
57
+ def index_www(max_sites: -1, max_data: 1_048_576_000, max_urls_per_iteration: 10)
39
58
  if max_sites.negative?
40
59
  Wgit.logger.info("Indexing until the database has been filled or it \
41
- runs out of urls to crawl (which might be never).")
60
+ runs out of urls to crawl (which might be never)")
42
61
  end
43
62
  site_count = 0
44
63
 
45
64
  while keep_crawling?(site_count, max_sites, max_data)
46
65
  Wgit.logger.info("Current database size: #{@db.size}")
47
66
 
48
- uncrawled_urls = @db.uncrawled_urls(limit: 100)
67
+ uncrawled_urls = @db.uncrawled_urls(limit: max_urls_per_iteration)
49
68
 
50
69
  if uncrawled_urls.empty?
51
- Wgit.logger.info('No urls to crawl, exiting.')
70
+ Wgit.logger.info('No urls to crawl, exiting')
52
71
 
53
72
  return
54
73
  end
55
- Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")
74
+ Wgit.logger.info("Starting indexing loop for: #{uncrawled_urls.map(&:to_s)}")
56
75
 
57
76
  docs_count = 0
58
77
  urls_count = 0
@@ -60,38 +79,48 @@ runs out of urls to crawl (which might be never).")
60
79
  uncrawled_urls.each do |url|
61
80
  unless keep_crawling?(site_count, max_sites, max_data)
62
81
  Wgit.logger.info("Reached max number of sites to crawl or \
63
- database capacity, exiting.")
82
+ database capacity, exiting")
64
83
 
65
84
  return
66
85
  end
67
86
  site_count += 1
68
87
 
88
+ parser = parse_robots_txt(url)
89
+ if parser&.no_index?
90
+ upsert_url_and_redirects(url)
91
+
92
+ next
93
+ end
94
+
69
95
  site_docs_count = 0
70
- ext_links = @crawler.crawl_site(url) do |doc|
71
- unless doc.empty?
72
- write_doc_to_db(doc)
73
- docs_count += 1
74
- site_docs_count += 1
75
- end
96
+ ext_links = @crawler.crawl_site(
97
+ url, allow_paths: parser&.allow_paths, disallow_paths: parser&.disallow_paths
98
+ ) do |doc|
99
+ next if doc.empty? || no_index?(@crawler.last_response, doc)
100
+
101
+ upsert_doc(doc)
102
+ docs_count += 1
103
+ site_docs_count += 1
76
104
  end
77
105
 
78
- raise 'Error updating url' unless @db.update(url) == 1
106
+ upsert_url_and_redirects(url)
79
107
 
80
- urls_count += write_urls_to_db(ext_links)
108
+ urls_count += upsert_external_urls(ext_links)
81
109
  end
82
110
 
83
111
  Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
84
- url(s) overall for this iteration.")
112
+ url(s) during this iteration")
85
113
  Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
86
- the next iteration.")
87
-
88
- nil
114
+ future iterations")
89
115
  end
116
+
117
+ nil
90
118
  end
91
119
 
92
120
  # Crawls a single website's pages and stores them into the database.
93
121
  # There is no max download limit so be careful which sites you index.
94
- # Logs info on the crawl using Wgit.logger as it goes along.
122
+ # Logs info on the crawl using Wgit.logger as it goes along. This method
123
+ # will honour the site's robots.txt and 'noindex' requests.
95
124
  #
96
125
  # @param url [Wgit::Url] The base Url of the website to crawl.
97
126
  # @param insert_externals [Boolean] Whether or not to insert the website's
@@ -113,28 +142,29 @@ the next iteration.")
113
142
  url, insert_externals: false, follow: :default,
114
143
  allow_paths: nil, disallow_paths: nil
115
144
  )
116
- crawl_opts = {
117
- follow: follow,
118
- allow_paths: allow_paths,
119
- disallow_paths: disallow_paths
120
- }
145
+ parser = parse_robots_txt(url)
146
+ if parser&.no_index?
147
+ upsert_url_and_redirects(url)
148
+
149
+ return 0
150
+ end
151
+
152
+ allow_paths, disallow_paths = merge_paths(parser, allow_paths, disallow_paths)
153
+ crawl_opts = { follow:, allow_paths:, disallow_paths: }
121
154
  total_pages_indexed = 0
122
155
 
123
156
  ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
157
+ next if no_index?(@crawler.last_response, doc)
158
+
124
159
  result = block_given? ? yield(doc) : true
160
+ next if doc.empty? || result == SKIP_UPSERT
125
161
 
126
- if result && !doc.empty?
127
- write_doc_to_db(doc)
128
- total_pages_indexed += 1
129
- end
162
+ upsert_doc(doc)
163
+ total_pages_indexed += 1
130
164
  end
131
165
 
132
- @db.upsert(url)
133
-
134
- if insert_externals && ext_urls
135
- num_inserted_urls = write_urls_to_db(ext_urls)
136
- Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
137
- end
166
+ upsert_url_and_redirects(url)
167
+ upsert_external_urls(ext_urls) if insert_externals && ext_urls
138
168
 
139
169
  Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
140
170
  for the site: #{url}")
@@ -145,6 +175,8 @@ for the site: #{url}")
145
175
  # Crawls one or more webpages and stores them into the database.
146
176
  # There is no max download limit so be careful of large pages.
147
177
  # Logs info on the crawl using Wgit.logger as it goes along.
178
+ # This method will honour the site's robots.txt and 'noindex' requests
179
+ # in relation to the given urls.
148
180
  #
149
181
  # @param urls [*Wgit::Url] The webpage Url's to crawl.
150
182
  # @param insert_externals [Boolean] Whether or not to insert the webpages
@@ -157,7 +189,7 @@ for the site: #{url}")
157
189
  def index_urls(*urls, insert_externals: false, &block)
158
190
  raise 'You must provide at least one Url' if urls.empty?
159
191
 
160
- opts = { insert_externals: insert_externals }
192
+ opts = { insert_externals: }
161
193
  Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
162
194
 
163
195
  nil
@@ -166,6 +198,8 @@ for the site: #{url}")
166
198
  # Crawls a single webpage and stores it into the database.
167
199
  # There is no max download limit so be careful of large pages.
168
200
  # Logs info on the crawl using Wgit.logger as it goes along.
201
+ # This method will honour the site's robots.txt and 'noindex' requests
202
+ # in relation to the given url.
169
203
  #
170
204
  # @param url [Wgit::Url] The webpage Url to crawl.
171
205
  # @param insert_externals [Boolean] Whether or not to insert the webpages
@@ -175,18 +209,26 @@ for the site: #{url}")
175
209
  # manipulation. Return nil or false from the block to prevent the
176
210
  # document from being saved into the database.
177
211
  def index_url(url, insert_externals: false)
212
+ parser = parse_robots_txt(url)
213
+ if parser && (parser.no_index? || contains_path?(parser.disallow_paths, url))
214
+ upsert_url_and_redirects(url)
215
+
216
+ return
217
+ end
218
+
178
219
  document = @crawler.crawl_url(url) do |doc|
220
+ break if no_index?(@crawler.last_response, doc)
221
+
179
222
  result = block_given? ? yield(doc) : true
180
- write_doc_to_db(doc) if result && !doc.empty?
223
+ break if doc.empty? || result == SKIP_UPSERT
224
+
225
+ upsert_doc(doc)
181
226
  end
182
227
 
183
- @db.upsert(url)
228
+ upsert_url_and_redirects(url)
184
229
 
185
230
  ext_urls = document&.external_links
186
- if insert_externals && ext_urls
187
- num_inserted_urls = write_urls_to_db(ext_urls)
188
- Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
189
- end
231
+ upsert_external_urls(ext_urls) if insert_externals && ext_urls
190
232
 
191
233
  nil
192
234
  end
@@ -210,10 +252,11 @@ for the site: #{url}")
210
252
  end
211
253
 
212
254
  # Write the doc to the DB. Note that the unique url index on the documents
213
- # collection deliberately prevents duplicate inserts.
255
+ # collection deliberately prevents duplicate inserts. If the document
256
+ # already exists, then it will be updated in the DB.
214
257
  #
215
258
  # @param doc [Wgit::Document] The document to write to the DB.
216
- def write_doc_to_db(doc)
259
+ def upsert_doc(doc)
217
260
  if @db.upsert(doc)
218
261
  Wgit.logger.info("Saved document for url: #{doc.url}")
219
262
  else
@@ -221,35 +264,105 @@ for the site: #{url}")
221
264
  end
222
265
  end
223
266
 
224
- # Write the urls to the DB. Note that the unique url index on the urls
225
- # collection deliberately prevents duplicate inserts.
267
+ # Upsert the url and its redirects, setting all to crawled = true.
226
268
  #
227
- # @param urls [Array<Wgit::Url>] The urls to write to the DB.
228
- # @return [Integer] The number of inserted urls.
229
- def write_urls_to_db(urls)
230
- count = 0
269
+ # @param url [Wgit::Url] The url to write to the DB.
270
+ # @return [Integer] The number of upserted urls (url + redirect urls).
271
+ def upsert_url_and_redirects(url)
272
+ url.crawled = true unless url.crawled?
231
273
 
232
- return count unless urls.respond_to?(:each)
274
+ # Upsert the url and any url redirects, setting them as crawled also.
275
+ @db.bulk_upsert(url.redirects_journey)
276
+ end
233
277
 
234
- urls.each do |url|
235
- if url.invalid?
236
- Wgit.logger.info("Ignoring invalid external url: #{url}")
237
- next
238
- end
278
+ # Write the external urls to the DB. For any external url, its origin will
279
+ # be inserted e.g. if the external url is http://example.com/contact then
280
+ # http://example.com will be inserted into the database. Note that the
281
+ # unique url index on the urls collection deliberately prevents duplicate
282
+ # inserts.
283
+ #
284
+ # @param urls [Array<Wgit::Url>] The external urls to write to the DB.
285
+ # @return [Integer] The number of upserted urls.
286
+ def upsert_external_urls(urls)
287
+ urls = urls
288
+ .reject(&:invalid?)
289
+ .map(&:to_origin)
290
+ .uniq
291
+ return 0 if urls.empty?
292
+
293
+ count = @db.bulk_upsert(urls)
294
+ Wgit.logger.info("Saved #{count} external urls")
295
+
296
+ count
297
+ end
298
+
299
+ private
300
+
301
+ # Crawls and parses robots.txt file (if found). Returns the parser or nil.
302
+ def parse_robots_txt(url)
303
+ return nil if ENV[WGIT_IGNORE_ROBOTS_TXT]
304
+
305
+ robots_url = url.to_origin.join('/robots.txt')
239
306
 
240
- @db.insert(url)
241
- count += 1
307
+ Wgit.logger.info("Crawling for robots.txt: #{robots_url}")
242
308
 
243
- Wgit.logger.info("Inserted external url: #{url}")
244
- rescue Mongo::Error::OperationFailure
245
- Wgit.logger.info("External url already exists: #{url}")
309
+ doc = @crawler.crawl_url(robots_url)
310
+ return nil if !@crawler.last_response.ok? || doc.empty?
311
+
312
+ parser = Wgit::RobotsParser.new(doc.content)
313
+
314
+ Wgit.logger.info("robots.txt allow paths: #{parser.allow_paths}")
315
+ Wgit.logger.info("robots.txt disallow paths: #{parser.disallow_paths}")
316
+ if parser.no_index?
317
+ Wgit.logger.info('robots.txt has banned wgit indexing, skipping')
246
318
  end
247
319
 
248
- count
320
+ parser
321
+ end
322
+
323
+ # Takes the user defined allow/disallow_paths and merges robots paths
324
+ # into them. The allow/disallow_paths vars each can be of type nil, String,
325
+ # Enumerable<String>.
326
+ def merge_paths(parser, allow_paths, disallow_paths)
327
+ return allow_paths, disallow_paths unless parser&.rules?
328
+
329
+ allow = allow_paths || []
330
+ allow = [allow] unless allow.is_a?(Enumerable)
331
+
332
+ disallow = disallow_paths || []
333
+ disallow = [disallow] unless disallow.is_a?(Enumerable)
334
+
335
+ allow.concat(parser.allow_paths)
336
+ disallow.concat(parser.disallow_paths)
337
+
338
+ [allow, disallow]
339
+ end
340
+
341
+ # Returns true if url is included in the given paths.
342
+ def contains_path?(paths, url)
343
+ paths.any? { |path| Wgit::Url.new(path).to_path == url.to_path }
344
+ end
345
+
346
+ # Returns if the last_response or doc #no_index? is true or not.
347
+ def no_index?(last_response, doc)
348
+ return false if ENV[WGIT_IGNORE_ROBOTS_TXT]
349
+
350
+ url = last_response.url.to_s
351
+ if last_response.no_index?
352
+ Wgit.logger.info("Skipping page due to no-index response header: #{url}")
353
+ return true
354
+ end
355
+
356
+ if doc&.no_index?
357
+ Wgit.logger.info("Skipping page due to no-index HTML meta tag: #{url}")
358
+ return true
359
+ end
360
+
361
+ false
249
362
  end
250
363
 
251
- alias database db
252
- alias index index_urls
253
- alias index_r index_site
364
+ alias_method :database, :db
365
+ alias_method :index, :index_urls
366
+ alias_method :index_r, :index_site
254
367
  end
255
368
  end
data/lib/wgit/logger.rb CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  # FYI: The default logger is set at the bottom of this file.
4
4
 
5
- require 'logger'
5
+ require "logger"
6
6
 
7
7
  module Wgit
8
8
  # The Logger instance used by Wgit. Set your own custom logger after
@@ -28,7 +28,7 @@ module Wgit
28
28
  #
29
29
  # @return [Logger] The default Logger instance.
30
30
  def self.default_logger
31
- logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
31
+ logger = Logger.new($stdout, progname: "wgit", level: :info)
32
32
  logger.formatter = proc do |_severity, _datetime, progname, msg|
33
33
  "[#{progname}] #{msg}\n"
34
34
  end
data/lib/wgit/model.rb ADDED
@@ -0,0 +1,164 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "./utils"
4
+
5
+ module Wgit
6
+ # Module used to build the Database collection objects, forming a data model.
7
+ # The models produced are Hash like and therefore DB agnostic. Each model
8
+ # will contain a unique field used for searching and avoiding duplicates,
9
+ # this is typically a `url` field. Also contained in the model are the
10
+ # search fields used in Database and Document #search calls.
11
+ module Model
12
+ # The default search fields used in Database and Document #search calls.
13
+ # The number of matches for each field is multiplied by the field weight,
14
+ # the total is the search score, used to sort the search results.
15
+ # Call Wgit::Model.set_default_search_fields` to revert to default.
16
+ DEFAULT_SEARCH_FIELDS = {
17
+ title: 2,
18
+ description: 2,
19
+ keywords: 2,
20
+ text: 1
21
+ }.freeze
22
+
23
+ # The search fields used in Database and Document #search calls.
24
+ # The number of matches for each field is multiplied by the field weight,
25
+ # the total is the search score, used to sort the search results.
26
+ # Call Wgit::Model.set_default_search_fields` to revert to default.
27
+ @search_fields = DEFAULT_SEARCH_FIELDS
28
+
29
+ # Whether or not to include the Document#html in the #document model.
30
+ @include_doc_html = false
31
+
32
+ # Whether or not to include the Document#score in the #document model.
33
+ @include_doc_score = false
34
+
35
+ class << self
36
+ # The search fields used in Database and Document #search calls.
37
+ # A custom setter method is also provided for changing these fields.
38
+ attr_reader :search_fields
39
+
40
+ # Whether or not to include the Document#html in the #document model.
41
+ attr_accessor :include_doc_html
42
+
43
+ # Whether or not to include the Document#score in the #document model.
44
+ attr_accessor :include_doc_score
45
+ end
46
+
47
+ # Sets the search fields used in Database and Document #search calls.
48
+ #
49
+ # You can pass the fields as an Array of Symbols which gives each field a
50
+ # weight of 1 meaning all fields are considered of equal value. Or you can
51
+ # pass a Hash of Symbol => Int and specify the weights yourself, allowing
52
+ # you to customise the search rankings.
53
+ #
54
+ # Use like:
55
+ # ```
56
+ # Wgit::Model.set_search_fields [:title, :text], db
57
+ # => { title: 1, text: 1 }
58
+ # Wgit::Model.set_search_fields {title: 2, text: 1}, db
59
+ # => { title: 2, text: 1 }
60
+ # ```
61
+ #
62
+ # If the given db (database) param responds to #search_fields= then it will
63
+ # be called and given the fields to set. This should perform whatever the
64
+ # database adapter needs in order to search using the given fields e.g.
65
+ # creating a search index. Calling the DB enables the search_fields to be
66
+ # set globally within Wgit by one method call, this one.
67
+ #
68
+ # @param fields [Array<Symbol>, Hash<Symbol, Integer>] The field names or
69
+ # the field names with their coresponding search weights.
70
+ # @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If
71
+ # db responds to #search_fields=, it will be called and given the fields.
72
+ # @raise [StandardError] If fields is of an incorrect type.
73
+ # @return [Hash<Symbol, Integer>] The fields and their weights.
74
+ def self.set_search_fields(fields, db = nil)
75
+ # We need a Hash of fields => weights (Symbols => Integers).
76
+ case fields
77
+ when Array # of Strings/Symbols.
78
+ fields = fields.map { |field| [field.to_sym, 1] }
79
+ when Hash # of Strings/Symbols and Integers.
80
+ fields = fields.map { |field, weight| [field.to_sym, weight.to_i] }
81
+ else
82
+ raise "fields must be an Array or Hash, not a #{fields.class}"
83
+ end
84
+
85
+ @search_fields = fields.to_h
86
+ db.search_fields = @search_fields if db.respond_to?(:search_fields=)
87
+
88
+ @search_fields
89
+ end
90
+
91
+ # Sets the search fields used in Database and Document #search calls.
92
+ #
93
+ # If the given db (database) param responds to #search_fields= then it will
94
+ # be called and given the fields to set. This should perform whatever the
95
+ # database adapter needs in order to search using the given fields e.g.
96
+ # creating a search index. Calling the DB enables the search_fields to be
97
+ # set globally within Wgit by one method call, this one.
98
+ #
99
+ # @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If
100
+ # db responds to #search_fields=, it will be called and given the fields.
101
+ # @return [Hash<Symbol, Integer>] The fields and their weights.
102
+ def self.set_default_search_fields(db = nil)
103
+ set_search_fields(DEFAULT_SEARCH_FIELDS, db)
104
+ end
105
+
106
+ # The data model for a Wgit::Url collection object and for an embedded
107
+ # 'url' inside a Wgit::Document collection object.
108
+ #
109
+ # The unique field for this model is `model['url']`.
110
+ #
111
+ # @param url [Wgit::Url] The Url data object.
112
+ # @return [Hash] The URL model ready for DB insertion.
113
+ def self.url(url)
114
+ raise "url must respond_to? :to_h" unless url.respond_to?(:to_h)
115
+
116
+ model = url.to_h
117
+ select_bson_types(model)
118
+ end
119
+
120
+ # The data model for a Wgit::Document collection object.
121
+ #
122
+ # The unique field for this model is `model['url']['url']`.
123
+ #
124
+ # @param doc [Wgit::Document] The Document data object.
125
+ # @return [Hash] The Document model ready for DB insertion.
126
+ def self.document(doc)
127
+ raise "doc must respond_to? :to_h" unless doc.respond_to?(:to_h)
128
+
129
+ model = doc.to_h(
130
+ include_html: @include_doc_html, include_score: @include_doc_score
131
+ )
132
+ model["url"] = url(doc.url) # Expand Url String into full object.
133
+
134
+ select_bson_types(model)
135
+ end
136
+
137
+ # Common fields when inserting a record into the DB.
138
+ #
139
+ # @return [Hash] Insertion fields common to all models.
140
+ def self.common_insert_data
141
+ {
142
+ date_added: Wgit::Utils.time_stamp,
143
+ date_modified: Wgit::Utils.time_stamp
144
+ }
145
+ end
146
+
147
+ # Common fields when updating a record in the DB.
148
+ #
149
+ # @return [Hash] Update fields common to all models.
150
+ def self.common_update_data
151
+ {
152
+ date_modified: Wgit::Utils.time_stamp
153
+ }
154
+ end
155
+
156
+ # Returns the model having removed non bson types (for use with MongoDB).
157
+ #
158
+ # @param model_hash [Hash] The model Hash to sanitize.
159
+ # @return [Hash] The model Hash with non bson types removed.
160
+ def self.select_bson_types(model_hash)
161
+ model_hash.select { |_k, v| v.respond_to?(:bson_type) }
162
+ end
163
+ end
164
+ end
data/lib/wgit/response.rb CHANGED
@@ -27,18 +27,25 @@ module Wgit
27
27
 
28
28
  # Defaults some values and returns a "blank" Wgit::Response object.
29
29
  def initialize
30
- @body = ''
30
+ @body = ""
31
31
  @headers = {}
32
32
  @redirections = {}
33
33
  @total_time = 0.0
34
34
  end
35
35
 
36
+ # Overrides String#inspect to shorten the printed output of a Response.
37
+ #
38
+ # @return [String] A short textual representation of this Response.
39
+ def inspect
40
+ "#<Wgit::Response url=\"#{@url}\" status=#{status}>"
41
+ end
42
+
36
43
  # Adds time to @total_time (incrementally).
37
44
  #
38
45
  # @param time [Float] The time to add to @total_time.
39
46
  # @return [Float] @total_time's new value.
40
47
  def add_total_time(time)
41
- @total_time += (time || 0.0)
48
+ @total_time += time || 0.0
42
49
  end
43
50
 
44
51
  # Sets the HTML response body.
@@ -46,7 +53,7 @@ module Wgit
46
53
  # @param str [String] The new HTML body.
47
54
  # @return [String] @body's new value.
48
55
  def body=(str)
49
- @body = (str || '')
56
+ @body = str || ""
50
57
  end
51
58
 
52
59
  # Returns the HTML response body or nil (if it's empty).
@@ -74,10 +81,7 @@ module Wgit
74
81
  return
75
82
  end
76
83
 
77
- @headers = headers.map do |k, v|
78
- k = k.downcase.gsub('-', '_').to_sym
79
- [k, v]
80
- end.to_h
84
+ @headers = headers.transform_keys { |k| k.downcase.gsub("-", "_").to_sym }
81
85
  end
82
86
 
83
87
  # Returns whether or not the response is 404 Not Found.
@@ -134,11 +138,19 @@ module Wgit
134
138
  @status.positive?
135
139
  end
136
140
 
137
- alias code status
138
- alias content body
139
- alias crawl_duration total_time
140
- alias to_s body
141
- alias redirects redirections
142
- alias length size
141
+ # Returns whether or not Wgit is banned from indexing this site.
142
+ #
143
+ # @return [Boolean] True if Wgit should not index this site, false
144
+ # otherwise.
145
+ def no_index?
146
+ headers.fetch(:x_robots_tag, "").downcase.strip == "noindex"
147
+ end
148
+
149
+ alias_method :code, :status
150
+ alias_method :content, :body
151
+ alias_method :crawl_duration, :total_time
152
+ alias_method :to_s, :body
153
+ alias_method :redirects, :redirections
154
+ alias_method :length, :size
143
155
  end
144
156
  end