wgit 0.10.7 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/indexer.rb CHANGED
@@ -26,33 +26,38 @@ module Wgit
26
26
  # Retrieves uncrawled url's from the database and recursively crawls each
27
27
  # site storing their internal pages into the database and adding their
28
28
  # external url's to be crawled later on. Logs info on the crawl using
29
- # Wgit.logger as it goes along.
29
+ # Wgit.logger as it goes along. This method will honour all site's
30
+ # robots.txt and 'noindex' requests.
30
31
  #
31
32
  # @param max_sites [Integer] The number of separate and whole
32
33
  # websites to be crawled before the method exits. Defaults to -1 which
33
- # means the crawl will occur until manually stopped (Ctrl+C etc).
34
+ # means the crawl will occur until manually stopped (Ctrl+C), the
35
+ # max_data has been reached, or it runs out of external urls to index.
34
36
  # @param max_data [Integer] The maximum amount of bytes that will be
35
37
  # scraped from the web (default is 1GB). Note, that this value is used to
36
38
  # determine when to stop crawling; it's not a guarantee of the max data
37
39
  # that will be obtained.
38
- def index_www(max_sites: -1, max_data: 1_048_576_000)
40
+ # @param max_urls_per_iteration [Integer] The maximum number of uncrawled
41
+ # urls to index for each iteration, before checking max_sites and
42
+ # max_data, possibly ending the crawl.
43
+ def index_www(max_sites: -1, max_data: 1_048_576_000, max_urls_per_iteration: 10)
39
44
  if max_sites.negative?
40
45
  Wgit.logger.info("Indexing until the database has been filled or it \
41
- runs out of urls to crawl (which might be never).")
46
+ runs out of urls to crawl (which might be never)")
42
47
  end
43
48
  site_count = 0
44
49
 
45
50
  while keep_crawling?(site_count, max_sites, max_data)
46
51
  Wgit.logger.info("Current database size: #{@db.size}")
47
52
 
48
- uncrawled_urls = @db.uncrawled_urls(limit: 100)
53
+ uncrawled_urls = @db.uncrawled_urls(limit: max_urls_per_iteration)
49
54
 
50
55
  if uncrawled_urls.empty?
51
- Wgit.logger.info('No urls to crawl, exiting.')
56
+ Wgit.logger.info('No urls to crawl, exiting')
52
57
 
53
58
  return
54
59
  end
55
- Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")
60
+ Wgit.logger.info("Starting indexing loop for: #{uncrawled_urls.map(&:to_s)}")
56
61
 
57
62
  docs_count = 0
58
63
  urls_count = 0
@@ -60,38 +65,48 @@ runs out of urls to crawl (which might be never).")
60
65
  uncrawled_urls.each do |url|
61
66
  unless keep_crawling?(site_count, max_sites, max_data)
62
67
  Wgit.logger.info("Reached max number of sites to crawl or \
63
- database capacity, exiting.")
68
+ database capacity, exiting")
64
69
 
65
70
  return
66
71
  end
67
72
  site_count += 1
68
73
 
74
+ parser = parse_robots_txt(url)
75
+ if parser&.no_index?
76
+ upsert_url_and_redirects(url)
77
+
78
+ next
79
+ end
80
+
69
81
  site_docs_count = 0
70
- ext_links = @crawler.crawl_site(url) do |doc|
71
- unless doc.empty?
72
- write_doc_to_db(doc)
73
- docs_count += 1
74
- site_docs_count += 1
75
- end
82
+ ext_links = @crawler.crawl_site(
83
+ url, allow_paths: parser&.allow_paths, disallow_paths: parser&.disallow_paths
84
+ ) do |doc|
85
+ next if doc.empty? || no_index?(@crawler.last_response, doc)
86
+
87
+ upsert_doc(doc)
88
+ docs_count += 1
89
+ site_docs_count += 1
76
90
  end
77
91
 
78
- raise 'Error updating url' unless @db.update(url) == 1
92
+ upsert_url_and_redirects(url)
79
93
 
80
- urls_count += write_urls_to_db(ext_links)
94
+ urls_count += upsert_external_urls(ext_links)
81
95
  end
82
96
 
83
97
  Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
84
- url(s) overall for this iteration.")
98
+ url(s) during this iteration")
85
99
  Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
86
- the next iteration.")
87
-
88
- nil
100
+ future iterations")
89
101
  end
102
+
103
+ nil
90
104
  end
91
105
 
92
106
  # Crawls a single website's pages and stores them into the database.
93
107
  # There is no max download limit so be careful which sites you index.
94
- # Logs info on the crawl using Wgit.logger as it goes along.
108
+ # Logs info on the crawl using Wgit.logger as it goes along. This method
109
+ # will honour the site's robots.txt and 'noindex' requests.
95
110
  #
96
111
  # @param url [Wgit::Url] The base Url of the website to crawl.
97
112
  # @param insert_externals [Boolean] Whether or not to insert the website's
@@ -113,28 +128,30 @@ the next iteration.")
113
128
  url, insert_externals: false, follow: :default,
114
129
  allow_paths: nil, disallow_paths: nil
115
130
  )
116
- crawl_opts = {
117
- follow: follow,
118
- allow_paths: allow_paths,
119
- disallow_paths: disallow_paths
120
- }
131
+ parser = parse_robots_txt(url)
132
+ if parser&.no_index?
133
+ upsert_url_and_redirects(url)
134
+
135
+ return 0
136
+ end
137
+
138
+ allow_paths, disallow_paths = merge_paths(parser, allow_paths, disallow_paths)
139
+ crawl_opts = { follow:, allow_paths:, disallow_paths: }
121
140
  total_pages_indexed = 0
122
141
 
123
142
  ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
143
+ next if no_index?(@crawler.last_response, doc)
144
+
124
145
  result = block_given? ? yield(doc) : true
125
146
 
126
147
  if result && !doc.empty?
127
- write_doc_to_db(doc)
148
+ upsert_doc(doc)
128
149
  total_pages_indexed += 1
129
150
  end
130
151
  end
131
152
 
132
- @db.upsert(url)
133
-
134
- if insert_externals && ext_urls
135
- num_inserted_urls = write_urls_to_db(ext_urls)
136
- Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
137
- end
153
+ upsert_url_and_redirects(url)
154
+ upsert_external_urls(ext_urls) if insert_externals && ext_urls
138
155
 
139
156
  Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
140
157
  for the site: #{url}")
@@ -145,6 +162,8 @@ for the site: #{url}")
145
162
  # Crawls one or more webpages and stores them into the database.
146
163
  # There is no max download limit so be careful of large pages.
147
164
  # Logs info on the crawl using Wgit.logger as it goes along.
165
+ # This method will honour the site's robots.txt and 'noindex' requests
166
+ # in relation to the given urls.
148
167
  #
149
168
  # @param urls [*Wgit::Url] The webpage Url's to crawl.
150
169
  # @param insert_externals [Boolean] Whether or not to insert the webpages
@@ -157,7 +176,7 @@ for the site: #{url}")
157
176
  def index_urls(*urls, insert_externals: false, &block)
158
177
  raise 'You must provide at least one Url' if urls.empty?
159
178
 
160
- opts = { insert_externals: insert_externals }
179
+ opts = { insert_externals: }
161
180
  Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
162
181
 
163
182
  nil
@@ -166,6 +185,8 @@ for the site: #{url}")
166
185
  # Crawls a single webpage and stores it into the database.
167
186
  # There is no max download limit so be careful of large pages.
168
187
  # Logs info on the crawl using Wgit.logger as it goes along.
188
+ # This method will honour the site's robots.txt and 'noindex' requests
189
+ # in relation to the given url.
169
190
  #
170
191
  # @param url [Wgit::Url] The webpage Url to crawl.
171
192
  # @param insert_externals [Boolean] Whether or not to insert the webpages
@@ -175,18 +196,24 @@ for the site: #{url}")
175
196
  # manipulation. Return nil or false from the block to prevent the
176
197
  # document from being saved into the database.
177
198
  def index_url(url, insert_externals: false)
199
+ parser = parse_robots_txt(url)
200
+ if parser && (parser.no_index? || contains_path?(parser.disallow_paths, url))
201
+ upsert_url_and_redirects(url)
202
+
203
+ return
204
+ end
205
+
178
206
  document = @crawler.crawl_url(url) do |doc|
207
+ break if no_index?(@crawler.last_response, doc)
208
+
179
209
  result = block_given? ? yield(doc) : true
180
- write_doc_to_db(doc) if result && !doc.empty?
210
+ upsert_doc(doc) if result && !doc.empty?
181
211
  end
182
212
 
183
- @db.upsert(url)
213
+ upsert_url_and_redirects(url)
184
214
 
185
215
  ext_urls = document&.external_links
186
- if insert_externals && ext_urls
187
- num_inserted_urls = write_urls_to_db(ext_urls)
188
- Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
189
- end
216
+ upsert_external_urls(ext_urls) if insert_externals && ext_urls
190
217
 
191
218
  nil
192
219
  end
@@ -210,10 +237,11 @@ for the site: #{url}")
210
237
  end
211
238
 
212
239
  # Write the doc to the DB. Note that the unique url index on the documents
213
- # collection deliberately prevents duplicate inserts.
240
+ # collection deliberately prevents duplicate inserts. If the document
241
+ # already exists, then it will be updated in the DB.
214
242
  #
215
243
  # @param doc [Wgit::Document] The document to write to the DB.
216
- def write_doc_to_db(doc)
244
+ def upsert_doc(doc)
217
245
  if @db.upsert(doc)
218
246
  Wgit.logger.info("Saved document for url: #{doc.url}")
219
247
  else
@@ -221,35 +249,101 @@ for the site: #{url}")
221
249
  end
222
250
  end
223
251
 
224
- # Write the urls to the DB. Note that the unique url index on the urls
225
- # collection deliberately prevents duplicate inserts.
252
+ # Upsert the url and its redirects, setting all to crawled = true.
226
253
  #
227
- # @param urls [Array<Wgit::Url>] The urls to write to the DB.
228
- # @return [Integer] The number of inserted urls.
229
- def write_urls_to_db(urls)
230
- count = 0
254
+ # @param url [Wgit::Url] The url to write to the DB.
255
+ # @return [Integer] The number of upserted urls (url + redirect urls).
256
+ def upsert_url_and_redirects(url)
257
+ url.crawled = true unless url.crawled?
231
258
 
232
- return count unless urls.respond_to?(:each)
259
+ # Upsert the url and any url redirects, setting them as crawled also.
260
+ @db.bulk_upsert(url.redirects_journey)
261
+ end
233
262
 
234
- urls.each do |url|
235
- if url.invalid?
236
- Wgit.logger.info("Ignoring invalid external url: #{url}")
237
- next
238
- end
263
+ # Write the external urls to the DB. For any external url, its origin will
264
+ # be inserted e.g. if the external url is http://example.com/contact then
265
+ # http://example.com will be inserted into the database. Note that the
266
+ # unique url index on the urls collection deliberately prevents duplicate
267
+ # inserts.
268
+ #
269
+ # @param urls [Array<Wgit::Url>] The external urls to write to the DB.
270
+ # @return [Integer] The number of upserted urls.
271
+ def upsert_external_urls(urls)
272
+ urls = urls
273
+ .reject(&:invalid?)
274
+ .map(&:to_origin)
275
+ .uniq
276
+ return 0 if urls.empty?
277
+
278
+ count = @db.bulk_upsert(urls)
279
+ Wgit.logger.info("Saved #{count} external urls")
239
280
 
240
- @db.insert(url)
241
- count += 1
281
+ count
282
+ end
283
+
284
+ private
285
+
286
+ # Crawls and parses robots.txt file (if found). Returns the parser or nil.
287
+ def parse_robots_txt(url)
288
+ robots_url = url.to_origin.join('/robots.txt')
289
+
290
+ Wgit.logger.info("Crawling for robots.txt: #{robots_url}")
291
+
292
+ doc = @crawler.crawl_url(robots_url)
293
+ return nil if !@crawler.last_response.ok? || doc.empty?
294
+
295
+ parser = Wgit::RobotsParser.new(doc.content)
242
296
 
243
- Wgit.logger.info("Inserted external url: #{url}")
244
- rescue Mongo::Error::OperationFailure
245
- Wgit.logger.info("External url already exists: #{url}")
297
+ Wgit.logger.info("robots.txt allow paths: #{parser.allow_paths}")
298
+ Wgit.logger.info("robots.txt disallow paths: #{parser.disallow_paths}")
299
+ if parser.no_index?
300
+ Wgit.logger.info('robots.txt has banned wgit indexing, skipping')
246
301
  end
247
302
 
248
- count
303
+ parser
304
+ end
305
+
306
+ # Takes the user defined allow/disallow_paths and merges robots paths
307
+ # into them. The allow/disallow_paths vars each can be of type nil, String,
308
+ # Enumerable<String>.
309
+ def merge_paths(parser, allow_paths, disallow_paths)
310
+ return allow_paths, disallow_paths unless parser&.rules?
311
+
312
+ allow = allow_paths || []
313
+ allow = [allow] unless allow.is_a?(Enumerable)
314
+
315
+ disallow = disallow_paths || []
316
+ disallow = [disallow] unless disallow.is_a?(Enumerable)
317
+
318
+ allow.concat(parser.allow_paths)
319
+ disallow.concat(parser.disallow_paths)
320
+
321
+ [allow, disallow]
322
+ end
323
+
324
+ # Returns true if url is included in the given paths.
325
+ def contains_path?(paths, url)
326
+ paths.any? { |path| Wgit::Url.new(path).to_path == url.to_path }
327
+ end
328
+
329
+ # Returns if the last_response or doc #no_index? is true or not.
330
+ def no_index?(last_response, doc)
331
+ url = last_response.url.to_s
332
+ if last_response.no_index?
333
+ Wgit.logger.info("Skipping page due to no-index response header: #{url}")
334
+ return true
335
+ end
336
+
337
+ if doc&.no_index?
338
+ Wgit.logger.info("Skipping page due to no-index HTML meta tag: #{url}")
339
+ return true
340
+ end
341
+
342
+ false
249
343
  end
250
344
 
251
- alias database db
252
- alias index index_urls
253
- alias index_r index_site
345
+ alias_method :database, :db
346
+ alias_method :index, :index_urls
347
+ alias_method :index_r, :index_site
254
348
  end
255
349
  end
data/lib/wgit/logger.rb CHANGED
@@ -28,7 +28,7 @@ module Wgit
28
28
  #
29
29
  # @return [Logger] The default Logger instance.
30
30
  def self.default_logger
31
- logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
31
+ logger = Logger.new($stdout, progname: 'wgit', level: :info)
32
32
  logger.formatter = proc do |_severity, _datetime, progname, msg|
33
33
  "[#{progname}] #{msg}\n"
34
34
  end
data/lib/wgit/response.rb CHANGED
@@ -33,6 +33,13 @@ module Wgit
33
33
  @total_time = 0.0
34
34
  end
35
35
 
36
+ # Overrides String#inspect to shorten the printed output of a Response.
37
+ #
38
+ # @return [String] A short textual representation of this Response.
39
+ def inspect
40
+ "#<Wgit::Response url=\"#{@url}\" status=#{status}>"
41
+ end
42
+
36
43
  # Adds time to @total_time (incrementally).
37
44
  #
38
45
  # @param time [Float] The time to add to @total_time.
@@ -134,11 +141,19 @@ module Wgit
134
141
  @status.positive?
135
142
  end
136
143
 
137
- alias code status
138
- alias content body
139
- alias crawl_duration total_time
140
- alias to_s body
141
- alias redirects redirections
142
- alias length size
144
+ # Returns whether or not Wgit is banned from indexing this site.
145
+ #
146
+ # @return [Boolean] True if Wgit should not index this site, false
147
+ # otherwise.
148
+ def no_index?
149
+ headers.fetch(:x_robots_tag, '').downcase.strip == 'noindex'
150
+ end
151
+
152
+ alias_method :code, :status
153
+ alias_method :content, :body
154
+ alias_method :crawl_duration, :total_time
155
+ alias_method :to_s, :body
156
+ alias_method :redirects, :redirections
157
+ alias_method :length, :size
143
158
  end
144
159
  end
@@ -0,0 +1,193 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wgit
4
+ # The RobotsParser class handles parsing and processing of a web servers
5
+ # robots.txt file.
6
+ class RobotsParser
7
+ include Wgit::Assertable
8
+
9
+ # Key representing the start of a comment.
10
+ KEY_COMMENT = '#'
11
+ # Key value separator used in robots.txt files.
12
+ KEY_SEPARATOR = ':'
13
+ # Key representing a user agent.
14
+ KEY_USER_AGENT = 'User-agent'
15
+ # Key representing an allow URL rule.
16
+ KEY_ALLOW = 'Allow'
17
+ # Key representing a disallow URL rule.
18
+ KEY_DISALLOW = 'Disallow'
19
+
20
+ # Value representing the Wgit user agent.
21
+ USER_AGENT_WGIT = :wgit
22
+ # Value representing any user agent including Wgit.
23
+ USER_AGENT_ANY = :*
24
+
25
+ # Value representing any and all paths.
26
+ PATHS_ALL = %w[/ *].freeze
27
+
28
+ # Hash containing the user-agent allow/disallow URL rules. Looks like:
29
+ # allow_paths: ["/"]
30
+ # disallow_paths: ["/accounts", ...]
31
+ attr_reader :rules
32
+
33
+ # Initializes and returns a Wgit::RobotsParser instance having parsed the
34
+ # robot.txt contents.
35
+ #
36
+ # @param contents [String, #to_s] The contents of the robots.txt file to be
37
+ # parsed.
38
+ def initialize(contents)
39
+ @rules = {
40
+ allow_paths: Set.new,
41
+ disallow_paths: Set.new
42
+ }
43
+
44
+ assert_respond_to(contents, :to_s)
45
+ parse(contents.to_s)
46
+ end
47
+
48
+ # Overrides String#inspect to shorten the printed output of a Parser.
49
+ #
50
+ # @return [String] A short textual representation of this Parser.
51
+ def inspect
52
+ "#<Wgit::RobotsParser has_rules=#{rules?} no_index=#{no_index?}>"
53
+ end
54
+
55
+ # Returns the allow paths/rules for this parser's robots.txt contents.
56
+ #
57
+ # @return [Array<String>] The allow paths/rules to follow.
58
+ def allow_paths
59
+ @rules[:allow_paths].to_a
60
+ end
61
+
62
+ # Returns the disallow paths/rules for this parser's robots.txt contents.
63
+ #
64
+ # @return [Array<String>] The disallow paths/rules to follow.
65
+ def disallow_paths
66
+ @rules[:disallow_paths].to_a
67
+ end
68
+
69
+ # Returns whether or not there are rules applying to Wgit.
70
+ #
71
+ # @return [Boolean] True if there are rules for Wgit to follow, false
72
+ # otherwise.
73
+ def rules?
74
+ allow_rules? || disallow_rules?
75
+ end
76
+
77
+ # Returns whether or not there are allow rules applying to Wgit.
78
+ #
79
+ # @return [Boolean] True if there are allow rules for Wgit to follow,
80
+ # false otherwise.
81
+ def allow_rules?
82
+ @rules[:allow_paths].any?
83
+ end
84
+
85
+ # Returns whether or not there are disallow rules applying to Wgit.
86
+ #
87
+ # @return [Boolean] True if there are disallow rules for Wgit to follow,
88
+ # false otherwise.
89
+ def disallow_rules?
90
+ @rules[:disallow_paths].any?
91
+ end
92
+
93
+ # Returns whether or not Wgit is banned from indexing this site.
94
+ #
95
+ # @return [Boolean] True if Wgit should not index this site, false
96
+ # otherwise.
97
+ def no_index?
98
+ @rules[:disallow_paths].any? { |path| PATHS_ALL.include?(path) }
99
+ end
100
+
101
+ private
102
+
103
+ # Parses the file contents and sets @rules.
104
+ def parse(contents)
105
+ user_agents = []
106
+ new_block = false
107
+
108
+ contents.split("\n").each do |line|
109
+ line.strip!
110
+ next if line.empty? || line.start_with?(KEY_COMMENT)
111
+
112
+ # A user agent block is denoted by N User-agent's followed by N
113
+ # Allow/Disallow's. After which a new block is formed from scratch.
114
+ if start_with_any_case?(line, KEY_USER_AGENT)
115
+ if new_block
116
+ user_agents = []
117
+ new_block = false
118
+ end
119
+ user_agents << remove_key(line, KEY_USER_AGENT).downcase.to_sym
120
+ else
121
+ new_block = true
122
+ end
123
+
124
+ if start_with_any_case?(line, KEY_ALLOW)
125
+ append_allow_rule(user_agents, line)
126
+ elsif start_with_any_case?(line, KEY_DISALLOW)
127
+ append_disallow_rule(user_agents, line)
128
+ elsif !start_with_any_case?(line, KEY_USER_AGENT)
129
+ Wgit.logger.debug("Skipping unsupported robots.txt line: #{line}")
130
+ end
131
+ end
132
+ end
133
+
134
+ # Implements start_with? but case insensitive.
135
+ def start_with_any_case?(str, prefix)
136
+ str.downcase.start_with?(prefix.downcase)
137
+ end
138
+
139
+ # Returns line with key removed (if present). Otherwise line is returned
140
+ # as given.
141
+ def remove_key(line, key)
142
+ return line unless start_with_any_case?(line, key)
143
+ return line unless line.count(KEY_SEPARATOR) == 1
144
+
145
+ segs = line.split(KEY_SEPARATOR)
146
+ return '' if segs.size == 1
147
+
148
+ segs.last.strip
149
+ end
150
+
151
+ # Don't append * or /, as this means all paths, which is the same as no
152
+ # allow_paths when passed to Wgit::Crawler.
153
+ def append_allow_rule(user_agents, line)
154
+ return unless wgit_user_agent?(user_agents)
155
+
156
+ path = remove_key(line, KEY_ALLOW)
157
+ path = parse_special_syntax(path)
158
+ return if PATHS_ALL.include?(path)
159
+
160
+ @rules[:allow_paths] << path
161
+ end
162
+
163
+ def append_disallow_rule(user_agents, line)
164
+ return unless wgit_user_agent?(user_agents)
165
+
166
+ path = remove_key(line, KEY_DISALLOW)
167
+ path = parse_special_syntax(path)
168
+ @rules[:disallow_paths] << path
169
+ end
170
+
171
+ def wgit_user_agent?(user_agents)
172
+ user_agents.any? do |agent|
173
+ [USER_AGENT_ANY, USER_AGENT_WGIT].include?(agent.downcase)
174
+ end
175
+ end
176
+
177
+ def parse_special_syntax(path)
178
+ # Remove $ e.g. "/blah$" becomes "/blah"
179
+ path = path.gsub('$', '')
180
+
181
+ # Remove any inline comments e.g. "/blah # comment" becomes "/blah"
182
+ path = path.split(" #{KEY_COMMENT}").first if path.include?(" #{KEY_COMMENT}")
183
+
184
+ # Replace an empty path with * e.g. "Allow: " becomes "Allow: *"
185
+ path = '*' if path.empty?
186
+
187
+ path
188
+ end
189
+
190
+ alias_method :paths, :rules
191
+ alias_method :banned?, :no_index?
192
+ end
193
+ end