wgit 0.10.7 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/indexer.rb CHANGED
@@ -26,33 +26,38 @@ module Wgit
26
26
  # Retrieves uncrawled url's from the database and recursively crawls each
27
27
  # site storing their internal pages into the database and adding their
28
28
  # external url's to be crawled later on. Logs info on the crawl using
29
- # Wgit.logger as it goes along.
29
+ # Wgit.logger as it goes along. This method will honour all site's
30
+ # robots.txt and 'noindex' requests.
30
31
  #
31
32
  # @param max_sites [Integer] The number of separate and whole
32
33
  # websites to be crawled before the method exits. Defaults to -1 which
33
- # means the crawl will occur until manually stopped (Ctrl+C etc).
34
+ # means the crawl will occur until manually stopped (Ctrl+C), the
35
+ # max_data has been reached, or it runs out of external urls to index.
34
36
  # @param max_data [Integer] The maximum amount of bytes that will be
35
37
  # scraped from the web (default is 1GB). Note, that this value is used to
36
38
  # determine when to stop crawling; it's not a guarantee of the max data
37
39
  # that will be obtained.
38
- def index_www(max_sites: -1, max_data: 1_048_576_000)
40
+ # @param max_urls_per_iteration [Integer] The maximum number of uncrawled
41
+ # urls to index for each iteration, before checking max_sites and
42
+ # max_data, possibly ending the crawl.
43
+ def index_www(max_sites: -1, max_data: 1_048_576_000, max_urls_per_iteration: 10)
39
44
  if max_sites.negative?
40
45
  Wgit.logger.info("Indexing until the database has been filled or it \
41
- runs out of urls to crawl (which might be never).")
46
+ runs out of urls to crawl (which might be never)")
42
47
  end
43
48
  site_count = 0
44
49
 
45
50
  while keep_crawling?(site_count, max_sites, max_data)
46
51
  Wgit.logger.info("Current database size: #{@db.size}")
47
52
 
48
- uncrawled_urls = @db.uncrawled_urls(limit: 100)
53
+ uncrawled_urls = @db.uncrawled_urls(limit: max_urls_per_iteration)
49
54
 
50
55
  if uncrawled_urls.empty?
51
- Wgit.logger.info('No urls to crawl, exiting.')
56
+ Wgit.logger.info('No urls to crawl, exiting')
52
57
 
53
58
  return
54
59
  end
55
- Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")
60
+ Wgit.logger.info("Starting indexing loop for: #{uncrawled_urls.map(&:to_s)}")
56
61
 
57
62
  docs_count = 0
58
63
  urls_count = 0
@@ -60,38 +65,48 @@ runs out of urls to crawl (which might be never).")
60
65
  uncrawled_urls.each do |url|
61
66
  unless keep_crawling?(site_count, max_sites, max_data)
62
67
  Wgit.logger.info("Reached max number of sites to crawl or \
63
- database capacity, exiting.")
68
+ database capacity, exiting")
64
69
 
65
70
  return
66
71
  end
67
72
  site_count += 1
68
73
 
74
+ parser = parse_robots_txt(url)
75
+ if parser&.no_index?
76
+ upsert_url_and_redirects(url)
77
+
78
+ next
79
+ end
80
+
69
81
  site_docs_count = 0
70
- ext_links = @crawler.crawl_site(url) do |doc|
71
- unless doc.empty?
72
- write_doc_to_db(doc)
73
- docs_count += 1
74
- site_docs_count += 1
75
- end
82
+ ext_links = @crawler.crawl_site(
83
+ url, allow_paths: parser&.allow_paths, disallow_paths: parser&.disallow_paths
84
+ ) do |doc|
85
+ next if doc.empty? || no_index?(@crawler.last_response, doc)
86
+
87
+ upsert_doc(doc)
88
+ docs_count += 1
89
+ site_docs_count += 1
76
90
  end
77
91
 
78
- raise 'Error updating url' unless @db.update(url) == 1
92
+ upsert_url_and_redirects(url)
79
93
 
80
- urls_count += write_urls_to_db(ext_links)
94
+ urls_count += upsert_external_urls(ext_links)
81
95
  end
82
96
 
83
97
  Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
84
- url(s) overall for this iteration.")
98
+ url(s) during this iteration")
85
99
  Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
86
- the next iteration.")
87
-
88
- nil
100
+ future iterations")
89
101
  end
102
+
103
+ nil
90
104
  end
91
105
 
92
106
  # Crawls a single website's pages and stores them into the database.
93
107
  # There is no max download limit so be careful which sites you index.
94
- # Logs info on the crawl using Wgit.logger as it goes along.
108
+ # Logs info on the crawl using Wgit.logger as it goes along. This method
109
+ # will honour the site's robots.txt and 'noindex' requests.
95
110
  #
96
111
  # @param url [Wgit::Url] The base Url of the website to crawl.
97
112
  # @param insert_externals [Boolean] Whether or not to insert the website's
@@ -113,28 +128,30 @@ the next iteration.")
113
128
  url, insert_externals: false, follow: :default,
114
129
  allow_paths: nil, disallow_paths: nil
115
130
  )
116
- crawl_opts = {
117
- follow: follow,
118
- allow_paths: allow_paths,
119
- disallow_paths: disallow_paths
120
- }
131
+ parser = parse_robots_txt(url)
132
+ if parser&.no_index?
133
+ upsert_url_and_redirects(url)
134
+
135
+ return 0
136
+ end
137
+
138
+ allow_paths, disallow_paths = merge_paths(parser, allow_paths, disallow_paths)
139
+ crawl_opts = { follow:, allow_paths:, disallow_paths: }
121
140
  total_pages_indexed = 0
122
141
 
123
142
  ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
143
+ next if no_index?(@crawler.last_response, doc)
144
+
124
145
  result = block_given? ? yield(doc) : true
125
146
 
126
147
  if result && !doc.empty?
127
- write_doc_to_db(doc)
148
+ upsert_doc(doc)
128
149
  total_pages_indexed += 1
129
150
  end
130
151
  end
131
152
 
132
- @db.upsert(url)
133
-
134
- if insert_externals && ext_urls
135
- num_inserted_urls = write_urls_to_db(ext_urls)
136
- Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
137
- end
153
+ upsert_url_and_redirects(url)
154
+ upsert_external_urls(ext_urls) if insert_externals && ext_urls
138
155
 
139
156
  Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
140
157
  for the site: #{url}")
@@ -145,6 +162,8 @@ for the site: #{url}")
145
162
  # Crawls one or more webpages and stores them into the database.
146
163
  # There is no max download limit so be careful of large pages.
147
164
  # Logs info on the crawl using Wgit.logger as it goes along.
165
+ # This method will honour the site's robots.txt and 'noindex' requests
166
+ # in relation to the given urls.
148
167
  #
149
168
  # @param urls [*Wgit::Url] The webpage Url's to crawl.
150
169
  # @param insert_externals [Boolean] Whether or not to insert the webpages
@@ -157,7 +176,7 @@ for the site: #{url}")
157
176
  def index_urls(*urls, insert_externals: false, &block)
158
177
  raise 'You must provide at least one Url' if urls.empty?
159
178
 
160
- opts = { insert_externals: insert_externals }
179
+ opts = { insert_externals: }
161
180
  Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
162
181
 
163
182
  nil
@@ -166,6 +185,8 @@ for the site: #{url}")
166
185
  # Crawls a single webpage and stores it into the database.
167
186
  # There is no max download limit so be careful of large pages.
168
187
  # Logs info on the crawl using Wgit.logger as it goes along.
188
+ # This method will honour the site's robots.txt and 'noindex' requests
189
+ # in relation to the given url.
169
190
  #
170
191
  # @param url [Wgit::Url] The webpage Url to crawl.
171
192
  # @param insert_externals [Boolean] Whether or not to insert the webpages
@@ -175,18 +196,24 @@ for the site: #{url}")
175
196
  # manipulation. Return nil or false from the block to prevent the
176
197
  # document from being saved into the database.
177
198
  def index_url(url, insert_externals: false)
199
+ parser = parse_robots_txt(url)
200
+ if parser && (parser.no_index? || contains_path?(parser.disallow_paths, url))
201
+ upsert_url_and_redirects(url)
202
+
203
+ return
204
+ end
205
+
178
206
  document = @crawler.crawl_url(url) do |doc|
207
+ break if no_index?(@crawler.last_response, doc)
208
+
179
209
  result = block_given? ? yield(doc) : true
180
- write_doc_to_db(doc) if result && !doc.empty?
210
+ upsert_doc(doc) if result && !doc.empty?
181
211
  end
182
212
 
183
- @db.upsert(url)
213
+ upsert_url_and_redirects(url)
184
214
 
185
215
  ext_urls = document&.external_links
186
- if insert_externals && ext_urls
187
- num_inserted_urls = write_urls_to_db(ext_urls)
188
- Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
189
- end
216
+ upsert_external_urls(ext_urls) if insert_externals && ext_urls
190
217
 
191
218
  nil
192
219
  end
@@ -210,10 +237,11 @@ for the site: #{url}")
210
237
  end
211
238
 
212
239
  # Write the doc to the DB. Note that the unique url index on the documents
213
- # collection deliberately prevents duplicate inserts.
240
+ # collection deliberately prevents duplicate inserts. If the document
241
+ # already exists, then it will be updated in the DB.
214
242
  #
215
243
  # @param doc [Wgit::Document] The document to write to the DB.
216
- def write_doc_to_db(doc)
244
+ def upsert_doc(doc)
217
245
  if @db.upsert(doc)
218
246
  Wgit.logger.info("Saved document for url: #{doc.url}")
219
247
  else
@@ -221,35 +249,101 @@ for the site: #{url}")
221
249
  end
222
250
  end
223
251
 
224
- # Write the urls to the DB. Note that the unique url index on the urls
225
- # collection deliberately prevents duplicate inserts.
252
+ # Upsert the url and its redirects, setting all to crawled = true.
226
253
  #
227
- # @param urls [Array<Wgit::Url>] The urls to write to the DB.
228
- # @return [Integer] The number of inserted urls.
229
- def write_urls_to_db(urls)
230
- count = 0
254
+ # @param url [Wgit::Url] The url to write to the DB.
255
+ # @return [Integer] The number of upserted urls (url + redirect urls).
256
+ def upsert_url_and_redirects(url)
257
+ url.crawled = true unless url.crawled?
231
258
 
232
- return count unless urls.respond_to?(:each)
259
+ # Upsert the url and any url redirects, setting them as crawled also.
260
+ @db.bulk_upsert(url.redirects_journey)
261
+ end
233
262
 
234
- urls.each do |url|
235
- if url.invalid?
236
- Wgit.logger.info("Ignoring invalid external url: #{url}")
237
- next
238
- end
263
+ # Write the external urls to the DB. For any external url, its origin will
264
+ # be inserted e.g. if the external url is http://example.com/contact then
265
+ # http://example.com will be inserted into the database. Note that the
266
+ # unique url index on the urls collection deliberately prevents duplicate
267
+ # inserts.
268
+ #
269
+ # @param urls [Array<Wgit::Url>] The external urls to write to the DB.
270
+ # @return [Integer] The number of upserted urls.
271
+ def upsert_external_urls(urls)
272
+ urls = urls
273
+ .reject(&:invalid?)
274
+ .map(&:to_origin)
275
+ .uniq
276
+ return 0 if urls.empty?
277
+
278
+ count = @db.bulk_upsert(urls)
279
+ Wgit.logger.info("Saved #{count} external urls")
239
280
 
240
- @db.insert(url)
241
- count += 1
281
+ count
282
+ end
283
+
284
+ private
285
+
286
+ # Crawls and parses robots.txt file (if found). Returns the parser or nil.
287
+ def parse_robots_txt(url)
288
+ robots_url = url.to_origin.join('/robots.txt')
289
+
290
+ Wgit.logger.info("Crawling for robots.txt: #{robots_url}")
291
+
292
+ doc = @crawler.crawl_url(robots_url)
293
+ return nil if !@crawler.last_response.ok? || doc.empty?
294
+
295
+ parser = Wgit::RobotsParser.new(doc.content)
242
296
 
243
- Wgit.logger.info("Inserted external url: #{url}")
244
- rescue Mongo::Error::OperationFailure
245
- Wgit.logger.info("External url already exists: #{url}")
297
+ Wgit.logger.info("robots.txt allow paths: #{parser.allow_paths}")
298
+ Wgit.logger.info("robots.txt disallow paths: #{parser.disallow_paths}")
299
+ if parser.no_index?
300
+ Wgit.logger.info('robots.txt has banned wgit indexing, skipping')
246
301
  end
247
302
 
248
- count
303
+ parser
304
+ end
305
+
306
+ # Takes the user defined allow/disallow_paths and merges robots paths
307
+ # into them. The allow/disallow_paths vars each can be of type nil, String,
308
+ # Enumerable<String>.
309
+ def merge_paths(parser, allow_paths, disallow_paths)
310
+ return allow_paths, disallow_paths unless parser&.rules?
311
+
312
+ allow = allow_paths || []
313
+ allow = [allow] unless allow.is_a?(Enumerable)
314
+
315
+ disallow = disallow_paths || []
316
+ disallow = [disallow] unless disallow.is_a?(Enumerable)
317
+
318
+ allow.concat(parser.allow_paths)
319
+ disallow.concat(parser.disallow_paths)
320
+
321
+ [allow, disallow]
322
+ end
323
+
324
+ # Returns true if url is included in the given paths.
325
+ def contains_path?(paths, url)
326
+ paths.any? { |path| Wgit::Url.new(path).to_path == url.to_path }
327
+ end
328
+
329
+ # Returns if the last_response or doc #no_index? is true or not.
330
+ def no_index?(last_response, doc)
331
+ url = last_response.url.to_s
332
+ if last_response.no_index?
333
+ Wgit.logger.info("Skipping page due to no-index response header: #{url}")
334
+ return true
335
+ end
336
+
337
+ if doc&.no_index?
338
+ Wgit.logger.info("Skipping page due to no-index HTML meta tag: #{url}")
339
+ return true
340
+ end
341
+
342
+ false
249
343
  end
250
344
 
251
- alias database db
252
- alias index index_urls
253
- alias index_r index_site
345
+ alias_method :database, :db
346
+ alias_method :index, :index_urls
347
+ alias_method :index_r, :index_site
254
348
  end
255
349
  end
data/lib/wgit/logger.rb CHANGED
@@ -28,7 +28,7 @@ module Wgit
28
28
  #
29
29
  # @return [Logger] The default Logger instance.
30
30
  def self.default_logger
31
- logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
31
+ logger = Logger.new($stdout, progname: 'wgit', level: :info)
32
32
  logger.formatter = proc do |_severity, _datetime, progname, msg|
33
33
  "[#{progname}] #{msg}\n"
34
34
  end
data/lib/wgit/response.rb CHANGED
@@ -33,6 +33,13 @@ module Wgit
33
33
  @total_time = 0.0
34
34
  end
35
35
 
36
+ # Overrides String#inspect to shorten the printed output of a Response.
37
+ #
38
+ # @return [String] A short textual representation of this Response.
39
+ def inspect
40
+ "#<Wgit::Response url=\"#{@url}\" status=#{status}>"
41
+ end
42
+
36
43
  # Adds time to @total_time (incrementally).
37
44
  #
38
45
  # @param time [Float] The time to add to @total_time.
@@ -134,11 +141,19 @@ module Wgit
134
141
  @status.positive?
135
142
  end
136
143
 
137
- alias code status
138
- alias content body
139
- alias crawl_duration total_time
140
- alias to_s body
141
- alias redirects redirections
142
- alias length size
144
+ # Returns whether or not Wgit is banned from indexing this site.
145
+ #
146
+ # @return [Boolean] True if Wgit should not index this site, false
147
+ # otherwise.
148
+ def no_index?
149
+ headers.fetch(:x_robots_tag, '').downcase.strip == 'noindex'
150
+ end
151
+
152
+ alias_method :code, :status
153
+ alias_method :content, :body
154
+ alias_method :crawl_duration, :total_time
155
+ alias_method :to_s, :body
156
+ alias_method :redirects, :redirections
157
+ alias_method :length, :size
143
158
  end
144
159
  end
@@ -0,0 +1,193 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wgit
4
+ # The RobotsParser class handles parsing and processing of a web servers
5
+ # robots.txt file.
6
+ class RobotsParser
7
+ include Wgit::Assertable
8
+
9
+ # Key representing the start of a comment.
10
+ KEY_COMMENT = '#'
11
+ # Key value separator used in robots.txt files.
12
+ KEY_SEPARATOR = ':'
13
+ # Key representing a user agent.
14
+ KEY_USER_AGENT = 'User-agent'
15
+ # Key representing an allow URL rule.
16
+ KEY_ALLOW = 'Allow'
17
+ # Key representing a disallow URL rule.
18
+ KEY_DISALLOW = 'Disallow'
19
+
20
+ # Value representing the Wgit user agent.
21
+ USER_AGENT_WGIT = :wgit
22
+ # Value representing any user agent including Wgit.
23
+ USER_AGENT_ANY = :*
24
+
25
+ # Value representing any and all paths.
26
+ PATHS_ALL = %w[/ *].freeze
27
+
28
+ # Hash containing the user-agent allow/disallow URL rules. Looks like:
29
+ # allow_paths: ["/"]
30
+ # disallow_paths: ["/accounts", ...]
31
+ attr_reader :rules
32
+
33
+ # Initializes and returns a Wgit::RobotsParser instance having parsed the
34
+ # robot.txt contents.
35
+ #
36
+ # @param contents [String, #to_s] The contents of the robots.txt file to be
37
+ # parsed.
38
+ def initialize(contents)
39
+ @rules = {
40
+ allow_paths: Set.new,
41
+ disallow_paths: Set.new
42
+ }
43
+
44
+ assert_respond_to(contents, :to_s)
45
+ parse(contents.to_s)
46
+ end
47
+
48
+ # Overrides String#inspect to shorten the printed output of a Parser.
49
+ #
50
+ # @return [String] A short textual representation of this Parser.
51
+ def inspect
52
+ "#<Wgit::RobotsParser has_rules=#{rules?} no_index=#{no_index?}>"
53
+ end
54
+
55
+ # Returns the allow paths/rules for this parser's robots.txt contents.
56
+ #
57
+ # @return [Array<String>] The allow paths/rules to follow.
58
+ def allow_paths
59
+ @rules[:allow_paths].to_a
60
+ end
61
+
62
+ # Returns the disallow paths/rules for this parser's robots.txt contents.
63
+ #
64
+ # @return [Array<String>] The disallow paths/rules to follow.
65
+ def disallow_paths
66
+ @rules[:disallow_paths].to_a
67
+ end
68
+
69
+ # Returns whether or not there are rules applying to Wgit.
70
+ #
71
+ # @return [Boolean] True if there are rules for Wgit to follow, false
72
+ # otherwise.
73
+ def rules?
74
+ allow_rules? || disallow_rules?
75
+ end
76
+
77
+ # Returns whether or not there are allow rules applying to Wgit.
78
+ #
79
+ # @return [Boolean] True if there are allow rules for Wgit to follow,
80
+ # false otherwise.
81
+ def allow_rules?
82
+ @rules[:allow_paths].any?
83
+ end
84
+
85
+ # Returns whether or not there are disallow rules applying to Wgit.
86
+ #
87
+ # @return [Boolean] True if there are disallow rules for Wgit to follow,
88
+ # false otherwise.
89
+ def disallow_rules?
90
+ @rules[:disallow_paths].any?
91
+ end
92
+
93
+ # Returns whether or not Wgit is banned from indexing this site.
94
+ #
95
+ # @return [Boolean] True if Wgit should not index this site, false
96
+ # otherwise.
97
+ def no_index?
98
+ @rules[:disallow_paths].any? { |path| PATHS_ALL.include?(path) }
99
+ end
100
+
101
+ private
102
+
103
+ # Parses the file contents and sets @rules.
104
+ def parse(contents)
105
+ user_agents = []
106
+ new_block = false
107
+
108
+ contents.split("\n").each do |line|
109
+ line.strip!
110
+ next if line.empty? || line.start_with?(KEY_COMMENT)
111
+
112
+ # A user agent block is denoted by N User-agent's followed by N
113
+ # Allow/Disallow's. After which a new block is formed from scratch.
114
+ if start_with_any_case?(line, KEY_USER_AGENT)
115
+ if new_block
116
+ user_agents = []
117
+ new_block = false
118
+ end
119
+ user_agents << remove_key(line, KEY_USER_AGENT).downcase.to_sym
120
+ else
121
+ new_block = true
122
+ end
123
+
124
+ if start_with_any_case?(line, KEY_ALLOW)
125
+ append_allow_rule(user_agents, line)
126
+ elsif start_with_any_case?(line, KEY_DISALLOW)
127
+ append_disallow_rule(user_agents, line)
128
+ elsif !start_with_any_case?(line, KEY_USER_AGENT)
129
+ Wgit.logger.debug("Skipping unsupported robots.txt line: #{line}")
130
+ end
131
+ end
132
+ end
133
+
134
+ # Implements start_with? but case insensitive.
135
+ def start_with_any_case?(str, prefix)
136
+ str.downcase.start_with?(prefix.downcase)
137
+ end
138
+
139
+ # Returns line with key removed (if present). Otherwise line is returned
140
+ # as given.
141
+ def remove_key(line, key)
142
+ return line unless start_with_any_case?(line, key)
143
+ return line unless line.count(KEY_SEPARATOR) == 1
144
+
145
+ segs = line.split(KEY_SEPARATOR)
146
+ return '' if segs.size == 1
147
+
148
+ segs.last.strip
149
+ end
150
+
151
+ # Don't append * or /, as this means all paths, which is the same as no
152
+ # allow_paths when passed to Wgit::Crawler.
153
+ def append_allow_rule(user_agents, line)
154
+ return unless wgit_user_agent?(user_agents)
155
+
156
+ path = remove_key(line, KEY_ALLOW)
157
+ path = parse_special_syntax(path)
158
+ return if PATHS_ALL.include?(path)
159
+
160
+ @rules[:allow_paths] << path
161
+ end
162
+
163
+ def append_disallow_rule(user_agents, line)
164
+ return unless wgit_user_agent?(user_agents)
165
+
166
+ path = remove_key(line, KEY_DISALLOW)
167
+ path = parse_special_syntax(path)
168
+ @rules[:disallow_paths] << path
169
+ end
170
+
171
+ def wgit_user_agent?(user_agents)
172
+ user_agents.any? do |agent|
173
+ [USER_AGENT_ANY, USER_AGENT_WGIT].include?(agent.downcase)
174
+ end
175
+ end
176
+
177
+ def parse_special_syntax(path)
178
+ # Remove $ e.g. "/blah$" becomes "/blah"
179
+ path = path.gsub('$', '')
180
+
181
+ # Remove any inline comments e.g. "/blah # comment" becomes "/blah"
182
+ path = path.split(" #{KEY_COMMENT}").first if path.include?(" #{KEY_COMMENT}")
183
+
184
+ # Replace an empty path with * e.g. "Allow: " becomes "Allow: *"
185
+ path = '*' if path.empty?
186
+
187
+ path
188
+ end
189
+
190
+ alias_method :paths, :rules
191
+ alias_method :banned?, :no_index?
192
+ end
193
+ end