wgit 0.10.7 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +44 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +22 -2
- data/bin/wgit +3 -1
- data/lib/wgit/assertable.rb +2 -2
- data/lib/wgit/crawler.rb +56 -34
- data/lib/wgit/database/database.rb +64 -52
- data/lib/wgit/document.rb +67 -39
- data/lib/wgit/document_extractors.rb +15 -1
- data/lib/wgit/dsl.rb +16 -20
- data/lib/wgit/indexer.rb +157 -63
- data/lib/wgit/logger.rb +1 -1
- data/lib/wgit/response.rb +21 -6
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +118 -51
- data/lib/wgit/utils.rb +81 -28
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +1 -0
- metadata +33 -38
data/lib/wgit/indexer.rb
CHANGED
@@ -26,33 +26,38 @@ module Wgit
|
|
26
26
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
27
27
|
# site storing their internal pages into the database and adding their
|
28
28
|
# external url's to be crawled later on. Logs info on the crawl using
|
29
|
-
# Wgit.logger as it goes along.
|
29
|
+
# Wgit.logger as it goes along. This method will honour all site's
|
30
|
+
# robots.txt and 'noindex' requests.
|
30
31
|
#
|
31
32
|
# @param max_sites [Integer] The number of separate and whole
|
32
33
|
# websites to be crawled before the method exits. Defaults to -1 which
|
33
|
-
# means the crawl will occur until manually stopped (Ctrl+C
|
34
|
+
# means the crawl will occur until manually stopped (Ctrl+C), the
|
35
|
+
# max_data has been reached, or it runs out of external urls to index.
|
34
36
|
# @param max_data [Integer] The maximum amount of bytes that will be
|
35
37
|
# scraped from the web (default is 1GB). Note, that this value is used to
|
36
38
|
# determine when to stop crawling; it's not a guarantee of the max data
|
37
39
|
# that will be obtained.
|
38
|
-
|
40
|
+
# @param max_urls_per_iteration [Integer] The maximum number of uncrawled
|
41
|
+
# urls to index for each iteration, before checking max_sites and
|
42
|
+
# max_data, possibly ending the crawl.
|
43
|
+
def index_www(max_sites: -1, max_data: 1_048_576_000, max_urls_per_iteration: 10)
|
39
44
|
if max_sites.negative?
|
40
45
|
Wgit.logger.info("Indexing until the database has been filled or it \
|
41
|
-
runs out of urls to crawl (which might be never)
|
46
|
+
runs out of urls to crawl (which might be never)")
|
42
47
|
end
|
43
48
|
site_count = 0
|
44
49
|
|
45
50
|
while keep_crawling?(site_count, max_sites, max_data)
|
46
51
|
Wgit.logger.info("Current database size: #{@db.size}")
|
47
52
|
|
48
|
-
uncrawled_urls = @db.uncrawled_urls(limit:
|
53
|
+
uncrawled_urls = @db.uncrawled_urls(limit: max_urls_per_iteration)
|
49
54
|
|
50
55
|
if uncrawled_urls.empty?
|
51
|
-
Wgit.logger.info('No urls to crawl, exiting
|
56
|
+
Wgit.logger.info('No urls to crawl, exiting')
|
52
57
|
|
53
58
|
return
|
54
59
|
end
|
55
|
-
Wgit.logger.info("Starting
|
60
|
+
Wgit.logger.info("Starting indexing loop for: #{uncrawled_urls.map(&:to_s)}")
|
56
61
|
|
57
62
|
docs_count = 0
|
58
63
|
urls_count = 0
|
@@ -60,38 +65,48 @@ runs out of urls to crawl (which might be never).")
|
|
60
65
|
uncrawled_urls.each do |url|
|
61
66
|
unless keep_crawling?(site_count, max_sites, max_data)
|
62
67
|
Wgit.logger.info("Reached max number of sites to crawl or \
|
63
|
-
database capacity, exiting
|
68
|
+
database capacity, exiting")
|
64
69
|
|
65
70
|
return
|
66
71
|
end
|
67
72
|
site_count += 1
|
68
73
|
|
74
|
+
parser = parse_robots_txt(url)
|
75
|
+
if parser&.no_index?
|
76
|
+
upsert_url_and_redirects(url)
|
77
|
+
|
78
|
+
next
|
79
|
+
end
|
80
|
+
|
69
81
|
site_docs_count = 0
|
70
|
-
ext_links = @crawler.crawl_site(
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
82
|
+
ext_links = @crawler.crawl_site(
|
83
|
+
url, allow_paths: parser&.allow_paths, disallow_paths: parser&.disallow_paths
|
84
|
+
) do |doc|
|
85
|
+
next if doc.empty? || no_index?(@crawler.last_response, doc)
|
86
|
+
|
87
|
+
upsert_doc(doc)
|
88
|
+
docs_count += 1
|
89
|
+
site_docs_count += 1
|
76
90
|
end
|
77
91
|
|
78
|
-
|
92
|
+
upsert_url_and_redirects(url)
|
79
93
|
|
80
|
-
urls_count +=
|
94
|
+
urls_count += upsert_external_urls(ext_links)
|
81
95
|
end
|
82
96
|
|
83
97
|
Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
|
84
|
-
url(s)
|
98
|
+
url(s) during this iteration")
|
85
99
|
Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
|
86
|
-
|
87
|
-
|
88
|
-
nil
|
100
|
+
future iterations")
|
89
101
|
end
|
102
|
+
|
103
|
+
nil
|
90
104
|
end
|
91
105
|
|
92
106
|
# Crawls a single website's pages and stores them into the database.
|
93
107
|
# There is no max download limit so be careful which sites you index.
|
94
|
-
# Logs info on the crawl using Wgit.logger as it goes along.
|
108
|
+
# Logs info on the crawl using Wgit.logger as it goes along. This method
|
109
|
+
# will honour the site's robots.txt and 'noindex' requests.
|
95
110
|
#
|
96
111
|
# @param url [Wgit::Url] The base Url of the website to crawl.
|
97
112
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
@@ -113,28 +128,30 @@ the next iteration.")
|
|
113
128
|
url, insert_externals: false, follow: :default,
|
114
129
|
allow_paths: nil, disallow_paths: nil
|
115
130
|
)
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
131
|
+
parser = parse_robots_txt(url)
|
132
|
+
if parser&.no_index?
|
133
|
+
upsert_url_and_redirects(url)
|
134
|
+
|
135
|
+
return 0
|
136
|
+
end
|
137
|
+
|
138
|
+
allow_paths, disallow_paths = merge_paths(parser, allow_paths, disallow_paths)
|
139
|
+
crawl_opts = { follow:, allow_paths:, disallow_paths: }
|
121
140
|
total_pages_indexed = 0
|
122
141
|
|
123
142
|
ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
|
143
|
+
next if no_index?(@crawler.last_response, doc)
|
144
|
+
|
124
145
|
result = block_given? ? yield(doc) : true
|
125
146
|
|
126
147
|
if result && !doc.empty?
|
127
|
-
|
148
|
+
upsert_doc(doc)
|
128
149
|
total_pages_indexed += 1
|
129
150
|
end
|
130
151
|
end
|
131
152
|
|
132
|
-
|
133
|
-
|
134
|
-
if insert_externals && ext_urls
|
135
|
-
num_inserted_urls = write_urls_to_db(ext_urls)
|
136
|
-
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
137
|
-
end
|
153
|
+
upsert_url_and_redirects(url)
|
154
|
+
upsert_external_urls(ext_urls) if insert_externals && ext_urls
|
138
155
|
|
139
156
|
Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
|
140
157
|
for the site: #{url}")
|
@@ -145,6 +162,8 @@ for the site: #{url}")
|
|
145
162
|
# Crawls one or more webpages and stores them into the database.
|
146
163
|
# There is no max download limit so be careful of large pages.
|
147
164
|
# Logs info on the crawl using Wgit.logger as it goes along.
|
165
|
+
# This method will honour the site's robots.txt and 'noindex' requests
|
166
|
+
# in relation to the given urls.
|
148
167
|
#
|
149
168
|
# @param urls [*Wgit::Url] The webpage Url's to crawl.
|
150
169
|
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
@@ -157,7 +176,7 @@ for the site: #{url}")
|
|
157
176
|
def index_urls(*urls, insert_externals: false, &block)
|
158
177
|
raise 'You must provide at least one Url' if urls.empty?
|
159
178
|
|
160
|
-
opts = { insert_externals:
|
179
|
+
opts = { insert_externals: }
|
161
180
|
Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
|
162
181
|
|
163
182
|
nil
|
@@ -166,6 +185,8 @@ for the site: #{url}")
|
|
166
185
|
# Crawls a single webpage and stores it into the database.
|
167
186
|
# There is no max download limit so be careful of large pages.
|
168
187
|
# Logs info on the crawl using Wgit.logger as it goes along.
|
188
|
+
# This method will honour the site's robots.txt and 'noindex' requests
|
189
|
+
# in relation to the given url.
|
169
190
|
#
|
170
191
|
# @param url [Wgit::Url] The webpage Url to crawl.
|
171
192
|
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
@@ -175,18 +196,24 @@ for the site: #{url}")
|
|
175
196
|
# manipulation. Return nil or false from the block to prevent the
|
176
197
|
# document from being saved into the database.
|
177
198
|
def index_url(url, insert_externals: false)
|
199
|
+
parser = parse_robots_txt(url)
|
200
|
+
if parser && (parser.no_index? || contains_path?(parser.disallow_paths, url))
|
201
|
+
upsert_url_and_redirects(url)
|
202
|
+
|
203
|
+
return
|
204
|
+
end
|
205
|
+
|
178
206
|
document = @crawler.crawl_url(url) do |doc|
|
207
|
+
break if no_index?(@crawler.last_response, doc)
|
208
|
+
|
179
209
|
result = block_given? ? yield(doc) : true
|
180
|
-
|
210
|
+
upsert_doc(doc) if result && !doc.empty?
|
181
211
|
end
|
182
212
|
|
183
|
-
|
213
|
+
upsert_url_and_redirects(url)
|
184
214
|
|
185
215
|
ext_urls = document&.external_links
|
186
|
-
if insert_externals && ext_urls
|
187
|
-
num_inserted_urls = write_urls_to_db(ext_urls)
|
188
|
-
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
189
|
-
end
|
216
|
+
upsert_external_urls(ext_urls) if insert_externals && ext_urls
|
190
217
|
|
191
218
|
nil
|
192
219
|
end
|
@@ -210,10 +237,11 @@ for the site: #{url}")
|
|
210
237
|
end
|
211
238
|
|
212
239
|
# Write the doc to the DB. Note that the unique url index on the documents
|
213
|
-
# collection deliberately prevents duplicate inserts.
|
240
|
+
# collection deliberately prevents duplicate inserts. If the document
|
241
|
+
# already exists, then it will be updated in the DB.
|
214
242
|
#
|
215
243
|
# @param doc [Wgit::Document] The document to write to the DB.
|
216
|
-
def
|
244
|
+
def upsert_doc(doc)
|
217
245
|
if @db.upsert(doc)
|
218
246
|
Wgit.logger.info("Saved document for url: #{doc.url}")
|
219
247
|
else
|
@@ -221,35 +249,101 @@ for the site: #{url}")
|
|
221
249
|
end
|
222
250
|
end
|
223
251
|
|
224
|
-
#
|
225
|
-
# collection deliberately prevents duplicate inserts.
|
252
|
+
# Upsert the url and its redirects, setting all to crawled = true.
|
226
253
|
#
|
227
|
-
# @param
|
228
|
-
# @return [Integer] The number of
|
229
|
-
def
|
230
|
-
|
254
|
+
# @param url [Wgit::Url] The url to write to the DB.
|
255
|
+
# @return [Integer] The number of upserted urls (url + redirect urls).
|
256
|
+
def upsert_url_and_redirects(url)
|
257
|
+
url.crawled = true unless url.crawled?
|
231
258
|
|
232
|
-
|
259
|
+
# Upsert the url and any url redirects, setting them as crawled also.
|
260
|
+
@db.bulk_upsert(url.redirects_journey)
|
261
|
+
end
|
233
262
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
263
|
+
# Write the external urls to the DB. For any external url, its origin will
|
264
|
+
# be inserted e.g. if the external url is http://example.com/contact then
|
265
|
+
# http://example.com will be inserted into the database. Note that the
|
266
|
+
# unique url index on the urls collection deliberately prevents duplicate
|
267
|
+
# inserts.
|
268
|
+
#
|
269
|
+
# @param urls [Array<Wgit::Url>] The external urls to write to the DB.
|
270
|
+
# @return [Integer] The number of upserted urls.
|
271
|
+
def upsert_external_urls(urls)
|
272
|
+
urls = urls
|
273
|
+
.reject(&:invalid?)
|
274
|
+
.map(&:to_origin)
|
275
|
+
.uniq
|
276
|
+
return 0 if urls.empty?
|
277
|
+
|
278
|
+
count = @db.bulk_upsert(urls)
|
279
|
+
Wgit.logger.info("Saved #{count} external urls")
|
239
280
|
|
240
|
-
|
241
|
-
|
281
|
+
count
|
282
|
+
end
|
283
|
+
|
284
|
+
private
|
285
|
+
|
286
|
+
# Crawls and parses robots.txt file (if found). Returns the parser or nil.
|
287
|
+
def parse_robots_txt(url)
|
288
|
+
robots_url = url.to_origin.join('/robots.txt')
|
289
|
+
|
290
|
+
Wgit.logger.info("Crawling for robots.txt: #{robots_url}")
|
291
|
+
|
292
|
+
doc = @crawler.crawl_url(robots_url)
|
293
|
+
return nil if !@crawler.last_response.ok? || doc.empty?
|
294
|
+
|
295
|
+
parser = Wgit::RobotsParser.new(doc.content)
|
242
296
|
|
243
|
-
|
244
|
-
|
245
|
-
|
297
|
+
Wgit.logger.info("robots.txt allow paths: #{parser.allow_paths}")
|
298
|
+
Wgit.logger.info("robots.txt disallow paths: #{parser.disallow_paths}")
|
299
|
+
if parser.no_index?
|
300
|
+
Wgit.logger.info('robots.txt has banned wgit indexing, skipping')
|
246
301
|
end
|
247
302
|
|
248
|
-
|
303
|
+
parser
|
304
|
+
end
|
305
|
+
|
306
|
+
# Takes the user defined allow/disallow_paths and merges robots paths
|
307
|
+
# into them. The allow/disallow_paths vars each can be of type nil, String,
|
308
|
+
# Enumerable<String>.
|
309
|
+
def merge_paths(parser, allow_paths, disallow_paths)
|
310
|
+
return allow_paths, disallow_paths unless parser&.rules?
|
311
|
+
|
312
|
+
allow = allow_paths || []
|
313
|
+
allow = [allow] unless allow.is_a?(Enumerable)
|
314
|
+
|
315
|
+
disallow = disallow_paths || []
|
316
|
+
disallow = [disallow] unless disallow.is_a?(Enumerable)
|
317
|
+
|
318
|
+
allow.concat(parser.allow_paths)
|
319
|
+
disallow.concat(parser.disallow_paths)
|
320
|
+
|
321
|
+
[allow, disallow]
|
322
|
+
end
|
323
|
+
|
324
|
+
# Returns true if url is included in the given paths.
|
325
|
+
def contains_path?(paths, url)
|
326
|
+
paths.any? { |path| Wgit::Url.new(path).to_path == url.to_path }
|
327
|
+
end
|
328
|
+
|
329
|
+
# Returns if the last_response or doc #no_index? is true or not.
|
330
|
+
def no_index?(last_response, doc)
|
331
|
+
url = last_response.url.to_s
|
332
|
+
if last_response.no_index?
|
333
|
+
Wgit.logger.info("Skipping page due to no-index response header: #{url}")
|
334
|
+
return true
|
335
|
+
end
|
336
|
+
|
337
|
+
if doc&.no_index?
|
338
|
+
Wgit.logger.info("Skipping page due to no-index HTML meta tag: #{url}")
|
339
|
+
return true
|
340
|
+
end
|
341
|
+
|
342
|
+
false
|
249
343
|
end
|
250
344
|
|
251
|
-
|
252
|
-
|
253
|
-
|
345
|
+
alias_method :database, :db
|
346
|
+
alias_method :index, :index_urls
|
347
|
+
alias_method :index_r, :index_site
|
254
348
|
end
|
255
349
|
end
|
data/lib/wgit/logger.rb
CHANGED
@@ -28,7 +28,7 @@ module Wgit
|
|
28
28
|
#
|
29
29
|
# @return [Logger] The default Logger instance.
|
30
30
|
def self.default_logger
|
31
|
-
logger = Logger.new(
|
31
|
+
logger = Logger.new($stdout, progname: 'wgit', level: :info)
|
32
32
|
logger.formatter = proc do |_severity, _datetime, progname, msg|
|
33
33
|
"[#{progname}] #{msg}\n"
|
34
34
|
end
|
data/lib/wgit/response.rb
CHANGED
@@ -33,6 +33,13 @@ module Wgit
|
|
33
33
|
@total_time = 0.0
|
34
34
|
end
|
35
35
|
|
36
|
+
# Overrides String#inspect to shorten the printed output of a Response.
|
37
|
+
#
|
38
|
+
# @return [String] A short textual representation of this Response.
|
39
|
+
def inspect
|
40
|
+
"#<Wgit::Response url=\"#{@url}\" status=#{status}>"
|
41
|
+
end
|
42
|
+
|
36
43
|
# Adds time to @total_time (incrementally).
|
37
44
|
#
|
38
45
|
# @param time [Float] The time to add to @total_time.
|
@@ -134,11 +141,19 @@ module Wgit
|
|
134
141
|
@status.positive?
|
135
142
|
end
|
136
143
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
144
|
+
# Returns whether or not Wgit is banned from indexing this site.
|
145
|
+
#
|
146
|
+
# @return [Boolean] True if Wgit should not index this site, false
|
147
|
+
# otherwise.
|
148
|
+
def no_index?
|
149
|
+
headers.fetch(:x_robots_tag, '').downcase.strip == 'noindex'
|
150
|
+
end
|
151
|
+
|
152
|
+
alias_method :code, :status
|
153
|
+
alias_method :content, :body
|
154
|
+
alias_method :crawl_duration, :total_time
|
155
|
+
alias_method :to_s, :body
|
156
|
+
alias_method :redirects, :redirections
|
157
|
+
alias_method :length, :size
|
143
158
|
end
|
144
159
|
end
|
@@ -0,0 +1,193 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wgit
|
4
|
+
# The RobotsParser class handles parsing and processing of a web servers
|
5
|
+
# robots.txt file.
|
6
|
+
class RobotsParser
|
7
|
+
include Wgit::Assertable
|
8
|
+
|
9
|
+
# Key representing the start of a comment.
|
10
|
+
KEY_COMMENT = '#'
|
11
|
+
# Key value separator used in robots.txt files.
|
12
|
+
KEY_SEPARATOR = ':'
|
13
|
+
# Key representing a user agent.
|
14
|
+
KEY_USER_AGENT = 'User-agent'
|
15
|
+
# Key representing an allow URL rule.
|
16
|
+
KEY_ALLOW = 'Allow'
|
17
|
+
# Key representing a disallow URL rule.
|
18
|
+
KEY_DISALLOW = 'Disallow'
|
19
|
+
|
20
|
+
# Value representing the Wgit user agent.
|
21
|
+
USER_AGENT_WGIT = :wgit
|
22
|
+
# Value representing any user agent including Wgit.
|
23
|
+
USER_AGENT_ANY = :*
|
24
|
+
|
25
|
+
# Value representing any and all paths.
|
26
|
+
PATHS_ALL = %w[/ *].freeze
|
27
|
+
|
28
|
+
# Hash containing the user-agent allow/disallow URL rules. Looks like:
|
29
|
+
# allow_paths: ["/"]
|
30
|
+
# disallow_paths: ["/accounts", ...]
|
31
|
+
attr_reader :rules
|
32
|
+
|
33
|
+
# Initializes and returns a Wgit::RobotsParser instance having parsed the
|
34
|
+
# robot.txt contents.
|
35
|
+
#
|
36
|
+
# @param contents [String, #to_s] The contents of the robots.txt file to be
|
37
|
+
# parsed.
|
38
|
+
def initialize(contents)
|
39
|
+
@rules = {
|
40
|
+
allow_paths: Set.new,
|
41
|
+
disallow_paths: Set.new
|
42
|
+
}
|
43
|
+
|
44
|
+
assert_respond_to(contents, :to_s)
|
45
|
+
parse(contents.to_s)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Overrides String#inspect to shorten the printed output of a Parser.
|
49
|
+
#
|
50
|
+
# @return [String] A short textual representation of this Parser.
|
51
|
+
def inspect
|
52
|
+
"#<Wgit::RobotsParser has_rules=#{rules?} no_index=#{no_index?}>"
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns the allow paths/rules for this parser's robots.txt contents.
|
56
|
+
#
|
57
|
+
# @return [Array<String>] The allow paths/rules to follow.
|
58
|
+
def allow_paths
|
59
|
+
@rules[:allow_paths].to_a
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns the disallow paths/rules for this parser's robots.txt contents.
|
63
|
+
#
|
64
|
+
# @return [Array<String>] The disallow paths/rules to follow.
|
65
|
+
def disallow_paths
|
66
|
+
@rules[:disallow_paths].to_a
|
67
|
+
end
|
68
|
+
|
69
|
+
# Returns whether or not there are rules applying to Wgit.
|
70
|
+
#
|
71
|
+
# @return [Boolean] True if there are rules for Wgit to follow, false
|
72
|
+
# otherwise.
|
73
|
+
def rules?
|
74
|
+
allow_rules? || disallow_rules?
|
75
|
+
end
|
76
|
+
|
77
|
+
# Returns whether or not there are allow rules applying to Wgit.
|
78
|
+
#
|
79
|
+
# @return [Boolean] True if there are allow rules for Wgit to follow,
|
80
|
+
# false otherwise.
|
81
|
+
def allow_rules?
|
82
|
+
@rules[:allow_paths].any?
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns whether or not there are disallow rules applying to Wgit.
|
86
|
+
#
|
87
|
+
# @return [Boolean] True if there are disallow rules for Wgit to follow,
|
88
|
+
# false otherwise.
|
89
|
+
def disallow_rules?
|
90
|
+
@rules[:disallow_paths].any?
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns whether or not Wgit is banned from indexing this site.
|
94
|
+
#
|
95
|
+
# @return [Boolean] True if Wgit should not index this site, false
|
96
|
+
# otherwise.
|
97
|
+
def no_index?
|
98
|
+
@rules[:disallow_paths].any? { |path| PATHS_ALL.include?(path) }
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
# Parses the file contents and sets @rules.
|
104
|
+
def parse(contents)
|
105
|
+
user_agents = []
|
106
|
+
new_block = false
|
107
|
+
|
108
|
+
contents.split("\n").each do |line|
|
109
|
+
line.strip!
|
110
|
+
next if line.empty? || line.start_with?(KEY_COMMENT)
|
111
|
+
|
112
|
+
# A user agent block is denoted by N User-agent's followed by N
|
113
|
+
# Allow/Disallow's. After which a new block is formed from scratch.
|
114
|
+
if start_with_any_case?(line, KEY_USER_AGENT)
|
115
|
+
if new_block
|
116
|
+
user_agents = []
|
117
|
+
new_block = false
|
118
|
+
end
|
119
|
+
user_agents << remove_key(line, KEY_USER_AGENT).downcase.to_sym
|
120
|
+
else
|
121
|
+
new_block = true
|
122
|
+
end
|
123
|
+
|
124
|
+
if start_with_any_case?(line, KEY_ALLOW)
|
125
|
+
append_allow_rule(user_agents, line)
|
126
|
+
elsif start_with_any_case?(line, KEY_DISALLOW)
|
127
|
+
append_disallow_rule(user_agents, line)
|
128
|
+
elsif !start_with_any_case?(line, KEY_USER_AGENT)
|
129
|
+
Wgit.logger.debug("Skipping unsupported robots.txt line: #{line}")
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Implements start_with? but case insensitive.
|
135
|
+
def start_with_any_case?(str, prefix)
|
136
|
+
str.downcase.start_with?(prefix.downcase)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Returns line with key removed (if present). Otherwise line is returned
|
140
|
+
# as given.
|
141
|
+
def remove_key(line, key)
|
142
|
+
return line unless start_with_any_case?(line, key)
|
143
|
+
return line unless line.count(KEY_SEPARATOR) == 1
|
144
|
+
|
145
|
+
segs = line.split(KEY_SEPARATOR)
|
146
|
+
return '' if segs.size == 1
|
147
|
+
|
148
|
+
segs.last.strip
|
149
|
+
end
|
150
|
+
|
151
|
+
# Don't append * or /, as this means all paths, which is the same as no
|
152
|
+
# allow_paths when passed to Wgit::Crawler.
|
153
|
+
def append_allow_rule(user_agents, line)
|
154
|
+
return unless wgit_user_agent?(user_agents)
|
155
|
+
|
156
|
+
path = remove_key(line, KEY_ALLOW)
|
157
|
+
path = parse_special_syntax(path)
|
158
|
+
return if PATHS_ALL.include?(path)
|
159
|
+
|
160
|
+
@rules[:allow_paths] << path
|
161
|
+
end
|
162
|
+
|
163
|
+
def append_disallow_rule(user_agents, line)
|
164
|
+
return unless wgit_user_agent?(user_agents)
|
165
|
+
|
166
|
+
path = remove_key(line, KEY_DISALLOW)
|
167
|
+
path = parse_special_syntax(path)
|
168
|
+
@rules[:disallow_paths] << path
|
169
|
+
end
|
170
|
+
|
171
|
+
def wgit_user_agent?(user_agents)
|
172
|
+
user_agents.any? do |agent|
|
173
|
+
[USER_AGENT_ANY, USER_AGENT_WGIT].include?(agent.downcase)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def parse_special_syntax(path)
|
178
|
+
# Remove $ e.g. "/blah$" becomes "/blah"
|
179
|
+
path = path.gsub('$', '')
|
180
|
+
|
181
|
+
# Remove any inline comments e.g. "/blah # comment" becomes "/blah"
|
182
|
+
path = path.split(" #{KEY_COMMENT}").first if path.include?(" #{KEY_COMMENT}")
|
183
|
+
|
184
|
+
# Replace an empty path with * e.g. "Allow: " becomes "Allow: *"
|
185
|
+
path = '*' if path.empty?
|
186
|
+
|
187
|
+
path
|
188
|
+
end
|
189
|
+
|
190
|
+
alias_method :paths, :rules
|
191
|
+
alias_method :banned?, :no_index?
|
192
|
+
end
|
193
|
+
end
|