wgit 0.10.7 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +44 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +22 -2
- data/bin/wgit +3 -1
- data/lib/wgit/assertable.rb +2 -2
- data/lib/wgit/crawler.rb +56 -34
- data/lib/wgit/database/database.rb +64 -52
- data/lib/wgit/document.rb +67 -39
- data/lib/wgit/document_extractors.rb +15 -1
- data/lib/wgit/dsl.rb +16 -20
- data/lib/wgit/indexer.rb +157 -63
- data/lib/wgit/logger.rb +1 -1
- data/lib/wgit/response.rb +21 -6
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +118 -51
- data/lib/wgit/utils.rb +81 -28
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +1 -0
- metadata +33 -38
data/lib/wgit/indexer.rb
CHANGED
@@ -26,33 +26,38 @@ module Wgit
|
|
26
26
|
# Retrieves uncrawled url's from the database and recursively crawls each
|
27
27
|
# site storing their internal pages into the database and adding their
|
28
28
|
# external url's to be crawled later on. Logs info on the crawl using
|
29
|
-
# Wgit.logger as it goes along.
|
29
|
+
# Wgit.logger as it goes along. This method will honour all site's
|
30
|
+
# robots.txt and 'noindex' requests.
|
30
31
|
#
|
31
32
|
# @param max_sites [Integer] The number of separate and whole
|
32
33
|
# websites to be crawled before the method exits. Defaults to -1 which
|
33
|
-
# means the crawl will occur until manually stopped (Ctrl+C
|
34
|
+
# means the crawl will occur until manually stopped (Ctrl+C), the
|
35
|
+
# max_data has been reached, or it runs out of external urls to index.
|
34
36
|
# @param max_data [Integer] The maximum amount of bytes that will be
|
35
37
|
# scraped from the web (default is 1GB). Note, that this value is used to
|
36
38
|
# determine when to stop crawling; it's not a guarantee of the max data
|
37
39
|
# that will be obtained.
|
38
|
-
|
40
|
+
# @param max_urls_per_iteration [Integer] The maximum number of uncrawled
|
41
|
+
# urls to index for each iteration, before checking max_sites and
|
42
|
+
# max_data, possibly ending the crawl.
|
43
|
+
def index_www(max_sites: -1, max_data: 1_048_576_000, max_urls_per_iteration: 10)
|
39
44
|
if max_sites.negative?
|
40
45
|
Wgit.logger.info("Indexing until the database has been filled or it \
|
41
|
-
runs out of urls to crawl (which might be never)
|
46
|
+
runs out of urls to crawl (which might be never)")
|
42
47
|
end
|
43
48
|
site_count = 0
|
44
49
|
|
45
50
|
while keep_crawling?(site_count, max_sites, max_data)
|
46
51
|
Wgit.logger.info("Current database size: #{@db.size}")
|
47
52
|
|
48
|
-
uncrawled_urls = @db.uncrawled_urls(limit:
|
53
|
+
uncrawled_urls = @db.uncrawled_urls(limit: max_urls_per_iteration)
|
49
54
|
|
50
55
|
if uncrawled_urls.empty?
|
51
|
-
Wgit.logger.info('No urls to crawl, exiting
|
56
|
+
Wgit.logger.info('No urls to crawl, exiting')
|
52
57
|
|
53
58
|
return
|
54
59
|
end
|
55
|
-
Wgit.logger.info("Starting
|
60
|
+
Wgit.logger.info("Starting indexing loop for: #{uncrawled_urls.map(&:to_s)}")
|
56
61
|
|
57
62
|
docs_count = 0
|
58
63
|
urls_count = 0
|
@@ -60,38 +65,48 @@ runs out of urls to crawl (which might be never).")
|
|
60
65
|
uncrawled_urls.each do |url|
|
61
66
|
unless keep_crawling?(site_count, max_sites, max_data)
|
62
67
|
Wgit.logger.info("Reached max number of sites to crawl or \
|
63
|
-
database capacity, exiting
|
68
|
+
database capacity, exiting")
|
64
69
|
|
65
70
|
return
|
66
71
|
end
|
67
72
|
site_count += 1
|
68
73
|
|
74
|
+
parser = parse_robots_txt(url)
|
75
|
+
if parser&.no_index?
|
76
|
+
upsert_url_and_redirects(url)
|
77
|
+
|
78
|
+
next
|
79
|
+
end
|
80
|
+
|
69
81
|
site_docs_count = 0
|
70
|
-
ext_links = @crawler.crawl_site(
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
82
|
+
ext_links = @crawler.crawl_site(
|
83
|
+
url, allow_paths: parser&.allow_paths, disallow_paths: parser&.disallow_paths
|
84
|
+
) do |doc|
|
85
|
+
next if doc.empty? || no_index?(@crawler.last_response, doc)
|
86
|
+
|
87
|
+
upsert_doc(doc)
|
88
|
+
docs_count += 1
|
89
|
+
site_docs_count += 1
|
76
90
|
end
|
77
91
|
|
78
|
-
|
92
|
+
upsert_url_and_redirects(url)
|
79
93
|
|
80
|
-
urls_count +=
|
94
|
+
urls_count += upsert_external_urls(ext_links)
|
81
95
|
end
|
82
96
|
|
83
97
|
Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
|
84
|
-
url(s)
|
98
|
+
url(s) during this iteration")
|
85
99
|
Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
|
86
|
-
|
87
|
-
|
88
|
-
nil
|
100
|
+
future iterations")
|
89
101
|
end
|
102
|
+
|
103
|
+
nil
|
90
104
|
end
|
91
105
|
|
92
106
|
# Crawls a single website's pages and stores them into the database.
|
93
107
|
# There is no max download limit so be careful which sites you index.
|
94
|
-
# Logs info on the crawl using Wgit.logger as it goes along.
|
108
|
+
# Logs info on the crawl using Wgit.logger as it goes along. This method
|
109
|
+
# will honour the site's robots.txt and 'noindex' requests.
|
95
110
|
#
|
96
111
|
# @param url [Wgit::Url] The base Url of the website to crawl.
|
97
112
|
# @param insert_externals [Boolean] Whether or not to insert the website's
|
@@ -113,28 +128,30 @@ the next iteration.")
|
|
113
128
|
url, insert_externals: false, follow: :default,
|
114
129
|
allow_paths: nil, disallow_paths: nil
|
115
130
|
)
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
131
|
+
parser = parse_robots_txt(url)
|
132
|
+
if parser&.no_index?
|
133
|
+
upsert_url_and_redirects(url)
|
134
|
+
|
135
|
+
return 0
|
136
|
+
end
|
137
|
+
|
138
|
+
allow_paths, disallow_paths = merge_paths(parser, allow_paths, disallow_paths)
|
139
|
+
crawl_opts = { follow:, allow_paths:, disallow_paths: }
|
121
140
|
total_pages_indexed = 0
|
122
141
|
|
123
142
|
ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
|
143
|
+
next if no_index?(@crawler.last_response, doc)
|
144
|
+
|
124
145
|
result = block_given? ? yield(doc) : true
|
125
146
|
|
126
147
|
if result && !doc.empty?
|
127
|
-
|
148
|
+
upsert_doc(doc)
|
128
149
|
total_pages_indexed += 1
|
129
150
|
end
|
130
151
|
end
|
131
152
|
|
132
|
-
|
133
|
-
|
134
|
-
if insert_externals && ext_urls
|
135
|
-
num_inserted_urls = write_urls_to_db(ext_urls)
|
136
|
-
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
137
|
-
end
|
153
|
+
upsert_url_and_redirects(url)
|
154
|
+
upsert_external_urls(ext_urls) if insert_externals && ext_urls
|
138
155
|
|
139
156
|
Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
|
140
157
|
for the site: #{url}")
|
@@ -145,6 +162,8 @@ for the site: #{url}")
|
|
145
162
|
# Crawls one or more webpages and stores them into the database.
|
146
163
|
# There is no max download limit so be careful of large pages.
|
147
164
|
# Logs info on the crawl using Wgit.logger as it goes along.
|
165
|
+
# This method will honour the site's robots.txt and 'noindex' requests
|
166
|
+
# in relation to the given urls.
|
148
167
|
#
|
149
168
|
# @param urls [*Wgit::Url] The webpage Url's to crawl.
|
150
169
|
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
@@ -157,7 +176,7 @@ for the site: #{url}")
|
|
157
176
|
def index_urls(*urls, insert_externals: false, &block)
|
158
177
|
raise 'You must provide at least one Url' if urls.empty?
|
159
178
|
|
160
|
-
opts = { insert_externals:
|
179
|
+
opts = { insert_externals: }
|
161
180
|
Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
|
162
181
|
|
163
182
|
nil
|
@@ -166,6 +185,8 @@ for the site: #{url}")
|
|
166
185
|
# Crawls a single webpage and stores it into the database.
|
167
186
|
# There is no max download limit so be careful of large pages.
|
168
187
|
# Logs info on the crawl using Wgit.logger as it goes along.
|
188
|
+
# This method will honour the site's robots.txt and 'noindex' requests
|
189
|
+
# in relation to the given url.
|
169
190
|
#
|
170
191
|
# @param url [Wgit::Url] The webpage Url to crawl.
|
171
192
|
# @param insert_externals [Boolean] Whether or not to insert the webpages
|
@@ -175,18 +196,24 @@ for the site: #{url}")
|
|
175
196
|
# manipulation. Return nil or false from the block to prevent the
|
176
197
|
# document from being saved into the database.
|
177
198
|
def index_url(url, insert_externals: false)
|
199
|
+
parser = parse_robots_txt(url)
|
200
|
+
if parser && (parser.no_index? || contains_path?(parser.disallow_paths, url))
|
201
|
+
upsert_url_and_redirects(url)
|
202
|
+
|
203
|
+
return
|
204
|
+
end
|
205
|
+
|
178
206
|
document = @crawler.crawl_url(url) do |doc|
|
207
|
+
break if no_index?(@crawler.last_response, doc)
|
208
|
+
|
179
209
|
result = block_given? ? yield(doc) : true
|
180
|
-
|
210
|
+
upsert_doc(doc) if result && !doc.empty?
|
181
211
|
end
|
182
212
|
|
183
|
-
|
213
|
+
upsert_url_and_redirects(url)
|
184
214
|
|
185
215
|
ext_urls = document&.external_links
|
186
|
-
if insert_externals && ext_urls
|
187
|
-
num_inserted_urls = write_urls_to_db(ext_urls)
|
188
|
-
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
189
|
-
end
|
216
|
+
upsert_external_urls(ext_urls) if insert_externals && ext_urls
|
190
217
|
|
191
218
|
nil
|
192
219
|
end
|
@@ -210,10 +237,11 @@ for the site: #{url}")
|
|
210
237
|
end
|
211
238
|
|
212
239
|
# Write the doc to the DB. Note that the unique url index on the documents
|
213
|
-
# collection deliberately prevents duplicate inserts.
|
240
|
+
# collection deliberately prevents duplicate inserts. If the document
|
241
|
+
# already exists, then it will be updated in the DB.
|
214
242
|
#
|
215
243
|
# @param doc [Wgit::Document] The document to write to the DB.
|
216
|
-
def
|
244
|
+
def upsert_doc(doc)
|
217
245
|
if @db.upsert(doc)
|
218
246
|
Wgit.logger.info("Saved document for url: #{doc.url}")
|
219
247
|
else
|
@@ -221,35 +249,101 @@ for the site: #{url}")
|
|
221
249
|
end
|
222
250
|
end
|
223
251
|
|
224
|
-
#
|
225
|
-
# collection deliberately prevents duplicate inserts.
|
252
|
+
# Upsert the url and its redirects, setting all to crawled = true.
|
226
253
|
#
|
227
|
-
# @param
|
228
|
-
# @return [Integer] The number of
|
229
|
-
def
|
230
|
-
|
254
|
+
# @param url [Wgit::Url] The url to write to the DB.
|
255
|
+
# @return [Integer] The number of upserted urls (url + redirect urls).
|
256
|
+
def upsert_url_and_redirects(url)
|
257
|
+
url.crawled = true unless url.crawled?
|
231
258
|
|
232
|
-
|
259
|
+
# Upsert the url and any url redirects, setting them as crawled also.
|
260
|
+
@db.bulk_upsert(url.redirects_journey)
|
261
|
+
end
|
233
262
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
263
|
+
# Write the external urls to the DB. For any external url, its origin will
|
264
|
+
# be inserted e.g. if the external url is http://example.com/contact then
|
265
|
+
# http://example.com will be inserted into the database. Note that the
|
266
|
+
# unique url index on the urls collection deliberately prevents duplicate
|
267
|
+
# inserts.
|
268
|
+
#
|
269
|
+
# @param urls [Array<Wgit::Url>] The external urls to write to the DB.
|
270
|
+
# @return [Integer] The number of upserted urls.
|
271
|
+
def upsert_external_urls(urls)
|
272
|
+
urls = urls
|
273
|
+
.reject(&:invalid?)
|
274
|
+
.map(&:to_origin)
|
275
|
+
.uniq
|
276
|
+
return 0 if urls.empty?
|
277
|
+
|
278
|
+
count = @db.bulk_upsert(urls)
|
279
|
+
Wgit.logger.info("Saved #{count} external urls")
|
239
280
|
|
240
|
-
|
241
|
-
|
281
|
+
count
|
282
|
+
end
|
283
|
+
|
284
|
+
private
|
285
|
+
|
286
|
+
# Crawls and parses robots.txt file (if found). Returns the parser or nil.
|
287
|
+
def parse_robots_txt(url)
|
288
|
+
robots_url = url.to_origin.join('/robots.txt')
|
289
|
+
|
290
|
+
Wgit.logger.info("Crawling for robots.txt: #{robots_url}")
|
291
|
+
|
292
|
+
doc = @crawler.crawl_url(robots_url)
|
293
|
+
return nil if !@crawler.last_response.ok? || doc.empty?
|
294
|
+
|
295
|
+
parser = Wgit::RobotsParser.new(doc.content)
|
242
296
|
|
243
|
-
|
244
|
-
|
245
|
-
|
297
|
+
Wgit.logger.info("robots.txt allow paths: #{parser.allow_paths}")
|
298
|
+
Wgit.logger.info("robots.txt disallow paths: #{parser.disallow_paths}")
|
299
|
+
if parser.no_index?
|
300
|
+
Wgit.logger.info('robots.txt has banned wgit indexing, skipping')
|
246
301
|
end
|
247
302
|
|
248
|
-
|
303
|
+
parser
|
304
|
+
end
|
305
|
+
|
306
|
+
# Takes the user defined allow/disallow_paths and merges robots paths
|
307
|
+
# into them. The allow/disallow_paths vars each can be of type nil, String,
|
308
|
+
# Enumerable<String>.
|
309
|
+
def merge_paths(parser, allow_paths, disallow_paths)
|
310
|
+
return allow_paths, disallow_paths unless parser&.rules?
|
311
|
+
|
312
|
+
allow = allow_paths || []
|
313
|
+
allow = [allow] unless allow.is_a?(Enumerable)
|
314
|
+
|
315
|
+
disallow = disallow_paths || []
|
316
|
+
disallow = [disallow] unless disallow.is_a?(Enumerable)
|
317
|
+
|
318
|
+
allow.concat(parser.allow_paths)
|
319
|
+
disallow.concat(parser.disallow_paths)
|
320
|
+
|
321
|
+
[allow, disallow]
|
322
|
+
end
|
323
|
+
|
324
|
+
# Returns true if url is included in the given paths.
|
325
|
+
def contains_path?(paths, url)
|
326
|
+
paths.any? { |path| Wgit::Url.new(path).to_path == url.to_path }
|
327
|
+
end
|
328
|
+
|
329
|
+
# Returns if the last_response or doc #no_index? is true or not.
|
330
|
+
def no_index?(last_response, doc)
|
331
|
+
url = last_response.url.to_s
|
332
|
+
if last_response.no_index?
|
333
|
+
Wgit.logger.info("Skipping page due to no-index response header: #{url}")
|
334
|
+
return true
|
335
|
+
end
|
336
|
+
|
337
|
+
if doc&.no_index?
|
338
|
+
Wgit.logger.info("Skipping page due to no-index HTML meta tag: #{url}")
|
339
|
+
return true
|
340
|
+
end
|
341
|
+
|
342
|
+
false
|
249
343
|
end
|
250
344
|
|
251
|
-
|
252
|
-
|
253
|
-
|
345
|
+
alias_method :database, :db
|
346
|
+
alias_method :index, :index_urls
|
347
|
+
alias_method :index_r, :index_site
|
254
348
|
end
|
255
349
|
end
|
data/lib/wgit/logger.rb
CHANGED
@@ -28,7 +28,7 @@ module Wgit
|
|
28
28
|
#
|
29
29
|
# @return [Logger] The default Logger instance.
|
30
30
|
def self.default_logger
|
31
|
-
logger = Logger.new(
|
31
|
+
logger = Logger.new($stdout, progname: 'wgit', level: :info)
|
32
32
|
logger.formatter = proc do |_severity, _datetime, progname, msg|
|
33
33
|
"[#{progname}] #{msg}\n"
|
34
34
|
end
|
data/lib/wgit/response.rb
CHANGED
@@ -33,6 +33,13 @@ module Wgit
|
|
33
33
|
@total_time = 0.0
|
34
34
|
end
|
35
35
|
|
36
|
+
# Overrides String#inspect to shorten the printed output of a Response.
|
37
|
+
#
|
38
|
+
# @return [String] A short textual representation of this Response.
|
39
|
+
def inspect
|
40
|
+
"#<Wgit::Response url=\"#{@url}\" status=#{status}>"
|
41
|
+
end
|
42
|
+
|
36
43
|
# Adds time to @total_time (incrementally).
|
37
44
|
#
|
38
45
|
# @param time [Float] The time to add to @total_time.
|
@@ -134,11 +141,19 @@ module Wgit
|
|
134
141
|
@status.positive?
|
135
142
|
end
|
136
143
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
144
|
+
# Returns whether or not Wgit is banned from indexing this site.
|
145
|
+
#
|
146
|
+
# @return [Boolean] True if Wgit should not index this site, false
|
147
|
+
# otherwise.
|
148
|
+
def no_index?
|
149
|
+
headers.fetch(:x_robots_tag, '').downcase.strip == 'noindex'
|
150
|
+
end
|
151
|
+
|
152
|
+
alias_method :code, :status
|
153
|
+
alias_method :content, :body
|
154
|
+
alias_method :crawl_duration, :total_time
|
155
|
+
alias_method :to_s, :body
|
156
|
+
alias_method :redirects, :redirections
|
157
|
+
alias_method :length, :size
|
143
158
|
end
|
144
159
|
end
|
@@ -0,0 +1,193 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wgit
|
4
|
+
# The RobotsParser class handles parsing and processing of a web servers
|
5
|
+
# robots.txt file.
|
6
|
+
class RobotsParser
|
7
|
+
include Wgit::Assertable
|
8
|
+
|
9
|
+
# Key representing the start of a comment.
|
10
|
+
KEY_COMMENT = '#'
|
11
|
+
# Key value separator used in robots.txt files.
|
12
|
+
KEY_SEPARATOR = ':'
|
13
|
+
# Key representing a user agent.
|
14
|
+
KEY_USER_AGENT = 'User-agent'
|
15
|
+
# Key representing an allow URL rule.
|
16
|
+
KEY_ALLOW = 'Allow'
|
17
|
+
# Key representing a disallow URL rule.
|
18
|
+
KEY_DISALLOW = 'Disallow'
|
19
|
+
|
20
|
+
# Value representing the Wgit user agent.
|
21
|
+
USER_AGENT_WGIT = :wgit
|
22
|
+
# Value representing any user agent including Wgit.
|
23
|
+
USER_AGENT_ANY = :*
|
24
|
+
|
25
|
+
# Value representing any and all paths.
|
26
|
+
PATHS_ALL = %w[/ *].freeze
|
27
|
+
|
28
|
+
# Hash containing the user-agent allow/disallow URL rules. Looks like:
|
29
|
+
# allow_paths: ["/"]
|
30
|
+
# disallow_paths: ["/accounts", ...]
|
31
|
+
attr_reader :rules
|
32
|
+
|
33
|
+
# Initializes and returns a Wgit::RobotsParser instance having parsed the
|
34
|
+
# robot.txt contents.
|
35
|
+
#
|
36
|
+
# @param contents [String, #to_s] The contents of the robots.txt file to be
|
37
|
+
# parsed.
|
38
|
+
def initialize(contents)
|
39
|
+
@rules = {
|
40
|
+
allow_paths: Set.new,
|
41
|
+
disallow_paths: Set.new
|
42
|
+
}
|
43
|
+
|
44
|
+
assert_respond_to(contents, :to_s)
|
45
|
+
parse(contents.to_s)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Overrides String#inspect to shorten the printed output of a Parser.
|
49
|
+
#
|
50
|
+
# @return [String] A short textual representation of this Parser.
|
51
|
+
def inspect
|
52
|
+
"#<Wgit::RobotsParser has_rules=#{rules?} no_index=#{no_index?}>"
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns the allow paths/rules for this parser's robots.txt contents.
|
56
|
+
#
|
57
|
+
# @return [Array<String>] The allow paths/rules to follow.
|
58
|
+
def allow_paths
|
59
|
+
@rules[:allow_paths].to_a
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns the disallow paths/rules for this parser's robots.txt contents.
|
63
|
+
#
|
64
|
+
# @return [Array<String>] The disallow paths/rules to follow.
|
65
|
+
def disallow_paths
|
66
|
+
@rules[:disallow_paths].to_a
|
67
|
+
end
|
68
|
+
|
69
|
+
# Returns whether or not there are rules applying to Wgit.
|
70
|
+
#
|
71
|
+
# @return [Boolean] True if there are rules for Wgit to follow, false
|
72
|
+
# otherwise.
|
73
|
+
def rules?
|
74
|
+
allow_rules? || disallow_rules?
|
75
|
+
end
|
76
|
+
|
77
|
+
# Returns whether or not there are allow rules applying to Wgit.
|
78
|
+
#
|
79
|
+
# @return [Boolean] True if there are allow rules for Wgit to follow,
|
80
|
+
# false otherwise.
|
81
|
+
def allow_rules?
|
82
|
+
@rules[:allow_paths].any?
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns whether or not there are disallow rules applying to Wgit.
|
86
|
+
#
|
87
|
+
# @return [Boolean] True if there are disallow rules for Wgit to follow,
|
88
|
+
# false otherwise.
|
89
|
+
def disallow_rules?
|
90
|
+
@rules[:disallow_paths].any?
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns whether or not Wgit is banned from indexing this site.
|
94
|
+
#
|
95
|
+
# @return [Boolean] True if Wgit should not index this site, false
|
96
|
+
# otherwise.
|
97
|
+
def no_index?
|
98
|
+
@rules[:disallow_paths].any? { |path| PATHS_ALL.include?(path) }
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
# Parses the file contents and sets @rules.
|
104
|
+
def parse(contents)
|
105
|
+
user_agents = []
|
106
|
+
new_block = false
|
107
|
+
|
108
|
+
contents.split("\n").each do |line|
|
109
|
+
line.strip!
|
110
|
+
next if line.empty? || line.start_with?(KEY_COMMENT)
|
111
|
+
|
112
|
+
# A user agent block is denoted by N User-agent's followed by N
|
113
|
+
# Allow/Disallow's. After which a new block is formed from scratch.
|
114
|
+
if start_with_any_case?(line, KEY_USER_AGENT)
|
115
|
+
if new_block
|
116
|
+
user_agents = []
|
117
|
+
new_block = false
|
118
|
+
end
|
119
|
+
user_agents << remove_key(line, KEY_USER_AGENT).downcase.to_sym
|
120
|
+
else
|
121
|
+
new_block = true
|
122
|
+
end
|
123
|
+
|
124
|
+
if start_with_any_case?(line, KEY_ALLOW)
|
125
|
+
append_allow_rule(user_agents, line)
|
126
|
+
elsif start_with_any_case?(line, KEY_DISALLOW)
|
127
|
+
append_disallow_rule(user_agents, line)
|
128
|
+
elsif !start_with_any_case?(line, KEY_USER_AGENT)
|
129
|
+
Wgit.logger.debug("Skipping unsupported robots.txt line: #{line}")
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Implements start_with? but case insensitive.
|
135
|
+
def start_with_any_case?(str, prefix)
|
136
|
+
str.downcase.start_with?(prefix.downcase)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Returns line with key removed (if present). Otherwise line is returned
|
140
|
+
# as given.
|
141
|
+
def remove_key(line, key)
|
142
|
+
return line unless start_with_any_case?(line, key)
|
143
|
+
return line unless line.count(KEY_SEPARATOR) == 1
|
144
|
+
|
145
|
+
segs = line.split(KEY_SEPARATOR)
|
146
|
+
return '' if segs.size == 1
|
147
|
+
|
148
|
+
segs.last.strip
|
149
|
+
end
|
150
|
+
|
151
|
+
# Don't append * or /, as this means all paths, which is the same as no
|
152
|
+
# allow_paths when passed to Wgit::Crawler.
|
153
|
+
def append_allow_rule(user_agents, line)
|
154
|
+
return unless wgit_user_agent?(user_agents)
|
155
|
+
|
156
|
+
path = remove_key(line, KEY_ALLOW)
|
157
|
+
path = parse_special_syntax(path)
|
158
|
+
return if PATHS_ALL.include?(path)
|
159
|
+
|
160
|
+
@rules[:allow_paths] << path
|
161
|
+
end
|
162
|
+
|
163
|
+
def append_disallow_rule(user_agents, line)
|
164
|
+
return unless wgit_user_agent?(user_agents)
|
165
|
+
|
166
|
+
path = remove_key(line, KEY_DISALLOW)
|
167
|
+
path = parse_special_syntax(path)
|
168
|
+
@rules[:disallow_paths] << path
|
169
|
+
end
|
170
|
+
|
171
|
+
def wgit_user_agent?(user_agents)
|
172
|
+
user_agents.any? do |agent|
|
173
|
+
[USER_AGENT_ANY, USER_AGENT_WGIT].include?(agent.downcase)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def parse_special_syntax(path)
|
178
|
+
# Remove $ e.g. "/blah$" becomes "/blah"
|
179
|
+
path = path.gsub('$', '')
|
180
|
+
|
181
|
+
# Remove any inline comments e.g. "/blah # comment" becomes "/blah"
|
182
|
+
path = path.split(" #{KEY_COMMENT}").first if path.include?(" #{KEY_COMMENT}")
|
183
|
+
|
184
|
+
# Replace an empty path with * e.g. "Allow: " becomes "Allow: *"
|
185
|
+
path = '*' if path.empty?
|
186
|
+
|
187
|
+
path
|
188
|
+
end
|
189
|
+
|
190
|
+
alias_method :paths, :rules
|
191
|
+
alias_method :banned?, :no_index?
|
192
|
+
end
|
193
|
+
end
|