kudzu 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/kudzu/adapter/memory/frontier.rb +1 -1
- data/lib/kudzu/adapter/memory/model/link.rb +2 -6
- data/lib/kudzu/adapter/memory/model/page.rb +3 -8
- data/lib/kudzu/adapter/memory/repository.rb +0 -2
- data/lib/kudzu/adapter/memory.rb +3 -4
- data/lib/kudzu/agent/all.rb +1 -1
- data/lib/kudzu/agent/fetcher.rb +46 -49
- data/lib/kudzu/agent/http/connection.rb +9 -0
- data/lib/kudzu/agent/http/connection_pool.rb +50 -0
- data/lib/kudzu/agent/page_filterer.rb +58 -0
- data/lib/kudzu/agent/reference.rb +9 -0
- data/lib/kudzu/agent/response.rb +14 -0
- data/lib/kudzu/agent/robots/parser.rb +91 -0
- data/lib/kudzu/agent/robots/txt.rb +34 -0
- data/lib/kudzu/agent/robots.rb +12 -123
- data/lib/kudzu/agent/sleeper.rb +2 -2
- data/lib/kudzu/agent/url_extractor.rb +60 -46
- data/lib/kudzu/agent/{url_filter.rb → url_filterer.rb} +26 -13
- data/lib/kudzu/agent/util/charset_detector.rb +84 -0
- data/lib/kudzu/agent/util/content_type_parser.rb +28 -0
- data/lib/kudzu/agent/util/matcher.rb +25 -0
- data/lib/kudzu/agent/util/mime_type_detector.rb +38 -0
- data/lib/kudzu/agent/util/title_parser.rb +30 -0
- data/lib/kudzu/agent.rb +42 -0
- data/lib/kudzu/callback.rb +4 -2
- data/lib/kudzu/config/filter.rb +11 -11
- data/lib/kudzu/config.rb +20 -25
- data/lib/kudzu/crawler.rb +65 -146
- data/lib/kudzu/{adapter/base → model}/all.rb +0 -0
- data/lib/kudzu/model/base.rb +9 -0
- data/lib/kudzu/model/link.rb +9 -0
- data/lib/kudzu/model/page.rb +112 -0
- data/lib/kudzu/thread_pool.rb +36 -0
- data/lib/kudzu/version.rb +1 -1
- data/lib/kudzu.rb +21 -3
- metadata +21 -19
- data/lib/kudzu/adapter/base/link.rb +0 -8
- data/lib/kudzu/adapter/base/page.rb +0 -106
- data/lib/kudzu/adapter/memory/all.rb +0 -3
- data/lib/kudzu/agent/charset_detector.rb +0 -84
- data/lib/kudzu/agent/filter.rb +0 -40
- data/lib/kudzu/agent/mime_type_detector.rb +0 -34
- data/lib/kudzu/agent/title_parser.rb +0 -16
- data/lib/kudzu/logger.rb +0 -20
- data/lib/kudzu/revisit/all.rb +0 -3
- data/lib/kudzu/revisit/scheduler.rb +0 -28
- data/lib/kudzu/util/all.rb +0 -3
- data/lib/kudzu/util/connection_pool.rb +0 -56
- data/lib/kudzu/util/content_type_parser.rb +0 -24
- data/lib/kudzu/util/matcher.rb +0 -21
- data/lib/kudzu/util/thread_pool.rb +0 -38
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ba76e95628d94560421358aa7982bdc429971e4
|
4
|
+
data.tar.gz: e1875f5760573a021fcf129018aaba5f6213ad23
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f17f799c2ad67722860bbad00e9e220db8265221d598aebbbd181fea41546454e954fc67db18abba2b8b68797fe44da3887ec4ea3a5d2486fd1afd61584c152
|
7
|
+
data.tar.gz: 65d3bf42fafbcf835740ebe5534a52163f09ef313795152fafadc87aeaa335d540b037180a4f03208471361a9284635ff15a85471f3def22ba43e341ee1eb724
|
@@ -1,14 +1,10 @@
|
|
1
1
|
module Kudzu
|
2
2
|
module Adapter
|
3
3
|
module Memory
|
4
|
-
class Link
|
5
|
-
include Kudzu::
|
4
|
+
class Link < Kudzu::Model::Base
|
5
|
+
include Kudzu::Model::Link
|
6
6
|
|
7
7
|
attr_accessor :uuid, :url, :title, :state, :depth
|
8
|
-
|
9
|
-
def initialize(attr = {})
|
10
|
-
attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
|
11
|
-
end
|
12
8
|
end
|
13
9
|
end
|
14
10
|
end
|
@@ -1,16 +1,11 @@
|
|
1
1
|
module Kudzu
|
2
2
|
module Adapter
|
3
3
|
module Memory
|
4
|
-
class Page
|
5
|
-
include Kudzu::
|
4
|
+
class Page < Kudzu::Model::Base
|
5
|
+
include Kudzu::Model::Page
|
6
6
|
|
7
7
|
attr_accessor :url, :title, :status, :mime_type, :size, :charset, :digest,
|
8
|
-
:response_header, :response_time, :redirect_from, :fetched_at, :revised_at
|
9
|
-
:revisit_interval, :revisit_at
|
10
|
-
|
11
|
-
def initialize(attr = {})
|
12
|
-
attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
|
13
|
-
end
|
8
|
+
:response_header, :response_time, :redirect_from, :fetched_at, :revised_at
|
14
9
|
end
|
15
10
|
end
|
16
11
|
end
|
data/lib/kudzu/adapter/memory.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
|
-
require_relative
|
3
|
-
|
4
|
-
Kudzu.adapter = Kudzu::Adapter::Memory
|
1
|
+
Dir[File.join(__dir__, 'memory/**/*.rb')].each do |file|
|
2
|
+
require_relative file
|
3
|
+
end
|
data/lib/kudzu/agent/all.rb
CHANGED
data/lib/kudzu/agent/fetcher.rb
CHANGED
@@ -1,67 +1,65 @@
|
|
1
|
-
require 'net/http'
|
2
|
-
require 'http-cookie'
|
3
|
-
|
4
1
|
module Kudzu
|
5
2
|
class Agent
|
6
3
|
class Fetcher
|
7
|
-
class Response
|
8
|
-
attr_accessor :url, :status, :header, :body, :time, :redirected
|
9
|
-
|
10
|
-
def initialize(attr = {})
|
11
|
-
attr.each { |k, v| public_send("#{k}=", v) }
|
12
|
-
end
|
13
|
-
|
14
|
-
def redirected?
|
15
|
-
redirected
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
4
|
attr_reader :pool
|
20
5
|
|
21
6
|
def initialize(config, robots = nil)
|
22
7
|
@config = config
|
23
|
-
@pool =
|
24
|
-
@sleeper =
|
8
|
+
@pool = Http::ConnectionPool.new(@config.max_connection || 100)
|
9
|
+
@sleeper = Sleeper.new(@config, robots)
|
10
|
+
@filterer = PageFilterer.new(@config)
|
25
11
|
@jar = HTTP::CookieJar.new
|
26
12
|
end
|
27
13
|
|
28
|
-
def fetch(url, request_header: {}, redirect: max_redirect,
|
14
|
+
def fetch(url, request_header: {}, method: :get, redirect: @config.max_redirect, redirect_from: nil)
|
29
15
|
uri = Addressable::URI.parse(url)
|
30
|
-
http = @pool.checkout(pool_name(uri)) { build_http(uri) }
|
31
16
|
request = build_request(uri, request_header: request_header, method: method)
|
32
|
-
|
33
|
-
append_cookie(url, request) if @config.handle_cookie
|
34
|
-
|
35
|
-
@sleeper.politeness_delay(url)
|
36
|
-
|
37
|
-
response = nil
|
38
|
-
response_time = Benchmark.realtime { response = http.request(request) }
|
39
|
-
|
40
|
-
parse_cookie(url, response) if @config.handle_cookie
|
17
|
+
response, response_time = send_request(uri, request)
|
41
18
|
|
42
19
|
if redirection?(response.code) && response['location'] && redirect > 0
|
43
|
-
fetch(uri.join(response['location']).to_s, request_header: request_header,
|
20
|
+
fetch(uri.join(response['location']).to_s, request_header: request_header,
|
21
|
+
redirect: redirect - 1,
|
22
|
+
redirect_from: redirect_from || url)
|
44
23
|
else
|
45
|
-
|
46
|
-
res.redirected = (redirect != max_redirect)
|
47
|
-
res
|
24
|
+
build_response(url, response, response_time, redirect_from)
|
48
25
|
end
|
49
26
|
end
|
50
27
|
|
51
28
|
private
|
52
29
|
|
53
|
-
def max_redirect
|
54
|
-
@config.max_redirect || 5
|
55
|
-
end
|
56
|
-
|
57
30
|
def pool_name(uri)
|
58
31
|
"#{uri.scheme}_#{uri.host}_#{uri.port || uri.default_port}"
|
59
32
|
end
|
60
33
|
|
34
|
+
def send_request(uri, request)
|
35
|
+
start_http(uri, request) do |http|
|
36
|
+
http.request(request) do |response|
|
37
|
+
unless @filterer.allowed_response_header?(uri.to_s, response)
|
38
|
+
http.finish
|
39
|
+
break response
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def start_http(uri, request)
|
46
|
+
http = @pool.checkout(pool_name(uri)) { build_http(uri) }
|
47
|
+
append_cookie(uri, request) if @config.handle_cookie
|
48
|
+
@sleeper.politeness_delay(uri)
|
49
|
+
|
50
|
+
start = Time.now.to_f
|
51
|
+
response = yield http
|
52
|
+
response_time = Time.now.to_f - start
|
53
|
+
|
54
|
+
parse_cookie(uri, response) if @config.handle_cookie
|
55
|
+
return response, response_time
|
56
|
+
end
|
57
|
+
|
61
58
|
def build_http(uri)
|
62
59
|
http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
|
63
60
|
http.open_timeout = @config.open_timeout if @config.open_timeout
|
64
61
|
http.read_timeout = @config.read_timeout if @config.read_timeout
|
62
|
+
http.keep_alive_timeout = @config.keep_alive if @config.keep_alive
|
65
63
|
if uri.scheme == 'https'
|
66
64
|
http.use_ssl = true
|
67
65
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
@@ -70,7 +68,7 @@ module Kudzu
|
|
70
68
|
end
|
71
69
|
|
72
70
|
def build_request(uri, request_header:, method:)
|
73
|
-
request =
|
71
|
+
request = Object.const_get("Net::HTTP::#{method.capitalize}").new(uri.request_uri)
|
74
72
|
request.basic_auth uri.user, uri.password if uri.user && uri.password
|
75
73
|
|
76
74
|
request['User-Agent'] = @config.user_agent
|
@@ -80,16 +78,15 @@ module Kudzu
|
|
80
78
|
request
|
81
79
|
end
|
82
80
|
|
83
|
-
def
|
84
|
-
|
85
|
-
end
|
86
|
-
|
87
|
-
def build_response(url, response, response_time)
|
81
|
+
def build_response(url, response, response_time, redirect_from)
|
82
|
+
fetched = response.instance_variable_get("@read")
|
88
83
|
Response.new(url: url,
|
89
84
|
status: response.code.to_i,
|
90
|
-
|
91
|
-
|
92
|
-
|
85
|
+
body: fetched ? response.body.to_s : nil,
|
86
|
+
response_header: Hash[response.each.to_a],
|
87
|
+
response_time: response_time,
|
88
|
+
redirect_from: redirect_from,
|
89
|
+
fetched: fetched)
|
93
90
|
end
|
94
91
|
|
95
92
|
def redirection?(code)
|
@@ -97,12 +94,12 @@ module Kudzu
|
|
97
94
|
300 <= code && code <= 399
|
98
95
|
end
|
99
96
|
|
100
|
-
def parse_cookie(
|
101
|
-
@jar.parse(response['set-cookie'],
|
97
|
+
def parse_cookie(uri, response)
|
98
|
+
@jar.parse(response['set-cookie'], uri.to_s) if response['set-cookie']
|
102
99
|
end
|
103
100
|
|
104
|
-
def append_cookie(
|
105
|
-
cookies = @jar.cookies(
|
101
|
+
def append_cookie(uri, request)
|
102
|
+
cookies = @jar.cookies(uri.to_s)
|
106
103
|
unless cookies.empty?
|
107
104
|
if request['Cookie']
|
108
105
|
request['Cookie'] += '; ' + cookies.join('; ')
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Http
|
4
|
+
class ConnectionPool
|
5
|
+
def initialize(max_size = 10)
|
6
|
+
@max_size = max_size
|
7
|
+
end
|
8
|
+
|
9
|
+
def checkout(name)
|
10
|
+
pool[name] ||= Connection.new(name: name, http: yield)
|
11
|
+
|
12
|
+
conn = pool[name]
|
13
|
+
conn.last_used = Time.now
|
14
|
+
|
15
|
+
if pool.size > @max_size
|
16
|
+
reduce
|
17
|
+
end
|
18
|
+
|
19
|
+
conn.http
|
20
|
+
end
|
21
|
+
|
22
|
+
def close
|
23
|
+
pool.values.each do |conn|
|
24
|
+
finish_http(conn.http)
|
25
|
+
end
|
26
|
+
Thread.current[:kudzu_connection] = nil
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def pool
|
32
|
+
Thread.current[:kudzu_connection] ||= {}
|
33
|
+
Thread.current[:kudzu_connection]
|
34
|
+
end
|
35
|
+
|
36
|
+
def reduce
|
37
|
+
conns = pool.values.sort_by { |conn| conn.last_used }
|
38
|
+
conns.first(pool.size - @max_size).each do |conn|
|
39
|
+
finish_http(conn.http)
|
40
|
+
pool.delete(conn.name)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def finish_http(http)
|
45
|
+
http.finish if http && http.started?
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class PageFilterer
|
4
|
+
def initialize(config)
|
5
|
+
@config = config
|
6
|
+
end
|
7
|
+
|
8
|
+
def allowed?(response)
|
9
|
+
filter = @config.find_filter(response.url)
|
10
|
+
|
11
|
+
if filter.nil? || (allowed_mime_type?(response.mime_type, filter) &&
|
12
|
+
allowed_size?(response.size, filter) &&
|
13
|
+
allowed_index?(response))
|
14
|
+
Kudzu.log :info, "passed page: #{response.url}"
|
15
|
+
true
|
16
|
+
else
|
17
|
+
Kudzu.log :info, "dropped page: #{response.url}"
|
18
|
+
false
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def allowed_response_header?(url, response_header)
|
23
|
+
filter = @config.find_filter(url)
|
24
|
+
|
25
|
+
if response_header['content-type']
|
26
|
+
mime_type = Util::ContentTypeParser.parse(response_header['content-type']).first
|
27
|
+
end
|
28
|
+
if response_header['content-length']
|
29
|
+
size = response_header['content-length'].to_i
|
30
|
+
end
|
31
|
+
|
32
|
+
filter.nil? || (allowed_mime_type?(mime_type, filter) &&
|
33
|
+
allowed_size?(size, filter))
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def allowed_mime_type?(mime_type, filter)
|
39
|
+
return true if mime_type.nil?
|
40
|
+
Util::Matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
|
41
|
+
end
|
42
|
+
|
43
|
+
def allowed_size?(size, filter)
|
44
|
+
return true if filter.max_size.nil? || size.nil?
|
45
|
+
size.to_i < filter.max_size.to_i
|
46
|
+
end
|
47
|
+
|
48
|
+
def allowed_index?(response)
|
49
|
+
return true if response.body.nil? || !response.html?
|
50
|
+
return true unless @config.respect_noindex
|
51
|
+
|
52
|
+
doc = response.parsed_doc
|
53
|
+
doc.xpath('html/head/meta[@name]')
|
54
|
+
.all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Response < Kudzu::Model::Base
|
4
|
+
include Kudzu::Model::Page
|
5
|
+
|
6
|
+
attr_accessor :url, :status, :body, :response_header, :response_time, :redirect_from, :fetched,
|
7
|
+
:size, :digest, :mime_type, :charset, :title
|
8
|
+
|
9
|
+
def fetched?
|
10
|
+
fetched
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Robots
|
4
|
+
class Parser
|
5
|
+
UNMATCH_REGEXP = /^$/
|
6
|
+
|
7
|
+
class << self
|
8
|
+
def parse(body)
|
9
|
+
txt = Txt.new
|
10
|
+
sets = []
|
11
|
+
prev_key = nil
|
12
|
+
|
13
|
+
parse_body(body).each do |key, value|
|
14
|
+
case key
|
15
|
+
when 'user-agent'
|
16
|
+
new_set = RuleSet.new(user_agent: ua_regexp(value))
|
17
|
+
txt.sets << new_set
|
18
|
+
if prev_key == 'user-agent'
|
19
|
+
sets << new_set
|
20
|
+
else
|
21
|
+
sets = [new_set]
|
22
|
+
end
|
23
|
+
when 'allow'
|
24
|
+
re = path_regexp(value)
|
25
|
+
sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
|
26
|
+
when 'disallow'
|
27
|
+
re = path_regexp(value)
|
28
|
+
sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
|
29
|
+
when 'crawl-delay'
|
30
|
+
sets.each { |set| set.crawl_delay = value.to_i }
|
31
|
+
when 'sitemap'
|
32
|
+
txt.sitemaps << value
|
33
|
+
end
|
34
|
+
|
35
|
+
prev_key = key
|
36
|
+
end
|
37
|
+
|
38
|
+
sort(txt)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def parse_body(body)
|
44
|
+
lines = body.to_s.split(/\r|\n|\r\n/)
|
45
|
+
lines.map { |line| parse_line(line) }.compact
|
46
|
+
end
|
47
|
+
|
48
|
+
def parse_line(line)
|
49
|
+
line.strip!
|
50
|
+
if line.empty? || line.start_with?('#')
|
51
|
+
nil
|
52
|
+
else
|
53
|
+
split_line(line)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def split_line(line)
|
58
|
+
key, value = line.split(':', 2)
|
59
|
+
key = key.to_s.strip.downcase
|
60
|
+
value = value.to_s.sub(/#.*$/, '').strip
|
61
|
+
if key.empty? || value.empty?
|
62
|
+
nil
|
63
|
+
else
|
64
|
+
[key, value]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def ua_regexp(value)
|
69
|
+
Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
|
70
|
+
rescue RegexpError
|
71
|
+
UNMATCH_REGEXP
|
72
|
+
end
|
73
|
+
|
74
|
+
def path_regexp(value)
|
75
|
+
Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$'))
|
76
|
+
rescue RegexpError
|
77
|
+
UNMATCH_REGEXP
|
78
|
+
end
|
79
|
+
|
80
|
+
def sort(txt)
|
81
|
+
txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
|
82
|
+
txt.sets.each do |set|
|
83
|
+
set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
|
84
|
+
end
|
85
|
+
txt
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Robots
|
4
|
+
class Txt < Kudzu::Model::Base
|
5
|
+
attr_accessor :sets, :sitemaps
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
self.sets = []
|
9
|
+
self.sitemaps = []
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class RuleSet < Kudzu::Model::Base
|
14
|
+
attr_accessor :user_agent, :rules, :crawl_delay
|
15
|
+
|
16
|
+
def initialize(attr = {})
|
17
|
+
self.rules = []
|
18
|
+
super
|
19
|
+
end
|
20
|
+
|
21
|
+
def allowed_path?(uri)
|
22
|
+
rules.each do |rule|
|
23
|
+
return rule.allow if uri.path =~ rule.path
|
24
|
+
end
|
25
|
+
return true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Rule < Kudzu::Model::Base
|
30
|
+
attr_accessor :path, :allow
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/kudzu/agent/robots.rb
CHANGED
@@ -3,7 +3,6 @@ module Kudzu
|
|
3
3
|
class Robots
|
4
4
|
def initialize(config)
|
5
5
|
@user_agent = config.user_agent
|
6
|
-
@page_fetcher = Kudzu::Agent::Fetcher.new(config)
|
7
6
|
@monitor = Monitor.new
|
8
7
|
@txt = {}
|
9
8
|
end
|
@@ -49,11 +48,11 @@ module Kudzu
|
|
49
48
|
|
50
49
|
def fetch_and_parse(uri)
|
51
50
|
response = fetch(uri)
|
52
|
-
if response && response.
|
51
|
+
if response && response.code.to_i == 200
|
53
52
|
body = response.body.force_encoding('utf-8').encode('utf-8', undef: :replace, invalid: :replace)
|
54
|
-
Parser.
|
53
|
+
Parser.parse(body)
|
55
54
|
else
|
56
|
-
Parser.
|
55
|
+
Parser.parse('')
|
57
56
|
end
|
58
57
|
end
|
59
58
|
|
@@ -62,127 +61,17 @@ module Kudzu
|
|
62
61
|
uri.path = 'robots.txt'
|
63
62
|
uri.fragment = uri.query = nil
|
64
63
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
class Txt
|
73
|
-
attr_accessor :sets, :sitemaps
|
74
|
-
|
75
|
-
def initialize
|
76
|
-
self.sets = []
|
77
|
-
self.sitemaps = []
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
class RuleSet
|
82
|
-
attr_accessor :user_agent, :rules, :crawl_delay
|
83
|
-
|
84
|
-
def initialize(attr = {})
|
85
|
-
self.rules = []
|
86
|
-
attr.each { |k, v| public_send("#{k}=", v) }
|
87
|
-
end
|
88
|
-
|
89
|
-
def allowed_path?(uri)
|
90
|
-
rules.each do |rule|
|
91
|
-
return rule.allow if uri.path =~ rule.path
|
92
|
-
end
|
93
|
-
return true
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
class Rule
|
98
|
-
attr_accessor :path, :allow
|
99
|
-
|
100
|
-
def initialize(attr = {})
|
101
|
-
attr.each { |k, v| public_send("#{k}=", v) }
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
class Parser
|
106
|
-
UNMATCH_REGEXP = /^$/
|
107
|
-
|
108
|
-
def parse(body)
|
109
|
-
txt = Txt.new
|
110
|
-
sets = []
|
111
|
-
prev_key = nil
|
112
|
-
|
113
|
-
parse_body(body).each do |key, value|
|
114
|
-
case key
|
115
|
-
when 'user-agent'
|
116
|
-
new_set = RuleSet.new(user_agent: ua_regexp(value))
|
117
|
-
txt.sets << new_set
|
118
|
-
if prev_key == 'user-agent'
|
119
|
-
sets << new_set
|
120
|
-
else
|
121
|
-
sets = [new_set]
|
122
|
-
end
|
123
|
-
when 'allow'
|
124
|
-
re = path_regexp(value)
|
125
|
-
sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
|
126
|
-
when 'disallow'
|
127
|
-
re = path_regexp(value)
|
128
|
-
sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
|
129
|
-
when 'crawl-delay'
|
130
|
-
sets.each { |set| set.crawl_delay = value.to_i }
|
131
|
-
when 'sitemap'
|
132
|
-
txt.sitemaps << value
|
133
|
-
end
|
134
|
-
|
135
|
-
prev_key = key
|
136
|
-
end
|
137
|
-
|
138
|
-
sort(txt)
|
139
|
-
end
|
140
|
-
|
141
|
-
private
|
142
|
-
|
143
|
-
def parse_body(body)
|
144
|
-
lines = body.to_s.split(/\r|\n|\r\n/)
|
145
|
-
lines.map { |line| parse_line(line) }.compact
|
146
|
-
end
|
147
|
-
|
148
|
-
def parse_line(line)
|
149
|
-
line.strip!
|
150
|
-
if line.empty? || line.start_with?('#')
|
151
|
-
nil
|
152
|
-
else
|
153
|
-
split_line(line)
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
def split_line(line)
|
158
|
-
key, value = line.split(':', 2)
|
159
|
-
key = key.to_s.strip.downcase
|
160
|
-
value = value.to_s.sub(/#.*$/, '').strip
|
161
|
-
if key.empty? || value.empty?
|
162
|
-
nil
|
163
|
-
else
|
164
|
-
[key, value]
|
165
|
-
end
|
166
|
-
end
|
167
|
-
|
168
|
-
def ua_regexp(value)
|
169
|
-
Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
|
170
|
-
rescue RegexpError
|
171
|
-
UNMATCH_REGEXP
|
64
|
+
http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
|
65
|
+
if uri.scheme == 'https'
|
66
|
+
http.use_ssl = true
|
67
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
172
68
|
end
|
173
69
|
|
174
|
-
|
175
|
-
|
176
|
-
rescue
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
def sort(txt)
|
181
|
-
txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
|
182
|
-
txt.sets.each do |set|
|
183
|
-
set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
|
184
|
-
end
|
185
|
-
txt
|
70
|
+
begin
|
71
|
+
http.get(uri.to_s)
|
72
|
+
rescue => e
|
73
|
+
Kudzu.log :error, "failed to fetch robots.txt: #{uri}", error: e
|
74
|
+
nil
|
186
75
|
end
|
187
76
|
end
|
188
77
|
end
|
data/lib/kudzu/agent/sleeper.rb
CHANGED