kudzu 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kudzu/adapter/memory/frontier.rb +1 -1
- data/lib/kudzu/adapter/memory/model/link.rb +2 -6
- data/lib/kudzu/adapter/memory/model/page.rb +3 -8
- data/lib/kudzu/adapter/memory/repository.rb +0 -2
- data/lib/kudzu/adapter/memory.rb +3 -4
- data/lib/kudzu/agent/all.rb +1 -1
- data/lib/kudzu/agent/fetcher.rb +46 -49
- data/lib/kudzu/agent/http/connection.rb +9 -0
- data/lib/kudzu/agent/http/connection_pool.rb +50 -0
- data/lib/kudzu/agent/page_filterer.rb +58 -0
- data/lib/kudzu/agent/reference.rb +9 -0
- data/lib/kudzu/agent/response.rb +14 -0
- data/lib/kudzu/agent/robots/parser.rb +91 -0
- data/lib/kudzu/agent/robots/txt.rb +34 -0
- data/lib/kudzu/agent/robots.rb +12 -123
- data/lib/kudzu/agent/sleeper.rb +2 -2
- data/lib/kudzu/agent/url_extractor.rb +60 -46
- data/lib/kudzu/agent/{url_filter.rb → url_filterer.rb} +26 -13
- data/lib/kudzu/agent/util/charset_detector.rb +84 -0
- data/lib/kudzu/agent/util/content_type_parser.rb +28 -0
- data/lib/kudzu/agent/util/matcher.rb +25 -0
- data/lib/kudzu/agent/util/mime_type_detector.rb +38 -0
- data/lib/kudzu/agent/util/title_parser.rb +30 -0
- data/lib/kudzu/agent.rb +42 -0
- data/lib/kudzu/callback.rb +4 -2
- data/lib/kudzu/config/filter.rb +11 -11
- data/lib/kudzu/config.rb +20 -25
- data/lib/kudzu/crawler.rb +65 -146
- data/lib/kudzu/{adapter/base → model}/all.rb +0 -0
- data/lib/kudzu/model/base.rb +9 -0
- data/lib/kudzu/model/link.rb +9 -0
- data/lib/kudzu/model/page.rb +112 -0
- data/lib/kudzu/thread_pool.rb +36 -0
- data/lib/kudzu/version.rb +1 -1
- data/lib/kudzu.rb +21 -3
- metadata +21 -19
- data/lib/kudzu/adapter/base/link.rb +0 -8
- data/lib/kudzu/adapter/base/page.rb +0 -106
- data/lib/kudzu/adapter/memory/all.rb +0 -3
- data/lib/kudzu/agent/charset_detector.rb +0 -84
- data/lib/kudzu/agent/filter.rb +0 -40
- data/lib/kudzu/agent/mime_type_detector.rb +0 -34
- data/lib/kudzu/agent/title_parser.rb +0 -16
- data/lib/kudzu/logger.rb +0 -20
- data/lib/kudzu/revisit/all.rb +0 -3
- data/lib/kudzu/revisit/scheduler.rb +0 -28
- data/lib/kudzu/util/all.rb +0 -3
- data/lib/kudzu/util/connection_pool.rb +0 -56
- data/lib/kudzu/util/content_type_parser.rb +0 -24
- data/lib/kudzu/util/matcher.rb +0 -21
- data/lib/kudzu/util/thread_pool.rb +0 -38
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ba76e95628d94560421358aa7982bdc429971e4
|
4
|
+
data.tar.gz: e1875f5760573a021fcf129018aaba5f6213ad23
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f17f799c2ad67722860bbad00e9e220db8265221d598aebbbd181fea41546454e954fc67db18abba2b8b68797fe44da3887ec4ea3a5d2486fd1afd61584c152
|
7
|
+
data.tar.gz: 65d3bf42fafbcf835740ebe5534a52163f09ef313795152fafadc87aeaa335d540b037180a4f03208471361a9284635ff15a85471f3def22ba43e341ee1eb724
|
@@ -1,14 +1,10 @@
|
|
1
1
|
module Kudzu
|
2
2
|
module Adapter
|
3
3
|
module Memory
|
4
|
-
class Link
|
5
|
-
include Kudzu::
|
4
|
+
class Link < Kudzu::Model::Base
|
5
|
+
include Kudzu::Model::Link
|
6
6
|
|
7
7
|
attr_accessor :uuid, :url, :title, :state, :depth
|
8
|
-
|
9
|
-
def initialize(attr = {})
|
10
|
-
attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
|
11
|
-
end
|
12
8
|
end
|
13
9
|
end
|
14
10
|
end
|
@@ -1,16 +1,11 @@
|
|
1
1
|
module Kudzu
|
2
2
|
module Adapter
|
3
3
|
module Memory
|
4
|
-
class Page
|
5
|
-
include Kudzu::
|
4
|
+
class Page < Kudzu::Model::Base
|
5
|
+
include Kudzu::Model::Page
|
6
6
|
|
7
7
|
attr_accessor :url, :title, :status, :mime_type, :size, :charset, :digest,
|
8
|
-
:response_header, :response_time, :redirect_from, :fetched_at, :revised_at
|
9
|
-
:revisit_interval, :revisit_at
|
10
|
-
|
11
|
-
def initialize(attr = {})
|
12
|
-
attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
|
13
|
-
end
|
8
|
+
:response_header, :response_time, :redirect_from, :fetched_at, :revised_at
|
14
9
|
end
|
15
10
|
end
|
16
11
|
end
|
data/lib/kudzu/adapter/memory.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
|
-
require_relative
|
3
|
-
|
4
|
-
Kudzu.adapter = Kudzu::Adapter::Memory
|
1
|
+
Dir[File.join(__dir__, 'memory/**/*.rb')].each do |file|
|
2
|
+
require_relative file
|
3
|
+
end
|
data/lib/kudzu/agent/all.rb
CHANGED
data/lib/kudzu/agent/fetcher.rb
CHANGED
@@ -1,67 +1,65 @@
|
|
1
|
-
require 'net/http'
|
2
|
-
require 'http-cookie'
|
3
|
-
|
4
1
|
module Kudzu
|
5
2
|
class Agent
|
6
3
|
class Fetcher
|
7
|
-
class Response
|
8
|
-
attr_accessor :url, :status, :header, :body, :time, :redirected
|
9
|
-
|
10
|
-
def initialize(attr = {})
|
11
|
-
attr.each { |k, v| public_send("#{k}=", v) }
|
12
|
-
end
|
13
|
-
|
14
|
-
def redirected?
|
15
|
-
redirected
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
4
|
attr_reader :pool
|
20
5
|
|
21
6
|
def initialize(config, robots = nil)
|
22
7
|
@config = config
|
23
|
-
@pool =
|
24
|
-
@sleeper =
|
8
|
+
@pool = Http::ConnectionPool.new(@config.max_connection || 100)
|
9
|
+
@sleeper = Sleeper.new(@config, robots)
|
10
|
+
@filterer = PageFilterer.new(@config)
|
25
11
|
@jar = HTTP::CookieJar.new
|
26
12
|
end
|
27
13
|
|
28
|
-
def fetch(url, request_header: {}, redirect: max_redirect,
|
14
|
+
def fetch(url, request_header: {}, method: :get, redirect: @config.max_redirect, redirect_from: nil)
|
29
15
|
uri = Addressable::URI.parse(url)
|
30
|
-
http = @pool.checkout(pool_name(uri)) { build_http(uri) }
|
31
16
|
request = build_request(uri, request_header: request_header, method: method)
|
32
|
-
|
33
|
-
append_cookie(url, request) if @config.handle_cookie
|
34
|
-
|
35
|
-
@sleeper.politeness_delay(url)
|
36
|
-
|
37
|
-
response = nil
|
38
|
-
response_time = Benchmark.realtime { response = http.request(request) }
|
39
|
-
|
40
|
-
parse_cookie(url, response) if @config.handle_cookie
|
17
|
+
response, response_time = send_request(uri, request)
|
41
18
|
|
42
19
|
if redirection?(response.code) && response['location'] && redirect > 0
|
43
|
-
fetch(uri.join(response['location']).to_s, request_header: request_header,
|
20
|
+
fetch(uri.join(response['location']).to_s, request_header: request_header,
|
21
|
+
redirect: redirect - 1,
|
22
|
+
redirect_from: redirect_from || url)
|
44
23
|
else
|
45
|
-
|
46
|
-
res.redirected = (redirect != max_redirect)
|
47
|
-
res
|
24
|
+
build_response(url, response, response_time, redirect_from)
|
48
25
|
end
|
49
26
|
end
|
50
27
|
|
51
28
|
private
|
52
29
|
|
53
|
-
def max_redirect
|
54
|
-
@config.max_redirect || 5
|
55
|
-
end
|
56
|
-
|
57
30
|
def pool_name(uri)
|
58
31
|
"#{uri.scheme}_#{uri.host}_#{uri.port || uri.default_port}"
|
59
32
|
end
|
60
33
|
|
34
|
+
def send_request(uri, request)
|
35
|
+
start_http(uri, request) do |http|
|
36
|
+
http.request(request) do |response|
|
37
|
+
unless @filterer.allowed_response_header?(uri.to_s, response)
|
38
|
+
http.finish
|
39
|
+
break response
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def start_http(uri, request)
|
46
|
+
http = @pool.checkout(pool_name(uri)) { build_http(uri) }
|
47
|
+
append_cookie(uri, request) if @config.handle_cookie
|
48
|
+
@sleeper.politeness_delay(uri)
|
49
|
+
|
50
|
+
start = Time.now.to_f
|
51
|
+
response = yield http
|
52
|
+
response_time = Time.now.to_f - start
|
53
|
+
|
54
|
+
parse_cookie(uri, response) if @config.handle_cookie
|
55
|
+
return response, response_time
|
56
|
+
end
|
57
|
+
|
61
58
|
def build_http(uri)
|
62
59
|
http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
|
63
60
|
http.open_timeout = @config.open_timeout if @config.open_timeout
|
64
61
|
http.read_timeout = @config.read_timeout if @config.read_timeout
|
62
|
+
http.keep_alive_timeout = @config.keep_alive if @config.keep_alive
|
65
63
|
if uri.scheme == 'https'
|
66
64
|
http.use_ssl = true
|
67
65
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
@@ -70,7 +68,7 @@ module Kudzu
|
|
70
68
|
end
|
71
69
|
|
72
70
|
def build_request(uri, request_header:, method:)
|
73
|
-
request =
|
71
|
+
request = Object.const_get("Net::HTTP::#{method.capitalize}").new(uri.request_uri)
|
74
72
|
request.basic_auth uri.user, uri.password if uri.user && uri.password
|
75
73
|
|
76
74
|
request['User-Agent'] = @config.user_agent
|
@@ -80,16 +78,15 @@ module Kudzu
|
|
80
78
|
request
|
81
79
|
end
|
82
80
|
|
83
|
-
def
|
84
|
-
|
85
|
-
end
|
86
|
-
|
87
|
-
def build_response(url, response, response_time)
|
81
|
+
def build_response(url, response, response_time, redirect_from)
|
82
|
+
fetched = response.instance_variable_get("@read")
|
88
83
|
Response.new(url: url,
|
89
84
|
status: response.code.to_i,
|
90
|
-
|
91
|
-
|
92
|
-
|
85
|
+
body: fetched ? response.body.to_s : nil,
|
86
|
+
response_header: Hash[response.each.to_a],
|
87
|
+
response_time: response_time,
|
88
|
+
redirect_from: redirect_from,
|
89
|
+
fetched: fetched)
|
93
90
|
end
|
94
91
|
|
95
92
|
def redirection?(code)
|
@@ -97,12 +94,12 @@ module Kudzu
|
|
97
94
|
300 <= code && code <= 399
|
98
95
|
end
|
99
96
|
|
100
|
-
def parse_cookie(
|
101
|
-
@jar.parse(response['set-cookie'],
|
97
|
+
def parse_cookie(uri, response)
|
98
|
+
@jar.parse(response['set-cookie'], uri.to_s) if response['set-cookie']
|
102
99
|
end
|
103
100
|
|
104
|
-
def append_cookie(
|
105
|
-
cookies = @jar.cookies(
|
101
|
+
def append_cookie(uri, request)
|
102
|
+
cookies = @jar.cookies(uri.to_s)
|
106
103
|
unless cookies.empty?
|
107
104
|
if request['Cookie']
|
108
105
|
request['Cookie'] += '; ' + cookies.join('; ')
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Http
|
4
|
+
class ConnectionPool
|
5
|
+
def initialize(max_size = 10)
|
6
|
+
@max_size = max_size
|
7
|
+
end
|
8
|
+
|
9
|
+
def checkout(name)
|
10
|
+
pool[name] ||= Connection.new(name: name, http: yield)
|
11
|
+
|
12
|
+
conn = pool[name]
|
13
|
+
conn.last_used = Time.now
|
14
|
+
|
15
|
+
if pool.size > @max_size
|
16
|
+
reduce
|
17
|
+
end
|
18
|
+
|
19
|
+
conn.http
|
20
|
+
end
|
21
|
+
|
22
|
+
def close
|
23
|
+
pool.values.each do |conn|
|
24
|
+
finish_http(conn.http)
|
25
|
+
end
|
26
|
+
Thread.current[:kudzu_connection] = nil
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def pool
|
32
|
+
Thread.current[:kudzu_connection] ||= {}
|
33
|
+
Thread.current[:kudzu_connection]
|
34
|
+
end
|
35
|
+
|
36
|
+
def reduce
|
37
|
+
conns = pool.values.sort_by { |conn| conn.last_used }
|
38
|
+
conns.first(pool.size - @max_size).each do |conn|
|
39
|
+
finish_http(conn.http)
|
40
|
+
pool.delete(conn.name)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def finish_http(http)
|
45
|
+
http.finish if http && http.started?
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class PageFilterer
|
4
|
+
def initialize(config)
|
5
|
+
@config = config
|
6
|
+
end
|
7
|
+
|
8
|
+
def allowed?(response)
|
9
|
+
filter = @config.find_filter(response.url)
|
10
|
+
|
11
|
+
if filter.nil? || (allowed_mime_type?(response.mime_type, filter) &&
|
12
|
+
allowed_size?(response.size, filter) &&
|
13
|
+
allowed_index?(response))
|
14
|
+
Kudzu.log :info, "passed page: #{response.url}"
|
15
|
+
true
|
16
|
+
else
|
17
|
+
Kudzu.log :info, "dropped page: #{response.url}"
|
18
|
+
false
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def allowed_response_header?(url, response_header)
|
23
|
+
filter = @config.find_filter(url)
|
24
|
+
|
25
|
+
if response_header['content-type']
|
26
|
+
mime_type = Util::ContentTypeParser.parse(response_header['content-type']).first
|
27
|
+
end
|
28
|
+
if response_header['content-length']
|
29
|
+
size = response_header['content-length'].to_i
|
30
|
+
end
|
31
|
+
|
32
|
+
filter.nil? || (allowed_mime_type?(mime_type, filter) &&
|
33
|
+
allowed_size?(size, filter))
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def allowed_mime_type?(mime_type, filter)
|
39
|
+
return true if mime_type.nil?
|
40
|
+
Util::Matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
|
41
|
+
end
|
42
|
+
|
43
|
+
def allowed_size?(size, filter)
|
44
|
+
return true if filter.max_size.nil? || size.nil?
|
45
|
+
size.to_i < filter.max_size.to_i
|
46
|
+
end
|
47
|
+
|
48
|
+
def allowed_index?(response)
|
49
|
+
return true if response.body.nil? || !response.html?
|
50
|
+
return true unless @config.respect_noindex
|
51
|
+
|
52
|
+
doc = response.parsed_doc
|
53
|
+
doc.xpath('html/head/meta[@name]')
|
54
|
+
.all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Response < Kudzu::Model::Base
|
4
|
+
include Kudzu::Model::Page
|
5
|
+
|
6
|
+
attr_accessor :url, :status, :body, :response_header, :response_time, :redirect_from, :fetched,
|
7
|
+
:size, :digest, :mime_type, :charset, :title
|
8
|
+
|
9
|
+
def fetched?
|
10
|
+
fetched
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Robots
|
4
|
+
class Parser
|
5
|
+
UNMATCH_REGEXP = /^$/
|
6
|
+
|
7
|
+
class << self
|
8
|
+
def parse(body)
|
9
|
+
txt = Txt.new
|
10
|
+
sets = []
|
11
|
+
prev_key = nil
|
12
|
+
|
13
|
+
parse_body(body).each do |key, value|
|
14
|
+
case key
|
15
|
+
when 'user-agent'
|
16
|
+
new_set = RuleSet.new(user_agent: ua_regexp(value))
|
17
|
+
txt.sets << new_set
|
18
|
+
if prev_key == 'user-agent'
|
19
|
+
sets << new_set
|
20
|
+
else
|
21
|
+
sets = [new_set]
|
22
|
+
end
|
23
|
+
when 'allow'
|
24
|
+
re = path_regexp(value)
|
25
|
+
sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
|
26
|
+
when 'disallow'
|
27
|
+
re = path_regexp(value)
|
28
|
+
sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
|
29
|
+
when 'crawl-delay'
|
30
|
+
sets.each { |set| set.crawl_delay = value.to_i }
|
31
|
+
when 'sitemap'
|
32
|
+
txt.sitemaps << value
|
33
|
+
end
|
34
|
+
|
35
|
+
prev_key = key
|
36
|
+
end
|
37
|
+
|
38
|
+
sort(txt)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def parse_body(body)
|
44
|
+
lines = body.to_s.split(/\r|\n|\r\n/)
|
45
|
+
lines.map { |line| parse_line(line) }.compact
|
46
|
+
end
|
47
|
+
|
48
|
+
def parse_line(line)
|
49
|
+
line.strip!
|
50
|
+
if line.empty? || line.start_with?('#')
|
51
|
+
nil
|
52
|
+
else
|
53
|
+
split_line(line)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def split_line(line)
|
58
|
+
key, value = line.split(':', 2)
|
59
|
+
key = key.to_s.strip.downcase
|
60
|
+
value = value.to_s.sub(/#.*$/, '').strip
|
61
|
+
if key.empty? || value.empty?
|
62
|
+
nil
|
63
|
+
else
|
64
|
+
[key, value]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def ua_regexp(value)
|
69
|
+
Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
|
70
|
+
rescue RegexpError
|
71
|
+
UNMATCH_REGEXP
|
72
|
+
end
|
73
|
+
|
74
|
+
def path_regexp(value)
|
75
|
+
Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$'))
|
76
|
+
rescue RegexpError
|
77
|
+
UNMATCH_REGEXP
|
78
|
+
end
|
79
|
+
|
80
|
+
def sort(txt)
|
81
|
+
txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
|
82
|
+
txt.sets.each do |set|
|
83
|
+
set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
|
84
|
+
end
|
85
|
+
txt
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Robots
|
4
|
+
class Txt < Kudzu::Model::Base
|
5
|
+
attr_accessor :sets, :sitemaps
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
self.sets = []
|
9
|
+
self.sitemaps = []
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class RuleSet < Kudzu::Model::Base
|
14
|
+
attr_accessor :user_agent, :rules, :crawl_delay
|
15
|
+
|
16
|
+
def initialize(attr = {})
|
17
|
+
self.rules = []
|
18
|
+
super
|
19
|
+
end
|
20
|
+
|
21
|
+
def allowed_path?(uri)
|
22
|
+
rules.each do |rule|
|
23
|
+
return rule.allow if uri.path =~ rule.path
|
24
|
+
end
|
25
|
+
return true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Rule < Kudzu::Model::Base
|
30
|
+
attr_accessor :path, :allow
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/kudzu/agent/robots.rb
CHANGED
@@ -3,7 +3,6 @@ module Kudzu
|
|
3
3
|
class Robots
|
4
4
|
def initialize(config)
|
5
5
|
@user_agent = config.user_agent
|
6
|
-
@page_fetcher = Kudzu::Agent::Fetcher.new(config)
|
7
6
|
@monitor = Monitor.new
|
8
7
|
@txt = {}
|
9
8
|
end
|
@@ -49,11 +48,11 @@ module Kudzu
|
|
49
48
|
|
50
49
|
def fetch_and_parse(uri)
|
51
50
|
response = fetch(uri)
|
52
|
-
if response && response.
|
51
|
+
if response && response.code.to_i == 200
|
53
52
|
body = response.body.force_encoding('utf-8').encode('utf-8', undef: :replace, invalid: :replace)
|
54
|
-
Parser.
|
53
|
+
Parser.parse(body)
|
55
54
|
else
|
56
|
-
Parser.
|
55
|
+
Parser.parse('')
|
57
56
|
end
|
58
57
|
end
|
59
58
|
|
@@ -62,127 +61,17 @@ module Kudzu
|
|
62
61
|
uri.path = 'robots.txt'
|
63
62
|
uri.fragment = uri.query = nil
|
64
63
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
class Txt
|
73
|
-
attr_accessor :sets, :sitemaps
|
74
|
-
|
75
|
-
def initialize
|
76
|
-
self.sets = []
|
77
|
-
self.sitemaps = []
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
class RuleSet
|
82
|
-
attr_accessor :user_agent, :rules, :crawl_delay
|
83
|
-
|
84
|
-
def initialize(attr = {})
|
85
|
-
self.rules = []
|
86
|
-
attr.each { |k, v| public_send("#{k}=", v) }
|
87
|
-
end
|
88
|
-
|
89
|
-
def allowed_path?(uri)
|
90
|
-
rules.each do |rule|
|
91
|
-
return rule.allow if uri.path =~ rule.path
|
92
|
-
end
|
93
|
-
return true
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
class Rule
|
98
|
-
attr_accessor :path, :allow
|
99
|
-
|
100
|
-
def initialize(attr = {})
|
101
|
-
attr.each { |k, v| public_send("#{k}=", v) }
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
class Parser
|
106
|
-
UNMATCH_REGEXP = /^$/
|
107
|
-
|
108
|
-
def parse(body)
|
109
|
-
txt = Txt.new
|
110
|
-
sets = []
|
111
|
-
prev_key = nil
|
112
|
-
|
113
|
-
parse_body(body).each do |key, value|
|
114
|
-
case key
|
115
|
-
when 'user-agent'
|
116
|
-
new_set = RuleSet.new(user_agent: ua_regexp(value))
|
117
|
-
txt.sets << new_set
|
118
|
-
if prev_key == 'user-agent'
|
119
|
-
sets << new_set
|
120
|
-
else
|
121
|
-
sets = [new_set]
|
122
|
-
end
|
123
|
-
when 'allow'
|
124
|
-
re = path_regexp(value)
|
125
|
-
sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
|
126
|
-
when 'disallow'
|
127
|
-
re = path_regexp(value)
|
128
|
-
sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
|
129
|
-
when 'crawl-delay'
|
130
|
-
sets.each { |set| set.crawl_delay = value.to_i }
|
131
|
-
when 'sitemap'
|
132
|
-
txt.sitemaps << value
|
133
|
-
end
|
134
|
-
|
135
|
-
prev_key = key
|
136
|
-
end
|
137
|
-
|
138
|
-
sort(txt)
|
139
|
-
end
|
140
|
-
|
141
|
-
private
|
142
|
-
|
143
|
-
def parse_body(body)
|
144
|
-
lines = body.to_s.split(/\r|\n|\r\n/)
|
145
|
-
lines.map { |line| parse_line(line) }.compact
|
146
|
-
end
|
147
|
-
|
148
|
-
def parse_line(line)
|
149
|
-
line.strip!
|
150
|
-
if line.empty? || line.start_with?('#')
|
151
|
-
nil
|
152
|
-
else
|
153
|
-
split_line(line)
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
def split_line(line)
|
158
|
-
key, value = line.split(':', 2)
|
159
|
-
key = key.to_s.strip.downcase
|
160
|
-
value = value.to_s.sub(/#.*$/, '').strip
|
161
|
-
if key.empty? || value.empty?
|
162
|
-
nil
|
163
|
-
else
|
164
|
-
[key, value]
|
165
|
-
end
|
166
|
-
end
|
167
|
-
|
168
|
-
def ua_regexp(value)
|
169
|
-
Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
|
170
|
-
rescue RegexpError
|
171
|
-
UNMATCH_REGEXP
|
64
|
+
http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
|
65
|
+
if uri.scheme == 'https'
|
66
|
+
http.use_ssl = true
|
67
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
172
68
|
end
|
173
69
|
|
174
|
-
|
175
|
-
|
176
|
-
rescue
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
def sort(txt)
|
181
|
-
txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
|
182
|
-
txt.sets.each do |set|
|
183
|
-
set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
|
184
|
-
end
|
185
|
-
txt
|
70
|
+
begin
|
71
|
+
http.get(uri.to_s)
|
72
|
+
rescue => e
|
73
|
+
Kudzu.log :error, "failed to fetch robots.txt: #{uri}", error: e
|
74
|
+
nil
|
186
75
|
end
|
187
76
|
end
|
188
77
|
end
|
data/lib/kudzu/agent/sleeper.rb
CHANGED