kudzu 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +61 -0
- data/Rakefile +6 -0
- data/lib/kudzu.rb +8 -0
- data/lib/kudzu/adapter/base/all.rb +3 -0
- data/lib/kudzu/adapter/base/link.rb +8 -0
- data/lib/kudzu/adapter/base/page.rb +106 -0
- data/lib/kudzu/adapter/memory.rb +4 -0
- data/lib/kudzu/adapter/memory/all.rb +3 -0
- data/lib/kudzu/adapter/memory/frontier.rb +38 -0
- data/lib/kudzu/adapter/memory/model/link.rb +15 -0
- data/lib/kudzu/adapter/memory/model/page.rb +17 -0
- data/lib/kudzu/adapter/memory/repository.rb +27 -0
- data/lib/kudzu/agent/all.rb +3 -0
- data/lib/kudzu/agent/charset_detector.rb +84 -0
- data/lib/kudzu/agent/fetcher.rb +116 -0
- data/lib/kudzu/agent/filter.rb +40 -0
- data/lib/kudzu/agent/mime_type_detector.rb +34 -0
- data/lib/kudzu/agent/robots.rb +190 -0
- data/lib/kudzu/agent/sleeper.rb +44 -0
- data/lib/kudzu/agent/title_parser.rb +16 -0
- data/lib/kudzu/agent/url_extractor.rb +123 -0
- data/lib/kudzu/agent/url_filter.rb +65 -0
- data/lib/kudzu/callback.rb +41 -0
- data/lib/kudzu/common.rb +23 -0
- data/lib/kudzu/config.rb +79 -0
- data/lib/kudzu/config/filter.rb +39 -0
- data/lib/kudzu/crawler.rb +258 -0
- data/lib/kudzu/logger.rb +20 -0
- data/lib/kudzu/revisit/all.rb +3 -0
- data/lib/kudzu/revisit/scheduler.rb +28 -0
- data/lib/kudzu/util/all.rb +3 -0
- data/lib/kudzu/util/connection_pool.rb +56 -0
- data/lib/kudzu/util/content_type_parser.rb +24 -0
- data/lib/kudzu/util/matcher.rb +21 -0
- data/lib/kudzu/util/thread_pool.rb +38 -0
- data/lib/kudzu/version.rb +3 -0
- metadata +234 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Kudzu
|
4
|
+
class Agent
|
5
|
+
class Filter
|
6
|
+
def initialize(config)
|
7
|
+
@config = config
|
8
|
+
@matcher = Kudzu::Util::Matcher.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def allowed?(page)
|
12
|
+
filter = @config.find_filter(page.url)
|
13
|
+
return true unless filter
|
14
|
+
|
15
|
+
allowed_mime_type?(page.mime_type, filter) && allowed_size?(page.size, filter) && allowed_index?(page)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def allowed_mime_type?(mime_type, filter)
|
21
|
+
return true if mime_type.nil?
|
22
|
+
@matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
|
23
|
+
end
|
24
|
+
|
25
|
+
def allowed_size?(size, filter)
|
26
|
+
return true if filter.max_size.nil? || size.nil?
|
27
|
+
size.to_i < filter.max_size.to_i
|
28
|
+
end
|
29
|
+
|
30
|
+
def allowed_index?(page)
|
31
|
+
return true unless page.html?
|
32
|
+
return true unless @config.respect_noindex
|
33
|
+
|
34
|
+
doc = Nokogiri::HTML(page.body.encode('ascii', undef: :replace, invalid: :replace))
|
35
|
+
doc.xpath('html/head/meta[@name]')
|
36
|
+
.all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'shared-mime-info'
|
2
|
+
|
3
|
+
module Kudzu
|
4
|
+
class Agent
|
5
|
+
class MimeTypeDetector
|
6
|
+
DEFALUT_MIME_TYPE = 'application/octet-stream'
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@content_type_parser = Kudzu::Util::ContentTypeParser.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def detect(page)
|
13
|
+
from_header(page.response_header) || from_body(page.body) || from_url(page.url) || DEFALUT_MIME_TYPE
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def from_header(header)
|
19
|
+
@content_type_parser.parse(header['content-type']).first
|
20
|
+
end
|
21
|
+
|
22
|
+
def from_body(body)
|
23
|
+
mime = MIME.check_magics(StringIO.new(body))
|
24
|
+
mime.to_s if mime
|
25
|
+
end
|
26
|
+
|
27
|
+
def from_url(url)
|
28
|
+
uri = Addressable::URI.parse(url)
|
29
|
+
mime = MIME.check_globs(uri.basename)
|
30
|
+
mime.to_s if mime
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Robots
|
4
|
+
def initialize(config)
|
5
|
+
@user_agent = config.user_agent
|
6
|
+
@page_fetcher = Kudzu::Agent::Fetcher.new(config)
|
7
|
+
@monitor = Monitor.new
|
8
|
+
@txt = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def allowed?(uri)
|
12
|
+
uri = Addressable::URI.parse(uri) if uri.is_a?(String)
|
13
|
+
set = find_set(uri)
|
14
|
+
return true unless set
|
15
|
+
set.allowed_path?(uri)
|
16
|
+
end
|
17
|
+
|
18
|
+
def crawl_delay(uri)
|
19
|
+
uri = Addressable::URI.parse(uri) if uri.is_a?(String)
|
20
|
+
set = find_set(uri)
|
21
|
+
return nil unless set
|
22
|
+
set.crawl_delay
|
23
|
+
end
|
24
|
+
|
25
|
+
def sitemaps(uri)
|
26
|
+
uri = Addressable::URI.parse(uri) if uri.is_a?(String)
|
27
|
+
txt = find_txt(uri)
|
28
|
+
return [] unless txt
|
29
|
+
txt.sitemaps
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def find_txt(uri)
|
35
|
+
@monitor.synchronize do
|
36
|
+
@txt[uri.host] ||= fetch_and_parse(uri)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def find_set(uri)
|
41
|
+
txt = find_txt(uri)
|
42
|
+
return unless txt
|
43
|
+
|
44
|
+
txt.sets.each do |set|
|
45
|
+
return set if @user_agent =~ set.user_agent
|
46
|
+
end
|
47
|
+
return nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def fetch_and_parse(uri)
|
51
|
+
response = fetch(uri)
|
52
|
+
if response && response.status == 200
|
53
|
+
body = response.body.force_encoding('utf-8').encode('utf-8', undef: :replace, invalid: :replace)
|
54
|
+
Parser.new.parse(body)
|
55
|
+
else
|
56
|
+
Parser.new.parse('')
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def fetch(base_uri)
|
61
|
+
uri = base_uri.dup
|
62
|
+
uri.path = 'robots.txt'
|
63
|
+
uri.fragment = uri.query = nil
|
64
|
+
|
65
|
+
begin
|
66
|
+
@page_fetcher.fetch(uri.to_s)
|
67
|
+
rescue
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class Txt
|
73
|
+
attr_accessor :sets, :sitemaps
|
74
|
+
|
75
|
+
def initialize
|
76
|
+
self.sets = []
|
77
|
+
self.sitemaps = []
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class RuleSet
|
82
|
+
attr_accessor :user_agent, :rules, :crawl_delay
|
83
|
+
|
84
|
+
def initialize(attr = {})
|
85
|
+
self.rules = []
|
86
|
+
attr.each { |k, v| public_send("#{k}=", v) }
|
87
|
+
end
|
88
|
+
|
89
|
+
def allowed_path?(uri)
|
90
|
+
rules.each do |rule|
|
91
|
+
return rule.allow if uri.path =~ rule.path
|
92
|
+
end
|
93
|
+
return true
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
class Rule
|
98
|
+
attr_accessor :path, :allow
|
99
|
+
|
100
|
+
def initialize(attr = {})
|
101
|
+
attr.each { |k, v| public_send("#{k}=", v) }
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
class Parser
|
106
|
+
UNMATCH_REGEXP = /^$/
|
107
|
+
|
108
|
+
def parse(body)
|
109
|
+
txt = Txt.new
|
110
|
+
sets = []
|
111
|
+
prev_key = nil
|
112
|
+
|
113
|
+
parse_body(body).each do |key, value|
|
114
|
+
case key
|
115
|
+
when 'user-agent'
|
116
|
+
new_set = RuleSet.new(user_agent: ua_regexp(value))
|
117
|
+
txt.sets << new_set
|
118
|
+
if prev_key == 'user-agent'
|
119
|
+
sets << new_set
|
120
|
+
else
|
121
|
+
sets = [new_set]
|
122
|
+
end
|
123
|
+
when 'allow'
|
124
|
+
re = path_regexp(value)
|
125
|
+
sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
|
126
|
+
when 'disallow'
|
127
|
+
re = path_regexp(value)
|
128
|
+
sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
|
129
|
+
when 'crawl-delay'
|
130
|
+
sets.each { |set| set.crawl_delay = value.to_i }
|
131
|
+
when 'sitemap'
|
132
|
+
txt.sitemaps << value
|
133
|
+
end
|
134
|
+
|
135
|
+
prev_key = key
|
136
|
+
end
|
137
|
+
|
138
|
+
sort(txt)
|
139
|
+
end
|
140
|
+
|
141
|
+
private
|
142
|
+
|
143
|
+
def parse_body(body)
|
144
|
+
lines = body.to_s.split(/\r|\n|\r\n/)
|
145
|
+
lines.map { |line| parse_line(line) }.compact
|
146
|
+
end
|
147
|
+
|
148
|
+
def parse_line(line)
|
149
|
+
line.strip!
|
150
|
+
if line.empty? || line.start_with?('#')
|
151
|
+
nil
|
152
|
+
else
|
153
|
+
split_line(line)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def split_line(line)
|
158
|
+
key, value = line.split(':', 2)
|
159
|
+
key = key.to_s.strip.downcase
|
160
|
+
value = value.to_s.sub(/#.*$/, '').strip
|
161
|
+
if key.empty? || value.empty?
|
162
|
+
nil
|
163
|
+
else
|
164
|
+
[key, value]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def ua_regexp(value)
|
169
|
+
Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
|
170
|
+
rescue RegexpError
|
171
|
+
UNMATCH_REGEXP
|
172
|
+
end
|
173
|
+
|
174
|
+
def path_regexp(value)
|
175
|
+
Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$'))
|
176
|
+
rescue RegexpError
|
177
|
+
UNMATCH_REGEXP
|
178
|
+
end
|
179
|
+
|
180
|
+
def sort(txt)
|
181
|
+
txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
|
182
|
+
txt.sets.each do |set|
|
183
|
+
set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
|
184
|
+
end
|
185
|
+
txt
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Sleeper
|
4
|
+
def initialize(config, robots = nil)
|
5
|
+
@config = config
|
6
|
+
@robots = robots
|
7
|
+
@monitor = Monitor.new
|
8
|
+
@last_accessed = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def politeness_delay(url)
|
12
|
+
uri = Addressable::URI.parse(url)
|
13
|
+
delay_sec = delay_second(uri)
|
14
|
+
return unless delay_sec
|
15
|
+
|
16
|
+
sleep_sec = sleep_second(uri, delay_sec)
|
17
|
+
sleep sleep_sec if sleep_sec > 0
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def delay_second(uri)
|
23
|
+
if @config.respect_robots_txt && @robots && (crawl_delay = @robots.crawl_delay(uri))
|
24
|
+
crawl_delay.to_f
|
25
|
+
elsif @config.politeness_delay
|
26
|
+
@config.politeness_delay.to_f
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def sleep_second(uri, delay_sec)
|
31
|
+
@monitor.synchronize do
|
32
|
+
now = Time.now.to_f
|
33
|
+
value = if @last_accessed[uri.host]
|
34
|
+
(@last_accessed[uri.host] + delay_sec) - now
|
35
|
+
else
|
36
|
+
0
|
37
|
+
end
|
38
|
+
@last_accessed[uri.host] = now + value
|
39
|
+
value
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Kudzu
|
4
|
+
class Agent
|
5
|
+
class UrlExtractor
|
6
|
+
def initialize(config)
|
7
|
+
@config = config
|
8
|
+
end
|
9
|
+
|
10
|
+
def extract(page, base_url)
|
11
|
+
hrefs = if page.html?
|
12
|
+
FromHTML.new(@config).extract(page)
|
13
|
+
elsif page.xml?
|
14
|
+
FromXML.new(@config).extract(page)
|
15
|
+
else
|
16
|
+
[]
|
17
|
+
end
|
18
|
+
|
19
|
+
hrefs.select do |href|
|
20
|
+
href[:url] = normalize(href[:url], base_url)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def normalize(url, base_url)
|
27
|
+
uri = Addressable::URI.parse(base_url.to_s).join(url.to_s).normalize
|
28
|
+
uri.path = '/' unless uri.path
|
29
|
+
uri.fragment = nil
|
30
|
+
|
31
|
+
if uri.scheme.in?(%w(http https))
|
32
|
+
uri.to_s
|
33
|
+
else
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class FromHTML < UrlExtractor
|
39
|
+
def initialize(config)
|
40
|
+
super
|
41
|
+
@content_type_parser = Kudzu::Util::ContentTypeParser.new
|
42
|
+
end
|
43
|
+
|
44
|
+
def extract(page)
|
45
|
+
doc = Nokogiri::HTML(page.decoded_body)
|
46
|
+
return [] if nofollow?(doc)
|
47
|
+
|
48
|
+
if (filter = @config.find_filter(page.url))
|
49
|
+
if filter.allow_element
|
50
|
+
doc = doc.search(*Array(filter.allow_element))
|
51
|
+
end
|
52
|
+
if filter.deny_element
|
53
|
+
doc.search(*Array(filter.deny_element)).remove
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
hrefs = from_html(doc) + from_html_in_meta(doc)
|
58
|
+
hrefs.reject { |href| href[:url].empty? }.uniq
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def nofollow?(doc)
|
64
|
+
return false unless @config.respect_nofollow
|
65
|
+
nodes = doc.xpath('//meta[@name]')
|
66
|
+
nodes.any? { |node| node[:name] =~ /^robots$/i && node[:content] =~ /nofollow/i }
|
67
|
+
end
|
68
|
+
|
69
|
+
def from_html(doc)
|
70
|
+
nodes = doc.xpath('.//*[@href or @src]').to_a
|
71
|
+
|
72
|
+
if @config.respect_nofollow
|
73
|
+
nodes.reject! { |url| url[:rel] =~ /nofollow/i }
|
74
|
+
end
|
75
|
+
|
76
|
+
nodes.map { |node|
|
77
|
+
{ url: (node[:href] || node[:src]).to_s.strip,
|
78
|
+
title: node_to_title(node) }
|
79
|
+
}
|
80
|
+
end
|
81
|
+
|
82
|
+
def node_to_title(node)
|
83
|
+
unless node.inner_text.empty?
|
84
|
+
node.inner_text
|
85
|
+
else
|
86
|
+
(node[:title] || node[:alt]).to_s
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def from_html_in_meta(doc)
|
91
|
+
nodes = doc.xpath('.//meta[@http-equiv]').select { |node| node[:'http-equiv'] =~ /^refresh$/i }
|
92
|
+
urls = nodes.map { |node| @content_type_parser.parse(node[:content]).last[:url] }.compact
|
93
|
+
urls.map { |url| { url: url.to_s.strip } }
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
class FromXML < UrlExtractor
|
98
|
+
def extract(page)
|
99
|
+
doc = Nokogiri::XML(page.decoded_body)
|
100
|
+
doc.remove_namespaces!
|
101
|
+
hrefs = from_xml_rss(doc) + from_xml_atom(doc)
|
102
|
+
hrefs.reject { |href| href[:url].empty? }.uniq
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
def from_xml_rss(doc)
|
108
|
+
doc.xpath('rss/channel').map { |node|
|
109
|
+
{ url: node.xpath('./item/link').inner_text.strip,
|
110
|
+
title: node.xpath('./item/title').inner_text }
|
111
|
+
}
|
112
|
+
end
|
113
|
+
|
114
|
+
def from_xml_atom(doc)
|
115
|
+
doc.xpath('feed/entry').map { |node|
|
116
|
+
{ url: node.xpath('./link[@href]/@href').to_s.strip,
|
117
|
+
title: node.xpath('./title').inner_text }
|
118
|
+
}
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|