kudzu 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +61 -0
- data/Rakefile +6 -0
- data/lib/kudzu.rb +8 -0
- data/lib/kudzu/adapter/base/all.rb +3 -0
- data/lib/kudzu/adapter/base/link.rb +8 -0
- data/lib/kudzu/adapter/base/page.rb +106 -0
- data/lib/kudzu/adapter/memory.rb +4 -0
- data/lib/kudzu/adapter/memory/all.rb +3 -0
- data/lib/kudzu/adapter/memory/frontier.rb +38 -0
- data/lib/kudzu/adapter/memory/model/link.rb +15 -0
- data/lib/kudzu/adapter/memory/model/page.rb +17 -0
- data/lib/kudzu/adapter/memory/repository.rb +27 -0
- data/lib/kudzu/agent/all.rb +3 -0
- data/lib/kudzu/agent/charset_detector.rb +84 -0
- data/lib/kudzu/agent/fetcher.rb +116 -0
- data/lib/kudzu/agent/filter.rb +40 -0
- data/lib/kudzu/agent/mime_type_detector.rb +34 -0
- data/lib/kudzu/agent/robots.rb +190 -0
- data/lib/kudzu/agent/sleeper.rb +44 -0
- data/lib/kudzu/agent/title_parser.rb +16 -0
- data/lib/kudzu/agent/url_extractor.rb +123 -0
- data/lib/kudzu/agent/url_filter.rb +65 -0
- data/lib/kudzu/callback.rb +41 -0
- data/lib/kudzu/common.rb +23 -0
- data/lib/kudzu/config.rb +79 -0
- data/lib/kudzu/config/filter.rb +39 -0
- data/lib/kudzu/crawler.rb +258 -0
- data/lib/kudzu/logger.rb +20 -0
- data/lib/kudzu/revisit/all.rb +3 -0
- data/lib/kudzu/revisit/scheduler.rb +28 -0
- data/lib/kudzu/util/all.rb +3 -0
- data/lib/kudzu/util/connection_pool.rb +56 -0
- data/lib/kudzu/util/content_type_parser.rb +24 -0
- data/lib/kudzu/util/matcher.rb +21 -0
- data/lib/kudzu/util/thread_pool.rb +38 -0
- data/lib/kudzu/version.rb +3 -0
- metadata +234 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Kudzu
|
4
|
+
class Agent
|
5
|
+
class Filter
|
6
|
+
def initialize(config)
|
7
|
+
@config = config
|
8
|
+
@matcher = Kudzu::Util::Matcher.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def allowed?(page)
|
12
|
+
filter = @config.find_filter(page.url)
|
13
|
+
return true unless filter
|
14
|
+
|
15
|
+
allowed_mime_type?(page.mime_type, filter) && allowed_size?(page.size, filter) && allowed_index?(page)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def allowed_mime_type?(mime_type, filter)
|
21
|
+
return true if mime_type.nil?
|
22
|
+
@matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
|
23
|
+
end
|
24
|
+
|
25
|
+
def allowed_size?(size, filter)
|
26
|
+
return true if filter.max_size.nil? || size.nil?
|
27
|
+
size.to_i < filter.max_size.to_i
|
28
|
+
end
|
29
|
+
|
30
|
+
def allowed_index?(page)
|
31
|
+
return true unless page.html?
|
32
|
+
return true unless @config.respect_noindex
|
33
|
+
|
34
|
+
doc = Nokogiri::HTML(page.body.encode('ascii', undef: :replace, invalid: :replace))
|
35
|
+
doc.xpath('html/head/meta[@name]')
|
36
|
+
.all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'shared-mime-info'
|
2
|
+
|
3
|
+
module Kudzu
|
4
|
+
class Agent
|
5
|
+
class MimeTypeDetector
|
6
|
+
DEFALUT_MIME_TYPE = 'application/octet-stream'
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@content_type_parser = Kudzu::Util::ContentTypeParser.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def detect(page)
|
13
|
+
from_header(page.response_header) || from_body(page.body) || from_url(page.url) || DEFALUT_MIME_TYPE
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def from_header(header)
|
19
|
+
@content_type_parser.parse(header['content-type']).first
|
20
|
+
end
|
21
|
+
|
22
|
+
def from_body(body)
|
23
|
+
mime = MIME.check_magics(StringIO.new(body))
|
24
|
+
mime.to_s if mime
|
25
|
+
end
|
26
|
+
|
27
|
+
def from_url(url)
|
28
|
+
uri = Addressable::URI.parse(url)
|
29
|
+
mime = MIME.check_globs(uri.basename)
|
30
|
+
mime.to_s if mime
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Robots
|
4
|
+
def initialize(config)
|
5
|
+
@user_agent = config.user_agent
|
6
|
+
@page_fetcher = Kudzu::Agent::Fetcher.new(config)
|
7
|
+
@monitor = Monitor.new
|
8
|
+
@txt = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def allowed?(uri)
|
12
|
+
uri = Addressable::URI.parse(uri) if uri.is_a?(String)
|
13
|
+
set = find_set(uri)
|
14
|
+
return true unless set
|
15
|
+
set.allowed_path?(uri)
|
16
|
+
end
|
17
|
+
|
18
|
+
def crawl_delay(uri)
|
19
|
+
uri = Addressable::URI.parse(uri) if uri.is_a?(String)
|
20
|
+
set = find_set(uri)
|
21
|
+
return nil unless set
|
22
|
+
set.crawl_delay
|
23
|
+
end
|
24
|
+
|
25
|
+
def sitemaps(uri)
|
26
|
+
uri = Addressable::URI.parse(uri) if uri.is_a?(String)
|
27
|
+
txt = find_txt(uri)
|
28
|
+
return [] unless txt
|
29
|
+
txt.sitemaps
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def find_txt(uri)
|
35
|
+
@monitor.synchronize do
|
36
|
+
@txt[uri.host] ||= fetch_and_parse(uri)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def find_set(uri)
|
41
|
+
txt = find_txt(uri)
|
42
|
+
return unless txt
|
43
|
+
|
44
|
+
txt.sets.each do |set|
|
45
|
+
return set if @user_agent =~ set.user_agent
|
46
|
+
end
|
47
|
+
return nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def fetch_and_parse(uri)
|
51
|
+
response = fetch(uri)
|
52
|
+
if response && response.status == 200
|
53
|
+
body = response.body.force_encoding('utf-8').encode('utf-8', undef: :replace, invalid: :replace)
|
54
|
+
Parser.new.parse(body)
|
55
|
+
else
|
56
|
+
Parser.new.parse('')
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def fetch(base_uri)
|
61
|
+
uri = base_uri.dup
|
62
|
+
uri.path = 'robots.txt'
|
63
|
+
uri.fragment = uri.query = nil
|
64
|
+
|
65
|
+
begin
|
66
|
+
@page_fetcher.fetch(uri.to_s)
|
67
|
+
rescue
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class Txt
|
73
|
+
attr_accessor :sets, :sitemaps
|
74
|
+
|
75
|
+
def initialize
|
76
|
+
self.sets = []
|
77
|
+
self.sitemaps = []
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class RuleSet
|
82
|
+
attr_accessor :user_agent, :rules, :crawl_delay
|
83
|
+
|
84
|
+
def initialize(attr = {})
|
85
|
+
self.rules = []
|
86
|
+
attr.each { |k, v| public_send("#{k}=", v) }
|
87
|
+
end
|
88
|
+
|
89
|
+
def allowed_path?(uri)
|
90
|
+
rules.each do |rule|
|
91
|
+
return rule.allow if uri.path =~ rule.path
|
92
|
+
end
|
93
|
+
return true
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
class Rule
|
98
|
+
attr_accessor :path, :allow
|
99
|
+
|
100
|
+
def initialize(attr = {})
|
101
|
+
attr.each { |k, v| public_send("#{k}=", v) }
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
class Parser
|
106
|
+
UNMATCH_REGEXP = /^$/
|
107
|
+
|
108
|
+
def parse(body)
|
109
|
+
txt = Txt.new
|
110
|
+
sets = []
|
111
|
+
prev_key = nil
|
112
|
+
|
113
|
+
parse_body(body).each do |key, value|
|
114
|
+
case key
|
115
|
+
when 'user-agent'
|
116
|
+
new_set = RuleSet.new(user_agent: ua_regexp(value))
|
117
|
+
txt.sets << new_set
|
118
|
+
if prev_key == 'user-agent'
|
119
|
+
sets << new_set
|
120
|
+
else
|
121
|
+
sets = [new_set]
|
122
|
+
end
|
123
|
+
when 'allow'
|
124
|
+
re = path_regexp(value)
|
125
|
+
sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
|
126
|
+
when 'disallow'
|
127
|
+
re = path_regexp(value)
|
128
|
+
sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
|
129
|
+
when 'crawl-delay'
|
130
|
+
sets.each { |set| set.crawl_delay = value.to_i }
|
131
|
+
when 'sitemap'
|
132
|
+
txt.sitemaps << value
|
133
|
+
end
|
134
|
+
|
135
|
+
prev_key = key
|
136
|
+
end
|
137
|
+
|
138
|
+
sort(txt)
|
139
|
+
end
|
140
|
+
|
141
|
+
private
|
142
|
+
|
143
|
+
def parse_body(body)
|
144
|
+
lines = body.to_s.split(/\r|\n|\r\n/)
|
145
|
+
lines.map { |line| parse_line(line) }.compact
|
146
|
+
end
|
147
|
+
|
148
|
+
def parse_line(line)
|
149
|
+
line.strip!
|
150
|
+
if line.empty? || line.start_with?('#')
|
151
|
+
nil
|
152
|
+
else
|
153
|
+
split_line(line)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def split_line(line)
|
158
|
+
key, value = line.split(':', 2)
|
159
|
+
key = key.to_s.strip.downcase
|
160
|
+
value = value.to_s.sub(/#.*$/, '').strip
|
161
|
+
if key.empty? || value.empty?
|
162
|
+
nil
|
163
|
+
else
|
164
|
+
[key, value]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def ua_regexp(value)
|
169
|
+
Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
|
170
|
+
rescue RegexpError
|
171
|
+
UNMATCH_REGEXP
|
172
|
+
end
|
173
|
+
|
174
|
+
def path_regexp(value)
|
175
|
+
Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$'))
|
176
|
+
rescue RegexpError
|
177
|
+
UNMATCH_REGEXP
|
178
|
+
end
|
179
|
+
|
180
|
+
def sort(txt)
|
181
|
+
txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
|
182
|
+
txt.sets.each do |set|
|
183
|
+
set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
|
184
|
+
end
|
185
|
+
txt
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Sleeper
|
4
|
+
def initialize(config, robots = nil)
|
5
|
+
@config = config
|
6
|
+
@robots = robots
|
7
|
+
@monitor = Monitor.new
|
8
|
+
@last_accessed = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def politeness_delay(url)
|
12
|
+
uri = Addressable::URI.parse(url)
|
13
|
+
delay_sec = delay_second(uri)
|
14
|
+
return unless delay_sec
|
15
|
+
|
16
|
+
sleep_sec = sleep_second(uri, delay_sec)
|
17
|
+
sleep sleep_sec if sleep_sec > 0
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def delay_second(uri)
|
23
|
+
if @config.respect_robots_txt && @robots && (crawl_delay = @robots.crawl_delay(uri))
|
24
|
+
crawl_delay.to_f
|
25
|
+
elsif @config.politeness_delay
|
26
|
+
@config.politeness_delay.to_f
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def sleep_second(uri, delay_sec)
|
31
|
+
@monitor.synchronize do
|
32
|
+
now = Time.now.to_f
|
33
|
+
value = if @last_accessed[uri.host]
|
34
|
+
(@last_accessed[uri.host] + delay_sec) - now
|
35
|
+
else
|
36
|
+
0
|
37
|
+
end
|
38
|
+
@last_accessed[uri.host] = now + value
|
39
|
+
value
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Kudzu
|
4
|
+
class Agent
|
5
|
+
class UrlExtractor
|
6
|
+
def initialize(config)
|
7
|
+
@config = config
|
8
|
+
end
|
9
|
+
|
10
|
+
def extract(page, base_url)
|
11
|
+
hrefs = if page.html?
|
12
|
+
FromHTML.new(@config).extract(page)
|
13
|
+
elsif page.xml?
|
14
|
+
FromXML.new(@config).extract(page)
|
15
|
+
else
|
16
|
+
[]
|
17
|
+
end
|
18
|
+
|
19
|
+
hrefs.select do |href|
|
20
|
+
href[:url] = normalize(href[:url], base_url)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def normalize(url, base_url)
|
27
|
+
uri = Addressable::URI.parse(base_url.to_s).join(url.to_s).normalize
|
28
|
+
uri.path = '/' unless uri.path
|
29
|
+
uri.fragment = nil
|
30
|
+
|
31
|
+
if uri.scheme.in?(%w(http https))
|
32
|
+
uri.to_s
|
33
|
+
else
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class FromHTML < UrlExtractor
|
39
|
+
def initialize(config)
|
40
|
+
super
|
41
|
+
@content_type_parser = Kudzu::Util::ContentTypeParser.new
|
42
|
+
end
|
43
|
+
|
44
|
+
def extract(page)
|
45
|
+
doc = Nokogiri::HTML(page.decoded_body)
|
46
|
+
return [] if nofollow?(doc)
|
47
|
+
|
48
|
+
if (filter = @config.find_filter(page.url))
|
49
|
+
if filter.allow_element
|
50
|
+
doc = doc.search(*Array(filter.allow_element))
|
51
|
+
end
|
52
|
+
if filter.deny_element
|
53
|
+
doc.search(*Array(filter.deny_element)).remove
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
hrefs = from_html(doc) + from_html_in_meta(doc)
|
58
|
+
hrefs.reject { |href| href[:url].empty? }.uniq
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def nofollow?(doc)
|
64
|
+
return false unless @config.respect_nofollow
|
65
|
+
nodes = doc.xpath('//meta[@name]')
|
66
|
+
nodes.any? { |node| node[:name] =~ /^robots$/i && node[:content] =~ /nofollow/i }
|
67
|
+
end
|
68
|
+
|
69
|
+
def from_html(doc)
|
70
|
+
nodes = doc.xpath('.//*[@href or @src]').to_a
|
71
|
+
|
72
|
+
if @config.respect_nofollow
|
73
|
+
nodes.reject! { |url| url[:rel] =~ /nofollow/i }
|
74
|
+
end
|
75
|
+
|
76
|
+
nodes.map { |node|
|
77
|
+
{ url: (node[:href] || node[:src]).to_s.strip,
|
78
|
+
title: node_to_title(node) }
|
79
|
+
}
|
80
|
+
end
|
81
|
+
|
82
|
+
def node_to_title(node)
|
83
|
+
unless node.inner_text.empty?
|
84
|
+
node.inner_text
|
85
|
+
else
|
86
|
+
(node[:title] || node[:alt]).to_s
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def from_html_in_meta(doc)
|
91
|
+
nodes = doc.xpath('.//meta[@http-equiv]').select { |node| node[:'http-equiv'] =~ /^refresh$/i }
|
92
|
+
urls = nodes.map { |node| @content_type_parser.parse(node[:content]).last[:url] }.compact
|
93
|
+
urls.map { |url| { url: url.to_s.strip } }
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
class FromXML < UrlExtractor
|
98
|
+
def extract(page)
|
99
|
+
doc = Nokogiri::XML(page.decoded_body)
|
100
|
+
doc.remove_namespaces!
|
101
|
+
hrefs = from_xml_rss(doc) + from_xml_atom(doc)
|
102
|
+
hrefs.reject { |href| href[:url].empty? }.uniq
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
def from_xml_rss(doc)
|
108
|
+
doc.xpath('rss/channel').map { |node|
|
109
|
+
{ url: node.xpath('./item/link').inner_text.strip,
|
110
|
+
title: node.xpath('./item/title').inner_text }
|
111
|
+
}
|
112
|
+
end
|
113
|
+
|
114
|
+
def from_xml_atom(doc)
|
115
|
+
doc.xpath('feed/entry').map { |node|
|
116
|
+
{ url: node.xpath('./link[@href]/@href').to_s.strip,
|
117
|
+
title: node.xpath('./title').inner_text }
|
118
|
+
}
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|