kudzu 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,40 @@
1
+ require 'nokogiri'
2
+
3
+ module Kudzu
4
+ class Agent
5
+ class Filter
6
+ def initialize(config)
7
+ @config = config
8
+ @matcher = Kudzu::Util::Matcher.new
9
+ end
10
+
11
+ def allowed?(page)
12
+ filter = @config.find_filter(page.url)
13
+ return true unless filter
14
+
15
+ allowed_mime_type?(page.mime_type, filter) && allowed_size?(page.size, filter) && allowed_index?(page)
16
+ end
17
+
18
+ private
19
+
20
+ def allowed_mime_type?(mime_type, filter)
21
+ return true if mime_type.nil?
22
+ @matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
23
+ end
24
+
25
+ def allowed_size?(size, filter)
26
+ return true if filter.max_size.nil? || size.nil?
27
+ size.to_i < filter.max_size.to_i
28
+ end
29
+
30
+ def allowed_index?(page)
31
+ return true unless page.html?
32
+ return true unless @config.respect_noindex
33
+
34
+ doc = Nokogiri::HTML(page.body.encode('ascii', undef: :replace, invalid: :replace))
35
+ doc.xpath('html/head/meta[@name]')
36
+ .all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,34 @@
1
+ require 'shared-mime-info'
2
+
3
+ module Kudzu
4
+ class Agent
5
+ class MimeTypeDetector
6
+ DEFALUT_MIME_TYPE = 'application/octet-stream'
7
+
8
+ def initialize
9
+ @content_type_parser = Kudzu::Util::ContentTypeParser.new
10
+ end
11
+
12
+ def detect(page)
13
+ from_header(page.response_header) || from_body(page.body) || from_url(page.url) || DEFALUT_MIME_TYPE
14
+ end
15
+
16
+ private
17
+
18
+ def from_header(header)
19
+ @content_type_parser.parse(header['content-type']).first
20
+ end
21
+
22
+ def from_body(body)
23
+ mime = MIME.check_magics(StringIO.new(body))
24
+ mime.to_s if mime
25
+ end
26
+
27
+ def from_url(url)
28
+ uri = Addressable::URI.parse(url)
29
+ mime = MIME.check_globs(uri.basename)
30
+ mime.to_s if mime
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,190 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Robots
4
+ def initialize(config)
5
+ @user_agent = config.user_agent
6
+ @page_fetcher = Kudzu::Agent::Fetcher.new(config)
7
+ @monitor = Monitor.new
8
+ @txt = {}
9
+ end
10
+
11
+ def allowed?(uri)
12
+ uri = Addressable::URI.parse(uri) if uri.is_a?(String)
13
+ set = find_set(uri)
14
+ return true unless set
15
+ set.allowed_path?(uri)
16
+ end
17
+
18
+ def crawl_delay(uri)
19
+ uri = Addressable::URI.parse(uri) if uri.is_a?(String)
20
+ set = find_set(uri)
21
+ return nil unless set
22
+ set.crawl_delay
23
+ end
24
+
25
+ def sitemaps(uri)
26
+ uri = Addressable::URI.parse(uri) if uri.is_a?(String)
27
+ txt = find_txt(uri)
28
+ return [] unless txt
29
+ txt.sitemaps
30
+ end
31
+
32
+ private
33
+
34
+ def find_txt(uri)
35
+ @monitor.synchronize do
36
+ @txt[uri.host] ||= fetch_and_parse(uri)
37
+ end
38
+ end
39
+
40
+ def find_set(uri)
41
+ txt = find_txt(uri)
42
+ return unless txt
43
+
44
+ txt.sets.each do |set|
45
+ return set if @user_agent =~ set.user_agent
46
+ end
47
+ return nil
48
+ end
49
+
50
+ def fetch_and_parse(uri)
51
+ response = fetch(uri)
52
+ if response && response.status == 200
53
+ body = response.body.force_encoding('utf-8').encode('utf-8', undef: :replace, invalid: :replace)
54
+ Parser.new.parse(body)
55
+ else
56
+ Parser.new.parse('')
57
+ end
58
+ end
59
+
60
+ def fetch(base_uri)
61
+ uri = base_uri.dup
62
+ uri.path = 'robots.txt'
63
+ uri.fragment = uri.query = nil
64
+
65
+ begin
66
+ @page_fetcher.fetch(uri.to_s)
67
+ rescue
68
+ nil
69
+ end
70
+ end
71
+
72
+ class Txt
73
+ attr_accessor :sets, :sitemaps
74
+
75
+ def initialize
76
+ self.sets = []
77
+ self.sitemaps = []
78
+ end
79
+ end
80
+
81
+ class RuleSet
82
+ attr_accessor :user_agent, :rules, :crawl_delay
83
+
84
+ def initialize(attr = {})
85
+ self.rules = []
86
+ attr.each { |k, v| public_send("#{k}=", v) }
87
+ end
88
+
89
+ def allowed_path?(uri)
90
+ rules.each do |rule|
91
+ return rule.allow if uri.path =~ rule.path
92
+ end
93
+ return true
94
+ end
95
+ end
96
+
97
+ class Rule
98
+ attr_accessor :path, :allow
99
+
100
+ def initialize(attr = {})
101
+ attr.each { |k, v| public_send("#{k}=", v) }
102
+ end
103
+ end
104
+
105
+ class Parser
106
+ UNMATCH_REGEXP = /^$/
107
+
108
+ def parse(body)
109
+ txt = Txt.new
110
+ sets = []
111
+ prev_key = nil
112
+
113
+ parse_body(body).each do |key, value|
114
+ case key
115
+ when 'user-agent'
116
+ new_set = RuleSet.new(user_agent: ua_regexp(value))
117
+ txt.sets << new_set
118
+ if prev_key == 'user-agent'
119
+ sets << new_set
120
+ else
121
+ sets = [new_set]
122
+ end
123
+ when 'allow'
124
+ re = path_regexp(value)
125
+ sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
126
+ when 'disallow'
127
+ re = path_regexp(value)
128
+ sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
129
+ when 'crawl-delay'
130
+ sets.each { |set| set.crawl_delay = value.to_i }
131
+ when 'sitemap'
132
+ txt.sitemaps << value
133
+ end
134
+
135
+ prev_key = key
136
+ end
137
+
138
+ sort(txt)
139
+ end
140
+
141
+ private
142
+
143
+ def parse_body(body)
144
+ lines = body.to_s.split(/\r|\n|\r\n/)
145
+ lines.map { |line| parse_line(line) }.compact
146
+ end
147
+
148
+ def parse_line(line)
149
+ line.strip!
150
+ if line.empty? || line.start_with?('#')
151
+ nil
152
+ else
153
+ split_line(line)
154
+ end
155
+ end
156
+
157
+ def split_line(line)
158
+ key, value = line.split(':', 2)
159
+ key = key.to_s.strip.downcase
160
+ value = value.to_s.sub(/#.*$/, '').strip
161
+ if key.empty? || value.empty?
162
+ nil
163
+ else
164
+ [key, value]
165
+ end
166
+ end
167
+
168
+ def ua_regexp(value)
169
+ Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
170
+ rescue RegexpError
171
+ UNMATCH_REGEXP
172
+ end
173
+
174
+ def path_regexp(value)
175
+ Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$'))
176
+ rescue RegexpError
177
+ UNMATCH_REGEXP
178
+ end
179
+
180
+ def sort(txt)
181
+ txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
182
+ txt.sets.each do |set|
183
+ set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
184
+ end
185
+ txt
186
+ end
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,44 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Sleeper
4
+ def initialize(config, robots = nil)
5
+ @config = config
6
+ @robots = robots
7
+ @monitor = Monitor.new
8
+ @last_accessed = {}
9
+ end
10
+
11
+ def politeness_delay(url)
12
+ uri = Addressable::URI.parse(url)
13
+ delay_sec = delay_second(uri)
14
+ return unless delay_sec
15
+
16
+ sleep_sec = sleep_second(uri, delay_sec)
17
+ sleep sleep_sec if sleep_sec > 0
18
+ end
19
+
20
+ private
21
+
22
+ def delay_second(uri)
23
+ if @config.respect_robots_txt && @robots && (crawl_delay = @robots.crawl_delay(uri))
24
+ crawl_delay.to_f
25
+ elsif @config.politeness_delay
26
+ @config.politeness_delay.to_f
27
+ end
28
+ end
29
+
30
+ def sleep_second(uri, delay_sec)
31
+ @monitor.synchronize do
32
+ now = Time.now.to_f
33
+ value = if @last_accessed[uri.host]
34
+ (@last_accessed[uri.host] + delay_sec) - now
35
+ else
36
+ 0
37
+ end
38
+ @last_accessed[uri.host] = now + value
39
+ value
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,16 @@
1
+ require 'nokogiri'
2
+
3
+ module Kudzu
4
+ class Agent
5
+ class TitleParser
6
+ def parse(page)
7
+ doc = Nokogiri::HTML(page.decoded_body)
8
+ if (node = doc.xpath('//head/title').first)
9
+ node.inner_text.to_s
10
+ else
11
+ ''
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,123 @@
1
+ require 'nokogiri'
2
+
3
+ module Kudzu
4
+ class Agent
5
+ class UrlExtractor
6
+ def initialize(config)
7
+ @config = config
8
+ end
9
+
10
+ def extract(page, base_url)
11
+ hrefs = if page.html?
12
+ FromHTML.new(@config).extract(page)
13
+ elsif page.xml?
14
+ FromXML.new(@config).extract(page)
15
+ else
16
+ []
17
+ end
18
+
19
+ hrefs.select do |href|
20
+ href[:url] = normalize(href[:url], base_url)
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def normalize(url, base_url)
27
+ uri = Addressable::URI.parse(base_url.to_s).join(url.to_s).normalize
28
+ uri.path = '/' unless uri.path
29
+ uri.fragment = nil
30
+
31
+ if uri.scheme.in?(%w(http https))
32
+ uri.to_s
33
+ else
34
+ nil
35
+ end
36
+ end
37
+
38
+ class FromHTML < UrlExtractor
39
+ def initialize(config)
40
+ super
41
+ @content_type_parser = Kudzu::Util::ContentTypeParser.new
42
+ end
43
+
44
+ def extract(page)
45
+ doc = Nokogiri::HTML(page.decoded_body)
46
+ return [] if nofollow?(doc)
47
+
48
+ if (filter = @config.find_filter(page.url))
49
+ if filter.allow_element
50
+ doc = doc.search(*Array(filter.allow_element))
51
+ end
52
+ if filter.deny_element
53
+ doc.search(*Array(filter.deny_element)).remove
54
+ end
55
+ end
56
+
57
+ hrefs = from_html(doc) + from_html_in_meta(doc)
58
+ hrefs.reject { |href| href[:url].empty? }.uniq
59
+ end
60
+
61
+ private
62
+
63
+ def nofollow?(doc)
64
+ return false unless @config.respect_nofollow
65
+ nodes = doc.xpath('//meta[@name]')
66
+ nodes.any? { |node| node[:name] =~ /^robots$/i && node[:content] =~ /nofollow/i }
67
+ end
68
+
69
+ def from_html(doc)
70
+ nodes = doc.xpath('.//*[@href or @src]').to_a
71
+
72
+ if @config.respect_nofollow
73
+ nodes.reject! { |url| url[:rel] =~ /nofollow/i }
74
+ end
75
+
76
+ nodes.map { |node|
77
+ { url: (node[:href] || node[:src]).to_s.strip,
78
+ title: node_to_title(node) }
79
+ }
80
+ end
81
+
82
+ def node_to_title(node)
83
+ unless node.inner_text.empty?
84
+ node.inner_text
85
+ else
86
+ (node[:title] || node[:alt]).to_s
87
+ end
88
+ end
89
+
90
+ def from_html_in_meta(doc)
91
+ nodes = doc.xpath('.//meta[@http-equiv]').select { |node| node[:'http-equiv'] =~ /^refresh$/i }
92
+ urls = nodes.map { |node| @content_type_parser.parse(node[:content]).last[:url] }.compact
93
+ urls.map { |url| { url: url.to_s.strip } }
94
+ end
95
+ end
96
+
97
+ class FromXML < UrlExtractor
98
+ def extract(page)
99
+ doc = Nokogiri::XML(page.decoded_body)
100
+ doc.remove_namespaces!
101
+ hrefs = from_xml_rss(doc) + from_xml_atom(doc)
102
+ hrefs.reject { |href| href[:url].empty? }.uniq
103
+ end
104
+
105
+ private
106
+
107
+ def from_xml_rss(doc)
108
+ doc.xpath('rss/channel').map { |node|
109
+ { url: node.xpath('./item/link').inner_text.strip,
110
+ title: node.xpath('./item/title').inner_text }
111
+ }
112
+ end
113
+
114
+ def from_xml_atom(doc)
115
+ doc.xpath('feed/entry').map { |node|
116
+ { url: node.xpath('./link[@href]/@href').to_s.strip,
117
+ title: node.xpath('./title').inner_text }
118
+ }
119
+ end
120
+ end
121
+ end
122
+ end
123
+ end