kudzu 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ require 'nokogiri'
2
+
3
+ module Kudzu
4
+ class Agent
5
+ class Filter
6
+ def initialize(config)
7
+ @config = config
8
+ @matcher = Kudzu::Util::Matcher.new
9
+ end
10
+
11
+ def allowed?(page)
12
+ filter = @config.find_filter(page.url)
13
+ return true unless filter
14
+
15
+ allowed_mime_type?(page.mime_type, filter) && allowed_size?(page.size, filter) && allowed_index?(page)
16
+ end
17
+
18
+ private
19
+
20
+ def allowed_mime_type?(mime_type, filter)
21
+ return true if mime_type.nil?
22
+ @matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
23
+ end
24
+
25
+ def allowed_size?(size, filter)
26
+ return true if filter.max_size.nil? || size.nil?
27
+ size.to_i < filter.max_size.to_i
28
+ end
29
+
30
+ def allowed_index?(page)
31
+ return true unless page.html?
32
+ return true unless @config.respect_noindex
33
+
34
+ doc = Nokogiri::HTML(page.body.encode('ascii', undef: :replace, invalid: :replace))
35
+ doc.xpath('html/head/meta[@name]')
36
+ .all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,34 @@
1
+ require 'shared-mime-info'
2
+
3
+ module Kudzu
4
+ class Agent
5
+ class MimeTypeDetector
6
+ DEFALUT_MIME_TYPE = 'application/octet-stream'
7
+
8
+ def initialize
9
+ @content_type_parser = Kudzu::Util::ContentTypeParser.new
10
+ end
11
+
12
+ def detect(page)
13
+ from_header(page.response_header) || from_body(page.body) || from_url(page.url) || DEFALUT_MIME_TYPE
14
+ end
15
+
16
+ private
17
+
18
+ def from_header(header)
19
+ @content_type_parser.parse(header['content-type']).first
20
+ end
21
+
22
+ def from_body(body)
23
+ mime = MIME.check_magics(StringIO.new(body))
24
+ mime.to_s if mime
25
+ end
26
+
27
+ def from_url(url)
28
+ uri = Addressable::URI.parse(url)
29
+ mime = MIME.check_globs(uri.basename)
30
+ mime.to_s if mime
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,190 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Robots
4
+ def initialize(config)
5
+ @user_agent = config.user_agent
6
+ @page_fetcher = Kudzu::Agent::Fetcher.new(config)
7
+ @monitor = Monitor.new
8
+ @txt = {}
9
+ end
10
+
11
+ def allowed?(uri)
12
+ uri = Addressable::URI.parse(uri) if uri.is_a?(String)
13
+ set = find_set(uri)
14
+ return true unless set
15
+ set.allowed_path?(uri)
16
+ end
17
+
18
+ def crawl_delay(uri)
19
+ uri = Addressable::URI.parse(uri) if uri.is_a?(String)
20
+ set = find_set(uri)
21
+ return nil unless set
22
+ set.crawl_delay
23
+ end
24
+
25
+ def sitemaps(uri)
26
+ uri = Addressable::URI.parse(uri) if uri.is_a?(String)
27
+ txt = find_txt(uri)
28
+ return [] unless txt
29
+ txt.sitemaps
30
+ end
31
+
32
+ private
33
+
34
+ def find_txt(uri)
35
+ @monitor.synchronize do
36
+ @txt[uri.host] ||= fetch_and_parse(uri)
37
+ end
38
+ end
39
+
40
+ def find_set(uri)
41
+ txt = find_txt(uri)
42
+ return unless txt
43
+
44
+ txt.sets.each do |set|
45
+ return set if @user_agent =~ set.user_agent
46
+ end
47
+ return nil
48
+ end
49
+
50
+ def fetch_and_parse(uri)
51
+ response = fetch(uri)
52
+ if response && response.status == 200
53
+ body = response.body.force_encoding('utf-8').encode('utf-8', undef: :replace, invalid: :replace)
54
+ Parser.new.parse(body)
55
+ else
56
+ Parser.new.parse('')
57
+ end
58
+ end
59
+
60
+ def fetch(base_uri)
61
+ uri = base_uri.dup
62
+ uri.path = 'robots.txt'
63
+ uri.fragment = uri.query = nil
64
+
65
+ begin
66
+ @page_fetcher.fetch(uri.to_s)
67
+ rescue
68
+ nil
69
+ end
70
+ end
71
+
72
+ class Txt
73
+ attr_accessor :sets, :sitemaps
74
+
75
+ def initialize
76
+ self.sets = []
77
+ self.sitemaps = []
78
+ end
79
+ end
80
+
81
+ class RuleSet
82
+ attr_accessor :user_agent, :rules, :crawl_delay
83
+
84
+ def initialize(attr = {})
85
+ self.rules = []
86
+ attr.each { |k, v| public_send("#{k}=", v) }
87
+ end
88
+
89
+ def allowed_path?(uri)
90
+ rules.each do |rule|
91
+ return rule.allow if uri.path =~ rule.path
92
+ end
93
+ return true
94
+ end
95
+ end
96
+
97
+ class Rule
98
+ attr_accessor :path, :allow
99
+
100
+ def initialize(attr = {})
101
+ attr.each { |k, v| public_send("#{k}=", v) }
102
+ end
103
+ end
104
+
105
+ class Parser
106
+ UNMATCH_REGEXP = /^$/
107
+
108
+ def parse(body)
109
+ txt = Txt.new
110
+ sets = []
111
+ prev_key = nil
112
+
113
+ parse_body(body).each do |key, value|
114
+ case key
115
+ when 'user-agent'
116
+ new_set = RuleSet.new(user_agent: ua_regexp(value))
117
+ txt.sets << new_set
118
+ if prev_key == 'user-agent'
119
+ sets << new_set
120
+ else
121
+ sets = [new_set]
122
+ end
123
+ when 'allow'
124
+ re = path_regexp(value)
125
+ sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
126
+ when 'disallow'
127
+ re = path_regexp(value)
128
+ sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
129
+ when 'crawl-delay'
130
+ sets.each { |set| set.crawl_delay = value.to_i }
131
+ when 'sitemap'
132
+ txt.sitemaps << value
133
+ end
134
+
135
+ prev_key = key
136
+ end
137
+
138
+ sort(txt)
139
+ end
140
+
141
+ private
142
+
143
+ def parse_body(body)
144
+ lines = body.to_s.split(/\r|\n|\r\n/)
145
+ lines.map { |line| parse_line(line) }.compact
146
+ end
147
+
148
+ def parse_line(line)
149
+ line.strip!
150
+ if line.empty? || line.start_with?('#')
151
+ nil
152
+ else
153
+ split_line(line)
154
+ end
155
+ end
156
+
157
+ def split_line(line)
158
+ key, value = line.split(':', 2)
159
+ key = key.to_s.strip.downcase
160
+ value = value.to_s.sub(/#.*$/, '').strip
161
+ if key.empty? || value.empty?
162
+ nil
163
+ else
164
+ [key, value]
165
+ end
166
+ end
167
+
168
+ def ua_regexp(value)
169
+ Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
170
+ rescue RegexpError
171
+ UNMATCH_REGEXP
172
+ end
173
+
174
+ def path_regexp(value)
175
+ Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$'))
176
+ rescue RegexpError
177
+ UNMATCH_REGEXP
178
+ end
179
+
180
+ def sort(txt)
181
+ txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
182
+ txt.sets.each do |set|
183
+ set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
184
+ end
185
+ txt
186
+ end
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,44 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Sleeper
4
+ def initialize(config, robots = nil)
5
+ @config = config
6
+ @robots = robots
7
+ @monitor = Monitor.new
8
+ @last_accessed = {}
9
+ end
10
+
11
+ def politeness_delay(url)
12
+ uri = Addressable::URI.parse(url)
13
+ delay_sec = delay_second(uri)
14
+ return unless delay_sec
15
+
16
+ sleep_sec = sleep_second(uri, delay_sec)
17
+ sleep sleep_sec if sleep_sec > 0
18
+ end
19
+
20
+ private
21
+
22
+ def delay_second(uri)
23
+ if @config.respect_robots_txt && @robots && (crawl_delay = @robots.crawl_delay(uri))
24
+ crawl_delay.to_f
25
+ elsif @config.politeness_delay
26
+ @config.politeness_delay.to_f
27
+ end
28
+ end
29
+
30
+ def sleep_second(uri, delay_sec)
31
+ @monitor.synchronize do
32
+ now = Time.now.to_f
33
+ value = if @last_accessed[uri.host]
34
+ (@last_accessed[uri.host] + delay_sec) - now
35
+ else
36
+ 0
37
+ end
38
+ @last_accessed[uri.host] = now + value
39
+ value
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,16 @@
1
+ require 'nokogiri'
2
+
3
+ module Kudzu
4
+ class Agent
5
+ class TitleParser
6
+ def parse(page)
7
+ doc = Nokogiri::HTML(page.decoded_body)
8
+ if (node = doc.xpath('//head/title').first)
9
+ node.inner_text.to_s
10
+ else
11
+ ''
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,123 @@
1
+ require 'nokogiri'
2
+
3
+ module Kudzu
4
+ class Agent
5
+ class UrlExtractor
6
+ def initialize(config)
7
+ @config = config
8
+ end
9
+
10
+ def extract(page, base_url)
11
+ hrefs = if page.html?
12
+ FromHTML.new(@config).extract(page)
13
+ elsif page.xml?
14
+ FromXML.new(@config).extract(page)
15
+ else
16
+ []
17
+ end
18
+
19
+ hrefs.select do |href|
20
+ href[:url] = normalize(href[:url], base_url)
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def normalize(url, base_url)
27
+ uri = Addressable::URI.parse(base_url.to_s).join(url.to_s).normalize
28
+ uri.path = '/' unless uri.path
29
+ uri.fragment = nil
30
+
31
+ if uri.scheme.in?(%w(http https))
32
+ uri.to_s
33
+ else
34
+ nil
35
+ end
36
+ end
37
+
38
+ class FromHTML < UrlExtractor
39
+ def initialize(config)
40
+ super
41
+ @content_type_parser = Kudzu::Util::ContentTypeParser.new
42
+ end
43
+
44
+ def extract(page)
45
+ doc = Nokogiri::HTML(page.decoded_body)
46
+ return [] if nofollow?(doc)
47
+
48
+ if (filter = @config.find_filter(page.url))
49
+ if filter.allow_element
50
+ doc = doc.search(*Array(filter.allow_element))
51
+ end
52
+ if filter.deny_element
53
+ doc.search(*Array(filter.deny_element)).remove
54
+ end
55
+ end
56
+
57
+ hrefs = from_html(doc) + from_html_in_meta(doc)
58
+ hrefs.reject { |href| href[:url].empty? }.uniq
59
+ end
60
+
61
+ private
62
+
63
+ def nofollow?(doc)
64
+ return false unless @config.respect_nofollow
65
+ nodes = doc.xpath('//meta[@name]')
66
+ nodes.any? { |node| node[:name] =~ /^robots$/i && node[:content] =~ /nofollow/i }
67
+ end
68
+
69
+ def from_html(doc)
70
+ nodes = doc.xpath('.//*[@href or @src]').to_a
71
+
72
+ if @config.respect_nofollow
73
+ nodes.reject! { |url| url[:rel] =~ /nofollow/i }
74
+ end
75
+
76
+ nodes.map { |node|
77
+ { url: (node[:href] || node[:src]).to_s.strip,
78
+ title: node_to_title(node) }
79
+ }
80
+ end
81
+
82
+ def node_to_title(node)
83
+ unless node.inner_text.empty?
84
+ node.inner_text
85
+ else
86
+ (node[:title] || node[:alt]).to_s
87
+ end
88
+ end
89
+
90
+ def from_html_in_meta(doc)
91
+ nodes = doc.xpath('.//meta[@http-equiv]').select { |node| node[:'http-equiv'] =~ /^refresh$/i }
92
+ urls = nodes.map { |node| @content_type_parser.parse(node[:content]).last[:url] }.compact
93
+ urls.map { |url| { url: url.to_s.strip } }
94
+ end
95
+ end
96
+
97
+ class FromXML < UrlExtractor
98
+ def extract(page)
99
+ doc = Nokogiri::XML(page.decoded_body)
100
+ doc.remove_namespaces!
101
+ hrefs = from_xml_rss(doc) + from_xml_atom(doc)
102
+ hrefs.reject { |href| href[:url].empty? }.uniq
103
+ end
104
+
105
+ private
106
+
107
+ def from_xml_rss(doc)
108
+ doc.xpath('rss/channel').map { |node|
109
+ { url: node.xpath('./item/link').inner_text.strip,
110
+ title: node.xpath('./item/title').inner_text }
111
+ }
112
+ end
113
+
114
+ def from_xml_atom(doc)
115
+ doc.xpath('feed/entry').map { |node|
116
+ { url: node.xpath('./link[@href]/@href').to_s.strip,
117
+ title: node.xpath('./title').inner_text }
118
+ }
119
+ end
120
+ end
121
+ end
122
+ end
123
+ end