kudzu 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,65 @@
1
+ module Kudzu
2
+ class Agent
3
+ class UrlFilter
4
+ def initialize(config)
5
+ @config = config
6
+ @matcher = Kudzu::Util::Matcher.new
7
+ end
8
+
9
+ def filter(hrefs, base_url)
10
+ base_uri = Addressable::URI.parse(base_url)
11
+ filter = @config.find_filter(base_uri)
12
+
13
+ hrefs.partition do |href|
14
+ allowed?(href[:url], base_uri, filter: filter)
15
+ end
16
+ end
17
+
18
+ def allowed?(url, base_uri, filter: nil)
19
+ uri = Addressable::URI.parse(url)
20
+ base_uri = Addressable::URI.parse(base_uri) if base_uri.is_a?(String)
21
+ filter ||= @config.find_filter(base_uri)
22
+ return true unless filter
23
+
24
+ focused_host?(uri, base_uri, filter) &&
25
+ focused_descendants?(uri, base_uri, filter) &&
26
+ allowed_url?(uri, filter) &&
27
+ allowed_host?(uri, filter) &&
28
+ allowed_path?(uri, filter) &&
29
+ allowed_ext?(uri, filter)
30
+ end
31
+
32
+ private
33
+
34
+ def focused_host?(uri, base_uri, filter)
35
+ return true unless filter.focus_host
36
+ uri.host == base_uri.host
37
+ end
38
+
39
+ def focused_descendants?(uri, base_uri, filter)
40
+ return true unless filter.focus_descendants
41
+ dir = Kudzu::Common.path_to_dir(uri.path)
42
+ base_dir = Kudzu::Common.path_to_dir(base_uri.path)
43
+ uri.host == base_uri.host && dir =~ /^#{Regexp.escape(base_dir)}/i
44
+ end
45
+
46
+ def allowed_url?(uri, filter)
47
+ @matcher.match?(uri.to_s, allows: filter.allow_url, denies: filter.deny_url)
48
+ end
49
+
50
+ def allowed_host?(uri, filter)
51
+ @matcher.match?(uri.host, allows: filter.allow_host, denies: filter.deny_host)
52
+ end
53
+
54
+ def allowed_path?(uri, filter)
55
+ @matcher.match?(uri.path, allows: filter.allow_path, denies: filter.deny_path)
56
+ end
57
+
58
+ def allowed_ext?(uri, filter)
59
+ ext = uri.extname.to_s.sub(/^\./, '')
60
+ return true if ext.empty?
61
+ @matcher.match?(ext, allows: filter.allow_ext, denies: filter.deny_ext)
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,41 @@
1
+ module Kudzu
2
+ class Callback
3
+ CALLBACKS = [:on_success, # 2xx
4
+ :on_redirection, # 3xx
5
+ :on_client_error, # 4xx
6
+ :on_server_error, # 5xx
7
+ :on_filter, # 2xx, filtered
8
+ :on_failure, # Exception
9
+ :before_register,
10
+ :after_register,
11
+ :before_delete,
12
+ :after_delete,
13
+ :before_enqueue,
14
+ :after_enqueue,
15
+ ]
16
+
17
+ def initialize(&block)
18
+ @callback = {}
19
+ instance_eval(&block) if block
20
+ end
21
+
22
+ CALLBACKS.each do |key|
23
+ define_method(key) do |&block|
24
+ @callback[key] = block
25
+ end
26
+ end
27
+
28
+ def on(name, *args)
29
+ on_name = "on_#{name}".to_sym
30
+ @callback[on_name].call(*args) if @callback.key?(on_name)
31
+ end
32
+
33
+ def around(name, *args)
34
+ before_name = "before_#{name}".to_sym
35
+ after_name = "after_#{name}".to_sym
36
+ @callback[before_name].call(*args) if @callback.key?(before_name)
37
+ yield
38
+ @callback[after_name].call(*args) if @callback.key?(after_name)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,23 @@
1
+ module Kudzu
2
+ class Common
3
+ class << self
4
+ def match?(text, pattern)
5
+ if pattern.is_a?(String)
6
+ File.fnmatch(pattern, text)
7
+ elsif pattern.is_a?(Regexp)
8
+ text =~ pattern
9
+ else
10
+ false
11
+ end
12
+ end
13
+
14
+ def path_to_dir(path)
15
+ if path.end_with?('/')
16
+ path
17
+ else
18
+ File.dirname(path) + '/'
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,79 @@
1
+ require 'ostruct'
2
+ require_relative 'config/filter'
3
+
4
+ module Kudzu
5
+ class Config
6
+ SIMPLE_CONFIGS = [:config_file,
7
+ :user_agent, :thread_num, :open_timeout, :read_timeout,
8
+ :max_connection, :max_redirect, :max_depth, :default_request_header,
9
+ :politeness_delay, :handle_cookie,
10
+ :respect_robots_txt, :respect_nofollow, :respect_noindex,
11
+ :log_file, :log_level,
12
+ :revisit_mode, :revisit_min_interval, :revisit_max_interval, :revisit_default_interval,
13
+ :filters]
14
+ DEFAULT_CONFIG = { user_agent: "Kudzu/#{Kudzu::VERSION}",
15
+ open_timeout: 10,
16
+ read_timeout: 10,
17
+ thread_num: 1,
18
+ max_connection: 10,
19
+ max_redirect: 3,
20
+ politeness_delay: 0.5,
21
+ handle_cookie: true,
22
+ respect_robots_txt: true,
23
+ respect_nofollow: true,
24
+ respect_noindex: true,
25
+ revisit_mode: false,
26
+ revisit_min_interval: 1,
27
+ revisit_max_interval: 10,
28
+ revisit_default_interval: 5 }
29
+
30
+ attr_accessor *SIMPLE_CONFIGS
31
+
32
+ class Delegator
33
+ def initialize(config)
34
+ @config = config
35
+ end
36
+
37
+ Kudzu::Config::SIMPLE_CONFIGS.each do |key|
38
+ define_method(key) do |value|
39
+ @config.send("#{key}=", value)
40
+ end
41
+ end
42
+
43
+ def add_filter(base_url = nil, config = {}, &block)
44
+ @config.add_filter(base_url, config, &block)
45
+ end
46
+ end
47
+
48
+ def initialize(config = {}, &block)
49
+ self.filters = {}
50
+ DEFAULT_CONFIG.merge(config).each do |key, value|
51
+ send("#{key}=", value)
52
+ end
53
+ if config_file || block
54
+ delegator = Kudzu::Config::Delegator.new(self)
55
+ delegator.instance_eval(File.read(config_file)) if config_file
56
+ delegator.instance_eval(&block) if block
57
+ end
58
+ end
59
+
60
+ def add_filter(base_url = nil, config = {}, &block)
61
+ base_uri = Addressable::URI.parse(base_url || '*')
62
+ host = base_uri.host.presence || '*'
63
+ path = base_uri.path.presence || '*'
64
+ filters[host] ||= []
65
+ filters[host] << Filter.new(path, config, &block)
66
+ end
67
+
68
+ def find_filter(uri)
69
+ uri = Addressable::URI.parse(uri) if uri.is_a?(String)
70
+ filters.each do |host, filters|
71
+ next unless Kudzu::Common.match?(uri.host, host)
72
+ filters.each do |filter|
73
+ return filter if Kudzu::Common.match?(uri.path, filter.path)
74
+ end
75
+ end
76
+ nil
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,39 @@
1
+ module Kudzu
2
+ class Config
3
+ class Filter
4
+ SIMPLE_CONFIGS = [# url filter
5
+ :focus_host, :focus_descendants, :allow_element, :deny_element,
6
+ :allow_url, :deny_url, :allow_host, :deny_host, :allow_path, :deny_path,
7
+ :allow_ext, :deny_ext,
8
+ # page filter
9
+ :allow_mime_type, :deny_mime_type, :max_size]
10
+ DEFAULT_CONFIG = { focus_host: false,
11
+ focus_descendants: false }
12
+
13
+ attr_accessor :path
14
+ attr_accessor *SIMPLE_CONFIGS
15
+
16
+ class Delegator
17
+ def initialize(filter)
18
+ @filter = filter
19
+ end
20
+
21
+ Kudzu::Config::Filter::SIMPLE_CONFIGS.each do |key|
22
+ define_method(key) do |value|
23
+ @filter.send("#{key}=", value)
24
+ end
25
+ end
26
+ end
27
+
28
+ def initialize(path, config = {}, &block)
29
+ @path = path
30
+ DEFAULT_CONFIG.merge(config).each do |key, value|
31
+ send("#{key}=", value)
32
+ end
33
+ if block
34
+ Kudzu::Config::Filter::Delegator.new(self).instance_eval(&block)
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,258 @@
1
+ require 'addressable'
2
+ require 'nokogiri'
3
+ require_relative 'common'
4
+ require_relative 'config'
5
+ require_relative 'callback'
6
+ require_relative 'logger'
7
+ require_relative 'adapter/memory'
8
+ require_relative 'util/all'
9
+ require_relative 'agent/all'
10
+ require_relative 'revisit/all'
11
+
12
+ module Kudzu
13
+ class Crawler
14
+ attr_reader :uuid, :config
15
+ attr_reader :frontier, :repository
16
+
17
+ def initialize(options = {}, &block)
18
+ @uuid = options[:uuid] || SecureRandom.uuid
19
+ @config = Kudzu::Config.new(options, &block)
20
+ end
21
+
22
+ def prepare(&block)
23
+ @logger = Kudzu::Logger.new(@config.log_file, @config.log_level)
24
+ @callback = Kudzu::Callback.new(&block)
25
+
26
+ @frontier = Kudzu.adapter::Frontier.new(@uuid)
27
+ @repository = Kudzu.adapter::Repository.new
28
+
29
+ @robots = Kudzu::Agent::Robots.new(@config)
30
+ @page_fetcher = Kudzu::Agent::Fetcher.new(@config, @robots)
31
+ @page_filter = Kudzu::Agent::Filter.new(@config)
32
+ @charset_detector = Kudzu::Agent::CharsetDetector.new
33
+ @mime_type_detector = Kudzu::Agent::MimeTypeDetector.new
34
+ @title_parser = Kudzu::Agent::TitleParser.new
35
+
36
+ @url_extractor = Kudzu::Agent::UrlExtractor.new(@config)
37
+ @url_filter = Kudzu::Agent::UrlFilter.new(@config)
38
+
39
+ @revisit_scheduler = Kudzu::Revisit::Scheduler.new(@config)
40
+ end
41
+
42
+ def run(seed_url, &block)
43
+ prepare(&block)
44
+
45
+ seeds = Array(seed_url).map { |url| { url: url } }
46
+ enqueue_hrefs(seeds, 1)
47
+
48
+ if @config.thread_num.to_i <= 1
49
+ single_thread
50
+ else
51
+ multi_thread(@config.thread_num)
52
+ end
53
+
54
+ @page_fetcher.pool.close
55
+ @frontier.clear
56
+ end
57
+
58
+ private
59
+
60
+ def single_thread
61
+ loop do
62
+ link = @frontier.dequeue.first
63
+ break unless link
64
+ visit_link(link)
65
+ end
66
+ end
67
+
68
+ def multi_thread(thread_num)
69
+ @thread_pool = Kudzu::Util::ThreadPool.new(thread_num)
70
+
71
+ @thread_pool.start do |queue|
72
+ limit_num = [thread_num - queue.size, 0].max
73
+ @frontier.dequeue(limit: limit_num).each do |link|
74
+ queue.push(link)
75
+ end
76
+ link = queue.pop
77
+ visit_link(link)
78
+ end
79
+
80
+ @thread_pool.wait
81
+ @thread_pool.shutdown
82
+ end
83
+
84
+ def visit_link(link)
85
+ page = @repository.find_by_url(link.url)
86
+ response = fetch_link(link, build_request_header(page))
87
+ return unless response
88
+
89
+ page = @repository.find_by_url(response.url) if response.redirected?
90
+ page.url = response.url
91
+ page.status = response.status
92
+ page.response_time = response.time
93
+ page.fetched_at = Time.now
94
+
95
+ if page.status_success?
96
+ handle_success(page, link, response)
97
+ elsif page.status_not_modified?
98
+ @revisit_scheduler.schedule(page, modified: false)
99
+ register_page(page)
100
+ elsif page.status_not_found? || page.status_gone?
101
+ delete_page(page)
102
+ end
103
+
104
+ run_callback(page, link)
105
+ end
106
+
107
+ def run_callback(page, link)
108
+ if page.status_success?
109
+ if page.filtered
110
+ @callback.on(:filter, page, link)
111
+ else
112
+ @callback.on(:success, page, link)
113
+ end
114
+ elsif page.status_redirection?
115
+ @callback.on(:redirection, page, link)
116
+ elsif page.status_client_error?
117
+ @callback.on(:client_error, page, link)
118
+ elsif page.status_server_error?
119
+ @callback.on(:server_error, page, link)
120
+ end
121
+ end
122
+
123
+ def build_request_header(page)
124
+ header = @config.default_request_header.to_h
125
+ if @config.revisit_mode
126
+ header['If-Modified-Since'] = page.last_modified.httpdate if page.last_modified
127
+ header['If-None-Match'] = page.etag if page.etag
128
+ end
129
+ header
130
+ end
131
+
132
+ def fetch_link(link, request_header)
133
+ response = @page_fetcher.fetch(link.url, request_header: request_header)
134
+ @logger.log :info, "page fetched: #{response.status} #{response.url}"
135
+ response
136
+ rescue Exception => e
137
+ @logger.log :warn, "couldn't fetch page: #{link.url}", error: e
138
+ @callback.on(:failure, link, e)
139
+ nil
140
+ end
141
+
142
+ def handle_success(page, link, response)
143
+ digest = Digest::MD5.hexdigest(response.body)
144
+ @revisit_scheduler.schedule(page, modified: page.digest != digest)
145
+
146
+ page.response_header = response.header
147
+ page.body = response.body
148
+ page.size = response.body.size
149
+ page.mime_type = detect_mime_type(page)
150
+ page.charset = detect_charset(page)
151
+ page.title = parse_title(page)
152
+ page.redirect_from = link.url if response.redirected?
153
+ page.revised_at = Time.now if page.digest != digest
154
+ page.digest = digest
155
+
156
+ if follow_hrefs_from?(page, link)
157
+ hrefs = extract_hrefs(page, page.url)
158
+ enqueue_hrefs(hrefs, link.depth + 1) unless hrefs.empty?
159
+ end
160
+
161
+ if allowed_page?(page)
162
+ register_page(page)
163
+ else
164
+ page.filtered = true
165
+ delete_page(page)
166
+ end
167
+ end
168
+
169
+ def detect_mime_type(page)
170
+ @mime_type_detector.detect(page)
171
+ rescue => e
172
+ @logger.log :warn, "couldn't detect mime type for #{page.url}", error: e
173
+ nil
174
+ end
175
+
176
+ def detect_charset(page)
177
+ if page.text?
178
+ @charset_detector.detect(page)
179
+ else
180
+ nil
181
+ end
182
+ rescue => e
183
+ @logger.log :warn, "couldn't detect charset for #{page.url}", error: e
184
+ nil
185
+ end
186
+
187
+ def parse_title(page)
188
+ if page.html?
189
+ @title_parser.parse(page)
190
+ else
191
+ Addressable::URI.parse(page.url).basename
192
+ end
193
+ rescue => e
194
+ @logger.log :warn, "couldn't parse title for #{page.url}", error: e
195
+ nil
196
+ end
197
+
198
+ def follow_hrefs_from?(page, link)
199
+ (page.html? || page.xml?) && (@config.max_depth.nil? || link.depth < @config.max_depth.to_i)
200
+ end
201
+
202
+ def extract_hrefs(page, base_url)
203
+ hrefs = @url_extractor.extract(page, base_url)
204
+ passed, dropped = @url_filter.filter(hrefs, base_url)
205
+
206
+ if @config.respect_robots_txt
207
+ passed, dropped_by_robots = passed.partition { |href| @robots.allowed?(href[:url]) }
208
+ dropped += dropped_by_robots
209
+ end
210
+
211
+ if @config.log_level == :debug
212
+ passed.each { |href| @logger.log :debug, "url passed: #{href[:url]}" }
213
+ dropped.each { |href| @logger.log :debug, "url dropped: #{href[:url]}" }
214
+ end
215
+
216
+ passed
217
+ rescue => e
218
+ @logger.log :warn, "couldn't extract links from #{page.url}", error: e
219
+ []
220
+ end
221
+
222
+ def allowed_page?(page)
223
+ if @page_filter.allowed?(page) &&
224
+ (!page.redirect_from || @url_filter.allowed?(page.url, page.redirect_from))
225
+ @logger.log :info, "page passed: #{page.url}"
226
+ true
227
+ else
228
+ @logger.log :info, "page dropped: #{page.url}"
229
+ false
230
+ end
231
+ end
232
+
233
+ def register_page(page)
234
+ @callback.around(:register, page) do
235
+ @repository.register(page)
236
+ end
237
+ end
238
+
239
+ def delete_page(page)
240
+ @callback.around(:delete, page) do
241
+ @repository.delete(page)
242
+ end
243
+ end
244
+
245
+ def enqueue_hrefs(hrefs, depth)
246
+ links = hrefs.map do |href|
247
+ Kudzu.adapter::Link.new(uuid: @uuid,
248
+ url: href[:url],
249
+ title: href[:title],
250
+ state: 0,
251
+ depth: depth)
252
+ end
253
+ @callback.around(:enqueue, links) do
254
+ @frontier.enqueue(links, depth: depth)
255
+ end
256
+ end
257
+ end
258
+ end