kudzu 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,65 @@
1
+ module Kudzu
2
+ class Agent
3
+ class UrlFilter
4
+ def initialize(config)
5
+ @config = config
6
+ @matcher = Kudzu::Util::Matcher.new
7
+ end
8
+
9
+ def filter(hrefs, base_url)
10
+ base_uri = Addressable::URI.parse(base_url)
11
+ filter = @config.find_filter(base_uri)
12
+
13
+ hrefs.partition do |href|
14
+ allowed?(href[:url], base_uri, filter: filter)
15
+ end
16
+ end
17
+
18
+ def allowed?(url, base_uri, filter: nil)
19
+ uri = Addressable::URI.parse(url)
20
+ base_uri = Addressable::URI.parse(base_uri) if base_uri.is_a?(String)
21
+ filter ||= @config.find_filter(base_uri)
22
+ return true unless filter
23
+
24
+ focused_host?(uri, base_uri, filter) &&
25
+ focused_descendants?(uri, base_uri, filter) &&
26
+ allowed_url?(uri, filter) &&
27
+ allowed_host?(uri, filter) &&
28
+ allowed_path?(uri, filter) &&
29
+ allowed_ext?(uri, filter)
30
+ end
31
+
32
+ private
33
+
34
+ def focused_host?(uri, base_uri, filter)
35
+ return true unless filter.focus_host
36
+ uri.host == base_uri.host
37
+ end
38
+
39
+ def focused_descendants?(uri, base_uri, filter)
40
+ return true unless filter.focus_descendants
41
+ dir = Kudzu::Common.path_to_dir(uri.path)
42
+ base_dir = Kudzu::Common.path_to_dir(base_uri.path)
43
+ uri.host == base_uri.host && dir =~ /^#{Regexp.escape(base_dir)}/i
44
+ end
45
+
46
+ def allowed_url?(uri, filter)
47
+ @matcher.match?(uri.to_s, allows: filter.allow_url, denies: filter.deny_url)
48
+ end
49
+
50
+ def allowed_host?(uri, filter)
51
+ @matcher.match?(uri.host, allows: filter.allow_host, denies: filter.deny_host)
52
+ end
53
+
54
+ def allowed_path?(uri, filter)
55
+ @matcher.match?(uri.path, allows: filter.allow_path, denies: filter.deny_path)
56
+ end
57
+
58
+ def allowed_ext?(uri, filter)
59
+ ext = uri.extname.to_s.sub(/^\./, '')
60
+ return true if ext.empty?
61
+ @matcher.match?(ext, allows: filter.allow_ext, denies: filter.deny_ext)
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,41 @@
1
+ module Kudzu
2
+ class Callback
3
+ CALLBACKS = [:on_success, # 2xx
4
+ :on_redirection, # 3xx
5
+ :on_client_error, # 4xx
6
+ :on_server_error, # 5xx
7
+ :on_filter, # 2xx, filtered
8
+ :on_failure, # Exception
9
+ :before_register,
10
+ :after_register,
11
+ :before_delete,
12
+ :after_delete,
13
+ :before_enqueue,
14
+ :after_enqueue,
15
+ ]
16
+
17
+ def initialize(&block)
18
+ @callback = {}
19
+ instance_eval(&block) if block
20
+ end
21
+
22
+ CALLBACKS.each do |key|
23
+ define_method(key) do |&block|
24
+ @callback[key] = block
25
+ end
26
+ end
27
+
28
+ def on(name, *args)
29
+ on_name = "on_#{name}".to_sym
30
+ @callback[on_name].call(*args) if @callback.key?(on_name)
31
+ end
32
+
33
+ def around(name, *args)
34
+ before_name = "before_#{name}".to_sym
35
+ after_name = "after_#{name}".to_sym
36
+ @callback[before_name].call(*args) if @callback.key?(before_name)
37
+ yield
38
+ @callback[after_name].call(*args) if @callback.key?(after_name)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,23 @@
1
+ module Kudzu
2
+ class Common
3
+ class << self
4
+ def match?(text, pattern)
5
+ if pattern.is_a?(String)
6
+ File.fnmatch(pattern, text)
7
+ elsif pattern.is_a?(Regexp)
8
+ text =~ pattern
9
+ else
10
+ false
11
+ end
12
+ end
13
+
14
+ def path_to_dir(path)
15
+ if path.end_with?('/')
16
+ path
17
+ else
18
+ File.dirname(path) + '/'
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,79 @@
1
+ require 'ostruct'
2
+ require_relative 'config/filter'
3
+
4
+ module Kudzu
5
+ class Config
6
+ SIMPLE_CONFIGS = [:config_file,
7
+ :user_agent, :thread_num, :open_timeout, :read_timeout,
8
+ :max_connection, :max_redirect, :max_depth, :default_request_header,
9
+ :politeness_delay, :handle_cookie,
10
+ :respect_robots_txt, :respect_nofollow, :respect_noindex,
11
+ :log_file, :log_level,
12
+ :revisit_mode, :revisit_min_interval, :revisit_max_interval, :revisit_default_interval,
13
+ :filters]
14
+ DEFAULT_CONFIG = { user_agent: "Kudzu/#{Kudzu::VERSION}",
15
+ open_timeout: 10,
16
+ read_timeout: 10,
17
+ thread_num: 1,
18
+ max_connection: 10,
19
+ max_redirect: 3,
20
+ politeness_delay: 0.5,
21
+ handle_cookie: true,
22
+ respect_robots_txt: true,
23
+ respect_nofollow: true,
24
+ respect_noindex: true,
25
+ revisit_mode: false,
26
+ revisit_min_interval: 1,
27
+ revisit_max_interval: 10,
28
+ revisit_default_interval: 5 }
29
+
30
+ attr_accessor *SIMPLE_CONFIGS
31
+
32
+ class Delegator
33
+ def initialize(config)
34
+ @config = config
35
+ end
36
+
37
+ Kudzu::Config::SIMPLE_CONFIGS.each do |key|
38
+ define_method(key) do |value|
39
+ @config.send("#{key}=", value)
40
+ end
41
+ end
42
+
43
+ def add_filter(base_url = nil, config = {}, &block)
44
+ @config.add_filter(base_url, config, &block)
45
+ end
46
+ end
47
+
48
+ def initialize(config = {}, &block)
49
+ self.filters = {}
50
+ DEFAULT_CONFIG.merge(config).each do |key, value|
51
+ send("#{key}=", value)
52
+ end
53
+ if config_file || block
54
+ delegator = Kudzu::Config::Delegator.new(self)
55
+ delegator.instance_eval(File.read(config_file)) if config_file
56
+ delegator.instance_eval(&block) if block
57
+ end
58
+ end
59
+
60
+ def add_filter(base_url = nil, config = {}, &block)
61
+ base_uri = Addressable::URI.parse(base_url || '*')
62
+ host = base_uri.host.presence || '*'
63
+ path = base_uri.path.presence || '*'
64
+ filters[host] ||= []
65
+ filters[host] << Filter.new(path, config, &block)
66
+ end
67
+
68
+ def find_filter(uri)
69
+ uri = Addressable::URI.parse(uri) if uri.is_a?(String)
70
+ filters.each do |host, filters|
71
+ next unless Kudzu::Common.match?(uri.host, host)
72
+ filters.each do |filter|
73
+ return filter if Kudzu::Common.match?(uri.path, filter.path)
74
+ end
75
+ end
76
+ nil
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,39 @@
1
+ module Kudzu
2
+ class Config
3
+ class Filter
4
+ SIMPLE_CONFIGS = [# url filter
5
+ :focus_host, :focus_descendants, :allow_element, :deny_element,
6
+ :allow_url, :deny_url, :allow_host, :deny_host, :allow_path, :deny_path,
7
+ :allow_ext, :deny_ext,
8
+ # page filter
9
+ :allow_mime_type, :deny_mime_type, :max_size]
10
+ DEFAULT_CONFIG = { focus_host: false,
11
+ focus_descendants: false }
12
+
13
+ attr_accessor :path
14
+ attr_accessor *SIMPLE_CONFIGS
15
+
16
+ class Delegator
17
+ def initialize(filter)
18
+ @filter = filter
19
+ end
20
+
21
+ Kudzu::Config::Filter::SIMPLE_CONFIGS.each do |key|
22
+ define_method(key) do |value|
23
+ @filter.send("#{key}=", value)
24
+ end
25
+ end
26
+ end
27
+
28
+ def initialize(path, config = {}, &block)
29
+ @path = path
30
+ DEFAULT_CONFIG.merge(config).each do |key, value|
31
+ send("#{key}=", value)
32
+ end
33
+ if block
34
+ Kudzu::Config::Filter::Delegator.new(self).instance_eval(&block)
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,258 @@
1
+ require 'addressable'
2
+ require 'nokogiri'
3
+ require_relative 'common'
4
+ require_relative 'config'
5
+ require_relative 'callback'
6
+ require_relative 'logger'
7
+ require_relative 'adapter/memory'
8
+ require_relative 'util/all'
9
+ require_relative 'agent/all'
10
+ require_relative 'revisit/all'
11
+
12
+ module Kudzu
13
+ class Crawler
14
+ attr_reader :uuid, :config
15
+ attr_reader :frontier, :repository
16
+
17
+ def initialize(options = {}, &block)
18
+ @uuid = options[:uuid] || SecureRandom.uuid
19
+ @config = Kudzu::Config.new(options, &block)
20
+ end
21
+
22
+ def prepare(&block)
23
+ @logger = Kudzu::Logger.new(@config.log_file, @config.log_level)
24
+ @callback = Kudzu::Callback.new(&block)
25
+
26
+ @frontier = Kudzu.adapter::Frontier.new(@uuid)
27
+ @repository = Kudzu.adapter::Repository.new
28
+
29
+ @robots = Kudzu::Agent::Robots.new(@config)
30
+ @page_fetcher = Kudzu::Agent::Fetcher.new(@config, @robots)
31
+ @page_filter = Kudzu::Agent::Filter.new(@config)
32
+ @charset_detector = Kudzu::Agent::CharsetDetector.new
33
+ @mime_type_detector = Kudzu::Agent::MimeTypeDetector.new
34
+ @title_parser = Kudzu::Agent::TitleParser.new
35
+
36
+ @url_extractor = Kudzu::Agent::UrlExtractor.new(@config)
37
+ @url_filter = Kudzu::Agent::UrlFilter.new(@config)
38
+
39
+ @revisit_scheduler = Kudzu::Revisit::Scheduler.new(@config)
40
+ end
41
+
42
+ def run(seed_url, &block)
43
+ prepare(&block)
44
+
45
+ seeds = Array(seed_url).map { |url| { url: url } }
46
+ enqueue_hrefs(seeds, 1)
47
+
48
+ if @config.thread_num.to_i <= 1
49
+ single_thread
50
+ else
51
+ multi_thread(@config.thread_num)
52
+ end
53
+
54
+ @page_fetcher.pool.close
55
+ @frontier.clear
56
+ end
57
+
58
+ private
59
+
60
+ def single_thread
61
+ loop do
62
+ link = @frontier.dequeue.first
63
+ break unless link
64
+ visit_link(link)
65
+ end
66
+ end
67
+
68
+ def multi_thread(thread_num)
69
+ @thread_pool = Kudzu::Util::ThreadPool.new(thread_num)
70
+
71
+ @thread_pool.start do |queue|
72
+ limit_num = [thread_num - queue.size, 0].max
73
+ @frontier.dequeue(limit: limit_num).each do |link|
74
+ queue.push(link)
75
+ end
76
+ link = queue.pop
77
+ visit_link(link)
78
+ end
79
+
80
+ @thread_pool.wait
81
+ @thread_pool.shutdown
82
+ end
83
+
84
+ def visit_link(link)
85
+ page = @repository.find_by_url(link.url)
86
+ response = fetch_link(link, build_request_header(page))
87
+ return unless response
88
+
89
+ page = @repository.find_by_url(response.url) if response.redirected?
90
+ page.url = response.url
91
+ page.status = response.status
92
+ page.response_time = response.time
93
+ page.fetched_at = Time.now
94
+
95
+ if page.status_success?
96
+ handle_success(page, link, response)
97
+ elsif page.status_not_modified?
98
+ @revisit_scheduler.schedule(page, modified: false)
99
+ register_page(page)
100
+ elsif page.status_not_found? || page.status_gone?
101
+ delete_page(page)
102
+ end
103
+
104
+ run_callback(page, link)
105
+ end
106
+
107
+ def run_callback(page, link)
108
+ if page.status_success?
109
+ if page.filtered
110
+ @callback.on(:filter, page, link)
111
+ else
112
+ @callback.on(:success, page, link)
113
+ end
114
+ elsif page.status_redirection?
115
+ @callback.on(:redirection, page, link)
116
+ elsif page.status_client_error?
117
+ @callback.on(:client_error, page, link)
118
+ elsif page.status_server_error?
119
+ @callback.on(:server_error, page, link)
120
+ end
121
+ end
122
+
123
+ def build_request_header(page)
124
+ header = @config.default_request_header.to_h
125
+ if @config.revisit_mode
126
+ header['If-Modified-Since'] = page.last_modified.httpdate if page.last_modified
127
+ header['If-None-Match'] = page.etag if page.etag
128
+ end
129
+ header
130
+ end
131
+
132
+ def fetch_link(link, request_header)
133
+ response = @page_fetcher.fetch(link.url, request_header: request_header)
134
+ @logger.log :info, "page fetched: #{response.status} #{response.url}"
135
+ response
136
+ rescue Exception => e
137
+ @logger.log :warn, "couldn't fetch page: #{link.url}", error: e
138
+ @callback.on(:failure, link, e)
139
+ nil
140
+ end
141
+
142
+ def handle_success(page, link, response)
143
+ digest = Digest::MD5.hexdigest(response.body)
144
+ @revisit_scheduler.schedule(page, modified: page.digest != digest)
145
+
146
+ page.response_header = response.header
147
+ page.body = response.body
148
+ page.size = response.body.size
149
+ page.mime_type = detect_mime_type(page)
150
+ page.charset = detect_charset(page)
151
+ page.title = parse_title(page)
152
+ page.redirect_from = link.url if response.redirected?
153
+ page.revised_at = Time.now if page.digest != digest
154
+ page.digest = digest
155
+
156
+ if follow_hrefs_from?(page, link)
157
+ hrefs = extract_hrefs(page, page.url)
158
+ enqueue_hrefs(hrefs, link.depth + 1) unless hrefs.empty?
159
+ end
160
+
161
+ if allowed_page?(page)
162
+ register_page(page)
163
+ else
164
+ page.filtered = true
165
+ delete_page(page)
166
+ end
167
+ end
168
+
169
+ def detect_mime_type(page)
170
+ @mime_type_detector.detect(page)
171
+ rescue => e
172
+ @logger.log :warn, "couldn't detect mime type for #{page.url}", error: e
173
+ nil
174
+ end
175
+
176
+ def detect_charset(page)
177
+ if page.text?
178
+ @charset_detector.detect(page)
179
+ else
180
+ nil
181
+ end
182
+ rescue => e
183
+ @logger.log :warn, "couldn't detect charset for #{page.url}", error: e
184
+ nil
185
+ end
186
+
187
+ def parse_title(page)
188
+ if page.html?
189
+ @title_parser.parse(page)
190
+ else
191
+ Addressable::URI.parse(page.url).basename
192
+ end
193
+ rescue => e
194
+ @logger.log :warn, "couldn't parse title for #{page.url}", error: e
195
+ nil
196
+ end
197
+
198
+ def follow_hrefs_from?(page, link)
199
+ (page.html? || page.xml?) && (@config.max_depth.nil? || link.depth < @config.max_depth.to_i)
200
+ end
201
+
202
+ def extract_hrefs(page, base_url)
203
+ hrefs = @url_extractor.extract(page, base_url)
204
+ passed, dropped = @url_filter.filter(hrefs, base_url)
205
+
206
+ if @config.respect_robots_txt
207
+ passed, dropped_by_robots = passed.partition { |href| @robots.allowed?(href[:url]) }
208
+ dropped += dropped_by_robots
209
+ end
210
+
211
+ if @config.log_level == :debug
212
+ passed.each { |href| @logger.log :debug, "url passed: #{href[:url]}" }
213
+ dropped.each { |href| @logger.log :debug, "url dropped: #{href[:url]}" }
214
+ end
215
+
216
+ passed
217
+ rescue => e
218
+ @logger.log :warn, "couldn't extract links from #{page.url}", error: e
219
+ []
220
+ end
221
+
222
+ def allowed_page?(page)
223
+ if @page_filter.allowed?(page) &&
224
+ (!page.redirect_from || @url_filter.allowed?(page.url, page.redirect_from))
225
+ @logger.log :info, "page passed: #{page.url}"
226
+ true
227
+ else
228
+ @logger.log :info, "page dropped: #{page.url}"
229
+ false
230
+ end
231
+ end
232
+
233
+ def register_page(page)
234
+ @callback.around(:register, page) do
235
+ @repository.register(page)
236
+ end
237
+ end
238
+
239
+ def delete_page(page)
240
+ @callback.around(:delete, page) do
241
+ @repository.delete(page)
242
+ end
243
+ end
244
+
245
+ def enqueue_hrefs(hrefs, depth)
246
+ links = hrefs.map do |href|
247
+ Kudzu.adapter::Link.new(uuid: @uuid,
248
+ url: href[:url],
249
+ title: href[:title],
250
+ state: 0,
251
+ depth: depth)
252
+ end
253
+ @callback.around(:enqueue, links) do
254
+ @frontier.enqueue(links, depth: depth)
255
+ end
256
+ end
257
+ end
258
+ end