kudzu 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kudzu/adapter/memory/frontier.rb +1 -1
- data/lib/kudzu/adapter/memory/model/link.rb +2 -6
- data/lib/kudzu/adapter/memory/model/page.rb +3 -8
- data/lib/kudzu/adapter/memory/repository.rb +0 -2
- data/lib/kudzu/adapter/memory.rb +3 -4
- data/lib/kudzu/agent/all.rb +1 -1
- data/lib/kudzu/agent/fetcher.rb +46 -49
- data/lib/kudzu/agent/http/connection.rb +9 -0
- data/lib/kudzu/agent/http/connection_pool.rb +50 -0
- data/lib/kudzu/agent/page_filterer.rb +58 -0
- data/lib/kudzu/agent/reference.rb +9 -0
- data/lib/kudzu/agent/response.rb +14 -0
- data/lib/kudzu/agent/robots/parser.rb +91 -0
- data/lib/kudzu/agent/robots/txt.rb +34 -0
- data/lib/kudzu/agent/robots.rb +12 -123
- data/lib/kudzu/agent/sleeper.rb +2 -2
- data/lib/kudzu/agent/url_extractor.rb +60 -46
- data/lib/kudzu/agent/{url_filter.rb → url_filterer.rb} +26 -13
- data/lib/kudzu/agent/util/charset_detector.rb +84 -0
- data/lib/kudzu/agent/util/content_type_parser.rb +28 -0
- data/lib/kudzu/agent/util/matcher.rb +25 -0
- data/lib/kudzu/agent/util/mime_type_detector.rb +38 -0
- data/lib/kudzu/agent/util/title_parser.rb +30 -0
- data/lib/kudzu/agent.rb +42 -0
- data/lib/kudzu/callback.rb +4 -2
- data/lib/kudzu/config/filter.rb +11 -11
- data/lib/kudzu/config.rb +20 -25
- data/lib/kudzu/crawler.rb +65 -146
- data/lib/kudzu/{adapter/base → model}/all.rb +0 -0
- data/lib/kudzu/model/base.rb +9 -0
- data/lib/kudzu/model/link.rb +9 -0
- data/lib/kudzu/model/page.rb +112 -0
- data/lib/kudzu/thread_pool.rb +36 -0
- data/lib/kudzu/version.rb +1 -1
- data/lib/kudzu.rb +21 -3
- metadata +21 -19
- data/lib/kudzu/adapter/base/link.rb +0 -8
- data/lib/kudzu/adapter/base/page.rb +0 -106
- data/lib/kudzu/adapter/memory/all.rb +0 -3
- data/lib/kudzu/agent/charset_detector.rb +0 -84
- data/lib/kudzu/agent/filter.rb +0 -40
- data/lib/kudzu/agent/mime_type_detector.rb +0 -34
- data/lib/kudzu/agent/title_parser.rb +0 -16
- data/lib/kudzu/logger.rb +0 -20
- data/lib/kudzu/revisit/all.rb +0 -3
- data/lib/kudzu/revisit/scheduler.rb +0 -28
- data/lib/kudzu/util/all.rb +0 -3
- data/lib/kudzu/util/connection_pool.rb +0 -56
- data/lib/kudzu/util/content_type_parser.rb +0 -24
- data/lib/kudzu/util/matcher.rb +0 -21
- data/lib/kudzu/util/thread_pool.rb +0 -38
@@ -1,84 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'charlock_holmes'
|
3
|
-
|
4
|
-
module Kudzu
|
5
|
-
class Agent
|
6
|
-
class CharsetDetector
|
7
|
-
CORRECTION = {
|
8
|
-
'utf_8' => 'utf-8',
|
9
|
-
'shift-jis' => 'shift_jis',
|
10
|
-
'x-sjis' => 'shift_jis',
|
11
|
-
'euc_jp' => 'euc-jp'
|
12
|
-
}
|
13
|
-
|
14
|
-
def initialize
|
15
|
-
@parser = Kudzu::Util::ContentTypeParser.new
|
16
|
-
end
|
17
|
-
|
18
|
-
def detect(page)
|
19
|
-
if page.html?
|
20
|
-
from_html(page.body) || from_text(page.body)
|
21
|
-
elsif page.xml?
|
22
|
-
from_xml(page.body) || from_text(page.body)
|
23
|
-
elsif page.text?
|
24
|
-
from_text(page.body)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
private
|
29
|
-
|
30
|
-
def from_html(body)
|
31
|
-
doc = Nokogiri::HTML(body.encode('ascii', undef: :replace, invalid: :replace))
|
32
|
-
|
33
|
-
if (node = doc.xpath('//meta/@charset').first)
|
34
|
-
charset = correct(node.to_s)
|
35
|
-
return charset if charset
|
36
|
-
end
|
37
|
-
|
38
|
-
doc.xpath('//meta[@http-equiv]').each do |meta|
|
39
|
-
if meta['http-equiv'] =~ /content-type/i
|
40
|
-
charset = @parser.parse(meta[:content].to_s)[1][:charset]
|
41
|
-
charset = correct(node.to_s)
|
42
|
-
return charset if charset
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
return nil
|
47
|
-
end
|
48
|
-
|
49
|
-
def from_xml(body)
|
50
|
-
doc = Nokogiri::XML(body.encode('ascii', undef: :replace, invalid: :replace))
|
51
|
-
if doc.encoding
|
52
|
-
correct(doc.encoding)
|
53
|
-
else
|
54
|
-
nil
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def from_text(text)
|
59
|
-
if text.ascii_only?
|
60
|
-
'ascii'
|
61
|
-
else
|
62
|
-
detection = CharlockHolmes::EncodingDetector.detect(text)
|
63
|
-
if detection && detection.key?(:encoding)
|
64
|
-
detection[:encoding].downcase
|
65
|
-
else
|
66
|
-
nil
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
def correct(charset)
|
72
|
-
charset = charset.downcase
|
73
|
-
charset = CORRECTION[charset] if CORRECTION.key?(charset)
|
74
|
-
|
75
|
-
begin
|
76
|
-
Encoding.find(charset)
|
77
|
-
rescue
|
78
|
-
charset = nil
|
79
|
-
end
|
80
|
-
charset
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
data/lib/kudzu/agent/filter.rb
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
|
3
|
-
module Kudzu
|
4
|
-
class Agent
|
5
|
-
class Filter
|
6
|
-
def initialize(config)
|
7
|
-
@config = config
|
8
|
-
@matcher = Kudzu::Util::Matcher.new
|
9
|
-
end
|
10
|
-
|
11
|
-
def allowed?(page)
|
12
|
-
filter = @config.find_filter(page.url)
|
13
|
-
return true unless filter
|
14
|
-
|
15
|
-
allowed_mime_type?(page.mime_type, filter) && allowed_size?(page.size, filter) && allowed_index?(page)
|
16
|
-
end
|
17
|
-
|
18
|
-
private
|
19
|
-
|
20
|
-
def allowed_mime_type?(mime_type, filter)
|
21
|
-
return true if mime_type.nil?
|
22
|
-
@matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
|
23
|
-
end
|
24
|
-
|
25
|
-
def allowed_size?(size, filter)
|
26
|
-
return true if filter.max_size.nil? || size.nil?
|
27
|
-
size.to_i < filter.max_size.to_i
|
28
|
-
end
|
29
|
-
|
30
|
-
def allowed_index?(page)
|
31
|
-
return true unless page.html?
|
32
|
-
return true unless @config.respect_noindex
|
33
|
-
|
34
|
-
doc = Nokogiri::HTML(page.body.encode('ascii', undef: :replace, invalid: :replace))
|
35
|
-
doc.xpath('html/head/meta[@name]')
|
36
|
-
.all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
@@ -1,34 +0,0 @@
|
|
1
|
-
require 'shared-mime-info'
|
2
|
-
|
3
|
-
module Kudzu
|
4
|
-
class Agent
|
5
|
-
class MimeTypeDetector
|
6
|
-
DEFALUT_MIME_TYPE = 'application/octet-stream'
|
7
|
-
|
8
|
-
def initialize
|
9
|
-
@content_type_parser = Kudzu::Util::ContentTypeParser.new
|
10
|
-
end
|
11
|
-
|
12
|
-
def detect(page)
|
13
|
-
from_header(page.response_header) || from_body(page.body) || from_url(page.url) || DEFALUT_MIME_TYPE
|
14
|
-
end
|
15
|
-
|
16
|
-
private
|
17
|
-
|
18
|
-
def from_header(header)
|
19
|
-
@content_type_parser.parse(header['content-type']).first
|
20
|
-
end
|
21
|
-
|
22
|
-
def from_body(body)
|
23
|
-
mime = MIME.check_magics(StringIO.new(body))
|
24
|
-
mime.to_s if mime
|
25
|
-
end
|
26
|
-
|
27
|
-
def from_url(url)
|
28
|
-
uri = Addressable::URI.parse(url)
|
29
|
-
mime = MIME.check_globs(uri.basename)
|
30
|
-
mime.to_s if mime
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
data/lib/kudzu/logger.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
module Kudzu
|
2
|
-
class Logger
|
3
|
-
def initialize(file, level)
|
4
|
-
if file.is_a?(::Logger)
|
5
|
-
@logger = file
|
6
|
-
elsif file
|
7
|
-
@logger = ::Logger.new(file)
|
8
|
-
@logger.level = level
|
9
|
-
else
|
10
|
-
@logger = nil
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
def log(level, message, error: nil)
|
15
|
-
return unless @logger
|
16
|
-
message += " #{error.class} #{error.message} #{error.backtrace.join("\n")}" if error
|
17
|
-
@logger.send(level, message)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
data/lib/kudzu/revisit/all.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
module Kudzu
|
2
|
-
class Revisit
|
3
|
-
class Scheduler
|
4
|
-
def initialize(config)
|
5
|
-
@config = config
|
6
|
-
end
|
7
|
-
|
8
|
-
def schedule(page, modified: true)
|
9
|
-
page.revisit_interval = next_interval(page.revisit_interval, modified)
|
10
|
-
page.revisit_at = page.fetched_at + page.revisit_interval * 86400
|
11
|
-
end
|
12
|
-
|
13
|
-
private
|
14
|
-
|
15
|
-
def next_interval(curr_interval, modified)
|
16
|
-
if curr_interval
|
17
|
-
if modified
|
18
|
-
[curr_interval - 1, @config.revisit_min_interval].max
|
19
|
-
else
|
20
|
-
[curr_interval + 1, @config.revisit_max_interval].min
|
21
|
-
end
|
22
|
-
else
|
23
|
-
@config.revisit_default_interval
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
data/lib/kudzu/util/all.rb
DELETED
@@ -1,56 +0,0 @@
|
|
1
|
-
module Kudzu
|
2
|
-
class Util
|
3
|
-
class ConnectionPool
|
4
|
-
class Connection
|
5
|
-
attr_accessor :name, :http, :last_used
|
6
|
-
|
7
|
-
def initialize(attr = {})
|
8
|
-
attr.each { |k, v| public_send("#{k}=", v) }
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
def initialize(max_size = 10)
|
13
|
-
@max_size = max_size
|
14
|
-
end
|
15
|
-
|
16
|
-
def checkout(name)
|
17
|
-
pool[name] ||= Connection.new(name: name, http: yield)
|
18
|
-
|
19
|
-
conn = pool[name]
|
20
|
-
conn.last_used = Time.now
|
21
|
-
|
22
|
-
if pool.size > @max_size
|
23
|
-
reduce
|
24
|
-
end
|
25
|
-
|
26
|
-
conn.http
|
27
|
-
end
|
28
|
-
|
29
|
-
def close
|
30
|
-
pool.values.each do |conn|
|
31
|
-
finish_http(conn.http)
|
32
|
-
end
|
33
|
-
Thread.current[:kudzu_connection] = nil
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
def pool
|
39
|
-
Thread.current[:kudzu_connection] ||= {}
|
40
|
-
Thread.current[:kudzu_connection]
|
41
|
-
end
|
42
|
-
|
43
|
-
def reduce
|
44
|
-
conns = pool.values.sort_by { |conn| conn.last_used }
|
45
|
-
conns.first(pool.size - @max_size).each do |conn|
|
46
|
-
finish_http(conn.http)
|
47
|
-
pool.delete(conn.name)
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
def finish_http(http)
|
52
|
-
http.finish if http && http.started?
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
@@ -1,24 +0,0 @@
|
|
1
|
-
module Kudzu
|
2
|
-
class Util
|
3
|
-
class ContentTypeParser
|
4
|
-
def parse(content_type)
|
5
|
-
mime, *kvs = content_type.to_s.split(';').map { |str| str.strip.downcase }
|
6
|
-
params = kvs.each_with_object({}) do |kv, hash|
|
7
|
-
k, v = kv.to_s.split('=').map { |str| str.strip }
|
8
|
-
hash[k.to_sym] = unquote(v) if k && v
|
9
|
-
end
|
10
|
-
return mime, params
|
11
|
-
end
|
12
|
-
|
13
|
-
private
|
14
|
-
|
15
|
-
def unquote(str)
|
16
|
-
if str =~ /^"(.*?)"$/
|
17
|
-
$1.gsub(/\\(.)/, '\1')
|
18
|
-
else
|
19
|
-
str
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
data/lib/kudzu/util/matcher.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
module Kudzu
|
2
|
-
class Util
|
3
|
-
class Matcher
|
4
|
-
def match?(text, allows: nil, denies: nil)
|
5
|
-
match_to_allows?(text, allows) && !match_to_denies?(text, denies)
|
6
|
-
end
|
7
|
-
|
8
|
-
private
|
9
|
-
|
10
|
-
def match_to_allows?(text, allows)
|
11
|
-
allows = Array(allows)
|
12
|
-
allows.empty? || allows.any? { |allow| Kudzu::Common.match?(text, allow) }
|
13
|
-
end
|
14
|
-
|
15
|
-
def match_to_denies?(text, denies)
|
16
|
-
denies = Array(denies)
|
17
|
-
!denies.empty? && denies.any? { |deny| Kudzu::Common.match?(text, deny) }
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
@@ -1,38 +0,0 @@
|
|
1
|
-
module Kudzu
|
2
|
-
class Util
|
3
|
-
class ThreadPool
|
4
|
-
def initialize(size)
|
5
|
-
@size = size
|
6
|
-
@queue = Queue.new
|
7
|
-
@threads = []
|
8
|
-
end
|
9
|
-
|
10
|
-
def start(&block)
|
11
|
-
@threads = 1.upto(@size).map { create_thread(&block) }
|
12
|
-
end
|
13
|
-
|
14
|
-
def wait
|
15
|
-
until @queue.num_waiting == @threads.select { |t| t.alive? }.size
|
16
|
-
Thread.pass
|
17
|
-
sleep 1
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def shutdown
|
22
|
-
@threads.each { |t| t.kill }
|
23
|
-
@threads = []
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
def create_thread(&block)
|
29
|
-
Thread.start do
|
30
|
-
loop do
|
31
|
-
ret = block.call(@queue)
|
32
|
-
break if ret == :end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|