kudzu 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kudzu/adapter/memory/frontier.rb +1 -1
  3. data/lib/kudzu/adapter/memory/model/link.rb +2 -6
  4. data/lib/kudzu/adapter/memory/model/page.rb +3 -8
  5. data/lib/kudzu/adapter/memory/repository.rb +0 -2
  6. data/lib/kudzu/adapter/memory.rb +3 -4
  7. data/lib/kudzu/agent/all.rb +1 -1
  8. data/lib/kudzu/agent/fetcher.rb +46 -49
  9. data/lib/kudzu/agent/http/connection.rb +9 -0
  10. data/lib/kudzu/agent/http/connection_pool.rb +50 -0
  11. data/lib/kudzu/agent/page_filterer.rb +58 -0
  12. data/lib/kudzu/agent/reference.rb +9 -0
  13. data/lib/kudzu/agent/response.rb +14 -0
  14. data/lib/kudzu/agent/robots/parser.rb +91 -0
  15. data/lib/kudzu/agent/robots/txt.rb +34 -0
  16. data/lib/kudzu/agent/robots.rb +12 -123
  17. data/lib/kudzu/agent/sleeper.rb +2 -2
  18. data/lib/kudzu/agent/url_extractor.rb +60 -46
  19. data/lib/kudzu/agent/{url_filter.rb → url_filterer.rb} +26 -13
  20. data/lib/kudzu/agent/util/charset_detector.rb +84 -0
  21. data/lib/kudzu/agent/util/content_type_parser.rb +28 -0
  22. data/lib/kudzu/agent/util/matcher.rb +25 -0
  23. data/lib/kudzu/agent/util/mime_type_detector.rb +38 -0
  24. data/lib/kudzu/agent/util/title_parser.rb +30 -0
  25. data/lib/kudzu/agent.rb +42 -0
  26. data/lib/kudzu/callback.rb +4 -2
  27. data/lib/kudzu/config/filter.rb +11 -11
  28. data/lib/kudzu/config.rb +20 -25
  29. data/lib/kudzu/crawler.rb +65 -146
  30. data/lib/kudzu/{adapter/base → model}/all.rb +0 -0
  31. data/lib/kudzu/model/base.rb +9 -0
  32. data/lib/kudzu/model/link.rb +9 -0
  33. data/lib/kudzu/model/page.rb +112 -0
  34. data/lib/kudzu/thread_pool.rb +36 -0
  35. data/lib/kudzu/version.rb +1 -1
  36. data/lib/kudzu.rb +21 -3
  37. metadata +21 -19
  38. data/lib/kudzu/adapter/base/link.rb +0 -8
  39. data/lib/kudzu/adapter/base/page.rb +0 -106
  40. data/lib/kudzu/adapter/memory/all.rb +0 -3
  41. data/lib/kudzu/agent/charset_detector.rb +0 -84
  42. data/lib/kudzu/agent/filter.rb +0 -40
  43. data/lib/kudzu/agent/mime_type_detector.rb +0 -34
  44. data/lib/kudzu/agent/title_parser.rb +0 -16
  45. data/lib/kudzu/logger.rb +0 -20
  46. data/lib/kudzu/revisit/all.rb +0 -3
  47. data/lib/kudzu/revisit/scheduler.rb +0 -28
  48. data/lib/kudzu/util/all.rb +0 -3
  49. data/lib/kudzu/util/connection_pool.rb +0 -56
  50. data/lib/kudzu/util/content_type_parser.rb +0 -24
  51. data/lib/kudzu/util/matcher.rb +0 -21
  52. data/lib/kudzu/util/thread_pool.rb +0 -38
@@ -1,84 +0,0 @@
1
- require 'nokogiri'
2
- require 'charlock_holmes'
3
-
4
- module Kudzu
5
- class Agent
6
- class CharsetDetector
7
- CORRECTION = {
8
- 'utf_8' => 'utf-8',
9
- 'shift-jis' => 'shift_jis',
10
- 'x-sjis' => 'shift_jis',
11
- 'euc_jp' => 'euc-jp'
12
- }
13
-
14
- def initialize
15
- @parser = Kudzu::Util::ContentTypeParser.new
16
- end
17
-
18
- def detect(page)
19
- if page.html?
20
- from_html(page.body) || from_text(page.body)
21
- elsif page.xml?
22
- from_xml(page.body) || from_text(page.body)
23
- elsif page.text?
24
- from_text(page.body)
25
- end
26
- end
27
-
28
- private
29
-
30
- def from_html(body)
31
- doc = Nokogiri::HTML(body.encode('ascii', undef: :replace, invalid: :replace))
32
-
33
- if (node = doc.xpath('//meta/@charset').first)
34
- charset = correct(node.to_s)
35
- return charset if charset
36
- end
37
-
38
- doc.xpath('//meta[@http-equiv]').each do |meta|
39
- if meta['http-equiv'] =~ /content-type/i
40
- charset = @parser.parse(meta[:content].to_s)[1][:charset]
41
- charset = correct(node.to_s)
42
- return charset if charset
43
- end
44
- end
45
-
46
- return nil
47
- end
48
-
49
- def from_xml(body)
50
- doc = Nokogiri::XML(body.encode('ascii', undef: :replace, invalid: :replace))
51
- if doc.encoding
52
- correct(doc.encoding)
53
- else
54
- nil
55
- end
56
- end
57
-
58
- def from_text(text)
59
- if text.ascii_only?
60
- 'ascii'
61
- else
62
- detection = CharlockHolmes::EncodingDetector.detect(text)
63
- if detection && detection.key?(:encoding)
64
- detection[:encoding].downcase
65
- else
66
- nil
67
- end
68
- end
69
- end
70
-
71
- def correct(charset)
72
- charset = charset.downcase
73
- charset = CORRECTION[charset] if CORRECTION.key?(charset)
74
-
75
- begin
76
- Encoding.find(charset)
77
- rescue
78
- charset = nil
79
- end
80
- charset
81
- end
82
- end
83
- end
84
- end
@@ -1,40 +0,0 @@
1
- require 'nokogiri'
2
-
3
- module Kudzu
4
- class Agent
5
- class Filter
6
- def initialize(config)
7
- @config = config
8
- @matcher = Kudzu::Util::Matcher.new
9
- end
10
-
11
- def allowed?(page)
12
- filter = @config.find_filter(page.url)
13
- return true unless filter
14
-
15
- allowed_mime_type?(page.mime_type, filter) && allowed_size?(page.size, filter) && allowed_index?(page)
16
- end
17
-
18
- private
19
-
20
- def allowed_mime_type?(mime_type, filter)
21
- return true if mime_type.nil?
22
- @matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
23
- end
24
-
25
- def allowed_size?(size, filter)
26
- return true if filter.max_size.nil? || size.nil?
27
- size.to_i < filter.max_size.to_i
28
- end
29
-
30
- def allowed_index?(page)
31
- return true unless page.html?
32
- return true unless @config.respect_noindex
33
-
34
- doc = Nokogiri::HTML(page.body.encode('ascii', undef: :replace, invalid: :replace))
35
- doc.xpath('html/head/meta[@name]')
36
- .all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
37
- end
38
- end
39
- end
40
- end
@@ -1,34 +0,0 @@
1
- require 'shared-mime-info'
2
-
3
- module Kudzu
4
- class Agent
5
- class MimeTypeDetector
6
- DEFALUT_MIME_TYPE = 'application/octet-stream'
7
-
8
- def initialize
9
- @content_type_parser = Kudzu::Util::ContentTypeParser.new
10
- end
11
-
12
- def detect(page)
13
- from_header(page.response_header) || from_body(page.body) || from_url(page.url) || DEFALUT_MIME_TYPE
14
- end
15
-
16
- private
17
-
18
- def from_header(header)
19
- @content_type_parser.parse(header['content-type']).first
20
- end
21
-
22
- def from_body(body)
23
- mime = MIME.check_magics(StringIO.new(body))
24
- mime.to_s if mime
25
- end
26
-
27
- def from_url(url)
28
- uri = Addressable::URI.parse(url)
29
- mime = MIME.check_globs(uri.basename)
30
- mime.to_s if mime
31
- end
32
- end
33
- end
34
- end
@@ -1,16 +0,0 @@
1
- require 'nokogiri'
2
-
3
- module Kudzu
4
- class Agent
5
- class TitleParser
6
- def parse(page)
7
- doc = Nokogiri::HTML(page.decoded_body)
8
- if (node = doc.xpath('//head/title').first)
9
- node.inner_text.to_s
10
- else
11
- ''
12
- end
13
- end
14
- end
15
- end
16
- end
data/lib/kudzu/logger.rb DELETED
@@ -1,20 +0,0 @@
1
- module Kudzu
2
- class Logger
3
- def initialize(file, level)
4
- if file.is_a?(::Logger)
5
- @logger = file
6
- elsif file
7
- @logger = ::Logger.new(file)
8
- @logger.level = level
9
- else
10
- @logger = nil
11
- end
12
- end
13
-
14
- def log(level, message, error: nil)
15
- return unless @logger
16
- message += " #{error.class} #{error.message} #{error.backtrace.join("\n")}" if error
17
- @logger.send(level, message)
18
- end
19
- end
20
- end
@@ -1,3 +0,0 @@
1
- Dir[File.join(__dir__, '*.rb')].each do |file|
2
- require_relative file
3
- end
@@ -1,28 +0,0 @@
1
- module Kudzu
2
- class Revisit
3
- class Scheduler
4
- def initialize(config)
5
- @config = config
6
- end
7
-
8
- def schedule(page, modified: true)
9
- page.revisit_interval = next_interval(page.revisit_interval, modified)
10
- page.revisit_at = page.fetched_at + page.revisit_interval * 86400
11
- end
12
-
13
- private
14
-
15
- def next_interval(curr_interval, modified)
16
- if curr_interval
17
- if modified
18
- [curr_interval - 1, @config.revisit_min_interval].max
19
- else
20
- [curr_interval + 1, @config.revisit_max_interval].min
21
- end
22
- else
23
- @config.revisit_default_interval
24
- end
25
- end
26
- end
27
- end
28
- end
@@ -1,3 +0,0 @@
1
- Dir[File.join(__dir__, '*.rb')].each do |file|
2
- require_relative file
3
- end
@@ -1,56 +0,0 @@
1
- module Kudzu
2
- class Util
3
- class ConnectionPool
4
- class Connection
5
- attr_accessor :name, :http, :last_used
6
-
7
- def initialize(attr = {})
8
- attr.each { |k, v| public_send("#{k}=", v) }
9
- end
10
- end
11
-
12
- def initialize(max_size = 10)
13
- @max_size = max_size
14
- end
15
-
16
- def checkout(name)
17
- pool[name] ||= Connection.new(name: name, http: yield)
18
-
19
- conn = pool[name]
20
- conn.last_used = Time.now
21
-
22
- if pool.size > @max_size
23
- reduce
24
- end
25
-
26
- conn.http
27
- end
28
-
29
- def close
30
- pool.values.each do |conn|
31
- finish_http(conn.http)
32
- end
33
- Thread.current[:kudzu_connection] = nil
34
- end
35
-
36
- private
37
-
38
- def pool
39
- Thread.current[:kudzu_connection] ||= {}
40
- Thread.current[:kudzu_connection]
41
- end
42
-
43
- def reduce
44
- conns = pool.values.sort_by { |conn| conn.last_used }
45
- conns.first(pool.size - @max_size).each do |conn|
46
- finish_http(conn.http)
47
- pool.delete(conn.name)
48
- end
49
- end
50
-
51
- def finish_http(http)
52
- http.finish if http && http.started?
53
- end
54
- end
55
- end
56
- end
@@ -1,24 +0,0 @@
1
- module Kudzu
2
- class Util
3
- class ContentTypeParser
4
- def parse(content_type)
5
- mime, *kvs = content_type.to_s.split(';').map { |str| str.strip.downcase }
6
- params = kvs.each_with_object({}) do |kv, hash|
7
- k, v = kv.to_s.split('=').map { |str| str.strip }
8
- hash[k.to_sym] = unquote(v) if k && v
9
- end
10
- return mime, params
11
- end
12
-
13
- private
14
-
15
- def unquote(str)
16
- if str =~ /^"(.*?)"$/
17
- $1.gsub(/\\(.)/, '\1')
18
- else
19
- str
20
- end
21
- end
22
- end
23
- end
24
- end
@@ -1,21 +0,0 @@
1
- module Kudzu
2
- class Util
3
- class Matcher
4
- def match?(text, allows: nil, denies: nil)
5
- match_to_allows?(text, allows) && !match_to_denies?(text, denies)
6
- end
7
-
8
- private
9
-
10
- def match_to_allows?(text, allows)
11
- allows = Array(allows)
12
- allows.empty? || allows.any? { |allow| Kudzu::Common.match?(text, allow) }
13
- end
14
-
15
- def match_to_denies?(text, denies)
16
- denies = Array(denies)
17
- !denies.empty? && denies.any? { |deny| Kudzu::Common.match?(text, deny) }
18
- end
19
- end
20
- end
21
- end
@@ -1,38 +0,0 @@
1
- module Kudzu
2
- class Util
3
- class ThreadPool
4
- def initialize(size)
5
- @size = size
6
- @queue = Queue.new
7
- @threads = []
8
- end
9
-
10
- def start(&block)
11
- @threads = 1.upto(@size).map { create_thread(&block) }
12
- end
13
-
14
- def wait
15
- until @queue.num_waiting == @threads.select { |t| t.alive? }.size
16
- Thread.pass
17
- sleep 1
18
- end
19
- end
20
-
21
- def shutdown
22
- @threads.each { |t| t.kill }
23
- @threads = []
24
- end
25
-
26
- private
27
-
28
- def create_thread(&block)
29
- Thread.start do
30
- loop do
31
- ret = block.call(@queue)
32
- break if ret == :end
33
- end
34
- end
35
- end
36
- end
37
- end
38
- end