kudzu 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kudzu/adapter/memory/frontier.rb +1 -1
- data/lib/kudzu/adapter/memory/model/link.rb +2 -6
- data/lib/kudzu/adapter/memory/model/page.rb +3 -8
- data/lib/kudzu/adapter/memory/repository.rb +0 -2
- data/lib/kudzu/adapter/memory.rb +3 -4
- data/lib/kudzu/agent/all.rb +1 -1
- data/lib/kudzu/agent/fetcher.rb +46 -49
- data/lib/kudzu/agent/http/connection.rb +9 -0
- data/lib/kudzu/agent/http/connection_pool.rb +50 -0
- data/lib/kudzu/agent/page_filterer.rb +58 -0
- data/lib/kudzu/agent/reference.rb +9 -0
- data/lib/kudzu/agent/response.rb +14 -0
- data/lib/kudzu/agent/robots/parser.rb +91 -0
- data/lib/kudzu/agent/robots/txt.rb +34 -0
- data/lib/kudzu/agent/robots.rb +12 -123
- data/lib/kudzu/agent/sleeper.rb +2 -2
- data/lib/kudzu/agent/url_extractor.rb +60 -46
- data/lib/kudzu/agent/{url_filter.rb → url_filterer.rb} +26 -13
- data/lib/kudzu/agent/util/charset_detector.rb +84 -0
- data/lib/kudzu/agent/util/content_type_parser.rb +28 -0
- data/lib/kudzu/agent/util/matcher.rb +25 -0
- data/lib/kudzu/agent/util/mime_type_detector.rb +38 -0
- data/lib/kudzu/agent/util/title_parser.rb +30 -0
- data/lib/kudzu/agent.rb +42 -0
- data/lib/kudzu/callback.rb +4 -2
- data/lib/kudzu/config/filter.rb +11 -11
- data/lib/kudzu/config.rb +20 -25
- data/lib/kudzu/crawler.rb +65 -146
- data/lib/kudzu/{adapter/base → model}/all.rb +0 -0
- data/lib/kudzu/model/base.rb +9 -0
- data/lib/kudzu/model/link.rb +9 -0
- data/lib/kudzu/model/page.rb +112 -0
- data/lib/kudzu/thread_pool.rb +36 -0
- data/lib/kudzu/version.rb +1 -1
- data/lib/kudzu.rb +21 -3
- metadata +21 -19
- data/lib/kudzu/adapter/base/link.rb +0 -8
- data/lib/kudzu/adapter/base/page.rb +0 -106
- data/lib/kudzu/adapter/memory/all.rb +0 -3
- data/lib/kudzu/agent/charset_detector.rb +0 -84
- data/lib/kudzu/agent/filter.rb +0 -40
- data/lib/kudzu/agent/mime_type_detector.rb +0 -34
- data/lib/kudzu/agent/title_parser.rb +0 -16
- data/lib/kudzu/logger.rb +0 -20
- data/lib/kudzu/revisit/all.rb +0 -3
- data/lib/kudzu/revisit/scheduler.rb +0 -28
- data/lib/kudzu/util/all.rb +0 -3
- data/lib/kudzu/util/connection_pool.rb +0 -56
- data/lib/kudzu/util/content_type_parser.rb +0 -24
- data/lib/kudzu/util/matcher.rb +0 -21
- data/lib/kudzu/util/thread_pool.rb +0 -38
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
|
3
1
|
module Kudzu
|
4
2
|
class Agent
|
5
3
|
class UrlExtractor
|
@@ -7,25 +5,32 @@ module Kudzu
|
|
7
5
|
@config = config
|
8
6
|
end
|
9
7
|
|
10
|
-
def extract(
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
8
|
+
def extract(response)
|
9
|
+
refs = if response.html?
|
10
|
+
ForHTML.new(@config).extract(response)
|
11
|
+
elsif response.xml?
|
12
|
+
ForXML.new(@config).extract(response)
|
13
|
+
else
|
14
|
+
[]
|
15
|
+
end
|
16
|
+
|
17
|
+
refs.each do |ref|
|
18
|
+
ref.url = sanitize(ref.url)
|
19
|
+
ref.url = normalize(ref.url, response.url)
|
21
20
|
end
|
21
|
+
refs.reject { |ref| ref.url.nil? }.uniq
|
22
22
|
end
|
23
23
|
|
24
24
|
private
|
25
25
|
|
26
|
+
def sanitize(url)
|
27
|
+
url.gsub(/^( | |%20)+/, '')
|
28
|
+
end
|
29
|
+
|
26
30
|
def normalize(url, base_url)
|
27
|
-
uri = Addressable::URI.parse(base_url
|
31
|
+
uri = Addressable::URI.parse(base_url).join(url).normalize
|
28
32
|
uri.path = '/' unless uri.path
|
33
|
+
uri.path = uri.path.gsub(%r|/{2,}|, '/')
|
29
34
|
uri.fragment = nil
|
30
35
|
|
31
36
|
if uri.scheme.in?(%w(http https))
|
@@ -33,35 +38,37 @@ module Kudzu
|
|
33
38
|
else
|
34
39
|
nil
|
35
40
|
end
|
41
|
+
rescue => e
|
42
|
+
Kudzu.log :warn, "failed to normalize url: #{url}", error: e
|
43
|
+
nil
|
36
44
|
end
|
37
45
|
|
38
|
-
class
|
46
|
+
class ForHTML
|
39
47
|
def initialize(config)
|
40
|
-
|
41
|
-
@content_type_parser = Kudzu::Util::ContentTypeParser.new
|
48
|
+
@config = config
|
42
49
|
end
|
43
50
|
|
44
|
-
def extract(
|
45
|
-
doc =
|
46
|
-
return [] if nofollow?(doc)
|
51
|
+
def extract(response)
|
52
|
+
doc = response.parsed_doc
|
53
|
+
return [] if @config.respect_nofollow && nofollow?(doc)
|
47
54
|
|
48
|
-
if (filter = @config.find_filter(
|
55
|
+
if (filter = @config.find_filter(response.url))
|
49
56
|
if filter.allow_element
|
50
57
|
doc = doc.search(*Array(filter.allow_element))
|
51
58
|
end
|
52
59
|
if filter.deny_element
|
60
|
+
doc = doc.dup
|
53
61
|
doc.search(*Array(filter.deny_element)).remove
|
54
62
|
end
|
55
63
|
end
|
56
64
|
|
57
|
-
|
58
|
-
|
65
|
+
refs = from_html(doc) + from_meta(doc)
|
66
|
+
refs.reject { |ref| ref.url.nil? || ref.url.empty? }
|
59
67
|
end
|
60
68
|
|
61
69
|
private
|
62
70
|
|
63
71
|
def nofollow?(doc)
|
64
|
-
return false unless @config.respect_nofollow
|
65
72
|
nodes = doc.xpath('//meta[@name]')
|
66
73
|
nodes.any? { |node| node[:name] =~ /^robots$/i && node[:content] =~ /nofollow/i }
|
67
74
|
end
|
@@ -73,10 +80,10 @@ module Kudzu
|
|
73
80
|
nodes.reject! { |url| url[:rel] =~ /nofollow/i }
|
74
81
|
end
|
75
82
|
|
76
|
-
nodes.map
|
77
|
-
|
78
|
-
|
79
|
-
|
83
|
+
nodes.map do |node|
|
84
|
+
Reference.new(url: (node[:href] || node[:src]).to_s,
|
85
|
+
title: node_to_title(node))
|
86
|
+
end
|
80
87
|
end
|
81
88
|
|
82
89
|
def node_to_title(node)
|
@@ -87,35 +94,42 @@ module Kudzu
|
|
87
94
|
end
|
88
95
|
end
|
89
96
|
|
90
|
-
def
|
97
|
+
def from_meta(doc)
|
91
98
|
nodes = doc.xpath('.//meta[@http-equiv]').select { |node| node[:'http-equiv'] =~ /^refresh$/i }
|
92
|
-
urls = nodes.map { |node|
|
93
|
-
urls.map
|
99
|
+
urls = nodes.map { |node| Util::ContentTypeParser.parse(node[:content]).last[:url] }.compact
|
100
|
+
urls.map do |url|
|
101
|
+
Reference.new(url: url.to_s)
|
102
|
+
end
|
94
103
|
end
|
95
104
|
end
|
96
105
|
|
97
|
-
class
|
98
|
-
def
|
99
|
-
|
106
|
+
class ForXML
|
107
|
+
def initialize(config)
|
108
|
+
@config = config
|
109
|
+
end
|
110
|
+
|
111
|
+
def extract(response)
|
112
|
+
doc = response.parsed_doc.dup
|
100
113
|
doc.remove_namespaces!
|
101
|
-
|
102
|
-
|
114
|
+
|
115
|
+
refs = from_rss(doc) + from_atom(doc)
|
116
|
+
refs.reject { |ref| ref.url.nil? || ref.url.empty? }
|
103
117
|
end
|
104
118
|
|
105
119
|
private
|
106
120
|
|
107
|
-
def
|
108
|
-
doc.xpath('rss/channel').map
|
109
|
-
|
110
|
-
|
111
|
-
|
121
|
+
def from_rss(doc)
|
122
|
+
doc.xpath('rss/channel').map do |node|
|
123
|
+
Reference.new(url: node.xpath('./item/link').inner_text,
|
124
|
+
title: node.xpath('./item/title').inner_text)
|
125
|
+
end
|
112
126
|
end
|
113
127
|
|
114
|
-
def
|
115
|
-
doc.xpath('feed/entry').map
|
116
|
-
|
117
|
-
|
118
|
-
|
128
|
+
def from_atom(doc)
|
129
|
+
doc.xpath('feed/entry').map do |node|
|
130
|
+
Reference.new(url: node.xpath('./link[@href]/@href').to_s,
|
131
|
+
title: node.xpath('./title').inner_text)
|
132
|
+
end
|
119
133
|
end
|
120
134
|
end
|
121
135
|
end
|
@@ -1,22 +1,28 @@
|
|
1
1
|
module Kudzu
|
2
2
|
class Agent
|
3
|
-
class
|
4
|
-
def initialize(config)
|
3
|
+
class UrlFilterer
|
4
|
+
def initialize(config, robots = nil)
|
5
5
|
@config = config
|
6
|
-
@
|
6
|
+
@robots = robots
|
7
7
|
end
|
8
8
|
|
9
|
-
def filter(
|
9
|
+
def filter(refs, base_url)
|
10
10
|
base_uri = Addressable::URI.parse(base_url)
|
11
11
|
filter = @config.find_filter(base_uri)
|
12
12
|
|
13
|
-
|
14
|
-
allowed?(
|
13
|
+
refs.select do |ref|
|
14
|
+
if allowed?(ref.uri, base_uri, filter: filter)
|
15
|
+
Kudzu.log :debug, "passed url: #{ref.url}"
|
16
|
+
true
|
17
|
+
else
|
18
|
+
Kudzu.log :debug, "dropped url: #{ref.url}"
|
19
|
+
false
|
20
|
+
end
|
15
21
|
end
|
16
22
|
end
|
17
23
|
|
18
|
-
def allowed?(
|
19
|
-
uri = Addressable::URI.parse(
|
24
|
+
def allowed?(uri, base_uri, filter: nil)
|
25
|
+
uri = Addressable::URI.parse(uri) if uri.is_a?(String)
|
20
26
|
base_uri = Addressable::URI.parse(base_uri) if base_uri.is_a?(String)
|
21
27
|
filter ||= @config.find_filter(base_uri)
|
22
28
|
return true unless filter
|
@@ -26,7 +32,8 @@ module Kudzu
|
|
26
32
|
allowed_url?(uri, filter) &&
|
27
33
|
allowed_host?(uri, filter) &&
|
28
34
|
allowed_path?(uri, filter) &&
|
29
|
-
allowed_ext?(uri, filter)
|
35
|
+
allowed_ext?(uri, filter) &&
|
36
|
+
allowed_by_robots?(uri)
|
30
37
|
end
|
31
38
|
|
32
39
|
private
|
@@ -44,21 +51,27 @@ module Kudzu
|
|
44
51
|
end
|
45
52
|
|
46
53
|
def allowed_url?(uri, filter)
|
47
|
-
|
54
|
+
Util::Matcher.match?(uri.to_s, allows: filter.allow_url, denies: filter.deny_url)
|
48
55
|
end
|
49
56
|
|
50
57
|
def allowed_host?(uri, filter)
|
51
|
-
|
58
|
+
Util::Matcher.match?(uri.host, allows: filter.allow_host, denies: filter.deny_host)
|
52
59
|
end
|
53
60
|
|
54
61
|
def allowed_path?(uri, filter)
|
55
|
-
|
62
|
+
Util::Matcher.match?(uri.path, allows: filter.allow_path, denies: filter.deny_path)
|
56
63
|
end
|
57
64
|
|
58
65
|
def allowed_ext?(uri, filter)
|
59
66
|
ext = uri.extname.to_s.sub(/^\./, '')
|
60
67
|
return true if ext.empty?
|
61
|
-
|
68
|
+
Util::Matcher.match?(ext, allows: filter.allow_ext, denies: filter.deny_ext)
|
69
|
+
end
|
70
|
+
|
71
|
+
def allowed_by_robots?(uri)
|
72
|
+
return true unless @robots
|
73
|
+
return true unless @config.respect_robots_txt
|
74
|
+
@robots.allowed?(uri)
|
62
75
|
end
|
63
76
|
end
|
64
77
|
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Util
|
4
|
+
class CharsetDetector
|
5
|
+
CORRECTION = {
|
6
|
+
'utf_8' => 'utf-8',
|
7
|
+
'shift-jis' => 'shift_jis',
|
8
|
+
'x-sjis' => 'shift_jis',
|
9
|
+
'euc_jp' => 'euc-jp'
|
10
|
+
}
|
11
|
+
|
12
|
+
class << self
|
13
|
+
def detect(response)
|
14
|
+
if response.html?
|
15
|
+
from_html(response.body) || from_text(response.body)
|
16
|
+
elsif response.xml?
|
17
|
+
from_xml(response.body) || from_text(response.body)
|
18
|
+
elsif response.text?
|
19
|
+
from_text(response.body)
|
20
|
+
end
|
21
|
+
rescue => e
|
22
|
+
Kudzu.log :warn, "failed to detect charset: #{response.url}", error: e
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def from_html(body)
|
29
|
+
doc = Nokogiri::HTML(body.encode('utf-8', undef: :replace, invalid: :replace))
|
30
|
+
|
31
|
+
if (node = doc.xpath('//meta/@charset').first)
|
32
|
+
charset = correct(node.to_s)
|
33
|
+
return charset if charset
|
34
|
+
end
|
35
|
+
|
36
|
+
doc.xpath('//meta[@http-equiv]').each do |meta|
|
37
|
+
if meta['http-equiv'] =~ /content-type/i
|
38
|
+
charset = ContentTypeParser.parse(meta[:content].to_s)[1][:charset]
|
39
|
+
charset = correct(node.to_s)
|
40
|
+
return charset if charset
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
return nil
|
45
|
+
end
|
46
|
+
|
47
|
+
def from_xml(body)
|
48
|
+
doc = Nokogiri::XML(body.encode('utf-8', undef: :replace, invalid: :replace))
|
49
|
+
if doc.encoding
|
50
|
+
correct(doc.encoding)
|
51
|
+
else
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def from_text(text)
|
57
|
+
if text.ascii_only?
|
58
|
+
'ascii'
|
59
|
+
else
|
60
|
+
detection = CharlockHolmes::EncodingDetector.detect(text)
|
61
|
+
if detection && detection.key?(:encoding)
|
62
|
+
detection[:encoding].downcase
|
63
|
+
else
|
64
|
+
nil
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def correct(charset)
|
70
|
+
charset = charset.downcase
|
71
|
+
charset = CORRECTION[charset] if CORRECTION.key?(charset)
|
72
|
+
|
73
|
+
begin
|
74
|
+
Encoding.find(charset)
|
75
|
+
rescue
|
76
|
+
charset = nil
|
77
|
+
end
|
78
|
+
charset
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Util
|
4
|
+
class ContentTypeParser
|
5
|
+
class << self
|
6
|
+
def parse(content_type)
|
7
|
+
mime, *kvs = content_type.to_s.split(';').map { |str| str.strip.downcase }
|
8
|
+
params = kvs.each_with_object({}) do |kv, hash|
|
9
|
+
k, v = kv.to_s.split('=').map { |str| str.strip }
|
10
|
+
hash[k.to_sym] = unquote(v) if k && v
|
11
|
+
end
|
12
|
+
return mime, params
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def unquote(str)
|
18
|
+
if str =~ /^"(.*?)"$/
|
19
|
+
$1.gsub(/\\(.)/, '\1')
|
20
|
+
else
|
21
|
+
str
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Util
|
4
|
+
class Matcher
|
5
|
+
class << self
|
6
|
+
def match?(text, allows: nil, denies: nil)
|
7
|
+
match_to_allows?(text, allows) && !match_to_denies?(text, denies)
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def match_to_allows?(text, allows)
|
13
|
+
allows = Array(allows)
|
14
|
+
allows.empty? || allows.any? { |allow| Kudzu::Common.match?(text, allow) }
|
15
|
+
end
|
16
|
+
|
17
|
+
def match_to_denies?(text, denies)
|
18
|
+
denies = Array(denies)
|
19
|
+
!denies.empty? && denies.any? { |deny| Kudzu::Common.match?(text, deny) }
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Util
|
4
|
+
class MimeTypeDetector
|
5
|
+
DEFALUT_MIME_TYPE = 'application/octet-stream'
|
6
|
+
|
7
|
+
class << self
|
8
|
+
def detect(response)
|
9
|
+
from_header(response.response_header) ||
|
10
|
+
from_body(response.body) ||
|
11
|
+
from_url(response.url) ||
|
12
|
+
DEFALUT_MIME_TYPE
|
13
|
+
rescue => e
|
14
|
+
Kudzu.log :warn, "failed to detect mime: #{response.url}", error: e
|
15
|
+
nil
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def from_header(header)
|
21
|
+
ContentTypeParser.parse(header['content-type']).first
|
22
|
+
end
|
23
|
+
|
24
|
+
def from_body(body)
|
25
|
+
mime = MIME.check_magics(StringIO.new(body))
|
26
|
+
mime.to_s if mime
|
27
|
+
end
|
28
|
+
|
29
|
+
def from_url(url)
|
30
|
+
uri = Addressable::URI.parse(url)
|
31
|
+
mime = MIME.check_globs(uri.basename)
|
32
|
+
mime.to_s if mime
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class Agent
|
3
|
+
class Util
|
4
|
+
class TitleParser
|
5
|
+
class << self
|
6
|
+
def parse(response)
|
7
|
+
if response.html?
|
8
|
+
from_html(response.parsed_doc)
|
9
|
+
else
|
10
|
+
Addressable::URI.parse(response.url).basename
|
11
|
+
end
|
12
|
+
rescue => e
|
13
|
+
Kudzu.log :warn, "failed to parse title: #{response.url}", error: e
|
14
|
+
nil
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def from_html(doc)
|
20
|
+
if (node = doc.xpath('//head/title').first)
|
21
|
+
node.inner_text.to_s
|
22
|
+
else
|
23
|
+
''
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/kudzu/agent.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require_relative 'agent/all'
|
2
|
+
|
3
|
+
module Kudzu
|
4
|
+
class Agent
|
5
|
+
def initialize(config, &block)
|
6
|
+
@config = config
|
7
|
+
|
8
|
+
@robots = Robots.new(@config)
|
9
|
+
@fetcher = Fetcher.new(@config, @robots)
|
10
|
+
@url_extractor = UrlExtractor.new(@config)
|
11
|
+
@url_filterer = UrlFilterer.new(@config, @robots)
|
12
|
+
@page_filterer = PageFilterer.new(@config)
|
13
|
+
end
|
14
|
+
|
15
|
+
def start
|
16
|
+
yield
|
17
|
+
@fetcher.pool.close
|
18
|
+
end
|
19
|
+
|
20
|
+
def fetch(url, request_header = {})
|
21
|
+
response = @fetcher.fetch(url, request_header: request_header)
|
22
|
+
return response unless response.fetched?
|
23
|
+
|
24
|
+
response.size = response.body.size
|
25
|
+
response.digest = Digest::MD5.hexdigest(response.body)
|
26
|
+
response.mime_type = Util::MimeTypeDetector.detect(response)
|
27
|
+
response.charset = Util::CharsetDetector.detect(response) if response.text?
|
28
|
+
response.title = Util::TitleParser.parse(response)
|
29
|
+
response
|
30
|
+
end
|
31
|
+
|
32
|
+
def extract_refs(response)
|
33
|
+
refs = @url_extractor.extract(response)
|
34
|
+
@url_filterer.filter(refs, response.url)
|
35
|
+
end
|
36
|
+
|
37
|
+
def filter_response?(response)
|
38
|
+
return false if response.redirect_from && !@url_filterer.allowed?(response.url, response.redirect_from)
|
39
|
+
!@page_filterer.allowed?(response)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/kudzu/callback.rb
CHANGED
@@ -6,12 +6,14 @@ module Kudzu
|
|
6
6
|
:on_server_error, # 5xx
|
7
7
|
:on_filter, # 2xx, filtered
|
8
8
|
:on_failure, # Exception
|
9
|
+
:before_enqueue,
|
10
|
+
:after_enqueue,
|
11
|
+
:before_fetch,
|
12
|
+
:after_fetch,
|
9
13
|
:before_register,
|
10
14
|
:after_register,
|
11
15
|
:before_delete,
|
12
16
|
:after_delete,
|
13
|
-
:before_enqueue,
|
14
|
-
:after_enqueue,
|
15
17
|
]
|
16
18
|
|
17
19
|
def initialize(&block)
|
data/lib/kudzu/config/filter.rb
CHANGED
@@ -13,27 +13,27 @@ module Kudzu
|
|
13
13
|
attr_accessor :path
|
14
14
|
attr_accessor *SIMPLE_CONFIGS
|
15
15
|
|
16
|
+
def initialize(path, config = {}, &block)
|
17
|
+
@path = path
|
18
|
+
DEFAULT_CONFIG.merge(config).each do |key, value|
|
19
|
+
send("#{key}=", value)
|
20
|
+
end
|
21
|
+
if block
|
22
|
+
Delegator.new(self).instance_eval(&block)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
16
26
|
class Delegator
|
17
27
|
def initialize(filter)
|
18
28
|
@filter = filter
|
19
29
|
end
|
20
30
|
|
21
|
-
|
31
|
+
SIMPLE_CONFIGS.each do |key|
|
22
32
|
define_method(key) do |value|
|
23
33
|
@filter.send("#{key}=", value)
|
24
34
|
end
|
25
35
|
end
|
26
36
|
end
|
27
|
-
|
28
|
-
def initialize(path, config = {}, &block)
|
29
|
-
@path = path
|
30
|
-
DEFAULT_CONFIG.merge(config).each do |key, value|
|
31
|
-
send("#{key}=", value)
|
32
|
-
end
|
33
|
-
if block
|
34
|
-
Kudzu::Config::Filter::Delegator.new(self).instance_eval(&block)
|
35
|
-
end
|
36
|
-
end
|
37
37
|
end
|
38
38
|
end
|
39
39
|
end
|
data/lib/kudzu/config.rb
CHANGED
@@ -4,16 +4,15 @@ require_relative 'config/filter'
|
|
4
4
|
module Kudzu
|
5
5
|
class Config
|
6
6
|
SIMPLE_CONFIGS = [:config_file,
|
7
|
-
:user_agent, :thread_num, :open_timeout, :read_timeout,
|
7
|
+
:user_agent, :thread_num, :open_timeout, :read_timeout, :keep_alive,
|
8
8
|
:max_connection, :max_redirect, :max_depth, :default_request_header,
|
9
9
|
:politeness_delay, :handle_cookie,
|
10
10
|
:respect_robots_txt, :respect_nofollow, :respect_noindex,
|
11
|
-
:log_file, :log_level,
|
12
|
-
:revisit_mode, :revisit_min_interval, :revisit_max_interval, :revisit_default_interval,
|
13
11
|
:filters]
|
14
12
|
DEFAULT_CONFIG = { user_agent: "Kudzu/#{Kudzu::VERSION}",
|
15
13
|
open_timeout: 10,
|
16
14
|
read_timeout: 10,
|
15
|
+
keep_alive: 5,
|
17
16
|
thread_num: 1,
|
18
17
|
max_connection: 10,
|
19
18
|
max_redirect: 3,
|
@@ -21,37 +20,17 @@ module Kudzu
|
|
21
20
|
handle_cookie: true,
|
22
21
|
respect_robots_txt: true,
|
23
22
|
respect_nofollow: true,
|
24
|
-
respect_noindex: true
|
25
|
-
revisit_mode: false,
|
26
|
-
revisit_min_interval: 1,
|
27
|
-
revisit_max_interval: 10,
|
28
|
-
revisit_default_interval: 5 }
|
23
|
+
respect_noindex: true }
|
29
24
|
|
30
25
|
attr_accessor *SIMPLE_CONFIGS
|
31
26
|
|
32
|
-
class Delegator
|
33
|
-
def initialize(config)
|
34
|
-
@config = config
|
35
|
-
end
|
36
|
-
|
37
|
-
Kudzu::Config::SIMPLE_CONFIGS.each do |key|
|
38
|
-
define_method(key) do |value|
|
39
|
-
@config.send("#{key}=", value)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def add_filter(base_url = nil, config = {}, &block)
|
44
|
-
@config.add_filter(base_url, config, &block)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
27
|
def initialize(config = {}, &block)
|
49
28
|
self.filters = {}
|
50
29
|
DEFAULT_CONFIG.merge(config).each do |key, value|
|
51
30
|
send("#{key}=", value)
|
52
31
|
end
|
53
32
|
if config_file || block
|
54
|
-
delegator =
|
33
|
+
delegator = Delegator.new(self)
|
55
34
|
delegator.instance_eval(File.read(config_file)) if config_file
|
56
35
|
delegator.instance_eval(&block) if block
|
57
36
|
end
|
@@ -75,5 +54,21 @@ module Kudzu
|
|
75
54
|
end
|
76
55
|
nil
|
77
56
|
end
|
57
|
+
|
58
|
+
class Delegator
|
59
|
+
def initialize(config)
|
60
|
+
@config = config
|
61
|
+
end
|
62
|
+
|
63
|
+
SIMPLE_CONFIGS.each do |key|
|
64
|
+
define_method(key) do |value|
|
65
|
+
@config.send("#{key}=", value)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def add_filter(base_url = nil, config = {}, &block)
|
70
|
+
@config.add_filter(base_url, config, &block)
|
71
|
+
end
|
72
|
+
end
|
78
73
|
end
|
79
74
|
end
|