kudzu 1.1.9 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/kudzu/agent/url_extractor.rb +1 -1
- data/lib/kudzu/agent/util/mime_type_detector.rb +2 -2
- data/lib/kudzu/config.rb +3 -2
- data/lib/kudzu/crawler.rb +13 -10
- data/lib/kudzu/model/link.rb +8 -0
- data/lib/kudzu/version.rb +1 -1
- data/lib/kudzu.rb +1 -2
- metadata +18 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 16db7469597ff96bff30c59c9b6da73e8c4d33b535228424675c7e7a539e4ec2
|
4
|
+
data.tar.gz: ee0eb79b8e5c3e4e3d3832ed1f8e9d0dc291c60f3d626495eb21055e5e23341e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6d3a060a674cbd7ec9de94db1d94dcb9c0e6eac1f0b6bb20a798547b8eb3f29a69c0d404ca45dd7efc7f11d91de2618ac214f0d316cbe58fd69276878e37a9bf
|
7
|
+
data.tar.gz: b423624bc1e7ea849a98cbea9d34b8a1dcfd22f5fd7bee8ce0791e900ac805d2d448776d0a4b7cb1e0be9a9ef033f008700cf16aed62ffa287bccacbb1db0db4
|
data/README.md
CHANGED
@@ -22,13 +22,13 @@ module Kudzu
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def from_body(body)
|
25
|
-
mime =
|
25
|
+
mime = Marcel::Magic.by_magic(StringIO.new(body))
|
26
26
|
mime.to_s if mime
|
27
27
|
end
|
28
28
|
|
29
29
|
def from_url(url)
|
30
30
|
uri = Addressable::URI.parse(url)
|
31
|
-
mime =
|
31
|
+
mime = Marcel::Magic.by_path(uri.basename)
|
32
32
|
mime.to_s if mime
|
33
33
|
end
|
34
34
|
end
|
data/lib/kudzu/config.rb
CHANGED
@@ -5,7 +5,7 @@ module Kudzu
|
|
5
5
|
class Config
|
6
6
|
SIMPLE_CONFIGS = [:config_file,
|
7
7
|
:user_agent, :thread_num, :open_timeout, :read_timeout, :keep_alive,
|
8
|
-
:max_connection, :max_redirect, :max_depth, :default_request_header,
|
8
|
+
:max_connection, :max_redirect, :max_depth, :max_retry, :default_request_header,
|
9
9
|
:politeness_delay, :handle_cookie,
|
10
10
|
:respect_robots_txt, :respect_nofollow, :respect_noindex,
|
11
11
|
:filters]
|
@@ -16,6 +16,7 @@ module Kudzu
|
|
16
16
|
thread_num: 1,
|
17
17
|
max_connection: 10,
|
18
18
|
max_redirect: 3,
|
19
|
+
max_retry: 0,
|
19
20
|
politeness_delay: 0.5,
|
20
21
|
handle_cookie: true,
|
21
22
|
respect_robots_txt: true,
|
@@ -27,7 +28,7 @@ module Kudzu
|
|
27
28
|
def initialize(config = {}, &block)
|
28
29
|
self.filters = {}
|
29
30
|
DEFAULT_CONFIG.merge(config).each do |key, value|
|
30
|
-
send("#{key}=", value)
|
31
|
+
send("#{key}=", value) if respond_to?("#{key}=")
|
31
32
|
end
|
32
33
|
if config_file || block
|
33
34
|
delegator = Delegator.new(self)
|
data/lib/kudzu/crawler.rb
CHANGED
@@ -107,13 +107,16 @@ module Kudzu
|
|
107
107
|
|
108
108
|
def fetch(link, request_header)
|
109
109
|
response = nil
|
110
|
-
@
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
110
|
+
(@config.max_retry.to_i + 1).times do
|
111
|
+
@callback.around(:fetch, link, request_header, response) do
|
112
|
+
response = @agent.fetch(link.url, request_header)
|
113
|
+
end
|
114
|
+
if response.fetched?
|
115
|
+
Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
|
116
|
+
else
|
117
|
+
Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
|
118
|
+
end
|
119
|
+
break if !response.fetched? || response.status_success? || response.status_redirection?
|
117
120
|
end
|
118
121
|
response
|
119
122
|
rescue Exception => e
|
@@ -135,7 +138,7 @@ module Kudzu
|
|
135
138
|
|
136
139
|
if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
|
137
140
|
refs = @agent.extract_refs(response)
|
138
|
-
enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
|
141
|
+
enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty?
|
139
142
|
end
|
140
143
|
|
141
144
|
if @agent.filter_response?(response)
|
@@ -168,8 +171,8 @@ module Kudzu
|
|
168
171
|
end
|
169
172
|
end
|
170
173
|
|
171
|
-
def enqueue_links(links)
|
172
|
-
@callback.around(:enqueue, links) do
|
174
|
+
def enqueue_links(links, response = nil)
|
175
|
+
@callback.around(:enqueue, links, response) do
|
173
176
|
@frontier.enqueue(links)
|
174
177
|
end
|
175
178
|
end
|
data/lib/kudzu/model/link.rb
CHANGED
data/lib/kudzu/version.rb
CHANGED
data/lib/kudzu.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kudzu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoshikazu Kaneta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -53,7 +53,7 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: marcel
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - ">="
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: webrick
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: rspec-rails
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -214,8 +228,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
214
228
|
- !ruby/object:Gem::Version
|
215
229
|
version: '0'
|
216
230
|
requirements: []
|
217
|
-
|
218
|
-
rubygems_version: 2.7.6
|
231
|
+
rubygems_version: 3.3.3
|
219
232
|
signing_key:
|
220
233
|
specification_version: 4
|
221
234
|
summary: A simple web crawler for ruby
|