kudzu 1.1.9 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/kudzu/agent/url_extractor.rb +1 -1
- data/lib/kudzu/agent/util/mime_type_detector.rb +2 -2
- data/lib/kudzu/config.rb +3 -2
- data/lib/kudzu/crawler.rb +13 -10
- data/lib/kudzu/model/link.rb +8 -0
- data/lib/kudzu/version.rb +1 -1
- data/lib/kudzu.rb +1 -2
- metadata +18 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 16db7469597ff96bff30c59c9b6da73e8c4d33b535228424675c7e7a539e4ec2
|
4
|
+
data.tar.gz: ee0eb79b8e5c3e4e3d3832ed1f8e9d0dc291c60f3d626495eb21055e5e23341e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6d3a060a674cbd7ec9de94db1d94dcb9c0e6eac1f0b6bb20a798547b8eb3f29a69c0d404ca45dd7efc7f11d91de2618ac214f0d316cbe58fd69276878e37a9bf
|
7
|
+
data.tar.gz: b423624bc1e7ea849a98cbea9d34b8a1dcfd22f5fd7bee8ce0791e900ac805d2d448776d0a4b7cb1e0be9a9ef033f008700cf16aed62ffa287bccacbb1db0db4
|
data/README.md
CHANGED
@@ -22,13 +22,13 @@ module Kudzu
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def from_body(body)
|
25
|
-
mime =
|
25
|
+
mime = Marcel::Magic.by_magic(StringIO.new(body))
|
26
26
|
mime.to_s if mime
|
27
27
|
end
|
28
28
|
|
29
29
|
def from_url(url)
|
30
30
|
uri = Addressable::URI.parse(url)
|
31
|
-
mime =
|
31
|
+
mime = Marcel::Magic.by_path(uri.basename)
|
32
32
|
mime.to_s if mime
|
33
33
|
end
|
34
34
|
end
|
data/lib/kudzu/config.rb
CHANGED
@@ -5,7 +5,7 @@ module Kudzu
|
|
5
5
|
class Config
|
6
6
|
SIMPLE_CONFIGS = [:config_file,
|
7
7
|
:user_agent, :thread_num, :open_timeout, :read_timeout, :keep_alive,
|
8
|
-
:max_connection, :max_redirect, :max_depth, :default_request_header,
|
8
|
+
:max_connection, :max_redirect, :max_depth, :max_retry, :default_request_header,
|
9
9
|
:politeness_delay, :handle_cookie,
|
10
10
|
:respect_robots_txt, :respect_nofollow, :respect_noindex,
|
11
11
|
:filters]
|
@@ -16,6 +16,7 @@ module Kudzu
|
|
16
16
|
thread_num: 1,
|
17
17
|
max_connection: 10,
|
18
18
|
max_redirect: 3,
|
19
|
+
max_retry: 0,
|
19
20
|
politeness_delay: 0.5,
|
20
21
|
handle_cookie: true,
|
21
22
|
respect_robots_txt: true,
|
@@ -27,7 +28,7 @@ module Kudzu
|
|
27
28
|
def initialize(config = {}, &block)
|
28
29
|
self.filters = {}
|
29
30
|
DEFAULT_CONFIG.merge(config).each do |key, value|
|
30
|
-
send("#{key}=", value)
|
31
|
+
send("#{key}=", value) if respond_to?("#{key}=")
|
31
32
|
end
|
32
33
|
if config_file || block
|
33
34
|
delegator = Delegator.new(self)
|
data/lib/kudzu/crawler.rb
CHANGED
@@ -107,13 +107,16 @@ module Kudzu
|
|
107
107
|
|
108
108
|
def fetch(link, request_header)
|
109
109
|
response = nil
|
110
|
-
@
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
110
|
+
(@config.max_retry.to_i + 1).times do
|
111
|
+
@callback.around(:fetch, link, request_header, response) do
|
112
|
+
response = @agent.fetch(link.url, request_header)
|
113
|
+
end
|
114
|
+
if response.fetched?
|
115
|
+
Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
|
116
|
+
else
|
117
|
+
Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
|
118
|
+
end
|
119
|
+
break if !response.fetched? || response.status_success? || response.status_redirection?
|
117
120
|
end
|
118
121
|
response
|
119
122
|
rescue Exception => e
|
@@ -135,7 +138,7 @@ module Kudzu
|
|
135
138
|
|
136
139
|
if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
|
137
140
|
refs = @agent.extract_refs(response)
|
138
|
-
enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
|
141
|
+
enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty?
|
139
142
|
end
|
140
143
|
|
141
144
|
if @agent.filter_response?(response)
|
@@ -168,8 +171,8 @@ module Kudzu
|
|
168
171
|
end
|
169
172
|
end
|
170
173
|
|
171
|
-
def enqueue_links(links)
|
172
|
-
@callback.around(:enqueue, links) do
|
174
|
+
def enqueue_links(links, response = nil)
|
175
|
+
@callback.around(:enqueue, links, response) do
|
173
176
|
@frontier.enqueue(links)
|
174
177
|
end
|
175
178
|
end
|
data/lib/kudzu/model/link.rb
CHANGED
data/lib/kudzu/version.rb
CHANGED
data/lib/kudzu.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kudzu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoshikazu Kaneta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -53,7 +53,7 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: marcel
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - ">="
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: webrick
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: rspec-rails
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -214,8 +228,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
214
228
|
- !ruby/object:Gem::Version
|
215
229
|
version: '0'
|
216
230
|
requirements: []
|
217
|
-
|
218
|
-
rubygems_version: 2.7.6
|
231
|
+
rubygems_version: 3.3.3
|
219
232
|
signing_key:
|
220
233
|
specification_version: 4
|
221
234
|
summary: A simple web crawler for ruby
|