kudzu 1.1.11 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kudzu/config.rb +2 -1
- data/lib/kudzu/crawler.rb +13 -10
- data/lib/kudzu/model/link.rb +8 -0
- data/lib/kudzu/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 232a54685462f7415227ddf7ce2efd31d0046cb5bf7756ffbffb0b473836d45d
|
4
|
+
data.tar.gz: e601657db196282b54e89633eebbaa4633ae1f7951fd0e7a3aa4b6ea00109d48
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5a3dabef241ebaed7b9a80ed11abd8bbaee63e09fe674743931c4f2c36ca44e436be6e5b44ed44e68cc30899ff080faf803f5e2834074fdc3dfc9abba100248f
|
7
|
+
data.tar.gz: fa024d77336f25d03025d056a2d40f7c16f302658cbcf21afba5a637c9f14ab29f353c135b4c90217f6de46cbc9edb52a4bd0284982d264019762b4978b69c67
|
data/lib/kudzu/config.rb
CHANGED
@@ -5,7 +5,7 @@ module Kudzu
|
|
5
5
|
class Config
|
6
6
|
SIMPLE_CONFIGS = [:config_file,
|
7
7
|
:user_agent, :thread_num, :open_timeout, :read_timeout, :keep_alive,
|
8
|
-
:max_connection, :max_redirect, :max_depth, :default_request_header,
|
8
|
+
:max_connection, :max_redirect, :max_depth, :max_retry, :default_request_header,
|
9
9
|
:politeness_delay, :handle_cookie,
|
10
10
|
:respect_robots_txt, :respect_nofollow, :respect_noindex,
|
11
11
|
:filters]
|
@@ -16,6 +16,7 @@ module Kudzu
|
|
16
16
|
thread_num: 1,
|
17
17
|
max_connection: 10,
|
18
18
|
max_redirect: 3,
|
19
|
+
max_retry: 0,
|
19
20
|
politeness_delay: 0.5,
|
20
21
|
handle_cookie: true,
|
21
22
|
respect_robots_txt: true,
|
data/lib/kudzu/crawler.rb
CHANGED
@@ -107,13 +107,16 @@ module Kudzu
|
|
107
107
|
|
108
108
|
def fetch(link, request_header)
|
109
109
|
response = nil
|
110
|
-
@
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
110
|
+
(@config.max_retry.to_i + 1).times do
|
111
|
+
@callback.around(:fetch, link, request_header, response) do
|
112
|
+
response = @agent.fetch(link.url, request_header)
|
113
|
+
end
|
114
|
+
if response.fetched?
|
115
|
+
Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
|
116
|
+
else
|
117
|
+
Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
|
118
|
+
end
|
119
|
+
break if !response.fetched? || response.status_success? || response.status_redirection?
|
117
120
|
end
|
118
121
|
response
|
119
122
|
rescue Exception => e
|
@@ -135,7 +138,7 @@ module Kudzu
|
|
135
138
|
|
136
139
|
if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
|
137
140
|
refs = @agent.extract_refs(response)
|
138
|
-
enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
|
141
|
+
enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty?
|
139
142
|
end
|
140
143
|
|
141
144
|
if @agent.filter_response?(response)
|
@@ -168,8 +171,8 @@ module Kudzu
|
|
168
171
|
end
|
169
172
|
end
|
170
173
|
|
171
|
-
def enqueue_links(links)
|
172
|
-
@callback.around(:enqueue, links) do
|
174
|
+
def enqueue_links(links, response = nil)
|
175
|
+
@callback.around(:enqueue, links, response) do
|
173
176
|
@frontier.enqueue(links)
|
174
177
|
end
|
175
178
|
end
|
data/lib/kudzu/model/link.rb
CHANGED
data/lib/kudzu/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kudzu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoshikazu Kaneta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -214,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
214
214
|
- !ruby/object:Gem::Version
|
215
215
|
version: '0'
|
216
216
|
requirements: []
|
217
|
-
rubygems_version: 3.
|
217
|
+
rubygems_version: 3.1.2
|
218
218
|
signing_key:
|
219
219
|
specification_version: 4
|
220
220
|
summary: A simple web crawler for ruby
|