kudzu 1.1.11 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/kudzu/config.rb +2 -1
- data/lib/kudzu/crawler.rb +13 -10
- data/lib/kudzu/model/link.rb +8 -0
- data/lib/kudzu/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 232a54685462f7415227ddf7ce2efd31d0046cb5bf7756ffbffb0b473836d45d
|
4
|
+
data.tar.gz: e601657db196282b54e89633eebbaa4633ae1f7951fd0e7a3aa4b6ea00109d48
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5a3dabef241ebaed7b9a80ed11abd8bbaee63e09fe674743931c4f2c36ca44e436be6e5b44ed44e68cc30899ff080faf803f5e2834074fdc3dfc9abba100248f
|
7
|
+
data.tar.gz: fa024d77336f25d03025d056a2d40f7c16f302658cbcf21afba5a637c9f14ab29f353c135b4c90217f6de46cbc9edb52a4bd0284982d264019762b4978b69c67
|
data/lib/kudzu/config.rb
CHANGED
@@ -5,7 +5,7 @@ module Kudzu
|
|
5
5
|
class Config
|
6
6
|
SIMPLE_CONFIGS = [:config_file,
|
7
7
|
:user_agent, :thread_num, :open_timeout, :read_timeout, :keep_alive,
|
8
|
-
:max_connection, :max_redirect, :max_depth, :default_request_header,
|
8
|
+
:max_connection, :max_redirect, :max_depth, :max_retry, :default_request_header,
|
9
9
|
:politeness_delay, :handle_cookie,
|
10
10
|
:respect_robots_txt, :respect_nofollow, :respect_noindex,
|
11
11
|
:filters]
|
@@ -16,6 +16,7 @@ module Kudzu
|
|
16
16
|
thread_num: 1,
|
17
17
|
max_connection: 10,
|
18
18
|
max_redirect: 3,
|
19
|
+
max_retry: 0,
|
19
20
|
politeness_delay: 0.5,
|
20
21
|
handle_cookie: true,
|
21
22
|
respect_robots_txt: true,
|
data/lib/kudzu/crawler.rb
CHANGED
@@ -107,13 +107,16 @@ module Kudzu
|
|
107
107
|
|
108
108
|
def fetch(link, request_header)
|
109
109
|
response = nil
|
110
|
-
@
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
110
|
+
(@config.max_retry.to_i + 1).times do
|
111
|
+
@callback.around(:fetch, link, request_header, response) do
|
112
|
+
response = @agent.fetch(link.url, request_header)
|
113
|
+
end
|
114
|
+
if response.fetched?
|
115
|
+
Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
|
116
|
+
else
|
117
|
+
Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
|
118
|
+
end
|
119
|
+
break if !response.fetched? || response.status_success? || response.status_redirection?
|
117
120
|
end
|
118
121
|
response
|
119
122
|
rescue Exception => e
|
@@ -135,7 +138,7 @@ module Kudzu
|
|
135
138
|
|
136
139
|
if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
|
137
140
|
refs = @agent.extract_refs(response)
|
138
|
-
enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
|
141
|
+
enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty?
|
139
142
|
end
|
140
143
|
|
141
144
|
if @agent.filter_response?(response)
|
@@ -168,8 +171,8 @@ module Kudzu
|
|
168
171
|
end
|
169
172
|
end
|
170
173
|
|
171
|
-
def enqueue_links(links)
|
172
|
-
@callback.around(:enqueue, links) do
|
174
|
+
def enqueue_links(links, response = nil)
|
175
|
+
@callback.around(:enqueue, links, response) do
|
173
176
|
@frontier.enqueue(links)
|
174
177
|
end
|
175
178
|
end
|
data/lib/kudzu/model/link.rb
CHANGED
data/lib/kudzu/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kudzu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoshikazu Kaneta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -214,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
214
214
|
- !ruby/object:Gem::Version
|
215
215
|
version: '0'
|
216
216
|
requirements: []
|
217
|
-
rubygems_version: 3.
|
217
|
+
rubygems_version: 3.1.2
|
218
218
|
signing_key:
|
219
219
|
specification_version: 4
|
220
220
|
summary: A simple web crawler for ruby
|