kudzu 1.1.11 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e644c892ed66d9f00577bc01c4a47ab8c2f8650f1ad412de119efd56d3027d4a
4
- data.tar.gz: afe03192b1a584ae8cf04b18404c3ff751edf5ac9a2c99289b84263891d6c187
3
+ metadata.gz: 232a54685462f7415227ddf7ce2efd31d0046cb5bf7756ffbffb0b473836d45d
4
+ data.tar.gz: e601657db196282b54e89633eebbaa4633ae1f7951fd0e7a3aa4b6ea00109d48
5
5
  SHA512:
6
- metadata.gz: b57e8d06b027f03dbee49cfb93bf961a1bf7bbe62a2660da865f9aec7905c21484ffe6127840d54812e33b25f51b26229aaf01df6f0618f83d977321db7c7390
7
- data.tar.gz: a39396c80a0efbfe21bc33492995c897f094d7b81c134b75bfa594a2247d980ab6b385d658a8b1a697a67cb30dbefaea92c3cfd490deb33dc00400aebadc9a9c
6
+ metadata.gz: 5a3dabef241ebaed7b9a80ed11abd8bbaee63e09fe674743931c4f2c36ca44e436be6e5b44ed44e68cc30899ff080faf803f5e2834074fdc3dfc9abba100248f
7
+ data.tar.gz: fa024d77336f25d03025d056a2d40f7c16f302658cbcf21afba5a637c9f14ab29f353c135b4c90217f6de46cbc9edb52a4bd0284982d264019762b4978b69c67
data/lib/kudzu/config.rb CHANGED
@@ -5,7 +5,7 @@ module Kudzu
5
5
  class Config
6
6
  SIMPLE_CONFIGS = [:config_file,
7
7
  :user_agent, :thread_num, :open_timeout, :read_timeout, :keep_alive,
8
- :max_connection, :max_redirect, :max_depth, :default_request_header,
8
+ :max_connection, :max_redirect, :max_depth, :max_retry, :default_request_header,
9
9
  :politeness_delay, :handle_cookie,
10
10
  :respect_robots_txt, :respect_nofollow, :respect_noindex,
11
11
  :filters]
@@ -16,6 +16,7 @@ module Kudzu
16
16
  thread_num: 1,
17
17
  max_connection: 10,
18
18
  max_redirect: 3,
19
+ max_retry: 0,
19
20
  politeness_delay: 0.5,
20
21
  handle_cookie: true,
21
22
  respect_robots_txt: true,
data/lib/kudzu/crawler.rb CHANGED
@@ -107,13 +107,16 @@ module Kudzu
107
107
 
108
108
  def fetch(link, request_header)
109
109
  response = nil
110
- @callback.around(:fetch, link, request_header, response) do
111
- response = @agent.fetch(link.url, request_header)
112
- end
113
- if response.fetched?
114
- Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
115
- else
116
- Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
110
+ (@config.max_retry.to_i + 1).times do
111
+ @callback.around(:fetch, link, request_header, response) do
112
+ response = @agent.fetch(link.url, request_header)
113
+ end
114
+ if response.fetched?
115
+ Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
116
+ else
117
+ Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
118
+ end
119
+ break if !response.fetched? || response.status_success? || response.status_redirection?
117
120
  end
118
121
  response
119
122
  rescue Exception => e
@@ -135,7 +138,7 @@ module Kudzu
135
138
 
136
139
  if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
137
140
  refs = @agent.extract_refs(response)
138
- enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
141
+ enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty?
139
142
  end
140
143
 
141
144
  if @agent.filter_response?(response)
@@ -168,8 +171,8 @@ module Kudzu
168
171
  end
169
172
  end
170
173
 
171
- def enqueue_links(links)
172
- @callback.around(:enqueue, links) do
174
+ def enqueue_links(links, response = nil)
175
+ @callback.around(:enqueue, links, response) do
173
176
  @frontier.enqueue(links)
174
177
  end
175
178
  end
@@ -4,6 +4,14 @@ module Kudzu
4
4
  def uri
5
5
  Addressable::URI.parse(url)
6
6
  end
7
+
8
+ def status_success?
9
+ 200 <= status && status <= 299
10
+ end
11
+
12
+ def status_redirection?
13
+ 300 <= status && status <= 399
14
+ end
7
15
  end
8
16
  end
9
17
  end
data/lib/kudzu/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kudzu
2
- VERSION = '1.1.11'
2
+ VERSION = '1.2.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kudzu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.11
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoshikazu Kaneta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-17 00:00:00.000000000 Z
11
+ date: 2021-02-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -214,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
214
214
  - !ruby/object:Gem::Version
215
215
  version: '0'
216
216
  requirements: []
217
- rubygems_version: 3.0.3
217
+ rubygems_version: 3.1.2
218
218
  signing_key:
219
219
  specification_version: 4
220
220
  summary: A simple web crawler for ruby