kudzu 1.1.11 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e644c892ed66d9f00577bc01c4a47ab8c2f8650f1ad412de119efd56d3027d4a
4
- data.tar.gz: afe03192b1a584ae8cf04b18404c3ff751edf5ac9a2c99289b84263891d6c187
3
+ metadata.gz: 232a54685462f7415227ddf7ce2efd31d0046cb5bf7756ffbffb0b473836d45d
4
+ data.tar.gz: e601657db196282b54e89633eebbaa4633ae1f7951fd0e7a3aa4b6ea00109d48
5
5
  SHA512:
6
- metadata.gz: b57e8d06b027f03dbee49cfb93bf961a1bf7bbe62a2660da865f9aec7905c21484ffe6127840d54812e33b25f51b26229aaf01df6f0618f83d977321db7c7390
7
- data.tar.gz: a39396c80a0efbfe21bc33492995c897f094d7b81c134b75bfa594a2247d980ab6b385d658a8b1a697a67cb30dbefaea92c3cfd490deb33dc00400aebadc9a9c
6
+ metadata.gz: 5a3dabef241ebaed7b9a80ed11abd8bbaee63e09fe674743931c4f2c36ca44e436be6e5b44ed44e68cc30899ff080faf803f5e2834074fdc3dfc9abba100248f
7
+ data.tar.gz: fa024d77336f25d03025d056a2d40f7c16f302658cbcf21afba5a637c9f14ab29f353c135b4c90217f6de46cbc9edb52a4bd0284982d264019762b4978b69c67
data/lib/kudzu/config.rb CHANGED
@@ -5,7 +5,7 @@ module Kudzu
5
5
  class Config
6
6
  SIMPLE_CONFIGS = [:config_file,
7
7
  :user_agent, :thread_num, :open_timeout, :read_timeout, :keep_alive,
8
- :max_connection, :max_redirect, :max_depth, :default_request_header,
8
+ :max_connection, :max_redirect, :max_depth, :max_retry, :default_request_header,
9
9
  :politeness_delay, :handle_cookie,
10
10
  :respect_robots_txt, :respect_nofollow, :respect_noindex,
11
11
  :filters]
@@ -16,6 +16,7 @@ module Kudzu
16
16
  thread_num: 1,
17
17
  max_connection: 10,
18
18
  max_redirect: 3,
19
+ max_retry: 0,
19
20
  politeness_delay: 0.5,
20
21
  handle_cookie: true,
21
22
  respect_robots_txt: true,
data/lib/kudzu/crawler.rb CHANGED
@@ -107,13 +107,16 @@ module Kudzu
107
107
 
108
108
  def fetch(link, request_header)
109
109
  response = nil
110
- @callback.around(:fetch, link, request_header, response) do
111
- response = @agent.fetch(link.url, request_header)
112
- end
113
- if response.fetched?
114
- Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
115
- else
116
- Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
110
+ (@config.max_retry.to_i + 1).times do
111
+ @callback.around(:fetch, link, request_header, response) do
112
+ response = @agent.fetch(link.url, request_header)
113
+ end
114
+ if response.fetched?
115
+ Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
116
+ else
117
+ Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
118
+ end
119
+ break if !response.fetched? || response.status_success? || response.status_redirection?
117
120
  end
118
121
  response
119
122
  rescue Exception => e
@@ -135,7 +138,7 @@ module Kudzu
135
138
 
136
139
  if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
137
140
  refs = @agent.extract_refs(response)
138
- enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
141
+ enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty?
139
142
  end
140
143
 
141
144
  if @agent.filter_response?(response)
@@ -168,8 +171,8 @@ module Kudzu
168
171
  end
169
172
  end
170
173
 
171
- def enqueue_links(links)
172
- @callback.around(:enqueue, links) do
174
+ def enqueue_links(links, response = nil)
175
+ @callback.around(:enqueue, links, response) do
173
176
  @frontier.enqueue(links)
174
177
  end
175
178
  end
@@ -4,6 +4,14 @@ module Kudzu
4
4
  def uri
5
5
  Addressable::URI.parse(url)
6
6
  end
7
+
8
+ def status_success?
9
+ 200 <= status && status <= 299
10
+ end
11
+
12
+ def status_redirection?
13
+ 300 <= status && status <= 399
14
+ end
7
15
  end
8
16
  end
9
17
  end
data/lib/kudzu/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kudzu
2
- VERSION = '1.1.11'
2
+ VERSION = '1.2.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kudzu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.11
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoshikazu Kaneta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-17 00:00:00.000000000 Z
11
+ date: 2021-02-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -214,7 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
214
214
  - !ruby/object:Gem::Version
215
215
  version: '0'
216
216
  requirements: []
217
- rubygems_version: 3.0.3
217
+ rubygems_version: 3.1.2
218
218
  signing_key:
219
219
  specification_version: 4
220
220
  summary: A simple web crawler for ruby