kudzu 1.1.7 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: adc0bdb2f1017f8f4e8abcc2f0e6416c88b2110ce9f345957c6233484505509c
4
- data.tar.gz: 49a6c3166b4d499177a144987623d17e516c4a6d84674af81d883dddbfee3b5b
3
+ metadata.gz: 232a54685462f7415227ddf7ce2efd31d0046cb5bf7756ffbffb0b473836d45d
4
+ data.tar.gz: e601657db196282b54e89633eebbaa4633ae1f7951fd0e7a3aa4b6ea00109d48
5
5
  SHA512:
6
- metadata.gz: 7eba18403646beab304d92b9826bb774f4d2b2f59b365402b2aeadb3ae5e05a50f0604a5c8ba7ccfe63176db82ea85bf7759697b1069adda860a8c2e4ef56bff
7
- data.tar.gz: 456b279651849f18308de4ab93cf76e4644988fdbb5ac023fa32b6d5cc163298236e374c8323227d2540854f4579d08e137caf844bf49c41235dd5a112b9dc6f
6
+ metadata.gz: 5a3dabef241ebaed7b9a80ed11abd8bbaee63e09fe674743931c4f2c36ca44e436be6e5b44ed44e68cc30899ff080faf803f5e2834074fdc3dfc9abba100248f
7
+ data.tar.gz: fa024d77336f25d03025d056a2d40f7c16f302658cbcf21afba5a637c9f14ab29f353c135b4c90217f6de46cbc9edb52a4bd0284982d264019762b4978b69c67
@@ -83,12 +83,18 @@ module Kudzu
83
83
  Response.new(url: url,
84
84
  status: response.code.to_i,
85
85
  body: fetched ? response.body.to_s : nil,
86
- response_header: Hash[response.each.to_a],
86
+ response_header: force_header_encoding(Hash[response.each.to_a]),
87
87
  response_time: response_time,
88
88
  redirect_from: redirect_from,
89
89
  fetched: fetched)
90
90
  end
91
91
 
92
+ def force_header_encoding(response_header)
93
+ response_header.each do |key, value|
94
+ response_header[key] = value.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace)
95
+ end
96
+ end
97
+
92
98
  def redirection?(code)
93
99
  code = code.to_i
94
100
  300 <= code && code <= 399
@@ -35,7 +35,7 @@ module Kudzu
35
35
  uri.path = uri.path.gsub(%r|/{2,}|, '/')
36
36
  uri.fragment = nil
37
37
 
38
- if uri.scheme.in?(%w(http https))
38
+ if uri.scheme.in?(%w(http https)) && Addressable::URI.parse(uri.to_s)
39
39
  uri.to_s
40
40
  else
41
41
  nil
data/lib/kudzu/common.rb CHANGED
@@ -15,7 +15,8 @@ module Kudzu
15
15
  if path.end_with?('/')
16
16
  path
17
17
  else
18
- File.dirname(path) + '/'
18
+ dir = File.dirname(path)
19
+ dir.end_with?('/') ? dir : dir + '/'
19
20
  end
20
21
  end
21
22
  end
data/lib/kudzu/config.rb CHANGED
@@ -5,7 +5,7 @@ module Kudzu
5
5
  class Config
6
6
  SIMPLE_CONFIGS = [:config_file,
7
7
  :user_agent, :thread_num, :open_timeout, :read_timeout, :keep_alive,
8
- :max_connection, :max_redirect, :max_depth, :default_request_header,
8
+ :max_connection, :max_redirect, :max_depth, :max_retry, :default_request_header,
9
9
  :politeness_delay, :handle_cookie,
10
10
  :respect_robots_txt, :respect_nofollow, :respect_noindex,
11
11
  :filters]
@@ -16,6 +16,7 @@ module Kudzu
16
16
  thread_num: 1,
17
17
  max_connection: 10,
18
18
  max_redirect: 3,
19
+ max_retry: 0,
19
20
  politeness_delay: 0.5,
20
21
  handle_cookie: true,
21
22
  respect_robots_txt: true,
@@ -27,7 +28,7 @@ module Kudzu
27
28
  def initialize(config = {}, &block)
28
29
  self.filters = {}
29
30
  DEFAULT_CONFIG.merge(config).each do |key, value|
30
- send("#{key}=", value)
31
+ send("#{key}=", value) if respond_to?("#{key}=")
31
32
  end
32
33
  if config_file || block
33
34
  delegator = Delegator.new(self)
data/lib/kudzu/crawler.rb CHANGED
@@ -107,13 +107,16 @@ module Kudzu
107
107
 
108
108
  def fetch(link, request_header)
109
109
  response = nil
110
- @callback.around(:fetch, link, request_header, response) do
111
- response = @agent.fetch(link.url, request_header)
112
- end
113
- if response.fetched?
114
- Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
115
- else
116
- Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
110
+ (@config.max_retry.to_i + 1).times do
111
+ @callback.around(:fetch, link, request_header, response) do
112
+ response = @agent.fetch(link.url, request_header)
113
+ end
114
+ if response.fetched?
115
+ Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
116
+ else
117
+ Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
118
+ end
119
+ break if !response.fetched? || response.status_success? || response.status_redirection?
117
120
  end
118
121
  response
119
122
  rescue Exception => e
@@ -135,7 +138,7 @@ module Kudzu
135
138
 
136
139
  if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
137
140
  refs = @agent.extract_refs(response)
138
- enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
141
+ enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty?
139
142
  end
140
143
 
141
144
  if @agent.filter_response?(response)
@@ -168,8 +171,8 @@ module Kudzu
168
171
  end
169
172
  end
170
173
 
171
- def enqueue_links(links)
172
- @callback.around(:enqueue, links) do
174
+ def enqueue_links(links, response = nil)
175
+ @callback.around(:enqueue, links, response) do
173
176
  @frontier.enqueue(links)
174
177
  end
175
178
  end
@@ -4,6 +4,14 @@ module Kudzu
4
4
  def uri
5
5
  Addressable::URI.parse(url)
6
6
  end
7
+
8
+ def status_success?
9
+ 200 <= status && status <= 299
10
+ end
11
+
12
+ def status_redirection?
13
+ 300 <= status && status <= 399
14
+ end
7
15
  end
8
16
  end
9
17
  end
data/lib/kudzu/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kudzu
2
- VERSION = '1.1.7'
2
+ VERSION = '1.2.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kudzu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.7
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoshikazu Kaneta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-08-08 00:00:00.000000000 Z
11
+ date: 2021-02-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -214,8 +214,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
214
214
  - !ruby/object:Gem::Version
215
215
  version: '0'
216
216
  requirements: []
217
- rubyforge_project:
218
- rubygems_version: 2.7.4
217
+ rubygems_version: 3.1.2
219
218
  signing_key:
220
219
  specification_version: 4
221
220
  summary: A simple web crawler for ruby