kudzu 1.1.9 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1235e19cd09b27f161411c47d33ef361732539bbd1ae1fa801fa764972590b0a
4
- data.tar.gz: 366f376338f82363e2f53b6bc5d0d2d89e695581e6dcc245e60893613334dcc1
3
+ metadata.gz: 16db7469597ff96bff30c59c9b6da73e8c4d33b535228424675c7e7a539e4ec2
4
+ data.tar.gz: ee0eb79b8e5c3e4e3d3832ed1f8e9d0dc291c60f3d626495eb21055e5e23341e
5
5
  SHA512:
6
- metadata.gz: 37d53924ac944bab707c73f5bb698ecfb267f273e33eda38b7f13ee01fe667b35fc6c4941c0d7864fdbe96fd80cdfe00b5699abe4a9749992cf5c426edafeade
7
- data.tar.gz: d6cd02bbd42b4be05070f1ba5a76f5a2220a633ae5184118cd74c6c753707bda6fa7e3f70935a19462e25295edbecdd4ab9657094e208ae3f8d14c5313a1e12a
6
+ metadata.gz: 6d3a060a674cbd7ec9de94db1d94dcb9c0e6eac1f0b6bb20a798547b8eb3f29a69c0d404ca45dd7efc7f11d91de2618ac214f0d316cbe58fd69276878e37a9bf
7
+ data.tar.gz: b423624bc1e7ea849a98cbea9d34b8a1dcfd22f5fd7bee8ce0791e900ac805d2d448776d0a4b7cb1e0be9a9ef033f008700cf16aed62ffa287bccacbb1db0db4
data/README.md CHANGED
@@ -12,7 +12,7 @@ A simple web crawler for ruby.
12
12
 
13
13
  ## Dependencies
14
14
 
15
- * ruby 2.3+
15
+ * ruby 2.5+
16
16
  * libicu
17
17
 
18
18
  ## Installation
@@ -35,7 +35,7 @@ module Kudzu
35
35
  uri.path = uri.path.gsub(%r|/{2,}|, '/')
36
36
  uri.fragment = nil
37
37
 
38
- if uri.scheme.in?(%w(http https))
38
+ if uri.scheme.in?(%w(http https)) && Addressable::URI.parse(uri.to_s)
39
39
  uri.to_s
40
40
  else
41
41
  nil
@@ -22,13 +22,13 @@ module Kudzu
22
22
  end
23
23
 
24
24
  def from_body(body)
25
- mime = MimeMagic.by_magic(StringIO.new(body))
25
+ mime = Marcel::Magic.by_magic(StringIO.new(body))
26
26
  mime.to_s if mime
27
27
  end
28
28
 
29
29
  def from_url(url)
30
30
  uri = Addressable::URI.parse(url)
31
- mime = MimeMagic.by_path(uri.basename)
31
+ mime = Marcel::Magic.by_path(uri.basename)
32
32
  mime.to_s if mime
33
33
  end
34
34
  end
data/lib/kudzu/config.rb CHANGED
@@ -5,7 +5,7 @@ module Kudzu
5
5
  class Config
6
6
  SIMPLE_CONFIGS = [:config_file,
7
7
  :user_agent, :thread_num, :open_timeout, :read_timeout, :keep_alive,
8
- :max_connection, :max_redirect, :max_depth, :default_request_header,
8
+ :max_connection, :max_redirect, :max_depth, :max_retry, :default_request_header,
9
9
  :politeness_delay, :handle_cookie,
10
10
  :respect_robots_txt, :respect_nofollow, :respect_noindex,
11
11
  :filters]
@@ -16,6 +16,7 @@ module Kudzu
16
16
  thread_num: 1,
17
17
  max_connection: 10,
18
18
  max_redirect: 3,
19
+ max_retry: 0,
19
20
  politeness_delay: 0.5,
20
21
  handle_cookie: true,
21
22
  respect_robots_txt: true,
@@ -27,7 +28,7 @@ module Kudzu
27
28
  def initialize(config = {}, &block)
28
29
  self.filters = {}
29
30
  DEFAULT_CONFIG.merge(config).each do |key, value|
30
- send("#{key}=", value)
31
+ send("#{key}=", value) if respond_to?("#{key}=")
31
32
  end
32
33
  if config_file || block
33
34
  delegator = Delegator.new(self)
data/lib/kudzu/crawler.rb CHANGED
@@ -107,13 +107,16 @@ module Kudzu
107
107
 
108
108
  def fetch(link, request_header)
109
109
  response = nil
110
- @callback.around(:fetch, link, request_header, response) do
111
- response = @agent.fetch(link.url, request_header)
112
- end
113
- if response.fetched?
114
- Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
115
- else
116
- Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
110
+ (@config.max_retry.to_i + 1).times do
111
+ @callback.around(:fetch, link, request_header, response) do
112
+ response = @agent.fetch(link.url, request_header)
113
+ end
114
+ if response.fetched?
115
+ Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
116
+ else
117
+ Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
118
+ end
119
+ break if !response.fetched? || response.status_success? || response.status_redirection?
117
120
  end
118
121
  response
119
122
  rescue Exception => e
@@ -135,7 +138,7 @@ module Kudzu
135
138
 
136
139
  if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
137
140
  refs = @agent.extract_refs(response)
138
- enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
141
+ enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty?
139
142
  end
140
143
 
141
144
  if @agent.filter_response?(response)
@@ -168,8 +171,8 @@ module Kudzu
168
171
  end
169
172
  end
170
173
 
171
- def enqueue_links(links)
172
- @callback.around(:enqueue, links) do
174
+ def enqueue_links(links, response = nil)
175
+ @callback.around(:enqueue, links, response) do
173
176
  @frontier.enqueue(links)
174
177
  end
175
178
  end
@@ -4,6 +4,14 @@ module Kudzu
4
4
  def uri
5
5
  Addressable::URI.parse(url)
6
6
  end
7
+
8
+ def status_success?
9
+ 200 <= status && status <= 299
10
+ end
11
+
12
+ def status_redirection?
13
+ 300 <= status && status <= 399
14
+ end
7
15
  end
8
16
  end
9
17
  end
data/lib/kudzu/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kudzu
2
- VERSION = '1.1.9'
2
+ VERSION = '1.3.0'
3
3
  end
data/lib/kudzu.rb CHANGED
@@ -2,8 +2,7 @@ require 'net/http'
2
2
  require 'http-cookie'
3
3
  require 'addressable'
4
4
  require 'nokogiri'
5
- require 'mimemagic'
6
- require 'mimemagic/overlay'
5
+ require 'marcel'
7
6
  require 'charlock_holmes'
8
7
 
9
8
  require 'kudzu/version'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kudzu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.9
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoshikazu Kaneta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-08-27 00:00:00.000000000 Z
11
+ date: 2022-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -53,7 +53,7 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: mimemagic
56
+ name: marcel
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - ">="
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: webrick
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: rspec-rails
99
113
  requirement: !ruby/object:Gem::Requirement
@@ -214,8 +228,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
214
228
  - !ruby/object:Gem::Version
215
229
  version: '0'
216
230
  requirements: []
217
- rubyforge_project:
218
- rubygems_version: 2.7.6
231
+ rubygems_version: 3.3.3
219
232
  signing_key:
220
233
  specification_version: 4
221
234
  summary: A simple web crawler for ruby