kudzu 1.1.9 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1235e19cd09b27f161411c47d33ef361732539bbd1ae1fa801fa764972590b0a
4
- data.tar.gz: 366f376338f82363e2f53b6bc5d0d2d89e695581e6dcc245e60893613334dcc1
3
+ metadata.gz: 16db7469597ff96bff30c59c9b6da73e8c4d33b535228424675c7e7a539e4ec2
4
+ data.tar.gz: ee0eb79b8e5c3e4e3d3832ed1f8e9d0dc291c60f3d626495eb21055e5e23341e
5
5
  SHA512:
6
- metadata.gz: 37d53924ac944bab707c73f5bb698ecfb267f273e33eda38b7f13ee01fe667b35fc6c4941c0d7864fdbe96fd80cdfe00b5699abe4a9749992cf5c426edafeade
7
- data.tar.gz: d6cd02bbd42b4be05070f1ba5a76f5a2220a633ae5184118cd74c6c753707bda6fa7e3f70935a19462e25295edbecdd4ab9657094e208ae3f8d14c5313a1e12a
6
+ metadata.gz: 6d3a060a674cbd7ec9de94db1d94dcb9c0e6eac1f0b6bb20a798547b8eb3f29a69c0d404ca45dd7efc7f11d91de2618ac214f0d316cbe58fd69276878e37a9bf
7
+ data.tar.gz: b423624bc1e7ea849a98cbea9d34b8a1dcfd22f5fd7bee8ce0791e900ac805d2d448776d0a4b7cb1e0be9a9ef033f008700cf16aed62ffa287bccacbb1db0db4
data/README.md CHANGED
@@ -12,7 +12,7 @@ A simple web crawler for ruby.
12
12
 
13
13
  ## Dependencies
14
14
 
15
- * ruby 2.3+
15
+ * ruby 2.5+
16
16
  * libicu
17
17
 
18
18
  ## Installation
@@ -35,7 +35,7 @@ module Kudzu
35
35
  uri.path = uri.path.gsub(%r|/{2,}|, '/')
36
36
  uri.fragment = nil
37
37
 
38
- if uri.scheme.in?(%w(http https))
38
+ if uri.scheme.in?(%w(http https)) && Addressable::URI.parse(uri.to_s)
39
39
  uri.to_s
40
40
  else
41
41
  nil
@@ -22,13 +22,13 @@ module Kudzu
22
22
  end
23
23
 
24
24
  def from_body(body)
25
- mime = MimeMagic.by_magic(StringIO.new(body))
25
+ mime = Marcel::Magic.by_magic(StringIO.new(body))
26
26
  mime.to_s if mime
27
27
  end
28
28
 
29
29
  def from_url(url)
30
30
  uri = Addressable::URI.parse(url)
31
- mime = MimeMagic.by_path(uri.basename)
31
+ mime = Marcel::Magic.by_path(uri.basename)
32
32
  mime.to_s if mime
33
33
  end
34
34
  end
data/lib/kudzu/config.rb CHANGED
@@ -5,7 +5,7 @@ module Kudzu
5
5
  class Config
6
6
  SIMPLE_CONFIGS = [:config_file,
7
7
  :user_agent, :thread_num, :open_timeout, :read_timeout, :keep_alive,
8
- :max_connection, :max_redirect, :max_depth, :default_request_header,
8
+ :max_connection, :max_redirect, :max_depth, :max_retry, :default_request_header,
9
9
  :politeness_delay, :handle_cookie,
10
10
  :respect_robots_txt, :respect_nofollow, :respect_noindex,
11
11
  :filters]
@@ -16,6 +16,7 @@ module Kudzu
16
16
  thread_num: 1,
17
17
  max_connection: 10,
18
18
  max_redirect: 3,
19
+ max_retry: 0,
19
20
  politeness_delay: 0.5,
20
21
  handle_cookie: true,
21
22
  respect_robots_txt: true,
@@ -27,7 +28,7 @@ module Kudzu
27
28
  def initialize(config = {}, &block)
28
29
  self.filters = {}
29
30
  DEFAULT_CONFIG.merge(config).each do |key, value|
30
- send("#{key}=", value)
31
+ send("#{key}=", value) if respond_to?("#{key}=")
31
32
  end
32
33
  if config_file || block
33
34
  delegator = Delegator.new(self)
data/lib/kudzu/crawler.rb CHANGED
@@ -107,13 +107,16 @@ module Kudzu
107
107
 
108
108
  def fetch(link, request_header)
109
109
  response = nil
110
- @callback.around(:fetch, link, request_header, response) do
111
- response = @agent.fetch(link.url, request_header)
112
- end
113
- if response.fetched?
114
- Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
115
- else
116
- Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
110
+ (@config.max_retry.to_i + 1).times do
111
+ @callback.around(:fetch, link, request_header, response) do
112
+ response = @agent.fetch(link.url, request_header)
113
+ end
114
+ if response.fetched?
115
+ Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
116
+ else
117
+ Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
118
+ end
119
+ break if !response.fetched? || response.status_success? || response.status_redirection?
117
120
  end
118
121
  response
119
122
  rescue Exception => e
@@ -135,7 +138,7 @@ module Kudzu
135
138
 
136
139
  if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
137
140
  refs = @agent.extract_refs(response)
138
- enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
141
+ enqueue_links(refs_to_links(refs, link.depth + 1), response) unless refs.empty?
139
142
  end
140
143
 
141
144
  if @agent.filter_response?(response)
@@ -168,8 +171,8 @@ module Kudzu
168
171
  end
169
172
  end
170
173
 
171
- def enqueue_links(links)
172
- @callback.around(:enqueue, links) do
174
+ def enqueue_links(links, response = nil)
175
+ @callback.around(:enqueue, links, response) do
173
176
  @frontier.enqueue(links)
174
177
  end
175
178
  end
@@ -4,6 +4,14 @@ module Kudzu
4
4
  def uri
5
5
  Addressable::URI.parse(url)
6
6
  end
7
+
8
+ def status_success?
9
+ 200 <= status && status <= 299
10
+ end
11
+
12
+ def status_redirection?
13
+ 300 <= status && status <= 399
14
+ end
7
15
  end
8
16
  end
9
17
  end
data/lib/kudzu/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kudzu
2
- VERSION = '1.1.9'
2
+ VERSION = '1.3.0'
3
3
  end
data/lib/kudzu.rb CHANGED
@@ -2,8 +2,7 @@ require 'net/http'
2
2
  require 'http-cookie'
3
3
  require 'addressable'
4
4
  require 'nokogiri'
5
- require 'mimemagic'
6
- require 'mimemagic/overlay'
5
+ require 'marcel'
7
6
  require 'charlock_holmes'
8
7
 
9
8
  require 'kudzu/version'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kudzu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.9
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoshikazu Kaneta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-08-27 00:00:00.000000000 Z
11
+ date: 2022-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -53,7 +53,7 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: mimemagic
56
+ name: marcel
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - ">="
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: webrick
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: rspec-rails
99
113
  requirement: !ruby/object:Gem::Requirement
@@ -214,8 +228,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
214
228
  - !ruby/object:Gem::Version
215
229
  version: '0'
216
230
  requirements: []
217
- rubyforge_project:
218
- rubygems_version: 2.7.6
231
+ rubygems_version: 3.3.3
219
232
  signing_key:
220
233
  specification_version: 4
221
234
  summary: A simple web crawler for ruby