kudzu 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kudzu/adapter/memory/frontier.rb +2 -0
- data/lib/kudzu/adapter/memory/model/link.rb +2 -0
- data/lib/kudzu/adapter/memory/model/page.rb +2 -0
- data/lib/kudzu/adapter/memory/repository.rb +2 -0
- data/lib/kudzu/adapter/memory.rb +2 -0
- data/lib/kudzu/agent/all.rb +2 -0
- data/lib/kudzu/agent/fetcher.rb +2 -0
- data/lib/kudzu/agent/http/connection.rb +2 -0
- data/lib/kudzu/agent/http/connection_pool.rb +2 -0
- data/lib/kudzu/agent/page_filterer.rb +2 -0
- data/lib/kudzu/agent/reference.rb +2 -0
- data/lib/kudzu/agent/response.rb +2 -0
- data/lib/kudzu/agent/robots/parser.rb +2 -0
- data/lib/kudzu/agent/robots/txt.rb +2 -0
- data/lib/kudzu/agent/robots.rb +2 -0
- data/lib/kudzu/agent/sleeper.rb +2 -0
- data/lib/kudzu/agent/url_extractor.rb +2 -0
- data/lib/kudzu/agent/url_filterer.rb +2 -0
- data/lib/kudzu/agent/util/charset_detector.rb +2 -0
- data/lib/kudzu/agent/util/content_type_parser.rb +2 -0
- data/lib/kudzu/agent/util/matcher.rb +2 -0
- data/lib/kudzu/agent/util/mime_type_detector.rb +2 -0
- data/lib/kudzu/agent/util/title_parser.rb +2 -0
- data/lib/kudzu/agent.rb +11 -1
- data/lib/kudzu/callback.rb +2 -0
- data/lib/kudzu/common.rb +2 -0
- data/lib/kudzu/config/filter.rb +2 -0
- data/lib/kudzu/config.rb +2 -0
- data/lib/kudzu/crawler.rb +2 -0
- data/lib/kudzu/model/all.rb +2 -0
- data/lib/kudzu/model/base.rb +2 -0
- data/lib/kudzu/model/link.rb +2 -0
- data/lib/kudzu/model/page.rb +2 -0
- data/lib/kudzu/thread_pool.rb +2 -0
- data/lib/kudzu/version.rb +3 -1
- data/lib/kudzu.rb +2 -0
- metadata +5 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a481432d23d78a3eb20587d82f440dc0e3e7375ad605ac8b7afc9f65250ba19f
|
|
4
|
+
data.tar.gz: e45cf1a70ca8f151b3e9b1298d33e07e5101a939aedea5aa959dfce3309bdcba
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4e31a988ca6f30ba60d4832c65376b62f3e285d3204da35ded55d52a4bbf64c9cef8a89b59e2653832f9369af7b57842d30fbf03d8f0003e2efdb0be3ac46b1e
|
|
7
|
+
data.tar.gz: a950a0462839e0b8b115f3b46fd267b55d56001c151cc4893f2c00d17910ee5a5c4cc109d70023d95bbabd89df354e0cd6167010c5b60491121a9386316aa09e
|
data/lib/kudzu/adapter/memory.rb
CHANGED
data/lib/kudzu/agent/all.rb
CHANGED
data/lib/kudzu/agent/fetcher.rb
CHANGED
data/lib/kudzu/agent/response.rb
CHANGED
data/lib/kudzu/agent/robots.rb
CHANGED
data/lib/kudzu/agent/sleeper.rb
CHANGED
data/lib/kudzu/agent.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require_relative 'agent/all'
|
|
2
4
|
|
|
3
5
|
module Kudzu
|
|
@@ -30,13 +32,21 @@ module Kudzu
|
|
|
30
32
|
end
|
|
31
33
|
|
|
32
34
|
def extract_refs(response)
|
|
35
|
+
return [] unless redirect_url_allowed?(response)
|
|
33
36
|
refs = @url_extractor.extract(response)
|
|
34
37
|
@url_filterer.filter(refs, response.url)
|
|
35
38
|
end
|
|
36
39
|
|
|
37
40
|
def filter_response?(response)
|
|
38
|
-
return
|
|
41
|
+
return true unless redirect_url_allowed?(response)
|
|
39
42
|
!@page_filterer.allowed?(response)
|
|
40
43
|
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def redirect_url_allowed?(response)
|
|
48
|
+
return true if response.redirect_from.nil? || response.redirect_from.empty?
|
|
49
|
+
@url_filterer.allowed?(response.url, response.redirect_from)
|
|
50
|
+
end
|
|
41
51
|
end
|
|
42
52
|
end
|
data/lib/kudzu/callback.rb
CHANGED
data/lib/kudzu/common.rb
CHANGED
data/lib/kudzu/config/filter.rb
CHANGED
data/lib/kudzu/config.rb
CHANGED
data/lib/kudzu/crawler.rb
CHANGED
data/lib/kudzu/model/all.rb
CHANGED
data/lib/kudzu/model/base.rb
CHANGED
data/lib/kudzu/model/link.rb
CHANGED
data/lib/kudzu/model/page.rb
CHANGED
data/lib/kudzu/thread_pool.rb
CHANGED
data/lib/kudzu/version.rb
CHANGED
data/lib/kudzu.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kudzu
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.3.
|
|
4
|
+
version: 1.3.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yoshikazu Kaneta
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2024-06-23 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: addressable
|
|
@@ -213,7 +213,7 @@ homepage: https://github.com/kanety/kudzu
|
|
|
213
213
|
licenses:
|
|
214
214
|
- MIT
|
|
215
215
|
metadata: {}
|
|
216
|
-
post_install_message:
|
|
216
|
+
post_install_message:
|
|
217
217
|
rdoc_options: []
|
|
218
218
|
require_paths:
|
|
219
219
|
- lib
|
|
@@ -229,7 +229,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
229
229
|
version: '0'
|
|
230
230
|
requirements: []
|
|
231
231
|
rubygems_version: 3.3.3
|
|
232
|
-
signing_key:
|
|
232
|
+
signing_key:
|
|
233
233
|
specification_version: 4
|
|
234
234
|
summary: A simple web crawler for ruby
|
|
235
235
|
test_files: []
|