kudzu 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kudzu/adapter/memory/frontier.rb +1 -1
- data/lib/kudzu/adapter/memory/model/link.rb +2 -6
- data/lib/kudzu/adapter/memory/model/page.rb +3 -8
- data/lib/kudzu/adapter/memory/repository.rb +0 -2
- data/lib/kudzu/adapter/memory.rb +3 -4
- data/lib/kudzu/agent/all.rb +1 -1
- data/lib/kudzu/agent/fetcher.rb +46 -49
- data/lib/kudzu/agent/http/connection.rb +9 -0
- data/lib/kudzu/agent/http/connection_pool.rb +50 -0
- data/lib/kudzu/agent/page_filterer.rb +58 -0
- data/lib/kudzu/agent/reference.rb +9 -0
- data/lib/kudzu/agent/response.rb +14 -0
- data/lib/kudzu/agent/robots/parser.rb +91 -0
- data/lib/kudzu/agent/robots/txt.rb +34 -0
- data/lib/kudzu/agent/robots.rb +12 -123
- data/lib/kudzu/agent/sleeper.rb +2 -2
- data/lib/kudzu/agent/url_extractor.rb +60 -46
- data/lib/kudzu/agent/{url_filter.rb → url_filterer.rb} +26 -13
- data/lib/kudzu/agent/util/charset_detector.rb +84 -0
- data/lib/kudzu/agent/util/content_type_parser.rb +28 -0
- data/lib/kudzu/agent/util/matcher.rb +25 -0
- data/lib/kudzu/agent/util/mime_type_detector.rb +38 -0
- data/lib/kudzu/agent/util/title_parser.rb +30 -0
- data/lib/kudzu/agent.rb +42 -0
- data/lib/kudzu/callback.rb +4 -2
- data/lib/kudzu/config/filter.rb +11 -11
- data/lib/kudzu/config.rb +20 -25
- data/lib/kudzu/crawler.rb +65 -146
- data/lib/kudzu/{adapter/base → model}/all.rb +0 -0
- data/lib/kudzu/model/base.rb +9 -0
- data/lib/kudzu/model/link.rb +9 -0
- data/lib/kudzu/model/page.rb +112 -0
- data/lib/kudzu/thread_pool.rb +36 -0
- data/lib/kudzu/version.rb +1 -1
- data/lib/kudzu.rb +21 -3
- metadata +21 -19
- data/lib/kudzu/adapter/base/link.rb +0 -8
- data/lib/kudzu/adapter/base/page.rb +0 -106
- data/lib/kudzu/adapter/memory/all.rb +0 -3
- data/lib/kudzu/agent/charset_detector.rb +0 -84
- data/lib/kudzu/agent/filter.rb +0 -40
- data/lib/kudzu/agent/mime_type_detector.rb +0 -34
- data/lib/kudzu/agent/title_parser.rb +0 -16
- data/lib/kudzu/logger.rb +0 -20
- data/lib/kudzu/revisit/all.rb +0 -3
- data/lib/kudzu/revisit/scheduler.rb +0 -28
- data/lib/kudzu/util/all.rb +0 -3
- data/lib/kudzu/util/connection_pool.rb +0 -56
- data/lib/kudzu/util/content_type_parser.rb +0 -24
- data/lib/kudzu/util/matcher.rb +0 -21
- data/lib/kudzu/util/thread_pool.rb +0 -38
data/lib/kudzu/crawler.rb
CHANGED
@@ -1,57 +1,39 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative 'model/all'
|
2
|
+
require_relative 'adapter/memory'
|
3
|
+
require_relative 'agent'
|
4
|
+
require_relative 'callback'
|
3
5
|
require_relative 'common'
|
4
6
|
require_relative 'config'
|
5
|
-
require_relative '
|
6
|
-
require_relative 'logger'
|
7
|
-
require_relative 'adapter/memory'
|
8
|
-
require_relative 'util/all'
|
9
|
-
require_relative 'agent/all'
|
10
|
-
require_relative 'revisit/all'
|
7
|
+
require_relative 'thread_pool'
|
11
8
|
|
12
9
|
module Kudzu
|
13
10
|
class Crawler
|
14
11
|
attr_reader :uuid, :config
|
15
|
-
attr_reader :frontier, :repository
|
12
|
+
attr_reader :frontier, :repository, :agent
|
16
13
|
|
17
14
|
def initialize(options = {}, &block)
|
18
15
|
@uuid = options[:uuid] || SecureRandom.uuid
|
19
16
|
@config = Kudzu::Config.new(options, &block)
|
20
|
-
end
|
21
|
-
|
22
|
-
def prepare(&block)
|
23
|
-
@logger = Kudzu::Logger.new(@config.log_file, @config.log_level)
|
24
|
-
@callback = Kudzu::Callback.new(&block)
|
25
17
|
|
26
18
|
@frontier = Kudzu.adapter::Frontier.new(@uuid)
|
27
19
|
@repository = Kudzu.adapter::Repository.new
|
28
|
-
|
29
|
-
@robots = Kudzu::Agent::Robots.new(@config)
|
30
|
-
@page_fetcher = Kudzu::Agent::Fetcher.new(@config, @robots)
|
31
|
-
@page_filter = Kudzu::Agent::Filter.new(@config)
|
32
|
-
@charset_detector = Kudzu::Agent::CharsetDetector.new
|
33
|
-
@mime_type_detector = Kudzu::Agent::MimeTypeDetector.new
|
34
|
-
@title_parser = Kudzu::Agent::TitleParser.new
|
35
|
-
|
36
|
-
@url_extractor = Kudzu::Agent::UrlExtractor.new(@config)
|
37
|
-
@url_filter = Kudzu::Agent::UrlFilter.new(@config)
|
38
|
-
|
39
|
-
@revisit_scheduler = Kudzu::Revisit::Scheduler.new(@config)
|
20
|
+
@agent = Kudzu.agent.new(@config)
|
40
21
|
end
|
41
22
|
|
42
23
|
def run(seed_url, &block)
|
43
|
-
|
24
|
+
@callback = Kudzu::Callback.new(&block)
|
44
25
|
|
45
|
-
|
46
|
-
|
26
|
+
seed_refs = Array(seed_url).map { |url| Kudzu::Agent::Reference.new(url: url) }
|
27
|
+
enqueue_links(refs_to_links(seed_refs, 1))
|
47
28
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
29
|
+
@agent.start do
|
30
|
+
if @config.thread_num.to_i <= 1
|
31
|
+
single_thread
|
32
|
+
else
|
33
|
+
multi_thread(@config.thread_num)
|
34
|
+
end
|
52
35
|
end
|
53
36
|
|
54
|
-
@page_fetcher.pool.close
|
55
37
|
@frontier.clear
|
56
38
|
end
|
57
39
|
|
@@ -66,7 +48,7 @@ module Kudzu
|
|
66
48
|
end
|
67
49
|
|
68
50
|
def multi_thread(thread_num)
|
69
|
-
@thread_pool = Kudzu::
|
51
|
+
@thread_pool = Kudzu::ThreadPool.new(thread_num)
|
70
52
|
|
71
53
|
@thread_pool.start do |queue|
|
72
54
|
limit_num = [thread_num - queue.size, 0].max
|
@@ -82,22 +64,25 @@ module Kudzu
|
|
82
64
|
end
|
83
65
|
|
84
66
|
def visit_link(link)
|
85
|
-
|
86
|
-
response = fetch_link(link, build_request_header(page))
|
67
|
+
response = fetch(link, @config.default_request_header.to_h)
|
87
68
|
return unless response
|
88
69
|
|
89
|
-
page = @repository.find_by_url(response.url)
|
70
|
+
page = @repository.find_by_url(response.url)
|
90
71
|
page.url = response.url
|
91
72
|
page.status = response.status
|
92
|
-
page.response_time = response.
|
73
|
+
page.response_time = response.response_time
|
93
74
|
page.fetched_at = Time.now
|
94
75
|
|
95
|
-
if
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
76
|
+
if response.fetched?
|
77
|
+
if page.status_success?
|
78
|
+
handle_success(page, link, response)
|
79
|
+
elsif page.status_not_modified?
|
80
|
+
register_page(page)
|
81
|
+
elsif page.status_not_found? || page.status_gone?
|
82
|
+
delete_page(page)
|
83
|
+
end
|
84
|
+
else
|
85
|
+
page.filtered = true
|
101
86
|
delete_page(page)
|
102
87
|
end
|
103
88
|
|
@@ -120,113 +105,44 @@ module Kudzu
|
|
120
105
|
end
|
121
106
|
end
|
122
107
|
|
123
|
-
def
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
108
|
+
def fetch(link, request_header)
|
109
|
+
response = nil
|
110
|
+
@callback.around(:fetch, link, request_header, response) do
|
111
|
+
response = @agent.fetch(link.url, request_header)
|
112
|
+
end
|
113
|
+
if response.fetched?
|
114
|
+
Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
|
115
|
+
else
|
116
|
+
Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
|
128
117
|
end
|
129
|
-
header
|
130
|
-
end
|
131
|
-
|
132
|
-
def fetch_link(link, request_header)
|
133
|
-
response = @page_fetcher.fetch(link.url, request_header: request_header)
|
134
|
-
@logger.log :info, "page fetched: #{response.status} #{response.url}"
|
135
118
|
response
|
136
119
|
rescue Exception => e
|
137
|
-
|
120
|
+
Kudzu.log :warn, "failed to fetch page: #{link.url}", error: e
|
138
121
|
@callback.on(:failure, link, e)
|
139
122
|
nil
|
140
123
|
end
|
141
124
|
|
142
125
|
def handle_success(page, link, response)
|
143
|
-
|
144
|
-
@revisit_scheduler.schedule(page, modified: page.digest != digest)
|
145
|
-
|
146
|
-
page.response_header = response.header
|
126
|
+
page.response_header = response.response_header
|
147
127
|
page.body = response.body
|
148
|
-
page.size = response.
|
149
|
-
page.mime_type =
|
150
|
-
page.charset =
|
151
|
-
page.title =
|
152
|
-
page.redirect_from =
|
153
|
-
page.revised_at = Time.now if page.digest != digest
|
154
|
-
page.digest = digest
|
128
|
+
page.size = response.size
|
129
|
+
page.mime_type = response.mime_type
|
130
|
+
page.charset = response.charset
|
131
|
+
page.title = response.title
|
132
|
+
page.redirect_from = response.redirect_from
|
133
|
+
page.revised_at = Time.now if page.digest != response.digest
|
134
|
+
page.digest = response.digest
|
155
135
|
|
156
|
-
if
|
157
|
-
|
158
|
-
|
136
|
+
if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
|
137
|
+
refs = @agent.extract_refs(response)
|
138
|
+
enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
|
159
139
|
end
|
160
140
|
|
161
|
-
if
|
162
|
-
register_page(page)
|
163
|
-
else
|
141
|
+
if @agent.filter_response?(response)
|
164
142
|
page.filtered = true
|
165
143
|
delete_page(page)
|
166
|
-
end
|
167
|
-
end
|
168
|
-
|
169
|
-
def detect_mime_type(page)
|
170
|
-
@mime_type_detector.detect(page)
|
171
|
-
rescue => e
|
172
|
-
@logger.log :warn, "couldn't detect mime type for #{page.url}", error: e
|
173
|
-
nil
|
174
|
-
end
|
175
|
-
|
176
|
-
def detect_charset(page)
|
177
|
-
if page.text?
|
178
|
-
@charset_detector.detect(page)
|
179
|
-
else
|
180
|
-
nil
|
181
|
-
end
|
182
|
-
rescue => e
|
183
|
-
@logger.log :warn, "couldn't detect charset for #{page.url}", error: e
|
184
|
-
nil
|
185
|
-
end
|
186
|
-
|
187
|
-
def parse_title(page)
|
188
|
-
if page.html?
|
189
|
-
@title_parser.parse(page)
|
190
144
|
else
|
191
|
-
|
192
|
-
end
|
193
|
-
rescue => e
|
194
|
-
@logger.log :warn, "couldn't parse title for #{page.url}", error: e
|
195
|
-
nil
|
196
|
-
end
|
197
|
-
|
198
|
-
def follow_hrefs_from?(page, link)
|
199
|
-
(page.html? || page.xml?) && (@config.max_depth.nil? || link.depth < @config.max_depth.to_i)
|
200
|
-
end
|
201
|
-
|
202
|
-
def extract_hrefs(page, base_url)
|
203
|
-
hrefs = @url_extractor.extract(page, base_url)
|
204
|
-
passed, dropped = @url_filter.filter(hrefs, base_url)
|
205
|
-
|
206
|
-
if @config.respect_robots_txt
|
207
|
-
passed, dropped_by_robots = passed.partition { |href| @robots.allowed?(href[:url]) }
|
208
|
-
dropped += dropped_by_robots
|
209
|
-
end
|
210
|
-
|
211
|
-
if @config.log_level == :debug
|
212
|
-
passed.each { |href| @logger.log :debug, "url passed: #{href[:url]}" }
|
213
|
-
dropped.each { |href| @logger.log :debug, "url dropped: #{href[:url]}" }
|
214
|
-
end
|
215
|
-
|
216
|
-
passed
|
217
|
-
rescue => e
|
218
|
-
@logger.log :warn, "couldn't extract links from #{page.url}", error: e
|
219
|
-
[]
|
220
|
-
end
|
221
|
-
|
222
|
-
def allowed_page?(page)
|
223
|
-
if @page_filter.allowed?(page) &&
|
224
|
-
(!page.redirect_from || @url_filter.allowed?(page.url, page.redirect_from))
|
225
|
-
@logger.log :info, "page passed: #{page.url}"
|
226
|
-
true
|
227
|
-
else
|
228
|
-
@logger.log :info, "page dropped: #{page.url}"
|
229
|
-
false
|
145
|
+
register_page(page)
|
230
146
|
end
|
231
147
|
end
|
232
148
|
|
@@ -242,16 +158,19 @@ module Kudzu
|
|
242
158
|
end
|
243
159
|
end
|
244
160
|
|
245
|
-
def
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
161
|
+
def refs_to_links(refs, depth)
|
162
|
+
refs.map do |ref|
|
163
|
+
Kudzu.adapter::Link.new(uuid: @uuid,
|
164
|
+
url: ref.url,
|
165
|
+
title: ref.title,
|
166
|
+
state: 0,
|
167
|
+
depth: depth)
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def enqueue_links(links)
|
253
172
|
@callback.around(:enqueue, links) do
|
254
|
-
@frontier.enqueue(links
|
173
|
+
@frontier.enqueue(links)
|
255
174
|
end
|
256
175
|
end
|
257
176
|
end
|
File without changes
|
@@ -0,0 +1,112 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Model
|
3
|
+
module Page
|
4
|
+
def last_modified
|
5
|
+
last_modified = response_header['last-modified']
|
6
|
+
Time.parse(last_modified).localtime if last_modified
|
7
|
+
rescue
|
8
|
+
nil
|
9
|
+
end
|
10
|
+
|
11
|
+
def etag
|
12
|
+
response_header['etag']
|
13
|
+
end
|
14
|
+
|
15
|
+
def html?
|
16
|
+
!mime_type.to_s.match(%r{text/html|application/xhtml\+xml}).nil?
|
17
|
+
end
|
18
|
+
|
19
|
+
def xml?
|
20
|
+
!mime_type.to_s.match(%r{text/xml|application/xml|application/rss\+xml|application/atom\+xml}).nil?
|
21
|
+
end
|
22
|
+
|
23
|
+
def css?
|
24
|
+
!mime_type.to_s.match(%r{text/css}).nil?
|
25
|
+
end
|
26
|
+
|
27
|
+
def js?
|
28
|
+
!mime_type.to_s.match(%r{text/javascript|application/javascript|application/x-javascript}).nil?
|
29
|
+
end
|
30
|
+
|
31
|
+
def text?
|
32
|
+
html? || xml? || !mime_type.to_s.match(%r{text/}).nil?
|
33
|
+
end
|
34
|
+
|
35
|
+
def status_success?
|
36
|
+
200 <= status && status <= 299
|
37
|
+
end
|
38
|
+
|
39
|
+
def status_redirection?
|
40
|
+
300 <= status && status <= 399
|
41
|
+
end
|
42
|
+
|
43
|
+
def status_client_error?
|
44
|
+
400 <= status && status <= 499
|
45
|
+
end
|
46
|
+
|
47
|
+
def status_server_error?
|
48
|
+
500 <= status && status <= 599
|
49
|
+
end
|
50
|
+
|
51
|
+
def status_not_modified?
|
52
|
+
status == 304
|
53
|
+
end
|
54
|
+
|
55
|
+
def status_not_found?
|
56
|
+
status == 404
|
57
|
+
end
|
58
|
+
|
59
|
+
def status_gone?
|
60
|
+
status == 410
|
61
|
+
end
|
62
|
+
|
63
|
+
def body
|
64
|
+
@body
|
65
|
+
end
|
66
|
+
|
67
|
+
def body=(body)
|
68
|
+
@body = body
|
69
|
+
end
|
70
|
+
|
71
|
+
def filtered
|
72
|
+
@filtered
|
73
|
+
end
|
74
|
+
|
75
|
+
def filtered=(filtered)
|
76
|
+
@filtered = filtered
|
77
|
+
end
|
78
|
+
|
79
|
+
def decoded_body
|
80
|
+
@decoded_body ||= decode_body(body)
|
81
|
+
end
|
82
|
+
|
83
|
+
def parsed_doc
|
84
|
+
@parsed_doc ||= if html?
|
85
|
+
Nokogiri::HTML(decoded_body)
|
86
|
+
elsif xml?
|
87
|
+
Nokogiri::XML(decoded_body)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def decode_body(body)
|
94
|
+
if text?
|
95
|
+
if find_encoding
|
96
|
+
body.force_encoding(charset).encode('utf-8', invalid: :replace, undef: :replace)
|
97
|
+
else
|
98
|
+
body.encode('utf-8', invalid: :replace, undef: :replace)
|
99
|
+
end
|
100
|
+
else
|
101
|
+
body
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def find_encoding
|
106
|
+
Encoding.find(charset)
|
107
|
+
rescue
|
108
|
+
nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Kudzu
|
2
|
+
class ThreadPool
|
3
|
+
def initialize(size)
|
4
|
+
@size = size
|
5
|
+
@queue = Queue.new
|
6
|
+
@threads = []
|
7
|
+
end
|
8
|
+
|
9
|
+
def start(&block)
|
10
|
+
@threads = 1.upto(@size).map { create_thread(&block) }
|
11
|
+
end
|
12
|
+
|
13
|
+
def wait
|
14
|
+
until @queue.num_waiting == @threads.select { |t| t.alive? }.size
|
15
|
+
Thread.pass
|
16
|
+
sleep 1
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def shutdown
|
21
|
+
@threads.each { |t| t.kill }
|
22
|
+
@threads = []
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def create_thread(&block)
|
28
|
+
Thread.start do
|
29
|
+
loop do
|
30
|
+
ret = block.call(@queue)
|
31
|
+
break if ret == :end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/kudzu/version.rb
CHANGED
data/lib/kudzu.rb
CHANGED
@@ -1,8 +1,26 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'http-cookie'
|
3
|
+
require 'addressable'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'shared-mime-info'
|
6
|
+
require 'charlock_holmes'
|
7
|
+
|
8
|
+
require 'kudzu/version'
|
9
|
+
require 'kudzu/crawler'
|
10
|
+
|
1
11
|
module Kudzu
|
2
12
|
class << self
|
3
|
-
attr_accessor :adapter
|
13
|
+
attr_accessor :adapter, :agent, :logger
|
14
|
+
|
15
|
+
def log(level, message, error: nil)
|
16
|
+
return unless @logger
|
17
|
+
if error
|
18
|
+
message += " - #{error.class}: #{error.message} at #{error.backtrace.take(5).join("\n")}"
|
19
|
+
end
|
20
|
+
@logger.send(level, message)
|
21
|
+
end
|
4
22
|
end
|
5
23
|
end
|
6
24
|
|
7
|
-
|
8
|
-
|
25
|
+
Kudzu.adapter = Kudzu::Adapter::Memory
|
26
|
+
Kudzu.agent = Kudzu::Agent
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kudzu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoshikazu Kaneta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -174,38 +174,40 @@ files:
|
|
174
174
|
- README.md
|
175
175
|
- Rakefile
|
176
176
|
- lib/kudzu.rb
|
177
|
-
- lib/kudzu/adapter/base/all.rb
|
178
|
-
- lib/kudzu/adapter/base/link.rb
|
179
|
-
- lib/kudzu/adapter/base/page.rb
|
180
177
|
- lib/kudzu/adapter/memory.rb
|
181
|
-
- lib/kudzu/adapter/memory/all.rb
|
182
178
|
- lib/kudzu/adapter/memory/frontier.rb
|
183
179
|
- lib/kudzu/adapter/memory/model/link.rb
|
184
180
|
- lib/kudzu/adapter/memory/model/page.rb
|
185
181
|
- lib/kudzu/adapter/memory/repository.rb
|
182
|
+
- lib/kudzu/agent.rb
|
186
183
|
- lib/kudzu/agent/all.rb
|
187
|
-
- lib/kudzu/agent/charset_detector.rb
|
188
184
|
- lib/kudzu/agent/fetcher.rb
|
189
|
-
- lib/kudzu/agent/
|
190
|
-
- lib/kudzu/agent/
|
185
|
+
- lib/kudzu/agent/http/connection.rb
|
186
|
+
- lib/kudzu/agent/http/connection_pool.rb
|
187
|
+
- lib/kudzu/agent/page_filterer.rb
|
188
|
+
- lib/kudzu/agent/reference.rb
|
189
|
+
- lib/kudzu/agent/response.rb
|
191
190
|
- lib/kudzu/agent/robots.rb
|
191
|
+
- lib/kudzu/agent/robots/parser.rb
|
192
|
+
- lib/kudzu/agent/robots/txt.rb
|
192
193
|
- lib/kudzu/agent/sleeper.rb
|
193
|
-
- lib/kudzu/agent/title_parser.rb
|
194
194
|
- lib/kudzu/agent/url_extractor.rb
|
195
|
-
- lib/kudzu/agent/
|
195
|
+
- lib/kudzu/agent/url_filterer.rb
|
196
|
+
- lib/kudzu/agent/util/charset_detector.rb
|
197
|
+
- lib/kudzu/agent/util/content_type_parser.rb
|
198
|
+
- lib/kudzu/agent/util/matcher.rb
|
199
|
+
- lib/kudzu/agent/util/mime_type_detector.rb
|
200
|
+
- lib/kudzu/agent/util/title_parser.rb
|
196
201
|
- lib/kudzu/callback.rb
|
197
202
|
- lib/kudzu/common.rb
|
198
203
|
- lib/kudzu/config.rb
|
199
204
|
- lib/kudzu/config/filter.rb
|
200
205
|
- lib/kudzu/crawler.rb
|
201
|
-
- lib/kudzu/
|
202
|
-
- lib/kudzu/
|
203
|
-
- lib/kudzu/
|
204
|
-
- lib/kudzu/
|
205
|
-
- lib/kudzu/
|
206
|
-
- lib/kudzu/util/content_type_parser.rb
|
207
|
-
- lib/kudzu/util/matcher.rb
|
208
|
-
- lib/kudzu/util/thread_pool.rb
|
206
|
+
- lib/kudzu/model/all.rb
|
207
|
+
- lib/kudzu/model/base.rb
|
208
|
+
- lib/kudzu/model/link.rb
|
209
|
+
- lib/kudzu/model/page.rb
|
210
|
+
- lib/kudzu/thread_pool.rb
|
209
211
|
- lib/kudzu/version.rb
|
210
212
|
homepage: https://github.com/kanety/kudzu
|
211
213
|
licenses:
|
@@ -1,106 +0,0 @@
|
|
1
|
-
module Kudzu
|
2
|
-
module Adapter
|
3
|
-
module Base
|
4
|
-
module Page
|
5
|
-
def last_modified
|
6
|
-
last_modified = response_header['last-modified']
|
7
|
-
Time.parse(last_modified).localtime if last_modified
|
8
|
-
rescue
|
9
|
-
nil
|
10
|
-
end
|
11
|
-
|
12
|
-
def etag
|
13
|
-
response_header['etag']
|
14
|
-
end
|
15
|
-
|
16
|
-
def html?
|
17
|
-
!mime_type.to_s.match(%r{text/html|application/xhtml\+xml}).nil?
|
18
|
-
end
|
19
|
-
|
20
|
-
def xml?
|
21
|
-
!mime_type.to_s.match(%r{text/xml|application/xml|application/rss\+xml|application/atom\+xml}).nil?
|
22
|
-
end
|
23
|
-
|
24
|
-
def css?
|
25
|
-
!mime_type.to_s.match(%r{text/css}).nil?
|
26
|
-
end
|
27
|
-
|
28
|
-
def js?
|
29
|
-
!mime_type.to_s.match(%r{text/javascript|application/javascript|application/x-javascript}).nil?
|
30
|
-
end
|
31
|
-
|
32
|
-
def text?
|
33
|
-
html? || xml? || !mime_type.to_s.match(%r{text/}).nil?
|
34
|
-
end
|
35
|
-
|
36
|
-
def status_success?
|
37
|
-
200 <= status && status <= 299
|
38
|
-
end
|
39
|
-
|
40
|
-
def status_redirection?
|
41
|
-
300 <= status && status <= 399
|
42
|
-
end
|
43
|
-
|
44
|
-
def status_client_error?
|
45
|
-
400 <= status && status <= 499
|
46
|
-
end
|
47
|
-
|
48
|
-
def status_server_error?
|
49
|
-
500 <= status && status <= 599
|
50
|
-
end
|
51
|
-
|
52
|
-
def status_not_modified?
|
53
|
-
status == 304
|
54
|
-
end
|
55
|
-
|
56
|
-
def status_not_found?
|
57
|
-
status == 404
|
58
|
-
end
|
59
|
-
|
60
|
-
def status_gone?
|
61
|
-
status == 410
|
62
|
-
end
|
63
|
-
|
64
|
-
def body
|
65
|
-
@body
|
66
|
-
end
|
67
|
-
|
68
|
-
def body=(body)
|
69
|
-
@body = body
|
70
|
-
end
|
71
|
-
|
72
|
-
def filtered
|
73
|
-
@filtered
|
74
|
-
end
|
75
|
-
|
76
|
-
def filtered=(filtered)
|
77
|
-
@filtered = filtered
|
78
|
-
end
|
79
|
-
|
80
|
-
def decoded_body
|
81
|
-
@decoded_body ||= decode_body(body)
|
82
|
-
end
|
83
|
-
|
84
|
-
private
|
85
|
-
|
86
|
-
def decode_body(body)
|
87
|
-
if text?
|
88
|
-
if find_encoding
|
89
|
-
body.force_encoding(charset).encode('utf-8', invalid: :replace, undef: :replace)
|
90
|
-
else
|
91
|
-
body.encode('utf-8', invalid: :replace, undef: :replace)
|
92
|
-
end
|
93
|
-
else
|
94
|
-
body
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
def find_encoding
|
99
|
-
Encoding.find(charset)
|
100
|
-
rescue
|
101
|
-
nil
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
end
|