kudzu 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +61 -0
- data/Rakefile +6 -0
- data/lib/kudzu.rb +8 -0
- data/lib/kudzu/adapter/base/all.rb +3 -0
- data/lib/kudzu/adapter/base/link.rb +8 -0
- data/lib/kudzu/adapter/base/page.rb +106 -0
- data/lib/kudzu/adapter/memory.rb +4 -0
- data/lib/kudzu/adapter/memory/all.rb +3 -0
- data/lib/kudzu/adapter/memory/frontier.rb +38 -0
- data/lib/kudzu/adapter/memory/model/link.rb +15 -0
- data/lib/kudzu/adapter/memory/model/page.rb +17 -0
- data/lib/kudzu/adapter/memory/repository.rb +27 -0
- data/lib/kudzu/agent/all.rb +3 -0
- data/lib/kudzu/agent/charset_detector.rb +84 -0
- data/lib/kudzu/agent/fetcher.rb +116 -0
- data/lib/kudzu/agent/filter.rb +40 -0
- data/lib/kudzu/agent/mime_type_detector.rb +34 -0
- data/lib/kudzu/agent/robots.rb +190 -0
- data/lib/kudzu/agent/sleeper.rb +44 -0
- data/lib/kudzu/agent/title_parser.rb +16 -0
- data/lib/kudzu/agent/url_extractor.rb +123 -0
- data/lib/kudzu/agent/url_filter.rb +65 -0
- data/lib/kudzu/callback.rb +41 -0
- data/lib/kudzu/common.rb +23 -0
- data/lib/kudzu/config.rb +79 -0
- data/lib/kudzu/config/filter.rb +39 -0
- data/lib/kudzu/crawler.rb +258 -0
- data/lib/kudzu/logger.rb +20 -0
- data/lib/kudzu/revisit/all.rb +3 -0
- data/lib/kudzu/revisit/scheduler.rb +28 -0
- data/lib/kudzu/util/all.rb +3 -0
- data/lib/kudzu/util/connection_pool.rb +56 -0
- data/lib/kudzu/util/content_type_parser.rb +24 -0
- data/lib/kudzu/util/matcher.rb +21 -0
- data/lib/kudzu/util/thread_pool.rb +38 -0
- data/lib/kudzu/version.rb +3 -0
- metadata +234 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ab6c69046e9faa3788ead18864ee6d13ddbe2980
|
4
|
+
data.tar.gz: c9868fabe9542d877d6519e0f5297419c882a8e5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7018e08e6744a9e74e601bad26a88df3d60140ddaa055fc194c88263cff37137402de967203999d9ba9c2bda199228215f380e207a4b12b6c2e50c5774827e16
|
7
|
+
data.tar.gz: b03edd059ea5b5cb0f50fd0bc660c02e8727c1bb7f8f735fe86ba6f08b2a8bf743a04570eb171dc99f3ffcacba26c1df8a99cc2a1187e2e2fb481c01adb59e6c
|
data/README.md
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# Kudzu
|
2
|
+
|
3
|
+
A simple web crawler for ruby.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
* Run single-thread or multi-thread.
|
8
|
+
* Pool HTTP connection.
|
9
|
+
* Restrict links by url-based patterns.
|
10
|
+
* Respect robots.txt.
|
11
|
+
* Store page contents via adapter.
|
12
|
+
|
13
|
+
## Dependencies
|
14
|
+
|
15
|
+
* ruby 2.3+
|
16
|
+
* libicu
|
17
|
+
|
18
|
+
## Installation
|
19
|
+
|
20
|
+
Add to your application's Gemfile:
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
gem 'kudzu'
|
24
|
+
```
|
25
|
+
|
26
|
+
Then run:
|
27
|
+
|
28
|
+
$ bundle install
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
Crawl html files in `example.com`:
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
crawler = Kudzu::Crawler.new do
|
36
|
+
user_agent 'YOUR_AWESOME_APP'
|
37
|
+
add_filter do
|
38
|
+
focus_host true
|
39
|
+
allow_mime_type %w(text/html)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
crawler.run('http://example.com/') do
|
43
|
+
on_success do |page, link|
|
44
|
+
puts page.url
|
45
|
+
end
|
46
|
+
end
|
47
|
+
```
|
48
|
+
|
49
|
+
## Adapters
|
50
|
+
|
51
|
+
This gem supports only in-memory crawling by default. Use following adapter to save page contents persistently:
|
52
|
+
|
53
|
+
* [kudzu-adapter-active_record](https://github.com/kanety/kudzu-adapter-active_record)
|
54
|
+
|
55
|
+
## Contributing
|
56
|
+
|
57
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/kanety/kudzu. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
58
|
+
|
59
|
+
## License
|
60
|
+
|
61
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/lib/kudzu.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module Base
|
4
|
+
module Page
|
5
|
+
def last_modified
|
6
|
+
last_modified = response_header['last-modified']
|
7
|
+
Time.parse(last_modified).localtime if last_modified
|
8
|
+
rescue
|
9
|
+
nil
|
10
|
+
end
|
11
|
+
|
12
|
+
def etag
|
13
|
+
response_header['etag']
|
14
|
+
end
|
15
|
+
|
16
|
+
def html?
|
17
|
+
!mime_type.to_s.match(%r{text/html|application/xhtml\+xml}).nil?
|
18
|
+
end
|
19
|
+
|
20
|
+
def xml?
|
21
|
+
!mime_type.to_s.match(%r{text/xml|application/xml|application/rss\+xml|application/atom\+xml}).nil?
|
22
|
+
end
|
23
|
+
|
24
|
+
def css?
|
25
|
+
!mime_type.to_s.match(%r{text/css}).nil?
|
26
|
+
end
|
27
|
+
|
28
|
+
def js?
|
29
|
+
!mime_type.to_s.match(%r{text/javascript|application/javascript|application/x-javascript}).nil?
|
30
|
+
end
|
31
|
+
|
32
|
+
def text?
|
33
|
+
html? || xml? || !mime_type.to_s.match(%r{text/}).nil?
|
34
|
+
end
|
35
|
+
|
36
|
+
def status_success?
|
37
|
+
200 <= status && status <= 299
|
38
|
+
end
|
39
|
+
|
40
|
+
def status_redirection?
|
41
|
+
300 <= status && status <= 399
|
42
|
+
end
|
43
|
+
|
44
|
+
def status_client_error?
|
45
|
+
400 <= status && status <= 499
|
46
|
+
end
|
47
|
+
|
48
|
+
def status_server_error?
|
49
|
+
500 <= status && status <= 599
|
50
|
+
end
|
51
|
+
|
52
|
+
def status_not_modified?
|
53
|
+
status == 304
|
54
|
+
end
|
55
|
+
|
56
|
+
def status_not_found?
|
57
|
+
status == 404
|
58
|
+
end
|
59
|
+
|
60
|
+
def status_gone?
|
61
|
+
status == 410
|
62
|
+
end
|
63
|
+
|
64
|
+
def body
|
65
|
+
@body
|
66
|
+
end
|
67
|
+
|
68
|
+
def body=(body)
|
69
|
+
@body = body
|
70
|
+
end
|
71
|
+
|
72
|
+
def filtered
|
73
|
+
@filtered
|
74
|
+
end
|
75
|
+
|
76
|
+
def filtered=(filtered)
|
77
|
+
@filtered = filtered
|
78
|
+
end
|
79
|
+
|
80
|
+
def decoded_body
|
81
|
+
@decoded_body ||= decode_body(body)
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def decode_body(body)
|
87
|
+
if text?
|
88
|
+
if find_encoding
|
89
|
+
body.force_encoding(charset).encode('utf-8', invalid: :replace, undef: :replace)
|
90
|
+
else
|
91
|
+
body.encode('utf-8', invalid: :replace, undef: :replace)
|
92
|
+
end
|
93
|
+
else
|
94
|
+
body
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def find_encoding
|
99
|
+
Encoding.find(charset)
|
100
|
+
rescue
|
101
|
+
nil
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module Memory
|
4
|
+
class Frontier
|
5
|
+
def initialize(uuid)
|
6
|
+
@uuid = uuid
|
7
|
+
@monitor = Monitor.new
|
8
|
+
@queue = []
|
9
|
+
@queued = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def enqueue(links, depth: 1)
|
13
|
+
@monitor.synchronize do
|
14
|
+
Array(links).each do |link|
|
15
|
+
next if @queued.key?(link.url)
|
16
|
+
@queued[link.url] = true
|
17
|
+
@queue << link
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def dequeue(limit: 1)
|
23
|
+
@monitor.synchronize do
|
24
|
+
links = @queue.shift(limit)
|
25
|
+
links.each do |link|
|
26
|
+
link.state = 1
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def clear
|
32
|
+
@queue.clear
|
33
|
+
@queued.clear
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module Memory
|
4
|
+
class Link
|
5
|
+
include Kudzu::Adapter::Base::Link
|
6
|
+
|
7
|
+
attr_accessor :uuid, :url, :title, :state, :depth
|
8
|
+
|
9
|
+
def initialize(attr = {})
|
10
|
+
attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module Memory
|
4
|
+
class Page
|
5
|
+
include Kudzu::Adapter::Base::Page
|
6
|
+
|
7
|
+
attr_accessor :url, :title, :status, :mime_type, :size, :charset, :digest,
|
8
|
+
:response_header, :response_time, :redirect_from, :fetched_at, :revised_at,
|
9
|
+
:revisit_interval, :revisit_at
|
10
|
+
|
11
|
+
def initialize(attr = {})
|
12
|
+
attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module Memory
|
4
|
+
class Repository
|
5
|
+
attr_reader :page
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@page = {}
|
9
|
+
@digest = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def find_by_url(url)
|
13
|
+
@page[url] || Page.new(url: url)
|
14
|
+
end
|
15
|
+
|
16
|
+
def register(page)
|
17
|
+
@page[page.url] = page
|
18
|
+
@digest[page.digest] = true
|
19
|
+
end
|
20
|
+
|
21
|
+
def delete(page)
|
22
|
+
@page.delete(page.url)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'charlock_holmes'
|
3
|
+
|
4
|
+
module Kudzu
|
5
|
+
class Agent
|
6
|
+
class CharsetDetector
|
7
|
+
CORRECTION = {
|
8
|
+
'utf_8' => 'utf-8',
|
9
|
+
'shift-jis' => 'shift_jis',
|
10
|
+
'x-sjis' => 'shift_jis',
|
11
|
+
'euc_jp' => 'euc-jp'
|
12
|
+
}
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@parser = Kudzu::Util::ContentTypeParser.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def detect(page)
|
19
|
+
if page.html?
|
20
|
+
from_html(page.body) || from_text(page.body)
|
21
|
+
elsif page.xml?
|
22
|
+
from_xml(page.body) || from_text(page.body)
|
23
|
+
elsif page.text?
|
24
|
+
from_text(page.body)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def from_html(body)
|
31
|
+
doc = Nokogiri::HTML(body.encode('ascii', undef: :replace, invalid: :replace))
|
32
|
+
|
33
|
+
if (node = doc.xpath('//meta/@charset').first)
|
34
|
+
charset = correct(node.to_s)
|
35
|
+
return charset if charset
|
36
|
+
end
|
37
|
+
|
38
|
+
doc.xpath('//meta[@http-equiv]').each do |meta|
|
39
|
+
if meta['http-equiv'] =~ /content-type/i
|
40
|
+
charset = @parser.parse(meta[:content].to_s)[1][:charset]
|
41
|
+
charset = correct(node.to_s)
|
42
|
+
return charset if charset
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
return nil
|
47
|
+
end
|
48
|
+
|
49
|
+
def from_xml(body)
|
50
|
+
doc = Nokogiri::XML(body.encode('ascii', undef: :replace, invalid: :replace))
|
51
|
+
if doc.encoding
|
52
|
+
correct(doc.encoding)
|
53
|
+
else
|
54
|
+
nil
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def from_text(text)
|
59
|
+
if text.ascii_only?
|
60
|
+
'ascii'
|
61
|
+
else
|
62
|
+
detection = CharlockHolmes::EncodingDetector.detect(text)
|
63
|
+
if detection && detection.key?(:encoding)
|
64
|
+
detection[:encoding].downcase
|
65
|
+
else
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def correct(charset)
|
72
|
+
charset = charset.downcase
|
73
|
+
charset = CORRECTION[charset] if CORRECTION.key?(charset)
|
74
|
+
|
75
|
+
begin
|
76
|
+
Encoding.find(charset)
|
77
|
+
rescue
|
78
|
+
charset = nil
|
79
|
+
end
|
80
|
+
charset
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'http-cookie'
|
3
|
+
|
4
|
+
module Kudzu
|
5
|
+
class Agent
|
6
|
+
class Fetcher
|
7
|
+
class Response
|
8
|
+
attr_accessor :url, :status, :header, :body, :time, :redirected
|
9
|
+
|
10
|
+
def initialize(attr = {})
|
11
|
+
attr.each { |k, v| public_send("#{k}=", v) }
|
12
|
+
end
|
13
|
+
|
14
|
+
def redirected?
|
15
|
+
redirected
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :pool
|
20
|
+
|
21
|
+
def initialize(config, robots = nil)
|
22
|
+
@config = config
|
23
|
+
@pool = Kudzu::Util::ConnectionPool.new(@config.max_connection || 100)
|
24
|
+
@sleeper = Kudzu::Agent::Sleeper.new(@config, robots)
|
25
|
+
@jar = HTTP::CookieJar.new
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(url, request_header: {}, redirect: max_redirect, method: :get)
|
29
|
+
uri = Addressable::URI.parse(url)
|
30
|
+
http = @pool.checkout(pool_name(uri)) { build_http(uri) }
|
31
|
+
request = build_request(uri, request_header: request_header, method: method)
|
32
|
+
|
33
|
+
append_cookie(url, request) if @config.handle_cookie
|
34
|
+
|
35
|
+
@sleeper.politeness_delay(url)
|
36
|
+
|
37
|
+
response = nil
|
38
|
+
response_time = Benchmark.realtime { response = http.request(request) }
|
39
|
+
|
40
|
+
parse_cookie(url, response) if @config.handle_cookie
|
41
|
+
|
42
|
+
if redirection?(response.code) && response['location'] && redirect > 0
|
43
|
+
fetch(uri.join(response['location']).to_s, request_header: request_header, redirect: redirect - 1)
|
44
|
+
else
|
45
|
+
res = build_response(url, response, response_time)
|
46
|
+
res.redirected = (redirect != max_redirect)
|
47
|
+
res
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def max_redirect
|
54
|
+
@config.max_redirect || 5
|
55
|
+
end
|
56
|
+
|
57
|
+
def pool_name(uri)
|
58
|
+
"#{uri.scheme}_#{uri.host}_#{uri.port || uri.default_port}"
|
59
|
+
end
|
60
|
+
|
61
|
+
def build_http(uri)
|
62
|
+
http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
|
63
|
+
http.open_timeout = @config.open_timeout if @config.open_timeout
|
64
|
+
http.read_timeout = @config.read_timeout if @config.read_timeout
|
65
|
+
if uri.scheme == 'https'
|
66
|
+
http.use_ssl = true
|
67
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
68
|
+
end
|
69
|
+
http.start
|
70
|
+
end
|
71
|
+
|
72
|
+
def build_request(uri, request_header:, method:)
|
73
|
+
request = request_klass_for(method).new(uri.request_uri)
|
74
|
+
request.basic_auth uri.user, uri.password if uri.user && uri.password
|
75
|
+
|
76
|
+
request['User-Agent'] = @config.user_agent
|
77
|
+
request_header.each do |key, value|
|
78
|
+
request[key] = value
|
79
|
+
end
|
80
|
+
request
|
81
|
+
end
|
82
|
+
|
83
|
+
def request_klass_for(method)
|
84
|
+
Object.const_get("Net::HTTP::#{method.capitalize}")
|
85
|
+
end
|
86
|
+
|
87
|
+
def build_response(url, response, response_time)
|
88
|
+
Response.new(url: url,
|
89
|
+
status: response.code.to_i,
|
90
|
+
header: Hash[response.each.to_a],
|
91
|
+
body: response.body.to_s,
|
92
|
+
time: response_time)
|
93
|
+
end
|
94
|
+
|
95
|
+
def redirection?(code)
|
96
|
+
code = code.to_i
|
97
|
+
300 <= code && code <= 399
|
98
|
+
end
|
99
|
+
|
100
|
+
def parse_cookie(url, response)
|
101
|
+
@jar.parse(response['set-cookie'], url) if response['set-cookie']
|
102
|
+
end
|
103
|
+
|
104
|
+
def append_cookie(url, request)
|
105
|
+
cookies = @jar.cookies(url)
|
106
|
+
unless cookies.empty?
|
107
|
+
if request['Cookie']
|
108
|
+
request['Cookie'] += '; ' + cookies.join('; ')
|
109
|
+
else
|
110
|
+
request['Cookie'] = cookies.join('; ')
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|