kudzu 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +61 -0
- data/Rakefile +6 -0
- data/lib/kudzu.rb +8 -0
- data/lib/kudzu/adapter/base/all.rb +3 -0
- data/lib/kudzu/adapter/base/link.rb +8 -0
- data/lib/kudzu/adapter/base/page.rb +106 -0
- data/lib/kudzu/adapter/memory.rb +4 -0
- data/lib/kudzu/adapter/memory/all.rb +3 -0
- data/lib/kudzu/adapter/memory/frontier.rb +38 -0
- data/lib/kudzu/adapter/memory/model/link.rb +15 -0
- data/lib/kudzu/adapter/memory/model/page.rb +17 -0
- data/lib/kudzu/adapter/memory/repository.rb +27 -0
- data/lib/kudzu/agent/all.rb +3 -0
- data/lib/kudzu/agent/charset_detector.rb +84 -0
- data/lib/kudzu/agent/fetcher.rb +116 -0
- data/lib/kudzu/agent/filter.rb +40 -0
- data/lib/kudzu/agent/mime_type_detector.rb +34 -0
- data/lib/kudzu/agent/robots.rb +190 -0
- data/lib/kudzu/agent/sleeper.rb +44 -0
- data/lib/kudzu/agent/title_parser.rb +16 -0
- data/lib/kudzu/agent/url_extractor.rb +123 -0
- data/lib/kudzu/agent/url_filter.rb +65 -0
- data/lib/kudzu/callback.rb +41 -0
- data/lib/kudzu/common.rb +23 -0
- data/lib/kudzu/config.rb +79 -0
- data/lib/kudzu/config/filter.rb +39 -0
- data/lib/kudzu/crawler.rb +258 -0
- data/lib/kudzu/logger.rb +20 -0
- data/lib/kudzu/revisit/all.rb +3 -0
- data/lib/kudzu/revisit/scheduler.rb +28 -0
- data/lib/kudzu/util/all.rb +3 -0
- data/lib/kudzu/util/connection_pool.rb +56 -0
- data/lib/kudzu/util/content_type_parser.rb +24 -0
- data/lib/kudzu/util/matcher.rb +21 -0
- data/lib/kudzu/util/thread_pool.rb +38 -0
- data/lib/kudzu/version.rb +3 -0
- metadata +234 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ab6c69046e9faa3788ead18864ee6d13ddbe2980
|
4
|
+
data.tar.gz: c9868fabe9542d877d6519e0f5297419c882a8e5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7018e08e6744a9e74e601bad26a88df3d60140ddaa055fc194c88263cff37137402de967203999d9ba9c2bda199228215f380e207a4b12b6c2e50c5774827e16
|
7
|
+
data.tar.gz: b03edd059ea5b5cb0f50fd0bc660c02e8727c1bb7f8f735fe86ba6f08b2a8bf743a04570eb171dc99f3ffcacba26c1df8a99cc2a1187e2e2fb481c01adb59e6c
|
data/README.md
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# Kudzu
|
2
|
+
|
3
|
+
A simple web crawler for ruby.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
* Run single-thread or multi-thread.
|
8
|
+
* Pool HTTP connection.
|
9
|
+
* Restrict links by url-based patterns.
|
10
|
+
* Respect robots.txt.
|
11
|
+
* Store page contents via adapter.
|
12
|
+
|
13
|
+
## Dependencies
|
14
|
+
|
15
|
+
* ruby 2.3+
|
16
|
+
* libicu
|
17
|
+
|
18
|
+
## Installation
|
19
|
+
|
20
|
+
Add to your application's Gemfile:
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
gem 'kudzu'
|
24
|
+
```
|
25
|
+
|
26
|
+
Then run:
|
27
|
+
|
28
|
+
$ bundle install
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
Crawl html files in `example.com`:
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
crawler = Kudzu::Crawler.new do
|
36
|
+
user_agent 'YOUR_AWESOME_APP'
|
37
|
+
add_filter do
|
38
|
+
focus_host true
|
39
|
+
allow_mime_type %w(text/html)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
crawler.run('http://example.com/') do
|
43
|
+
on_success do |page, link|
|
44
|
+
puts page.url
|
45
|
+
end
|
46
|
+
end
|
47
|
+
```
|
48
|
+
|
49
|
+
## Adapters
|
50
|
+
|
51
|
+
This gem supports only in-memory crawling by default. Use following adapter to save page contents persistently:
|
52
|
+
|
53
|
+
* [kudzu-adapter-active_record](https://github.com/kanety/kudzu-adapter-active_record)
|
54
|
+
|
55
|
+
## Contributing
|
56
|
+
|
57
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/kanety/kudzu. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
58
|
+
|
59
|
+
## License
|
60
|
+
|
61
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/lib/kudzu.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module Base
|
4
|
+
module Page
|
5
|
+
def last_modified
|
6
|
+
last_modified = response_header['last-modified']
|
7
|
+
Time.parse(last_modified).localtime if last_modified
|
8
|
+
rescue
|
9
|
+
nil
|
10
|
+
end
|
11
|
+
|
12
|
+
def etag
|
13
|
+
response_header['etag']
|
14
|
+
end
|
15
|
+
|
16
|
+
def html?
|
17
|
+
!mime_type.to_s.match(%r{text/html|application/xhtml\+xml}).nil?
|
18
|
+
end
|
19
|
+
|
20
|
+
def xml?
|
21
|
+
!mime_type.to_s.match(%r{text/xml|application/xml|application/rss\+xml|application/atom\+xml}).nil?
|
22
|
+
end
|
23
|
+
|
24
|
+
def css?
|
25
|
+
!mime_type.to_s.match(%r{text/css}).nil?
|
26
|
+
end
|
27
|
+
|
28
|
+
def js?
|
29
|
+
!mime_type.to_s.match(%r{text/javascript|application/javascript|application/x-javascript}).nil?
|
30
|
+
end
|
31
|
+
|
32
|
+
def text?
|
33
|
+
html? || xml? || !mime_type.to_s.match(%r{text/}).nil?
|
34
|
+
end
|
35
|
+
|
36
|
+
def status_success?
|
37
|
+
200 <= status && status <= 299
|
38
|
+
end
|
39
|
+
|
40
|
+
def status_redirection?
|
41
|
+
300 <= status && status <= 399
|
42
|
+
end
|
43
|
+
|
44
|
+
def status_client_error?
|
45
|
+
400 <= status && status <= 499
|
46
|
+
end
|
47
|
+
|
48
|
+
def status_server_error?
|
49
|
+
500 <= status && status <= 599
|
50
|
+
end
|
51
|
+
|
52
|
+
def status_not_modified?
|
53
|
+
status == 304
|
54
|
+
end
|
55
|
+
|
56
|
+
def status_not_found?
|
57
|
+
status == 404
|
58
|
+
end
|
59
|
+
|
60
|
+
def status_gone?
|
61
|
+
status == 410
|
62
|
+
end
|
63
|
+
|
64
|
+
def body
|
65
|
+
@body
|
66
|
+
end
|
67
|
+
|
68
|
+
def body=(body)
|
69
|
+
@body = body
|
70
|
+
end
|
71
|
+
|
72
|
+
def filtered
|
73
|
+
@filtered
|
74
|
+
end
|
75
|
+
|
76
|
+
def filtered=(filtered)
|
77
|
+
@filtered = filtered
|
78
|
+
end
|
79
|
+
|
80
|
+
def decoded_body
|
81
|
+
@decoded_body ||= decode_body(body)
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def decode_body(body)
|
87
|
+
if text?
|
88
|
+
if find_encoding
|
89
|
+
body.force_encoding(charset).encode('utf-8', invalid: :replace, undef: :replace)
|
90
|
+
else
|
91
|
+
body.encode('utf-8', invalid: :replace, undef: :replace)
|
92
|
+
end
|
93
|
+
else
|
94
|
+
body
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def find_encoding
|
99
|
+
Encoding.find(charset)
|
100
|
+
rescue
|
101
|
+
nil
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module Memory
|
4
|
+
class Frontier
|
5
|
+
def initialize(uuid)
|
6
|
+
@uuid = uuid
|
7
|
+
@monitor = Monitor.new
|
8
|
+
@queue = []
|
9
|
+
@queued = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def enqueue(links, depth: 1)
|
13
|
+
@monitor.synchronize do
|
14
|
+
Array(links).each do |link|
|
15
|
+
next if @queued.key?(link.url)
|
16
|
+
@queued[link.url] = true
|
17
|
+
@queue << link
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def dequeue(limit: 1)
|
23
|
+
@monitor.synchronize do
|
24
|
+
links = @queue.shift(limit)
|
25
|
+
links.each do |link|
|
26
|
+
link.state = 1
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def clear
|
32
|
+
@queue.clear
|
33
|
+
@queued.clear
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module Memory
|
4
|
+
class Link
|
5
|
+
include Kudzu::Adapter::Base::Link
|
6
|
+
|
7
|
+
attr_accessor :uuid, :url, :title, :state, :depth
|
8
|
+
|
9
|
+
def initialize(attr = {})
|
10
|
+
attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module Memory
|
4
|
+
class Page
|
5
|
+
include Kudzu::Adapter::Base::Page
|
6
|
+
|
7
|
+
attr_accessor :url, :title, :status, :mime_type, :size, :charset, :digest,
|
8
|
+
:response_header, :response_time, :redirect_from, :fetched_at, :revised_at,
|
9
|
+
:revisit_interval, :revisit_at
|
10
|
+
|
11
|
+
def initialize(attr = {})
|
12
|
+
attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module Memory
|
4
|
+
class Repository
|
5
|
+
attr_reader :page
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@page = {}
|
9
|
+
@digest = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def find_by_url(url)
|
13
|
+
@page[url] || Page.new(url: url)
|
14
|
+
end
|
15
|
+
|
16
|
+
def register(page)
|
17
|
+
@page[page.url] = page
|
18
|
+
@digest[page.digest] = true
|
19
|
+
end
|
20
|
+
|
21
|
+
def delete(page)
|
22
|
+
@page.delete(page.url)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'charlock_holmes'
|
3
|
+
|
4
|
+
module Kudzu
|
5
|
+
class Agent
|
6
|
+
class CharsetDetector
|
7
|
+
CORRECTION = {
|
8
|
+
'utf_8' => 'utf-8',
|
9
|
+
'shift-jis' => 'shift_jis',
|
10
|
+
'x-sjis' => 'shift_jis',
|
11
|
+
'euc_jp' => 'euc-jp'
|
12
|
+
}
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@parser = Kudzu::Util::ContentTypeParser.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def detect(page)
|
19
|
+
if page.html?
|
20
|
+
from_html(page.body) || from_text(page.body)
|
21
|
+
elsif page.xml?
|
22
|
+
from_xml(page.body) || from_text(page.body)
|
23
|
+
elsif page.text?
|
24
|
+
from_text(page.body)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def from_html(body)
|
31
|
+
doc = Nokogiri::HTML(body.encode('ascii', undef: :replace, invalid: :replace))
|
32
|
+
|
33
|
+
if (node = doc.xpath('//meta/@charset').first)
|
34
|
+
charset = correct(node.to_s)
|
35
|
+
return charset if charset
|
36
|
+
end
|
37
|
+
|
38
|
+
doc.xpath('//meta[@http-equiv]').each do |meta|
|
39
|
+
if meta['http-equiv'] =~ /content-type/i
|
40
|
+
charset = @parser.parse(meta[:content].to_s)[1][:charset]
|
41
|
+
charset = correct(node.to_s)
|
42
|
+
return charset if charset
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
return nil
|
47
|
+
end
|
48
|
+
|
49
|
+
def from_xml(body)
|
50
|
+
doc = Nokogiri::XML(body.encode('ascii', undef: :replace, invalid: :replace))
|
51
|
+
if doc.encoding
|
52
|
+
correct(doc.encoding)
|
53
|
+
else
|
54
|
+
nil
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def from_text(text)
|
59
|
+
if text.ascii_only?
|
60
|
+
'ascii'
|
61
|
+
else
|
62
|
+
detection = CharlockHolmes::EncodingDetector.detect(text)
|
63
|
+
if detection && detection.key?(:encoding)
|
64
|
+
detection[:encoding].downcase
|
65
|
+
else
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def correct(charset)
|
72
|
+
charset = charset.downcase
|
73
|
+
charset = CORRECTION[charset] if CORRECTION.key?(charset)
|
74
|
+
|
75
|
+
begin
|
76
|
+
Encoding.find(charset)
|
77
|
+
rescue
|
78
|
+
charset = nil
|
79
|
+
end
|
80
|
+
charset
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'http-cookie'
|
3
|
+
|
4
|
+
module Kudzu
|
5
|
+
class Agent
|
6
|
+
class Fetcher
|
7
|
+
class Response
|
8
|
+
attr_accessor :url, :status, :header, :body, :time, :redirected
|
9
|
+
|
10
|
+
def initialize(attr = {})
|
11
|
+
attr.each { |k, v| public_send("#{k}=", v) }
|
12
|
+
end
|
13
|
+
|
14
|
+
def redirected?
|
15
|
+
redirected
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :pool
|
20
|
+
|
21
|
+
def initialize(config, robots = nil)
|
22
|
+
@config = config
|
23
|
+
@pool = Kudzu::Util::ConnectionPool.new(@config.max_connection || 100)
|
24
|
+
@sleeper = Kudzu::Agent::Sleeper.new(@config, robots)
|
25
|
+
@jar = HTTP::CookieJar.new
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(url, request_header: {}, redirect: max_redirect, method: :get)
|
29
|
+
uri = Addressable::URI.parse(url)
|
30
|
+
http = @pool.checkout(pool_name(uri)) { build_http(uri) }
|
31
|
+
request = build_request(uri, request_header: request_header, method: method)
|
32
|
+
|
33
|
+
append_cookie(url, request) if @config.handle_cookie
|
34
|
+
|
35
|
+
@sleeper.politeness_delay(url)
|
36
|
+
|
37
|
+
response = nil
|
38
|
+
response_time = Benchmark.realtime { response = http.request(request) }
|
39
|
+
|
40
|
+
parse_cookie(url, response) if @config.handle_cookie
|
41
|
+
|
42
|
+
if redirection?(response.code) && response['location'] && redirect > 0
|
43
|
+
fetch(uri.join(response['location']).to_s, request_header: request_header, redirect: redirect - 1)
|
44
|
+
else
|
45
|
+
res = build_response(url, response, response_time)
|
46
|
+
res.redirected = (redirect != max_redirect)
|
47
|
+
res
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def max_redirect
|
54
|
+
@config.max_redirect || 5
|
55
|
+
end
|
56
|
+
|
57
|
+
def pool_name(uri)
|
58
|
+
"#{uri.scheme}_#{uri.host}_#{uri.port || uri.default_port}"
|
59
|
+
end
|
60
|
+
|
61
|
+
def build_http(uri)
|
62
|
+
http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
|
63
|
+
http.open_timeout = @config.open_timeout if @config.open_timeout
|
64
|
+
http.read_timeout = @config.read_timeout if @config.read_timeout
|
65
|
+
if uri.scheme == 'https'
|
66
|
+
http.use_ssl = true
|
67
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
68
|
+
end
|
69
|
+
http.start
|
70
|
+
end
|
71
|
+
|
72
|
+
def build_request(uri, request_header:, method:)
|
73
|
+
request = request_klass_for(method).new(uri.request_uri)
|
74
|
+
request.basic_auth uri.user, uri.password if uri.user && uri.password
|
75
|
+
|
76
|
+
request['User-Agent'] = @config.user_agent
|
77
|
+
request_header.each do |key, value|
|
78
|
+
request[key] = value
|
79
|
+
end
|
80
|
+
request
|
81
|
+
end
|
82
|
+
|
83
|
+
def request_klass_for(method)
|
84
|
+
Object.const_get("Net::HTTP::#{method.capitalize}")
|
85
|
+
end
|
86
|
+
|
87
|
+
def build_response(url, response, response_time)
|
88
|
+
Response.new(url: url,
|
89
|
+
status: response.code.to_i,
|
90
|
+
header: Hash[response.each.to_a],
|
91
|
+
body: response.body.to_s,
|
92
|
+
time: response_time)
|
93
|
+
end
|
94
|
+
|
95
|
+
def redirection?(code)
|
96
|
+
code = code.to_i
|
97
|
+
300 <= code && code <= 399
|
98
|
+
end
|
99
|
+
|
100
|
+
def parse_cookie(url, response)
|
101
|
+
@jar.parse(response['set-cookie'], url) if response['set-cookie']
|
102
|
+
end
|
103
|
+
|
104
|
+
def append_cookie(url, request)
|
105
|
+
cookies = @jar.cookies(url)
|
106
|
+
unless cookies.empty?
|
107
|
+
if request['Cookie']
|
108
|
+
request['Cookie'] += '; ' + cookies.join('; ')
|
109
|
+
else
|
110
|
+
request['Cookie'] = cookies.join('; ')
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|