kudzu 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ab6c69046e9faa3788ead18864ee6d13ddbe2980
4
+ data.tar.gz: c9868fabe9542d877d6519e0f5297419c882a8e5
5
+ SHA512:
6
+ metadata.gz: 7018e08e6744a9e74e601bad26a88df3d60140ddaa055fc194c88263cff37137402de967203999d9ba9c2bda199228215f380e207a4b12b6c2e50c5774827e16
7
+ data.tar.gz: b03edd059ea5b5cb0f50fd0bc660c02e8727c1bb7f8f735fe86ba6f08b2a8bf743a04570eb171dc99f3ffcacba26c1df8a99cc2a1187e2e2fb481c01adb59e6c
data/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # Kudzu
2
+
3
+ A simple web crawler for ruby.
4
+
5
+ ## Features
6
+
7
+ * Run single-thread or multi-thread.
8
+ * Pool HTTP connection.
9
+ * Restrict links by url-based patterns.
10
+ * Respect robots.txt.
11
+ * Store page contents via adapter.
12
+
13
+ ## Dependencies
14
+
15
+ * ruby 2.3+
16
+ * libicu
17
+
18
+ ## Installation
19
+
20
+ Add to your application's Gemfile:
21
+
22
+ ```ruby
23
+ gem 'kudzu'
24
+ ```
25
+
26
+ Then run:
27
+
28
+ $ bundle install
29
+
30
+ ## Usage
31
+
32
+ Crawl html files in `example.com`:
33
+
34
+ ```ruby
35
+ crawler = Kudzu::Crawler.new do
36
+ user_agent 'YOUR_AWESOME_APP'
37
+ add_filter do
38
+ focus_host true
39
+ allow_mime_type %w(text/html)
40
+ end
41
+ end
42
+ crawler.run('http://example.com/') do
43
+ on_success do |page, link|
44
+ puts page.url
45
+ end
46
+ end
47
+ ```
48
+
49
+ ## Adapters
50
+
51
+ This gem supports only in-memory crawling by default. Use following adapter to save page contents persistently:
52
+
53
+ * [kudzu-adapter-active_record](https://github.com/kanety/kudzu-adapter-active_record)
54
+
55
+ ## Contributing
56
+
57
+ Bug reports and pull requests are welcome on GitHub at https://github.com/kanety/kudzu. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
58
+
59
+ ## License
60
+
61
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/lib/kudzu.rb ADDED
@@ -0,0 +1,8 @@
1
+ module Kudzu
2
+ class << self
3
+ attr_accessor :adapter
4
+ end
5
+ end
6
+
7
+ require 'kudzu/version'
8
+ require 'kudzu/crawler'
@@ -0,0 +1,3 @@
1
+ Dir[File.join(__dir__, '*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -0,0 +1,8 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Base
4
+ module Link
5
+ end
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,106 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Base
4
+ module Page
5
+ def last_modified
6
+ last_modified = response_header['last-modified']
7
+ Time.parse(last_modified).localtime if last_modified
8
+ rescue
9
+ nil
10
+ end
11
+
12
+ def etag
13
+ response_header['etag']
14
+ end
15
+
16
+ def html?
17
+ !mime_type.to_s.match(%r{text/html|application/xhtml\+xml}).nil?
18
+ end
19
+
20
+ def xml?
21
+ !mime_type.to_s.match(%r{text/xml|application/xml|application/rss\+xml|application/atom\+xml}).nil?
22
+ end
23
+
24
+ def css?
25
+ !mime_type.to_s.match(%r{text/css}).nil?
26
+ end
27
+
28
+ def js?
29
+ !mime_type.to_s.match(%r{text/javascript|application/javascript|application/x-javascript}).nil?
30
+ end
31
+
32
+ def text?
33
+ html? || xml? || !mime_type.to_s.match(%r{text/}).nil?
34
+ end
35
+
36
+ def status_success?
37
+ 200 <= status && status <= 299
38
+ end
39
+
40
+ def status_redirection?
41
+ 300 <= status && status <= 399
42
+ end
43
+
44
+ def status_client_error?
45
+ 400 <= status && status <= 499
46
+ end
47
+
48
+ def status_server_error?
49
+ 500 <= status && status <= 599
50
+ end
51
+
52
+ def status_not_modified?
53
+ status == 304
54
+ end
55
+
56
+ def status_not_found?
57
+ status == 404
58
+ end
59
+
60
+ def status_gone?
61
+ status == 410
62
+ end
63
+
64
+ def body
65
+ @body
66
+ end
67
+
68
+ def body=(body)
69
+ @body = body
70
+ end
71
+
72
+ def filtered
73
+ @filtered
74
+ end
75
+
76
+ def filtered=(filtered)
77
+ @filtered = filtered
78
+ end
79
+
80
+ def decoded_body
81
+ @decoded_body ||= decode_body(body)
82
+ end
83
+
84
+ private
85
+
86
+ def decode_body(body)
87
+ if text?
88
+ if find_encoding
89
+ body.force_encoding(charset).encode('utf-8', invalid: :replace, undef: :replace)
90
+ else
91
+ body.encode('utf-8', invalid: :replace, undef: :replace)
92
+ end
93
+ else
94
+ body
95
+ end
96
+ end
97
+
98
+ def find_encoding
99
+ Encoding.find(charset)
100
+ rescue
101
+ nil
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,4 @@
1
+ require_relative 'base/all'
2
+ require_relative 'memory/all'
3
+
4
+ Kudzu.adapter = Kudzu::Adapter::Memory
@@ -0,0 +1,3 @@
1
+ Dir[File.join(__dir__, '**/*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -0,0 +1,38 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Memory
4
+ class Frontier
5
+ def initialize(uuid)
6
+ @uuid = uuid
7
+ @monitor = Monitor.new
8
+ @queue = []
9
+ @queued = {}
10
+ end
11
+
12
+ def enqueue(links, depth: 1)
13
+ @monitor.synchronize do
14
+ Array(links).each do |link|
15
+ next if @queued.key?(link.url)
16
+ @queued[link.url] = true
17
+ @queue << link
18
+ end
19
+ end
20
+ end
21
+
22
+ def dequeue(limit: 1)
23
+ @monitor.synchronize do
24
+ links = @queue.shift(limit)
25
+ links.each do |link|
26
+ link.state = 1
27
+ end
28
+ end
29
+ end
30
+
31
+ def clear
32
+ @queue.clear
33
+ @queued.clear
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,15 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Memory
4
+ class Link
5
+ include Kudzu::Adapter::Base::Link
6
+
7
+ attr_accessor :uuid, :url, :title, :state, :depth
8
+
9
+ def initialize(attr = {})
10
+ attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,17 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Memory
4
+ class Page
5
+ include Kudzu::Adapter::Base::Page
6
+
7
+ attr_accessor :url, :title, :status, :mime_type, :size, :charset, :digest,
8
+ :response_header, :response_time, :redirect_from, :fetched_at, :revised_at,
9
+ :revisit_interval, :revisit_at
10
+
11
+ def initialize(attr = {})
12
+ attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,27 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Memory
4
+ class Repository
5
+ attr_reader :page
6
+
7
+ def initialize
8
+ @page = {}
9
+ @digest = {}
10
+ end
11
+
12
+ def find_by_url(url)
13
+ @page[url] || Page.new(url: url)
14
+ end
15
+
16
+ def register(page)
17
+ @page[page.url] = page
18
+ @digest[page.digest] = true
19
+ end
20
+
21
+ def delete(page)
22
+ @page.delete(page.url)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,3 @@
1
+ Dir[File.join(__dir__, '*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -0,0 +1,84 @@
1
+ require 'nokogiri'
2
+ require 'charlock_holmes'
3
+
4
+ module Kudzu
5
+ class Agent
6
+ class CharsetDetector
7
+ CORRECTION = {
8
+ 'utf_8' => 'utf-8',
9
+ 'shift-jis' => 'shift_jis',
10
+ 'x-sjis' => 'shift_jis',
11
+ 'euc_jp' => 'euc-jp'
12
+ }
13
+
14
+ def initialize
15
+ @parser = Kudzu::Util::ContentTypeParser.new
16
+ end
17
+
18
+ def detect(page)
19
+ if page.html?
20
+ from_html(page.body) || from_text(page.body)
21
+ elsif page.xml?
22
+ from_xml(page.body) || from_text(page.body)
23
+ elsif page.text?
24
+ from_text(page.body)
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def from_html(body)
31
+ doc = Nokogiri::HTML(body.encode('ascii', undef: :replace, invalid: :replace))
32
+
33
+ if (node = doc.xpath('//meta/@charset').first)
34
+ charset = correct(node.to_s)
35
+ return charset if charset
36
+ end
37
+
38
+ doc.xpath('//meta[@http-equiv]').each do |meta|
39
+ if meta['http-equiv'] =~ /content-type/i
40
+ charset = @parser.parse(meta[:content].to_s)[1][:charset]
41
+ charset = correct(node.to_s)
42
+ return charset if charset
43
+ end
44
+ end
45
+
46
+ return nil
47
+ end
48
+
49
+ def from_xml(body)
50
+ doc = Nokogiri::XML(body.encode('ascii', undef: :replace, invalid: :replace))
51
+ if doc.encoding
52
+ correct(doc.encoding)
53
+ else
54
+ nil
55
+ end
56
+ end
57
+
58
+ def from_text(text)
59
+ if text.ascii_only?
60
+ 'ascii'
61
+ else
62
+ detection = CharlockHolmes::EncodingDetector.detect(text)
63
+ if detection && detection.key?(:encoding)
64
+ detection[:encoding].downcase
65
+ else
66
+ nil
67
+ end
68
+ end
69
+ end
70
+
71
+ def correct(charset)
72
+ charset = charset.downcase
73
+ charset = CORRECTION[charset] if CORRECTION.key?(charset)
74
+
75
+ begin
76
+ Encoding.find(charset)
77
+ rescue
78
+ charset = nil
79
+ end
80
+ charset
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,116 @@
1
+ require 'net/http'
2
+ require 'http-cookie'
3
+
4
+ module Kudzu
5
+ class Agent
6
+ class Fetcher
7
+ class Response
8
+ attr_accessor :url, :status, :header, :body, :time, :redirected
9
+
10
+ def initialize(attr = {})
11
+ attr.each { |k, v| public_send("#{k}=", v) }
12
+ end
13
+
14
+ def redirected?
15
+ redirected
16
+ end
17
+ end
18
+
19
+ attr_reader :pool
20
+
21
+ def initialize(config, robots = nil)
22
+ @config = config
23
+ @pool = Kudzu::Util::ConnectionPool.new(@config.max_connection || 100)
24
+ @sleeper = Kudzu::Agent::Sleeper.new(@config, robots)
25
+ @jar = HTTP::CookieJar.new
26
+ end
27
+
28
+ def fetch(url, request_header: {}, redirect: max_redirect, method: :get)
29
+ uri = Addressable::URI.parse(url)
30
+ http = @pool.checkout(pool_name(uri)) { build_http(uri) }
31
+ request = build_request(uri, request_header: request_header, method: method)
32
+
33
+ append_cookie(url, request) if @config.handle_cookie
34
+
35
+ @sleeper.politeness_delay(url)
36
+
37
+ response = nil
38
+ response_time = Benchmark.realtime { response = http.request(request) }
39
+
40
+ parse_cookie(url, response) if @config.handle_cookie
41
+
42
+ if redirection?(response.code) && response['location'] && redirect > 0
43
+ fetch(uri.join(response['location']).to_s, request_header: request_header, redirect: redirect - 1)
44
+ else
45
+ res = build_response(url, response, response_time)
46
+ res.redirected = (redirect != max_redirect)
47
+ res
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def max_redirect
54
+ @config.max_redirect || 5
55
+ end
56
+
57
+ def pool_name(uri)
58
+ "#{uri.scheme}_#{uri.host}_#{uri.port || uri.default_port}"
59
+ end
60
+
61
+ def build_http(uri)
62
+ http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
63
+ http.open_timeout = @config.open_timeout if @config.open_timeout
64
+ http.read_timeout = @config.read_timeout if @config.read_timeout
65
+ if uri.scheme == 'https'
66
+ http.use_ssl = true
67
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
68
+ end
69
+ http.start
70
+ end
71
+
72
+ def build_request(uri, request_header:, method:)
73
+ request = request_klass_for(method).new(uri.request_uri)
74
+ request.basic_auth uri.user, uri.password if uri.user && uri.password
75
+
76
+ request['User-Agent'] = @config.user_agent
77
+ request_header.each do |key, value|
78
+ request[key] = value
79
+ end
80
+ request
81
+ end
82
+
83
+ def request_klass_for(method)
84
+ Object.const_get("Net::HTTP::#{method.capitalize}")
85
+ end
86
+
87
+ def build_response(url, response, response_time)
88
+ Response.new(url: url,
89
+ status: response.code.to_i,
90
+ header: Hash[response.each.to_a],
91
+ body: response.body.to_s,
92
+ time: response_time)
93
+ end
94
+
95
+ def redirection?(code)
96
+ code = code.to_i
97
+ 300 <= code && code <= 399
98
+ end
99
+
100
+ def parse_cookie(url, response)
101
+ @jar.parse(response['set-cookie'], url) if response['set-cookie']
102
+ end
103
+
104
+ def append_cookie(url, request)
105
+ cookies = @jar.cookies(url)
106
+ unless cookies.empty?
107
+ if request['Cookie']
108
+ request['Cookie'] += '; ' + cookies.join('; ')
109
+ else
110
+ request['Cookie'] = cookies.join('; ')
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end