kudzu 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ab6c69046e9faa3788ead18864ee6d13ddbe2980
4
+ data.tar.gz: c9868fabe9542d877d6519e0f5297419c882a8e5
5
+ SHA512:
6
+ metadata.gz: 7018e08e6744a9e74e601bad26a88df3d60140ddaa055fc194c88263cff37137402de967203999d9ba9c2bda199228215f380e207a4b12b6c2e50c5774827e16
7
+ data.tar.gz: b03edd059ea5b5cb0f50fd0bc660c02e8727c1bb7f8f735fe86ba6f08b2a8bf743a04570eb171dc99f3ffcacba26c1df8a99cc2a1187e2e2fb481c01adb59e6c
data/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # Kudzu
2
+
3
+ A simple web crawler for ruby.
4
+
5
+ ## Features
6
+
7
+ * Run single-thread or multi-thread.
8
+ * Pool HTTP connection.
9
+ * Restrict links by url-based patterns.
10
+ * Respect robots.txt.
11
+ * Store page contents via adapter.
12
+
13
+ ## Dependencies
14
+
15
+ * ruby 2.3+
16
+ * libicu
17
+
18
+ ## Installation
19
+
20
+ Add to your application's Gemfile:
21
+
22
+ ```ruby
23
+ gem 'kudzu'
24
+ ```
25
+
26
+ Then run:
27
+
28
+ $ bundle install
29
+
30
+ ## Usage
31
+
32
+ Crawl html files in `example.com`:
33
+
34
+ ```ruby
35
+ crawler = Kudzu::Crawler.new do
36
+ user_agent 'YOUR_AWESOME_APP'
37
+ add_filter do
38
+ focus_host true
39
+ allow_mime_type %w(text/html)
40
+ end
41
+ end
42
+ crawler.run('http://example.com/') do
43
+ on_success do |page, link|
44
+ puts page.url
45
+ end
46
+ end
47
+ ```
48
+
49
+ ## Adapters
50
+
51
+ This gem supports only in-memory crawling by default. Use following adapter to save page contents persistently:
52
+
53
+ * [kudzu-adapter-active_record](https://github.com/kanety/kudzu-adapter-active_record)
54
+
55
+ ## Contributing
56
+
57
+ Bug reports and pull requests are welcome on GitHub at https://github.com/kanety/kudzu. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
58
+
59
+ ## License
60
+
61
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/lib/kudzu.rb ADDED
@@ -0,0 +1,8 @@
1
+ module Kudzu
2
+ class << self
3
+ attr_accessor :adapter
4
+ end
5
+ end
6
+
7
+ require 'kudzu/version'
8
+ require 'kudzu/crawler'
@@ -0,0 +1,3 @@
1
+ Dir[File.join(__dir__, '*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -0,0 +1,8 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Base
4
+ module Link
5
+ end
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,106 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Base
4
+ module Page
5
+ def last_modified
6
+ last_modified = response_header['last-modified']
7
+ Time.parse(last_modified).localtime if last_modified
8
+ rescue
9
+ nil
10
+ end
11
+
12
+ def etag
13
+ response_header['etag']
14
+ end
15
+
16
+ def html?
17
+ !mime_type.to_s.match(%r{text/html|application/xhtml\+xml}).nil?
18
+ end
19
+
20
+ def xml?
21
+ !mime_type.to_s.match(%r{text/xml|application/xml|application/rss\+xml|application/atom\+xml}).nil?
22
+ end
23
+
24
+ def css?
25
+ !mime_type.to_s.match(%r{text/css}).nil?
26
+ end
27
+
28
+ def js?
29
+ !mime_type.to_s.match(%r{text/javascript|application/javascript|application/x-javascript}).nil?
30
+ end
31
+
32
+ def text?
33
+ html? || xml? || !mime_type.to_s.match(%r{text/}).nil?
34
+ end
35
+
36
+ def status_success?
37
+ 200 <= status && status <= 299
38
+ end
39
+
40
+ def status_redirection?
41
+ 300 <= status && status <= 399
42
+ end
43
+
44
+ def status_client_error?
45
+ 400 <= status && status <= 499
46
+ end
47
+
48
+ def status_server_error?
49
+ 500 <= status && status <= 599
50
+ end
51
+
52
+ def status_not_modified?
53
+ status == 304
54
+ end
55
+
56
+ def status_not_found?
57
+ status == 404
58
+ end
59
+
60
+ def status_gone?
61
+ status == 410
62
+ end
63
+
64
+ def body
65
+ @body
66
+ end
67
+
68
+ def body=(body)
69
+ @body = body
70
+ end
71
+
72
+ def filtered
73
+ @filtered
74
+ end
75
+
76
+ def filtered=(filtered)
77
+ @filtered = filtered
78
+ end
79
+
80
+ def decoded_body
81
+ @decoded_body ||= decode_body(body)
82
+ end
83
+
84
+ private
85
+
86
+ def decode_body(body)
87
+ if text?
88
+ if find_encoding
89
+ body.force_encoding(charset).encode('utf-8', invalid: :replace, undef: :replace)
90
+ else
91
+ body.encode('utf-8', invalid: :replace, undef: :replace)
92
+ end
93
+ else
94
+ body
95
+ end
96
+ end
97
+
98
+ def find_encoding
99
+ Encoding.find(charset)
100
+ rescue
101
+ nil
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,4 @@
1
+ require_relative 'base/all'
2
+ require_relative 'memory/all'
3
+
4
+ Kudzu.adapter = Kudzu::Adapter::Memory
@@ -0,0 +1,3 @@
1
+ Dir[File.join(__dir__, '**/*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -0,0 +1,38 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Memory
4
+ class Frontier
5
+ def initialize(uuid)
6
+ @uuid = uuid
7
+ @monitor = Monitor.new
8
+ @queue = []
9
+ @queued = {}
10
+ end
11
+
12
+ def enqueue(links, depth: 1)
13
+ @monitor.synchronize do
14
+ Array(links).each do |link|
15
+ next if @queued.key?(link.url)
16
+ @queued[link.url] = true
17
+ @queue << link
18
+ end
19
+ end
20
+ end
21
+
22
+ def dequeue(limit: 1)
23
+ @monitor.synchronize do
24
+ links = @queue.shift(limit)
25
+ links.each do |link|
26
+ link.state = 1
27
+ end
28
+ end
29
+ end
30
+
31
+ def clear
32
+ @queue.clear
33
+ @queued.clear
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,15 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Memory
4
+ class Link
5
+ include Kudzu::Adapter::Base::Link
6
+
7
+ attr_accessor :uuid, :url, :title, :state, :depth
8
+
9
+ def initialize(attr = {})
10
+ attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,17 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Memory
4
+ class Page
5
+ include Kudzu::Adapter::Base::Page
6
+
7
+ attr_accessor :url, :title, :status, :mime_type, :size, :charset, :digest,
8
+ :response_header, :response_time, :redirect_from, :fetched_at, :revised_at,
9
+ :revisit_interval, :revisit_at
10
+
11
+ def initialize(attr = {})
12
+ attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,27 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module Memory
4
+ class Repository
5
+ attr_reader :page
6
+
7
+ def initialize
8
+ @page = {}
9
+ @digest = {}
10
+ end
11
+
12
+ def find_by_url(url)
13
+ @page[url] || Page.new(url: url)
14
+ end
15
+
16
+ def register(page)
17
+ @page[page.url] = page
18
+ @digest[page.digest] = true
19
+ end
20
+
21
+ def delete(page)
22
+ @page.delete(page.url)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,3 @@
1
+ Dir[File.join(__dir__, '*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -0,0 +1,84 @@
1
+ require 'nokogiri'
2
+ require 'charlock_holmes'
3
+
4
+ module Kudzu
5
+ class Agent
6
+ class CharsetDetector
7
+ CORRECTION = {
8
+ 'utf_8' => 'utf-8',
9
+ 'shift-jis' => 'shift_jis',
10
+ 'x-sjis' => 'shift_jis',
11
+ 'euc_jp' => 'euc-jp'
12
+ }
13
+
14
+ def initialize
15
+ @parser = Kudzu::Util::ContentTypeParser.new
16
+ end
17
+
18
+ def detect(page)
19
+ if page.html?
20
+ from_html(page.body) || from_text(page.body)
21
+ elsif page.xml?
22
+ from_xml(page.body) || from_text(page.body)
23
+ elsif page.text?
24
+ from_text(page.body)
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def from_html(body)
31
+ doc = Nokogiri::HTML(body.encode('ascii', undef: :replace, invalid: :replace))
32
+
33
+ if (node = doc.xpath('//meta/@charset').first)
34
+ charset = correct(node.to_s)
35
+ return charset if charset
36
+ end
37
+
38
+ doc.xpath('//meta[@http-equiv]').each do |meta|
39
+ if meta['http-equiv'] =~ /content-type/i
40
+ charset = @parser.parse(meta[:content].to_s)[1][:charset]
41
+ charset = correct(node.to_s)
42
+ return charset if charset
43
+ end
44
+ end
45
+
46
+ return nil
47
+ end
48
+
49
+ def from_xml(body)
50
+ doc = Nokogiri::XML(body.encode('ascii', undef: :replace, invalid: :replace))
51
+ if doc.encoding
52
+ correct(doc.encoding)
53
+ else
54
+ nil
55
+ end
56
+ end
57
+
58
+ def from_text(text)
59
+ if text.ascii_only?
60
+ 'ascii'
61
+ else
62
+ detection = CharlockHolmes::EncodingDetector.detect(text)
63
+ if detection && detection.key?(:encoding)
64
+ detection[:encoding].downcase
65
+ else
66
+ nil
67
+ end
68
+ end
69
+ end
70
+
71
+ def correct(charset)
72
+ charset = charset.downcase
73
+ charset = CORRECTION[charset] if CORRECTION.key?(charset)
74
+
75
+ begin
76
+ Encoding.find(charset)
77
+ rescue
78
+ charset = nil
79
+ end
80
+ charset
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,116 @@
1
+ require 'net/http'
2
+ require 'http-cookie'
3
+
4
+ module Kudzu
5
+ class Agent
6
+ class Fetcher
7
+ class Response
8
+ attr_accessor :url, :status, :header, :body, :time, :redirected
9
+
10
+ def initialize(attr = {})
11
+ attr.each { |k, v| public_send("#{k}=", v) }
12
+ end
13
+
14
+ def redirected?
15
+ redirected
16
+ end
17
+ end
18
+
19
+ attr_reader :pool
20
+
21
+ def initialize(config, robots = nil)
22
+ @config = config
23
+ @pool = Kudzu::Util::ConnectionPool.new(@config.max_connection || 100)
24
+ @sleeper = Kudzu::Agent::Sleeper.new(@config, robots)
25
+ @jar = HTTP::CookieJar.new
26
+ end
27
+
28
+ def fetch(url, request_header: {}, redirect: max_redirect, method: :get)
29
+ uri = Addressable::URI.parse(url)
30
+ http = @pool.checkout(pool_name(uri)) { build_http(uri) }
31
+ request = build_request(uri, request_header: request_header, method: method)
32
+
33
+ append_cookie(url, request) if @config.handle_cookie
34
+
35
+ @sleeper.politeness_delay(url)
36
+
37
+ response = nil
38
+ response_time = Benchmark.realtime { response = http.request(request) }
39
+
40
+ parse_cookie(url, response) if @config.handle_cookie
41
+
42
+ if redirection?(response.code) && response['location'] && redirect > 0
43
+ fetch(uri.join(response['location']).to_s, request_header: request_header, redirect: redirect - 1)
44
+ else
45
+ res = build_response(url, response, response_time)
46
+ res.redirected = (redirect != max_redirect)
47
+ res
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def max_redirect
54
+ @config.max_redirect || 5
55
+ end
56
+
57
+ def pool_name(uri)
58
+ "#{uri.scheme}_#{uri.host}_#{uri.port || uri.default_port}"
59
+ end
60
+
61
+ def build_http(uri)
62
+ http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
63
+ http.open_timeout = @config.open_timeout if @config.open_timeout
64
+ http.read_timeout = @config.read_timeout if @config.read_timeout
65
+ if uri.scheme == 'https'
66
+ http.use_ssl = true
67
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
68
+ end
69
+ http.start
70
+ end
71
+
72
+ def build_request(uri, request_header:, method:)
73
+ request = request_klass_for(method).new(uri.request_uri)
74
+ request.basic_auth uri.user, uri.password if uri.user && uri.password
75
+
76
+ request['User-Agent'] = @config.user_agent
77
+ request_header.each do |key, value|
78
+ request[key] = value
79
+ end
80
+ request
81
+ end
82
+
83
+ def request_klass_for(method)
84
+ Object.const_get("Net::HTTP::#{method.capitalize}")
85
+ end
86
+
87
+ def build_response(url, response, response_time)
88
+ Response.new(url: url,
89
+ status: response.code.to_i,
90
+ header: Hash[response.each.to_a],
91
+ body: response.body.to_s,
92
+ time: response_time)
93
+ end
94
+
95
+ def redirection?(code)
96
+ code = code.to_i
97
+ 300 <= code && code <= 399
98
+ end
99
+
100
+ def parse_cookie(url, response)
101
+ @jar.parse(response['set-cookie'], url) if response['set-cookie']
102
+ end
103
+
104
+ def append_cookie(url, request)
105
+ cookies = @jar.cookies(url)
106
+ unless cookies.empty?
107
+ if request['Cookie']
108
+ request['Cookie'] += '; ' + cookies.join('; ')
109
+ else
110
+ request['Cookie'] = cookies.join('; ')
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end