grucrawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2c1969c40c532caad82628f59ac6cd088d069af2
4
+ data.tar.gz: 27e7781412201b02ef9356b28f0119a2cef5d69d
5
+ SHA512:
6
+ metadata.gz: 2f8704b18e02eedd2efaa39649a2e61e60bc6cfcd17a8e5e77e28e6e793d4475f5e29d887441cb0f632cd785d03e34e0c661059f810460be190ade23da2a7925
7
+ data.tar.gz: e10692a0971c5b427b501018fb85419d2810880225add5f2a7d7358abd4bd82b32f5ce5bc17781c582bdcde61d8429e040755d88d6cad9cacac8b7d2cc5f490c
data/.gitignore ADDED
@@ -0,0 +1,25 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
23
+
24
+ /.idea
25
+ /examples
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in grucrawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Slava Vishnyakov
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,78 @@
1
+ # Grucrawler
2
+
3
+ ```ruby
4
+ require 'grucrawler'
5
+
6
+ class ItalianCrawler
7
+ def options
8
+ {
9
+ visit_urls_only_once: true,
10
+ follow_redirects: true,
11
+ concurrency: 5
12
+ }
13
+ end
14
+
15
+ def on_init(crawler)
16
+ @crawler = crawler
17
+ end
18
+
19
+ def on_page_received(typhoeus_response, nokogiri_html)
20
+ puts "GOT #{typhoeus_response.effective_url.green}"
21
+
22
+ # typhoeus_response.body
23
+ # typhoeus_response.request.url
24
+ # typhoeus_response.effective_url
25
+ # nokogiri_html.css('a').each |a| { puts a.text; }
26
+ end
27
+
28
+ def follow_link(target_url, typhoeus_response, nokogiri_html)
29
+ return true if target_url.include? '.it'
30
+
31
+ false
32
+ end
33
+
34
+ def debug(message)
35
+ #puts message.blue
36
+ end
37
+
38
+ def log_info(message)
39
+ puts message.yellow
40
+ end
41
+
42
+ def log_error(typhoeus_response, exception)
43
+ puts exception.to_s.red
44
+ end
45
+ end
46
+
47
+ c = GruCrawler.new(ItalianCrawler.new)
48
+ # c.reset() # deletes all memory of all events - useful for restarting crawl
49
+ c.add_url('http://www.oneworlditaliano.com/english/italian/news-in-italian.htm')
50
+ c.run()
51
+ ```
52
+
53
+
54
+
55
+ ## Installation
56
+
57
+ [!] Requires local Redis at the moment.
58
+
59
+ Add this line to your application's Gemfile:
60
+
61
+ gem 'grucrawler'
62
+
63
+ And then execute:
64
+
65
+ $ bundle
66
+
67
+ Or install it yourself as:
68
+
69
+ $ gem install grucrawler
70
+
71
+
72
+ ## Contributing
73
+
74
+ 1. Fork it ( https://github.com/[my-github-username]/grucrawler/fork )
75
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
76
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
77
+ 4. Push to the branch (`git push origin my-new-feature`)
78
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'grucrawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "grucrawler"
8
+ spec.version = GruCrawler::VERSION
9
+ spec.authors = ["Slava Vishnyakov"]
10
+ spec.email = ["bomboze@gmail.com"]
11
+ spec.summary = %q{Simple crawler using Redis as backend}
12
+ spec.description = %q{Simple crawler using Redis as backend}
13
+ spec.homepage = "https://github.com/slava-vishnyakov/grucrawler"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake"
23
+
24
+ spec.add_dependency "typhoeus"
25
+ spec.add_dependency "redis"
26
+ spec.add_dependency "nokogiri"
27
+
28
+ end
@@ -0,0 +1,120 @@
1
+ class GruCrawler
2
+ class DoNotCrawlFurther < Exception
3
+ end
4
+
5
+ class Queue
6
+ VISITED_ALREADY_KEY = 'visited_already'
7
+ DOMAIN_VISITS_KEY = 'domain_visits'
8
+ QUEUE_KEY = 'queue'
9
+
10
+ def initialize(namespace, visit_once)
11
+ @redis = Redis.new
12
+ @rns = namespace + ':'
13
+ @concurrent_requests = 0
14
+ @tmp_block = {}
15
+ @domains_throttle = Hash.new(0.0)
16
+ @visit_once = visit_once
17
+ end
18
+
19
+ def reset
20
+ @redis.del(@rns + DOMAIN_VISITS_KEY)
21
+ @redis.del(@rns + QUEUE_KEY)
22
+ @redis.del(@rns + VISITED_ALREADY_KEY)
23
+ end
24
+
25
+ def next_url
26
+ url = ''
27
+
28
+ 100.times do
29
+ url = random_url_from_queue()
30
+
31
+ if visited_already(url) or not can_visit_now(url)
32
+ url = nil
33
+ next
34
+ end
35
+
36
+ break
37
+ end
38
+
39
+ @tmp_block[url] = true
40
+
41
+ url
42
+ end
43
+
44
+ MIN_TIME_TO_WAIT = 20
45
+
46
+ def can_visit_now(url)
47
+ return false if @tmp_block[url]
48
+
49
+ last_visit = last_visit_to_domain(url)
50
+ time_passed = Time.now.to_f - last_visit
51
+
52
+ time_passed > MIN_TIME_TO_WAIT
53
+ end
54
+
55
+ def started(url)
56
+ set_last_visit_to_domain(url)
57
+
58
+ @concurrent_requests += 1
59
+ end
60
+
61
+ def finished(url)
62
+ @tmp_block.delete(url)
63
+ set_visited_already(url)
64
+ remove_url_from_queue(url) if url
65
+ @concurrent_requests -= 1
66
+ end
67
+
68
+ def count
69
+ @concurrent_requests
70
+ end
71
+
72
+ def set_last_visit_to_domain(url)
73
+ time = Time.now.to_f
74
+ @redis.hset(@rns + DOMAIN_VISITS_KEY, domain(url), time)
75
+ end
76
+
77
+ def last_visit_to_domain(url)
78
+ @redis.hget(@rns + DOMAIN_VISITS_KEY, domain(url)).to_f
79
+ end
80
+
81
+
82
+ def remove_url_from_queue(url)
83
+ @redis.srem(@rns + QUEUE_KEY, url)
84
+ end
85
+
86
+ def random_url_from_queue
87
+ @redis.srandmember(@rns + QUEUE_KEY)
88
+ end
89
+
90
+ def push(url)
91
+ @redis.sadd(@rns + QUEUE_KEY, url) == 1
92
+ end
93
+
94
+
95
+ def visited_already(url)
96
+ return false unless @visit_once
97
+ @redis.sismember(@rns + VISITED_ALREADY_KEY, url)
98
+ end
99
+
100
+ def set_visited_already(url)
101
+ return unless @visit_once
102
+ @redis.sadd(@rns + VISITED_ALREADY_KEY, url)
103
+ end
104
+
105
+
106
+ # TODO: PublicSuffix
107
+ def domain(url)
108
+ begin
109
+ uri = URI.parse(url)
110
+ rescue URI::InvalidURIError
111
+ return nil
112
+ end
113
+
114
+ return nil if uri.host.nil?
115
+ host = uri.host.downcase
116
+ host = host.start_with?('www.') ? host[4..-1] : host
117
+ host.match(/\w+\.\w+$/)[0]
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,3 @@
1
+ class GruCrawler
2
+ VERSION = "0.0.1"
3
+ end
data/lib/grucrawler.rb ADDED
@@ -0,0 +1,94 @@
1
+ require "grucrawler/version"
2
+ require "grucrawler/queue"
3
+ require "typhoeus"
4
+ require "redis"
5
+ require "uri"
6
+ require "nokogiri"
7
+
8
+ class GruCrawler
9
+ class DoNotCrawlFurther < Exception
10
+ end
11
+
12
+ def initialize(rules)
13
+ @crawler = rules
14
+ @options = @crawler.options()
15
+ @queue = GruCrawler::Queue.new(@crawler.class.name, @options[:visit_urls_only_once])
16
+
17
+ @crawler.on_init(self)
18
+ end
19
+
20
+ def run
21
+ @hydra = Typhoeus::Hydra.new()
22
+ @concurrency = @options[:concurrency] || 5
23
+ crawl_more()
24
+ @hydra.run
25
+ end
26
+
27
+ def add_url(url)
28
+ @queue.push(url)
29
+ end
30
+
31
+ def add_from_queue
32
+ url = @queue.next_url()
33
+ return false unless url
34
+
35
+ request = Typhoeus::Request.new(url, followlocation: @options[:follow_redirects], accept_encoding: 'gzip')
36
+ @queue.started(url)
37
+
38
+ request.on_complete do |response|
39
+ on_response(response)
40
+ end
41
+
42
+ @crawler.debug("#{Time.now} started URL #{url}")
43
+ @hydra.queue(request)
44
+
45
+ true
46
+ end
47
+
48
+ def reset
49
+ @queue.reset
50
+ end
51
+
52
+ def on_response(response)
53
+ @crawler.debug("#{Time.now} ended URL #{response.request.url}")
54
+ @queue.finished(response.request.url)
55
+
56
+ crawl_more()
57
+
58
+ nokogiri = Nokogiri::HTML(response.body)
59
+
60
+ begin
61
+ @crawler.on_page_received(response, nokogiri)
62
+ rescue
63
+ @crawler.log_error(response, $!)
64
+ end
65
+
66
+ queue_links(response, nokogiri)
67
+
68
+ crawl_more()
69
+ end
70
+
71
+ def crawl_more
72
+ while @queue.count < @concurrency
73
+ break unless add_from_queue()
74
+ end
75
+ end
76
+
77
+ def queue_links(response, nokogiri)
78
+ nokogiri.css('a').each do |link|
79
+ next unless link['href']
80
+
81
+ begin
82
+ url = URI.join(response.effective_url, link['href']).to_s
83
+ rescue
84
+ next
85
+ end
86
+ if @crawler.follow_link(url, response, nokogiri)
87
+ added = add_url(url)
88
+ @crawler.debug("#{Time.now} queued #{url}") if added
89
+ end
90
+ end
91
+ end
92
+
93
+
94
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: grucrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Slava Vishnyakov
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: typhoeus
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: redis
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Simple crawler using Redis as backend
84
+ email:
85
+ - bomboze@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - .gitignore
91
+ - Gemfile
92
+ - LICENSE.txt
93
+ - README.md
94
+ - Rakefile
95
+ - grucrawler.gemspec
96
+ - lib/grucrawler.rb
97
+ - lib/grucrawler/queue.rb
98
+ - lib/grucrawler/version.rb
99
+ homepage: https://github.com/slava-vishnyakov/grucrawler
100
+ licenses:
101
+ - MIT
102
+ metadata: {}
103
+ post_install_message:
104
+ rdoc_options: []
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 2.2.2
120
+ signing_key:
121
+ specification_version: 4
122
+ summary: Simple crawler using Redis as backend
123
+ test_files: []