ranunculus 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 92b1b11c7cdbeca56636d6da9a841d02506b95e9
4
+ data.tar.gz: 84b35bb769296e265f7bfdb8256fef3ebc6f8719
5
+ SHA512:
6
+ metadata.gz: 96ab0a4bc0ae61ac78e1cc7b6bef8b2ad88a5f9327f3f3237966d72ef2f1d31e4c9744f89abe02a864c5abbcc42dcd40fcf84ddb6e316898881c4fd0efb17f2a
7
+ data.tar.gz: b7ba1eebbfc008ddd8b4a356249c7a78cf9547e2cc8fcf19e2c92e341919c17174b3952bab40f0a5cb5d3385e87d8e639e265843f5f09f0ec25c9f45c894777e
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.0
4
+ before_install: gem install bundler -v 1.11.2
@@ -0,0 +1,49 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, and in the interest of
4
+ fostering an open and welcoming community, we pledge to respect all people who
5
+ contribute through reporting issues, posting feature requests, updating
6
+ documentation, submitting pull requests or patches, and other activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, ethnicity, age, religion, or nationality.
12
+
13
+ Examples of unacceptable behavior by participants include:
14
+
15
+ * The use of sexualized language or imagery
16
+ * Personal attacks
17
+ * Trolling or insulting/derogatory comments
18
+ * Public or private harassment
19
+ * Publishing other's private information, such as physical or electronic
20
+ addresses, without explicit permission
21
+ * Other unethical or unprofessional conduct
22
+
23
+ Project maintainers have the right and responsibility to remove, edit, or
24
+ reject comments, commits, code, wiki edits, issues, and other contributions
25
+ that are not aligned to this Code of Conduct, or to ban temporarily or
26
+ permanently any contributor for other behaviors that they deem inappropriate,
27
+ threatening, offensive, or harmful.
28
+
29
+ By adopting this Code of Conduct, project maintainers commit themselves to
30
+ fairly and consistently applying these principles to every aspect of managing
31
+ this project. Project maintainers who do not follow or enforce the Code of
32
+ Conduct may be permanently removed from the project team.
33
+
34
+ This code of conduct applies both within project spaces and in public spaces
35
+ when an individual is representing the project or its community.
36
+
37
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
38
+ reported by contacting a project maintainer at TODO: Write your email address. All
39
+ complaints will be reviewed and investigated and will result in a response that
40
+ is deemed necessary and appropriate to the circumstances. Maintainers are
41
+ obligated to maintain confidentiality with regard to the reporter of an
42
+ incident.
43
+
44
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
45
+ version 1.3.0, available at
46
+ [http://contributor-covenant.org/version/1/3/0/][version]
47
+
48
+ [homepage]: http://contributor-covenant.org
49
+ [version]: http://contributor-covenant.org/version/1/3/0/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in ranunculus.gemspec
4
+ gemspec
@@ -0,0 +1,49 @@
1
+ # Ranunculus
2
+
3
+ Another Web crawler running with Amazon SQS and ElastiCache(Redis).
4
+ You need a worker which is available at GitHub.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'ranunculus'
12
+ ```
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install ranunculus
21
+
22
+ ## Usage
23
+
24
+ require 'aws-sdk'
25
+ require 'redis'
26
+ require 'ranunculus'
27
+
28
+ sqs = Aws::SQS::Client.new(
29
+ access_key_id: ENV['ACCESS_KEY_ID'],
30
+ secret_access_key: ENV['SECRET_ACCESS_KEY'],
31
+ region: 'ap-northeast-1'
32
+ )
33
+ redis = Redis.new(:host => "localhost", :port => 6379, :db => 0)
34
+
35
+ URL_QUEUE_URL = "https://sqs.ap-northeast-1.amazonaws.com/12345/UrlQueue"
36
+ RESULT_QUEUE_URL = "https://sqs.ap-northeast-1.amazonaws.com/12345/ResultQueue"
37
+
38
+ c = Ranunculus::Crawler.new(sqs, redis, URL_QUEUE_URL, RESULT_QUEUE_URL)
39
+
40
+ page = Ranunculus::Page.new("http://www.yahoo.com")
41
+ c.on_every_page do |page|
42
+ puts page.url
43
+ end
44
+ c.start(page, 1, "Ranunculus")
45
+
46
+ ## Contributing
47
+
48
+ Bug reports and pull requests are welcome on GitHub at https://github.com/k-kawa/ranunculus-ruby. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
49
+
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "ranunculus"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,5 @@
1
+ require 'aws-sdk-core'
2
+
3
+ require "ranunculus/version.rb"
4
+ require "ranunculus/page"
5
+ require "ranunculus/crawler"
@@ -0,0 +1,104 @@
1
+ class Ranunculus::Crawler
2
+
3
+ attr :user_agent
4
+
5
+ def initialize(sqs, redis, job_queue_url, result_queue_url)
6
+ @sqs = sqs
7
+ @redis = redis
8
+ @job_queue_url = job_queue_url
9
+ @result_queue_url = result_queue_url
10
+
11
+ @on_every_page_blocks = []
12
+
13
+ @result_count = 0
14
+ @result_pages = {}
15
+ @pushed_urls = {}
16
+ @skip_patterns = []
17
+ end
18
+
19
+ def skip_if(patterns)
20
+ @skip_patterns = patterns
21
+ end
22
+
23
+ def skip?(url)
24
+ @skip_patterns.any? do |pattern|
25
+ url.match(pattern) != nil
26
+ end || visit_link?(url)
27
+ end
28
+
29
+ def start(page, max_depth, user_agent)
30
+ @max_depth = max_depth
31
+ @user_agent = user_agent
32
+
33
+ @pushed_urls[page.url] = 1
34
+ @sqs.send_message({
35
+ queue_url: @job_queue_url,
36
+ message_body: page.to_sqs_hash(self).to_json,
37
+ })
38
+
39
+ while @result_count < @pushed_urls.size
40
+ work
41
+ end
42
+ end
43
+
44
+ def work
45
+ queue_resp = @sqs.receive_message({
46
+ queue_url: @result_queue_url,
47
+ wait_time_seconds: 20,
48
+ visibility_timeout: 10,
49
+ max_number_of_messages: 1
50
+ })
51
+ return if queue_resp.messages.length == 0
52
+ message_body = queue_resp.messages[0].body
53
+ receipt_handle = queue_resp.messages[0].receipt_handle
54
+ @result_count += 1
55
+
56
+ queue_obj = JSON.parse(message_body)
57
+ body = if queue_obj['HasBody']
58
+ @redis.get(queue_obj['Url'])
59
+ else
60
+ nil
61
+ end
62
+
63
+ @sqs.delete_message({
64
+ queue_url: @result_queue_url,
65
+ receipt_handle: receipt_handle
66
+ })
67
+
68
+ unless body.nil? || body.size == 0
69
+ page = Ranunculus::Page.create_from_queue(queue_obj, body)
70
+ @result_pages[page.url] = page
71
+
72
+ urls = page.fetch_urls.delete_if{|url| skip?(url)}
73
+ if page.depth < @max_depth
74
+ urls.each do |url|
75
+ @pushed_urls[url] = 1
76
+ @sqs.send_message({
77
+ queue_url: @job_queue_url,
78
+ message_body: Ranunculus::Page.
79
+ new(url, page.depth+1).
80
+ to_sqs_hash(self).to_json
81
+ })
82
+ end
83
+ end
84
+
85
+ do_page_blocks page
86
+ page.discard_doc!
87
+ end
88
+ end
89
+
90
+ def on_every_page(&block)
91
+ @on_every_page_blocks << block
92
+ self
93
+ end
94
+
95
+ def visit_link?(url)
96
+ @pushed_urls.key?(url)
97
+ end
98
+
99
+ def do_page_block(page)
100
+ @on_every_page_blocks.each do |block|
101
+ block.call(page)
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,70 @@
1
+ class Ranunculus::Page
2
+
3
+ attr :url, :depth, :body
4
+
5
+ def self.create_from_queue(sqs_result, body)
6
+ new(
7
+ sqs_result['Url'],
8
+ sqs_result['Depth'],
9
+ body
10
+ )
11
+ end
12
+
13
+ def initialize(url, depth = 0, body = nil)
14
+ @url = url
15
+ @depth = depth
16
+ @body = body
17
+
18
+ @links = nil
19
+ end
20
+
21
+ def crawl(cralwer)
22
+ crawler.start(self)
23
+ end
24
+
25
+ def to_sqs_hash(cralwer)
26
+ {
27
+ "Url" => @url,
28
+ "Depth" => @depth,
29
+ "Headers": {"User-Agent"=> cralwer.user_agent}
30
+ }
31
+ end
32
+
33
+ def doc
34
+ return nil if @body.nil?
35
+ return @doc unless @doc.nil?
36
+ @doc = Nokogiri::HTML(@body)
37
+ end
38
+
39
+ def discard_doc!
40
+ @doc = nil
41
+ end
42
+
43
+ def fetch_urls
44
+ return @links unless @links.nil?
45
+ @links = []
46
+
47
+ doc.search("//a[@href]").each do |a|
48
+ u = a['href']
49
+ next if u.nil? or u.empty?
50
+ abs = to_absolute(u)
51
+ unless abs.nil?
52
+ @links << abs
53
+ end
54
+ end
55
+ @links.uniq!
56
+ @links
57
+ end
58
+
59
+ def to_absolute(link)
60
+ return nil if link.nil?
61
+ return nil if link.start_with?("mail")
62
+ relative = URI.parse(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
63
+ absolute = URI(@url).merge(relative)
64
+ absolute.path = '/' if absolute.path.nil?
65
+ absolute
66
+ rescue URI::InvalidURIError, URI::InvalidComponentError, URI::BadURIError
67
+ # Not sure they are the all of the errors that can happen when the given link is invalid....
68
+ return nil
69
+ end
70
+ end
@@ -0,0 +1,3 @@
1
+ module Ranunculus
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,40 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'ranunculus/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "ranunculus"
7
+ spec.version = Ranunculus::VERSION
8
+ spec.authors = ["Kohei KAWASAKI"]
9
+ spec.email = []
10
+
11
+ spec.summary = %q{Another Web crawler running with Amazon SQS and ElastiCache(Redis)}
12
+ spec.description = %q{}
13
+ spec.homepage = ""
14
+
15
+ # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
16
+ # delete this section to allow pushing this gem to any host.
17
+ if spec.respond_to?(:metadata)
18
+ # spec.metadata['allowed_push_host'] = "http://localhost.com'"
19
+ else
20
+ raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
21
+ end
22
+
23
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
24
+ spec.bindir = "exe"
25
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
26
+ spec.require_paths = ["lib"]
27
+
28
+ spec.add_development_dependency "bundler", "~> 1.11"
29
+ spec.add_development_dependency "rake", "~> 10.0"
30
+ spec.add_development_dependency "rspec", "~> 3.0"
31
+ spec.add_development_dependency "byebug"
32
+ spec.add_development_dependency "pry-doc"
33
+ spec.add_development_dependency "pry-byebug"
34
+ spec.add_development_dependency "pry-rails"
35
+ spec.add_development_dependency "pry-stack_explorer"
36
+
37
+ spec.add_dependency "nokogiri", ">= 1.3.0"
38
+ spec.add_dependency "aws-sdk", "~> 2"
39
+ spec.add_dependency "redis", "~>3.2"
40
+ end
metadata ADDED
@@ -0,0 +1,211 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ranunculus
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Kohei KAWASAKI
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-04-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.11'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.11'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: byebug
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry-doc
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: pry-byebug
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pry-rails
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: pry-stack_explorer
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: nokogiri
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: 1.3.0
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: 1.3.0
139
+ - !ruby/object:Gem::Dependency
140
+ name: aws-sdk
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '2'
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '2'
153
+ - !ruby/object:Gem::Dependency
154
+ name: redis
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '3.2'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '3.2'
167
+ description: ''
168
+ email: []
169
+ executables: []
170
+ extensions: []
171
+ extra_rdoc_files: []
172
+ files:
173
+ - ".gitignore"
174
+ - ".rspec"
175
+ - ".travis.yml"
176
+ - CODE_OF_CONDUCT.md
177
+ - Gemfile
178
+ - README.md
179
+ - Rakefile
180
+ - bin/console
181
+ - bin/setup
182
+ - lib/ranunculus.rb
183
+ - lib/ranunculus/crawler.rb
184
+ - lib/ranunculus/page.rb
185
+ - lib/ranunculus/version.rb
186
+ - ranunculus.gemspec
187
+ homepage: ''
188
+ licenses: []
189
+ metadata: {}
190
+ post_install_message:
191
+ rdoc_options: []
192
+ require_paths:
193
+ - lib
194
+ required_ruby_version: !ruby/object:Gem::Requirement
195
+ requirements:
196
+ - - ">="
197
+ - !ruby/object:Gem::Version
198
+ version: '0'
199
+ required_rubygems_version: !ruby/object:Gem::Requirement
200
+ requirements:
201
+ - - ">="
202
+ - !ruby/object:Gem::Version
203
+ version: '0'
204
+ requirements: []
205
+ rubyforge_project:
206
+ rubygems_version: 2.4.5.1
207
+ signing_key:
208
+ specification_version: 4
209
+ summary: Another Web crawler running with Amazon SQS and ElastiCache(Redis)
210
+ test_files: []
211
+ has_rdoc: