cosmicrawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +23 -0
- data/Gemfile +16 -0
- data/README.md +57 -0
- data/Rakefile +8 -0
- data/cosmicrawler.gemspec +27 -0
- data/lib/cosmicrawler.rb +19 -0
- data/lib/cosmicrawler/em.rb +27 -0
- data/lib/cosmicrawler/em/crawler.rb +17 -0
- data/lib/cosmicrawler/em/http_crawler.rb +20 -0
- data/lib/cosmicrawler/version.rb +3 -0
- data/spec/cosmicrawler_spec.rb +94 -0
- data/spec/spec_helper.rb +3 -0
- metadata +131 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1c51e03fd72a7db8bad8d0addc1195c5fd96a021
|
4
|
+
data.tar.gz: 2aff46f71f44585ef2bfb00a8c88dd2f6be0009d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 84f47ea22ed2c920e3f480485d6fbe36377d579131d7706f168c2f888155aab02be99bd0bb38d981dcb1d15934fbed0bb621c024619904c108602e77906ac1f9
|
7
|
+
data.tar.gz: 5f522e9dc55e3f335e5c84e26f9760f199cdb0329fc56f264abf732a191eb92a2e0ccd22e13057aa5bbabf0ffd51d4dada32f9426fdc8b10ebb53d7116f10f2f
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
rvm:
|
2
|
+
- 1.9.3
|
3
|
+
- jruby-18mode
|
4
|
+
- jruby-19mode
|
5
|
+
- rbx-18mode
|
6
|
+
- rbx-19mode
|
7
|
+
- ruby-head
|
8
|
+
- jruby-head
|
9
|
+
- 1.8.7
|
10
|
+
- ree
|
11
|
+
jdk:
|
12
|
+
- openjdk7
|
13
|
+
- oraclejdk7
|
14
|
+
- openjdk6
|
15
|
+
env:
|
16
|
+
- ISOLATED=true
|
17
|
+
- ISOLATED=false
|
18
|
+
matrix:
|
19
|
+
exclude:
|
20
|
+
- rvm: 1.9.3
|
21
|
+
jdk: openjdk7
|
22
|
+
- rvm: 1.9.3
|
23
|
+
jdk: oraclejdk7
|
data/Gemfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in cosmicrawler.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
group :development do
|
7
|
+
gem "bundler"
|
8
|
+
gem "rspec"
|
9
|
+
gem 'webmock'
|
10
|
+
gem 'pry'
|
11
|
+
end
|
12
|
+
|
13
|
+
gem 'eventmachine'
|
14
|
+
gem 'em-http-request'
|
15
|
+
gem 'em-synchrony'
|
16
|
+
|
data/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
|
2
|
+
# Cosmicrawler
|
3
|
+
|
4
|
+
Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous crawling by http, file, etc using [EventMachine](https://github.com/eventmachine/eventmachine).
|
5
|
+
|
6
|
+
[](https://travis-ci.org/bash0C7/cosmicrawler)
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Add this line to your application's Gemfile:
|
11
|
+
|
12
|
+
gem 'cosmicrawler'
|
13
|
+
|
14
|
+
And then execute:
|
15
|
+
|
16
|
+
$ bundle
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
$ gem install cosmicrawler
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
### http
|
25
|
+
|
26
|
+
````ruby
|
27
|
+
require 'cosmicrawler'
|
28
|
+
|
29
|
+
Cosmicrawler.http_crawl(%w(http://example.com/1 http://example.com/2)) {|request|
|
30
|
+
get = request.get
|
31
|
+
puts get.response if get.response_header.status == 200
|
32
|
+
}
|
33
|
+
````
|
34
|
+
|
35
|
+
````ruby
|
36
|
+
require 'cosmicrawler'
|
37
|
+
require 'em-http-request'
|
38
|
+
|
39
|
+
Cosmicrawler.each(%w(http://example.com/1 http://example.com/2)) {|item|
|
40
|
+
request = EM::HttpRequest.new(item)
|
41
|
+
get = request.get
|
42
|
+
puts get.response if get.response_header.status == 200
|
43
|
+
}
|
44
|
+
|
45
|
+
````
|
46
|
+
|
47
|
+
## Contributing
|
48
|
+
|
49
|
+
1. Fork it
|
50
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
51
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
52
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
53
|
+
5. Create new Pull Request
|
54
|
+
|
55
|
+
## License
|
56
|
+
|
57
|
+
Ruby's License
|
data/Rakefile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'cosmicrawler/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "cosmicrawler"
|
8
|
+
spec.version = Cosmicrawler::VERSION
|
9
|
+
spec.authors = ["Toshiaki Koshiba"]
|
10
|
+
spec.email = ["koshiba+github@4038nullpointer.com"]
|
11
|
+
spec.description = 'Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous crawling by (http|file|etc) using EventMachine.'
|
12
|
+
spec.summary = 'Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous crawling by (http|file|etc) using EventMachine.'
|
13
|
+
spec.homepage = "https://github.com/bash0C7/cosmicrawler"
|
14
|
+
spec.license = "Ruby"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
|
24
|
+
spec.add_runtime_dependency("eventmachine")
|
25
|
+
spec.add_runtime_dependency("em-http-request")
|
26
|
+
spec.add_runtime_dependency("em-synchrony")
|
27
|
+
end
|
data/lib/cosmicrawler.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
module Cosmicrawler
|
2
|
+
%w(version em).each do |lib|
|
3
|
+
require_relative 'cosmicrawler/' + lib
|
4
|
+
end
|
5
|
+
|
6
|
+
DEFAULT_CONCURRENCY = 8
|
7
|
+
|
8
|
+
class << self
|
9
|
+
def each collection, concurrency = DEFAULT_CONCURRENCY, &block
|
10
|
+
c = Em::Crawler.new concurrency
|
11
|
+
c.each collection, &block
|
12
|
+
end
|
13
|
+
|
14
|
+
def http_crawl urls, concurrency = DEFAULT_CONCURRENCY, &block
|
15
|
+
c = Em::HttpCrawler.new concurrency
|
16
|
+
c.crawl urls, &block
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require'eventmachine'
|
2
|
+
require'em-synchrony'
|
3
|
+
require "em-synchrony/fiber_iterator"
|
4
|
+
|
5
|
+
require 'ostruct'
|
6
|
+
|
7
|
+
require_relative 'em/crawler.rb'
|
8
|
+
require_relative 'em/http_crawler.rb'
|
9
|
+
|
10
|
+
module Cosmicrawler
|
11
|
+
module Em
|
12
|
+
def initialize concurrency
|
13
|
+
@concurrency = concurrency
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
def iterate collection, &block
|
18
|
+
EventMachine.synchrony do
|
19
|
+
EM::Synchrony::FiberIterator.new(collection, @concurrency).each do |item|
|
20
|
+
block.call item
|
21
|
+
end
|
22
|
+
EM.stop_event_loop
|
23
|
+
end #EventMachine.synchrony
|
24
|
+
self
|
25
|
+
end #iterate
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require'em-http-request'
|
2
|
+
require'em-synchrony/em-http'
|
3
|
+
|
4
|
+
module Cosmicrawler
|
5
|
+
module Em
|
6
|
+
class HttpCrawler
|
7
|
+
include Em
|
8
|
+
|
9
|
+
def crawl urls, &block
|
10
|
+
raise unless block_given?
|
11
|
+
|
12
|
+
iterate(urls) do |url|
|
13
|
+
request = ::EM::HttpRequest.new(url)
|
14
|
+
block.call request
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'cosmicrawler'
|
3
|
+
|
4
|
+
describe Cosmicrawler do
|
5
|
+
before :each do
|
6
|
+
WebMock.reset!
|
7
|
+
stub_request(:get, /.+/).to_return(:status => 200, :body => "", :headers => {})
|
8
|
+
end
|
9
|
+
|
10
|
+
describe :each do
|
11
|
+
it 'block is required' do
|
12
|
+
lambda {Cosmicrawler.each(%w())}.should raise_error
|
13
|
+
end
|
14
|
+
|
15
|
+
context %w() do
|
16
|
+
it 'block is not called' do
|
17
|
+
Cosmicrawler.each(%w()) {|item|
|
18
|
+
request = EM::HttpRequest.new('http://example.com/')
|
19
|
+
request.get
|
20
|
+
}
|
21
|
+
WebMock.should_not have_requested(:get, "http://example.com/")
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
context %w(http://example.com/1) do
|
26
|
+
it 'block is called once' do
|
27
|
+
Cosmicrawler.each(%w(http://example.com/1)) {|item|
|
28
|
+
request = EM::HttpRequest.new(item)
|
29
|
+
request.get
|
30
|
+
|
31
|
+
}
|
32
|
+
WebMock.should have_requested(:get, "http://example.com/1")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
context %w(http://example.com/1 http://example.com/2) do
|
37
|
+
it 'block is called twice' do
|
38
|
+
Cosmicrawler.each(%w(http://example.com/1 http://example.com/2)) {|item|
|
39
|
+
request = EM::HttpRequest.new(item)
|
40
|
+
request.get
|
41
|
+
}
|
42
|
+
WebMock.should have_requested(:get, "http://example.com/1")
|
43
|
+
WebMock.should have_requested(:get, "http://example.com/2")
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe :http_crawl do
|
49
|
+
it 'block is required' do
|
50
|
+
lambda {Cosmicrawler.http_crawl(%w())}.should raise_error
|
51
|
+
end
|
52
|
+
|
53
|
+
context %w() do
|
54
|
+
it 'block is not called' do
|
55
|
+
Cosmicrawler.http_crawl(%w()) {|request|
|
56
|
+
EM::HttpRequest.new('http://example.com/').get
|
57
|
+
}
|
58
|
+
WebMock.should_not have_requested(:get, "http://example.com/")
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context %w(http://example.com/1) do
|
63
|
+
it 'block is called once' do
|
64
|
+
Cosmicrawler.http_crawl(%w(http://example.com/1)) {|request|
|
65
|
+
request.get
|
66
|
+
|
67
|
+
}
|
68
|
+
WebMock.should have_requested(:get, "http://example.com/1")
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
context %w(http://example.com/1 http://example.com/2) do
|
73
|
+
it 'block is called twice' do
|
74
|
+
Cosmicrawler.http_crawl(%w(http://example.com/1 http://example.com/2)) {|request|
|
75
|
+
request.get
|
76
|
+
}
|
77
|
+
WebMock.should have_requested(:get, "http://example.com/1")
|
78
|
+
WebMock.should have_requested(:get, "http://example.com/2")
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
context 'URI' do
|
83
|
+
it do
|
84
|
+
urls = %w(http://example.com/1 http://example.com/2).map {|str| URI.parse str}
|
85
|
+
|
86
|
+
Cosmicrawler.http_crawl(urls) {|request|
|
87
|
+
request.get
|
88
|
+
}
|
89
|
+
WebMock.should have_requested(:get, "http://example.com/1")
|
90
|
+
WebMock.should have_requested(:get, "http://example.com/2")
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cosmicrawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Toshiaki Koshiba
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-03-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: eventmachine
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: em-http-request
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: em-synchrony
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous
|
84
|
+
crawling by (http|file|etc) using EventMachine.
|
85
|
+
email:
|
86
|
+
- koshiba+github@4038nullpointer.com
|
87
|
+
executables: []
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- .gitignore
|
92
|
+
- .travis.yml
|
93
|
+
- Gemfile
|
94
|
+
- README.md
|
95
|
+
- Rakefile
|
96
|
+
- cosmicrawler.gemspec
|
97
|
+
- lib/cosmicrawler.rb
|
98
|
+
- lib/cosmicrawler/em.rb
|
99
|
+
- lib/cosmicrawler/em/crawler.rb
|
100
|
+
- lib/cosmicrawler/em/http_crawler.rb
|
101
|
+
- lib/cosmicrawler/version.rb
|
102
|
+
- spec/cosmicrawler_spec.rb
|
103
|
+
- spec/spec_helper.rb
|
104
|
+
homepage: https://github.com/bash0C7/cosmicrawler
|
105
|
+
licenses:
|
106
|
+
- Ruby
|
107
|
+
metadata: {}
|
108
|
+
post_install_message:
|
109
|
+
rdoc_options: []
|
110
|
+
require_paths:
|
111
|
+
- lib
|
112
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - '>='
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
|
+
requirements:
|
119
|
+
- - '>='
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: '0'
|
122
|
+
requirements: []
|
123
|
+
rubyforge_project:
|
124
|
+
rubygems_version: 2.0.0
|
125
|
+
signing_key:
|
126
|
+
specification_version: 4
|
127
|
+
summary: Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous
|
128
|
+
crawling by (http|file|etc) using EventMachine.
|
129
|
+
test_files:
|
130
|
+
- spec/cosmicrawler_spec.rb
|
131
|
+
- spec/spec_helper.rb
|