cosmicrawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1c51e03fd72a7db8bad8d0addc1195c5fd96a021
4
+ data.tar.gz: 2aff46f71f44585ef2bfb00a8c88dd2f6be0009d
5
+ SHA512:
6
+ metadata.gz: 84f47ea22ed2c920e3f480485d6fbe36377d579131d7706f168c2f888155aab02be99bd0bb38d981dcb1d15934fbed0bb621c024619904c108602e77906ac1f9
7
+ data.tar.gz: 5f522e9dc55e3f335e5c84e26f9760f199cdb0329fc56f264abf732a191eb92a2e0ccd22e13057aa5bbabf0ffd51d4dada32f9426fdc8b10ebb53d7116f10f2f
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ /nbproject
@@ -0,0 +1,23 @@
1
+ rvm:
2
+ - 1.9.3
3
+ - jruby-18mode
4
+ - jruby-19mode
5
+ - rbx-18mode
6
+ - rbx-19mode
7
+ - ruby-head
8
+ - jruby-head
9
+ - 1.8.7
10
+ - ree
11
+ jdk:
12
+ - openjdk7
13
+ - oraclejdk7
14
+ - openjdk6
15
+ env:
16
+ - ISOLATED=true
17
+ - ISOLATED=false
18
+ matrix:
19
+ exclude:
20
+ - rvm: 1.9.3
21
+ jdk: openjdk7
22
+ - rvm: 1.9.3
23
+ jdk: oraclejdk7
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in cosmicrawler.gemspec
4
+ gemspec
5
+
6
+ group :development do
7
+ gem "bundler"
8
+ gem "rspec"
9
+ gem 'webmock'
10
+ gem 'pry'
11
+ end
12
+
13
+ gem 'eventmachine'
14
+ gem 'em-http-request'
15
+ gem 'em-synchrony'
16
+
@@ -0,0 +1,57 @@
1
+
2
+ # Cosmicrawler
3
+
4
+ Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous crawling by http, file, etc using [EventMachine](https://github.com/eventmachine/eventmachine).
5
+
6
+ [![Build Status](https://travis-ci.org/(https://travis-ci.org/bash0C7/cosmicrawler.png)](https://travis-ci.org/bash0C7/cosmicrawler)
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ gem 'cosmicrawler'
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install cosmicrawler
21
+
22
+ ## Usage
23
+
24
+ ### http
25
+
26
+ ````ruby
27
+ require 'cosmicrawler'
28
+
29
+ Cosmicrawler.http_crawl(%w(http://example.com/1 http://example.com/2)) {|request|
30
+ get = request.get
31
+ puts get.response if get.response_header.status == 200
32
+ }
33
+ ````
34
+
35
+ ````ruby
36
+ require 'cosmicrawler'
37
+ require 'em-http-request'
38
+
39
+ Cosmicrawler.each(%w(http://example.com/1 http://example.com/2)) {|item|
40
+ request = EM::HttpRequest.new(item)
41
+ get = request.get
42
+ puts get.response if get.response_header.status == 200
43
+ }
44
+
45
+ ````
46
+
47
+ ## Contributing
48
+
49
+ 1. Fork it
50
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
51
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
52
+ 4. Push to the branch (`git push origin my-new-feature`)
53
+ 5. Create new Pull Request
54
+
55
+ ## License
56
+
57
+ Ruby's License
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default => :spec
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'cosmicrawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "cosmicrawler"
8
+ spec.version = Cosmicrawler::VERSION
9
+ spec.authors = ["Toshiaki Koshiba"]
10
+ spec.email = ["koshiba+github@4038nullpointer.com"]
11
+ spec.description = 'Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous crawling by (http|file|etc) using EventMachine.'
12
+ spec.summary = 'Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous crawling by (http|file|etc) using EventMachine.'
13
+ spec.homepage = "https://github.com/bash0C7/cosmicrawler"
14
+ spec.license = "Ruby"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+
24
+ spec.add_runtime_dependency("eventmachine")
25
+ spec.add_runtime_dependency("em-http-request")
26
+ spec.add_runtime_dependency("em-synchrony")
27
+ end
@@ -0,0 +1,19 @@
1
+ module Cosmicrawler
2
+ %w(version em).each do |lib|
3
+ require_relative 'cosmicrawler/' + lib
4
+ end
5
+
6
+ DEFAULT_CONCURRENCY = 8
7
+
8
+ class << self
9
+ def each collection, concurrency = DEFAULT_CONCURRENCY, &block
10
+ c = Em::Crawler.new concurrency
11
+ c.each collection, &block
12
+ end
13
+
14
+ def http_crawl urls, concurrency = DEFAULT_CONCURRENCY, &block
15
+ c = Em::HttpCrawler.new concurrency
16
+ c.crawl urls, &block
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,27 @@
1
+ require'eventmachine'
2
+ require'em-synchrony'
3
+ require "em-synchrony/fiber_iterator"
4
+
5
+ require 'ostruct'
6
+
7
+ require_relative 'em/crawler.rb'
8
+ require_relative 'em/http_crawler.rb'
9
+
10
+ module Cosmicrawler
11
+ module Em
12
+ def initialize concurrency
13
+ @concurrency = concurrency
14
+ end
15
+
16
+ private
17
+ def iterate collection, &block
18
+ EventMachine.synchrony do
19
+ EM::Synchrony::FiberIterator.new(collection, @concurrency).each do |item|
20
+ block.call item
21
+ end
22
+ EM.stop_event_loop
23
+ end #EventMachine.synchrony
24
+ self
25
+ end #iterate
26
+ end
27
+ end
@@ -0,0 +1,17 @@
1
+ module Cosmicrawler
2
+ module Em
3
+ class Crawler
4
+ include Em
5
+
6
+ def each collection, &block
7
+ raise unless block_given?
8
+
9
+ iterate(collection) do |item|
10
+ block.call item
11
+ end
12
+
13
+ self
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,20 @@
1
+ require'em-http-request'
2
+ require'em-synchrony/em-http'
3
+
4
+ module Cosmicrawler
5
+ module Em
6
+ class HttpCrawler
7
+ include Em
8
+
9
+ def crawl urls, &block
10
+ raise unless block_given?
11
+
12
+ iterate(urls) do |url|
13
+ request = ::EM::HttpRequest.new(url)
14
+ block.call request
15
+ end
16
+
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module Cosmicrawler
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,94 @@
1
+ require 'spec_helper'
2
+ require 'cosmicrawler'
3
+
4
+ describe Cosmicrawler do
5
+ before :each do
6
+ WebMock.reset!
7
+ stub_request(:get, /.+/).to_return(:status => 200, :body => "", :headers => {})
8
+ end
9
+
10
+ describe :each do
11
+ it 'block is required' do
12
+ lambda {Cosmicrawler.each(%w())}.should raise_error
13
+ end
14
+
15
+ context %w() do
16
+ it 'block is not called' do
17
+ Cosmicrawler.each(%w()) {|item|
18
+ request = EM::HttpRequest.new('http://example.com/')
19
+ request.get
20
+ }
21
+ WebMock.should_not have_requested(:get, "http://example.com/")
22
+ end
23
+ end
24
+
25
+ context %w(http://example.com/1) do
26
+ it 'block is called once' do
27
+ Cosmicrawler.each(%w(http://example.com/1)) {|item|
28
+ request = EM::HttpRequest.new(item)
29
+ request.get
30
+
31
+ }
32
+ WebMock.should have_requested(:get, "http://example.com/1")
33
+ end
34
+ end
35
+
36
+ context %w(http://example.com/1 http://example.com/2) do
37
+ it 'block is called twice' do
38
+ Cosmicrawler.each(%w(http://example.com/1 http://example.com/2)) {|item|
39
+ request = EM::HttpRequest.new(item)
40
+ request.get
41
+ }
42
+ WebMock.should have_requested(:get, "http://example.com/1")
43
+ WebMock.should have_requested(:get, "http://example.com/2")
44
+ end
45
+ end
46
+ end
47
+
48
+ describe :http_crawl do
49
+ it 'block is required' do
50
+ lambda {Cosmicrawler.http_crawl(%w())}.should raise_error
51
+ end
52
+
53
+ context %w() do
54
+ it 'block is not called' do
55
+ Cosmicrawler.http_crawl(%w()) {|request|
56
+ EM::HttpRequest.new('http://example.com/').get
57
+ }
58
+ WebMock.should_not have_requested(:get, "http://example.com/")
59
+ end
60
+ end
61
+
62
+ context %w(http://example.com/1) do
63
+ it 'block is called once' do
64
+ Cosmicrawler.http_crawl(%w(http://example.com/1)) {|request|
65
+ request.get
66
+
67
+ }
68
+ WebMock.should have_requested(:get, "http://example.com/1")
69
+ end
70
+ end
71
+
72
+ context %w(http://example.com/1 http://example.com/2) do
73
+ it 'block is called twice' do
74
+ Cosmicrawler.http_crawl(%w(http://example.com/1 http://example.com/2)) {|request|
75
+ request.get
76
+ }
77
+ WebMock.should have_requested(:get, "http://example.com/1")
78
+ WebMock.should have_requested(:get, "http://example.com/2")
79
+ end
80
+ end
81
+
82
+ context 'URI' do
83
+ it do
84
+ urls = %w(http://example.com/1 http://example.com/2).map {|str| URI.parse str}
85
+
86
+ Cosmicrawler.http_crawl(urls) {|request|
87
+ request.get
88
+ }
89
+ WebMock.should have_requested(:get, "http://example.com/1")
90
+ WebMock.should have_requested(:get, "http://example.com/2")
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,3 @@
1
+ require 'rubygems'
2
+ require 'pry'
3
+ require 'webmock/rspec'
metadata ADDED
@@ -0,0 +1,131 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cosmicrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Toshiaki Koshiba
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-03-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: eventmachine
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: em-http-request
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: em-synchrony
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous
84
+ crawling by (http|file|etc) using EventMachine.
85
+ email:
86
+ - koshiba+github@4038nullpointer.com
87
+ executables: []
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - .gitignore
92
+ - .travis.yml
93
+ - Gemfile
94
+ - README.md
95
+ - Rakefile
96
+ - cosmicrawler.gemspec
97
+ - lib/cosmicrawler.rb
98
+ - lib/cosmicrawler/em.rb
99
+ - lib/cosmicrawler/em/crawler.rb
100
+ - lib/cosmicrawler/em/http_crawler.rb
101
+ - lib/cosmicrawler/version.rb
102
+ - spec/cosmicrawler_spec.rb
103
+ - spec/spec_helper.rb
104
+ homepage: https://github.com/bash0C7/cosmicrawler
105
+ licenses:
106
+ - Ruby
107
+ metadata: {}
108
+ post_install_message:
109
+ rdoc_options: []
110
+ require_paths:
111
+ - lib
112
+ required_ruby_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ required_rubygems_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ requirements: []
123
+ rubyforge_project:
124
+ rubygems_version: 2.0.0
125
+ signing_key:
126
+ specification_version: 4
127
+ summary: Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous
128
+ crawling by (http|file|etc) using EventMachine.
129
+ test_files:
130
+ - spec/cosmicrawler_spec.rb
131
+ - spec/spec_helper.rb