cosmicrawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1c51e03fd72a7db8bad8d0addc1195c5fd96a021
4
+ data.tar.gz: 2aff46f71f44585ef2bfb00a8c88dd2f6be0009d
5
+ SHA512:
6
+ metadata.gz: 84f47ea22ed2c920e3f480485d6fbe36377d579131d7706f168c2f888155aab02be99bd0bb38d981dcb1d15934fbed0bb621c024619904c108602e77906ac1f9
7
+ data.tar.gz: 5f522e9dc55e3f335e5c84e26f9760f199cdb0329fc56f264abf732a191eb92a2e0ccd22e13057aa5bbabf0ffd51d4dada32f9426fdc8b10ebb53d7116f10f2f
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ /nbproject
@@ -0,0 +1,23 @@
1
+ rvm:
2
+ - 1.9.3
3
+ - jruby-18mode
4
+ - jruby-19mode
5
+ - rbx-18mode
6
+ - rbx-19mode
7
+ - ruby-head
8
+ - jruby-head
9
+ - 1.8.7
10
+ - ree
11
+ jdk:
12
+ - openjdk7
13
+ - oraclejdk7
14
+ - openjdk6
15
+ env:
16
+ - ISOLATED=true
17
+ - ISOLATED=false
18
+ matrix:
19
+ exclude:
20
+ - rvm: 1.9.3
21
+ jdk: openjdk7
22
+ - rvm: 1.9.3
23
+ jdk: oraclejdk7
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in cosmicrawler.gemspec
4
+ gemspec
5
+
6
+ group :development do
7
+ gem "bundler"
8
+ gem "rspec"
9
+ gem 'webmock'
10
+ gem 'pry'
11
+ end
12
+
13
+ gem 'eventmachine'
14
+ gem 'em-http-request'
15
+ gem 'em-synchrony'
16
+
@@ -0,0 +1,57 @@
1
+
2
+ # Cosmicrawler
3
+
4
+ Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous crawling by http, file, etc using [EventMachine](https://github.com/eventmachine/eventmachine).
5
+
6
+ [![Build Status](https://travis-ci.org/(https://travis-ci.org/bash0C7/cosmicrawler.png)](https://travis-ci.org/bash0C7/cosmicrawler)
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ gem 'cosmicrawler'
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install cosmicrawler
21
+
22
+ ## Usage
23
+
24
+ ### http
25
+
26
+ ````ruby
27
+ require 'cosmicrawler'
28
+
29
+ Cosmicrawler.http_crawl(%w(http://example.com/1 http://example.com/2)) {|request|
30
+ get = request.get
31
+ puts get.response if get.response_header.status == 200
32
+ }
33
+ ````
34
+
35
+ ````ruby
36
+ require 'cosmicrawler'
37
+ require 'em-http-request'
38
+
39
+ Cosmicrawler.each(%w(http://example.com/1 http://example.com/2)) {|item|
40
+ request = EM::HttpRequest.new(item)
41
+ get = request.get
42
+ puts get.response if get.response_header.status == 200
43
+ }
44
+
45
+ ````
46
+
47
+ ## Contributing
48
+
49
+ 1. Fork it
50
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
51
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
52
+ 4. Push to the branch (`git push origin my-new-feature`)
53
+ 5. Create new Pull Request
54
+
55
+ ## License
56
+
57
+ Ruby's License
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default => :spec
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'cosmicrawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "cosmicrawler"
8
+ spec.version = Cosmicrawler::VERSION
9
+ spec.authors = ["Toshiaki Koshiba"]
10
+ spec.email = ["koshiba+github@4038nullpointer.com"]
11
+ spec.description = 'Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous crawling by (http|file|etc) using EventMachine.'
12
+ spec.summary = 'Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous crawling by (http|file|etc) using EventMachine.'
13
+ spec.homepage = "https://github.com/bash0C7/cosmicrawler"
14
+ spec.license = "Ruby"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+
24
+ spec.add_runtime_dependency("eventmachine")
25
+ spec.add_runtime_dependency("em-http-request")
26
+ spec.add_runtime_dependency("em-synchrony")
27
+ end
@@ -0,0 +1,19 @@
1
+ module Cosmicrawler
2
+ %w(version em).each do |lib|
3
+ require_relative 'cosmicrawler/' + lib
4
+ end
5
+
6
+ DEFAULT_CONCURRENCY = 8
7
+
8
+ class << self
9
+ def each collection, concurrency = DEFAULT_CONCURRENCY, &block
10
+ c = Em::Crawler.new concurrency
11
+ c.each collection, &block
12
+ end
13
+
14
+ def http_crawl urls, concurrency = DEFAULT_CONCURRENCY, &block
15
+ c = Em::HttpCrawler.new concurrency
16
+ c.crawl urls, &block
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,27 @@
1
+ require'eventmachine'
2
+ require'em-synchrony'
3
+ require "em-synchrony/fiber_iterator"
4
+
5
+ require 'ostruct'
6
+
7
+ require_relative 'em/crawler.rb'
8
+ require_relative 'em/http_crawler.rb'
9
+
10
+ module Cosmicrawler
11
+ module Em
12
+ def initialize concurrency
13
+ @concurrency = concurrency
14
+ end
15
+
16
+ private
17
+ def iterate collection, &block
18
+ EventMachine.synchrony do
19
+ EM::Synchrony::FiberIterator.new(collection, @concurrency).each do |item|
20
+ block.call item
21
+ end
22
+ EM.stop_event_loop
23
+ end #EventMachine.synchrony
24
+ self
25
+ end #iterate
26
+ end
27
+ end
@@ -0,0 +1,17 @@
1
+ module Cosmicrawler
2
+ module Em
3
+ class Crawler
4
+ include Em
5
+
6
+ def each collection, &block
7
+ raise unless block_given?
8
+
9
+ iterate(collection) do |item|
10
+ block.call item
11
+ end
12
+
13
+ self
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,20 @@
1
+ require'em-http-request'
2
+ require'em-synchrony/em-http'
3
+
4
+ module Cosmicrawler
5
+ module Em
6
+ class HttpCrawler
7
+ include Em
8
+
9
+ def crawl urls, &block
10
+ raise unless block_given?
11
+
12
+ iterate(urls) do |url|
13
+ request = ::EM::HttpRequest.new(url)
14
+ block.call request
15
+ end
16
+
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module Cosmicrawler
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,94 @@
1
+ require 'spec_helper'
2
+ require 'cosmicrawler'
3
+
4
+ describe Cosmicrawler do
5
+ before :each do
6
+ WebMock.reset!
7
+ stub_request(:get, /.+/).to_return(:status => 200, :body => "", :headers => {})
8
+ end
9
+
10
+ describe :each do
11
+ it 'block is required' do
12
+ lambda {Cosmicrawler.each(%w())}.should raise_error
13
+ end
14
+
15
+ context %w() do
16
+ it 'block is not called' do
17
+ Cosmicrawler.each(%w()) {|item|
18
+ request = EM::HttpRequest.new('http://example.com/')
19
+ request.get
20
+ }
21
+ WebMock.should_not have_requested(:get, "http://example.com/")
22
+ end
23
+ end
24
+
25
+ context %w(http://example.com/1) do
26
+ it 'block is called once' do
27
+ Cosmicrawler.each(%w(http://example.com/1)) {|item|
28
+ request = EM::HttpRequest.new(item)
29
+ request.get
30
+
31
+ }
32
+ WebMock.should have_requested(:get, "http://example.com/1")
33
+ end
34
+ end
35
+
36
+ context %w(http://example.com/1 http://example.com/2) do
37
+ it 'block is called twice' do
38
+ Cosmicrawler.each(%w(http://example.com/1 http://example.com/2)) {|item|
39
+ request = EM::HttpRequest.new(item)
40
+ request.get
41
+ }
42
+ WebMock.should have_requested(:get, "http://example.com/1")
43
+ WebMock.should have_requested(:get, "http://example.com/2")
44
+ end
45
+ end
46
+ end
47
+
48
+ describe :http_crawl do
49
+ it 'block is required' do
50
+ lambda {Cosmicrawler.http_crawl(%w())}.should raise_error
51
+ end
52
+
53
+ context %w() do
54
+ it 'block is not called' do
55
+ Cosmicrawler.http_crawl(%w()) {|request|
56
+ EM::HttpRequest.new('http://example.com/').get
57
+ }
58
+ WebMock.should_not have_requested(:get, "http://example.com/")
59
+ end
60
+ end
61
+
62
+ context %w(http://example.com/1) do
63
+ it 'block is called once' do
64
+ Cosmicrawler.http_crawl(%w(http://example.com/1)) {|request|
65
+ request.get
66
+
67
+ }
68
+ WebMock.should have_requested(:get, "http://example.com/1")
69
+ end
70
+ end
71
+
72
+ context %w(http://example.com/1 http://example.com/2) do
73
+ it 'block is called twice' do
74
+ Cosmicrawler.http_crawl(%w(http://example.com/1 http://example.com/2)) {|request|
75
+ request.get
76
+ }
77
+ WebMock.should have_requested(:get, "http://example.com/1")
78
+ WebMock.should have_requested(:get, "http://example.com/2")
79
+ end
80
+ end
81
+
82
+ context 'URI' do
83
+ it do
84
+ urls = %w(http://example.com/1 http://example.com/2).map {|str| URI.parse str}
85
+
86
+ Cosmicrawler.http_crawl(urls) {|request|
87
+ request.get
88
+ }
89
+ WebMock.should have_requested(:get, "http://example.com/1")
90
+ WebMock.should have_requested(:get, "http://example.com/2")
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,3 @@
1
+ require 'rubygems'
2
+ require 'pry'
3
+ require 'webmock/rspec'
metadata ADDED
@@ -0,0 +1,131 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cosmicrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Toshiaki Koshiba
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-03-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: eventmachine
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: em-http-request
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: em-synchrony
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous
84
+ crawling by (http|file|etc) using EventMachine.
85
+ email:
86
+ - koshiba+github@4038nullpointer.com
87
+ executables: []
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - .gitignore
92
+ - .travis.yml
93
+ - Gemfile
94
+ - README.md
95
+ - Rakefile
96
+ - cosmicrawler.gemspec
97
+ - lib/cosmicrawler.rb
98
+ - lib/cosmicrawler/em.rb
99
+ - lib/cosmicrawler/em/crawler.rb
100
+ - lib/cosmicrawler/em/http_crawler.rb
101
+ - lib/cosmicrawler/version.rb
102
+ - spec/cosmicrawler_spec.rb
103
+ - spec/spec_helper.rb
104
+ homepage: https://github.com/bash0C7/cosmicrawler
105
+ licenses:
106
+ - Ruby
107
+ metadata: {}
108
+ post_install_message:
109
+ rdoc_options: []
110
+ require_paths:
111
+ - lib
112
+ required_ruby_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ required_rubygems_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ requirements: []
123
+ rubyforge_project:
124
+ rubygems_version: 2.0.0
125
+ signing_key:
126
+ specification_version: 4
127
+ summary: Cosmicrawler is crawler library for Ruby. It provides scalable asynchronous
128
+ crawling by (http|file|etc) using EventMachine.
129
+ test_files:
130
+ - spec/cosmicrawler_spec.rb
131
+ - spec/spec_helper.rb