em-http-fetcher 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +89 -0
- data/em-http-fetcher.gemspec +31 -0
- data/lib/em-http-fetcher.rb +1 -0
- data/lib/em/http-fetcher/fetcher.rb +163 -0
- metadata +82 -0
data/README.rdoc
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
= em-http-fetcher
|
2
|
+
|
3
|
+
HTTP fetch client based on ruby EventMachne and EM-HTTP-Request
|
4
|
+
that has configureable concurrency regardless of EM's thread pool.
|
5
|
+
|
6
|
+
== Example
|
7
|
+
|
8
|
+
EM.run do
|
9
|
+
trap(:INT) { EM.stop }
|
10
|
+
fetcher = EM::HttpFetcher.new
|
11
|
+
fetcher.callback do |req| # req is HttpRequest instance
|
12
|
+
# Here is global callback block for all request
|
13
|
+
p "Fetch success! #{req.last_effective_url} (#{req.response.size} bytes)"
|
14
|
+
end
|
15
|
+
|
16
|
+
%w(
|
17
|
+
http://www.google.com/
|
18
|
+
http://heroku.com/
|
19
|
+
http://sourceforge.net/
|
20
|
+
http://github.com/
|
21
|
+
).each do |url|
|
22
|
+
fetcher.request url
|
23
|
+
end
|
24
|
+
|
25
|
+
req = fetcher.request 'http://www.ruby-lang.org/'
|
26
|
+
req.callback do
|
27
|
+
# Here is appendix callback block for this request.
|
28
|
+
# Global callback block will also be called.
|
29
|
+
puts "Hello Ruby!"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
== Install
|
34
|
+
|
35
|
+
After em-http-request *1.0.4* *or* *above* is released, you can install with gem.
|
36
|
+
Currently em-http-fetcher depends (> 1.0.3) to prevent to install intentionally.
|
37
|
+
|
38
|
+
$ gem install em-http-fetcher
|
39
|
+
|
40
|
+
=== Workaround with bundler
|
41
|
+
|
42
|
+
gem install bundler
|
43
|
+
|
44
|
+
And create Gemfile to fetch develop version of em-http-request.
|
45
|
+
|
46
|
+
source "http://rubygems.org"
|
47
|
+
gem 'em-http-request', :git => 'git://github.com/igrigorik/em-http-request.git'
|
48
|
+
|
49
|
+
Then run bundle to install gems.
|
50
|
+
|
51
|
+
$ bundle
|
52
|
+
|
53
|
+
Finally run your script with "bundle exec".
|
54
|
+
|
55
|
+
$ bundle exec YOUR_SCRIPT
|
56
|
+
|
57
|
+
== Usage
|
58
|
+
|
59
|
+
=== Options for HttpFetcher.new
|
60
|
+
|
61
|
+
[:concurrency] Concurrency for all request.
|
62
|
+
[:host_concurrency] Concurrency per host.
|
63
|
+
[:host_request_wait] Wait specified seconds after request on each request thread.
|
64
|
+
[(all other keys)] Pass through for HttpRequest.new
|
65
|
+
|
66
|
+
=== Options for HttpFetcher#request
|
67
|
+
|
68
|
+
[:uri] Target URI (String or URI object)
|
69
|
+
[:method] Request method (get/head/put...) (default=:get)
|
70
|
+
[(all other keys)] Pass through for HttpRequest#(get/head/put...)
|
71
|
+
|
72
|
+
If first argument is not a hash, it will be treated as :uri.
|
73
|
+
|
74
|
+
== Limitations
|
75
|
+
|
76
|
+
* :host_concurrency is checked only for initial URI.
|
77
|
+
When request is redirected, number of parallel requests for
|
78
|
+
one host may be over host_concurrency.
|
79
|
+
* Redirections will not work until issue #230 of em-http-request
|
80
|
+
is resolved; https://github.com/igrigorik/em-http-request/pull/230
|
81
|
+
|
82
|
+
== License
|
83
|
+
|
84
|
+
Same as Ruby 2.0 (2-clause BSDL or Ruby original license)
|
85
|
+
|
86
|
+
== See Also
|
87
|
+
|
88
|
+
EventMachine:: http://rubyeventmachine.com/
|
89
|
+
EM-HTTP-Request:: https://github.com/igrigorik/em-http-request
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "em-http-fetcher"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "em-http-fetcher"
|
7
|
+
s.version = EventMachine::HttpFetcher::VERSION
|
8
|
+
|
9
|
+
s.platform = Gem::Platform::RUBY
|
10
|
+
s.authors = ["Tatsuki Sugiura"]
|
11
|
+
s.email = ["sugi@nemui.org"]
|
12
|
+
s.homepage = "http://github.com/sugi/em-http-fetcher"
|
13
|
+
s.summary = "HTTP fetch client based on ruby EventMachne and EM-HTTP-Request"
|
14
|
+
s.description = "HTTP fetch client based on ruby EventMachne and EM-HTTP-Request that has configureable concurrency regardless of EM's thread pool."
|
15
|
+
|
16
|
+
# s.rubyforge_project = ""
|
17
|
+
|
18
|
+
s.required_ruby_version = '>= 1.9.0'
|
19
|
+
|
20
|
+
s.add_dependency "addressable", ">= 2.2.3"
|
21
|
+
s.add_dependency "em-http-request", "> 1.0.3"
|
22
|
+
|
23
|
+
# s.add_development_dependency "rspec"
|
24
|
+
# s.add_development_dependency "rake"
|
25
|
+
|
26
|
+
s.files = `git ls-files`.split("\n")
|
27
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
28
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
29
|
+
s.require_paths = ["lib"]
|
30
|
+
end
|
31
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'em/http-fetcher/fetcher.rb'
|
@@ -0,0 +1,163 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'eventmachine'
|
3
|
+
require 'em/pool'
|
4
|
+
require 'em-http-request'
|
5
|
+
require 'addressable/uri'
|
6
|
+
|
7
|
+
module EventMachine
|
8
|
+
class HttpFetcher
|
9
|
+
VERSION = "0.1.0"
|
10
|
+
|
11
|
+
class RequestPool
|
12
|
+
def initialize(total_size, host_resource_size, host_reuse_wait = 0, opts = {})
|
13
|
+
super()
|
14
|
+
@total_size = total_size
|
15
|
+
@host_reuse_wait = host_reuse_wait
|
16
|
+
@host_resource_size = host_resource_size
|
17
|
+
|
18
|
+
@total_queue = EM::Queue.new
|
19
|
+
total_size.times { @total_queue.push true }
|
20
|
+
@host_pools = Hash.new {|h, k|
|
21
|
+
pool = EM::Pool.new
|
22
|
+
def pool.add item
|
23
|
+
super
|
24
|
+
@removed.delete item
|
25
|
+
end
|
26
|
+
host_resource_size.times {
|
27
|
+
pool.add EM::HttpRequest.new(k)
|
28
|
+
}
|
29
|
+
h[k] = { pool: pool, last_used: Time.now }
|
30
|
+
}
|
31
|
+
run
|
32
|
+
end
|
33
|
+
|
34
|
+
def perform(host, &b)
|
35
|
+
@host_pools[host][:pool].perform do |conn|
|
36
|
+
df = nil
|
37
|
+
@total_queue.pop do |tqi|
|
38
|
+
@host_pools[host][:last_used] = Time.now
|
39
|
+
@host_pools[host][:pool].remove conn
|
40
|
+
rq = proc { |req|
|
41
|
+
@total_queue.push tqi
|
42
|
+
lurl = req.last_effective_url
|
43
|
+
unless "#{lurl.scheme}://#{lurl.host}" == host
|
44
|
+
# Connection has been redirected another server.
|
45
|
+
# Re-create connection instance.
|
46
|
+
conn = EM::HttpRequest.new(host)
|
47
|
+
end
|
48
|
+
|
49
|
+
if @host_reuse_wait > 0
|
50
|
+
EM.add_timer(@host_reuse_wait) {
|
51
|
+
@host_pools[host][:pool].add conn
|
52
|
+
}
|
53
|
+
else
|
54
|
+
@host_pools[host][:pool].add conn
|
55
|
+
end
|
56
|
+
}
|
57
|
+
work = EM::Callback(&b)
|
58
|
+
df = work.call(conn)
|
59
|
+
df.callback(&rq)
|
60
|
+
df.errback(&rq)
|
61
|
+
df
|
62
|
+
end
|
63
|
+
df
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def run
|
68
|
+
# cleanup host pool timer
|
69
|
+
EM.add_periodic_timer(10) do
|
70
|
+
hrsize = @host_resource_size
|
71
|
+
@host_pools.each do |host, info|
|
72
|
+
info[:pool].instance_eval { @resources.size < hrsize } and next
|
73
|
+
info[:last_used].to_i > Time.now.to_i - 5 * 60 and next
|
74
|
+
@host_pools.delete host
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def initialize(opts = {})
|
81
|
+
@concurrency = opts[:concurrency] || 40
|
82
|
+
@host_concurrency = opts[:host_concurrency] || 2
|
83
|
+
@host_request_wait = opts[:host_request_wait] || 0.2
|
84
|
+
@request_pool = nil
|
85
|
+
@default_callbacks = []
|
86
|
+
@default_errbacks = []
|
87
|
+
@req_opts = {}.merge(opts)
|
88
|
+
@req_opts.delete :concurrency
|
89
|
+
@req_opts.delete :host_concurrency
|
90
|
+
@req_opts.delete :host_request_wait
|
91
|
+
end
|
92
|
+
|
93
|
+
def request_pool
|
94
|
+
@request_pool ||= RequestPool.new(@concurrency, @host_concurrency, @host_request_wait, @req_opts)
|
95
|
+
end
|
96
|
+
|
97
|
+
def callback(&block)
|
98
|
+
@default_callbacks << block
|
99
|
+
end
|
100
|
+
|
101
|
+
def errback(&block)
|
102
|
+
@default_errbacks << block
|
103
|
+
end
|
104
|
+
|
105
|
+
def request(*args)
|
106
|
+
if args.first.kind_of? Hash
|
107
|
+
opts = args[0]
|
108
|
+
uri = opts.delete(:uri)
|
109
|
+
else
|
110
|
+
uri = args.first
|
111
|
+
opts = args[1].kind_of?(Hash) ? args[1] : {}
|
112
|
+
end
|
113
|
+
|
114
|
+
uri.kind_of?(Addressable::URI) or
|
115
|
+
uri = Addressable::URI.parse(uri.to_s)
|
116
|
+
opts = {
|
117
|
+
:keepalive => true,
|
118
|
+
:redirects => 20,
|
119
|
+
:path => uri.path || '/',
|
120
|
+
}.merge(opts)
|
121
|
+
method = opts.delete(:method) || :get
|
122
|
+
uri.query and otps[:query] = uri.query
|
123
|
+
|
124
|
+
df = nil
|
125
|
+
request_pool.perform("#{uri.scheme}://#{uri.host}") do |conn|
|
126
|
+
df = req = conn.__send__(method, opts)
|
127
|
+
@default_callbacks.each do |cb|
|
128
|
+
req.callback(&cb)
|
129
|
+
end
|
130
|
+
@default_errbacks.each do |cb|
|
131
|
+
req.errback(&cb)
|
132
|
+
end
|
133
|
+
req
|
134
|
+
end
|
135
|
+
df
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
if __FILE__ == $0
|
142
|
+
trap(:INT) { EM.stop }
|
143
|
+
EM.run do
|
144
|
+
r = EM::HttpFetcher.new
|
145
|
+
r.callback do |req|
|
146
|
+
p [:success, req.last_effective_url, req.response.size]
|
147
|
+
end
|
148
|
+
r.errback do |req|
|
149
|
+
p [:err, req.last_effective_url, req.response.size]
|
150
|
+
end
|
151
|
+
|
152
|
+
ARGF.each { |line|
|
153
|
+
line.chomp!
|
154
|
+
line or next
|
155
|
+
req = r.request(line)
|
156
|
+
if line == 'http://www.yahoo.co.jp/'
|
157
|
+
req.callback do
|
158
|
+
p :yahoo!
|
159
|
+
end
|
160
|
+
end
|
161
|
+
}
|
162
|
+
end
|
163
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: em-http-fetcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Tatsuki Sugiura
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-05-11 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: addressable
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 2.2.3
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 2.2.3
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: em-http-request
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>'
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.0.3
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>'
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.0.3
|
46
|
+
description: HTTP fetch client based on ruby EventMachne and EM-HTTP-Request that
|
47
|
+
has configureable concurrency regardless of EM's thread pool.
|
48
|
+
email:
|
49
|
+
- sugi@nemui.org
|
50
|
+
executables: []
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files: []
|
53
|
+
files:
|
54
|
+
- README.rdoc
|
55
|
+
- em-http-fetcher.gemspec
|
56
|
+
- lib/em-http-fetcher.rb
|
57
|
+
- lib/em/http-fetcher/fetcher.rb
|
58
|
+
homepage: http://github.com/sugi/em-http-fetcher
|
59
|
+
licenses: []
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
require_paths:
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.9.0
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
requirements: []
|
77
|
+
rubyforge_project:
|
78
|
+
rubygems_version: 1.8.23
|
79
|
+
signing_key:
|
80
|
+
specification_version: 3
|
81
|
+
summary: HTTP fetch client based on ruby EventMachne and EM-HTTP-Request
|
82
|
+
test_files: []
|