blat 0.1.0a
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/blat.rb +16 -0
- data/lib/blat/batch.rb +121 -0
- data/lib/blat/formats.rb +118 -0
- data/lib/blat/pool.rb +432 -0
- metadata +61 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 878ccb8c27eeeeea95bf2771feb5164f0ea9f33b
|
4
|
+
data.tar.gz: 6e06378de9ddd8947ceb87ecced8c1418477a0c7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b9a812fa61e9627a5167e6595dcfde2c616349cac35b67428ba4bdebecbd28c524cb9888cabb99f639f0f0edc43ec371b1d8da9a6b776079f4ccd3e5bf844869
|
7
|
+
data.tar.gz: 5f8d35878a6993d6ace44d9a642e590054fff062d5f631893dd42f996ccb88011d34b3b52f0e457af592e680509cedc550bef3297d12186ce3f026bacf5770a0
|
data/lib/blat.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
|
2
|
+
require 'blat/pool'
|
3
|
+
require 'blat/batch'
|
4
|
+
require 'blat/formats'
|
5
|
+
|
6
|
+
# Blat is a hugely parallel wrapper for cURL designed to download data as
|
7
|
+
# aggressively as possible.
|
8
|
+
#
|
9
|
+
# Blat makes use of many threads at once in a producer-consumer pattern, and
|
10
|
+
# accepts tasks in the form of Blat::Jobs, which contain configuration and
|
11
|
+
# results from each request.
|
12
|
+
module Blat
|
13
|
+
|
14
|
+
VERSION = '0.1.0a'
|
15
|
+
|
16
|
+
end
|
data/lib/blat/batch.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
require 'blat/pool'
|
4
|
+
|
5
|
+
module Blat
|
6
|
+
|
7
|
+
# The batch downloader is a simpler wrapper around Pool that runs in a
|
8
|
+
# blocking manner. The idea of this is that you can put a list of URLs in,
|
9
|
+
# run #run and then retrieve the results easily and quickly.
|
10
|
+
#
|
11
|
+
# example:
|
12
|
+
#
|
13
|
+
# urls = File.read('url.list').lines
|
14
|
+
# b = Blat::Batch.new( list )
|
15
|
+
# b.run(10)
|
16
|
+
# puts "Results: #{b.results}"
|
17
|
+
#
|
18
|
+
class Batch
|
19
|
+
|
20
|
+
# Create a new batch downloader for a given list of URLS, and a given set
|
21
|
+
# of configuration options.
|
22
|
+
#
|
23
|
+
# [:urls] An array of URLs to download.
|
24
|
+
# [:config] (optional) configuration to pass to the Jobs. See Blat::Job
|
25
|
+
# for more information.
|
26
|
+
def initialize(urls, config = {})
|
27
|
+
|
28
|
+
# Config for each object
|
29
|
+
@config = config
|
30
|
+
|
31
|
+
# URLS in as a string
|
32
|
+
@urls = urls
|
33
|
+
@urls_mx = Mutex.new
|
34
|
+
|
35
|
+
# Stores results as Job objects
|
36
|
+
@results = []
|
37
|
+
@results_mx = Mutex.new
|
38
|
+
|
39
|
+
# Keep this to see if we have finished
|
40
|
+
@url_count = urls.length
|
41
|
+
end
|
42
|
+
|
43
|
+
# Run a batch with a given number of workers.
|
44
|
+
#
|
45
|
+
# If a block is provided, it is called with the curl object just before
|
46
|
+
# requests are made. This is to allow setting of various parameters, e.g.:
|
47
|
+
#
|
48
|
+
# batch.run(10){ |c|
|
49
|
+
# c.follow_location = true
|
50
|
+
# }
|
51
|
+
#
|
52
|
+
def run(workers, &block)
|
53
|
+
|
54
|
+
# Figure out if people have overestimated the workers needed
|
55
|
+
workers = [workers, @urls.length].min
|
56
|
+
|
57
|
+
# Construct a pool
|
58
|
+
x = Blat::Pool.new(workers) do |job|
|
59
|
+
@results_mx.synchronize { @results << job }
|
60
|
+
end
|
61
|
+
|
62
|
+
# Set work to do
|
63
|
+
x.work do
|
64
|
+
|
65
|
+
# Get the URL from the list
|
66
|
+
url = @urls_mx.synchronize { @urls.pop }
|
67
|
+
|
68
|
+
# If it's set, configure and return a job
|
69
|
+
if url
|
70
|
+
Blat::Job.new(@config) do |c|
|
71
|
+
|
72
|
+
# Configure with block if appropriate
|
73
|
+
yield(c) if block_given?
|
74
|
+
|
75
|
+
c.url= url
|
76
|
+
end
|
77
|
+
else
|
78
|
+
# If not, return nil to set the worker to idle
|
79
|
+
nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Wait until workers are idle
|
84
|
+
x.wait_until_idle
|
85
|
+
|
86
|
+
# Close them all.
|
87
|
+
x.close
|
88
|
+
end
|
89
|
+
|
90
|
+
# Is the batch complete?
|
91
|
+
def complete?
|
92
|
+
@results_mx.synchronize do
|
93
|
+
@results.length == @url_count
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Report progress with three vars
|
98
|
+
#
|
99
|
+
# remaining (yet to do)
|
100
|
+
# complete (completed)
|
101
|
+
# in_progress (currently running)
|
102
|
+
# total (remaining + complete + in progress)
|
103
|
+
def progress
|
104
|
+
remaining = @urls_mx.synchronize { @urls.length }
|
105
|
+
complete = @results_mx.synchronize { @results.length }
|
106
|
+
return remaining,
|
107
|
+
complete,
|
108
|
+
(@url_count - complete - remaining),
|
109
|
+
@url_count
|
110
|
+
end
|
111
|
+
|
112
|
+
# Get results as a list
|
113
|
+
def results
|
114
|
+
@results_mx.synchronize do
|
115
|
+
return @results
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
data/lib/blat/formats.rb
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Blat
|
4
|
+
|
5
|
+
# Blat::Job represents a single download task, both as a request and response.
|
6
|
+
#
|
7
|
+
# Jobs are provided to workers in a pool by a dispatcher block. Each job
|
8
|
+
# contains:
|
9
|
+
#
|
10
|
+
# * Configuration for the worker. Current configuration supported is
|
11
|
+
# detailed below and in the Pool documentation
|
12
|
+
# * A way of configuring a curl request (in order to set the url and other
|
13
|
+
# parameters)
|
14
|
+
# * Data returned by the download. This is stored as a hash in the #data
|
15
|
+
# parameter.
|
16
|
+
#
|
17
|
+
# == Worker Configuration
|
18
|
+
#
|
19
|
+
# Workers are configured by setting values in a hash. This hash is sent to
|
20
|
+
# the worker from the Job class, and contains options that affect the process
|
21
|
+
# of downloading. This is in addition to configuration on the curl object
|
22
|
+
# performed through Blat::Job.configure()
|
23
|
+
#
|
24
|
+
# Workers currently support the following configuration options:
|
25
|
+
#
|
26
|
+
# [:max_body_size] If set, downloads will cease after this many bytes have
|
27
|
+
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
28
|
+
# be set to true.
|
29
|
+
#
|
30
|
+
# == Returned Values
|
31
|
+
#
|
32
|
+
# When a job has been finalised, its #data property will be set to a hash
|
33
|
+
# left by the worker. This is currently specified as:
|
34
|
+
#
|
35
|
+
# [:head] The head string returned from the server (response.header_str)
|
36
|
+
# [:body] The body string returned from the server (response.body)
|
37
|
+
# [:response_properties] A hash with metadata in. Partially specified by the
|
38
|
+
# worker configuration, this contains things such as the number of bytes
|
39
|
+
# downloaded and duration of the request.
|
40
|
+
# [:response] The raw response from curl
|
41
|
+
# [:error] Any errors encountered during download, such as network errors.
|
42
|
+
# If this is nil the request was successful.
|
43
|
+
#
|
44
|
+
# Response properties are currently set to:
|
45
|
+
#
|
46
|
+
# response_properties = {
|
47
|
+
# round_trip_time: res.total_time,
|
48
|
+
# redirect_time: res.redirect_time,
|
49
|
+
# dns_lookup_time: res.name_lookup_time,
|
50
|
+
# effective_uri: res.last_effective_url,
|
51
|
+
# code: res.response_code,
|
52
|
+
# download_speed: res.download_speed,
|
53
|
+
# downloaded_bytes: res.downloaded_bytes || 0,
|
54
|
+
# truncated: ignore == true
|
55
|
+
# }
|
56
|
+
#
|
57
|
+
class Job
|
58
|
+
|
59
|
+
attr_reader :config, :data
|
60
|
+
|
61
|
+
# Construct a new Job with a block for configuring curl options.
|
62
|
+
def initialize(config = {}, &block)
|
63
|
+
raise 'No curl configuration block given' unless block_given?
|
64
|
+
|
65
|
+
@curl_config_block = block
|
66
|
+
@config = config
|
67
|
+
@finalised = false
|
68
|
+
end
|
69
|
+
|
70
|
+
# Configure a curl object to make the request
|
71
|
+
def configure(curl)
|
72
|
+
@curl_config_block.yield(curl)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Has this job been completed?
|
76
|
+
def finalised?
|
77
|
+
@finalise
|
78
|
+
end
|
79
|
+
|
80
|
+
# Allow people to use closed? instead.
|
81
|
+
alias :closed? :finalised?
|
82
|
+
|
83
|
+
# Write result and prevent further editing
|
84
|
+
def finalise!(data = {})
|
85
|
+
raise 'Job is already finalised.' if finalised?
|
86
|
+
@data = data
|
87
|
+
@finalised = true
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# --------------------------------------------------------------------------
|
92
|
+
|
93
|
+
|
94
|
+
# SimpleJob is a quick and easy way of wrapping a URL to create a job.
|
95
|
+
#
|
96
|
+
# It accepts:
|
97
|
+
#
|
98
|
+
# [:url] The URL to download
|
99
|
+
# [:curl_config] A hash of properties to set on the curl object, for example: {'follow_location' => true}
|
100
|
+
# [:config] The worker configuration properties.
|
101
|
+
class SimpleJob < Job
|
102
|
+
def initialize(url, curl_config = {}, config = {})
|
103
|
+
curl_config.merge!({url: url})
|
104
|
+
|
105
|
+
super(config){ |c|
|
106
|
+
curl_config.each do |k,v|
|
107
|
+
if v.is_a?(Array)
|
108
|
+
curl.send(k.to_s + '=', *v)
|
109
|
+
else
|
110
|
+
curl.send(k.to_s + '=', v)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
}
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
end
|
data/lib/blat/pool.rb
ADDED
@@ -0,0 +1,432 @@
|
|
1
|
+
|
2
|
+
require 'thread'
|
3
|
+
require 'curl'
|
4
|
+
|
5
|
+
require 'blat/formats'
|
6
|
+
|
7
|
+
module Blat
|
8
|
+
|
9
|
+
# The Blat::Pool class controls a number of workers as they go about running
|
10
|
+
# curl Jobs. This is the main class of Blat, and is the most flexible way of
|
11
|
+
# using the gem (Batch is simpler but less full-featured).
|
12
|
+
#
|
13
|
+
# == Workflow
|
14
|
+
#
|
15
|
+
# The pool is created with a size and a callback to present results to.
|
16
|
+
# This callback may be presented as a proc object or as a block, and is
|
17
|
+
# called with a finalised Blat::Job object upon completion of each request.
|
18
|
+
#
|
19
|
+
# x = Blat::Pool.new(100){ |job|
|
20
|
+
# puts "#{job.data[:body]}"
|
21
|
+
# }
|
22
|
+
#
|
23
|
+
# Once a pool is configured, it may be commanded to start downloading by
|
24
|
+
# presenting it with a dispatcher. This is a procedure that returns either a
|
25
|
+
# Blat::Job object or nil---workers will call this block in order to acquire
|
26
|
+
# work, and will enter an idle state when nil is returned.
|
27
|
+
#
|
28
|
+
# job_list = File.read('urls').lines.map{ |l| Blat::SimpleJob.new(l) }
|
29
|
+
#
|
30
|
+
# x.work{
|
31
|
+
# job_list.pop
|
32
|
+
# }
|
33
|
+
#
|
34
|
+
# Downloading can be waited upon any number of ways. The status of the pool
|
35
|
+
# may be requested with #count_idle and #all_idle? , and it's possible to
|
36
|
+
# wait until idle using #wait_until_idle :
|
37
|
+
#
|
38
|
+
# x.wait_until_idle
|
39
|
+
# x.close
|
40
|
+
#
|
41
|
+
# == Worker Configuration
|
42
|
+
#
|
43
|
+
# Workers are configured by setting values in a hash. This hash is sent to
|
44
|
+
# the worker from the Job class, and contains options that affect the process
|
45
|
+
# of downloading. This is in addition to configuration on the curl object
|
46
|
+
# performed through Blat::Job.configure()
|
47
|
+
#
|
48
|
+
# Workers currently support the following configuration options:
|
49
|
+
#
|
50
|
+
# [:max_body_size] If set, downloads will cease after this many bytes have
|
51
|
+
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
52
|
+
# be set to true.
|
53
|
+
#
|
54
|
+
# == Returned Values
|
55
|
+
#
|
56
|
+
# When a job has been finalised, its #data property will be set to a hash
|
57
|
+
# left by the worker. This is currently specified as:
|
58
|
+
#
|
59
|
+
# [:head] The head string returned from the server (response.header_str)
|
60
|
+
# [:body] The body string returned from the server (response.body)
|
61
|
+
# [:response_properties] A hash with metadata in. Partially specified by the
|
62
|
+
# worker configuration, this contains things such as the number of bytes
|
63
|
+
# downloaded and duration of the request.
|
64
|
+
# [:response] The raw response from curl
|
65
|
+
# [:error] Any errors encountered during download, such as network errors.
|
66
|
+
# If this is nil the request was successful.
|
67
|
+
#
|
68
|
+
# Response properties are currently set to:
|
69
|
+
#
|
70
|
+
# response_properties = {
|
71
|
+
# round_trip_time: res.total_time,
|
72
|
+
# redirect_time: res.redirect_time,
|
73
|
+
# dns_lookup_time: res.name_lookup_time,
|
74
|
+
# effective_uri: res.last_effective_url,
|
75
|
+
# code: res.response_code,
|
76
|
+
# download_speed: res.download_speed,
|
77
|
+
# downloaded_bytes: res.downloaded_bytes || 0,
|
78
|
+
# truncated: ignore == true
|
79
|
+
# }
|
80
|
+
#
|
81
|
+
class Pool
|
82
|
+
|
83
|
+
# Construct a new pool with a given size and a callback used to output
|
84
|
+
# data.
|
85
|
+
#
|
86
|
+
# x = Blat::Pool.new(100){ |job|
|
87
|
+
# puts "Job complete: #{job}"
|
88
|
+
# }
|
89
|
+
#
|
90
|
+
def initialize(size, finalise_callback = nil, &block)
|
91
|
+
|
92
|
+
@m = Mutex.new # Data mutex for "producer" status
|
93
|
+
@t = {} # threads
|
94
|
+
@w = [] # workers
|
95
|
+
@idle = []
|
96
|
+
@idle_mutex = Mutex.new
|
97
|
+
@size = size.to_i # number of simultaneous workers
|
98
|
+
|
99
|
+
# Pass a block for handling returns
|
100
|
+
if block
|
101
|
+
@finalise_callback = block
|
102
|
+
elsif finalise_callback && finalise_callback.is_a?(Proc)
|
103
|
+
@finalise_callback = finalise_callback
|
104
|
+
else
|
105
|
+
raise 'No callback given for final data'
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
|
110
|
+
# ------------------------------------------------------------------------
|
111
|
+
# Workers call these to report status
|
112
|
+
#
|
113
|
+
|
114
|
+
# Workers can register as active by calling this
|
115
|
+
def worker_active(worker_id)
|
116
|
+
@idle_mutex.synchronize{ @idle[worker_id] = false }
|
117
|
+
end
|
118
|
+
|
119
|
+
# Workers can register as idle by calling this
|
120
|
+
def worker_idle(worker_id)
|
121
|
+
@idle_mutex.synchronize{ @idle[worker_id] = true }
|
122
|
+
end
|
123
|
+
|
124
|
+
# Workers can register that they have completed
|
125
|
+
# a job by calling this.
|
126
|
+
def work_complete(job)
|
127
|
+
@finalise_callback.call(job)
|
128
|
+
end
|
129
|
+
|
130
|
+
# ------------------------------------------------------------------------
|
131
|
+
# Worker status
|
132
|
+
#
|
133
|
+
|
134
|
+
# check to see if all workers are idle
|
135
|
+
def all_idle?
|
136
|
+
@idle_mutex.synchronize{ @idle.inject(true) { |m, o| m && o} }
|
137
|
+
end
|
138
|
+
|
139
|
+
# Return the number of idle workers
|
140
|
+
def count_idle
|
141
|
+
@idle_mutex.synchronize{ @idle.count(true) }
|
142
|
+
end
|
143
|
+
|
144
|
+
# ------------------------------------------------------------------------
|
145
|
+
# Set work and initialise workers
|
146
|
+
#
|
147
|
+
|
148
|
+
# Create workers without running them.
|
149
|
+
#
|
150
|
+
# This is usually not very useful to call on its own, and is called by
|
151
|
+
# #work when creating threads.
|
152
|
+
def init_workers
|
153
|
+
#$log.debug "Maintaining #{@size} worker object[s] (#{@w.length} currently active)."
|
154
|
+
@w = []
|
155
|
+
(@size - @w.length).times do |s|
|
156
|
+
@w << Worker.new(s, self)
|
157
|
+
@idle[s] = true
|
158
|
+
end
|
159
|
+
#$log.info "#{@w.length} worker[s] created."
|
160
|
+
end
|
161
|
+
|
162
|
+
# Run a worker over every point competitively.
|
163
|
+
# Will create @size workers if they do not already exist (there is no need
|
164
|
+
# to also call init_workers)
|
165
|
+
def work(dispatcher = nil, &block)
|
166
|
+
|
167
|
+
raise "No dispatcher provided" unless block_given? || (dispatcher && dispatcher.is_?(Proc))
|
168
|
+
|
169
|
+
init_workers
|
170
|
+
|
171
|
+
# Make things do the work
|
172
|
+
#$log.debug "Starting threads..."
|
173
|
+
@start_time = Time.now
|
174
|
+
@w.each do |w|
|
175
|
+
# Give each worker a handle back to the dispatcher to get data.
|
176
|
+
@t[w] = Thread.new(dispatcher || block) do |d|
|
177
|
+
begin
|
178
|
+
w.work(d)
|
179
|
+
rescue SignalException => e
|
180
|
+
#$log.fatal "Signal caught: #{e.message}"
|
181
|
+
#$log.fatal "Since I'm sampling right now, I will kill workers before shutdown."
|
182
|
+
kill_workers
|
183
|
+
raise e
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
# Pass exceptions up
|
188
|
+
@t[w].abort_on_exception = true
|
189
|
+
end
|
190
|
+
#$log.info "#{@t.length} download thread[s] started."
|
191
|
+
end
|
192
|
+
|
193
|
+
# ------------------------------------------------------------------------
|
194
|
+
# Wait on conditions and close the pool
|
195
|
+
#
|
196
|
+
|
197
|
+
# Block until all workers are idle, checking every poll_rate seconds.
|
198
|
+
def wait_until_idle(poll_rate = 0.5)
|
199
|
+
#$log.debug "Waiting until idle, polling every #{poll_rate}s..."
|
200
|
+
sleep(poll_rate)
|
201
|
+
sleep(poll_rate) until all_idle?
|
202
|
+
end
|
203
|
+
|
204
|
+
# Wait for threads to complete.
|
205
|
+
def wait_until_closed
|
206
|
+
#$log.debug "Waiting for #{@t.length} worker[s] to close."
|
207
|
+
@t.each { |w, t| t.join }
|
208
|
+
#$log.info "Workers all terminated naturally."
|
209
|
+
end
|
210
|
+
|
211
|
+
# Tell workers to die forcibly
|
212
|
+
def kill_workers
|
213
|
+
#$log.debug "Forcing #{@t.length} worker threads to die..."
|
214
|
+
@t.each { |t| t.kill }
|
215
|
+
#$log.info "Worker threads killed."
|
216
|
+
end
|
217
|
+
|
218
|
+
# Close all workers' connections to the servers cleanly,
|
219
|
+
#
|
220
|
+
# This is non-blocking. Call #close or #wait to block:
|
221
|
+
#
|
222
|
+
# pool.close_nonblock
|
223
|
+
# pool.wait_until_closed
|
224
|
+
#
|
225
|
+
def close_nonblock
|
226
|
+
#$log.debug "Requesting closure of #{@w.length} worker[s]..."
|
227
|
+
@w.each { |w| w.close }
|
228
|
+
end
|
229
|
+
|
230
|
+
# Cleanly close the pool, waiting for workers to end their
|
231
|
+
# current request. Blocks, unlike #close.
|
232
|
+
def close
|
233
|
+
close_nonblock
|
234
|
+
wait_until_closed
|
235
|
+
end
|
236
|
+
|
237
|
+
private
|
238
|
+
|
239
|
+
# Workers are instantiated and maintained by a Blat::Pool and continually
|
240
|
+
# poll for available work, passing it off for integration with the final
|
241
|
+
# results set.
|
242
|
+
#
|
243
|
+
# Though it is possible to create your own, I would recommend instead using
|
244
|
+
# a pool.
|
245
|
+
#
|
246
|
+
# == Worker Configuration
|
247
|
+
#
|
248
|
+
# Workers are configured by setting values in a hash. This hash is sent to
|
249
|
+
# the worker from the Job class, and contains options that affect the process
|
250
|
+
# of downloading. This is in addition to configuration on the curl object
|
251
|
+
# performed through Blat::Job.configure()
|
252
|
+
#
|
253
|
+
# Workers currently support the following configuration options:
|
254
|
+
#
|
255
|
+
# [:max_body_size] If set, downloads will cease after this many bytes have
|
256
|
+
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
257
|
+
# be set to true.
|
258
|
+
#
|
259
|
+
# == Returned Values
|
260
|
+
#
|
261
|
+
# When a job has been finalised, its #data property will be set to a hash
|
262
|
+
# left by the worker. This is currently specified as:
|
263
|
+
#
|
264
|
+
# [:head] The head string returned from the server (response.header_str)
|
265
|
+
# [:body] The body string returned from the server (response.body)
|
266
|
+
# [:response_properties] A hash with metadata in. Partially specified by the
|
267
|
+
# worker configuration, this contains things such as the number of bytes
|
268
|
+
# downloaded and duration of the request.
|
269
|
+
# [:response] The raw response from curl
|
270
|
+
# [:error] Any errors encountered during download, such as network errors.
|
271
|
+
# If this is nil the request was successful.
|
272
|
+
#
|
273
|
+
# Response properties are currently set to:
|
274
|
+
#
|
275
|
+
# response_properties = {
|
276
|
+
# round_trip_time: res.total_time,
|
277
|
+
# redirect_time: res.redirect_time,
|
278
|
+
# dns_lookup_time: res.name_lookup_time,
|
279
|
+
# effective_uri: res.last_effective_url,
|
280
|
+
# code: res.response_code,
|
281
|
+
# download_speed: res.download_speed,
|
282
|
+
# downloaded_bytes: res.downloaded_bytes || 0,
|
283
|
+
# truncated: ignore == true
|
284
|
+
# }
|
285
|
+
#
|
286
|
+
class Worker
|
287
|
+
|
288
|
+
# Construct a new worker with a given ID and linked to a given pool.
|
289
|
+
#
|
290
|
+
# The pool will be called to report idle/working states.
|
291
|
+
def initialize(id, pool)
|
292
|
+
@id = id
|
293
|
+
@pool = pool
|
294
|
+
@abort = false
|
295
|
+
end
|
296
|
+
|
297
|
+
# Should be run in a thread. Performs work until the dispatcher runs
|
298
|
+
# out of data.
|
299
|
+
def work(dispatcher)
|
300
|
+
# start idle
|
301
|
+
last_idle_state = true
|
302
|
+
|
303
|
+
loop do
|
304
|
+
while (job = dispatcher.call).is_a?(Job) do
|
305
|
+
|
306
|
+
# If we were idle last, tell the pool
|
307
|
+
@pool.worker_active(@id) if last_idle_state == true
|
308
|
+
|
309
|
+
# tell people
|
310
|
+
#$log.debug "W#{@id}: Downloading job #{job}"
|
311
|
+
|
312
|
+
# Make the request
|
313
|
+
complete_request(job, new_curl(job), job.config)
|
314
|
+
|
315
|
+
return if @abort
|
316
|
+
end
|
317
|
+
return if @abort
|
318
|
+
|
319
|
+
# TODO: configurable
|
320
|
+
@pool.worker_idle(@id)
|
321
|
+
last_idle_state = true
|
322
|
+
sleep(1)
|
323
|
+
end
|
324
|
+
|
325
|
+
# rescue StandardError => e
|
326
|
+
#$log.warn "W#{@id}: Error: #{e}"
|
327
|
+
#$log.debug "#{e.backtrace.join("\n")}"
|
328
|
+
end
|
329
|
+
|
330
|
+
# Closes the connection to the server
|
331
|
+
def close
|
332
|
+
@abort = true
|
333
|
+
end
|
334
|
+
|
335
|
+
private
|
336
|
+
|
337
|
+
# Datapoint is complete, run callback
|
338
|
+
def finalise(job, head, body, response_properties, response, error)
|
339
|
+
job.finalise!(
|
340
|
+
head: head,
|
341
|
+
body: body,
|
342
|
+
response_properties: response_properties,
|
343
|
+
response: response,
|
344
|
+
error: error
|
345
|
+
)
|
346
|
+
|
347
|
+
@pool.work_complete(job)
|
348
|
+
end
|
349
|
+
|
350
|
+
# ---------- called by workers below this line
|
351
|
+
|
352
|
+
# Submit a complete dp to the pool
|
353
|
+
def complete_request(job, res, config)
|
354
|
+
|
355
|
+
# Somewhere to store the body in a size-aware way
|
356
|
+
body = ''
|
357
|
+
|
358
|
+
# If limiting body size, use a callback to handle incoming data
|
359
|
+
if config[:max_body_size]
|
360
|
+
ignore = false
|
361
|
+
|
362
|
+
res.on_body do |str|
|
363
|
+
# Read up to the limit of bytes
|
364
|
+
if !ignore && config[:max_body_size] && (body.length + str.length) > config[:max_body_size]
|
365
|
+
body += str[0..(body.length + str.length) - config[:max_body_size]]
|
366
|
+
#$log.warn "W#{@id}: Job #{job} exceeded byte limit (#{config[:max_body_size]}b)"
|
367
|
+
ignore = true
|
368
|
+
elsif not ignore
|
369
|
+
body += str
|
370
|
+
else
|
371
|
+
# ignore data
|
372
|
+
end
|
373
|
+
|
374
|
+
# Have to return number of bytes to curb
|
375
|
+
str.length
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
# Perform a request prepared elsewhere,
|
380
|
+
# can run alongside other requests
|
381
|
+
res.perform
|
382
|
+
|
383
|
+
# Load body directly from response if not using the system above
|
384
|
+
body = res.body_str unless config[:max_body_size]
|
385
|
+
|
386
|
+
# Load stuff out of response object.
|
387
|
+
response_properties = {
|
388
|
+
round_trip_time: res.total_time,
|
389
|
+
redirect_time: res.redirect_time,
|
390
|
+
dns_lookup_time: res.name_lookup_time,
|
391
|
+
effective_uri: res.last_effective_url,
|
392
|
+
code: res.response_code,
|
393
|
+
download_speed: res.download_speed,
|
394
|
+
downloaded_bytes: res.downloaded_bytes || 0,
|
395
|
+
truncated: ignore == true
|
396
|
+
}
|
397
|
+
|
398
|
+
# write to datapoint list
|
399
|
+
finalise(job, res.header_str, body, response_properties, res, nil)
|
400
|
+
|
401
|
+
rescue SignalException => e
|
402
|
+
raise e
|
403
|
+
rescue StandardError => e
|
404
|
+
# if e.class.to_s =~ /^Curl::Err::/ then
|
405
|
+
# #$log.debug "W#{@id}: Job #{job}: #{e.to_s[11..-1]}"
|
406
|
+
# else
|
407
|
+
# #$log.error "W#{@id}: Exception retrieving #{job}: #{e.to_s}."
|
408
|
+
# #$log.debug "#{e.backtrace.join("\n")}"
|
409
|
+
# end
|
410
|
+
|
411
|
+
# write to datapoint list
|
412
|
+
finalise(job, res.header_str, body, response_properties, res, e)
|
413
|
+
end
|
414
|
+
|
415
|
+
# Returns a new curl object to use downloading things.
|
416
|
+
def new_curl(job)
|
417
|
+
# Set up curl
|
418
|
+
c = Curl::Easy.new
|
419
|
+
|
420
|
+
# Configure the curl object
|
421
|
+
job.configure(c)
|
422
|
+
|
423
|
+
# Return it for work
|
424
|
+
return c
|
425
|
+
end
|
426
|
+
|
427
|
+
end
|
428
|
+
|
429
|
+
end
|
430
|
+
|
431
|
+
end
|
432
|
+
|
metadata
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: blat
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0a
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Stephen Wattam
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-06-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: curb
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.8'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.8'
|
27
|
+
description: A very parallel cURL wrapper for ongoing download tasks
|
28
|
+
email: stephenwattam@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/blat/pool.rb
|
34
|
+
- lib/blat/formats.rb
|
35
|
+
- lib/blat/batch.rb
|
36
|
+
- ./lib/blat.rb
|
37
|
+
homepage: http://stephenwattam.com/projects/blat
|
38
|
+
licenses:
|
39
|
+
- Beerware
|
40
|
+
metadata: {}
|
41
|
+
post_install_message: Thanks for installing Blat!
|
42
|
+
rdoc_options: []
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - '>='
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '1.9'
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>'
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.3.1
|
55
|
+
requirements: []
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 2.0.0
|
58
|
+
signing_key:
|
59
|
+
specification_version: 4
|
60
|
+
summary: Aggressive parallel web request library
|
61
|
+
test_files: []
|