blat 0.1.0a
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/blat.rb +16 -0
- data/lib/blat/batch.rb +121 -0
- data/lib/blat/formats.rb +118 -0
- data/lib/blat/pool.rb +432 -0
- metadata +61 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 878ccb8c27eeeeea95bf2771feb5164f0ea9f33b
|
|
4
|
+
data.tar.gz: 6e06378de9ddd8947ceb87ecced8c1418477a0c7
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: b9a812fa61e9627a5167e6595dcfde2c616349cac35b67428ba4bdebecbd28c524cb9888cabb99f639f0f0edc43ec371b1d8da9a6b776079f4ccd3e5bf844869
|
|
7
|
+
data.tar.gz: 5f8d35878a6993d6ace44d9a642e590054fff062d5f631893dd42f996ccb88011d34b3b52f0e457af592e680509cedc550bef3297d12186ce3f026bacf5770a0
|
data/lib/blat.rb
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
require 'blat/pool'
|
|
3
|
+
require 'blat/batch'
|
|
4
|
+
require 'blat/formats'
|
|
5
|
+
|
|
6
|
+
# Blat is a hugely parallel wrapper for cURL designed to download data as
|
|
7
|
+
# aggressively as possible.
|
|
8
|
+
#
|
|
9
|
+
# Blat makes use of many threads at once in a producer-consumer pattern, and
|
|
10
|
+
# accepts tasks in the form of Blat::Jobs, which contain configuration and
|
|
11
|
+
# results from each request.
|
|
12
|
+
module Blat
|
|
13
|
+
|
|
14
|
+
VERSION = '0.1.0a'
|
|
15
|
+
|
|
16
|
+
end
|
data/lib/blat/batch.rb
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
require 'blat/pool'
|
|
4
|
+
|
|
5
|
+
module Blat
|
|
6
|
+
|
|
7
|
+
# The batch downloader is a simpler wrapper around Pool that runs in a
|
|
8
|
+
# blocking manner. The idea of this is that you can put a list of URLs in,
|
|
9
|
+
# run #run and then retrieve the results easily and quickly.
|
|
10
|
+
#
|
|
11
|
+
# example:
|
|
12
|
+
#
|
|
13
|
+
# urls = File.read('url.list').lines
|
|
14
|
+
# b = Blat::Batch.new( list )
|
|
15
|
+
# b.run(10)
|
|
16
|
+
# puts "Results: #{b.results}"
|
|
17
|
+
#
|
|
18
|
+
class Batch
|
|
19
|
+
|
|
20
|
+
# Create a new batch downloader for a given list of URLS, and a given set
|
|
21
|
+
# of configuration options.
|
|
22
|
+
#
|
|
23
|
+
# [:urls] An array of URLs to download.
|
|
24
|
+
# [:config] (optional) configuration to pass to the Jobs. See Blat::Job
|
|
25
|
+
# for more information.
|
|
26
|
+
def initialize(urls, config = {})
|
|
27
|
+
|
|
28
|
+
# Config for each object
|
|
29
|
+
@config = config
|
|
30
|
+
|
|
31
|
+
# URLS in as a string
|
|
32
|
+
@urls = urls
|
|
33
|
+
@urls_mx = Mutex.new
|
|
34
|
+
|
|
35
|
+
# Stores results as Job objects
|
|
36
|
+
@results = []
|
|
37
|
+
@results_mx = Mutex.new
|
|
38
|
+
|
|
39
|
+
# Keep this to see if we have finished
|
|
40
|
+
@url_count = urls.length
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Run a batch with a given number of workers.
|
|
44
|
+
#
|
|
45
|
+
# If a block is provided, it is called with the curl object just before
|
|
46
|
+
# requests are made. This is to allow setting of various parameters, e.g.:
|
|
47
|
+
#
|
|
48
|
+
# batch.run(10){ |c|
|
|
49
|
+
# c.follow_location = true
|
|
50
|
+
# }
|
|
51
|
+
#
|
|
52
|
+
def run(workers, &block)
|
|
53
|
+
|
|
54
|
+
# Figure out if people have overestimated the workers needed
|
|
55
|
+
workers = [workers, @urls.length].min
|
|
56
|
+
|
|
57
|
+
# Construct a pool
|
|
58
|
+
x = Blat::Pool.new(workers) do |job|
|
|
59
|
+
@results_mx.synchronize { @results << job }
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Set work to do
|
|
63
|
+
x.work do
|
|
64
|
+
|
|
65
|
+
# Get the URL from the list
|
|
66
|
+
url = @urls_mx.synchronize { @urls.pop }
|
|
67
|
+
|
|
68
|
+
# If it's set, configure and return a job
|
|
69
|
+
if url
|
|
70
|
+
Blat::Job.new(@config) do |c|
|
|
71
|
+
|
|
72
|
+
# Configure with block if appropriate
|
|
73
|
+
yield(c) if block_given?
|
|
74
|
+
|
|
75
|
+
c.url= url
|
|
76
|
+
end
|
|
77
|
+
else
|
|
78
|
+
# If not, return nil to set the worker to idle
|
|
79
|
+
nil
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Wait until workers are idle
|
|
84
|
+
x.wait_until_idle
|
|
85
|
+
|
|
86
|
+
# Close them all.
|
|
87
|
+
x.close
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Is the batch complete?
|
|
91
|
+
def complete?
|
|
92
|
+
@results_mx.synchronize do
|
|
93
|
+
@results.length == @url_count
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Report progress with three vars
|
|
98
|
+
#
|
|
99
|
+
# remaining (yet to do)
|
|
100
|
+
# complete (completed)
|
|
101
|
+
# in_progress (currently running)
|
|
102
|
+
# total (remaining + complete + in progress)
|
|
103
|
+
def progress
|
|
104
|
+
remaining = @urls_mx.synchronize { @urls.length }
|
|
105
|
+
complete = @results_mx.synchronize { @results.length }
|
|
106
|
+
return remaining,
|
|
107
|
+
complete,
|
|
108
|
+
(@url_count - complete - remaining),
|
|
109
|
+
@url_count
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Get results as a list
|
|
113
|
+
def results
|
|
114
|
+
@results_mx.synchronize do
|
|
115
|
+
return @results
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
end
|
data/lib/blat/formats.rb
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
module Blat
|
|
4
|
+
|
|
5
|
+
# Blat::Job represents a single download task, both as a request and response.
|
|
6
|
+
#
|
|
7
|
+
# Jobs are provided to workers in a pool by a dispatcher block. Each job
|
|
8
|
+
# contains:
|
|
9
|
+
#
|
|
10
|
+
# * Configuration for the worker. Current configuration supported is
|
|
11
|
+
# detailed below and in the Pool documentation
|
|
12
|
+
# * A way of configuring a curl request (in order to set the url and other
|
|
13
|
+
# parameters)
|
|
14
|
+
# * Data returned by the download. This is stored as a hash in the #data
|
|
15
|
+
# parameter.
|
|
16
|
+
#
|
|
17
|
+
# == Worker Configuration
|
|
18
|
+
#
|
|
19
|
+
# Workers are configured by setting values in a hash. This hash is sent to
|
|
20
|
+
# the worker from the Job class, and contains options that affect the process
|
|
21
|
+
# of downloading. This is in addition to configuration on the curl object
|
|
22
|
+
# performed through Blat::Job.configure()
|
|
23
|
+
#
|
|
24
|
+
# Workers currently support the following configuration options:
|
|
25
|
+
#
|
|
26
|
+
# [:max_body_size] If set, downloads will cease after this many bytes have
|
|
27
|
+
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
|
28
|
+
# be set to true.
|
|
29
|
+
#
|
|
30
|
+
# == Returned Values
|
|
31
|
+
#
|
|
32
|
+
# When a job has been finalised, its #data property will be set to a hash
|
|
33
|
+
# left by the worker. This is currently specified as:
|
|
34
|
+
#
|
|
35
|
+
# [:head] The head string returned from the server (response.header_str)
|
|
36
|
+
# [:body] The body string returned from the server (response.body)
|
|
37
|
+
# [:response_properties] A hash with metadata in. Partially specified by the
|
|
38
|
+
# worker configuration, this contains things such as the number of bytes
|
|
39
|
+
# downloaded and duration of the request.
|
|
40
|
+
# [:response] The raw response from curl
|
|
41
|
+
# [:error] Any errors encountered during download, such as network errors.
|
|
42
|
+
# If this is nil the request was successful.
|
|
43
|
+
#
|
|
44
|
+
# Response properties are currently set to:
|
|
45
|
+
#
|
|
46
|
+
# response_properties = {
|
|
47
|
+
# round_trip_time: res.total_time,
|
|
48
|
+
# redirect_time: res.redirect_time,
|
|
49
|
+
# dns_lookup_time: res.name_lookup_time,
|
|
50
|
+
# effective_uri: res.last_effective_url,
|
|
51
|
+
# code: res.response_code,
|
|
52
|
+
# download_speed: res.download_speed,
|
|
53
|
+
# downloaded_bytes: res.downloaded_bytes || 0,
|
|
54
|
+
# truncated: ignore == true
|
|
55
|
+
# }
|
|
56
|
+
#
|
|
57
|
+
class Job
|
|
58
|
+
|
|
59
|
+
attr_reader :config, :data
|
|
60
|
+
|
|
61
|
+
# Construct a new Job with a block for configuring curl options.
|
|
62
|
+
def initialize(config = {}, &block)
|
|
63
|
+
raise 'No curl configuration block given' unless block_given?
|
|
64
|
+
|
|
65
|
+
@curl_config_block = block
|
|
66
|
+
@config = config
|
|
67
|
+
@finalised = false
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Configure a curl object to make the request
|
|
71
|
+
def configure(curl)
|
|
72
|
+
@curl_config_block.yield(curl)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Has this job been completed?
|
|
76
|
+
def finalised?
|
|
77
|
+
@finalise
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Allow people to use closed? instead.
|
|
81
|
+
alias :closed? :finalised?
|
|
82
|
+
|
|
83
|
+
# Write result and prevent further editing
|
|
84
|
+
def finalise!(data = {})
|
|
85
|
+
raise 'Job is already finalised.' if finalised?
|
|
86
|
+
@data = data
|
|
87
|
+
@finalised = true
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# --------------------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# SimpleJob is a quick and easy way of wrapping a URL to create a job.
|
|
95
|
+
#
|
|
96
|
+
# It accepts:
|
|
97
|
+
#
|
|
98
|
+
# [:url] The URL to download
|
|
99
|
+
# [:curl_config] A hash of properties to set on the curl object, for example: {'follow_location' => true}
|
|
100
|
+
# [:config] The worker configuration properties.
|
|
101
|
+
class SimpleJob < Job
|
|
102
|
+
def initialize(url, curl_config = {}, config = {})
|
|
103
|
+
curl_config.merge!({url: url})
|
|
104
|
+
|
|
105
|
+
super(config){ |c|
|
|
106
|
+
curl_config.each do |k,v|
|
|
107
|
+
if v.is_a?(Array)
|
|
108
|
+
curl.send(k.to_s + '=', *v)
|
|
109
|
+
else
|
|
110
|
+
curl.send(k.to_s + '=', v)
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
}
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
end
|
data/lib/blat/pool.rb
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
|
|
2
|
+
require 'thread'
|
|
3
|
+
require 'curl'
|
|
4
|
+
|
|
5
|
+
require 'blat/formats'
|
|
6
|
+
|
|
7
|
+
module Blat
|
|
8
|
+
|
|
9
|
+
# The Blat::Pool class controls a number of workers as they go about running
|
|
10
|
+
# curl Jobs. This is the main class of Blat, and is the most flexible way of
|
|
11
|
+
# using the gem (Batch is simpler but less full-featured).
|
|
12
|
+
#
|
|
13
|
+
# == Workflow
|
|
14
|
+
#
|
|
15
|
+
# The pool is created with a size and a callback to present results to.
|
|
16
|
+
# This callback may be presented as a proc object or as a block, and is
|
|
17
|
+
# called with a finalised Blat::Job object upon completion of each request.
|
|
18
|
+
#
|
|
19
|
+
# x = Blat::Pool.new(100){ |job|
|
|
20
|
+
# puts "#{job.data[:body]}"
|
|
21
|
+
# }
|
|
22
|
+
#
|
|
23
|
+
# Once a pool is configured, it may be commanded to start downloading by
|
|
24
|
+
# presenting it with a dispatcher. This is a procedure that returns either a
|
|
25
|
+
# Blat::Job object or nil---workers will call this block in order to acquire
|
|
26
|
+
# work, and will enter an idle state when nil is returned.
|
|
27
|
+
#
|
|
28
|
+
# job_list = File.read('urls').lines.map{ |l| Blat::SimpleJob.new(l) }
|
|
29
|
+
#
|
|
30
|
+
# x.work{
|
|
31
|
+
# job_list.pop
|
|
32
|
+
# }
|
|
33
|
+
#
|
|
34
|
+
# Downloading can be waited upon any number of ways. The status of the pool
|
|
35
|
+
# may be requested with #count_idle and #all_idle? , and it's possible to
|
|
36
|
+
# wait until idle using #wait_until_idle :
|
|
37
|
+
#
|
|
38
|
+
# x.wait_until_idle
|
|
39
|
+
# x.close
|
|
40
|
+
#
|
|
41
|
+
# == Worker Configuration
|
|
42
|
+
#
|
|
43
|
+
# Workers are configured by setting values in a hash. This hash is sent to
|
|
44
|
+
# the worker from the Job class, and contains options that affect the process
|
|
45
|
+
# of downloading. This is in addition to configuration on the curl object
|
|
46
|
+
# performed through Blat::Job.configure()
|
|
47
|
+
#
|
|
48
|
+
# Workers currently support the following configuration options:
|
|
49
|
+
#
|
|
50
|
+
# [:max_body_size] If set, downloads will cease after this many bytes have
|
|
51
|
+
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
|
52
|
+
# be set to true.
|
|
53
|
+
#
|
|
54
|
+
# == Returned Values
|
|
55
|
+
#
|
|
56
|
+
# When a job has been finalised, its #data property will be set to a hash
|
|
57
|
+
# left by the worker. This is currently specified as:
|
|
58
|
+
#
|
|
59
|
+
# [:head] The head string returned from the server (response.header_str)
|
|
60
|
+
# [:body] The body string returned from the server (response.body)
|
|
61
|
+
# [:response_properties] A hash with metadata in. Partially specified by the
|
|
62
|
+
# worker configuration, this contains things such as the number of bytes
|
|
63
|
+
# downloaded and duration of the request.
|
|
64
|
+
# [:response] The raw response from curl
|
|
65
|
+
# [:error] Any errors encountered during download, such as network errors.
|
|
66
|
+
# If this is nil the request was successful.
|
|
67
|
+
#
|
|
68
|
+
# Response properties are currently set to:
|
|
69
|
+
#
|
|
70
|
+
# response_properties = {
|
|
71
|
+
# round_trip_time: res.total_time,
|
|
72
|
+
# redirect_time: res.redirect_time,
|
|
73
|
+
# dns_lookup_time: res.name_lookup_time,
|
|
74
|
+
# effective_uri: res.last_effective_url,
|
|
75
|
+
# code: res.response_code,
|
|
76
|
+
# download_speed: res.download_speed,
|
|
77
|
+
# downloaded_bytes: res.downloaded_bytes || 0,
|
|
78
|
+
# truncated: ignore == true
|
|
79
|
+
# }
|
|
80
|
+
#
|
|
81
|
+
class Pool
|
|
82
|
+
|
|
83
|
+
# Construct a new pool with a given size and a callback used to output
|
|
84
|
+
# data.
|
|
85
|
+
#
|
|
86
|
+
# x = Blat::Pool.new(100){ |job|
|
|
87
|
+
# puts "Job complete: #{job}"
|
|
88
|
+
# }
|
|
89
|
+
#
|
|
90
|
+
def initialize(size, finalise_callback = nil, &block)
|
|
91
|
+
|
|
92
|
+
@m = Mutex.new # Data mutex for "producer" status
|
|
93
|
+
@t = {} # threads
|
|
94
|
+
@w = [] # workers
|
|
95
|
+
@idle = []
|
|
96
|
+
@idle_mutex = Mutex.new
|
|
97
|
+
@size = size.to_i # number of simultaneous workers
|
|
98
|
+
|
|
99
|
+
# Pass a block for handling returns
|
|
100
|
+
if block
|
|
101
|
+
@finalise_callback = block
|
|
102
|
+
elsif finalise_callback && finalise_callback.is_a?(Proc)
|
|
103
|
+
@finalise_callback = finalise_callback
|
|
104
|
+
else
|
|
105
|
+
raise 'No callback given for final data'
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# ------------------------------------------------------------------------
|
|
111
|
+
# Workers call these to report status
|
|
112
|
+
#
|
|
113
|
+
|
|
114
|
+
# Workers can register as active by calling this
|
|
115
|
+
def worker_active(worker_id)
|
|
116
|
+
@idle_mutex.synchronize{ @idle[worker_id] = false }
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Workers can register as idle by calling this
|
|
120
|
+
def worker_idle(worker_id)
|
|
121
|
+
@idle_mutex.synchronize{ @idle[worker_id] = true }
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Workers can register that they have completed
|
|
125
|
+
# a job by calling this.
|
|
126
|
+
def work_complete(job)
|
|
127
|
+
@finalise_callback.call(job)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# ------------------------------------------------------------------------
|
|
131
|
+
# Worker status
|
|
132
|
+
#
|
|
133
|
+
|
|
134
|
+
# check to see if all workers are idle
|
|
135
|
+
def all_idle?
|
|
136
|
+
@idle_mutex.synchronize{ @idle.inject(true) { |m, o| m && o} }
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Return the number of idle workers
|
|
140
|
+
def count_idle
|
|
141
|
+
@idle_mutex.synchronize{ @idle.count(true) }
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# ------------------------------------------------------------------------
|
|
145
|
+
# Set work and initialise workers
|
|
146
|
+
#
|
|
147
|
+
|
|
148
|
+
# Create workers without running them.
|
|
149
|
+
#
|
|
150
|
+
# This is usually not very useful to call on its own, and is called by
|
|
151
|
+
# #work when creating threads.
|
|
152
|
+
def init_workers
|
|
153
|
+
#$log.debug "Maintaining #{@size} worker object[s] (#{@w.length} currently active)."
|
|
154
|
+
@w = []
|
|
155
|
+
(@size - @w.length).times do |s|
|
|
156
|
+
@w << Worker.new(s, self)
|
|
157
|
+
@idle[s] = true
|
|
158
|
+
end
|
|
159
|
+
#$log.info "#{@w.length} worker[s] created."
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Run a worker over every point competitively.
|
|
163
|
+
# Will create @size workers if they do not already exist (there is no need
|
|
164
|
+
# to also call init_workers)
|
|
165
|
+
def work(dispatcher = nil, &block)
|
|
166
|
+
|
|
167
|
+
raise "No dispatcher provided" unless block_given? || (dispatcher && dispatcher.is_?(Proc))
|
|
168
|
+
|
|
169
|
+
init_workers
|
|
170
|
+
|
|
171
|
+
# Make things do the work
|
|
172
|
+
#$log.debug "Starting threads..."
|
|
173
|
+
@start_time = Time.now
|
|
174
|
+
@w.each do |w|
|
|
175
|
+
# Give each worker a handle back to the dispatcher to get data.
|
|
176
|
+
@t[w] = Thread.new(dispatcher || block) do |d|
|
|
177
|
+
begin
|
|
178
|
+
w.work(d)
|
|
179
|
+
rescue SignalException => e
|
|
180
|
+
#$log.fatal "Signal caught: #{e.message}"
|
|
181
|
+
#$log.fatal "Since I'm sampling right now, I will kill workers before shutdown."
|
|
182
|
+
kill_workers
|
|
183
|
+
raise e
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Pass exceptions up
|
|
188
|
+
@t[w].abort_on_exception = true
|
|
189
|
+
end
|
|
190
|
+
#$log.info "#{@t.length} download thread[s] started."
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# ------------------------------------------------------------------------
|
|
194
|
+
# Wait on conditions and close the pool
|
|
195
|
+
#
|
|
196
|
+
|
|
197
|
+
# Block until all workers are idle, checking every poll_rate seconds.
|
|
198
|
+
def wait_until_idle(poll_rate = 0.5)
|
|
199
|
+
#$log.debug "Waiting until idle, polling every #{poll_rate}s..."
|
|
200
|
+
sleep(poll_rate)
|
|
201
|
+
sleep(poll_rate) until all_idle?
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Wait for threads to complete.
|
|
205
|
+
def wait_until_closed
|
|
206
|
+
#$log.debug "Waiting for #{@t.length} worker[s] to close."
|
|
207
|
+
@t.each { |w, t| t.join }
|
|
208
|
+
#$log.info "Workers all terminated naturally."
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Tell workers to die forcibly
|
|
212
|
+
def kill_workers
|
|
213
|
+
#$log.debug "Forcing #{@t.length} worker threads to die..."
|
|
214
|
+
@t.each { |t| t.kill }
|
|
215
|
+
#$log.info "Worker threads killed."
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Close all workers' connections to the servers cleanly,
|
|
219
|
+
#
|
|
220
|
+
# This is non-blocking. Call #close or #wait to block:
|
|
221
|
+
#
|
|
222
|
+
# pool.close_nonblock
|
|
223
|
+
# pool.wait_until_closed
|
|
224
|
+
#
|
|
225
|
+
def close_nonblock
|
|
226
|
+
#$log.debug "Requesting closure of #{@w.length} worker[s]..."
|
|
227
|
+
@w.each { |w| w.close }
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Cleanly close the pool, waiting for workers to end their
|
|
231
|
+
# current request. Blocks, unlike #close.
|
|
232
|
+
def close
|
|
233
|
+
close_nonblock
|
|
234
|
+
wait_until_closed
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
private
|
|
238
|
+
|
|
239
|
+
# Workers are instantiated and maintained by a Blat::Pool and continually
|
|
240
|
+
# poll for available work, passing it off for integration with the final
|
|
241
|
+
# results set.
|
|
242
|
+
#
|
|
243
|
+
# Though it is possible to create your own, I would recommend instead using
|
|
244
|
+
# a pool.
|
|
245
|
+
#
|
|
246
|
+
# == Worker Configuration
|
|
247
|
+
#
|
|
248
|
+
# Workers are configured by setting values in a hash. This hash is sent to
|
|
249
|
+
# the worker from the Job class, and contains options that affect the process
|
|
250
|
+
# of downloading. This is in addition to configuration on the curl object
|
|
251
|
+
# performed through Blat::Job.configure()
|
|
252
|
+
#
|
|
253
|
+
# Workers currently support the following configuration options:
|
|
254
|
+
#
|
|
255
|
+
# [:max_body_size] If set, downloads will cease after this many bytes have
|
|
256
|
+
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
|
257
|
+
# be set to true.
|
|
258
|
+
#
|
|
259
|
+
# == Returned Values
|
|
260
|
+
#
|
|
261
|
+
# When a job has been finalised, its #data property will be set to a hash
|
|
262
|
+
# left by the worker. This is currently specified as:
|
|
263
|
+
#
|
|
264
|
+
# [:head] The head string returned from the server (response.header_str)
|
|
265
|
+
# [:body] The body string returned from the server (response.body)
|
|
266
|
+
# [:response_properties] A hash with metadata in. Partially specified by the
|
|
267
|
+
# worker configuration, this contains things such as the number of bytes
|
|
268
|
+
# downloaded and duration of the request.
|
|
269
|
+
# [:response] The raw response from curl
|
|
270
|
+
# [:error] Any errors encountered during download, such as network errors.
|
|
271
|
+
# If this is nil the request was successful.
|
|
272
|
+
#
|
|
273
|
+
# Response properties are currently set to:
|
|
274
|
+
#
|
|
275
|
+
# response_properties = {
|
|
276
|
+
# round_trip_time: res.total_time,
|
|
277
|
+
# redirect_time: res.redirect_time,
|
|
278
|
+
# dns_lookup_time: res.name_lookup_time,
|
|
279
|
+
# effective_uri: res.last_effective_url,
|
|
280
|
+
# code: res.response_code,
|
|
281
|
+
# download_speed: res.download_speed,
|
|
282
|
+
# downloaded_bytes: res.downloaded_bytes || 0,
|
|
283
|
+
# truncated: ignore == true
|
|
284
|
+
# }
|
|
285
|
+
#
|
|
286
|
+
class Worker
|
|
287
|
+
|
|
288
|
+
# Construct a new worker with a given ID and linked to a given pool.
|
|
289
|
+
#
|
|
290
|
+
# The pool will be called to report idle/working states.
|
|
291
|
+
def initialize(id, pool)
|
|
292
|
+
@id = id
|
|
293
|
+
@pool = pool
|
|
294
|
+
@abort = false
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
# Should be run in a thread. Performs work until the dispatcher runs
|
|
298
|
+
# out of data.
|
|
299
|
+
def work(dispatcher)
|
|
300
|
+
# start idle
|
|
301
|
+
last_idle_state = true
|
|
302
|
+
|
|
303
|
+
loop do
|
|
304
|
+
while (job = dispatcher.call).is_a?(Job) do
|
|
305
|
+
|
|
306
|
+
# If we were idle last, tell the pool
|
|
307
|
+
@pool.worker_active(@id) if last_idle_state == true
|
|
308
|
+
|
|
309
|
+
# tell people
|
|
310
|
+
#$log.debug "W#{@id}: Downloading job #{job}"
|
|
311
|
+
|
|
312
|
+
# Make the request
|
|
313
|
+
complete_request(job, new_curl(job), job.config)
|
|
314
|
+
|
|
315
|
+
return if @abort
|
|
316
|
+
end
|
|
317
|
+
return if @abort
|
|
318
|
+
|
|
319
|
+
# TODO: configurable
|
|
320
|
+
@pool.worker_idle(@id)
|
|
321
|
+
last_idle_state = true
|
|
322
|
+
sleep(1)
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
# rescue StandardError => e
|
|
326
|
+
#$log.warn "W#{@id}: Error: #{e}"
|
|
327
|
+
#$log.debug "#{e.backtrace.join("\n")}"
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# Closes the connection to the server
|
|
331
|
+
def close
|
|
332
|
+
@abort = true
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
private
|
|
336
|
+
|
|
337
|
+
# Datapoint is complete, run callback
|
|
338
|
+
def finalise(job, head, body, response_properties, response, error)
|
|
339
|
+
job.finalise!(
|
|
340
|
+
head: head,
|
|
341
|
+
body: body,
|
|
342
|
+
response_properties: response_properties,
|
|
343
|
+
response: response,
|
|
344
|
+
error: error
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
@pool.work_complete(job)
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# ---------- called by workers below this line
|
|
351
|
+
|
|
352
|
+
# Submit a complete dp to the pool
|
|
353
|
+
def complete_request(job, res, config)
|
|
354
|
+
|
|
355
|
+
# Somewhere to store the body in a size-aware way
|
|
356
|
+
body = ''
|
|
357
|
+
|
|
358
|
+
# If limiting body size, use a callback to handle incoming data
|
|
359
|
+
if config[:max_body_size]
|
|
360
|
+
ignore = false
|
|
361
|
+
|
|
362
|
+
res.on_body do |str|
|
|
363
|
+
# Read up to the limit of bytes
|
|
364
|
+
if !ignore && config[:max_body_size] && (body.length + str.length) > config[:max_body_size]
|
|
365
|
+
body += str[0..(body.length + str.length) - config[:max_body_size]]
|
|
366
|
+
#$log.warn "W#{@id}: Job #{job} exceeded byte limit (#{config[:max_body_size]}b)"
|
|
367
|
+
ignore = true
|
|
368
|
+
elsif not ignore
|
|
369
|
+
body += str
|
|
370
|
+
else
|
|
371
|
+
# ignore data
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
# Have to return number of bytes to curb
|
|
375
|
+
str.length
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
# Perform a request prepared elsewhere,
|
|
380
|
+
# can run alongside other requests
|
|
381
|
+
res.perform
|
|
382
|
+
|
|
383
|
+
# Load body directly from response if not using the system above
|
|
384
|
+
body = res.body_str unless config[:max_body_size]
|
|
385
|
+
|
|
386
|
+
# Load stuff out of response object.
|
|
387
|
+
response_properties = {
|
|
388
|
+
round_trip_time: res.total_time,
|
|
389
|
+
redirect_time: res.redirect_time,
|
|
390
|
+
dns_lookup_time: res.name_lookup_time,
|
|
391
|
+
effective_uri: res.last_effective_url,
|
|
392
|
+
code: res.response_code,
|
|
393
|
+
download_speed: res.download_speed,
|
|
394
|
+
downloaded_bytes: res.downloaded_bytes || 0,
|
|
395
|
+
truncated: ignore == true
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
# write to datapoint list
|
|
399
|
+
finalise(job, res.header_str, body, response_properties, res, nil)
|
|
400
|
+
|
|
401
|
+
rescue SignalException => e
|
|
402
|
+
raise e
|
|
403
|
+
rescue StandardError => e
|
|
404
|
+
# if e.class.to_s =~ /^Curl::Err::/ then
|
|
405
|
+
# #$log.debug "W#{@id}: Job #{job}: #{e.to_s[11..-1]}"
|
|
406
|
+
# else
|
|
407
|
+
# #$log.error "W#{@id}: Exception retrieving #{job}: #{e.to_s}."
|
|
408
|
+
# #$log.debug "#{e.backtrace.join("\n")}"
|
|
409
|
+
# end
|
|
410
|
+
|
|
411
|
+
# write to datapoint list
|
|
412
|
+
finalise(job, res.header_str, body, response_properties, res, e)
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
# Returns a new curl object to use downloading things.
|
|
416
|
+
def new_curl(job)
|
|
417
|
+
# Set up curl
|
|
418
|
+
c = Curl::Easy.new
|
|
419
|
+
|
|
420
|
+
# Configure the curl object
|
|
421
|
+
job.configure(c)
|
|
422
|
+
|
|
423
|
+
# Return it for work
|
|
424
|
+
return c
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
end
|
|
432
|
+
|
metadata
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: blat
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0a
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Stephen Wattam
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2013-06-02 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: curb
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ~>
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0.8'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ~>
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0.8'
|
|
27
|
+
description: A very parallel cURL wrapper for ongoing download tasks
|
|
28
|
+
email: stephenwattam@gmail.com
|
|
29
|
+
executables: []
|
|
30
|
+
extensions: []
|
|
31
|
+
extra_rdoc_files: []
|
|
32
|
+
files:
|
|
33
|
+
- lib/blat/pool.rb
|
|
34
|
+
- lib/blat/formats.rb
|
|
35
|
+
- lib/blat/batch.rb
|
|
36
|
+
- ./lib/blat.rb
|
|
37
|
+
homepage: http://stephenwattam.com/projects/blat
|
|
38
|
+
licenses:
|
|
39
|
+
- Beerware
|
|
40
|
+
metadata: {}
|
|
41
|
+
post_install_message: Thanks for installing Blat!
|
|
42
|
+
rdoc_options: []
|
|
43
|
+
require_paths:
|
|
44
|
+
- lib
|
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
46
|
+
requirements:
|
|
47
|
+
- - '>='
|
|
48
|
+
- !ruby/object:Gem::Version
|
|
49
|
+
version: '1.9'
|
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - '>'
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: 1.3.1
|
|
55
|
+
requirements: []
|
|
56
|
+
rubyforge_project:
|
|
57
|
+
rubygems_version: 2.0.0
|
|
58
|
+
signing_key:
|
|
59
|
+
specification_version: 4
|
|
60
|
+
summary: Aggressive parallel web request library
|
|
61
|
+
test_files: []
|