blat 0.1.0a → 0.1.0b
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/blat.rb +3 -8
- data/lib/blat/batch.rb +25 -102
- data/lib/blat/queue.rb +152 -0
- metadata +5 -5
- data/lib/blat/formats.rb +0 -118
- data/lib/blat/pool.rb +0 -432
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a18d2f9b96d5a2d9b6e7e8ae085e1bfed1855d32
|
4
|
+
data.tar.gz: df5cc403ee46326005387f73432d73edb0fb69bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f324fa27a4c93dd576a2663b87a047c3f0c74cf2387ad8fdf362eb4cf884b156a09232421101a7ca6cd947c03ad3d7fe5870ff6a1d7a95897a107fd31ca66fa7
|
7
|
+
data.tar.gz: cf89d6d6a43935fbd03dd811a770ff53ba091318cb89ee88037b9017efaed91e178dd1dcd8387624383b0d476b35f9bf9f5dc22d8fe3046876b7e07362d4c51b
|
data/lib/blat.rb
CHANGED
@@ -1,16 +1,11 @@
|
|
1
1
|
|
2
|
-
require 'blat/pool'
|
3
2
|
require 'blat/batch'
|
4
|
-
require 'blat/
|
3
|
+
require 'blat/queue'
|
5
4
|
|
6
|
-
# Blat is a
|
5
|
+
# Blat is a simple wrapper for cURL::Multi designed to download data as
|
7
6
|
# aggressively as possible.
|
8
|
-
#
|
9
|
-
# Blat makes use of many threads at once in a producer-consumer pattern, and
|
10
|
-
# accepts tasks in the form of Blat::Jobs, which contain configuration and
|
11
|
-
# results from each request.
|
12
7
|
module Blat
|
13
8
|
|
14
|
-
VERSION = '0.1.
|
9
|
+
VERSION = '0.1.0b'
|
15
10
|
|
16
11
|
end
|
data/lib/blat/batch.rb
CHANGED
@@ -1,121 +1,44 @@
|
|
1
1
|
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'curl'
|
4
4
|
|
5
5
|
module Blat
|
6
6
|
|
7
|
-
|
8
|
-
# blocking manner. The idea of this is that you can put a list of URLs in,
|
9
|
-
# run #run and then retrieve the results easily and quickly.
|
10
|
-
#
|
11
|
-
# example:
|
12
|
-
#
|
13
|
-
# urls = File.read('url.list').lines
|
14
|
-
# b = Blat::Batch.new( list )
|
15
|
-
# b.run(10)
|
16
|
-
# puts "Results: #{b.results}"
|
17
|
-
#
|
18
|
-
class Batch
|
7
|
+
module Blat::Batch
|
19
8
|
|
20
|
-
#
|
21
|
-
#
|
9
|
+
# Blat::Batch::run takes a list of links and downloads them all before
|
10
|
+
# returning. It is a very simple interface to Curl::Multi for smallish
|
11
|
+
# tasks.
|
22
12
|
#
|
23
|
-
# [
|
24
|
-
# [
|
25
|
-
#
|
26
|
-
|
27
|
-
|
28
|
-
# Config for each object
|
29
|
-
@config = config
|
30
|
-
|
31
|
-
# URLS in as a string
|
32
|
-
@urls = urls
|
33
|
-
@urls_mx = Mutex.new
|
34
|
-
|
35
|
-
# Stores results as Job objects
|
36
|
-
@results = []
|
37
|
-
@results_mx = Mutex.new
|
38
|
-
|
39
|
-
# Keep this to see if we have finished
|
40
|
-
@url_count = urls.length
|
41
|
-
end
|
42
|
-
|
43
|
-
# Run a batch with a given number of workers.
|
13
|
+
# [max_connections] Defines how many parallel connections to use
|
14
|
+
# [links] Is the list of strings or Curl::Easy objects to download. The list object must support #map and #each
|
15
|
+
# [pipeline] Indicates if Curl::Multi should pipeline its HTTP requests
|
16
|
+
# [&block] If given, this block is called to configure each Curl::Easy object prior to it being pushed into the queue.
|
44
17
|
#
|
45
|
-
|
46
|
-
|
47
|
-
#
|
48
|
-
# batch.run(10){ |c|
|
49
|
-
# c.follow_location = true
|
50
|
-
# }
|
51
|
-
#
|
52
|
-
def run(workers, &block)
|
53
|
-
|
54
|
-
# Figure out if people have overestimated the workers needed
|
55
|
-
workers = [workers, @urls.length].min
|
56
|
-
|
57
|
-
# Construct a pool
|
58
|
-
x = Blat::Pool.new(workers) do |job|
|
59
|
-
@results_mx.synchronize { @results << job }
|
60
|
-
end
|
18
|
+
def self.run(max_connections, links, pipeline = true, &block)
|
19
|
+
multi = Curl::Multi.new
|
61
20
|
|
62
|
-
# Set
|
63
|
-
|
21
|
+
# Set options
|
22
|
+
multi.max_connects = max_connections.to_i
|
23
|
+
multi.pipeline = (pipeline == true)
|
64
24
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
if url
|
70
|
-
Blat::Job.new(@config) do |c|
|
71
|
-
|
72
|
-
# Configure with block if appropriate
|
73
|
-
yield(c) if block_given?
|
74
|
-
|
75
|
-
c.url= url
|
76
|
-
end
|
77
|
-
else
|
78
|
-
# If not, return nil to set the worker to idle
|
79
|
-
nil
|
80
|
-
end
|
25
|
+
curls = links.map do |l|
|
26
|
+
c = l
|
27
|
+
c = Curl::Easy.new(l) unless l.is_a?(Curl::Easy)
|
28
|
+
c
|
81
29
|
end
|
82
30
|
|
83
|
-
#
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
x.close
|
88
|
-
end
|
89
|
-
|
90
|
-
# Is the batch complete?
|
91
|
-
def complete?
|
92
|
-
@results_mx.synchronize do
|
93
|
-
@results.length == @url_count
|
31
|
+
# Pump links in
|
32
|
+
curls.each do |c|
|
33
|
+
yield(c) if block_given?
|
34
|
+
multi.add(c)
|
94
35
|
end
|
95
|
-
end
|
96
36
|
|
97
|
-
|
98
|
-
|
99
|
-
# remaining (yet to do)
|
100
|
-
# complete (completed)
|
101
|
-
# in_progress (currently running)
|
102
|
-
# total (remaining + complete + in progress)
|
103
|
-
def progress
|
104
|
-
remaining = @urls_mx.synchronize { @urls.length }
|
105
|
-
complete = @results_mx.synchronize { @results.length }
|
106
|
-
return remaining,
|
107
|
-
complete,
|
108
|
-
(@url_count - complete - remaining),
|
109
|
-
@url_count
|
110
|
-
end
|
37
|
+
# Wait
|
38
|
+
multi.perform
|
111
39
|
|
112
|
-
|
113
|
-
def results
|
114
|
-
@results_mx.synchronize do
|
115
|
-
return @results
|
116
|
-
end
|
40
|
+
return curls
|
117
41
|
end
|
118
|
-
|
119
42
|
end
|
120
43
|
|
121
44
|
end
|
data/lib/blat/queue.rb
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
|
2
|
+
require 'curl'
|
3
|
+
|
4
|
+
module Blat
|
5
|
+
|
6
|
+
# The Blat::Queue class represents a download queue that handles requests
|
7
|
+
# using Curl::Multi. It, and its descendants, accept a large number of
|
8
|
+
# Curl::Easy objects and download them in parallel.
|
9
|
+
#
|
10
|
+
# In order to know when each request has completed, use
|
11
|
+
# Curl::Easy::on_complete. This is made simpler by Queue#add, which will
|
12
|
+
# yield to a block on completion of each download.
|
13
|
+
#
|
14
|
+
class Queue
|
15
|
+
|
16
|
+
attr_reader :max_connections, :pipeline
|
17
|
+
|
18
|
+
# Create a new Blat::Queue with a given number of maximum connections.
|
19
|
+
#
|
20
|
+
# The 'pipeline' options controls Curl::Multi's pipelining feature, which
|
21
|
+
# tries to use the same http connection for many requests to the same server.
|
22
|
+
def initialize(max_connections, pipeline = true)
|
23
|
+
@multi = Curl::Multi.new
|
24
|
+
|
25
|
+
# Set properties
|
26
|
+
@max_connects = max_connections.to_i
|
27
|
+
@pipeline = (pipeline == true)
|
28
|
+
@multi.max_connects = @max_connects
|
29
|
+
@multi.pipeline = @pipeline
|
30
|
+
end
|
31
|
+
|
32
|
+
# Add a URL or a Curl::Easy object to the queue.
|
33
|
+
#
|
34
|
+
# Optionally, provide a callback for calling when requests are complete,
|
35
|
+
# e.g.:
|
36
|
+
#
|
37
|
+
# q.add('http://google.com') do |c|
|
38
|
+
# puts "Complete request: #{r}"
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
def add(curl_or_link, &block)
|
42
|
+
# Convert to curl if necessary
|
43
|
+
curl = curl_or_link.is_a?(Curl::Easy) ? curl_or_link : Curl::Easy.new(curl_or_link)
|
44
|
+
curl.on_complete { |c| block.yield(c) } if block_given?
|
45
|
+
|
46
|
+
# Add
|
47
|
+
@multi.add(curl)
|
48
|
+
|
49
|
+
# Return
|
50
|
+
return curl
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns the number of active requests
|
54
|
+
def request_count
|
55
|
+
requests.length
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns a list of active requests
|
59
|
+
def requests
|
60
|
+
@multi.requests
|
61
|
+
end
|
62
|
+
|
63
|
+
# Remove a request from the queue.
|
64
|
+
#
|
65
|
+
# This needn't be called if a request has completed.
|
66
|
+
def remove(curl)
|
67
|
+
@multi.remove(curl)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Wait for all requests to finish (blocking).
|
71
|
+
#
|
72
|
+
# If a block is given it is executed repeatedly whilst waiting.
|
73
|
+
def wait(&block)
|
74
|
+
@multi.perform do
|
75
|
+
yield if block_given?
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
alias_method :perform, :wait
|
80
|
+
|
81
|
+
# Is the queue idle?
|
82
|
+
def idle?
|
83
|
+
@multi.idle?
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
# Similar to a queue, except that it explicitly calls a block in order to
|
89
|
+
# acquire new URLs.
|
90
|
+
#
|
91
|
+
# This makes it suitable for use in producer/consumer patterns.
|
92
|
+
class ConsumingQueue < Queue
|
93
|
+
|
94
|
+
# Executes the given block in order to keep the curl pool working at its
|
95
|
+
# maximum capacity.
|
96
|
+
#
|
97
|
+
# consume blocks as long as links are being downloaded, as it relies on
|
98
|
+
# Curl::Multi#perform
|
99
|
+
#
|
100
|
+
# Note that blocks providing links must also perform their own
|
101
|
+
# configuration, e.g.:
|
102
|
+
#
|
103
|
+
# q.consume do
|
104
|
+
# url = get_url
|
105
|
+
# if(url)
|
106
|
+
# c = Curl::Easy.new(url)
|
107
|
+
# c.follow_location = true
|
108
|
+
# c.on_complete{ |c| puts "Retrieved: #{c.body_str}" }
|
109
|
+
# c
|
110
|
+
# else
|
111
|
+
# nil
|
112
|
+
# end
|
113
|
+
# end
|
114
|
+
#
|
115
|
+
def consume(connections = @max_connects, &block)
|
116
|
+
@multi.perform do
|
117
|
+
while request_count < connections && new_link = yield
|
118
|
+
add(new_link) if new_link
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
# The ListConsumingQueue is similar to the ConsumingQueue except that
|
126
|
+
# it takes its argument in the form of an Enumerable object.
|
127
|
+
class ListConsumingQueue < ConsumingQueue
|
128
|
+
|
129
|
+
# Download all of the URLs or Curl::Easy objects in the given list, and
|
130
|
+
# optionally execute the given block on completion for each
|
131
|
+
def consume(list, connections = @max_connects)
|
132
|
+
item = 0 # Start at item 0
|
133
|
+
list = list.to_a # Ensure we can address with []
|
134
|
+
|
135
|
+
@multi.perform do
|
136
|
+
while request_count < connections && new_link = list[item]
|
137
|
+
|
138
|
+
item += 1
|
139
|
+
|
140
|
+
# Add with config block if appropriate
|
141
|
+
if block_given?
|
142
|
+
add(new_link) { |req| yield(req) }
|
143
|
+
else
|
144
|
+
add(new_link)
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: blat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.0b
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stephen Wattam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-06-
|
11
|
+
date: 2013-06-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: curb
|
@@ -24,15 +24,15 @@ dependencies:
|
|
24
24
|
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0.8'
|
27
|
-
description:
|
27
|
+
description: Curl::Multi wrapper for high-performance and/or long-running download
|
28
|
+
tasks
|
28
29
|
email: stephenwattam@gmail.com
|
29
30
|
executables: []
|
30
31
|
extensions: []
|
31
32
|
extra_rdoc_files: []
|
32
33
|
files:
|
33
|
-
- lib/blat/pool.rb
|
34
|
-
- lib/blat/formats.rb
|
35
34
|
- lib/blat/batch.rb
|
35
|
+
- lib/blat/queue.rb
|
36
36
|
- ./lib/blat.rb
|
37
37
|
homepage: http://stephenwattam.com/projects/blat
|
38
38
|
licenses:
|
data/lib/blat/formats.rb
DELETED
@@ -1,118 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
module Blat
|
4
|
-
|
5
|
-
# Blat::Job represents a single download task, both as a request and response.
|
6
|
-
#
|
7
|
-
# Jobs are provided to workers in a pool by a dispatcher block. Each job
|
8
|
-
# contains:
|
9
|
-
#
|
10
|
-
# * Configuration for the worker. Current configuration supported is
|
11
|
-
# detailed below and in the Pool documentation
|
12
|
-
# * A way of configuring a curl request (in order to set the url and other
|
13
|
-
# parameters)
|
14
|
-
# * Data returned by the download. This is stored as a hash in the #data
|
15
|
-
# parameter.
|
16
|
-
#
|
17
|
-
# == Worker Configuration
|
18
|
-
#
|
19
|
-
# Workers are configured by setting values in a hash. This hash is sent to
|
20
|
-
# the worker from the Job class, and contains options that affect the process
|
21
|
-
# of downloading. This is in addition to configuration on the curl object
|
22
|
-
# performed through Blat::Job.configure()
|
23
|
-
#
|
24
|
-
# Workers currently support the following configuration options:
|
25
|
-
#
|
26
|
-
# [:max_body_size] If set, downloads will cease after this many bytes have
|
27
|
-
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
28
|
-
# be set to true.
|
29
|
-
#
|
30
|
-
# == Returned Values
|
31
|
-
#
|
32
|
-
# When a job has been finalised, its #data property will be set to a hash
|
33
|
-
# left by the worker. This is currently specified as:
|
34
|
-
#
|
35
|
-
# [:head] The head string returned from the server (response.header_str)
|
36
|
-
# [:body] The body string returned from the server (response.body)
|
37
|
-
# [:response_properties] A hash with metadata in. Partially specified by the
|
38
|
-
# worker configuration, this contains things such as the number of bytes
|
39
|
-
# downloaded and duration of the request.
|
40
|
-
# [:response] The raw response from curl
|
41
|
-
# [:error] Any errors encountered during download, such as network errors.
|
42
|
-
# If this is nil the request was successful.
|
43
|
-
#
|
44
|
-
# Response properties are currently set to:
|
45
|
-
#
|
46
|
-
# response_properties = {
|
47
|
-
# round_trip_time: res.total_time,
|
48
|
-
# redirect_time: res.redirect_time,
|
49
|
-
# dns_lookup_time: res.name_lookup_time,
|
50
|
-
# effective_uri: res.last_effective_url,
|
51
|
-
# code: res.response_code,
|
52
|
-
# download_speed: res.download_speed,
|
53
|
-
# downloaded_bytes: res.downloaded_bytes || 0,
|
54
|
-
# truncated: ignore == true
|
55
|
-
# }
|
56
|
-
#
|
57
|
-
class Job
|
58
|
-
|
59
|
-
attr_reader :config, :data
|
60
|
-
|
61
|
-
# Construct a new Job with a block for configuring curl options.
|
62
|
-
def initialize(config = {}, &block)
|
63
|
-
raise 'No curl configuration block given' unless block_given?
|
64
|
-
|
65
|
-
@curl_config_block = block
|
66
|
-
@config = config
|
67
|
-
@finalised = false
|
68
|
-
end
|
69
|
-
|
70
|
-
# Configure a curl object to make the request
|
71
|
-
def configure(curl)
|
72
|
-
@curl_config_block.yield(curl)
|
73
|
-
end
|
74
|
-
|
75
|
-
# Has this job been completed?
|
76
|
-
def finalised?
|
77
|
-
@finalise
|
78
|
-
end
|
79
|
-
|
80
|
-
# Allow people to use closed? instead.
|
81
|
-
alias :closed? :finalised?
|
82
|
-
|
83
|
-
# Write result and prevent further editing
|
84
|
-
def finalise!(data = {})
|
85
|
-
raise 'Job is already finalised.' if finalised?
|
86
|
-
@data = data
|
87
|
-
@finalised = true
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
# --------------------------------------------------------------------------
|
92
|
-
|
93
|
-
|
94
|
-
# SimpleJob is a quick and easy way of wrapping a URL to create a job.
|
95
|
-
#
|
96
|
-
# It accepts:
|
97
|
-
#
|
98
|
-
# [:url] The URL to download
|
99
|
-
# [:curl_config] A hash of properties to set on the curl object, for example: {'follow_location' => true}
|
100
|
-
# [:config] The worker configuration properties.
|
101
|
-
class SimpleJob < Job
|
102
|
-
def initialize(url, curl_config = {}, config = {})
|
103
|
-
curl_config.merge!({url: url})
|
104
|
-
|
105
|
-
super(config){ |c|
|
106
|
-
curl_config.each do |k,v|
|
107
|
-
if v.is_a?(Array)
|
108
|
-
curl.send(k.to_s + '=', *v)
|
109
|
-
else
|
110
|
-
curl.send(k.to_s + '=', v)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
}
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
|
118
|
-
end
|
data/lib/blat/pool.rb
DELETED
@@ -1,432 +0,0 @@
|
|
1
|
-
|
2
|
-
require 'thread'
|
3
|
-
require 'curl'
|
4
|
-
|
5
|
-
require 'blat/formats'
|
6
|
-
|
7
|
-
module Blat
|
8
|
-
|
9
|
-
# The Blat::Pool class controls a number of workers as they go about running
|
10
|
-
# curl Jobs. This is the main class of Blat, and is the most flexible way of
|
11
|
-
# using the gem (Batch is simpler but less full-featured).
|
12
|
-
#
|
13
|
-
# == Workflow
|
14
|
-
#
|
15
|
-
# The pool is created with a size and a callback to present results to.
|
16
|
-
# This callback may be presented as a proc object or as a block, and is
|
17
|
-
# called with a finalised Blat::Job object upon completion of each request.
|
18
|
-
#
|
19
|
-
# x = Blat::Pool.new(100){ |job|
|
20
|
-
# puts "#{job.data[:body]}"
|
21
|
-
# }
|
22
|
-
#
|
23
|
-
# Once a pool is configured, it may be commanded to start downloading by
|
24
|
-
# presenting it with a dispatcher. This is a procedure that returns either a
|
25
|
-
# Blat::Job object or nil---workers will call this block in order to acquire
|
26
|
-
# work, and will enter an idle state when nil is returned.
|
27
|
-
#
|
28
|
-
# job_list = File.read('urls').lines.map{ |l| Blat::SimpleJob.new(l) }
|
29
|
-
#
|
30
|
-
# x.work{
|
31
|
-
# job_list.pop
|
32
|
-
# }
|
33
|
-
#
|
34
|
-
# Downloading can be waited upon any number of ways. The status of the pool
|
35
|
-
# may be requested with #count_idle and #all_idle? , and it's possible to
|
36
|
-
# wait until idle using #wait_until_idle :
|
37
|
-
#
|
38
|
-
# x.wait_until_idle
|
39
|
-
# x.close
|
40
|
-
#
|
41
|
-
# == Worker Configuration
|
42
|
-
#
|
43
|
-
# Workers are configured by setting values in a hash. This hash is sent to
|
44
|
-
# the worker from the Job class, and contains options that affect the process
|
45
|
-
# of downloading. This is in addition to configuration on the curl object
|
46
|
-
# performed through Blat::Job.configure()
|
47
|
-
#
|
48
|
-
# Workers currently support the following configuration options:
|
49
|
-
#
|
50
|
-
# [:max_body_size] If set, downloads will cease after this many bytes have
|
51
|
-
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
52
|
-
# be set to true.
|
53
|
-
#
|
54
|
-
# == Returned Values
|
55
|
-
#
|
56
|
-
# When a job has been finalised, its #data property will be set to a hash
|
57
|
-
# left by the worker. This is currently specified as:
|
58
|
-
#
|
59
|
-
# [:head] The head string returned from the server (response.header_str)
|
60
|
-
# [:body] The body string returned from the server (response.body)
|
61
|
-
# [:response_properties] A hash with metadata in. Partially specified by the
|
62
|
-
# worker configuration, this contains things such as the number of bytes
|
63
|
-
# downloaded and duration of the request.
|
64
|
-
# [:response] The raw response from curl
|
65
|
-
# [:error] Any errors encountered during download, such as network errors.
|
66
|
-
# If this is nil the request was successful.
|
67
|
-
#
|
68
|
-
# Response properties are currently set to:
|
69
|
-
#
|
70
|
-
# response_properties = {
|
71
|
-
# round_trip_time: res.total_time,
|
72
|
-
# redirect_time: res.redirect_time,
|
73
|
-
# dns_lookup_time: res.name_lookup_time,
|
74
|
-
# effective_uri: res.last_effective_url,
|
75
|
-
# code: res.response_code,
|
76
|
-
# download_speed: res.download_speed,
|
77
|
-
# downloaded_bytes: res.downloaded_bytes || 0,
|
78
|
-
# truncated: ignore == true
|
79
|
-
# }
|
80
|
-
#
|
81
|
-
class Pool
|
82
|
-
|
83
|
-
# Construct a new pool with a given size and a callback used to output
|
84
|
-
# data.
|
85
|
-
#
|
86
|
-
# x = Blat::Pool.new(100){ |job|
|
87
|
-
# puts "Job complete: #{job}"
|
88
|
-
# }
|
89
|
-
#
|
90
|
-
def initialize(size, finalise_callback = nil, &block)
|
91
|
-
|
92
|
-
@m = Mutex.new # Data mutex for "producer" status
|
93
|
-
@t = {} # threads
|
94
|
-
@w = [] # workers
|
95
|
-
@idle = []
|
96
|
-
@idle_mutex = Mutex.new
|
97
|
-
@size = size.to_i # number of simultaneous workers
|
98
|
-
|
99
|
-
# Pass a block for handling returns
|
100
|
-
if block
|
101
|
-
@finalise_callback = block
|
102
|
-
elsif finalise_callback && finalise_callback.is_a?(Proc)
|
103
|
-
@finalise_callback = finalise_callback
|
104
|
-
else
|
105
|
-
raise 'No callback given for final data'
|
106
|
-
end
|
107
|
-
|
108
|
-
end
|
109
|
-
|
110
|
-
# ------------------------------------------------------------------------
|
111
|
-
# Workers call these to report status
|
112
|
-
#
|
113
|
-
|
114
|
-
# Workers can register as active by calling this
|
115
|
-
def worker_active(worker_id)
|
116
|
-
@idle_mutex.synchronize{ @idle[worker_id] = false }
|
117
|
-
end
|
118
|
-
|
119
|
-
# Workers can register as idle by calling this
|
120
|
-
def worker_idle(worker_id)
|
121
|
-
@idle_mutex.synchronize{ @idle[worker_id] = true }
|
122
|
-
end
|
123
|
-
|
124
|
-
# Workers can register that they have completed
|
125
|
-
# a job by calling this.
|
126
|
-
def work_complete(job)
|
127
|
-
@finalise_callback.call(job)
|
128
|
-
end
|
129
|
-
|
130
|
-
# ------------------------------------------------------------------------
|
131
|
-
# Worker status
|
132
|
-
#
|
133
|
-
|
134
|
-
# check to see if all workers are idle
|
135
|
-
def all_idle?
|
136
|
-
@idle_mutex.synchronize{ @idle.inject(true) { |m, o| m && o} }
|
137
|
-
end
|
138
|
-
|
139
|
-
# Return the number of idle workers
|
140
|
-
def count_idle
|
141
|
-
@idle_mutex.synchronize{ @idle.count(true) }
|
142
|
-
end
|
143
|
-
|
144
|
-
# ------------------------------------------------------------------------
|
145
|
-
# Set work and initialise workers
|
146
|
-
#
|
147
|
-
|
148
|
-
# Create workers without running them.
|
149
|
-
#
|
150
|
-
# This is usually not very useful to call on its own, and is called by
|
151
|
-
# #work when creating threads.
|
152
|
-
def init_workers
|
153
|
-
#$log.debug "Maintaining #{@size} worker object[s] (#{@w.length} currently active)."
|
154
|
-
@w = []
|
155
|
-
(@size - @w.length).times do |s|
|
156
|
-
@w << Worker.new(s, self)
|
157
|
-
@idle[s] = true
|
158
|
-
end
|
159
|
-
#$log.info "#{@w.length} worker[s] created."
|
160
|
-
end
|
161
|
-
|
162
|
-
# Run a worker over every point competitively.
|
163
|
-
# Will create @size workers if they do not already exist (there is no need
|
164
|
-
# to also call init_workers)
|
165
|
-
def work(dispatcher = nil, &block)
|
166
|
-
|
167
|
-
raise "No dispatcher provided" unless block_given? || (dispatcher && dispatcher.is_?(Proc))
|
168
|
-
|
169
|
-
init_workers
|
170
|
-
|
171
|
-
# Make things do the work
|
172
|
-
#$log.debug "Starting threads..."
|
173
|
-
@start_time = Time.now
|
174
|
-
@w.each do |w|
|
175
|
-
# Give each worker a handle back to the dispatcher to get data.
|
176
|
-
@t[w] = Thread.new(dispatcher || block) do |d|
|
177
|
-
begin
|
178
|
-
w.work(d)
|
179
|
-
rescue SignalException => e
|
180
|
-
#$log.fatal "Signal caught: #{e.message}"
|
181
|
-
#$log.fatal "Since I'm sampling right now, I will kill workers before shutdown."
|
182
|
-
kill_workers
|
183
|
-
raise e
|
184
|
-
end
|
185
|
-
end
|
186
|
-
|
187
|
-
# Pass exceptions up
|
188
|
-
@t[w].abort_on_exception = true
|
189
|
-
end
|
190
|
-
#$log.info "#{@t.length} download thread[s] started."
|
191
|
-
end
|
192
|
-
|
193
|
-
# ------------------------------------------------------------------------
|
194
|
-
# Wait on conditions and close the pool
|
195
|
-
#
|
196
|
-
|
197
|
-
# Block until all workers are idle, checking every poll_rate seconds.
|
198
|
-
def wait_until_idle(poll_rate = 0.5)
|
199
|
-
#$log.debug "Waiting until idle, polling every #{poll_rate}s..."
|
200
|
-
sleep(poll_rate)
|
201
|
-
sleep(poll_rate) until all_idle?
|
202
|
-
end
|
203
|
-
|
204
|
-
# Wait for threads to complete.
|
205
|
-
def wait_until_closed
|
206
|
-
#$log.debug "Waiting for #{@t.length} worker[s] to close."
|
207
|
-
@t.each { |w, t| t.join }
|
208
|
-
#$log.info "Workers all terminated naturally."
|
209
|
-
end
|
210
|
-
|
211
|
-
# Tell workers to die forcibly
|
212
|
-
def kill_workers
|
213
|
-
#$log.debug "Forcing #{@t.length} worker threads to die..."
|
214
|
-
@t.each { |t| t.kill }
|
215
|
-
#$log.info "Worker threads killed."
|
216
|
-
end
|
217
|
-
|
218
|
-
# Close all workers' connections to the servers cleanly,
|
219
|
-
#
|
220
|
-
# This is non-blocking. Call #close or #wait to block:
|
221
|
-
#
|
222
|
-
# pool.close_nonblock
|
223
|
-
# pool.wait_until_closed
|
224
|
-
#
|
225
|
-
def close_nonblock
|
226
|
-
#$log.debug "Requesting closure of #{@w.length} worker[s]..."
|
227
|
-
@w.each { |w| w.close }
|
228
|
-
end
|
229
|
-
|
230
|
-
# Cleanly close the pool, waiting for workers to end their
|
231
|
-
# current request. Blocks, unlike #close.
|
232
|
-
def close
|
233
|
-
close_nonblock
|
234
|
-
wait_until_closed
|
235
|
-
end
|
236
|
-
|
237
|
-
private
|
238
|
-
|
239
|
-
# Workers are instantiated and maintained by a Blat::Pool and continually
|
240
|
-
# poll for available work, passing it off for integration with the final
|
241
|
-
# results set.
|
242
|
-
#
|
243
|
-
# Though it is possible to create your own, I would recommend instead using
|
244
|
-
# a pool.
|
245
|
-
#
|
246
|
-
# == Worker Configuration
|
247
|
-
#
|
248
|
-
# Workers are configured by setting values in a hash. This hash is sent to
|
249
|
-
# the worker from the Job class, and contains options that affect the process
|
250
|
-
# of downloading. This is in addition to configuration on the curl object
|
251
|
-
# performed through Blat::Job.configure()
|
252
|
-
#
|
253
|
-
# Workers currently support the following configuration options:
|
254
|
-
#
|
255
|
-
# [:max_body_size] If set, downloads will cease after this many bytes have
|
256
|
-
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
257
|
-
# be set to true.
|
258
|
-
#
|
259
|
-
# == Returned Values
|
260
|
-
#
|
261
|
-
# When a job has been finalised, its #data property will be set to a hash
|
262
|
-
# left by the worker. This is currently specified as:
|
263
|
-
#
|
264
|
-
# [:head] The head string returned from the server (response.header_str)
|
265
|
-
# [:body] The body string returned from the server (response.body)
|
266
|
-
# [:response_properties] A hash with metadata in. Partially specified by the
|
267
|
-
# worker configuration, this contains things such as the number of bytes
|
268
|
-
# downloaded and duration of the request.
|
269
|
-
# [:response] The raw response from curl
|
270
|
-
# [:error] Any errors encountered during download, such as network errors.
|
271
|
-
# If this is nil the request was successful.
|
272
|
-
#
|
273
|
-
# Response properties are currently set to:
|
274
|
-
#
|
275
|
-
# response_properties = {
|
276
|
-
# round_trip_time: res.total_time,
|
277
|
-
# redirect_time: res.redirect_time,
|
278
|
-
# dns_lookup_time: res.name_lookup_time,
|
279
|
-
# effective_uri: res.last_effective_url,
|
280
|
-
# code: res.response_code,
|
281
|
-
# download_speed: res.download_speed,
|
282
|
-
# downloaded_bytes: res.downloaded_bytes || 0,
|
283
|
-
# truncated: ignore == true
|
284
|
-
# }
|
285
|
-
#
|
286
|
-
class Worker
|
287
|
-
|
288
|
-
# Construct a new worker with a given ID and linked to a given pool.
|
289
|
-
#
|
290
|
-
# The pool will be called to report idle/working states.
|
291
|
-
def initialize(id, pool)
|
292
|
-
@id = id
|
293
|
-
@pool = pool
|
294
|
-
@abort = false
|
295
|
-
end
|
296
|
-
|
297
|
-
# Should be run in a thread. Performs work until the dispatcher runs
|
298
|
-
# out of data.
|
299
|
-
def work(dispatcher)
|
300
|
-
# start idle
|
301
|
-
last_idle_state = true
|
302
|
-
|
303
|
-
loop do
|
304
|
-
while (job = dispatcher.call).is_a?(Job) do
|
305
|
-
|
306
|
-
# If we were idle last, tell the pool
|
307
|
-
@pool.worker_active(@id) if last_idle_state == true
|
308
|
-
|
309
|
-
# tell people
|
310
|
-
#$log.debug "W#{@id}: Downloading job #{job}"
|
311
|
-
|
312
|
-
# Make the request
|
313
|
-
complete_request(job, new_curl(job), job.config)
|
314
|
-
|
315
|
-
return if @abort
|
316
|
-
end
|
317
|
-
return if @abort
|
318
|
-
|
319
|
-
# TODO: configurable
|
320
|
-
@pool.worker_idle(@id)
|
321
|
-
last_idle_state = true
|
322
|
-
sleep(1)
|
323
|
-
end
|
324
|
-
|
325
|
-
# rescue StandardError => e
|
326
|
-
#$log.warn "W#{@id}: Error: #{e}"
|
327
|
-
#$log.debug "#{e.backtrace.join("\n")}"
|
328
|
-
end
|
329
|
-
|
330
|
-
# Closes the connection to the server
|
331
|
-
def close
|
332
|
-
@abort = true
|
333
|
-
end
|
334
|
-
|
335
|
-
private
|
336
|
-
|
337
|
-
# Datapoint is complete, run callback
|
338
|
-
def finalise(job, head, body, response_properties, response, error)
|
339
|
-
job.finalise!(
|
340
|
-
head: head,
|
341
|
-
body: body,
|
342
|
-
response_properties: response_properties,
|
343
|
-
response: response,
|
344
|
-
error: error
|
345
|
-
)
|
346
|
-
|
347
|
-
@pool.work_complete(job)
|
348
|
-
end
|
349
|
-
|
350
|
-
# ---------- called by workers below this line
|
351
|
-
|
352
|
-
# Submit a complete dp to the pool
|
353
|
-
def complete_request(job, res, config)
|
354
|
-
|
355
|
-
# Somewhere to store the body in a size-aware way
|
356
|
-
body = ''
|
357
|
-
|
358
|
-
# If limiting body size, use a callback to handle incoming data
|
359
|
-
if config[:max_body_size]
|
360
|
-
ignore = false
|
361
|
-
|
362
|
-
res.on_body do |str|
|
363
|
-
# Read up to the limit of bytes
|
364
|
-
if !ignore && config[:max_body_size] && (body.length + str.length) > config[:max_body_size]
|
365
|
-
body += str[0..(body.length + str.length) - config[:max_body_size]]
|
366
|
-
#$log.warn "W#{@id}: Job #{job} exceeded byte limit (#{config[:max_body_size]}b)"
|
367
|
-
ignore = true
|
368
|
-
elsif not ignore
|
369
|
-
body += str
|
370
|
-
else
|
371
|
-
# ignore data
|
372
|
-
end
|
373
|
-
|
374
|
-
# Have to return number of bytes to curb
|
375
|
-
str.length
|
376
|
-
end
|
377
|
-
end
|
378
|
-
|
379
|
-
# Perform a request prepared elsewhere,
|
380
|
-
# can run alongside other requests
|
381
|
-
res.perform
|
382
|
-
|
383
|
-
# Load body directly from response if not using the system above
|
384
|
-
body = res.body_str unless config[:max_body_size]
|
385
|
-
|
386
|
-
# Load stuff out of response object.
|
387
|
-
response_properties = {
|
388
|
-
round_trip_time: res.total_time,
|
389
|
-
redirect_time: res.redirect_time,
|
390
|
-
dns_lookup_time: res.name_lookup_time,
|
391
|
-
effective_uri: res.last_effective_url,
|
392
|
-
code: res.response_code,
|
393
|
-
download_speed: res.download_speed,
|
394
|
-
downloaded_bytes: res.downloaded_bytes || 0,
|
395
|
-
truncated: ignore == true
|
396
|
-
}
|
397
|
-
|
398
|
-
# write to datapoint list
|
399
|
-
finalise(job, res.header_str, body, response_properties, res, nil)
|
400
|
-
|
401
|
-
rescue SignalException => e
|
402
|
-
raise e
|
403
|
-
rescue StandardError => e
|
404
|
-
# if e.class.to_s =~ /^Curl::Err::/ then
|
405
|
-
# #$log.debug "W#{@id}: Job #{job}: #{e.to_s[11..-1]}"
|
406
|
-
# else
|
407
|
-
# #$log.error "W#{@id}: Exception retrieving #{job}: #{e.to_s}."
|
408
|
-
# #$log.debug "#{e.backtrace.join("\n")}"
|
409
|
-
# end
|
410
|
-
|
411
|
-
# write to datapoint list
|
412
|
-
finalise(job, res.header_str, body, response_properties, res, e)
|
413
|
-
end
|
414
|
-
|
415
|
-
# Returns a new curl object to use downloading things.
|
416
|
-
def new_curl(job)
|
417
|
-
# Set up curl
|
418
|
-
c = Curl::Easy.new
|
419
|
-
|
420
|
-
# Configure the curl object
|
421
|
-
job.configure(c)
|
422
|
-
|
423
|
-
# Return it for work
|
424
|
-
return c
|
425
|
-
end
|
426
|
-
|
427
|
-
end
|
428
|
-
|
429
|
-
end
|
430
|
-
|
431
|
-
end
|
432
|
-
|