blat 0.1.0a → 0.1.0b
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/blat.rb +3 -8
- data/lib/blat/batch.rb +25 -102
- data/lib/blat/queue.rb +152 -0
- metadata +5 -5
- data/lib/blat/formats.rb +0 -118
- data/lib/blat/pool.rb +0 -432
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a18d2f9b96d5a2d9b6e7e8ae085e1bfed1855d32
|
4
|
+
data.tar.gz: df5cc403ee46326005387f73432d73edb0fb69bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f324fa27a4c93dd576a2663b87a047c3f0c74cf2387ad8fdf362eb4cf884b156a09232421101a7ca6cd947c03ad3d7fe5870ff6a1d7a95897a107fd31ca66fa7
|
7
|
+
data.tar.gz: cf89d6d6a43935fbd03dd811a770ff53ba091318cb89ee88037b9017efaed91e178dd1dcd8387624383b0d476b35f9bf9f5dc22d8fe3046876b7e07362d4c51b
|
data/lib/blat.rb
CHANGED
@@ -1,16 +1,11 @@
|
|
1
1
|
|
2
|
-
require 'blat/pool'
|
3
2
|
require 'blat/batch'
|
4
|
-
require 'blat/
|
3
|
+
require 'blat/queue'
|
5
4
|
|
6
|
-
# Blat is a
|
5
|
+
# Blat is a simple wrapper for cURL::Multi designed to download data as
|
7
6
|
# aggressively as possible.
|
8
|
-
#
|
9
|
-
# Blat makes use of many threads at once in a producer-consumer pattern, and
|
10
|
-
# accepts tasks in the form of Blat::Jobs, which contain configuration and
|
11
|
-
# results from each request.
|
12
7
|
module Blat
|
13
8
|
|
14
|
-
VERSION = '0.1.
|
9
|
+
VERSION = '0.1.0b'
|
15
10
|
|
16
11
|
end
|
data/lib/blat/batch.rb
CHANGED
@@ -1,121 +1,44 @@
|
|
1
1
|
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'curl'
|
4
4
|
|
5
5
|
module Blat
|
6
6
|
|
7
|
-
|
8
|
-
# blocking manner. The idea of this is that you can put a list of URLs in,
|
9
|
-
# run #run and then retrieve the results easily and quickly.
|
10
|
-
#
|
11
|
-
# example:
|
12
|
-
#
|
13
|
-
# urls = File.read('url.list').lines
|
14
|
-
# b = Blat::Batch.new( list )
|
15
|
-
# b.run(10)
|
16
|
-
# puts "Results: #{b.results}"
|
17
|
-
#
|
18
|
-
class Batch
|
7
|
+
module Blat::Batch
|
19
8
|
|
20
|
-
#
|
21
|
-
#
|
9
|
+
# Blat::Batch::run takes a list of links and downloads them all before
|
10
|
+
# returning. It is a very simple interface to Curl::Multi for smallish
|
11
|
+
# tasks.
|
22
12
|
#
|
23
|
-
# [
|
24
|
-
# [
|
25
|
-
#
|
26
|
-
|
27
|
-
|
28
|
-
# Config for each object
|
29
|
-
@config = config
|
30
|
-
|
31
|
-
# URLS in as a string
|
32
|
-
@urls = urls
|
33
|
-
@urls_mx = Mutex.new
|
34
|
-
|
35
|
-
# Stores results as Job objects
|
36
|
-
@results = []
|
37
|
-
@results_mx = Mutex.new
|
38
|
-
|
39
|
-
# Keep this to see if we have finished
|
40
|
-
@url_count = urls.length
|
41
|
-
end
|
42
|
-
|
43
|
-
# Run a batch with a given number of workers.
|
13
|
+
# [max_connections] Defines how many parallel connections to use
|
14
|
+
# [links] Is the list of strings or Curl::Easy objects to download. The list object must support #map and #each
|
15
|
+
# [pipeline] Indicates if Curl::Multi should pipeline its HTTP requests
|
16
|
+
# [&block] If given, this block is called to configure each Curl::Easy object prior to it being pushed into the queue.
|
44
17
|
#
|
45
|
-
|
46
|
-
|
47
|
-
#
|
48
|
-
# batch.run(10){ |c|
|
49
|
-
# c.follow_location = true
|
50
|
-
# }
|
51
|
-
#
|
52
|
-
def run(workers, &block)
|
53
|
-
|
54
|
-
# Figure out if people have overestimated the workers needed
|
55
|
-
workers = [workers, @urls.length].min
|
56
|
-
|
57
|
-
# Construct a pool
|
58
|
-
x = Blat::Pool.new(workers) do |job|
|
59
|
-
@results_mx.synchronize { @results << job }
|
60
|
-
end
|
18
|
+
def self.run(max_connections, links, pipeline = true, &block)
|
19
|
+
multi = Curl::Multi.new
|
61
20
|
|
62
|
-
# Set
|
63
|
-
|
21
|
+
# Set options
|
22
|
+
multi.max_connects = max_connections.to_i
|
23
|
+
multi.pipeline = (pipeline == true)
|
64
24
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
if url
|
70
|
-
Blat::Job.new(@config) do |c|
|
71
|
-
|
72
|
-
# Configure with block if appropriate
|
73
|
-
yield(c) if block_given?
|
74
|
-
|
75
|
-
c.url= url
|
76
|
-
end
|
77
|
-
else
|
78
|
-
# If not, return nil to set the worker to idle
|
79
|
-
nil
|
80
|
-
end
|
25
|
+
curls = links.map do |l|
|
26
|
+
c = l
|
27
|
+
c = Curl::Easy.new(l) unless l.is_a?(Curl::Easy)
|
28
|
+
c
|
81
29
|
end
|
82
30
|
|
83
|
-
#
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
x.close
|
88
|
-
end
|
89
|
-
|
90
|
-
# Is the batch complete?
|
91
|
-
def complete?
|
92
|
-
@results_mx.synchronize do
|
93
|
-
@results.length == @url_count
|
31
|
+
# Pump links in
|
32
|
+
curls.each do |c|
|
33
|
+
yield(c) if block_given?
|
34
|
+
multi.add(c)
|
94
35
|
end
|
95
|
-
end
|
96
36
|
|
97
|
-
|
98
|
-
|
99
|
-
# remaining (yet to do)
|
100
|
-
# complete (completed)
|
101
|
-
# in_progress (currently running)
|
102
|
-
# total (remaining + complete + in progress)
|
103
|
-
def progress
|
104
|
-
remaining = @urls_mx.synchronize { @urls.length }
|
105
|
-
complete = @results_mx.synchronize { @results.length }
|
106
|
-
return remaining,
|
107
|
-
complete,
|
108
|
-
(@url_count - complete - remaining),
|
109
|
-
@url_count
|
110
|
-
end
|
37
|
+
# Wait
|
38
|
+
multi.perform
|
111
39
|
|
112
|
-
|
113
|
-
def results
|
114
|
-
@results_mx.synchronize do
|
115
|
-
return @results
|
116
|
-
end
|
40
|
+
return curls
|
117
41
|
end
|
118
|
-
|
119
42
|
end
|
120
43
|
|
121
44
|
end
|
data/lib/blat/queue.rb
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
|
2
|
+
require 'curl'
|
3
|
+
|
4
|
+
module Blat
|
5
|
+
|
6
|
+
# The Blat::Queue class represents a download queue that handles requests
|
7
|
+
# using Curl::Multi. It, and its descendants, accept a large number of
|
8
|
+
# Curl::Easy objects and download them in parallel.
|
9
|
+
#
|
10
|
+
# In order to know when each request has completed, use
|
11
|
+
# Curl::Easy::on_complete. This is made simpler by Queue#add, which will
|
12
|
+
# yield to a block on completion of each download.
|
13
|
+
#
|
14
|
+
class Queue
|
15
|
+
|
16
|
+
attr_reader :max_connections, :pipeline
|
17
|
+
|
18
|
+
# Create a new Blat::Queue with a given number of maximum connections.
|
19
|
+
#
|
20
|
+
# The 'pipeline' options controls Curl::Multi's pipelining feature, which
|
21
|
+
# tries to use the same http connection for many requests to the same server.
|
22
|
+
def initialize(max_connections, pipeline = true)
|
23
|
+
@multi = Curl::Multi.new
|
24
|
+
|
25
|
+
# Set properties
|
26
|
+
@max_connects = max_connections.to_i
|
27
|
+
@pipeline = (pipeline == true)
|
28
|
+
@multi.max_connects = @max_connects
|
29
|
+
@multi.pipeline = @pipeline
|
30
|
+
end
|
31
|
+
|
32
|
+
# Add a URL or a Curl::Easy object to the queue.
|
33
|
+
#
|
34
|
+
# Optionally, provide a callback for calling when requests are complete,
|
35
|
+
# e.g.:
|
36
|
+
#
|
37
|
+
# q.add('http://google.com') do |c|
|
38
|
+
# puts "Complete request: #{r}"
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
def add(curl_or_link, &block)
|
42
|
+
# Convert to curl if necessary
|
43
|
+
curl = curl_or_link.is_a?(Curl::Easy) ? curl_or_link : Curl::Easy.new(curl_or_link)
|
44
|
+
curl.on_complete { |c| block.yield(c) } if block_given?
|
45
|
+
|
46
|
+
# Add
|
47
|
+
@multi.add(curl)
|
48
|
+
|
49
|
+
# Return
|
50
|
+
return curl
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns the number of active requests
|
54
|
+
def request_count
|
55
|
+
requests.length
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns a list of active requests
|
59
|
+
def requests
|
60
|
+
@multi.requests
|
61
|
+
end
|
62
|
+
|
63
|
+
# Remove a request from the queue.
|
64
|
+
#
|
65
|
+
# This needn't be called if a request has completed.
|
66
|
+
def remove(curl)
|
67
|
+
@multi.remove(curl)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Wait for all requests to finish (blocking).
|
71
|
+
#
|
72
|
+
# If a block is given it is executed repeatedly whilst waiting.
|
73
|
+
def wait(&block)
|
74
|
+
@multi.perform do
|
75
|
+
yield if block_given?
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
alias_method :perform, :wait
|
80
|
+
|
81
|
+
# Is the queue idle?
|
82
|
+
def idle?
|
83
|
+
@multi.idle?
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
# Similar to a queue, except that it explicitly calls a block in order to
|
89
|
+
# acquire new URLs.
|
90
|
+
#
|
91
|
+
# This makes it suitable for use in producer/consumer patterns.
|
92
|
+
class ConsumingQueue < Queue
|
93
|
+
|
94
|
+
# Executes the given block in order to keep the curl pool working at its
|
95
|
+
# maximum capacity.
|
96
|
+
#
|
97
|
+
# consume blocks as long as links are being downloaded, as it relies on
|
98
|
+
# Curl::Multi#perform
|
99
|
+
#
|
100
|
+
# Note that blocks providing links must also perform their own
|
101
|
+
# configuration, e.g.:
|
102
|
+
#
|
103
|
+
# q.consume do
|
104
|
+
# url = get_url
|
105
|
+
# if(url)
|
106
|
+
# c = Curl::Easy.new(url)
|
107
|
+
# c.follow_location = true
|
108
|
+
# c.on_complete{ |c| puts "Retrieved: #{c.body_str}" }
|
109
|
+
# c
|
110
|
+
# else
|
111
|
+
# nil
|
112
|
+
# end
|
113
|
+
# end
|
114
|
+
#
|
115
|
+
def consume(connections = @max_connects, &block)
|
116
|
+
@multi.perform do
|
117
|
+
while request_count < connections && new_link = yield
|
118
|
+
add(new_link) if new_link
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
# The ListConsumingQueue is similar to the ConsumingQueue except that
|
126
|
+
# it takes its argument in the form of an Enumerable object.
|
127
|
+
class ListConsumingQueue < ConsumingQueue
|
128
|
+
|
129
|
+
# Download all of the URLs or Curl::Easy objects in the given list, and
|
130
|
+
# optionally execute the given block on completion for each
|
131
|
+
def consume(list, connections = @max_connects)
|
132
|
+
item = 0 # Start at item 0
|
133
|
+
list = list.to_a # Ensure we can address with []
|
134
|
+
|
135
|
+
@multi.perform do
|
136
|
+
while request_count < connections && new_link = list[item]
|
137
|
+
|
138
|
+
item += 1
|
139
|
+
|
140
|
+
# Add with config block if appropriate
|
141
|
+
if block_given?
|
142
|
+
add(new_link) { |req| yield(req) }
|
143
|
+
else
|
144
|
+
add(new_link)
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: blat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.0b
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stephen Wattam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-06-
|
11
|
+
date: 2013-06-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: curb
|
@@ -24,15 +24,15 @@ dependencies:
|
|
24
24
|
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0.8'
|
27
|
-
description:
|
27
|
+
description: Curl::Multi wrapper for high-performance and/or long-running download
|
28
|
+
tasks
|
28
29
|
email: stephenwattam@gmail.com
|
29
30
|
executables: []
|
30
31
|
extensions: []
|
31
32
|
extra_rdoc_files: []
|
32
33
|
files:
|
33
|
-
- lib/blat/pool.rb
|
34
|
-
- lib/blat/formats.rb
|
35
34
|
- lib/blat/batch.rb
|
35
|
+
- lib/blat/queue.rb
|
36
36
|
- ./lib/blat.rb
|
37
37
|
homepage: http://stephenwattam.com/projects/blat
|
38
38
|
licenses:
|
data/lib/blat/formats.rb
DELETED
@@ -1,118 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
module Blat
|
4
|
-
|
5
|
-
# Blat::Job represents a single download task, both as a request and response.
|
6
|
-
#
|
7
|
-
# Jobs are provided to workers in a pool by a dispatcher block. Each job
|
8
|
-
# contains:
|
9
|
-
#
|
10
|
-
# * Configuration for the worker. Current configuration supported is
|
11
|
-
# detailed below and in the Pool documentation
|
12
|
-
# * A way of configuring a curl request (in order to set the url and other
|
13
|
-
# parameters)
|
14
|
-
# * Data returned by the download. This is stored as a hash in the #data
|
15
|
-
# parameter.
|
16
|
-
#
|
17
|
-
# == Worker Configuration
|
18
|
-
#
|
19
|
-
# Workers are configured by setting values in a hash. This hash is sent to
|
20
|
-
# the worker from the Job class, and contains options that affect the process
|
21
|
-
# of downloading. This is in addition to configuration on the curl object
|
22
|
-
# performed through Blat::Job.configure()
|
23
|
-
#
|
24
|
-
# Workers currently support the following configuration options:
|
25
|
-
#
|
26
|
-
# [:max_body_size] If set, downloads will cease after this many bytes have
|
27
|
-
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
28
|
-
# be set to true.
|
29
|
-
#
|
30
|
-
# == Returned Values
|
31
|
-
#
|
32
|
-
# When a job has been finalised, its #data property will be set to a hash
|
33
|
-
# left by the worker. This is currently specified as:
|
34
|
-
#
|
35
|
-
# [:head] The head string returned from the server (response.header_str)
|
36
|
-
# [:body] The body string returned from the server (response.body)
|
37
|
-
# [:response_properties] A hash with metadata in. Partially specified by the
|
38
|
-
# worker configuration, this contains things such as the number of bytes
|
39
|
-
# downloaded and duration of the request.
|
40
|
-
# [:response] The raw response from curl
|
41
|
-
# [:error] Any errors encountered during download, such as network errors.
|
42
|
-
# If this is nil the request was successful.
|
43
|
-
#
|
44
|
-
# Response properties are currently set to:
|
45
|
-
#
|
46
|
-
# response_properties = {
|
47
|
-
# round_trip_time: res.total_time,
|
48
|
-
# redirect_time: res.redirect_time,
|
49
|
-
# dns_lookup_time: res.name_lookup_time,
|
50
|
-
# effective_uri: res.last_effective_url,
|
51
|
-
# code: res.response_code,
|
52
|
-
# download_speed: res.download_speed,
|
53
|
-
# downloaded_bytes: res.downloaded_bytes || 0,
|
54
|
-
# truncated: ignore == true
|
55
|
-
# }
|
56
|
-
#
|
57
|
-
class Job
|
58
|
-
|
59
|
-
attr_reader :config, :data
|
60
|
-
|
61
|
-
# Construct a new Job with a block for configuring curl options.
|
62
|
-
def initialize(config = {}, &block)
|
63
|
-
raise 'No curl configuration block given' unless block_given?
|
64
|
-
|
65
|
-
@curl_config_block = block
|
66
|
-
@config = config
|
67
|
-
@finalised = false
|
68
|
-
end
|
69
|
-
|
70
|
-
# Configure a curl object to make the request
|
71
|
-
def configure(curl)
|
72
|
-
@curl_config_block.yield(curl)
|
73
|
-
end
|
74
|
-
|
75
|
-
# Has this job been completed?
|
76
|
-
def finalised?
|
77
|
-
@finalise
|
78
|
-
end
|
79
|
-
|
80
|
-
# Allow people to use closed? instead.
|
81
|
-
alias :closed? :finalised?
|
82
|
-
|
83
|
-
# Write result and prevent further editing
|
84
|
-
def finalise!(data = {})
|
85
|
-
raise 'Job is already finalised.' if finalised?
|
86
|
-
@data = data
|
87
|
-
@finalised = true
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
# --------------------------------------------------------------------------
|
92
|
-
|
93
|
-
|
94
|
-
# SimpleJob is a quick and easy way of wrapping a URL to create a job.
|
95
|
-
#
|
96
|
-
# It accepts:
|
97
|
-
#
|
98
|
-
# [:url] The URL to download
|
99
|
-
# [:curl_config] A hash of properties to set on the curl object, for example: {'follow_location' => true}
|
100
|
-
# [:config] The worker configuration properties.
|
101
|
-
class SimpleJob < Job
|
102
|
-
def initialize(url, curl_config = {}, config = {})
|
103
|
-
curl_config.merge!({url: url})
|
104
|
-
|
105
|
-
super(config){ |c|
|
106
|
-
curl_config.each do |k,v|
|
107
|
-
if v.is_a?(Array)
|
108
|
-
curl.send(k.to_s + '=', *v)
|
109
|
-
else
|
110
|
-
curl.send(k.to_s + '=', v)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
}
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
|
118
|
-
end
|
data/lib/blat/pool.rb
DELETED
@@ -1,432 +0,0 @@
|
|
1
|
-
|
2
|
-
require 'thread'
|
3
|
-
require 'curl'
|
4
|
-
|
5
|
-
require 'blat/formats'
|
6
|
-
|
7
|
-
module Blat
|
8
|
-
|
9
|
-
# The Blat::Pool class controls a number of workers as they go about running
|
10
|
-
# curl Jobs. This is the main class of Blat, and is the most flexible way of
|
11
|
-
# using the gem (Batch is simpler but less full-featured).
|
12
|
-
#
|
13
|
-
# == Workflow
|
14
|
-
#
|
15
|
-
# The pool is created with a size and a callback to present results to.
|
16
|
-
# This callback may be presented as a proc object or as a block, and is
|
17
|
-
# called with a finalised Blat::Job object upon completion of each request.
|
18
|
-
#
|
19
|
-
# x = Blat::Pool.new(100){ |job|
|
20
|
-
# puts "#{job.data[:body]}"
|
21
|
-
# }
|
22
|
-
#
|
23
|
-
# Once a pool is configured, it may be commanded to start downloading by
|
24
|
-
# presenting it with a dispatcher. This is a procedure that returns either a
|
25
|
-
# Blat::Job object or nil---workers will call this block in order to acquire
|
26
|
-
# work, and will enter an idle state when nil is returned.
|
27
|
-
#
|
28
|
-
# job_list = File.read('urls').lines.map{ |l| Blat::SimpleJob.new(l) }
|
29
|
-
#
|
30
|
-
# x.work{
|
31
|
-
# job_list.pop
|
32
|
-
# }
|
33
|
-
#
|
34
|
-
# Downloading can be waited upon any number of ways. The status of the pool
|
35
|
-
# may be requested with #count_idle and #all_idle? , and it's possible to
|
36
|
-
# wait until idle using #wait_until_idle :
|
37
|
-
#
|
38
|
-
# x.wait_until_idle
|
39
|
-
# x.close
|
40
|
-
#
|
41
|
-
# == Worker Configuration
|
42
|
-
#
|
43
|
-
# Workers are configured by setting values in a hash. This hash is sent to
|
44
|
-
# the worker from the Job class, and contains options that affect the process
|
45
|
-
# of downloading. This is in addition to configuration on the curl object
|
46
|
-
# performed through Blat::Job.configure()
|
47
|
-
#
|
48
|
-
# Workers currently support the following configuration options:
|
49
|
-
#
|
50
|
-
# [:max_body_size] If set, downloads will cease after this many bytes have
|
51
|
-
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
52
|
-
# be set to true.
|
53
|
-
#
|
54
|
-
# == Returned Values
|
55
|
-
#
|
56
|
-
# When a job has been finalised, its #data property will be set to a hash
|
57
|
-
# left by the worker. This is currently specified as:
|
58
|
-
#
|
59
|
-
# [:head] The head string returned from the server (response.header_str)
|
60
|
-
# [:body] The body string returned from the server (response.body)
|
61
|
-
# [:response_properties] A hash with metadata in. Partially specified by the
|
62
|
-
# worker configuration, this contains things such as the number of bytes
|
63
|
-
# downloaded and duration of the request.
|
64
|
-
# [:response] The raw response from curl
|
65
|
-
# [:error] Any errors encountered during download, such as network errors.
|
66
|
-
# If this is nil the request was successful.
|
67
|
-
#
|
68
|
-
# Response properties are currently set to:
|
69
|
-
#
|
70
|
-
# response_properties = {
|
71
|
-
# round_trip_time: res.total_time,
|
72
|
-
# redirect_time: res.redirect_time,
|
73
|
-
# dns_lookup_time: res.name_lookup_time,
|
74
|
-
# effective_uri: res.last_effective_url,
|
75
|
-
# code: res.response_code,
|
76
|
-
# download_speed: res.download_speed,
|
77
|
-
# downloaded_bytes: res.downloaded_bytes || 0,
|
78
|
-
# truncated: ignore == true
|
79
|
-
# }
|
80
|
-
#
|
81
|
-
class Pool
|
82
|
-
|
83
|
-
# Construct a new pool with a given size and a callback used to output
|
84
|
-
# data.
|
85
|
-
#
|
86
|
-
# x = Blat::Pool.new(100){ |job|
|
87
|
-
# puts "Job complete: #{job}"
|
88
|
-
# }
|
89
|
-
#
|
90
|
-
def initialize(size, finalise_callback = nil, &block)
|
91
|
-
|
92
|
-
@m = Mutex.new # Data mutex for "producer" status
|
93
|
-
@t = {} # threads
|
94
|
-
@w = [] # workers
|
95
|
-
@idle = []
|
96
|
-
@idle_mutex = Mutex.new
|
97
|
-
@size = size.to_i # number of simultaneous workers
|
98
|
-
|
99
|
-
# Pass a block for handling returns
|
100
|
-
if block
|
101
|
-
@finalise_callback = block
|
102
|
-
elsif finalise_callback && finalise_callback.is_a?(Proc)
|
103
|
-
@finalise_callback = finalise_callback
|
104
|
-
else
|
105
|
-
raise 'No callback given for final data'
|
106
|
-
end
|
107
|
-
|
108
|
-
end
|
109
|
-
|
110
|
-
# ------------------------------------------------------------------------
|
111
|
-
# Workers call these to report status
|
112
|
-
#
|
113
|
-
|
114
|
-
# Workers can register as active by calling this
|
115
|
-
def worker_active(worker_id)
|
116
|
-
@idle_mutex.synchronize{ @idle[worker_id] = false }
|
117
|
-
end
|
118
|
-
|
119
|
-
# Workers can register as idle by calling this
|
120
|
-
def worker_idle(worker_id)
|
121
|
-
@idle_mutex.synchronize{ @idle[worker_id] = true }
|
122
|
-
end
|
123
|
-
|
124
|
-
# Workers can register that they have completed
|
125
|
-
# a job by calling this.
|
126
|
-
def work_complete(job)
|
127
|
-
@finalise_callback.call(job)
|
128
|
-
end
|
129
|
-
|
130
|
-
# ------------------------------------------------------------------------
|
131
|
-
# Worker status
|
132
|
-
#
|
133
|
-
|
134
|
-
# check to see if all workers are idle
|
135
|
-
def all_idle?
|
136
|
-
@idle_mutex.synchronize{ @idle.inject(true) { |m, o| m && o} }
|
137
|
-
end
|
138
|
-
|
139
|
-
# Return the number of idle workers
|
140
|
-
def count_idle
|
141
|
-
@idle_mutex.synchronize{ @idle.count(true) }
|
142
|
-
end
|
143
|
-
|
144
|
-
# ------------------------------------------------------------------------
|
145
|
-
# Set work and initialise workers
|
146
|
-
#
|
147
|
-
|
148
|
-
# Create workers without running them.
|
149
|
-
#
|
150
|
-
# This is usually not very useful to call on its own, and is called by
|
151
|
-
# #work when creating threads.
|
152
|
-
def init_workers
|
153
|
-
#$log.debug "Maintaining #{@size} worker object[s] (#{@w.length} currently active)."
|
154
|
-
@w = []
|
155
|
-
(@size - @w.length).times do |s|
|
156
|
-
@w << Worker.new(s, self)
|
157
|
-
@idle[s] = true
|
158
|
-
end
|
159
|
-
#$log.info "#{@w.length} worker[s] created."
|
160
|
-
end
|
161
|
-
|
162
|
-
# Run a worker over every point competitively.
|
163
|
-
# Will create @size workers if they do not already exist (there is no need
|
164
|
-
# to also call init_workers)
|
165
|
-
def work(dispatcher = nil, &block)
|
166
|
-
|
167
|
-
raise "No dispatcher provided" unless block_given? || (dispatcher && dispatcher.is_?(Proc))
|
168
|
-
|
169
|
-
init_workers
|
170
|
-
|
171
|
-
# Make things do the work
|
172
|
-
#$log.debug "Starting threads..."
|
173
|
-
@start_time = Time.now
|
174
|
-
@w.each do |w|
|
175
|
-
# Give each worker a handle back to the dispatcher to get data.
|
176
|
-
@t[w] = Thread.new(dispatcher || block) do |d|
|
177
|
-
begin
|
178
|
-
w.work(d)
|
179
|
-
rescue SignalException => e
|
180
|
-
#$log.fatal "Signal caught: #{e.message}"
|
181
|
-
#$log.fatal "Since I'm sampling right now, I will kill workers before shutdown."
|
182
|
-
kill_workers
|
183
|
-
raise e
|
184
|
-
end
|
185
|
-
end
|
186
|
-
|
187
|
-
# Pass exceptions up
|
188
|
-
@t[w].abort_on_exception = true
|
189
|
-
end
|
190
|
-
#$log.info "#{@t.length} download thread[s] started."
|
191
|
-
end
|
192
|
-
|
193
|
-
# ------------------------------------------------------------------------
|
194
|
-
# Wait on conditions and close the pool
|
195
|
-
#
|
196
|
-
|
197
|
-
# Block until all workers are idle, checking every poll_rate seconds.
|
198
|
-
def wait_until_idle(poll_rate = 0.5)
|
199
|
-
#$log.debug "Waiting until idle, polling every #{poll_rate}s..."
|
200
|
-
sleep(poll_rate)
|
201
|
-
sleep(poll_rate) until all_idle?
|
202
|
-
end
|
203
|
-
|
204
|
-
# Wait for threads to complete.
|
205
|
-
def wait_until_closed
|
206
|
-
#$log.debug "Waiting for #{@t.length} worker[s] to close."
|
207
|
-
@t.each { |w, t| t.join }
|
208
|
-
#$log.info "Workers all terminated naturally."
|
209
|
-
end
|
210
|
-
|
211
|
-
# Tell workers to die forcibly
|
212
|
-
def kill_workers
|
213
|
-
#$log.debug "Forcing #{@t.length} worker threads to die..."
|
214
|
-
@t.each { |t| t.kill }
|
215
|
-
#$log.info "Worker threads killed."
|
216
|
-
end
|
217
|
-
|
218
|
-
# Close all workers' connections to the servers cleanly,
|
219
|
-
#
|
220
|
-
# This is non-blocking. Call #close or #wait to block:
|
221
|
-
#
|
222
|
-
# pool.close_nonblock
|
223
|
-
# pool.wait_until_closed
|
224
|
-
#
|
225
|
-
def close_nonblock
|
226
|
-
#$log.debug "Requesting closure of #{@w.length} worker[s]..."
|
227
|
-
@w.each { |w| w.close }
|
228
|
-
end
|
229
|
-
|
230
|
-
# Cleanly close the pool, waiting for workers to end their
|
231
|
-
# current request. Blocks, unlike #close.
|
232
|
-
def close
|
233
|
-
close_nonblock
|
234
|
-
wait_until_closed
|
235
|
-
end
|
236
|
-
|
237
|
-
private
|
238
|
-
|
239
|
-
# Workers are instantiated and maintained by a Blat::Pool and continually
|
240
|
-
# poll for available work, passing it off for integration with the final
|
241
|
-
# results set.
|
242
|
-
#
|
243
|
-
# Though it is possible to create your own, I would recommend instead using
|
244
|
-
# a pool.
|
245
|
-
#
|
246
|
-
# == Worker Configuration
|
247
|
-
#
|
248
|
-
# Workers are configured by setting values in a hash. This hash is sent to
|
249
|
-
# the worker from the Job class, and contains options that affect the process
|
250
|
-
# of downloading. This is in addition to configuration on the curl object
|
251
|
-
# performed through Blat::Job.configure()
|
252
|
-
#
|
253
|
-
# Workers currently support the following configuration options:
|
254
|
-
#
|
255
|
-
# [:max_body_size] If set, downloads will cease after this many bytes have
|
256
|
-
# been downloaded. If truncated, data[:response_properties][:truncated] will
|
257
|
-
# be set to true.
|
258
|
-
#
|
259
|
-
# == Returned Values
|
260
|
-
#
|
261
|
-
# When a job has been finalised, its #data property will be set to a hash
|
262
|
-
# left by the worker. This is currently specified as:
|
263
|
-
#
|
264
|
-
# [:head] The head string returned from the server (response.header_str)
|
265
|
-
# [:body] The body string returned from the server (response.body)
|
266
|
-
# [:response_properties] A hash with metadata in. Partially specified by the
|
267
|
-
# worker configuration, this contains things such as the number of bytes
|
268
|
-
# downloaded and duration of the request.
|
269
|
-
# [:response] The raw response from curl
|
270
|
-
# [:error] Any errors encountered during download, such as network errors.
|
271
|
-
# If this is nil the request was successful.
|
272
|
-
#
|
273
|
-
# Response properties are currently set to:
|
274
|
-
#
|
275
|
-
# response_properties = {
|
276
|
-
# round_trip_time: res.total_time,
|
277
|
-
# redirect_time: res.redirect_time,
|
278
|
-
# dns_lookup_time: res.name_lookup_time,
|
279
|
-
# effective_uri: res.last_effective_url,
|
280
|
-
# code: res.response_code,
|
281
|
-
# download_speed: res.download_speed,
|
282
|
-
# downloaded_bytes: res.downloaded_bytes || 0,
|
283
|
-
# truncated: ignore == true
|
284
|
-
# }
|
285
|
-
#
|
286
|
-
class Worker
|
287
|
-
|
288
|
-
# Construct a new worker with a given ID and linked to a given pool.
|
289
|
-
#
|
290
|
-
# The pool will be called to report idle/working states.
|
291
|
-
def initialize(id, pool)
|
292
|
-
@id = id
|
293
|
-
@pool = pool
|
294
|
-
@abort = false
|
295
|
-
end
|
296
|
-
|
297
|
-
# Should be run in a thread. Performs work until the dispatcher runs
|
298
|
-
# out of data.
|
299
|
-
def work(dispatcher)
|
300
|
-
# start idle
|
301
|
-
last_idle_state = true
|
302
|
-
|
303
|
-
loop do
|
304
|
-
while (job = dispatcher.call).is_a?(Job) do
|
305
|
-
|
306
|
-
# If we were idle last, tell the pool
|
307
|
-
@pool.worker_active(@id) if last_idle_state == true
|
308
|
-
|
309
|
-
# tell people
|
310
|
-
#$log.debug "W#{@id}: Downloading job #{job}"
|
311
|
-
|
312
|
-
# Make the request
|
313
|
-
complete_request(job, new_curl(job), job.config)
|
314
|
-
|
315
|
-
return if @abort
|
316
|
-
end
|
317
|
-
return if @abort
|
318
|
-
|
319
|
-
# TODO: configurable
|
320
|
-
@pool.worker_idle(@id)
|
321
|
-
last_idle_state = true
|
322
|
-
sleep(1)
|
323
|
-
end
|
324
|
-
|
325
|
-
# rescue StandardError => e
|
326
|
-
#$log.warn "W#{@id}: Error: #{e}"
|
327
|
-
#$log.debug "#{e.backtrace.join("\n")}"
|
328
|
-
end
|
329
|
-
|
330
|
-
# Closes the connection to the server
|
331
|
-
def close
|
332
|
-
@abort = true
|
333
|
-
end
|
334
|
-
|
335
|
-
private
|
336
|
-
|
337
|
-
# Datapoint is complete, run callback
|
338
|
-
def finalise(job, head, body, response_properties, response, error)
|
339
|
-
job.finalise!(
|
340
|
-
head: head,
|
341
|
-
body: body,
|
342
|
-
response_properties: response_properties,
|
343
|
-
response: response,
|
344
|
-
error: error
|
345
|
-
)
|
346
|
-
|
347
|
-
@pool.work_complete(job)
|
348
|
-
end
|
349
|
-
|
350
|
-
# ---------- called by workers below this line
|
351
|
-
|
352
|
-
# Submit a complete dp to the pool
|
353
|
-
def complete_request(job, res, config)
|
354
|
-
|
355
|
-
# Somewhere to store the body in a size-aware way
|
356
|
-
body = ''
|
357
|
-
|
358
|
-
# If limiting body size, use a callback to handle incoming data
|
359
|
-
if config[:max_body_size]
|
360
|
-
ignore = false
|
361
|
-
|
362
|
-
res.on_body do |str|
|
363
|
-
# Read up to the limit of bytes
|
364
|
-
if !ignore && config[:max_body_size] && (body.length + str.length) > config[:max_body_size]
|
365
|
-
body += str[0..(body.length + str.length) - config[:max_body_size]]
|
366
|
-
#$log.warn "W#{@id}: Job #{job} exceeded byte limit (#{config[:max_body_size]}b)"
|
367
|
-
ignore = true
|
368
|
-
elsif not ignore
|
369
|
-
body += str
|
370
|
-
else
|
371
|
-
# ignore data
|
372
|
-
end
|
373
|
-
|
374
|
-
# Have to return number of bytes to curb
|
375
|
-
str.length
|
376
|
-
end
|
377
|
-
end
|
378
|
-
|
379
|
-
# Perform a request prepared elsewhere,
|
380
|
-
# can run alongside other requests
|
381
|
-
res.perform
|
382
|
-
|
383
|
-
# Load body directly from response if not using the system above
|
384
|
-
body = res.body_str unless config[:max_body_size]
|
385
|
-
|
386
|
-
# Load stuff out of response object.
|
387
|
-
response_properties = {
|
388
|
-
round_trip_time: res.total_time,
|
389
|
-
redirect_time: res.redirect_time,
|
390
|
-
dns_lookup_time: res.name_lookup_time,
|
391
|
-
effective_uri: res.last_effective_url,
|
392
|
-
code: res.response_code,
|
393
|
-
download_speed: res.download_speed,
|
394
|
-
downloaded_bytes: res.downloaded_bytes || 0,
|
395
|
-
truncated: ignore == true
|
396
|
-
}
|
397
|
-
|
398
|
-
# write to datapoint list
|
399
|
-
finalise(job, res.header_str, body, response_properties, res, nil)
|
400
|
-
|
401
|
-
rescue SignalException => e
|
402
|
-
raise e
|
403
|
-
rescue StandardError => e
|
404
|
-
# if e.class.to_s =~ /^Curl::Err::/ then
|
405
|
-
# #$log.debug "W#{@id}: Job #{job}: #{e.to_s[11..-1]}"
|
406
|
-
# else
|
407
|
-
# #$log.error "W#{@id}: Exception retrieving #{job}: #{e.to_s}."
|
408
|
-
# #$log.debug "#{e.backtrace.join("\n")}"
|
409
|
-
# end
|
410
|
-
|
411
|
-
# write to datapoint list
|
412
|
-
finalise(job, res.header_str, body, response_properties, res, e)
|
413
|
-
end
|
414
|
-
|
415
|
-
# Returns a new curl object to use downloading things.
|
416
|
-
def new_curl(job)
|
417
|
-
# Set up curl
|
418
|
-
c = Curl::Easy.new
|
419
|
-
|
420
|
-
# Configure the curl object
|
421
|
-
job.configure(c)
|
422
|
-
|
423
|
-
# Return it for work
|
424
|
-
return c
|
425
|
-
end
|
426
|
-
|
427
|
-
end
|
428
|
-
|
429
|
-
end
|
430
|
-
|
431
|
-
end
|
432
|
-
|