blat 0.1.0a → 0.1.0b

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 878ccb8c27eeeeea95bf2771feb5164f0ea9f33b
4
- data.tar.gz: 6e06378de9ddd8947ceb87ecced8c1418477a0c7
3
+ metadata.gz: a18d2f9b96d5a2d9b6e7e8ae085e1bfed1855d32
4
+ data.tar.gz: df5cc403ee46326005387f73432d73edb0fb69bf
5
5
  SHA512:
6
- metadata.gz: b9a812fa61e9627a5167e6595dcfde2c616349cac35b67428ba4bdebecbd28c524cb9888cabb99f639f0f0edc43ec371b1d8da9a6b776079f4ccd3e5bf844869
7
- data.tar.gz: 5f8d35878a6993d6ace44d9a642e590054fff062d5f631893dd42f996ccb88011d34b3b52f0e457af592e680509cedc550bef3297d12186ce3f026bacf5770a0
6
+ metadata.gz: f324fa27a4c93dd576a2663b87a047c3f0c74cf2387ad8fdf362eb4cf884b156a09232421101a7ca6cd947c03ad3d7fe5870ff6a1d7a95897a107fd31ca66fa7
7
+ data.tar.gz: cf89d6d6a43935fbd03dd811a770ff53ba091318cb89ee88037b9017efaed91e178dd1dcd8387624383b0d476b35f9bf9f5dc22d8fe3046876b7e07362d4c51b
@@ -1,16 +1,11 @@
1
1
 
2
- require 'blat/pool'
3
2
  require 'blat/batch'
4
- require 'blat/formats'
3
+ require 'blat/queue'
5
4
 
6
- # Blat is a hugely parallel wrapper for cURL designed to download data as
5
+ # Blat is a simple wrapper for cURL::Multi designed to download data as
7
6
  # aggressively as possible.
8
- #
9
- # Blat makes use of many threads at once in a producer-consumer pattern, and
10
- # accepts tasks in the form of Blat::Jobs, which contain configuration and
11
- # results from each request.
12
7
  module Blat
13
8
 
14
- VERSION = '0.1.0a'
9
+ VERSION = '0.1.0b'
15
10
 
16
11
  end
@@ -1,121 +1,44 @@
1
1
 
2
2
 
3
- require 'blat/pool'
3
+ require 'curl'
4
4
 
5
5
  module Blat
6
6
 
7
- # The batch downloader is a simpler wrapper around Pool that runs in a
8
- # blocking manner. The idea of this is that you can put a list of URLs in,
9
- # run #run and then retrieve the results easily and quickly.
10
- #
11
- # example:
12
- #
13
- # urls = File.read('url.list').lines
14
- # b = Blat::Batch.new( list )
15
- # b.run(10)
16
- # puts "Results: #{b.results}"
17
- #
18
- class Batch
7
+ module Blat::Batch
19
8
 
20
- # Create a new batch downloader for a given list of URLS, and a given set
21
- # of configuration options.
9
+ # Blat::Batch::run takes a list of links and downloads them all before
10
+ # returning. It is a very simple interface to Curl::Multi for smallish
11
+ # tasks.
22
12
  #
23
- # [:urls] An array of URLs to download.
24
- # [:config] (optional) configuration to pass to the Jobs. See Blat::Job
25
- # for more information.
26
- def initialize(urls, config = {})
27
-
28
- # Config for each object
29
- @config = config
30
-
31
- # URLS in as a string
32
- @urls = urls
33
- @urls_mx = Mutex.new
34
-
35
- # Stores results as Job objects
36
- @results = []
37
- @results_mx = Mutex.new
38
-
39
- # Keep this to see if we have finished
40
- @url_count = urls.length
41
- end
42
-
43
- # Run a batch with a given number of workers.
13
+ # [max_connections] Defines how many parallel connections to use
14
+ # [links] Is the list of strings or Curl::Easy objects to download. The list object must support #map and #each
15
+ # [pipeline] Indicates if Curl::Multi should pipeline its HTTP requests
16
+ # [&block] If given, this block is called to configure each Curl::Easy object prior to it being pushed into the queue.
44
17
  #
45
- # If a block is provided, it is called with the curl object just before
46
- # requests are made. This is to allow setting of various parameters, e.g.:
47
- #
48
- # batch.run(10){ |c|
49
- # c.follow_location = true
50
- # }
51
- #
52
- def run(workers, &block)
53
-
54
- # Figure out if people have overestimated the workers needed
55
- workers = [workers, @urls.length].min
56
-
57
- # Construct a pool
58
- x = Blat::Pool.new(workers) do |job|
59
- @results_mx.synchronize { @results << job }
60
- end
18
+ def self.run(max_connections, links, pipeline = true, &block)
19
+ multi = Curl::Multi.new
61
20
 
62
- # Set work to do
63
- x.work do
21
+ # Set options
22
+ multi.max_connects = max_connections.to_i
23
+ multi.pipeline = (pipeline == true)
64
24
 
65
- # Get the URL from the list
66
- url = @urls_mx.synchronize { @urls.pop }
67
-
68
- # If it's set, configure and return a job
69
- if url
70
- Blat::Job.new(@config) do |c|
71
-
72
- # Configure with block if appropriate
73
- yield(c) if block_given?
74
-
75
- c.url= url
76
- end
77
- else
78
- # If not, return nil to set the worker to idle
79
- nil
80
- end
25
+ curls = links.map do |l|
26
+ c = l
27
+ c = Curl::Easy.new(l) unless l.is_a?(Curl::Easy)
28
+ c
81
29
  end
82
30
 
83
- # Wait until workers are idle
84
- x.wait_until_idle
85
-
86
- # Close them all.
87
- x.close
88
- end
89
-
90
- # Is the batch complete?
91
- def complete?
92
- @results_mx.synchronize do
93
- @results.length == @url_count
31
+ # Pump links in
32
+ curls.each do |c|
33
+ yield(c) if block_given?
34
+ multi.add(c)
94
35
  end
95
- end
96
36
 
97
- # Report progress with three vars
98
- #
99
- # remaining (yet to do)
100
- # complete (completed)
101
- # in_progress (currently running)
102
- # total (remaining + complete + in progress)
103
- def progress
104
- remaining = @urls_mx.synchronize { @urls.length }
105
- complete = @results_mx.synchronize { @results.length }
106
- return remaining,
107
- complete,
108
- (@url_count - complete - remaining),
109
- @url_count
110
- end
37
+ # Wait
38
+ multi.perform
111
39
 
112
- # Get results as a list
113
- def results
114
- @results_mx.synchronize do
115
- return @results
116
- end
40
+ return curls
117
41
  end
118
-
119
42
  end
120
43
 
121
44
  end
@@ -0,0 +1,152 @@
1
+
2
+ require 'curl'
3
+
4
+ module Blat
5
+
6
+ # The Blat::Queue class represents a download queue that handles requests
7
+ # using Curl::Multi. It, and its descendants, accept a large number of
8
+ # Curl::Easy objects and download them in parallel.
9
+ #
10
+ # In order to know when each request has completed, use
11
+ # Curl::Easy::on_complete. This is made simpler by Queue#add, which will
12
+ # yield to a block on completion of each download.
13
+ #
14
+ class Queue
15
+
16
+ attr_reader :max_connections, :pipeline
17
+
18
+ # Create a new Blat::Queue with a given number of maximum connections.
19
+ #
20
+ # The 'pipeline' options controls Curl::Multi's pipelining feature, which
21
+ # tries to use the same http connection for many requests to the same server.
22
+ def initialize(max_connections, pipeline = true)
23
+ @multi = Curl::Multi.new
24
+
25
+ # Set properties
26
+ @max_connects = max_connections.to_i
27
+ @pipeline = (pipeline == true)
28
+ @multi.max_connects = @max_connects
29
+ @multi.pipeline = @pipeline
30
+ end
31
+
32
+ # Add a URL or a Curl::Easy object to the queue.
33
+ #
34
+ # Optionally, provide a callback for calling when requests are complete,
35
+ # e.g.:
36
+ #
37
+ # q.add('http://google.com') do |c|
38
+ # puts "Complete request: #{r}"
39
+ # end
40
+ #
41
+ def add(curl_or_link, &block)
42
+ # Convert to curl if necessary
43
+ curl = curl_or_link.is_a?(Curl::Easy) ? curl_or_link : Curl::Easy.new(curl_or_link)
44
+ curl.on_complete { |c| block.yield(c) } if block_given?
45
+
46
+ # Add
47
+ @multi.add(curl)
48
+
49
+ # Return
50
+ return curl
51
+ end
52
+
53
+ # Returns the number of active requests
54
+ def request_count
55
+ requests.length
56
+ end
57
+
58
+ # Returns a list of active requests
59
+ def requests
60
+ @multi.requests
61
+ end
62
+
63
+ # Remove a request from the queue.
64
+ #
65
+ # This needn't be called if a request has completed.
66
+ def remove(curl)
67
+ @multi.remove(curl)
68
+ end
69
+
70
+ # Wait for all requests to finish (blocking).
71
+ #
72
+ # If a block is given it is executed repeatedly whilst waiting.
73
+ def wait(&block)
74
+ @multi.perform do
75
+ yield if block_given?
76
+ end
77
+ end
78
+
79
+ alias_method :perform, :wait
80
+
81
+ # Is the queue idle?
82
+ def idle?
83
+ @multi.idle?
84
+ end
85
+
86
+ end
87
+
88
+ # Similar to a queue, except that it explicitly calls a block in order to
89
+ # acquire new URLs.
90
+ #
91
+ # This makes it suitable for use in producer/consumer patterns.
92
+ class ConsumingQueue < Queue
93
+
94
+ # Executes the given block in order to keep the curl pool working at its
95
+ # maximum capacity.
96
+ #
97
+ # consume blocks as long as links are being downloaded, as it relies on
98
+ # Curl::Multi#perform
99
+ #
100
+ # Note that blocks providing links must also perform their own
101
+ # configuration, e.g.:
102
+ #
103
+ # q.consume do
104
+ # url = get_url
105
+ # if(url)
106
+ # c = Curl::Easy.new(url)
107
+ # c.follow_location = true
108
+ # c.on_complete{ |c| puts "Retrieved: #{c.body_str}" }
109
+ # c
110
+ # else
111
+ # nil
112
+ # end
113
+ # end
114
+ #
115
+ def consume(connections = @max_connects, &block)
116
+ @multi.perform do
117
+ while request_count < connections && new_link = yield
118
+ add(new_link) if new_link
119
+ end
120
+ end
121
+ end
122
+
123
+ end
124
+
125
+ # The ListConsumingQueue is similar to the ConsumingQueue except that
126
+ # it takes its argument in the form of an Enumerable object.
127
+ class ListConsumingQueue < ConsumingQueue
128
+
129
+ # Download all of the URLs or Curl::Easy objects in the given list, and
130
+ # optionally execute the given block on completion for each
131
+ def consume(list, connections = @max_connects)
132
+ item = 0 # Start at item 0
133
+ list = list.to_a # Ensure we can address with []
134
+
135
+ @multi.perform do
136
+ while request_count < connections && new_link = list[item]
137
+
138
+ item += 1
139
+
140
+ # Add with config block if appropriate
141
+ if block_given?
142
+ add(new_link) { |req| yield(req) }
143
+ else
144
+ add(new_link)
145
+ end
146
+
147
+ end
148
+ end
149
+ end
150
+ end
151
+
152
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0a
4
+ version: 0.1.0b
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stephen Wattam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-06-02 00:00:00.000000000 Z
11
+ date: 2013-06-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: curb
@@ -24,15 +24,15 @@ dependencies:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0.8'
27
- description: A very parallel cURL wrapper for ongoing download tasks
27
+ description: Curl::Multi wrapper for high-performance and/or long-running download
28
+ tasks
28
29
  email: stephenwattam@gmail.com
29
30
  executables: []
30
31
  extensions: []
31
32
  extra_rdoc_files: []
32
33
  files:
33
- - lib/blat/pool.rb
34
- - lib/blat/formats.rb
35
34
  - lib/blat/batch.rb
35
+ - lib/blat/queue.rb
36
36
  - ./lib/blat.rb
37
37
  homepage: http://stephenwattam.com/projects/blat
38
38
  licenses:
@@ -1,118 +0,0 @@
1
-
2
-
3
- module Blat
4
-
5
- # Blat::Job represents a single download task, both as a request and response.
6
- #
7
- # Jobs are provided to workers in a pool by a dispatcher block. Each job
8
- # contains:
9
- #
10
- # * Configuration for the worker. Current configuration supported is
11
- # detailed below and in the Pool documentation
12
- # * A way of configuring a curl request (in order to set the url and other
13
- # parameters)
14
- # * Data returned by the download. This is stored as a hash in the #data
15
- # parameter.
16
- #
17
- # == Worker Configuration
18
- #
19
- # Workers are configured by setting values in a hash. This hash is sent to
20
- # the worker from the Job class, and contains options that affect the process
21
- # of downloading. This is in addition to configuration on the curl object
22
- # performed through Blat::Job.configure()
23
- #
24
- # Workers currently support the following configuration options:
25
- #
26
- # [:max_body_size] If set, downloads will cease after this many bytes have
27
- # been downloaded. If truncated, data[:response_properties][:truncated] will
28
- # be set to true.
29
- #
30
- # == Returned Values
31
- #
32
- # When a job has been finalised, its #data property will be set to a hash
33
- # left by the worker. This is currently specified as:
34
- #
35
- # [:head] The head string returned from the server (response.header_str)
36
- # [:body] The body string returned from the server (response.body)
37
- # [:response_properties] A hash with metadata in. Partially specified by the
38
- # worker configuration, this contains things such as the number of bytes
39
- # downloaded and duration of the request.
40
- # [:response] The raw response from curl
41
- # [:error] Any errors encountered during download, such as network errors.
42
- # If this is nil the request was successful.
43
- #
44
- # Response properties are currently set to:
45
- #
46
- # response_properties = {
47
- # round_trip_time: res.total_time,
48
- # redirect_time: res.redirect_time,
49
- # dns_lookup_time: res.name_lookup_time,
50
- # effective_uri: res.last_effective_url,
51
- # code: res.response_code,
52
- # download_speed: res.download_speed,
53
- # downloaded_bytes: res.downloaded_bytes || 0,
54
- # truncated: ignore == true
55
- # }
56
- #
57
- class Job
58
-
59
- attr_reader :config, :data
60
-
61
- # Construct a new Job with a block for configuring curl options.
62
- def initialize(config = {}, &block)
63
- raise 'No curl configuration block given' unless block_given?
64
-
65
- @curl_config_block = block
66
- @config = config
67
- @finalised = false
68
- end
69
-
70
- # Configure a curl object to make the request
71
- def configure(curl)
72
- @curl_config_block.yield(curl)
73
- end
74
-
75
- # Has this job been completed?
76
- def finalised?
77
- @finalise
78
- end
79
-
80
- # Allow people to use closed? instead.
81
- alias :closed? :finalised?
82
-
83
- # Write result and prevent further editing
84
- def finalise!(data = {})
85
- raise 'Job is already finalised.' if finalised?
86
- @data = data
87
- @finalised = true
88
- end
89
- end
90
-
91
- # --------------------------------------------------------------------------
92
-
93
-
94
- # SimpleJob is a quick and easy way of wrapping a URL to create a job.
95
- #
96
- # It accepts:
97
- #
98
- # [:url] The URL to download
99
- # [:curl_config] A hash of properties to set on the curl object, for example: {'follow_location' => true}
100
- # [:config] The worker configuration properties.
101
- class SimpleJob < Job
102
- def initialize(url, curl_config = {}, config = {})
103
- curl_config.merge!({url: url})
104
-
105
- super(config){ |c|
106
- curl_config.each do |k,v|
107
- if v.is_a?(Array)
108
- curl.send(k.to_s + '=', *v)
109
- else
110
- curl.send(k.to_s + '=', v)
111
- end
112
- end
113
- }
114
- end
115
- end
116
-
117
-
118
- end
@@ -1,432 +0,0 @@
1
-
2
- require 'thread'
3
- require 'curl'
4
-
5
- require 'blat/formats'
6
-
7
- module Blat
8
-
9
- # The Blat::Pool class controls a number of workers as they go about running
10
- # curl Jobs. This is the main class of Blat, and is the most flexible way of
11
- # using the gem (Batch is simpler but less full-featured).
12
- #
13
- # == Workflow
14
- #
15
- # The pool is created with a size and a callback to present results to.
16
- # This callback may be presented as a proc object or as a block, and is
17
- # called with a finalised Blat::Job object upon completion of each request.
18
- #
19
- # x = Blat::Pool.new(100){ |job|
20
- # puts "#{job.data[:body]}"
21
- # }
22
- #
23
- # Once a pool is configured, it may be commanded to start downloading by
24
- # presenting it with a dispatcher. This is a procedure that returns either a
25
- # Blat::Job object or nil---workers will call this block in order to acquire
26
- # work, and will enter an idle state when nil is returned.
27
- #
28
- # job_list = File.read('urls').lines.map{ |l| Blat::SimpleJob.new(l) }
29
- #
30
- # x.work{
31
- # job_list.pop
32
- # }
33
- #
34
- # Downloading can be waited upon any number of ways. The status of the pool
35
- # may be requested with #count_idle and #all_idle? , and it's possible to
36
- # wait until idle using #wait_until_idle :
37
- #
38
- # x.wait_until_idle
39
- # x.close
40
- #
41
- # == Worker Configuration
42
- #
43
- # Workers are configured by setting values in a hash. This hash is sent to
44
- # the worker from the Job class, and contains options that affect the process
45
- # of downloading. This is in addition to configuration on the curl object
46
- # performed through Blat::Job.configure()
47
- #
48
- # Workers currently support the following configuration options:
49
- #
50
- # [:max_body_size] If set, downloads will cease after this many bytes have
51
- # been downloaded. If truncated, data[:response_properties][:truncated] will
52
- # be set to true.
53
- #
54
- # == Returned Values
55
- #
56
- # When a job has been finalised, its #data property will be set to a hash
57
- # left by the worker. This is currently specified as:
58
- #
59
- # [:head] The head string returned from the server (response.header_str)
60
- # [:body] The body string returned from the server (response.body)
61
- # [:response_properties] A hash with metadata in. Partially specified by the
62
- # worker configuration, this contains things such as the number of bytes
63
- # downloaded and duration of the request.
64
- # [:response] The raw response from curl
65
- # [:error] Any errors encountered during download, such as network errors.
66
- # If this is nil the request was successful.
67
- #
68
- # Response properties are currently set to:
69
- #
70
- # response_properties = {
71
- # round_trip_time: res.total_time,
72
- # redirect_time: res.redirect_time,
73
- # dns_lookup_time: res.name_lookup_time,
74
- # effective_uri: res.last_effective_url,
75
- # code: res.response_code,
76
- # download_speed: res.download_speed,
77
- # downloaded_bytes: res.downloaded_bytes || 0,
78
- # truncated: ignore == true
79
- # }
80
- #
81
- class Pool
82
-
83
- # Construct a new pool with a given size and a callback used to output
84
- # data.
85
- #
86
- # x = Blat::Pool.new(100){ |job|
87
- # puts "Job complete: #{job}"
88
- # }
89
- #
90
- def initialize(size, finalise_callback = nil, &block)
91
-
92
- @m = Mutex.new # Data mutex for "producer" status
93
- @t = {} # threads
94
- @w = [] # workers
95
- @idle = []
96
- @idle_mutex = Mutex.new
97
- @size = size.to_i # number of simultaneous workers
98
-
99
- # Pass a block for handling returns
100
- if block
101
- @finalise_callback = block
102
- elsif finalise_callback && finalise_callback.is_a?(Proc)
103
- @finalise_callback = finalise_callback
104
- else
105
- raise 'No callback given for final data'
106
- end
107
-
108
- end
109
-
110
- # ------------------------------------------------------------------------
111
- # Workers call these to report status
112
- #
113
-
114
- # Workers can register as active by calling this
115
- def worker_active(worker_id)
116
- @idle_mutex.synchronize{ @idle[worker_id] = false }
117
- end
118
-
119
- # Workers can register as idle by calling this
120
- def worker_idle(worker_id)
121
- @idle_mutex.synchronize{ @idle[worker_id] = true }
122
- end
123
-
124
- # Workers can register that they have completed
125
- # a job by calling this.
126
- def work_complete(job)
127
- @finalise_callback.call(job)
128
- end
129
-
130
- # ------------------------------------------------------------------------
131
- # Worker status
132
- #
133
-
134
- # check to see if all workers are idle
135
- def all_idle?
136
- @idle_mutex.synchronize{ @idle.inject(true) { |m, o| m && o} }
137
- end
138
-
139
- # Return the number of idle workers
140
- def count_idle
141
- @idle_mutex.synchronize{ @idle.count(true) }
142
- end
143
-
144
- # ------------------------------------------------------------------------
145
- # Set work and initialise workers
146
- #
147
-
148
- # Create workers without running them.
149
- #
150
- # This is usually not very useful to call on its own, and is called by
151
- # #work when creating threads.
152
- def init_workers
153
- #$log.debug "Maintaining #{@size} worker object[s] (#{@w.length} currently active)."
154
- @w = []
155
- (@size - @w.length).times do |s|
156
- @w << Worker.new(s, self)
157
- @idle[s] = true
158
- end
159
- #$log.info "#{@w.length} worker[s] created."
160
- end
161
-
162
- # Run a worker over every point competitively.
163
- # Will create @size workers if they do not already exist (there is no need
164
- # to also call init_workers)
165
- def work(dispatcher = nil, &block)
166
-
167
- raise "No dispatcher provided" unless block_given? || (dispatcher && dispatcher.is_?(Proc))
168
-
169
- init_workers
170
-
171
- # Make things do the work
172
- #$log.debug "Starting threads..."
173
- @start_time = Time.now
174
- @w.each do |w|
175
- # Give each worker a handle back to the dispatcher to get data.
176
- @t[w] = Thread.new(dispatcher || block) do |d|
177
- begin
178
- w.work(d)
179
- rescue SignalException => e
180
- #$log.fatal "Signal caught: #{e.message}"
181
- #$log.fatal "Since I'm sampling right now, I will kill workers before shutdown."
182
- kill_workers
183
- raise e
184
- end
185
- end
186
-
187
- # Pass exceptions up
188
- @t[w].abort_on_exception = true
189
- end
190
- #$log.info "#{@t.length} download thread[s] started."
191
- end
192
-
193
- # ------------------------------------------------------------------------
194
- # Wait on conditions and close the pool
195
- #
196
-
197
- # Block until all workers are idle, checking every poll_rate seconds.
198
- def wait_until_idle(poll_rate = 0.5)
199
- #$log.debug "Waiting until idle, polling every #{poll_rate}s..."
200
- sleep(poll_rate)
201
- sleep(poll_rate) until all_idle?
202
- end
203
-
204
- # Wait for threads to complete.
205
- def wait_until_closed
206
- #$log.debug "Waiting for #{@t.length} worker[s] to close."
207
- @t.each { |w, t| t.join }
208
- #$log.info "Workers all terminated naturally."
209
- end
210
-
211
- # Tell workers to die forcibly
212
- def kill_workers
213
- #$log.debug "Forcing #{@t.length} worker threads to die..."
214
- @t.each { |t| t.kill }
215
- #$log.info "Worker threads killed."
216
- end
217
-
218
- # Close all workers' connections to the servers cleanly,
219
- #
220
- # This is non-blocking. Call #close or #wait to block:
221
- #
222
- # pool.close_nonblock
223
- # pool.wait_until_closed
224
- #
225
- def close_nonblock
226
- #$log.debug "Requesting closure of #{@w.length} worker[s]..."
227
- @w.each { |w| w.close }
228
- end
229
-
230
- # Cleanly close the pool, waiting for workers to end their
231
- # current request. Blocks, unlike #close.
232
- def close
233
- close_nonblock
234
- wait_until_closed
235
- end
236
-
237
- private
238
-
239
- # Workers are instantiated and maintained by a Blat::Pool and continually
240
- # poll for available work, passing it off for integration with the final
241
- # results set.
242
- #
243
- # Though it is possible to create your own, I would recommend instead using
244
- # a pool.
245
- #
246
- # == Worker Configuration
247
- #
248
- # Workers are configured by setting values in a hash. This hash is sent to
249
- # the worker from the Job class, and contains options that affect the process
250
- # of downloading. This is in addition to configuration on the curl object
251
- # performed through Blat::Job.configure()
252
- #
253
- # Workers currently support the following configuration options:
254
- #
255
- # [:max_body_size] If set, downloads will cease after this many bytes have
256
- # been downloaded. If truncated, data[:response_properties][:truncated] will
257
- # be set to true.
258
- #
259
- # == Returned Values
260
- #
261
- # When a job has been finalised, its #data property will be set to a hash
262
- # left by the worker. This is currently specified as:
263
- #
264
- # [:head] The head string returned from the server (response.header_str)
265
- # [:body] The body string returned from the server (response.body)
266
- # [:response_properties] A hash with metadata in. Partially specified by the
267
- # worker configuration, this contains things such as the number of bytes
268
- # downloaded and duration of the request.
269
- # [:response] The raw response from curl
270
- # [:error] Any errors encountered during download, such as network errors.
271
- # If this is nil the request was successful.
272
- #
273
- # Response properties are currently set to:
274
- #
275
- # response_properties = {
276
- # round_trip_time: res.total_time,
277
- # redirect_time: res.redirect_time,
278
- # dns_lookup_time: res.name_lookup_time,
279
- # effective_uri: res.last_effective_url,
280
- # code: res.response_code,
281
- # download_speed: res.download_speed,
282
- # downloaded_bytes: res.downloaded_bytes || 0,
283
- # truncated: ignore == true
284
- # }
285
- #
286
- class Worker
287
-
288
- # Construct a new worker with a given ID and linked to a given pool.
289
- #
290
- # The pool will be called to report idle/working states.
291
- def initialize(id, pool)
292
- @id = id
293
- @pool = pool
294
- @abort = false
295
- end
296
-
297
- # Should be run in a thread. Performs work until the dispatcher runs
298
- # out of data.
299
- def work(dispatcher)
300
- # start idle
301
- last_idle_state = true
302
-
303
- loop do
304
- while (job = dispatcher.call).is_a?(Job) do
305
-
306
- # If we were idle last, tell the pool
307
- @pool.worker_active(@id) if last_idle_state == true
308
-
309
- # tell people
310
- #$log.debug "W#{@id}: Downloading job #{job}"
311
-
312
- # Make the request
313
- complete_request(job, new_curl(job), job.config)
314
-
315
- return if @abort
316
- end
317
- return if @abort
318
-
319
- # TODO: configurable
320
- @pool.worker_idle(@id)
321
- last_idle_state = true
322
- sleep(1)
323
- end
324
-
325
- # rescue StandardError => e
326
- #$log.warn "W#{@id}: Error: #{e}"
327
- #$log.debug "#{e.backtrace.join("\n")}"
328
- end
329
-
330
- # Closes the connection to the server
331
- def close
332
- @abort = true
333
- end
334
-
335
- private
336
-
337
- # Datapoint is complete, run callback
338
- def finalise(job, head, body, response_properties, response, error)
339
- job.finalise!(
340
- head: head,
341
- body: body,
342
- response_properties: response_properties,
343
- response: response,
344
- error: error
345
- )
346
-
347
- @pool.work_complete(job)
348
- end
349
-
350
- # ---------- called by workers below this line
351
-
352
- # Submit a complete dp to the pool
353
- def complete_request(job, res, config)
354
-
355
- # Somewhere to store the body in a size-aware way
356
- body = ''
357
-
358
- # If limiting body size, use a callback to handle incoming data
359
- if config[:max_body_size]
360
- ignore = false
361
-
362
- res.on_body do |str|
363
- # Read up to the limit of bytes
364
- if !ignore && config[:max_body_size] && (body.length + str.length) > config[:max_body_size]
365
- body += str[0..(body.length + str.length) - config[:max_body_size]]
366
- #$log.warn "W#{@id}: Job #{job} exceeded byte limit (#{config[:max_body_size]}b)"
367
- ignore = true
368
- elsif not ignore
369
- body += str
370
- else
371
- # ignore data
372
- end
373
-
374
- # Have to return number of bytes to curb
375
- str.length
376
- end
377
- end
378
-
379
- # Perform a request prepared elsewhere,
380
- # can run alongside other requests
381
- res.perform
382
-
383
- # Load body directly from response if not using the system above
384
- body = res.body_str unless config[:max_body_size]
385
-
386
- # Load stuff out of response object.
387
- response_properties = {
388
- round_trip_time: res.total_time,
389
- redirect_time: res.redirect_time,
390
- dns_lookup_time: res.name_lookup_time,
391
- effective_uri: res.last_effective_url,
392
- code: res.response_code,
393
- download_speed: res.download_speed,
394
- downloaded_bytes: res.downloaded_bytes || 0,
395
- truncated: ignore == true
396
- }
397
-
398
- # write to datapoint list
399
- finalise(job, res.header_str, body, response_properties, res, nil)
400
-
401
- rescue SignalException => e
402
- raise e
403
- rescue StandardError => e
404
- # if e.class.to_s =~ /^Curl::Err::/ then
405
- # #$log.debug "W#{@id}: Job #{job}: #{e.to_s[11..-1]}"
406
- # else
407
- # #$log.error "W#{@id}: Exception retrieving #{job}: #{e.to_s}."
408
- # #$log.debug "#{e.backtrace.join("\n")}"
409
- # end
410
-
411
- # write to datapoint list
412
- finalise(job, res.header_str, body, response_properties, res, e)
413
- end
414
-
415
- # Returns a new curl object to use downloading things.
416
- def new_curl(job)
417
- # Set up curl
418
- c = Curl::Easy.new
419
-
420
- # Configure the curl object
421
- job.configure(c)
422
-
423
- # Return it for work
424
- return c
425
- end
426
-
427
- end
428
-
429
- end
430
-
431
- end
432
-