blat 0.1.0a → 0.1.0b

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 878ccb8c27eeeeea95bf2771feb5164f0ea9f33b
4
- data.tar.gz: 6e06378de9ddd8947ceb87ecced8c1418477a0c7
3
+ metadata.gz: a18d2f9b96d5a2d9b6e7e8ae085e1bfed1855d32
4
+ data.tar.gz: df5cc403ee46326005387f73432d73edb0fb69bf
5
5
  SHA512:
6
- metadata.gz: b9a812fa61e9627a5167e6595dcfde2c616349cac35b67428ba4bdebecbd28c524cb9888cabb99f639f0f0edc43ec371b1d8da9a6b776079f4ccd3e5bf844869
7
- data.tar.gz: 5f8d35878a6993d6ace44d9a642e590054fff062d5f631893dd42f996ccb88011d34b3b52f0e457af592e680509cedc550bef3297d12186ce3f026bacf5770a0
6
+ metadata.gz: f324fa27a4c93dd576a2663b87a047c3f0c74cf2387ad8fdf362eb4cf884b156a09232421101a7ca6cd947c03ad3d7fe5870ff6a1d7a95897a107fd31ca66fa7
7
+ data.tar.gz: cf89d6d6a43935fbd03dd811a770ff53ba091318cb89ee88037b9017efaed91e178dd1dcd8387624383b0d476b35f9bf9f5dc22d8fe3046876b7e07362d4c51b
@@ -1,16 +1,11 @@
1
1
 
2
- require 'blat/pool'
3
2
  require 'blat/batch'
4
- require 'blat/formats'
3
+ require 'blat/queue'
5
4
 
6
- # Blat is a hugely parallel wrapper for cURL designed to download data as
5
+ # Blat is a simple wrapper for cURL::Multi designed to download data as
7
6
  # aggressively as possible.
8
- #
9
- # Blat makes use of many threads at once in a producer-consumer pattern, and
10
- # accepts tasks in the form of Blat::Jobs, which contain configuration and
11
- # results from each request.
12
7
  module Blat
13
8
 
14
- VERSION = '0.1.0a'
9
+ VERSION = '0.1.0b'
15
10
 
16
11
  end
@@ -1,121 +1,44 @@
1
1
 
2
2
 
3
- require 'blat/pool'
3
+ require 'curl'
4
4
 
5
5
  module Blat
6
6
 
7
- # The batch downloader is a simpler wrapper around Pool that runs in a
8
- # blocking manner. The idea of this is that you can put a list of URLs in,
9
- # run #run and then retrieve the results easily and quickly.
10
- #
11
- # example:
12
- #
13
- # urls = File.read('url.list').lines
14
- # b = Blat::Batch.new( list )
15
- # b.run(10)
16
- # puts "Results: #{b.results}"
17
- #
18
- class Batch
7
+ module Blat::Batch
19
8
 
20
- # Create a new batch downloader for a given list of URLS, and a given set
21
- # of configuration options.
9
+ # Blat::Batch::run takes a list of links and downloads them all before
10
+ # returning. It is a very simple interface to Curl::Multi for smallish
11
+ # tasks.
22
12
  #
23
- # [:urls] An array of URLs to download.
24
- # [:config] (optional) configuration to pass to the Jobs. See Blat::Job
25
- # for more information.
26
- def initialize(urls, config = {})
27
-
28
- # Config for each object
29
- @config = config
30
-
31
- # URLS in as a string
32
- @urls = urls
33
- @urls_mx = Mutex.new
34
-
35
- # Stores results as Job objects
36
- @results = []
37
- @results_mx = Mutex.new
38
-
39
- # Keep this to see if we have finished
40
- @url_count = urls.length
41
- end
42
-
43
- # Run a batch with a given number of workers.
13
+ # [max_connections] Defines how many parallel connections to use
14
+ # [links] Is the list of strings or Curl::Easy objects to download. The list object must support #map and #each
15
+ # [pipeline] Indicates if Curl::Multi should pipeline its HTTP requests
16
+ # [&block] If given, this block is called to configure each Curl::Easy object prior to it being pushed into the queue.
44
17
  #
45
- # If a block is provided, it is called with the curl object just before
46
- # requests are made. This is to allow setting of various parameters, e.g.:
47
- #
48
- # batch.run(10){ |c|
49
- # c.follow_location = true
50
- # }
51
- #
52
- def run(workers, &block)
53
-
54
- # Figure out if people have overestimated the workers needed
55
- workers = [workers, @urls.length].min
56
-
57
- # Construct a pool
58
- x = Blat::Pool.new(workers) do |job|
59
- @results_mx.synchronize { @results << job }
60
- end
18
+ def self.run(max_connections, links, pipeline = true, &block)
19
+ multi = Curl::Multi.new
61
20
 
62
- # Set work to do
63
- x.work do
21
+ # Set options
22
+ multi.max_connects = max_connections.to_i
23
+ multi.pipeline = (pipeline == true)
64
24
 
65
- # Get the URL from the list
66
- url = @urls_mx.synchronize { @urls.pop }
67
-
68
- # If it's set, configure and return a job
69
- if url
70
- Blat::Job.new(@config) do |c|
71
-
72
- # Configure with block if appropriate
73
- yield(c) if block_given?
74
-
75
- c.url= url
76
- end
77
- else
78
- # If not, return nil to set the worker to idle
79
- nil
80
- end
25
+ curls = links.map do |l|
26
+ c = l
27
+ c = Curl::Easy.new(l) unless l.is_a?(Curl::Easy)
28
+ c
81
29
  end
82
30
 
83
- # Wait until workers are idle
84
- x.wait_until_idle
85
-
86
- # Close them all.
87
- x.close
88
- end
89
-
90
- # Is the batch complete?
91
- def complete?
92
- @results_mx.synchronize do
93
- @results.length == @url_count
31
+ # Pump links in
32
+ curls.each do |c|
33
+ yield(c) if block_given?
34
+ multi.add(c)
94
35
  end
95
- end
96
36
 
97
- # Report progress with three vars
98
- #
99
- # remaining (yet to do)
100
- # complete (completed)
101
- # in_progress (currently running)
102
- # total (remaining + complete + in progress)
103
- def progress
104
- remaining = @urls_mx.synchronize { @urls.length }
105
- complete = @results_mx.synchronize { @results.length }
106
- return remaining,
107
- complete,
108
- (@url_count - complete - remaining),
109
- @url_count
110
- end
37
+ # Wait
38
+ multi.perform
111
39
 
112
- # Get results as a list
113
- def results
114
- @results_mx.synchronize do
115
- return @results
116
- end
40
+ return curls
117
41
  end
118
-
119
42
  end
120
43
 
121
44
  end
@@ -0,0 +1,152 @@
1
+
2
+ require 'curl'
3
+
4
+ module Blat
5
+
6
+ # The Blat::Queue class represents a download queue that handles requests
7
+ # using Curl::Multi. It, and its descendants, accept a large number of
8
+ # Curl::Easy objects and download them in parallel.
9
+ #
10
+ # In order to know when each request has completed, use
11
+ # Curl::Easy::on_complete. This is made simpler by Queue#add, which will
12
+ # yield to a block on completion of each download.
13
+ #
14
+ class Queue
15
+
16
+ attr_reader :max_connections, :pipeline
17
+
18
+ # Create a new Blat::Queue with a given number of maximum connections.
19
+ #
20
+ # The 'pipeline' options controls Curl::Multi's pipelining feature, which
21
+ # tries to use the same http connection for many requests to the same server.
22
+ def initialize(max_connections, pipeline = true)
23
+ @multi = Curl::Multi.new
24
+
25
+ # Set properties
26
+ @max_connects = max_connections.to_i
27
+ @pipeline = (pipeline == true)
28
+ @multi.max_connects = @max_connects
29
+ @multi.pipeline = @pipeline
30
+ end
31
+
32
+ # Add a URL or a Curl::Easy object to the queue.
33
+ #
34
+ # Optionally, provide a callback for calling when requests are complete,
35
+ # e.g.:
36
+ #
37
+ # q.add('http://google.com') do |c|
38
+ # puts "Complete request: #{r}"
39
+ # end
40
+ #
41
+ def add(curl_or_link, &block)
42
+ # Convert to curl if necessary
43
+ curl = curl_or_link.is_a?(Curl::Easy) ? curl_or_link : Curl::Easy.new(curl_or_link)
44
+ curl.on_complete { |c| block.yield(c) } if block_given?
45
+
46
+ # Add
47
+ @multi.add(curl)
48
+
49
+ # Return
50
+ return curl
51
+ end
52
+
53
+ # Returns the number of active requests
54
+ def request_count
55
+ requests.length
56
+ end
57
+
58
+ # Returns a list of active requests
59
+ def requests
60
+ @multi.requests
61
+ end
62
+
63
+ # Remove a request from the queue.
64
+ #
65
+ # This needn't be called if a request has completed.
66
+ def remove(curl)
67
+ @multi.remove(curl)
68
+ end
69
+
70
+ # Wait for all requests to finish (blocking).
71
+ #
72
+ # If a block is given it is executed repeatedly whilst waiting.
73
+ def wait(&block)
74
+ @multi.perform do
75
+ yield if block_given?
76
+ end
77
+ end
78
+
79
+ alias_method :perform, :wait
80
+
81
+ # Is the queue idle?
82
+ def idle?
83
+ @multi.idle?
84
+ end
85
+
86
+ end
87
+
88
+ # Similar to a queue, except that it explicitly calls a block in order to
89
+ # acquire new URLs.
90
+ #
91
+ # This makes it suitable for use in producer/consumer patterns.
92
+ class ConsumingQueue < Queue
93
+
94
+ # Executes the given block in order to keep the curl pool working at its
95
+ # maximum capacity.
96
+ #
97
+ # consume blocks as long as links are being downloaded, as it relies on
98
+ # Curl::Multi#perform
99
+ #
100
+ # Note that blocks providing links must also perform their own
101
+ # configuration, e.g.:
102
+ #
103
+ # q.consume do
104
+ # url = get_url
105
+ # if(url)
106
+ # c = Curl::Easy.new(url)
107
+ # c.follow_location = true
108
+ # c.on_complete{ |c| puts "Retrieved: #{c.body_str}" }
109
+ # c
110
+ # else
111
+ # nil
112
+ # end
113
+ # end
114
+ #
115
+ def consume(connections = @max_connects, &block)
116
+ @multi.perform do
117
+ while request_count < connections && new_link = yield
118
+ add(new_link) if new_link
119
+ end
120
+ end
121
+ end
122
+
123
+ end
124
+
125
+ # The ListConsumingQueue is similar to the ConsumingQueue except that
126
+ # it takes its argument in the form of an Enumerable object.
127
+ class ListConsumingQueue < ConsumingQueue
128
+
129
+ # Download all of the URLs or Curl::Easy objects in the given list, and
130
+ # optionally execute the given block on completion for each
131
+ def consume(list, connections = @max_connects)
132
+ item = 0 # Start at item 0
133
+ list = list.to_a # Ensure we can address with []
134
+
135
+ @multi.perform do
136
+ while request_count < connections && new_link = list[item]
137
+
138
+ item += 1
139
+
140
+ # Add with config block if appropriate
141
+ if block_given?
142
+ add(new_link) { |req| yield(req) }
143
+ else
144
+ add(new_link)
145
+ end
146
+
147
+ end
148
+ end
149
+ end
150
+ end
151
+
152
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0a
4
+ version: 0.1.0b
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stephen Wattam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-06-02 00:00:00.000000000 Z
11
+ date: 2013-06-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: curb
@@ -24,15 +24,15 @@ dependencies:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0.8'
27
- description: A very parallel cURL wrapper for ongoing download tasks
27
+ description: Curl::Multi wrapper for high-performance and/or long-running download
28
+ tasks
28
29
  email: stephenwattam@gmail.com
29
30
  executables: []
30
31
  extensions: []
31
32
  extra_rdoc_files: []
32
33
  files:
33
- - lib/blat/pool.rb
34
- - lib/blat/formats.rb
35
34
  - lib/blat/batch.rb
35
+ - lib/blat/queue.rb
36
36
  - ./lib/blat.rb
37
37
  homepage: http://stephenwattam.com/projects/blat
38
38
  licenses:
@@ -1,118 +0,0 @@
1
-
2
-
3
- module Blat
4
-
5
- # Blat::Job represents a single download task, both as a request and response.
6
- #
7
- # Jobs are provided to workers in a pool by a dispatcher block. Each job
8
- # contains:
9
- #
10
- # * Configuration for the worker. Current configuration supported is
11
- # detailed below and in the Pool documentation
12
- # * A way of configuring a curl request (in order to set the url and other
13
- # parameters)
14
- # * Data returned by the download. This is stored as a hash in the #data
15
- # parameter.
16
- #
17
- # == Worker Configuration
18
- #
19
- # Workers are configured by setting values in a hash. This hash is sent to
20
- # the worker from the Job class, and contains options that affect the process
21
- # of downloading. This is in addition to configuration on the curl object
22
- # performed through Blat::Job.configure()
23
- #
24
- # Workers currently support the following configuration options:
25
- #
26
- # [:max_body_size] If set, downloads will cease after this many bytes have
27
- # been downloaded. If truncated, data[:response_properties][:truncated] will
28
- # be set to true.
29
- #
30
- # == Returned Values
31
- #
32
- # When a job has been finalised, its #data property will be set to a hash
33
- # left by the worker. This is currently specified as:
34
- #
35
- # [:head] The head string returned from the server (response.header_str)
36
- # [:body] The body string returned from the server (response.body)
37
- # [:response_properties] A hash with metadata in. Partially specified by the
38
- # worker configuration, this contains things such as the number of bytes
39
- # downloaded and duration of the request.
40
- # [:response] The raw response from curl
41
- # [:error] Any errors encountered during download, such as network errors.
42
- # If this is nil the request was successful.
43
- #
44
- # Response properties are currently set to:
45
- #
46
- # response_properties = {
47
- # round_trip_time: res.total_time,
48
- # redirect_time: res.redirect_time,
49
- # dns_lookup_time: res.name_lookup_time,
50
- # effective_uri: res.last_effective_url,
51
- # code: res.response_code,
52
- # download_speed: res.download_speed,
53
- # downloaded_bytes: res.downloaded_bytes || 0,
54
- # truncated: ignore == true
55
- # }
56
- #
57
- class Job
58
-
59
- attr_reader :config, :data
60
-
61
- # Construct a new Job with a block for configuring curl options.
62
- def initialize(config = {}, &block)
63
- raise 'No curl configuration block given' unless block_given?
64
-
65
- @curl_config_block = block
66
- @config = config
67
- @finalised = false
68
- end
69
-
70
- # Configure a curl object to make the request
71
- def configure(curl)
72
- @curl_config_block.yield(curl)
73
- end
74
-
75
- # Has this job been completed?
76
- def finalised?
77
- @finalise
78
- end
79
-
80
- # Allow people to use closed? instead.
81
- alias :closed? :finalised?
82
-
83
- # Write result and prevent further editing
84
- def finalise!(data = {})
85
- raise 'Job is already finalised.' if finalised?
86
- @data = data
87
- @finalised = true
88
- end
89
- end
90
-
91
- # --------------------------------------------------------------------------
92
-
93
-
94
- # SimpleJob is a quick and easy way of wrapping a URL to create a job.
95
- #
96
- # It accepts:
97
- #
98
- # [:url] The URL to download
99
- # [:curl_config] A hash of properties to set on the curl object, for example: {'follow_location' => true}
100
- # [:config] The worker configuration properties.
101
- class SimpleJob < Job
102
- def initialize(url, curl_config = {}, config = {})
103
- curl_config.merge!({url: url})
104
-
105
- super(config){ |c|
106
- curl_config.each do |k,v|
107
- if v.is_a?(Array)
108
- curl.send(k.to_s + '=', *v)
109
- else
110
- curl.send(k.to_s + '=', v)
111
- end
112
- end
113
- }
114
- end
115
- end
116
-
117
-
118
- end
@@ -1,432 +0,0 @@
1
-
2
- require 'thread'
3
- require 'curl'
4
-
5
- require 'blat/formats'
6
-
7
- module Blat
8
-
9
- # The Blat::Pool class controls a number of workers as they go about running
10
- # curl Jobs. This is the main class of Blat, and is the most flexible way of
11
- # using the gem (Batch is simpler but less full-featured).
12
- #
13
- # == Workflow
14
- #
15
- # The pool is created with a size and a callback to present results to.
16
- # This callback may be presented as a proc object or as a block, and is
17
- # called with a finalised Blat::Job object upon completion of each request.
18
- #
19
- # x = Blat::Pool.new(100){ |job|
20
- # puts "#{job.data[:body]}"
21
- # }
22
- #
23
- # Once a pool is configured, it may be commanded to start downloading by
24
- # presenting it with a dispatcher. This is a procedure that returns either a
25
- # Blat::Job object or nil---workers will call this block in order to acquire
26
- # work, and will enter an idle state when nil is returned.
27
- #
28
- # job_list = File.read('urls').lines.map{ |l| Blat::SimpleJob.new(l) }
29
- #
30
- # x.work{
31
- # job_list.pop
32
- # }
33
- #
34
- # Downloading can be waited upon any number of ways. The status of the pool
35
- # may be requested with #count_idle and #all_idle? , and it's possible to
36
- # wait until idle using #wait_until_idle :
37
- #
38
- # x.wait_until_idle
39
- # x.close
40
- #
41
- # == Worker Configuration
42
- #
43
- # Workers are configured by setting values in a hash. This hash is sent to
44
- # the worker from the Job class, and contains options that affect the process
45
- # of downloading. This is in addition to configuration on the curl object
46
- # performed through Blat::Job.configure()
47
- #
48
- # Workers currently support the following configuration options:
49
- #
50
- # [:max_body_size] If set, downloads will cease after this many bytes have
51
- # been downloaded. If truncated, data[:response_properties][:truncated] will
52
- # be set to true.
53
- #
54
- # == Returned Values
55
- #
56
- # When a job has been finalised, its #data property will be set to a hash
57
- # left by the worker. This is currently specified as:
58
- #
59
- # [:head] The head string returned from the server (response.header_str)
60
- # [:body] The body string returned from the server (response.body)
61
- # [:response_properties] A hash with metadata in. Partially specified by the
62
- # worker configuration, this contains things such as the number of bytes
63
- # downloaded and duration of the request.
64
- # [:response] The raw response from curl
65
- # [:error] Any errors encountered during download, such as network errors.
66
- # If this is nil the request was successful.
67
- #
68
- # Response properties are currently set to:
69
- #
70
- # response_properties = {
71
- # round_trip_time: res.total_time,
72
- # redirect_time: res.redirect_time,
73
- # dns_lookup_time: res.name_lookup_time,
74
- # effective_uri: res.last_effective_url,
75
- # code: res.response_code,
76
- # download_speed: res.download_speed,
77
- # downloaded_bytes: res.downloaded_bytes || 0,
78
- # truncated: ignore == true
79
- # }
80
- #
81
- class Pool
82
-
83
- # Construct a new pool with a given size and a callback used to output
84
- # data.
85
- #
86
- # x = Blat::Pool.new(100){ |job|
87
- # puts "Job complete: #{job}"
88
- # }
89
- #
90
- def initialize(size, finalise_callback = nil, &block)
91
-
92
- @m = Mutex.new # Data mutex for "producer" status
93
- @t = {} # threads
94
- @w = [] # workers
95
- @idle = []
96
- @idle_mutex = Mutex.new
97
- @size = size.to_i # number of simultaneous workers
98
-
99
- # Pass a block for handling returns
100
- if block
101
- @finalise_callback = block
102
- elsif finalise_callback && finalise_callback.is_a?(Proc)
103
- @finalise_callback = finalise_callback
104
- else
105
- raise 'No callback given for final data'
106
- end
107
-
108
- end
109
-
110
- # ------------------------------------------------------------------------
111
- # Workers call these to report status
112
- #
113
-
114
- # Workers can register as active by calling this
115
- def worker_active(worker_id)
116
- @idle_mutex.synchronize{ @idle[worker_id] = false }
117
- end
118
-
119
- # Workers can register as idle by calling this
120
- def worker_idle(worker_id)
121
- @idle_mutex.synchronize{ @idle[worker_id] = true }
122
- end
123
-
124
- # Workers can register that they have completed
125
- # a job by calling this.
126
- def work_complete(job)
127
- @finalise_callback.call(job)
128
- end
129
-
130
- # ------------------------------------------------------------------------
131
- # Worker status
132
- #
133
-
134
- # check to see if all workers are idle
135
- def all_idle?
136
- @idle_mutex.synchronize{ @idle.inject(true) { |m, o| m && o} }
137
- end
138
-
139
- # Return the number of idle workers
140
- def count_idle
141
- @idle_mutex.synchronize{ @idle.count(true) }
142
- end
143
-
144
- # ------------------------------------------------------------------------
145
- # Set work and initialise workers
146
- #
147
-
148
- # Create workers without running them.
149
- #
150
- # This is usually not very useful to call on its own, and is called by
151
- # #work when creating threads.
152
- def init_workers
153
- #$log.debug "Maintaining #{@size} worker object[s] (#{@w.length} currently active)."
154
- @w = []
155
- (@size - @w.length).times do |s|
156
- @w << Worker.new(s, self)
157
- @idle[s] = true
158
- end
159
- #$log.info "#{@w.length} worker[s] created."
160
- end
161
-
162
- # Run a worker over every point competitively.
163
- # Will create @size workers if they do not already exist (there is no need
164
- # to also call init_workers)
165
- def work(dispatcher = nil, &block)
166
-
167
- raise "No dispatcher provided" unless block_given? || (dispatcher && dispatcher.is_?(Proc))
168
-
169
- init_workers
170
-
171
- # Make things do the work
172
- #$log.debug "Starting threads..."
173
- @start_time = Time.now
174
- @w.each do |w|
175
- # Give each worker a handle back to the dispatcher to get data.
176
- @t[w] = Thread.new(dispatcher || block) do |d|
177
- begin
178
- w.work(d)
179
- rescue SignalException => e
180
- #$log.fatal "Signal caught: #{e.message}"
181
- #$log.fatal "Since I'm sampling right now, I will kill workers before shutdown."
182
- kill_workers
183
- raise e
184
- end
185
- end
186
-
187
- # Pass exceptions up
188
- @t[w].abort_on_exception = true
189
- end
190
- #$log.info "#{@t.length} download thread[s] started."
191
- end
192
-
193
- # ------------------------------------------------------------------------
194
- # Wait on conditions and close the pool
195
- #
196
-
197
- # Block until all workers are idle, checking every poll_rate seconds.
198
- def wait_until_idle(poll_rate = 0.5)
199
- #$log.debug "Waiting until idle, polling every #{poll_rate}s..."
200
- sleep(poll_rate)
201
- sleep(poll_rate) until all_idle?
202
- end
203
-
204
- # Wait for threads to complete.
205
- def wait_until_closed
206
- #$log.debug "Waiting for #{@t.length} worker[s] to close."
207
- @t.each { |w, t| t.join }
208
- #$log.info "Workers all terminated naturally."
209
- end
210
-
211
- # Tell workers to die forcibly
212
- def kill_workers
213
- #$log.debug "Forcing #{@t.length} worker threads to die..."
214
- @t.each { |t| t.kill }
215
- #$log.info "Worker threads killed."
216
- end
217
-
218
- # Close all workers' connections to the servers cleanly,
219
- #
220
- # This is non-blocking. Call #close or #wait to block:
221
- #
222
- # pool.close_nonblock
223
- # pool.wait_until_closed
224
- #
225
- def close_nonblock
226
- #$log.debug "Requesting closure of #{@w.length} worker[s]..."
227
- @w.each { |w| w.close }
228
- end
229
-
230
- # Cleanly close the pool, waiting for workers to end their
231
- # current request. Blocks, unlike #close.
232
- def close
233
- close_nonblock
234
- wait_until_closed
235
- end
236
-
237
- private
238
-
239
- # Workers are instantiated and maintained by a Blat::Pool and continually
240
- # poll for available work, passing it off for integration with the final
241
- # results set.
242
- #
243
- # Though it is possible to create your own, I would recommend instead using
244
- # a pool.
245
- #
246
- # == Worker Configuration
247
- #
248
- # Workers are configured by setting values in a hash. This hash is sent to
249
- # the worker from the Job class, and contains options that affect the process
250
- # of downloading. This is in addition to configuration on the curl object
251
- # performed through Blat::Job.configure()
252
- #
253
- # Workers currently support the following configuration options:
254
- #
255
- # [:max_body_size] If set, downloads will cease after this many bytes have
256
- # been downloaded. If truncated, data[:response_properties][:truncated] will
257
- # be set to true.
258
- #
259
- # == Returned Values
260
- #
261
- # When a job has been finalised, its #data property will be set to a hash
262
- # left by the worker. This is currently specified as:
263
- #
264
- # [:head] The head string returned from the server (response.header_str)
265
- # [:body] The body string returned from the server (response.body)
266
- # [:response_properties] A hash with metadata in. Partially specified by the
267
- # worker configuration, this contains things such as the number of bytes
268
- # downloaded and duration of the request.
269
- # [:response] The raw response from curl
270
- # [:error] Any errors encountered during download, such as network errors.
271
- # If this is nil the request was successful.
272
- #
273
- # Response properties are currently set to:
274
- #
275
- # response_properties = {
276
- # round_trip_time: res.total_time,
277
- # redirect_time: res.redirect_time,
278
- # dns_lookup_time: res.name_lookup_time,
279
- # effective_uri: res.last_effective_url,
280
- # code: res.response_code,
281
- # download_speed: res.download_speed,
282
- # downloaded_bytes: res.downloaded_bytes || 0,
283
- # truncated: ignore == true
284
- # }
285
- #
286
- class Worker
287
-
288
- # Construct a new worker with a given ID and linked to a given pool.
289
- #
290
- # The pool will be called to report idle/working states.
291
- def initialize(id, pool)
292
- @id = id
293
- @pool = pool
294
- @abort = false
295
- end
296
-
297
- # Should be run in a thread. Performs work until the dispatcher runs
298
- # out of data.
299
- def work(dispatcher)
300
- # start idle
301
- last_idle_state = true
302
-
303
- loop do
304
- while (job = dispatcher.call).is_a?(Job) do
305
-
306
- # If we were idle last, tell the pool
307
- @pool.worker_active(@id) if last_idle_state == true
308
-
309
- # tell people
310
- #$log.debug "W#{@id}: Downloading job #{job}"
311
-
312
- # Make the request
313
- complete_request(job, new_curl(job), job.config)
314
-
315
- return if @abort
316
- end
317
- return if @abort
318
-
319
- # TODO: configurable
320
- @pool.worker_idle(@id)
321
- last_idle_state = true
322
- sleep(1)
323
- end
324
-
325
- # rescue StandardError => e
326
- #$log.warn "W#{@id}: Error: #{e}"
327
- #$log.debug "#{e.backtrace.join("\n")}"
328
- end
329
-
330
- # Closes the connection to the server
331
- def close
332
- @abort = true
333
- end
334
-
335
- private
336
-
337
- # Datapoint is complete, run callback
338
- def finalise(job, head, body, response_properties, response, error)
339
- job.finalise!(
340
- head: head,
341
- body: body,
342
- response_properties: response_properties,
343
- response: response,
344
- error: error
345
- )
346
-
347
- @pool.work_complete(job)
348
- end
349
-
350
- # ---------- called by workers below this line
351
-
352
- # Submit a complete dp to the pool
353
- def complete_request(job, res, config)
354
-
355
- # Somewhere to store the body in a size-aware way
356
- body = ''
357
-
358
- # If limiting body size, use a callback to handle incoming data
359
- if config[:max_body_size]
360
- ignore = false
361
-
362
- res.on_body do |str|
363
- # Read up to the limit of bytes
364
- if !ignore && config[:max_body_size] && (body.length + str.length) > config[:max_body_size]
365
- body += str[0..(body.length + str.length) - config[:max_body_size]]
366
- #$log.warn "W#{@id}: Job #{job} exceeded byte limit (#{config[:max_body_size]}b)"
367
- ignore = true
368
- elsif not ignore
369
- body += str
370
- else
371
- # ignore data
372
- end
373
-
374
- # Have to return number of bytes to curb
375
- str.length
376
- end
377
- end
378
-
379
- # Perform a request prepared elsewhere,
380
- # can run alongside other requests
381
- res.perform
382
-
383
- # Load body directly from response if not using the system above
384
- body = res.body_str unless config[:max_body_size]
385
-
386
- # Load stuff out of response object.
387
- response_properties = {
388
- round_trip_time: res.total_time,
389
- redirect_time: res.redirect_time,
390
- dns_lookup_time: res.name_lookup_time,
391
- effective_uri: res.last_effective_url,
392
- code: res.response_code,
393
- download_speed: res.download_speed,
394
- downloaded_bytes: res.downloaded_bytes || 0,
395
- truncated: ignore == true
396
- }
397
-
398
- # write to datapoint list
399
- finalise(job, res.header_str, body, response_properties, res, nil)
400
-
401
- rescue SignalException => e
402
- raise e
403
- rescue StandardError => e
404
- # if e.class.to_s =~ /^Curl::Err::/ then
405
- # #$log.debug "W#{@id}: Job #{job}: #{e.to_s[11..-1]}"
406
- # else
407
- # #$log.error "W#{@id}: Exception retrieving #{job}: #{e.to_s}."
408
- # #$log.debug "#{e.backtrace.join("\n")}"
409
- # end
410
-
411
- # write to datapoint list
412
- finalise(job, res.header_str, body, response_properties, res, e)
413
- end
414
-
415
- # Returns a new curl object to use downloading things.
416
- def new_curl(job)
417
- # Set up curl
418
- c = Curl::Easy.new
419
-
420
- # Configure the curl object
421
- job.configure(c)
422
-
423
- # Return it for work
424
- return c
425
- end
426
-
427
- end
428
-
429
- end
430
-
431
- end
432
-