get_them_all 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/bin/gta CHANGED
@@ -32,14 +32,9 @@ class GtaRunner < Thor
32
32
  fail("file #{script_path} should define class #{class_name} !") unless Object.const_defined?( class_name.to_sym )
33
33
 
34
34
  info("Started with config file #{File.basename(script_path)}")
35
-
36
- # CTRL+C
37
- trap("INT") do
38
- EM::stop_event_loop()
39
- end
40
-
35
+
41
36
  # create the instance (and start download)
42
- class_name.constantize.new(
37
+ crawler = class_name.constantize.new(
43
38
  :storage => {
44
39
  :type => 'file',
45
40
  :params => {
@@ -47,7 +42,14 @@ class GtaRunner < Thor
47
42
  }
48
43
  },
49
44
  :extensions => [GetThemAll::ActionLogger.new]
50
- ).start()
45
+ )
46
+
47
+ # CTRL+C
48
+ trap("INT") do
49
+ crawler.stop()
50
+ end
51
+
52
+ crawler.start()
51
53
  end
52
54
  end
53
55
 
@@ -7,7 +7,7 @@ module GetThemAll
7
7
 
8
8
  include EM::Deferrable
9
9
 
10
- def initialize(downloader, h, params = {})
10
+ def initialize(downloader, h = {}, params = {})
11
11
  @downloader = downloader
12
12
 
13
13
  @storage = @downloader.storage
@@ -6,11 +6,33 @@ module GetThemAll
6
6
  #
7
7
  class ActionLogger < Extension
8
8
  def initialize
9
+
10
+ #
11
+ # reactor events
12
+ #
9
13
  register_handler('downloader.started') do |name, downloader|
10
14
  @skipped_files = 0
11
15
  @download_files = 0
12
16
  end
13
-
17
+
18
+ register_handler('downloader.stopping') do |name, downloader|
19
+ log("Engine stopping...")
20
+ end
21
+
22
+ register_handler('downloader.stopped') do |name, downloader|
23
+ log("Engine stopped")
24
+ end
25
+
26
+ register_handler('downloader.completed') do |name, worker, downloader|
27
+ log ""
28
+ log "Downloaded #{@download_files} files"
29
+ log "Skipped: #{@skipped_files}"
30
+ end
31
+
32
+
33
+ #
34
+ # examine events
35
+ #
14
36
  register_handler('action.examine.started') do |name, worker, action|
15
37
  log("Examining[#{action.level}] #{action.url}")
16
38
  end
@@ -23,8 +45,11 @@ module GetThemAll
23
45
  register_handler('action.examine.success') do |name, worker, action|
24
46
  # do nothing
25
47
  end
26
-
27
-
48
+
49
+
50
+ #
51
+ # download events
52
+ #
28
53
  register_handler('action.download.started') do |name, worker, action|
29
54
  log("Downloading #{action.url}")
30
55
  end
@@ -41,13 +66,7 @@ module GetThemAll
41
66
  @download_files += 1
42
67
  log("File downloaded: #{destpath}")
43
68
  end
44
-
45
- register_handler('downloader.completed') do |name, worker, downloader|
46
- log ""
47
- log "Downloaded #{@download_files} files"
48
- log "Skipped: #{@skipped_files}"
49
- end
50
-
69
+
51
70
  end
52
71
 
53
72
  def log(str)
@@ -1,4 +1,6 @@
1
1
 
2
+ require 'fiber'
3
+
2
4
  require 'addressable/uri'
3
5
  require 'active_support/hash_with_indifferent_access'
4
6
 
@@ -14,9 +16,20 @@ module GetThemAll
14
16
  class SiteDownloader
15
17
  include Notifier
16
18
 
19
+ # number of worker for each tasks
17
20
  class_attribute :examiners_count, :downloaders_count
18
- class_attribute :config
19
-
21
+
22
+ # delay between each action for one worker
23
+ class_attribute :examiners_delay, :downloaders_delay
24
+
25
+
26
+ self.examiners_count = 1
27
+ self.downloaders_count = 1
28
+
29
+ # default: 100 to 200ms between actions
30
+ self.downloaders_delay = [100, 200]
31
+ self.examiners_delay = [100, 200]
32
+
20
33
  ##
21
34
  # Determine what will be stored in the history file,
22
35
  # the default is to store the last url before the download
@@ -28,9 +41,6 @@ module GetThemAll
28
41
  #
29
42
  class_attribute :history_tracking
30
43
 
31
- self.examiners_count = 1
32
- self.downloaders_count = 1
33
-
34
44
  self.history_tracking = :default
35
45
 
36
46
  attr_reader :base_url, :storage, :history
@@ -62,6 +72,7 @@ module GetThemAll
62
72
  @base_url= args.delete(:base_url)
63
73
  @start_url = args.delete(:start_url) || '/'
64
74
  @folder_name= args.delete(:folder_name)
75
+ @login_request = args.delete(:login_request)
65
76
 
66
77
  # keep a pointer to each extension
67
78
  @extensions = args.delete(:extensions) || [ActionLogger]
@@ -105,10 +116,10 @@ module GetThemAll
105
116
  notify('downloader.started', self)
106
117
 
107
118
  EM::run do
108
- EM::add_periodic_timer(5) do
109
- if (EM::connection_count() == 0) && !@storage.working?
110
- debug("no connections, exiting")
111
- EM::stop_event_loop()
119
+ @exit_timer = EM::add_periodic_timer(2) do
120
+ # if all workers are idle
121
+ if @examiners.all?(&:idle?) && @downloaders.all?(&:idle?)
122
+ self.stop()
112
123
  end
113
124
  end
114
125
 
@@ -122,37 +133,93 @@ module GetThemAll
122
133
  end
123
134
  end
124
135
  end
125
-
126
136
 
127
- # queue the first action to start crawling
128
- #
129
- @examine_queue.push(ExamineAction.new(self,
130
- :url => @start_url,
131
- :destination_folder => '/',
132
- :level => 0,
133
- ), 0)
137
+ # authenticate connection if required
138
+ if @login_request
139
+ open_url(*@login_request) do |req, doc|
140
+ after_login()
141
+ end
142
+ else
143
+ after_login()
144
+ end
134
145
 
146
+ end
135
147
 
136
- # now that actions are queued, start handling them
137
- # start each "worker"
138
- # dequeuing is priority based, the download actions
139
- # first and then the higher the level the higher the
140
- # priority for examine actions, this is done this way
141
- # to give work to the download workers asap.
142
- #
143
- 1.upto(self.class.examiners_count) do |n|
144
- Worker.new(:examiner, n - 1, self, @examine_queue)
145
- end
148
+ notify('downloader.completed', self)
149
+ end
150
+
151
+ def after_login
152
+ # queue the first action to start crawling
153
+ #
154
+ @examine_queue.push(ExamineAction.new(self,
155
+ :url => @start_url,
156
+ :destination_folder => '/',
157
+ :level => 0,
158
+ ), 0)
146
159
 
147
- 1.upto(self.class.downloaders_count) do |n|
148
- Worker.new(:downloader, n - 1, self, @download_queue)
149
- end
150
-
160
+
161
+ # now that actions are queued, start handling them
162
+ # start each "worker"
163
+ # dequeuing is priority based, the download actions
164
+ # first and then the higher the level the higher the
165
+ # priority for examine actions, this is done this way
166
+ # to give work to the download workers asap.
167
+ #
168
+
169
+ @examiners = []
170
+ @downloaders = []
171
+
172
+ 1.upto(self.class.examiners_count) do |n|
173
+ @examiners << Worker.new(:examiner, n - 1, @examine_queue, self.class.examiners_delay)
151
174
  end
152
-
153
- save_history()
154
175
 
155
- notify('downloader.completed', self)
176
+ 1.upto(self.class.downloaders_count) do |n|
177
+ @downloaders << Worker.new(:downloader, n - 1, @download_queue, self.class.downloaders_delay)
178
+ end
179
+ end
180
+
181
+ ##
182
+ # Cleanly stop the engine and ensure the history file is
183
+ # written.
184
+ #
185
+ def stop
186
+ return if @stopping
187
+
188
+ # first stop the exit timer, no longer needed once we are here
189
+ @exit_timer.cancel()
190
+ @stopping = true
191
+
192
+ Fiber.new do
193
+ fiber = Fiber.current
194
+
195
+ notify('downloader.stopping', self)
196
+
197
+ # first ask every workers to stop their work
198
+ # starting with examiners
199
+ @examiners.each do |worker|
200
+ debug "Stopping Examiner #{worker.index}..."
201
+ worker.request_stop { fiber.resume }
202
+ Fiber.yield
203
+ debug "Stopped Examiner #{worker.index}"
204
+ end
205
+
206
+ @downloaders.each do |worker|
207
+ debug "Stopping Downloader #{worker.index}..."
208
+ worker.request_stop { fiber.resume }
209
+ Fiber.yield
210
+ debug "Stopped Downloader #{worker.index}"
211
+ end
212
+
213
+ # now that every worker is stopped, write the history
214
+ deferrable = save_history()
215
+ deferrable.callback{ fiber.resume }
216
+ Fiber.yield
217
+
218
+ notify('downloader.stopped', self)
219
+
220
+ # and stop the reactor
221
+ EM::stop_event_loop()
222
+ end.resume
156
223
  end
157
224
 
158
225
  class AssertionFailed < RuntimeError; end
@@ -18,8 +18,11 @@ module GetThemAll
18
18
  open(destpath, "wb") do |f|
19
19
  f.write( data )
20
20
  end
21
-
22
- deferrable.succeed
21
+
22
+ # allow time to caller to schedule
23
+ # callbacks on the deferrable
24
+
25
+ EM::next_tick{ deferrable.succeed }
23
26
  deferrable
24
27
 
25
28
  rescue Errno::EINVAL
@@ -1,3 +1,3 @@
1
1
  module GetThemAll
2
- VERSION = "1.0.0"
2
+ VERSION = "1.0.1"
3
3
  end
@@ -4,45 +4,133 @@ module GetThemAll
4
4
  # each time an action is put in the queue it will handle it
5
5
  #
6
6
  class Worker
7
+ include Notifier
8
+
7
9
  attr_reader :type, :index
8
-
9
- def initialize(type, index, downloader, queue)
10
+
11
+ ##
12
+ # Create a worker.
13
+ #
14
+ # @param [StringSymbol] type Name assigned to the worker
15
+ # the only real use is to identify the worker.
16
+ # @param [Integer] index additional way to indetify the worker.
17
+ # @param [EM::Queue] queue the queue from which this worker
18
+ # will take its jobs.
19
+ # @param [Integer,Array] delay Number of milliseconds between two
20
+ # actions, if an array is provided the value will be randomized
21
+ # between the two first values in the array.
22
+ #
23
+ def initialize(type, index, queue, delay = 0)
10
24
  @type = type
11
25
  @index = index
26
+ @delay = delay
27
+
28
+ # ensure delay is valid
29
+ unless @delay.is_a?(Integer) || (@delay.is_a?(Array) && @delay.size >= 2)
30
+ raise "invalid value for delay: #{@delay}"
31
+ end
12
32
 
13
- @downloader = downloader
14
33
  @queue = queue
15
-
34
+ @idle = true
35
+
36
+
37
+ @stop_requested = false
38
+
39
+ notify('worker.started', self)
40
+
16
41
  @queue.pop do |action|
17
42
  handle_action(action)
18
43
  end
19
44
 
20
45
  end
21
-
46
+
47
+ ##
48
+ # when called the worker will
49
+ # finish the current job and then stop taking
50
+ # new jobs.
51
+ #
52
+ # if a block is given it will be called when
53
+ # the worker is no longer taking actions.
54
+ #
55
+ def request_stop(&block)
56
+ @stop_requested = true
57
+ notify('worker.stop_requested', self)
58
+
59
+ if @idle
60
+ # we are already stopped, just call the block
61
+ EM::next_tick{ block.call }
62
+ else
63
+ @stop_requested_block = block
64
+ end
65
+ end
66
+
67
+ def idle?
68
+ @idle
69
+ end
70
+
22
71
  ##
23
72
  # Take the next action in queue
24
73
  #
25
74
  def take_next_job
26
- EM::next_tick do
27
- @queue.pop{|act| handle_action(act) }
75
+ @idle = true
76
+
77
+ if @stop_requested
78
+ # do not take new jobs and call
79
+ # the passed block is any
80
+ @stop_requested_block.call if @stop_requested_block
81
+ notify('worker.stopped', self)
82
+ else
83
+ delay = delay_before_next_action()
84
+ EM::add_timer(delay / 1000) do
85
+ @queue.pop do |act|
86
+ handle_action(act)
87
+ end
88
+ end
89
+
28
90
  end
29
91
  end
30
92
 
31
93
 
32
94
  def handle_action(action)
33
- action.callback( &method(:take_next_job) )
34
- action.errback do
35
- # in case of failure, try again later (with slightly lower priority)
36
- EM::add_timer(50) do
37
- @queue.push(action, [action.level - 1, 0].max)
38
- end
95
+ @idle = false
96
+ @current_action = action
39
97
 
40
- # and take the next job
41
- take_next_job()
42
- end
43
-
98
+ # register callbacks
99
+ action.callback( &method(:action_succeeded) )
100
+ action.errback( &method(:action_failed) )
101
+
102
+ # and start the action
44
103
  action.do_action(self)
45
104
  end
46
105
 
106
+ private
107
+ def action_failed
108
+ # in case of failure, try again later (with slightly lower priority)
109
+ @queue.push(@current_action, [@current_action.level - 1, 0].max)
110
+
111
+ # and take the next job
112
+ take_next_job()
113
+ end
114
+
115
+ def action_succeeded
116
+ take_next_job()
117
+ end
118
+
119
+
120
+ ##
121
+ # Compute the delay before the next action can
122
+ # take place.
123
+ #
124
+ # @return [Integer] Number of milliseconds to wait
125
+ #
126
+ def delay_before_next_action
127
+ case @delay
128
+ when Integer then @delay
129
+ when Array then rand(@delay[1] - @delay[0]) + @delay[0]
130
+ else
131
+ 0
132
+ end
133
+ end
134
+
47
135
  end
48
136
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: get_them_all
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2011-10-16 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: thor
16
- requirement: &70256844459680 !ruby/object:Gem::Requirement
16
+ requirement: &70096048287860 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70256844459680
24
+ version_requirements: *70096048287860
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: em-http-request
27
- requirement: &70256844459180 !ruby/object:Gem::Requirement
27
+ requirement: &70096048287240 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.0.0
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70256844459180
35
+ version_requirements: *70096048287240
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: em-priority-queue
38
- requirement: &70256844458680 !ruby/object:Gem::Requirement
38
+ requirement: &70096048286640 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 0.0.2
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70256844458680
46
+ version_requirements: *70096048286640
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: hpricot
49
- requirement: &70256844458220 !ruby/object:Gem::Requirement
49
+ requirement: &70096048286060 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 0.8.1
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70256844458220
57
+ version_requirements: *70096048286060
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: i18n
60
- requirement: &70256844457840 !ruby/object:Gem::Requirement
60
+ requirement: &70096048285560 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70256844457840
68
+ version_requirements: *70096048285560
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: activesupport
71
- requirement: &70256844457300 !ruby/object:Gem::Requirement
71
+ requirement: &70096048284900 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: 3.1.0
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70256844457300
79
+ version_requirements: *70096048284900
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: therubyracer
82
- requirement: &70256844456800 !ruby/object:Gem::Requirement
82
+ requirement: &70096048284280 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ~>
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: 0.9.8
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70256844456800
90
+ version_requirements: *70096048284280
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: dropbox
93
- requirement: &70256844456420 !ruby/object:Gem::Requirement
93
+ requirement: &70096048283860 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70256844456420
101
+ version_requirements: *70096048283860
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: girl_friday
104
- requirement: &70256844455800 !ruby/object:Gem::Requirement
104
+ requirement: &70096048283380 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,7 +109,7 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70256844455800
112
+ version_requirements: *70096048283380
113
113
  description: Mass downloader useable as standalone or as a library
114
114
  email: []
115
115
  executables:
@@ -153,7 +153,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
153
153
  version: '0'
154
154
  segments:
155
155
  - 0
156
- hash: 3600791884245333025
156
+ hash: 2337480508786894492
157
157
  required_rubygems_version: !ruby/object:Gem::Requirement
158
158
  none: false
159
159
  requirements:
@@ -162,7 +162,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
162
162
  version: '0'
163
163
  segments:
164
164
  - 0
165
- hash: 3600791884245333025
165
+ hash: 2337480508786894492
166
166
  requirements: []
167
167
  rubyforge_project: get_them_all
168
168
  rubygems_version: 1.8.11