get_them_all 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/gta CHANGED
@@ -32,14 +32,9 @@ class GtaRunner < Thor
32
32
  fail("file #{script_path} should define class #{class_name} !") unless Object.const_defined?( class_name.to_sym )
33
33
 
34
34
  info("Started with config file #{File.basename(script_path)}")
35
-
36
- # CTRL+C
37
- trap("INT") do
38
- EM::stop_event_loop()
39
- end
40
-
35
+
41
36
  # create the instance (and start download)
42
- class_name.constantize.new(
37
+ crawler = class_name.constantize.new(
43
38
  :storage => {
44
39
  :type => 'file',
45
40
  :params => {
@@ -47,7 +42,14 @@ class GtaRunner < Thor
47
42
  }
48
43
  },
49
44
  :extensions => [GetThemAll::ActionLogger.new]
50
- ).start()
45
+ )
46
+
47
+ # CTRL+C
48
+ trap("INT") do
49
+ crawler.stop()
50
+ end
51
+
52
+ crawler.start()
51
53
  end
52
54
  end
53
55
 
@@ -7,7 +7,7 @@ module GetThemAll
7
7
 
8
8
  include EM::Deferrable
9
9
 
10
- def initialize(downloader, h, params = {})
10
+ def initialize(downloader, h = {}, params = {})
11
11
  @downloader = downloader
12
12
 
13
13
  @storage = @downloader.storage
@@ -6,11 +6,33 @@ module GetThemAll
6
6
  #
7
7
  class ActionLogger < Extension
8
8
  def initialize
9
+
10
+ #
11
+ # reactor events
12
+ #
9
13
  register_handler('downloader.started') do |name, downloader|
10
14
  @skipped_files = 0
11
15
  @download_files = 0
12
16
  end
13
-
17
+
18
+ register_handler('downloader.stopping') do |name, downloader|
19
+ log("Engine stopping...")
20
+ end
21
+
22
+ register_handler('downloader.stopped') do |name, downloader|
23
+ log("Engine stopped")
24
+ end
25
+
26
+ register_handler('downloader.completed') do |name, worker, downloader|
27
+ log ""
28
+ log "Downloaded #{@download_files} files"
29
+ log "Skipped: #{@skipped_files}"
30
+ end
31
+
32
+
33
+ #
34
+ # examine events
35
+ #
14
36
  register_handler('action.examine.started') do |name, worker, action|
15
37
  log("Examining[#{action.level}] #{action.url}")
16
38
  end
@@ -23,8 +45,11 @@ module GetThemAll
23
45
  register_handler('action.examine.success') do |name, worker, action|
24
46
  # do nothing
25
47
  end
26
-
27
-
48
+
49
+
50
+ #
51
+ # download events
52
+ #
28
53
  register_handler('action.download.started') do |name, worker, action|
29
54
  log("Downloading #{action.url}")
30
55
  end
@@ -41,13 +66,7 @@ module GetThemAll
41
66
  @download_files += 1
42
67
  log("File downloaded: #{destpath}")
43
68
  end
44
-
45
- register_handler('downloader.completed') do |name, worker, downloader|
46
- log ""
47
- log "Downloaded #{@download_files} files"
48
- log "Skipped: #{@skipped_files}"
49
- end
50
-
69
+
51
70
  end
52
71
 
53
72
  def log(str)
@@ -1,4 +1,6 @@
1
1
 
2
+ require 'fiber'
3
+
2
4
  require 'addressable/uri'
3
5
  require 'active_support/hash_with_indifferent_access'
4
6
 
@@ -14,9 +16,20 @@ module GetThemAll
14
16
  class SiteDownloader
15
17
  include Notifier
16
18
 
19
+ # number of worker for each tasks
17
20
  class_attribute :examiners_count, :downloaders_count
18
- class_attribute :config
19
-
21
+
22
+ # delay between each action for one worker
23
+ class_attribute :examiners_delay, :downloaders_delay
24
+
25
+
26
+ self.examiners_count = 1
27
+ self.downloaders_count = 1
28
+
29
+ # default: 100 to 200ms between actions
30
+ self.downloaders_delay = [100, 200]
31
+ self.examiners_delay = [100, 200]
32
+
20
33
  ##
21
34
  # Determine what will be stored in the history file,
22
35
  # the default is to store the last url before the download
@@ -28,9 +41,6 @@ module GetThemAll
28
41
  #
29
42
  class_attribute :history_tracking
30
43
 
31
- self.examiners_count = 1
32
- self.downloaders_count = 1
33
-
34
44
  self.history_tracking = :default
35
45
 
36
46
  attr_reader :base_url, :storage, :history
@@ -62,6 +72,7 @@ module GetThemAll
62
72
  @base_url= args.delete(:base_url)
63
73
  @start_url = args.delete(:start_url) || '/'
64
74
  @folder_name= args.delete(:folder_name)
75
+ @login_request = args.delete(:login_request)
65
76
 
66
77
  # keep a pointer to each extension
67
78
  @extensions = args.delete(:extensions) || [ActionLogger]
@@ -105,10 +116,10 @@ module GetThemAll
105
116
  notify('downloader.started', self)
106
117
 
107
118
  EM::run do
108
- EM::add_periodic_timer(5) do
109
- if (EM::connection_count() == 0) && !@storage.working?
110
- debug("no connections, exiting")
111
- EM::stop_event_loop()
119
+ @exit_timer = EM::add_periodic_timer(2) do
120
+ # if all workers are idle
121
+ if @examiners.all?(&:idle?) && @downloaders.all?(&:idle?)
122
+ self.stop()
112
123
  end
113
124
  end
114
125
 
@@ -122,37 +133,93 @@ module GetThemAll
122
133
  end
123
134
  end
124
135
  end
125
-
126
136
 
127
- # queue the first action to start crawling
128
- #
129
- @examine_queue.push(ExamineAction.new(self,
130
- :url => @start_url,
131
- :destination_folder => '/',
132
- :level => 0,
133
- ), 0)
137
+ # authenticate connection if required
138
+ if @login_request
139
+ open_url(*@login_request) do |req, doc|
140
+ after_login()
141
+ end
142
+ else
143
+ after_login()
144
+ end
134
145
 
146
+ end
135
147
 
136
- # now that actions are queued, start handling them
137
- # start each "worker"
138
- # dequeuing is priority based, the download actions
139
- # first and then the higher the level the higher the
140
- # priority for examine actions, this is done this way
141
- # to give work to the download workers asap.
142
- #
143
- 1.upto(self.class.examiners_count) do |n|
144
- Worker.new(:examiner, n - 1, self, @examine_queue)
145
- end
148
+ notify('downloader.completed', self)
149
+ end
150
+
151
+ def after_login
152
+ # queue the first action to start crawling
153
+ #
154
+ @examine_queue.push(ExamineAction.new(self,
155
+ :url => @start_url,
156
+ :destination_folder => '/',
157
+ :level => 0,
158
+ ), 0)
146
159
 
147
- 1.upto(self.class.downloaders_count) do |n|
148
- Worker.new(:downloader, n - 1, self, @download_queue)
149
- end
150
-
160
+
161
+ # now that actions are queued, start handling them
162
+ # start each "worker"
163
+ # dequeuing is priority based, the download actions
164
+ # first and then the higher the level the higher the
165
+ # priority for examine actions, this is done this way
166
+ # to give work to the download workers asap.
167
+ #
168
+
169
+ @examiners = []
170
+ @downloaders = []
171
+
172
+ 1.upto(self.class.examiners_count) do |n|
173
+ @examiners << Worker.new(:examiner, n - 1, @examine_queue, self.class.examiners_delay)
151
174
  end
152
-
153
- save_history()
154
175
 
155
- notify('downloader.completed', self)
176
+ 1.upto(self.class.downloaders_count) do |n|
177
+ @downloaders << Worker.new(:downloader, n - 1, @download_queue, self.class.downloaders_delay)
178
+ end
179
+ end
180
+
181
+ ##
182
+ # Cleanly stop the engine and ensure the history file is
183
+ # written.
184
+ #
185
+ def stop
186
+ return if @stopping
187
+
188
+ # first stop the exit timer, no longer needed once we are here
189
+ @exit_timer.cancel()
190
+ @stopping = true
191
+
192
+ Fiber.new do
193
+ fiber = Fiber.current
194
+
195
+ notify('downloader.stopping', self)
196
+
197
+ # first ask every workers to stop their work
198
+ # starting with examiners
199
+ @examiners.each do |worker|
200
+ debug "Stopping Examiner #{worker.index}..."
201
+ worker.request_stop { fiber.resume }
202
+ Fiber.yield
203
+ debug "Stopped Examiner #{worker.index}"
204
+ end
205
+
206
+ @downloaders.each do |worker|
207
+ debug "Stopping Downloader #{worker.index}..."
208
+ worker.request_stop { fiber.resume }
209
+ Fiber.yield
210
+ debug "Stopped Downloader #{worker.index}"
211
+ end
212
+
213
+ # now that every worker is stopped, write the history
214
+ deferrable = save_history()
215
+ deferrable.callback{ fiber.resume }
216
+ Fiber.yield
217
+
218
+ notify('downloader.stopped', self)
219
+
220
+ # and stop the reactor
221
+ EM::stop_event_loop()
222
+ end.resume
156
223
  end
157
224
 
158
225
  class AssertionFailed < RuntimeError; end
@@ -18,8 +18,11 @@ module GetThemAll
18
18
  open(destpath, "wb") do |f|
19
19
  f.write( data )
20
20
  end
21
-
22
- deferrable.succeed
21
+
22
+ # allow time to caller to schedule
23
+ # callbacks on the deferrable
24
+
25
+ EM::next_tick{ deferrable.succeed }
23
26
  deferrable
24
27
 
25
28
  rescue Errno::EINVAL
@@ -1,3 +1,3 @@
1
1
  module GetThemAll
2
- VERSION = "1.0.0"
2
+ VERSION = "1.0.1"
3
3
  end
@@ -4,45 +4,133 @@ module GetThemAll
4
4
  # each time an action is put in the queue it will handle it
5
5
  #
6
6
  class Worker
7
+ include Notifier
8
+
7
9
  attr_reader :type, :index
8
-
9
- def initialize(type, index, downloader, queue)
10
+
11
+ ##
12
+ # Create a worker.
13
+ #
14
+ # @param [StringSymbol] type Name assigned to the worker
15
+ # the only real use is to identify the worker.
16
+ # @param [Integer] index additional way to indetify the worker.
17
+ # @param [EM::Queue] queue the queue from which this worker
18
+ # will take its jobs.
19
+ # @param [Integer,Array] delay Number of milliseconds between two
20
+ # actions, if an array is provided the value will be randomized
21
+ # between the two first values in the array.
22
+ #
23
+ def initialize(type, index, queue, delay = 0)
10
24
  @type = type
11
25
  @index = index
26
+ @delay = delay
27
+
28
+ # ensure delay is valid
29
+ unless @delay.is_a?(Integer) || (@delay.is_a?(Array) && @delay.size >= 2)
30
+ raise "invalid value for delay: #{@delay}"
31
+ end
12
32
 
13
- @downloader = downloader
14
33
  @queue = queue
15
-
34
+ @idle = true
35
+
36
+
37
+ @stop_requested = false
38
+
39
+ notify('worker.started', self)
40
+
16
41
  @queue.pop do |action|
17
42
  handle_action(action)
18
43
  end
19
44
 
20
45
  end
21
-
46
+
47
+ ##
48
+ # when called the worker will
49
+ # finish the current job and then stop taking
50
+ # new jobs.
51
+ #
52
+ # if a block is given it will be called when
53
+ # the worker is no longer taking actions.
54
+ #
55
+ def request_stop(&block)
56
+ @stop_requested = true
57
+ notify('worker.stop_requested', self)
58
+
59
+ if @idle
60
+ # we are already stopped, just call the block
61
+ EM::next_tick{ block.call }
62
+ else
63
+ @stop_requested_block = block
64
+ end
65
+ end
66
+
67
+ def idle?
68
+ @idle
69
+ end
70
+
22
71
  ##
23
72
  # Take the next action in queue
24
73
  #
25
74
  def take_next_job
26
- EM::next_tick do
27
- @queue.pop{|act| handle_action(act) }
75
+ @idle = true
76
+
77
+ if @stop_requested
78
+ # do not take new jobs and call
79
+ # the passed block is any
80
+ @stop_requested_block.call if @stop_requested_block
81
+ notify('worker.stopped', self)
82
+ else
83
+ delay = delay_before_next_action()
84
+ EM::add_timer(delay / 1000) do
85
+ @queue.pop do |act|
86
+ handle_action(act)
87
+ end
88
+ end
89
+
28
90
  end
29
91
  end
30
92
 
31
93
 
32
94
  def handle_action(action)
33
- action.callback( &method(:take_next_job) )
34
- action.errback do
35
- # in case of failure, try again later (with slightly lower priority)
36
- EM::add_timer(50) do
37
- @queue.push(action, [action.level - 1, 0].max)
38
- end
95
+ @idle = false
96
+ @current_action = action
39
97
 
40
- # and take the next job
41
- take_next_job()
42
- end
43
-
98
+ # register callbacks
99
+ action.callback( &method(:action_succeeded) )
100
+ action.errback( &method(:action_failed) )
101
+
102
+ # and start the action
44
103
  action.do_action(self)
45
104
  end
46
105
 
106
+ private
107
+ def action_failed
108
+ # in case of failure, try again later (with slightly lower priority)
109
+ @queue.push(@current_action, [@current_action.level - 1, 0].max)
110
+
111
+ # and take the next job
112
+ take_next_job()
113
+ end
114
+
115
+ def action_succeeded
116
+ take_next_job()
117
+ end
118
+
119
+
120
+ ##
121
+ # Compute the delay before the next action can
122
+ # take place.
123
+ #
124
+ # @return [Integer] Number of milliseconds to wait
125
+ #
126
+ def delay_before_next_action
127
+ case @delay
128
+ when Integer then @delay
129
+ when Array then rand(@delay[1] - @delay[0]) + @delay[0]
130
+ else
131
+ 0
132
+ end
133
+ end
134
+
47
135
  end
48
136
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: get_them_all
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2011-10-16 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: thor
16
- requirement: &70256844459680 !ruby/object:Gem::Requirement
16
+ requirement: &70096048287860 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70256844459680
24
+ version_requirements: *70096048287860
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: em-http-request
27
- requirement: &70256844459180 !ruby/object:Gem::Requirement
27
+ requirement: &70096048287240 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.0.0
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70256844459180
35
+ version_requirements: *70096048287240
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: em-priority-queue
38
- requirement: &70256844458680 !ruby/object:Gem::Requirement
38
+ requirement: &70096048286640 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 0.0.2
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70256844458680
46
+ version_requirements: *70096048286640
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: hpricot
49
- requirement: &70256844458220 !ruby/object:Gem::Requirement
49
+ requirement: &70096048286060 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 0.8.1
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70256844458220
57
+ version_requirements: *70096048286060
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: i18n
60
- requirement: &70256844457840 !ruby/object:Gem::Requirement
60
+ requirement: &70096048285560 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70256844457840
68
+ version_requirements: *70096048285560
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: activesupport
71
- requirement: &70256844457300 !ruby/object:Gem::Requirement
71
+ requirement: &70096048284900 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: 3.1.0
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70256844457300
79
+ version_requirements: *70096048284900
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: therubyracer
82
- requirement: &70256844456800 !ruby/object:Gem::Requirement
82
+ requirement: &70096048284280 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ~>
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: 0.9.8
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70256844456800
90
+ version_requirements: *70096048284280
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: dropbox
93
- requirement: &70256844456420 !ruby/object:Gem::Requirement
93
+ requirement: &70096048283860 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70256844456420
101
+ version_requirements: *70096048283860
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: girl_friday
104
- requirement: &70256844455800 !ruby/object:Gem::Requirement
104
+ requirement: &70096048283380 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,7 +109,7 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70256844455800
112
+ version_requirements: *70096048283380
113
113
  description: Mass downloader useable as standalone or as a library
114
114
  email: []
115
115
  executables:
@@ -153,7 +153,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
153
153
  version: '0'
154
154
  segments:
155
155
  - 0
156
- hash: 3600791884245333025
156
+ hash: 2337480508786894492
157
157
  required_rubygems_version: !ruby/object:Gem::Requirement
158
158
  none: false
159
159
  requirements:
@@ -162,7 +162,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
162
162
  version: '0'
163
163
  segments:
164
164
  - 0
165
- hash: 3600791884245333025
165
+ hash: 2337480508786894492
166
166
  requirements: []
167
167
  rubyforge_project: get_them_all
168
168
  rubygems_version: 1.8.11