get_them_all 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/gta +10 -8
- data/lib/get_them_all/action.rb +1 -1
- data/lib/get_them_all/extensions/action_logger.rb +29 -10
- data/lib/get_them_all/site_downloader.rb +101 -34
- data/lib/get_them_all/storage/file_storage.rb +5 -2
- data/lib/get_them_all/version.rb +1 -1
- data/lib/get_them_all/worker.rb +105 -17
- metadata +21 -21
data/bin/gta
CHANGED
@@ -32,14 +32,9 @@ class GtaRunner < Thor
|
|
32
32
|
fail("file #{script_path} should define class #{class_name} !") unless Object.const_defined?( class_name.to_sym )
|
33
33
|
|
34
34
|
info("Started with config file #{File.basename(script_path)}")
|
35
|
-
|
36
|
-
# CTRL+C
|
37
|
-
trap("INT") do
|
38
|
-
EM::stop_event_loop()
|
39
|
-
end
|
40
|
-
|
35
|
+
|
41
36
|
# create the instance (and start download)
|
42
|
-
class_name.constantize.new(
|
37
|
+
crawler = class_name.constantize.new(
|
43
38
|
:storage => {
|
44
39
|
:type => 'file',
|
45
40
|
:params => {
|
@@ -47,7 +42,14 @@ class GtaRunner < Thor
|
|
47
42
|
}
|
48
43
|
},
|
49
44
|
:extensions => [GetThemAll::ActionLogger.new]
|
50
|
-
)
|
45
|
+
)
|
46
|
+
|
47
|
+
# CTRL+C
|
48
|
+
trap("INT") do
|
49
|
+
crawler.stop()
|
50
|
+
end
|
51
|
+
|
52
|
+
crawler.start()
|
51
53
|
end
|
52
54
|
end
|
53
55
|
|
data/lib/get_them_all/action.rb
CHANGED
@@ -6,11 +6,33 @@ module GetThemAll
|
|
6
6
|
#
|
7
7
|
class ActionLogger < Extension
|
8
8
|
def initialize
|
9
|
+
|
10
|
+
#
|
11
|
+
# reactor events
|
12
|
+
#
|
9
13
|
register_handler('downloader.started') do |name, downloader|
|
10
14
|
@skipped_files = 0
|
11
15
|
@download_files = 0
|
12
16
|
end
|
13
|
-
|
17
|
+
|
18
|
+
register_handler('downloader.stopping') do |name, downloader|
|
19
|
+
log("Engine stopping...")
|
20
|
+
end
|
21
|
+
|
22
|
+
register_handler('downloader.stopped') do |name, downloader|
|
23
|
+
log("Engine stopped")
|
24
|
+
end
|
25
|
+
|
26
|
+
register_handler('downloader.completed') do |name, worker, downloader|
|
27
|
+
log ""
|
28
|
+
log "Downloaded #{@download_files} files"
|
29
|
+
log "Skipped: #{@skipped_files}"
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
#
|
34
|
+
# examine events
|
35
|
+
#
|
14
36
|
register_handler('action.examine.started') do |name, worker, action|
|
15
37
|
log("Examining[#{action.level}] #{action.url}")
|
16
38
|
end
|
@@ -23,8 +45,11 @@ module GetThemAll
|
|
23
45
|
register_handler('action.examine.success') do |name, worker, action|
|
24
46
|
# do nothing
|
25
47
|
end
|
26
|
-
|
27
|
-
|
48
|
+
|
49
|
+
|
50
|
+
#
|
51
|
+
# download events
|
52
|
+
#
|
28
53
|
register_handler('action.download.started') do |name, worker, action|
|
29
54
|
log("Downloading #{action.url}")
|
30
55
|
end
|
@@ -41,13 +66,7 @@ module GetThemAll
|
|
41
66
|
@download_files += 1
|
42
67
|
log("File downloaded: #{destpath}")
|
43
68
|
end
|
44
|
-
|
45
|
-
register_handler('downloader.completed') do |name, worker, downloader|
|
46
|
-
log ""
|
47
|
-
log "Downloaded #{@download_files} files"
|
48
|
-
log "Skipped: #{@skipped_files}"
|
49
|
-
end
|
50
|
-
|
69
|
+
|
51
70
|
end
|
52
71
|
|
53
72
|
def log(str)
|
@@ -1,4 +1,6 @@
|
|
1
1
|
|
2
|
+
require 'fiber'
|
3
|
+
|
2
4
|
require 'addressable/uri'
|
3
5
|
require 'active_support/hash_with_indifferent_access'
|
4
6
|
|
@@ -14,9 +16,20 @@ module GetThemAll
|
|
14
16
|
class SiteDownloader
|
15
17
|
include Notifier
|
16
18
|
|
19
|
+
# number of worker for each tasks
|
17
20
|
class_attribute :examiners_count, :downloaders_count
|
18
|
-
|
19
|
-
|
21
|
+
|
22
|
+
# delay between each action for one worker
|
23
|
+
class_attribute :examiners_delay, :downloaders_delay
|
24
|
+
|
25
|
+
|
26
|
+
self.examiners_count = 1
|
27
|
+
self.downloaders_count = 1
|
28
|
+
|
29
|
+
# default: 100 to 200ms between actions
|
30
|
+
self.downloaders_delay = [100, 200]
|
31
|
+
self.examiners_delay = [100, 200]
|
32
|
+
|
20
33
|
##
|
21
34
|
# Determine what will be stored in the history file,
|
22
35
|
# the default is to store the last url before the download
|
@@ -28,9 +41,6 @@ module GetThemAll
|
|
28
41
|
#
|
29
42
|
class_attribute :history_tracking
|
30
43
|
|
31
|
-
self.examiners_count = 1
|
32
|
-
self.downloaders_count = 1
|
33
|
-
|
34
44
|
self.history_tracking = :default
|
35
45
|
|
36
46
|
attr_reader :base_url, :storage, :history
|
@@ -62,6 +72,7 @@ module GetThemAll
|
|
62
72
|
@base_url= args.delete(:base_url)
|
63
73
|
@start_url = args.delete(:start_url) || '/'
|
64
74
|
@folder_name= args.delete(:folder_name)
|
75
|
+
@login_request = args.delete(:login_request)
|
65
76
|
|
66
77
|
# keep a pointer to each extension
|
67
78
|
@extensions = args.delete(:extensions) || [ActionLogger]
|
@@ -105,10 +116,10 @@ module GetThemAll
|
|
105
116
|
notify('downloader.started', self)
|
106
117
|
|
107
118
|
EM::run do
|
108
|
-
EM::add_periodic_timer(
|
109
|
-
if
|
110
|
-
|
111
|
-
|
119
|
+
@exit_timer = EM::add_periodic_timer(2) do
|
120
|
+
# if all workers are idle
|
121
|
+
if @examiners.all?(&:idle?) && @downloaders.all?(&:idle?)
|
122
|
+
self.stop()
|
112
123
|
end
|
113
124
|
end
|
114
125
|
|
@@ -122,37 +133,93 @@ module GetThemAll
|
|
122
133
|
end
|
123
134
|
end
|
124
135
|
end
|
125
|
-
|
126
136
|
|
127
|
-
#
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
)
|
137
|
+
# authenticate connection if required
|
138
|
+
if @login_request
|
139
|
+
open_url(*@login_request) do |req, doc|
|
140
|
+
after_login()
|
141
|
+
end
|
142
|
+
else
|
143
|
+
after_login()
|
144
|
+
end
|
134
145
|
|
146
|
+
end
|
135
147
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
148
|
+
notify('downloader.completed', self)
|
149
|
+
end
|
150
|
+
|
151
|
+
def after_login
|
152
|
+
# queue the first action to start crawling
|
153
|
+
#
|
154
|
+
@examine_queue.push(ExamineAction.new(self,
|
155
|
+
:url => @start_url,
|
156
|
+
:destination_folder => '/',
|
157
|
+
:level => 0,
|
158
|
+
), 0)
|
146
159
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
160
|
+
|
161
|
+
# now that actions are queued, start handling them
|
162
|
+
# start each "worker"
|
163
|
+
# dequeuing is priority based, the download actions
|
164
|
+
# first and then the higher the level the higher the
|
165
|
+
# priority for examine actions, this is done this way
|
166
|
+
# to give work to the download workers asap.
|
167
|
+
#
|
168
|
+
|
169
|
+
@examiners = []
|
170
|
+
@downloaders = []
|
171
|
+
|
172
|
+
1.upto(self.class.examiners_count) do |n|
|
173
|
+
@examiners << Worker.new(:examiner, n - 1, @examine_queue, self.class.examiners_delay)
|
151
174
|
end
|
152
|
-
|
153
|
-
save_history()
|
154
175
|
|
155
|
-
|
176
|
+
1.upto(self.class.downloaders_count) do |n|
|
177
|
+
@downloaders << Worker.new(:downloader, n - 1, @download_queue, self.class.downloaders_delay)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
##
|
182
|
+
# Cleanly stop the engine and ensure the history file is
|
183
|
+
# written.
|
184
|
+
#
|
185
|
+
def stop
|
186
|
+
return if @stopping
|
187
|
+
|
188
|
+
# first stop the exit timer, no longer needed once we are here
|
189
|
+
@exit_timer.cancel()
|
190
|
+
@stopping = true
|
191
|
+
|
192
|
+
Fiber.new do
|
193
|
+
fiber = Fiber.current
|
194
|
+
|
195
|
+
notify('downloader.stopping', self)
|
196
|
+
|
197
|
+
# first ask every workers to stop their work
|
198
|
+
# starting with examiners
|
199
|
+
@examiners.each do |worker|
|
200
|
+
debug "Stopping Examiner #{worker.index}..."
|
201
|
+
worker.request_stop { fiber.resume }
|
202
|
+
Fiber.yield
|
203
|
+
debug "Stopped Examiner #{worker.index}"
|
204
|
+
end
|
205
|
+
|
206
|
+
@downloaders.each do |worker|
|
207
|
+
debug "Stopping Downloader #{worker.index}..."
|
208
|
+
worker.request_stop { fiber.resume }
|
209
|
+
Fiber.yield
|
210
|
+
debug "Stopped Downloader #{worker.index}"
|
211
|
+
end
|
212
|
+
|
213
|
+
# now that every worker is stopped, write the history
|
214
|
+
deferrable = save_history()
|
215
|
+
deferrable.callback{ fiber.resume }
|
216
|
+
Fiber.yield
|
217
|
+
|
218
|
+
notify('downloader.stopped', self)
|
219
|
+
|
220
|
+
# and stop the reactor
|
221
|
+
EM::stop_event_loop()
|
222
|
+
end.resume
|
156
223
|
end
|
157
224
|
|
158
225
|
class AssertionFailed < RuntimeError; end
|
@@ -18,8 +18,11 @@ module GetThemAll
|
|
18
18
|
open(destpath, "wb") do |f|
|
19
19
|
f.write( data )
|
20
20
|
end
|
21
|
-
|
22
|
-
|
21
|
+
|
22
|
+
# allow time to caller to schedule
|
23
|
+
# callbacks on the deferrable
|
24
|
+
|
25
|
+
EM::next_tick{ deferrable.succeed }
|
23
26
|
deferrable
|
24
27
|
|
25
28
|
rescue Errno::EINVAL
|
data/lib/get_them_all/version.rb
CHANGED
data/lib/get_them_all/worker.rb
CHANGED
@@ -4,45 +4,133 @@ module GetThemAll
|
|
4
4
|
# each time an action is put in the queue it will handle it
|
5
5
|
#
|
6
6
|
class Worker
|
7
|
+
include Notifier
|
8
|
+
|
7
9
|
attr_reader :type, :index
|
8
|
-
|
9
|
-
|
10
|
+
|
11
|
+
##
|
12
|
+
# Create a worker.
|
13
|
+
#
|
14
|
+
# @param [StringSymbol] type Name assigned to the worker
|
15
|
+
# the only real use is to identify the worker.
|
16
|
+
# @param [Integer] index additional way to indetify the worker.
|
17
|
+
# @param [EM::Queue] queue the queue from which this worker
|
18
|
+
# will take its jobs.
|
19
|
+
# @param [Integer,Array] delay Number of milliseconds between two
|
20
|
+
# actions, if an array is provided the value will be randomized
|
21
|
+
# between the two first values in the array.
|
22
|
+
#
|
23
|
+
def initialize(type, index, queue, delay = 0)
|
10
24
|
@type = type
|
11
25
|
@index = index
|
26
|
+
@delay = delay
|
27
|
+
|
28
|
+
# ensure delay is valid
|
29
|
+
unless @delay.is_a?(Integer) || (@delay.is_a?(Array) && @delay.size >= 2)
|
30
|
+
raise "invalid value for delay: #{@delay}"
|
31
|
+
end
|
12
32
|
|
13
|
-
@downloader = downloader
|
14
33
|
@queue = queue
|
15
|
-
|
34
|
+
@idle = true
|
35
|
+
|
36
|
+
|
37
|
+
@stop_requested = false
|
38
|
+
|
39
|
+
notify('worker.started', self)
|
40
|
+
|
16
41
|
@queue.pop do |action|
|
17
42
|
handle_action(action)
|
18
43
|
end
|
19
44
|
|
20
45
|
end
|
21
|
-
|
46
|
+
|
47
|
+
##
|
48
|
+
# when called the worker will
|
49
|
+
# finish the current job and then stop taking
|
50
|
+
# new jobs.
|
51
|
+
#
|
52
|
+
# if a block is given it will be called when
|
53
|
+
# the worker is no longer taking actions.
|
54
|
+
#
|
55
|
+
def request_stop(&block)
|
56
|
+
@stop_requested = true
|
57
|
+
notify('worker.stop_requested', self)
|
58
|
+
|
59
|
+
if @idle
|
60
|
+
# we are already stopped, just call the block
|
61
|
+
EM::next_tick{ block.call }
|
62
|
+
else
|
63
|
+
@stop_requested_block = block
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def idle?
|
68
|
+
@idle
|
69
|
+
end
|
70
|
+
|
22
71
|
##
|
23
72
|
# Take the next action in queue
|
24
73
|
#
|
25
74
|
def take_next_job
|
26
|
-
|
27
|
-
|
75
|
+
@idle = true
|
76
|
+
|
77
|
+
if @stop_requested
|
78
|
+
# do not take new jobs and call
|
79
|
+
# the passed block is any
|
80
|
+
@stop_requested_block.call if @stop_requested_block
|
81
|
+
notify('worker.stopped', self)
|
82
|
+
else
|
83
|
+
delay = delay_before_next_action()
|
84
|
+
EM::add_timer(delay / 1000) do
|
85
|
+
@queue.pop do |act|
|
86
|
+
handle_action(act)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
28
90
|
end
|
29
91
|
end
|
30
92
|
|
31
93
|
|
32
94
|
def handle_action(action)
|
33
|
-
|
34
|
-
action
|
35
|
-
# in case of failure, try again later (with slightly lower priority)
|
36
|
-
EM::add_timer(50) do
|
37
|
-
@queue.push(action, [action.level - 1, 0].max)
|
38
|
-
end
|
95
|
+
@idle = false
|
96
|
+
@current_action = action
|
39
97
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
98
|
+
# register callbacks
|
99
|
+
action.callback( &method(:action_succeeded) )
|
100
|
+
action.errback( &method(:action_failed) )
|
101
|
+
|
102
|
+
# and start the action
|
44
103
|
action.do_action(self)
|
45
104
|
end
|
46
105
|
|
106
|
+
private
|
107
|
+
def action_failed
|
108
|
+
# in case of failure, try again later (with slightly lower priority)
|
109
|
+
@queue.push(@current_action, [@current_action.level - 1, 0].max)
|
110
|
+
|
111
|
+
# and take the next job
|
112
|
+
take_next_job()
|
113
|
+
end
|
114
|
+
|
115
|
+
def action_succeeded
|
116
|
+
take_next_job()
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
##
|
121
|
+
# Compute the delay before the next action can
|
122
|
+
# take place.
|
123
|
+
#
|
124
|
+
# @return [Integer] Number of milliseconds to wait
|
125
|
+
#
|
126
|
+
def delay_before_next_action
|
127
|
+
case @delay
|
128
|
+
when Integer then @delay
|
129
|
+
when Array then rand(@delay[1] - @delay[0]) + @delay[0]
|
130
|
+
else
|
131
|
+
0
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
47
135
|
end
|
48
136
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: get_them_all
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2011-10-16 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: thor
|
16
|
-
requirement: &
|
16
|
+
requirement: &70096048287860 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70096048287860
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: em-http-request
|
27
|
-
requirement: &
|
27
|
+
requirement: &70096048287240 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.0.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70096048287240
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: em-priority-queue
|
38
|
-
requirement: &
|
38
|
+
requirement: &70096048286640 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.0.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70096048286640
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: hpricot
|
49
|
-
requirement: &
|
49
|
+
requirement: &70096048286060 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.8.1
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70096048286060
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: i18n
|
60
|
-
requirement: &
|
60
|
+
requirement: &70096048285560 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70096048285560
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: activesupport
|
71
|
-
requirement: &
|
71
|
+
requirement: &70096048284900 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: 3.1.0
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70096048284900
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: therubyracer
|
82
|
-
requirement: &
|
82
|
+
requirement: &70096048284280 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: 0.9.8
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70096048284280
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: dropbox
|
93
|
-
requirement: &
|
93
|
+
requirement: &70096048283860 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70096048283860
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: girl_friday
|
104
|
-
requirement: &
|
104
|
+
requirement: &70096048283380 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70096048283380
|
113
113
|
description: Mass downloader useable as standalone or as a library
|
114
114
|
email: []
|
115
115
|
executables:
|
@@ -153,7 +153,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
153
153
|
version: '0'
|
154
154
|
segments:
|
155
155
|
- 0
|
156
|
-
hash:
|
156
|
+
hash: 2337480508786894492
|
157
157
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
158
158
|
none: false
|
159
159
|
requirements:
|
@@ -162,7 +162,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
162
162
|
version: '0'
|
163
163
|
segments:
|
164
164
|
- 0
|
165
|
-
hash:
|
165
|
+
hash: 2337480508786894492
|
166
166
|
requirements: []
|
167
167
|
rubyforge_project: get_them_all
|
168
168
|
rubygems_version: 1.8.11
|