get_them_all 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/gta +10 -8
- data/lib/get_them_all/action.rb +1 -1
- data/lib/get_them_all/extensions/action_logger.rb +29 -10
- data/lib/get_them_all/site_downloader.rb +101 -34
- data/lib/get_them_all/storage/file_storage.rb +5 -2
- data/lib/get_them_all/version.rb +1 -1
- data/lib/get_them_all/worker.rb +105 -17
- metadata +21 -21
data/bin/gta
CHANGED
@@ -32,14 +32,9 @@ class GtaRunner < Thor
|
|
32
32
|
fail("file #{script_path} should define class #{class_name} !") unless Object.const_defined?( class_name.to_sym )
|
33
33
|
|
34
34
|
info("Started with config file #{File.basename(script_path)}")
|
35
|
-
|
36
|
-
# CTRL+C
|
37
|
-
trap("INT") do
|
38
|
-
EM::stop_event_loop()
|
39
|
-
end
|
40
|
-
|
35
|
+
|
41
36
|
# create the instance (and start download)
|
42
|
-
class_name.constantize.new(
|
37
|
+
crawler = class_name.constantize.new(
|
43
38
|
:storage => {
|
44
39
|
:type => 'file',
|
45
40
|
:params => {
|
@@ -47,7 +42,14 @@ class GtaRunner < Thor
|
|
47
42
|
}
|
48
43
|
},
|
49
44
|
:extensions => [GetThemAll::ActionLogger.new]
|
50
|
-
)
|
45
|
+
)
|
46
|
+
|
47
|
+
# CTRL+C
|
48
|
+
trap("INT") do
|
49
|
+
crawler.stop()
|
50
|
+
end
|
51
|
+
|
52
|
+
crawler.start()
|
51
53
|
end
|
52
54
|
end
|
53
55
|
|
data/lib/get_them_all/action.rb
CHANGED
@@ -6,11 +6,33 @@ module GetThemAll
|
|
6
6
|
#
|
7
7
|
class ActionLogger < Extension
|
8
8
|
def initialize
|
9
|
+
|
10
|
+
#
|
11
|
+
# reactor events
|
12
|
+
#
|
9
13
|
register_handler('downloader.started') do |name, downloader|
|
10
14
|
@skipped_files = 0
|
11
15
|
@download_files = 0
|
12
16
|
end
|
13
|
-
|
17
|
+
|
18
|
+
register_handler('downloader.stopping') do |name, downloader|
|
19
|
+
log("Engine stopping...")
|
20
|
+
end
|
21
|
+
|
22
|
+
register_handler('downloader.stopped') do |name, downloader|
|
23
|
+
log("Engine stopped")
|
24
|
+
end
|
25
|
+
|
26
|
+
register_handler('downloader.completed') do |name, worker, downloader|
|
27
|
+
log ""
|
28
|
+
log "Downloaded #{@download_files} files"
|
29
|
+
log "Skipped: #{@skipped_files}"
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
#
|
34
|
+
# examine events
|
35
|
+
#
|
14
36
|
register_handler('action.examine.started') do |name, worker, action|
|
15
37
|
log("Examining[#{action.level}] #{action.url}")
|
16
38
|
end
|
@@ -23,8 +45,11 @@ module GetThemAll
|
|
23
45
|
register_handler('action.examine.success') do |name, worker, action|
|
24
46
|
# do nothing
|
25
47
|
end
|
26
|
-
|
27
|
-
|
48
|
+
|
49
|
+
|
50
|
+
#
|
51
|
+
# download events
|
52
|
+
#
|
28
53
|
register_handler('action.download.started') do |name, worker, action|
|
29
54
|
log("Downloading #{action.url}")
|
30
55
|
end
|
@@ -41,13 +66,7 @@ module GetThemAll
|
|
41
66
|
@download_files += 1
|
42
67
|
log("File downloaded: #{destpath}")
|
43
68
|
end
|
44
|
-
|
45
|
-
register_handler('downloader.completed') do |name, worker, downloader|
|
46
|
-
log ""
|
47
|
-
log "Downloaded #{@download_files} files"
|
48
|
-
log "Skipped: #{@skipped_files}"
|
49
|
-
end
|
50
|
-
|
69
|
+
|
51
70
|
end
|
52
71
|
|
53
72
|
def log(str)
|
@@ -1,4 +1,6 @@
|
|
1
1
|
|
2
|
+
require 'fiber'
|
3
|
+
|
2
4
|
require 'addressable/uri'
|
3
5
|
require 'active_support/hash_with_indifferent_access'
|
4
6
|
|
@@ -14,9 +16,20 @@ module GetThemAll
|
|
14
16
|
class SiteDownloader
|
15
17
|
include Notifier
|
16
18
|
|
19
|
+
# number of worker for each tasks
|
17
20
|
class_attribute :examiners_count, :downloaders_count
|
18
|
-
|
19
|
-
|
21
|
+
|
22
|
+
# delay between each action for one worker
|
23
|
+
class_attribute :examiners_delay, :downloaders_delay
|
24
|
+
|
25
|
+
|
26
|
+
self.examiners_count = 1
|
27
|
+
self.downloaders_count = 1
|
28
|
+
|
29
|
+
# default: 100 to 200ms between actions
|
30
|
+
self.downloaders_delay = [100, 200]
|
31
|
+
self.examiners_delay = [100, 200]
|
32
|
+
|
20
33
|
##
|
21
34
|
# Determine what will be stored in the history file,
|
22
35
|
# the default is to store the last url before the download
|
@@ -28,9 +41,6 @@ module GetThemAll
|
|
28
41
|
#
|
29
42
|
class_attribute :history_tracking
|
30
43
|
|
31
|
-
self.examiners_count = 1
|
32
|
-
self.downloaders_count = 1
|
33
|
-
|
34
44
|
self.history_tracking = :default
|
35
45
|
|
36
46
|
attr_reader :base_url, :storage, :history
|
@@ -62,6 +72,7 @@ module GetThemAll
|
|
62
72
|
@base_url= args.delete(:base_url)
|
63
73
|
@start_url = args.delete(:start_url) || '/'
|
64
74
|
@folder_name= args.delete(:folder_name)
|
75
|
+
@login_request = args.delete(:login_request)
|
65
76
|
|
66
77
|
# keep a pointer to each extension
|
67
78
|
@extensions = args.delete(:extensions) || [ActionLogger]
|
@@ -105,10 +116,10 @@ module GetThemAll
|
|
105
116
|
notify('downloader.started', self)
|
106
117
|
|
107
118
|
EM::run do
|
108
|
-
EM::add_periodic_timer(
|
109
|
-
if
|
110
|
-
|
111
|
-
|
119
|
+
@exit_timer = EM::add_periodic_timer(2) do
|
120
|
+
# if all workers are idle
|
121
|
+
if @examiners.all?(&:idle?) && @downloaders.all?(&:idle?)
|
122
|
+
self.stop()
|
112
123
|
end
|
113
124
|
end
|
114
125
|
|
@@ -122,37 +133,93 @@ module GetThemAll
|
|
122
133
|
end
|
123
134
|
end
|
124
135
|
end
|
125
|
-
|
126
136
|
|
127
|
-
#
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
)
|
137
|
+
# authenticate connection if required
|
138
|
+
if @login_request
|
139
|
+
open_url(*@login_request) do |req, doc|
|
140
|
+
after_login()
|
141
|
+
end
|
142
|
+
else
|
143
|
+
after_login()
|
144
|
+
end
|
134
145
|
|
146
|
+
end
|
135
147
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
148
|
+
notify('downloader.completed', self)
|
149
|
+
end
|
150
|
+
|
151
|
+
def after_login
|
152
|
+
# queue the first action to start crawling
|
153
|
+
#
|
154
|
+
@examine_queue.push(ExamineAction.new(self,
|
155
|
+
:url => @start_url,
|
156
|
+
:destination_folder => '/',
|
157
|
+
:level => 0,
|
158
|
+
), 0)
|
146
159
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
160
|
+
|
161
|
+
# now that actions are queued, start handling them
|
162
|
+
# start each "worker"
|
163
|
+
# dequeuing is priority based, the download actions
|
164
|
+
# first and then the higher the level the higher the
|
165
|
+
# priority for examine actions, this is done this way
|
166
|
+
# to give work to the download workers asap.
|
167
|
+
#
|
168
|
+
|
169
|
+
@examiners = []
|
170
|
+
@downloaders = []
|
171
|
+
|
172
|
+
1.upto(self.class.examiners_count) do |n|
|
173
|
+
@examiners << Worker.new(:examiner, n - 1, @examine_queue, self.class.examiners_delay)
|
151
174
|
end
|
152
|
-
|
153
|
-
save_history()
|
154
175
|
|
155
|
-
|
176
|
+
1.upto(self.class.downloaders_count) do |n|
|
177
|
+
@downloaders << Worker.new(:downloader, n - 1, @download_queue, self.class.downloaders_delay)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
##
|
182
|
+
# Cleanly stop the engine and ensure the history file is
|
183
|
+
# written.
|
184
|
+
#
|
185
|
+
def stop
|
186
|
+
return if @stopping
|
187
|
+
|
188
|
+
# first stop the exit timer, no longer needed once we are here
|
189
|
+
@exit_timer.cancel()
|
190
|
+
@stopping = true
|
191
|
+
|
192
|
+
Fiber.new do
|
193
|
+
fiber = Fiber.current
|
194
|
+
|
195
|
+
notify('downloader.stopping', self)
|
196
|
+
|
197
|
+
# first ask every workers to stop their work
|
198
|
+
# starting with examiners
|
199
|
+
@examiners.each do |worker|
|
200
|
+
debug "Stopping Examiner #{worker.index}..."
|
201
|
+
worker.request_stop { fiber.resume }
|
202
|
+
Fiber.yield
|
203
|
+
debug "Stopped Examiner #{worker.index}"
|
204
|
+
end
|
205
|
+
|
206
|
+
@downloaders.each do |worker|
|
207
|
+
debug "Stopping Downloader #{worker.index}..."
|
208
|
+
worker.request_stop { fiber.resume }
|
209
|
+
Fiber.yield
|
210
|
+
debug "Stopped Downloader #{worker.index}"
|
211
|
+
end
|
212
|
+
|
213
|
+
# now that every worker is stopped, write the history
|
214
|
+
deferrable = save_history()
|
215
|
+
deferrable.callback{ fiber.resume }
|
216
|
+
Fiber.yield
|
217
|
+
|
218
|
+
notify('downloader.stopped', self)
|
219
|
+
|
220
|
+
# and stop the reactor
|
221
|
+
EM::stop_event_loop()
|
222
|
+
end.resume
|
156
223
|
end
|
157
224
|
|
158
225
|
class AssertionFailed < RuntimeError; end
|
@@ -18,8 +18,11 @@ module GetThemAll
|
|
18
18
|
open(destpath, "wb") do |f|
|
19
19
|
f.write( data )
|
20
20
|
end
|
21
|
-
|
22
|
-
|
21
|
+
|
22
|
+
# allow time to caller to schedule
|
23
|
+
# callbacks on the deferrable
|
24
|
+
|
25
|
+
EM::next_tick{ deferrable.succeed }
|
23
26
|
deferrable
|
24
27
|
|
25
28
|
rescue Errno::EINVAL
|
data/lib/get_them_all/version.rb
CHANGED
data/lib/get_them_all/worker.rb
CHANGED
@@ -4,45 +4,133 @@ module GetThemAll
|
|
4
4
|
# each time an action is put in the queue it will handle it
|
5
5
|
#
|
6
6
|
class Worker
|
7
|
+
include Notifier
|
8
|
+
|
7
9
|
attr_reader :type, :index
|
8
|
-
|
9
|
-
|
10
|
+
|
11
|
+
##
|
12
|
+
# Create a worker.
|
13
|
+
#
|
14
|
+
# @param [StringSymbol] type Name assigned to the worker
|
15
|
+
# the only real use is to identify the worker.
|
16
|
+
# @param [Integer] index additional way to indetify the worker.
|
17
|
+
# @param [EM::Queue] queue the queue from which this worker
|
18
|
+
# will take its jobs.
|
19
|
+
# @param [Integer,Array] delay Number of milliseconds between two
|
20
|
+
# actions, if an array is provided the value will be randomized
|
21
|
+
# between the two first values in the array.
|
22
|
+
#
|
23
|
+
def initialize(type, index, queue, delay = 0)
|
10
24
|
@type = type
|
11
25
|
@index = index
|
26
|
+
@delay = delay
|
27
|
+
|
28
|
+
# ensure delay is valid
|
29
|
+
unless @delay.is_a?(Integer) || (@delay.is_a?(Array) && @delay.size >= 2)
|
30
|
+
raise "invalid value for delay: #{@delay}"
|
31
|
+
end
|
12
32
|
|
13
|
-
@downloader = downloader
|
14
33
|
@queue = queue
|
15
|
-
|
34
|
+
@idle = true
|
35
|
+
|
36
|
+
|
37
|
+
@stop_requested = false
|
38
|
+
|
39
|
+
notify('worker.started', self)
|
40
|
+
|
16
41
|
@queue.pop do |action|
|
17
42
|
handle_action(action)
|
18
43
|
end
|
19
44
|
|
20
45
|
end
|
21
|
-
|
46
|
+
|
47
|
+
##
|
48
|
+
# when called the worker will
|
49
|
+
# finish the current job and then stop taking
|
50
|
+
# new jobs.
|
51
|
+
#
|
52
|
+
# if a block is given it will be called when
|
53
|
+
# the worker is no longer taking actions.
|
54
|
+
#
|
55
|
+
def request_stop(&block)
|
56
|
+
@stop_requested = true
|
57
|
+
notify('worker.stop_requested', self)
|
58
|
+
|
59
|
+
if @idle
|
60
|
+
# we are already stopped, just call the block
|
61
|
+
EM::next_tick{ block.call }
|
62
|
+
else
|
63
|
+
@stop_requested_block = block
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def idle?
|
68
|
+
@idle
|
69
|
+
end
|
70
|
+
|
22
71
|
##
|
23
72
|
# Take the next action in queue
|
24
73
|
#
|
25
74
|
def take_next_job
|
26
|
-
|
27
|
-
|
75
|
+
@idle = true
|
76
|
+
|
77
|
+
if @stop_requested
|
78
|
+
# do not take new jobs and call
|
79
|
+
# the passed block is any
|
80
|
+
@stop_requested_block.call if @stop_requested_block
|
81
|
+
notify('worker.stopped', self)
|
82
|
+
else
|
83
|
+
delay = delay_before_next_action()
|
84
|
+
EM::add_timer(delay / 1000) do
|
85
|
+
@queue.pop do |act|
|
86
|
+
handle_action(act)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
28
90
|
end
|
29
91
|
end
|
30
92
|
|
31
93
|
|
32
94
|
def handle_action(action)
|
33
|
-
|
34
|
-
action
|
35
|
-
# in case of failure, try again later (with slightly lower priority)
|
36
|
-
EM::add_timer(50) do
|
37
|
-
@queue.push(action, [action.level - 1, 0].max)
|
38
|
-
end
|
95
|
+
@idle = false
|
96
|
+
@current_action = action
|
39
97
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
98
|
+
# register callbacks
|
99
|
+
action.callback( &method(:action_succeeded) )
|
100
|
+
action.errback( &method(:action_failed) )
|
101
|
+
|
102
|
+
# and start the action
|
44
103
|
action.do_action(self)
|
45
104
|
end
|
46
105
|
|
106
|
+
private
|
107
|
+
def action_failed
|
108
|
+
# in case of failure, try again later (with slightly lower priority)
|
109
|
+
@queue.push(@current_action, [@current_action.level - 1, 0].max)
|
110
|
+
|
111
|
+
# and take the next job
|
112
|
+
take_next_job()
|
113
|
+
end
|
114
|
+
|
115
|
+
def action_succeeded
|
116
|
+
take_next_job()
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
##
|
121
|
+
# Compute the delay before the next action can
|
122
|
+
# take place.
|
123
|
+
#
|
124
|
+
# @return [Integer] Number of milliseconds to wait
|
125
|
+
#
|
126
|
+
def delay_before_next_action
|
127
|
+
case @delay
|
128
|
+
when Integer then @delay
|
129
|
+
when Array then rand(@delay[1] - @delay[0]) + @delay[0]
|
130
|
+
else
|
131
|
+
0
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
47
135
|
end
|
48
136
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: get_them_all
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2011-10-16 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: thor
|
16
|
-
requirement: &
|
16
|
+
requirement: &70096048287860 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70096048287860
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: em-http-request
|
27
|
-
requirement: &
|
27
|
+
requirement: &70096048287240 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.0.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70096048287240
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: em-priority-queue
|
38
|
-
requirement: &
|
38
|
+
requirement: &70096048286640 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.0.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70096048286640
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: hpricot
|
49
|
-
requirement: &
|
49
|
+
requirement: &70096048286060 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.8.1
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70096048286060
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: i18n
|
60
|
-
requirement: &
|
60
|
+
requirement: &70096048285560 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70096048285560
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: activesupport
|
71
|
-
requirement: &
|
71
|
+
requirement: &70096048284900 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: 3.1.0
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70096048284900
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: therubyracer
|
82
|
-
requirement: &
|
82
|
+
requirement: &70096048284280 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: 0.9.8
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70096048284280
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: dropbox
|
93
|
-
requirement: &
|
93
|
+
requirement: &70096048283860 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70096048283860
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: girl_friday
|
104
|
-
requirement: &
|
104
|
+
requirement: &70096048283380 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70096048283380
|
113
113
|
description: Mass downloader useable as standalone or as a library
|
114
114
|
email: []
|
115
115
|
executables:
|
@@ -153,7 +153,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
153
153
|
version: '0'
|
154
154
|
segments:
|
155
155
|
- 0
|
156
|
-
hash:
|
156
|
+
hash: 2337480508786894492
|
157
157
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
158
158
|
none: false
|
159
159
|
requirements:
|
@@ -162,7 +162,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
162
162
|
version: '0'
|
163
163
|
segments:
|
164
164
|
- 0
|
165
|
-
hash:
|
165
|
+
hash: 2337480508786894492
|
166
166
|
requirements: []
|
167
167
|
rubyforge_project: get_them_all
|
168
168
|
rubygems_version: 1.8.11
|