skynet 0.9.2 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +49 -0
- data/Manifest.txt +84 -6
- data/README.txt +75 -64
- data/app_generators/skynet_install/skynet_install_generator.rb +14 -8
- data/app_generators/skynet_install/templates/migration.rb +1 -24
- data/app_generators/skynet_install/templates/skynet_config.rb +50 -0
- data/app_generators/skynet_install/templates/skynet_initializer.rb +1 -0
- data/app_generators/skynet_install/templates/{skynet_schema.sql → skynet_mysql_schema.sql} +1 -24
- data/bin/skynet +37 -10
- data/bin/skynet_install +5 -5
- data/bin/skynet_tuplespace_server +27 -19
- data/examples/dgrep/README +70 -0
- data/examples/dgrep/config/skynet_config.rb +26 -0
- data/examples/dgrep/data/shakespeare/README +2 -0
- data/examples/dgrep/data/shakespeare/poetry/loverscomplaint +381 -0
- data/examples/dgrep/data/shakespeare/poetry/rapeoflucrece +2199 -0
- data/examples/dgrep/data/shakespeare/poetry/sonnets +2633 -0
- data/examples/dgrep/data/shakespeare/poetry/various +640 -0
- data/examples/dgrep/data/shakespeare/poetry/venusandadonis +1423 -0
- data/examples/dgrep/data/testfile1.txt +1 -0
- data/examples/dgrep/data/testfile2.txt +1 -0
- data/examples/dgrep/data/testfile3.txt +1 -0
- data/examples/dgrep/data/testfile4.txt +1 -0
- data/examples/dgrep/lib/dgrep.rb +59 -0
- data/examples/dgrep/lib/mapreduce_test.rb +32 -0
- data/examples/dgrep/lib/most_common_words.rb +45 -0
- data/examples/dgrep/script/dgrep +75 -0
- data/examples/rails_mysql_example/README +66 -0
- data/examples/rails_mysql_example/Rakefile +10 -0
- data/examples/rails_mysql_example/app/controllers/application.rb +10 -0
- data/examples/rails_mysql_example/app/helpers/application_helper.rb +3 -0
- data/examples/rails_mysql_example/app/models/user.rb +21 -0
- data/examples/rails_mysql_example/app/models/user_favorite.rb +5 -0
- data/examples/rails_mysql_example/app/models/user_mailer.rb +12 -0
- data/examples/rails_mysql_example/app/views/user_mailer/welcome.erb +5 -0
- data/examples/rails_mysql_example/config/boot.rb +109 -0
- data/examples/rails_mysql_example/config/database.yml +42 -0
- data/examples/rails_mysql_example/config/environment.rb +59 -0
- data/examples/rails_mysql_example/config/environments/development.rb +18 -0
- data/examples/rails_mysql_example/config/environments/production.rb +19 -0
- data/examples/rails_mysql_example/config/environments/test.rb +22 -0
- data/examples/rails_mysql_example/config/initializers/inflections.rb +10 -0
- data/examples/rails_mysql_example/config/initializers/mime_types.rb +5 -0
- data/examples/rails_mysql_example/config/initializers/skynet.rb +1 -0
- data/examples/rails_mysql_example/config/routes.rb +35 -0
- data/examples/rails_mysql_example/config/skynet_config.rb +36 -0
- data/examples/rails_mysql_example/db/migrate/001_create_skynet_tables.rb +43 -0
- data/examples/rails_mysql_example/db/migrate/002_create_users.rb +16 -0
- data/examples/rails_mysql_example/db/migrate/003_create_user_favorites.rb +14 -0
- data/examples/rails_mysql_example/db/schema.rb +85 -0
- data/examples/rails_mysql_example/db/skynet_mysql_schema.sql +33 -0
- data/examples/rails_mysql_example/doc/README_FOR_APP +2 -0
- data/examples/rails_mysql_example/lib/tasks/rails_mysql_example.rake +20 -0
- data/examples/rails_mysql_example/public/.htaccess +40 -0
- data/examples/rails_mysql_example/public/404.html +30 -0
- data/examples/rails_mysql_example/public/422.html +30 -0
- data/examples/rails_mysql_example/public/500.html +30 -0
- data/examples/rails_mysql_example/public/dispatch.cgi +10 -0
- data/examples/rails_mysql_example/public/dispatch.fcgi +24 -0
- data/examples/rails_mysql_example/public/dispatch.rb +10 -0
- data/{log/debug.log → examples/rails_mysql_example/public/favicon.ico} +0 -0
- data/examples/rails_mysql_example/public/images/rails.png +0 -0
- data/examples/rails_mysql_example/public/index.html +277 -0
- data/examples/rails_mysql_example/public/javascripts/application.js +2 -0
- data/examples/rails_mysql_example/public/javascripts/controls.js +963 -0
- data/examples/rails_mysql_example/public/javascripts/dragdrop.js +972 -0
- data/examples/rails_mysql_example/public/javascripts/effects.js +1120 -0
- data/examples/rails_mysql_example/public/javascripts/prototype.js +4225 -0
- data/examples/rails_mysql_example/public/robots.txt +5 -0
- data/examples/rails_mysql_example/script/about +3 -0
- data/examples/rails_mysql_example/script/console +3 -0
- data/examples/rails_mysql_example/script/destroy +3 -0
- data/examples/rails_mysql_example/script/generate +3 -0
- data/examples/rails_mysql_example/script/performance/benchmarker +3 -0
- data/examples/rails_mysql_example/script/performance/profiler +3 -0
- data/examples/rails_mysql_example/script/performance/request +3 -0
- data/examples/rails_mysql_example/script/plugin +3 -0
- data/examples/rails_mysql_example/script/process/inspector +3 -0
- data/examples/rails_mysql_example/script/process/reaper +3 -0
- data/examples/rails_mysql_example/script/process/spawner +3 -0
- data/examples/rails_mysql_example/script/runner +3 -0
- data/examples/rails_mysql_example/script/server +3 -0
- data/examples/rails_mysql_example/test/fixtures/user_favorites.yml +9 -0
- data/examples/rails_mysql_example/test/fixtures/users.yml +11 -0
- data/examples/rails_mysql_example/test/test_helper.rb +38 -0
- data/examples/rails_mysql_example/test/unit/user_favorite_test.rb +8 -0
- data/examples/rails_mysql_example/test/unit/user_test.rb +8 -0
- data/extras/README +7 -0
- data/extras/init.d/skynet +87 -0
- data/extras/nagios/check_skynet.sh +121 -0
- data/extras/rails/controllers/skynet_controller.rb +43 -0
- data/extras/rails/views/skynet/index.rhtml +137 -0
- data/lib/skynet.rb +59 -1
- data/lib/skynet/mapreduce_helper.rb +2 -2
- data/lib/skynet/mapreduce_test.rb +32 -1
- data/lib/skynet/message_queue_adapters/mysql.rb +422 -539
- data/lib/skynet/message_queue_adapters/tuple_space.rb +45 -71
- data/lib/skynet/skynet_active_record_extensions.rb +22 -11
- data/lib/skynet/skynet_config.rb +54 -20
- data/lib/skynet/skynet_console.rb +4 -1
- data/lib/skynet/skynet_console_helper.rb +5 -1
- data/lib/skynet/skynet_debugger.rb +58 -4
- data/lib/skynet/skynet_job.rb +61 -24
- data/lib/skynet/skynet_launcher.rb +29 -3
- data/lib/skynet/skynet_logger.rb +11 -1
- data/lib/skynet/skynet_manager.rb +403 -240
- data/lib/skynet/skynet_message.rb +1 -3
- data/lib/skynet/skynet_message_queue.rb +42 -19
- data/lib/skynet/skynet_partitioners.rb +19 -15
- data/lib/skynet/skynet_ruby_extensions.rb +18 -0
- data/lib/skynet/skynet_tuplespace_server.rb +17 -14
- data/lib/skynet/skynet_worker.rb +132 -98
- data/lib/skynet/version.rb +1 -1
- data/script/destroy +0 -0
- data/script/generate +0 -0
- data/script/txt2html +0 -0
- data/test/test_helper.rb +2 -0
- data/test/test_skynet.rb +13 -5
- data/test/test_skynet_manager.rb +24 -9
- data/test/test_skynet_task.rb +1 -1
- data/website/index.html +77 -29
- data/website/index.txt +53 -24
- data/website/stylesheets/screen.css +12 -12
- metadata +156 -66
- data/app_generators/skynet_install/templates/skynet +0 -46
- data/log/skynet.log +0 -29
- data/log/skynet_tuplespace_server.log +0 -7
- data/log/skynet_worker.pid +0 -1
@@ -30,7 +30,10 @@ class Skynet
|
|
30
30
|
end
|
31
31
|
|
32
32
|
IRB.setup(Skynet::CONFIG[:LAUNCHER_PATH])
|
33
|
-
IRB.conf[:
|
33
|
+
IRB.conf[:PROMPT][:SKYNET] = IRB.conf[:PROMPT][:SIMPLE].dup
|
34
|
+
IRB.conf[:PROMPT][:SKYNET][:PROMPT_I] = "skynet>>"
|
35
|
+
|
36
|
+
IRB.conf[:PROMPT_MODE] = :SKYNET
|
34
37
|
irb = IRB::Irb.new()
|
35
38
|
IRB.conf[:MAIN_CONTEXT] = irb.context
|
36
39
|
irb.context.workspace.main.extend Skynet::ConsoleHelper
|
@@ -1,6 +1,10 @@
|
|
1
1
|
module Skynet::ConsoleHelper
|
2
2
|
# All of these commands can be run at the 'skynet console'.
|
3
3
|
|
4
|
+
def log
|
5
|
+
Skynet::Logger.get
|
6
|
+
end
|
7
|
+
|
4
8
|
def mq
|
5
9
|
@mq ||= Skynet::MessageQueue.new
|
6
10
|
end
|
@@ -22,7 +26,7 @@ module Skynet::ConsoleHelper
|
|
22
26
|
end
|
23
27
|
|
24
28
|
def manager
|
25
|
-
@manager ||=
|
29
|
+
@manager ||= Skynet::Manager.get
|
26
30
|
end
|
27
31
|
|
28
32
|
def add_lib(lib)
|
@@ -1,3 +1,36 @@
|
|
1
|
+
# ==SkynetDebugger
|
2
|
+
# The SkynetDebugger is a module you can include in any of your classes that will give you easy access
|
3
|
+
# to the Skynet::Logger. including SkynetDebugger gives you a number of logging methods. Each logging method
|
4
|
+
# lets you pass a message as well as an optional number of objects which will be pretty_printed after your message.
|
5
|
+
# Log lines print with their log level, PID, time, class, and message. eg.
|
6
|
+
#
|
7
|
+
# [WARN] #78002 2008-04-11 14:17:15.363167 <WORKER-78002> Exiting...
|
8
|
+
#
|
9
|
+
# You can set the log_level and log_file with (See Skynet::Config)
|
10
|
+
# Skynet::CONFIG[:SKYNET_LOG_FILE]
|
11
|
+
# Skynet::CONFIG[:SKYNET_LOG_LEVEL]
|
12
|
+
#
|
13
|
+
# Possible log levels include
|
14
|
+
# Logger::DEBUG
|
15
|
+
# Logger::INFO
|
16
|
+
# Logger::WARN
|
17
|
+
# Logger::ERROR
|
18
|
+
# Logger::FATAL
|
19
|
+
#
|
20
|
+
# ==Methods
|
21
|
+
# log - returns the Skynet::Logger
|
22
|
+
#
|
23
|
+
# debug(msg,*objects_to_inspect)
|
24
|
+
#
|
25
|
+
# info(msg,*objects_to_inspect)
|
26
|
+
#
|
27
|
+
# warn(msg,*objects_to_inspect)
|
28
|
+
#
|
29
|
+
# error(msg,*objects_to_inspect)
|
30
|
+
#
|
31
|
+
# fatal(msg,*objects_to_inspect)
|
32
|
+
#
|
33
|
+
# printlog(msg,*objects_to_inspect) #printlog will ALWAYS print to the log as log level [LOG] regardless of the LOG_LEVEL
|
1
34
|
module SkynetDebugger
|
2
35
|
|
3
36
|
def self.included(base)
|
@@ -12,22 +45,18 @@ module SkynetDebugger
|
|
12
45
|
self.class.args_pp(*args)
|
13
46
|
end
|
14
47
|
|
15
|
-
|
16
48
|
def debug(*args)
|
17
49
|
self.class.debug(*args)
|
18
50
|
end
|
19
51
|
|
20
|
-
|
21
52
|
def info(*args)
|
22
53
|
self.class.info(*args)
|
23
54
|
end
|
24
55
|
|
25
|
-
|
26
56
|
def warn(*args)
|
27
57
|
self.class.warn(*args)
|
28
58
|
end
|
29
59
|
|
30
|
-
|
31
60
|
def error(*args)
|
32
61
|
self.class.error(*args)
|
33
62
|
end
|
@@ -35,11 +64,23 @@ module SkynetDebugger
|
|
35
64
|
def fatal(*args)
|
36
65
|
self.class.fatal(*args)
|
37
66
|
end
|
67
|
+
|
68
|
+
def printlog(*args)
|
69
|
+
self.class.printlog(*args)
|
70
|
+
end
|
38
71
|
|
39
72
|
def debug_header
|
40
73
|
self.class.debug_header
|
41
74
|
end
|
42
75
|
|
76
|
+
def stderr(*args)
|
77
|
+
self.class.stderr
|
78
|
+
end
|
79
|
+
|
80
|
+
def stdout(*args)
|
81
|
+
self.class.stdout
|
82
|
+
end
|
83
|
+
|
43
84
|
module ClassMethods
|
44
85
|
|
45
86
|
def debug_class_desc
|
@@ -51,6 +92,7 @@ module SkynetDebugger
|
|
51
92
|
"##{$$} #{t.strftime("%Y-%m-%d %H:%M:%S")}.#{t.usec} <#{debug_class_desc}>"
|
52
93
|
end
|
53
94
|
|
95
|
+
# log
|
54
96
|
def log
|
55
97
|
Skynet::Logger.get
|
56
98
|
end
|
@@ -78,6 +120,18 @@ module SkynetDebugger
|
|
78
120
|
def fatal(msg, *args)
|
79
121
|
log.fatal "[FATAL] #{debug_header} #{msg} #{args_pp(*args)}"
|
80
122
|
end
|
123
|
+
|
124
|
+
def printlog(msg, *args)
|
125
|
+
log.unknown "[LOG] #{debug_header} #{msg} #{args_pp(*args)}"
|
126
|
+
end
|
127
|
+
|
128
|
+
def stderr(msg, *args)
|
129
|
+
$stderr.puts "#{debug_header} #{msg} #{args_pp(*args)}"
|
130
|
+
end
|
131
|
+
|
132
|
+
def stdout(msg, *args)
|
133
|
+
$stdout.puts "#{debug_header} #{msg} #{args_pp(*args)}"
|
134
|
+
end
|
81
135
|
|
82
136
|
end
|
83
137
|
|
data/lib/skynet/skynet_job.rb
CHANGED
@@ -9,8 +9,10 @@ class Skynet
|
|
9
9
|
# There are also many global configuration options which can be controlled through Skynet::CONFIG
|
10
10
|
#
|
11
11
|
# Example Usage:
|
12
|
+
# Create a file called mapreduce_test.rb with the following.
|
12
13
|
#
|
13
|
-
# class
|
14
|
+
# class MapreduceTest
|
15
|
+
# include SkynetDebugger ## This gives you logging methods such as log, error, info, fatal
|
14
16
|
#
|
15
17
|
# def self.run
|
16
18
|
# job = Skynet::Job.new(
|
@@ -41,17 +43,28 @@ class Skynet
|
|
41
43
|
# end
|
42
44
|
# end
|
43
45
|
#
|
44
|
-
# MapReduceTest.run
|
45
46
|
#
|
46
|
-
# You
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
47
|
+
# You need to make sure Skynet is running with your class loaded. That's is how Skynet works.
|
48
|
+
# Since there is no easy way to actually pass code around the network, each skynet worker must
|
49
|
+
# already have your code loaded. If you have skynet started, stop it and then start it with the -r flag
|
50
|
+
# to tell it where to find your class it should require.
|
51
|
+
# $ skynet -r mapreduce_test.rb
|
52
|
+
# Then go into the skynet console to test running your map reduce task.
|
53
|
+
# $ skynet console -r mapreduce_test.rb
|
54
|
+
# skynet>> MapreduceTest.run # returns {2=>2, 3=>1}
|
53
55
|
#
|
54
|
-
#
|
56
|
+
# In the example above, you might notice that self.map and self.reduce both accept Arrays.
|
57
|
+
# If you do not want to deal with getting arrays of map_data or reduce_data, you can include MapreduceHelper
|
58
|
+
# into your class and then implement self.map_each and self.reduce_each methods.
|
59
|
+
# The included self.map and self.reduce methods will handle iterating over the map_data and reduce_data,
|
60
|
+
# passing each element to your map_each and reduce_each methods respectively. They will also handle error
|
61
|
+
# handling within that loop to make sure even if a single map or reduce fails, processing will continue.
|
62
|
+
# If you do not want processing to continue if a map fails, do not use the MapreduceHelper mixin.
|
63
|
+
#
|
64
|
+
# Since Skynet must have your code, you will probably want to install skynet into the application
|
65
|
+
# that skynet needs access to in order to run your jobs. See bin/skynet_install[link:files/bin/skynet_install.html] for more info.
|
66
|
+
#
|
67
|
+
# See new for the many other options to control various Skynet::Job settings.
|
55
68
|
class Job
|
56
69
|
include SkynetDebugger
|
57
70
|
include Skynet::GuidGenerator
|
@@ -70,7 +83,7 @@ class Skynet
|
|
70
83
|
:map, :map_partitioner, :reduce, :reduce_partition, :map_reduce_class,
|
71
84
|
:master_retry, :map_retry, :reduce_retry,
|
72
85
|
:keep_map_tasks, :keep_reduce_tasks,
|
73
|
-
:local_master, :async
|
86
|
+
:local_master, :async, :data_debug
|
74
87
|
]
|
75
88
|
|
76
89
|
FIELDS.each do |method|
|
@@ -83,7 +96,7 @@ class Skynet
|
|
83
96
|
end
|
84
97
|
end
|
85
98
|
|
86
|
-
attr_accessor :use_local_queue
|
99
|
+
attr_accessor :use_local_queue, :data_debug
|
87
100
|
|
88
101
|
Skynet::CONFIG[:JOB_DEFAULTS] = {
|
89
102
|
:queue_id => 0,
|
@@ -220,7 +233,7 @@ class Skynet
|
|
220
233
|
# If a number is provided, the master will run the reduce_tasks locally if there are
|
221
234
|
# LESS THAN OR EQUAL TO the number provided.
|
222
235
|
# You may also set Skynet::CONFIG[:DEFAILT_REDUCVE_MAP_TASKS] DEFAULT 1
|
223
|
-
def initialize(options = {})
|
236
|
+
def initialize(options = {})
|
224
237
|
FIELDS.each do |field|
|
225
238
|
if options.has_key?(field)
|
226
239
|
self.send("#{field}=".to_sym,options[field])
|
@@ -314,7 +327,10 @@ class Skynet
|
|
314
327
|
self.use_local_queue = map_local?
|
315
328
|
if map_tasks
|
316
329
|
number_of_tasks = 0
|
317
|
-
map_tasks.
|
330
|
+
size = map_tasks.size - 1
|
331
|
+
printlog "MESSAGES TO MAP ENQUEUE #{size}" if data_debug?
|
332
|
+
map_tasks.each_with_index do |task,ii|
|
333
|
+
printlog "#{size - ii} MAP TASKS LEFT TO ENQUEUE" if data_debug?
|
318
334
|
number_of_tasks += 1
|
319
335
|
enqueue_messages(tasks_to_messages(task))
|
320
336
|
end
|
@@ -332,8 +348,9 @@ class Skynet
|
|
332
348
|
end
|
333
349
|
|
334
350
|
def partition_data(post_map_data)
|
335
|
-
|
351
|
+
info "RUN REDUCE 3.1 BEFORE PARTITION #{display_info} reducers: #{reducers}"
|
336
352
|
debug "RUN REDUCE 3.1 : #{reducers} #{name}, job_id:#{job_id}", post_map_data
|
353
|
+
printlog "RUN REDUCE 3.1 : #{reducers} #{name}, job_id:#{job_id}", post_map_data if data_debug?
|
337
354
|
return unless post_map_data
|
338
355
|
partitioned_data = nil
|
339
356
|
if not @reduce_partition
|
@@ -353,19 +370,23 @@ class Skynet
|
|
353
370
|
partitioned_data = @reduce_partition.call(post_map_data, reducers)
|
354
371
|
end
|
355
372
|
partitioned_data.compact! if partitioned_data
|
356
|
-
|
373
|
+
info "RUN REDUCE 3.2 AFTER PARTITION #{display_info} reducers: #{reducers}"
|
357
374
|
debug "RUN REDUCE 3.2 AFTER PARTITION #{display_info} data:", partitioned_data if partitioned_data
|
375
|
+
printlog "RUN REDUCE 3.2 AFTER PARTITION #{display_info} data:", partitioned_data if data_debug?
|
358
376
|
partitioned_data
|
359
377
|
end
|
360
378
|
|
361
379
|
def reduce_enqueue(partitioned_data)
|
362
380
|
return partitioned_data unless @reduce and reducers and reducers > 0
|
363
381
|
debug "RUN REDUCE 3.3 CREATED REDUCE TASKS #{display_info}", partitioned_data
|
382
|
+
size = partitioned_data.size
|
383
|
+
printlog "REDUCE MESSAGES TO ENQUEUE #{size}" if data_debug?
|
364
384
|
|
365
385
|
reduce_tasks = self.reduce_tasks(partitioned_data)
|
366
386
|
self.use_local_queue = reduce_local?(reduce_tasks)
|
367
387
|
number_of_tasks = 0
|
368
|
-
reduce_tasks.
|
388
|
+
reduce_tasks.each_with_index do |task,ii|
|
389
|
+
printlog "#{size - ii} REDUCE TASKS LEFT TO ENQUEUE" if data_debug?
|
369
390
|
number_of_tasks += 1
|
370
391
|
enqueue_messages(tasks_to_messages(task))
|
371
392
|
end
|
@@ -374,20 +395,23 @@ class Skynet
|
|
374
395
|
|
375
396
|
def reduce_results(number_of_tasks)
|
376
397
|
results = gather_results(number_of_tasks, reduce_timeout, reduce_name)
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
results
|
381
|
-
|
382
|
-
|
398
|
+
printlog "REDUCE RESULTS", results if data_debug?
|
399
|
+
if results.is_a?(Array) and results.first.is_a?(Array)
|
400
|
+
final = []
|
401
|
+
results.each do |result|
|
402
|
+
final += result
|
403
|
+
end
|
404
|
+
results = final
|
383
405
|
end
|
384
406
|
debug "RUN REDUCE 3.4 AFTER REDUCE #{display_info} results size: #{results ? results.size : ''}"
|
385
407
|
debug "RUN REDUCE 3.4 AFTER REDUCE #{display_info} results:", results if results
|
408
|
+
printlog "POST REDUCE RESULTS", results if data_debug?
|
386
409
|
return results
|
387
410
|
end
|
388
411
|
|
389
412
|
def enqueue_messages(messages)
|
390
|
-
messages.
|
413
|
+
size = messages.size
|
414
|
+
messages.each_with_index do |message,ii|
|
391
415
|
timeout = message.expiry || 5
|
392
416
|
debug "RUN TASKS SUBMITTING #{message.name} job_id: #{job_id} #{message.payload.is_a?(Skynet::Task) ? 'task' + message.payload.task_id.to_s : ''}"
|
393
417
|
debug "RUN TASKS WORKER MESSAGE #{message.name} job_id: #{job_id}", message.to_a
|
@@ -395,6 +419,14 @@ class Skynet
|
|
395
419
|
end
|
396
420
|
end
|
397
421
|
|
422
|
+
# Given a job_id, returns the results from the message queue. Used to retrieve results of asyncronous jobs.
|
423
|
+
def self.results_by_job_id(job_id,timeout=2)
|
424
|
+
result_message = mq.take_result(job_id,timeout)
|
425
|
+
result = result_message.payload
|
426
|
+
return nil unless result
|
427
|
+
return result
|
428
|
+
end
|
429
|
+
|
398
430
|
def gather_results(number_of_tasks, timeout=nil, description=nil)
|
399
431
|
debug "GATHER RESULTS job_id: #{job_id} - NOT AN ASYNC JOB"
|
400
432
|
results = {}
|
@@ -415,6 +447,7 @@ class Skynet
|
|
415
447
|
debug "RESULT returned TASKID: #{result_message.task_id} #{results[result_message.task_id].inspect}"
|
416
448
|
end
|
417
449
|
debug "RESULT collected: #{(results.keys + errors.keys).size}, remaining: #{(number_of_tasks - (results.keys + errors.keys).uniq.size)}"
|
450
|
+
printlog "RESULT collected: #{(results.keys + errors.keys).size}, remaining: #{(number_of_tasks - (results.keys + errors.keys).uniq.size)}" if data_debug?
|
418
451
|
break if (number_of_tasks - (results.keys + errors.keys).uniq.size) <= 0
|
419
452
|
end
|
420
453
|
rescue Skynet::RequestExpiredError => e
|
@@ -582,6 +615,10 @@ class Skynet
|
|
582
615
|
|
583
616
|
def single?
|
584
617
|
@single
|
618
|
+
end
|
619
|
+
|
620
|
+
def data_debug?
|
621
|
+
@data_debug || Skynet::CONFIG[:SKYNET_JOB_DEBUG_DATA_LEVEL]
|
585
622
|
end
|
586
623
|
|
587
624
|
def reset!
|
@@ -1,14 +1,40 @@
|
|
1
1
|
# FIXME: should be a module
|
2
2
|
class Skynet
|
3
3
|
include SkynetDebugger
|
4
|
-
def self.
|
4
|
+
def self.start(options={})
|
5
|
+
begin
|
6
|
+
mq = Skynet::MessageQueue.new
|
7
|
+
rescue Skynet::ConnectionError
|
8
|
+
if Skynet::MessageQueue.adapter == :tuplespace
|
9
|
+
ts_port = Skynet::CONFIG[:TS_SERVER_HOSTS].first.split(':').last
|
10
|
+
# puts "trying to make ts skynet_tuplespace_server --port=#{ts_port} --logfile=#{Skynet.config.logfile_location} --piddir=#{Skynet.config.skynet_pid_dir} --use_ringserver=#{Skynet.config.ts_use_ringserver} --drburi=#{Skynet.config.ts_drburi} start"
|
11
|
+
cmd = "skynet_tuplespace_server --port=#{ts_port} --logfile=#{Skynet.config.logfile_location} --piddir=#{Skynet.config.skynet_pid_dir} --use_ringserver=#{Skynet.config.ts_use_ringserver} --drburi=#{Skynet.config.ts_drburi} start"
|
12
|
+
pid = fork do
|
13
|
+
exec(cmd)
|
14
|
+
end
|
15
|
+
sleep Skynet::CONFIG[:TS_SERVER_START_DELAY]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
options[:script_path] = Skynet::CONFIG[:LAUNCHER_PATH]
|
20
|
+
|
5
21
|
if ARGV.detect {|a| a == 'console' }
|
6
22
|
ARGV.delete('console')
|
7
23
|
Skynet::Console.start
|
8
24
|
elsif options[:worker_type] or ARGV.detect {|a| a =~ /worker_type/ }
|
9
25
|
Skynet::Worker.start(options)
|
10
26
|
else
|
11
|
-
|
27
|
+
if ARGV.include?('stop')
|
28
|
+
Skynet::Manager.stop(options)
|
29
|
+
else
|
30
|
+
options["daemonize"] = true if ARGV.include?('start')
|
31
|
+
Skynet::Manager.start(options)
|
32
|
+
end
|
12
33
|
end
|
13
34
|
end
|
14
|
-
|
35
|
+
|
36
|
+
def self.new(options={})
|
37
|
+
warn("Skynet.new is deprecated, please use Skynet.start instead")
|
38
|
+
start(options)
|
39
|
+
end
|
40
|
+
end
|
data/lib/skynet/skynet_logger.rb
CHANGED
@@ -16,7 +16,7 @@ class Skynet
|
|
16
16
|
|
17
17
|
def self.get
|
18
18
|
if not @@log
|
19
|
-
@@log = self.new(Skynet::
|
19
|
+
@@log = self.new(Skynet::Config.new.logfile_location)
|
20
20
|
@@log.level = Skynet::CONFIG[:SKYNET_LOG_LEVEL]
|
21
21
|
end
|
22
22
|
@@log
|
@@ -25,7 +25,13 @@ class Skynet
|
|
25
25
|
def self.log=(log)
|
26
26
|
@@log = log
|
27
27
|
end
|
28
|
+
|
29
|
+
def printlog(*args)
|
30
|
+
self.class.get.unknown(*args)
|
31
|
+
end
|
32
|
+
|
28
33
|
end
|
34
|
+
|
29
35
|
|
30
36
|
# This module can be mixed in to add logging methods to your class.
|
31
37
|
module Loggable
|
@@ -48,5 +54,9 @@ class Skynet
|
|
48
54
|
def fatal
|
49
55
|
log = Skynet::Logger.get
|
50
56
|
end
|
57
|
+
|
58
|
+
def unknown
|
59
|
+
log = Skynet::Logger.get
|
60
|
+
end
|
51
61
|
end
|
52
62
|
end
|
@@ -1,64 +1,87 @@
|
|
1
|
-
|
2
|
-
begin
|
3
|
-
require 'fastthread'
|
4
|
-
rescue LoadError
|
5
|
-
# puts 'fastthread not installed, using thread instead'
|
6
|
-
require 'thread'
|
7
|
-
end
|
1
|
+
require 'yaml'
|
8
2
|
|
3
|
+
class Skynet
|
9
4
|
class Manager
|
10
|
-
|
5
|
+
|
11
6
|
class Error < StandardError
|
12
7
|
end
|
13
|
-
|
8
|
+
|
14
9
|
include SkynetDebugger
|
15
10
|
|
16
11
|
Skynet::CONFIG[:PERCENTAGE_OF_TASK_ONLY_WORKERS] ||= 0.7
|
17
12
|
Skynet::CONFIG[:PERCENTAGE_OF_MASTER_ONLY_WORKERS] ||= 0.2
|
18
|
-
|
13
|
+
|
19
14
|
def self.debug_class_desc
|
20
15
|
"MANAGER"
|
21
|
-
end
|
22
|
-
|
16
|
+
end
|
17
|
+
|
23
18
|
attr_accessor :required_libs, :queue_id
|
24
|
-
attr_reader :config
|
25
|
-
|
26
|
-
def initialize(options)
|
19
|
+
attr_reader :config, :worker_queue, :wqts
|
20
|
+
|
21
|
+
def initialize(options)
|
27
22
|
raise Error.new("You must provide a script path to Skynet::Manager.new.") unless options[:script_path]
|
28
|
-
@script_path = options[:script_path]
|
23
|
+
@script_path = options[:script_path] || Skynet::CONFIG[:LAUNCHER_PATH]
|
29
24
|
# info "Skynet Launcher Path: [#{@script_path}]"
|
30
25
|
@workers_requested = options[:workers] || 4
|
31
|
-
@required_libs = options[:
|
26
|
+
@required_libs = options[:required_libs] || []
|
32
27
|
@queue_id = options[:queue_id] || 0
|
33
28
|
@number_of_workers = 0
|
34
29
|
@workers_by_type = {:master => [], :task => [], :any => []}
|
35
30
|
@signaled_workers = []
|
36
|
-
@
|
31
|
+
@worker_queue = {}
|
37
32
|
@workers_restarting = 0
|
38
|
-
@all_workers_started = false
|
39
|
-
@config = Skynet::Config.new
|
33
|
+
@all_workers_started = false
|
34
|
+
@config = Skynet::Config.new
|
40
35
|
@mutex = Mutex.new
|
41
|
-
|
36
|
+
@wqts = Queue.new
|
37
|
+
end
|
38
|
+
|
39
|
+
def worker_notify(item)
|
40
|
+
@wqts.push(item)
|
41
|
+
end
|
42
|
+
|
43
|
+
def start_worker_queue_thread
|
44
|
+
Thread.new do
|
45
|
+
last_save_time = Time.now
|
46
|
+
loop do
|
47
|
+
task = @wqts.pop
|
48
|
+
begin
|
49
|
+
status = Skynet::WorkerStatusMessage.new(task)
|
50
|
+
status.started_at = status.started_at.to_i
|
51
|
+
@mutex.synchronize do
|
52
|
+
@worker_queue[status.worker_id] = status
|
53
|
+
end
|
54
|
+
if last_save_time < Time.now - 60
|
55
|
+
save_worker_queue_to_file
|
56
|
+
last_save_time = Time.now
|
57
|
+
end
|
58
|
+
rescue Exception => e
|
59
|
+
error "Error in worker queue thread #{e.inspect} #{e.backtrace.join("\n")}"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
42
64
|
|
43
65
|
def start_workers
|
66
|
+
load_worker_queue_from_file
|
67
|
+
start_worker_queue_thread
|
68
|
+
|
44
69
|
setup_signals
|
45
|
-
|
70
|
+
|
46
71
|
starting = workers_to_start(@workers_requested)
|
47
72
|
warn "Starting #{starting} workers. QUEUE: #{config.queue_name_by_id(queue_id)} #{@workers_requested - starting} already running."
|
48
73
|
add_worker(starting)
|
49
74
|
end
|
50
|
-
|
75
|
+
|
51
76
|
### maybe workers_to_start should be a method
|
52
77
|
def workers_to_start(workers_to_start)
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
if worker_alive?(worker_pid)
|
57
|
-
@workers_running[worker_pid] = Time.now
|
78
|
+
if not worker_pids.empty?
|
79
|
+
worker_pids.each do |worker_pid|
|
80
|
+
if worker_alive?(worker_pid)
|
58
81
|
@number_of_workers += 1
|
59
82
|
workers_to_start -= 1
|
60
83
|
else
|
61
|
-
|
84
|
+
mark_worker_as_stopped(worker_pid)
|
62
85
|
end
|
63
86
|
return 0 if workers_to_start < 1
|
64
87
|
end
|
@@ -67,242 +90,214 @@ class Skynet
|
|
67
90
|
end
|
68
91
|
|
69
92
|
def check_started_workers
|
70
|
-
workers = []
|
71
93
|
begin
|
72
94
|
100.times do |ii|
|
73
|
-
workers
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
end
|
95
|
+
warn "Checking started workers, #{active_workers.size} out of #{@number_of_workers} after the #{(ii+1)}th try..."
|
96
|
+
break if active_workers.size >= @number_of_workers
|
97
|
+
sleep (@number_of_workers - active_workers.size)
|
98
|
+
end
|
78
99
|
rescue Exception => e
|
79
100
|
fatal "Something bad happened #{e.inspect} #{e.backtrace.join("\n")}"
|
80
101
|
end
|
81
102
|
|
82
103
|
@all_workers_started = true
|
83
104
|
|
84
|
-
|
85
|
-
if
|
86
|
-
warn "EXPECTED #{@number_of_workers}"
|
87
|
-
@number_of_workers =
|
105
|
+
printlog "FINISHED STARTING ALL #{active_workers.size} WORKERS"
|
106
|
+
if active_workers.size > @number_of_workers
|
107
|
+
warn "EXPECTED #{@number_of_workers}"
|
108
|
+
@number_of_workers = active_workers.size
|
88
109
|
end
|
89
110
|
end
|
90
|
-
|
111
|
+
|
91
112
|
# the main application loop
|
92
113
|
def run
|
93
|
-
loop do
|
114
|
+
loop do
|
94
115
|
next unless @all_workers_started
|
95
116
|
begin
|
96
117
|
check_workers
|
97
118
|
sleep Skynet::CONFIG[:WORKER_CHECK_DELAY]
|
98
119
|
rescue SystemExit, Interrupt => e
|
99
|
-
|
120
|
+
printlog "Manager Exiting!"
|
100
121
|
exit
|
101
122
|
rescue Exception => e
|
102
123
|
fatal "Something bad happened #{e.inspect} #{e.backtrace.join("\n")}"
|
103
124
|
end
|
104
125
|
end
|
105
126
|
end
|
106
|
-
|
127
|
+
|
107
128
|
def check_workers
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
def check_running_pids(worker_queue)
|
117
|
-
# There are workers running that are not in the queue. When does this happen?
|
118
|
-
q_pids = worker_queue_pids(worker_queue) || []
|
119
|
-
if @workers_running.keys.size > q_pids.size
|
120
|
-
(@workers_running.keys - q_pids).each do |wpid|
|
121
|
-
error "Missing worker #{wpid} from worker queue. Removing and/or killing."
|
122
|
-
Process.kill("TERM",wpid) if worker_alive?(wpid)
|
123
|
-
@workers_running.delete(wpid)
|
124
|
-
q_pids.delete(wpid)
|
125
|
-
end
|
126
|
-
end
|
127
|
-
|
128
|
-
q_pids.each do |wpid|
|
129
|
+
debug "Checking on #{@number_of_workers} workers..." unless @shutdown
|
130
|
+
check_running_pids
|
131
|
+
check_number_of_workers
|
132
|
+
true
|
133
|
+
end
|
134
|
+
|
135
|
+
def check_running_pids
|
136
|
+
worker_pids.each do |wpid|
|
129
137
|
if not worker_alive?(wpid)
|
130
|
-
|
131
|
-
|
132
|
-
|
138
|
+
if @shutdown
|
139
|
+
info "Worker #{wpid} shut down gracefully. Removing from queue."
|
140
|
+
else
|
141
|
+
error "Worker #{wpid} was in queue and but was not running. Removing from queue."
|
142
|
+
end
|
143
|
+
mark_worker_as_stopped(wpid)
|
133
144
|
@number_of_workers -= 1
|
134
|
-
q_pids.delete(wpid)
|
135
145
|
end
|
136
146
|
end
|
137
|
-
|
138
|
-
end
|
139
|
-
|
140
|
-
def check_number_of_workers
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
if q_pids.size < 1
|
147
|
+
worker_pids
|
148
|
+
end
|
149
|
+
|
150
|
+
def check_number_of_workers
|
151
|
+
if @shutdown
|
152
|
+
worker_shutdown
|
153
|
+
if worker_pids.size < 1
|
145
154
|
exit
|
146
|
-
end
|
155
|
+
end
|
147
156
|
elsif @workers_restarting > 0
|
148
|
-
if @workers_requested -
|
149
|
-
restarting = @workers_requested -
|
150
|
-
warn "RESTART MODE: Expected #{@number_of_workers} workers. #{
|
151
|
-
else
|
152
|
-
warn "RESTART MODE: Expected #{@number_of_workers} workers. #{
|
153
|
-
end
|
154
|
-
@workers_restarting = @workers_requested -
|
155
|
-
|
156
|
-
elsif
|
157
|
-
starting = 0
|
158
|
-
if
|
159
|
-
starting = @workers_requested -
|
160
|
-
error "Expected #{@number_of_workers} workers. #{
|
161
|
-
@number_of_workers =
|
162
|
-
add_worker(starting)
|
163
|
-
else
|
164
|
-
|
165
|
-
error "Expected #{@number_of_workers} workers. #{
|
166
|
-
@number_of_workers =
|
157
|
+
if @workers_requested - worker_pids.size != 0
|
158
|
+
restarting = @workers_requested - worker_pids.size
|
159
|
+
warn "RESTART MODE: Expected #{@number_of_workers} workers. #{worker_pids.size} running. #{restarting} are still restarting"
|
160
|
+
else
|
161
|
+
warn "RESTART MODE: Expected #{@number_of_workers} workers. #{worker_pids.size} running."
|
162
|
+
end
|
163
|
+
@workers_restarting = @workers_requested - worker_pids.size
|
164
|
+
|
165
|
+
elsif worker_pids.size != @number_of_workers
|
166
|
+
starting = 0
|
167
|
+
if worker_pids.size.to_f / @workers_requested.to_f < 0.85
|
168
|
+
starting = @workers_requested - worker_pids.size
|
169
|
+
error "Expected #{@number_of_workers} workers. #{worker_pids.size} running. Starting #{starting}"
|
170
|
+
@number_of_workers = worker_pids.size
|
171
|
+
add_worker(starting)
|
172
|
+
else
|
173
|
+
|
174
|
+
error "Expected #{@number_of_workers} workers. #{worker_pids.size} running."
|
175
|
+
@number_of_workers = worker_pids.size
|
167
176
|
end
|
168
177
|
end
|
169
178
|
end
|
170
|
-
|
171
|
-
def worker_shutdown
|
172
|
-
q_pids = worker_queue_pids(worker_queue) || []
|
179
|
+
|
180
|
+
def worker_shutdown
|
173
181
|
if not @masters_dead
|
174
|
-
workers_to_kill =
|
175
|
-
w.map_or_reduce == "master" and
|
176
|
-
end
|
177
|
-
warn "Shutting down masters. #{
|
182
|
+
workers_to_kill = active_workers.select do |w|
|
183
|
+
w.map_or_reduce == "master" and active_workers.detect{|status| status.process_id == w.process_id and worker_alive?(w.process_id)}
|
184
|
+
end
|
185
|
+
warn "Shutting down masters. #{worker_pids.size} workers still running." if worker_pids.size > 0
|
178
186
|
|
179
187
|
worker_pids_to_kill = workers_to_kill.collect { |w| w.process_id }
|
180
188
|
if worker_pids_to_kill and not worker_pids_to_kill.empty?
|
181
|
-
warn "FOUND MORE RUNNING MASTERS WE HAVEN'T KILLED:", worker_pids_to_kill
|
182
|
-
remove_worker(worker_pids_to_kill)
|
189
|
+
warn "FOUND MORE RUNNING MASTERS WE HAVEN'T KILLED:", worker_pids_to_kill
|
190
|
+
remove_worker(worker_pids_to_kill)
|
183
191
|
end
|
184
192
|
|
185
|
-
if not
|
186
|
-
signal_workers("
|
193
|
+
if not active_workers.detect { |w| w.map_or_reduce == "master" }
|
194
|
+
signal_workers("TERM")
|
187
195
|
@masters_dead = true
|
188
|
-
sleep 1
|
189
|
-
return check_number_of_workers()
|
190
196
|
else
|
191
|
-
|
192
|
-
return check_number_of_workers()
|
197
|
+
return check_number_of_workers
|
193
198
|
end
|
194
|
-
else
|
195
|
-
warn "Shutting down. #{q_pids.size} workers still running." if q_pids.size > 0
|
196
199
|
end
|
197
|
-
if
|
200
|
+
if worker_pids.size < 1
|
198
201
|
info "No more workers running."
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
def take_worker_status(worker_process_id)
|
203
|
-
begin
|
204
|
-
mq.take_worker_status({
|
205
|
-
:hostname => hostname,
|
206
|
-
:process_id => worker_process_id
|
207
|
-
},0.00001)
|
208
|
-
rescue Skynet::QueueTimeout => e
|
209
|
-
error "Couldnt take worker status for #{hostname} #{worker_process_id}"
|
202
|
+
else
|
203
|
+
warn "Shutting down. #{worker_pids.size} workers still running." if worker_pids.size > 0
|
210
204
|
end
|
211
|
-
end
|
212
|
-
|
205
|
+
end
|
206
|
+
|
213
207
|
def worker_alive?(worker_pid)
|
214
|
-
|
215
|
-
|
216
|
-
return ps.detect {|line| line =~ /worker_type/}
|
217
|
-
end
|
218
|
-
rescue Errno::ENOENT => e
|
219
|
-
return false
|
220
|
-
end
|
221
|
-
false
|
222
|
-
end
|
223
|
-
|
208
|
+
Skynet.process_alive?(worker_pid)
|
209
|
+
end
|
224
210
|
|
225
211
|
def add_workers(*args)
|
226
212
|
add_worker(*args)
|
227
213
|
end
|
228
|
-
|
214
|
+
|
229
215
|
def add_worker(workers=1)
|
230
216
|
num_task_only_workers = (workers * Skynet::CONFIG[:PERCENTAGE_OF_TASK_ONLY_WORKERS]).to_i
|
231
217
|
num_master_only_workers = (workers * Skynet::CONFIG[:PERCENTAGE_OF_MASTER_ONLY_WORKERS]).to_i
|
232
218
|
warn "Adding #{workers} WORKERS. Task Workers: #{num_task_only_workers}, Master Workers: #{num_master_only_workers} Master & Task Workers: #{workers - num_task_only_workers - num_master_only_workers}"
|
233
|
-
|
219
|
+
|
234
220
|
@all_workers_started = false
|
235
221
|
worker_types = {:task => 0, :master => 0, :any => 0}
|
236
222
|
(1..workers).collect do |ii|
|
237
223
|
worker_type = :any
|
238
|
-
if (ii <= num_master_only_workers)
|
239
|
-
worker_type = :master
|
224
|
+
if (ii <= num_master_only_workers)
|
225
|
+
worker_type = :master
|
240
226
|
worker_types[:master] += 1
|
241
227
|
elsif (ii > num_master_only_workers and ii <= num_master_only_workers + num_task_only_workers)
|
242
228
|
worker_type = :task
|
243
229
|
worker_types[:task] += 1
|
244
230
|
else
|
245
231
|
worker_types[:any] += 1
|
246
|
-
end
|
232
|
+
end
|
247
233
|
cmd = "#{@script_path} --worker_type=#{worker_type}"
|
234
|
+
cmd << " --config='#{Skynet::CONFIG[:CONFIG_FILE]}'" if Skynet::CONFIG[:CONFIG_FILE]
|
248
235
|
cmd << " --queue_id=#{queue_id}"
|
249
236
|
cmd << " -r #{required_libs.join(' -r ')}" if required_libs and not required_libs.empty?
|
250
|
-
wpid =
|
237
|
+
wpid = Skynet.fork_and_exec(cmd)
|
238
|
+
Skynet.close_console
|
251
239
|
@workers_by_type[worker_type] ||= []
|
252
240
|
@workers_by_type[worker_type] << wpid
|
253
241
|
warn "Adding Worker ##{ii} PID: #{wpid} QUEUE: #{queue_id}, WORKER_TYPE?:#{worker_type}"
|
254
242
|
@mutex.synchronize do
|
255
243
|
@number_of_workers += 1
|
256
244
|
end
|
257
|
-
@workers_running[wpid] = Time.now
|
258
245
|
sleep 0.01
|
259
246
|
wpid
|
260
|
-
end
|
261
|
-
info "
|
262
|
-
check_started_workers
|
247
|
+
end
|
248
|
+
info "Worker Distribution", worker_types
|
249
|
+
check_started_workers
|
263
250
|
end
|
264
|
-
|
251
|
+
|
265
252
|
def remove_workers(workers=1)
|
266
|
-
pids =
|
253
|
+
pids = worker_pids[0...workers]
|
267
254
|
remove_worker(pids)
|
268
255
|
end
|
269
256
|
|
270
257
|
def remove_worker(pids = nil)
|
271
258
|
pids = [pids] unless pids.kind_of?(Array)
|
272
259
|
info "Removing workers #{pids.join(",")} from worker queue. They will die gracefully when they finish what they're doing."
|
273
|
-
wq = worker_queue
|
274
260
|
pids.collect do |wpid|
|
275
|
-
|
261
|
+
Process.kill("INT",wpid)
|
262
|
+
mark_worker_as_stopped(wpid)
|
276
263
|
@number_of_workers -= 1
|
277
|
-
@workers_running.delete(wpid)
|
278
264
|
warn "REMOVING WORKER #{wpid}"
|
279
265
|
@signaled_workers << wpid
|
280
|
-
|
281
|
-
end
|
266
|
+
end
|
282
267
|
pids
|
283
268
|
end
|
284
269
|
|
285
|
-
def
|
286
|
-
worker_queue.
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
end
|
293
|
-
end
|
294
|
-
|
295
|
-
def restart_all_workers
|
296
|
-
hostnames = {}
|
297
|
-
mq.read_all_worker_statuses.each do |status|
|
298
|
-
hostnames[status.hostname] = true
|
270
|
+
def mark_worker_as_stopped(wpid)
|
271
|
+
worker = @worker_queue.values.detect {|status| status.process_id == wpid}
|
272
|
+
if worker and not worker_alive?(wpid)
|
273
|
+
@worker_queue.delete_if {|worker_id, status| status.process_id == wpid }
|
274
|
+
worker_pids.delete(worker.process_id)
|
275
|
+
worker.started_at = Time.now.to_f
|
276
|
+
worker.process_id = nil
|
299
277
|
end
|
300
|
-
|
301
|
-
|
302
|
-
|
278
|
+
end
|
279
|
+
|
280
|
+
def signal_workers(signal,worker_type=[])
|
281
|
+
worker_types = [worker_type].flatten
|
282
|
+
active_workers.each do |worker|
|
283
|
+
worker_types.each do |worker_type|
|
284
|
+
if worker_type == :idle
|
285
|
+
next if worker_type and worker.task_id
|
286
|
+
else
|
287
|
+
next if worker_type and not @workers_by_type[worker_type].include?(worker.process_id)
|
288
|
+
end
|
289
|
+
end
|
290
|
+
warn "SHUTTING DOWN #{worker.process_id} MR: #{worker.map_or_reduce} SIG: #{signal}"
|
291
|
+
begin
|
292
|
+
Process.kill(signal,worker.process_id)
|
293
|
+
rescue Errno::ESRCH
|
294
|
+
warn "Tried to kill a process that didn't exist #{worker.process_id}"
|
295
|
+
end
|
296
|
+
# mark_worker_as_stopped(worker.process_id)
|
297
|
+
@signaled_workers << worker.process_id
|
303
298
|
end
|
304
299
|
end
|
305
|
-
|
300
|
+
|
306
301
|
def hard_restart_workers
|
307
302
|
@all_workers_started = false
|
308
303
|
signal_workers("TERM")
|
@@ -319,34 +314,33 @@ class Skynet
|
|
319
314
|
def restart_worker(wpid)
|
320
315
|
info "RESTARTING WORKER #{wpid}"
|
321
316
|
@mutex.synchronize do
|
322
|
-
|
317
|
+
Process.kill("HUP",wpid)
|
318
|
+
mark_worker_as_stopped(wpid)
|
323
319
|
@workers_restarting += 1
|
324
320
|
end
|
325
|
-
Process.kill("HUP",wpid)
|
326
321
|
sleep Skynet::CONFIG[:WORKER_CHECK_DELAY]
|
327
322
|
end
|
328
323
|
|
329
324
|
def restart_workers
|
330
325
|
@all_workers_started = false
|
331
326
|
signal_workers("HUP")
|
332
|
-
@workers_running = {}
|
333
327
|
sleep @number_of_workers
|
334
328
|
check_started_workers
|
335
329
|
end
|
336
330
|
|
337
331
|
def setup_signals
|
338
|
-
Signal.trap("HUP") do
|
332
|
+
Signal.trap("HUP") do
|
339
333
|
restart_workers
|
340
334
|
end
|
341
335
|
Signal.trap("TERM") do
|
342
|
-
if @term
|
336
|
+
if @term
|
343
337
|
terminate
|
344
338
|
else
|
345
339
|
@term=true
|
346
340
|
shutdown
|
347
341
|
end
|
348
342
|
end
|
349
|
-
|
343
|
+
|
350
344
|
Signal.trap("INT") do
|
351
345
|
if @shutdown
|
352
346
|
terminate
|
@@ -359,116 +353,210 @@ class Skynet
|
|
359
353
|
def shutdown
|
360
354
|
info(:shutdown)
|
361
355
|
@shutdown = true
|
362
|
-
signal_workers("
|
363
|
-
signal_workers("INT",:any)
|
356
|
+
signal_workers("TERM",[:idle,:master,:any])
|
364
357
|
end
|
365
358
|
|
366
|
-
def terminate
|
367
|
-
info(:terminate)
|
368
|
-
signal_workers("
|
359
|
+
def terminate
|
360
|
+
info(:terminate)
|
361
|
+
signal_workers("KILL")
|
362
|
+
sleep 1
|
369
363
|
exit
|
370
364
|
end
|
371
365
|
|
372
|
-
def
|
373
|
-
|
374
|
-
|
375
|
-
|
366
|
+
def save_worker_queue_to_file
|
367
|
+
debug "Writing worker queue to file #{Skynet.config.manager_statfile_location}"
|
368
|
+
File.open(Skynet.config.manager_statfile_location,"w") do |f|
|
369
|
+
f.write(YAML.dump(@worker_queue))
|
376
370
|
end
|
377
|
-
Process.detach(pid) if (pid != 0)
|
378
|
-
pid
|
379
371
|
end
|
380
372
|
|
381
|
-
def
|
382
|
-
|
383
|
-
|
373
|
+
def load_worker_queue_from_file
|
374
|
+
if File.exists?(Skynet.config.manager_statfile_location)
|
375
|
+
File.open(Skynet.config.manager_statfile_location,"r") do |f|
|
376
|
+
begin
|
377
|
+
@worker_queue = YAML.load(f.read)
|
378
|
+
raise Error.new("Bad Manager File returned type #{@worker_queue.class}") unless @worker_queue.is_a?(Hash)
|
379
|
+
rescue Exception => e
|
380
|
+
error "Error loading manager stats file: #{f}", e
|
381
|
+
@worker_queue = {}
|
382
|
+
save_worker_queue_to_file
|
383
|
+
end
|
384
|
+
end
|
385
|
+
end
|
386
|
+
end
|
384
387
|
|
385
|
-
def
|
386
|
-
|
388
|
+
def prune_inactive_worker_stats
|
389
|
+
@worker_queue.delete_if{|worker_id, worker| !worker.process_id.is_a?(Fixnum) }
|
390
|
+
stats
|
391
|
+
end
|
392
|
+
|
393
|
+
def self.stats_for_hosts(manager_hosts=nil)
|
394
|
+
manager_hosts ||= Skynet::CONFIG[:MANAGER_HOSTS] || ["localhost"]
|
395
|
+
stats = {
|
396
|
+
:servers => {},
|
397
|
+
:processed => 0,
|
398
|
+
:number_of_workers => 0,
|
399
|
+
:active_workers => 0,
|
400
|
+
:idle_workers => 0,
|
401
|
+
:hosts => 0,
|
402
|
+
:masters => 0,
|
403
|
+
:taskworkers => 0,
|
404
|
+
:time => Time.now.to_f
|
405
|
+
}
|
406
|
+
servers = {}
|
407
|
+
manager_hosts.each do |manager_host|
|
408
|
+
begin
|
409
|
+
manager = DRbObject.new(nil,"druby://#{manager_host}:#{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}")
|
410
|
+
manager_stats = manager.stats
|
411
|
+
servers[manager_host] = manager_stats
|
412
|
+
manager_stats.each do |key,value|
|
413
|
+
next unless value.is_a?(Fixnum)
|
414
|
+
stats[key] ||= 0
|
415
|
+
stats[key] += value
|
416
|
+
end
|
417
|
+
rescue DRb::DRbConnError, Errno::ECONNREFUSED => e
|
418
|
+
warn "Couldn't get stats from manager at druby://#{manager_host}:#{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}"
|
419
|
+
end
|
420
|
+
end
|
421
|
+
stats[:servers] = servers
|
422
|
+
stats[:hosts] = manager_hosts
|
423
|
+
stats
|
424
|
+
end
|
425
|
+
|
426
|
+
def stats
|
427
|
+
started_times = @worker_queue.values.collect{|worker| worker.started_at }.sort
|
428
|
+
active_started_times = active_workers.collect{|worker|worker.started_at }.sort
|
429
|
+
stats = {
|
430
|
+
:hostname => hostname,
|
431
|
+
:earliest_update => started_times.first,
|
432
|
+
:latest_update => started_times.last,
|
433
|
+
:active_earliest_update => active_started_times.first,
|
434
|
+
:active_latest_update => active_started_times.last,
|
435
|
+
:processed => 0,
|
436
|
+
:processed_by_active_workers => 0,
|
437
|
+
:number_of_workers => 0,
|
438
|
+
:idle_workers => 0,
|
439
|
+
:shutdown_workers => 0,
|
440
|
+
}
|
441
|
+
@worker_queue.values.collect{|worker|stats[:processed] += worker.processed}
|
442
|
+
active_workers.collect{|worker|stats[:processed_by_active_workers] += worker.processed}
|
443
|
+
currently_active_workers, idle_workers = active_workers.partition{|worker| worker.map_or_reduce }
|
444
|
+
stats[:number_of_workers] = active_workers.size
|
445
|
+
stats[:active_workers] = currently_active_workers.size
|
446
|
+
stats[:idle_workers] = idle_workers.size
|
447
|
+
stats[:shutdown_workers] = inactive_workers.size
|
448
|
+
stats[:masters] = active_workers.select{|worker|worker.tasktype.to_s == "master"}.size
|
449
|
+
stats[:master_or_task_workers] = active_workers.select{|worker|worker.tasktype.to_s == "any"}.size
|
450
|
+
stats[:taskworkers] = active_workers.select{|worker|worker.tasktype.to_s == "task"}.size
|
451
|
+
stats[:active_masters] = currently_active_workers.select{|worker|worker.tasktype.to_s == "master"}.size
|
452
|
+
stats[:active_master_or_task_workers] = currently_active_workers.select{|worker|worker.tasktype.to_s == "any"}.size
|
453
|
+
stats[:active_taskworkers] = currently_active_workers.select{|worker|worker.tasktype.to_s == "task"}.size
|
454
|
+
stats[:idle_masters] = idle_workers.select{|worker|worker.tasktype.to_s == "master"}.size
|
455
|
+
stats[:idle_master_or_task_workers] = idle_workers.select{|worker|worker.tasktype.to_s == "any"}.size
|
456
|
+
stats[:idle_taskworkers] = idle_workers.select{|worker|worker.tasktype.to_s == "task"}.size
|
457
|
+
stats
|
458
|
+
end
|
459
|
+
|
460
|
+
def active_workers
|
461
|
+
@worker_queue.values.select{|status| status.process_id.is_a?(Fixnum) }
|
462
|
+
end
|
463
|
+
|
464
|
+
def inactive_workers
|
465
|
+
@worker_queue.values.select{|status| !status.process_id.is_a?(Fixnum) }
|
387
466
|
end
|
388
|
-
|
389
|
-
def worker_queue_pids(worker_queue=self.worker_queue)
|
390
|
-
worker_queue.collect {|w| w.process_id}
|
391
|
-
end
|
392
467
|
|
393
468
|
def worker_pids
|
394
|
-
|
395
|
-
end
|
396
|
-
|
469
|
+
active_workers.collect {|w| w.process_id}
|
470
|
+
end
|
471
|
+
|
397
472
|
def parent_pid
|
398
473
|
$$
|
399
474
|
end
|
400
475
|
|
401
476
|
def hostname
|
402
477
|
@machine_name ||= Socket.gethostname
|
403
|
-
end
|
478
|
+
end
|
404
479
|
|
405
480
|
def ping
|
406
481
|
true
|
407
482
|
end
|
408
483
|
|
484
|
+
def self.local_manager_uri
|
485
|
+
"druby://localhost:#{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}"
|
486
|
+
end
|
487
|
+
|
488
|
+
def self.get
|
489
|
+
DRbObject.new(nil,local_manager_uri)
|
490
|
+
end
|
491
|
+
|
409
492
|
def self.start(options={})
|
410
493
|
options[:add_workers] ||= nil
|
411
494
|
options[:remove_workers] ||= nil
|
412
495
|
options[:use_rails] ||= false
|
413
496
|
options[:required_libs] ||= []
|
414
|
-
|
415
|
-
options[:pid_file] ||= Skynet::CONFIG[:SKYNET_PIDS_FILE]
|
416
|
-
options[:script_path] ||= Skynet::CONFIG[:LAUNCHER_PATH]
|
417
|
-
|
497
|
+
|
418
498
|
config = Skynet::Config.new
|
419
499
|
|
420
500
|
OptionParser.new do |opt|
|
421
|
-
opt.banner = %{Usage:
|
501
|
+
opt.banner = %{Usage:
|
422
502
|
> skynet [options]
|
423
503
|
|
504
|
+
OR to daemonize
|
505
|
+
|
506
|
+
> skynet [options] start
|
507
|
+
> skynet stop
|
508
|
+
|
424
509
|
You can also run:
|
425
510
|
> skynet console [options]
|
426
511
|
}
|
427
|
-
opt.on('
|
512
|
+
opt.on('--restart-all-workers', 'Restart All Workers') do |v|
|
428
513
|
puts "Restarting ALL workers on ALL machines."
|
429
514
|
begin
|
430
|
-
manager =
|
515
|
+
manager = self.get
|
431
516
|
manager.restart_all_workers
|
432
517
|
exit
|
433
518
|
rescue DRb::DRbConnError => e
|
434
|
-
puts "No manager running at #{
|
519
|
+
puts "No manager running at #{local_manager_uri} ERROR: #{e.inspect}"
|
435
520
|
exit
|
436
521
|
end
|
437
522
|
end
|
438
|
-
opt.on('
|
523
|
+
opt.on('--restart-workers', 'Restart Workers') do |v|
|
439
524
|
puts "Restarting workers on this machine."
|
440
525
|
begin
|
441
|
-
manager =
|
526
|
+
manager = self.get
|
442
527
|
manager.restart_workers
|
443
528
|
exit
|
444
529
|
rescue DRb::DRbConnError => e
|
445
|
-
puts "No manager running at #{
|
530
|
+
puts "No manager running at #{local_manager_uri} ERROR: #{e.inspect}"
|
446
531
|
exit
|
447
532
|
end
|
448
533
|
end
|
449
|
-
opt.on('
|
534
|
+
opt.on('--increment-worker-version', 'Increment Worker Version') do |v|
|
450
535
|
ver = Skynet::MessageQueue.new.increment_worker_version
|
451
536
|
puts "Incrementing Worker Version to #{ver}"
|
452
537
|
exit
|
453
538
|
end
|
454
|
-
opt.on('
|
539
|
+
opt.on('--add-workers=WORKERS', 'Number of workers to add.') do |v|
|
455
540
|
options[:add_workers] = v.to_i
|
456
541
|
end
|
457
|
-
opt.on('
|
542
|
+
opt.on('--remove-workers=WORKERS', 'Number of workers to remove.') do |v|
|
458
543
|
options[:remove_workers] = v.to_i
|
459
544
|
end
|
460
|
-
opt.on('
|
545
|
+
opt.on('--workers=WORKERS', 'Number of workers to start.') do |v|
|
461
546
|
options[:workers] = v.to_i
|
462
|
-
end
|
547
|
+
end
|
463
548
|
opt.on('-r', '--required LIBRARY', 'Require the specified libraries') do |v|
|
464
549
|
options[:required_libs] << File.expand_path(v)
|
465
550
|
end
|
466
|
-
opt.on('
|
551
|
+
opt.on('--config=CONFIG_FILE', 'Where to find the skynet.rb config file') do |v|
|
552
|
+
options[:config_file] = File.expand_path(v)
|
553
|
+
end
|
554
|
+
opt.on('--queue=QUEUE_NAME', 'Which queue should these workers use (default "default").') do |v|
|
467
555
|
options[:queue] = v
|
468
|
-
end
|
469
|
-
opt.on('
|
556
|
+
end
|
557
|
+
opt.on('--queue_id=queue_id', 'Which queue should these workers use (default 0).') do |v|
|
470
558
|
options[:queue_id] = v.to_i
|
471
|
-
end
|
559
|
+
end
|
472
560
|
opt.parse!(ARGV)
|
473
561
|
end
|
474
562
|
if options[:queue]
|
@@ -477,9 +565,9 @@ class Skynet
|
|
477
565
|
end
|
478
566
|
options[:queue_id] = config.queue_id_by_name(options[:queue])
|
479
567
|
else
|
480
|
-
options[:queue_id] ||= 0
|
568
|
+
options[:queue_id] ||= 0
|
481
569
|
end
|
482
|
-
|
570
|
+
|
483
571
|
options[:required_libs].each do |adlib|
|
484
572
|
begin
|
485
573
|
require adlib
|
@@ -487,12 +575,28 @@ class Skynet
|
|
487
575
|
error "The included lib #{adlib} was not found: #{e.inspect}"
|
488
576
|
exit
|
489
577
|
end
|
490
|
-
end
|
578
|
+
end
|
579
|
+
|
580
|
+
options[:config_file] ||= Skynet::CONFIG[:CONFIG_FILE]
|
581
|
+
if options[:config_file]
|
582
|
+
begin
|
583
|
+
require options[:config_file]
|
584
|
+
rescue MissingSourceFile => e
|
585
|
+
error "The config file at #{options[:config_file]} was not found: #{e.inspect}"
|
586
|
+
exit
|
587
|
+
end
|
588
|
+
elsif Skynet::CONFIG[:SYSTEM_RUNNER]
|
589
|
+
error "Config file missing. Please add a config/skynet_config.rb before starting."
|
590
|
+
end
|
591
|
+
|
592
|
+
options[:workers] ||= Skynet::CONFIG[:NUMBER_OF_WORKERS] || 4
|
593
|
+
options[:pid_file] ||= Skynet::Config.pidfile_location
|
594
|
+
options[:script_path] ||= Skynet::CONFIG[:LAUNCHER_PATH]
|
491
595
|
|
492
596
|
# Handle add or remove workers
|
493
597
|
if options[:add_workers] or options[:remove_workers]
|
494
598
|
begin
|
495
|
-
manager =
|
599
|
+
manager = self.get
|
496
600
|
if options[:add_workers]
|
497
601
|
pids = manager.add_worker(options[:add_workers])
|
498
602
|
warn "ADDING #{options[:add_workers]} workers PIDS: #{pids.inspect}"
|
@@ -501,7 +605,7 @@ class Skynet
|
|
501
605
|
warn "REMOVING #{options[:remove_workers]} workers PIDS: #{pids.inspect}"
|
502
606
|
end
|
503
607
|
rescue DRb::DRbConnError => e
|
504
|
-
warn "Couldnt add or remove workers. There are probably no workers running. At least I couldn't find a skynet_manager around at #{
|
608
|
+
warn "Couldnt add or remove workers. There are probably no workers running. At least I couldn't find a skynet_manager around at #{local_manager_uri} #{e.inspect}"
|
505
609
|
rescue Exception => e
|
506
610
|
warn "Couldnt add or remove workers #{e.inspect} #{e.backtrace.join("\n")}"
|
507
611
|
end
|
@@ -519,25 +623,84 @@ class Skynet
|
|
519
623
|
|
520
624
|
debug "CONTINUING TO START : There IS an available MessageQueue", options
|
521
625
|
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
626
|
+
begin
|
627
|
+
if oldpid = read_pid_file
|
628
|
+
errmsg = nil
|
629
|
+
if Skynet.process_alive?(oldpid)
|
630
|
+
errmsg = "Another Skynet Manager is running at pid: #{oldpid}"
|
631
|
+
warn errmsg
|
632
|
+
stderr errmsg
|
633
|
+
exit
|
634
|
+
else
|
635
|
+
errmsg = "Deleting stale pidfile #{Skynet::Config.pidfile_location}"
|
636
|
+
warn errmsg
|
637
|
+
stderr errmsg
|
638
|
+
File.unlink(Skynet::Config.pidfile_location) if File.exist?(Skynet::Config.pidfile_location)
|
639
|
+
end
|
640
|
+
end
|
526
641
|
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
642
|
+
printlog "STARTING THE MANAGER!!!!!!!!!!! port: #{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}"
|
643
|
+
puts "Starting Skynet..."
|
644
|
+
printlog "Skynet Stopped"
|
645
|
+
if options["daemonize"]
|
646
|
+
Skynet.safefork do
|
647
|
+
sess_id = Process.setsid
|
648
|
+
write_pid_file
|
649
|
+
Skynet.close_console
|
650
|
+
run_manager(options)
|
651
|
+
exit!
|
652
|
+
end
|
653
|
+
else
|
654
|
+
write_pid_file
|
655
|
+
run_manager(options)
|
656
|
+
end
|
535
657
|
rescue SystemExit, Interrupt
|
536
658
|
rescue Exception => e
|
537
|
-
fatal("Error in Manager. Manager Dying. #{e.inspect} #{e.backtrace}")
|
659
|
+
fatal("Error in Manager. Manager Dying. #{e.inspect} #{e.backtrace}")
|
538
660
|
end
|
539
661
|
end
|
540
662
|
end
|
541
663
|
|
664
|
+
def self.run_manager(options)
|
665
|
+
@manager = Skynet::Manager.new(options)
|
666
|
+
@drb_manager = DRb.start_service("druby://:#{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}", @manager)
|
667
|
+
@manager.start_workers
|
668
|
+
info "MANAGER STARTED ON PORT: #{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}"
|
669
|
+
@manager.run
|
670
|
+
end
|
671
|
+
|
672
|
+
# stop the daemon, nicely at first, and then forcefully if necessary
|
673
|
+
def self.stop(options = {})
|
674
|
+
pid = read_pid_file
|
675
|
+
if not pid
|
676
|
+
puts "The Skynet Manager is not running. No PID found in #{Skynet::Config.pidfile_location}"
|
677
|
+
exit
|
678
|
+
end
|
679
|
+
$stdout.puts "Stopping Skynet"
|
680
|
+
printlog "Stopping Skynet"
|
681
|
+
Process.kill("TERM", pid)
|
682
|
+
180.times { Process.kill(0, pid); sleep(1) }
|
683
|
+
Process.kill("TERM", pid)
|
684
|
+
180.times { Process.kill(0, pid); sleep(1) }
|
685
|
+
$stdout.puts("using kill -9 #{pid}")
|
686
|
+
Process.kill("KILL", pid)
|
687
|
+
rescue Errno::ESRCH => e
|
688
|
+
printlog "Skynet Stopped"
|
689
|
+
ensure
|
690
|
+
File.unlink(Skynet::Config.pidfile_location) if File.exist?(Skynet::Config.pidfile_location)
|
691
|
+
end
|
692
|
+
|
693
|
+
def self.read_pid_file
|
694
|
+
pidfile = Skynet::Config.pidfile_location
|
695
|
+
File.read(pidfile).to_i if File.exist?(pidfile)
|
696
|
+
end
|
697
|
+
|
698
|
+
def self.write_pid_file
|
699
|
+
pidfile = Skynet::Config.pidfile_location
|
700
|
+
info "Writing PIDFILE to #{pidfile}"
|
701
|
+
open(pidfile, "w") {|f| f << Process.pid << "\n"}
|
702
|
+
at_exit { File.unlink(pidfile) if read_pid_file == Process.pid }
|
703
|
+
end
|
704
|
+
|
542
705
|
end
|
543
706
|
end
|