skynet 0.9.2 → 0.9.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (128) hide show
  1. data/History.txt +49 -0
  2. data/Manifest.txt +84 -6
  3. data/README.txt +75 -64
  4. data/app_generators/skynet_install/skynet_install_generator.rb +14 -8
  5. data/app_generators/skynet_install/templates/migration.rb +1 -24
  6. data/app_generators/skynet_install/templates/skynet_config.rb +50 -0
  7. data/app_generators/skynet_install/templates/skynet_initializer.rb +1 -0
  8. data/app_generators/skynet_install/templates/{skynet_schema.sql → skynet_mysql_schema.sql} +1 -24
  9. data/bin/skynet +37 -10
  10. data/bin/skynet_install +5 -5
  11. data/bin/skynet_tuplespace_server +27 -19
  12. data/examples/dgrep/README +70 -0
  13. data/examples/dgrep/config/skynet_config.rb +26 -0
  14. data/examples/dgrep/data/shakespeare/README +2 -0
  15. data/examples/dgrep/data/shakespeare/poetry/loverscomplaint +381 -0
  16. data/examples/dgrep/data/shakespeare/poetry/rapeoflucrece +2199 -0
  17. data/examples/dgrep/data/shakespeare/poetry/sonnets +2633 -0
  18. data/examples/dgrep/data/shakespeare/poetry/various +640 -0
  19. data/examples/dgrep/data/shakespeare/poetry/venusandadonis +1423 -0
  20. data/examples/dgrep/data/testfile1.txt +1 -0
  21. data/examples/dgrep/data/testfile2.txt +1 -0
  22. data/examples/dgrep/data/testfile3.txt +1 -0
  23. data/examples/dgrep/data/testfile4.txt +1 -0
  24. data/examples/dgrep/lib/dgrep.rb +59 -0
  25. data/examples/dgrep/lib/mapreduce_test.rb +32 -0
  26. data/examples/dgrep/lib/most_common_words.rb +45 -0
  27. data/examples/dgrep/script/dgrep +75 -0
  28. data/examples/rails_mysql_example/README +66 -0
  29. data/examples/rails_mysql_example/Rakefile +10 -0
  30. data/examples/rails_mysql_example/app/controllers/application.rb +10 -0
  31. data/examples/rails_mysql_example/app/helpers/application_helper.rb +3 -0
  32. data/examples/rails_mysql_example/app/models/user.rb +21 -0
  33. data/examples/rails_mysql_example/app/models/user_favorite.rb +5 -0
  34. data/examples/rails_mysql_example/app/models/user_mailer.rb +12 -0
  35. data/examples/rails_mysql_example/app/views/user_mailer/welcome.erb +5 -0
  36. data/examples/rails_mysql_example/config/boot.rb +109 -0
  37. data/examples/rails_mysql_example/config/database.yml +42 -0
  38. data/examples/rails_mysql_example/config/environment.rb +59 -0
  39. data/examples/rails_mysql_example/config/environments/development.rb +18 -0
  40. data/examples/rails_mysql_example/config/environments/production.rb +19 -0
  41. data/examples/rails_mysql_example/config/environments/test.rb +22 -0
  42. data/examples/rails_mysql_example/config/initializers/inflections.rb +10 -0
  43. data/examples/rails_mysql_example/config/initializers/mime_types.rb +5 -0
  44. data/examples/rails_mysql_example/config/initializers/skynet.rb +1 -0
  45. data/examples/rails_mysql_example/config/routes.rb +35 -0
  46. data/examples/rails_mysql_example/config/skynet_config.rb +36 -0
  47. data/examples/rails_mysql_example/db/migrate/001_create_skynet_tables.rb +43 -0
  48. data/examples/rails_mysql_example/db/migrate/002_create_users.rb +16 -0
  49. data/examples/rails_mysql_example/db/migrate/003_create_user_favorites.rb +14 -0
  50. data/examples/rails_mysql_example/db/schema.rb +85 -0
  51. data/examples/rails_mysql_example/db/skynet_mysql_schema.sql +33 -0
  52. data/examples/rails_mysql_example/doc/README_FOR_APP +2 -0
  53. data/examples/rails_mysql_example/lib/tasks/rails_mysql_example.rake +20 -0
  54. data/examples/rails_mysql_example/public/.htaccess +40 -0
  55. data/examples/rails_mysql_example/public/404.html +30 -0
  56. data/examples/rails_mysql_example/public/422.html +30 -0
  57. data/examples/rails_mysql_example/public/500.html +30 -0
  58. data/examples/rails_mysql_example/public/dispatch.cgi +10 -0
  59. data/examples/rails_mysql_example/public/dispatch.fcgi +24 -0
  60. data/examples/rails_mysql_example/public/dispatch.rb +10 -0
  61. data/{log/debug.log → examples/rails_mysql_example/public/favicon.ico} +0 -0
  62. data/examples/rails_mysql_example/public/images/rails.png +0 -0
  63. data/examples/rails_mysql_example/public/index.html +277 -0
  64. data/examples/rails_mysql_example/public/javascripts/application.js +2 -0
  65. data/examples/rails_mysql_example/public/javascripts/controls.js +963 -0
  66. data/examples/rails_mysql_example/public/javascripts/dragdrop.js +972 -0
  67. data/examples/rails_mysql_example/public/javascripts/effects.js +1120 -0
  68. data/examples/rails_mysql_example/public/javascripts/prototype.js +4225 -0
  69. data/examples/rails_mysql_example/public/robots.txt +5 -0
  70. data/examples/rails_mysql_example/script/about +3 -0
  71. data/examples/rails_mysql_example/script/console +3 -0
  72. data/examples/rails_mysql_example/script/destroy +3 -0
  73. data/examples/rails_mysql_example/script/generate +3 -0
  74. data/examples/rails_mysql_example/script/performance/benchmarker +3 -0
  75. data/examples/rails_mysql_example/script/performance/profiler +3 -0
  76. data/examples/rails_mysql_example/script/performance/request +3 -0
  77. data/examples/rails_mysql_example/script/plugin +3 -0
  78. data/examples/rails_mysql_example/script/process/inspector +3 -0
  79. data/examples/rails_mysql_example/script/process/reaper +3 -0
  80. data/examples/rails_mysql_example/script/process/spawner +3 -0
  81. data/examples/rails_mysql_example/script/runner +3 -0
  82. data/examples/rails_mysql_example/script/server +3 -0
  83. data/examples/rails_mysql_example/test/fixtures/user_favorites.yml +9 -0
  84. data/examples/rails_mysql_example/test/fixtures/users.yml +11 -0
  85. data/examples/rails_mysql_example/test/test_helper.rb +38 -0
  86. data/examples/rails_mysql_example/test/unit/user_favorite_test.rb +8 -0
  87. data/examples/rails_mysql_example/test/unit/user_test.rb +8 -0
  88. data/extras/README +7 -0
  89. data/extras/init.d/skynet +87 -0
  90. data/extras/nagios/check_skynet.sh +121 -0
  91. data/extras/rails/controllers/skynet_controller.rb +43 -0
  92. data/extras/rails/views/skynet/index.rhtml +137 -0
  93. data/lib/skynet.rb +59 -1
  94. data/lib/skynet/mapreduce_helper.rb +2 -2
  95. data/lib/skynet/mapreduce_test.rb +32 -1
  96. data/lib/skynet/message_queue_adapters/mysql.rb +422 -539
  97. data/lib/skynet/message_queue_adapters/tuple_space.rb +45 -71
  98. data/lib/skynet/skynet_active_record_extensions.rb +22 -11
  99. data/lib/skynet/skynet_config.rb +54 -20
  100. data/lib/skynet/skynet_console.rb +4 -1
  101. data/lib/skynet/skynet_console_helper.rb +5 -1
  102. data/lib/skynet/skynet_debugger.rb +58 -4
  103. data/lib/skynet/skynet_job.rb +61 -24
  104. data/lib/skynet/skynet_launcher.rb +29 -3
  105. data/lib/skynet/skynet_logger.rb +11 -1
  106. data/lib/skynet/skynet_manager.rb +403 -240
  107. data/lib/skynet/skynet_message.rb +1 -3
  108. data/lib/skynet/skynet_message_queue.rb +42 -19
  109. data/lib/skynet/skynet_partitioners.rb +19 -15
  110. data/lib/skynet/skynet_ruby_extensions.rb +18 -0
  111. data/lib/skynet/skynet_tuplespace_server.rb +17 -14
  112. data/lib/skynet/skynet_worker.rb +132 -98
  113. data/lib/skynet/version.rb +1 -1
  114. data/script/destroy +0 -0
  115. data/script/generate +0 -0
  116. data/script/txt2html +0 -0
  117. data/test/test_helper.rb +2 -0
  118. data/test/test_skynet.rb +13 -5
  119. data/test/test_skynet_manager.rb +24 -9
  120. data/test/test_skynet_task.rb +1 -1
  121. data/website/index.html +77 -29
  122. data/website/index.txt +53 -24
  123. data/website/stylesheets/screen.css +12 -12
  124. metadata +156 -66
  125. data/app_generators/skynet_install/templates/skynet +0 -46
  126. data/log/skynet.log +0 -29
  127. data/log/skynet_tuplespace_server.log +0 -7
  128. data/log/skynet_worker.pid +0 -1
@@ -30,7 +30,10 @@ class Skynet
30
30
  end
31
31
 
32
32
  IRB.setup(Skynet::CONFIG[:LAUNCHER_PATH])
33
- IRB.conf[:PROMPT_MODE] = :SIMPLE
33
+ IRB.conf[:PROMPT][:SKYNET] = IRB.conf[:PROMPT][:SIMPLE].dup
34
+ IRB.conf[:PROMPT][:SKYNET][:PROMPT_I] = "skynet>>"
35
+
36
+ IRB.conf[:PROMPT_MODE] = :SKYNET
34
37
  irb = IRB::Irb.new()
35
38
  IRB.conf[:MAIN_CONTEXT] = irb.context
36
39
  irb.context.workspace.main.extend Skynet::ConsoleHelper
@@ -1,6 +1,10 @@
1
1
  module Skynet::ConsoleHelper
2
2
  # All of these commands can be run at the 'skynet console'.
3
3
 
4
+ def log
5
+ Skynet::Logger.get
6
+ end
7
+
4
8
  def mq
5
9
  @mq ||= Skynet::MessageQueue.new
6
10
  end
@@ -22,7 +26,7 @@ module Skynet::ConsoleHelper
22
26
  end
23
27
 
24
28
  def manager
25
- @manager ||= DRbObject.new(nil,Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_URL])
29
+ @manager ||= Skynet::Manager.get
26
30
  end
27
31
 
28
32
  def add_lib(lib)
@@ -1,3 +1,36 @@
1
+ # ==SkynetDebugger
2
+ # The SkynetDebugger is a module you can include in any of your classes that will give you easy access
3
+ # to the Skynet::Logger. including SkynetDebugger gives you a number of logging methods. Each logging method
4
+ # lets you pass a message as well as an optional number of objects which will be pretty_printed after your message.
5
+ # Log lines print with their log level, PID, time, class, and message. eg.
6
+ #
7
+ # [WARN] #78002 2008-04-11 14:17:15.363167 <WORKER-78002> Exiting...
8
+ #
9
+ # You can set the log_level and log_file with (See Skynet::Config)
10
+ # Skynet::CONFIG[:SKYNET_LOG_FILE]
11
+ # Skynet::CONFIG[:SKYNET_LOG_LEVEL]
12
+ #
13
+ # Possible log levels include
14
+ # Logger::DEBUG
15
+ # Logger::INFO
16
+ # Logger::WARN
17
+ # Logger::ERROR
18
+ # Logger::FATAL
19
+ #
20
+ # ==Methods
21
+ # log - returns the Skynet::Logger
22
+ #
23
+ # debug(msg,*objects_to_inspect)
24
+ #
25
+ # info(msg,*objects_to_inspect)
26
+ #
27
+ # warn(msg,*objects_to_inspect)
28
+ #
29
+ # error(msg,*objects_to_inspect)
30
+ #
31
+ # fatal(msg,*objects_to_inspect)
32
+ #
33
+ # printlog(msg,*objects_to_inspect) #printlog will ALWAYS print to the log as log level [LOG] regardless of the LOG_LEVEL
1
34
  module SkynetDebugger
2
35
 
3
36
  def self.included(base)
@@ -12,22 +45,18 @@ module SkynetDebugger
12
45
  self.class.args_pp(*args)
13
46
  end
14
47
 
15
-
16
48
  def debug(*args)
17
49
  self.class.debug(*args)
18
50
  end
19
51
 
20
-
21
52
  def info(*args)
22
53
  self.class.info(*args)
23
54
  end
24
55
 
25
-
26
56
  def warn(*args)
27
57
  self.class.warn(*args)
28
58
  end
29
59
 
30
-
31
60
  def error(*args)
32
61
  self.class.error(*args)
33
62
  end
@@ -35,11 +64,23 @@ module SkynetDebugger
35
64
  def fatal(*args)
36
65
  self.class.fatal(*args)
37
66
  end
67
+
68
+ def printlog(*args)
69
+ self.class.printlog(*args)
70
+ end
38
71
 
39
72
  def debug_header
40
73
  self.class.debug_header
41
74
  end
42
75
 
76
+ def stderr(*args)
77
+ self.class.stderr
78
+ end
79
+
80
+ def stdout(*args)
81
+ self.class.stdout
82
+ end
83
+
43
84
  module ClassMethods
44
85
 
45
86
  def debug_class_desc
@@ -51,6 +92,7 @@ module SkynetDebugger
51
92
  "##{$$} #{t.strftime("%Y-%m-%d %H:%M:%S")}.#{t.usec} <#{debug_class_desc}>"
52
93
  end
53
94
 
95
+ # log
54
96
  def log
55
97
  Skynet::Logger.get
56
98
  end
@@ -78,6 +120,18 @@ module SkynetDebugger
78
120
  def fatal(msg, *args)
79
121
  log.fatal "[FATAL] #{debug_header} #{msg} #{args_pp(*args)}"
80
122
  end
123
+
124
+ def printlog(msg, *args)
125
+ log.unknown "[LOG] #{debug_header} #{msg} #{args_pp(*args)}"
126
+ end
127
+
128
+ def stderr(msg, *args)
129
+ $stderr.puts "#{debug_header} #{msg} #{args_pp(*args)}"
130
+ end
131
+
132
+ def stdout(msg, *args)
133
+ $stdout.puts "#{debug_header} #{msg} #{args_pp(*args)}"
134
+ end
81
135
 
82
136
  end
83
137
 
@@ -9,8 +9,10 @@ class Skynet
9
9
  # There are also many global configuration options which can be controlled through Skynet::CONFIG
10
10
  #
11
11
  # Example Usage:
12
+ # Create a file called mapreduce_test.rb with the following.
12
13
  #
13
- # class MapReduceTest
14
+ # class MapreduceTest
15
+ # include SkynetDebugger ## This gives you logging methods such as log, error, info, fatal
14
16
  #
15
17
  # def self.run
16
18
  # job = Skynet::Job.new(
@@ -41,17 +43,28 @@ class Skynet
41
43
  # end
42
44
  # end
43
45
  #
44
- # MapReduceTest.run
45
46
  #
46
- # You might notice that self.map and self.reduce both accept Arrays. If you do not want to deal with
47
- # getting arrays of map_data or reduce_data, you can include MapreduceHelper into your class and then
48
- # implement self.map_each and self.reduce_each methods. The included self.map and self.reduce methods
49
- # will handle iterating over the map_data and reduce_data, passing each element to your map_each and
50
- # reduce_each methods respectively. They will also handle error handling within that loop to make sure
51
- # even if a single map or reduce fails, processing will continue. If you do not want processing to
52
- # continue if a map fails, do not use the MapreduceHelper mixin.
47
+ # You need to make sure Skynet is running with your class loaded. That's is how Skynet works.
48
+ # Since there is no easy way to actually pass code around the network, each skynet worker must
49
+ # already have your code loaded. If you have skynet started, stop it and then start it with the -r flag
50
+ # to tell it where to find your class it should require.
51
+ # $ skynet -r mapreduce_test.rb
52
+ # Then go into the skynet console to test running your map reduce task.
53
+ # $ skynet console -r mapreduce_test.rb
54
+ # skynet>> MapreduceTest.run # returns {2=>2, 3=>1}
53
55
  #
54
- # There are many other options to control various defaults and timeouts.
56
+ # In the example above, you might notice that self.map and self.reduce both accept Arrays.
57
+ # If you do not want to deal with getting arrays of map_data or reduce_data, you can include MapreduceHelper
58
+ # into your class and then implement self.map_each and self.reduce_each methods.
59
+ # The included self.map and self.reduce methods will handle iterating over the map_data and reduce_data,
60
+ # passing each element to your map_each and reduce_each methods respectively. They will also handle error
61
+ # handling within that loop to make sure even if a single map or reduce fails, processing will continue.
62
+ # If you do not want processing to continue if a map fails, do not use the MapreduceHelper mixin.
63
+ #
64
+ # Since Skynet must have your code, you will probably want to install skynet into the application
65
+ # that skynet needs access to in order to run your jobs. See bin/skynet_install[link:files/bin/skynet_install.html] for more info.
66
+ #
67
+ # See new for the many other options to control various Skynet::Job settings.
55
68
  class Job
56
69
  include SkynetDebugger
57
70
  include Skynet::GuidGenerator
@@ -70,7 +83,7 @@ class Skynet
70
83
  :map, :map_partitioner, :reduce, :reduce_partition, :map_reduce_class,
71
84
  :master_retry, :map_retry, :reduce_retry,
72
85
  :keep_map_tasks, :keep_reduce_tasks,
73
- :local_master, :async
86
+ :local_master, :async, :data_debug
74
87
  ]
75
88
 
76
89
  FIELDS.each do |method|
@@ -83,7 +96,7 @@ class Skynet
83
96
  end
84
97
  end
85
98
 
86
- attr_accessor :use_local_queue
99
+ attr_accessor :use_local_queue, :data_debug
87
100
 
88
101
  Skynet::CONFIG[:JOB_DEFAULTS] = {
89
102
  :queue_id => 0,
@@ -220,7 +233,7 @@ class Skynet
220
233
  # If a number is provided, the master will run the reduce_tasks locally if there are
221
234
  # LESS THAN OR EQUAL TO the number provided.
222
235
  # You may also set Skynet::CONFIG[:DEFAILT_REDUCVE_MAP_TASKS] DEFAULT 1
223
- def initialize(options = {})
236
+ def initialize(options = {})
224
237
  FIELDS.each do |field|
225
238
  if options.has_key?(field)
226
239
  self.send("#{field}=".to_sym,options[field])
@@ -314,7 +327,10 @@ class Skynet
314
327
  self.use_local_queue = map_local?
315
328
  if map_tasks
316
329
  number_of_tasks = 0
317
- map_tasks.each do |task|
330
+ size = map_tasks.size - 1
331
+ printlog "MESSAGES TO MAP ENQUEUE #{size}" if data_debug?
332
+ map_tasks.each_with_index do |task,ii|
333
+ printlog "#{size - ii} MAP TASKS LEFT TO ENQUEUE" if data_debug?
318
334
  number_of_tasks += 1
319
335
  enqueue_messages(tasks_to_messages(task))
320
336
  end
@@ -332,8 +348,9 @@ class Skynet
332
348
  end
333
349
 
334
350
  def partition_data(post_map_data)
335
- debug "RUN REDUCE 3.1 BEFORE PARTITION #{display_info} reducers: #{reducers}"
351
+ info "RUN REDUCE 3.1 BEFORE PARTITION #{display_info} reducers: #{reducers}"
336
352
  debug "RUN REDUCE 3.1 : #{reducers} #{name}, job_id:#{job_id}", post_map_data
353
+ printlog "RUN REDUCE 3.1 : #{reducers} #{name}, job_id:#{job_id}", post_map_data if data_debug?
337
354
  return unless post_map_data
338
355
  partitioned_data = nil
339
356
  if not @reduce_partition
@@ -353,19 +370,23 @@ class Skynet
353
370
  partitioned_data = @reduce_partition.call(post_map_data, reducers)
354
371
  end
355
372
  partitioned_data.compact! if partitioned_data
356
- debug "RUN REDUCE 3.2 AFTER PARTITION #{display_info} reducers: #{reducers}"
373
+ info "RUN REDUCE 3.2 AFTER PARTITION #{display_info} reducers: #{reducers}"
357
374
  debug "RUN REDUCE 3.2 AFTER PARTITION #{display_info} data:", partitioned_data if partitioned_data
375
+ printlog "RUN REDUCE 3.2 AFTER PARTITION #{display_info} data:", partitioned_data if data_debug?
358
376
  partitioned_data
359
377
  end
360
378
 
361
379
  def reduce_enqueue(partitioned_data)
362
380
  return partitioned_data unless @reduce and reducers and reducers > 0
363
381
  debug "RUN REDUCE 3.3 CREATED REDUCE TASKS #{display_info}", partitioned_data
382
+ size = partitioned_data.size
383
+ printlog "REDUCE MESSAGES TO ENQUEUE #{size}" if data_debug?
364
384
 
365
385
  reduce_tasks = self.reduce_tasks(partitioned_data)
366
386
  self.use_local_queue = reduce_local?(reduce_tasks)
367
387
  number_of_tasks = 0
368
- reduce_tasks.each do |task|
388
+ reduce_tasks.each_with_index do |task,ii|
389
+ printlog "#{size - ii} REDUCE TASKS LEFT TO ENQUEUE" if data_debug?
369
390
  number_of_tasks += 1
370
391
  enqueue_messages(tasks_to_messages(task))
371
392
  end
@@ -374,20 +395,23 @@ class Skynet
374
395
 
375
396
  def reduce_results(number_of_tasks)
376
397
  results = gather_results(number_of_tasks, reduce_timeout, reduce_name)
377
- if results.is_a?(Array) and results.first.is_a?(Hash)
378
- hash_results = Hash.new
379
- results.each {|h| hash_results.merge!(h) if h.class == Hash}
380
- results = hash_results
381
- elsif results.is_a?(Array) and results.first.is_a?(Array)
382
- results = results.compact
398
+ printlog "REDUCE RESULTS", results if data_debug?
399
+ if results.is_a?(Array) and results.first.is_a?(Array)
400
+ final = []
401
+ results.each do |result|
402
+ final += result
403
+ end
404
+ results = final
383
405
  end
384
406
  debug "RUN REDUCE 3.4 AFTER REDUCE #{display_info} results size: #{results ? results.size : ''}"
385
407
  debug "RUN REDUCE 3.4 AFTER REDUCE #{display_info} results:", results if results
408
+ printlog "POST REDUCE RESULTS", results if data_debug?
386
409
  return results
387
410
  end
388
411
 
389
412
  def enqueue_messages(messages)
390
- messages.each do |message|
413
+ size = messages.size
414
+ messages.each_with_index do |message,ii|
391
415
  timeout = message.expiry || 5
392
416
  debug "RUN TASKS SUBMITTING #{message.name} job_id: #{job_id} #{message.payload.is_a?(Skynet::Task) ? 'task' + message.payload.task_id.to_s : ''}"
393
417
  debug "RUN TASKS WORKER MESSAGE #{message.name} job_id: #{job_id}", message.to_a
@@ -395,6 +419,14 @@ class Skynet
395
419
  end
396
420
  end
397
421
 
422
+ # Given a job_id, returns the results from the message queue. Used to retrieve results of asyncronous jobs.
423
+ def self.results_by_job_id(job_id,timeout=2)
424
+ result_message = mq.take_result(job_id,timeout)
425
+ result = result_message.payload
426
+ return nil unless result
427
+ return result
428
+ end
429
+
398
430
  def gather_results(number_of_tasks, timeout=nil, description=nil)
399
431
  debug "GATHER RESULTS job_id: #{job_id} - NOT AN ASYNC JOB"
400
432
  results = {}
@@ -415,6 +447,7 @@ class Skynet
415
447
  debug "RESULT returned TASKID: #{result_message.task_id} #{results[result_message.task_id].inspect}"
416
448
  end
417
449
  debug "RESULT collected: #{(results.keys + errors.keys).size}, remaining: #{(number_of_tasks - (results.keys + errors.keys).uniq.size)}"
450
+ printlog "RESULT collected: #{(results.keys + errors.keys).size}, remaining: #{(number_of_tasks - (results.keys + errors.keys).uniq.size)}" if data_debug?
418
451
  break if (number_of_tasks - (results.keys + errors.keys).uniq.size) <= 0
419
452
  end
420
453
  rescue Skynet::RequestExpiredError => e
@@ -582,6 +615,10 @@ class Skynet
582
615
 
583
616
  def single?
584
617
  @single
618
+ end
619
+
620
+ def data_debug?
621
+ @data_debug || Skynet::CONFIG[:SKYNET_JOB_DEBUG_DATA_LEVEL]
585
622
  end
586
623
 
587
624
  def reset!
@@ -1,14 +1,40 @@
1
1
  # FIXME: should be a module
2
2
  class Skynet
3
3
  include SkynetDebugger
4
- def self.new(options={})
4
+ def self.start(options={})
5
+ begin
6
+ mq = Skynet::MessageQueue.new
7
+ rescue Skynet::ConnectionError
8
+ if Skynet::MessageQueue.adapter == :tuplespace
9
+ ts_port = Skynet::CONFIG[:TS_SERVER_HOSTS].first.split(':').last
10
+ # puts "trying to make ts skynet_tuplespace_server --port=#{ts_port} --logfile=#{Skynet.config.logfile_location} --piddir=#{Skynet.config.skynet_pid_dir} --use_ringserver=#{Skynet.config.ts_use_ringserver} --drburi=#{Skynet.config.ts_drburi} start"
11
+ cmd = "skynet_tuplespace_server --port=#{ts_port} --logfile=#{Skynet.config.logfile_location} --piddir=#{Skynet.config.skynet_pid_dir} --use_ringserver=#{Skynet.config.ts_use_ringserver} --drburi=#{Skynet.config.ts_drburi} start"
12
+ pid = fork do
13
+ exec(cmd)
14
+ end
15
+ sleep Skynet::CONFIG[:TS_SERVER_START_DELAY]
16
+ end
17
+ end
18
+
19
+ options[:script_path] = Skynet::CONFIG[:LAUNCHER_PATH]
20
+
5
21
  if ARGV.detect {|a| a == 'console' }
6
22
  ARGV.delete('console')
7
23
  Skynet::Console.start
8
24
  elsif options[:worker_type] or ARGV.detect {|a| a =~ /worker_type/ }
9
25
  Skynet::Worker.start(options)
10
26
  else
11
- Skynet::Manager.start(options)
27
+ if ARGV.include?('stop')
28
+ Skynet::Manager.stop(options)
29
+ else
30
+ options["daemonize"] = true if ARGV.include?('start')
31
+ Skynet::Manager.start(options)
32
+ end
12
33
  end
13
34
  end
14
- end
35
+
36
+ def self.new(options={})
37
+ warn("Skynet.new is deprecated, please use Skynet.start instead")
38
+ start(options)
39
+ end
40
+ end
@@ -16,7 +16,7 @@ class Skynet
16
16
 
17
17
  def self.get
18
18
  if not @@log
19
- @@log = self.new(Skynet::CONFIG[:SKYNET_LOG_FILE])
19
+ @@log = self.new(Skynet::Config.new.logfile_location)
20
20
  @@log.level = Skynet::CONFIG[:SKYNET_LOG_LEVEL]
21
21
  end
22
22
  @@log
@@ -25,7 +25,13 @@ class Skynet
25
25
  def self.log=(log)
26
26
  @@log = log
27
27
  end
28
+
29
+ def printlog(*args)
30
+ self.class.get.unknown(*args)
31
+ end
32
+
28
33
  end
34
+
29
35
 
30
36
  # This module can be mixed in to add logging methods to your class.
31
37
  module Loggable
@@ -48,5 +54,9 @@ class Skynet
48
54
  def fatal
49
55
  log = Skynet::Logger.get
50
56
  end
57
+
58
+ def unknown
59
+ log = Skynet::Logger.get
60
+ end
51
61
  end
52
62
  end
@@ -1,64 +1,87 @@
1
- class Skynet
2
- begin
3
- require 'fastthread'
4
- rescue LoadError
5
- # puts 'fastthread not installed, using thread instead'
6
- require 'thread'
7
- end
1
+ require 'yaml'
8
2
 
3
+ class Skynet
9
4
  class Manager
10
-
5
+
11
6
  class Error < StandardError
12
7
  end
13
-
8
+
14
9
  include SkynetDebugger
15
10
 
16
11
  Skynet::CONFIG[:PERCENTAGE_OF_TASK_ONLY_WORKERS] ||= 0.7
17
12
  Skynet::CONFIG[:PERCENTAGE_OF_MASTER_ONLY_WORKERS] ||= 0.2
18
-
13
+
19
14
  def self.debug_class_desc
20
15
  "MANAGER"
21
- end
22
-
16
+ end
17
+
23
18
  attr_accessor :required_libs, :queue_id
24
- attr_reader :config
25
-
26
- def initialize(options)
19
+ attr_reader :config, :worker_queue, :wqts
20
+
21
+ def initialize(options)
27
22
  raise Error.new("You must provide a script path to Skynet::Manager.new.") unless options[:script_path]
28
- @script_path = options[:script_path]
23
+ @script_path = options[:script_path] || Skynet::CONFIG[:LAUNCHER_PATH]
29
24
  # info "Skynet Launcher Path: [#{@script_path}]"
30
25
  @workers_requested = options[:workers] || 4
31
- @required_libs = options[:adlibs] || []
26
+ @required_libs = options[:required_libs] || []
32
27
  @queue_id = options[:queue_id] || 0
33
28
  @number_of_workers = 0
34
29
  @workers_by_type = {:master => [], :task => [], :any => []}
35
30
  @signaled_workers = []
36
- @workers_running = {}
31
+ @worker_queue = {}
37
32
  @workers_restarting = 0
38
- @all_workers_started = false
39
- @config = Skynet::Config.new
33
+ @all_workers_started = false
34
+ @config = Skynet::Config.new
40
35
  @mutex = Mutex.new
41
- end
36
+ @wqts = Queue.new
37
+ end
38
+
39
+ def worker_notify(item)
40
+ @wqts.push(item)
41
+ end
42
+
43
+ def start_worker_queue_thread
44
+ Thread.new do
45
+ last_save_time = Time.now
46
+ loop do
47
+ task = @wqts.pop
48
+ begin
49
+ status = Skynet::WorkerStatusMessage.new(task)
50
+ status.started_at = status.started_at.to_i
51
+ @mutex.synchronize do
52
+ @worker_queue[status.worker_id] = status
53
+ end
54
+ if last_save_time < Time.now - 60
55
+ save_worker_queue_to_file
56
+ last_save_time = Time.now
57
+ end
58
+ rescue Exception => e
59
+ error "Error in worker queue thread #{e.inspect} #{e.backtrace.join("\n")}"
60
+ end
61
+ end
62
+ end
63
+ end
42
64
 
43
65
  def start_workers
66
+ load_worker_queue_from_file
67
+ start_worker_queue_thread
68
+
44
69
  setup_signals
45
-
70
+
46
71
  starting = workers_to_start(@workers_requested)
47
72
  warn "Starting #{starting} workers. QUEUE: #{config.queue_name_by_id(queue_id)} #{@workers_requested - starting} already running."
48
73
  add_worker(starting)
49
74
  end
50
-
75
+
51
76
  ### maybe workers_to_start should be a method
52
77
  def workers_to_start(workers_to_start)
53
- pids = worker_queue_pids
54
- if not pids.empty?
55
- pids.each do |worker_pid|
56
- if worker_alive?(worker_pid)
57
- @workers_running[worker_pid] = Time.now
78
+ if not worker_pids.empty?
79
+ worker_pids.each do |worker_pid|
80
+ if worker_alive?(worker_pid)
58
81
  @number_of_workers += 1
59
82
  workers_to_start -= 1
60
83
  else
61
- take_worker_status(worker_pid)
84
+ mark_worker_as_stopped(worker_pid)
62
85
  end
63
86
  return 0 if workers_to_start < 1
64
87
  end
@@ -67,242 +90,214 @@ class Skynet
67
90
  end
68
91
 
69
92
  def check_started_workers
70
- workers = []
71
93
  begin
72
94
  100.times do |ii|
73
- workers = worker_queue
74
- warn "Checking started workers, #{workers.size} out of #{@number_of_workers} after the #{(ii+1)}th try..."
75
- break if workers.size >= @number_of_workers
76
- sleep (@number_of_workers - workers.size)
77
- end
95
+ warn "Checking started workers, #{active_workers.size} out of #{@number_of_workers} after the #{(ii+1)}th try..."
96
+ break if active_workers.size >= @number_of_workers
97
+ sleep (@number_of_workers - active_workers.size)
98
+ end
78
99
  rescue Exception => e
79
100
  fatal "Something bad happened #{e.inspect} #{e.backtrace.join("\n")}"
80
101
  end
81
102
 
82
103
  @all_workers_started = true
83
104
 
84
- warn "FINISHED STARTING ALL #{workers.size} WORKERS"
85
- if workers.size > @number_of_workers
86
- warn "EXPECTED #{@number_of_workers}"
87
- @number_of_workers = workers.size
105
+ printlog "FINISHED STARTING ALL #{active_workers.size} WORKERS"
106
+ if active_workers.size > @number_of_workers
107
+ warn "EXPECTED #{@number_of_workers}"
108
+ @number_of_workers = active_workers.size
88
109
  end
89
110
  end
90
-
111
+
91
112
  # the main application loop
92
113
  def run
93
- loop do
114
+ loop do
94
115
  next unless @all_workers_started
95
116
  begin
96
117
  check_workers
97
118
  sleep Skynet::CONFIG[:WORKER_CHECK_DELAY]
98
119
  rescue SystemExit, Interrupt => e
99
- fatal "Manager Exiting!"
120
+ printlog "Manager Exiting!"
100
121
  exit
101
122
  rescue Exception => e
102
123
  fatal "Something bad happened #{e.inspect} #{e.backtrace.join("\n")}"
103
124
  end
104
125
  end
105
126
  end
106
-
127
+
107
128
  def check_workers
108
- worker_queue = self.worker_queue
109
- q_pids = worker_queue_pids(worker_queue) || []
110
- info "Checking on #{@number_of_workers} workers..." unless @shutdown
111
- check_running_pids(worker_queue)
112
- check_number_of_workers(worker_queue)
113
- true
114
- end
115
-
116
- def check_running_pids(worker_queue)
117
- # There are workers running that are not in the queue. When does this happen?
118
- q_pids = worker_queue_pids(worker_queue) || []
119
- if @workers_running.keys.size > q_pids.size
120
- (@workers_running.keys - q_pids).each do |wpid|
121
- error "Missing worker #{wpid} from worker queue. Removing and/or killing."
122
- Process.kill("TERM",wpid) if worker_alive?(wpid)
123
- @workers_running.delete(wpid)
124
- q_pids.delete(wpid)
125
- end
126
- end
127
-
128
- q_pids.each do |wpid|
129
+ debug "Checking on #{@number_of_workers} workers..." unless @shutdown
130
+ check_running_pids
131
+ check_number_of_workers
132
+ true
133
+ end
134
+
135
+ def check_running_pids
136
+ worker_pids.each do |wpid|
129
137
  if not worker_alive?(wpid)
130
- error "Worker #{wpid} was in queue and but was not running. Removing from queue."
131
- take_worker_status(wpid)
132
- @workers_running.delete(wpid)
138
+ if @shutdown
139
+ info "Worker #{wpid} shut down gracefully. Removing from queue."
140
+ else
141
+ error "Worker #{wpid} was in queue and but was not running. Removing from queue."
142
+ end
143
+ mark_worker_as_stopped(wpid)
133
144
  @number_of_workers -= 1
134
- q_pids.delete(wpid)
135
145
  end
136
146
  end
137
- q_pids
138
- end
139
-
140
- def check_number_of_workers(worker_queue=self.worker_queue)
141
- q_pids = worker_queue_pids(worker_queue) || []
142
- if @shutdown
143
- worker_shutdown(worker_queue)
144
- if q_pids.size < 1
147
+ worker_pids
148
+ end
149
+
150
+ def check_number_of_workers
151
+ if @shutdown
152
+ worker_shutdown
153
+ if worker_pids.size < 1
145
154
  exit
146
- end
155
+ end
147
156
  elsif @workers_restarting > 0
148
- if @workers_requested - q_pids.size != 0
149
- restarting = @workers_requested - q_pids.size
150
- warn "RESTART MODE: Expected #{@number_of_workers} workers. #{q_pids.size} running. #{restarting} are still restarting"
151
- else
152
- warn "RESTART MODE: Expected #{@number_of_workers} workers. #{q_pids.size} running."
153
- end
154
- @workers_restarting = @workers_requested - q_pids.size
155
-
156
- elsif q_pids.size != @number_of_workers
157
- starting = 0
158
- if q_pids.size.to_f / @workers_requested.to_f < 0.85
159
- starting = @workers_requested - q_pids.size
160
- error "Expected #{@number_of_workers} workers. #{q_pids.size} running. Starting #{starting}"
161
- @number_of_workers = q_pids.size
162
- add_worker(starting)
163
- else
164
-
165
- error "Expected #{@number_of_workers} workers. #{q_pids.size} running."
166
- @number_of_workers = q_pids.size
157
+ if @workers_requested - worker_pids.size != 0
158
+ restarting = @workers_requested - worker_pids.size
159
+ warn "RESTART MODE: Expected #{@number_of_workers} workers. #{worker_pids.size} running. #{restarting} are still restarting"
160
+ else
161
+ warn "RESTART MODE: Expected #{@number_of_workers} workers. #{worker_pids.size} running."
162
+ end
163
+ @workers_restarting = @workers_requested - worker_pids.size
164
+
165
+ elsif worker_pids.size != @number_of_workers
166
+ starting = 0
167
+ if worker_pids.size.to_f / @workers_requested.to_f < 0.85
168
+ starting = @workers_requested - worker_pids.size
169
+ error "Expected #{@number_of_workers} workers. #{worker_pids.size} running. Starting #{starting}"
170
+ @number_of_workers = worker_pids.size
171
+ add_worker(starting)
172
+ else
173
+
174
+ error "Expected #{@number_of_workers} workers. #{worker_pids.size} running."
175
+ @number_of_workers = worker_pids.size
167
176
  end
168
177
  end
169
178
  end
170
-
171
- def worker_shutdown(worker_queue)
172
- q_pids = worker_queue_pids(worker_queue) || []
179
+
180
+ def worker_shutdown
173
181
  if not @masters_dead
174
- workers_to_kill = worker_queue.select do |w|
175
- w.map_or_reduce == "master" and @workers_running.include?(w.process_id)
176
- end
177
- warn "Shutting down masters. #{q_pids.size} workers still running." if q_pids.size > 0
182
+ workers_to_kill = active_workers.select do |w|
183
+ w.map_or_reduce == "master" and active_workers.detect{|status| status.process_id == w.process_id and worker_alive?(w.process_id)}
184
+ end
185
+ warn "Shutting down masters. #{worker_pids.size} workers still running." if worker_pids.size > 0
178
186
 
179
187
  worker_pids_to_kill = workers_to_kill.collect { |w| w.process_id }
180
188
  if worker_pids_to_kill and not worker_pids_to_kill.empty?
181
- warn "FOUND MORE RUNNING MASTERS WE HAVEN'T KILLED:", worker_pids_to_kill
182
- remove_worker(worker_pids_to_kill)
189
+ warn "FOUND MORE RUNNING MASTERS WE HAVEN'T KILLED:", worker_pids_to_kill
190
+ remove_worker(worker_pids_to_kill)
183
191
  end
184
192
 
185
- if not worker_queue.detect { |w| w.map_or_reduce == "master" }
186
- signal_workers("INT")
193
+ if not active_workers.detect { |w| w.map_or_reduce == "master" }
194
+ signal_workers("TERM")
187
195
  @masters_dead = true
188
- sleep 1
189
- return check_number_of_workers()
190
196
  else
191
- sleep 4
192
- return check_number_of_workers()
197
+ return check_number_of_workers
193
198
  end
194
- else
195
- warn "Shutting down. #{q_pids.size} workers still running." if q_pids.size > 0
196
199
  end
197
- if q_pids.size < 1
200
+ if worker_pids.size < 1
198
201
  info "No more workers running."
199
- end
200
- end
201
-
202
- def take_worker_status(worker_process_id)
203
- begin
204
- mq.take_worker_status({
205
- :hostname => hostname,
206
- :process_id => worker_process_id
207
- },0.00001)
208
- rescue Skynet::QueueTimeout => e
209
- error "Couldnt take worker status for #{hostname} #{worker_process_id}"
202
+ else
203
+ warn "Shutting down. #{worker_pids.size} workers still running." if worker_pids.size > 0
210
204
  end
211
- end
212
-
205
+ end
206
+
213
207
  def worker_alive?(worker_pid)
214
- begin
215
- IO.popen("ps -o pid,command -p #{worker_pid}", "r") do |ps|
216
- return ps.detect {|line| line =~ /worker_type/}
217
- end
218
- rescue Errno::ENOENT => e
219
- return false
220
- end
221
- false
222
- end
223
-
208
+ Skynet.process_alive?(worker_pid)
209
+ end
224
210
 
225
211
  def add_workers(*args)
226
212
  add_worker(*args)
227
213
  end
228
-
214
+
229
215
  def add_worker(workers=1)
230
216
  num_task_only_workers = (workers * Skynet::CONFIG[:PERCENTAGE_OF_TASK_ONLY_WORKERS]).to_i
231
217
  num_master_only_workers = (workers * Skynet::CONFIG[:PERCENTAGE_OF_MASTER_ONLY_WORKERS]).to_i
232
218
  warn "Adding #{workers} WORKERS. Task Workers: #{num_task_only_workers}, Master Workers: #{num_master_only_workers} Master & Task Workers: #{workers - num_task_only_workers - num_master_only_workers}"
233
-
219
+
234
220
  @all_workers_started = false
235
221
  worker_types = {:task => 0, :master => 0, :any => 0}
236
222
  (1..workers).collect do |ii|
237
223
  worker_type = :any
238
- if (ii <= num_master_only_workers)
239
- worker_type = :master
224
+ if (ii <= num_master_only_workers)
225
+ worker_type = :master
240
226
  worker_types[:master] += 1
241
227
  elsif (ii > num_master_only_workers and ii <= num_master_only_workers + num_task_only_workers)
242
228
  worker_type = :task
243
229
  worker_types[:task] += 1
244
230
  else
245
231
  worker_types[:any] += 1
246
- end
232
+ end
247
233
  cmd = "#{@script_path} --worker_type=#{worker_type}"
234
+ cmd << " --config='#{Skynet::CONFIG[:CONFIG_FILE]}'" if Skynet::CONFIG[:CONFIG_FILE]
248
235
  cmd << " --queue_id=#{queue_id}"
249
236
  cmd << " -r #{required_libs.join(' -r ')}" if required_libs and not required_libs.empty?
250
- wpid = self.fork_and_exec(cmd)
237
+ wpid = Skynet.fork_and_exec(cmd)
238
+ Skynet.close_console
251
239
  @workers_by_type[worker_type] ||= []
252
240
  @workers_by_type[worker_type] << wpid
253
241
  warn "Adding Worker ##{ii} PID: #{wpid} QUEUE: #{queue_id}, WORKER_TYPE?:#{worker_type}"
254
242
  @mutex.synchronize do
255
243
  @number_of_workers += 1
256
244
  end
257
- @workers_running[wpid] = Time.now
258
245
  sleep 0.01
259
246
  wpid
260
- end
261
- info "DISTRO", worker_types
262
- check_started_workers
247
+ end
248
+ info "Worker Distribution", worker_types
249
+ check_started_workers
263
250
  end
264
-
251
+
265
252
  def remove_workers(workers=1)
266
- pids = worker_queue_pids[0...workers]
253
+ pids = worker_pids[0...workers]
267
254
  remove_worker(pids)
268
255
  end
269
256
 
270
257
  def remove_worker(pids = nil)
271
258
  pids = [pids] unless pids.kind_of?(Array)
272
259
  info "Removing workers #{pids.join(",")} from worker queue. They will die gracefully when they finish what they're doing."
273
- wq = worker_queue
274
260
  pids.collect do |wpid|
275
- @workers_running.delete(wpid)
261
+ Process.kill("INT",wpid)
262
+ mark_worker_as_stopped(wpid)
276
263
  @number_of_workers -= 1
277
- @workers_running.delete(wpid)
278
264
  warn "REMOVING WORKER #{wpid}"
279
265
  @signaled_workers << wpid
280
- Process.kill("INT",wpid)
281
- end
266
+ end
282
267
  pids
283
268
  end
284
269
 
285
- def signal_workers(signal,worker_type=nil)
286
- worker_queue.each do |worker|
287
- next if worker_type and not @workers_by_type[worker_type].include?(worker.process_id)
288
- warn "SHUTTING DOWN #{worker.process_id} MR: #{worker.map_or_reduce}"
289
- @workers_running.delete(worker.process_id)
290
- Process.kill(signal,worker.process_id)
291
- @signaled_workers << worker.process_id
292
- end
293
- end
294
-
295
- def restart_all_workers
296
- hostnames = {}
297
- mq.read_all_worker_statuses.each do |status|
298
- hostnames[status.hostname] = true
270
+ def mark_worker_as_stopped(wpid)
271
+ worker = @worker_queue.values.detect {|status| status.process_id == wpid}
272
+ if worker and not worker_alive?(wpid)
273
+ @worker_queue.delete_if {|worker_id, status| status.process_id == wpid }
274
+ worker_pids.delete(worker.process_id)
275
+ worker.started_at = Time.now.to_f
276
+ worker.process_id = nil
299
277
  end
300
- hostnames.keys.each do |remote_hostname|
301
- manager = DRbObject.new(nil,"druby://#{remote_hostname}:40000")
302
- manager.restart_workers
278
+ end
279
+
280
+ def signal_workers(signal,worker_type=[])
281
+ worker_types = [worker_type].flatten
282
+ active_workers.each do |worker|
283
+ worker_types.each do |worker_type|
284
+ if worker_type == :idle
285
+ next if worker_type and worker.task_id
286
+ else
287
+ next if worker_type and not @workers_by_type[worker_type].include?(worker.process_id)
288
+ end
289
+ end
290
+ warn "SHUTTING DOWN #{worker.process_id} MR: #{worker.map_or_reduce} SIG: #{signal}"
291
+ begin
292
+ Process.kill(signal,worker.process_id)
293
+ rescue Errno::ESRCH
294
+ warn "Tried to kill a process that didn't exist #{worker.process_id}"
295
+ end
296
+ # mark_worker_as_stopped(worker.process_id)
297
+ @signaled_workers << worker.process_id
303
298
  end
304
299
  end
305
-
300
+
306
301
  def hard_restart_workers
307
302
  @all_workers_started = false
308
303
  signal_workers("TERM")
@@ -319,34 +314,33 @@ class Skynet
319
314
  def restart_worker(wpid)
320
315
  info "RESTARTING WORKER #{wpid}"
321
316
  @mutex.synchronize do
322
- @workers_running.delete(wpid)
317
+ Process.kill("HUP",wpid)
318
+ mark_worker_as_stopped(wpid)
323
319
  @workers_restarting += 1
324
320
  end
325
- Process.kill("HUP",wpid)
326
321
  sleep Skynet::CONFIG[:WORKER_CHECK_DELAY]
327
322
  end
328
323
 
329
324
  def restart_workers
330
325
  @all_workers_started = false
331
326
  signal_workers("HUP")
332
- @workers_running = {}
333
327
  sleep @number_of_workers
334
328
  check_started_workers
335
329
  end
336
330
 
337
331
  def setup_signals
338
- Signal.trap("HUP") do
332
+ Signal.trap("HUP") do
339
333
  restart_workers
340
334
  end
341
335
  Signal.trap("TERM") do
342
- if @term
336
+ if @term
343
337
  terminate
344
338
  else
345
339
  @term=true
346
340
  shutdown
347
341
  end
348
342
  end
349
-
343
+
350
344
  Signal.trap("INT") do
351
345
  if @shutdown
352
346
  terminate
@@ -359,116 +353,210 @@ class Skynet
359
353
  def shutdown
360
354
  info(:shutdown)
361
355
  @shutdown = true
362
- signal_workers("INT",:master)
363
- signal_workers("INT",:any)
356
+ signal_workers("TERM",[:idle,:master,:any])
364
357
  end
365
358
 
366
- def terminate
367
- info(:terminate)
368
- signal_workers("TERM")
359
+ def terminate
360
+ info(:terminate)
361
+ signal_workers("KILL")
362
+ sleep 1
369
363
  exit
370
364
  end
371
365
 
372
- def fork_and_exec(command)
373
- pid = fork do
374
- exec("/bin/sh -c \"#{command}\"")
375
- exit
366
+ def save_worker_queue_to_file
367
+ debug "Writing worker queue to file #{Skynet.config.manager_statfile_location}"
368
+ File.open(Skynet.config.manager_statfile_location,"w") do |f|
369
+ f.write(YAML.dump(@worker_queue))
376
370
  end
377
- Process.detach(pid) if (pid != 0)
378
- pid
379
371
  end
380
372
 
381
- def mq
382
- @mq ||= Skynet::MessageQueue.new
383
- end
373
+ def load_worker_queue_from_file
374
+ if File.exists?(Skynet.config.manager_statfile_location)
375
+ File.open(Skynet.config.manager_statfile_location,"r") do |f|
376
+ begin
377
+ @worker_queue = YAML.load(f.read)
378
+ raise Error.new("Bad Manager File returned type #{@worker_queue.class}") unless @worker_queue.is_a?(Hash)
379
+ rescue Exception => e
380
+ error "Error loading manager stats file: #{f}", e
381
+ @worker_queue = {}
382
+ save_worker_queue_to_file
383
+ end
384
+ end
385
+ end
386
+ end
384
387
 
385
- def worker_queue
386
- mq.read_all_worker_statuses(hostname)
388
+ def prune_inactive_worker_stats
389
+ @worker_queue.delete_if{|worker_id, worker| !worker.process_id.is_a?(Fixnum) }
390
+ stats
391
+ end
392
+
393
+ def self.stats_for_hosts(manager_hosts=nil)
394
+ manager_hosts ||= Skynet::CONFIG[:MANAGER_HOSTS] || ["localhost"]
395
+ stats = {
396
+ :servers => {},
397
+ :processed => 0,
398
+ :number_of_workers => 0,
399
+ :active_workers => 0,
400
+ :idle_workers => 0,
401
+ :hosts => 0,
402
+ :masters => 0,
403
+ :taskworkers => 0,
404
+ :time => Time.now.to_f
405
+ }
406
+ servers = {}
407
+ manager_hosts.each do |manager_host|
408
+ begin
409
+ manager = DRbObject.new(nil,"druby://#{manager_host}:#{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}")
410
+ manager_stats = manager.stats
411
+ servers[manager_host] = manager_stats
412
+ manager_stats.each do |key,value|
413
+ next unless value.is_a?(Fixnum)
414
+ stats[key] ||= 0
415
+ stats[key] += value
416
+ end
417
+ rescue DRb::DRbConnError, Errno::ECONNREFUSED => e
418
+ warn "Couldn't get stats from manager at druby://#{manager_host}:#{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}"
419
+ end
420
+ end
421
+ stats[:servers] = servers
422
+ stats[:hosts] = manager_hosts
423
+ stats
424
+ end
425
+
426
+ def stats
427
+ started_times = @worker_queue.values.collect{|worker| worker.started_at }.sort
428
+ active_started_times = active_workers.collect{|worker|worker.started_at }.sort
429
+ stats = {
430
+ :hostname => hostname,
431
+ :earliest_update => started_times.first,
432
+ :latest_update => started_times.last,
433
+ :active_earliest_update => active_started_times.first,
434
+ :active_latest_update => active_started_times.last,
435
+ :processed => 0,
436
+ :processed_by_active_workers => 0,
437
+ :number_of_workers => 0,
438
+ :idle_workers => 0,
439
+ :shutdown_workers => 0,
440
+ }
441
+ @worker_queue.values.collect{|worker|stats[:processed] += worker.processed}
442
+ active_workers.collect{|worker|stats[:processed_by_active_workers] += worker.processed}
443
+ currently_active_workers, idle_workers = active_workers.partition{|worker| worker.map_or_reduce }
444
+ stats[:number_of_workers] = active_workers.size
445
+ stats[:active_workers] = currently_active_workers.size
446
+ stats[:idle_workers] = idle_workers.size
447
+ stats[:shutdown_workers] = inactive_workers.size
448
+ stats[:masters] = active_workers.select{|worker|worker.tasktype.to_s == "master"}.size
449
+ stats[:master_or_task_workers] = active_workers.select{|worker|worker.tasktype.to_s == "any"}.size
450
+ stats[:taskworkers] = active_workers.select{|worker|worker.tasktype.to_s == "task"}.size
451
+ stats[:active_masters] = currently_active_workers.select{|worker|worker.tasktype.to_s == "master"}.size
452
+ stats[:active_master_or_task_workers] = currently_active_workers.select{|worker|worker.tasktype.to_s == "any"}.size
453
+ stats[:active_taskworkers] = currently_active_workers.select{|worker|worker.tasktype.to_s == "task"}.size
454
+ stats[:idle_masters] = idle_workers.select{|worker|worker.tasktype.to_s == "master"}.size
455
+ stats[:idle_master_or_task_workers] = idle_workers.select{|worker|worker.tasktype.to_s == "any"}.size
456
+ stats[:idle_taskworkers] = idle_workers.select{|worker|worker.tasktype.to_s == "task"}.size
457
+ stats
458
+ end
459
+
460
+ def active_workers
461
+ @worker_queue.values.select{|status| status.process_id.is_a?(Fixnum) }
462
+ end
463
+
464
+ def inactive_workers
465
+ @worker_queue.values.select{|status| !status.process_id.is_a?(Fixnum) }
387
466
  end
388
-
389
- def worker_queue_pids(worker_queue=self.worker_queue)
390
- worker_queue.collect {|w| w.process_id}
391
- end
392
467
 
393
468
  def worker_pids
394
- worker_queue_pids
395
- end
396
-
469
+ active_workers.collect {|w| w.process_id}
470
+ end
471
+
397
472
  def parent_pid
398
473
  $$
399
474
  end
400
475
 
401
476
  def hostname
402
477
  @machine_name ||= Socket.gethostname
403
- end
478
+ end
404
479
 
405
480
  def ping
406
481
  true
407
482
  end
408
483
 
484
+ def self.local_manager_uri
485
+ "druby://localhost:#{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}"
486
+ end
487
+
488
+ def self.get
489
+ DRbObject.new(nil,local_manager_uri)
490
+ end
491
+
409
492
  def self.start(options={})
410
493
  options[:add_workers] ||= nil
411
494
  options[:remove_workers] ||= nil
412
495
  options[:use_rails] ||= false
413
496
  options[:required_libs] ||= []
414
- options[:workers] ||= Skynet::CONFIG[:NUMBER_OF_WORKERS] || 4
415
- options[:pid_file] ||= Skynet::CONFIG[:SKYNET_PIDS_FILE]
416
- options[:script_path] ||= Skynet::CONFIG[:LAUNCHER_PATH]
417
-
497
+
418
498
  config = Skynet::Config.new
419
499
 
420
500
  OptionParser.new do |opt|
421
- opt.banner = %{Usage:
501
+ opt.banner = %{Usage:
422
502
  > skynet [options]
423
503
 
504
+ OR to daemonize
505
+
506
+ > skynet [options] start
507
+ > skynet stop
508
+
424
509
  You can also run:
425
510
  > skynet console [options]
426
511
  }
427
- opt.on('', '--restart-all-workers', 'Restart All Workers') do |v|
512
+ opt.on('--restart-all-workers', 'Restart All Workers') do |v|
428
513
  puts "Restarting ALL workers on ALL machines."
429
514
  begin
430
- manager = DRbObject.new(nil, Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_URL])
515
+ manager = self.get
431
516
  manager.restart_all_workers
432
517
  exit
433
518
  rescue DRb::DRbConnError => e
434
- puts "No manager running at #{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_URL]} ERROR: #{e.inspect}"
519
+ puts "No manager running at #{local_manager_uri} ERROR: #{e.inspect}"
435
520
  exit
436
521
  end
437
522
  end
438
- opt.on('', '--restart-workers', 'Restart Workers') do |v|
523
+ opt.on('--restart-workers', 'Restart Workers') do |v|
439
524
  puts "Restarting workers on this machine."
440
525
  begin
441
- manager = DRbObject.new(nil, Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_URL])
526
+ manager = self.get
442
527
  manager.restart_workers
443
528
  exit
444
529
  rescue DRb::DRbConnError => e
445
- puts "No manager running at #{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_URL]} ERROR: #{e.inspect}"
530
+ puts "No manager running at #{local_manager_uri} ERROR: #{e.inspect}"
446
531
  exit
447
532
  end
448
533
  end
449
- opt.on('-i', '--increment-worker-version', 'Increment Worker Version') do |v|
534
+ opt.on('--increment-worker-version', 'Increment Worker Version') do |v|
450
535
  ver = Skynet::MessageQueue.new.increment_worker_version
451
536
  puts "Incrementing Worker Version to #{ver}"
452
537
  exit
453
538
  end
454
- opt.on('-a', '--add-workers WORKERS', 'Number of workers to add.') do |v|
539
+ opt.on('--add-workers=WORKERS', 'Number of workers to add.') do |v|
455
540
  options[:add_workers] = v.to_i
456
541
  end
457
- opt.on('-k', '--remove-workers WORKERS', 'Number of workers to remove.') do |v|
542
+ opt.on('--remove-workers=WORKERS', 'Number of workers to remove.') do |v|
458
543
  options[:remove_workers] = v.to_i
459
544
  end
460
- opt.on('-w', '--workers WORKERS', 'Number of workers to start.') do |v|
545
+ opt.on('--workers=WORKERS', 'Number of workers to start.') do |v|
461
546
  options[:workers] = v.to_i
462
- end
547
+ end
463
548
  opt.on('-r', '--required LIBRARY', 'Require the specified libraries') do |v|
464
549
  options[:required_libs] << File.expand_path(v)
465
550
  end
466
- opt.on('-q', '--queue QUEUE_NAME', 'Which queue should these workers use (default "default").') do |v|
551
+ opt.on('--config=CONFIG_FILE', 'Where to find the skynet.rb config file') do |v|
552
+ options[:config_file] = File.expand_path(v)
553
+ end
554
+ opt.on('--queue=QUEUE_NAME', 'Which queue should these workers use (default "default").') do |v|
467
555
  options[:queue] = v
468
- end
469
- opt.on('-i', '--queue_id queue_id', 'Which queue should these workers use (default 0).') do |v|
556
+ end
557
+ opt.on('--queue_id=queue_id', 'Which queue should these workers use (default 0).') do |v|
470
558
  options[:queue_id] = v.to_i
471
- end
559
+ end
472
560
  opt.parse!(ARGV)
473
561
  end
474
562
  if options[:queue]
@@ -477,9 +565,9 @@ class Skynet
477
565
  end
478
566
  options[:queue_id] = config.queue_id_by_name(options[:queue])
479
567
  else
480
- options[:queue_id] ||= 0
568
+ options[:queue_id] ||= 0
481
569
  end
482
-
570
+
483
571
  options[:required_libs].each do |adlib|
484
572
  begin
485
573
  require adlib
@@ -487,12 +575,28 @@ class Skynet
487
575
  error "The included lib #{adlib} was not found: #{e.inspect}"
488
576
  exit
489
577
  end
490
- end
578
+ end
579
+
580
+ options[:config_file] ||= Skynet::CONFIG[:CONFIG_FILE]
581
+ if options[:config_file]
582
+ begin
583
+ require options[:config_file]
584
+ rescue MissingSourceFile => e
585
+ error "The config file at #{options[:config_file]} was not found: #{e.inspect}"
586
+ exit
587
+ end
588
+ elsif Skynet::CONFIG[:SYSTEM_RUNNER]
589
+ error "Config file missing. Please add a config/skynet_config.rb before starting."
590
+ end
591
+
592
+ options[:workers] ||= Skynet::CONFIG[:NUMBER_OF_WORKERS] || 4
593
+ options[:pid_file] ||= Skynet::Config.pidfile_location
594
+ options[:script_path] ||= Skynet::CONFIG[:LAUNCHER_PATH]
491
595
 
492
596
  # Handle add or remove workers
493
597
  if options[:add_workers] or options[:remove_workers]
494
598
  begin
495
- manager = DRbObject.new(nil, Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_URL])
599
+ manager = self.get
496
600
  if options[:add_workers]
497
601
  pids = manager.add_worker(options[:add_workers])
498
602
  warn "ADDING #{options[:add_workers]} workers PIDS: #{pids.inspect}"
@@ -501,7 +605,7 @@ class Skynet
501
605
  warn "REMOVING #{options[:remove_workers]} workers PIDS: #{pids.inspect}"
502
606
  end
503
607
  rescue DRb::DRbConnError => e
504
- warn "Couldnt add or remove workers. There are probably no workers running. At least I couldn't find a skynet_manager around at #{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_URL]} #{e.inspect}"
608
+ warn "Couldnt add or remove workers. There are probably no workers running. At least I couldn't find a skynet_manager around at #{local_manager_uri} #{e.inspect}"
505
609
  rescue Exception => e
506
610
  warn "Couldnt add or remove workers #{e.inspect} #{e.backtrace.join("\n")}"
507
611
  end
@@ -519,25 +623,84 @@ class Skynet
519
623
 
520
624
  debug "CONTINUING TO START : There IS an available MessageQueue", options
521
625
 
522
- # create main pid file
523
- File.open(options[:pid_file], 'w') do |file|
524
- file.puts($$)
525
- end
626
+ begin
627
+ if oldpid = read_pid_file
628
+ errmsg = nil
629
+ if Skynet.process_alive?(oldpid)
630
+ errmsg = "Another Skynet Manager is running at pid: #{oldpid}"
631
+ warn errmsg
632
+ stderr errmsg
633
+ exit
634
+ else
635
+ errmsg = "Deleting stale pidfile #{Skynet::Config.pidfile_location}"
636
+ warn errmsg
637
+ stderr errmsg
638
+ File.unlink(Skynet::Config.pidfile_location) if File.exist?(Skynet::Config.pidfile_location)
639
+ end
640
+ end
526
641
 
527
- begin
528
- info "STARTING THE MANAGER!!!!!!!!!!!"
529
- @manager = Skynet::Manager.new(options)
530
- DRb.start_service(Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_URL], @manager)
531
- info "WORKER MANAGER URI: #{DRb.uri}"
532
- @manager.start_workers
533
- @manager.run
534
- DRb.thread.join
642
+ printlog "STARTING THE MANAGER!!!!!!!!!!! port: #{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}"
643
+ puts "Starting Skynet..."
644
+ printlog "Skynet Stopped"
645
+ if options["daemonize"]
646
+ Skynet.safefork do
647
+ sess_id = Process.setsid
648
+ write_pid_file
649
+ Skynet.close_console
650
+ run_manager(options)
651
+ exit!
652
+ end
653
+ else
654
+ write_pid_file
655
+ run_manager(options)
656
+ end
535
657
  rescue SystemExit, Interrupt
536
658
  rescue Exception => e
537
- fatal("Error in Manager. Manager Dying. #{e.inspect} #{e.backtrace}")
659
+ fatal("Error in Manager. Manager Dying. #{e.inspect} #{e.backtrace}")
538
660
  end
539
661
  end
540
662
  end
541
663
 
664
+ def self.run_manager(options)
665
+ @manager = Skynet::Manager.new(options)
666
+ @drb_manager = DRb.start_service("druby://:#{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}", @manager)
667
+ @manager.start_workers
668
+ info "MANAGER STARTED ON PORT: #{Skynet::CONFIG[:SKYNET_LOCAL_MANAGER_PORT]}"
669
+ @manager.run
670
+ end
671
+
672
+ # stop the daemon, nicely at first, and then forcefully if necessary
673
+ def self.stop(options = {})
674
+ pid = read_pid_file
675
+ if not pid
676
+ puts "The Skynet Manager is not running. No PID found in #{Skynet::Config.pidfile_location}"
677
+ exit
678
+ end
679
+ $stdout.puts "Stopping Skynet"
680
+ printlog "Stopping Skynet"
681
+ Process.kill("TERM", pid)
682
+ 180.times { Process.kill(0, pid); sleep(1) }
683
+ Process.kill("TERM", pid)
684
+ 180.times { Process.kill(0, pid); sleep(1) }
685
+ $stdout.puts("using kill -9 #{pid}")
686
+ Process.kill("KILL", pid)
687
+ rescue Errno::ESRCH => e
688
+ printlog "Skynet Stopped"
689
+ ensure
690
+ File.unlink(Skynet::Config.pidfile_location) if File.exist?(Skynet::Config.pidfile_location)
691
+ end
692
+
693
+ def self.read_pid_file
694
+ pidfile = Skynet::Config.pidfile_location
695
+ File.read(pidfile).to_i if File.exist?(pidfile)
696
+ end
697
+
698
+ def self.write_pid_file
699
+ pidfile = Skynet::Config.pidfile_location
700
+ info "Writing PIDFILE to #{pidfile}"
701
+ open(pidfile, "w") {|f| f << Process.pid << "\n"}
702
+ at_exit { File.unlink(pidfile) if read_pid_file == Process.pid }
703
+ end
704
+
542
705
  end
543
706
  end