naf 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/.gitignore +16 -0
  2. data/.rspec +1 -0
  3. data/.travis.yml +17 -0
  4. data/Gemfile +17 -0
  5. data/LICENSE +2 -0
  6. data/README.rdoc +22 -0
  7. data/RELEASE_NOTES.rdoc +18 -0
  8. data/Rakefile +43 -0
  9. data/app/assets/images/bg-grad.png +0 -0
  10. data/app/assets/images/clock.png +0 -0
  11. data/app/assets/images/control_play_blue.png +0 -0
  12. data/app/assets/images/down_arrow.gif +0 -0
  13. data/app/assets/images/papertrail_job.png +0 -0
  14. data/app/assets/images/papertrail_machine.png +0 -0
  15. data/app/assets/images/papertrail_machine_runner.png +0 -0
  16. data/app/assets/images/terminate.png +0 -0
  17. data/app/assets/images/ui-bg_flat_0_aaaaaa_40x100.png +0 -0
  18. data/app/assets/images/ui-bg_flat_0_ffffff_40x100.png +0 -0
  19. data/app/assets/images/ui-bg_flat_75_ffffff_40x100.png +0 -0
  20. data/app/assets/images/ui-bg_glass_0_f4f4f4_1x400.png +0 -0
  21. data/app/assets/images/ui-bg_glass_55_fbf9ee_1x400.png +0 -0
  22. data/app/assets/images/ui-bg_glass_65_f4f4f4_1x400.png +0 -0
  23. data/app/assets/images/ui-bg_glass_65_ffffff_1x400.png +0 -0
  24. data/app/assets/images/ui-bg_glass_75_dadada_1x400.png +0 -0
  25. data/app/assets/images/ui-bg_glass_75_e6e6e6_1x400.png +0 -0
  26. data/app/assets/images/ui-bg_glass_75_f4f4f4_1x400.png +0 -0
  27. data/app/assets/images/ui-bg_glass_95_fef1ec_1x400.png +0 -0
  28. data/app/assets/images/ui-bg_highlight-soft_0_f4f4f4_1x100.png +0 -0
  29. data/app/assets/images/ui-bg_highlight-soft_75_cccccc_1x100.png +0 -0
  30. data/app/assets/images/ui-icons_222222_256x240.png +0 -0
  31. data/app/assets/images/ui-icons_2e83ff_256x240.png +0 -0
  32. data/app/assets/images/ui-icons_454545_256x240.png +0 -0
  33. data/app/assets/images/ui-icons_888888_256x240.png +0 -0
  34. data/app/assets/images/ui-icons_cd0a0a_256x240.png +0 -0
  35. data/app/assets/images/up_arrow.gif +0 -0
  36. data/app/assets/javascripts/dataTablesTemplates/applications.js +94 -0
  37. data/app/assets/javascripts/dataTablesTemplates/jobs.js +163 -0
  38. data/app/assets/javascripts/dataTablesTemplates/machine_runner_invocations.js +60 -0
  39. data/app/assets/javascripts/dataTablesTemplates/machine_runners.js +82 -0
  40. data/app/assets/javascripts/dataTablesTemplates/machines.js +93 -0
  41. data/app/assets/javascripts/date.js +104 -0
  42. data/app/assets/javascripts/iso8601.js +41 -0
  43. data/app/assets/javascripts/jquery.dataTables.custom.js +62 -0
  44. data/app/assets/javascripts/jquery.dataTables.js +6862 -0
  45. data/app/assets/javascripts/naf.js +30 -0
  46. data/app/assets/javascripts/underscore.js +713 -0
  47. data/app/assets/stylesheets/jquery_ui/jquery-ui-1.8.5.custom.css.erb +572 -0
  48. data/app/assets/stylesheets/min_naf.css +14 -0
  49. data/app/assets/stylesheets/min_naf/layout.css.scss +355 -0
  50. data/app/assets/stylesheets/naf.css +14 -0
  51. data/app/assets/stylesheets/naf/layout.css.scss +497 -0
  52. data/app/controllers/naf/affinities_controller.rb +61 -0
  53. data/app/controllers/naf/application_controller.rb +43 -0
  54. data/app/controllers/naf/application_schedule_affinity_tabs_controller.rb +75 -0
  55. data/app/controllers/naf/applications_controller.rb +153 -0
  56. data/app/controllers/naf/historical_job_affinity_tabs_controller.rb +65 -0
  57. data/app/controllers/naf/historical_jobs_controller.rb +159 -0
  58. data/app/controllers/naf/janitorial_assignments_controller.rb +77 -0
  59. data/app/controllers/naf/logger_names_controller.rb +58 -0
  60. data/app/controllers/naf/logger_styles_controller.rb +59 -0
  61. data/app/controllers/naf/machine_affinity_slots_controller.rb +69 -0
  62. data/app/controllers/naf/machine_runner_invocations_controller.rb +59 -0
  63. data/app/controllers/naf/machine_runners_controller.rb +26 -0
  64. data/app/controllers/naf/machines_controller.rb +95 -0
  65. data/app/helpers/naf/application_helper.rb +275 -0
  66. data/app/models/log4r/papertrail_outputter.rb +19 -0
  67. data/app/models/logical/naf/application.rb +183 -0
  68. data/app/models/logical/naf/construction_zone/ad_hoc_work_order.rb +22 -0
  69. data/app/models/logical/naf/construction_zone/application_schedule_work_order.rb +15 -0
  70. data/app/models/logical/naf/construction_zone/application_work_order.rb +25 -0
  71. data/app/models/logical/naf/construction_zone/boss.rb +123 -0
  72. data/app/models/logical/naf/construction_zone/foreman.rb +53 -0
  73. data/app/models/logical/naf/construction_zone/proletariat.rb +40 -0
  74. data/app/models/logical/naf/construction_zone/work_order.rb +100 -0
  75. data/app/models/logical/naf/create_infrastructure.rb +48 -0
  76. data/app/models/logical/naf/job.rb +357 -0
  77. data/app/models/logical/naf/job_creator.rb +155 -0
  78. data/app/models/logical/naf/job_fetcher.rb +167 -0
  79. data/app/models/logical/naf/job_statuses/errored.rb +27 -0
  80. data/app/models/logical/naf/job_statuses/finished.rb +26 -0
  81. data/app/models/logical/naf/job_statuses/finished_less_minute.rb +25 -0
  82. data/app/models/logical/naf/job_statuses/queued.rb +32 -0
  83. data/app/models/logical/naf/job_statuses/running.rb +34 -0
  84. data/app/models/logical/naf/job_statuses/terminated.rb +25 -0
  85. data/app/models/logical/naf/job_statuses/waiting.rb +43 -0
  86. data/app/models/logical/naf/machine.rb +85 -0
  87. data/app/models/logical/naf/machine_runner.rb +46 -0
  88. data/app/models/logical/naf/machine_runner_invocation.rb +50 -0
  89. data/app/models/logical/naf/pickler.rb +74 -0
  90. data/app/models/logical/naf/unpickler.rb +98 -0
  91. data/app/models/naf/affinity.rb +145 -0
  92. data/app/models/naf/affinity_classification.rb +44 -0
  93. data/app/models/naf/application.rb +100 -0
  94. data/app/models/naf/application_run_group_restriction.rb +39 -0
  95. data/app/models/naf/application_schedule.rb +181 -0
  96. data/app/models/naf/application_schedule_affinity_tab.rb +86 -0
  97. data/app/models/naf/application_schedule_prerequisite.rb +50 -0
  98. data/app/models/naf/application_type.rb +72 -0
  99. data/app/models/naf/by_historical_job_id.rb +86 -0
  100. data/app/models/naf/historical_job.rb +334 -0
  101. data/app/models/naf/historical_job_affinity_tab.rb +61 -0
  102. data/app/models/naf/historical_job_prerequisite.rb +19 -0
  103. data/app/models/naf/janitorial_archive_assignment.rb +36 -0
  104. data/app/models/naf/janitorial_assignment.rb +37 -0
  105. data/app/models/naf/janitorial_create_assignment.rb +36 -0
  106. data/app/models/naf/janitorial_drop_assignment.rb +36 -0
  107. data/app/models/naf/logger_level.rb +21 -0
  108. data/app/models/naf/logger_name.rb +23 -0
  109. data/app/models/naf/logger_style.rb +58 -0
  110. data/app/models/naf/logger_style_name.rb +28 -0
  111. data/app/models/naf/machine.rb +257 -0
  112. data/app/models/naf/machine_affinity_slot.rb +78 -0
  113. data/app/models/naf/machine_runner.rb +51 -0
  114. data/app/models/naf/machine_runner_invocation.rb +71 -0
  115. data/app/models/naf/naf_base.rb +9 -0
  116. data/app/models/naf/queued_job.rb +164 -0
  117. data/app/models/naf/running_job.rb +80 -0
  118. data/app/models/process/naf/application.rb +164 -0
  119. data/app/models/process/naf/janitor.rb +117 -0
  120. data/app/models/process/naf/machine_manager.rb +150 -0
  121. data/app/models/process/naf/machine_upgrader.rb +112 -0
  122. data/app/models/process/naf/runner.rb +539 -0
  123. data/app/views/naf/affinities/_form.html.erb +50 -0
  124. data/app/views/naf/affinities/edit.html.erb +11 -0
  125. data/app/views/naf/affinities/index.html.erb +57 -0
  126. data/app/views/naf/affinities/new.html.erb +15 -0
  127. data/app/views/naf/affinities/show.html.erb +48 -0
  128. data/app/views/naf/application_schedule_affinity_tabs/_form.html.erb +31 -0
  129. data/app/views/naf/application_schedule_affinity_tabs/edit.html.erb +12 -0
  130. data/app/views/naf/application_schedule_affinity_tabs/new.html.erb +11 -0
  131. data/app/views/naf/applications/_application_schedule.html.erb +80 -0
  132. data/app/views/naf/applications/_application_schedule_prerequisites.html.erb +14 -0
  133. data/app/views/naf/applications/_form.html.erb +109 -0
  134. data/app/views/naf/applications/_search_container.html.erb +94 -0
  135. data/app/views/naf/applications/_show.html.erb +34 -0
  136. data/app/views/naf/applications/edit.html.erb +11 -0
  137. data/app/views/naf/applications/index.html.erb +51 -0
  138. data/app/views/naf/applications/index.json.erb +11 -0
  139. data/app/views/naf/applications/new.html.erb +11 -0
  140. data/app/views/naf/applications/show.html.erb +203 -0
  141. data/app/views/naf/datatable.html.erb +49 -0
  142. data/app/views/naf/historical_job_affinity_tabs/_form.html.erb +36 -0
  143. data/app/views/naf/historical_job_affinity_tabs/edit.html.erb +11 -0
  144. data/app/views/naf/historical_job_affinity_tabs/new.html.erb +11 -0
  145. data/app/views/naf/historical_jobs/_form.html.erb +94 -0
  146. data/app/views/naf/historical_jobs/_runners.html.erb +22 -0
  147. data/app/views/naf/historical_jobs/_search_container.html.erb +140 -0
  148. data/app/views/naf/historical_jobs/edit.html.erb +11 -0
  149. data/app/views/naf/historical_jobs/index.html.erb +48 -0
  150. data/app/views/naf/historical_jobs/index.json.erb +26 -0
  151. data/app/views/naf/historical_jobs/new.html.erb +61 -0
  152. data/app/views/naf/historical_jobs/show.html.erb +201 -0
  153. data/app/views/naf/janitorial_assignments/_form.html.erb +38 -0
  154. data/app/views/naf/janitorial_assignments/_rows.html.erb +17 -0
  155. data/app/views/naf/janitorial_assignments/edit.html.erb +11 -0
  156. data/app/views/naf/janitorial_assignments/index.html.erb +56 -0
  157. data/app/views/naf/janitorial_assignments/index.js.erb +1 -0
  158. data/app/views/naf/janitorial_assignments/new.html.erb +11 -0
  159. data/app/views/naf/layouts/jquery_datatables.json.erb +6 -0
  160. data/app/views/naf/logger_names/_form.html.erb +18 -0
  161. data/app/views/naf/logger_names/edit.html.erb +11 -0
  162. data/app/views/naf/logger_names/new.html.erb +11 -0
  163. data/app/views/naf/logger_names/show.html.erb +44 -0
  164. data/app/views/naf/logger_styles/_form.html.erb +30 -0
  165. data/app/views/naf/logger_styles/_logger_style_names.html.erb +19 -0
  166. data/app/views/naf/logger_styles/edit.html.erb +11 -0
  167. data/app/views/naf/logger_styles/new.html.erb +11 -0
  168. data/app/views/naf/logger_styles/show.html.erb +48 -0
  169. data/app/views/naf/machine_affinity_slots/_form.html.erb +36 -0
  170. data/app/views/naf/machine_affinity_slots/edit.html.erb +11 -0
  171. data/app/views/naf/machine_affinity_slots/new.html.erb +11 -0
  172. data/app/views/naf/machine_runner_invocations/_filter.html.erb +21 -0
  173. data/app/views/naf/machine_runner_invocations/index.html.erb +36 -0
  174. data/app/views/naf/machine_runner_invocations/index.json.erb +16 -0
  175. data/app/views/naf/machine_runner_invocations/show.html.erb +91 -0
  176. data/app/views/naf/machine_runners/index.html.erb +82 -0
  177. data/app/views/naf/machine_runners/index.json.erb +16 -0
  178. data/app/views/naf/machine_runners/show.html.erb +113 -0
  179. data/app/views/naf/machines/_filter.html.erb +26 -0
  180. data/app/views/naf/machines/_form.html.erb +62 -0
  181. data/app/views/naf/machines/_show.html.erb +169 -0
  182. data/app/views/naf/machines/edit.html.erb +11 -0
  183. data/app/views/naf/machines/index.html.erb +51 -0
  184. data/app/views/naf/machines/index.json.erb +23 -0
  185. data/app/views/naf/machines/new.html.erb +11 -0
  186. data/app/views/naf/machines/show.html.erb +92 -0
  187. data/app/views/naf/record.html.erb +46 -0
  188. data/app/views/naf/shared/_application.html.erb +50 -0
  189. data/app/views/naf/shared/_information_container.html.erb +19 -0
  190. data/app/views/naf/shared/_select_per_page.html.erb +72 -0
  191. data/ci/test-build.sh +17 -0
  192. data/ci/travis.sh +26 -0
  193. data/config/initializers/naf.rb +3 -0
  194. data/config/routes.rb +38 -0
  195. data/db/migrate/20120820023848_naf_schema.rb +413 -0
  196. data/doc/README_FOR_APP +2 -0
  197. data/lib/generators/naf_generator.rb +45 -0
  198. data/lib/generators/templates/config/logging/af.yml +26 -0
  199. data/lib/generators/templates/config/logging/naf.yml +22 -0
  200. data/lib/generators/templates/config/logging/nafjob.yml +16 -0
  201. data/lib/generators/templates/config/logging/nafrunner.yml +17 -0
  202. data/lib/generators/templates/naf.rb +11 -0
  203. data/lib/generators/templates/naf_layout.html.erb +15 -0
  204. data/lib/naf.rb +48 -0
  205. data/lib/naf/configuration.rb +23 -0
  206. data/lib/naf/engine.rb +18 -0
  207. data/lib/naf/version.rb +3 -0
  208. data/lib/tasks/naf_tasks.rake +370 -0
  209. data/naf.gemspec +30 -0
  210. data/script/rails +10 -0
  211. data/spec/controllers/naf/affinities_controller_spec.rb +79 -0
  212. data/spec/controllers/naf/application_controller_spec.rb +10 -0
  213. data/spec/controllers/naf/application_schedule_affinity_tabs_controller_spec.rb +106 -0
  214. data/spec/controllers/naf/applications_controller_spec.rb +109 -0
  215. data/spec/controllers/naf/historical_job_affinity_tabs_controller_spec.rb +96 -0
  216. data/spec/controllers/naf/historical_jobs_controller_spec.rb +19 -0
  217. data/spec/controllers/naf/machine_affinity_slots_controller_spec.rb +109 -0
  218. data/spec/controllers/naf/machines_controller_spec.rb +74 -0
  219. data/spec/dummy/.gitignore +12 -0
  220. data/spec/dummy/README +19 -0
  221. data/spec/dummy/Rakefile +7 -0
  222. data/spec/dummy/app/assets/javascripts/application.js +16 -0
  223. data/spec/dummy/app/assets/stylesheets/application.css +14 -0
  224. data/spec/dummy/app/controllers/application_controller.rb +3 -0
  225. data/spec/dummy/app/helpers/application_helper.rb +2 -0
  226. data/spec/dummy/app/models/my_script.rb +8 -0
  227. data/spec/dummy/app/models/other/base.rb.sample +10 -0
  228. data/spec/dummy/app/views/layouts/application.html.erb +15 -0
  229. data/spec/dummy/app/views/layouts/naf_layout.html.erb +15 -0
  230. data/spec/dummy/config.ru +4 -0
  231. data/spec/dummy/config/application.rb +62 -0
  232. data/spec/dummy/config/boot.rb +10 -0
  233. data/spec/dummy/config/database-non_primary.yml +20 -0
  234. data/spec/dummy/config/database-primary.yml +16 -0
  235. data/spec/dummy/config/environment.rb +5 -0
  236. data/spec/dummy/config/environments/development.rb +37 -0
  237. data/spec/dummy/config/environments/production.rb +67 -0
  238. data/spec/dummy/config/environments/test.rb +37 -0
  239. data/spec/dummy/config/initializers/backtrace_silencers.rb +7 -0
  240. data/spec/dummy/config/initializers/inflections.rb +15 -0
  241. data/spec/dummy/config/initializers/mime_types.rb +5 -0
  242. data/spec/dummy/config/initializers/naf.rb.non_primary +4 -0
  243. data/spec/dummy/config/initializers/naf.rb.primary +3 -0
  244. data/spec/dummy/config/initializers/secret_token.rb +7 -0
  245. data/spec/dummy/config/initializers/session_store.rb +8 -0
  246. data/spec/dummy/config/initializers/wrap_parameters.rb +14 -0
  247. data/spec/dummy/config/locales/en.yml +5 -0
  248. data/spec/dummy/config/logging/af.yml +26 -0
  249. data/spec/dummy/config/logging/naf.yml +22 -0
  250. data/spec/dummy/config/logging/nafjob.yml +16 -0
  251. data/spec/dummy/config/logging/nafrunner.yml +17 -0
  252. data/spec/dummy/config/routes.rb +5 -0
  253. data/spec/dummy/db/.gitignore +2 -0
  254. data/spec/dummy/lib/tasks/dummy.rake +60 -0
  255. data/spec/dummy/public/404.html +26 -0
  256. data/spec/dummy/public/422.html +26 -0
  257. data/spec/dummy/public/500.html +25 -0
  258. data/spec/dummy/public/favicon.ico +0 -0
  259. data/spec/dummy/script/rails +6 -0
  260. data/spec/factories/naf.rb +433 -0
  261. data/spec/helpers/naf/application_helper_spec.rb +0 -0
  262. data/spec/models/logical/naf/application_spec.rb +69 -0
  263. data/spec/models/logical/naf/job_creator_spec.rb +32 -0
  264. data/spec/models/logical/naf/job_fetcher_spec.rb +140 -0
  265. data/spec/models/logical/naf/job_spec.rb +282 -0
  266. data/spec/models/logical/naf/machine_spec.rb +61 -0
  267. data/spec/models/naf/affinity_classification_spec.rb +56 -0
  268. data/spec/models/naf/affinity_spec.rb +100 -0
  269. data/spec/models/naf/application_run_group_restriction_spec.rb +57 -0
  270. data/spec/models/naf/application_schedule_affinity_tab_spec.rb +85 -0
  271. data/spec/models/naf/application_schedule_prerequisite_spec.rb +35 -0
  272. data/spec/models/naf/application_schedule_spec.rb +166 -0
  273. data/spec/models/naf/application_spec.rb +128 -0
  274. data/spec/models/naf/application_type_spec.rb +104 -0
  275. data/spec/models/naf/historical_job_affinity_tab_spec.rb +59 -0
  276. data/spec/models/naf/historical_job_prerequisite_spec.rb +25 -0
  277. data/spec/models/naf/historical_job_spec.rb +334 -0
  278. data/spec/models/naf/logger_level_spec.rb +34 -0
  279. data/spec/models/naf/logger_name_spec.rb +35 -0
  280. data/spec/models/naf/logger_style_name_spec.rb +39 -0
  281. data/spec/models/naf/logger_style_spec.rb +89 -0
  282. data/spec/models/naf/machine_affinity_slot_spec.rb +77 -0
  283. data/spec/models/naf/machine_runner_invocation_spec.rb +38 -0
  284. data/spec/models/naf/machine_runner_spec.rb +37 -0
  285. data/spec/models/naf/machine_spec.rb +425 -0
  286. data/spec/models/naf/naf_base_spec.rb +14 -0
  287. data/spec/models/naf/queued_job_spec.rb +171 -0
  288. data/spec/models/naf/running_job_spec.rb +107 -0
  289. data/spec/models/process/naf/application_spec.rb +8 -0
  290. data/spec/models/process/naf/janitor_spec.rb +10 -0
  291. data/spec/models/process/naf/runner_spec.rb +10 -0
  292. data/spec/spec_helper.rb +32 -0
  293. data/spec/support/engine_routing.rb +27 -0
  294. data/spec/support/script_spec_helper.rb +58 -0
  295. metadata +590 -0
@@ -0,0 +1,539 @@
1
+ require 'timeout'
2
+
3
+ module Process::Naf
4
+ class Runner < ::Af::Application
5
+
6
+ #----------------
7
+ # *** Options ***
8
+ #+++++++++++++++++
9
+
10
+ opt :wait_time_for_processes_to_terminate,
11
+ "time between askign processes to terminate and sending kill signals",
12
+ argument_note: "SECONDS",
13
+ default: 120
14
+ opt :check_schedules_period,
15
+ "time between checking schedules",
16
+ argument_note: "MINUTES",
17
+ default: 1
18
+ opt :schedule_fudge_scale,
19
+ "amount of time to look back in schedule for run_start_minute schedules (scaled to --check-schedule-period)",
20
+ default: 5
21
+ opt :runner_stale_period,
22
+ "amount of time to consider a machine out of touch if it hasn't updated its machine entry",
23
+ argument_note: "MINUTES",
24
+ default: 10
25
+ opt :loop_sleep_time,
26
+ "runner main loop sleep time",
27
+ argument_note: "SECONDS",
28
+ default: 30
29
+ opt :server_address,
30
+ "set the machines server address (dangerous)",
31
+ type: :string,
32
+ default: ::Naf::Machine.machine_ip_address,
33
+ hidden: true
34
+ opt :minimum_memory_free,
35
+ "percentage of memory free below which will limit process spawning",
36
+ default: 15.0,
37
+ argument_note: "PERCENT"
38
+ opt :disable_gc_modifications,
39
+ "don't modify ruby GC parameters",
40
+ default: false
41
+ opt :kill_all_runners,
42
+ "don't wait for runners to wind down and finish running their jobs",
43
+ default: false
44
+
45
+ def initialize
46
+ super
47
+ opt :log_configuration_files, default: ["af.yml",
48
+ "af-#{Rails.env}.yml",
49
+ "naf.yml",
50
+ "naf-#{Rails.env}.yml",
51
+ "nafrunner.yml",
52
+ "nafrunner-#{Rails.env}.yml",
53
+ "#{af_name}.yml",
54
+ "#{af_name}-#{Rails.env}.yml"]
55
+ @last_machine_log_level = nil
56
+ end
57
+
58
+ def work
59
+ unless @disable_gc_modifications
60
+ # These configuration changes will help forked processes, not the runner
61
+ ENV['RUBY_HEAP_MIN_SLOTS'] = '500000'
62
+ ENV['RUBY_HEAP_SLOTS_INCREMENT'] = '250000'
63
+ ENV['RUBY_HEAP_SLOTS_GROWTH_FACTOR'] = '1'
64
+ ENV['RUBY_GC_MALLOC_LIMIT'] = '50000000'
65
+ end
66
+
67
+ machine = ::Naf::Machine.find_by_server_address(@server_address)
68
+
69
+ unless machine.present?
70
+ logger.fatal "This machine is not configued correctly (ipaddress: #{@server_address})."
71
+ logger.fatal "Please update #{::Naf::Machine.table_name} with an entry for this machine."
72
+ logger.fatal "Exiting..."
73
+ exit 1
74
+ end
75
+
76
+ machine.lock_for_runner_use
77
+ begin
78
+ # Wind down other runners
79
+ machine.machine_runners.each do |machine_runner|
80
+ machine_runner.machine_runner_invocations.each do |invocation|
81
+ if invocation.dead_at.blank?
82
+ begin
83
+ retval = Process.kill(0, invocation.pid)
84
+ logger.detail "#{retval} = kill(0, #{invocation.pid}) -- process alive, marking runner invocation as winding down"
85
+ invocation.wind_down_at = Time.zone.now
86
+ invocation.save!
87
+ rescue Errno::ESRCH
88
+ logger.detail "ESRCH = kill(0, #{invocation.pid}) -- marking runner invocation as not running"
89
+ invocation.dead_at = Time.zone.now
90
+ invocation.save!
91
+ end
92
+ end
93
+ end
94
+ end
95
+ # Create a machine runner, if it doesn't exist
96
+ machine_runner = ::Naf::MachineRunner.
97
+ find_or_create_by_machine_id_and_runner_cwd(machine_id: machine.id,
98
+ runner_cwd: Dir.pwd)
99
+
100
+ begin
101
+ repository_name = (`git remote -v`).slice(/:\S+/).sub('.git','')[1..-1]
102
+ if repository_name.match(/fatal/)
103
+ repository_name = nil
104
+ end
105
+ rescue
106
+ repository_name = nil
107
+ end
108
+ branch_name = (`git rev-parse --abbrev-ref HEAD`).strip
109
+ if branch_name.match(/fatal/)
110
+ branch_name = nil
111
+ end
112
+ commit_information = (`git log --pretty="%H" -n 1`).strip
113
+ if commit_information.match(/fatal/)
114
+ commit_information = nil
115
+ end
116
+ deployment_tag = (`git describe --abbrev=0 --tag 2>&1`).strip
117
+ if deployment_tag.match(/fatal: No names found, cannot describe anything/)
118
+ deployment_tag = nil
119
+ end
120
+ # Create an invocation for this runner
121
+ invocation = ::Naf::MachineRunnerInvocation.create!(machine_runner_id: machine_runner.id,
122
+ pid: Process.pid,
123
+ repository_name: repository_name,
124
+ branch_name: branch_name,
125
+ commit_information: commit_information,
126
+ deployment_tag: deployment_tag)
127
+ ensure
128
+ machine.unlock_for_runner_use
129
+ end
130
+
131
+ begin
132
+ work_machine(machine, invocation)
133
+ ensure
134
+ invocation.dead_at = Time.zone.now
135
+ invocation.save!
136
+ end
137
+ end
138
+
139
+ def work_machine(machine, invocation)
140
+ machine.mark_alive
141
+ machine.mark_up
142
+
143
+ # Make sure no processes are thought to be running on this machine
144
+ terminate_old_processes(machine) if @kill_all_runners
145
+
146
+ logger.info "working: #{machine}"
147
+
148
+ @children = {}
149
+
150
+ at_exit {
151
+ ::Af::Application.singleton.emergency_teardown
152
+ }
153
+
154
+ @job_fetcher = ::Logical::Naf::JobFetcher.new(machine)
155
+
156
+ while true
157
+ break unless work_machine_loop(machine, invocation)
158
+ GC.start
159
+ end
160
+
161
+ logger.info "runner quitting"
162
+ end
163
+
164
+ def work_machine_loop(machine, invocation)
165
+ machine.reload
166
+
167
+ # Check machine status
168
+ if !machine.enabled
169
+ logger.warn "this machine is disabled #{machine}"
170
+ return false
171
+ elsif machine.marked_down
172
+ logger.warn "this machine is marked down #{machine}"
173
+ return false
174
+ end
175
+
176
+ machine.mark_alive
177
+
178
+ if machine.log_level != @last_machine_log_level
179
+ @last_machine_log_level = machine.log_level
180
+ unless @last_machine_log_level.blank?
181
+ logging_configurator.parse_and_set_logger_levels(@last_machine_log_level)
182
+ end
183
+ end
184
+
185
+ invocation.reload
186
+ if invocation.wind_down_at.present?
187
+ logger.warn "invocation asked to wind down"
188
+ if @children.length == 0
189
+ return false;
190
+ end
191
+ end
192
+
193
+ check_schedules(machine) if invocation.wind_down_at.blank?
194
+
195
+ # clean up children that have exited
196
+ logger.detail "cleaning up dead children: #{@children.length}"
197
+
198
+ if @children.length > 0
199
+ while @children.length > 0
200
+ pid = nil
201
+ status = nil
202
+ begin
203
+ Timeout::timeout(@loop_sleep_time) do
204
+ pid, status = Process.waitpid2(-1)
205
+ end
206
+ rescue Timeout::Error
207
+ # XXX is there a race condition where a child process exits
208
+ # XXX has not set pid or status yet and timeout fires?
209
+ # XXX i bet there is
210
+ # XXX so this code is here:
211
+ dead_children = []
212
+ @children.each do |pid, child|
213
+ unless is_job_process_alive?(child)
214
+ dead_children << child
215
+ end
216
+ end
217
+
218
+ unless dead_children.blank?
219
+ logger.error "#{machine}: dead children even with timeout during waitpid2(): #{dead_children.inspect}"
220
+ logger.warn "this isn't necessarily incorrect -- look for the pids to be cleaned up next round, if not: call it a bug"
221
+ end
222
+
223
+ break
224
+ rescue Errno::ECHILD => e
225
+ logger.error "#{machine} No child when we thought we had children #{@children.inspect}"
226
+ logger.warn e
227
+ pid = @children.first.try(:first)
228
+ status = nil
229
+ logger.warn "pulling first child off list to clean it up: pid=#{pid}"
230
+ end
231
+
232
+ if pid
233
+ begin
234
+ child_job = @children.delete(pid)
235
+
236
+ if child_job.present?
237
+ # Update job tags
238
+ child_job.historical_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:work]])
239
+
240
+ if status.nil? || status.exited? || status.signaled?
241
+ logger.info { "cleaning up dead child: #{child_job.reload}" }
242
+ finish_job(child_job,
243
+ { exit_status: (status && status.exitstatus), termination_signal: (status && status.termsig) })
244
+ else
245
+ # this can happen if the child is sigstopped
246
+ logger.warn "child waited for did not exit: #{child_job}, status: #{status.inspect}"
247
+ end
248
+ else
249
+ # XXX ERROR no child for returned pid -- this can't happen
250
+ logger.warn "child pid: #{pid}, status: #{status.inspect}, not managed by this runner"
251
+ end
252
+ rescue ActiveRecord::ActiveRecordError => are
253
+ raise
254
+ rescue StandardError => e
255
+ # XXX just incase a job control failure -- more code here
256
+ logger.error "some failure during child clean up"
257
+ logger.warn e
258
+ end
259
+ end
260
+ end
261
+ else
262
+ logger.detail "sleeping in loop: #{@loop_sleep_time} seconds"
263
+ sleep(@loop_sleep_time)
264
+ end
265
+
266
+ # start new jobs
267
+ logger.detail "starting new jobs, num children: #{@children.length}/#{machine.thread_pool_size}"
268
+ # XXX while @children.length < machine.thread_pool_size && memory_available_to_spawn? && invocation.wind_down_at.blank?
269
+ while ::Naf::RunningJob.where(:started_on_machine_id => machine.id).count < machine.thread_pool_size &&
270
+ memory_available_to_spawn? && invocation.wind_down_at.blank?
271
+
272
+ logger.debug_gross "fetching jobs because: children: #{@children.length} < #{machine.thread_pool_size} (poolsize)"
273
+ begin
274
+ running_job = @job_fetcher.fetch_next_job
275
+
276
+ unless running_job.present?
277
+ logger.debug_gross "no more jobs to run"
278
+ break
279
+ end
280
+
281
+ logger.info "starting new job : #{running_job}"
282
+
283
+ pid = running_job.historical_job.spawn
284
+ if pid
285
+ @children[pid] = running_job
286
+ running_job.pid = pid
287
+ running_job.historical_job.pid = pid
288
+ running_job.historical_job.failed_to_start = false
289
+ running_job.historical_job.machine_runner_invocation_id = invocation.id
290
+ logger.info "job started : #{running_job}"
291
+ running_job.save!
292
+ running_job.historical_job.save!
293
+ else
294
+ # should never get here (well, hopefully)
295
+ logger.error "#{machine}: failed to execute #{running_job}"
296
+
297
+ finish_job(running_job, { failed_to_start: true })
298
+ end
299
+ rescue ActiveRecord::ActiveRecordError => are
300
+ raise
301
+ rescue StandardError => e
302
+ # XXX rescue for various issues
303
+ logger.error "#{machine}: failure during job start"
304
+ logger.warn e
305
+ end
306
+ end
307
+ logger.debug_gross "done starting jobs"
308
+
309
+ return true
310
+
311
+ end
312
+
313
+ def check_schedules(machine)
314
+ if ::Naf::Machine.is_it_time_to_check_schedules?(@check_schedules_period.minutes)
315
+ logger.debug "it's time to check schedules"
316
+ if ::Naf::ApplicationSchedule.try_lock_schedules
317
+ logger.debug_gross "checking schedules"
318
+ machine.mark_checked_schedule
319
+ ::Naf::ApplicationSchedule.unlock_schedules
320
+
321
+ # check scheduled tasks
322
+ should_be_queued(machine).each do |application_schedule|
323
+ logger.info "scheduled application: #{application_schedule}"
324
+ begin
325
+ naf_boss = ::Logical::Naf::ConstructionZone::Boss.new
326
+ # this doesn't work very well for run_group_limits in the thousands
327
+ Range.new(0, application_schedule.application_run_group_limit || 1, true).each do
328
+ naf_boss.enqueue_application_schedule(application_schedule)
329
+ end
330
+ rescue ::Naf::HistoricalJob::JobPrerequisiteLoop => jpl
331
+ logger.error "#{machine} couldn't queue schedule because of prerequisite loop: #{jpl.message}"
332
+ logger.warn jpl
333
+ application_schedule.enabled = false
334
+ application_schedule.save!
335
+ logger.alarm "Application Schedule disabled due to loop: #{application_schedule}"
336
+ end
337
+ end
338
+
339
+ # check the runner machines
340
+ ::Naf::Machine.enabled.up.each do |runner_to_check|
341
+ if runner_to_check.is_stale?(@runner_stale_period.minutes)
342
+ logger.alarm "runner is stale for #{@runner_stale_period} minutes, #{runner_to_check}"
343
+ runner_to_check.mark_machine_down(machine)
344
+ end
345
+ end
346
+ end
347
+ end
348
+ end
349
+
350
+ # XXX update_all doesn't support "from_partition" so we have this helper
351
+ def update_historical_job(updates, historical_job_id)
352
+ updates[:updated_at] = Time.zone.now
353
+ update_columns = updates.map{ |k,v| "#{k} = ?" }.join(", ")
354
+ update_sql = <<-SQL
355
+ UPDATE
356
+ #{::Naf::HistoricalJob.partition_table_name(historical_job_id)}
357
+ SET
358
+ #{update_columns}
359
+ WHERE
360
+ id = ?
361
+ SQL
362
+ ::Naf::HistoricalJob.find_by_sql([update_sql] + updates.values + [historical_job_id])
363
+ end
364
+
365
+ def finish_job(running_job, updates = {})
366
+ running_job.historical_job.remove_all_tags
367
+ running_job.historical_job.add_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
368
+
369
+ ::Naf::HistoricalJob.transaction do
370
+ update_historical_job(updates.merge({ finished_at: Time.zone.now }), running_job.id)
371
+ running_job.delete
372
+ end
373
+
374
+ running_job.historical_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
375
+ end
376
+
377
+ # kill(0, pid) seems to fail during at_exit block
378
+ # so this shoots from the hip
379
+ def emergency_teardown
380
+ return if @children.length == 0
381
+ logger.warn "emergency teardown of #{@children.length} job(s)"
382
+ @children.clone.each do |pid, child|
383
+ send_signal_and_maybe_clean_up(child, "TERM")
384
+ end
385
+ sleep(2)
386
+ @children.clone.each do |pid, child|
387
+ send_signal_and_maybe_clean_up(child, "KILL")
388
+
389
+ # force job down
390
+ finish_job(child)
391
+ end
392
+ end
393
+
394
+ def terminate_old_processes(machine)
395
+ # check if any processes are hanging around and ask them
396
+ # politely if they will please terminate
397
+ jobs = assigned_jobs(machine)
398
+ if jobs.length == 0
399
+ logger.detail "no jobs to remove"
400
+ return
401
+ end
402
+ logger.info "number of old jobs to sift through: #{jobs.length}"
403
+ jobs.each do |job|
404
+ logger.detail "job still around: #{job}"
405
+ if job.request_to_terminate == false
406
+ logger.warn "politely asking process: #{job.pid} to terminate itself"
407
+ job.request_to_terminate = true
408
+ job.save!
409
+ end
410
+ end
411
+
412
+ # wait
413
+ (1..@wait_time_for_processes_to_terminate).each do |i|
414
+ num_assigned_jobs = assigned_jobs(machine).length
415
+ return if num_assigned_jobs == 0
416
+ logger.debug_medium "#{i}/#{@wait_time_for_processes_to_terminate}: sleeping 1 second while we wait for " +
417
+ "#{num_assigned_jobs} assigned job(s) to terminate as requested"
418
+ sleep(1)
419
+ end
420
+
421
+ # nudge them to terminate
422
+ jobs = assigned_jobs(machine)
423
+ if jobs.length == 0
424
+ logger.debug_gross "assigned jobs have exited after asking to terminate nicely"
425
+ return
426
+ end
427
+ jobs.each do |job|
428
+ logger.warn "sending SIG_TERM to process: #{job}"
429
+ send_signal_and_maybe_clean_up(job, "TERM")
430
+ end
431
+
432
+ # wait
433
+ (1..5).each do |i|
434
+ num_assigned_jobs = assigned_jobs(machine).length
435
+ return if num_assigned_jobs == 0
436
+ logger.debug_medium "#{i}/5: sleeping 1 second while we wait for #{num_assigned_jobs} assigned job(s) to terminate from SIG_TERM"
437
+ sleep(1)
438
+ end
439
+
440
+ # kill with fire
441
+ assigned_jobs(machine).each do |job|
442
+ logger.alarm "sending SIG_KILL to process: #{job}"
443
+ send_signal_and_maybe_clean_up(job, "KILL")
444
+
445
+ # job force job down
446
+ finish_job(job)
447
+ end
448
+ end
449
+
450
+ def send_signal_and_maybe_clean_up(job, signal)
451
+ if job.pid.nil?
452
+ finish_job(job)
453
+
454
+ return false
455
+ end
456
+
457
+ begin
458
+ retval = Process.kill(signal, job.pid)
459
+ logger.detail "#{retval} = kill(#{signal}, #{job.pid})"
460
+ rescue Errno::ESRCH
461
+ logger.detail "ESRCH = kill(#{signal}, #{job.pid})"
462
+
463
+ # job does not exist -- mark it finished
464
+ finish_job(job)
465
+
466
+ return false
467
+ end
468
+ return true
469
+ end
470
+
471
+ def is_job_process_alive?(job)
472
+ return send_signal_and_maybe_clean_up(job, 0)
473
+ end
474
+
475
+ def assigned_jobs(machine)
476
+ return ::Naf::RunningJob.assigned_jobs(machine).select do |job|
477
+ is_job_process_alive?(job)
478
+ end
479
+ end
480
+
481
+ def should_be_queued(machine)
482
+ not_finished_applications = ::Naf::HistoricalJob.
483
+ queued_between(Time.zone.now - Naf::HistoricalJob::JOB_STALE_TIME, Time.zone.now).
484
+ where("finished_at IS NULL AND request_to_terminate = false").
485
+ find_all{ |job| job.application_id.present? }.
486
+ index_by{ |job| job.application_id }
487
+
488
+ application_last_runs = ::Naf::HistoricalJob.application_last_runs.
489
+ index_by{ |job| job.application_id }
490
+
491
+ # find the run_interval based schedules that should be queued
492
+ # select anything that isn't currently running and completed
493
+ # running more than run_interval minutes ago
494
+ relative_schedules_what_need_queuin = ::Naf::ApplicationSchedule.where(enabled: true).relative_schedules.select do |schedule|
495
+ (not_finished_applications[schedule.application_id].nil? &&
496
+ (application_last_runs[schedule.application_id].nil? ||
497
+ (Time.zone.now - application_last_runs[schedule.application_id].finished_at) > (schedule.run_interval.minutes)))
498
+ end
499
+
500
+ # find the run_start_minute based schedules
501
+ # select anything that
502
+ # isn't currently running (or queued) AND
503
+ # hasn't run since run_start_time AND
504
+ # should have been run by now AND
505
+ # that should have run within fudge period AND
506
+ exact_schedules_what_need_queuin = ::Naf::ApplicationSchedule.where(enabled: true).exact_schedules.select do |schedule|
507
+ (not_finished_applications[schedule.application_id].nil? &&
508
+ (application_last_runs[schedule.application_id].nil? ||
509
+ ((Time.zone.now.to_date + schedule.run_start_minute.minutes) >= application_last_runs[schedule.application_id].finished_at)) &&
510
+ (Time.zone.now - (Time.zone.now.to_date + schedule.run_start_minute.minutes)) >= 0.seconds &&
511
+ ((Time.zone.now - (Time.zone.now.to_date + schedule.run_start_minute.minutes)) <= (@check_schedules_period * @schedule_fudge_scale).minutes)
512
+ )
513
+ end
514
+
515
+ foreman = ::Logical::Naf::ConstructionZone::Foreman.new()
516
+ return (relative_schedules_what_need_queuin + exact_schedules_what_need_queuin).select do |schedule|
517
+ schedule.enqueue_backlogs || !foreman.limited_by_run_group?(schedule.application_run_group_restriction,
518
+ schedule.application_run_group_name,
519
+ schedule.application_run_group_limit)
520
+ end
521
+ end
522
+
523
+ def memory_available_to_spawn?
524
+ Facter.clear
525
+ memory_size = Facter.memorysize_mb.to_f
526
+ memory_free = Facter.memoryfree_mb.to_f
527
+ memory_free_percentage = (memory_free / memory_size) * 100.0
528
+
529
+ if (memory_free_percentage >= @minimum_memory_free)
530
+ logger.detail "memory available: #{memory_free_percentage}% (free) >= #{@minimum_memory_free}% (min percent)"
531
+ return true
532
+ end
533
+ logger.alarm "#{Facter.hostname}.#{Facter.domain}: not enough memory to spawn: #{memory_free_percentage}% (free) < #{@minimum_memory_free}% (min percent)"
534
+
535
+ return false
536
+ end
537
+
538
+ end
539
+ end