naf 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/.gitignore +16 -0
  2. data/.rspec +1 -0
  3. data/.travis.yml +17 -0
  4. data/Gemfile +17 -0
  5. data/LICENSE +2 -0
  6. data/README.rdoc +22 -0
  7. data/RELEASE_NOTES.rdoc +18 -0
  8. data/Rakefile +43 -0
  9. data/app/assets/images/bg-grad.png +0 -0
  10. data/app/assets/images/clock.png +0 -0
  11. data/app/assets/images/control_play_blue.png +0 -0
  12. data/app/assets/images/down_arrow.gif +0 -0
  13. data/app/assets/images/papertrail_job.png +0 -0
  14. data/app/assets/images/papertrail_machine.png +0 -0
  15. data/app/assets/images/papertrail_machine_runner.png +0 -0
  16. data/app/assets/images/terminate.png +0 -0
  17. data/app/assets/images/ui-bg_flat_0_aaaaaa_40x100.png +0 -0
  18. data/app/assets/images/ui-bg_flat_0_ffffff_40x100.png +0 -0
  19. data/app/assets/images/ui-bg_flat_75_ffffff_40x100.png +0 -0
  20. data/app/assets/images/ui-bg_glass_0_f4f4f4_1x400.png +0 -0
  21. data/app/assets/images/ui-bg_glass_55_fbf9ee_1x400.png +0 -0
  22. data/app/assets/images/ui-bg_glass_65_f4f4f4_1x400.png +0 -0
  23. data/app/assets/images/ui-bg_glass_65_ffffff_1x400.png +0 -0
  24. data/app/assets/images/ui-bg_glass_75_dadada_1x400.png +0 -0
  25. data/app/assets/images/ui-bg_glass_75_e6e6e6_1x400.png +0 -0
  26. data/app/assets/images/ui-bg_glass_75_f4f4f4_1x400.png +0 -0
  27. data/app/assets/images/ui-bg_glass_95_fef1ec_1x400.png +0 -0
  28. data/app/assets/images/ui-bg_highlight-soft_0_f4f4f4_1x100.png +0 -0
  29. data/app/assets/images/ui-bg_highlight-soft_75_cccccc_1x100.png +0 -0
  30. data/app/assets/images/ui-icons_222222_256x240.png +0 -0
  31. data/app/assets/images/ui-icons_2e83ff_256x240.png +0 -0
  32. data/app/assets/images/ui-icons_454545_256x240.png +0 -0
  33. data/app/assets/images/ui-icons_888888_256x240.png +0 -0
  34. data/app/assets/images/ui-icons_cd0a0a_256x240.png +0 -0
  35. data/app/assets/images/up_arrow.gif +0 -0
  36. data/app/assets/javascripts/dataTablesTemplates/applications.js +94 -0
  37. data/app/assets/javascripts/dataTablesTemplates/jobs.js +163 -0
  38. data/app/assets/javascripts/dataTablesTemplates/machine_runner_invocations.js +60 -0
  39. data/app/assets/javascripts/dataTablesTemplates/machine_runners.js +82 -0
  40. data/app/assets/javascripts/dataTablesTemplates/machines.js +93 -0
  41. data/app/assets/javascripts/date.js +104 -0
  42. data/app/assets/javascripts/iso8601.js +41 -0
  43. data/app/assets/javascripts/jquery.dataTables.custom.js +62 -0
  44. data/app/assets/javascripts/jquery.dataTables.js +6862 -0
  45. data/app/assets/javascripts/naf.js +30 -0
  46. data/app/assets/javascripts/underscore.js +713 -0
  47. data/app/assets/stylesheets/jquery_ui/jquery-ui-1.8.5.custom.css.erb +572 -0
  48. data/app/assets/stylesheets/min_naf.css +14 -0
  49. data/app/assets/stylesheets/min_naf/layout.css.scss +355 -0
  50. data/app/assets/stylesheets/naf.css +14 -0
  51. data/app/assets/stylesheets/naf/layout.css.scss +497 -0
  52. data/app/controllers/naf/affinities_controller.rb +61 -0
  53. data/app/controllers/naf/application_controller.rb +43 -0
  54. data/app/controllers/naf/application_schedule_affinity_tabs_controller.rb +75 -0
  55. data/app/controllers/naf/applications_controller.rb +153 -0
  56. data/app/controllers/naf/historical_job_affinity_tabs_controller.rb +65 -0
  57. data/app/controllers/naf/historical_jobs_controller.rb +159 -0
  58. data/app/controllers/naf/janitorial_assignments_controller.rb +77 -0
  59. data/app/controllers/naf/logger_names_controller.rb +58 -0
  60. data/app/controllers/naf/logger_styles_controller.rb +59 -0
  61. data/app/controllers/naf/machine_affinity_slots_controller.rb +69 -0
  62. data/app/controllers/naf/machine_runner_invocations_controller.rb +59 -0
  63. data/app/controllers/naf/machine_runners_controller.rb +26 -0
  64. data/app/controllers/naf/machines_controller.rb +95 -0
  65. data/app/helpers/naf/application_helper.rb +275 -0
  66. data/app/models/log4r/papertrail_outputter.rb +19 -0
  67. data/app/models/logical/naf/application.rb +183 -0
  68. data/app/models/logical/naf/construction_zone/ad_hoc_work_order.rb +22 -0
  69. data/app/models/logical/naf/construction_zone/application_schedule_work_order.rb +15 -0
  70. data/app/models/logical/naf/construction_zone/application_work_order.rb +25 -0
  71. data/app/models/logical/naf/construction_zone/boss.rb +123 -0
  72. data/app/models/logical/naf/construction_zone/foreman.rb +53 -0
  73. data/app/models/logical/naf/construction_zone/proletariat.rb +40 -0
  74. data/app/models/logical/naf/construction_zone/work_order.rb +100 -0
  75. data/app/models/logical/naf/create_infrastructure.rb +48 -0
  76. data/app/models/logical/naf/job.rb +357 -0
  77. data/app/models/logical/naf/job_creator.rb +155 -0
  78. data/app/models/logical/naf/job_fetcher.rb +167 -0
  79. data/app/models/logical/naf/job_statuses/errored.rb +27 -0
  80. data/app/models/logical/naf/job_statuses/finished.rb +26 -0
  81. data/app/models/logical/naf/job_statuses/finished_less_minute.rb +25 -0
  82. data/app/models/logical/naf/job_statuses/queued.rb +32 -0
  83. data/app/models/logical/naf/job_statuses/running.rb +34 -0
  84. data/app/models/logical/naf/job_statuses/terminated.rb +25 -0
  85. data/app/models/logical/naf/job_statuses/waiting.rb +43 -0
  86. data/app/models/logical/naf/machine.rb +85 -0
  87. data/app/models/logical/naf/machine_runner.rb +46 -0
  88. data/app/models/logical/naf/machine_runner_invocation.rb +50 -0
  89. data/app/models/logical/naf/pickler.rb +74 -0
  90. data/app/models/logical/naf/unpickler.rb +98 -0
  91. data/app/models/naf/affinity.rb +145 -0
  92. data/app/models/naf/affinity_classification.rb +44 -0
  93. data/app/models/naf/application.rb +100 -0
  94. data/app/models/naf/application_run_group_restriction.rb +39 -0
  95. data/app/models/naf/application_schedule.rb +181 -0
  96. data/app/models/naf/application_schedule_affinity_tab.rb +86 -0
  97. data/app/models/naf/application_schedule_prerequisite.rb +50 -0
  98. data/app/models/naf/application_type.rb +72 -0
  99. data/app/models/naf/by_historical_job_id.rb +86 -0
  100. data/app/models/naf/historical_job.rb +334 -0
  101. data/app/models/naf/historical_job_affinity_tab.rb +61 -0
  102. data/app/models/naf/historical_job_prerequisite.rb +19 -0
  103. data/app/models/naf/janitorial_archive_assignment.rb +36 -0
  104. data/app/models/naf/janitorial_assignment.rb +37 -0
  105. data/app/models/naf/janitorial_create_assignment.rb +36 -0
  106. data/app/models/naf/janitorial_drop_assignment.rb +36 -0
  107. data/app/models/naf/logger_level.rb +21 -0
  108. data/app/models/naf/logger_name.rb +23 -0
  109. data/app/models/naf/logger_style.rb +58 -0
  110. data/app/models/naf/logger_style_name.rb +28 -0
  111. data/app/models/naf/machine.rb +257 -0
  112. data/app/models/naf/machine_affinity_slot.rb +78 -0
  113. data/app/models/naf/machine_runner.rb +51 -0
  114. data/app/models/naf/machine_runner_invocation.rb +71 -0
  115. data/app/models/naf/naf_base.rb +9 -0
  116. data/app/models/naf/queued_job.rb +164 -0
  117. data/app/models/naf/running_job.rb +80 -0
  118. data/app/models/process/naf/application.rb +164 -0
  119. data/app/models/process/naf/janitor.rb +117 -0
  120. data/app/models/process/naf/machine_manager.rb +150 -0
  121. data/app/models/process/naf/machine_upgrader.rb +112 -0
  122. data/app/models/process/naf/runner.rb +539 -0
  123. data/app/views/naf/affinities/_form.html.erb +50 -0
  124. data/app/views/naf/affinities/edit.html.erb +11 -0
  125. data/app/views/naf/affinities/index.html.erb +57 -0
  126. data/app/views/naf/affinities/new.html.erb +15 -0
  127. data/app/views/naf/affinities/show.html.erb +48 -0
  128. data/app/views/naf/application_schedule_affinity_tabs/_form.html.erb +31 -0
  129. data/app/views/naf/application_schedule_affinity_tabs/edit.html.erb +12 -0
  130. data/app/views/naf/application_schedule_affinity_tabs/new.html.erb +11 -0
  131. data/app/views/naf/applications/_application_schedule.html.erb +80 -0
  132. data/app/views/naf/applications/_application_schedule_prerequisites.html.erb +14 -0
  133. data/app/views/naf/applications/_form.html.erb +109 -0
  134. data/app/views/naf/applications/_search_container.html.erb +94 -0
  135. data/app/views/naf/applications/_show.html.erb +34 -0
  136. data/app/views/naf/applications/edit.html.erb +11 -0
  137. data/app/views/naf/applications/index.html.erb +51 -0
  138. data/app/views/naf/applications/index.json.erb +11 -0
  139. data/app/views/naf/applications/new.html.erb +11 -0
  140. data/app/views/naf/applications/show.html.erb +203 -0
  141. data/app/views/naf/datatable.html.erb +49 -0
  142. data/app/views/naf/historical_job_affinity_tabs/_form.html.erb +36 -0
  143. data/app/views/naf/historical_job_affinity_tabs/edit.html.erb +11 -0
  144. data/app/views/naf/historical_job_affinity_tabs/new.html.erb +11 -0
  145. data/app/views/naf/historical_jobs/_form.html.erb +94 -0
  146. data/app/views/naf/historical_jobs/_runners.html.erb +22 -0
  147. data/app/views/naf/historical_jobs/_search_container.html.erb +140 -0
  148. data/app/views/naf/historical_jobs/edit.html.erb +11 -0
  149. data/app/views/naf/historical_jobs/index.html.erb +48 -0
  150. data/app/views/naf/historical_jobs/index.json.erb +26 -0
  151. data/app/views/naf/historical_jobs/new.html.erb +61 -0
  152. data/app/views/naf/historical_jobs/show.html.erb +201 -0
  153. data/app/views/naf/janitorial_assignments/_form.html.erb +38 -0
  154. data/app/views/naf/janitorial_assignments/_rows.html.erb +17 -0
  155. data/app/views/naf/janitorial_assignments/edit.html.erb +11 -0
  156. data/app/views/naf/janitorial_assignments/index.html.erb +56 -0
  157. data/app/views/naf/janitorial_assignments/index.js.erb +1 -0
  158. data/app/views/naf/janitorial_assignments/new.html.erb +11 -0
  159. data/app/views/naf/layouts/jquery_datatables.json.erb +6 -0
  160. data/app/views/naf/logger_names/_form.html.erb +18 -0
  161. data/app/views/naf/logger_names/edit.html.erb +11 -0
  162. data/app/views/naf/logger_names/new.html.erb +11 -0
  163. data/app/views/naf/logger_names/show.html.erb +44 -0
  164. data/app/views/naf/logger_styles/_form.html.erb +30 -0
  165. data/app/views/naf/logger_styles/_logger_style_names.html.erb +19 -0
  166. data/app/views/naf/logger_styles/edit.html.erb +11 -0
  167. data/app/views/naf/logger_styles/new.html.erb +11 -0
  168. data/app/views/naf/logger_styles/show.html.erb +48 -0
  169. data/app/views/naf/machine_affinity_slots/_form.html.erb +36 -0
  170. data/app/views/naf/machine_affinity_slots/edit.html.erb +11 -0
  171. data/app/views/naf/machine_affinity_slots/new.html.erb +11 -0
  172. data/app/views/naf/machine_runner_invocations/_filter.html.erb +21 -0
  173. data/app/views/naf/machine_runner_invocations/index.html.erb +36 -0
  174. data/app/views/naf/machine_runner_invocations/index.json.erb +16 -0
  175. data/app/views/naf/machine_runner_invocations/show.html.erb +91 -0
  176. data/app/views/naf/machine_runners/index.html.erb +82 -0
  177. data/app/views/naf/machine_runners/index.json.erb +16 -0
  178. data/app/views/naf/machine_runners/show.html.erb +113 -0
  179. data/app/views/naf/machines/_filter.html.erb +26 -0
  180. data/app/views/naf/machines/_form.html.erb +62 -0
  181. data/app/views/naf/machines/_show.html.erb +169 -0
  182. data/app/views/naf/machines/edit.html.erb +11 -0
  183. data/app/views/naf/machines/index.html.erb +51 -0
  184. data/app/views/naf/machines/index.json.erb +23 -0
  185. data/app/views/naf/machines/new.html.erb +11 -0
  186. data/app/views/naf/machines/show.html.erb +92 -0
  187. data/app/views/naf/record.html.erb +46 -0
  188. data/app/views/naf/shared/_application.html.erb +50 -0
  189. data/app/views/naf/shared/_information_container.html.erb +19 -0
  190. data/app/views/naf/shared/_select_per_page.html.erb +72 -0
  191. data/ci/test-build.sh +17 -0
  192. data/ci/travis.sh +26 -0
  193. data/config/initializers/naf.rb +3 -0
  194. data/config/routes.rb +38 -0
  195. data/db/migrate/20120820023848_naf_schema.rb +413 -0
  196. data/doc/README_FOR_APP +2 -0
  197. data/lib/generators/naf_generator.rb +45 -0
  198. data/lib/generators/templates/config/logging/af.yml +26 -0
  199. data/lib/generators/templates/config/logging/naf.yml +22 -0
  200. data/lib/generators/templates/config/logging/nafjob.yml +16 -0
  201. data/lib/generators/templates/config/logging/nafrunner.yml +17 -0
  202. data/lib/generators/templates/naf.rb +11 -0
  203. data/lib/generators/templates/naf_layout.html.erb +15 -0
  204. data/lib/naf.rb +48 -0
  205. data/lib/naf/configuration.rb +23 -0
  206. data/lib/naf/engine.rb +18 -0
  207. data/lib/naf/version.rb +3 -0
  208. data/lib/tasks/naf_tasks.rake +370 -0
  209. data/naf.gemspec +30 -0
  210. data/script/rails +10 -0
  211. data/spec/controllers/naf/affinities_controller_spec.rb +79 -0
  212. data/spec/controllers/naf/application_controller_spec.rb +10 -0
  213. data/spec/controllers/naf/application_schedule_affinity_tabs_controller_spec.rb +106 -0
  214. data/spec/controllers/naf/applications_controller_spec.rb +109 -0
  215. data/spec/controllers/naf/historical_job_affinity_tabs_controller_spec.rb +96 -0
  216. data/spec/controllers/naf/historical_jobs_controller_spec.rb +19 -0
  217. data/spec/controllers/naf/machine_affinity_slots_controller_spec.rb +109 -0
  218. data/spec/controllers/naf/machines_controller_spec.rb +74 -0
  219. data/spec/dummy/.gitignore +12 -0
  220. data/spec/dummy/README +19 -0
  221. data/spec/dummy/Rakefile +7 -0
  222. data/spec/dummy/app/assets/javascripts/application.js +16 -0
  223. data/spec/dummy/app/assets/stylesheets/application.css +14 -0
  224. data/spec/dummy/app/controllers/application_controller.rb +3 -0
  225. data/spec/dummy/app/helpers/application_helper.rb +2 -0
  226. data/spec/dummy/app/models/my_script.rb +8 -0
  227. data/spec/dummy/app/models/other/base.rb.sample +10 -0
  228. data/spec/dummy/app/views/layouts/application.html.erb +15 -0
  229. data/spec/dummy/app/views/layouts/naf_layout.html.erb +15 -0
  230. data/spec/dummy/config.ru +4 -0
  231. data/spec/dummy/config/application.rb +62 -0
  232. data/spec/dummy/config/boot.rb +10 -0
  233. data/spec/dummy/config/database-non_primary.yml +20 -0
  234. data/spec/dummy/config/database-primary.yml +16 -0
  235. data/spec/dummy/config/environment.rb +5 -0
  236. data/spec/dummy/config/environments/development.rb +37 -0
  237. data/spec/dummy/config/environments/production.rb +67 -0
  238. data/spec/dummy/config/environments/test.rb +37 -0
  239. data/spec/dummy/config/initializers/backtrace_silencers.rb +7 -0
  240. data/spec/dummy/config/initializers/inflections.rb +15 -0
  241. data/spec/dummy/config/initializers/mime_types.rb +5 -0
  242. data/spec/dummy/config/initializers/naf.rb.non_primary +4 -0
  243. data/spec/dummy/config/initializers/naf.rb.primary +3 -0
  244. data/spec/dummy/config/initializers/secret_token.rb +7 -0
  245. data/spec/dummy/config/initializers/session_store.rb +8 -0
  246. data/spec/dummy/config/initializers/wrap_parameters.rb +14 -0
  247. data/spec/dummy/config/locales/en.yml +5 -0
  248. data/spec/dummy/config/logging/af.yml +26 -0
  249. data/spec/dummy/config/logging/naf.yml +22 -0
  250. data/spec/dummy/config/logging/nafjob.yml +16 -0
  251. data/spec/dummy/config/logging/nafrunner.yml +17 -0
  252. data/spec/dummy/config/routes.rb +5 -0
  253. data/spec/dummy/db/.gitignore +2 -0
  254. data/spec/dummy/lib/tasks/dummy.rake +60 -0
  255. data/spec/dummy/public/404.html +26 -0
  256. data/spec/dummy/public/422.html +26 -0
  257. data/spec/dummy/public/500.html +25 -0
  258. data/spec/dummy/public/favicon.ico +0 -0
  259. data/spec/dummy/script/rails +6 -0
  260. data/spec/factories/naf.rb +433 -0
  261. data/spec/helpers/naf/application_helper_spec.rb +0 -0
  262. data/spec/models/logical/naf/application_spec.rb +69 -0
  263. data/spec/models/logical/naf/job_creator_spec.rb +32 -0
  264. data/spec/models/logical/naf/job_fetcher_spec.rb +140 -0
  265. data/spec/models/logical/naf/job_spec.rb +282 -0
  266. data/spec/models/logical/naf/machine_spec.rb +61 -0
  267. data/spec/models/naf/affinity_classification_spec.rb +56 -0
  268. data/spec/models/naf/affinity_spec.rb +100 -0
  269. data/spec/models/naf/application_run_group_restriction_spec.rb +57 -0
  270. data/spec/models/naf/application_schedule_affinity_tab_spec.rb +85 -0
  271. data/spec/models/naf/application_schedule_prerequisite_spec.rb +35 -0
  272. data/spec/models/naf/application_schedule_spec.rb +166 -0
  273. data/spec/models/naf/application_spec.rb +128 -0
  274. data/spec/models/naf/application_type_spec.rb +104 -0
  275. data/spec/models/naf/historical_job_affinity_tab_spec.rb +59 -0
  276. data/spec/models/naf/historical_job_prerequisite_spec.rb +25 -0
  277. data/spec/models/naf/historical_job_spec.rb +334 -0
  278. data/spec/models/naf/logger_level_spec.rb +34 -0
  279. data/spec/models/naf/logger_name_spec.rb +35 -0
  280. data/spec/models/naf/logger_style_name_spec.rb +39 -0
  281. data/spec/models/naf/logger_style_spec.rb +89 -0
  282. data/spec/models/naf/machine_affinity_slot_spec.rb +77 -0
  283. data/spec/models/naf/machine_runner_invocation_spec.rb +38 -0
  284. data/spec/models/naf/machine_runner_spec.rb +37 -0
  285. data/spec/models/naf/machine_spec.rb +425 -0
  286. data/spec/models/naf/naf_base_spec.rb +14 -0
  287. data/spec/models/naf/queued_job_spec.rb +171 -0
  288. data/spec/models/naf/running_job_spec.rb +107 -0
  289. data/spec/models/process/naf/application_spec.rb +8 -0
  290. data/spec/models/process/naf/janitor_spec.rb +10 -0
  291. data/spec/models/process/naf/runner_spec.rb +10 -0
  292. data/spec/spec_helper.rb +32 -0
  293. data/spec/support/engine_routing.rb +27 -0
  294. data/spec/support/script_spec_helper.rb +58 -0
  295. metadata +590 -0
@@ -0,0 +1,539 @@
1
+ require 'timeout'
2
+
3
+ module Process::Naf
4
+ class Runner < ::Af::Application
5
+
6
+ #----------------
7
+ # *** Options ***
8
+ #+++++++++++++++++
9
+
10
+ opt :wait_time_for_processes_to_terminate,
11
+ "time between askign processes to terminate and sending kill signals",
12
+ argument_note: "SECONDS",
13
+ default: 120
14
+ opt :check_schedules_period,
15
+ "time between checking schedules",
16
+ argument_note: "MINUTES",
17
+ default: 1
18
+ opt :schedule_fudge_scale,
19
+ "amount of time to look back in schedule for run_start_minute schedules (scaled to --check-schedule-period)",
20
+ default: 5
21
+ opt :runner_stale_period,
22
+ "amount of time to consider a machine out of touch if it hasn't updated its machine entry",
23
+ argument_note: "MINUTES",
24
+ default: 10
25
+ opt :loop_sleep_time,
26
+ "runner main loop sleep time",
27
+ argument_note: "SECONDS",
28
+ default: 30
29
+ opt :server_address,
30
+ "set the machines server address (dangerous)",
31
+ type: :string,
32
+ default: ::Naf::Machine.machine_ip_address,
33
+ hidden: true
34
+ opt :minimum_memory_free,
35
+ "percentage of memory free below which will limit process spawning",
36
+ default: 15.0,
37
+ argument_note: "PERCENT"
38
+ opt :disable_gc_modifications,
39
+ "don't modify ruby GC parameters",
40
+ default: false
41
+ opt :kill_all_runners,
42
+ "don't wait for runners to wind down and finish running their jobs",
43
+ default: false
44
+
45
+ def initialize
46
+ super
47
+ opt :log_configuration_files, default: ["af.yml",
48
+ "af-#{Rails.env}.yml",
49
+ "naf.yml",
50
+ "naf-#{Rails.env}.yml",
51
+ "nafrunner.yml",
52
+ "nafrunner-#{Rails.env}.yml",
53
+ "#{af_name}.yml",
54
+ "#{af_name}-#{Rails.env}.yml"]
55
+ @last_machine_log_level = nil
56
+ end
57
+
58
+ def work
59
+ unless @disable_gc_modifications
60
+ # These configuration changes will help forked processes, not the runner
61
+ ENV['RUBY_HEAP_MIN_SLOTS'] = '500000'
62
+ ENV['RUBY_HEAP_SLOTS_INCREMENT'] = '250000'
63
+ ENV['RUBY_HEAP_SLOTS_GROWTH_FACTOR'] = '1'
64
+ ENV['RUBY_GC_MALLOC_LIMIT'] = '50000000'
65
+ end
66
+
67
+ machine = ::Naf::Machine.find_by_server_address(@server_address)
68
+
69
+ unless machine.present?
70
+ logger.fatal "This machine is not configued correctly (ipaddress: #{@server_address})."
71
+ logger.fatal "Please update #{::Naf::Machine.table_name} with an entry for this machine."
72
+ logger.fatal "Exiting..."
73
+ exit 1
74
+ end
75
+
76
+ machine.lock_for_runner_use
77
+ begin
78
+ # Wind down other runners
79
+ machine.machine_runners.each do |machine_runner|
80
+ machine_runner.machine_runner_invocations.each do |invocation|
81
+ if invocation.dead_at.blank?
82
+ begin
83
+ retval = Process.kill(0, invocation.pid)
84
+ logger.detail "#{retval} = kill(0, #{invocation.pid}) -- process alive, marking runner invocation as winding down"
85
+ invocation.wind_down_at = Time.zone.now
86
+ invocation.save!
87
+ rescue Errno::ESRCH
88
+ logger.detail "ESRCH = kill(0, #{invocation.pid}) -- marking runner invocation as not running"
89
+ invocation.dead_at = Time.zone.now
90
+ invocation.save!
91
+ end
92
+ end
93
+ end
94
+ end
95
+ # Create a machine runner, if it doesn't exist
96
+ machine_runner = ::Naf::MachineRunner.
97
+ find_or_create_by_machine_id_and_runner_cwd(machine_id: machine.id,
98
+ runner_cwd: Dir.pwd)
99
+
100
+ begin
101
+ repository_name = (`git remote -v`).slice(/:\S+/).sub('.git','')[1..-1]
102
+ if repository_name.match(/fatal/)
103
+ repository_name = nil
104
+ end
105
+ rescue
106
+ repository_name = nil
107
+ end
108
+ branch_name = (`git rev-parse --abbrev-ref HEAD`).strip
109
+ if branch_name.match(/fatal/)
110
+ branch_name = nil
111
+ end
112
+ commit_information = (`git log --pretty="%H" -n 1`).strip
113
+ if commit_information.match(/fatal/)
114
+ commit_information = nil
115
+ end
116
+ deployment_tag = (`git describe --abbrev=0 --tag 2>&1`).strip
117
+ if deployment_tag.match(/fatal: No names found, cannot describe anything/)
118
+ deployment_tag = nil
119
+ end
120
+ # Create an invocation for this runner
121
+ invocation = ::Naf::MachineRunnerInvocation.create!(machine_runner_id: machine_runner.id,
122
+ pid: Process.pid,
123
+ repository_name: repository_name,
124
+ branch_name: branch_name,
125
+ commit_information: commit_information,
126
+ deployment_tag: deployment_tag)
127
+ ensure
128
+ machine.unlock_for_runner_use
129
+ end
130
+
131
+ begin
132
+ work_machine(machine, invocation)
133
+ ensure
134
+ invocation.dead_at = Time.zone.now
135
+ invocation.save!
136
+ end
137
+ end
138
+
139
+ def work_machine(machine, invocation)
140
+ machine.mark_alive
141
+ machine.mark_up
142
+
143
+ # Make sure no processes are thought to be running on this machine
144
+ terminate_old_processes(machine) if @kill_all_runners
145
+
146
+ logger.info "working: #{machine}"
147
+
148
+ @children = {}
149
+
150
+ at_exit {
151
+ ::Af::Application.singleton.emergency_teardown
152
+ }
153
+
154
+ @job_fetcher = ::Logical::Naf::JobFetcher.new(machine)
155
+
156
+ while true
157
+ break unless work_machine_loop(machine, invocation)
158
+ GC.start
159
+ end
160
+
161
+ logger.info "runner quitting"
162
+ end
163
+
164
+ def work_machine_loop(machine, invocation)
165
+ machine.reload
166
+
167
+ # Check machine status
168
+ if !machine.enabled
169
+ logger.warn "this machine is disabled #{machine}"
170
+ return false
171
+ elsif machine.marked_down
172
+ logger.warn "this machine is marked down #{machine}"
173
+ return false
174
+ end
175
+
176
+ machine.mark_alive
177
+
178
+ if machine.log_level != @last_machine_log_level
179
+ @last_machine_log_level = machine.log_level
180
+ unless @last_machine_log_level.blank?
181
+ logging_configurator.parse_and_set_logger_levels(@last_machine_log_level)
182
+ end
183
+ end
184
+
185
+ invocation.reload
186
+ if invocation.wind_down_at.present?
187
+ logger.warn "invocation asked to wind down"
188
+ if @children.length == 0
189
+ return false;
190
+ end
191
+ end
192
+
193
+ check_schedules(machine) if invocation.wind_down_at.blank?
194
+
195
+ # clean up children that have exited
196
+ logger.detail "cleaning up dead children: #{@children.length}"
197
+
198
+ if @children.length > 0
199
+ while @children.length > 0
200
+ pid = nil
201
+ status = nil
202
+ begin
203
+ Timeout::timeout(@loop_sleep_time) do
204
+ pid, status = Process.waitpid2(-1)
205
+ end
206
+ rescue Timeout::Error
207
+ # XXX is there a race condition where a child process exits
208
+ # XXX has not set pid or status yet and timeout fires?
209
+ # XXX i bet there is
210
+ # XXX so this code is here:
211
+ dead_children = []
212
+ @children.each do |pid, child|
213
+ unless is_job_process_alive?(child)
214
+ dead_children << child
215
+ end
216
+ end
217
+
218
+ unless dead_children.blank?
219
+ logger.error "#{machine}: dead children even with timeout during waitpid2(): #{dead_children.inspect}"
220
+ logger.warn "this isn't necessarily incorrect -- look for the pids to be cleaned up next round, if not: call it a bug"
221
+ end
222
+
223
+ break
224
+ rescue Errno::ECHILD => e
225
+ logger.error "#{machine} No child when we thought we had children #{@children.inspect}"
226
+ logger.warn e
227
+ pid = @children.first.try(:first)
228
+ status = nil
229
+ logger.warn "pulling first child off list to clean it up: pid=#{pid}"
230
+ end
231
+
232
+ if pid
233
+ begin
234
+ child_job = @children.delete(pid)
235
+
236
+ if child_job.present?
237
+ # Update job tags
238
+ child_job.historical_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:work]])
239
+
240
+ if status.nil? || status.exited? || status.signaled?
241
+ logger.info { "cleaning up dead child: #{child_job.reload}" }
242
+ finish_job(child_job,
243
+ { exit_status: (status && status.exitstatus), termination_signal: (status && status.termsig) })
244
+ else
245
+ # this can happen if the child is sigstopped
246
+ logger.warn "child waited for did not exit: #{child_job}, status: #{status.inspect}"
247
+ end
248
+ else
249
+ # XXX ERROR no child for returned pid -- this can't happen
250
+ logger.warn "child pid: #{pid}, status: #{status.inspect}, not managed by this runner"
251
+ end
252
+ rescue ActiveRecord::ActiveRecordError => are
253
+ raise
254
+ rescue StandardError => e
255
+ # XXX just incase a job control failure -- more code here
256
+ logger.error "some failure during child clean up"
257
+ logger.warn e
258
+ end
259
+ end
260
+ end
261
+ else
262
+ logger.detail "sleeping in loop: #{@loop_sleep_time} seconds"
263
+ sleep(@loop_sleep_time)
264
+ end
265
+
266
+ # start new jobs
267
+ logger.detail "starting new jobs, num children: #{@children.length}/#{machine.thread_pool_size}"
268
+ # XXX while @children.length < machine.thread_pool_size && memory_available_to_spawn? && invocation.wind_down_at.blank?
269
+ while ::Naf::RunningJob.where(:started_on_machine_id => machine.id).count < machine.thread_pool_size &&
270
+ memory_available_to_spawn? && invocation.wind_down_at.blank?
271
+
272
+ logger.debug_gross "fetching jobs because: children: #{@children.length} < #{machine.thread_pool_size} (poolsize)"
273
+ begin
274
+ running_job = @job_fetcher.fetch_next_job
275
+
276
+ unless running_job.present?
277
+ logger.debug_gross "no more jobs to run"
278
+ break
279
+ end
280
+
281
+ logger.info "starting new job : #{running_job}"
282
+
283
+ pid = running_job.historical_job.spawn
284
+ if pid
285
+ @children[pid] = running_job
286
+ running_job.pid = pid
287
+ running_job.historical_job.pid = pid
288
+ running_job.historical_job.failed_to_start = false
289
+ running_job.historical_job.machine_runner_invocation_id = invocation.id
290
+ logger.info "job started : #{running_job}"
291
+ running_job.save!
292
+ running_job.historical_job.save!
293
+ else
294
+ # should never get here (well, hopefully)
295
+ logger.error "#{machine}: failed to execute #{running_job}"
296
+
297
+ finish_job(running_job, { failed_to_start: true })
298
+ end
299
+ rescue ActiveRecord::ActiveRecordError => are
300
+ raise
301
+ rescue StandardError => e
302
+ # XXX rescue for various issues
303
+ logger.error "#{machine}: failure during job start"
304
+ logger.warn e
305
+ end
306
+ end
307
+ logger.debug_gross "done starting jobs"
308
+
309
+ return true
310
+
311
+ end
312
+
313
+ def check_schedules(machine)
314
+ if ::Naf::Machine.is_it_time_to_check_schedules?(@check_schedules_period.minutes)
315
+ logger.debug "it's time to check schedules"
316
+ if ::Naf::ApplicationSchedule.try_lock_schedules
317
+ logger.debug_gross "checking schedules"
318
+ machine.mark_checked_schedule
319
+ ::Naf::ApplicationSchedule.unlock_schedules
320
+
321
+ # check scheduled tasks
322
+ should_be_queued(machine).each do |application_schedule|
323
+ logger.info "scheduled application: #{application_schedule}"
324
+ begin
325
+ naf_boss = ::Logical::Naf::ConstructionZone::Boss.new
326
+ # this doesn't work very well for run_group_limits in the thousands
327
+ Range.new(0, application_schedule.application_run_group_limit || 1, true).each do
328
+ naf_boss.enqueue_application_schedule(application_schedule)
329
+ end
330
+ rescue ::Naf::HistoricalJob::JobPrerequisiteLoop => jpl
331
+ logger.error "#{machine} couldn't queue schedule because of prerequisite loop: #{jpl.message}"
332
+ logger.warn jpl
333
+ application_schedule.enabled = false
334
+ application_schedule.save!
335
+ logger.alarm "Application Schedule disabled due to loop: #{application_schedule}"
336
+ end
337
+ end
338
+
339
+ # check the runner machines
340
+ ::Naf::Machine.enabled.up.each do |runner_to_check|
341
+ if runner_to_check.is_stale?(@runner_stale_period.minutes)
342
+ logger.alarm "runner is stale for #{@runner_stale_period} minutes, #{runner_to_check}"
343
+ runner_to_check.mark_machine_down(machine)
344
+ end
345
+ end
346
+ end
347
+ end
348
+ end
349
+
350
+ # XXX update_all doesn't support "from_partition" so we have this helper
351
+ def update_historical_job(updates, historical_job_id)
352
+ updates[:updated_at] = Time.zone.now
353
+ update_columns = updates.map{ |k,v| "#{k} = ?" }.join(", ")
354
+ update_sql = <<-SQL
355
+ UPDATE
356
+ #{::Naf::HistoricalJob.partition_table_name(historical_job_id)}
357
+ SET
358
+ #{update_columns}
359
+ WHERE
360
+ id = ?
361
+ SQL
362
+ ::Naf::HistoricalJob.find_by_sql([update_sql] + updates.values + [historical_job_id])
363
+ end
364
+
365
+ def finish_job(running_job, updates = {})
366
+ running_job.historical_job.remove_all_tags
367
+ running_job.historical_job.add_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
368
+
369
+ ::Naf::HistoricalJob.transaction do
370
+ update_historical_job(updates.merge({ finished_at: Time.zone.now }), running_job.id)
371
+ running_job.delete
372
+ end
373
+
374
+ running_job.historical_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
375
+ end
376
+
377
+ # kill(0, pid) seems to fail during at_exit block
378
+ # so this shoots from the hip
379
+ def emergency_teardown
380
+ return if @children.length == 0
381
+ logger.warn "emergency teardown of #{@children.length} job(s)"
382
+ @children.clone.each do |pid, child|
383
+ send_signal_and_maybe_clean_up(child, "TERM")
384
+ end
385
+ sleep(2)
386
+ @children.clone.each do |pid, child|
387
+ send_signal_and_maybe_clean_up(child, "KILL")
388
+
389
+ # force job down
390
+ finish_job(child)
391
+ end
392
+ end
393
+
394
+ def terminate_old_processes(machine)
395
+ # check if any processes are hanging around and ask them
396
+ # politely if they will please terminate
397
+ jobs = assigned_jobs(machine)
398
+ if jobs.length == 0
399
+ logger.detail "no jobs to remove"
400
+ return
401
+ end
402
+ logger.info "number of old jobs to sift through: #{jobs.length}"
403
+ jobs.each do |job|
404
+ logger.detail "job still around: #{job}"
405
+ if job.request_to_terminate == false
406
+ logger.warn "politely asking process: #{job.pid} to terminate itself"
407
+ job.request_to_terminate = true
408
+ job.save!
409
+ end
410
+ end
411
+
412
+ # wait
413
+ (1..@wait_time_for_processes_to_terminate).each do |i|
414
+ num_assigned_jobs = assigned_jobs(machine).length
415
+ return if num_assigned_jobs == 0
416
+ logger.debug_medium "#{i}/#{@wait_time_for_processes_to_terminate}: sleeping 1 second while we wait for " +
417
+ "#{num_assigned_jobs} assigned job(s) to terminate as requested"
418
+ sleep(1)
419
+ end
420
+
421
+ # nudge them to terminate
422
+ jobs = assigned_jobs(machine)
423
+ if jobs.length == 0
424
+ logger.debug_gross "assigned jobs have exited after asking to terminate nicely"
425
+ return
426
+ end
427
+ jobs.each do |job|
428
+ logger.warn "sending SIG_TERM to process: #{job}"
429
+ send_signal_and_maybe_clean_up(job, "TERM")
430
+ end
431
+
432
+ # wait
433
+ (1..5).each do |i|
434
+ num_assigned_jobs = assigned_jobs(machine).length
435
+ return if num_assigned_jobs == 0
436
+ logger.debug_medium "#{i}/5: sleeping 1 second while we wait for #{num_assigned_jobs} assigned job(s) to terminate from SIG_TERM"
437
+ sleep(1)
438
+ end
439
+
440
+ # kill with fire
441
+ assigned_jobs(machine).each do |job|
442
+ logger.alarm "sending SIG_KILL to process: #{job}"
443
+ send_signal_and_maybe_clean_up(job, "KILL")
444
+
445
+ # job force job down
446
+ finish_job(job)
447
+ end
448
+ end
449
+
450
+ def send_signal_and_maybe_clean_up(job, signal)
451
+ if job.pid.nil?
452
+ finish_job(job)
453
+
454
+ return false
455
+ end
456
+
457
+ begin
458
+ retval = Process.kill(signal, job.pid)
459
+ logger.detail "#{retval} = kill(#{signal}, #{job.pid})"
460
+ rescue Errno::ESRCH
461
+ logger.detail "ESRCH = kill(#{signal}, #{job.pid})"
462
+
463
+ # job does not exist -- mark it finished
464
+ finish_job(job)
465
+
466
+ return false
467
+ end
468
+ return true
469
+ end
470
+
471
+ def is_job_process_alive?(job)
472
+ return send_signal_and_maybe_clean_up(job, 0)
473
+ end
474
+
475
+ def assigned_jobs(machine)
476
+ return ::Naf::RunningJob.assigned_jobs(machine).select do |job|
477
+ is_job_process_alive?(job)
478
+ end
479
+ end
480
+
481
+ def should_be_queued(machine)
482
+ not_finished_applications = ::Naf::HistoricalJob.
483
+ queued_between(Time.zone.now - Naf::HistoricalJob::JOB_STALE_TIME, Time.zone.now).
484
+ where("finished_at IS NULL AND request_to_terminate = false").
485
+ find_all{ |job| job.application_id.present? }.
486
+ index_by{ |job| job.application_id }
487
+
488
+ application_last_runs = ::Naf::HistoricalJob.application_last_runs.
489
+ index_by{ |job| job.application_id }
490
+
491
+ # find the run_interval based schedules that should be queued
492
+ # select anything that isn't currently running and completed
493
+ # running more than run_interval minutes ago
494
+ relative_schedules_what_need_queuin = ::Naf::ApplicationSchedule.where(enabled: true).relative_schedules.select do |schedule|
495
+ (not_finished_applications[schedule.application_id].nil? &&
496
+ (application_last_runs[schedule.application_id].nil? ||
497
+ (Time.zone.now - application_last_runs[schedule.application_id].finished_at) > (schedule.run_interval.minutes)))
498
+ end
499
+
500
+ # find the run_start_minute based schedules
501
+ # select anything that
502
+ # isn't currently running (or queued) AND
503
+ # hasn't run since run_start_time AND
504
+ # should have been run by now AND
505
+ # that should have run within fudge period AND
506
+ exact_schedules_what_need_queuin = ::Naf::ApplicationSchedule.where(enabled: true).exact_schedules.select do |schedule|
507
+ (not_finished_applications[schedule.application_id].nil? &&
508
+ (application_last_runs[schedule.application_id].nil? ||
509
+ ((Time.zone.now.to_date + schedule.run_start_minute.minutes) >= application_last_runs[schedule.application_id].finished_at)) &&
510
+ (Time.zone.now - (Time.zone.now.to_date + schedule.run_start_minute.minutes)) >= 0.seconds &&
511
+ ((Time.zone.now - (Time.zone.now.to_date + schedule.run_start_minute.minutes)) <= (@check_schedules_period * @schedule_fudge_scale).minutes)
512
+ )
513
+ end
514
+
515
+ foreman = ::Logical::Naf::ConstructionZone::Foreman.new()
516
+ return (relative_schedules_what_need_queuin + exact_schedules_what_need_queuin).select do |schedule|
517
+ schedule.enqueue_backlogs || !foreman.limited_by_run_group?(schedule.application_run_group_restriction,
518
+ schedule.application_run_group_name,
519
+ schedule.application_run_group_limit)
520
+ end
521
+ end
522
+
523
+ def memory_available_to_spawn?
524
+ Facter.clear
525
+ memory_size = Facter.memorysize_mb.to_f
526
+ memory_free = Facter.memoryfree_mb.to_f
527
+ memory_free_percentage = (memory_free / memory_size) * 100.0
528
+
529
+ if (memory_free_percentage >= @minimum_memory_free)
530
+ logger.detail "memory available: #{memory_free_percentage}% (free) >= #{@minimum_memory_free}% (min percent)"
531
+ return true
532
+ end
533
+ logger.alarm "#{Facter.hostname}.#{Facter.domain}: not enough memory to spawn: #{memory_free_percentage}% (free) < #{@minimum_memory_free}% (min percent)"
534
+
535
+ return false
536
+ end
537
+
538
+ end
539
+ end