naf 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +16 -0
- data/.rspec +1 -0
- data/.travis.yml +17 -0
- data/Gemfile +17 -0
- data/LICENSE +2 -0
- data/README.rdoc +22 -0
- data/RELEASE_NOTES.rdoc +18 -0
- data/Rakefile +43 -0
- data/app/assets/images/bg-grad.png +0 -0
- data/app/assets/images/clock.png +0 -0
- data/app/assets/images/control_play_blue.png +0 -0
- data/app/assets/images/down_arrow.gif +0 -0
- data/app/assets/images/papertrail_job.png +0 -0
- data/app/assets/images/papertrail_machine.png +0 -0
- data/app/assets/images/papertrail_machine_runner.png +0 -0
- data/app/assets/images/terminate.png +0 -0
- data/app/assets/images/ui-bg_flat_0_aaaaaa_40x100.png +0 -0
- data/app/assets/images/ui-bg_flat_0_ffffff_40x100.png +0 -0
- data/app/assets/images/ui-bg_flat_75_ffffff_40x100.png +0 -0
- data/app/assets/images/ui-bg_glass_0_f4f4f4_1x400.png +0 -0
- data/app/assets/images/ui-bg_glass_55_fbf9ee_1x400.png +0 -0
- data/app/assets/images/ui-bg_glass_65_f4f4f4_1x400.png +0 -0
- data/app/assets/images/ui-bg_glass_65_ffffff_1x400.png +0 -0
- data/app/assets/images/ui-bg_glass_75_dadada_1x400.png +0 -0
- data/app/assets/images/ui-bg_glass_75_e6e6e6_1x400.png +0 -0
- data/app/assets/images/ui-bg_glass_75_f4f4f4_1x400.png +0 -0
- data/app/assets/images/ui-bg_glass_95_fef1ec_1x400.png +0 -0
- data/app/assets/images/ui-bg_highlight-soft_0_f4f4f4_1x100.png +0 -0
- data/app/assets/images/ui-bg_highlight-soft_75_cccccc_1x100.png +0 -0
- data/app/assets/images/ui-icons_222222_256x240.png +0 -0
- data/app/assets/images/ui-icons_2e83ff_256x240.png +0 -0
- data/app/assets/images/ui-icons_454545_256x240.png +0 -0
- data/app/assets/images/ui-icons_888888_256x240.png +0 -0
- data/app/assets/images/ui-icons_cd0a0a_256x240.png +0 -0
- data/app/assets/images/up_arrow.gif +0 -0
- data/app/assets/javascripts/dataTablesTemplates/applications.js +94 -0
- data/app/assets/javascripts/dataTablesTemplates/jobs.js +163 -0
- data/app/assets/javascripts/dataTablesTemplates/machine_runner_invocations.js +60 -0
- data/app/assets/javascripts/dataTablesTemplates/machine_runners.js +82 -0
- data/app/assets/javascripts/dataTablesTemplates/machines.js +93 -0
- data/app/assets/javascripts/date.js +104 -0
- data/app/assets/javascripts/iso8601.js +41 -0
- data/app/assets/javascripts/jquery.dataTables.custom.js +62 -0
- data/app/assets/javascripts/jquery.dataTables.js +6862 -0
- data/app/assets/javascripts/naf.js +30 -0
- data/app/assets/javascripts/underscore.js +713 -0
- data/app/assets/stylesheets/jquery_ui/jquery-ui-1.8.5.custom.css.erb +572 -0
- data/app/assets/stylesheets/min_naf.css +14 -0
- data/app/assets/stylesheets/min_naf/layout.css.scss +355 -0
- data/app/assets/stylesheets/naf.css +14 -0
- data/app/assets/stylesheets/naf/layout.css.scss +497 -0
- data/app/controllers/naf/affinities_controller.rb +61 -0
- data/app/controllers/naf/application_controller.rb +43 -0
- data/app/controllers/naf/application_schedule_affinity_tabs_controller.rb +75 -0
- data/app/controllers/naf/applications_controller.rb +153 -0
- data/app/controllers/naf/historical_job_affinity_tabs_controller.rb +65 -0
- data/app/controllers/naf/historical_jobs_controller.rb +159 -0
- data/app/controllers/naf/janitorial_assignments_controller.rb +77 -0
- data/app/controllers/naf/logger_names_controller.rb +58 -0
- data/app/controllers/naf/logger_styles_controller.rb +59 -0
- data/app/controllers/naf/machine_affinity_slots_controller.rb +69 -0
- data/app/controllers/naf/machine_runner_invocations_controller.rb +59 -0
- data/app/controllers/naf/machine_runners_controller.rb +26 -0
- data/app/controllers/naf/machines_controller.rb +95 -0
- data/app/helpers/naf/application_helper.rb +275 -0
- data/app/models/log4r/papertrail_outputter.rb +19 -0
- data/app/models/logical/naf/application.rb +183 -0
- data/app/models/logical/naf/construction_zone/ad_hoc_work_order.rb +22 -0
- data/app/models/logical/naf/construction_zone/application_schedule_work_order.rb +15 -0
- data/app/models/logical/naf/construction_zone/application_work_order.rb +25 -0
- data/app/models/logical/naf/construction_zone/boss.rb +123 -0
- data/app/models/logical/naf/construction_zone/foreman.rb +53 -0
- data/app/models/logical/naf/construction_zone/proletariat.rb +40 -0
- data/app/models/logical/naf/construction_zone/work_order.rb +100 -0
- data/app/models/logical/naf/create_infrastructure.rb +48 -0
- data/app/models/logical/naf/job.rb +357 -0
- data/app/models/logical/naf/job_creator.rb +155 -0
- data/app/models/logical/naf/job_fetcher.rb +167 -0
- data/app/models/logical/naf/job_statuses/errored.rb +27 -0
- data/app/models/logical/naf/job_statuses/finished.rb +26 -0
- data/app/models/logical/naf/job_statuses/finished_less_minute.rb +25 -0
- data/app/models/logical/naf/job_statuses/queued.rb +32 -0
- data/app/models/logical/naf/job_statuses/running.rb +34 -0
- data/app/models/logical/naf/job_statuses/terminated.rb +25 -0
- data/app/models/logical/naf/job_statuses/waiting.rb +43 -0
- data/app/models/logical/naf/machine.rb +85 -0
- data/app/models/logical/naf/machine_runner.rb +46 -0
- data/app/models/logical/naf/machine_runner_invocation.rb +50 -0
- data/app/models/logical/naf/pickler.rb +74 -0
- data/app/models/logical/naf/unpickler.rb +98 -0
- data/app/models/naf/affinity.rb +145 -0
- data/app/models/naf/affinity_classification.rb +44 -0
- data/app/models/naf/application.rb +100 -0
- data/app/models/naf/application_run_group_restriction.rb +39 -0
- data/app/models/naf/application_schedule.rb +181 -0
- data/app/models/naf/application_schedule_affinity_tab.rb +86 -0
- data/app/models/naf/application_schedule_prerequisite.rb +50 -0
- data/app/models/naf/application_type.rb +72 -0
- data/app/models/naf/by_historical_job_id.rb +86 -0
- data/app/models/naf/historical_job.rb +334 -0
- data/app/models/naf/historical_job_affinity_tab.rb +61 -0
- data/app/models/naf/historical_job_prerequisite.rb +19 -0
- data/app/models/naf/janitorial_archive_assignment.rb +36 -0
- data/app/models/naf/janitorial_assignment.rb +37 -0
- data/app/models/naf/janitorial_create_assignment.rb +36 -0
- data/app/models/naf/janitorial_drop_assignment.rb +36 -0
- data/app/models/naf/logger_level.rb +21 -0
- data/app/models/naf/logger_name.rb +23 -0
- data/app/models/naf/logger_style.rb +58 -0
- data/app/models/naf/logger_style_name.rb +28 -0
- data/app/models/naf/machine.rb +257 -0
- data/app/models/naf/machine_affinity_slot.rb +78 -0
- data/app/models/naf/machine_runner.rb +51 -0
- data/app/models/naf/machine_runner_invocation.rb +71 -0
- data/app/models/naf/naf_base.rb +9 -0
- data/app/models/naf/queued_job.rb +164 -0
- data/app/models/naf/running_job.rb +80 -0
- data/app/models/process/naf/application.rb +164 -0
- data/app/models/process/naf/janitor.rb +117 -0
- data/app/models/process/naf/machine_manager.rb +150 -0
- data/app/models/process/naf/machine_upgrader.rb +112 -0
- data/app/models/process/naf/runner.rb +539 -0
- data/app/views/naf/affinities/_form.html.erb +50 -0
- data/app/views/naf/affinities/edit.html.erb +11 -0
- data/app/views/naf/affinities/index.html.erb +57 -0
- data/app/views/naf/affinities/new.html.erb +15 -0
- data/app/views/naf/affinities/show.html.erb +48 -0
- data/app/views/naf/application_schedule_affinity_tabs/_form.html.erb +31 -0
- data/app/views/naf/application_schedule_affinity_tabs/edit.html.erb +12 -0
- data/app/views/naf/application_schedule_affinity_tabs/new.html.erb +11 -0
- data/app/views/naf/applications/_application_schedule.html.erb +80 -0
- data/app/views/naf/applications/_application_schedule_prerequisites.html.erb +14 -0
- data/app/views/naf/applications/_form.html.erb +109 -0
- data/app/views/naf/applications/_search_container.html.erb +94 -0
- data/app/views/naf/applications/_show.html.erb +34 -0
- data/app/views/naf/applications/edit.html.erb +11 -0
- data/app/views/naf/applications/index.html.erb +51 -0
- data/app/views/naf/applications/index.json.erb +11 -0
- data/app/views/naf/applications/new.html.erb +11 -0
- data/app/views/naf/applications/show.html.erb +203 -0
- data/app/views/naf/datatable.html.erb +49 -0
- data/app/views/naf/historical_job_affinity_tabs/_form.html.erb +36 -0
- data/app/views/naf/historical_job_affinity_tabs/edit.html.erb +11 -0
- data/app/views/naf/historical_job_affinity_tabs/new.html.erb +11 -0
- data/app/views/naf/historical_jobs/_form.html.erb +94 -0
- data/app/views/naf/historical_jobs/_runners.html.erb +22 -0
- data/app/views/naf/historical_jobs/_search_container.html.erb +140 -0
- data/app/views/naf/historical_jobs/edit.html.erb +11 -0
- data/app/views/naf/historical_jobs/index.html.erb +48 -0
- data/app/views/naf/historical_jobs/index.json.erb +26 -0
- data/app/views/naf/historical_jobs/new.html.erb +61 -0
- data/app/views/naf/historical_jobs/show.html.erb +201 -0
- data/app/views/naf/janitorial_assignments/_form.html.erb +38 -0
- data/app/views/naf/janitorial_assignments/_rows.html.erb +17 -0
- data/app/views/naf/janitorial_assignments/edit.html.erb +11 -0
- data/app/views/naf/janitorial_assignments/index.html.erb +56 -0
- data/app/views/naf/janitorial_assignments/index.js.erb +1 -0
- data/app/views/naf/janitorial_assignments/new.html.erb +11 -0
- data/app/views/naf/layouts/jquery_datatables.json.erb +6 -0
- data/app/views/naf/logger_names/_form.html.erb +18 -0
- data/app/views/naf/logger_names/edit.html.erb +11 -0
- data/app/views/naf/logger_names/new.html.erb +11 -0
- data/app/views/naf/logger_names/show.html.erb +44 -0
- data/app/views/naf/logger_styles/_form.html.erb +30 -0
- data/app/views/naf/logger_styles/_logger_style_names.html.erb +19 -0
- data/app/views/naf/logger_styles/edit.html.erb +11 -0
- data/app/views/naf/logger_styles/new.html.erb +11 -0
- data/app/views/naf/logger_styles/show.html.erb +48 -0
- data/app/views/naf/machine_affinity_slots/_form.html.erb +36 -0
- data/app/views/naf/machine_affinity_slots/edit.html.erb +11 -0
- data/app/views/naf/machine_affinity_slots/new.html.erb +11 -0
- data/app/views/naf/machine_runner_invocations/_filter.html.erb +21 -0
- data/app/views/naf/machine_runner_invocations/index.html.erb +36 -0
- data/app/views/naf/machine_runner_invocations/index.json.erb +16 -0
- data/app/views/naf/machine_runner_invocations/show.html.erb +91 -0
- data/app/views/naf/machine_runners/index.html.erb +82 -0
- data/app/views/naf/machine_runners/index.json.erb +16 -0
- data/app/views/naf/machine_runners/show.html.erb +113 -0
- data/app/views/naf/machines/_filter.html.erb +26 -0
- data/app/views/naf/machines/_form.html.erb +62 -0
- data/app/views/naf/machines/_show.html.erb +169 -0
- data/app/views/naf/machines/edit.html.erb +11 -0
- data/app/views/naf/machines/index.html.erb +51 -0
- data/app/views/naf/machines/index.json.erb +23 -0
- data/app/views/naf/machines/new.html.erb +11 -0
- data/app/views/naf/machines/show.html.erb +92 -0
- data/app/views/naf/record.html.erb +46 -0
- data/app/views/naf/shared/_application.html.erb +50 -0
- data/app/views/naf/shared/_information_container.html.erb +19 -0
- data/app/views/naf/shared/_select_per_page.html.erb +72 -0
- data/ci/test-build.sh +17 -0
- data/ci/travis.sh +26 -0
- data/config/initializers/naf.rb +3 -0
- data/config/routes.rb +38 -0
- data/db/migrate/20120820023848_naf_schema.rb +413 -0
- data/doc/README_FOR_APP +2 -0
- data/lib/generators/naf_generator.rb +45 -0
- data/lib/generators/templates/config/logging/af.yml +26 -0
- data/lib/generators/templates/config/logging/naf.yml +22 -0
- data/lib/generators/templates/config/logging/nafjob.yml +16 -0
- data/lib/generators/templates/config/logging/nafrunner.yml +17 -0
- data/lib/generators/templates/naf.rb +11 -0
- data/lib/generators/templates/naf_layout.html.erb +15 -0
- data/lib/naf.rb +48 -0
- data/lib/naf/configuration.rb +23 -0
- data/lib/naf/engine.rb +18 -0
- data/lib/naf/version.rb +3 -0
- data/lib/tasks/naf_tasks.rake +370 -0
- data/naf.gemspec +30 -0
- data/script/rails +10 -0
- data/spec/controllers/naf/affinities_controller_spec.rb +79 -0
- data/spec/controllers/naf/application_controller_spec.rb +10 -0
- data/spec/controllers/naf/application_schedule_affinity_tabs_controller_spec.rb +106 -0
- data/spec/controllers/naf/applications_controller_spec.rb +109 -0
- data/spec/controllers/naf/historical_job_affinity_tabs_controller_spec.rb +96 -0
- data/spec/controllers/naf/historical_jobs_controller_spec.rb +19 -0
- data/spec/controllers/naf/machine_affinity_slots_controller_spec.rb +109 -0
- data/spec/controllers/naf/machines_controller_spec.rb +74 -0
- data/spec/dummy/.gitignore +12 -0
- data/spec/dummy/README +19 -0
- data/spec/dummy/Rakefile +7 -0
- data/spec/dummy/app/assets/javascripts/application.js +16 -0
- data/spec/dummy/app/assets/stylesheets/application.css +14 -0
- data/spec/dummy/app/controllers/application_controller.rb +3 -0
- data/spec/dummy/app/helpers/application_helper.rb +2 -0
- data/spec/dummy/app/models/my_script.rb +8 -0
- data/spec/dummy/app/models/other/base.rb.sample +10 -0
- data/spec/dummy/app/views/layouts/application.html.erb +15 -0
- data/spec/dummy/app/views/layouts/naf_layout.html.erb +15 -0
- data/spec/dummy/config.ru +4 -0
- data/spec/dummy/config/application.rb +62 -0
- data/spec/dummy/config/boot.rb +10 -0
- data/spec/dummy/config/database-non_primary.yml +20 -0
- data/spec/dummy/config/database-primary.yml +16 -0
- data/spec/dummy/config/environment.rb +5 -0
- data/spec/dummy/config/environments/development.rb +37 -0
- data/spec/dummy/config/environments/production.rb +67 -0
- data/spec/dummy/config/environments/test.rb +37 -0
- data/spec/dummy/config/initializers/backtrace_silencers.rb +7 -0
- data/spec/dummy/config/initializers/inflections.rb +15 -0
- data/spec/dummy/config/initializers/mime_types.rb +5 -0
- data/spec/dummy/config/initializers/naf.rb.non_primary +4 -0
- data/spec/dummy/config/initializers/naf.rb.primary +3 -0
- data/spec/dummy/config/initializers/secret_token.rb +7 -0
- data/spec/dummy/config/initializers/session_store.rb +8 -0
- data/spec/dummy/config/initializers/wrap_parameters.rb +14 -0
- data/spec/dummy/config/locales/en.yml +5 -0
- data/spec/dummy/config/logging/af.yml +26 -0
- data/spec/dummy/config/logging/naf.yml +22 -0
- data/spec/dummy/config/logging/nafjob.yml +16 -0
- data/spec/dummy/config/logging/nafrunner.yml +17 -0
- data/spec/dummy/config/routes.rb +5 -0
- data/spec/dummy/db/.gitignore +2 -0
- data/spec/dummy/lib/tasks/dummy.rake +60 -0
- data/spec/dummy/public/404.html +26 -0
- data/spec/dummy/public/422.html +26 -0
- data/spec/dummy/public/500.html +25 -0
- data/spec/dummy/public/favicon.ico +0 -0
- data/spec/dummy/script/rails +6 -0
- data/spec/factories/naf.rb +433 -0
- data/spec/helpers/naf/application_helper_spec.rb +0 -0
- data/spec/models/logical/naf/application_spec.rb +69 -0
- data/spec/models/logical/naf/job_creator_spec.rb +32 -0
- data/spec/models/logical/naf/job_fetcher_spec.rb +140 -0
- data/spec/models/logical/naf/job_spec.rb +282 -0
- data/spec/models/logical/naf/machine_spec.rb +61 -0
- data/spec/models/naf/affinity_classification_spec.rb +56 -0
- data/spec/models/naf/affinity_spec.rb +100 -0
- data/spec/models/naf/application_run_group_restriction_spec.rb +57 -0
- data/spec/models/naf/application_schedule_affinity_tab_spec.rb +85 -0
- data/spec/models/naf/application_schedule_prerequisite_spec.rb +35 -0
- data/spec/models/naf/application_schedule_spec.rb +166 -0
- data/spec/models/naf/application_spec.rb +128 -0
- data/spec/models/naf/application_type_spec.rb +104 -0
- data/spec/models/naf/historical_job_affinity_tab_spec.rb +59 -0
- data/spec/models/naf/historical_job_prerequisite_spec.rb +25 -0
- data/spec/models/naf/historical_job_spec.rb +334 -0
- data/spec/models/naf/logger_level_spec.rb +34 -0
- data/spec/models/naf/logger_name_spec.rb +35 -0
- data/spec/models/naf/logger_style_name_spec.rb +39 -0
- data/spec/models/naf/logger_style_spec.rb +89 -0
- data/spec/models/naf/machine_affinity_slot_spec.rb +77 -0
- data/spec/models/naf/machine_runner_invocation_spec.rb +38 -0
- data/spec/models/naf/machine_runner_spec.rb +37 -0
- data/spec/models/naf/machine_spec.rb +425 -0
- data/spec/models/naf/naf_base_spec.rb +14 -0
- data/spec/models/naf/queued_job_spec.rb +171 -0
- data/spec/models/naf/running_job_spec.rb +107 -0
- data/spec/models/process/naf/application_spec.rb +8 -0
- data/spec/models/process/naf/janitor_spec.rb +10 -0
- data/spec/models/process/naf/runner_spec.rb +10 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/engine_routing.rb +27 -0
- data/spec/support/script_spec_helper.rb +58 -0
- metadata +590 -0
@@ -0,0 +1,539 @@
|
|
1
|
+
require 'timeout'
|
2
|
+
|
3
|
+
module Process::Naf
|
4
|
+
class Runner < ::Af::Application
|
5
|
+
|
6
|
+
#----------------
|
7
|
+
# *** Options ***
|
8
|
+
#+++++++++++++++++
|
9
|
+
|
10
|
+
opt :wait_time_for_processes_to_terminate,
|
11
|
+
"time between askign processes to terminate and sending kill signals",
|
12
|
+
argument_note: "SECONDS",
|
13
|
+
default: 120
|
14
|
+
opt :check_schedules_period,
|
15
|
+
"time between checking schedules",
|
16
|
+
argument_note: "MINUTES",
|
17
|
+
default: 1
|
18
|
+
opt :schedule_fudge_scale,
|
19
|
+
"amount of time to look back in schedule for run_start_minute schedules (scaled to --check-schedule-period)",
|
20
|
+
default: 5
|
21
|
+
opt :runner_stale_period,
|
22
|
+
"amount of time to consider a machine out of touch if it hasn't updated its machine entry",
|
23
|
+
argument_note: "MINUTES",
|
24
|
+
default: 10
|
25
|
+
opt :loop_sleep_time,
|
26
|
+
"runner main loop sleep time",
|
27
|
+
argument_note: "SECONDS",
|
28
|
+
default: 30
|
29
|
+
opt :server_address,
|
30
|
+
"set the machines server address (dangerous)",
|
31
|
+
type: :string,
|
32
|
+
default: ::Naf::Machine.machine_ip_address,
|
33
|
+
hidden: true
|
34
|
+
opt :minimum_memory_free,
|
35
|
+
"percentage of memory free below which will limit process spawning",
|
36
|
+
default: 15.0,
|
37
|
+
argument_note: "PERCENT"
|
38
|
+
opt :disable_gc_modifications,
|
39
|
+
"don't modify ruby GC parameters",
|
40
|
+
default: false
|
41
|
+
opt :kill_all_runners,
|
42
|
+
"don't wait for runners to wind down and finish running their jobs",
|
43
|
+
default: false
|
44
|
+
|
45
|
+
def initialize
|
46
|
+
super
|
47
|
+
opt :log_configuration_files, default: ["af.yml",
|
48
|
+
"af-#{Rails.env}.yml",
|
49
|
+
"naf.yml",
|
50
|
+
"naf-#{Rails.env}.yml",
|
51
|
+
"nafrunner.yml",
|
52
|
+
"nafrunner-#{Rails.env}.yml",
|
53
|
+
"#{af_name}.yml",
|
54
|
+
"#{af_name}-#{Rails.env}.yml"]
|
55
|
+
@last_machine_log_level = nil
|
56
|
+
end
|
57
|
+
|
58
|
+
def work
|
59
|
+
unless @disable_gc_modifications
|
60
|
+
# These configuration changes will help forked processes, not the runner
|
61
|
+
ENV['RUBY_HEAP_MIN_SLOTS'] = '500000'
|
62
|
+
ENV['RUBY_HEAP_SLOTS_INCREMENT'] = '250000'
|
63
|
+
ENV['RUBY_HEAP_SLOTS_GROWTH_FACTOR'] = '1'
|
64
|
+
ENV['RUBY_GC_MALLOC_LIMIT'] = '50000000'
|
65
|
+
end
|
66
|
+
|
67
|
+
machine = ::Naf::Machine.find_by_server_address(@server_address)
|
68
|
+
|
69
|
+
unless machine.present?
|
70
|
+
logger.fatal "This machine is not configued correctly (ipaddress: #{@server_address})."
|
71
|
+
logger.fatal "Please update #{::Naf::Machine.table_name} with an entry for this machine."
|
72
|
+
logger.fatal "Exiting..."
|
73
|
+
exit 1
|
74
|
+
end
|
75
|
+
|
76
|
+
machine.lock_for_runner_use
|
77
|
+
begin
|
78
|
+
# Wind down other runners
|
79
|
+
machine.machine_runners.each do |machine_runner|
|
80
|
+
machine_runner.machine_runner_invocations.each do |invocation|
|
81
|
+
if invocation.dead_at.blank?
|
82
|
+
begin
|
83
|
+
retval = Process.kill(0, invocation.pid)
|
84
|
+
logger.detail "#{retval} = kill(0, #{invocation.pid}) -- process alive, marking runner invocation as winding down"
|
85
|
+
invocation.wind_down_at = Time.zone.now
|
86
|
+
invocation.save!
|
87
|
+
rescue Errno::ESRCH
|
88
|
+
logger.detail "ESRCH = kill(0, #{invocation.pid}) -- marking runner invocation as not running"
|
89
|
+
invocation.dead_at = Time.zone.now
|
90
|
+
invocation.save!
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
# Create a machine runner, if it doesn't exist
|
96
|
+
machine_runner = ::Naf::MachineRunner.
|
97
|
+
find_or_create_by_machine_id_and_runner_cwd(machine_id: machine.id,
|
98
|
+
runner_cwd: Dir.pwd)
|
99
|
+
|
100
|
+
begin
|
101
|
+
repository_name = (`git remote -v`).slice(/:\S+/).sub('.git','')[1..-1]
|
102
|
+
if repository_name.match(/fatal/)
|
103
|
+
repository_name = nil
|
104
|
+
end
|
105
|
+
rescue
|
106
|
+
repository_name = nil
|
107
|
+
end
|
108
|
+
branch_name = (`git rev-parse --abbrev-ref HEAD`).strip
|
109
|
+
if branch_name.match(/fatal/)
|
110
|
+
branch_name = nil
|
111
|
+
end
|
112
|
+
commit_information = (`git log --pretty="%H" -n 1`).strip
|
113
|
+
if commit_information.match(/fatal/)
|
114
|
+
commit_information = nil
|
115
|
+
end
|
116
|
+
deployment_tag = (`git describe --abbrev=0 --tag 2>&1`).strip
|
117
|
+
if deployment_tag.match(/fatal: No names found, cannot describe anything/)
|
118
|
+
deployment_tag = nil
|
119
|
+
end
|
120
|
+
# Create an invocation for this runner
|
121
|
+
invocation = ::Naf::MachineRunnerInvocation.create!(machine_runner_id: machine_runner.id,
|
122
|
+
pid: Process.pid,
|
123
|
+
repository_name: repository_name,
|
124
|
+
branch_name: branch_name,
|
125
|
+
commit_information: commit_information,
|
126
|
+
deployment_tag: deployment_tag)
|
127
|
+
ensure
|
128
|
+
machine.unlock_for_runner_use
|
129
|
+
end
|
130
|
+
|
131
|
+
begin
|
132
|
+
work_machine(machine, invocation)
|
133
|
+
ensure
|
134
|
+
invocation.dead_at = Time.zone.now
|
135
|
+
invocation.save!
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def work_machine(machine, invocation)
|
140
|
+
machine.mark_alive
|
141
|
+
machine.mark_up
|
142
|
+
|
143
|
+
# Make sure no processes are thought to be running on this machine
|
144
|
+
terminate_old_processes(machine) if @kill_all_runners
|
145
|
+
|
146
|
+
logger.info "working: #{machine}"
|
147
|
+
|
148
|
+
@children = {}
|
149
|
+
|
150
|
+
at_exit {
|
151
|
+
::Af::Application.singleton.emergency_teardown
|
152
|
+
}
|
153
|
+
|
154
|
+
@job_fetcher = ::Logical::Naf::JobFetcher.new(machine)
|
155
|
+
|
156
|
+
while true
|
157
|
+
break unless work_machine_loop(machine, invocation)
|
158
|
+
GC.start
|
159
|
+
end
|
160
|
+
|
161
|
+
logger.info "runner quitting"
|
162
|
+
end
|
163
|
+
|
164
|
+
def work_machine_loop(machine, invocation)
|
165
|
+
machine.reload
|
166
|
+
|
167
|
+
# Check machine status
|
168
|
+
if !machine.enabled
|
169
|
+
logger.warn "this machine is disabled #{machine}"
|
170
|
+
return false
|
171
|
+
elsif machine.marked_down
|
172
|
+
logger.warn "this machine is marked down #{machine}"
|
173
|
+
return false
|
174
|
+
end
|
175
|
+
|
176
|
+
machine.mark_alive
|
177
|
+
|
178
|
+
if machine.log_level != @last_machine_log_level
|
179
|
+
@last_machine_log_level = machine.log_level
|
180
|
+
unless @last_machine_log_level.blank?
|
181
|
+
logging_configurator.parse_and_set_logger_levels(@last_machine_log_level)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
invocation.reload
|
186
|
+
if invocation.wind_down_at.present?
|
187
|
+
logger.warn "invocation asked to wind down"
|
188
|
+
if @children.length == 0
|
189
|
+
return false;
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
check_schedules(machine) if invocation.wind_down_at.blank?
|
194
|
+
|
195
|
+
# clean up children that have exited
|
196
|
+
logger.detail "cleaning up dead children: #{@children.length}"
|
197
|
+
|
198
|
+
if @children.length > 0
|
199
|
+
while @children.length > 0
|
200
|
+
pid = nil
|
201
|
+
status = nil
|
202
|
+
begin
|
203
|
+
Timeout::timeout(@loop_sleep_time) do
|
204
|
+
pid, status = Process.waitpid2(-1)
|
205
|
+
end
|
206
|
+
rescue Timeout::Error
|
207
|
+
# XXX is there a race condition where a child process exits
|
208
|
+
# XXX has not set pid or status yet and timeout fires?
|
209
|
+
# XXX i bet there is
|
210
|
+
# XXX so this code is here:
|
211
|
+
dead_children = []
|
212
|
+
@children.each do |pid, child|
|
213
|
+
unless is_job_process_alive?(child)
|
214
|
+
dead_children << child
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
unless dead_children.blank?
|
219
|
+
logger.error "#{machine}: dead children even with timeout during waitpid2(): #{dead_children.inspect}"
|
220
|
+
logger.warn "this isn't necessarily incorrect -- look for the pids to be cleaned up next round, if not: call it a bug"
|
221
|
+
end
|
222
|
+
|
223
|
+
break
|
224
|
+
rescue Errno::ECHILD => e
|
225
|
+
logger.error "#{machine} No child when we thought we had children #{@children.inspect}"
|
226
|
+
logger.warn e
|
227
|
+
pid = @children.first.try(:first)
|
228
|
+
status = nil
|
229
|
+
logger.warn "pulling first child off list to clean it up: pid=#{pid}"
|
230
|
+
end
|
231
|
+
|
232
|
+
if pid
|
233
|
+
begin
|
234
|
+
child_job = @children.delete(pid)
|
235
|
+
|
236
|
+
if child_job.present?
|
237
|
+
# Update job tags
|
238
|
+
child_job.historical_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:work]])
|
239
|
+
|
240
|
+
if status.nil? || status.exited? || status.signaled?
|
241
|
+
logger.info { "cleaning up dead child: #{child_job.reload}" }
|
242
|
+
finish_job(child_job,
|
243
|
+
{ exit_status: (status && status.exitstatus), termination_signal: (status && status.termsig) })
|
244
|
+
else
|
245
|
+
# this can happen if the child is sigstopped
|
246
|
+
logger.warn "child waited for did not exit: #{child_job}, status: #{status.inspect}"
|
247
|
+
end
|
248
|
+
else
|
249
|
+
# XXX ERROR no child for returned pid -- this can't happen
|
250
|
+
logger.warn "child pid: #{pid}, status: #{status.inspect}, not managed by this runner"
|
251
|
+
end
|
252
|
+
rescue ActiveRecord::ActiveRecordError => are
|
253
|
+
raise
|
254
|
+
rescue StandardError => e
|
255
|
+
# XXX just incase a job control failure -- more code here
|
256
|
+
logger.error "some failure during child clean up"
|
257
|
+
logger.warn e
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
261
|
+
else
|
262
|
+
logger.detail "sleeping in loop: #{@loop_sleep_time} seconds"
|
263
|
+
sleep(@loop_sleep_time)
|
264
|
+
end
|
265
|
+
|
266
|
+
# start new jobs
|
267
|
+
logger.detail "starting new jobs, num children: #{@children.length}/#{machine.thread_pool_size}"
|
268
|
+
# XXX while @children.length < machine.thread_pool_size && memory_available_to_spawn? && invocation.wind_down_at.blank?
|
269
|
+
while ::Naf::RunningJob.where(:started_on_machine_id => machine.id).count < machine.thread_pool_size &&
|
270
|
+
memory_available_to_spawn? && invocation.wind_down_at.blank?
|
271
|
+
|
272
|
+
logger.debug_gross "fetching jobs because: children: #{@children.length} < #{machine.thread_pool_size} (poolsize)"
|
273
|
+
begin
|
274
|
+
running_job = @job_fetcher.fetch_next_job
|
275
|
+
|
276
|
+
unless running_job.present?
|
277
|
+
logger.debug_gross "no more jobs to run"
|
278
|
+
break
|
279
|
+
end
|
280
|
+
|
281
|
+
logger.info "starting new job : #{running_job}"
|
282
|
+
|
283
|
+
pid = running_job.historical_job.spawn
|
284
|
+
if pid
|
285
|
+
@children[pid] = running_job
|
286
|
+
running_job.pid = pid
|
287
|
+
running_job.historical_job.pid = pid
|
288
|
+
running_job.historical_job.failed_to_start = false
|
289
|
+
running_job.historical_job.machine_runner_invocation_id = invocation.id
|
290
|
+
logger.info "job started : #{running_job}"
|
291
|
+
running_job.save!
|
292
|
+
running_job.historical_job.save!
|
293
|
+
else
|
294
|
+
# should never get here (well, hopefully)
|
295
|
+
logger.error "#{machine}: failed to execute #{running_job}"
|
296
|
+
|
297
|
+
finish_job(running_job, { failed_to_start: true })
|
298
|
+
end
|
299
|
+
rescue ActiveRecord::ActiveRecordError => are
|
300
|
+
raise
|
301
|
+
rescue StandardError => e
|
302
|
+
# XXX rescue for various issues
|
303
|
+
logger.error "#{machine}: failure during job start"
|
304
|
+
logger.warn e
|
305
|
+
end
|
306
|
+
end
|
307
|
+
logger.debug_gross "done starting jobs"
|
308
|
+
|
309
|
+
return true
|
310
|
+
|
311
|
+
end
|
312
|
+
|
313
|
+
def check_schedules(machine)
|
314
|
+
if ::Naf::Machine.is_it_time_to_check_schedules?(@check_schedules_period.minutes)
|
315
|
+
logger.debug "it's time to check schedules"
|
316
|
+
if ::Naf::ApplicationSchedule.try_lock_schedules
|
317
|
+
logger.debug_gross "checking schedules"
|
318
|
+
machine.mark_checked_schedule
|
319
|
+
::Naf::ApplicationSchedule.unlock_schedules
|
320
|
+
|
321
|
+
# check scheduled tasks
|
322
|
+
should_be_queued(machine).each do |application_schedule|
|
323
|
+
logger.info "scheduled application: #{application_schedule}"
|
324
|
+
begin
|
325
|
+
naf_boss = ::Logical::Naf::ConstructionZone::Boss.new
|
326
|
+
# this doesn't work very well for run_group_limits in the thousands
|
327
|
+
Range.new(0, application_schedule.application_run_group_limit || 1, true).each do
|
328
|
+
naf_boss.enqueue_application_schedule(application_schedule)
|
329
|
+
end
|
330
|
+
rescue ::Naf::HistoricalJob::JobPrerequisiteLoop => jpl
|
331
|
+
logger.error "#{machine} couldn't queue schedule because of prerequisite loop: #{jpl.message}"
|
332
|
+
logger.warn jpl
|
333
|
+
application_schedule.enabled = false
|
334
|
+
application_schedule.save!
|
335
|
+
logger.alarm "Application Schedule disabled due to loop: #{application_schedule}"
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
# check the runner machines
|
340
|
+
::Naf::Machine.enabled.up.each do |runner_to_check|
|
341
|
+
if runner_to_check.is_stale?(@runner_stale_period.minutes)
|
342
|
+
logger.alarm "runner is stale for #{@runner_stale_period} minutes, #{runner_to_check}"
|
343
|
+
runner_to_check.mark_machine_down(machine)
|
344
|
+
end
|
345
|
+
end
|
346
|
+
end
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
# XXX update_all doesn't support "from_partition" so we have this helper
|
351
|
+
def update_historical_job(updates, historical_job_id)
|
352
|
+
updates[:updated_at] = Time.zone.now
|
353
|
+
update_columns = updates.map{ |k,v| "#{k} = ?" }.join(", ")
|
354
|
+
update_sql = <<-SQL
|
355
|
+
UPDATE
|
356
|
+
#{::Naf::HistoricalJob.partition_table_name(historical_job_id)}
|
357
|
+
SET
|
358
|
+
#{update_columns}
|
359
|
+
WHERE
|
360
|
+
id = ?
|
361
|
+
SQL
|
362
|
+
::Naf::HistoricalJob.find_by_sql([update_sql] + updates.values + [historical_job_id])
|
363
|
+
end
|
364
|
+
|
365
|
+
def finish_job(running_job, updates = {})
|
366
|
+
running_job.historical_job.remove_all_tags
|
367
|
+
running_job.historical_job.add_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
|
368
|
+
|
369
|
+
::Naf::HistoricalJob.transaction do
|
370
|
+
update_historical_job(updates.merge({ finished_at: Time.zone.now }), running_job.id)
|
371
|
+
running_job.delete
|
372
|
+
end
|
373
|
+
|
374
|
+
running_job.historical_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
|
375
|
+
end
|
376
|
+
|
377
|
+
# kill(0, pid) seems to fail during at_exit block
|
378
|
+
# so this shoots from the hip
|
379
|
+
def emergency_teardown
|
380
|
+
return if @children.length == 0
|
381
|
+
logger.warn "emergency teardown of #{@children.length} job(s)"
|
382
|
+
@children.clone.each do |pid, child|
|
383
|
+
send_signal_and_maybe_clean_up(child, "TERM")
|
384
|
+
end
|
385
|
+
sleep(2)
|
386
|
+
@children.clone.each do |pid, child|
|
387
|
+
send_signal_and_maybe_clean_up(child, "KILL")
|
388
|
+
|
389
|
+
# force job down
|
390
|
+
finish_job(child)
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
def terminate_old_processes(machine)
|
395
|
+
# check if any processes are hanging around and ask them
|
396
|
+
# politely if they will please terminate
|
397
|
+
jobs = assigned_jobs(machine)
|
398
|
+
if jobs.length == 0
|
399
|
+
logger.detail "no jobs to remove"
|
400
|
+
return
|
401
|
+
end
|
402
|
+
logger.info "number of old jobs to sift through: #{jobs.length}"
|
403
|
+
jobs.each do |job|
|
404
|
+
logger.detail "job still around: #{job}"
|
405
|
+
if job.request_to_terminate == false
|
406
|
+
logger.warn "politely asking process: #{job.pid} to terminate itself"
|
407
|
+
job.request_to_terminate = true
|
408
|
+
job.save!
|
409
|
+
end
|
410
|
+
end
|
411
|
+
|
412
|
+
# wait
|
413
|
+
(1..@wait_time_for_processes_to_terminate).each do |i|
|
414
|
+
num_assigned_jobs = assigned_jobs(machine).length
|
415
|
+
return if num_assigned_jobs == 0
|
416
|
+
logger.debug_medium "#{i}/#{@wait_time_for_processes_to_terminate}: sleeping 1 second while we wait for " +
|
417
|
+
"#{num_assigned_jobs} assigned job(s) to terminate as requested"
|
418
|
+
sleep(1)
|
419
|
+
end
|
420
|
+
|
421
|
+
# nudge them to terminate
|
422
|
+
jobs = assigned_jobs(machine)
|
423
|
+
if jobs.length == 0
|
424
|
+
logger.debug_gross "assigned jobs have exited after asking to terminate nicely"
|
425
|
+
return
|
426
|
+
end
|
427
|
+
jobs.each do |job|
|
428
|
+
logger.warn "sending SIG_TERM to process: #{job}"
|
429
|
+
send_signal_and_maybe_clean_up(job, "TERM")
|
430
|
+
end
|
431
|
+
|
432
|
+
# wait
|
433
|
+
(1..5).each do |i|
|
434
|
+
num_assigned_jobs = assigned_jobs(machine).length
|
435
|
+
return if num_assigned_jobs == 0
|
436
|
+
logger.debug_medium "#{i}/5: sleeping 1 second while we wait for #{num_assigned_jobs} assigned job(s) to terminate from SIG_TERM"
|
437
|
+
sleep(1)
|
438
|
+
end
|
439
|
+
|
440
|
+
# kill with fire
|
441
|
+
assigned_jobs(machine).each do |job|
|
442
|
+
logger.alarm "sending SIG_KILL to process: #{job}"
|
443
|
+
send_signal_and_maybe_clean_up(job, "KILL")
|
444
|
+
|
445
|
+
# job force job down
|
446
|
+
finish_job(job)
|
447
|
+
end
|
448
|
+
end
|
449
|
+
|
450
|
+
def send_signal_and_maybe_clean_up(job, signal)
|
451
|
+
if job.pid.nil?
|
452
|
+
finish_job(job)
|
453
|
+
|
454
|
+
return false
|
455
|
+
end
|
456
|
+
|
457
|
+
begin
|
458
|
+
retval = Process.kill(signal, job.pid)
|
459
|
+
logger.detail "#{retval} = kill(#{signal}, #{job.pid})"
|
460
|
+
rescue Errno::ESRCH
|
461
|
+
logger.detail "ESRCH = kill(#{signal}, #{job.pid})"
|
462
|
+
|
463
|
+
# job does not exist -- mark it finished
|
464
|
+
finish_job(job)
|
465
|
+
|
466
|
+
return false
|
467
|
+
end
|
468
|
+
return true
|
469
|
+
end
|
470
|
+
|
471
|
+
def is_job_process_alive?(job)
|
472
|
+
return send_signal_and_maybe_clean_up(job, 0)
|
473
|
+
end
|
474
|
+
|
475
|
+
def assigned_jobs(machine)
|
476
|
+
return ::Naf::RunningJob.assigned_jobs(machine).select do |job|
|
477
|
+
is_job_process_alive?(job)
|
478
|
+
end
|
479
|
+
end
|
480
|
+
|
481
|
+
def should_be_queued(machine)
|
482
|
+
not_finished_applications = ::Naf::HistoricalJob.
|
483
|
+
queued_between(Time.zone.now - Naf::HistoricalJob::JOB_STALE_TIME, Time.zone.now).
|
484
|
+
where("finished_at IS NULL AND request_to_terminate = false").
|
485
|
+
find_all{ |job| job.application_id.present? }.
|
486
|
+
index_by{ |job| job.application_id }
|
487
|
+
|
488
|
+
application_last_runs = ::Naf::HistoricalJob.application_last_runs.
|
489
|
+
index_by{ |job| job.application_id }
|
490
|
+
|
491
|
+
# find the run_interval based schedules that should be queued
|
492
|
+
# select anything that isn't currently running and completed
|
493
|
+
# running more than run_interval minutes ago
|
494
|
+
relative_schedules_what_need_queuin = ::Naf::ApplicationSchedule.where(enabled: true).relative_schedules.select do |schedule|
|
495
|
+
(not_finished_applications[schedule.application_id].nil? &&
|
496
|
+
(application_last_runs[schedule.application_id].nil? ||
|
497
|
+
(Time.zone.now - application_last_runs[schedule.application_id].finished_at) > (schedule.run_interval.minutes)))
|
498
|
+
end
|
499
|
+
|
500
|
+
# find the run_start_minute based schedules
|
501
|
+
# select anything that
|
502
|
+
# isn't currently running (or queued) AND
|
503
|
+
# hasn't run since run_start_time AND
|
504
|
+
# should have been run by now AND
|
505
|
+
# that should have run within fudge period AND
|
506
|
+
exact_schedules_what_need_queuin = ::Naf::ApplicationSchedule.where(enabled: true).exact_schedules.select do |schedule|
|
507
|
+
(not_finished_applications[schedule.application_id].nil? &&
|
508
|
+
(application_last_runs[schedule.application_id].nil? ||
|
509
|
+
((Time.zone.now.to_date + schedule.run_start_minute.minutes) >= application_last_runs[schedule.application_id].finished_at)) &&
|
510
|
+
(Time.zone.now - (Time.zone.now.to_date + schedule.run_start_minute.minutes)) >= 0.seconds &&
|
511
|
+
((Time.zone.now - (Time.zone.now.to_date + schedule.run_start_minute.minutes)) <= (@check_schedules_period * @schedule_fudge_scale).minutes)
|
512
|
+
)
|
513
|
+
end
|
514
|
+
|
515
|
+
foreman = ::Logical::Naf::ConstructionZone::Foreman.new()
|
516
|
+
return (relative_schedules_what_need_queuin + exact_schedules_what_need_queuin).select do |schedule|
|
517
|
+
schedule.enqueue_backlogs || !foreman.limited_by_run_group?(schedule.application_run_group_restriction,
|
518
|
+
schedule.application_run_group_name,
|
519
|
+
schedule.application_run_group_limit)
|
520
|
+
end
|
521
|
+
end
|
522
|
+
|
523
|
+
def memory_available_to_spawn?
|
524
|
+
Facter.clear
|
525
|
+
memory_size = Facter.memorysize_mb.to_f
|
526
|
+
memory_free = Facter.memoryfree_mb.to_f
|
527
|
+
memory_free_percentage = (memory_free / memory_size) * 100.0
|
528
|
+
|
529
|
+
if (memory_free_percentage >= @minimum_memory_free)
|
530
|
+
logger.detail "memory available: #{memory_free_percentage}% (free) >= #{@minimum_memory_free}% (min percent)"
|
531
|
+
return true
|
532
|
+
end
|
533
|
+
logger.alarm "#{Facter.hostname}.#{Facter.domain}: not enough memory to spawn: #{memory_free_percentage}% (free) < #{@minimum_memory_free}% (min percent)"
|
534
|
+
|
535
|
+
return false
|
536
|
+
end
|
537
|
+
|
538
|
+
end
|
539
|
+
end
|