naf 1.1.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -2
- data/app/assets/images/{papertrail_job.png → job.png} +0 -0
- data/app/assets/images/{papertrail_machine.png → machine.png} +0 -0
- data/app/assets/images/{papertrail_machine_runner.png → machine_runner.png} +0 -0
- data/app/assets/javascripts/col_reorder_with_resize.js +1228 -0
- data/app/assets/javascripts/dataTablesTemplates/applications.js +2 -1
- data/app/assets/javascripts/dataTablesTemplates/jobs.js +2 -1
- data/app/assets/javascripts/dataTablesTemplates/machine_runner_invocations.js +2 -1
- data/app/assets/javascripts/dataTablesTemplates/machine_runners.js +2 -1
- data/app/assets/javascripts/dataTablesTemplates/machines.js +2 -1
- data/app/assets/javascripts/jquery.dataTables.js +10339 -5103
- data/app/assets/javascripts/naf.js +1 -0
- data/app/assets/stylesheets/jquery_ui/jquery-ui-1.8.5.custom.css.erb +6 -6
- data/app/assets/stylesheets/min_naf/layout.css.scss +94 -43
- data/app/assets/stylesheets/naf/layout.css.scss +94 -43
- data/app/controllers/naf/affinities_controller.rb +1 -1
- data/app/controllers/naf/applications_controller.rb +3 -0
- data/app/controllers/naf/historical_job_affinity_tabs_controller.rb +1 -1
- data/app/controllers/naf/historical_jobs_controller.rb +2 -5
- data/app/controllers/naf/log_parsers_controller.rb +16 -0
- data/app/controllers/naf/log_viewer_controller.rb +19 -0
- data/app/controllers/naf/machine_affinity_slots_controller.rb +1 -1
- data/app/controllers/naf/machine_runners_controller.rb +12 -0
- data/app/controllers/naf/machines_controller.rb +8 -10
- data/app/controllers/naf/status_controller.rb +12 -0
- data/app/helpers/naf/application_helper.rb +19 -38
- data/app/helpers/naf/time_helper.rb +37 -0
- data/app/models/logical/naf/application.rb +13 -19
- data/app/models/logical/naf/construction_zone/boss.rb +1 -1
- data/app/models/logical/naf/construction_zone/foreman.rb +1 -1
- data/app/models/logical/naf/job.rb +39 -34
- data/app/models/logical/naf/job_creator.rb +19 -23
- data/app/models/logical/naf/job_fetcher.rb +36 -6
- data/app/models/logical/naf/log_file.rb +70 -0
- data/app/models/logical/naf/log_parser/base.rb +272 -0
- data/app/models/logical/naf/log_parser/job.rb +65 -0
- data/app/models/logical/naf/log_parser/machine.rb +64 -0
- data/app/models/logical/naf/log_parser/runner.rb +72 -0
- data/app/models/logical/naf/log_reader.rb +85 -0
- data/app/models/logical/naf/machine.rb +39 -1
- data/app/models/naf/affinity.rb +18 -0
- data/app/models/naf/application_schedule_affinity_tab.rb +1 -0
- data/app/models/naf/application_type.rb +2 -1
- data/app/models/naf/historical_job.rb +9 -29
- data/app/models/naf/machine.rb +8 -0
- data/app/models/naf/machine_runner.rb +11 -2
- data/app/models/naf/machine_runner_invocation.rb +9 -1
- data/app/models/naf/running_job.rb +40 -1
- data/app/models/process/naf/application.rb +3 -3
- data/app/models/process/naf/log_archiver.rb +78 -0
- data/app/models/process/naf/machine_manager.rb +3 -1
- data/app/models/process/naf/runner.rb +286 -162
- data/app/models/process/naf/runner_log.rb +26 -0
- data/app/views/naf/application_schedule_affinity_tabs/_form.html.erb +1 -5
- data/app/views/naf/applications/show.html.erb +1 -1
- data/app/views/naf/historical_job_affinity_tabs/_form.html.erb +1 -5
- data/app/views/naf/historical_jobs/_form.html.erb +1 -1
- data/app/views/naf/historical_jobs/_runners.html.erb +21 -12
- data/app/views/naf/historical_jobs/_search_container.html.erb +1 -2
- data/app/views/naf/historical_jobs/index.html.erb +0 -1
- data/app/views/naf/historical_jobs/index.json.erb +4 -4
- data/app/views/naf/historical_jobs/show.html.erb +57 -51
- data/app/views/naf/log_viewer/_job_logs.html.erb +65 -0
- data/app/views/naf/log_viewer/_log_display.html.erb +259 -0
- data/app/views/naf/log_viewer/_log_layout.html.erb +59 -0
- data/app/views/naf/log_viewer/_machine_logs.html.erb +62 -0
- data/app/views/naf/log_viewer/_runner_logs.html.erb +62 -0
- data/app/views/naf/log_viewer/_search_options.html.erb +36 -0
- data/app/views/naf/log_viewer/_update_page_title.html.erb +9 -0
- data/app/views/naf/log_viewer/index.html.erb +1 -0
- data/app/views/naf/logger_names/_form.html.erb +1 -2
- data/app/views/naf/machine_affinity_slots/_form.html.erb +1 -5
- data/app/views/naf/machine_runner_invocations/show.html.erb +4 -0
- data/app/views/naf/machine_runners/show.html.erb +44 -34
- data/app/views/naf/machines/index.json.erb +14 -6
- data/app/views/naf/machines/show.html.erb +44 -40
- data/app/views/naf/shared/_auto_resize_width.html.erb +7 -0
- data/app/views/naf/shared/_date_select.html.erb +65 -0
- data/app/views/naf/shared/_select_per_page.html.erb +48 -13
- data/app/views/naf/status/index.html.erb +27 -0
- data/bin/naf +26 -0
- data/config/initializers/naf.rb +13 -1
- data/config/routes.rb +16 -2
- data/db/migrate/20131106162436_add_uuid_column_to_machine_runner_invocations.rb +15 -0
- data/db/migrate/20131121185222_move_tabs_column_from_historical_jobs_to_running_jobs.rb +15 -0
- data/lib/generators/templates/config/logging/naf.yml +0 -8
- data/lib/generators/templates/config/logging/nafjob.yml +0 -8
- data/lib/generators/templates/config/logging/nafrunner.yml +0 -8
- data/lib/generators/templates/naf.rb +0 -8
- data/lib/naf.rb +0 -8
- data/lib/naf/configuration.rb +0 -4
- data/lib/naf/version.rb +1 -1
- data/lib/tasks/naf_tasks.rake +18 -0
- data/naf.gemspec +3 -1
- data/spec/controllers/naf/affinities_controller_spec.rb +0 -1
- data/spec/controllers/naf/applications_controller_spec.rb +3 -2
- data/spec/controllers/naf/machine_affinity_slots_controller_spec.rb +0 -1
- data/spec/controllers/naf/machines_controller_spec.rb +1 -1
- data/spec/dummy/config/logging/naf.yml +0 -8
- data/spec/dummy/config/logging/nafjob.yml +0 -9
- data/spec/dummy/config/logging/nafrunner.yml +0 -10
- data/spec/factories/naf.rb +4 -0
- data/spec/models/logical/naf/application_spec.rb +3 -4
- data/spec/models/logical/naf/job_creator_spec.rb +91 -21
- data/spec/models/logical/naf/job_spec.rb +19 -6
- data/spec/models/logical/naf/log_file_spec.rb +105 -0
- data/spec/models/logical/naf/machine_runner_invocation_spec.rb +41 -0
- data/spec/models/logical/naf/machine_runner_spec.rb +42 -0
- data/spec/models/logical/naf/machine_spec.rb +98 -28
- data/spec/models/naf/affinity_classification_spec.rb +20 -0
- data/spec/models/naf/affinity_spec.rb +21 -0
- data/spec/models/naf/historical_job_spec.rb +2 -44
- data/spec/models/naf/machine_runner_invocation_spec.rb +17 -1
- data/spec/models/naf/running_job_spec.rb +64 -1
- metadata +40 -9
- data/app/models/log4r/papertrail_outputter.rb +0 -19
- data/app/views/naf/historical_jobs/edit.html.erb +0 -11
- data/app/views/naf/machines/_show.html.erb +0 -169
data/app/models/naf/machine.rb
CHANGED
|
@@ -42,9 +42,18 @@ module Naf
|
|
|
42
42
|
#{::Naf.schema_name}.machine_runner_invocations.wind_down_at IS NOT NULL")
|
|
43
43
|
end
|
|
44
44
|
|
|
45
|
-
def self.
|
|
45
|
+
def self.dead_count
|
|
46
46
|
(::Naf::MachineRunner.joins(:machine).where("#{::Naf.schema_name}.machines.enabled IS TRUE").pluck(:machine_id) -
|
|
47
|
-
::Naf::MachineRunner.running.pluck(:machine_id)
|
|
47
|
+
::Naf::MachineRunner.running.pluck(:machine_id) -
|
|
48
|
+
::Naf::MachineRunner.winding_down.pluck(:machine_id)).uniq.count
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
#-------------------------
|
|
52
|
+
# *** Instance Methods ***
|
|
53
|
+
#+++++++++++++++++++++++++
|
|
54
|
+
|
|
55
|
+
def current_invocation
|
|
56
|
+
machine_runner_invocations.where(dead_at: nil, wind_down_at: nil).order(:id).last
|
|
48
57
|
end
|
|
49
58
|
|
|
50
59
|
end
|
|
@@ -8,7 +8,8 @@ module Naf
|
|
|
8
8
|
:commit_information,
|
|
9
9
|
:branch_name,
|
|
10
10
|
:repository_name,
|
|
11
|
-
:deployment_tag
|
|
11
|
+
:deployment_tag,
|
|
12
|
+
:uuid
|
|
12
13
|
|
|
13
14
|
#---------------------
|
|
14
15
|
# *** Associations ***
|
|
@@ -49,6 +50,13 @@ module Naf
|
|
|
49
50
|
end
|
|
50
51
|
end
|
|
51
52
|
|
|
53
|
+
def self.recently_marked_dead(time)
|
|
54
|
+
where("
|
|
55
|
+
#{::Naf.schema_name}.machine_runner_invocations.dead_at IS NOT NULL AND
|
|
56
|
+
#{::Naf.schema_name}.machine_runner_invocations.dead_at > ?", Time.zone.now - time
|
|
57
|
+
)
|
|
58
|
+
end
|
|
59
|
+
|
|
52
60
|
#-------------------------
|
|
53
61
|
# *** Instance Methods ***
|
|
54
62
|
#+++++++++++++++++++++++++
|
|
@@ -12,7 +12,8 @@ module Naf
|
|
|
12
12
|
:request_to_terminate,
|
|
13
13
|
:marked_dead_by_machine_id,
|
|
14
14
|
:log_level,
|
|
15
|
-
:started_at
|
|
15
|
+
:started_at,
|
|
16
|
+
:tags
|
|
16
17
|
|
|
17
18
|
#---------------------
|
|
18
19
|
# *** Associations ***
|
|
@@ -48,6 +49,11 @@ module Naf
|
|
|
48
49
|
where(started_on_machine_id: machine.id)
|
|
49
50
|
end
|
|
50
51
|
|
|
52
|
+
def self.started_on_invocation(invocation_id)
|
|
53
|
+
joins(:historical_job).
|
|
54
|
+
where("#{::Naf.schema_name}.historical_jobs.machine_runner_invocation_id = #{invocation_id}")
|
|
55
|
+
end
|
|
56
|
+
|
|
51
57
|
def self.in_run_group(run_group_name)
|
|
52
58
|
where(application_run_group_name: run_group_name)
|
|
53
59
|
end
|
|
@@ -76,5 +82,38 @@ module Naf
|
|
|
76
82
|
job_weights
|
|
77
83
|
end
|
|
78
84
|
|
|
85
|
+
#-------------------------
|
|
86
|
+
# *** Instance Methods ***
|
|
87
|
+
#+++++++++++++++++++++++++
|
|
88
|
+
|
|
89
|
+
def add_tags(tags_to_add)
|
|
90
|
+
tags_array = nil
|
|
91
|
+
if self.tags.present?
|
|
92
|
+
tags_array = self.tags.gsub(/[{}]/,'').split(',')
|
|
93
|
+
new_tags = '{' + (tags_array | tags_to_add).join(',') + '}'
|
|
94
|
+
else
|
|
95
|
+
new_tags = '{' + tags_to_add.join(',') + '}'
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
self.tags = new_tags
|
|
99
|
+
self.save!
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def remove_tags(tags_to_remove)
|
|
103
|
+
if self.tags.present?
|
|
104
|
+
tags_array = self.tags.gsub(/[{}]/,'').split(',')
|
|
105
|
+
new_tags = '{' + (tags_array - tags_to_remove).join(',') + '}'
|
|
106
|
+
|
|
107
|
+
self.tags = new_tags
|
|
108
|
+
self.save!
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def remove_all_tags
|
|
113
|
+
self.tags = '{}'
|
|
114
|
+
self.save!
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
|
|
79
118
|
end
|
|
80
119
|
end
|
|
@@ -7,7 +7,7 @@ module Process::Naf
|
|
|
7
7
|
def initialize(job, reason)
|
|
8
8
|
@job = job
|
|
9
9
|
@reason = reason
|
|
10
|
-
super("Requested to terminate: #{reason}")
|
|
10
|
+
super("Requested to terminate by Naf: #{reason}")
|
|
11
11
|
end
|
|
12
12
|
end
|
|
13
13
|
|
|
@@ -93,7 +93,7 @@ module Process::Naf
|
|
|
93
93
|
end
|
|
94
94
|
|
|
95
95
|
def job_tag_block(*tags, &block)
|
|
96
|
-
job = fetch_naf_job
|
|
96
|
+
job = fetch_naf_job.try(:running_job)
|
|
97
97
|
begin
|
|
98
98
|
if job
|
|
99
99
|
add_job_tags(*tags)
|
|
@@ -107,7 +107,7 @@ module Process::Naf
|
|
|
107
107
|
end
|
|
108
108
|
|
|
109
109
|
def update_job_tags(old_tags, new_tags)
|
|
110
|
-
job = fetch_naf_job
|
|
110
|
+
job = fetch_naf_job.try(:running_job)
|
|
111
111
|
if job
|
|
112
112
|
job.remove_tags(old_tags.map(&:to_s))
|
|
113
113
|
job.add_tags(new_tags.map(&:to_s))
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
require 'aws'
|
|
2
|
+
|
|
3
|
+
module Process::Naf
|
|
4
|
+
class LogArchiver < ::Process::Naf::Application
|
|
5
|
+
|
|
6
|
+
NAF_JOBS_LOG_PATH = "#{::Naf::PREFIX_PATH}/#{::Naf.schema_name}/jobs/"
|
|
7
|
+
NAF_RUNNERS_LOG_PATH = "#{::Naf::PREFIX_PATH}/#{::Naf.schema_name}/runners/*/*"
|
|
8
|
+
DATE_REGEX = /\d{8}_\d{6}/
|
|
9
|
+
LOG_RETENTION = 1
|
|
10
|
+
|
|
11
|
+
def work
|
|
12
|
+
# Use AWS credentials to access S3
|
|
13
|
+
s3 = AWS::S3.new(access_key_id: AWS_ID,
|
|
14
|
+
secret_access_key: AWS_KEY,
|
|
15
|
+
ssl_verify_peer: false)
|
|
16
|
+
|
|
17
|
+
# Each project will have a specific bucket
|
|
18
|
+
bucket = s3.buckets[NAF_BUCKET]
|
|
19
|
+
files = log_files
|
|
20
|
+
|
|
21
|
+
logger.info 'Starting to save files to s3...'
|
|
22
|
+
files.each do |file|
|
|
23
|
+
# Write file if not existent
|
|
24
|
+
object = bucket.objects["naf/#{project_name}/#{Rails.env}/#{creation_time}" + file[12..-1]]
|
|
25
|
+
if !object.exists?
|
|
26
|
+
# Write file to S3
|
|
27
|
+
result = object.write(File.open(file).read)
|
|
28
|
+
logger.info "File #{file} saved to S3"
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
logger.info 'Starting to archive files...'
|
|
33
|
+
archive_old_files(files)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
def project_name
|
|
39
|
+
(`git remote -v`).slice(/\/\S+/).sub('.git','')[1..-1]
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def log_files
|
|
43
|
+
files = Dir[NAF_JOBS_LOG_PATH + "*/*"]
|
|
44
|
+
files += Dir[NAF_RUNNERS_LOG_PATH + "*/*"]
|
|
45
|
+
# Sort log files based on time
|
|
46
|
+
files = files.sort { |x, y| Time.parse(y.scan(DATE_REGEX).first) <=> Time.parse(x.scan(DATE_REGEX).first) }
|
|
47
|
+
|
|
48
|
+
return files
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def creation_time
|
|
52
|
+
::Naf::ApplicationType.first.created_at.strftime("%Y%m%d_%H%M%S")
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def archive_old_files(files)
|
|
56
|
+
copy_files
|
|
57
|
+
today = Time.zone.now.to_date
|
|
58
|
+
files.each do |file|
|
|
59
|
+
if (today - Time.parse(file.scan(DATE_REGEX).first).to_date).to_i > LOG_RETENTION
|
|
60
|
+
logger.info "Archived file: #{file}"
|
|
61
|
+
`rm #{file}`
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def copy_files
|
|
67
|
+
if File.directory?(Naf::LOGGING_ROOT_DIRECTORY + "/naf")
|
|
68
|
+
# Each archive will have a unique path based on the time archived
|
|
69
|
+
time = Time.zone.now.to_s
|
|
70
|
+
FileUtils.mkdir_p(Naf::LOGGING_ROOT_DIRECTORY + Naf::LOGGING_ARCHIVE_DIRECTORY + "/#{time}")
|
|
71
|
+
|
|
72
|
+
# Move the naf logs into the archive directory
|
|
73
|
+
`cp -r #{Naf::LOGGING_ROOT_DIRECTORY}/naf #{Naf::LOGGING_ROOT_DIRECTORY + Naf::LOGGING_ARCHIVE_DIRECTORY}/#{time.gsub(' ', '\ ')}`
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -27,7 +27,9 @@ module Process::Naf
|
|
|
27
27
|
if @update_machine
|
|
28
28
|
machine = ::Naf::Machine.find_by_server_address(@server_address)
|
|
29
29
|
if machine.blank?
|
|
30
|
-
|
|
30
|
+
server_name = (`hostname`).strip
|
|
31
|
+
machine = ::Naf::Machine.create(server_address: @server_address,
|
|
32
|
+
server_name: server_name)
|
|
31
33
|
add_default_affinities(machine)
|
|
32
34
|
end
|
|
33
35
|
|
|
@@ -3,6 +3,9 @@ require 'timeout'
|
|
|
3
3
|
module Process::Naf
|
|
4
4
|
class Runner < ::Af::Application
|
|
5
5
|
|
|
6
|
+
attr_accessor :machine,
|
|
7
|
+
:current_invocation
|
|
8
|
+
|
|
6
9
|
#----------------
|
|
7
10
|
# *** Options ***
|
|
8
11
|
#+++++++++++++++++
|
|
@@ -41,6 +44,9 @@ module Process::Naf
|
|
|
41
44
|
opt :kill_all_runners,
|
|
42
45
|
"don't wait for runners to wind down and finish running their jobs",
|
|
43
46
|
default: false
|
|
47
|
+
opt :invocation_uuid,
|
|
48
|
+
"unique identifer used for runner logs",
|
|
49
|
+
default: `uuidgen`
|
|
44
50
|
|
|
45
51
|
def initialize
|
|
46
52
|
super
|
|
@@ -56,15 +62,9 @@ module Process::Naf
|
|
|
56
62
|
end
|
|
57
63
|
|
|
58
64
|
def work
|
|
59
|
-
|
|
60
|
-
# These configuration changes will help forked processes, not the runner
|
|
61
|
-
ENV['RUBY_HEAP_MIN_SLOTS'] = '500000'
|
|
62
|
-
ENV['RUBY_HEAP_SLOTS_INCREMENT'] = '250000'
|
|
63
|
-
ENV['RUBY_HEAP_SLOTS_GROWTH_FACTOR'] = '1'
|
|
64
|
-
ENV['RUBY_GC_MALLOC_LIMIT'] = '50000000'
|
|
65
|
-
end
|
|
65
|
+
check_gc_configurations
|
|
66
66
|
|
|
67
|
-
machine = ::Naf::Machine.find_by_server_address(@server_address)
|
|
67
|
+
@machine = ::Naf::Machine.find_by_server_address(@server_address)
|
|
68
68
|
|
|
69
69
|
unless machine.present?
|
|
70
70
|
logger.fatal "This machine is not configued correctly (ipaddress: #{@server_address})."
|
|
@@ -75,77 +75,110 @@ module Process::Naf
|
|
|
75
75
|
|
|
76
76
|
machine.lock_for_runner_use
|
|
77
77
|
begin
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
if invocation.dead_at.blank?
|
|
82
|
-
begin
|
|
83
|
-
retval = Process.kill(0, invocation.pid)
|
|
84
|
-
logger.detail "#{retval} = kill(0, #{invocation.pid}) -- process alive, marking runner invocation as winding down"
|
|
85
|
-
invocation.wind_down_at = Time.zone.now
|
|
86
|
-
invocation.save!
|
|
87
|
-
rescue Errno::ESRCH
|
|
88
|
-
logger.detail "ESRCH = kill(0, #{invocation.pid}) -- marking runner invocation as not running"
|
|
89
|
-
invocation.dead_at = Time.zone.now
|
|
90
|
-
invocation.save!
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
|
-
end
|
|
94
|
-
end
|
|
78
|
+
cleanup_old_processes
|
|
79
|
+
wind_down_runners
|
|
80
|
+
|
|
95
81
|
# Create a machine runner, if it doesn't exist
|
|
96
82
|
machine_runner = ::Naf::MachineRunner.
|
|
97
83
|
find_or_create_by_machine_id_and_runner_cwd(machine_id: machine.id,
|
|
98
84
|
runner_cwd: Dir.pwd)
|
|
99
|
-
|
|
100
|
-
begin
|
|
101
|
-
repository_name = (`git remote -v`).slice(/:\S+/).sub('.git','')[1..-1]
|
|
102
|
-
if repository_name.match(/fatal/)
|
|
103
|
-
repository_name = nil
|
|
104
|
-
end
|
|
105
|
-
rescue
|
|
106
|
-
repository_name = nil
|
|
107
|
-
end
|
|
108
|
-
branch_name = (`git rev-parse --abbrev-ref HEAD`).strip
|
|
109
|
-
if branch_name.match(/fatal/)
|
|
110
|
-
branch_name = nil
|
|
111
|
-
end
|
|
112
|
-
commit_information = (`git log --pretty="%H" -n 1`).strip
|
|
113
|
-
if commit_information.match(/fatal/)
|
|
114
|
-
commit_information = nil
|
|
115
|
-
end
|
|
116
|
-
deployment_tag = (`git describe --abbrev=0 --tag 2>&1`).strip
|
|
117
|
-
if deployment_tag.match(/fatal: No names found, cannot describe anything/)
|
|
118
|
-
deployment_tag = nil
|
|
119
|
-
end
|
|
120
85
|
# Create an invocation for this runner
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
commit_information: commit_information,
|
|
126
|
-
deployment_tag: deployment_tag)
|
|
86
|
+
@current_invocation = ::Naf::MachineRunnerInvocation.
|
|
87
|
+
create!({ machine_runner_id: machine_runner.id,
|
|
88
|
+
pid: Process.pid,
|
|
89
|
+
uuid: @invocation_uuid }.merge!(retrieve_invocation_information))
|
|
127
90
|
ensure
|
|
128
91
|
machine.unlock_for_runner_use
|
|
129
92
|
end
|
|
130
93
|
|
|
131
94
|
begin
|
|
132
|
-
work_machine
|
|
95
|
+
work_machine
|
|
133
96
|
ensure
|
|
134
|
-
|
|
135
|
-
|
|
97
|
+
@current_invocation.dead_at = Time.zone.now
|
|
98
|
+
@current_invocation.save!
|
|
99
|
+
cleanup_old_processes
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def check_gc_configurations
|
|
104
|
+
unless @disable_gc_modifications
|
|
105
|
+
# These configuration changes will help forked processes, not the runner
|
|
106
|
+
ENV['RUBY_HEAP_MIN_SLOTS'] = '500000'
|
|
107
|
+
ENV['RUBY_HEAP_SLOTS_INCREMENT'] = '250000'
|
|
108
|
+
ENV['RUBY_HEAP_SLOTS_GROWTH_FACTOR'] = '1'
|
|
109
|
+
ENV['RUBY_GC_MALLOC_LIMIT'] = '50000000'
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def cleanup_old_processes
|
|
114
|
+
machine.machine_runners.each do |runner|
|
|
115
|
+
runner.machine_runner_invocations.recently_marked_dead(24.hours).each do |invocation|
|
|
116
|
+
terminate_old_processes(invocation)
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def wind_down_runners
|
|
122
|
+
machine.machine_runners.each do |runner|
|
|
123
|
+
runner.machine_runner_invocations.each do |invocation|
|
|
124
|
+
if invocation.dead_at.blank?
|
|
125
|
+
begin
|
|
126
|
+
retval = Process.kill(0, invocation.pid)
|
|
127
|
+
logger.detail "#{retval} = kill(0, #{invocation.pid}) -- process alive, marking runner invocation as winding down"
|
|
128
|
+
invocation.wind_down_at = Time.zone.now
|
|
129
|
+
invocation.save!
|
|
130
|
+
rescue Errno::ESRCH
|
|
131
|
+
logger.detail "ESRCH = kill(0, #{invocation.pid}) -- marking runner invocation as not running"
|
|
132
|
+
invocation.dead_at = Time.zone.now
|
|
133
|
+
invocation.save!
|
|
134
|
+
terminate_old_processes(invocation)
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def retrieve_invocation_information
|
|
142
|
+
begin
|
|
143
|
+
repository_name = (`git remote -v`).slice(/:\S+/).sub('.git','')[1..-1]
|
|
144
|
+
if repository_name.match(/fatal/)
|
|
145
|
+
repository_name = nil
|
|
146
|
+
end
|
|
147
|
+
rescue
|
|
148
|
+
repository_name = nil
|
|
149
|
+
end
|
|
150
|
+
branch_name = (`git rev-parse --abbrev-ref HEAD`).strip
|
|
151
|
+
if branch_name.match(/fatal/)
|
|
152
|
+
branch_name = nil
|
|
153
|
+
end
|
|
154
|
+
commit_information = (`git log --pretty="%H" -n 1`).strip
|
|
155
|
+
if commit_information.match(/fatal/)
|
|
156
|
+
commit_information = nil
|
|
136
157
|
end
|
|
158
|
+
deployment_tag = (`git describe --abbrev=0 --tag 2>&1`).strip
|
|
159
|
+
if deployment_tag.match(/fatal: No names found, cannot describe anything/)
|
|
160
|
+
deployment_tag = nil
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
{
|
|
164
|
+
repository_name: repository_name,
|
|
165
|
+
branch_name: branch_name,
|
|
166
|
+
commit_information: commit_information,
|
|
167
|
+
deployment_tag: deployment_tag
|
|
168
|
+
}
|
|
137
169
|
end
|
|
138
170
|
|
|
139
|
-
def work_machine
|
|
171
|
+
def work_machine
|
|
140
172
|
machine.mark_alive
|
|
141
173
|
machine.mark_up
|
|
142
174
|
|
|
143
175
|
# Make sure no processes are thought to be running on this machine
|
|
144
176
|
terminate_old_processes(machine) if @kill_all_runners
|
|
145
177
|
|
|
146
|
-
logger.info "working: #{machine}"
|
|
178
|
+
logger.info escape_html("working: #{machine}")
|
|
147
179
|
|
|
148
180
|
@children = {}
|
|
181
|
+
@threads = {}
|
|
149
182
|
|
|
150
183
|
at_exit {
|
|
151
184
|
::Af::Application.singleton.emergency_teardown
|
|
@@ -154,44 +187,92 @@ module Process::Naf
|
|
|
154
187
|
@job_fetcher = ::Logical::Naf::JobFetcher.new(machine)
|
|
155
188
|
|
|
156
189
|
while true
|
|
157
|
-
break unless work_machine_loop
|
|
190
|
+
break unless work_machine_loop
|
|
158
191
|
GC.start
|
|
159
192
|
end
|
|
160
193
|
|
|
161
194
|
logger.info "runner quitting"
|
|
162
195
|
end
|
|
163
196
|
|
|
164
|
-
def work_machine_loop
|
|
197
|
+
def work_machine_loop
|
|
165
198
|
machine.reload
|
|
166
199
|
|
|
167
200
|
# Check machine status
|
|
168
201
|
if !machine.enabled
|
|
169
|
-
logger.warn "this machine is disabled #{machine}"
|
|
202
|
+
logger.warn escape_html("this machine is disabled #{machine}")
|
|
170
203
|
return false
|
|
171
204
|
elsif machine.marked_down
|
|
172
|
-
logger.warn "this machine is marked down #{machine}"
|
|
205
|
+
logger.warn escape_html("this machine is marked down #{machine}")
|
|
173
206
|
return false
|
|
174
207
|
end
|
|
175
208
|
|
|
176
209
|
machine.mark_alive
|
|
177
210
|
|
|
211
|
+
check_log_level
|
|
212
|
+
|
|
213
|
+
@current_invocation.reload
|
|
214
|
+
if current_invocation.wind_down_at.present?
|
|
215
|
+
logger.warn "invocation asked to wind down"
|
|
216
|
+
if @children.length == 0
|
|
217
|
+
return false;
|
|
218
|
+
end
|
|
219
|
+
else
|
|
220
|
+
check_schedules
|
|
221
|
+
start_new_jobs
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
cleanup_dead_children
|
|
225
|
+
|
|
226
|
+
return true
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def check_log_level
|
|
178
230
|
if machine.log_level != @last_machine_log_level
|
|
179
231
|
@last_machine_log_level = machine.log_level
|
|
180
232
|
unless @last_machine_log_level.blank?
|
|
181
233
|
logging_configurator.parse_and_set_logger_levels(@last_machine_log_level)
|
|
182
234
|
end
|
|
183
235
|
end
|
|
236
|
+
end
|
|
184
237
|
|
|
185
|
-
|
|
186
|
-
if
|
|
187
|
-
logger.
|
|
188
|
-
if
|
|
189
|
-
|
|
238
|
+
def check_schedules
|
|
239
|
+
if ::Naf::Machine.is_it_time_to_check_schedules?(@check_schedules_period.minutes)
|
|
240
|
+
logger.debug "it's time to check schedules"
|
|
241
|
+
if ::Naf::ApplicationSchedule.try_lock_schedules
|
|
242
|
+
logger.debug_gross "checking schedules"
|
|
243
|
+
machine.mark_checked_schedule
|
|
244
|
+
::Naf::ApplicationSchedule.unlock_schedules
|
|
245
|
+
|
|
246
|
+
# check scheduled tasks
|
|
247
|
+
should_be_queued.each do |application_schedule|
|
|
248
|
+
logger.info escape_html("scheduled application: #{application_schedule}")
|
|
249
|
+
begin
|
|
250
|
+
naf_boss = ::Logical::Naf::ConstructionZone::Boss.new
|
|
251
|
+
# this doesn't work very well for run_group_limits in the thousands
|
|
252
|
+
Range.new(0, application_schedule.application_run_group_limit || 1, true).each do
|
|
253
|
+
naf_boss.enqueue_application_schedule(application_schedule)
|
|
254
|
+
end
|
|
255
|
+
rescue ::Naf::HistoricalJob::JobPrerequisiteLoop => jpl
|
|
256
|
+
logger.error escape_html("#{machine} couldn't queue schedule because of prerequisite loop: #{jpl.message}")
|
|
257
|
+
logger.warn jpl
|
|
258
|
+
application_schedule.enabled = false
|
|
259
|
+
application_schedule.save!
|
|
260
|
+
logger.alarm escape_html("Application Schedule disabled due to loop: #{application_schedule}")
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# check the runner machines
|
|
265
|
+
::Naf::Machine.enabled.up.each do |runner_to_check|
|
|
266
|
+
if runner_to_check.is_stale?(@runner_stale_period.minutes)
|
|
267
|
+
logger.alarm escape_html("runner is stale for #{@runner_stale_period} minutes, #{runner_to_check}")
|
|
268
|
+
runner_to_check.mark_machine_down(machine)
|
|
269
|
+
end
|
|
270
|
+
end
|
|
190
271
|
end
|
|
191
272
|
end
|
|
273
|
+
end
|
|
192
274
|
|
|
193
|
-
|
|
194
|
-
|
|
275
|
+
def cleanup_dead_children
|
|
195
276
|
# clean up children that have exited
|
|
196
277
|
logger.detail "cleaning up dead children: #{@children.length}"
|
|
197
278
|
|
|
@@ -204,25 +285,10 @@ module Process::Naf
|
|
|
204
285
|
pid, status = Process.waitpid2(-1)
|
|
205
286
|
end
|
|
206
287
|
rescue Timeout::Error
|
|
207
|
-
|
|
208
|
-
# XXX has not set pid or status yet and timeout fires?
|
|
209
|
-
# XXX i bet there is
|
|
210
|
-
# XXX so this code is here:
|
|
211
|
-
dead_children = []
|
|
212
|
-
@children.each do |pid, child|
|
|
213
|
-
unless is_job_process_alive?(child)
|
|
214
|
-
dead_children << child
|
|
215
|
-
end
|
|
216
|
-
end
|
|
217
|
-
|
|
218
|
-
unless dead_children.blank?
|
|
219
|
-
logger.error "#{machine}: dead children even with timeout during waitpid2(): #{dead_children.inspect}"
|
|
220
|
-
logger.warn "this isn't necessarily incorrect -- look for the pids to be cleaned up next round, if not: call it a bug"
|
|
221
|
-
end
|
|
222
|
-
|
|
288
|
+
check_dead_children_not_exited_properly
|
|
223
289
|
break
|
|
224
290
|
rescue Errno::ECHILD => e
|
|
225
|
-
logger.error "#{machine} No child when we thought we had children #{@children.inspect}"
|
|
291
|
+
logger.error escape_html("#{machine} No child when we thought we had children #{@children.inspect}")
|
|
226
292
|
logger.warn e
|
|
227
293
|
pid = @children.first.try(:first)
|
|
228
294
|
status = nil
|
|
@@ -231,27 +297,10 @@ module Process::Naf
|
|
|
231
297
|
|
|
232
298
|
if pid
|
|
233
299
|
begin
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
if child_job.present?
|
|
237
|
-
# Update job tags
|
|
238
|
-
child_job.historical_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:work]])
|
|
239
|
-
|
|
240
|
-
if status.nil? || status.exited? || status.signaled?
|
|
241
|
-
logger.info { "cleaning up dead child: #{child_job.reload}" }
|
|
242
|
-
finish_job(child_job,
|
|
243
|
-
{ exit_status: (status && status.exitstatus), termination_signal: (status && status.termsig) })
|
|
244
|
-
else
|
|
245
|
-
# this can happen if the child is sigstopped
|
|
246
|
-
logger.warn "child waited for did not exit: #{child_job}, status: #{status.inspect}"
|
|
247
|
-
end
|
|
248
|
-
else
|
|
249
|
-
# XXX ERROR no child for returned pid -- this can't happen
|
|
250
|
-
logger.warn "child pid: #{pid}, status: #{status.inspect}, not managed by this runner"
|
|
251
|
-
end
|
|
300
|
+
cleanup_dead_child(pid, status)
|
|
252
301
|
rescue ActiveRecord::ActiveRecordError => are
|
|
253
|
-
logger.error "Failure during cleaning up of dead child with pid: #{pid}"
|
|
254
|
-
logger.error "#{are.message}"
|
|
302
|
+
logger.error escape_html("Failure during cleaning up of dead child with pid: #{pid}, status: #{status}")
|
|
303
|
+
logger.error escape_html("#{are.message}")
|
|
255
304
|
rescue StandardError => e
|
|
256
305
|
# XXX just incase a job control failure -- more code here
|
|
257
306
|
logger.error "some failure during child clean up"
|
|
@@ -263,12 +312,58 @@ module Process::Naf
|
|
|
263
312
|
logger.detail "sleeping in loop: #{@loop_sleep_time} seconds"
|
|
264
313
|
sleep(@loop_sleep_time)
|
|
265
314
|
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# XXX is there a race condition where a child process exits
|
|
318
|
+
# XXX has not set pid or status yet and timeout fires?
|
|
319
|
+
# XXX i bet there is
|
|
320
|
+
# XXX so this code is here:
|
|
321
|
+
def check_dead_children_not_exited_properly
|
|
322
|
+
dead_children = []
|
|
323
|
+
@children.each do |pid, child|
|
|
324
|
+
unless is_job_process_alive?(child.reload)
|
|
325
|
+
dead_children << child
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
unless dead_children.blank?
|
|
330
|
+
logger.error escape_html("#{machine}: dead children even with timeout during waitpid2(): #{dead_children.inspect}")
|
|
331
|
+
logger.warn "this isn't necessarily incorrect -- look for the pids to be cleaned up next round, if not: call it a bug"
|
|
332
|
+
end
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def cleanup_dead_child(pid, status)
|
|
336
|
+
child_job = @children.delete(pid)
|
|
337
|
+
|
|
338
|
+
if child_job.present?
|
|
339
|
+
# Update job tags
|
|
340
|
+
child_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:work]])
|
|
341
|
+
|
|
342
|
+
if status.nil? || status.exited? || status.signaled?
|
|
343
|
+
logger.info { escape_html("cleaning up dead child: #{child_job.reload}") }
|
|
344
|
+
finish_job(child_job,
|
|
345
|
+
{ exit_status: (status && status.exitstatus), termination_signal: (status && status.termsig) })
|
|
346
|
+
|
|
347
|
+
thread = @threads.delete(pid)
|
|
348
|
+
logger.detail escape_html("cleaning up threads: #{thread.inspect}")
|
|
349
|
+
logger.detail escape_html("thread list: #{Thread.list}")
|
|
350
|
+
thread.join
|
|
351
|
+
else
|
|
352
|
+
# this can happen if the child is sigstopped
|
|
353
|
+
logger.warn escape_html("child waited for did not exit: #{child_job}, status: #{status.inspect}")
|
|
354
|
+
end
|
|
355
|
+
else
|
|
356
|
+
# XXX ERROR no child for returned pid -- this can't happen
|
|
357
|
+
logger.warn "child pid: #{pid}, status: #{status.inspect}, not managed by this runner"
|
|
358
|
+
end
|
|
359
|
+
end
|
|
266
360
|
|
|
361
|
+
def start_new_jobs
|
|
267
362
|
# start new jobs
|
|
268
363
|
logger.detail "starting new jobs, num children: #{@children.length}/#{machine.thread_pool_size}"
|
|
269
|
-
# XXX while @children.length < machine.thread_pool_size && memory_available_to_spawn? &&
|
|
270
|
-
while ::Naf::RunningJob.where(:
|
|
271
|
-
memory_available_to_spawn? &&
|
|
364
|
+
# XXX while @children.length < machine.thread_pool_size && memory_available_to_spawn? && current_invocation.wind_down_at.blank?
|
|
365
|
+
while ::Naf::RunningJob.where(started_on_machine_id: machine.id).count < machine.thread_pool_size &&
|
|
366
|
+
memory_available_to_spawn? && current_invocation.wind_down_at.blank?
|
|
272
367
|
|
|
273
368
|
logger.debug_gross "fetching jobs because: children: #{@children.length} < #{machine.thread_pool_size} (poolsize)"
|
|
274
369
|
begin
|
|
@@ -279,21 +374,37 @@ module Process::Naf
|
|
|
279
374
|
break
|
|
280
375
|
end
|
|
281
376
|
|
|
282
|
-
logger.info "starting new job : #{running_job}"
|
|
377
|
+
logger.info escape_html("starting new job : #{running_job.inspect}")
|
|
378
|
+
|
|
379
|
+
# fork and run
|
|
380
|
+
pid, stdin, stdout, stderr = running_job.historical_job.spawn
|
|
381
|
+
stdin.close
|
|
283
382
|
|
|
284
|
-
|
|
383
|
+
# Reset NAF_JOB_ID
|
|
384
|
+
ENV.delete('NAF_JOB_ID')
|
|
285
385
|
if pid
|
|
286
386
|
@children[pid] = running_job
|
|
287
387
|
running_job.pid = pid
|
|
288
388
|
running_job.historical_job.pid = pid
|
|
289
389
|
running_job.historical_job.failed_to_start = false
|
|
290
|
-
running_job.historical_job.machine_runner_invocation_id =
|
|
291
|
-
logger.info "job started : #{running_job}"
|
|
390
|
+
running_job.historical_job.machine_runner_invocation_id = current_invocation.id
|
|
391
|
+
logger.info escape_html("job started : #{running_job}")
|
|
292
392
|
running_job.save!
|
|
293
393
|
running_job.historical_job.save!
|
|
394
|
+
|
|
395
|
+
# Spawn a thread to output the log of each job to files.
|
|
396
|
+
#
|
|
397
|
+
# Make sure not to execute any database calls inside this
|
|
398
|
+
# block, as it will start an ActiveRecord connection for each
|
|
399
|
+
# thread and eventually raise a ConnetionTimeoutError, resulting
|
|
400
|
+
# the runner to exit.
|
|
401
|
+
thread = Thread.new do
|
|
402
|
+
log_output_until_job_finishes(running_job.id, stdout, stderr)
|
|
403
|
+
end
|
|
404
|
+
@threads[pid] = thread
|
|
294
405
|
else
|
|
295
406
|
# should never get here (well, hopefully)
|
|
296
|
-
logger.error "#{machine}: failed to execute #{running_job}"
|
|
407
|
+
logger.error escape_html("#{machine}: failed to execute #{running_job}")
|
|
297
408
|
|
|
298
409
|
finish_job(running_job, { failed_to_start: true })
|
|
299
410
|
end
|
|
@@ -301,51 +412,54 @@ module Process::Naf
|
|
|
301
412
|
raise
|
|
302
413
|
rescue StandardError => e
|
|
303
414
|
# XXX rescue for various issues
|
|
304
|
-
logger.error "#{machine}: failure during job start"
|
|
415
|
+
logger.error escape_html("#{machine}: failure during job start")
|
|
305
416
|
logger.warn e
|
|
306
417
|
end
|
|
307
418
|
end
|
|
308
419
|
logger.debug_gross "done starting jobs"
|
|
420
|
+
end
|
|
309
421
|
|
|
310
|
-
|
|
422
|
+
def log_output_until_job_finishes(job_id, stdout, stderr)
|
|
423
|
+
log_file = ::Logical::Naf::LogFile.new("#{::Naf::PREFIX_PATH}/#{::Naf.schema_name}/jobs/#{job_id}")
|
|
424
|
+
log_file.open
|
|
311
425
|
|
|
312
|
-
|
|
426
|
+
# Continue reading logs from stdout/stderror until it reaches end of file
|
|
427
|
+
while true
|
|
428
|
+
read_pipes = []
|
|
429
|
+
read_pipes << stdout if stdout
|
|
430
|
+
read_pipes << stderr if stderr
|
|
431
|
+
return if (read_pipes.length == 0)
|
|
313
432
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
logger.debug "it's time to check schedules"
|
|
317
|
-
if ::Naf::ApplicationSchedule.try_lock_schedules
|
|
318
|
-
logger.debug_gross "checking schedules"
|
|
319
|
-
machine.mark_checked_schedule
|
|
320
|
-
::Naf::ApplicationSchedule.unlock_schedules
|
|
433
|
+
error_pipes = read_pipes.clone
|
|
434
|
+
read_array, write_array, error_array = Kernel.select(read_pipes, nil, error_pipes, 1)
|
|
321
435
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
naf_boss = ::Logical::Naf::ConstructionZone::Boss.new
|
|
327
|
-
# this doesn't work very well for run_group_limits in the thousands
|
|
328
|
-
Range.new(0, application_schedule.application_run_group_limit || 1, true).each do
|
|
329
|
-
naf_boss.enqueue_application_schedule(application_schedule)
|
|
330
|
-
end
|
|
331
|
-
rescue ::Naf::HistoricalJob::JobPrerequisiteLoop => jpl
|
|
332
|
-
logger.error "#{machine} couldn't queue schedule because of prerequisite loop: #{jpl.message}"
|
|
333
|
-
logger.warn jpl
|
|
334
|
-
application_schedule.enabled = false
|
|
335
|
-
application_schedule.save!
|
|
336
|
-
logger.alarm "Application Schedule disabled due to loop: #{application_schedule}"
|
|
337
|
-
end
|
|
338
|
-
end
|
|
436
|
+
unless error_array.blank?
|
|
437
|
+
logger.error escape_html("job(#{job_id}): select returned error for #{error_pipes.inspect} (read_pipes: #{read_pipes.inspect})")
|
|
438
|
+
# XXX we should probably close the errored FDs
|
|
439
|
+
end
|
|
339
440
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
441
|
+
unless read_array.blank?
|
|
442
|
+
begin
|
|
443
|
+
for r in read_array do
|
|
444
|
+
begin
|
|
445
|
+
# Parse each log line into JSON
|
|
446
|
+
r.read_nonblock(10240).split("\n").each do |log|
|
|
447
|
+
log_file << log.rstrip
|
|
448
|
+
end
|
|
449
|
+
rescue Errno::EAGAIN
|
|
450
|
+
rescue Errno::EINTR
|
|
451
|
+
rescue EOFError => eof
|
|
452
|
+
stdout = nil if r == stdout
|
|
453
|
+
stderr = nil if r == stderr
|
|
454
|
+
end
|
|
345
455
|
end
|
|
456
|
+
ensure
|
|
457
|
+
log_file.write
|
|
346
458
|
end
|
|
347
459
|
end
|
|
348
460
|
end
|
|
461
|
+
|
|
462
|
+
log_file.close
|
|
349
463
|
end
|
|
350
464
|
|
|
351
465
|
# XXX update_all doesn't support "from_partition" so we have this helper
|
|
@@ -364,15 +478,15 @@ module Process::Naf
|
|
|
364
478
|
end
|
|
365
479
|
|
|
366
480
|
def finish_job(running_job, updates = {})
|
|
367
|
-
running_job.
|
|
368
|
-
|
|
481
|
+
if running_job.present?
|
|
482
|
+
running_job.remove_all_tags
|
|
483
|
+
running_job.add_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
|
|
484
|
+
end
|
|
369
485
|
|
|
370
486
|
::Naf::HistoricalJob.transaction do
|
|
371
487
|
update_historical_job(updates.merge({ finished_at: Time.zone.now }), running_job.id)
|
|
372
488
|
running_job.delete
|
|
373
489
|
end
|
|
374
|
-
|
|
375
|
-
running_job.historical_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
|
|
376
490
|
end
|
|
377
491
|
|
|
378
492
|
# kill(0, pid) seems to fail during at_exit block
|
|
@@ -392,17 +506,17 @@ module Process::Naf
|
|
|
392
506
|
end
|
|
393
507
|
end
|
|
394
508
|
|
|
395
|
-
def terminate_old_processes(
|
|
509
|
+
def terminate_old_processes(record)
|
|
396
510
|
# check if any processes are hanging around and ask them
|
|
397
511
|
# politely if they will please terminate
|
|
398
|
-
jobs = assigned_jobs(
|
|
512
|
+
jobs = assigned_jobs(record)
|
|
399
513
|
if jobs.length == 0
|
|
400
514
|
logger.detail "no jobs to remove"
|
|
401
515
|
return
|
|
402
516
|
end
|
|
403
517
|
logger.info "number of old jobs to sift through: #{jobs.length}"
|
|
404
518
|
jobs.each do |job|
|
|
405
|
-
logger.detail "job still around: #{job}"
|
|
519
|
+
logger.detail escape_html("job still around: #{job}")
|
|
406
520
|
if job.request_to_terminate == false
|
|
407
521
|
logger.warn "politely asking process: #{job.pid} to terminate itself"
|
|
408
522
|
job.request_to_terminate = true
|
|
@@ -412,7 +526,7 @@ module Process::Naf
|
|
|
412
526
|
|
|
413
527
|
# wait
|
|
414
528
|
(1..@wait_time_for_processes_to_terminate).each do |i|
|
|
415
|
-
num_assigned_jobs = assigned_jobs(
|
|
529
|
+
num_assigned_jobs = assigned_jobs(record).length
|
|
416
530
|
return if num_assigned_jobs == 0
|
|
417
531
|
logger.debug_medium "#{i}/#{@wait_time_for_processes_to_terminate}: sleeping 1 second while we wait for " +
|
|
418
532
|
"#{num_assigned_jobs} assigned job(s) to terminate as requested"
|
|
@@ -420,27 +534,27 @@ module Process::Naf
|
|
|
420
534
|
end
|
|
421
535
|
|
|
422
536
|
# nudge them to terminate
|
|
423
|
-
jobs = assigned_jobs(
|
|
537
|
+
jobs = assigned_jobs(record)
|
|
424
538
|
if jobs.length == 0
|
|
425
539
|
logger.debug_gross "assigned jobs have exited after asking to terminate nicely"
|
|
426
540
|
return
|
|
427
541
|
end
|
|
428
542
|
jobs.each do |job|
|
|
429
|
-
logger.warn "sending SIG_TERM to process: #{job}"
|
|
543
|
+
logger.warn escape_html("sending SIG_TERM to process: #{job}")
|
|
430
544
|
send_signal_and_maybe_clean_up(job, "TERM")
|
|
431
545
|
end
|
|
432
546
|
|
|
433
547
|
# wait
|
|
434
548
|
(1..5).each do |i|
|
|
435
|
-
num_assigned_jobs = assigned_jobs(
|
|
549
|
+
num_assigned_jobs = assigned_jobs(record).length
|
|
436
550
|
return if num_assigned_jobs == 0
|
|
437
551
|
logger.debug_medium "#{i}/5: sleeping 1 second while we wait for #{num_assigned_jobs} assigned job(s) to terminate from SIG_TERM"
|
|
438
552
|
sleep(1)
|
|
439
553
|
end
|
|
440
554
|
|
|
441
555
|
# kill with fire
|
|
442
|
-
assigned_jobs(
|
|
443
|
-
logger.alarm "sending SIG_KILL to process: #{job}"
|
|
556
|
+
assigned_jobs(record).each do |job|
|
|
557
|
+
logger.alarm escape_html("sending SIG_KILL to process: #{job}")
|
|
444
558
|
send_signal_and_maybe_clean_up(job, "KILL")
|
|
445
559
|
|
|
446
560
|
# job force job down
|
|
@@ -473,13 +587,19 @@ module Process::Naf
|
|
|
473
587
|
return send_signal_and_maybe_clean_up(job, 0)
|
|
474
588
|
end
|
|
475
589
|
|
|
476
|
-
def assigned_jobs(
|
|
477
|
-
|
|
478
|
-
|
|
590
|
+
def assigned_jobs(record)
|
|
591
|
+
if record.kind_of? ::Naf::MachineRunnerInvocation
|
|
592
|
+
return ::Naf::RunningJob.started_on_invocation(record.id).select do |job|
|
|
593
|
+
is_job_process_alive?(job)
|
|
594
|
+
end
|
|
595
|
+
else
|
|
596
|
+
return ::Naf::RunningJob.assigned_jobs(record).select do |job|
|
|
597
|
+
is_job_process_alive?(job)
|
|
598
|
+
end
|
|
479
599
|
end
|
|
480
600
|
end
|
|
481
601
|
|
|
482
|
-
def should_be_queued
|
|
602
|
+
def should_be_queued
|
|
483
603
|
not_finished_applications = ::Naf::HistoricalJob.
|
|
484
604
|
queued_between(Time.zone.now - Naf::HistoricalJob::JOB_STALE_TIME, Time.zone.now).
|
|
485
605
|
where("finished_at IS NULL AND request_to_terminate = false").
|
|
@@ -536,5 +656,9 @@ module Process::Naf
|
|
|
536
656
|
return false
|
|
537
657
|
end
|
|
538
658
|
|
|
659
|
+
def escape_html(str)
|
|
660
|
+
CGI::escapeHTML(str)
|
|
661
|
+
end
|
|
662
|
+
|
|
539
663
|
end
|
|
540
664
|
end
|