naf 1.1.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. data/Gemfile +4 -2
  2. data/app/assets/images/{papertrail_job.png → job.png} +0 -0
  3. data/app/assets/images/{papertrail_machine.png → machine.png} +0 -0
  4. data/app/assets/images/{papertrail_machine_runner.png → machine_runner.png} +0 -0
  5. data/app/assets/javascripts/col_reorder_with_resize.js +1228 -0
  6. data/app/assets/javascripts/dataTablesTemplates/applications.js +2 -1
  7. data/app/assets/javascripts/dataTablesTemplates/jobs.js +2 -1
  8. data/app/assets/javascripts/dataTablesTemplates/machine_runner_invocations.js +2 -1
  9. data/app/assets/javascripts/dataTablesTemplates/machine_runners.js +2 -1
  10. data/app/assets/javascripts/dataTablesTemplates/machines.js +2 -1
  11. data/app/assets/javascripts/jquery.dataTables.js +10339 -5103
  12. data/app/assets/javascripts/naf.js +1 -0
  13. data/app/assets/stylesheets/jquery_ui/jquery-ui-1.8.5.custom.css.erb +6 -6
  14. data/app/assets/stylesheets/min_naf/layout.css.scss +94 -43
  15. data/app/assets/stylesheets/naf/layout.css.scss +94 -43
  16. data/app/controllers/naf/affinities_controller.rb +1 -1
  17. data/app/controllers/naf/applications_controller.rb +3 -0
  18. data/app/controllers/naf/historical_job_affinity_tabs_controller.rb +1 -1
  19. data/app/controllers/naf/historical_jobs_controller.rb +2 -5
  20. data/app/controllers/naf/log_parsers_controller.rb +16 -0
  21. data/app/controllers/naf/log_viewer_controller.rb +19 -0
  22. data/app/controllers/naf/machine_affinity_slots_controller.rb +1 -1
  23. data/app/controllers/naf/machine_runners_controller.rb +12 -0
  24. data/app/controllers/naf/machines_controller.rb +8 -10
  25. data/app/controllers/naf/status_controller.rb +12 -0
  26. data/app/helpers/naf/application_helper.rb +19 -38
  27. data/app/helpers/naf/time_helper.rb +37 -0
  28. data/app/models/logical/naf/application.rb +13 -19
  29. data/app/models/logical/naf/construction_zone/boss.rb +1 -1
  30. data/app/models/logical/naf/construction_zone/foreman.rb +1 -1
  31. data/app/models/logical/naf/job.rb +39 -34
  32. data/app/models/logical/naf/job_creator.rb +19 -23
  33. data/app/models/logical/naf/job_fetcher.rb +36 -6
  34. data/app/models/logical/naf/log_file.rb +70 -0
  35. data/app/models/logical/naf/log_parser/base.rb +272 -0
  36. data/app/models/logical/naf/log_parser/job.rb +65 -0
  37. data/app/models/logical/naf/log_parser/machine.rb +64 -0
  38. data/app/models/logical/naf/log_parser/runner.rb +72 -0
  39. data/app/models/logical/naf/log_reader.rb +85 -0
  40. data/app/models/logical/naf/machine.rb +39 -1
  41. data/app/models/naf/affinity.rb +18 -0
  42. data/app/models/naf/application_schedule_affinity_tab.rb +1 -0
  43. data/app/models/naf/application_type.rb +2 -1
  44. data/app/models/naf/historical_job.rb +9 -29
  45. data/app/models/naf/machine.rb +8 -0
  46. data/app/models/naf/machine_runner.rb +11 -2
  47. data/app/models/naf/machine_runner_invocation.rb +9 -1
  48. data/app/models/naf/running_job.rb +40 -1
  49. data/app/models/process/naf/application.rb +3 -3
  50. data/app/models/process/naf/log_archiver.rb +78 -0
  51. data/app/models/process/naf/machine_manager.rb +3 -1
  52. data/app/models/process/naf/runner.rb +286 -162
  53. data/app/models/process/naf/runner_log.rb +26 -0
  54. data/app/views/naf/application_schedule_affinity_tabs/_form.html.erb +1 -5
  55. data/app/views/naf/applications/show.html.erb +1 -1
  56. data/app/views/naf/historical_job_affinity_tabs/_form.html.erb +1 -5
  57. data/app/views/naf/historical_jobs/_form.html.erb +1 -1
  58. data/app/views/naf/historical_jobs/_runners.html.erb +21 -12
  59. data/app/views/naf/historical_jobs/_search_container.html.erb +1 -2
  60. data/app/views/naf/historical_jobs/index.html.erb +0 -1
  61. data/app/views/naf/historical_jobs/index.json.erb +4 -4
  62. data/app/views/naf/historical_jobs/show.html.erb +57 -51
  63. data/app/views/naf/log_viewer/_job_logs.html.erb +65 -0
  64. data/app/views/naf/log_viewer/_log_display.html.erb +259 -0
  65. data/app/views/naf/log_viewer/_log_layout.html.erb +59 -0
  66. data/app/views/naf/log_viewer/_machine_logs.html.erb +62 -0
  67. data/app/views/naf/log_viewer/_runner_logs.html.erb +62 -0
  68. data/app/views/naf/log_viewer/_search_options.html.erb +36 -0
  69. data/app/views/naf/log_viewer/_update_page_title.html.erb +9 -0
  70. data/app/views/naf/log_viewer/index.html.erb +1 -0
  71. data/app/views/naf/logger_names/_form.html.erb +1 -2
  72. data/app/views/naf/machine_affinity_slots/_form.html.erb +1 -5
  73. data/app/views/naf/machine_runner_invocations/show.html.erb +4 -0
  74. data/app/views/naf/machine_runners/show.html.erb +44 -34
  75. data/app/views/naf/machines/index.json.erb +14 -6
  76. data/app/views/naf/machines/show.html.erb +44 -40
  77. data/app/views/naf/shared/_auto_resize_width.html.erb +7 -0
  78. data/app/views/naf/shared/_date_select.html.erb +65 -0
  79. data/app/views/naf/shared/_select_per_page.html.erb +48 -13
  80. data/app/views/naf/status/index.html.erb +27 -0
  81. data/bin/naf +26 -0
  82. data/config/initializers/naf.rb +13 -1
  83. data/config/routes.rb +16 -2
  84. data/db/migrate/20131106162436_add_uuid_column_to_machine_runner_invocations.rb +15 -0
  85. data/db/migrate/20131121185222_move_tabs_column_from_historical_jobs_to_running_jobs.rb +15 -0
  86. data/lib/generators/templates/config/logging/naf.yml +0 -8
  87. data/lib/generators/templates/config/logging/nafjob.yml +0 -8
  88. data/lib/generators/templates/config/logging/nafrunner.yml +0 -8
  89. data/lib/generators/templates/naf.rb +0 -8
  90. data/lib/naf.rb +0 -8
  91. data/lib/naf/configuration.rb +0 -4
  92. data/lib/naf/version.rb +1 -1
  93. data/lib/tasks/naf_tasks.rake +18 -0
  94. data/naf.gemspec +3 -1
  95. data/spec/controllers/naf/affinities_controller_spec.rb +0 -1
  96. data/spec/controllers/naf/applications_controller_spec.rb +3 -2
  97. data/spec/controllers/naf/machine_affinity_slots_controller_spec.rb +0 -1
  98. data/spec/controllers/naf/machines_controller_spec.rb +1 -1
  99. data/spec/dummy/config/logging/naf.yml +0 -8
  100. data/spec/dummy/config/logging/nafjob.yml +0 -9
  101. data/spec/dummy/config/logging/nafrunner.yml +0 -10
  102. data/spec/factories/naf.rb +4 -0
  103. data/spec/models/logical/naf/application_spec.rb +3 -4
  104. data/spec/models/logical/naf/job_creator_spec.rb +91 -21
  105. data/spec/models/logical/naf/job_spec.rb +19 -6
  106. data/spec/models/logical/naf/log_file_spec.rb +105 -0
  107. data/spec/models/logical/naf/machine_runner_invocation_spec.rb +41 -0
  108. data/spec/models/logical/naf/machine_runner_spec.rb +42 -0
  109. data/spec/models/logical/naf/machine_spec.rb +98 -28
  110. data/spec/models/naf/affinity_classification_spec.rb +20 -0
  111. data/spec/models/naf/affinity_spec.rb +21 -0
  112. data/spec/models/naf/historical_job_spec.rb +2 -44
  113. data/spec/models/naf/machine_runner_invocation_spec.rb +17 -1
  114. data/spec/models/naf/running_job_spec.rb +64 -1
  115. metadata +40 -9
  116. data/app/models/log4r/papertrail_outputter.rb +0 -19
  117. data/app/views/naf/historical_jobs/edit.html.erb +0 -11
  118. data/app/views/naf/machines/_show.html.erb +0 -169
@@ -244,6 +244,14 @@ module Naf
244
244
  ])
245
245
  end
246
246
 
247
+ def hostname
248
+ if server_name.present?
249
+ server_name
250
+ else
251
+ server_address
252
+ end
253
+ end
254
+
247
255
  private
248
256
 
249
257
  def check_blank_values
@@ -42,9 +42,18 @@ module Naf
42
42
  #{::Naf.schema_name}.machine_runner_invocations.wind_down_at IS NOT NULL")
43
43
  end
44
44
 
45
- def self.dead
45
+ def self.dead_count
46
46
  (::Naf::MachineRunner.joins(:machine).where("#{::Naf.schema_name}.machines.enabled IS TRUE").pluck(:machine_id) -
47
- ::Naf::MachineRunner.running.pluck(:machine_id)).uniq
47
+ ::Naf::MachineRunner.running.pluck(:machine_id) -
48
+ ::Naf::MachineRunner.winding_down.pluck(:machine_id)).uniq.count
49
+ end
50
+
51
+ #-------------------------
52
+ # *** Instance Methods ***
53
+ #+++++++++++++++++++++++++
54
+
55
+ def current_invocation
56
+ machine_runner_invocations.where(dead_at: nil, wind_down_at: nil).order(:id).last
48
57
  end
49
58
 
50
59
  end
@@ -8,7 +8,8 @@ module Naf
8
8
  :commit_information,
9
9
  :branch_name,
10
10
  :repository_name,
11
- :deployment_tag
11
+ :deployment_tag,
12
+ :uuid
12
13
 
13
14
  #---------------------
14
15
  # *** Associations ***
@@ -49,6 +50,13 @@ module Naf
49
50
  end
50
51
  end
51
52
 
53
+ def self.recently_marked_dead(time)
54
+ where("
55
+ #{::Naf.schema_name}.machine_runner_invocations.dead_at IS NOT NULL AND
56
+ #{::Naf.schema_name}.machine_runner_invocations.dead_at > ?", Time.zone.now - time
57
+ )
58
+ end
59
+
52
60
  #-------------------------
53
61
  # *** Instance Methods ***
54
62
  #+++++++++++++++++++++++++
@@ -12,7 +12,8 @@ module Naf
12
12
  :request_to_terminate,
13
13
  :marked_dead_by_machine_id,
14
14
  :log_level,
15
- :started_at
15
+ :started_at,
16
+ :tags
16
17
 
17
18
  #---------------------
18
19
  # *** Associations ***
@@ -48,6 +49,11 @@ module Naf
48
49
  where(started_on_machine_id: machine.id)
49
50
  end
50
51
 
52
+ def self.started_on_invocation(invocation_id)
53
+ joins(:historical_job).
54
+ where("#{::Naf.schema_name}.historical_jobs.machine_runner_invocation_id = #{invocation_id}")
55
+ end
56
+
51
57
  def self.in_run_group(run_group_name)
52
58
  where(application_run_group_name: run_group_name)
53
59
  end
@@ -76,5 +82,38 @@ module Naf
76
82
  job_weights
77
83
  end
78
84
 
85
+ #-------------------------
86
+ # *** Instance Methods ***
87
+ #+++++++++++++++++++++++++
88
+
89
+ def add_tags(tags_to_add)
90
+ tags_array = nil
91
+ if self.tags.present?
92
+ tags_array = self.tags.gsub(/[{}]/,'').split(',')
93
+ new_tags = '{' + (tags_array | tags_to_add).join(',') + '}'
94
+ else
95
+ new_tags = '{' + tags_to_add.join(',') + '}'
96
+ end
97
+
98
+ self.tags = new_tags
99
+ self.save!
100
+ end
101
+
102
+ def remove_tags(tags_to_remove)
103
+ if self.tags.present?
104
+ tags_array = self.tags.gsub(/[{}]/,'').split(',')
105
+ new_tags = '{' + (tags_array - tags_to_remove).join(',') + '}'
106
+
107
+ self.tags = new_tags
108
+ self.save!
109
+ end
110
+ end
111
+
112
+ def remove_all_tags
113
+ self.tags = '{}'
114
+ self.save!
115
+ end
116
+
117
+
79
118
  end
80
119
  end
@@ -7,7 +7,7 @@ module Process::Naf
7
7
  def initialize(job, reason)
8
8
  @job = job
9
9
  @reason = reason
10
- super("Requested to terminate: #{reason}")
10
+ super("Requested to terminate by Naf: #{reason}")
11
11
  end
12
12
  end
13
13
 
@@ -93,7 +93,7 @@ module Process::Naf
93
93
  end
94
94
 
95
95
  def job_tag_block(*tags, &block)
96
- job = fetch_naf_job
96
+ job = fetch_naf_job.try(:running_job)
97
97
  begin
98
98
  if job
99
99
  add_job_tags(*tags)
@@ -107,7 +107,7 @@ module Process::Naf
107
107
  end
108
108
 
109
109
  def update_job_tags(old_tags, new_tags)
110
- job = fetch_naf_job
110
+ job = fetch_naf_job.try(:running_job)
111
111
  if job
112
112
  job.remove_tags(old_tags.map(&:to_s))
113
113
  job.add_tags(new_tags.map(&:to_s))
@@ -0,0 +1,78 @@
1
+ require 'aws'
2
+
3
+ module Process::Naf
4
+ class LogArchiver < ::Process::Naf::Application
5
+
6
+ NAF_JOBS_LOG_PATH = "#{::Naf::PREFIX_PATH}/#{::Naf.schema_name}/jobs/"
7
+ NAF_RUNNERS_LOG_PATH = "#{::Naf::PREFIX_PATH}/#{::Naf.schema_name}/runners/*/*"
8
+ DATE_REGEX = /\d{8}_\d{6}/
9
+ LOG_RETENTION = 1
10
+
11
+ def work
12
+ # Use AWS credentials to access S3
13
+ s3 = AWS::S3.new(access_key_id: AWS_ID,
14
+ secret_access_key: AWS_KEY,
15
+ ssl_verify_peer: false)
16
+
17
+ # Each project will have a specific bucket
18
+ bucket = s3.buckets[NAF_BUCKET]
19
+ files = log_files
20
+
21
+ logger.info 'Starting to save files to s3...'
22
+ files.each do |file|
23
+ # Write file if not existent
24
+ object = bucket.objects["naf/#{project_name}/#{Rails.env}/#{creation_time}" + file[12..-1]]
25
+ if !object.exists?
26
+ # Write file to S3
27
+ result = object.write(File.open(file).read)
28
+ logger.info "File #{file} saved to S3"
29
+ end
30
+ end
31
+
32
+ logger.info 'Starting to archive files...'
33
+ archive_old_files(files)
34
+ end
35
+
36
+ private
37
+
38
+ def project_name
39
+ (`git remote -v`).slice(/\/\S+/).sub('.git','')[1..-1]
40
+ end
41
+
42
+ def log_files
43
+ files = Dir[NAF_JOBS_LOG_PATH + "*/*"]
44
+ files += Dir[NAF_RUNNERS_LOG_PATH + "*/*"]
45
+ # Sort log files based on time
46
+ files = files.sort { |x, y| Time.parse(y.scan(DATE_REGEX).first) <=> Time.parse(x.scan(DATE_REGEX).first) }
47
+
48
+ return files
49
+ end
50
+
51
+ def creation_time
52
+ ::Naf::ApplicationType.first.created_at.strftime("%Y%m%d_%H%M%S")
53
+ end
54
+
55
+ def archive_old_files(files)
56
+ copy_files
57
+ today = Time.zone.now.to_date
58
+ files.each do |file|
59
+ if (today - Time.parse(file.scan(DATE_REGEX).first).to_date).to_i > LOG_RETENTION
60
+ logger.info "Archived file: #{file}"
61
+ `rm #{file}`
62
+ end
63
+ end
64
+ end
65
+
66
+ def copy_files
67
+ if File.directory?(Naf::LOGGING_ROOT_DIRECTORY + "/naf")
68
+ # Each archive will have a unique path based on the time archived
69
+ time = Time.zone.now.to_s
70
+ FileUtils.mkdir_p(Naf::LOGGING_ROOT_DIRECTORY + Naf::LOGGING_ARCHIVE_DIRECTORY + "/#{time}")
71
+
72
+ # Move the naf logs into the archive directory
73
+ `cp -r #{Naf::LOGGING_ROOT_DIRECTORY}/naf #{Naf::LOGGING_ROOT_DIRECTORY + Naf::LOGGING_ARCHIVE_DIRECTORY}/#{time.gsub(' ', '\ ')}`
74
+ end
75
+ end
76
+
77
+ end
78
+ end
@@ -27,7 +27,9 @@ module Process::Naf
27
27
  if @update_machine
28
28
  machine = ::Naf::Machine.find_by_server_address(@server_address)
29
29
  if machine.blank?
30
- machine = ::Naf::Machine.create(server_address: @server_address)
30
+ server_name = (`hostname`).strip
31
+ machine = ::Naf::Machine.create(server_address: @server_address,
32
+ server_name: server_name)
31
33
  add_default_affinities(machine)
32
34
  end
33
35
 
@@ -3,6 +3,9 @@ require 'timeout'
3
3
  module Process::Naf
4
4
  class Runner < ::Af::Application
5
5
 
6
+ attr_accessor :machine,
7
+ :current_invocation
8
+
6
9
  #----------------
7
10
  # *** Options ***
8
11
  #+++++++++++++++++
@@ -41,6 +44,9 @@ module Process::Naf
41
44
  opt :kill_all_runners,
42
45
  "don't wait for runners to wind down and finish running their jobs",
43
46
  default: false
47
+ opt :invocation_uuid,
48
+ "unique identifer used for runner logs",
49
+ default: `uuidgen`
44
50
 
45
51
  def initialize
46
52
  super
@@ -56,15 +62,9 @@ module Process::Naf
56
62
  end
57
63
 
58
64
  def work
59
- unless @disable_gc_modifications
60
- # These configuration changes will help forked processes, not the runner
61
- ENV['RUBY_HEAP_MIN_SLOTS'] = '500000'
62
- ENV['RUBY_HEAP_SLOTS_INCREMENT'] = '250000'
63
- ENV['RUBY_HEAP_SLOTS_GROWTH_FACTOR'] = '1'
64
- ENV['RUBY_GC_MALLOC_LIMIT'] = '50000000'
65
- end
65
+ check_gc_configurations
66
66
 
67
- machine = ::Naf::Machine.find_by_server_address(@server_address)
67
+ @machine = ::Naf::Machine.find_by_server_address(@server_address)
68
68
 
69
69
  unless machine.present?
70
70
  logger.fatal "This machine is not configued correctly (ipaddress: #{@server_address})."
@@ -75,77 +75,110 @@ module Process::Naf
75
75
 
76
76
  machine.lock_for_runner_use
77
77
  begin
78
- # Wind down other runners
79
- machine.machine_runners.each do |machine_runner|
80
- machine_runner.machine_runner_invocations.each do |invocation|
81
- if invocation.dead_at.blank?
82
- begin
83
- retval = Process.kill(0, invocation.pid)
84
- logger.detail "#{retval} = kill(0, #{invocation.pid}) -- process alive, marking runner invocation as winding down"
85
- invocation.wind_down_at = Time.zone.now
86
- invocation.save!
87
- rescue Errno::ESRCH
88
- logger.detail "ESRCH = kill(0, #{invocation.pid}) -- marking runner invocation as not running"
89
- invocation.dead_at = Time.zone.now
90
- invocation.save!
91
- end
92
- end
93
- end
94
- end
78
+ cleanup_old_processes
79
+ wind_down_runners
80
+
95
81
  # Create a machine runner, if it doesn't exist
96
82
  machine_runner = ::Naf::MachineRunner.
97
83
  find_or_create_by_machine_id_and_runner_cwd(machine_id: machine.id,
98
84
  runner_cwd: Dir.pwd)
99
-
100
- begin
101
- repository_name = (`git remote -v`).slice(/:\S+/).sub('.git','')[1..-1]
102
- if repository_name.match(/fatal/)
103
- repository_name = nil
104
- end
105
- rescue
106
- repository_name = nil
107
- end
108
- branch_name = (`git rev-parse --abbrev-ref HEAD`).strip
109
- if branch_name.match(/fatal/)
110
- branch_name = nil
111
- end
112
- commit_information = (`git log --pretty="%H" -n 1`).strip
113
- if commit_information.match(/fatal/)
114
- commit_information = nil
115
- end
116
- deployment_tag = (`git describe --abbrev=0 --tag 2>&1`).strip
117
- if deployment_tag.match(/fatal: No names found, cannot describe anything/)
118
- deployment_tag = nil
119
- end
120
85
  # Create an invocation for this runner
121
- invocation = ::Naf::MachineRunnerInvocation.create!(machine_runner_id: machine_runner.id,
122
- pid: Process.pid,
123
- repository_name: repository_name,
124
- branch_name: branch_name,
125
- commit_information: commit_information,
126
- deployment_tag: deployment_tag)
86
+ @current_invocation = ::Naf::MachineRunnerInvocation.
87
+ create!({ machine_runner_id: machine_runner.id,
88
+ pid: Process.pid,
89
+ uuid: @invocation_uuid }.merge!(retrieve_invocation_information))
127
90
  ensure
128
91
  machine.unlock_for_runner_use
129
92
  end
130
93
 
131
94
  begin
132
- work_machine(machine, invocation)
95
+ work_machine
133
96
  ensure
134
- invocation.dead_at = Time.zone.now
135
- invocation.save!
97
+ @current_invocation.dead_at = Time.zone.now
98
+ @current_invocation.save!
99
+ cleanup_old_processes
100
+ end
101
+ end
102
+
103
+ def check_gc_configurations
104
+ unless @disable_gc_modifications
105
+ # These configuration changes will help forked processes, not the runner
106
+ ENV['RUBY_HEAP_MIN_SLOTS'] = '500000'
107
+ ENV['RUBY_HEAP_SLOTS_INCREMENT'] = '250000'
108
+ ENV['RUBY_HEAP_SLOTS_GROWTH_FACTOR'] = '1'
109
+ ENV['RUBY_GC_MALLOC_LIMIT'] = '50000000'
110
+ end
111
+ end
112
+
113
+ def cleanup_old_processes
114
+ machine.machine_runners.each do |runner|
115
+ runner.machine_runner_invocations.recently_marked_dead(24.hours).each do |invocation|
116
+ terminate_old_processes(invocation)
117
+ end
118
+ end
119
+ end
120
+
121
+ def wind_down_runners
122
+ machine.machine_runners.each do |runner|
123
+ runner.machine_runner_invocations.each do |invocation|
124
+ if invocation.dead_at.blank?
125
+ begin
126
+ retval = Process.kill(0, invocation.pid)
127
+ logger.detail "#{retval} = kill(0, #{invocation.pid}) -- process alive, marking runner invocation as winding down"
128
+ invocation.wind_down_at = Time.zone.now
129
+ invocation.save!
130
+ rescue Errno::ESRCH
131
+ logger.detail "ESRCH = kill(0, #{invocation.pid}) -- marking runner invocation as not running"
132
+ invocation.dead_at = Time.zone.now
133
+ invocation.save!
134
+ terminate_old_processes(invocation)
135
+ end
136
+ end
137
+ end
138
+ end
139
+ end
140
+
141
+ def retrieve_invocation_information
142
+ begin
143
+ repository_name = (`git remote -v`).slice(/:\S+/).sub('.git','')[1..-1]
144
+ if repository_name.match(/fatal/)
145
+ repository_name = nil
146
+ end
147
+ rescue
148
+ repository_name = nil
149
+ end
150
+ branch_name = (`git rev-parse --abbrev-ref HEAD`).strip
151
+ if branch_name.match(/fatal/)
152
+ branch_name = nil
153
+ end
154
+ commit_information = (`git log --pretty="%H" -n 1`).strip
155
+ if commit_information.match(/fatal/)
156
+ commit_information = nil
136
157
  end
158
+ deployment_tag = (`git describe --abbrev=0 --tag 2>&1`).strip
159
+ if deployment_tag.match(/fatal: No names found, cannot describe anything/)
160
+ deployment_tag = nil
161
+ end
162
+
163
+ {
164
+ repository_name: repository_name,
165
+ branch_name: branch_name,
166
+ commit_information: commit_information,
167
+ deployment_tag: deployment_tag
168
+ }
137
169
  end
138
170
 
139
- def work_machine(machine, invocation)
171
+ def work_machine
140
172
  machine.mark_alive
141
173
  machine.mark_up
142
174
 
143
175
  # Make sure no processes are thought to be running on this machine
144
176
  terminate_old_processes(machine) if @kill_all_runners
145
177
 
146
- logger.info "working: #{machine}"
178
+ logger.info escape_html("working: #{machine}")
147
179
 
148
180
  @children = {}
181
+ @threads = {}
149
182
 
150
183
  at_exit {
151
184
  ::Af::Application.singleton.emergency_teardown
@@ -154,44 +187,92 @@ module Process::Naf
154
187
  @job_fetcher = ::Logical::Naf::JobFetcher.new(machine)
155
188
 
156
189
  while true
157
- break unless work_machine_loop(machine, invocation)
190
+ break unless work_machine_loop
158
191
  GC.start
159
192
  end
160
193
 
161
194
  logger.info "runner quitting"
162
195
  end
163
196
 
164
- def work_machine_loop(machine, invocation)
197
+ def work_machine_loop
165
198
  machine.reload
166
199
 
167
200
  # Check machine status
168
201
  if !machine.enabled
169
- logger.warn "this machine is disabled #{machine}"
202
+ logger.warn escape_html("this machine is disabled #{machine}")
170
203
  return false
171
204
  elsif machine.marked_down
172
- logger.warn "this machine is marked down #{machine}"
205
+ logger.warn escape_html("this machine is marked down #{machine}")
173
206
  return false
174
207
  end
175
208
 
176
209
  machine.mark_alive
177
210
 
211
+ check_log_level
212
+
213
+ @current_invocation.reload
214
+ if current_invocation.wind_down_at.present?
215
+ logger.warn "invocation asked to wind down"
216
+ if @children.length == 0
217
+ return false;
218
+ end
219
+ else
220
+ check_schedules
221
+ start_new_jobs
222
+ end
223
+
224
+ cleanup_dead_children
225
+
226
+ return true
227
+ end
228
+
229
+ def check_log_level
178
230
  if machine.log_level != @last_machine_log_level
179
231
  @last_machine_log_level = machine.log_level
180
232
  unless @last_machine_log_level.blank?
181
233
  logging_configurator.parse_and_set_logger_levels(@last_machine_log_level)
182
234
  end
183
235
  end
236
+ end
184
237
 
185
- invocation.reload
186
- if invocation.wind_down_at.present?
187
- logger.warn "invocation asked to wind down"
188
- if @children.length == 0
189
- return false;
238
+ def check_schedules
239
+ if ::Naf::Machine.is_it_time_to_check_schedules?(@check_schedules_period.minutes)
240
+ logger.debug "it's time to check schedules"
241
+ if ::Naf::ApplicationSchedule.try_lock_schedules
242
+ logger.debug_gross "checking schedules"
243
+ machine.mark_checked_schedule
244
+ ::Naf::ApplicationSchedule.unlock_schedules
245
+
246
+ # check scheduled tasks
247
+ should_be_queued.each do |application_schedule|
248
+ logger.info escape_html("scheduled application: #{application_schedule}")
249
+ begin
250
+ naf_boss = ::Logical::Naf::ConstructionZone::Boss.new
251
+ # this doesn't work very well for run_group_limits in the thousands
252
+ Range.new(0, application_schedule.application_run_group_limit || 1, true).each do
253
+ naf_boss.enqueue_application_schedule(application_schedule)
254
+ end
255
+ rescue ::Naf::HistoricalJob::JobPrerequisiteLoop => jpl
256
+ logger.error escape_html("#{machine} couldn't queue schedule because of prerequisite loop: #{jpl.message}")
257
+ logger.warn jpl
258
+ application_schedule.enabled = false
259
+ application_schedule.save!
260
+ logger.alarm escape_html("Application Schedule disabled due to loop: #{application_schedule}")
261
+ end
262
+ end
263
+
264
+ # check the runner machines
265
+ ::Naf::Machine.enabled.up.each do |runner_to_check|
266
+ if runner_to_check.is_stale?(@runner_stale_period.minutes)
267
+ logger.alarm escape_html("runner is stale for #{@runner_stale_period} minutes, #{runner_to_check}")
268
+ runner_to_check.mark_machine_down(machine)
269
+ end
270
+ end
190
271
  end
191
272
  end
273
+ end
192
274
 
193
- check_schedules(machine) if invocation.wind_down_at.blank?
194
-
275
+ def cleanup_dead_children
195
276
  # clean up children that have exited
196
277
  logger.detail "cleaning up dead children: #{@children.length}"
197
278
 
@@ -204,25 +285,10 @@ module Process::Naf
204
285
  pid, status = Process.waitpid2(-1)
205
286
  end
206
287
  rescue Timeout::Error
207
- # XXX is there a race condition where a child process exits
208
- # XXX has not set pid or status yet and timeout fires?
209
- # XXX i bet there is
210
- # XXX so this code is here:
211
- dead_children = []
212
- @children.each do |pid, child|
213
- unless is_job_process_alive?(child)
214
- dead_children << child
215
- end
216
- end
217
-
218
- unless dead_children.blank?
219
- logger.error "#{machine}: dead children even with timeout during waitpid2(): #{dead_children.inspect}"
220
- logger.warn "this isn't necessarily incorrect -- look for the pids to be cleaned up next round, if not: call it a bug"
221
- end
222
-
288
+ check_dead_children_not_exited_properly
223
289
  break
224
290
  rescue Errno::ECHILD => e
225
- logger.error "#{machine} No child when we thought we had children #{@children.inspect}"
291
+ logger.error escape_html("#{machine} No child when we thought we had children #{@children.inspect}")
226
292
  logger.warn e
227
293
  pid = @children.first.try(:first)
228
294
  status = nil
@@ -231,27 +297,10 @@ module Process::Naf
231
297
 
232
298
  if pid
233
299
  begin
234
- child_job = @children.delete(pid)
235
-
236
- if child_job.present?
237
- # Update job tags
238
- child_job.historical_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:work]])
239
-
240
- if status.nil? || status.exited? || status.signaled?
241
- logger.info { "cleaning up dead child: #{child_job.reload}" }
242
- finish_job(child_job,
243
- { exit_status: (status && status.exitstatus), termination_signal: (status && status.termsig) })
244
- else
245
- # this can happen if the child is sigstopped
246
- logger.warn "child waited for did not exit: #{child_job}, status: #{status.inspect}"
247
- end
248
- else
249
- # XXX ERROR no child for returned pid -- this can't happen
250
- logger.warn "child pid: #{pid}, status: #{status.inspect}, not managed by this runner"
251
- end
300
+ cleanup_dead_child(pid, status)
252
301
  rescue ActiveRecord::ActiveRecordError => are
253
- logger.error "Failure during cleaning up of dead child with pid: #{pid}"
254
- logger.error "#{are.message}"
302
+ logger.error escape_html("Failure during cleaning up of dead child with pid: #{pid}, status: #{status}")
303
+ logger.error escape_html("#{are.message}")
255
304
  rescue StandardError => e
256
305
  # XXX just incase a job control failure -- more code here
257
306
  logger.error "some failure during child clean up"
@@ -263,12 +312,58 @@ module Process::Naf
263
312
  logger.detail "sleeping in loop: #{@loop_sleep_time} seconds"
264
313
  sleep(@loop_sleep_time)
265
314
  end
315
+ end
316
+
317
+ # XXX is there a race condition where a child process exits
318
+ # XXX has not set pid or status yet and timeout fires?
319
+ # XXX i bet there is
320
+ # XXX so this code is here:
321
+ def check_dead_children_not_exited_properly
322
+ dead_children = []
323
+ @children.each do |pid, child|
324
+ unless is_job_process_alive?(child.reload)
325
+ dead_children << child
326
+ end
327
+ end
328
+
329
+ unless dead_children.blank?
330
+ logger.error escape_html("#{machine}: dead children even with timeout during waitpid2(): #{dead_children.inspect}")
331
+ logger.warn "this isn't necessarily incorrect -- look for the pids to be cleaned up next round, if not: call it a bug"
332
+ end
333
+ end
334
+
335
+ def cleanup_dead_child(pid, status)
336
+ child_job = @children.delete(pid)
337
+
338
+ if child_job.present?
339
+ # Update job tags
340
+ child_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:work]])
341
+
342
+ if status.nil? || status.exited? || status.signaled?
343
+ logger.info { escape_html("cleaning up dead child: #{child_job.reload}") }
344
+ finish_job(child_job,
345
+ { exit_status: (status && status.exitstatus), termination_signal: (status && status.termsig) })
346
+
347
+ thread = @threads.delete(pid)
348
+ logger.detail escape_html("cleaning up threads: #{thread.inspect}")
349
+ logger.detail escape_html("thread list: #{Thread.list}")
350
+ thread.join
351
+ else
352
+ # this can happen if the child is sigstopped
353
+ logger.warn escape_html("child waited for did not exit: #{child_job}, status: #{status.inspect}")
354
+ end
355
+ else
356
+ # XXX ERROR no child for returned pid -- this can't happen
357
+ logger.warn "child pid: #{pid}, status: #{status.inspect}, not managed by this runner"
358
+ end
359
+ end
266
360
 
361
+ def start_new_jobs
267
362
  # start new jobs
268
363
  logger.detail "starting new jobs, num children: #{@children.length}/#{machine.thread_pool_size}"
269
- # XXX while @children.length < machine.thread_pool_size && memory_available_to_spawn? && invocation.wind_down_at.blank?
270
- while ::Naf::RunningJob.where(:started_on_machine_id => machine.id).count < machine.thread_pool_size &&
271
- memory_available_to_spawn? && invocation.wind_down_at.blank?
364
+ # XXX while @children.length < machine.thread_pool_size && memory_available_to_spawn? && current_invocation.wind_down_at.blank?
365
+ while ::Naf::RunningJob.where(started_on_machine_id: machine.id).count < machine.thread_pool_size &&
366
+ memory_available_to_spawn? && current_invocation.wind_down_at.blank?
272
367
 
273
368
  logger.debug_gross "fetching jobs because: children: #{@children.length} < #{machine.thread_pool_size} (poolsize)"
274
369
  begin
@@ -279,21 +374,37 @@ module Process::Naf
279
374
  break
280
375
  end
281
376
 
282
- logger.info "starting new job : #{running_job}"
377
+ logger.info escape_html("starting new job : #{running_job.inspect}")
378
+
379
+ # fork and run
380
+ pid, stdin, stdout, stderr = running_job.historical_job.spawn
381
+ stdin.close
283
382
 
284
- pid = running_job.historical_job.spawn
383
+ # Reset NAF_JOB_ID
384
+ ENV.delete('NAF_JOB_ID')
285
385
  if pid
286
386
  @children[pid] = running_job
287
387
  running_job.pid = pid
288
388
  running_job.historical_job.pid = pid
289
389
  running_job.historical_job.failed_to_start = false
290
- running_job.historical_job.machine_runner_invocation_id = invocation.id
291
- logger.info "job started : #{running_job}"
390
+ running_job.historical_job.machine_runner_invocation_id = current_invocation.id
391
+ logger.info escape_html("job started : #{running_job}")
292
392
  running_job.save!
293
393
  running_job.historical_job.save!
394
+
395
+ # Spawn a thread to output the log of each job to files.
396
+ #
397
+ # Make sure not to execute any database calls inside this
398
+ # block, as it will start an ActiveRecord connection for each
399
+ # thread and eventually raise a ConnetionTimeoutError, resulting
400
+ # the runner to exit.
401
+ thread = Thread.new do
402
+ log_output_until_job_finishes(running_job.id, stdout, stderr)
403
+ end
404
+ @threads[pid] = thread
294
405
  else
295
406
  # should never get here (well, hopefully)
296
- logger.error "#{machine}: failed to execute #{running_job}"
407
+ logger.error escape_html("#{machine}: failed to execute #{running_job}")
297
408
 
298
409
  finish_job(running_job, { failed_to_start: true })
299
410
  end
@@ -301,51 +412,54 @@ module Process::Naf
301
412
  raise
302
413
  rescue StandardError => e
303
414
  # XXX rescue for various issues
304
- logger.error "#{machine}: failure during job start"
415
+ logger.error escape_html("#{machine}: failure during job start")
305
416
  logger.warn e
306
417
  end
307
418
  end
308
419
  logger.debug_gross "done starting jobs"
420
+ end
309
421
 
310
- return true
422
+ def log_output_until_job_finishes(job_id, stdout, stderr)
423
+ log_file = ::Logical::Naf::LogFile.new("#{::Naf::PREFIX_PATH}/#{::Naf.schema_name}/jobs/#{job_id}")
424
+ log_file.open
311
425
 
312
- end
426
+ # Continue reading logs from stdout/stderror until it reaches end of file
427
+ while true
428
+ read_pipes = []
429
+ read_pipes << stdout if stdout
430
+ read_pipes << stderr if stderr
431
+ return if (read_pipes.length == 0)
313
432
 
314
- def check_schedules(machine)
315
- if ::Naf::Machine.is_it_time_to_check_schedules?(@check_schedules_period.minutes)
316
- logger.debug "it's time to check schedules"
317
- if ::Naf::ApplicationSchedule.try_lock_schedules
318
- logger.debug_gross "checking schedules"
319
- machine.mark_checked_schedule
320
- ::Naf::ApplicationSchedule.unlock_schedules
433
+ error_pipes = read_pipes.clone
434
+ read_array, write_array, error_array = Kernel.select(read_pipes, nil, error_pipes, 1)
321
435
 
322
- # check scheduled tasks
323
- should_be_queued(machine).each do |application_schedule|
324
- logger.info "scheduled application: #{application_schedule}"
325
- begin
326
- naf_boss = ::Logical::Naf::ConstructionZone::Boss.new
327
- # this doesn't work very well for run_group_limits in the thousands
328
- Range.new(0, application_schedule.application_run_group_limit || 1, true).each do
329
- naf_boss.enqueue_application_schedule(application_schedule)
330
- end
331
- rescue ::Naf::HistoricalJob::JobPrerequisiteLoop => jpl
332
- logger.error "#{machine} couldn't queue schedule because of prerequisite loop: #{jpl.message}"
333
- logger.warn jpl
334
- application_schedule.enabled = false
335
- application_schedule.save!
336
- logger.alarm "Application Schedule disabled due to loop: #{application_schedule}"
337
- end
338
- end
436
+ unless error_array.blank?
437
+ logger.error escape_html("job(#{job_id}): select returned error for #{error_pipes.inspect} (read_pipes: #{read_pipes.inspect})")
438
+ # XXX we should probably close the errored FDs
439
+ end
339
440
 
340
- # check the runner machines
341
- ::Naf::Machine.enabled.up.each do |runner_to_check|
342
- if runner_to_check.is_stale?(@runner_stale_period.minutes)
343
- logger.alarm "runner is stale for #{@runner_stale_period} minutes, #{runner_to_check}"
344
- runner_to_check.mark_machine_down(machine)
441
+ unless read_array.blank?
442
+ begin
443
+ for r in read_array do
444
+ begin
445
+ # Parse each log line into JSON
446
+ r.read_nonblock(10240).split("\n").each do |log|
447
+ log_file << log.rstrip
448
+ end
449
+ rescue Errno::EAGAIN
450
+ rescue Errno::EINTR
451
+ rescue EOFError => eof
452
+ stdout = nil if r == stdout
453
+ stderr = nil if r == stderr
454
+ end
345
455
  end
456
+ ensure
457
+ log_file.write
346
458
  end
347
459
  end
348
460
  end
461
+
462
+ log_file.close
349
463
  end
350
464
 
351
465
  # XXX update_all doesn't support "from_partition" so we have this helper
@@ -364,15 +478,15 @@ module Process::Naf
364
478
  end
365
479
 
366
480
  def finish_job(running_job, updates = {})
367
- running_job.historical_job.remove_all_tags
368
- running_job.historical_job.add_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
481
+ if running_job.present?
482
+ running_job.remove_all_tags
483
+ running_job.add_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
484
+ end
369
485
 
370
486
  ::Naf::HistoricalJob.transaction do
371
487
  update_historical_job(updates.merge({ finished_at: Time.zone.now }), running_job.id)
372
488
  running_job.delete
373
489
  end
374
-
375
- running_job.historical_job.remove_tags([::Naf::HistoricalJob::SYSTEM_TAGS[:cleanup]])
376
490
  end
377
491
 
378
492
  # kill(0, pid) seems to fail during at_exit block
@@ -392,17 +506,17 @@ module Process::Naf
392
506
  end
393
507
  end
394
508
 
395
- def terminate_old_processes(machine)
509
+ def terminate_old_processes(record)
396
510
  # check if any processes are hanging around and ask them
397
511
  # politely if they will please terminate
398
- jobs = assigned_jobs(machine)
512
+ jobs = assigned_jobs(record)
399
513
  if jobs.length == 0
400
514
  logger.detail "no jobs to remove"
401
515
  return
402
516
  end
403
517
  logger.info "number of old jobs to sift through: #{jobs.length}"
404
518
  jobs.each do |job|
405
- logger.detail "job still around: #{job}"
519
+ logger.detail escape_html("job still around: #{job}")
406
520
  if job.request_to_terminate == false
407
521
  logger.warn "politely asking process: #{job.pid} to terminate itself"
408
522
  job.request_to_terminate = true
@@ -412,7 +526,7 @@ module Process::Naf
412
526
 
413
527
  # wait
414
528
  (1..@wait_time_for_processes_to_terminate).each do |i|
415
- num_assigned_jobs = assigned_jobs(machine).length
529
+ num_assigned_jobs = assigned_jobs(record).length
416
530
  return if num_assigned_jobs == 0
417
531
  logger.debug_medium "#{i}/#{@wait_time_for_processes_to_terminate}: sleeping 1 second while we wait for " +
418
532
  "#{num_assigned_jobs} assigned job(s) to terminate as requested"
@@ -420,27 +534,27 @@ module Process::Naf
420
534
  end
421
535
 
422
536
  # nudge them to terminate
423
- jobs = assigned_jobs(machine)
537
+ jobs = assigned_jobs(record)
424
538
  if jobs.length == 0
425
539
  logger.debug_gross "assigned jobs have exited after asking to terminate nicely"
426
540
  return
427
541
  end
428
542
  jobs.each do |job|
429
- logger.warn "sending SIG_TERM to process: #{job}"
543
+ logger.warn escape_html("sending SIG_TERM to process: #{job}")
430
544
  send_signal_and_maybe_clean_up(job, "TERM")
431
545
  end
432
546
 
433
547
  # wait
434
548
  (1..5).each do |i|
435
- num_assigned_jobs = assigned_jobs(machine).length
549
+ num_assigned_jobs = assigned_jobs(record).length
436
550
  return if num_assigned_jobs == 0
437
551
  logger.debug_medium "#{i}/5: sleeping 1 second while we wait for #{num_assigned_jobs} assigned job(s) to terminate from SIG_TERM"
438
552
  sleep(1)
439
553
  end
440
554
 
441
555
  # kill with fire
442
- assigned_jobs(machine).each do |job|
443
- logger.alarm "sending SIG_KILL to process: #{job}"
556
+ assigned_jobs(record).each do |job|
557
+ logger.alarm escape_html("sending SIG_KILL to process: #{job}")
444
558
  send_signal_and_maybe_clean_up(job, "KILL")
445
559
 
446
560
  # job force job down
@@ -473,13 +587,19 @@ module Process::Naf
473
587
  return send_signal_and_maybe_clean_up(job, 0)
474
588
  end
475
589
 
476
- def assigned_jobs(machine)
477
- return ::Naf::RunningJob.assigned_jobs(machine).select do |job|
478
- is_job_process_alive?(job)
590
+ def assigned_jobs(record)
591
+ if record.kind_of? ::Naf::MachineRunnerInvocation
592
+ return ::Naf::RunningJob.started_on_invocation(record.id).select do |job|
593
+ is_job_process_alive?(job)
594
+ end
595
+ else
596
+ return ::Naf::RunningJob.assigned_jobs(record).select do |job|
597
+ is_job_process_alive?(job)
598
+ end
479
599
  end
480
600
  end
481
601
 
482
- def should_be_queued(machine)
602
+ def should_be_queued
483
603
  not_finished_applications = ::Naf::HistoricalJob.
484
604
  queued_between(Time.zone.now - Naf::HistoricalJob::JOB_STALE_TIME, Time.zone.now).
485
605
  where("finished_at IS NULL AND request_to_terminate = false").
@@ -536,5 +656,9 @@ module Process::Naf
536
656
  return false
537
657
  end
538
658
 
659
+ def escape_html(str)
660
+ CGI::escapeHTML(str)
661
+ end
662
+
539
663
  end
540
664
  end