cnvrg 1.6.38 → 1.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/cnvrg.gemspec +1 -4
- data/lib/cnvrg/Images.rb +0 -148
- data/lib/cnvrg/api.rb +8 -8
- data/lib/cnvrg/api_v2.rb +14 -0
- data/lib/cnvrg/cli.rb +288 -781
- data/lib/cnvrg/connect_job_ssh.rb +31 -0
- data/lib/cnvrg/data.rb +65 -12
- data/lib/cnvrg/datafiles.rb +483 -201
- data/lib/cnvrg/dataset.rb +65 -29
- data/lib/cnvrg/experiment.rb +10 -4
- data/lib/cnvrg/files.rb +46 -14
- data/lib/cnvrg/helpers.rb +34 -26
- data/lib/cnvrg/helpers/agent.rb +188 -0
- data/lib/cnvrg/helpers/executer.rb +162 -258
- data/lib/cnvrg/job_cli.rb +28 -53
- data/lib/cnvrg/job_ssh.rb +47 -0
- data/lib/cnvrg/logger.rb +4 -0
- data/lib/cnvrg/project.rb +45 -16
- data/lib/cnvrg/ssh.rb +0 -1
- data/lib/cnvrg/version.rb +1 -1
- metadata +9 -33
@@ -0,0 +1,188 @@
|
|
1
|
+
class Cnvrg::Helpers::Agent
|
2
|
+
|
3
|
+
module Status
|
4
|
+
STARTED = :started
|
5
|
+
RUNNING = :running
|
6
|
+
FINISHED = :finished
|
7
|
+
ABORTED = "aborted"
|
8
|
+
end
|
9
|
+
|
10
|
+
module LogLevel
|
11
|
+
INFO = :info
|
12
|
+
PURE = :pure
|
13
|
+
ERROR = :error
|
14
|
+
end
|
15
|
+
|
16
|
+
#### This class represent a single command in the system.
|
17
|
+
#### it runs under an executer (machine_activity) so it should have all the executer
|
18
|
+
#### params
|
19
|
+
def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
|
20
|
+
@executer = executer
|
21
|
+
@slug = slug
|
22
|
+
@files_exist = files_exist
|
23
|
+
@container_name = container_name
|
24
|
+
@run_in_slave = @container_name.downcase == "slave"
|
25
|
+
@log_interval = send_log_interval
|
26
|
+
# https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
|
27
|
+
if timeout.blank? or timeout.negative?
|
28
|
+
@timeout = 0
|
29
|
+
else
|
30
|
+
@timeout = timeout
|
31
|
+
end
|
32
|
+
@logs_regex = logs_regex || []
|
33
|
+
@async = async
|
34
|
+
@command = command
|
35
|
+
@send_logs = send_logs
|
36
|
+
@retries = retries.try(:to_i) ## How many times the user asked to try to execute the command again
|
37
|
+
@sleep_before_retry = sleep_before_retry
|
38
|
+
@real_execution_retries = 0 ## How many times the command really executed until success
|
39
|
+
@single_quotes = single_quotes
|
40
|
+
@docker_user = ""
|
41
|
+
@shell_type = use_bash ? "bash -l" : "sh"
|
42
|
+
if docker_user.present?
|
43
|
+
@docker_user = " --user #{docker_user}"
|
44
|
+
end
|
45
|
+
if @run_in_slave
|
46
|
+
if @single_quotes
|
47
|
+
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
|
48
|
+
else
|
49
|
+
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
|
50
|
+
end
|
51
|
+
end
|
52
|
+
@output = []
|
53
|
+
@errors = []
|
54
|
+
@exit_status = nil
|
55
|
+
@is_running = true
|
56
|
+
@pid = nil
|
57
|
+
end
|
58
|
+
|
59
|
+
def base_url
|
60
|
+
[@executer.activity_url, "commands", @slug].join("/")
|
61
|
+
end
|
62
|
+
|
63
|
+
def should_run?
|
64
|
+
if @files_exist.present?
|
65
|
+
file_doesnt_exists = @files_exist.find do |file|
|
66
|
+
not File.exists? file
|
67
|
+
end
|
68
|
+
return true if file_doesnt_exists.blank?
|
69
|
+
log_internal("Can't find file #{file_doesnt_exists}, stopping the job")
|
70
|
+
return false
|
71
|
+
end
|
72
|
+
true
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
def exec!
|
77
|
+
log_internal("Command: #{@command} with slug: #{@slug} started!")
|
78
|
+
if should_run?
|
79
|
+
send_logs(status: Status::STARTED)
|
80
|
+
periodic_thread
|
81
|
+
execute_command
|
82
|
+
else
|
83
|
+
@exit_status = 127
|
84
|
+
end
|
85
|
+
finish_log = "Command: #{@command} with slug: #{@slug} finished"
|
86
|
+
finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
|
87
|
+
log_internal(finish_log)
|
88
|
+
send_logs(exit_status: @exit_status, status: Status::FINISHED)
|
89
|
+
end
|
90
|
+
|
91
|
+
def get_logs_to_send
|
92
|
+
new_logs = @output.pop(@output.length)
|
93
|
+
new_errors = @errors.pop(@errors.length)
|
94
|
+
[new_logs, new_errors]
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
def periodic_thread
|
99
|
+
Thread.new do
|
100
|
+
while @exit_status.blank?
|
101
|
+
Thread.exit if @log_interval.blank?
|
102
|
+
sleep(@log_interval)
|
103
|
+
send_logs
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def retry_command
|
109
|
+
@retries -=1
|
110
|
+
sleep @sleep_before_retry
|
111
|
+
@real_execution_retries +=1
|
112
|
+
execute_command
|
113
|
+
end
|
114
|
+
|
115
|
+
def execute_command
|
116
|
+
Timeout.timeout(@timeout) do
|
117
|
+
PTY.spawn(@command) do |stdout, stdin, pid, stderr|
|
118
|
+
@pid = pid
|
119
|
+
begin
|
120
|
+
if stdout.present?
|
121
|
+
stdout.each do |line|
|
122
|
+
log_internal(line, level: LogLevel::PURE)
|
123
|
+
line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
|
124
|
+
@output << {log: line, timestamp: Time.now}
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
if stderr.present?
|
129
|
+
stderr.each do |line|
|
130
|
+
line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
|
131
|
+
log_internal(line, level: LogLevel::ERROR)
|
132
|
+
@errors << {log: line, timestamp: Time.now}
|
133
|
+
end
|
134
|
+
end
|
135
|
+
rescue Errno::EIO => e
|
136
|
+
next
|
137
|
+
rescue => e
|
138
|
+
log_internal(e.message, level: LogLevel::ERROR)
|
139
|
+
log_internal(e.backtrace.join("\n"), level: LogLevel::ERROR)
|
140
|
+
@errors << {log: e.message, timestamp: Time.now}
|
141
|
+
end
|
142
|
+
::Process.wait pid
|
143
|
+
end
|
144
|
+
end
|
145
|
+
@exit_status = $?.exitstatus
|
146
|
+
rescue Timeout::Error
|
147
|
+
Process.kill(0, @pid)
|
148
|
+
@errors << {log: "Command timed out!", timestamp: Time.now}
|
149
|
+
log_internal("Command timed out!", level: LogLevel::ERROR)
|
150
|
+
@exit_status = 124
|
151
|
+
ensure
|
152
|
+
retry_command if @retries != 0 and @exit_status !=0
|
153
|
+
@exit_status
|
154
|
+
end
|
155
|
+
|
156
|
+
private
|
157
|
+
def send_logs(exit_status: nil, status: Status::RUNNING)
|
158
|
+
logs, error_logs = get_logs_to_send
|
159
|
+
# Filter logs only if not failed
|
160
|
+
if exit_status.blank? or exit_status == 0
|
161
|
+
logs = filter_logs_by_regex(logs)
|
162
|
+
end
|
163
|
+
### there is no logs, no exit_status and status is running.
|
164
|
+
### this condition let us call "send_logs" every interval iteration.
|
165
|
+
if logs.blank? and error_logs.blank? and exit_status.blank? and status == Status::RUNNING
|
166
|
+
return
|
167
|
+
end
|
168
|
+
Cnvrg::API.request(base_url, 'PUT', {logs: logs, error_logs: error_logs, exit_status: exit_status, status: status, execution_retries: @real_execution_retries, pid: @pid})
|
169
|
+
end
|
170
|
+
|
171
|
+
def log_internal(log, level: LogLevel::INFO)
|
172
|
+
if level == LogLevel::PURE
|
173
|
+
puts(log)
|
174
|
+
else
|
175
|
+
puts({log: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity}.to_json)
|
176
|
+
end
|
177
|
+
STDOUT.flush
|
178
|
+
end
|
179
|
+
|
180
|
+
def filter_logs_by_regex(logs)
|
181
|
+
logs.select do |log|
|
182
|
+
next true if @send_logs
|
183
|
+
@logs_regex.find do |regexp_str|
|
184
|
+
Regexp.new(regexp_str).match(log[:log]).present?
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
@@ -1,309 +1,213 @@
|
|
1
|
+
require 'cnvrg/helpers/agent'
|
1
2
|
class Cnvrg::Helpers::Executer
|
2
|
-
|
3
|
-
ACK = :ack
|
4
|
-
STARTED = :started
|
5
|
-
FAILED = :failed
|
6
|
-
SUCCESS = :success
|
7
|
-
end
|
8
|
-
def initialize(project: nil, job_type: nil, job_id: nil, image: nil)
|
9
|
-
@image = image
|
10
|
-
@project = project || Cnvrg::Project.new(owner: ENV['CNVRG_OWNER'], slug: ENV['CNVRG_PROJECT'])
|
11
|
-
@job_type = job_type || ENV['CNVRG_JOB_TYPE']
|
12
|
-
@job_id = job_id || ENV['CNVRG_JOB_ID']
|
13
|
-
if @job_id.blank?
|
14
|
-
Cnvrg::CLI.log_message("Cant find job, exiting.", 'red')
|
15
|
-
exit(1)
|
16
|
-
end
|
17
|
-
end
|
3
|
+
attr_reader :machine_activity, :agent_id, :slave_id
|
18
4
|
|
19
5
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
6
|
+
### this class represent a machine_activity. it will poll the commands, communicate with the
|
7
|
+
# server (poll commands) and let the server know the status of this executer.
|
8
|
+
def initialize(owner: nil, machine_activity: nil, poll_every: 30, job_id: nil)
|
9
|
+
@owner = owner
|
10
|
+
@job_id = job_id
|
11
|
+
@poll_every = poll_every
|
12
|
+
@machine_activity = machine_activity
|
13
|
+
@commands_q = Queue.new
|
14
|
+
@files_q = Queue.new
|
15
|
+
@agent_id = nil
|
16
|
+
@slave_id = nil
|
31
17
|
end
|
32
18
|
|
33
|
-
def
|
34
|
-
|
35
|
-
|
36
|
-
decipher.decrypt
|
37
|
-
decipher.key = key
|
38
|
-
decipher.iv = iv
|
39
|
-
commands = decipher.update(text) + decipher.final
|
40
|
-
JSON.parse(commands)
|
41
|
-
end
|
42
|
-
|
43
|
-
def execute(cmd)
|
44
|
-
## execute the command for running
|
45
|
-
# cmd will have to following fields
|
46
|
-
#
|
47
|
-
# :command => the command to execute
|
48
|
-
# :type => the command type, 'notify' or nil
|
49
|
-
# :timeout => the timeout for the command in seconds (default is 60 hours)
|
50
|
-
# :retries => integer, default 1
|
51
|
-
#
|
52
|
-
# when type == 'file_exists'
|
53
|
-
# 'file' => string => file to check (fullpath)
|
54
|
-
# 'exists_commands' => list of commands in case file exists
|
55
|
-
# 'non_exists_commands' => list of commands in case file doesnt exists
|
56
|
-
# when type == 'notify'
|
57
|
-
# :before_execute_log => log to be logged before execution
|
58
|
-
# :logs => boolean => add the execution logs to the job logs
|
59
|
-
# :title => command title, can replace the on_error, on_success fields
|
60
|
-
# :on_error_log => log to be logged on exit_code != 0
|
61
|
-
# :on_success_log => log to be logged on exit_code == 0
|
62
|
-
# when type == 'notify_command'
|
63
|
-
# notify to commands api about command progress
|
64
|
-
# when type == 'spawn'
|
65
|
-
# run in another process and detach from it
|
66
|
-
#
|
67
|
-
retries = cmd[:retries] || 1
|
68
|
-
resp = []
|
69
|
-
retries.times.each do
|
70
|
-
resp = execute_helper(cmd)
|
71
|
-
exit_status, _, _, _, _ = resp
|
72
|
-
return resp if exit_status == 0
|
19
|
+
def create_file_cmd(path, content)
|
20
|
+
if path.include? "~"
|
21
|
+
path = File.expand_path(path)
|
73
22
|
end
|
74
|
-
|
23
|
+
FileUtils.mkdir_p(File.dirname(path))
|
24
|
+
File.open(path, "w+"){|f| f.write(content)}
|
75
25
|
end
|
76
26
|
|
77
|
-
def
|
78
|
-
|
79
|
-
|
80
|
-
if command[:type] == "file_exists"
|
81
|
-
puts "Looking for file #{command[:file]}"
|
82
|
-
else
|
83
|
-
puts "Execute #{command[:command]}" unless command[:no_stdout]
|
84
|
-
end
|
85
|
-
execute(command)
|
27
|
+
def handle_files(files)
|
28
|
+
(files || {}).each do |path, content|
|
29
|
+
create_file_cmd(path, content)
|
86
30
|
end
|
87
31
|
end
|
88
32
|
|
89
|
-
def
|
90
|
-
|
91
|
-
|
92
|
-
|
33
|
+
def activity_url
|
34
|
+
['users', @owner, 'machine_activities', @machine_activity].join("/")
|
35
|
+
end
|
36
|
+
|
37
|
+
def executer_stats
|
38
|
+
return @stats if @stats.present?
|
39
|
+
Cnvrg::Logger.log_info("getting containers")
|
40
|
+
@agent_id, @slave_id = containers
|
41
|
+
Cnvrg::Logger.log_info("got containers")
|
42
|
+
pod_name, node_name = get_node_and_pod_names
|
43
|
+
@stats = {
|
44
|
+
pod_name: pod_name,
|
45
|
+
node_name: node_name,
|
46
|
+
agent: {
|
47
|
+
container_id: @agent_id,
|
48
|
+
workdir: `pwd`.strip,
|
49
|
+
homedir: current_homedir,
|
50
|
+
user: `whoami`.strip,
|
51
|
+
user_id: `id -u`.strip,
|
52
|
+
group_id: `id -g`.strip,
|
53
|
+
cnvrg: Cnvrg::VERSION
|
54
|
+
},
|
55
|
+
slave: {
|
56
|
+
container_id: @slave_id,
|
57
|
+
workdir: run_in_slave('pwd'),
|
58
|
+
homedir: slave_homedir,
|
59
|
+
spark_path: spark_path,
|
60
|
+
user: run_in_slave( 'whoami'),
|
61
|
+
cnvrg: run_in_slave( 'which cnvrg'),
|
62
|
+
has_bash: run_in_slave( 'which bash'),
|
63
|
+
user_id: run_in_slave( 'id -u'),
|
64
|
+
group_id: run_in_slave( 'id -g'),
|
65
|
+
python_version: run_in_slave( 'python --version'),
|
66
|
+
python3_version: run_in_slave( 'python3 --version'),
|
67
|
+
pip_version: run_in_slave( 'pip --version'),
|
68
|
+
pip3_version: run_in_slave( 'pip3 --version')
|
69
|
+
},
|
70
|
+
}
|
71
|
+
@stats
|
72
|
+
end
|
73
|
+
|
74
|
+
def containers
|
75
|
+
agent_id = nil
|
76
|
+
slave_id = nil
|
77
|
+
while agent_id.blank? or slave_id.blank?
|
78
|
+
grep_by = @job_id
|
79
|
+
grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
|
80
|
+
cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
|
81
|
+
agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
|
82
|
+
slave_id = cntrs.find{|container_name| container_name.include? "slave"}.split(",").first rescue nil
|
83
|
+
sleep(5)
|
84
|
+
end
|
85
|
+
if slave_id.blank?
|
86
|
+
raise "Can't find slave id"
|
87
|
+
end
|
88
|
+
[agent_id, slave_id]
|
93
89
|
end
|
94
90
|
|
95
|
-
def
|
96
|
-
|
97
|
-
commands = resp["commands"]
|
98
|
-
commands.map{|k| k.with_indifferent_access}
|
91
|
+
def current_homedir
|
92
|
+
`env | grep HOME`.strip.split("=").try(:last)
|
99
93
|
end
|
100
94
|
|
101
|
-
def
|
102
|
-
|
103
|
-
command = resp["command"]
|
104
|
-
command.with_indifferent_access
|
95
|
+
def spark_path
|
96
|
+
run_in_slave("env | grep SPARK_HOME").strip.split("=").try(:last)
|
105
97
|
end
|
106
98
|
|
107
|
-
def
|
108
|
-
|
99
|
+
def slave_homedir()
|
100
|
+
run_in_slave("env | grep HOME").split("=").try(:last)
|
109
101
|
end
|
110
102
|
|
111
|
-
|
112
|
-
|
113
|
-
git_commit = `git rev-parse --verify HEAD`
|
114
|
-
return if git_commit.blank?
|
115
|
-
Cnvrg::API.request("#{base_url}/update_git_commit", "POST", {git_commit: git_commit.strip!})
|
103
|
+
def slave_env
|
104
|
+
run_in_slave("env").split("\n").map{|x| x.split("=")}
|
116
105
|
end
|
117
106
|
|
118
|
-
def
|
119
|
-
|
107
|
+
def run_in_slave(command)
|
108
|
+
`docker exec -i #{@slave_id} sh -c '#{command}'`.strip
|
120
109
|
end
|
121
110
|
|
122
|
-
def monitor_command(command, command_slug)
|
123
|
-
monitor_single_command(command, command_slug)
|
124
|
-
end
|
125
111
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
112
|
+
def poll
|
113
|
+
resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
|
114
|
+
commands = resp["commands"]
|
115
|
+
files = resp["files"]
|
116
|
+
handle_files(files)
|
117
|
+
commands.each{|cmd| @commands_q.push(cmd)}
|
118
|
+
rescue => e
|
119
|
+
Cnvrg::Logger.log_error(e)
|
120
|
+
end
|
121
|
+
|
122
|
+
def init
|
123
|
+
resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
|
124
|
+
machine_activity = resp["machine_activity"]
|
125
|
+
Cnvrg::Logger.log_info("Got back machine activity #{machine_activity}")
|
126
|
+
if machine_activity.present? and @machine_activity != machine_activity
|
127
|
+
Cnvrg::Logger.log_info("Changing to machine activity #{machine_activity}")
|
128
|
+
machine_activity_yml = {slug: machine_activity}
|
129
|
+
File.open("/conf/.machine_activity.yml", "w+") {|f| f.write machine_activity_yml.to_yaml}
|
130
|
+
@machine_activity = machine_activity
|
145
131
|
end
|
132
|
+
rescue => e
|
133
|
+
Cnvrg::Logger.log_error(e)
|
146
134
|
end
|
147
135
|
|
148
|
-
def
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
output: output,
|
153
|
-
errors: errors,
|
154
|
-
start_time: start_time,
|
155
|
-
end_time: end_time
|
156
|
-
}
|
157
|
-
if exit_status == 0
|
158
|
-
update_command(CommandsStatus::SUCCESS, context, cmd[:command_slug])
|
159
|
-
else
|
160
|
-
update_command(CommandsStatus::FAILED, context, cmd[:command_slug])
|
136
|
+
def polling_thread
|
137
|
+
while true
|
138
|
+
poll
|
139
|
+
sleep(@poll_every)
|
161
140
|
end
|
162
141
|
end
|
163
142
|
|
164
|
-
def
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
exit_status, output, errors, start_time, end_time = regular_command(cmd)
|
169
|
-
logs = []
|
170
|
-
if exit_status == 0
|
171
|
-
logs = output.map{|log| log[:logs]} if with_logs
|
172
|
-
job_log(logs + cmd[:on_success_log])
|
173
|
-
else
|
174
|
-
logs = output + errors
|
175
|
-
logs = logs.sort_by{|x| x[:timestamp]}.map{|x| x[:logs]} if with_logs
|
176
|
-
job_log(logs + cmd[:on_error_log], level: 'error')
|
143
|
+
def main_thread
|
144
|
+
init
|
145
|
+
Thread.new do
|
146
|
+
polling_thread
|
177
147
|
end
|
178
|
-
|
179
|
-
end
|
180
|
-
|
181
|
-
def merge_log_block(logs)
|
182
|
-
logs.group_by {|log| log[:timestamp].to_s}
|
183
|
-
.map {|ts, logz| {timestamp: ts, logs: logz.map {|l| l[:log]}.join("\n")}}
|
148
|
+
execute_cmds
|
184
149
|
end
|
185
150
|
|
151
|
+
def execute_cmds
|
152
|
+
pids = []
|
153
|
+
while true
|
154
|
+
if @commands_q.empty?
|
155
|
+
sleep(5)
|
156
|
+
next
|
157
|
+
end
|
158
|
+
cmd = @commands_q.pop.symbolize_keys
|
159
|
+
command_json = Cnvrg::API.request([activity_url, "commands", cmd[:slug]].join('/'), "GET")
|
186
160
|
|
187
|
-
|
188
|
-
pid = Process.spawn(cmd[:command])
|
189
|
-
Process.detach(pid)
|
190
|
-
end
|
161
|
+
cmd_status = command_json["status"] rescue ""
|
191
162
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
start_time = Time.now
|
196
|
-
timeout = cmd[:timeout] || 5*60
|
197
|
-
exit_status = nil
|
198
|
-
t = Thread.new do
|
199
|
-
PTY.spawn(cmd[:command]) do |stdout, stdin, pid, stderr|
|
200
|
-
begin
|
201
|
-
if stdout.present?
|
202
|
-
stdout.each do |line|
|
203
|
-
puts line
|
204
|
-
output << {log: line.strip, timestamp: Time.now}
|
205
|
-
end
|
206
|
-
end
|
207
|
-
if stderr.present?
|
208
|
-
stderr.each do |line|
|
209
|
-
errors << {log: line.strip, timestamp: Time.now}
|
210
|
-
end
|
211
|
-
end
|
212
|
-
rescue Errno::EIO
|
213
|
-
rescue => e
|
214
|
-
errors << {log: e.message, timestamp: Time.now}
|
215
|
-
end
|
216
|
-
::Process.wait pid
|
163
|
+
if cmd_status == Cnvrg::Helpers::Agent::Status::ABORTED
|
164
|
+
Cnvrg::Logger.log_info("stopping job because command #{cmd[:slug]} with status #{cmd_status}")
|
165
|
+
next
|
217
166
|
end
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
if
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
exit_status = 100 ##killed
|
167
|
+
pid = Process.fork do
|
168
|
+
Cnvrg::Helpers::Agent.new(executer: self, **cmd).exec!
|
169
|
+
end
|
170
|
+
if cmd[:async].blank?
|
171
|
+
Process.waitpid(pid)
|
172
|
+
else
|
173
|
+
Process.detach(pid)
|
226
174
|
end
|
227
|
-
|
175
|
+
pids << pid
|
176
|
+
######
|
228
177
|
end
|
229
|
-
|
230
|
-
[exit_status, merge_log_block(output), merge_log_block(errors), start_time, end_time]
|
178
|
+
pids
|
231
179
|
end
|
232
180
|
|
233
|
-
def
|
234
|
-
|
235
|
-
|
181
|
+
def merge_log_block(logs)
|
182
|
+
logs.group_by {|log| log[:timestamp].to_s}
|
183
|
+
.map {|ts, logz| {timestamp: ts, logs: logz.map {|l| l[:log]}.join("\n")}}
|
236
184
|
end
|
237
185
|
|
238
|
-
def
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
186
|
+
def get_node_and_pod_names
|
187
|
+
pod_name = `hostname`.strip rescue nil
|
188
|
+
node_name = nil
|
189
|
+
if pod_name.present?
|
190
|
+
pod_describe = `kubectl -n cnvrg get pod #{pod_name} -o json` rescue nil
|
191
|
+
pod_describe = JSON.parse(pod_describe) rescue {}
|
192
|
+
node_name = pod_describe["spec"]["nodeName"] rescue nil
|
243
193
|
end
|
244
|
-
|
245
|
-
cmd[:on_error_log] ||= []
|
246
|
-
[:before_execute_log, :on_success_log, :on_error_log].each{|x| cmd[x] = [cmd[x]].flatten}
|
247
|
-
cmd
|
194
|
+
[pod_name, node_name]
|
248
195
|
end
|
249
196
|
|
250
|
-
def
|
251
|
-
|
197
|
+
def pre_pod_stop
|
198
|
+
pod_name, node_name = get_node_and_pod_names
|
199
|
+
pod_events = get_pod_events(pod_name)
|
200
|
+
node_events = get_node_events(node_name)
|
201
|
+
Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
|
252
202
|
end
|
253
203
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
@image.job_log(logs, level: level, step: step)
|
258
|
-
else
|
259
|
-
@project.job_log(logs, level: level, step: step, job_type: @job_type, job_id: @job_id)
|
260
|
-
end
|
204
|
+
def get_pod_events(pod_name)
|
205
|
+
return if pod_name.blank?
|
206
|
+
`kubectl get event --namespace cnvrg --field-selector involvedObject.name=#{pod_name} -o json`
|
261
207
|
end
|
262
208
|
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
log = ""
|
267
|
-
PTY.spawn(cmd[:command]) do |stdout, stdin, pid, stderr|
|
268
|
-
begin
|
269
|
-
stdout.each do |line|
|
270
|
-
log += line + "\n"
|
271
|
-
if cmd[:success_log] and line.match(cmd[:success_log])
|
272
|
-
puts "Match found!"
|
273
|
-
context = {log: line, pid: Process.pid}
|
274
|
-
update_command(CommandsStatus::SUCCESS, context, command_slug)
|
275
|
-
end
|
276
|
-
end
|
277
|
-
|
278
|
-
if stderr
|
279
|
-
stderr.each do |err|
|
280
|
-
log += err + "\n"
|
281
|
-
#context = {error: err}
|
282
|
-
#update_command(CommandsStatus::FAILED, context, command_slug)
|
283
|
-
#log << {time: Time.now, message: err, type: "stderr"}
|
284
|
-
end
|
285
|
-
end
|
286
|
-
|
287
|
-
rescue Errno::EIO => e
|
288
|
-
Cnvrg::Logger.log_error(e)
|
289
|
-
context = {error: e.message}
|
290
|
-
update_command(CommandsStatus::FAILED, context, command_slug)
|
291
|
-
rescue Errno::ENOENT => e
|
292
|
-
exp_success = false
|
293
|
-
context = {error: e.message}
|
294
|
-
update_command(CommandsStatus::FAILED, context, command_slug)
|
295
|
-
Cnvrg::Logger.info("command \"#{cmd[:command]}\" couldn't be executed, verify command is valid")
|
296
|
-
Cnvrg::Logger.log_error(e)
|
297
|
-
rescue => e
|
298
|
-
#res = @exp.end(log, 1, start_commit, 0, 0)
|
299
|
-
context = {error: e.message}
|
300
|
-
update_command(CommandsStatus::FAILED, context, command_slug)
|
301
|
-
Cnvrg::Logger.info("Error occurred,aborting")
|
302
|
-
Cnvrg::Logger.log_error(e)
|
303
|
-
exit(0)
|
304
|
-
end
|
305
|
-
::Process.wait pid
|
306
|
-
end
|
209
|
+
def get_node_events(node_name)
|
210
|
+
return if node_name.blank?
|
211
|
+
`kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
|
307
212
|
end
|
308
|
-
|
309
213
|
end
|