cnvrg 1.11.28 → 2.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Readme.md +42 -0
- data/cnvrg.gemspec +8 -6
- data/lib/cnvrg/api.rb +4 -4
- data/lib/cnvrg/cli.rb +172 -81
- data/lib/cnvrg/connect_job_ssh.rb +4 -4
- data/lib/cnvrg/experiment.rb +18 -11
- data/lib/cnvrg/files.rb +6 -2
- data/lib/cnvrg/helpers/agent.rb +58 -18
- data/lib/cnvrg/helpers/executer.rb +179 -37
- data/lib/cnvrg/job_ssh.rb +9 -4
- data/lib/cnvrg/project.rb +26 -9
- data/lib/cnvrg/version.rb +2 -2
- metadata +43 -8
@@ -10,20 +10,20 @@ module Cnvrg
|
|
10
10
|
Cnvrg::Logger.log_info("cnvrg is not configured")
|
11
11
|
end
|
12
12
|
|
13
|
-
def start(username, password)
|
14
|
-
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password})
|
13
|
+
def start(username, password, no_auth, port: nil)
|
14
|
+
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password, no_auth: no_auth, port: port})
|
15
15
|
end
|
16
16
|
|
17
17
|
def status()
|
18
18
|
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/status" , 'GET', nil)
|
19
19
|
end
|
20
20
|
|
21
|
-
def run_portforward_command(pod_name, port, kubeconfig, namespace)
|
21
|
+
def run_portforward_command(pod_name, port, kubeconfig, namespace, internal_port)
|
22
22
|
command = "kubectl"
|
23
23
|
if kubeconfig.present?
|
24
24
|
command = "kubectl --kubeconfig=#{kubeconfig}"
|
25
25
|
end
|
26
|
-
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}
|
26
|
+
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:#{internal_port}"
|
27
27
|
puts("\nrunning command #{bashCommand}")
|
28
28
|
`#{bashCommand}`
|
29
29
|
end
|
data/lib/cnvrg/experiment.rb
CHANGED
@@ -133,23 +133,30 @@ module Cnvrg
|
|
133
133
|
return response
|
134
134
|
end
|
135
135
|
def remote_notebook(instance_type, commit, data, data_commit, notebook_type,ds_sync_options=0,data_query=nil, image = nil, datasets = nil)
|
136
|
-
response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', {instance_type: instance_type,dataset_slug:data,
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
136
|
+
response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', { instance_type: instance_type, dataset_slug:data,
|
137
|
+
dataset_commit: data_commit, image_slug:image,
|
138
|
+
datasets: datasets,
|
139
|
+
commit:commit, notebook_type:notebook_type, dataset_sync_options:ds_sync_options,
|
140
|
+
dataset_query: data_query })
|
141
141
|
return response
|
142
142
|
end
|
143
143
|
|
144
144
|
def upload_temp_log(temp_log)
|
145
|
-
response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', {output: temp_log,
|
146
|
-
|
147
|
-
Cnvrg::CLI.is_response_success(response,false)
|
145
|
+
response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', { output: temp_log,
|
146
|
+
exp_slug: @slug })
|
147
|
+
Cnvrg::CLI.is_response_success(response, false)
|
148
148
|
end
|
149
149
|
|
150
150
|
def send_machine_stats(stats)
|
151
|
-
response = Cnvrg::API.request(
|
152
|
-
|
151
|
+
response = Cnvrg::API.request(
|
152
|
+
@base_resource + "experiment/upload_stats",
|
153
|
+
"POST",
|
154
|
+
{
|
155
|
+
exp_slug: @slug,
|
156
|
+
stats: stats.map { |s| s.merge!({ time: Time.now }) }
|
157
|
+
}
|
158
|
+
)
|
159
|
+
Cnvrg::CLI.is_response_success(response, false)
|
153
160
|
end
|
154
161
|
|
155
162
|
def end(output, exit_status, end_commit, cpu_average, memory_average, end_time: nil)
|
@@ -158,7 +165,7 @@ module Cnvrg
|
|
158
165
|
success = false
|
159
166
|
end_time ||= Time.now
|
160
167
|
while tries < 10 and success.blank?
|
161
|
-
sleep (tries*rand) ** 2 ### exponential backoff
|
168
|
+
sleep (tries * rand) ** 2 ### exponential backoff
|
162
169
|
## this call is super important so we cant let it crash.
|
163
170
|
|
164
171
|
tries += 1
|
data/lib/cnvrg/files.rb
CHANGED
@@ -106,7 +106,7 @@ module Cnvrg
|
|
106
106
|
commit: commit_sha1
|
107
107
|
})
|
108
108
|
unless Cnvrg::CLI.is_response_success(resp, false)
|
109
|
-
raise
|
109
|
+
raise StandardError.new("unsupported character: folder name can not include / \\ * : ? \" | ")
|
110
110
|
end
|
111
111
|
# resolve bucket
|
112
112
|
res = resp['result']
|
@@ -730,7 +730,11 @@ module Cnvrg
|
|
730
730
|
end
|
731
731
|
res = Cnvrg::API.request(@base_resource + "download_files", 'POST', {files: files, commit: commit})
|
732
732
|
unless Cnvrg::CLI.is_response_success(res, false)
|
733
|
-
|
733
|
+
begin
|
734
|
+
puts(res)
|
735
|
+
rescue
|
736
|
+
end
|
737
|
+
raise StandardError.new("Cant download files from the server.")
|
734
738
|
end
|
735
739
|
self.download_multiple_files_s3(res['result'], @project_home, postfix: postfix, progress: progress, threads: threads)
|
736
740
|
end
|
data/lib/cnvrg/helpers/agent.rb
CHANGED
@@ -18,10 +18,13 @@ class Cnvrg::Helpers::Agent
|
|
18
18
|
#### params
|
19
19
|
def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
|
20
20
|
@executer = executer
|
21
|
+
@job_id = ENV["CNVRG_JOB_ID"]
|
21
22
|
@slug = slug
|
22
23
|
@files_exist = files_exist
|
23
24
|
@container_name = container_name
|
24
|
-
@
|
25
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
26
|
+
@main_name = @is_new_main ? "main" : "slave"
|
27
|
+
@run_in_main = @container_name.downcase == @main_name
|
25
28
|
@log_interval = send_log_interval
|
26
29
|
# https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
|
27
30
|
if timeout.blank? or timeout.negative?
|
@@ -37,18 +40,8 @@ class Cnvrg::Helpers::Agent
|
|
37
40
|
@sleep_before_retry = sleep_before_retry
|
38
41
|
@real_execution_retries = 0 ## How many times the command really executed until success
|
39
42
|
@single_quotes = single_quotes
|
40
|
-
@docker_user =
|
41
|
-
@
|
42
|
-
if docker_user.present?
|
43
|
-
@docker_user = " --user #{docker_user}"
|
44
|
-
end
|
45
|
-
if @run_in_slave
|
46
|
-
if @single_quotes
|
47
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
|
48
|
-
else
|
49
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
|
50
|
-
end
|
51
|
-
end
|
43
|
+
@docker_user = docker_user
|
44
|
+
@use_bash = use_bash
|
52
45
|
@output = []
|
53
46
|
@errors = []
|
54
47
|
@exit_status = nil
|
@@ -75,19 +68,22 @@ class Cnvrg::Helpers::Agent
|
|
75
68
|
|
76
69
|
def exec!
|
77
70
|
log_internal("Command: #{@command} with slug: #{@slug} started!")
|
71
|
+
command_status = Status::FINISHED
|
78
72
|
if @command.blank?
|
79
73
|
@exit_status = 0
|
74
|
+
command_status = Status::ABORTED
|
80
75
|
elsif should_run?
|
81
76
|
send_logs(status: Status::STARTED)
|
82
77
|
periodic_thread_handle = periodic_thread
|
83
78
|
execute_command
|
84
79
|
else
|
80
|
+
command_status = Status::ABORTED
|
85
81
|
@exit_status = 127
|
86
82
|
end
|
87
83
|
finish_log = "Command: #{@command} with slug: #{@slug} finished"
|
88
84
|
finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
|
89
85
|
log_internal(finish_log)
|
90
|
-
send_logs(exit_status: @exit_status, status:
|
86
|
+
send_logs(exit_status: @exit_status, status: command_status)
|
91
87
|
if periodic_thread_handle.present?
|
92
88
|
periodic_thread_handle.join
|
93
89
|
end
|
@@ -117,14 +113,50 @@ class Cnvrg::Helpers::Agent
|
|
117
113
|
execute_command
|
118
114
|
end
|
119
115
|
|
116
|
+
def execute_command_on_slave
|
117
|
+
extra_slug = (0...2).map { (65 + rand(26)).chr }.join
|
118
|
+
result_file = "/conf/result-#{@slug}-#{extra_slug}"
|
119
|
+
Timeout.timeout(@timeout) do
|
120
|
+
data = {cmd: @command, async: true, file_name: result_file, use_script: true, use_bash: @use_bash, use_sh: !@use_bash, docker_user: @docker_user}
|
121
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
122
|
+
response = conn.post('command', data.to_json)
|
123
|
+
if response.to_hash[:status].to_i != 200
|
124
|
+
@exit_status = 129
|
125
|
+
raise StandardError.new("Cant send command to slave")
|
126
|
+
end
|
127
|
+
t = FileWatch::Tail.new
|
128
|
+
filename = result_file
|
129
|
+
t.tail(filename)
|
130
|
+
t.subscribe do |path, line|
|
131
|
+
if line.include?("cnvrg-exit-code")
|
132
|
+
@exit_status = line.split("=")[1].to_i
|
133
|
+
break
|
134
|
+
end
|
135
|
+
if !@is_new_main
|
136
|
+
log_internal(line, level: LogLevel::PURE)
|
137
|
+
end
|
138
|
+
line = line.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_')
|
139
|
+
@output << {log: line, timestamp: Time.now}
|
140
|
+
end
|
141
|
+
end
|
142
|
+
rescue Timeout::Error
|
143
|
+
@errors << {log: "Command timed out!", timestamp: Time.now}
|
144
|
+
log_internal("Command timed out!", level: LogLevel::ERROR)
|
145
|
+
@exit_status = 124
|
146
|
+
ensure
|
147
|
+
retry_command if @retries != 0 and @exit_status !=0
|
148
|
+
@exit_status
|
149
|
+
end
|
150
|
+
|
120
151
|
def execute_command
|
152
|
+
return execute_command_on_slave if @run_in_main
|
121
153
|
Timeout.timeout(@timeout) do
|
122
154
|
PTY.spawn(@command) do |stdout, stdin, pid, stderr|
|
123
155
|
@pid = pid
|
124
156
|
begin
|
125
157
|
if stdout.present?
|
126
158
|
stdout.each do |line|
|
127
|
-
log_internal(line, level: LogLevel::
|
159
|
+
log_internal(line, level: LogLevel::INFO)
|
128
160
|
line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
|
129
161
|
@output << {log: line, timestamp: Time.now}
|
130
162
|
end
|
@@ -176,10 +208,18 @@ class Cnvrg::Helpers::Agent
|
|
176
208
|
def log_internal(log, level: LogLevel::INFO)
|
177
209
|
if level == LogLevel::PURE
|
178
210
|
puts(log)
|
179
|
-
|
180
|
-
|
211
|
+
STDOUT.flush
|
212
|
+
return
|
213
|
+
end
|
214
|
+
to_print = {message: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity, job: @job_id}
|
215
|
+
if log.start_with?("{") and log.include?("timestamp")
|
216
|
+
log_json = JSON.parse(log)
|
217
|
+
to_print = to_print.stringify_keys.merge(log_json.stringify_keys)
|
181
218
|
end
|
219
|
+
puts(to_print.to_json)
|
182
220
|
STDOUT.flush
|
221
|
+
rescue => e
|
222
|
+
Cnvrg::Logger.log_error(e)
|
183
223
|
end
|
184
224
|
|
185
225
|
def filter_logs_by_regex(logs)
|
@@ -190,4 +230,4 @@ class Cnvrg::Helpers::Agent
|
|
190
230
|
end
|
191
231
|
end
|
192
232
|
end
|
193
|
-
end
|
233
|
+
end
|
@@ -1,7 +1,9 @@
|
|
1
|
+
require "filewatch/tail"
|
1
2
|
require 'cnvrg/helpers/agent'
|
2
3
|
class Cnvrg::Helpers::Executer
|
3
|
-
attr_reader :machine_activity, :agent_id, :
|
4
|
-
|
4
|
+
attr_reader :machine_activity, :agent_id, :main_id
|
5
|
+
MAIN_CONTAINER_PORT = ENV['MAIN_CONTAINER_PORT'].try(:to_i) || 4000
|
6
|
+
HAS_DOCKER = ENV['HAS_DOCKER'] == "true"
|
5
7
|
|
6
8
|
### this class represent a machine_activity. it will poll the commands, communicate with the
|
7
9
|
# server (poll commands) and let the server know the status of this executer.
|
@@ -9,11 +11,15 @@ class Cnvrg::Helpers::Executer
|
|
9
11
|
@owner = owner
|
10
12
|
@job_id = job_id
|
11
13
|
@poll_every = poll_every
|
14
|
+
@check_main_every = 10
|
12
15
|
@machine_activity = machine_activity
|
13
16
|
@commands_q = Queue.new
|
14
17
|
@files_q = Queue.new
|
15
18
|
@agent_id = nil
|
16
|
-
@
|
19
|
+
@main_id = nil
|
20
|
+
@main_start_time = nil
|
21
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
22
|
+
@main_name = @is_new_main ? "main" : "slave"
|
17
23
|
end
|
18
24
|
|
19
25
|
def create_file_cmd(path, content)
|
@@ -37,9 +43,10 @@ class Cnvrg::Helpers::Executer
|
|
37
43
|
def executer_stats
|
38
44
|
return @stats if @stats.present?
|
39
45
|
Cnvrg::Logger.log_info("getting containers")
|
40
|
-
@agent_id, @
|
46
|
+
@agent_id, @main_id = containers
|
41
47
|
Cnvrg::Logger.log_info("got containers")
|
42
48
|
pod_name, node_name = get_node_and_pod_names
|
49
|
+
# For backwards compatibility we still call this slave stats
|
43
50
|
@stats = {
|
44
51
|
pod_name: pod_name,
|
45
52
|
node_name: node_name,
|
@@ -53,39 +60,48 @@ class Cnvrg::Helpers::Executer
|
|
53
60
|
cnvrg: Cnvrg::VERSION
|
54
61
|
},
|
55
62
|
slave: {
|
56
|
-
container_id: @
|
57
|
-
|
58
|
-
|
63
|
+
container_id: @main_id,
|
64
|
+
container_name: @main_name,
|
65
|
+
workdir: run_in_main('pwd'),
|
66
|
+
homedir: main_homedir,
|
59
67
|
spark_path: spark_path,
|
60
|
-
user:
|
61
|
-
cnvrg:
|
62
|
-
has_bash:
|
63
|
-
user_id:
|
64
|
-
group_id:
|
65
|
-
python_version:
|
66
|
-
python3_version:
|
67
|
-
pip_version:
|
68
|
-
pip3_version:
|
68
|
+
user: run_in_main( 'whoami'),
|
69
|
+
cnvrg: run_in_main( 'which cnvrg'),
|
70
|
+
has_bash: run_in_main( 'which bash'),
|
71
|
+
user_id: run_in_main( 'id -u'),
|
72
|
+
group_id: run_in_main( 'id -g'),
|
73
|
+
python_version: run_in_main( 'python --version'),
|
74
|
+
python3_version: run_in_main( 'python3 --version'),
|
75
|
+
pip_version: run_in_main( 'pip --version'),
|
76
|
+
pip3_version: run_in_main( 'pip3 --version')
|
69
77
|
},
|
70
78
|
}
|
79
|
+
|
71
80
|
@stats
|
72
81
|
end
|
73
82
|
|
74
83
|
def containers
|
75
84
|
agent_id = nil
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
85
|
+
main_id = nil
|
86
|
+
timeout = 2
|
87
|
+
timeout = nil if (!@is_new_main || HAS_DOCKER)
|
88
|
+
Timeout.timeout(timeout) do
|
89
|
+
while agent_id.blank? or main_id.blank?
|
90
|
+
grep_by = @job_id
|
91
|
+
grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
|
92
|
+
cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
|
93
|
+
agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
|
94
|
+
main_id = cntrs.find{|container_name| container_name.include? @main_name}.split(",").first rescue nil
|
95
|
+
sleep(2)
|
96
|
+
end
|
84
97
|
end
|
85
|
-
if
|
86
|
-
raise "Can't find
|
98
|
+
if main_id.blank?
|
99
|
+
raise "Can't find main id"
|
87
100
|
end
|
88
|
-
[agent_id,
|
101
|
+
[agent_id, main_id]
|
102
|
+
rescue => e
|
103
|
+
Cnvrg::Logger.log_error(e)
|
104
|
+
[agent_id, main_id]
|
89
105
|
end
|
90
106
|
|
91
107
|
def current_homedir
|
@@ -93,21 +109,45 @@ class Cnvrg::Helpers::Executer
|
|
93
109
|
end
|
94
110
|
|
95
111
|
def spark_path
|
96
|
-
|
112
|
+
run_in_main("env | grep SPARK_HOME").strip.split("=").try(:last)
|
97
113
|
end
|
98
114
|
|
99
|
-
def
|
100
|
-
|
115
|
+
def main_homedir()
|
116
|
+
run_in_main("env | grep -w HOME").split("=").try(:last)
|
101
117
|
end
|
102
118
|
|
103
|
-
def
|
104
|
-
|
119
|
+
def main_env
|
120
|
+
run_in_main("env").split("\n").map{|x| x.split("=")}
|
105
121
|
end
|
106
122
|
|
107
|
-
def
|
108
|
-
|
109
|
-
end
|
123
|
+
def run_in_main(command)
|
124
|
+
data = {cmd: command, async: false, use_sh: true}
|
110
125
|
|
126
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn(timeout: 180)
|
127
|
+
response = conn.post('command', data.to_json)
|
128
|
+
if response.to_hash[:status].to_i != 200
|
129
|
+
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
130
|
+
return ""
|
131
|
+
end
|
132
|
+
resp = []
|
133
|
+
lines = response.body.split("\n")
|
134
|
+
lines.each do |line|
|
135
|
+
next if line.strip == nil or line.strip == ""
|
136
|
+
if line.include?("cnvrg-exit-code")
|
137
|
+
exit_status = line.split("=")[1].to_i
|
138
|
+
if exit_status != 0
|
139
|
+
Cnvrg::Logger.log_info("failed to run find command #{command} on main")
|
140
|
+
return ""
|
141
|
+
end
|
142
|
+
next
|
143
|
+
end
|
144
|
+
resp << line
|
145
|
+
end
|
146
|
+
return resp.join("\n")
|
147
|
+
rescue => e
|
148
|
+
Cnvrg::Logger.log_error(e)
|
149
|
+
return ""
|
150
|
+
end
|
111
151
|
|
112
152
|
def poll
|
113
153
|
resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
|
@@ -124,6 +164,7 @@ class Cnvrg::Helpers::Executer
|
|
124
164
|
success = false
|
125
165
|
puts("Agent started, connecting to #{Cnvrg::API.get_api}")
|
126
166
|
STDOUT.flush
|
167
|
+
wait_for_main
|
127
168
|
while !success and retries < 100
|
128
169
|
begin
|
129
170
|
resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
|
@@ -154,14 +195,75 @@ class Cnvrg::Helpers::Executer
|
|
154
195
|
end
|
155
196
|
end
|
156
197
|
|
198
|
+
def check_main_is_working_thread
|
199
|
+
while true
|
200
|
+
check_main_alive
|
201
|
+
sleep(@check_main_every)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
157
205
|
def main_thread
|
158
206
|
init
|
159
207
|
Thread.new do
|
160
208
|
polling_thread
|
161
209
|
end
|
210
|
+
Thread.new do
|
211
|
+
check_main_is_working_thread
|
212
|
+
end
|
162
213
|
execute_cmds
|
163
214
|
end
|
164
215
|
|
216
|
+
def wait_for_main
|
217
|
+
copy_file_to_main
|
218
|
+
start_tiny_if_missing
|
219
|
+
puts("Waiting for main container")
|
220
|
+
STDOUT.flush
|
221
|
+
got_response = false
|
222
|
+
while !got_response do
|
223
|
+
begin
|
224
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
225
|
+
response = conn.get('readiness')
|
226
|
+
if response.to_hash[:status].to_i != 200
|
227
|
+
sleep(0.1)
|
228
|
+
next
|
229
|
+
else
|
230
|
+
puts("Client container is ready")
|
231
|
+
STDOUT.flush
|
232
|
+
@main_start_time = response.body.to_i
|
233
|
+
got_response = true
|
234
|
+
end
|
235
|
+
rescue => e
|
236
|
+
puts("Failed to connect to main")
|
237
|
+
puts(e)
|
238
|
+
STDOUT.flush
|
239
|
+
sleep(0.1)
|
240
|
+
next
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def copy_file_to_main
|
246
|
+
begin
|
247
|
+
FileUtils.cp("/cnvrg-tiny", "/conf/tiny")
|
248
|
+
FileUtils.cp_r("/scripts", "/conf/scripts-bin")
|
249
|
+
FileUtils.touch("/conf/tiny-ready")
|
250
|
+
rescue => e
|
251
|
+
Cnvrg::Logger.log_error(e)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def start_tiny_if_missing
|
256
|
+
return unless ENV['MAIN_CONTAINER_PORT'].blank?
|
257
|
+
Cnvrg::Logger.log_info("Tiny not found, starting it")
|
258
|
+
@agent_id, @main_id = containers
|
259
|
+
pid = Process.fork do
|
260
|
+
Cnvrg::Logger.log_info("running docker exec -i #{@main_id} sh -c '/conf/tiny")
|
261
|
+
`docker exec -i #{@main_id} sh -c '/conf/tiny'`.strip
|
262
|
+
end
|
263
|
+
Process.detach(pid)
|
264
|
+
Cnvrg::Logger.log_info("Tiny started and detached")
|
265
|
+
end
|
266
|
+
|
165
267
|
def execute_cmds
|
166
268
|
pids = []
|
167
269
|
while true
|
@@ -201,7 +303,7 @@ class Cnvrg::Helpers::Executer
|
|
201
303
|
pod_name = `hostname`.strip rescue nil
|
202
304
|
node_name = nil
|
203
305
|
if pod_name.present?
|
204
|
-
pod_describe = `kubectl
|
306
|
+
pod_describe = `kubectl get pod #{pod_name} -o json` rescue nil
|
205
307
|
pod_describe = JSON.parse(pod_describe) rescue {}
|
206
308
|
node_name = pod_describe["spec"]["nodeName"] rescue nil
|
207
309
|
end
|
@@ -215,13 +317,53 @@ class Cnvrg::Helpers::Executer
|
|
215
317
|
Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
|
216
318
|
end
|
217
319
|
|
320
|
+
def check_main_alive
|
321
|
+
# Dont check before we got first response
|
322
|
+
return if @main_start_time == nil
|
323
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
324
|
+
response = conn.get('readiness')
|
325
|
+
if response.to_hash[:status].to_i != 200
|
326
|
+
main_start_time = 0
|
327
|
+
else
|
328
|
+
main_start_time = response.body.to_i
|
329
|
+
end
|
330
|
+
if main_start_time != @main_start_time
|
331
|
+
puts("Found that main restarted, restarting agent")
|
332
|
+
Cnvrg::Logger.log_info("Found that main restarted, restarting agent")
|
333
|
+
exit(1)
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
218
337
|
def get_pod_events(pod_name)
|
219
338
|
return if pod_name.blank?
|
220
|
-
`kubectl get event --
|
339
|
+
`kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
|
221
340
|
end
|
222
341
|
|
223
342
|
def get_node_events(node_name)
|
224
343
|
return if node_name.blank?
|
225
344
|
`kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
|
226
345
|
end
|
346
|
+
|
347
|
+
def self.main_container_url
|
348
|
+
if ENV["CNVRG_COMPUTE_CLUSTER"].blank? and ENV["KUBERNETES_SERVICE_HOST"].blank?
|
349
|
+
if ENV["MAIN_CONTAINER_PORT"].blank?
|
350
|
+
host = "slave"
|
351
|
+
else
|
352
|
+
host = "main"
|
353
|
+
end
|
354
|
+
"http://#{host}:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
355
|
+
else
|
356
|
+
"http://localhost:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
def self.get_main_conn(timeout: 4, open_timeout: 1)
|
361
|
+
conn = Faraday.new(
|
362
|
+
url: Cnvrg::Helpers::Executer.main_container_url,
|
363
|
+
headers: {'Content-Type' => 'application/json'}
|
364
|
+
)
|
365
|
+
conn.options.timeout = timeout
|
366
|
+
conn.options.open_timeout = open_timeout
|
367
|
+
conn
|
368
|
+
end
|
227
369
|
end
|