cnvrg 1.11.31 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Readme.md +6 -1
- data/cnvrg.gemspec +2 -1
- data/lib/cnvrg/api.rb +4 -4
- data/lib/cnvrg/cli.rb +83 -71
- data/lib/cnvrg/connect_job_ssh.rb +4 -4
- data/lib/cnvrg/files.rb +1 -1
- data/lib/cnvrg/helpers/agent.rb +58 -18
- data/lib/cnvrg/helpers/executer.rb +176 -34
- data/lib/cnvrg/job_ssh.rb +9 -4
- data/lib/cnvrg/project.rb +4 -2
- data/lib/cnvrg/version.rb +2 -2
- metadata +17 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ab82882b2bb6c9093751cd560eaa4ccb7540fad9b9a34a81245721538dd37a5b
|
|
4
|
+
data.tar.gz: e35299be744d985a37794288a269ba2bb55cf64d3fcd702c6c6147bd4f5d740d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 52b51bb4942583e9ac3eceab24891d252ef7e31d7f1a2a87d5c8f4586f914f33f2d9cc7addc09ab5aa3652a9ba939ef5a37bdb9fc82d18e4e0e0a61126265d17
|
|
7
|
+
data.tar.gz: df735e631778b5d36d296c33903719a2ba7832ce2a9e218eea033ad246f8c4ce044a7b7d5562f68d901a1ee32d62a88559f886da2a0341d8bf7c51b50c25660c
|
data/Readme.md
CHANGED
data/cnvrg.gemspec
CHANGED
|
@@ -6,7 +6,7 @@ require 'cnvrg/version'
|
|
|
6
6
|
Gem::Specification.new do |spec|
|
|
7
7
|
spec.name = 'cnvrg'
|
|
8
8
|
spec.version = Cnvrg::VERSION
|
|
9
|
-
spec.authors = ['Yochay Ettun', 'Leah Kolben']
|
|
9
|
+
spec.authors = ['Yochay Ettun', 'Leah Kolben', 'Omer Shacham']
|
|
10
10
|
spec.email = ['info@cnvrg.io']
|
|
11
11
|
spec.summary = %q{A CLI tool for interacting with cnvrg.io.}
|
|
12
12
|
spec.description = %q{A CLI tool for interacting with cnvrg.io.}
|
|
@@ -39,6 +39,7 @@ Gem::Specification.new do |spec|
|
|
|
39
39
|
spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
|
|
40
40
|
spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
|
|
41
41
|
spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
|
|
42
|
+
spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
|
|
42
43
|
spec.add_runtime_dependency 'parallel', '~> 1.12.0'
|
|
43
44
|
spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
|
|
44
45
|
spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
|
data/lib/cnvrg/api.rb
CHANGED
|
@@ -72,11 +72,11 @@ module Cnvrg
|
|
|
72
72
|
if response.to_hash[:status].to_i != 200
|
|
73
73
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
|
74
74
|
end
|
|
75
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
|
75
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
|
76
76
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
|
77
77
|
success = false
|
|
78
78
|
sleep(5 * retries)
|
|
79
|
-
retries +=1
|
|
79
|
+
retries += 1
|
|
80
80
|
next
|
|
81
81
|
end
|
|
82
82
|
rescue => e
|
|
@@ -112,11 +112,11 @@ module Cnvrg
|
|
|
112
112
|
if response.to_hash[:status].to_i != 200
|
|
113
113
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
|
114
114
|
end
|
|
115
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
|
115
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
|
116
116
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
|
117
117
|
success = false
|
|
118
118
|
sleep(5 * retries)
|
|
119
|
-
retries +=1
|
|
119
|
+
retries += 1
|
|
120
120
|
next
|
|
121
121
|
end
|
|
122
122
|
rescue => e
|
data/lib/cnvrg/cli.rb
CHANGED
|
@@ -173,7 +173,7 @@ module Cnvrg
|
|
|
173
173
|
desc "data [COMMAND]", "Upload and manage datasets", :hide => false
|
|
174
174
|
subcommand "data", Data
|
|
175
175
|
|
|
176
|
-
desc "job", "manage running jobs", :hide =>
|
|
176
|
+
desc "job", "manage running jobs", :hide => true
|
|
177
177
|
subcommand "job", JobCli
|
|
178
178
|
|
|
179
179
|
desc "ssh", "ssh into running jobs", :hide => false
|
|
@@ -415,7 +415,7 @@ module Cnvrg
|
|
|
415
415
|
end
|
|
416
416
|
end
|
|
417
417
|
|
|
418
|
-
desc 'set_compression_path', 'Set compression path'
|
|
418
|
+
desc 'set_compression_path', 'Set compression path', :hide => true
|
|
419
419
|
method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
|
|
420
420
|
|
|
421
421
|
def set_compression_path(*compression_path)
|
|
@@ -2311,6 +2311,7 @@ module Cnvrg
|
|
|
2311
2311
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
|
2312
2312
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
|
2313
2313
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
|
2314
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
|
2314
2315
|
|
|
2315
2316
|
def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
|
|
2316
2317
|
begin
|
|
@@ -2328,6 +2329,8 @@ module Cnvrg
|
|
|
2328
2329
|
exp_obj = nil
|
|
2329
2330
|
end
|
|
2330
2331
|
|
|
2332
|
+
local = options["local"]
|
|
2333
|
+
|
|
2331
2334
|
commit_msg = options["message"]
|
|
2332
2335
|
if commit_msg.nil? or commit_msg.empty?
|
|
2333
2336
|
commit_msg = ""
|
|
@@ -2349,7 +2352,7 @@ module Cnvrg
|
|
|
2349
2352
|
if git_output_dir.ends_with? "/"
|
|
2350
2353
|
git_output_dir = git_output_dir[0..-2]
|
|
2351
2354
|
end
|
|
2352
|
-
list = @project.generate_output_dir(git_output_dir)
|
|
2355
|
+
list = @project.generate_output_dir(git_output_dir, local: local)
|
|
2353
2356
|
end
|
|
2354
2357
|
list += @project.generate_git_diff if options["git_diff"]
|
|
2355
2358
|
spec_files_to_upload = list
|
|
@@ -2668,7 +2671,7 @@ module Cnvrg
|
|
|
2668
2671
|
end
|
|
2669
2672
|
end
|
|
2670
2673
|
|
|
2671
|
-
desc 'commit before termination', 'Commit job code before termination'
|
|
2674
|
+
desc 'commit before termination', 'Commit job code before termination', :hide => true
|
|
2672
2675
|
def commit_before_termination()
|
|
2673
2676
|
job_type = ENV['CNVRG_JOB_TYPE']
|
|
2674
2677
|
job_id = ENV['CNVRG_JOB_ID']
|
|
@@ -2678,7 +2681,7 @@ module Cnvrg
|
|
|
2678
2681
|
log_error(e)
|
|
2679
2682
|
end
|
|
2680
2683
|
|
|
2681
|
-
desc 'update_job_commit', 'Update job with its last commit'
|
|
2684
|
+
desc 'update_job_commit', 'Update job with its last commit' , :hide => true
|
|
2682
2685
|
def update_job_commit()
|
|
2683
2686
|
job_type = ENV['CNVRG_JOB_TYPE']
|
|
2684
2687
|
job_id = ENV['CNVRG_JOB_ID']
|
|
@@ -2868,7 +2871,7 @@ module Cnvrg
|
|
|
2868
2871
|
|
|
2869
2872
|
|
|
2870
2873
|
|
|
2871
|
-
desc 'jump', 'Jump to specific commit'
|
|
2874
|
+
desc 'jump COMMIT_ID', 'Jump to specific commit'
|
|
2872
2875
|
def jump(commit_sha1)
|
|
2873
2876
|
begin
|
|
2874
2877
|
verify_logged_in()
|
|
@@ -3008,6 +3011,7 @@ module Cnvrg
|
|
|
3008
3011
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
|
3009
3012
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
|
3010
3013
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
|
3014
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => false
|
|
3011
3015
|
|
|
3012
3016
|
def sync(direct = true)
|
|
3013
3017
|
verify_logged_in(true) if direct
|
|
@@ -3030,10 +3034,10 @@ module Cnvrg
|
|
|
3030
3034
|
if run_download or options['debug_mode']
|
|
3031
3035
|
invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
|
|
3032
3036
|
end
|
|
3033
|
-
invoke :upload, [false, true,
|
|
3037
|
+
invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
|
|
3034
3038
|
:ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
|
|
3035
3039
|
:files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
|
|
3036
|
-
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
|
|
3040
|
+
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
|
|
3037
3041
|
|
|
3038
3042
|
end
|
|
3039
3043
|
|
|
@@ -3316,62 +3320,47 @@ module Cnvrg
|
|
|
3316
3320
|
end
|
|
3317
3321
|
end
|
|
3318
3322
|
start_time = Time.now
|
|
3319
|
-
shell_type = options["use_bash"] ? "bash -l" : "sh"
|
|
3320
3323
|
if @exp.get_cmd.present?
|
|
3321
3324
|
cmd = @exp.get_cmd
|
|
3322
|
-
if options["docker_id"].present? # Escape for docker exec
|
|
3323
|
-
cmd = cmd.gsub("\"", "\\\"")
|
|
3324
|
-
end
|
|
3325
3325
|
end
|
|
3326
|
-
|
|
3327
|
-
|
|
3326
|
+
command_slug = (0...18).map { (65 + rand(26)).chr }.join
|
|
3327
|
+
result_file = "/conf/result-#{command_slug}"
|
|
3328
|
+
data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
|
|
3329
|
+
|
|
3330
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
|
3331
|
+
response = conn.post('command', data.to_json)
|
|
3332
|
+
if response.to_hash[:status].to_i != 200
|
|
3333
|
+
exit_status = 129
|
|
3334
|
+
raise StandardError.new("Cant send command to slave")
|
|
3328
3335
|
end
|
|
3329
|
-
|
|
3336
|
+
t = FileWatch::Tail.new
|
|
3337
|
+
filename = result_file
|
|
3338
|
+
lines = []
|
|
3339
|
+
t.tail(filename)
|
|
3340
|
+
t.subscribe do |path, line|
|
|
3330
3341
|
begin
|
|
3331
|
-
|
|
3332
|
-
|
|
3333
|
-
|
|
3334
|
-
|
|
3335
|
-
|
|
3336
|
-
|
|
3337
|
-
|
|
3338
|
-
|
|
3339
|
-
|
|
3340
|
-
puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
|
|
3341
|
-
end
|
|
3342
|
+
cur_log = JSON.parse(line)
|
|
3343
|
+
if cur_log["type"] == "endMessage"
|
|
3344
|
+
exit_status = cur_log["real"].to_i
|
|
3345
|
+
break
|
|
3346
|
+
else
|
|
3347
|
+
puts(cur_log.to_json)
|
|
3348
|
+
STDOUT.flush
|
|
3349
|
+
cur_log["time"] = Time.parse(cur_log["timestamp"])
|
|
3350
|
+
cur_log["message"] = cur_log["message"].to_s + "\r\n"
|
|
3342
3351
|
log << cur_log
|
|
3343
|
-
if log.size >= 10
|
|
3344
|
-
@exp.upload_temp_log(log) unless log.empty?
|
|
3345
|
-
log = []
|
|
3346
|
-
elsif (start_time + 15.seconds) <= Time.now
|
|
3347
|
-
@exp.upload_temp_log(log) unless log.empty?
|
|
3348
|
-
log = []
|
|
3349
|
-
start_time = Time.now
|
|
3350
|
-
end
|
|
3351
3352
|
end
|
|
3352
|
-
if
|
|
3353
|
-
|
|
3354
|
-
|
|
3355
|
-
|
|
3356
|
-
|
|
3357
|
-
|
|
3358
|
-
|
|
3359
|
-
if !log.empty?
|
|
3360
|
-
temp_log = log
|
|
3361
|
-
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
|
3362
|
-
log -= temp_log
|
|
3353
|
+
if log.size >= 10
|
|
3354
|
+
@exp.upload_temp_log(log)
|
|
3355
|
+
log = []
|
|
3356
|
+
elsif (start_time + 15.seconds) <= Time.now
|
|
3357
|
+
@exp.upload_temp_log(log) unless log.empty?
|
|
3358
|
+
log = []
|
|
3359
|
+
start_time = Time.now
|
|
3363
3360
|
end
|
|
3364
|
-
rescue Errno::ENOENT => e
|
|
3365
|
-
exp_success = false
|
|
3366
|
-
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
|
3367
|
-
log_error(e)
|
|
3368
3361
|
rescue => e
|
|
3369
|
-
res = @exp.end(log, 1, start_commit, 0, 0)
|
|
3370
|
-
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
|
3371
3362
|
log_error(e)
|
|
3372
|
-
exit(0)
|
|
3373
3363
|
end
|
|
3374
|
-
::Process.wait pid
|
|
3375
3364
|
end
|
|
3376
3365
|
end_time = Time.now
|
|
3377
3366
|
process_running = false
|
|
@@ -3379,14 +3368,13 @@ module Cnvrg
|
|
|
3379
3368
|
if !log.empty?
|
|
3380
3369
|
|
|
3381
3370
|
temp_log = log
|
|
3382
|
-
|
|
3371
|
+
@exp.upload_temp_log(temp_log)
|
|
3383
3372
|
log -= temp_log
|
|
3384
3373
|
end
|
|
3385
3374
|
|
|
3386
3375
|
cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
|
|
3387
3376
|
memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
|
|
3388
|
-
exit_status
|
|
3389
|
-
if $?.exitstatus != 0
|
|
3377
|
+
if exit_status != 0
|
|
3390
3378
|
exp_success = false
|
|
3391
3379
|
end
|
|
3392
3380
|
|
|
@@ -3430,7 +3418,6 @@ module Cnvrg
|
|
|
3430
3418
|
if @exp
|
|
3431
3419
|
# log_thread.join
|
|
3432
3420
|
Thread.kill(stats_thread) if docker_stats
|
|
3433
|
-
exit_status = $?.exitstatus
|
|
3434
3421
|
if exit_status.blank?
|
|
3435
3422
|
exit_status = "-1"
|
|
3436
3423
|
end
|
|
@@ -3443,8 +3430,6 @@ module Cnvrg
|
|
|
3443
3430
|
|
|
3444
3431
|
exit(1)
|
|
3445
3432
|
end
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
3433
|
end
|
|
3449
3434
|
|
|
3450
3435
|
end
|
|
@@ -3689,7 +3674,7 @@ module Cnvrg
|
|
|
3689
3674
|
end
|
|
3690
3675
|
end
|
|
3691
3676
|
|
|
3692
|
-
desc 'deploy', 'Deploys model to production'
|
|
3677
|
+
desc 'deploy', 'Deploys model to production', :hide => true
|
|
3693
3678
|
method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
|
|
3694
3679
|
method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
|
|
3695
3680
|
method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
|
|
@@ -3778,7 +3763,7 @@ module Cnvrg
|
|
|
3778
3763
|
method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
|
|
3779
3764
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
|
|
3780
3765
|
|
|
3781
|
-
desc 'notebook', 'Starts a notebook session remotely or locally'
|
|
3766
|
+
desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
|
|
3782
3767
|
|
|
3783
3768
|
def notebook
|
|
3784
3769
|
verify_logged_in(true)
|
|
@@ -3905,7 +3890,7 @@ module Cnvrg
|
|
|
3905
3890
|
end
|
|
3906
3891
|
end
|
|
3907
3892
|
|
|
3908
|
-
desc 'remote_notebook', 'Run notebook server on remote server'
|
|
3893
|
+
desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
|
|
3909
3894
|
method_option :machine_type, :type => :string, :default => ""
|
|
3910
3895
|
method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
|
|
3911
3896
|
method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
|
|
@@ -4264,7 +4249,7 @@ module Cnvrg
|
|
|
4264
4249
|
|
|
4265
4250
|
end
|
|
4266
4251
|
|
|
4267
|
-
desc 'notebook_stop', '
|
|
4252
|
+
desc 'notebook_stop', 'Stop notebook', :hide => true
|
|
4268
4253
|
method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
|
|
4269
4254
|
method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
|
|
4270
4255
|
method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
|
|
@@ -4651,10 +4636,16 @@ module Cnvrg
|
|
|
4651
4636
|
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
|
4652
4637
|
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
|
4653
4638
|
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
|
4639
|
+
method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
|
|
4640
|
+
method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
|
|
4641
|
+
method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
|
|
4654
4642
|
|
|
4655
4643
|
def collect_metrics
|
|
4656
4644
|
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
|
4657
4645
|
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
|
4646
|
+
prom_user = options[:prom_user]
|
|
4647
|
+
prom_password = options[:prom_password]
|
|
4648
|
+
name = options[:name]
|
|
4658
4649
|
|
|
4659
4650
|
translate_result = Cnvrg::API_V2.request(
|
|
4660
4651
|
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
|
@@ -4679,9 +4670,16 @@ module Cnvrg
|
|
|
4679
4670
|
next
|
|
4680
4671
|
end
|
|
4681
4672
|
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
|
4682
|
-
|
|
4673
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
4674
|
+
http.use_ssl = uri.scheme == "https"
|
|
4675
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
4676
|
+
req = Net::HTTP::Get.new uri.request_uri
|
|
4677
|
+
if prom_user.present?
|
|
4678
|
+
req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
|
|
4679
|
+
end
|
|
4680
|
+
resp = http.request(req)
|
|
4683
4681
|
begin
|
|
4684
|
-
result = JSON.parse(resp)
|
|
4682
|
+
result = JSON.parse(resp.body)
|
|
4685
4683
|
rescue JSON::ParserError => e
|
|
4686
4684
|
log_error(e)
|
|
4687
4685
|
next
|
|
@@ -4690,13 +4688,22 @@ module Cnvrg
|
|
|
4690
4688
|
next unless data_result
|
|
4691
4689
|
|
|
4692
4690
|
if data_result.size > 1
|
|
4693
|
-
stats[query_name] = {}
|
|
4691
|
+
stats[query_name] = {} unless query_name.include? 'block'
|
|
4694
4692
|
data_result.each_with_index do |res, i|
|
|
4695
4693
|
timestamp, value = res["value"]
|
|
4696
4694
|
uuid = res["metric"]["UUID"].presence || i
|
|
4697
4695
|
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
|
4698
4696
|
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
|
4699
|
-
|
|
4697
|
+
if query_name.include? 'block'
|
|
4698
|
+
uuid = res["metric"]["interface"].presence || i
|
|
4699
|
+
uuid = "#{name}-#{uuid}" if name.present?
|
|
4700
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
|
4701
|
+
io_type = query_name.split('_')[1]
|
|
4702
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
|
4703
|
+
stats['block_io'][io_type].merge!({ uuid => stat_value })
|
|
4704
|
+
else
|
|
4705
|
+
stats[query_name][uuid] = stat_value
|
|
4706
|
+
end
|
|
4700
4707
|
end
|
|
4701
4708
|
else
|
|
4702
4709
|
timestamp, value = data_result&.first&.dig('value')
|
|
@@ -4705,9 +4712,14 @@ module Cnvrg
|
|
|
4705
4712
|
if query_name.include? 'block'
|
|
4706
4713
|
stats['block_io'] = {} if stats['block_io'].blank?
|
|
4707
4714
|
io_type = query_name.split('_')[1]
|
|
4708
|
-
|
|
4715
|
+
if name.present?
|
|
4716
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
|
4717
|
+
stats['block_io'][io_type].merge!({ name => stat_value })
|
|
4718
|
+
else
|
|
4719
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
|
4720
|
+
end
|
|
4709
4721
|
else
|
|
4710
|
-
stats[query_name] = stat_value
|
|
4722
|
+
stats[query_name] = name.present? ? { name => stat_value } : stat_value
|
|
4711
4723
|
end
|
|
4712
4724
|
end
|
|
4713
4725
|
end
|
|
@@ -4751,7 +4763,7 @@ module Cnvrg
|
|
|
4751
4763
|
end
|
|
4752
4764
|
|
|
4753
4765
|
|
|
4754
|
-
desc '', ''
|
|
4766
|
+
desc '', '', :hide => true
|
|
4755
4767
|
|
|
4756
4768
|
def download_built_image(image_name, image_slug)
|
|
4757
4769
|
begin
|
|
@@ -4995,7 +5007,7 @@ module Cnvrg
|
|
|
4995
5007
|
end
|
|
4996
5008
|
end
|
|
4997
5009
|
|
|
4998
|
-
desc 'experiments', 'List project experiments'
|
|
5010
|
+
desc 'experiments', 'List project experiments', :hide => true
|
|
4999
5011
|
method_option :id, :type => :string, :aliases => ["--id"], :desc => "Get info for specific experiments", :default => ""
|
|
5000
5012
|
method_option :tag, :type => :string, :aliases => ["-t"], :desc => "Get info for specific experiment tag", :default => ""
|
|
5001
5013
|
|
|
@@ -10,20 +10,20 @@ module Cnvrg
|
|
|
10
10
|
Cnvrg::Logger.log_info("cnvrg is not configured")
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
-
def start(username, password)
|
|
14
|
-
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password})
|
|
13
|
+
def start(username, password, no_auth, port: nil)
|
|
14
|
+
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password, no_auth: no_auth, port: port})
|
|
15
15
|
end
|
|
16
16
|
|
|
17
17
|
def status()
|
|
18
18
|
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/status" , 'GET', nil)
|
|
19
19
|
end
|
|
20
20
|
|
|
21
|
-
def run_portforward_command(pod_name, port, kubeconfig, namespace)
|
|
21
|
+
def run_portforward_command(pod_name, port, kubeconfig, namespace, internal_port)
|
|
22
22
|
command = "kubectl"
|
|
23
23
|
if kubeconfig.present?
|
|
24
24
|
command = "kubectl --kubeconfig=#{kubeconfig}"
|
|
25
25
|
end
|
|
26
|
-
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}
|
|
26
|
+
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:#{internal_port}"
|
|
27
27
|
puts("\nrunning command #{bashCommand}")
|
|
28
28
|
`#{bashCommand}`
|
|
29
29
|
end
|
data/lib/cnvrg/files.rb
CHANGED
|
@@ -106,7 +106,7 @@ module Cnvrg
|
|
|
106
106
|
commit: commit_sha1
|
|
107
107
|
})
|
|
108
108
|
unless Cnvrg::CLI.is_response_success(resp, false)
|
|
109
|
-
raise
|
|
109
|
+
raise StandardError.new("unsupported character: folder name can not include / \\ * : ? \" | ")
|
|
110
110
|
end
|
|
111
111
|
# resolve bucket
|
|
112
112
|
res = resp['result']
|
data/lib/cnvrg/helpers/agent.rb
CHANGED
|
@@ -18,10 +18,13 @@ class Cnvrg::Helpers::Agent
|
|
|
18
18
|
#### params
|
|
19
19
|
def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
|
|
20
20
|
@executer = executer
|
|
21
|
+
@job_id = ENV["CNVRG_JOB_ID"]
|
|
21
22
|
@slug = slug
|
|
22
23
|
@files_exist = files_exist
|
|
23
24
|
@container_name = container_name
|
|
24
|
-
@
|
|
25
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
|
26
|
+
@main_name = @is_new_main ? "main" : "slave"
|
|
27
|
+
@run_in_main = @container_name.downcase == @main_name
|
|
25
28
|
@log_interval = send_log_interval
|
|
26
29
|
# https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
|
|
27
30
|
if timeout.blank? or timeout.negative?
|
|
@@ -37,18 +40,8 @@ class Cnvrg::Helpers::Agent
|
|
|
37
40
|
@sleep_before_retry = sleep_before_retry
|
|
38
41
|
@real_execution_retries = 0 ## How many times the command really executed until success
|
|
39
42
|
@single_quotes = single_quotes
|
|
40
|
-
@docker_user =
|
|
41
|
-
@
|
|
42
|
-
if docker_user.present?
|
|
43
|
-
@docker_user = " --user #{docker_user}"
|
|
44
|
-
end
|
|
45
|
-
if @run_in_slave
|
|
46
|
-
if @single_quotes
|
|
47
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
|
|
48
|
-
else
|
|
49
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
|
|
50
|
-
end
|
|
51
|
-
end
|
|
43
|
+
@docker_user = docker_user
|
|
44
|
+
@use_bash = use_bash
|
|
52
45
|
@output = []
|
|
53
46
|
@errors = []
|
|
54
47
|
@exit_status = nil
|
|
@@ -75,19 +68,22 @@ class Cnvrg::Helpers::Agent
|
|
|
75
68
|
|
|
76
69
|
def exec!
|
|
77
70
|
log_internal("Command: #{@command} with slug: #{@slug} started!")
|
|
71
|
+
command_status = Status::FINISHED
|
|
78
72
|
if @command.blank?
|
|
79
73
|
@exit_status = 0
|
|
74
|
+
command_status = Status::ABORTED
|
|
80
75
|
elsif should_run?
|
|
81
76
|
send_logs(status: Status::STARTED)
|
|
82
77
|
periodic_thread_handle = periodic_thread
|
|
83
78
|
execute_command
|
|
84
79
|
else
|
|
80
|
+
command_status = Status::ABORTED
|
|
85
81
|
@exit_status = 127
|
|
86
82
|
end
|
|
87
83
|
finish_log = "Command: #{@command} with slug: #{@slug} finished"
|
|
88
84
|
finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
|
|
89
85
|
log_internal(finish_log)
|
|
90
|
-
send_logs(exit_status: @exit_status, status:
|
|
86
|
+
send_logs(exit_status: @exit_status, status: command_status)
|
|
91
87
|
if periodic_thread_handle.present?
|
|
92
88
|
periodic_thread_handle.join
|
|
93
89
|
end
|
|
@@ -117,14 +113,50 @@ class Cnvrg::Helpers::Agent
|
|
|
117
113
|
execute_command
|
|
118
114
|
end
|
|
119
115
|
|
|
116
|
+
def execute_command_on_slave
|
|
117
|
+
extra_slug = (0...2).map { (65 + rand(26)).chr }.join
|
|
118
|
+
result_file = "/conf/result-#{@slug}-#{extra_slug}"
|
|
119
|
+
Timeout.timeout(@timeout) do
|
|
120
|
+
data = {cmd: @command, async: true, file_name: result_file, use_script: true, use_bash: @use_bash, use_sh: !@use_bash, docker_user: @docker_user}
|
|
121
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
|
122
|
+
response = conn.post('command', data.to_json)
|
|
123
|
+
if response.to_hash[:status].to_i != 200
|
|
124
|
+
@exit_status = 129
|
|
125
|
+
raise StandardError.new("Cant send command to slave")
|
|
126
|
+
end
|
|
127
|
+
t = FileWatch::Tail.new
|
|
128
|
+
filename = result_file
|
|
129
|
+
t.tail(filename)
|
|
130
|
+
t.subscribe do |path, line|
|
|
131
|
+
if line.include?("cnvrg-exit-code")
|
|
132
|
+
@exit_status = line.split("=")[1].to_i
|
|
133
|
+
break
|
|
134
|
+
end
|
|
135
|
+
if !@is_new_main
|
|
136
|
+
log_internal(line, level: LogLevel::PURE)
|
|
137
|
+
end
|
|
138
|
+
line = line.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_')
|
|
139
|
+
@output << {log: line, timestamp: Time.now}
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
rescue Timeout::Error
|
|
143
|
+
@errors << {log: "Command timed out!", timestamp: Time.now}
|
|
144
|
+
log_internal("Command timed out!", level: LogLevel::ERROR)
|
|
145
|
+
@exit_status = 124
|
|
146
|
+
ensure
|
|
147
|
+
retry_command if @retries != 0 and @exit_status !=0
|
|
148
|
+
@exit_status
|
|
149
|
+
end
|
|
150
|
+
|
|
120
151
|
def execute_command
|
|
152
|
+
return execute_command_on_slave if @run_in_main
|
|
121
153
|
Timeout.timeout(@timeout) do
|
|
122
154
|
PTY.spawn(@command) do |stdout, stdin, pid, stderr|
|
|
123
155
|
@pid = pid
|
|
124
156
|
begin
|
|
125
157
|
if stdout.present?
|
|
126
158
|
stdout.each do |line|
|
|
127
|
-
log_internal(line, level: LogLevel::
|
|
159
|
+
log_internal(line, level: LogLevel::INFO)
|
|
128
160
|
line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
|
|
129
161
|
@output << {log: line, timestamp: Time.now}
|
|
130
162
|
end
|
|
@@ -176,10 +208,18 @@ class Cnvrg::Helpers::Agent
|
|
|
176
208
|
def log_internal(log, level: LogLevel::INFO)
|
|
177
209
|
if level == LogLevel::PURE
|
|
178
210
|
puts(log)
|
|
179
|
-
|
|
180
|
-
|
|
211
|
+
STDOUT.flush
|
|
212
|
+
return
|
|
213
|
+
end
|
|
214
|
+
to_print = {message: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity, job: @job_id}
|
|
215
|
+
if log.start_with?("{") and log.include?("timestamp")
|
|
216
|
+
log_json = JSON.parse(log)
|
|
217
|
+
to_print = to_print.stringify_keys.merge(log_json.stringify_keys)
|
|
181
218
|
end
|
|
219
|
+
puts(to_print.to_json)
|
|
182
220
|
STDOUT.flush
|
|
221
|
+
rescue => e
|
|
222
|
+
Cnvrg::Logger.log_error(e)
|
|
183
223
|
end
|
|
184
224
|
|
|
185
225
|
def filter_logs_by_regex(logs)
|
|
@@ -190,4 +230,4 @@ class Cnvrg::Helpers::Agent
|
|
|
190
230
|
end
|
|
191
231
|
end
|
|
192
232
|
end
|
|
193
|
-
end
|
|
233
|
+
end
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
require "filewatch/tail"
|
|
1
2
|
require 'cnvrg/helpers/agent'
|
|
2
3
|
class Cnvrg::Helpers::Executer
|
|
3
|
-
attr_reader :machine_activity, :agent_id, :
|
|
4
|
+
attr_reader :machine_activity, :agent_id, :main_id
|
|
5
|
+
MAIN_CONTAINER_PORT = ENV['MAIN_CONTAINER_PORT'].try(:to_i) || 4000
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
### this class represent a machine_activity. it will poll the commands, communicate with the
|
|
@@ -9,11 +11,15 @@ class Cnvrg::Helpers::Executer
|
|
|
9
11
|
@owner = owner
|
|
10
12
|
@job_id = job_id
|
|
11
13
|
@poll_every = poll_every
|
|
14
|
+
@check_main_every = 10
|
|
12
15
|
@machine_activity = machine_activity
|
|
13
16
|
@commands_q = Queue.new
|
|
14
17
|
@files_q = Queue.new
|
|
15
18
|
@agent_id = nil
|
|
16
|
-
@
|
|
19
|
+
@main_id = nil
|
|
20
|
+
@main_start_time = nil
|
|
21
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
|
22
|
+
@main_name = @is_new_main ? "main" : "slave"
|
|
17
23
|
end
|
|
18
24
|
|
|
19
25
|
def create_file_cmd(path, content)
|
|
@@ -37,9 +43,10 @@ class Cnvrg::Helpers::Executer
|
|
|
37
43
|
def executer_stats
|
|
38
44
|
return @stats if @stats.present?
|
|
39
45
|
Cnvrg::Logger.log_info("getting containers")
|
|
40
|
-
@agent_id, @
|
|
46
|
+
@agent_id, @main_id = containers
|
|
41
47
|
Cnvrg::Logger.log_info("got containers")
|
|
42
48
|
pod_name, node_name = get_node_and_pod_names
|
|
49
|
+
# For backwards compatibility we still call this slave stats
|
|
43
50
|
@stats = {
|
|
44
51
|
pod_name: pod_name,
|
|
45
52
|
node_name: node_name,
|
|
@@ -53,39 +60,48 @@ class Cnvrg::Helpers::Executer
|
|
|
53
60
|
cnvrg: Cnvrg::VERSION
|
|
54
61
|
},
|
|
55
62
|
slave: {
|
|
56
|
-
container_id: @
|
|
57
|
-
|
|
58
|
-
|
|
63
|
+
container_id: @main_id,
|
|
64
|
+
container_name: @main_name,
|
|
65
|
+
workdir: run_in_main('pwd'),
|
|
66
|
+
homedir: main_homedir,
|
|
59
67
|
spark_path: spark_path,
|
|
60
|
-
user:
|
|
61
|
-
cnvrg:
|
|
62
|
-
has_bash:
|
|
63
|
-
user_id:
|
|
64
|
-
group_id:
|
|
65
|
-
python_version:
|
|
66
|
-
python3_version:
|
|
67
|
-
pip_version:
|
|
68
|
-
pip3_version:
|
|
68
|
+
user: run_in_main( 'whoami'),
|
|
69
|
+
cnvrg: run_in_main( 'which cnvrg'),
|
|
70
|
+
has_bash: run_in_main( 'which bash'),
|
|
71
|
+
user_id: run_in_main( 'id -u'),
|
|
72
|
+
group_id: run_in_main( 'id -g'),
|
|
73
|
+
python_version: run_in_main( 'python --version'),
|
|
74
|
+
python3_version: run_in_main( 'python3 --version'),
|
|
75
|
+
pip_version: run_in_main( 'pip --version'),
|
|
76
|
+
pip3_version: run_in_main( 'pip3 --version')
|
|
69
77
|
},
|
|
70
78
|
}
|
|
79
|
+
|
|
71
80
|
@stats
|
|
72
81
|
end
|
|
73
82
|
|
|
74
83
|
def containers
|
|
75
84
|
agent_id = nil
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
85
|
+
main_id = nil
|
|
86
|
+
timeout = 2
|
|
87
|
+
timeout = nil if !@is_new_main
|
|
88
|
+
Timeout.timeout(timeout) do
|
|
89
|
+
while agent_id.blank? or main_id.blank?
|
|
90
|
+
grep_by = @job_id
|
|
91
|
+
grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
|
|
92
|
+
cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
|
|
93
|
+
agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
|
|
94
|
+
main_id = cntrs.find{|container_name| container_name.include? @main_name}.split(",").first rescue nil
|
|
95
|
+
sleep(2)
|
|
96
|
+
end
|
|
84
97
|
end
|
|
85
|
-
if
|
|
86
|
-
raise "Can't find
|
|
98
|
+
if main_id.blank?
|
|
99
|
+
raise "Can't find main id"
|
|
87
100
|
end
|
|
88
|
-
[agent_id,
|
|
101
|
+
[agent_id, main_id]
|
|
102
|
+
rescue => e
|
|
103
|
+
Cnvrg::Logger.log_error(e)
|
|
104
|
+
[agent_id, main_id]
|
|
89
105
|
end
|
|
90
106
|
|
|
91
107
|
def current_homedir
|
|
@@ -93,21 +109,45 @@ class Cnvrg::Helpers::Executer
|
|
|
93
109
|
end
|
|
94
110
|
|
|
95
111
|
def spark_path
|
|
96
|
-
|
|
112
|
+
run_in_main("env | grep SPARK_HOME").strip.split("=").try(:last)
|
|
97
113
|
end
|
|
98
114
|
|
|
99
|
-
def
|
|
100
|
-
|
|
115
|
+
def main_homedir()
|
|
116
|
+
run_in_main("env | grep -w HOME").split("=").try(:last)
|
|
101
117
|
end
|
|
102
118
|
|
|
103
|
-
def
|
|
104
|
-
|
|
119
|
+
def main_env
|
|
120
|
+
run_in_main("env").split("\n").map{|x| x.split("=")}
|
|
105
121
|
end
|
|
106
122
|
|
|
107
|
-
def
|
|
108
|
-
|
|
109
|
-
end
|
|
123
|
+
def run_in_main(command)
|
|
124
|
+
data = {cmd: command, async: false, use_sh: true}
|
|
110
125
|
|
|
126
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn(timeout: 180)
|
|
127
|
+
response = conn.post('command', data.to_json)
|
|
128
|
+
if response.to_hash[:status].to_i != 200
|
|
129
|
+
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
|
130
|
+
return ""
|
|
131
|
+
end
|
|
132
|
+
resp = []
|
|
133
|
+
lines = response.body.split("\n")
|
|
134
|
+
lines.each do |line|
|
|
135
|
+
next if line.strip == nil or line.strip == ""
|
|
136
|
+
if line.include?("cnvrg-exit-code")
|
|
137
|
+
exit_status = line.split("=")[1].to_i
|
|
138
|
+
if exit_status != 0
|
|
139
|
+
Cnvrg::Logger.log_info("failed to run find command #{command} on main")
|
|
140
|
+
return ""
|
|
141
|
+
end
|
|
142
|
+
next
|
|
143
|
+
end
|
|
144
|
+
resp << line
|
|
145
|
+
end
|
|
146
|
+
return resp.join("\n")
|
|
147
|
+
rescue => e
|
|
148
|
+
Cnvrg::Logger.log_error(e)
|
|
149
|
+
return ""
|
|
150
|
+
end
|
|
111
151
|
|
|
112
152
|
def poll
|
|
113
153
|
resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
|
|
@@ -124,6 +164,7 @@ class Cnvrg::Helpers::Executer
|
|
|
124
164
|
success = false
|
|
125
165
|
puts("Agent started, connecting to #{Cnvrg::API.get_api}")
|
|
126
166
|
STDOUT.flush
|
|
167
|
+
wait_for_main
|
|
127
168
|
while !success and retries < 100
|
|
128
169
|
begin
|
|
129
170
|
resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
|
|
@@ -154,14 +195,75 @@ class Cnvrg::Helpers::Executer
|
|
|
154
195
|
end
|
|
155
196
|
end
|
|
156
197
|
|
|
198
|
+
def check_main_is_working_thread
|
|
199
|
+
while true
|
|
200
|
+
check_main_alive
|
|
201
|
+
sleep(@check_main_every)
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
157
205
|
def main_thread
|
|
158
206
|
init
|
|
159
207
|
Thread.new do
|
|
160
208
|
polling_thread
|
|
161
209
|
end
|
|
210
|
+
Thread.new do
|
|
211
|
+
check_main_is_working_thread
|
|
212
|
+
end
|
|
162
213
|
execute_cmds
|
|
163
214
|
end
|
|
164
215
|
|
|
216
|
+
def wait_for_main
|
|
217
|
+
copy_file_to_main
|
|
218
|
+
start_tiny_if_missing
|
|
219
|
+
puts("Waiting for main container")
|
|
220
|
+
STDOUT.flush
|
|
221
|
+
got_response = false
|
|
222
|
+
while !got_response do
|
|
223
|
+
begin
|
|
224
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
|
225
|
+
response = conn.get('readiness')
|
|
226
|
+
if response.to_hash[:status].to_i != 200
|
|
227
|
+
sleep(0.1)
|
|
228
|
+
next
|
|
229
|
+
else
|
|
230
|
+
puts("Client container is ready")
|
|
231
|
+
STDOUT.flush
|
|
232
|
+
@main_start_time = response.body.to_i
|
|
233
|
+
got_response = true
|
|
234
|
+
end
|
|
235
|
+
rescue => e
|
|
236
|
+
puts("Failed to connect to main")
|
|
237
|
+
puts(e)
|
|
238
|
+
STDOUT.flush
|
|
239
|
+
sleep(0.1)
|
|
240
|
+
next
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def copy_file_to_main
|
|
246
|
+
begin
|
|
247
|
+
FileUtils.cp("/cnvrg-tiny", "/conf/tiny")
|
|
248
|
+
FileUtils.cp_r("/scripts", "/conf/scripts-bin")
|
|
249
|
+
FileUtils.touch("/conf/tiny-ready")
|
|
250
|
+
rescue => e
|
|
251
|
+
Cnvrg::Logger.log_error(e)
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def start_tiny_if_missing
|
|
256
|
+
return unless ENV['MAIN_CONTAINER_PORT'].blank?
|
|
257
|
+
Cnvrg::Logger.log_info("Tiny not found, starting it")
|
|
258
|
+
@agent_id, @main_id = containers
|
|
259
|
+
pid = Process.fork do
|
|
260
|
+
Cnvrg::Logger.log_info("running docker exec -i #{@main_id} sh -c '/conf/tiny")
|
|
261
|
+
`docker exec -i #{@main_id} sh -c '/conf/tiny'`.strip
|
|
262
|
+
end
|
|
263
|
+
Process.detach(pid)
|
|
264
|
+
Cnvrg::Logger.log_info("Tiny started and detached")
|
|
265
|
+
end
|
|
266
|
+
|
|
165
267
|
def execute_cmds
|
|
166
268
|
pids = []
|
|
167
269
|
while true
|
|
@@ -215,6 +317,23 @@ class Cnvrg::Helpers::Executer
|
|
|
215
317
|
Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
|
|
216
318
|
end
|
|
217
319
|
|
|
320
|
+
def check_main_alive
|
|
321
|
+
# Dont check before we got first response
|
|
322
|
+
return if @main_start_time == nil
|
|
323
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
|
324
|
+
response = conn.get('readiness')
|
|
325
|
+
if response.to_hash[:status].to_i != 200
|
|
326
|
+
main_start_time = 0
|
|
327
|
+
else
|
|
328
|
+
main_start_time = response.body.to_i
|
|
329
|
+
end
|
|
330
|
+
if main_start_time != @main_start_time
|
|
331
|
+
puts("Found that main restarted, restarting agent")
|
|
332
|
+
Cnvrg::Logger.log_info("Found that main restarted, restarting agent")
|
|
333
|
+
exit(1)
|
|
334
|
+
end
|
|
335
|
+
end
|
|
336
|
+
|
|
218
337
|
def get_pod_events(pod_name)
|
|
219
338
|
return if pod_name.blank?
|
|
220
339
|
`kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
|
|
@@ -224,4 +343,27 @@ class Cnvrg::Helpers::Executer
|
|
|
224
343
|
return if node_name.blank?
|
|
225
344
|
`kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
|
|
226
345
|
end
|
|
346
|
+
|
|
347
|
+
def self.main_container_url
|
|
348
|
+
if ENV["CNVRG_COMPUTE_CLUSTER"].blank? and ENV["KUBERNETES_SERVICE_HOST"].blank?
|
|
349
|
+
if ENV["MAIN_CONTAINER_PORT"].blank?
|
|
350
|
+
host = "slave"
|
|
351
|
+
else
|
|
352
|
+
host = "main"
|
|
353
|
+
end
|
|
354
|
+
"http://#{host}:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
|
355
|
+
else
|
|
356
|
+
"http://localhost:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
|
357
|
+
end
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
def self.get_main_conn(timeout: 4, open_timeout: 1)
|
|
361
|
+
conn = Faraday.new(
|
|
362
|
+
url: Cnvrg::Helpers::Executer.main_container_url,
|
|
363
|
+
headers: {'Content-Type' => 'application/json'}
|
|
364
|
+
)
|
|
365
|
+
conn.options.timeout = timeout
|
|
366
|
+
conn.options.open_timeout = open_timeout
|
|
367
|
+
conn
|
|
368
|
+
end
|
|
227
369
|
end
|
data/lib/cnvrg/job_ssh.rb
CHANGED
|
@@ -5,14 +5,18 @@ module Cnvrg
|
|
|
5
5
|
method_option :port, :type => :numeric, :aliases => ["-p", "--port"], :desc => "Port to bind into", :default => 2222
|
|
6
6
|
method_option :username, :type => :string, :aliases => ["-u", "--username"], :desc => "Job container user name", :default => nil
|
|
7
7
|
method_option :password, :type => :string, :aliases => ["--password"], :desc =>"Job Conatainer user name, will be set by cnvrg", :default => nil
|
|
8
|
+
method_option :no_auth, :type => :boolean, :aliases => ["--no-auth"], :default => false
|
|
9
|
+
method_option :internal_port, :type => :numeric, :aliases => ["--internal-port"], :desc =>"Internal port in the pod for the ssh", :default => 22
|
|
8
10
|
method_option :kubeconfig, :type => :string, :aliases => ["--kubeconfig"], :desc => "Path to kubeconfig, if blank default config will be used", :default => nil
|
|
9
11
|
def start(job_id)
|
|
12
|
+
no_auth = options["no_auth"]
|
|
10
13
|
Cnvrg::CLI.new.log_start(__method__, args, options)
|
|
11
14
|
@job_ssh = ConnectJobSsh.new(job_id)
|
|
12
|
-
@job_ssh.start(options['username'], options['password'])
|
|
15
|
+
@job_ssh.start(options['username'], options['password'], no_auth, port: options['internal_port'])
|
|
13
16
|
pod_name = nil
|
|
14
17
|
namespace = "cnvrg"
|
|
15
18
|
ssh_ready = false
|
|
19
|
+
internal_port = options['internal_port']
|
|
16
20
|
while not ssh_ready
|
|
17
21
|
resp = @job_ssh.status()
|
|
18
22
|
status = resp["ssh_status"]
|
|
@@ -26,13 +30,14 @@ module Cnvrg
|
|
|
26
30
|
username = resp["username"]
|
|
27
31
|
pod_name = resp["pod_name"]
|
|
28
32
|
namespace = resp["namespace"]
|
|
33
|
+
internal_port = resp["port"] || internal_port
|
|
29
34
|
ssh_ready = true
|
|
30
35
|
else
|
|
31
36
|
puts("Failed to start ssh")
|
|
32
37
|
break
|
|
33
38
|
end
|
|
34
39
|
end
|
|
35
|
-
if pod_name.blank? or password.blank? or username.blank?
|
|
40
|
+
if pod_name.blank? or (password.blank? and !no_auth) or username.blank?
|
|
36
41
|
puts("Failed to get required params")
|
|
37
42
|
return
|
|
38
43
|
end
|
|
@@ -41,8 +46,8 @@ module Cnvrg
|
|
|
41
46
|
puts("host: 127.0.0.1")
|
|
42
47
|
puts("port: #{options["port"]}")
|
|
43
48
|
puts("username: #{username}")
|
|
44
|
-
puts("password: #{password}")
|
|
45
|
-
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace)
|
|
49
|
+
puts("password: #{password}") unless no_auth
|
|
50
|
+
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace, internal_port)
|
|
46
51
|
end
|
|
47
52
|
end
|
|
48
53
|
end
|
data/lib/cnvrg/project.rb
CHANGED
|
@@ -378,14 +378,16 @@ module Cnvrg
|
|
|
378
378
|
[]
|
|
379
379
|
end
|
|
380
380
|
|
|
381
|
-
def generate_output_dir(output_dir)
|
|
381
|
+
def generate_output_dir(output_dir, local: false)
|
|
382
382
|
Cnvrg::Logger.log_info("Generating output dir for #{output_dir}")
|
|
383
383
|
upload_list = []
|
|
384
384
|
list = Dir.glob("/cnvrg/#{output_dir}/**/*", File::FNM_DOTMATCH)
|
|
385
|
+
if local
|
|
386
|
+
list += Dir.glob("#{output_dir}/**/*", File::FNM_DOTMATCH)
|
|
387
|
+
end
|
|
385
388
|
Parallel.map(list, in_threads: IDXParallelThreads) do |e|
|
|
386
389
|
next if e.end_with? "/."
|
|
387
390
|
if File.directory? e
|
|
388
|
-
|
|
389
391
|
upload_list << e + "/"
|
|
390
392
|
else
|
|
391
393
|
upload_list << e
|
data/lib/cnvrg/version.rb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
module Cnvrg
|
|
2
|
-
VERSION = '
|
|
3
|
-
end
|
|
2
|
+
VERSION = '2.0.1'
|
|
3
|
+
end
|
metadata
CHANGED
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: cnvrg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 2.0.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yochay Ettun
|
|
8
8
|
- Leah Kolben
|
|
9
|
+
- Omer Shacham
|
|
9
10
|
autorequire:
|
|
10
11
|
bindir: bin
|
|
11
12
|
cert_chain: []
|
|
12
|
-
date: 2021-
|
|
13
|
+
date: 2021-06-16 00:00:00.000000000 Z
|
|
13
14
|
dependencies:
|
|
14
15
|
- !ruby/object:Gem::Dependency
|
|
15
16
|
name: bundler
|
|
@@ -321,6 +322,20 @@ dependencies:
|
|
|
321
322
|
- - "~>"
|
|
322
323
|
- !ruby/object:Gem::Version
|
|
323
324
|
version: 0.1.1
|
|
325
|
+
- !ruby/object:Gem::Dependency
|
|
326
|
+
name: filewatch
|
|
327
|
+
requirement: !ruby/object:Gem::Requirement
|
|
328
|
+
requirements:
|
|
329
|
+
- - "~>"
|
|
330
|
+
- !ruby/object:Gem::Version
|
|
331
|
+
version: 0.9.0
|
|
332
|
+
type: :runtime
|
|
333
|
+
prerelease: false
|
|
334
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
335
|
+
requirements:
|
|
336
|
+
- - "~>"
|
|
337
|
+
- !ruby/object:Gem::Version
|
|
338
|
+
version: 0.9.0
|
|
324
339
|
- !ruby/object:Gem::Dependency
|
|
325
340
|
name: parallel
|
|
326
341
|
requirement: !ruby/object:Gem::Requirement
|