cnvrg 1.11.31 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Readme.md +6 -1
- data/cnvrg.gemspec +2 -1
- data/lib/cnvrg/api.rb +4 -4
- data/lib/cnvrg/cli.rb +83 -71
- data/lib/cnvrg/connect_job_ssh.rb +4 -4
- data/lib/cnvrg/files.rb +1 -1
- data/lib/cnvrg/helpers/agent.rb +58 -18
- data/lib/cnvrg/helpers/executer.rb +176 -34
- data/lib/cnvrg/job_ssh.rb +9 -4
- data/lib/cnvrg/project.rb +4 -2
- data/lib/cnvrg/version.rb +2 -2
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ab82882b2bb6c9093751cd560eaa4ccb7540fad9b9a34a81245721538dd37a5b
|
4
|
+
data.tar.gz: e35299be744d985a37794288a269ba2bb55cf64d3fcd702c6c6147bd4f5d740d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 52b51bb4942583e9ac3eceab24891d252ef7e31d7f1a2a87d5c8f4586f914f33f2d9cc7addc09ab5aa3652a9ba939ef5a37bdb9fc82d18e4e0e0a61126265d17
|
7
|
+
data.tar.gz: df735e631778b5d36d296c33903719a2ba7832ce2a9e218eea033ad246f8c4ce044a7b7d5562f68d901a1ee32d62a88559f886da2a0341d8bf7c51b50c25660c
|
data/Readme.md
CHANGED
data/cnvrg.gemspec
CHANGED
@@ -6,7 +6,7 @@ require 'cnvrg/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = 'cnvrg'
|
8
8
|
spec.version = Cnvrg::VERSION
|
9
|
-
spec.authors = ['Yochay Ettun', 'Leah Kolben']
|
9
|
+
spec.authors = ['Yochay Ettun', 'Leah Kolben', 'Omer Shacham']
|
10
10
|
spec.email = ['info@cnvrg.io']
|
11
11
|
spec.summary = %q{A CLI tool for interacting with cnvrg.io.}
|
12
12
|
spec.description = %q{A CLI tool for interacting with cnvrg.io.}
|
@@ -39,6 +39,7 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
|
40
40
|
spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
|
41
41
|
spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
|
42
|
+
spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
|
42
43
|
spec.add_runtime_dependency 'parallel', '~> 1.12.0'
|
43
44
|
spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
|
44
45
|
spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
|
data/lib/cnvrg/api.rb
CHANGED
@@ -72,11 +72,11 @@ module Cnvrg
|
|
72
72
|
if response.to_hash[:status].to_i != 200
|
73
73
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
74
74
|
end
|
75
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
75
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
76
76
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
77
77
|
success = false
|
78
78
|
sleep(5 * retries)
|
79
|
-
retries +=1
|
79
|
+
retries += 1
|
80
80
|
next
|
81
81
|
end
|
82
82
|
rescue => e
|
@@ -112,11 +112,11 @@ module Cnvrg
|
|
112
112
|
if response.to_hash[:status].to_i != 200
|
113
113
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
114
114
|
end
|
115
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
115
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
116
116
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
117
117
|
success = false
|
118
118
|
sleep(5 * retries)
|
119
|
-
retries +=1
|
119
|
+
retries += 1
|
120
120
|
next
|
121
121
|
end
|
122
122
|
rescue => e
|
data/lib/cnvrg/cli.rb
CHANGED
@@ -173,7 +173,7 @@ module Cnvrg
|
|
173
173
|
desc "data [COMMAND]", "Upload and manage datasets", :hide => false
|
174
174
|
subcommand "data", Data
|
175
175
|
|
176
|
-
desc "job", "manage running jobs", :hide =>
|
176
|
+
desc "job", "manage running jobs", :hide => true
|
177
177
|
subcommand "job", JobCli
|
178
178
|
|
179
179
|
desc "ssh", "ssh into running jobs", :hide => false
|
@@ -415,7 +415,7 @@ module Cnvrg
|
|
415
415
|
end
|
416
416
|
end
|
417
417
|
|
418
|
-
desc 'set_compression_path', 'Set compression path'
|
418
|
+
desc 'set_compression_path', 'Set compression path', :hide => true
|
419
419
|
method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
|
420
420
|
|
421
421
|
def set_compression_path(*compression_path)
|
@@ -2311,6 +2311,7 @@ module Cnvrg
|
|
2311
2311
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
2312
2312
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
2313
2313
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
2314
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
2314
2315
|
|
2315
2316
|
def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
|
2316
2317
|
begin
|
@@ -2328,6 +2329,8 @@ module Cnvrg
|
|
2328
2329
|
exp_obj = nil
|
2329
2330
|
end
|
2330
2331
|
|
2332
|
+
local = options["local"]
|
2333
|
+
|
2331
2334
|
commit_msg = options["message"]
|
2332
2335
|
if commit_msg.nil? or commit_msg.empty?
|
2333
2336
|
commit_msg = ""
|
@@ -2349,7 +2352,7 @@ module Cnvrg
|
|
2349
2352
|
if git_output_dir.ends_with? "/"
|
2350
2353
|
git_output_dir = git_output_dir[0..-2]
|
2351
2354
|
end
|
2352
|
-
list = @project.generate_output_dir(git_output_dir)
|
2355
|
+
list = @project.generate_output_dir(git_output_dir, local: local)
|
2353
2356
|
end
|
2354
2357
|
list += @project.generate_git_diff if options["git_diff"]
|
2355
2358
|
spec_files_to_upload = list
|
@@ -2668,7 +2671,7 @@ module Cnvrg
|
|
2668
2671
|
end
|
2669
2672
|
end
|
2670
2673
|
|
2671
|
-
desc 'commit before termination', 'Commit job code before termination'
|
2674
|
+
desc 'commit before termination', 'Commit job code before termination', :hide => true
|
2672
2675
|
def commit_before_termination()
|
2673
2676
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2674
2677
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2678,7 +2681,7 @@ module Cnvrg
|
|
2678
2681
|
log_error(e)
|
2679
2682
|
end
|
2680
2683
|
|
2681
|
-
desc 'update_job_commit', 'Update job with its last commit'
|
2684
|
+
desc 'update_job_commit', 'Update job with its last commit' , :hide => true
|
2682
2685
|
def update_job_commit()
|
2683
2686
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2684
2687
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2868,7 +2871,7 @@ module Cnvrg
|
|
2868
2871
|
|
2869
2872
|
|
2870
2873
|
|
2871
|
-
desc 'jump', 'Jump to specific commit'
|
2874
|
+
desc 'jump COMMIT_ID', 'Jump to specific commit'
|
2872
2875
|
def jump(commit_sha1)
|
2873
2876
|
begin
|
2874
2877
|
verify_logged_in()
|
@@ -3008,6 +3011,7 @@ module Cnvrg
|
|
3008
3011
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
3009
3012
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
3010
3013
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
3014
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => false
|
3011
3015
|
|
3012
3016
|
def sync(direct = true)
|
3013
3017
|
verify_logged_in(true) if direct
|
@@ -3030,10 +3034,10 @@ module Cnvrg
|
|
3030
3034
|
if run_download or options['debug_mode']
|
3031
3035
|
invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
|
3032
3036
|
end
|
3033
|
-
invoke :upload, [false, true,
|
3037
|
+
invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
|
3034
3038
|
:ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
|
3035
3039
|
:files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
|
3036
|
-
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
|
3040
|
+
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
|
3037
3041
|
|
3038
3042
|
end
|
3039
3043
|
|
@@ -3316,62 +3320,47 @@ module Cnvrg
|
|
3316
3320
|
end
|
3317
3321
|
end
|
3318
3322
|
start_time = Time.now
|
3319
|
-
shell_type = options["use_bash"] ? "bash -l" : "sh"
|
3320
3323
|
if @exp.get_cmd.present?
|
3321
3324
|
cmd = @exp.get_cmd
|
3322
|
-
if options["docker_id"].present? # Escape for docker exec
|
3323
|
-
cmd = cmd.gsub("\"", "\\\"")
|
3324
|
-
end
|
3325
3325
|
end
|
3326
|
-
|
3327
|
-
|
3326
|
+
command_slug = (0...18).map { (65 + rand(26)).chr }.join
|
3327
|
+
result_file = "/conf/result-#{command_slug}"
|
3328
|
+
data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
|
3329
|
+
|
3330
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
3331
|
+
response = conn.post('command', data.to_json)
|
3332
|
+
if response.to_hash[:status].to_i != 200
|
3333
|
+
exit_status = 129
|
3334
|
+
raise StandardError.new("Cant send command to slave")
|
3328
3335
|
end
|
3329
|
-
|
3336
|
+
t = FileWatch::Tail.new
|
3337
|
+
filename = result_file
|
3338
|
+
lines = []
|
3339
|
+
t.tail(filename)
|
3340
|
+
t.subscribe do |path, line|
|
3330
3341
|
begin
|
3331
|
-
|
3332
|
-
|
3333
|
-
|
3334
|
-
|
3335
|
-
|
3336
|
-
|
3337
|
-
|
3338
|
-
|
3339
|
-
|
3340
|
-
puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
|
3341
|
-
end
|
3342
|
+
cur_log = JSON.parse(line)
|
3343
|
+
if cur_log["type"] == "endMessage"
|
3344
|
+
exit_status = cur_log["real"].to_i
|
3345
|
+
break
|
3346
|
+
else
|
3347
|
+
puts(cur_log.to_json)
|
3348
|
+
STDOUT.flush
|
3349
|
+
cur_log["time"] = Time.parse(cur_log["timestamp"])
|
3350
|
+
cur_log["message"] = cur_log["message"].to_s + "\r\n"
|
3342
3351
|
log << cur_log
|
3343
|
-
if log.size >= 10
|
3344
|
-
@exp.upload_temp_log(log) unless log.empty?
|
3345
|
-
log = []
|
3346
|
-
elsif (start_time + 15.seconds) <= Time.now
|
3347
|
-
@exp.upload_temp_log(log) unless log.empty?
|
3348
|
-
log = []
|
3349
|
-
start_time = Time.now
|
3350
|
-
end
|
3351
3352
|
end
|
3352
|
-
if
|
3353
|
-
|
3354
|
-
|
3355
|
-
|
3356
|
-
|
3357
|
-
|
3358
|
-
|
3359
|
-
if !log.empty?
|
3360
|
-
temp_log = log
|
3361
|
-
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3362
|
-
log -= temp_log
|
3353
|
+
if log.size >= 10
|
3354
|
+
@exp.upload_temp_log(log)
|
3355
|
+
log = []
|
3356
|
+
elsif (start_time + 15.seconds) <= Time.now
|
3357
|
+
@exp.upload_temp_log(log) unless log.empty?
|
3358
|
+
log = []
|
3359
|
+
start_time = Time.now
|
3363
3360
|
end
|
3364
|
-
rescue Errno::ENOENT => e
|
3365
|
-
exp_success = false
|
3366
|
-
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
3367
|
-
log_error(e)
|
3368
3361
|
rescue => e
|
3369
|
-
res = @exp.end(log, 1, start_commit, 0, 0)
|
3370
|
-
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
3371
3362
|
log_error(e)
|
3372
|
-
exit(0)
|
3373
3363
|
end
|
3374
|
-
::Process.wait pid
|
3375
3364
|
end
|
3376
3365
|
end_time = Time.now
|
3377
3366
|
process_running = false
|
@@ -3379,14 +3368,13 @@ module Cnvrg
|
|
3379
3368
|
if !log.empty?
|
3380
3369
|
|
3381
3370
|
temp_log = log
|
3382
|
-
|
3371
|
+
@exp.upload_temp_log(temp_log)
|
3383
3372
|
log -= temp_log
|
3384
3373
|
end
|
3385
3374
|
|
3386
3375
|
cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
|
3387
3376
|
memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
|
3388
|
-
exit_status
|
3389
|
-
if $?.exitstatus != 0
|
3377
|
+
if exit_status != 0
|
3390
3378
|
exp_success = false
|
3391
3379
|
end
|
3392
3380
|
|
@@ -3430,7 +3418,6 @@ module Cnvrg
|
|
3430
3418
|
if @exp
|
3431
3419
|
# log_thread.join
|
3432
3420
|
Thread.kill(stats_thread) if docker_stats
|
3433
|
-
exit_status = $?.exitstatus
|
3434
3421
|
if exit_status.blank?
|
3435
3422
|
exit_status = "-1"
|
3436
3423
|
end
|
@@ -3443,8 +3430,6 @@ module Cnvrg
|
|
3443
3430
|
|
3444
3431
|
exit(1)
|
3445
3432
|
end
|
3446
|
-
|
3447
|
-
|
3448
3433
|
end
|
3449
3434
|
|
3450
3435
|
end
|
@@ -3689,7 +3674,7 @@ module Cnvrg
|
|
3689
3674
|
end
|
3690
3675
|
end
|
3691
3676
|
|
3692
|
-
desc 'deploy', 'Deploys model to production'
|
3677
|
+
desc 'deploy', 'Deploys model to production', :hide => true
|
3693
3678
|
method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
|
3694
3679
|
method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
|
3695
3680
|
method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
|
@@ -3778,7 +3763,7 @@ module Cnvrg
|
|
3778
3763
|
method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
|
3779
3764
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
|
3780
3765
|
|
3781
|
-
desc 'notebook', 'Starts a notebook session remotely or locally'
|
3766
|
+
desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
|
3782
3767
|
|
3783
3768
|
def notebook
|
3784
3769
|
verify_logged_in(true)
|
@@ -3905,7 +3890,7 @@ module Cnvrg
|
|
3905
3890
|
end
|
3906
3891
|
end
|
3907
3892
|
|
3908
|
-
desc 'remote_notebook', 'Run notebook server on remote server'
|
3893
|
+
desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
|
3909
3894
|
method_option :machine_type, :type => :string, :default => ""
|
3910
3895
|
method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
|
3911
3896
|
method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
|
@@ -4264,7 +4249,7 @@ module Cnvrg
|
|
4264
4249
|
|
4265
4250
|
end
|
4266
4251
|
|
4267
|
-
desc 'notebook_stop', '
|
4252
|
+
desc 'notebook_stop', 'Stop notebook', :hide => true
|
4268
4253
|
method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
|
4269
4254
|
method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
|
4270
4255
|
method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
|
@@ -4651,10 +4636,16 @@ module Cnvrg
|
|
4651
4636
|
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
4652
4637
|
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
4653
4638
|
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
4639
|
+
method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
|
4640
|
+
method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
|
4641
|
+
method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
|
4654
4642
|
|
4655
4643
|
def collect_metrics
|
4656
4644
|
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
4657
4645
|
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
4646
|
+
prom_user = options[:prom_user]
|
4647
|
+
prom_password = options[:prom_password]
|
4648
|
+
name = options[:name]
|
4658
4649
|
|
4659
4650
|
translate_result = Cnvrg::API_V2.request(
|
4660
4651
|
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
@@ -4679,9 +4670,16 @@ module Cnvrg
|
|
4679
4670
|
next
|
4680
4671
|
end
|
4681
4672
|
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
4682
|
-
|
4673
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
4674
|
+
http.use_ssl = uri.scheme == "https"
|
4675
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
4676
|
+
req = Net::HTTP::Get.new uri.request_uri
|
4677
|
+
if prom_user.present?
|
4678
|
+
req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
|
4679
|
+
end
|
4680
|
+
resp = http.request(req)
|
4683
4681
|
begin
|
4684
|
-
result = JSON.parse(resp)
|
4682
|
+
result = JSON.parse(resp.body)
|
4685
4683
|
rescue JSON::ParserError => e
|
4686
4684
|
log_error(e)
|
4687
4685
|
next
|
@@ -4690,13 +4688,22 @@ module Cnvrg
|
|
4690
4688
|
next unless data_result
|
4691
4689
|
|
4692
4690
|
if data_result.size > 1
|
4693
|
-
stats[query_name] = {}
|
4691
|
+
stats[query_name] = {} unless query_name.include? 'block'
|
4694
4692
|
data_result.each_with_index do |res, i|
|
4695
4693
|
timestamp, value = res["value"]
|
4696
4694
|
uuid = res["metric"]["UUID"].presence || i
|
4697
4695
|
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4698
4696
|
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4699
|
-
|
4697
|
+
if query_name.include? 'block'
|
4698
|
+
uuid = res["metric"]["interface"].presence || i
|
4699
|
+
uuid = "#{name}-#{uuid}" if name.present?
|
4700
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
4701
|
+
io_type = query_name.split('_')[1]
|
4702
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4703
|
+
stats['block_io'][io_type].merge!({ uuid => stat_value })
|
4704
|
+
else
|
4705
|
+
stats[query_name][uuid] = stat_value
|
4706
|
+
end
|
4700
4707
|
end
|
4701
4708
|
else
|
4702
4709
|
timestamp, value = data_result&.first&.dig('value')
|
@@ -4705,9 +4712,14 @@ module Cnvrg
|
|
4705
4712
|
if query_name.include? 'block'
|
4706
4713
|
stats['block_io'] = {} if stats['block_io'].blank?
|
4707
4714
|
io_type = query_name.split('_')[1]
|
4708
|
-
|
4715
|
+
if name.present?
|
4716
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4717
|
+
stats['block_io'][io_type].merge!({ name => stat_value })
|
4718
|
+
else
|
4719
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
4720
|
+
end
|
4709
4721
|
else
|
4710
|
-
stats[query_name] = stat_value
|
4722
|
+
stats[query_name] = name.present? ? { name => stat_value } : stat_value
|
4711
4723
|
end
|
4712
4724
|
end
|
4713
4725
|
end
|
@@ -4751,7 +4763,7 @@ module Cnvrg
|
|
4751
4763
|
end
|
4752
4764
|
|
4753
4765
|
|
4754
|
-
desc '', ''
|
4766
|
+
desc '', '', :hide => true
|
4755
4767
|
|
4756
4768
|
def download_built_image(image_name, image_slug)
|
4757
4769
|
begin
|
@@ -4995,7 +5007,7 @@ module Cnvrg
|
|
4995
5007
|
end
|
4996
5008
|
end
|
4997
5009
|
|
4998
|
-
desc 'experiments', 'List project experiments'
|
5010
|
+
desc 'experiments', 'List project experiments', :hide => true
|
4999
5011
|
method_option :id, :type => :string, :aliases => ["--id"], :desc => "Get info for specific experiments", :default => ""
|
5000
5012
|
method_option :tag, :type => :string, :aliases => ["-t"], :desc => "Get info for specific experiment tag", :default => ""
|
5001
5013
|
|
@@ -10,20 +10,20 @@ module Cnvrg
|
|
10
10
|
Cnvrg::Logger.log_info("cnvrg is not configured")
|
11
11
|
end
|
12
12
|
|
13
|
-
def start(username, password)
|
14
|
-
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password})
|
13
|
+
def start(username, password, no_auth, port: nil)
|
14
|
+
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password, no_auth: no_auth, port: port})
|
15
15
|
end
|
16
16
|
|
17
17
|
def status()
|
18
18
|
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/status" , 'GET', nil)
|
19
19
|
end
|
20
20
|
|
21
|
-
def run_portforward_command(pod_name, port, kubeconfig, namespace)
|
21
|
+
def run_portforward_command(pod_name, port, kubeconfig, namespace, internal_port)
|
22
22
|
command = "kubectl"
|
23
23
|
if kubeconfig.present?
|
24
24
|
command = "kubectl --kubeconfig=#{kubeconfig}"
|
25
25
|
end
|
26
|
-
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}
|
26
|
+
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:#{internal_port}"
|
27
27
|
puts("\nrunning command #{bashCommand}")
|
28
28
|
`#{bashCommand}`
|
29
29
|
end
|
data/lib/cnvrg/files.rb
CHANGED
@@ -106,7 +106,7 @@ module Cnvrg
|
|
106
106
|
commit: commit_sha1
|
107
107
|
})
|
108
108
|
unless Cnvrg::CLI.is_response_success(resp, false)
|
109
|
-
raise
|
109
|
+
raise StandardError.new("unsupported character: folder name can not include / \\ * : ? \" | ")
|
110
110
|
end
|
111
111
|
# resolve bucket
|
112
112
|
res = resp['result']
|
data/lib/cnvrg/helpers/agent.rb
CHANGED
@@ -18,10 +18,13 @@ class Cnvrg::Helpers::Agent
|
|
18
18
|
#### params
|
19
19
|
def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
|
20
20
|
@executer = executer
|
21
|
+
@job_id = ENV["CNVRG_JOB_ID"]
|
21
22
|
@slug = slug
|
22
23
|
@files_exist = files_exist
|
23
24
|
@container_name = container_name
|
24
|
-
@
|
25
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
26
|
+
@main_name = @is_new_main ? "main" : "slave"
|
27
|
+
@run_in_main = @container_name.downcase == @main_name
|
25
28
|
@log_interval = send_log_interval
|
26
29
|
# https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
|
27
30
|
if timeout.blank? or timeout.negative?
|
@@ -37,18 +40,8 @@ class Cnvrg::Helpers::Agent
|
|
37
40
|
@sleep_before_retry = sleep_before_retry
|
38
41
|
@real_execution_retries = 0 ## How many times the command really executed until success
|
39
42
|
@single_quotes = single_quotes
|
40
|
-
@docker_user =
|
41
|
-
@
|
42
|
-
if docker_user.present?
|
43
|
-
@docker_user = " --user #{docker_user}"
|
44
|
-
end
|
45
|
-
if @run_in_slave
|
46
|
-
if @single_quotes
|
47
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
|
48
|
-
else
|
49
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
|
50
|
-
end
|
51
|
-
end
|
43
|
+
@docker_user = docker_user
|
44
|
+
@use_bash = use_bash
|
52
45
|
@output = []
|
53
46
|
@errors = []
|
54
47
|
@exit_status = nil
|
@@ -75,19 +68,22 @@ class Cnvrg::Helpers::Agent
|
|
75
68
|
|
76
69
|
def exec!
|
77
70
|
log_internal("Command: #{@command} with slug: #{@slug} started!")
|
71
|
+
command_status = Status::FINISHED
|
78
72
|
if @command.blank?
|
79
73
|
@exit_status = 0
|
74
|
+
command_status = Status::ABORTED
|
80
75
|
elsif should_run?
|
81
76
|
send_logs(status: Status::STARTED)
|
82
77
|
periodic_thread_handle = periodic_thread
|
83
78
|
execute_command
|
84
79
|
else
|
80
|
+
command_status = Status::ABORTED
|
85
81
|
@exit_status = 127
|
86
82
|
end
|
87
83
|
finish_log = "Command: #{@command} with slug: #{@slug} finished"
|
88
84
|
finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
|
89
85
|
log_internal(finish_log)
|
90
|
-
send_logs(exit_status: @exit_status, status:
|
86
|
+
send_logs(exit_status: @exit_status, status: command_status)
|
91
87
|
if periodic_thread_handle.present?
|
92
88
|
periodic_thread_handle.join
|
93
89
|
end
|
@@ -117,14 +113,50 @@ class Cnvrg::Helpers::Agent
|
|
117
113
|
execute_command
|
118
114
|
end
|
119
115
|
|
116
|
+
def execute_command_on_slave
|
117
|
+
extra_slug = (0...2).map { (65 + rand(26)).chr }.join
|
118
|
+
result_file = "/conf/result-#{@slug}-#{extra_slug}"
|
119
|
+
Timeout.timeout(@timeout) do
|
120
|
+
data = {cmd: @command, async: true, file_name: result_file, use_script: true, use_bash: @use_bash, use_sh: !@use_bash, docker_user: @docker_user}
|
121
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
122
|
+
response = conn.post('command', data.to_json)
|
123
|
+
if response.to_hash[:status].to_i != 200
|
124
|
+
@exit_status = 129
|
125
|
+
raise StandardError.new("Cant send command to slave")
|
126
|
+
end
|
127
|
+
t = FileWatch::Tail.new
|
128
|
+
filename = result_file
|
129
|
+
t.tail(filename)
|
130
|
+
t.subscribe do |path, line|
|
131
|
+
if line.include?("cnvrg-exit-code")
|
132
|
+
@exit_status = line.split("=")[1].to_i
|
133
|
+
break
|
134
|
+
end
|
135
|
+
if !@is_new_main
|
136
|
+
log_internal(line, level: LogLevel::PURE)
|
137
|
+
end
|
138
|
+
line = line.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_')
|
139
|
+
@output << {log: line, timestamp: Time.now}
|
140
|
+
end
|
141
|
+
end
|
142
|
+
rescue Timeout::Error
|
143
|
+
@errors << {log: "Command timed out!", timestamp: Time.now}
|
144
|
+
log_internal("Command timed out!", level: LogLevel::ERROR)
|
145
|
+
@exit_status = 124
|
146
|
+
ensure
|
147
|
+
retry_command if @retries != 0 and @exit_status !=0
|
148
|
+
@exit_status
|
149
|
+
end
|
150
|
+
|
120
151
|
def execute_command
|
152
|
+
return execute_command_on_slave if @run_in_main
|
121
153
|
Timeout.timeout(@timeout) do
|
122
154
|
PTY.spawn(@command) do |stdout, stdin, pid, stderr|
|
123
155
|
@pid = pid
|
124
156
|
begin
|
125
157
|
if stdout.present?
|
126
158
|
stdout.each do |line|
|
127
|
-
log_internal(line, level: LogLevel::
|
159
|
+
log_internal(line, level: LogLevel::INFO)
|
128
160
|
line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
|
129
161
|
@output << {log: line, timestamp: Time.now}
|
130
162
|
end
|
@@ -176,10 +208,18 @@ class Cnvrg::Helpers::Agent
|
|
176
208
|
def log_internal(log, level: LogLevel::INFO)
|
177
209
|
if level == LogLevel::PURE
|
178
210
|
puts(log)
|
179
|
-
|
180
|
-
|
211
|
+
STDOUT.flush
|
212
|
+
return
|
213
|
+
end
|
214
|
+
to_print = {message: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity, job: @job_id}
|
215
|
+
if log.start_with?("{") and log.include?("timestamp")
|
216
|
+
log_json = JSON.parse(log)
|
217
|
+
to_print = to_print.stringify_keys.merge(log_json.stringify_keys)
|
181
218
|
end
|
219
|
+
puts(to_print.to_json)
|
182
220
|
STDOUT.flush
|
221
|
+
rescue => e
|
222
|
+
Cnvrg::Logger.log_error(e)
|
183
223
|
end
|
184
224
|
|
185
225
|
def filter_logs_by_regex(logs)
|
@@ -190,4 +230,4 @@ class Cnvrg::Helpers::Agent
|
|
190
230
|
end
|
191
231
|
end
|
192
232
|
end
|
193
|
-
end
|
233
|
+
end
|
@@ -1,6 +1,8 @@
|
|
1
|
+
require "filewatch/tail"
|
1
2
|
require 'cnvrg/helpers/agent'
|
2
3
|
class Cnvrg::Helpers::Executer
|
3
|
-
attr_reader :machine_activity, :agent_id, :
|
4
|
+
attr_reader :machine_activity, :agent_id, :main_id
|
5
|
+
MAIN_CONTAINER_PORT = ENV['MAIN_CONTAINER_PORT'].try(:to_i) || 4000
|
4
6
|
|
5
7
|
|
6
8
|
### this class represent a machine_activity. it will poll the commands, communicate with the
|
@@ -9,11 +11,15 @@ class Cnvrg::Helpers::Executer
|
|
9
11
|
@owner = owner
|
10
12
|
@job_id = job_id
|
11
13
|
@poll_every = poll_every
|
14
|
+
@check_main_every = 10
|
12
15
|
@machine_activity = machine_activity
|
13
16
|
@commands_q = Queue.new
|
14
17
|
@files_q = Queue.new
|
15
18
|
@agent_id = nil
|
16
|
-
@
|
19
|
+
@main_id = nil
|
20
|
+
@main_start_time = nil
|
21
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
22
|
+
@main_name = @is_new_main ? "main" : "slave"
|
17
23
|
end
|
18
24
|
|
19
25
|
def create_file_cmd(path, content)
|
@@ -37,9 +43,10 @@ class Cnvrg::Helpers::Executer
|
|
37
43
|
def executer_stats
|
38
44
|
return @stats if @stats.present?
|
39
45
|
Cnvrg::Logger.log_info("getting containers")
|
40
|
-
@agent_id, @
|
46
|
+
@agent_id, @main_id = containers
|
41
47
|
Cnvrg::Logger.log_info("got containers")
|
42
48
|
pod_name, node_name = get_node_and_pod_names
|
49
|
+
# For backwards compatibility we still call this slave stats
|
43
50
|
@stats = {
|
44
51
|
pod_name: pod_name,
|
45
52
|
node_name: node_name,
|
@@ -53,39 +60,48 @@ class Cnvrg::Helpers::Executer
|
|
53
60
|
cnvrg: Cnvrg::VERSION
|
54
61
|
},
|
55
62
|
slave: {
|
56
|
-
container_id: @
|
57
|
-
|
58
|
-
|
63
|
+
container_id: @main_id,
|
64
|
+
container_name: @main_name,
|
65
|
+
workdir: run_in_main('pwd'),
|
66
|
+
homedir: main_homedir,
|
59
67
|
spark_path: spark_path,
|
60
|
-
user:
|
61
|
-
cnvrg:
|
62
|
-
has_bash:
|
63
|
-
user_id:
|
64
|
-
group_id:
|
65
|
-
python_version:
|
66
|
-
python3_version:
|
67
|
-
pip_version:
|
68
|
-
pip3_version:
|
68
|
+
user: run_in_main( 'whoami'),
|
69
|
+
cnvrg: run_in_main( 'which cnvrg'),
|
70
|
+
has_bash: run_in_main( 'which bash'),
|
71
|
+
user_id: run_in_main( 'id -u'),
|
72
|
+
group_id: run_in_main( 'id -g'),
|
73
|
+
python_version: run_in_main( 'python --version'),
|
74
|
+
python3_version: run_in_main( 'python3 --version'),
|
75
|
+
pip_version: run_in_main( 'pip --version'),
|
76
|
+
pip3_version: run_in_main( 'pip3 --version')
|
69
77
|
},
|
70
78
|
}
|
79
|
+
|
71
80
|
@stats
|
72
81
|
end
|
73
82
|
|
74
83
|
def containers
|
75
84
|
agent_id = nil
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
85
|
+
main_id = nil
|
86
|
+
timeout = 2
|
87
|
+
timeout = nil if !@is_new_main
|
88
|
+
Timeout.timeout(timeout) do
|
89
|
+
while agent_id.blank? or main_id.blank?
|
90
|
+
grep_by = @job_id
|
91
|
+
grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
|
92
|
+
cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
|
93
|
+
agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
|
94
|
+
main_id = cntrs.find{|container_name| container_name.include? @main_name}.split(",").first rescue nil
|
95
|
+
sleep(2)
|
96
|
+
end
|
84
97
|
end
|
85
|
-
if
|
86
|
-
raise "Can't find
|
98
|
+
if main_id.blank?
|
99
|
+
raise "Can't find main id"
|
87
100
|
end
|
88
|
-
[agent_id,
|
101
|
+
[agent_id, main_id]
|
102
|
+
rescue => e
|
103
|
+
Cnvrg::Logger.log_error(e)
|
104
|
+
[agent_id, main_id]
|
89
105
|
end
|
90
106
|
|
91
107
|
def current_homedir
|
@@ -93,21 +109,45 @@ class Cnvrg::Helpers::Executer
|
|
93
109
|
end
|
94
110
|
|
95
111
|
def spark_path
|
96
|
-
|
112
|
+
run_in_main("env | grep SPARK_HOME").strip.split("=").try(:last)
|
97
113
|
end
|
98
114
|
|
99
|
-
def
|
100
|
-
|
115
|
+
def main_homedir()
|
116
|
+
run_in_main("env | grep -w HOME").split("=").try(:last)
|
101
117
|
end
|
102
118
|
|
103
|
-
def
|
104
|
-
|
119
|
+
def main_env
|
120
|
+
run_in_main("env").split("\n").map{|x| x.split("=")}
|
105
121
|
end
|
106
122
|
|
107
|
-
def
|
108
|
-
|
109
|
-
end
|
123
|
+
def run_in_main(command)
|
124
|
+
data = {cmd: command, async: false, use_sh: true}
|
110
125
|
|
126
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn(timeout: 180)
|
127
|
+
response = conn.post('command', data.to_json)
|
128
|
+
if response.to_hash[:status].to_i != 200
|
129
|
+
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
130
|
+
return ""
|
131
|
+
end
|
132
|
+
resp = []
|
133
|
+
lines = response.body.split("\n")
|
134
|
+
lines.each do |line|
|
135
|
+
next if line.strip == nil or line.strip == ""
|
136
|
+
if line.include?("cnvrg-exit-code")
|
137
|
+
exit_status = line.split("=")[1].to_i
|
138
|
+
if exit_status != 0
|
139
|
+
Cnvrg::Logger.log_info("failed to run find command #{command} on main")
|
140
|
+
return ""
|
141
|
+
end
|
142
|
+
next
|
143
|
+
end
|
144
|
+
resp << line
|
145
|
+
end
|
146
|
+
return resp.join("\n")
|
147
|
+
rescue => e
|
148
|
+
Cnvrg::Logger.log_error(e)
|
149
|
+
return ""
|
150
|
+
end
|
111
151
|
|
112
152
|
def poll
|
113
153
|
resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
|
@@ -124,6 +164,7 @@ class Cnvrg::Helpers::Executer
|
|
124
164
|
success = false
|
125
165
|
puts("Agent started, connecting to #{Cnvrg::API.get_api}")
|
126
166
|
STDOUT.flush
|
167
|
+
wait_for_main
|
127
168
|
while !success and retries < 100
|
128
169
|
begin
|
129
170
|
resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
|
@@ -154,14 +195,75 @@ class Cnvrg::Helpers::Executer
|
|
154
195
|
end
|
155
196
|
end
|
156
197
|
|
198
|
+
def check_main_is_working_thread
|
199
|
+
while true
|
200
|
+
check_main_alive
|
201
|
+
sleep(@check_main_every)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
157
205
|
def main_thread
|
158
206
|
init
|
159
207
|
Thread.new do
|
160
208
|
polling_thread
|
161
209
|
end
|
210
|
+
Thread.new do
|
211
|
+
check_main_is_working_thread
|
212
|
+
end
|
162
213
|
execute_cmds
|
163
214
|
end
|
164
215
|
|
216
|
+
def wait_for_main
|
217
|
+
copy_file_to_main
|
218
|
+
start_tiny_if_missing
|
219
|
+
puts("Waiting for main container")
|
220
|
+
STDOUT.flush
|
221
|
+
got_response = false
|
222
|
+
while !got_response do
|
223
|
+
begin
|
224
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
225
|
+
response = conn.get('readiness')
|
226
|
+
if response.to_hash[:status].to_i != 200
|
227
|
+
sleep(0.1)
|
228
|
+
next
|
229
|
+
else
|
230
|
+
puts("Client container is ready")
|
231
|
+
STDOUT.flush
|
232
|
+
@main_start_time = response.body.to_i
|
233
|
+
got_response = true
|
234
|
+
end
|
235
|
+
rescue => e
|
236
|
+
puts("Failed to connect to main")
|
237
|
+
puts(e)
|
238
|
+
STDOUT.flush
|
239
|
+
sleep(0.1)
|
240
|
+
next
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def copy_file_to_main
|
246
|
+
begin
|
247
|
+
FileUtils.cp("/cnvrg-tiny", "/conf/tiny")
|
248
|
+
FileUtils.cp_r("/scripts", "/conf/scripts-bin")
|
249
|
+
FileUtils.touch("/conf/tiny-ready")
|
250
|
+
rescue => e
|
251
|
+
Cnvrg::Logger.log_error(e)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def start_tiny_if_missing
|
256
|
+
return unless ENV['MAIN_CONTAINER_PORT'].blank?
|
257
|
+
Cnvrg::Logger.log_info("Tiny not found, starting it")
|
258
|
+
@agent_id, @main_id = containers
|
259
|
+
pid = Process.fork do
|
260
|
+
Cnvrg::Logger.log_info("running docker exec -i #{@main_id} sh -c '/conf/tiny")
|
261
|
+
`docker exec -i #{@main_id} sh -c '/conf/tiny'`.strip
|
262
|
+
end
|
263
|
+
Process.detach(pid)
|
264
|
+
Cnvrg::Logger.log_info("Tiny started and detached")
|
265
|
+
end
|
266
|
+
|
165
267
|
def execute_cmds
|
166
268
|
pids = []
|
167
269
|
while true
|
@@ -215,6 +317,23 @@ class Cnvrg::Helpers::Executer
|
|
215
317
|
Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
|
216
318
|
end
|
217
319
|
|
320
|
+
def check_main_alive
|
321
|
+
# Dont check before we got first response
|
322
|
+
return if @main_start_time == nil
|
323
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
324
|
+
response = conn.get('readiness')
|
325
|
+
if response.to_hash[:status].to_i != 200
|
326
|
+
main_start_time = 0
|
327
|
+
else
|
328
|
+
main_start_time = response.body.to_i
|
329
|
+
end
|
330
|
+
if main_start_time != @main_start_time
|
331
|
+
puts("Found that main restarted, restarting agent")
|
332
|
+
Cnvrg::Logger.log_info("Found that main restarted, restarting agent")
|
333
|
+
exit(1)
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
218
337
|
def get_pod_events(pod_name)
|
219
338
|
return if pod_name.blank?
|
220
339
|
`kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
|
@@ -224,4 +343,27 @@ class Cnvrg::Helpers::Executer
|
|
224
343
|
return if node_name.blank?
|
225
344
|
`kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
|
226
345
|
end
|
346
|
+
|
347
|
+
def self.main_container_url
|
348
|
+
if ENV["CNVRG_COMPUTE_CLUSTER"].blank? and ENV["KUBERNETES_SERVICE_HOST"].blank?
|
349
|
+
if ENV["MAIN_CONTAINER_PORT"].blank?
|
350
|
+
host = "slave"
|
351
|
+
else
|
352
|
+
host = "main"
|
353
|
+
end
|
354
|
+
"http://#{host}:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
355
|
+
else
|
356
|
+
"http://localhost:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
def self.get_main_conn(timeout: 4, open_timeout: 1)
|
361
|
+
conn = Faraday.new(
|
362
|
+
url: Cnvrg::Helpers::Executer.main_container_url,
|
363
|
+
headers: {'Content-Type' => 'application/json'}
|
364
|
+
)
|
365
|
+
conn.options.timeout = timeout
|
366
|
+
conn.options.open_timeout = open_timeout
|
367
|
+
conn
|
368
|
+
end
|
227
369
|
end
|
data/lib/cnvrg/job_ssh.rb
CHANGED
@@ -5,14 +5,18 @@ module Cnvrg
|
|
5
5
|
method_option :port, :type => :numeric, :aliases => ["-p", "--port"], :desc => "Port to bind into", :default => 2222
|
6
6
|
method_option :username, :type => :string, :aliases => ["-u", "--username"], :desc => "Job container user name", :default => nil
|
7
7
|
method_option :password, :type => :string, :aliases => ["--password"], :desc =>"Job Conatainer user name, will be set by cnvrg", :default => nil
|
8
|
+
method_option :no_auth, :type => :boolean, :aliases => ["--no-auth"], :default => false
|
9
|
+
method_option :internal_port, :type => :numeric, :aliases => ["--internal-port"], :desc =>"Internal port in the pod for the ssh", :default => 22
|
8
10
|
method_option :kubeconfig, :type => :string, :aliases => ["--kubeconfig"], :desc => "Path to kubeconfig, if blank default config will be used", :default => nil
|
9
11
|
def start(job_id)
|
12
|
+
no_auth = options["no_auth"]
|
10
13
|
Cnvrg::CLI.new.log_start(__method__, args, options)
|
11
14
|
@job_ssh = ConnectJobSsh.new(job_id)
|
12
|
-
@job_ssh.start(options['username'], options['password'])
|
15
|
+
@job_ssh.start(options['username'], options['password'], no_auth, port: options['internal_port'])
|
13
16
|
pod_name = nil
|
14
17
|
namespace = "cnvrg"
|
15
18
|
ssh_ready = false
|
19
|
+
internal_port = options['internal_port']
|
16
20
|
while not ssh_ready
|
17
21
|
resp = @job_ssh.status()
|
18
22
|
status = resp["ssh_status"]
|
@@ -26,13 +30,14 @@ module Cnvrg
|
|
26
30
|
username = resp["username"]
|
27
31
|
pod_name = resp["pod_name"]
|
28
32
|
namespace = resp["namespace"]
|
33
|
+
internal_port = resp["port"] || internal_port
|
29
34
|
ssh_ready = true
|
30
35
|
else
|
31
36
|
puts("Failed to start ssh")
|
32
37
|
break
|
33
38
|
end
|
34
39
|
end
|
35
|
-
if pod_name.blank? or password.blank? or username.blank?
|
40
|
+
if pod_name.blank? or (password.blank? and !no_auth) or username.blank?
|
36
41
|
puts("Failed to get required params")
|
37
42
|
return
|
38
43
|
end
|
@@ -41,8 +46,8 @@ module Cnvrg
|
|
41
46
|
puts("host: 127.0.0.1")
|
42
47
|
puts("port: #{options["port"]}")
|
43
48
|
puts("username: #{username}")
|
44
|
-
puts("password: #{password}")
|
45
|
-
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace)
|
49
|
+
puts("password: #{password}") unless no_auth
|
50
|
+
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace, internal_port)
|
46
51
|
end
|
47
52
|
end
|
48
53
|
end
|
data/lib/cnvrg/project.rb
CHANGED
@@ -378,14 +378,16 @@ module Cnvrg
|
|
378
378
|
[]
|
379
379
|
end
|
380
380
|
|
381
|
-
def generate_output_dir(output_dir)
|
381
|
+
def generate_output_dir(output_dir, local: false)
|
382
382
|
Cnvrg::Logger.log_info("Generating output dir for #{output_dir}")
|
383
383
|
upload_list = []
|
384
384
|
list = Dir.glob("/cnvrg/#{output_dir}/**/*", File::FNM_DOTMATCH)
|
385
|
+
if local
|
386
|
+
list += Dir.glob("#{output_dir}/**/*", File::FNM_DOTMATCH)
|
387
|
+
end
|
385
388
|
Parallel.map(list, in_threads: IDXParallelThreads) do |e|
|
386
389
|
next if e.end_with? "/."
|
387
390
|
if File.directory? e
|
388
|
-
|
389
391
|
upload_list << e + "/"
|
390
392
|
else
|
391
393
|
upload_list << e
|
data/lib/cnvrg/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
1
|
module Cnvrg
|
2
|
-
VERSION = '
|
3
|
-
end
|
2
|
+
VERSION = '2.0.1'
|
3
|
+
end
|
metadata
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cnvrg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yochay Ettun
|
8
8
|
- Leah Kolben
|
9
|
+
- Omer Shacham
|
9
10
|
autorequire:
|
10
11
|
bindir: bin
|
11
12
|
cert_chain: []
|
12
|
-
date: 2021-
|
13
|
+
date: 2021-06-16 00:00:00.000000000 Z
|
13
14
|
dependencies:
|
14
15
|
- !ruby/object:Gem::Dependency
|
15
16
|
name: bundler
|
@@ -321,6 +322,20 @@ dependencies:
|
|
321
322
|
- - "~>"
|
322
323
|
- !ruby/object:Gem::Version
|
323
324
|
version: 0.1.1
|
325
|
+
- !ruby/object:Gem::Dependency
|
326
|
+
name: filewatch
|
327
|
+
requirement: !ruby/object:Gem::Requirement
|
328
|
+
requirements:
|
329
|
+
- - "~>"
|
330
|
+
- !ruby/object:Gem::Version
|
331
|
+
version: 0.9.0
|
332
|
+
type: :runtime
|
333
|
+
prerelease: false
|
334
|
+
version_requirements: !ruby/object:Gem::Requirement
|
335
|
+
requirements:
|
336
|
+
- - "~>"
|
337
|
+
- !ruby/object:Gem::Version
|
338
|
+
version: 0.9.0
|
324
339
|
- !ruby/object:Gem::Dependency
|
325
340
|
name: parallel
|
326
341
|
requirement: !ruby/object:Gem::Requirement
|