cnvrg 1.11.29 → 2.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Readme.md +40 -1
- data/cnvrg.gemspec +2 -0
- data/lib/cnvrg/api.rb +4 -4
- data/lib/cnvrg/cli/library_cli.rb +2 -2
- data/lib/cnvrg/cli.rb +143 -73
- data/lib/cnvrg/connect_job_ssh.rb +4 -4
- data/lib/cnvrg/files.rb +6 -2
- data/lib/cnvrg/helpers/agent.rb +58 -18
- data/lib/cnvrg/helpers/executer.rb +177 -35
- data/lib/cnvrg/job_ssh.rb +9 -4
- data/lib/cnvrg/project.rb +31 -10
- data/lib/cnvrg/version.rb +2 -2
- metadata +40 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 86857e06a0d107172c161467e8ff8cb04a120d8b230c97843e91ce13c826ccce
|
4
|
+
data.tar.gz: 272fe88e1e390f2887c36c49915cc89f10a6cf9947bb98ab6fd503476ac03820
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c28ab54953e47843897273f8038f5e0bfc92e101e161d1edae4477250e1334b5730b5020713572ad2482c5b0c305c5c0b2c5725b8796a44c06750eccf2924e9f
|
7
|
+
data.tar.gz: 69be0cd1fc89180ce7b878aaa8e0767b5be9a00cb0c9813c504e62677c1c839831aa6cf4fbf27aeb66aa50b756f7b28f46eb721182e588bcc4b7c65544e6a41b
|
data/Readme.md
CHANGED
@@ -14,4 +14,43 @@
|
|
14
14
|
* DEV-8284 - Improvement: Use server instead of docker for agent communication
|
15
15
|
* DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
|
16
16
|
* DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
|
17
|
-
* DEV-8621 - Improvement: Add more metrics
|
17
|
+
* DEV-8621 - Improvement: Add more metrics
|
18
|
+
## Version v1.11.30
|
19
|
+
2021-04-06
|
20
|
+
## Version v1.11.31
|
21
|
+
2021-04-22
|
22
|
+
## Version v1.11.32
|
23
|
+
2021-05-05
|
24
|
+
* DEV-8868 - Bug: SDK - e.sync() in git project only creates empty "output" folder in commit
|
25
|
+
## Version v2.0.1
|
26
|
+
2021-06-13
|
27
|
+
## Version v2.0.2
|
28
|
+
2021-06-16
|
29
|
+
* DEV-9694 - Bug: Download artifacts fails on authorization error
|
30
|
+
## Version v2.0.3
|
31
|
+
2021-06-29
|
32
|
+
* DEV-9919 - Bug: clone artifacts fails on "Not Authorize, Are you logged in?"
|
33
|
+
## Version v2.0.4
|
34
|
+
2021-07-08
|
35
|
+
* DEV-9935 - Bug: CLI - cnvrg sync creates new commit but no blob versions
|
36
|
+
## Version v2.0.5
|
37
|
+
2021-07-11
|
38
|
+
* DEV-10171 - Bug: experiment randomly fails with error- "Couldn't clone artifacts"
|
39
|
+
* DEV-10189 - Bug: CLI Sync -file/folder with broken symlink will cause sync to fail
|
40
|
+
## Version v2.0.6
|
41
|
+
2021-07-18
|
42
|
+
* DEV-10209 - Bug: some experiments in grid failed on cnvrg-cli commands (docker container id was missing)
|
43
|
+
## Version v2.0.7
|
44
|
+
2021-07-27
|
45
|
+
* DEV-10186 - Bug: CLI/run an experiment with --local tag giver server error
|
46
|
+
## Version v2.0.8
|
47
|
+
2021-09-06
|
48
|
+
* DEV-10697 - Bug: Tensorboard not starting in workspace and experiment.
|
49
|
+
## Version v2.0.9
|
50
|
+
2021-09-12
|
51
|
+
* DEV-10502 - Bug: Periodic sync stuck
|
52
|
+
## Version v2.0.10
|
53
|
+
2021-09-12
|
54
|
+
* DEV-10502 - Bug: Periodic sync stuck
|
55
|
+
## Version v2.0.11
|
56
|
+
2021-10-21
|
data/cnvrg.gemspec
CHANGED
@@ -25,6 +25,7 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_development_dependency 'aruba'
|
26
26
|
spec.add_development_dependency 'pry'
|
27
27
|
|
28
|
+
spec.add_runtime_dependency 'ffi', '~> 1.9', '>= 1.9.10'
|
28
29
|
spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
|
29
30
|
spec.add_runtime_dependency 'faraday', '~> 0.15.2'
|
30
31
|
spec.add_runtime_dependency 'netrc', '~> 0.11.0'
|
@@ -38,6 +39,7 @@ Gem::Specification.new do |spec|
|
|
38
39
|
spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
|
39
40
|
spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
|
40
41
|
spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
|
42
|
+
spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
|
41
43
|
spec.add_runtime_dependency 'parallel', '~> 1.12.0'
|
42
44
|
spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
|
43
45
|
spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
|
data/lib/cnvrg/api.rb
CHANGED
@@ -72,11 +72,11 @@ module Cnvrg
|
|
72
72
|
if response.to_hash[:status].to_i != 200
|
73
73
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
74
74
|
end
|
75
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
75
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
76
76
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
77
77
|
success = false
|
78
78
|
sleep(5 * retries)
|
79
|
-
retries +=1
|
79
|
+
retries += 1
|
80
80
|
next
|
81
81
|
end
|
82
82
|
rescue => e
|
@@ -112,11 +112,11 @@ module Cnvrg
|
|
112
112
|
if response.to_hash[:status].to_i != 200
|
113
113
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
114
114
|
end
|
115
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
115
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
116
116
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
117
117
|
success = false
|
118
118
|
sleep(5 * retries)
|
119
|
-
retries +=1
|
119
|
+
retries += 1
|
120
120
|
next
|
121
121
|
end
|
122
122
|
rescue => e
|
data/lib/cnvrg/cli.rb
CHANGED
@@ -173,7 +173,7 @@ module Cnvrg
|
|
173
173
|
desc "data [COMMAND]", "Upload and manage datasets", :hide => false
|
174
174
|
subcommand "data", Data
|
175
175
|
|
176
|
-
desc "job", "manage running jobs", :hide =>
|
176
|
+
desc "job", "manage running jobs", :hide => true
|
177
177
|
subcommand "job", JobCli
|
178
178
|
|
179
179
|
desc "ssh", "ssh into running jobs", :hide => false
|
@@ -415,7 +415,7 @@ module Cnvrg
|
|
415
415
|
end
|
416
416
|
end
|
417
417
|
|
418
|
-
desc 'set_compression_path', 'Set compression path'
|
418
|
+
desc 'set_compression_path', 'Set compression path', :hide => true
|
419
419
|
method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
|
420
420
|
|
421
421
|
def set_compression_path(*compression_path)
|
@@ -2311,6 +2311,7 @@ module Cnvrg
|
|
2311
2311
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
2312
2312
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
2313
2313
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
2314
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
2314
2315
|
|
2315
2316
|
def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
|
2316
2317
|
begin
|
@@ -2328,6 +2329,8 @@ module Cnvrg
|
|
2328
2329
|
exp_obj = nil
|
2329
2330
|
end
|
2330
2331
|
|
2332
|
+
local = options["local"]
|
2333
|
+
|
2331
2334
|
commit_msg = options["message"]
|
2332
2335
|
if commit_msg.nil? or commit_msg.empty?
|
2333
2336
|
commit_msg = ""
|
@@ -2349,7 +2352,7 @@ module Cnvrg
|
|
2349
2352
|
if git_output_dir.ends_with? "/"
|
2350
2353
|
git_output_dir = git_output_dir[0..-2]
|
2351
2354
|
end
|
2352
|
-
list = @project.generate_output_dir(git_output_dir)
|
2355
|
+
list = @project.generate_output_dir(git_output_dir, local: local)
|
2353
2356
|
end
|
2354
2357
|
list += @project.generate_git_diff if options["git_diff"]
|
2355
2358
|
spec_files_to_upload = list
|
@@ -2668,7 +2671,7 @@ module Cnvrg
|
|
2668
2671
|
end
|
2669
2672
|
end
|
2670
2673
|
|
2671
|
-
desc 'commit before termination', 'Commit job code before termination'
|
2674
|
+
desc 'commit before termination', 'Commit job code before termination', :hide => true
|
2672
2675
|
def commit_before_termination()
|
2673
2676
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2674
2677
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2678,7 +2681,7 @@ module Cnvrg
|
|
2678
2681
|
log_error(e)
|
2679
2682
|
end
|
2680
2683
|
|
2681
|
-
desc 'update_job_commit', 'Update job with its last commit'
|
2684
|
+
desc 'update_job_commit', 'Update job with its last commit' , :hide => true
|
2682
2685
|
def update_job_commit()
|
2683
2686
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2684
2687
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2868,7 +2871,7 @@ module Cnvrg
|
|
2868
2871
|
|
2869
2872
|
|
2870
2873
|
|
2871
|
-
desc 'jump', 'Jump to specific commit'
|
2874
|
+
desc 'jump COMMIT_ID', 'Jump to specific commit'
|
2872
2875
|
def jump(commit_sha1)
|
2873
2876
|
begin
|
2874
2877
|
verify_logged_in()
|
@@ -3003,11 +3006,12 @@ module Cnvrg
|
|
3003
3006
|
method_option :job_slug, :type => :string, :aliases => ["-j", "--job"], :default => nil
|
3004
3007
|
method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
|
3005
3008
|
method_option :files, :type => :string, :aliases => ["--files"], :default => nil
|
3006
|
-
method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default =>
|
3009
|
+
method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => 'output'
|
3007
3010
|
method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
|
3008
3011
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
3009
3012
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
3010
3013
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
3014
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
3011
3015
|
|
3012
3016
|
def sync(direct = true)
|
3013
3017
|
verify_logged_in(true) if direct
|
@@ -3030,10 +3034,10 @@ module Cnvrg
|
|
3030
3034
|
if run_download or options['debug_mode']
|
3031
3035
|
invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
|
3032
3036
|
end
|
3033
|
-
invoke :upload, [false, true,
|
3037
|
+
invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
|
3034
3038
|
:ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
|
3035
3039
|
:files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
|
3036
|
-
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
|
3040
|
+
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
|
3037
3041
|
|
3038
3042
|
end
|
3039
3043
|
|
@@ -3143,7 +3147,7 @@ module Cnvrg
|
|
3143
3147
|
invoke :exec, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title,
|
3144
3148
|
:log => log, :email_notification => email_notification, :upload_output => upload_output,
|
3145
3149
|
:commit => commit, :image => image, :data => data, :data_commit => data_commit,
|
3146
|
-
:ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query
|
3150
|
+
:ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query, :local => local
|
3147
3151
|
return
|
3148
3152
|
end
|
3149
3153
|
else
|
@@ -3200,6 +3204,7 @@ module Cnvrg
|
|
3200
3204
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
|
3201
3205
|
method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
|
3202
3206
|
method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
|
3207
|
+
method_option :local, :type => :boolean, :aliases => ["-l", "--local"], :default => false
|
3203
3208
|
|
3204
3209
|
def exec(*cmd)
|
3205
3210
|
log = []
|
@@ -3224,6 +3229,7 @@ module Cnvrg
|
|
3224
3229
|
project_home = get_project_home
|
3225
3230
|
data_query = options["data_query"]
|
3226
3231
|
docker_stats = options["docker_stats"]
|
3232
|
+
local = options[:local] || false
|
3227
3233
|
@project = Project.new(project_home)
|
3228
3234
|
if @project.is_git
|
3229
3235
|
sync_before = false
|
@@ -3316,62 +3322,53 @@ module Cnvrg
|
|
3316
3322
|
end
|
3317
3323
|
end
|
3318
3324
|
start_time = Time.now
|
3319
|
-
shell_type = options["use_bash"] ? "bash -l" : "sh"
|
3320
3325
|
if @exp.get_cmd.present?
|
3321
3326
|
cmd = @exp.get_cmd
|
3322
|
-
if options["docker_id"].present? # Escape for docker exec
|
3323
|
-
cmd = cmd.gsub("\"", "\\\"")
|
3324
|
-
end
|
3325
|
-
end
|
3326
|
-
if options["docker_id"].present?
|
3327
|
-
cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
|
3328
3327
|
end
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
|
3333
|
-
|
3334
|
-
|
3335
|
-
|
3336
|
-
|
3337
|
-
|
3338
|
-
|
3339
|
-
|
3340
|
-
|
3328
|
+
|
3329
|
+
if local
|
3330
|
+
exec_local(cmd)
|
3331
|
+
exit_status = $?.exitstatus
|
3332
|
+
|
3333
|
+
else
|
3334
|
+
command_slug = (0...18).map { (65 + rand(26)).chr }.join
|
3335
|
+
result_file = "/conf/result-#{command_slug}"
|
3336
|
+
data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
|
3337
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
3338
|
+
response = conn.post('command', data.to_json)
|
3339
|
+
if response.to_hash[:status].to_i != 200
|
3340
|
+
exit_status = 129
|
3341
|
+
raise StandardError.new("Cant send command to slave")
|
3342
|
+
end
|
3343
|
+
t = FileWatch::Tail.new
|
3344
|
+
filename = result_file
|
3345
|
+
lines = []
|
3346
|
+
t.tail(filename)
|
3347
|
+
t.subscribe do |path, line|
|
3348
|
+
begin
|
3349
|
+
cur_log = JSON.parse(line)
|
3350
|
+
if cur_log["type"] == "endMessage"
|
3351
|
+
exit_status = cur_log["real"].to_i
|
3352
|
+
break
|
3353
|
+
else
|
3354
|
+
puts(cur_log.to_json)
|
3355
|
+
STDOUT.flush
|
3356
|
+
cur_log["time"] = Time.parse(cur_log["timestamp"])
|
3357
|
+
cur_log["message"] = cur_log["message"].to_s + "\r\n"
|
3358
|
+
log << cur_log
|
3341
3359
|
end
|
3342
|
-
log << cur_log
|
3343
3360
|
if log.size >= 10
|
3344
|
-
@exp.upload_temp_log(log)
|
3361
|
+
@exp.upload_temp_log(log)
|
3345
3362
|
log = []
|
3346
|
-
|
3363
|
+
elsif (start_time + 15.seconds) <= Time.now
|
3347
3364
|
@exp.upload_temp_log(log) unless log.empty?
|
3348
3365
|
log = []
|
3349
3366
|
start_time = Time.now
|
3350
3367
|
end
|
3368
|
+
rescue => e
|
3369
|
+
log_error(e)
|
3351
3370
|
end
|
3352
|
-
if stderr
|
3353
|
-
stderr.each do |err|
|
3354
|
-
log << {time: Time.now, message: err, type: "stderr"}
|
3355
|
-
end
|
3356
|
-
end
|
3357
|
-
rescue Errno::EIO => e
|
3358
|
-
log_error(e)
|
3359
|
-
if !log.empty?
|
3360
|
-
temp_log = log
|
3361
|
-
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3362
|
-
log -= temp_log
|
3363
|
-
end
|
3364
|
-
rescue Errno::ENOENT => e
|
3365
|
-
exp_success = false
|
3366
|
-
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
3367
|
-
log_error(e)
|
3368
|
-
rescue => e
|
3369
|
-
res = @exp.end(log, 1, start_commit, 0, 0)
|
3370
|
-
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
3371
|
-
log_error(e)
|
3372
|
-
exit(0)
|
3373
3371
|
end
|
3374
|
-
::Process.wait pid
|
3375
3372
|
end
|
3376
3373
|
end_time = Time.now
|
3377
3374
|
process_running = false
|
@@ -3379,14 +3376,13 @@ module Cnvrg
|
|
3379
3376
|
if !log.empty?
|
3380
3377
|
|
3381
3378
|
temp_log = log
|
3382
|
-
|
3379
|
+
@exp.upload_temp_log(temp_log)
|
3383
3380
|
log -= temp_log
|
3384
3381
|
end
|
3385
3382
|
|
3386
3383
|
cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
|
3387
3384
|
memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
|
3388
|
-
exit_status
|
3389
|
-
if $?.exitstatus != 0
|
3385
|
+
if exit_status != 0
|
3390
3386
|
exp_success = false
|
3391
3387
|
end
|
3392
3388
|
|
@@ -3430,7 +3426,6 @@ module Cnvrg
|
|
3430
3426
|
if @exp
|
3431
3427
|
# log_thread.join
|
3432
3428
|
Thread.kill(stats_thread) if docker_stats
|
3433
|
-
exit_status = $?.exitstatus
|
3434
3429
|
if exit_status.blank?
|
3435
3430
|
exit_status = "-1"
|
3436
3431
|
end
|
@@ -3443,8 +3438,6 @@ module Cnvrg
|
|
3443
3438
|
|
3444
3439
|
exit(1)
|
3445
3440
|
end
|
3446
|
-
|
3447
|
-
|
3448
3441
|
end
|
3449
3442
|
|
3450
3443
|
end
|
@@ -3689,7 +3682,7 @@ module Cnvrg
|
|
3689
3682
|
end
|
3690
3683
|
end
|
3691
3684
|
|
3692
|
-
desc 'deploy', 'Deploys model to production'
|
3685
|
+
desc 'deploy', 'Deploys model to production', :hide => true
|
3693
3686
|
method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
|
3694
3687
|
method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
|
3695
3688
|
method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
|
@@ -3778,7 +3771,7 @@ module Cnvrg
|
|
3778
3771
|
method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
|
3779
3772
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
|
3780
3773
|
|
3781
|
-
desc 'notebook', 'Starts a notebook session remotely or locally'
|
3774
|
+
desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
|
3782
3775
|
|
3783
3776
|
def notebook
|
3784
3777
|
verify_logged_in(true)
|
@@ -3905,7 +3898,7 @@ module Cnvrg
|
|
3905
3898
|
end
|
3906
3899
|
end
|
3907
3900
|
|
3908
|
-
desc 'remote_notebook', 'Run notebook server on remote server'
|
3901
|
+
desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
|
3909
3902
|
method_option :machine_type, :type => :string, :default => ""
|
3910
3903
|
method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
|
3911
3904
|
method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
|
@@ -4264,7 +4257,7 @@ module Cnvrg
|
|
4264
4257
|
|
4265
4258
|
end
|
4266
4259
|
|
4267
|
-
desc 'notebook_stop', '
|
4260
|
+
desc 'notebook_stop', 'Stop notebook', :hide => true
|
4268
4261
|
method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
|
4269
4262
|
method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
|
4270
4263
|
method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
|
@@ -4651,15 +4644,21 @@ module Cnvrg
|
|
4651
4644
|
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
4652
4645
|
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
4653
4646
|
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
4647
|
+
method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
|
4648
|
+
method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
|
4649
|
+
method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
|
4654
4650
|
|
4655
4651
|
def collect_metrics
|
4656
4652
|
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
4657
4653
|
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
4654
|
+
prom_user = options[:prom_user]
|
4655
|
+
prom_password = options[:prom_password]
|
4656
|
+
name = options[:name]
|
4658
4657
|
|
4659
4658
|
translate_result = Cnvrg::API_V2.request(
|
4660
4659
|
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
4661
4660
|
'GET',
|
4662
|
-
{ gpu: options[:gpu] }
|
4661
|
+
{ gpu: options[:gpu], gaudi: options[:gaudi] }
|
4663
4662
|
)
|
4664
4663
|
|
4665
4664
|
is_machine = options[:machine]
|
@@ -4679,9 +4678,16 @@ module Cnvrg
|
|
4679
4678
|
next
|
4680
4679
|
end
|
4681
4680
|
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
4682
|
-
|
4681
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
4682
|
+
http.use_ssl = uri.scheme == "https"
|
4683
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
4684
|
+
req = Net::HTTP::Get.new uri.request_uri
|
4685
|
+
if prom_user.present?
|
4686
|
+
req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
|
4687
|
+
end
|
4688
|
+
resp = http.request(req)
|
4683
4689
|
begin
|
4684
|
-
result = JSON.parse(resp)
|
4690
|
+
result = JSON.parse(resp.body)
|
4685
4691
|
rescue JSON::ParserError => e
|
4686
4692
|
log_error(e)
|
4687
4693
|
next
|
@@ -4690,13 +4696,22 @@ module Cnvrg
|
|
4690
4696
|
next unless data_result
|
4691
4697
|
|
4692
4698
|
if data_result.size > 1
|
4693
|
-
stats[query_name] = {}
|
4699
|
+
stats[query_name] = {} unless query_name.include? 'block'
|
4694
4700
|
data_result.each_with_index do |res, i|
|
4695
4701
|
timestamp, value = res["value"]
|
4696
4702
|
uuid = res["metric"]["UUID"].presence || i
|
4697
4703
|
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4698
4704
|
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4699
|
-
|
4705
|
+
if query_name.include? 'block'
|
4706
|
+
uuid = res["metric"]["interface"].presence || i
|
4707
|
+
uuid = "#{name}-#{uuid}" if name.present?
|
4708
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
4709
|
+
io_type = query_name.split('_')[1]
|
4710
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4711
|
+
stats['block_io'][io_type].merge!({ uuid => stat_value })
|
4712
|
+
else
|
4713
|
+
stats[query_name][uuid] = stat_value
|
4714
|
+
end
|
4700
4715
|
end
|
4701
4716
|
else
|
4702
4717
|
timestamp, value = data_result&.first&.dig('value')
|
@@ -4705,9 +4720,14 @@ module Cnvrg
|
|
4705
4720
|
if query_name.include? 'block'
|
4706
4721
|
stats['block_io'] = {} if stats['block_io'].blank?
|
4707
4722
|
io_type = query_name.split('_')[1]
|
4708
|
-
|
4723
|
+
if name.present?
|
4724
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4725
|
+
stats['block_io'][io_type].merge!({ name => stat_value })
|
4726
|
+
else
|
4727
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
4728
|
+
end
|
4709
4729
|
else
|
4710
|
-
stats[query_name] = stat_value
|
4730
|
+
stats[query_name] = name.present? ? { name => stat_value } : stat_value
|
4711
4731
|
end
|
4712
4732
|
end
|
4713
4733
|
end
|
@@ -4751,7 +4771,7 @@ module Cnvrg
|
|
4751
4771
|
end
|
4752
4772
|
|
4753
4773
|
|
4754
|
-
desc '', ''
|
4774
|
+
desc '', '', :hide => true
|
4755
4775
|
|
4756
4776
|
def download_built_image(image_name, image_slug)
|
4757
4777
|
begin
|
@@ -4995,7 +5015,7 @@ module Cnvrg
|
|
4995
5015
|
end
|
4996
5016
|
end
|
4997
5017
|
|
4998
|
-
desc 'experiments', 'List project experiments'
|
5018
|
+
desc 'experiments', 'List project experiments', :hide => true
|
4999
5019
|
method_option :id, :type => :string, :aliases => ["--id"], :desc => "Get info for specific experiments", :default => ""
|
5000
5020
|
method_option :tag, :type => :string, :aliases => ["-t"], :desc => "Get info for specific experiment tag", :default => ""
|
5001
5021
|
|
@@ -5864,6 +5884,56 @@ module Cnvrg
|
|
5864
5884
|
end
|
5865
5885
|
end
|
5866
5886
|
|
5887
|
+
def exec_local(cmd)
|
5888
|
+
PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
|
5889
|
+
begin
|
5890
|
+
stdout.each do |line|
|
5891
|
+
cur_time = Time.now
|
5892
|
+
real_time = Time.now - real
|
5893
|
+
cur_log = {time: cur_time,
|
5894
|
+
message: line,
|
5895
|
+
type: "stdout",
|
5896
|
+
real: real_time
|
5897
|
+
}
|
5898
|
+
if print_log
|
5899
|
+
puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
|
5900
|
+
end
|
5901
|
+
log << cur_log
|
5902
|
+
if log.size >= 10
|
5903
|
+
@exp.upload_temp_log(log) unless log.empty?
|
5904
|
+
log = []
|
5905
|
+
elsif (start_time + 15.seconds) <= Time.now
|
5906
|
+
@exp.upload_temp_log(log) unless log.empty?
|
5907
|
+
log = []
|
5908
|
+
start_time = Time.now
|
5909
|
+
end
|
5910
|
+
end
|
5911
|
+
if stderr
|
5912
|
+
stderr.each do |err|
|
5913
|
+
log << {time: Time.now, message: err, type: "stderr"}
|
5914
|
+
end
|
5915
|
+
end
|
5916
|
+
rescue Errno::EIO => e
|
5917
|
+
log_error(e)
|
5918
|
+
if !log.empty?
|
5919
|
+
temp_log = log
|
5920
|
+
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
5921
|
+
log -= temp_log
|
5922
|
+
end
|
5923
|
+
rescue Errno::ENOENT => e
|
5924
|
+
exp_success = false
|
5925
|
+
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
5926
|
+
log_error(e)
|
5927
|
+
rescue => e
|
5928
|
+
res = @exp.end(log, 1, start_commit, 0, 0)
|
5929
|
+
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
5930
|
+
log_error(e)
|
5931
|
+
exit(0)
|
5932
|
+
end
|
5933
|
+
::Process.wait pid
|
5934
|
+
end
|
5935
|
+
end
|
5936
|
+
|
5867
5937
|
end
|
5868
5938
|
end
|
5869
5939
|
|
@@ -10,20 +10,20 @@ module Cnvrg
|
|
10
10
|
Cnvrg::Logger.log_info("cnvrg is not configured")
|
11
11
|
end
|
12
12
|
|
13
|
-
def start(username, password)
|
14
|
-
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password})
|
13
|
+
def start(username, password, no_auth, port: nil)
|
14
|
+
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password, no_auth: no_auth, port: port})
|
15
15
|
end
|
16
16
|
|
17
17
|
def status()
|
18
18
|
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/status" , 'GET', nil)
|
19
19
|
end
|
20
20
|
|
21
|
-
def run_portforward_command(pod_name, port, kubeconfig, namespace)
|
21
|
+
def run_portforward_command(pod_name, port, kubeconfig, namespace, internal_port)
|
22
22
|
command = "kubectl"
|
23
23
|
if kubeconfig.present?
|
24
24
|
command = "kubectl --kubeconfig=#{kubeconfig}"
|
25
25
|
end
|
26
|
-
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}
|
26
|
+
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:#{internal_port}"
|
27
27
|
puts("\nrunning command #{bashCommand}")
|
28
28
|
`#{bashCommand}`
|
29
29
|
end
|
data/lib/cnvrg/files.rb
CHANGED
@@ -106,7 +106,7 @@ module Cnvrg
|
|
106
106
|
commit: commit_sha1
|
107
107
|
})
|
108
108
|
unless Cnvrg::CLI.is_response_success(resp, false)
|
109
|
-
raise
|
109
|
+
raise StandardError.new("unsupported character: folder name can not include / \\ * : ? \" | ")
|
110
110
|
end
|
111
111
|
# resolve bucket
|
112
112
|
res = resp['result']
|
@@ -730,7 +730,11 @@ module Cnvrg
|
|
730
730
|
end
|
731
731
|
res = Cnvrg::API.request(@base_resource + "download_files", 'POST', {files: files, commit: commit})
|
732
732
|
unless Cnvrg::CLI.is_response_success(res, false)
|
733
|
-
|
733
|
+
begin
|
734
|
+
puts(res)
|
735
|
+
rescue
|
736
|
+
end
|
737
|
+
raise StandardError.new("Cant download files from the server.")
|
734
738
|
end
|
735
739
|
self.download_multiple_files_s3(res['result'], @project_home, postfix: postfix, progress: progress, threads: threads)
|
736
740
|
end
|
data/lib/cnvrg/helpers/agent.rb
CHANGED
@@ -18,10 +18,13 @@ class Cnvrg::Helpers::Agent
|
|
18
18
|
#### params
|
19
19
|
def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
|
20
20
|
@executer = executer
|
21
|
+
@job_id = ENV["CNVRG_JOB_ID"]
|
21
22
|
@slug = slug
|
22
23
|
@files_exist = files_exist
|
23
24
|
@container_name = container_name
|
24
|
-
@
|
25
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
26
|
+
@main_name = @is_new_main ? "main" : "slave"
|
27
|
+
@run_in_main = @container_name.downcase == @main_name
|
25
28
|
@log_interval = send_log_interval
|
26
29
|
# https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
|
27
30
|
if timeout.blank? or timeout.negative?
|
@@ -37,18 +40,8 @@ class Cnvrg::Helpers::Agent
|
|
37
40
|
@sleep_before_retry = sleep_before_retry
|
38
41
|
@real_execution_retries = 0 ## How many times the command really executed until success
|
39
42
|
@single_quotes = single_quotes
|
40
|
-
@docker_user =
|
41
|
-
@
|
42
|
-
if docker_user.present?
|
43
|
-
@docker_user = " --user #{docker_user}"
|
44
|
-
end
|
45
|
-
if @run_in_slave
|
46
|
-
if @single_quotes
|
47
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
|
48
|
-
else
|
49
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
|
50
|
-
end
|
51
|
-
end
|
43
|
+
@docker_user = docker_user
|
44
|
+
@use_bash = use_bash
|
52
45
|
@output = []
|
53
46
|
@errors = []
|
54
47
|
@exit_status = nil
|
@@ -75,19 +68,22 @@ class Cnvrg::Helpers::Agent
|
|
75
68
|
|
76
69
|
def exec!
|
77
70
|
log_internal("Command: #{@command} with slug: #{@slug} started!")
|
71
|
+
command_status = Status::FINISHED
|
78
72
|
if @command.blank?
|
79
73
|
@exit_status = 0
|
74
|
+
command_status = Status::ABORTED
|
80
75
|
elsif should_run?
|
81
76
|
send_logs(status: Status::STARTED)
|
82
77
|
periodic_thread_handle = periodic_thread
|
83
78
|
execute_command
|
84
79
|
else
|
80
|
+
command_status = Status::ABORTED
|
85
81
|
@exit_status = 127
|
86
82
|
end
|
87
83
|
finish_log = "Command: #{@command} with slug: #{@slug} finished"
|
88
84
|
finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
|
89
85
|
log_internal(finish_log)
|
90
|
-
send_logs(exit_status: @exit_status, status:
|
86
|
+
send_logs(exit_status: @exit_status, status: command_status)
|
91
87
|
if periodic_thread_handle.present?
|
92
88
|
periodic_thread_handle.join
|
93
89
|
end
|
@@ -117,14 +113,50 @@ class Cnvrg::Helpers::Agent
|
|
117
113
|
execute_command
|
118
114
|
end
|
119
115
|
|
116
|
+
def execute_command_on_slave
|
117
|
+
extra_slug = (0...2).map { (65 + rand(26)).chr }.join
|
118
|
+
result_file = "/conf/result-#{@slug}-#{extra_slug}"
|
119
|
+
Timeout.timeout(@timeout) do
|
120
|
+
data = {cmd: @command, async: true, file_name: result_file, use_script: true, use_bash: @use_bash, use_sh: !@use_bash, docker_user: @docker_user}
|
121
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
122
|
+
response = conn.post('command', data.to_json)
|
123
|
+
if response.to_hash[:status].to_i != 200
|
124
|
+
@exit_status = 129
|
125
|
+
raise StandardError.new("Cant send command to slave")
|
126
|
+
end
|
127
|
+
t = FileWatch::Tail.new
|
128
|
+
filename = result_file
|
129
|
+
t.tail(filename)
|
130
|
+
t.subscribe do |path, line|
|
131
|
+
if line.include?("cnvrg-exit-code")
|
132
|
+
@exit_status = line.split("=")[1].to_i
|
133
|
+
break
|
134
|
+
end
|
135
|
+
if !@is_new_main
|
136
|
+
log_internal(line, level: LogLevel::PURE)
|
137
|
+
end
|
138
|
+
line = line.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_')
|
139
|
+
@output << {log: line, timestamp: Time.now}
|
140
|
+
end
|
141
|
+
end
|
142
|
+
rescue Timeout::Error
|
143
|
+
@errors << {log: "Command timed out!", timestamp: Time.now}
|
144
|
+
log_internal("Command timed out!", level: LogLevel::ERROR)
|
145
|
+
@exit_status = 124
|
146
|
+
ensure
|
147
|
+
retry_command if @retries != 0 and @exit_status !=0
|
148
|
+
@exit_status
|
149
|
+
end
|
150
|
+
|
120
151
|
def execute_command
|
152
|
+
return execute_command_on_slave if @run_in_main
|
121
153
|
Timeout.timeout(@timeout) do
|
122
154
|
PTY.spawn(@command) do |stdout, stdin, pid, stderr|
|
123
155
|
@pid = pid
|
124
156
|
begin
|
125
157
|
if stdout.present?
|
126
158
|
stdout.each do |line|
|
127
|
-
log_internal(line, level: LogLevel::
|
159
|
+
log_internal(line, level: LogLevel::INFO)
|
128
160
|
line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
|
129
161
|
@output << {log: line, timestamp: Time.now}
|
130
162
|
end
|
@@ -176,10 +208,18 @@ class Cnvrg::Helpers::Agent
|
|
176
208
|
def log_internal(log, level: LogLevel::INFO)
|
177
209
|
if level == LogLevel::PURE
|
178
210
|
puts(log)
|
179
|
-
|
180
|
-
|
211
|
+
STDOUT.flush
|
212
|
+
return
|
213
|
+
end
|
214
|
+
to_print = {message: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity, job: @job_id}
|
215
|
+
if log.start_with?("{") and log.include?("timestamp")
|
216
|
+
log_json = JSON.parse(log)
|
217
|
+
to_print = to_print.stringify_keys.merge(log_json.stringify_keys)
|
181
218
|
end
|
219
|
+
puts(to_print.to_json)
|
182
220
|
STDOUT.flush
|
221
|
+
rescue => e
|
222
|
+
Cnvrg::Logger.log_error(e)
|
183
223
|
end
|
184
224
|
|
185
225
|
def filter_logs_by_regex(logs)
|
@@ -190,4 +230,4 @@ class Cnvrg::Helpers::Agent
|
|
190
230
|
end
|
191
231
|
end
|
192
232
|
end
|
193
|
-
end
|
233
|
+
end
|
@@ -1,7 +1,9 @@
|
|
1
|
+
require "filewatch/tail"
|
1
2
|
require 'cnvrg/helpers/agent'
|
2
3
|
class Cnvrg::Helpers::Executer
|
3
|
-
attr_reader :machine_activity, :agent_id, :
|
4
|
-
|
4
|
+
attr_reader :machine_activity, :agent_id, :main_id
|
5
|
+
MAIN_CONTAINER_PORT = ENV['MAIN_CONTAINER_PORT'].try(:to_i) || 4000
|
6
|
+
HAS_DOCKER = ENV['HAS_DOCKER'] == "true"
|
5
7
|
|
6
8
|
### this class represent a machine_activity. it will poll the commands, communicate with the
|
7
9
|
# server (poll commands) and let the server know the status of this executer.
|
@@ -9,11 +11,15 @@ class Cnvrg::Helpers::Executer
|
|
9
11
|
@owner = owner
|
10
12
|
@job_id = job_id
|
11
13
|
@poll_every = poll_every
|
14
|
+
@check_main_every = 10
|
12
15
|
@machine_activity = machine_activity
|
13
16
|
@commands_q = Queue.new
|
14
17
|
@files_q = Queue.new
|
15
18
|
@agent_id = nil
|
16
|
-
@
|
19
|
+
@main_id = nil
|
20
|
+
@main_start_time = nil
|
21
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
22
|
+
@main_name = @is_new_main ? "main" : "slave"
|
17
23
|
end
|
18
24
|
|
19
25
|
def create_file_cmd(path, content)
|
@@ -37,9 +43,10 @@ class Cnvrg::Helpers::Executer
|
|
37
43
|
def executer_stats
|
38
44
|
return @stats if @stats.present?
|
39
45
|
Cnvrg::Logger.log_info("getting containers")
|
40
|
-
@agent_id, @
|
46
|
+
@agent_id, @main_id = containers
|
41
47
|
Cnvrg::Logger.log_info("got containers")
|
42
48
|
pod_name, node_name = get_node_and_pod_names
|
49
|
+
# For backwards compatibility we still call this slave stats
|
43
50
|
@stats = {
|
44
51
|
pod_name: pod_name,
|
45
52
|
node_name: node_name,
|
@@ -53,39 +60,48 @@ class Cnvrg::Helpers::Executer
|
|
53
60
|
cnvrg: Cnvrg::VERSION
|
54
61
|
},
|
55
62
|
slave: {
|
56
|
-
container_id: @
|
57
|
-
|
58
|
-
|
63
|
+
container_id: @main_id,
|
64
|
+
container_name: @main_name,
|
65
|
+
workdir: run_in_main('pwd'),
|
66
|
+
homedir: main_homedir,
|
59
67
|
spark_path: spark_path,
|
60
|
-
user:
|
61
|
-
cnvrg:
|
62
|
-
has_bash:
|
63
|
-
user_id:
|
64
|
-
group_id:
|
65
|
-
python_version:
|
66
|
-
python3_version:
|
67
|
-
pip_version:
|
68
|
-
pip3_version:
|
68
|
+
user: run_in_main( 'whoami'),
|
69
|
+
cnvrg: run_in_main( 'which cnvrg'),
|
70
|
+
has_bash: run_in_main( 'which bash'),
|
71
|
+
user_id: run_in_main( 'id -u'),
|
72
|
+
group_id: run_in_main( 'id -g'),
|
73
|
+
python_version: run_in_main( 'python --version'),
|
74
|
+
python3_version: run_in_main( 'python3 --version'),
|
75
|
+
pip_version: run_in_main( 'pip --version'),
|
76
|
+
pip3_version: run_in_main( 'pip3 --version')
|
69
77
|
},
|
70
78
|
}
|
79
|
+
|
71
80
|
@stats
|
72
81
|
end
|
73
82
|
|
74
83
|
def containers
|
75
84
|
agent_id = nil
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
85
|
+
main_id = nil
|
86
|
+
timeout = 2
|
87
|
+
timeout = nil if (!@is_new_main || HAS_DOCKER)
|
88
|
+
Timeout.timeout(timeout) do
|
89
|
+
while agent_id.blank? or main_id.blank?
|
90
|
+
grep_by = @job_id
|
91
|
+
grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
|
92
|
+
cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
|
93
|
+
agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
|
94
|
+
main_id = cntrs.find{|container_name| container_name.include? @main_name}.split(",").first rescue nil
|
95
|
+
sleep(2)
|
96
|
+
end
|
84
97
|
end
|
85
|
-
if
|
86
|
-
raise "Can't find
|
98
|
+
if main_id.blank?
|
99
|
+
raise "Can't find main id"
|
87
100
|
end
|
88
|
-
[agent_id,
|
101
|
+
[agent_id, main_id]
|
102
|
+
rescue => e
|
103
|
+
Cnvrg::Logger.log_error(e)
|
104
|
+
[agent_id, main_id]
|
89
105
|
end
|
90
106
|
|
91
107
|
def current_homedir
|
@@ -93,21 +109,45 @@ class Cnvrg::Helpers::Executer
|
|
93
109
|
end
|
94
110
|
|
95
111
|
def spark_path
|
96
|
-
|
112
|
+
run_in_main("env | grep SPARK_HOME").strip.split("=").try(:last)
|
97
113
|
end
|
98
114
|
|
99
|
-
def
|
100
|
-
|
115
|
+
def main_homedir()
|
116
|
+
run_in_main("env | grep -w HOME").split("=").try(:last)
|
101
117
|
end
|
102
118
|
|
103
|
-
def
|
104
|
-
|
119
|
+
def main_env
|
120
|
+
run_in_main("env").split("\n").map{|x| x.split("=")}
|
105
121
|
end
|
106
122
|
|
107
|
-
def
|
108
|
-
|
109
|
-
end
|
123
|
+
def run_in_main(command)
|
124
|
+
data = {cmd: command, async: false, use_sh: true}
|
110
125
|
|
126
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn(timeout: 180)
|
127
|
+
response = conn.post('command', data.to_json)
|
128
|
+
if response.to_hash[:status].to_i != 200
|
129
|
+
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
130
|
+
return ""
|
131
|
+
end
|
132
|
+
resp = []
|
133
|
+
lines = response.body.split("\n")
|
134
|
+
lines.each do |line|
|
135
|
+
next if line.strip == nil or line.strip == ""
|
136
|
+
if line.include?("cnvrg-exit-code")
|
137
|
+
exit_status = line.split("=")[1].to_i
|
138
|
+
if exit_status != 0
|
139
|
+
Cnvrg::Logger.log_info("failed to run find command #{command} on main")
|
140
|
+
return ""
|
141
|
+
end
|
142
|
+
next
|
143
|
+
end
|
144
|
+
resp << line
|
145
|
+
end
|
146
|
+
return resp.join("\n")
|
147
|
+
rescue => e
|
148
|
+
Cnvrg::Logger.log_error(e)
|
149
|
+
return ""
|
150
|
+
end
|
111
151
|
|
112
152
|
def poll
|
113
153
|
resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
|
@@ -124,6 +164,7 @@ class Cnvrg::Helpers::Executer
|
|
124
164
|
success = false
|
125
165
|
puts("Agent started, connecting to #{Cnvrg::API.get_api}")
|
126
166
|
STDOUT.flush
|
167
|
+
wait_for_main
|
127
168
|
while !success and retries < 100
|
128
169
|
begin
|
129
170
|
resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
|
@@ -154,14 +195,75 @@ class Cnvrg::Helpers::Executer
|
|
154
195
|
end
|
155
196
|
end
|
156
197
|
|
198
|
+
def check_main_is_working_thread
|
199
|
+
while true
|
200
|
+
check_main_alive
|
201
|
+
sleep(@check_main_every)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
157
205
|
def main_thread
|
158
206
|
init
|
159
207
|
Thread.new do
|
160
208
|
polling_thread
|
161
209
|
end
|
210
|
+
Thread.new do
|
211
|
+
check_main_is_working_thread
|
212
|
+
end
|
162
213
|
execute_cmds
|
163
214
|
end
|
164
215
|
|
216
|
+
def wait_for_main
|
217
|
+
copy_file_to_main
|
218
|
+
start_tiny_if_missing
|
219
|
+
puts("Waiting for main container")
|
220
|
+
STDOUT.flush
|
221
|
+
got_response = false
|
222
|
+
while !got_response do
|
223
|
+
begin
|
224
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
225
|
+
response = conn.get('readiness')
|
226
|
+
if response.to_hash[:status].to_i != 200
|
227
|
+
sleep(0.1)
|
228
|
+
next
|
229
|
+
else
|
230
|
+
puts("Client container is ready")
|
231
|
+
STDOUT.flush
|
232
|
+
@main_start_time = response.body.to_i
|
233
|
+
got_response = true
|
234
|
+
end
|
235
|
+
rescue => e
|
236
|
+
puts("Failed to connect to main")
|
237
|
+
puts(e)
|
238
|
+
STDOUT.flush
|
239
|
+
sleep(0.1)
|
240
|
+
next
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def copy_file_to_main
|
246
|
+
begin
|
247
|
+
FileUtils.cp("/cnvrg-tiny", "/conf/tiny")
|
248
|
+
FileUtils.cp_r("/scripts", "/conf/scripts-bin")
|
249
|
+
FileUtils.touch("/conf/tiny-ready")
|
250
|
+
rescue => e
|
251
|
+
Cnvrg::Logger.log_error(e)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def start_tiny_if_missing
|
256
|
+
return unless ENV['MAIN_CONTAINER_PORT'].blank?
|
257
|
+
Cnvrg::Logger.log_info("Tiny not found, starting it")
|
258
|
+
@agent_id, @main_id = containers
|
259
|
+
pid = Process.fork do
|
260
|
+
Cnvrg::Logger.log_info("running docker exec -i #{@main_id} sh -c '/conf/tiny")
|
261
|
+
`docker exec -i #{@main_id} sh -c '/conf/tiny'`.strip
|
262
|
+
end
|
263
|
+
Process.detach(pid)
|
264
|
+
Cnvrg::Logger.log_info("Tiny started and detached")
|
265
|
+
end
|
266
|
+
|
165
267
|
def execute_cmds
|
166
268
|
pids = []
|
167
269
|
while true
|
@@ -215,6 +317,23 @@ class Cnvrg::Helpers::Executer
|
|
215
317
|
Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
|
216
318
|
end
|
217
319
|
|
320
|
+
def check_main_alive
|
321
|
+
# Dont check before we got first response
|
322
|
+
return if @main_start_time == nil
|
323
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
324
|
+
response = conn.get('readiness')
|
325
|
+
if response.to_hash[:status].to_i != 200
|
326
|
+
main_start_time = 0
|
327
|
+
else
|
328
|
+
main_start_time = response.body.to_i
|
329
|
+
end
|
330
|
+
if main_start_time != @main_start_time
|
331
|
+
puts("Found that main restarted, restarting agent")
|
332
|
+
Cnvrg::Logger.log_info("Found that main restarted, restarting agent")
|
333
|
+
exit(1)
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
218
337
|
def get_pod_events(pod_name)
|
219
338
|
return if pod_name.blank?
|
220
339
|
`kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
|
@@ -224,4 +343,27 @@ class Cnvrg::Helpers::Executer
|
|
224
343
|
return if node_name.blank?
|
225
344
|
`kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
|
226
345
|
end
|
346
|
+
|
347
|
+
def self.main_container_url
|
348
|
+
if ENV["CNVRG_COMPUTE_CLUSTER"].blank? and ENV["KUBERNETES_SERVICE_HOST"].blank?
|
349
|
+
if ENV["MAIN_CONTAINER_PORT"].blank?
|
350
|
+
host = "slave"
|
351
|
+
else
|
352
|
+
host = "main"
|
353
|
+
end
|
354
|
+
"http://#{host}:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
355
|
+
else
|
356
|
+
"http://localhost:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
def self.get_main_conn(timeout: 4, open_timeout: 1)
|
361
|
+
conn = Faraday.new(
|
362
|
+
url: Cnvrg::Helpers::Executer.main_container_url,
|
363
|
+
headers: {'Content-Type' => 'application/json'}
|
364
|
+
)
|
365
|
+
conn.options.timeout = timeout
|
366
|
+
conn.options.open_timeout = open_timeout
|
367
|
+
conn
|
368
|
+
end
|
227
369
|
end
|
data/lib/cnvrg/job_ssh.rb
CHANGED
@@ -5,14 +5,18 @@ module Cnvrg
|
|
5
5
|
method_option :port, :type => :numeric, :aliases => ["-p", "--port"], :desc => "Port to bind into", :default => 2222
|
6
6
|
method_option :username, :type => :string, :aliases => ["-u", "--username"], :desc => "Job container user name", :default => nil
|
7
7
|
method_option :password, :type => :string, :aliases => ["--password"], :desc =>"Job Conatainer user name, will be set by cnvrg", :default => nil
|
8
|
+
method_option :no_auth, :type => :boolean, :aliases => ["--no-auth"], :default => false
|
9
|
+
method_option :internal_port, :type => :numeric, :aliases => ["--internal-port"], :desc =>"Internal port in the pod for the ssh", :default => 22
|
8
10
|
method_option :kubeconfig, :type => :string, :aliases => ["--kubeconfig"], :desc => "Path to kubeconfig, if blank default config will be used", :default => nil
|
9
11
|
def start(job_id)
|
12
|
+
no_auth = options["no_auth"]
|
10
13
|
Cnvrg::CLI.new.log_start(__method__, args, options)
|
11
14
|
@job_ssh = ConnectJobSsh.new(job_id)
|
12
|
-
@job_ssh.start(options['username'], options['password'])
|
15
|
+
@job_ssh.start(options['username'], options['password'], no_auth, port: options['internal_port'])
|
13
16
|
pod_name = nil
|
14
17
|
namespace = "cnvrg"
|
15
18
|
ssh_ready = false
|
19
|
+
internal_port = options['internal_port']
|
16
20
|
while not ssh_ready
|
17
21
|
resp = @job_ssh.status()
|
18
22
|
status = resp["ssh_status"]
|
@@ -26,13 +30,14 @@ module Cnvrg
|
|
26
30
|
username = resp["username"]
|
27
31
|
pod_name = resp["pod_name"]
|
28
32
|
namespace = resp["namespace"]
|
33
|
+
internal_port = resp["port"] || internal_port
|
29
34
|
ssh_ready = true
|
30
35
|
else
|
31
36
|
puts("Failed to start ssh")
|
32
37
|
break
|
33
38
|
end
|
34
39
|
end
|
35
|
-
if pod_name.blank? or password.blank? or username.blank?
|
40
|
+
if pod_name.blank? or (password.blank? and !no_auth) or username.blank?
|
36
41
|
puts("Failed to get required params")
|
37
42
|
return
|
38
43
|
end
|
@@ -41,8 +46,8 @@ module Cnvrg
|
|
41
46
|
puts("host: 127.0.0.1")
|
42
47
|
puts("port: #{options["port"]}")
|
43
48
|
puts("username: #{username}")
|
44
|
-
puts("password: #{password}")
|
45
|
-
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace)
|
49
|
+
puts("password: #{password}") unless no_auth
|
50
|
+
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace, internal_port)
|
46
51
|
end
|
47
52
|
end
|
48
53
|
end
|
data/lib/cnvrg/project.rb
CHANGED
@@ -328,15 +328,21 @@ module Cnvrg
|
|
328
328
|
end
|
329
329
|
|
330
330
|
def get_storage_client
|
331
|
-
|
332
|
-
|
333
|
-
|
331
|
+
client_params = nil
|
332
|
+
i = 0
|
333
|
+
begin
|
334
|
+
response = Cnvrg::API.request("users/#{@owner}/projects/#{@slug}/client", 'GET')
|
335
|
+
unless Cnvrg::CLI.is_response_success(response, false)
|
336
|
+
raise StandardError.new("Can't find project credentials")
|
337
|
+
end
|
334
338
|
client_params = response['client']
|
335
|
-
|
336
|
-
|
339
|
+
rescue StandardError
|
340
|
+
i += 1
|
341
|
+
sleep(5 * i)
|
342
|
+
retry if i < 10
|
337
343
|
client_params = get_storage_client_fallback
|
338
344
|
end
|
339
|
-
|
345
|
+
raise StandardError.new("Can't find project credentials") unless client_params
|
340
346
|
Cnvrg::Downloader::Client.factory(client_params)
|
341
347
|
end
|
342
348
|
|
@@ -378,14 +384,18 @@ module Cnvrg
|
|
378
384
|
[]
|
379
385
|
end
|
380
386
|
|
381
|
-
def generate_output_dir(output_dir)
|
387
|
+
def generate_output_dir(output_dir, local: false)
|
382
388
|
Cnvrg::Logger.log_info("Generating output dir for #{output_dir}")
|
383
389
|
upload_list = []
|
390
|
+
list = []
|
384
391
|
list = Dir.glob("/cnvrg/#{output_dir}/**/*", File::FNM_DOTMATCH)
|
392
|
+
if local
|
393
|
+
list += Dir.glob("#{output_dir}/**/*", File::FNM_DOTMATCH)
|
394
|
+
end
|
395
|
+
list.uniq!
|
385
396
|
Parallel.map(list, in_threads: IDXParallelThreads) do |e|
|
386
397
|
next if e.end_with? "/."
|
387
398
|
if File.directory? e
|
388
|
-
|
389
399
|
upload_list << e + "/"
|
390
400
|
else
|
391
401
|
upload_list << e
|
@@ -447,6 +457,10 @@ module Cnvrg
|
|
447
457
|
if list_ignore_new.include? label
|
448
458
|
next
|
449
459
|
end
|
460
|
+
if File.symlink?(e)
|
461
|
+
Cnvrg::Logger.log_info("Skipping symlink #{e}")
|
462
|
+
next
|
463
|
+
end
|
450
464
|
if File.directory? e
|
451
465
|
dir_name = (label.ends_with? "/") ? label : (label + "/")
|
452
466
|
tree_idx[dir_name] = nil
|
@@ -647,7 +661,11 @@ module Cnvrg
|
|
647
661
|
|
648
662
|
def fetch_webapp_slugs(webapp_slug, slugs: nil)
|
649
663
|
response = Cnvrg::API_V2.request("#{self.owner}/projects/#{self.slug}/webapps/#{webapp_slug}" , 'GET')
|
650
|
-
|
664
|
+
|
665
|
+
if response.key?("experiments")
|
666
|
+
return response["experiments"]
|
667
|
+
end
|
668
|
+
return response["data"]["attributes"]["experiments"]
|
651
669
|
rescue
|
652
670
|
slugs
|
653
671
|
end
|
@@ -699,8 +717,11 @@ module Cnvrg
|
|
699
717
|
res = JSON.parse(resp['result']) rescue nil
|
700
718
|
return if res.blank?
|
701
719
|
config = self.get_config
|
702
|
-
config[:is_git] = res['git']
|
703
720
|
config[:project_name] = res['title']
|
721
|
+
config[:project_slug] = @slug
|
722
|
+
config[:owner] = @owner
|
723
|
+
config[:git] = res['git'] || false
|
724
|
+
config[:is_git] = res['git'] || false
|
704
725
|
self.set_config(config)
|
705
726
|
end
|
706
727
|
|
data/lib/cnvrg/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
1
|
module Cnvrg
|
2
|
-
VERSION = '
|
3
|
-
end
|
2
|
+
VERSION = '2.0.11'
|
3
|
+
end
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cnvrg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yochay Ettun
|
8
8
|
- Leah Kolben
|
9
9
|
- Omer Shacham
|
10
|
-
autorequire:
|
10
|
+
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2021-
|
13
|
+
date: 2021-10-21 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: bundler
|
@@ -96,6 +96,26 @@ dependencies:
|
|
96
96
|
- - ">="
|
97
97
|
- !ruby/object:Gem::Version
|
98
98
|
version: '0'
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: ffi
|
101
|
+
requirement: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - "~>"
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '1.9'
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: 1.9.10
|
109
|
+
type: :runtime
|
110
|
+
prerelease: false
|
111
|
+
version_requirements: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - "~>"
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '1.9'
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: 1.9.10
|
99
119
|
- !ruby/object:Gem::Dependency
|
100
120
|
name: mimemagic
|
101
121
|
requirement: !ruby/object:Gem::Requirement
|
@@ -302,6 +322,20 @@ dependencies:
|
|
302
322
|
- - "~>"
|
303
323
|
- !ruby/object:Gem::Version
|
304
324
|
version: 0.1.1
|
325
|
+
- !ruby/object:Gem::Dependency
|
326
|
+
name: filewatch
|
327
|
+
requirement: !ruby/object:Gem::Requirement
|
328
|
+
requirements:
|
329
|
+
- - "~>"
|
330
|
+
- !ruby/object:Gem::Version
|
331
|
+
version: 0.9.0
|
332
|
+
type: :runtime
|
333
|
+
prerelease: false
|
334
|
+
version_requirements: !ruby/object:Gem::Requirement
|
335
|
+
requirements:
|
336
|
+
- - "~>"
|
337
|
+
- !ruby/object:Gem::Version
|
338
|
+
version: 0.9.0
|
305
339
|
- !ruby/object:Gem::Dependency
|
306
340
|
name: parallel
|
307
341
|
requirement: !ruby/object:Gem::Requirement
|
@@ -439,7 +473,7 @@ files:
|
|
439
473
|
homepage: https://cnvrg.io
|
440
474
|
licenses: []
|
441
475
|
metadata: {}
|
442
|
-
post_install_message:
|
476
|
+
post_install_message:
|
443
477
|
rdoc_options: []
|
444
478
|
require_paths:
|
445
479
|
- lib
|
@@ -454,8 +488,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
454
488
|
- !ruby/object:Gem::Version
|
455
489
|
version: '0'
|
456
490
|
requirements: []
|
457
|
-
rubygems_version: 3.
|
458
|
-
signing_key:
|
491
|
+
rubygems_version: 3.2.22
|
492
|
+
signing_key:
|
459
493
|
specification_version: 4
|
460
494
|
summary: A CLI tool for interacting with cnvrg.io.
|
461
495
|
test_files: []
|