cnvrg 1.11.29 → 2.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Readme.md +40 -1
- data/cnvrg.gemspec +2 -0
- data/lib/cnvrg/api.rb +4 -4
- data/lib/cnvrg/cli/library_cli.rb +2 -2
- data/lib/cnvrg/cli.rb +143 -73
- data/lib/cnvrg/connect_job_ssh.rb +4 -4
- data/lib/cnvrg/files.rb +6 -2
- data/lib/cnvrg/helpers/agent.rb +58 -18
- data/lib/cnvrg/helpers/executer.rb +177 -35
- data/lib/cnvrg/job_ssh.rb +9 -4
- data/lib/cnvrg/project.rb +31 -10
- data/lib/cnvrg/version.rb +2 -2
- metadata +40 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 86857e06a0d107172c161467e8ff8cb04a120d8b230c97843e91ce13c826ccce
|
4
|
+
data.tar.gz: 272fe88e1e390f2887c36c49915cc89f10a6cf9947bb98ab6fd503476ac03820
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c28ab54953e47843897273f8038f5e0bfc92e101e161d1edae4477250e1334b5730b5020713572ad2482c5b0c305c5c0b2c5725b8796a44c06750eccf2924e9f
|
7
|
+
data.tar.gz: 69be0cd1fc89180ce7b878aaa8e0767b5be9a00cb0c9813c504e62677c1c839831aa6cf4fbf27aeb66aa50b756f7b28f46eb721182e588bcc4b7c65544e6a41b
|
data/Readme.md
CHANGED
@@ -14,4 +14,43 @@
|
|
14
14
|
* DEV-8284 - Improvement: Use server instead of docker for agent communication
|
15
15
|
* DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
|
16
16
|
* DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
|
17
|
-
* DEV-8621 - Improvement: Add more metrics
|
17
|
+
* DEV-8621 - Improvement: Add more metrics
|
18
|
+
## Version v1.11.30
|
19
|
+
2021-04-06
|
20
|
+
## Version v1.11.31
|
21
|
+
2021-04-22
|
22
|
+
## Version v1.11.32
|
23
|
+
2021-05-05
|
24
|
+
* DEV-8868 - Bug: SDK - e.sync() in git project only creates empty "output" folder in commit
|
25
|
+
## Version v2.0.1
|
26
|
+
2021-06-13
|
27
|
+
## Version v2.0.2
|
28
|
+
2021-06-16
|
29
|
+
* DEV-9694 - Bug: Download artifacts fails on authorization error
|
30
|
+
## Version v2.0.3
|
31
|
+
2021-06-29
|
32
|
+
* DEV-9919 - Bug: clone artifacts fails on "Not Authorize, Are you logged in?"
|
33
|
+
## Version v2.0.4
|
34
|
+
2021-07-08
|
35
|
+
* DEV-9935 - Bug: CLI - cnvrg sync creates new commit but no blob versions
|
36
|
+
## Version v2.0.5
|
37
|
+
2021-07-11
|
38
|
+
* DEV-10171 - Bug: experiment randomly fails with error- "Couldn't clone artifacts"
|
39
|
+
* DEV-10189 - Bug: CLI Sync -file/folder with broken symlink will cause sync to fail
|
40
|
+
## Version v2.0.6
|
41
|
+
2021-07-18
|
42
|
+
* DEV-10209 - Bug: some experiments in grid failed on cnvrg-cli commands (docker container id was missing)
|
43
|
+
## Version v2.0.7
|
44
|
+
2021-07-27
|
45
|
+
* DEV-10186 - Bug: CLI/run an experiment with --local tag giver server error
|
46
|
+
## Version v2.0.8
|
47
|
+
2021-09-06
|
48
|
+
* DEV-10697 - Bug: Tensorboard not starting in workspace and experiment.
|
49
|
+
## Version v2.0.9
|
50
|
+
2021-09-12
|
51
|
+
* DEV-10502 - Bug: Periodic sync stuck
|
52
|
+
## Version v2.0.10
|
53
|
+
2021-09-12
|
54
|
+
* DEV-10502 - Bug: Periodic sync stuck
|
55
|
+
## Version v2.0.11
|
56
|
+
2021-10-21
|
data/cnvrg.gemspec
CHANGED
@@ -25,6 +25,7 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_development_dependency 'aruba'
|
26
26
|
spec.add_development_dependency 'pry'
|
27
27
|
|
28
|
+
spec.add_runtime_dependency 'ffi', '~> 1.9', '>= 1.9.10'
|
28
29
|
spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
|
29
30
|
spec.add_runtime_dependency 'faraday', '~> 0.15.2'
|
30
31
|
spec.add_runtime_dependency 'netrc', '~> 0.11.0'
|
@@ -38,6 +39,7 @@ Gem::Specification.new do |spec|
|
|
38
39
|
spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
|
39
40
|
spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
|
40
41
|
spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
|
42
|
+
spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
|
41
43
|
spec.add_runtime_dependency 'parallel', '~> 1.12.0'
|
42
44
|
spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
|
43
45
|
spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
|
data/lib/cnvrg/api.rb
CHANGED
@@ -72,11 +72,11 @@ module Cnvrg
|
|
72
72
|
if response.to_hash[:status].to_i != 200
|
73
73
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
74
74
|
end
|
75
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
75
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
76
76
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
77
77
|
success = false
|
78
78
|
sleep(5 * retries)
|
79
|
-
retries +=1
|
79
|
+
retries += 1
|
80
80
|
next
|
81
81
|
end
|
82
82
|
rescue => e
|
@@ -112,11 +112,11 @@ module Cnvrg
|
|
112
112
|
if response.to_hash[:status].to_i != 200
|
113
113
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
114
114
|
end
|
115
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
115
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
116
116
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
117
117
|
success = false
|
118
118
|
sleep(5 * retries)
|
119
|
-
retries +=1
|
119
|
+
retries += 1
|
120
120
|
next
|
121
121
|
end
|
122
122
|
rescue => e
|
data/lib/cnvrg/cli.rb
CHANGED
@@ -173,7 +173,7 @@ module Cnvrg
|
|
173
173
|
desc "data [COMMAND]", "Upload and manage datasets", :hide => false
|
174
174
|
subcommand "data", Data
|
175
175
|
|
176
|
-
desc "job", "manage running jobs", :hide =>
|
176
|
+
desc "job", "manage running jobs", :hide => true
|
177
177
|
subcommand "job", JobCli
|
178
178
|
|
179
179
|
desc "ssh", "ssh into running jobs", :hide => false
|
@@ -415,7 +415,7 @@ module Cnvrg
|
|
415
415
|
end
|
416
416
|
end
|
417
417
|
|
418
|
-
desc 'set_compression_path', 'Set compression path'
|
418
|
+
desc 'set_compression_path', 'Set compression path', :hide => true
|
419
419
|
method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
|
420
420
|
|
421
421
|
def set_compression_path(*compression_path)
|
@@ -2311,6 +2311,7 @@ module Cnvrg
|
|
2311
2311
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
2312
2312
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
2313
2313
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
2314
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
2314
2315
|
|
2315
2316
|
def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
|
2316
2317
|
begin
|
@@ -2328,6 +2329,8 @@ module Cnvrg
|
|
2328
2329
|
exp_obj = nil
|
2329
2330
|
end
|
2330
2331
|
|
2332
|
+
local = options["local"]
|
2333
|
+
|
2331
2334
|
commit_msg = options["message"]
|
2332
2335
|
if commit_msg.nil? or commit_msg.empty?
|
2333
2336
|
commit_msg = ""
|
@@ -2349,7 +2352,7 @@ module Cnvrg
|
|
2349
2352
|
if git_output_dir.ends_with? "/"
|
2350
2353
|
git_output_dir = git_output_dir[0..-2]
|
2351
2354
|
end
|
2352
|
-
list = @project.generate_output_dir(git_output_dir)
|
2355
|
+
list = @project.generate_output_dir(git_output_dir, local: local)
|
2353
2356
|
end
|
2354
2357
|
list += @project.generate_git_diff if options["git_diff"]
|
2355
2358
|
spec_files_to_upload = list
|
@@ -2668,7 +2671,7 @@ module Cnvrg
|
|
2668
2671
|
end
|
2669
2672
|
end
|
2670
2673
|
|
2671
|
-
desc 'commit before termination', 'Commit job code before termination'
|
2674
|
+
desc 'commit before termination', 'Commit job code before termination', :hide => true
|
2672
2675
|
def commit_before_termination()
|
2673
2676
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2674
2677
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2678,7 +2681,7 @@ module Cnvrg
|
|
2678
2681
|
log_error(e)
|
2679
2682
|
end
|
2680
2683
|
|
2681
|
-
desc 'update_job_commit', 'Update job with its last commit'
|
2684
|
+
desc 'update_job_commit', 'Update job with its last commit' , :hide => true
|
2682
2685
|
def update_job_commit()
|
2683
2686
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2684
2687
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2868,7 +2871,7 @@ module Cnvrg
|
|
2868
2871
|
|
2869
2872
|
|
2870
2873
|
|
2871
|
-
desc 'jump', 'Jump to specific commit'
|
2874
|
+
desc 'jump COMMIT_ID', 'Jump to specific commit'
|
2872
2875
|
def jump(commit_sha1)
|
2873
2876
|
begin
|
2874
2877
|
verify_logged_in()
|
@@ -3003,11 +3006,12 @@ module Cnvrg
|
|
3003
3006
|
method_option :job_slug, :type => :string, :aliases => ["-j", "--job"], :default => nil
|
3004
3007
|
method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
|
3005
3008
|
method_option :files, :type => :string, :aliases => ["--files"], :default => nil
|
3006
|
-
method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default =>
|
3009
|
+
method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => 'output'
|
3007
3010
|
method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
|
3008
3011
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
3009
3012
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
3010
3013
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
3014
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
3011
3015
|
|
3012
3016
|
def sync(direct = true)
|
3013
3017
|
verify_logged_in(true) if direct
|
@@ -3030,10 +3034,10 @@ module Cnvrg
|
|
3030
3034
|
if run_download or options['debug_mode']
|
3031
3035
|
invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
|
3032
3036
|
end
|
3033
|
-
invoke :upload, [false, true,
|
3037
|
+
invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
|
3034
3038
|
:ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
|
3035
3039
|
:files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
|
3036
|
-
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
|
3040
|
+
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
|
3037
3041
|
|
3038
3042
|
end
|
3039
3043
|
|
@@ -3143,7 +3147,7 @@ module Cnvrg
|
|
3143
3147
|
invoke :exec, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title,
|
3144
3148
|
:log => log, :email_notification => email_notification, :upload_output => upload_output,
|
3145
3149
|
:commit => commit, :image => image, :data => data, :data_commit => data_commit,
|
3146
|
-
:ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query
|
3150
|
+
:ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query, :local => local
|
3147
3151
|
return
|
3148
3152
|
end
|
3149
3153
|
else
|
@@ -3200,6 +3204,7 @@ module Cnvrg
|
|
3200
3204
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
|
3201
3205
|
method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
|
3202
3206
|
method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
|
3207
|
+
method_option :local, :type => :boolean, :aliases => ["-l", "--local"], :default => false
|
3203
3208
|
|
3204
3209
|
def exec(*cmd)
|
3205
3210
|
log = []
|
@@ -3224,6 +3229,7 @@ module Cnvrg
|
|
3224
3229
|
project_home = get_project_home
|
3225
3230
|
data_query = options["data_query"]
|
3226
3231
|
docker_stats = options["docker_stats"]
|
3232
|
+
local = options[:local] || false
|
3227
3233
|
@project = Project.new(project_home)
|
3228
3234
|
if @project.is_git
|
3229
3235
|
sync_before = false
|
@@ -3316,62 +3322,53 @@ module Cnvrg
|
|
3316
3322
|
end
|
3317
3323
|
end
|
3318
3324
|
start_time = Time.now
|
3319
|
-
shell_type = options["use_bash"] ? "bash -l" : "sh"
|
3320
3325
|
if @exp.get_cmd.present?
|
3321
3326
|
cmd = @exp.get_cmd
|
3322
|
-
if options["docker_id"].present? # Escape for docker exec
|
3323
|
-
cmd = cmd.gsub("\"", "\\\"")
|
3324
|
-
end
|
3325
|
-
end
|
3326
|
-
if options["docker_id"].present?
|
3327
|
-
cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
|
3328
3327
|
end
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
|
3333
|
-
|
3334
|
-
|
3335
|
-
|
3336
|
-
|
3337
|
-
|
3338
|
-
|
3339
|
-
|
3340
|
-
|
3328
|
+
|
3329
|
+
if local
|
3330
|
+
exec_local(cmd)
|
3331
|
+
exit_status = $?.exitstatus
|
3332
|
+
|
3333
|
+
else
|
3334
|
+
command_slug = (0...18).map { (65 + rand(26)).chr }.join
|
3335
|
+
result_file = "/conf/result-#{command_slug}"
|
3336
|
+
data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
|
3337
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
3338
|
+
response = conn.post('command', data.to_json)
|
3339
|
+
if response.to_hash[:status].to_i != 200
|
3340
|
+
exit_status = 129
|
3341
|
+
raise StandardError.new("Cant send command to slave")
|
3342
|
+
end
|
3343
|
+
t = FileWatch::Tail.new
|
3344
|
+
filename = result_file
|
3345
|
+
lines = []
|
3346
|
+
t.tail(filename)
|
3347
|
+
t.subscribe do |path, line|
|
3348
|
+
begin
|
3349
|
+
cur_log = JSON.parse(line)
|
3350
|
+
if cur_log["type"] == "endMessage"
|
3351
|
+
exit_status = cur_log["real"].to_i
|
3352
|
+
break
|
3353
|
+
else
|
3354
|
+
puts(cur_log.to_json)
|
3355
|
+
STDOUT.flush
|
3356
|
+
cur_log["time"] = Time.parse(cur_log["timestamp"])
|
3357
|
+
cur_log["message"] = cur_log["message"].to_s + "\r\n"
|
3358
|
+
log << cur_log
|
3341
3359
|
end
|
3342
|
-
log << cur_log
|
3343
3360
|
if log.size >= 10
|
3344
|
-
@exp.upload_temp_log(log)
|
3361
|
+
@exp.upload_temp_log(log)
|
3345
3362
|
log = []
|
3346
|
-
|
3363
|
+
elsif (start_time + 15.seconds) <= Time.now
|
3347
3364
|
@exp.upload_temp_log(log) unless log.empty?
|
3348
3365
|
log = []
|
3349
3366
|
start_time = Time.now
|
3350
3367
|
end
|
3368
|
+
rescue => e
|
3369
|
+
log_error(e)
|
3351
3370
|
end
|
3352
|
-
if stderr
|
3353
|
-
stderr.each do |err|
|
3354
|
-
log << {time: Time.now, message: err, type: "stderr"}
|
3355
|
-
end
|
3356
|
-
end
|
3357
|
-
rescue Errno::EIO => e
|
3358
|
-
log_error(e)
|
3359
|
-
if !log.empty?
|
3360
|
-
temp_log = log
|
3361
|
-
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3362
|
-
log -= temp_log
|
3363
|
-
end
|
3364
|
-
rescue Errno::ENOENT => e
|
3365
|
-
exp_success = false
|
3366
|
-
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
3367
|
-
log_error(e)
|
3368
|
-
rescue => e
|
3369
|
-
res = @exp.end(log, 1, start_commit, 0, 0)
|
3370
|
-
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
3371
|
-
log_error(e)
|
3372
|
-
exit(0)
|
3373
3371
|
end
|
3374
|
-
::Process.wait pid
|
3375
3372
|
end
|
3376
3373
|
end_time = Time.now
|
3377
3374
|
process_running = false
|
@@ -3379,14 +3376,13 @@ module Cnvrg
|
|
3379
3376
|
if !log.empty?
|
3380
3377
|
|
3381
3378
|
temp_log = log
|
3382
|
-
|
3379
|
+
@exp.upload_temp_log(temp_log)
|
3383
3380
|
log -= temp_log
|
3384
3381
|
end
|
3385
3382
|
|
3386
3383
|
cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
|
3387
3384
|
memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
|
3388
|
-
exit_status
|
3389
|
-
if $?.exitstatus != 0
|
3385
|
+
if exit_status != 0
|
3390
3386
|
exp_success = false
|
3391
3387
|
end
|
3392
3388
|
|
@@ -3430,7 +3426,6 @@ module Cnvrg
|
|
3430
3426
|
if @exp
|
3431
3427
|
# log_thread.join
|
3432
3428
|
Thread.kill(stats_thread) if docker_stats
|
3433
|
-
exit_status = $?.exitstatus
|
3434
3429
|
if exit_status.blank?
|
3435
3430
|
exit_status = "-1"
|
3436
3431
|
end
|
@@ -3443,8 +3438,6 @@ module Cnvrg
|
|
3443
3438
|
|
3444
3439
|
exit(1)
|
3445
3440
|
end
|
3446
|
-
|
3447
|
-
|
3448
3441
|
end
|
3449
3442
|
|
3450
3443
|
end
|
@@ -3689,7 +3682,7 @@ module Cnvrg
|
|
3689
3682
|
end
|
3690
3683
|
end
|
3691
3684
|
|
3692
|
-
desc 'deploy', 'Deploys model to production'
|
3685
|
+
desc 'deploy', 'Deploys model to production', :hide => true
|
3693
3686
|
method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
|
3694
3687
|
method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
|
3695
3688
|
method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
|
@@ -3778,7 +3771,7 @@ module Cnvrg
|
|
3778
3771
|
method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
|
3779
3772
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
|
3780
3773
|
|
3781
|
-
desc 'notebook', 'Starts a notebook session remotely or locally'
|
3774
|
+
desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
|
3782
3775
|
|
3783
3776
|
def notebook
|
3784
3777
|
verify_logged_in(true)
|
@@ -3905,7 +3898,7 @@ module Cnvrg
|
|
3905
3898
|
end
|
3906
3899
|
end
|
3907
3900
|
|
3908
|
-
desc 'remote_notebook', 'Run notebook server on remote server'
|
3901
|
+
desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
|
3909
3902
|
method_option :machine_type, :type => :string, :default => ""
|
3910
3903
|
method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
|
3911
3904
|
method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
|
@@ -4264,7 +4257,7 @@ module Cnvrg
|
|
4264
4257
|
|
4265
4258
|
end
|
4266
4259
|
|
4267
|
-
desc 'notebook_stop', '
|
4260
|
+
desc 'notebook_stop', 'Stop notebook', :hide => true
|
4268
4261
|
method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
|
4269
4262
|
method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
|
4270
4263
|
method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
|
@@ -4651,15 +4644,21 @@ module Cnvrg
|
|
4651
4644
|
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
4652
4645
|
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
4653
4646
|
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
4647
|
+
method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
|
4648
|
+
method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
|
4649
|
+
method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
|
4654
4650
|
|
4655
4651
|
def collect_metrics
|
4656
4652
|
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
4657
4653
|
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
4654
|
+
prom_user = options[:prom_user]
|
4655
|
+
prom_password = options[:prom_password]
|
4656
|
+
name = options[:name]
|
4658
4657
|
|
4659
4658
|
translate_result = Cnvrg::API_V2.request(
|
4660
4659
|
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
4661
4660
|
'GET',
|
4662
|
-
{ gpu: options[:gpu] }
|
4661
|
+
{ gpu: options[:gpu], gaudi: options[:gaudi] }
|
4663
4662
|
)
|
4664
4663
|
|
4665
4664
|
is_machine = options[:machine]
|
@@ -4679,9 +4678,16 @@ module Cnvrg
|
|
4679
4678
|
next
|
4680
4679
|
end
|
4681
4680
|
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
4682
|
-
|
4681
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
4682
|
+
http.use_ssl = uri.scheme == "https"
|
4683
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
4684
|
+
req = Net::HTTP::Get.new uri.request_uri
|
4685
|
+
if prom_user.present?
|
4686
|
+
req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
|
4687
|
+
end
|
4688
|
+
resp = http.request(req)
|
4683
4689
|
begin
|
4684
|
-
result = JSON.parse(resp)
|
4690
|
+
result = JSON.parse(resp.body)
|
4685
4691
|
rescue JSON::ParserError => e
|
4686
4692
|
log_error(e)
|
4687
4693
|
next
|
@@ -4690,13 +4696,22 @@ module Cnvrg
|
|
4690
4696
|
next unless data_result
|
4691
4697
|
|
4692
4698
|
if data_result.size > 1
|
4693
|
-
stats[query_name] = {}
|
4699
|
+
stats[query_name] = {} unless query_name.include? 'block'
|
4694
4700
|
data_result.each_with_index do |res, i|
|
4695
4701
|
timestamp, value = res["value"]
|
4696
4702
|
uuid = res["metric"]["UUID"].presence || i
|
4697
4703
|
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4698
4704
|
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4699
|
-
|
4705
|
+
if query_name.include? 'block'
|
4706
|
+
uuid = res["metric"]["interface"].presence || i
|
4707
|
+
uuid = "#{name}-#{uuid}" if name.present?
|
4708
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
4709
|
+
io_type = query_name.split('_')[1]
|
4710
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4711
|
+
stats['block_io'][io_type].merge!({ uuid => stat_value })
|
4712
|
+
else
|
4713
|
+
stats[query_name][uuid] = stat_value
|
4714
|
+
end
|
4700
4715
|
end
|
4701
4716
|
else
|
4702
4717
|
timestamp, value = data_result&.first&.dig('value')
|
@@ -4705,9 +4720,14 @@ module Cnvrg
|
|
4705
4720
|
if query_name.include? 'block'
|
4706
4721
|
stats['block_io'] = {} if stats['block_io'].blank?
|
4707
4722
|
io_type = query_name.split('_')[1]
|
4708
|
-
|
4723
|
+
if name.present?
|
4724
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4725
|
+
stats['block_io'][io_type].merge!({ name => stat_value })
|
4726
|
+
else
|
4727
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
4728
|
+
end
|
4709
4729
|
else
|
4710
|
-
stats[query_name] = stat_value
|
4730
|
+
stats[query_name] = name.present? ? { name => stat_value } : stat_value
|
4711
4731
|
end
|
4712
4732
|
end
|
4713
4733
|
end
|
@@ -4751,7 +4771,7 @@ module Cnvrg
|
|
4751
4771
|
end
|
4752
4772
|
|
4753
4773
|
|
4754
|
-
desc '', ''
|
4774
|
+
desc '', '', :hide => true
|
4755
4775
|
|
4756
4776
|
def download_built_image(image_name, image_slug)
|
4757
4777
|
begin
|
@@ -4995,7 +5015,7 @@ module Cnvrg
|
|
4995
5015
|
end
|
4996
5016
|
end
|
4997
5017
|
|
4998
|
-
desc 'experiments', 'List project experiments'
|
5018
|
+
desc 'experiments', 'List project experiments', :hide => true
|
4999
5019
|
method_option :id, :type => :string, :aliases => ["--id"], :desc => "Get info for specific experiments", :default => ""
|
5000
5020
|
method_option :tag, :type => :string, :aliases => ["-t"], :desc => "Get info for specific experiment tag", :default => ""
|
5001
5021
|
|
@@ -5864,6 +5884,56 @@ module Cnvrg
|
|
5864
5884
|
end
|
5865
5885
|
end
|
5866
5886
|
|
5887
|
+
def exec_local(cmd)
|
5888
|
+
PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
|
5889
|
+
begin
|
5890
|
+
stdout.each do |line|
|
5891
|
+
cur_time = Time.now
|
5892
|
+
real_time = Time.now - real
|
5893
|
+
cur_log = {time: cur_time,
|
5894
|
+
message: line,
|
5895
|
+
type: "stdout",
|
5896
|
+
real: real_time
|
5897
|
+
}
|
5898
|
+
if print_log
|
5899
|
+
puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
|
5900
|
+
end
|
5901
|
+
log << cur_log
|
5902
|
+
if log.size >= 10
|
5903
|
+
@exp.upload_temp_log(log) unless log.empty?
|
5904
|
+
log = []
|
5905
|
+
elsif (start_time + 15.seconds) <= Time.now
|
5906
|
+
@exp.upload_temp_log(log) unless log.empty?
|
5907
|
+
log = []
|
5908
|
+
start_time = Time.now
|
5909
|
+
end
|
5910
|
+
end
|
5911
|
+
if stderr
|
5912
|
+
stderr.each do |err|
|
5913
|
+
log << {time: Time.now, message: err, type: "stderr"}
|
5914
|
+
end
|
5915
|
+
end
|
5916
|
+
rescue Errno::EIO => e
|
5917
|
+
log_error(e)
|
5918
|
+
if !log.empty?
|
5919
|
+
temp_log = log
|
5920
|
+
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
5921
|
+
log -= temp_log
|
5922
|
+
end
|
5923
|
+
rescue Errno::ENOENT => e
|
5924
|
+
exp_success = false
|
5925
|
+
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
5926
|
+
log_error(e)
|
5927
|
+
rescue => e
|
5928
|
+
res = @exp.end(log, 1, start_commit, 0, 0)
|
5929
|
+
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
5930
|
+
log_error(e)
|
5931
|
+
exit(0)
|
5932
|
+
end
|
5933
|
+
::Process.wait pid
|
5934
|
+
end
|
5935
|
+
end
|
5936
|
+
|
5867
5937
|
end
|
5868
5938
|
end
|
5869
5939
|
|
@@ -10,20 +10,20 @@ module Cnvrg
|
|
10
10
|
Cnvrg::Logger.log_info("cnvrg is not configured")
|
11
11
|
end
|
12
12
|
|
13
|
-
def start(username, password)
|
14
|
-
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password})
|
13
|
+
def start(username, password, no_auth, port: nil)
|
14
|
+
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password, no_auth: no_auth, port: port})
|
15
15
|
end
|
16
16
|
|
17
17
|
def status()
|
18
18
|
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/status" , 'GET', nil)
|
19
19
|
end
|
20
20
|
|
21
|
-
def run_portforward_command(pod_name, port, kubeconfig, namespace)
|
21
|
+
def run_portforward_command(pod_name, port, kubeconfig, namespace, internal_port)
|
22
22
|
command = "kubectl"
|
23
23
|
if kubeconfig.present?
|
24
24
|
command = "kubectl --kubeconfig=#{kubeconfig}"
|
25
25
|
end
|
26
|
-
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}
|
26
|
+
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:#{internal_port}"
|
27
27
|
puts("\nrunning command #{bashCommand}")
|
28
28
|
`#{bashCommand}`
|
29
29
|
end
|
data/lib/cnvrg/files.rb
CHANGED
@@ -106,7 +106,7 @@ module Cnvrg
|
|
106
106
|
commit: commit_sha1
|
107
107
|
})
|
108
108
|
unless Cnvrg::CLI.is_response_success(resp, false)
|
109
|
-
raise
|
109
|
+
raise StandardError.new("unsupported character: folder name can not include / \\ * : ? \" | ")
|
110
110
|
end
|
111
111
|
# resolve bucket
|
112
112
|
res = resp['result']
|
@@ -730,7 +730,11 @@ module Cnvrg
|
|
730
730
|
end
|
731
731
|
res = Cnvrg::API.request(@base_resource + "download_files", 'POST', {files: files, commit: commit})
|
732
732
|
unless Cnvrg::CLI.is_response_success(res, false)
|
733
|
-
|
733
|
+
begin
|
734
|
+
puts(res)
|
735
|
+
rescue
|
736
|
+
end
|
737
|
+
raise StandardError.new("Cant download files from the server.")
|
734
738
|
end
|
735
739
|
self.download_multiple_files_s3(res['result'], @project_home, postfix: postfix, progress: progress, threads: threads)
|
736
740
|
end
|
data/lib/cnvrg/helpers/agent.rb
CHANGED
@@ -18,10 +18,13 @@ class Cnvrg::Helpers::Agent
|
|
18
18
|
#### params
|
19
19
|
def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
|
20
20
|
@executer = executer
|
21
|
+
@job_id = ENV["CNVRG_JOB_ID"]
|
21
22
|
@slug = slug
|
22
23
|
@files_exist = files_exist
|
23
24
|
@container_name = container_name
|
24
|
-
@
|
25
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
26
|
+
@main_name = @is_new_main ? "main" : "slave"
|
27
|
+
@run_in_main = @container_name.downcase == @main_name
|
25
28
|
@log_interval = send_log_interval
|
26
29
|
# https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
|
27
30
|
if timeout.blank? or timeout.negative?
|
@@ -37,18 +40,8 @@ class Cnvrg::Helpers::Agent
|
|
37
40
|
@sleep_before_retry = sleep_before_retry
|
38
41
|
@real_execution_retries = 0 ## How many times the command really executed until success
|
39
42
|
@single_quotes = single_quotes
|
40
|
-
@docker_user =
|
41
|
-
@
|
42
|
-
if docker_user.present?
|
43
|
-
@docker_user = " --user #{docker_user}"
|
44
|
-
end
|
45
|
-
if @run_in_slave
|
46
|
-
if @single_quotes
|
47
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
|
48
|
-
else
|
49
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
|
50
|
-
end
|
51
|
-
end
|
43
|
+
@docker_user = docker_user
|
44
|
+
@use_bash = use_bash
|
52
45
|
@output = []
|
53
46
|
@errors = []
|
54
47
|
@exit_status = nil
|
@@ -75,19 +68,22 @@ class Cnvrg::Helpers::Agent
|
|
75
68
|
|
76
69
|
def exec!
|
77
70
|
log_internal("Command: #{@command} with slug: #{@slug} started!")
|
71
|
+
command_status = Status::FINISHED
|
78
72
|
if @command.blank?
|
79
73
|
@exit_status = 0
|
74
|
+
command_status = Status::ABORTED
|
80
75
|
elsif should_run?
|
81
76
|
send_logs(status: Status::STARTED)
|
82
77
|
periodic_thread_handle = periodic_thread
|
83
78
|
execute_command
|
84
79
|
else
|
80
|
+
command_status = Status::ABORTED
|
85
81
|
@exit_status = 127
|
86
82
|
end
|
87
83
|
finish_log = "Command: #{@command} with slug: #{@slug} finished"
|
88
84
|
finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
|
89
85
|
log_internal(finish_log)
|
90
|
-
send_logs(exit_status: @exit_status, status:
|
86
|
+
send_logs(exit_status: @exit_status, status: command_status)
|
91
87
|
if periodic_thread_handle.present?
|
92
88
|
periodic_thread_handle.join
|
93
89
|
end
|
@@ -117,14 +113,50 @@ class Cnvrg::Helpers::Agent
|
|
117
113
|
execute_command
|
118
114
|
end
|
119
115
|
|
116
|
+
def execute_command_on_slave
|
117
|
+
extra_slug = (0...2).map { (65 + rand(26)).chr }.join
|
118
|
+
result_file = "/conf/result-#{@slug}-#{extra_slug}"
|
119
|
+
Timeout.timeout(@timeout) do
|
120
|
+
data = {cmd: @command, async: true, file_name: result_file, use_script: true, use_bash: @use_bash, use_sh: !@use_bash, docker_user: @docker_user}
|
121
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
122
|
+
response = conn.post('command', data.to_json)
|
123
|
+
if response.to_hash[:status].to_i != 200
|
124
|
+
@exit_status = 129
|
125
|
+
raise StandardError.new("Cant send command to slave")
|
126
|
+
end
|
127
|
+
t = FileWatch::Tail.new
|
128
|
+
filename = result_file
|
129
|
+
t.tail(filename)
|
130
|
+
t.subscribe do |path, line|
|
131
|
+
if line.include?("cnvrg-exit-code")
|
132
|
+
@exit_status = line.split("=")[1].to_i
|
133
|
+
break
|
134
|
+
end
|
135
|
+
if !@is_new_main
|
136
|
+
log_internal(line, level: LogLevel::PURE)
|
137
|
+
end
|
138
|
+
line = line.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_')
|
139
|
+
@output << {log: line, timestamp: Time.now}
|
140
|
+
end
|
141
|
+
end
|
142
|
+
rescue Timeout::Error
|
143
|
+
@errors << {log: "Command timed out!", timestamp: Time.now}
|
144
|
+
log_internal("Command timed out!", level: LogLevel::ERROR)
|
145
|
+
@exit_status = 124
|
146
|
+
ensure
|
147
|
+
retry_command if @retries != 0 and @exit_status !=0
|
148
|
+
@exit_status
|
149
|
+
end
|
150
|
+
|
120
151
|
def execute_command
|
152
|
+
return execute_command_on_slave if @run_in_main
|
121
153
|
Timeout.timeout(@timeout) do
|
122
154
|
PTY.spawn(@command) do |stdout, stdin, pid, stderr|
|
123
155
|
@pid = pid
|
124
156
|
begin
|
125
157
|
if stdout.present?
|
126
158
|
stdout.each do |line|
|
127
|
-
log_internal(line, level: LogLevel::
|
159
|
+
log_internal(line, level: LogLevel::INFO)
|
128
160
|
line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
|
129
161
|
@output << {log: line, timestamp: Time.now}
|
130
162
|
end
|
@@ -176,10 +208,18 @@ class Cnvrg::Helpers::Agent
|
|
176
208
|
def log_internal(log, level: LogLevel::INFO)
|
177
209
|
if level == LogLevel::PURE
|
178
210
|
puts(log)
|
179
|
-
|
180
|
-
|
211
|
+
STDOUT.flush
|
212
|
+
return
|
213
|
+
end
|
214
|
+
to_print = {message: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity, job: @job_id}
|
215
|
+
if log.start_with?("{") and log.include?("timestamp")
|
216
|
+
log_json = JSON.parse(log)
|
217
|
+
to_print = to_print.stringify_keys.merge(log_json.stringify_keys)
|
181
218
|
end
|
219
|
+
puts(to_print.to_json)
|
182
220
|
STDOUT.flush
|
221
|
+
rescue => e
|
222
|
+
Cnvrg::Logger.log_error(e)
|
183
223
|
end
|
184
224
|
|
185
225
|
def filter_logs_by_regex(logs)
|
@@ -190,4 +230,4 @@ class Cnvrg::Helpers::Agent
|
|
190
230
|
end
|
191
231
|
end
|
192
232
|
end
|
193
|
-
end
|
233
|
+
end
|
@@ -1,7 +1,9 @@
|
|
1
|
+
require "filewatch/tail"
|
1
2
|
require 'cnvrg/helpers/agent'
|
2
3
|
class Cnvrg::Helpers::Executer
|
3
|
-
attr_reader :machine_activity, :agent_id, :
|
4
|
-
|
4
|
+
attr_reader :machine_activity, :agent_id, :main_id
|
5
|
+
MAIN_CONTAINER_PORT = ENV['MAIN_CONTAINER_PORT'].try(:to_i) || 4000
|
6
|
+
HAS_DOCKER = ENV['HAS_DOCKER'] == "true"
|
5
7
|
|
6
8
|
### this class represent a machine_activity. it will poll the commands, communicate with the
|
7
9
|
# server (poll commands) and let the server know the status of this executer.
|
@@ -9,11 +11,15 @@ class Cnvrg::Helpers::Executer
|
|
9
11
|
@owner = owner
|
10
12
|
@job_id = job_id
|
11
13
|
@poll_every = poll_every
|
14
|
+
@check_main_every = 10
|
12
15
|
@machine_activity = machine_activity
|
13
16
|
@commands_q = Queue.new
|
14
17
|
@files_q = Queue.new
|
15
18
|
@agent_id = nil
|
16
|
-
@
|
19
|
+
@main_id = nil
|
20
|
+
@main_start_time = nil
|
21
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
22
|
+
@main_name = @is_new_main ? "main" : "slave"
|
17
23
|
end
|
18
24
|
|
19
25
|
def create_file_cmd(path, content)
|
@@ -37,9 +43,10 @@ class Cnvrg::Helpers::Executer
|
|
37
43
|
def executer_stats
|
38
44
|
return @stats if @stats.present?
|
39
45
|
Cnvrg::Logger.log_info("getting containers")
|
40
|
-
@agent_id, @
|
46
|
+
@agent_id, @main_id = containers
|
41
47
|
Cnvrg::Logger.log_info("got containers")
|
42
48
|
pod_name, node_name = get_node_and_pod_names
|
49
|
+
# For backwards compatibility we still call this slave stats
|
43
50
|
@stats = {
|
44
51
|
pod_name: pod_name,
|
45
52
|
node_name: node_name,
|
@@ -53,39 +60,48 @@ class Cnvrg::Helpers::Executer
|
|
53
60
|
cnvrg: Cnvrg::VERSION
|
54
61
|
},
|
55
62
|
slave: {
|
56
|
-
container_id: @
|
57
|
-
|
58
|
-
|
63
|
+
container_id: @main_id,
|
64
|
+
container_name: @main_name,
|
65
|
+
workdir: run_in_main('pwd'),
|
66
|
+
homedir: main_homedir,
|
59
67
|
spark_path: spark_path,
|
60
|
-
user:
|
61
|
-
cnvrg:
|
62
|
-
has_bash:
|
63
|
-
user_id:
|
64
|
-
group_id:
|
65
|
-
python_version:
|
66
|
-
python3_version:
|
67
|
-
pip_version:
|
68
|
-
pip3_version:
|
68
|
+
user: run_in_main( 'whoami'),
|
69
|
+
cnvrg: run_in_main( 'which cnvrg'),
|
70
|
+
has_bash: run_in_main( 'which bash'),
|
71
|
+
user_id: run_in_main( 'id -u'),
|
72
|
+
group_id: run_in_main( 'id -g'),
|
73
|
+
python_version: run_in_main( 'python --version'),
|
74
|
+
python3_version: run_in_main( 'python3 --version'),
|
75
|
+
pip_version: run_in_main( 'pip --version'),
|
76
|
+
pip3_version: run_in_main( 'pip3 --version')
|
69
77
|
},
|
70
78
|
}
|
79
|
+
|
71
80
|
@stats
|
72
81
|
end
|
73
82
|
|
74
83
|
def containers
|
75
84
|
agent_id = nil
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
85
|
+
main_id = nil
|
86
|
+
timeout = 2
|
87
|
+
timeout = nil if (!@is_new_main || HAS_DOCKER)
|
88
|
+
Timeout.timeout(timeout) do
|
89
|
+
while agent_id.blank? or main_id.blank?
|
90
|
+
grep_by = @job_id
|
91
|
+
grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
|
92
|
+
cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
|
93
|
+
agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
|
94
|
+
main_id = cntrs.find{|container_name| container_name.include? @main_name}.split(",").first rescue nil
|
95
|
+
sleep(2)
|
96
|
+
end
|
84
97
|
end
|
85
|
-
if
|
86
|
-
raise "Can't find
|
98
|
+
if main_id.blank?
|
99
|
+
raise "Can't find main id"
|
87
100
|
end
|
88
|
-
[agent_id,
|
101
|
+
[agent_id, main_id]
|
102
|
+
rescue => e
|
103
|
+
Cnvrg::Logger.log_error(e)
|
104
|
+
[agent_id, main_id]
|
89
105
|
end
|
90
106
|
|
91
107
|
def current_homedir
|
@@ -93,21 +109,45 @@ class Cnvrg::Helpers::Executer
|
|
93
109
|
end
|
94
110
|
|
95
111
|
def spark_path
|
96
|
-
|
112
|
+
run_in_main("env | grep SPARK_HOME").strip.split("=").try(:last)
|
97
113
|
end
|
98
114
|
|
99
|
-
def
|
100
|
-
|
115
|
+
def main_homedir()
|
116
|
+
run_in_main("env | grep -w HOME").split("=").try(:last)
|
101
117
|
end
|
102
118
|
|
103
|
-
def
|
104
|
-
|
119
|
+
def main_env
|
120
|
+
run_in_main("env").split("\n").map{|x| x.split("=")}
|
105
121
|
end
|
106
122
|
|
107
|
-
def
|
108
|
-
|
109
|
-
end
|
123
|
+
def run_in_main(command)
|
124
|
+
data = {cmd: command, async: false, use_sh: true}
|
110
125
|
|
126
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn(timeout: 180)
|
127
|
+
response = conn.post('command', data.to_json)
|
128
|
+
if response.to_hash[:status].to_i != 200
|
129
|
+
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
130
|
+
return ""
|
131
|
+
end
|
132
|
+
resp = []
|
133
|
+
lines = response.body.split("\n")
|
134
|
+
lines.each do |line|
|
135
|
+
next if line.strip == nil or line.strip == ""
|
136
|
+
if line.include?("cnvrg-exit-code")
|
137
|
+
exit_status = line.split("=")[1].to_i
|
138
|
+
if exit_status != 0
|
139
|
+
Cnvrg::Logger.log_info("failed to run find command #{command} on main")
|
140
|
+
return ""
|
141
|
+
end
|
142
|
+
next
|
143
|
+
end
|
144
|
+
resp << line
|
145
|
+
end
|
146
|
+
return resp.join("\n")
|
147
|
+
rescue => e
|
148
|
+
Cnvrg::Logger.log_error(e)
|
149
|
+
return ""
|
150
|
+
end
|
111
151
|
|
112
152
|
def poll
|
113
153
|
resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
|
@@ -124,6 +164,7 @@ class Cnvrg::Helpers::Executer
|
|
124
164
|
success = false
|
125
165
|
puts("Agent started, connecting to #{Cnvrg::API.get_api}")
|
126
166
|
STDOUT.flush
|
167
|
+
wait_for_main
|
127
168
|
while !success and retries < 100
|
128
169
|
begin
|
129
170
|
resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
|
@@ -154,14 +195,75 @@ class Cnvrg::Helpers::Executer
|
|
154
195
|
end
|
155
196
|
end
|
156
197
|
|
198
|
+
def check_main_is_working_thread
|
199
|
+
while true
|
200
|
+
check_main_alive
|
201
|
+
sleep(@check_main_every)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
157
205
|
def main_thread
|
158
206
|
init
|
159
207
|
Thread.new do
|
160
208
|
polling_thread
|
161
209
|
end
|
210
|
+
Thread.new do
|
211
|
+
check_main_is_working_thread
|
212
|
+
end
|
162
213
|
execute_cmds
|
163
214
|
end
|
164
215
|
|
216
|
+
def wait_for_main
|
217
|
+
copy_file_to_main
|
218
|
+
start_tiny_if_missing
|
219
|
+
puts("Waiting for main container")
|
220
|
+
STDOUT.flush
|
221
|
+
got_response = false
|
222
|
+
while !got_response do
|
223
|
+
begin
|
224
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
225
|
+
response = conn.get('readiness')
|
226
|
+
if response.to_hash[:status].to_i != 200
|
227
|
+
sleep(0.1)
|
228
|
+
next
|
229
|
+
else
|
230
|
+
puts("Client container is ready")
|
231
|
+
STDOUT.flush
|
232
|
+
@main_start_time = response.body.to_i
|
233
|
+
got_response = true
|
234
|
+
end
|
235
|
+
rescue => e
|
236
|
+
puts("Failed to connect to main")
|
237
|
+
puts(e)
|
238
|
+
STDOUT.flush
|
239
|
+
sleep(0.1)
|
240
|
+
next
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def copy_file_to_main
|
246
|
+
begin
|
247
|
+
FileUtils.cp("/cnvrg-tiny", "/conf/tiny")
|
248
|
+
FileUtils.cp_r("/scripts", "/conf/scripts-bin")
|
249
|
+
FileUtils.touch("/conf/tiny-ready")
|
250
|
+
rescue => e
|
251
|
+
Cnvrg::Logger.log_error(e)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def start_tiny_if_missing
|
256
|
+
return unless ENV['MAIN_CONTAINER_PORT'].blank?
|
257
|
+
Cnvrg::Logger.log_info("Tiny not found, starting it")
|
258
|
+
@agent_id, @main_id = containers
|
259
|
+
pid = Process.fork do
|
260
|
+
Cnvrg::Logger.log_info("running docker exec -i #{@main_id} sh -c '/conf/tiny")
|
261
|
+
`docker exec -i #{@main_id} sh -c '/conf/tiny'`.strip
|
262
|
+
end
|
263
|
+
Process.detach(pid)
|
264
|
+
Cnvrg::Logger.log_info("Tiny started and detached")
|
265
|
+
end
|
266
|
+
|
165
267
|
def execute_cmds
|
166
268
|
pids = []
|
167
269
|
while true
|
@@ -215,6 +317,23 @@ class Cnvrg::Helpers::Executer
|
|
215
317
|
Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
|
216
318
|
end
|
217
319
|
|
320
|
+
def check_main_alive
|
321
|
+
# Dont check before we got first response
|
322
|
+
return if @main_start_time == nil
|
323
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
324
|
+
response = conn.get('readiness')
|
325
|
+
if response.to_hash[:status].to_i != 200
|
326
|
+
main_start_time = 0
|
327
|
+
else
|
328
|
+
main_start_time = response.body.to_i
|
329
|
+
end
|
330
|
+
if main_start_time != @main_start_time
|
331
|
+
puts("Found that main restarted, restarting agent")
|
332
|
+
Cnvrg::Logger.log_info("Found that main restarted, restarting agent")
|
333
|
+
exit(1)
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
218
337
|
def get_pod_events(pod_name)
|
219
338
|
return if pod_name.blank?
|
220
339
|
`kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
|
@@ -224,4 +343,27 @@ class Cnvrg::Helpers::Executer
|
|
224
343
|
return if node_name.blank?
|
225
344
|
`kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
|
226
345
|
end
|
346
|
+
|
347
|
+
def self.main_container_url
|
348
|
+
if ENV["CNVRG_COMPUTE_CLUSTER"].blank? and ENV["KUBERNETES_SERVICE_HOST"].blank?
|
349
|
+
if ENV["MAIN_CONTAINER_PORT"].blank?
|
350
|
+
host = "slave"
|
351
|
+
else
|
352
|
+
host = "main"
|
353
|
+
end
|
354
|
+
"http://#{host}:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
355
|
+
else
|
356
|
+
"http://localhost:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
def self.get_main_conn(timeout: 4, open_timeout: 1)
|
361
|
+
conn = Faraday.new(
|
362
|
+
url: Cnvrg::Helpers::Executer.main_container_url,
|
363
|
+
headers: {'Content-Type' => 'application/json'}
|
364
|
+
)
|
365
|
+
conn.options.timeout = timeout
|
366
|
+
conn.options.open_timeout = open_timeout
|
367
|
+
conn
|
368
|
+
end
|
227
369
|
end
|
data/lib/cnvrg/job_ssh.rb
CHANGED
@@ -5,14 +5,18 @@ module Cnvrg
|
|
5
5
|
method_option :port, :type => :numeric, :aliases => ["-p", "--port"], :desc => "Port to bind into", :default => 2222
|
6
6
|
method_option :username, :type => :string, :aliases => ["-u", "--username"], :desc => "Job container user name", :default => nil
|
7
7
|
method_option :password, :type => :string, :aliases => ["--password"], :desc =>"Job Conatainer user name, will be set by cnvrg", :default => nil
|
8
|
+
method_option :no_auth, :type => :boolean, :aliases => ["--no-auth"], :default => false
|
9
|
+
method_option :internal_port, :type => :numeric, :aliases => ["--internal-port"], :desc =>"Internal port in the pod for the ssh", :default => 22
|
8
10
|
method_option :kubeconfig, :type => :string, :aliases => ["--kubeconfig"], :desc => "Path to kubeconfig, if blank default config will be used", :default => nil
|
9
11
|
def start(job_id)
|
12
|
+
no_auth = options["no_auth"]
|
10
13
|
Cnvrg::CLI.new.log_start(__method__, args, options)
|
11
14
|
@job_ssh = ConnectJobSsh.new(job_id)
|
12
|
-
@job_ssh.start(options['username'], options['password'])
|
15
|
+
@job_ssh.start(options['username'], options['password'], no_auth, port: options['internal_port'])
|
13
16
|
pod_name = nil
|
14
17
|
namespace = "cnvrg"
|
15
18
|
ssh_ready = false
|
19
|
+
internal_port = options['internal_port']
|
16
20
|
while not ssh_ready
|
17
21
|
resp = @job_ssh.status()
|
18
22
|
status = resp["ssh_status"]
|
@@ -26,13 +30,14 @@ module Cnvrg
|
|
26
30
|
username = resp["username"]
|
27
31
|
pod_name = resp["pod_name"]
|
28
32
|
namespace = resp["namespace"]
|
33
|
+
internal_port = resp["port"] || internal_port
|
29
34
|
ssh_ready = true
|
30
35
|
else
|
31
36
|
puts("Failed to start ssh")
|
32
37
|
break
|
33
38
|
end
|
34
39
|
end
|
35
|
-
if pod_name.blank? or password.blank? or username.blank?
|
40
|
+
if pod_name.blank? or (password.blank? and !no_auth) or username.blank?
|
36
41
|
puts("Failed to get required params")
|
37
42
|
return
|
38
43
|
end
|
@@ -41,8 +46,8 @@ module Cnvrg
|
|
41
46
|
puts("host: 127.0.0.1")
|
42
47
|
puts("port: #{options["port"]}")
|
43
48
|
puts("username: #{username}")
|
44
|
-
puts("password: #{password}")
|
45
|
-
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace)
|
49
|
+
puts("password: #{password}") unless no_auth
|
50
|
+
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace, internal_port)
|
46
51
|
end
|
47
52
|
end
|
48
53
|
end
|
data/lib/cnvrg/project.rb
CHANGED
@@ -328,15 +328,21 @@ module Cnvrg
|
|
328
328
|
end
|
329
329
|
|
330
330
|
def get_storage_client
|
331
|
-
|
332
|
-
|
333
|
-
|
331
|
+
client_params = nil
|
332
|
+
i = 0
|
333
|
+
begin
|
334
|
+
response = Cnvrg::API.request("users/#{@owner}/projects/#{@slug}/client", 'GET')
|
335
|
+
unless Cnvrg::CLI.is_response_success(response, false)
|
336
|
+
raise StandardError.new("Can't find project credentials")
|
337
|
+
end
|
334
338
|
client_params = response['client']
|
335
|
-
|
336
|
-
|
339
|
+
rescue StandardError
|
340
|
+
i += 1
|
341
|
+
sleep(5 * i)
|
342
|
+
retry if i < 10
|
337
343
|
client_params = get_storage_client_fallback
|
338
344
|
end
|
339
|
-
|
345
|
+
raise StandardError.new("Can't find project credentials") unless client_params
|
340
346
|
Cnvrg::Downloader::Client.factory(client_params)
|
341
347
|
end
|
342
348
|
|
@@ -378,14 +384,18 @@ module Cnvrg
|
|
378
384
|
[]
|
379
385
|
end
|
380
386
|
|
381
|
-
def generate_output_dir(output_dir)
|
387
|
+
def generate_output_dir(output_dir, local: false)
|
382
388
|
Cnvrg::Logger.log_info("Generating output dir for #{output_dir}")
|
383
389
|
upload_list = []
|
390
|
+
list = []
|
384
391
|
list = Dir.glob("/cnvrg/#{output_dir}/**/*", File::FNM_DOTMATCH)
|
392
|
+
if local
|
393
|
+
list += Dir.glob("#{output_dir}/**/*", File::FNM_DOTMATCH)
|
394
|
+
end
|
395
|
+
list.uniq!
|
385
396
|
Parallel.map(list, in_threads: IDXParallelThreads) do |e|
|
386
397
|
next if e.end_with? "/."
|
387
398
|
if File.directory? e
|
388
|
-
|
389
399
|
upload_list << e + "/"
|
390
400
|
else
|
391
401
|
upload_list << e
|
@@ -447,6 +457,10 @@ module Cnvrg
|
|
447
457
|
if list_ignore_new.include? label
|
448
458
|
next
|
449
459
|
end
|
460
|
+
if File.symlink?(e)
|
461
|
+
Cnvrg::Logger.log_info("Skipping symlink #{e}")
|
462
|
+
next
|
463
|
+
end
|
450
464
|
if File.directory? e
|
451
465
|
dir_name = (label.ends_with? "/") ? label : (label + "/")
|
452
466
|
tree_idx[dir_name] = nil
|
@@ -647,7 +661,11 @@ module Cnvrg
|
|
647
661
|
|
648
662
|
def fetch_webapp_slugs(webapp_slug, slugs: nil)
|
649
663
|
response = Cnvrg::API_V2.request("#{self.owner}/projects/#{self.slug}/webapps/#{webapp_slug}" , 'GET')
|
650
|
-
|
664
|
+
|
665
|
+
if response.key?("experiments")
|
666
|
+
return response["experiments"]
|
667
|
+
end
|
668
|
+
return response["data"]["attributes"]["experiments"]
|
651
669
|
rescue
|
652
670
|
slugs
|
653
671
|
end
|
@@ -699,8 +717,11 @@ module Cnvrg
|
|
699
717
|
res = JSON.parse(resp['result']) rescue nil
|
700
718
|
return if res.blank?
|
701
719
|
config = self.get_config
|
702
|
-
config[:is_git] = res['git']
|
703
720
|
config[:project_name] = res['title']
|
721
|
+
config[:project_slug] = @slug
|
722
|
+
config[:owner] = @owner
|
723
|
+
config[:git] = res['git'] || false
|
724
|
+
config[:is_git] = res['git'] || false
|
704
725
|
self.set_config(config)
|
705
726
|
end
|
706
727
|
|
data/lib/cnvrg/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
1
|
module Cnvrg
|
2
|
-
VERSION = '
|
3
|
-
end
|
2
|
+
VERSION = '2.0.11'
|
3
|
+
end
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cnvrg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yochay Ettun
|
8
8
|
- Leah Kolben
|
9
9
|
- Omer Shacham
|
10
|
-
autorequire:
|
10
|
+
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2021-
|
13
|
+
date: 2021-10-21 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: bundler
|
@@ -96,6 +96,26 @@ dependencies:
|
|
96
96
|
- - ">="
|
97
97
|
- !ruby/object:Gem::Version
|
98
98
|
version: '0'
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: ffi
|
101
|
+
requirement: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - "~>"
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '1.9'
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: 1.9.10
|
109
|
+
type: :runtime
|
110
|
+
prerelease: false
|
111
|
+
version_requirements: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - "~>"
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '1.9'
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: 1.9.10
|
99
119
|
- !ruby/object:Gem::Dependency
|
100
120
|
name: mimemagic
|
101
121
|
requirement: !ruby/object:Gem::Requirement
|
@@ -302,6 +322,20 @@ dependencies:
|
|
302
322
|
- - "~>"
|
303
323
|
- !ruby/object:Gem::Version
|
304
324
|
version: 0.1.1
|
325
|
+
- !ruby/object:Gem::Dependency
|
326
|
+
name: filewatch
|
327
|
+
requirement: !ruby/object:Gem::Requirement
|
328
|
+
requirements:
|
329
|
+
- - "~>"
|
330
|
+
- !ruby/object:Gem::Version
|
331
|
+
version: 0.9.0
|
332
|
+
type: :runtime
|
333
|
+
prerelease: false
|
334
|
+
version_requirements: !ruby/object:Gem::Requirement
|
335
|
+
requirements:
|
336
|
+
- - "~>"
|
337
|
+
- !ruby/object:Gem::Version
|
338
|
+
version: 0.9.0
|
305
339
|
- !ruby/object:Gem::Dependency
|
306
340
|
name: parallel
|
307
341
|
requirement: !ruby/object:Gem::Requirement
|
@@ -439,7 +473,7 @@ files:
|
|
439
473
|
homepage: https://cnvrg.io
|
440
474
|
licenses: []
|
441
475
|
metadata: {}
|
442
|
-
post_install_message:
|
476
|
+
post_install_message:
|
443
477
|
rdoc_options: []
|
444
478
|
require_paths:
|
445
479
|
- lib
|
@@ -454,8 +488,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
454
488
|
- !ruby/object:Gem::Version
|
455
489
|
version: '0'
|
456
490
|
requirements: []
|
457
|
-
rubygems_version: 3.
|
458
|
-
signing_key:
|
491
|
+
rubygems_version: 3.2.22
|
492
|
+
signing_key:
|
459
493
|
specification_version: 4
|
460
494
|
summary: A CLI tool for interacting with cnvrg.io.
|
461
495
|
test_files: []
|