cnvrg 1.11.28 → 2.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Readme.md +42 -0
- data/cnvrg.gemspec +8 -6
- data/lib/cnvrg/api.rb +4 -4
- data/lib/cnvrg/cli.rb +172 -81
- data/lib/cnvrg/connect_job_ssh.rb +4 -4
- data/lib/cnvrg/experiment.rb +18 -11
- data/lib/cnvrg/files.rb +6 -2
- data/lib/cnvrg/helpers/agent.rb +58 -18
- data/lib/cnvrg/helpers/executer.rb +179 -37
- data/lib/cnvrg/job_ssh.rb +9 -4
- data/lib/cnvrg/project.rb +26 -9
- data/lib/cnvrg/version.rb +2 -2
- metadata +43 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b238ce877c0fa8da80c63c6d3d165ce974c250ea9c3f3d21ff092098356aa5a5
|
4
|
+
data.tar.gz: 7c7b4aa637b3bcaf5f18a7eb973d6995f0f1e25147a16a2ccfb1ce3d6dea91e3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb1b724199e92fddfa2be4c04b9eb1a8a94e1df65759482fb6c3173f7c3c7632d236c22836749e5f1d6578115555b838285b58d1a95a42ce2df67d3a354da043
|
7
|
+
data.tar.gz: c840826e01dcd8061fdb1bf065f416d97ca3fe35575649171814741c32332b458c7bac57fa248e9afd45f337b60b18003077cc72d5927527490689572608ad6b
|
data/Readme.md
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
|
2
|
+
## Version v1.11.15
|
3
|
+
2021-03-30
|
4
|
+
* DEV-208 - Task: Make sure the index name is constant over days
|
5
|
+
* DEV-7555 - Bug: CLI: Error message is not correct when run a flow after removing the permission from Template.
|
6
|
+
* DEV-7800 - New Feature: FR - add stdout to CLI logs (for logging in kibana etc.)
|
7
|
+
* DEV-7928 - Bug: CLI - cnvrg clone doesnt show log message when files not found
|
8
|
+
* DEV-7956 - Bug: CLI crashes from progressbar
|
9
|
+
* DEV-8006 - Bug: Cli- cnvrg data put ,slash in the end url path will cause unique index error
|
10
|
+
* DEV-8007 - Bug: Cli- Cnvrg data clone failed sometimes to load sts, there for clone crashed
|
11
|
+
* DEV-8159 - New Feature: Oauth Proxy
|
12
|
+
* DEV-8179 - New Feature: Add auto cache and link files in cache clone
|
13
|
+
* DEV-8208 - Bug: Cli - cnvrg data put fails
|
14
|
+
* DEV-8284 - Improvement: Use server instead of docker for agent communication
|
15
|
+
* DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
|
16
|
+
* DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
|
17
|
+
* DEV-8621 - Improvement: Add more metrics
|
18
|
+
## Version v1.11.30
|
19
|
+
2021-04-06
|
20
|
+
## Version v1.11.31
|
21
|
+
2021-04-22
|
22
|
+
## Version v1.11.32
|
23
|
+
2021-05-05
|
24
|
+
* DEV-8868 - Bug: SDK - e.sync() in git project only creates empty "output" folder in commit
|
25
|
+
## Version v2.0.1
|
26
|
+
2021-06-13
|
27
|
+
## Version v2.0.2
|
28
|
+
2021-06-16
|
29
|
+
* DEV-9694 - Bug: Download artifacts fails on authorization error
|
30
|
+
## Version v2.0.3
|
31
|
+
2021-06-29
|
32
|
+
* DEV-9919 - Bug: clone artifacts fails on "Not Authorize, Are you logged in?"
|
33
|
+
## Version v2.0.4
|
34
|
+
2021-07-08
|
35
|
+
* DEV-9935 - Bug: CLI - cnvrg sync creates new commit but no blob versions
|
36
|
+
## Version v2.0.5
|
37
|
+
2021-07-11
|
38
|
+
* DEV-10171 - Bug: experiment randomly fails with error- "Couldn't clone artifacts"
|
39
|
+
* DEV-10189 - Bug: CLI Sync -file/folder with broken symlink will cause sync to fail
|
40
|
+
## Version v2.0.6
|
41
|
+
2021-07-18
|
42
|
+
* DEV-10209 - Bug: some experiments in grid failed on cnvrg-cli commands (docker container id was missing)
|
data/cnvrg.gemspec
CHANGED
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
|
|
14
14
|
|
15
15
|
#spec.files = `git ls-files`.split($/)
|
16
16
|
spec.files = %w[cnvrg.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
|
17
|
-
spec.executables
|
18
|
-
spec.executables
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.executables = ['cnvrg']
|
19
19
|
spec.require_paths = ['lib']
|
20
20
|
|
21
21
|
spec.add_development_dependency 'bundler'
|
@@ -23,14 +23,15 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
24
24
|
spec.add_development_dependency 'vcr', '~> 3.0'
|
25
25
|
spec.add_development_dependency 'aruba'
|
26
|
-
spec.add_development_dependency 'pry'
|
27
|
-
|
28
|
-
spec.add_runtime_dependency '
|
26
|
+
spec.add_development_dependency 'pry'
|
27
|
+
|
28
|
+
spec.add_runtime_dependency 'ffi', '~> 1.9', '>= 1.9.10'
|
29
|
+
spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
|
29
30
|
spec.add_runtime_dependency 'faraday', '~> 0.15.2'
|
30
31
|
spec.add_runtime_dependency 'netrc', '~> 0.11.0'
|
31
32
|
spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
|
32
33
|
spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
|
33
|
-
spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
|
34
|
+
spec.add_runtime_dependency 'thor', '~> 0.19.0', '>=0.19.1'
|
34
35
|
spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
|
35
36
|
spec.add_runtime_dependency 'signet', '~> 0.11.0'
|
36
37
|
spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
|
@@ -38,6 +39,7 @@ Gem::Specification.new do |spec|
|
|
38
39
|
spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
|
39
40
|
spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
|
40
41
|
spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
|
42
|
+
spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
|
41
43
|
spec.add_runtime_dependency 'parallel', '~> 1.12.0'
|
42
44
|
spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
|
43
45
|
spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
|
data/lib/cnvrg/api.rb
CHANGED
@@ -72,11 +72,11 @@ module Cnvrg
|
|
72
72
|
if response.to_hash[:status].to_i != 200
|
73
73
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
74
74
|
end
|
75
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
75
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
76
76
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
77
77
|
success = false
|
78
78
|
sleep(5 * retries)
|
79
|
-
retries +=1
|
79
|
+
retries += 1
|
80
80
|
next
|
81
81
|
end
|
82
82
|
rescue => e
|
@@ -112,11 +112,11 @@ module Cnvrg
|
|
112
112
|
if response.to_hash[:status].to_i != 200
|
113
113
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
114
114
|
end
|
115
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
115
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
116
116
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
117
117
|
success = false
|
118
118
|
sleep(5 * retries)
|
119
|
-
retries +=1
|
119
|
+
retries += 1
|
120
120
|
next
|
121
121
|
end
|
122
122
|
rescue => e
|
data/lib/cnvrg/cli.rb
CHANGED
@@ -173,7 +173,7 @@ module Cnvrg
|
|
173
173
|
desc "data [COMMAND]", "Upload and manage datasets", :hide => false
|
174
174
|
subcommand "data", Data
|
175
175
|
|
176
|
-
desc "job", "manage running jobs", :hide =>
|
176
|
+
desc "job", "manage running jobs", :hide => true
|
177
177
|
subcommand "job", JobCli
|
178
178
|
|
179
179
|
desc "ssh", "ssh into running jobs", :hide => false
|
@@ -415,7 +415,7 @@ module Cnvrg
|
|
415
415
|
end
|
416
416
|
end
|
417
417
|
|
418
|
-
desc 'set_compression_path', 'Set compression path'
|
418
|
+
desc 'set_compression_path', 'Set compression path', :hide => true
|
419
419
|
method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
|
420
420
|
|
421
421
|
def set_compression_path(*compression_path)
|
@@ -2311,6 +2311,7 @@ module Cnvrg
|
|
2311
2311
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
2312
2312
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
2313
2313
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
2314
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
2314
2315
|
|
2315
2316
|
def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
|
2316
2317
|
begin
|
@@ -2328,6 +2329,8 @@ module Cnvrg
|
|
2328
2329
|
exp_obj = nil
|
2329
2330
|
end
|
2330
2331
|
|
2332
|
+
local = options["local"]
|
2333
|
+
|
2331
2334
|
commit_msg = options["message"]
|
2332
2335
|
if commit_msg.nil? or commit_msg.empty?
|
2333
2336
|
commit_msg = ""
|
@@ -2349,7 +2352,7 @@ module Cnvrg
|
|
2349
2352
|
if git_output_dir.ends_with? "/"
|
2350
2353
|
git_output_dir = git_output_dir[0..-2]
|
2351
2354
|
end
|
2352
|
-
list = @project.generate_output_dir(git_output_dir)
|
2355
|
+
list = @project.generate_output_dir(git_output_dir, local: local)
|
2353
2356
|
end
|
2354
2357
|
list += @project.generate_git_diff if options["git_diff"]
|
2355
2358
|
spec_files_to_upload = list
|
@@ -2668,7 +2671,7 @@ module Cnvrg
|
|
2668
2671
|
end
|
2669
2672
|
end
|
2670
2673
|
|
2671
|
-
desc 'commit before termination', 'Commit job code before termination'
|
2674
|
+
desc 'commit before termination', 'Commit job code before termination', :hide => true
|
2672
2675
|
def commit_before_termination()
|
2673
2676
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2674
2677
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2678,7 +2681,7 @@ module Cnvrg
|
|
2678
2681
|
log_error(e)
|
2679
2682
|
end
|
2680
2683
|
|
2681
|
-
desc 'update_job_commit', 'Update job with its last commit'
|
2684
|
+
desc 'update_job_commit', 'Update job with its last commit' , :hide => true
|
2682
2685
|
def update_job_commit()
|
2683
2686
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2684
2687
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2868,7 +2871,7 @@ module Cnvrg
|
|
2868
2871
|
|
2869
2872
|
|
2870
2873
|
|
2871
|
-
desc 'jump', 'Jump to specific commit'
|
2874
|
+
desc 'jump COMMIT_ID', 'Jump to specific commit'
|
2872
2875
|
def jump(commit_sha1)
|
2873
2876
|
begin
|
2874
2877
|
verify_logged_in()
|
@@ -3003,11 +3006,12 @@ module Cnvrg
|
|
3003
3006
|
method_option :job_slug, :type => :string, :aliases => ["-j", "--job"], :default => nil
|
3004
3007
|
method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
|
3005
3008
|
method_option :files, :type => :string, :aliases => ["--files"], :default => nil
|
3006
|
-
method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default =>
|
3009
|
+
method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => 'output'
|
3007
3010
|
method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
|
3008
3011
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
3009
3012
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
3010
3013
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
3014
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
3011
3015
|
|
3012
3016
|
def sync(direct = true)
|
3013
3017
|
verify_logged_in(true) if direct
|
@@ -3030,10 +3034,10 @@ module Cnvrg
|
|
3030
3034
|
if run_download or options['debug_mode']
|
3031
3035
|
invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
|
3032
3036
|
end
|
3033
|
-
invoke :upload, [false, true,
|
3037
|
+
invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
|
3034
3038
|
:ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
|
3035
3039
|
:files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
|
3036
|
-
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
|
3040
|
+
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
|
3037
3041
|
|
3038
3042
|
end
|
3039
3043
|
|
@@ -3199,6 +3203,7 @@ module Cnvrg
|
|
3199
3203
|
method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
|
3200
3204
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
|
3201
3205
|
method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
|
3206
|
+
method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
|
3202
3207
|
|
3203
3208
|
def exec(*cmd)
|
3204
3209
|
log = []
|
@@ -3222,6 +3227,7 @@ module Cnvrg
|
|
3222
3227
|
output_dir = options['output_dir'] || "output"
|
3223
3228
|
project_home = get_project_home
|
3224
3229
|
data_query = options["data_query"]
|
3230
|
+
docker_stats = options["docker_stats"]
|
3225
3231
|
@project = Project.new(project_home)
|
3226
3232
|
if @project.is_git
|
3227
3233
|
sync_before = false
|
@@ -3294,80 +3300,67 @@ module Cnvrg
|
|
3294
3300
|
stdout, stderr = '', ''
|
3295
3301
|
begin
|
3296
3302
|
process_running = true
|
3297
|
-
|
3298
|
-
|
3299
|
-
|
3300
|
-
|
3301
|
-
|
3302
|
-
|
3303
|
-
|
3304
|
-
|
3305
|
-
|
3303
|
+
if docker_stats
|
3304
|
+
stats_thread = Thread.new do
|
3305
|
+
while process_running do
|
3306
|
+
sleep 30
|
3307
|
+
begin
|
3308
|
+
stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? { memory: memory_usage, cpu: cpu_usage } : {}
|
3309
|
+
if is_on_gpu
|
3310
|
+
gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
|
3311
|
+
stats['gpu_util'] = gu[0]
|
3312
|
+
stats['gpu'] = gu[1]
|
3313
|
+
end
|
3314
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
3315
|
+
rescue => e
|
3316
|
+
log_error(e)
|
3317
|
+
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
3306
3318
|
end
|
3307
|
-
@exp.send_machine_stats [stats] unless stats.empty?
|
3308
|
-
rescue => e
|
3309
|
-
log_error(e)
|
3310
|
-
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
3311
3319
|
end
|
3312
3320
|
end
|
3313
3321
|
end
|
3314
3322
|
start_time = Time.now
|
3315
|
-
shell_type = options["use_bash"] ? "bash -l" : "sh"
|
3316
3323
|
if @exp.get_cmd.present?
|
3317
3324
|
cmd = @exp.get_cmd
|
3318
|
-
if options["docker_id"].present? # Escape for docker exec
|
3319
|
-
cmd = cmd.gsub("\"", "\\\"")
|
3320
|
-
end
|
3321
3325
|
end
|
3322
|
-
|
3323
|
-
|
3326
|
+
command_slug = (0...18).map { (65 + rand(26)).chr }.join
|
3327
|
+
result_file = "/conf/result-#{command_slug}"
|
3328
|
+
data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
|
3329
|
+
|
3330
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
3331
|
+
response = conn.post('command', data.to_json)
|
3332
|
+
if response.to_hash[:status].to_i != 200
|
3333
|
+
exit_status = 129
|
3334
|
+
raise StandardError.new("Cant send command to slave")
|
3324
3335
|
end
|
3325
|
-
|
3336
|
+
t = FileWatch::Tail.new
|
3337
|
+
filename = result_file
|
3338
|
+
lines = []
|
3339
|
+
t.tail(filename)
|
3340
|
+
t.subscribe do |path, line|
|
3326
3341
|
begin
|
3327
|
-
|
3328
|
-
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
|
3333
|
-
|
3334
|
-
|
3335
|
-
|
3336
|
-
puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
|
3337
|
-
end
|
3342
|
+
cur_log = JSON.parse(line)
|
3343
|
+
if cur_log["type"] == "endMessage"
|
3344
|
+
exit_status = cur_log["real"].to_i
|
3345
|
+
break
|
3346
|
+
else
|
3347
|
+
puts(cur_log.to_json)
|
3348
|
+
STDOUT.flush
|
3349
|
+
cur_log["time"] = Time.parse(cur_log["timestamp"])
|
3350
|
+
cur_log["message"] = cur_log["message"].to_s + "\r\n"
|
3338
3351
|
log << cur_log
|
3339
|
-
if log.size >= 10
|
3340
|
-
@exp.upload_temp_log(log) unless log.empty?
|
3341
|
-
log = []
|
3342
|
-
elsif (start_time + 15.seconds) <= Time.now
|
3343
|
-
@exp.upload_temp_log(log) unless log.empty?
|
3344
|
-
log = []
|
3345
|
-
start_time = Time.now
|
3346
|
-
end
|
3347
3352
|
end
|
3348
|
-
if
|
3349
|
-
|
3350
|
-
|
3351
|
-
|
3352
|
-
|
3353
|
-
|
3354
|
-
|
3355
|
-
if !log.empty?
|
3356
|
-
temp_log = log
|
3357
|
-
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3358
|
-
log -= temp_log
|
3353
|
+
if log.size >= 10
|
3354
|
+
@exp.upload_temp_log(log)
|
3355
|
+
log = []
|
3356
|
+
elsif (start_time + 15.seconds) <= Time.now
|
3357
|
+
@exp.upload_temp_log(log) unless log.empty?
|
3358
|
+
log = []
|
3359
|
+
start_time = Time.now
|
3359
3360
|
end
|
3360
|
-
rescue Errno::ENOENT => e
|
3361
|
-
exp_success = false
|
3362
|
-
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
3363
|
-
log_error(e)
|
3364
3361
|
rescue => e
|
3365
|
-
res = @exp.end(log, 1, start_commit, 0, 0)
|
3366
|
-
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
3367
3362
|
log_error(e)
|
3368
|
-
exit(0)
|
3369
3363
|
end
|
3370
|
-
::Process.wait pid
|
3371
3364
|
end
|
3372
3365
|
end_time = Time.now
|
3373
3366
|
process_running = false
|
@@ -3375,14 +3368,13 @@ module Cnvrg
|
|
3375
3368
|
if !log.empty?
|
3376
3369
|
|
3377
3370
|
temp_log = log
|
3378
|
-
|
3371
|
+
@exp.upload_temp_log(temp_log)
|
3379
3372
|
log -= temp_log
|
3380
3373
|
end
|
3381
3374
|
|
3382
3375
|
cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
|
3383
3376
|
memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
|
3384
|
-
exit_status
|
3385
|
-
if $?.exitstatus != 0
|
3377
|
+
if exit_status != 0
|
3386
3378
|
exp_success = false
|
3387
3379
|
end
|
3388
3380
|
|
@@ -3405,7 +3397,7 @@ module Cnvrg
|
|
3405
3397
|
end
|
3406
3398
|
|
3407
3399
|
# log_thread.join
|
3408
|
-
stats_thread.join
|
3400
|
+
stats_thread.join if docker_stats
|
3409
3401
|
|
3410
3402
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
|
3411
3403
|
|
@@ -3425,8 +3417,7 @@ module Cnvrg
|
|
3425
3417
|
log_message("Couldn't run #{cmd}, check your input parameters", Thor::Shell::Color::RED)
|
3426
3418
|
if @exp
|
3427
3419
|
# log_thread.join
|
3428
|
-
Thread.kill(stats_thread)
|
3429
|
-
exit_status = $?.exitstatus
|
3420
|
+
Thread.kill(stats_thread) if docker_stats
|
3430
3421
|
if exit_status.blank?
|
3431
3422
|
exit_status = "-1"
|
3432
3423
|
end
|
@@ -3439,8 +3430,6 @@ module Cnvrg
|
|
3439
3430
|
|
3440
3431
|
exit(1)
|
3441
3432
|
end
|
3442
|
-
|
3443
|
-
|
3444
3433
|
end
|
3445
3434
|
|
3446
3435
|
end
|
@@ -3449,7 +3438,7 @@ module Cnvrg
|
|
3449
3438
|
end_commit = @project.last_local_commit
|
3450
3439
|
process_running = false
|
3451
3440
|
# log_thread.join
|
3452
|
-
stats_thread.join
|
3441
|
+
stats_thread.join if docker_stats
|
3453
3442
|
|
3454
3443
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
|
3455
3444
|
if container
|
@@ -3685,7 +3674,7 @@ module Cnvrg
|
|
3685
3674
|
end
|
3686
3675
|
end
|
3687
3676
|
|
3688
|
-
desc 'deploy', 'Deploys model to production'
|
3677
|
+
desc 'deploy', 'Deploys model to production', :hide => true
|
3689
3678
|
method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
|
3690
3679
|
method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
|
3691
3680
|
method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
|
@@ -3774,7 +3763,7 @@ module Cnvrg
|
|
3774
3763
|
method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
|
3775
3764
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
|
3776
3765
|
|
3777
|
-
desc 'notebook', 'Starts a notebook session remotely or locally'
|
3766
|
+
desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
|
3778
3767
|
|
3779
3768
|
def notebook
|
3780
3769
|
verify_logged_in(true)
|
@@ -3901,7 +3890,7 @@ module Cnvrg
|
|
3901
3890
|
end
|
3902
3891
|
end
|
3903
3892
|
|
3904
|
-
desc 'remote_notebook', 'Run notebook server on remote server'
|
3893
|
+
desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
|
3905
3894
|
method_option :machine_type, :type => :string, :default => ""
|
3906
3895
|
method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
|
3907
3896
|
method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
|
@@ -4260,7 +4249,7 @@ module Cnvrg
|
|
4260
4249
|
|
4261
4250
|
end
|
4262
4251
|
|
4263
|
-
desc 'notebook_stop', '
|
4252
|
+
desc 'notebook_stop', 'Stop notebook', :hide => true
|
4264
4253
|
method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
|
4265
4254
|
method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
|
4266
4255
|
method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
|
@@ -4640,6 +4629,108 @@ module Cnvrg
|
|
4640
4629
|
end
|
4641
4630
|
end
|
4642
4631
|
|
4632
|
+
desc 'Collect and send job utilization', '', :hide => true
|
4633
|
+
method_option :prometheus_url, :type => :string, :aliases => ["--prometheus_url"], :desc => "prometheus url to collect metrics from"
|
4634
|
+
method_option :node_name, :type => :string, :aliases => ["--node_name"], :desc => "machie activity node name"
|
4635
|
+
method_option :machine, :type => :boolean, :aliases => ["--machine"], :desc => "get machine_query or cluster_query"
|
4636
|
+
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
4637
|
+
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
4638
|
+
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
4639
|
+
method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
|
4640
|
+
method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
|
4641
|
+
method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
|
4642
|
+
|
4643
|
+
def collect_metrics
|
4644
|
+
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
4645
|
+
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
4646
|
+
prom_user = options[:prom_user]
|
4647
|
+
prom_password = options[:prom_password]
|
4648
|
+
name = options[:name]
|
4649
|
+
|
4650
|
+
translate_result = Cnvrg::API_V2.request(
|
4651
|
+
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
4652
|
+
'GET',
|
4653
|
+
{ gpu: options[:gpu], gaudi: options[:gaudi] }
|
4654
|
+
)
|
4655
|
+
|
4656
|
+
is_machine = options[:machine]
|
4657
|
+
while true do
|
4658
|
+
begin
|
4659
|
+
stats = {}
|
4660
|
+
translate_result.each do |query_name, metric|
|
4661
|
+
if is_machine
|
4662
|
+
metric_query = metric['machine_query'].presence || metric['query']
|
4663
|
+
query_content = metric_query.gsub('#JOB_SLUG#', ENV['CNVRG_JOB_ID']).gsub('#NODE_NAME#', options[:node_name])
|
4664
|
+
else
|
4665
|
+
metric_query = metric['cluster_query'].presence || metric['query']
|
4666
|
+
pod_name = `hostname`.strip
|
4667
|
+
query_content = metric_query.gsub('#JOB_SLUG#', pod_name).gsub('#NODE_NAME#', options[:node_name])
|
4668
|
+
end
|
4669
|
+
if metric_query.blank? || query_content.blank?
|
4670
|
+
next
|
4671
|
+
end
|
4672
|
+
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
4673
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
4674
|
+
http.use_ssl = uri.scheme == "https"
|
4675
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
4676
|
+
req = Net::HTTP::Get.new uri.request_uri
|
4677
|
+
if prom_user.present?
|
4678
|
+
req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
|
4679
|
+
end
|
4680
|
+
resp = http.request(req)
|
4681
|
+
begin
|
4682
|
+
result = JSON.parse(resp.body)
|
4683
|
+
rescue JSON::ParserError => e
|
4684
|
+
log_error(e)
|
4685
|
+
next
|
4686
|
+
end
|
4687
|
+
data_result = result&.dig('data', 'result')
|
4688
|
+
next unless data_result
|
4689
|
+
|
4690
|
+
if data_result.size > 1
|
4691
|
+
stats[query_name] = {} unless query_name.include? 'block'
|
4692
|
+
data_result.each_with_index do |res, i|
|
4693
|
+
timestamp, value = res["value"]
|
4694
|
+
uuid = res["metric"]["UUID"].presence || i
|
4695
|
+
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4696
|
+
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4697
|
+
if query_name.include? 'block'
|
4698
|
+
uuid = res["metric"]["interface"].presence || i
|
4699
|
+
uuid = "#{name}-#{uuid}" if name.present?
|
4700
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
4701
|
+
io_type = query_name.split('_')[1]
|
4702
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4703
|
+
stats['block_io'][io_type].merge!({ uuid => stat_value })
|
4704
|
+
else
|
4705
|
+
stats[query_name][uuid] = stat_value
|
4706
|
+
end
|
4707
|
+
end
|
4708
|
+
else
|
4709
|
+
timestamp, value = data_result&.first&.dig('value')
|
4710
|
+
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4711
|
+
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4712
|
+
if query_name.include? 'block'
|
4713
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
4714
|
+
io_type = query_name.split('_')[1]
|
4715
|
+
if name.present?
|
4716
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4717
|
+
stats['block_io'][io_type].merge!({ name => stat_value })
|
4718
|
+
else
|
4719
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
4720
|
+
end
|
4721
|
+
else
|
4722
|
+
stats[query_name] = name.present? ? { name => stat_value } : stat_value
|
4723
|
+
end
|
4724
|
+
end
|
4725
|
+
end
|
4726
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
4727
|
+
rescue => e
|
4728
|
+
log_error(e)
|
4729
|
+
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
4730
|
+
end
|
4731
|
+
sleep options[:wait]
|
4732
|
+
end
|
4733
|
+
end
|
4643
4734
|
|
4644
4735
|
desc '', '', :hide => true
|
4645
4736
|
|
@@ -4672,7 +4763,7 @@ module Cnvrg
|
|
4672
4763
|
end
|
4673
4764
|
|
4674
4765
|
|
4675
|
-
desc '', ''
|
4766
|
+
desc '', '', :hide => true
|
4676
4767
|
|
4677
4768
|
def download_built_image(image_name, image_slug)
|
4678
4769
|
begin
|