cnvrg 1.11.26 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Readme.md +26 -0
- data/cnvrg.gemspec +6 -4
- data/lib/cnvrg/api.rb +4 -4
- data/lib/cnvrg/cli.rb +172 -81
- data/lib/cnvrg/connect_job_ssh.rb +4 -4
- data/lib/cnvrg/datafiles.rb +56 -39
- data/lib/cnvrg/experiment.rb +18 -11
- data/lib/cnvrg/files.rb +1 -2
- data/lib/cnvrg/helpers.rb +1 -0
- data/lib/cnvrg/helpers/agent.rb +58 -18
- data/lib/cnvrg/helpers/executer.rb +178 -36
- data/lib/cnvrg/job_ssh.rb +9 -4
- data/lib/cnvrg/project.rb +5 -3
- data/lib/cnvrg/version.rb +2 -2
- metadata +40 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ab82882b2bb6c9093751cd560eaa4ccb7540fad9b9a34a81245721538dd37a5b
|
|
4
|
+
data.tar.gz: e35299be744d985a37794288a269ba2bb55cf64d3fcd702c6c6147bd4f5d740d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 52b51bb4942583e9ac3eceab24891d252ef7e31d7f1a2a87d5c8f4586f914f33f2d9cc7addc09ab5aa3652a9ba939ef5a37bdb9fc82d18e4e0e0a61126265d17
|
|
7
|
+
data.tar.gz: df735e631778b5d36d296c33903719a2ba7832ce2a9e218eea033ad246f8c4ce044a7b7d5562f68d901a1ee32d62a88559f886da2a0341d8bf7c51b50c25660c
|
data/Readme.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
|
|
2
|
+
## Version v1.11.15
|
|
3
|
+
2021-03-30
|
|
4
|
+
* DEV-208 - Task: Make sure the index name is constant over days
|
|
5
|
+
* DEV-7555 - Bug: CLI: Error message is not correct when run a flow after removing the permission from Template.
|
|
6
|
+
* DEV-7800 - New Feature: FR - add stdout to CLI logs (for logging in kibana etc.)
|
|
7
|
+
* DEV-7928 - Bug: CLI - cnvrg clone doesnt show log message when files not found
|
|
8
|
+
* DEV-7956 - Bug: CLI crashes from progressbar
|
|
9
|
+
* DEV-8006 - Bug: Cli- cnvrg data put ,slash in the end url path will cause unique index error
|
|
10
|
+
* DEV-8007 - Bug: Cli- Cnvrg data clone failed sometimes to load sts, there for clone crashed
|
|
11
|
+
* DEV-8159 - New Feature: Oauth Proxy
|
|
12
|
+
* DEV-8179 - New Feature: Add auto cache and link files in cache clone
|
|
13
|
+
* DEV-8208 - Bug: Cli - cnvrg data put fails
|
|
14
|
+
* DEV-8284 - Improvement: Use server instead of docker for agent communication
|
|
15
|
+
* DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
|
|
16
|
+
* DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
|
|
17
|
+
* DEV-8621 - Improvement: Add more metrics
|
|
18
|
+
## Version v1.11.30
|
|
19
|
+
2021-04-06
|
|
20
|
+
## Version v1.11.31
|
|
21
|
+
2021-04-22
|
|
22
|
+
## Version v1.11.32
|
|
23
|
+
2021-05-05
|
|
24
|
+
* DEV-8868 - Bug: SDK - e.sync() in git project only creates empty "output" folder in commit
|
|
25
|
+
## Version v2.0.1
|
|
26
|
+
2021-06-13
|
data/cnvrg.gemspec
CHANGED
|
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
|
|
|
14
14
|
|
|
15
15
|
#spec.files = `git ls-files`.split($/)
|
|
16
16
|
spec.files = %w[cnvrg.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
|
|
17
|
-
spec.executables
|
|
18
|
-
spec.executables
|
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
18
|
+
spec.executables = ['cnvrg']
|
|
19
19
|
spec.require_paths = ['lib']
|
|
20
20
|
|
|
21
21
|
spec.add_development_dependency 'bundler'
|
|
@@ -25,12 +25,13 @@ Gem::Specification.new do |spec|
|
|
|
25
25
|
spec.add_development_dependency 'aruba'
|
|
26
26
|
spec.add_development_dependency 'pry'
|
|
27
27
|
|
|
28
|
-
spec.add_runtime_dependency '
|
|
28
|
+
spec.add_runtime_dependency 'ffi', '~> 1.9', '>= 1.9.10'
|
|
29
|
+
spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
|
|
29
30
|
spec.add_runtime_dependency 'faraday', '~> 0.15.2'
|
|
30
31
|
spec.add_runtime_dependency 'netrc', '~> 0.11.0'
|
|
31
32
|
spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
|
|
32
33
|
spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
|
|
33
|
-
spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
|
|
34
|
+
spec.add_runtime_dependency 'thor', '~> 0.19.0', '>=0.19.1'
|
|
34
35
|
spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
|
|
35
36
|
spec.add_runtime_dependency 'signet', '~> 0.11.0'
|
|
36
37
|
spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
|
|
@@ -38,6 +39,7 @@ Gem::Specification.new do |spec|
|
|
|
38
39
|
spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
|
|
39
40
|
spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
|
|
40
41
|
spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
|
|
42
|
+
spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
|
|
41
43
|
spec.add_runtime_dependency 'parallel', '~> 1.12.0'
|
|
42
44
|
spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
|
|
43
45
|
spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
|
data/lib/cnvrg/api.rb
CHANGED
|
@@ -72,11 +72,11 @@ module Cnvrg
|
|
|
72
72
|
if response.to_hash[:status].to_i != 200
|
|
73
73
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
|
74
74
|
end
|
|
75
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
|
75
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
|
76
76
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
|
77
77
|
success = false
|
|
78
78
|
sleep(5 * retries)
|
|
79
|
-
retries +=1
|
|
79
|
+
retries += 1
|
|
80
80
|
next
|
|
81
81
|
end
|
|
82
82
|
rescue => e
|
|
@@ -112,11 +112,11 @@ module Cnvrg
|
|
|
112
112
|
if response.to_hash[:status].to_i != 200
|
|
113
113
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
|
114
114
|
end
|
|
115
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
|
115
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
|
116
116
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
|
117
117
|
success = false
|
|
118
118
|
sleep(5 * retries)
|
|
119
|
-
retries +=1
|
|
119
|
+
retries += 1
|
|
120
120
|
next
|
|
121
121
|
end
|
|
122
122
|
rescue => e
|
data/lib/cnvrg/cli.rb
CHANGED
|
@@ -173,7 +173,7 @@ module Cnvrg
|
|
|
173
173
|
desc "data [COMMAND]", "Upload and manage datasets", :hide => false
|
|
174
174
|
subcommand "data", Data
|
|
175
175
|
|
|
176
|
-
desc "job", "manage running jobs", :hide =>
|
|
176
|
+
desc "job", "manage running jobs", :hide => true
|
|
177
177
|
subcommand "job", JobCli
|
|
178
178
|
|
|
179
179
|
desc "ssh", "ssh into running jobs", :hide => false
|
|
@@ -415,7 +415,7 @@ module Cnvrg
|
|
|
415
415
|
end
|
|
416
416
|
end
|
|
417
417
|
|
|
418
|
-
desc 'set_compression_path', 'Set compression path'
|
|
418
|
+
desc 'set_compression_path', 'Set compression path', :hide => true
|
|
419
419
|
method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
|
|
420
420
|
|
|
421
421
|
def set_compression_path(*compression_path)
|
|
@@ -2311,6 +2311,7 @@ module Cnvrg
|
|
|
2311
2311
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
|
2312
2312
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
|
2313
2313
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
|
2314
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
|
2314
2315
|
|
|
2315
2316
|
def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
|
|
2316
2317
|
begin
|
|
@@ -2328,6 +2329,8 @@ module Cnvrg
|
|
|
2328
2329
|
exp_obj = nil
|
|
2329
2330
|
end
|
|
2330
2331
|
|
|
2332
|
+
local = options["local"]
|
|
2333
|
+
|
|
2331
2334
|
commit_msg = options["message"]
|
|
2332
2335
|
if commit_msg.nil? or commit_msg.empty?
|
|
2333
2336
|
commit_msg = ""
|
|
@@ -2349,7 +2352,7 @@ module Cnvrg
|
|
|
2349
2352
|
if git_output_dir.ends_with? "/"
|
|
2350
2353
|
git_output_dir = git_output_dir[0..-2]
|
|
2351
2354
|
end
|
|
2352
|
-
list = @project.generate_output_dir(git_output_dir)
|
|
2355
|
+
list = @project.generate_output_dir(git_output_dir, local: local)
|
|
2353
2356
|
end
|
|
2354
2357
|
list += @project.generate_git_diff if options["git_diff"]
|
|
2355
2358
|
spec_files_to_upload = list
|
|
@@ -2668,7 +2671,7 @@ module Cnvrg
|
|
|
2668
2671
|
end
|
|
2669
2672
|
end
|
|
2670
2673
|
|
|
2671
|
-
desc 'commit before termination', 'Commit job code before termination'
|
|
2674
|
+
desc 'commit before termination', 'Commit job code before termination', :hide => true
|
|
2672
2675
|
def commit_before_termination()
|
|
2673
2676
|
job_type = ENV['CNVRG_JOB_TYPE']
|
|
2674
2677
|
job_id = ENV['CNVRG_JOB_ID']
|
|
@@ -2678,7 +2681,7 @@ module Cnvrg
|
|
|
2678
2681
|
log_error(e)
|
|
2679
2682
|
end
|
|
2680
2683
|
|
|
2681
|
-
desc 'update_job_commit', 'Update job with its last commit'
|
|
2684
|
+
desc 'update_job_commit', 'Update job with its last commit' , :hide => true
|
|
2682
2685
|
def update_job_commit()
|
|
2683
2686
|
job_type = ENV['CNVRG_JOB_TYPE']
|
|
2684
2687
|
job_id = ENV['CNVRG_JOB_ID']
|
|
@@ -2868,7 +2871,7 @@ module Cnvrg
|
|
|
2868
2871
|
|
|
2869
2872
|
|
|
2870
2873
|
|
|
2871
|
-
desc 'jump', 'Jump to specific commit'
|
|
2874
|
+
desc 'jump COMMIT_ID', 'Jump to specific commit'
|
|
2872
2875
|
def jump(commit_sha1)
|
|
2873
2876
|
begin
|
|
2874
2877
|
verify_logged_in()
|
|
@@ -3008,6 +3011,7 @@ module Cnvrg
|
|
|
3008
3011
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
|
3009
3012
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
|
3010
3013
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
|
3014
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => false
|
|
3011
3015
|
|
|
3012
3016
|
def sync(direct = true)
|
|
3013
3017
|
verify_logged_in(true) if direct
|
|
@@ -3030,10 +3034,10 @@ module Cnvrg
|
|
|
3030
3034
|
if run_download or options['debug_mode']
|
|
3031
3035
|
invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
|
|
3032
3036
|
end
|
|
3033
|
-
invoke :upload, [false, true,
|
|
3037
|
+
invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
|
|
3034
3038
|
:ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
|
|
3035
3039
|
:files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
|
|
3036
|
-
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
|
|
3040
|
+
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
|
|
3037
3041
|
|
|
3038
3042
|
end
|
|
3039
3043
|
|
|
@@ -3199,6 +3203,7 @@ module Cnvrg
|
|
|
3199
3203
|
method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
|
|
3200
3204
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
|
|
3201
3205
|
method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
|
|
3206
|
+
method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
|
|
3202
3207
|
|
|
3203
3208
|
def exec(*cmd)
|
|
3204
3209
|
log = []
|
|
@@ -3222,6 +3227,7 @@ module Cnvrg
|
|
|
3222
3227
|
output_dir = options['output_dir'] || "output"
|
|
3223
3228
|
project_home = get_project_home
|
|
3224
3229
|
data_query = options["data_query"]
|
|
3230
|
+
docker_stats = options["docker_stats"]
|
|
3225
3231
|
@project = Project.new(project_home)
|
|
3226
3232
|
if @project.is_git
|
|
3227
3233
|
sync_before = false
|
|
@@ -3294,80 +3300,67 @@ module Cnvrg
|
|
|
3294
3300
|
stdout, stderr = '', ''
|
|
3295
3301
|
begin
|
|
3296
3302
|
process_running = true
|
|
3297
|
-
|
|
3298
|
-
|
|
3299
|
-
|
|
3300
|
-
|
|
3301
|
-
|
|
3302
|
-
|
|
3303
|
-
|
|
3304
|
-
|
|
3305
|
-
|
|
3303
|
+
if docker_stats
|
|
3304
|
+
stats_thread = Thread.new do
|
|
3305
|
+
while process_running do
|
|
3306
|
+
sleep 30
|
|
3307
|
+
begin
|
|
3308
|
+
stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? { memory: memory_usage, cpu: cpu_usage } : {}
|
|
3309
|
+
if is_on_gpu
|
|
3310
|
+
gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
|
|
3311
|
+
stats['gpu_util'] = gu[0]
|
|
3312
|
+
stats['gpu'] = gu[1]
|
|
3313
|
+
end
|
|
3314
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
|
3315
|
+
rescue => e
|
|
3316
|
+
log_error(e)
|
|
3317
|
+
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
|
3306
3318
|
end
|
|
3307
|
-
@exp.send_machine_stats [stats] unless stats.empty?
|
|
3308
|
-
rescue => e
|
|
3309
|
-
log_error(e)
|
|
3310
|
-
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
|
3311
3319
|
end
|
|
3312
3320
|
end
|
|
3313
3321
|
end
|
|
3314
3322
|
start_time = Time.now
|
|
3315
|
-
shell_type = options["use_bash"] ? "bash -l" : "sh"
|
|
3316
3323
|
if @exp.get_cmd.present?
|
|
3317
3324
|
cmd = @exp.get_cmd
|
|
3318
|
-
if options["docker_id"].present? # Escape for docker exec
|
|
3319
|
-
cmd = cmd.gsub("\"", "\\\"")
|
|
3320
|
-
end
|
|
3321
3325
|
end
|
|
3322
|
-
|
|
3323
|
-
|
|
3326
|
+
command_slug = (0...18).map { (65 + rand(26)).chr }.join
|
|
3327
|
+
result_file = "/conf/result-#{command_slug}"
|
|
3328
|
+
data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
|
|
3329
|
+
|
|
3330
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
|
3331
|
+
response = conn.post('command', data.to_json)
|
|
3332
|
+
if response.to_hash[:status].to_i != 200
|
|
3333
|
+
exit_status = 129
|
|
3334
|
+
raise StandardError.new("Cant send command to slave")
|
|
3324
3335
|
end
|
|
3325
|
-
|
|
3336
|
+
t = FileWatch::Tail.new
|
|
3337
|
+
filename = result_file
|
|
3338
|
+
lines = []
|
|
3339
|
+
t.tail(filename)
|
|
3340
|
+
t.subscribe do |path, line|
|
|
3326
3341
|
begin
|
|
3327
|
-
|
|
3328
|
-
|
|
3329
|
-
|
|
3330
|
-
|
|
3331
|
-
|
|
3332
|
-
|
|
3333
|
-
|
|
3334
|
-
|
|
3335
|
-
|
|
3336
|
-
puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
|
|
3337
|
-
end
|
|
3342
|
+
cur_log = JSON.parse(line)
|
|
3343
|
+
if cur_log["type"] == "endMessage"
|
|
3344
|
+
exit_status = cur_log["real"].to_i
|
|
3345
|
+
break
|
|
3346
|
+
else
|
|
3347
|
+
puts(cur_log.to_json)
|
|
3348
|
+
STDOUT.flush
|
|
3349
|
+
cur_log["time"] = Time.parse(cur_log["timestamp"])
|
|
3350
|
+
cur_log["message"] = cur_log["message"].to_s + "\r\n"
|
|
3338
3351
|
log << cur_log
|
|
3339
|
-
if log.size >= 10
|
|
3340
|
-
@exp.upload_temp_log(log) unless log.empty?
|
|
3341
|
-
log = []
|
|
3342
|
-
elsif (start_time + 15.seconds) <= Time.now
|
|
3343
|
-
@exp.upload_temp_log(log) unless log.empty?
|
|
3344
|
-
log = []
|
|
3345
|
-
start_time = Time.now
|
|
3346
|
-
end
|
|
3347
3352
|
end
|
|
3348
|
-
if
|
|
3349
|
-
|
|
3350
|
-
|
|
3351
|
-
|
|
3353
|
+
if log.size >= 10
|
|
3354
|
+
@exp.upload_temp_log(log)
|
|
3355
|
+
log = []
|
|
3356
|
+
elsif (start_time + 15.seconds) <= Time.now
|
|
3357
|
+
@exp.upload_temp_log(log) unless log.empty?
|
|
3358
|
+
log = []
|
|
3359
|
+
start_time = Time.now
|
|
3352
3360
|
end
|
|
3353
|
-
rescue Errno::EIO => e
|
|
3354
|
-
log_error(e)
|
|
3355
|
-
if !log.empty?
|
|
3356
|
-
temp_log = log
|
|
3357
|
-
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
|
3358
|
-
log -= temp_log
|
|
3359
|
-
end
|
|
3360
|
-
rescue Errno::ENOENT => e
|
|
3361
|
-
exp_success = false
|
|
3362
|
-
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
|
3363
|
-
log_error(e)
|
|
3364
3361
|
rescue => e
|
|
3365
|
-
res = @exp.end(log, 1, start_commit, 0, 0)
|
|
3366
|
-
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
|
3367
3362
|
log_error(e)
|
|
3368
|
-
exit(0)
|
|
3369
3363
|
end
|
|
3370
|
-
::Process.wait pid
|
|
3371
3364
|
end
|
|
3372
3365
|
end_time = Time.now
|
|
3373
3366
|
process_running = false
|
|
@@ -3375,14 +3368,13 @@ module Cnvrg
|
|
|
3375
3368
|
if !log.empty?
|
|
3376
3369
|
|
|
3377
3370
|
temp_log = log
|
|
3378
|
-
|
|
3371
|
+
@exp.upload_temp_log(temp_log)
|
|
3379
3372
|
log -= temp_log
|
|
3380
3373
|
end
|
|
3381
3374
|
|
|
3382
3375
|
cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
|
|
3383
3376
|
memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
|
|
3384
|
-
exit_status
|
|
3385
|
-
if $?.exitstatus != 0
|
|
3377
|
+
if exit_status != 0
|
|
3386
3378
|
exp_success = false
|
|
3387
3379
|
end
|
|
3388
3380
|
|
|
@@ -3405,7 +3397,7 @@ module Cnvrg
|
|
|
3405
3397
|
end
|
|
3406
3398
|
|
|
3407
3399
|
# log_thread.join
|
|
3408
|
-
stats_thread.join
|
|
3400
|
+
stats_thread.join if docker_stats
|
|
3409
3401
|
|
|
3410
3402
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
|
|
3411
3403
|
|
|
@@ -3425,8 +3417,7 @@ module Cnvrg
|
|
|
3425
3417
|
log_message("Couldn't run #{cmd}, check your input parameters", Thor::Shell::Color::RED)
|
|
3426
3418
|
if @exp
|
|
3427
3419
|
# log_thread.join
|
|
3428
|
-
Thread.kill(stats_thread)
|
|
3429
|
-
exit_status = $?.exitstatus
|
|
3420
|
+
Thread.kill(stats_thread) if docker_stats
|
|
3430
3421
|
if exit_status.blank?
|
|
3431
3422
|
exit_status = "-1"
|
|
3432
3423
|
end
|
|
@@ -3439,8 +3430,6 @@ module Cnvrg
|
|
|
3439
3430
|
|
|
3440
3431
|
exit(1)
|
|
3441
3432
|
end
|
|
3442
|
-
|
|
3443
|
-
|
|
3444
3433
|
end
|
|
3445
3434
|
|
|
3446
3435
|
end
|
|
@@ -3449,7 +3438,7 @@ module Cnvrg
|
|
|
3449
3438
|
end_commit = @project.last_local_commit
|
|
3450
3439
|
process_running = false
|
|
3451
3440
|
# log_thread.join
|
|
3452
|
-
stats_thread.join
|
|
3441
|
+
stats_thread.join if docker_stats
|
|
3453
3442
|
|
|
3454
3443
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
|
|
3455
3444
|
if container
|
|
@@ -3685,7 +3674,7 @@ module Cnvrg
|
|
|
3685
3674
|
end
|
|
3686
3675
|
end
|
|
3687
3676
|
|
|
3688
|
-
desc 'deploy', 'Deploys model to production'
|
|
3677
|
+
desc 'deploy', 'Deploys model to production', :hide => true
|
|
3689
3678
|
method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
|
|
3690
3679
|
method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
|
|
3691
3680
|
method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
|
|
@@ -3774,7 +3763,7 @@ module Cnvrg
|
|
|
3774
3763
|
method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
|
|
3775
3764
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
|
|
3776
3765
|
|
|
3777
|
-
desc 'notebook', 'Starts a notebook session remotely or locally'
|
|
3766
|
+
desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
|
|
3778
3767
|
|
|
3779
3768
|
def notebook
|
|
3780
3769
|
verify_logged_in(true)
|
|
@@ -3901,7 +3890,7 @@ module Cnvrg
|
|
|
3901
3890
|
end
|
|
3902
3891
|
end
|
|
3903
3892
|
|
|
3904
|
-
desc 'remote_notebook', 'Run notebook server on remote server'
|
|
3893
|
+
desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
|
|
3905
3894
|
method_option :machine_type, :type => :string, :default => ""
|
|
3906
3895
|
method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
|
|
3907
3896
|
method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
|
|
@@ -4260,7 +4249,7 @@ module Cnvrg
|
|
|
4260
4249
|
|
|
4261
4250
|
end
|
|
4262
4251
|
|
|
4263
|
-
desc 'notebook_stop', '
|
|
4252
|
+
desc 'notebook_stop', 'Stop notebook', :hide => true
|
|
4264
4253
|
method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
|
|
4265
4254
|
method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
|
|
4266
4255
|
method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
|
|
@@ -4640,6 +4629,108 @@ module Cnvrg
|
|
|
4640
4629
|
end
|
|
4641
4630
|
end
|
|
4642
4631
|
|
|
4632
|
+
desc 'Collect and send job utilization', '', :hide => true
|
|
4633
|
+
method_option :prometheus_url, :type => :string, :aliases => ["--prometheus_url"], :desc => "prometheus url to collect metrics from"
|
|
4634
|
+
method_option :node_name, :type => :string, :aliases => ["--node_name"], :desc => "machie activity node name"
|
|
4635
|
+
method_option :machine, :type => :boolean, :aliases => ["--machine"], :desc => "get machine_query or cluster_query"
|
|
4636
|
+
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
|
4637
|
+
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
|
4638
|
+
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
|
4639
|
+
method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
|
|
4640
|
+
method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
|
|
4641
|
+
method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
|
|
4642
|
+
|
|
4643
|
+
def collect_metrics
|
|
4644
|
+
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
|
4645
|
+
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
|
4646
|
+
prom_user = options[:prom_user]
|
|
4647
|
+
prom_password = options[:prom_password]
|
|
4648
|
+
name = options[:name]
|
|
4649
|
+
|
|
4650
|
+
translate_result = Cnvrg::API_V2.request(
|
|
4651
|
+
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
|
4652
|
+
'GET',
|
|
4653
|
+
{ gpu: options[:gpu], gaudi: options[:gaudi] }
|
|
4654
|
+
)
|
|
4655
|
+
|
|
4656
|
+
is_machine = options[:machine]
|
|
4657
|
+
while true do
|
|
4658
|
+
begin
|
|
4659
|
+
stats = {}
|
|
4660
|
+
translate_result.each do |query_name, metric|
|
|
4661
|
+
if is_machine
|
|
4662
|
+
metric_query = metric['machine_query'].presence || metric['query']
|
|
4663
|
+
query_content = metric_query.gsub('#JOB_SLUG#', ENV['CNVRG_JOB_ID']).gsub('#NODE_NAME#', options[:node_name])
|
|
4664
|
+
else
|
|
4665
|
+
metric_query = metric['cluster_query'].presence || metric['query']
|
|
4666
|
+
pod_name = `hostname`.strip
|
|
4667
|
+
query_content = metric_query.gsub('#JOB_SLUG#', pod_name).gsub('#NODE_NAME#', options[:node_name])
|
|
4668
|
+
end
|
|
4669
|
+
if metric_query.blank? || query_content.blank?
|
|
4670
|
+
next
|
|
4671
|
+
end
|
|
4672
|
+
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
|
4673
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
4674
|
+
http.use_ssl = uri.scheme == "https"
|
|
4675
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
4676
|
+
req = Net::HTTP::Get.new uri.request_uri
|
|
4677
|
+
if prom_user.present?
|
|
4678
|
+
req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
|
|
4679
|
+
end
|
|
4680
|
+
resp = http.request(req)
|
|
4681
|
+
begin
|
|
4682
|
+
result = JSON.parse(resp.body)
|
|
4683
|
+
rescue JSON::ParserError => e
|
|
4684
|
+
log_error(e)
|
|
4685
|
+
next
|
|
4686
|
+
end
|
|
4687
|
+
data_result = result&.dig('data', 'result')
|
|
4688
|
+
next unless data_result
|
|
4689
|
+
|
|
4690
|
+
if data_result.size > 1
|
|
4691
|
+
stats[query_name] = {} unless query_name.include? 'block'
|
|
4692
|
+
data_result.each_with_index do |res, i|
|
|
4693
|
+
timestamp, value = res["value"]
|
|
4694
|
+
uuid = res["metric"]["UUID"].presence || i
|
|
4695
|
+
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
|
4696
|
+
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
|
4697
|
+
if query_name.include? 'block'
|
|
4698
|
+
uuid = res["metric"]["interface"].presence || i
|
|
4699
|
+
uuid = "#{name}-#{uuid}" if name.present?
|
|
4700
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
|
4701
|
+
io_type = query_name.split('_')[1]
|
|
4702
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
|
4703
|
+
stats['block_io'][io_type].merge!({ uuid => stat_value })
|
|
4704
|
+
else
|
|
4705
|
+
stats[query_name][uuid] = stat_value
|
|
4706
|
+
end
|
|
4707
|
+
end
|
|
4708
|
+
else
|
|
4709
|
+
timestamp, value = data_result&.first&.dig('value')
|
|
4710
|
+
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
|
4711
|
+
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
|
4712
|
+
if query_name.include? 'block'
|
|
4713
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
|
4714
|
+
io_type = query_name.split('_')[1]
|
|
4715
|
+
if name.present?
|
|
4716
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
|
4717
|
+
stats['block_io'][io_type].merge!({ name => stat_value })
|
|
4718
|
+
else
|
|
4719
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
|
4720
|
+
end
|
|
4721
|
+
else
|
|
4722
|
+
stats[query_name] = name.present? ? { name => stat_value } : stat_value
|
|
4723
|
+
end
|
|
4724
|
+
end
|
|
4725
|
+
end
|
|
4726
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
|
4727
|
+
rescue => e
|
|
4728
|
+
log_error(e)
|
|
4729
|
+
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
|
4730
|
+
end
|
|
4731
|
+
sleep options[:wait]
|
|
4732
|
+
end
|
|
4733
|
+
end
|
|
4643
4734
|
|
|
4644
4735
|
desc '', '', :hide => true
|
|
4645
4736
|
|
|
@@ -4672,7 +4763,7 @@ module Cnvrg
|
|
|
4672
4763
|
end
|
|
4673
4764
|
|
|
4674
4765
|
|
|
4675
|
-
desc '', ''
|
|
4766
|
+
desc '', '', :hide => true
|
|
4676
4767
|
|
|
4677
4768
|
def download_built_image(image_name, image_slug)
|
|
4678
4769
|
begin
|
|
@@ -4916,7 +5007,7 @@ module Cnvrg
|
|
|
4916
5007
|
end
|
|
4917
5008
|
end
|
|
4918
5009
|
|
|
4919
|
-
desc 'experiments', 'List project experiments'
|
|
5010
|
+
desc 'experiments', 'List project experiments', :hide => true
|
|
4920
5011
|
method_option :id, :type => :string, :aliases => ["--id"], :desc => "Get info for specific experiments", :default => ""
|
|
4921
5012
|
method_option :tag, :type => :string, :aliases => ["-t"], :desc => "Get info for specific experiment tag", :default => ""
|
|
4922
5013
|
|