cnvrg 1.11.28 → 2.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e617f50977eafa031b94bed3ba18753ecb4529924f74182525a5c6a53410fc5f
4
- data.tar.gz: 40b755a41a73611dbc8f48802352ca73cac613c2e6d3fa70bf823e9e24f7c3f5
3
+ metadata.gz: b238ce877c0fa8da80c63c6d3d165ce974c250ea9c3f3d21ff092098356aa5a5
4
+ data.tar.gz: 7c7b4aa637b3bcaf5f18a7eb973d6995f0f1e25147a16a2ccfb1ce3d6dea91e3
5
5
  SHA512:
6
- metadata.gz: 51abb620c15a66be237a4392ec7fad73e1a5f865f3932934b938028ee0b54558772b8dcc292c77379bbbe273c1771f2b3e85d250d492cafa0b0bc681cb7a52dc
7
- data.tar.gz: 06fe73c3d4246c41b43923736c2ece0680a4f04522a317acdc576b6191d5ca74057f55bc6c5120456ac8ef0c5cfe7f328317655799b0d3d3c910db1cde871ad7
6
+ metadata.gz: fb1b724199e92fddfa2be4c04b9eb1a8a94e1df65759482fb6c3173f7c3c7632d236c22836749e5f1d6578115555b838285b58d1a95a42ce2df67d3a354da043
7
+ data.tar.gz: c840826e01dcd8061fdb1bf065f416d97ca3fe35575649171814741c32332b458c7bac57fa248e9afd45f337b60b18003077cc72d5927527490689572608ad6b
data/Readme.md ADDED
@@ -0,0 +1,42 @@
1
+
2
+ ## Version v1.11.15
3
+ 2021-03-30
4
+ * DEV-208 - Task: Make sure the index name is constant over days
5
+ * DEV-7555 - Bug: CLI: Error message is not correct when run a flow after removing the permission from Template.
6
+ * DEV-7800 - New Feature: FR - add stdout to CLI logs (for logging in kibana etc.)
7
+ * DEV-7928 - Bug: CLI - cnvrg clone doesnt show log message when files not found
8
+ * DEV-7956 - Bug: CLI crashes from progressbar
9
+ * DEV-8006 - Bug: Cli- cnvrg data put ,slash in the end url path will cause unique index error
10
+ * DEV-8007 - Bug: Cli- Cnvrg data clone failed sometimes to load sts, there for clone crashed
11
+ * DEV-8159 - New Feature: Oauth Proxy
12
+ * DEV-8179 - New Feature: Add auto cache and link files in cache clone
13
+ * DEV-8208 - Bug: Cli - cnvrg data put fails
14
+ * DEV-8284 - Improvement: Use server instead of docker for agent communication
15
+ * DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
16
+ * DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
17
+ * DEV-8621 - Improvement: Add more metrics
18
+ ## Version v1.11.30
19
+ 2021-04-06
20
+ ## Version v1.11.31
21
+ 2021-04-22
22
+ ## Version v1.11.32
23
+ 2021-05-05
24
+ * DEV-8868 - Bug: SDK - e.sync() in git project only creates empty "output" folder in commit
25
+ ## Version v2.0.1
26
+ 2021-06-13
27
+ ## Version v2.0.2
28
+ 2021-06-16
29
+ * DEV-9694 - Bug: Download artifacts fails on authorization error
30
+ ## Version v2.0.3
31
+ 2021-06-29
32
+ * DEV-9919 - Bug: clone artifacts fails on "Not Authorize, Are you logged in?"
33
+ ## Version v2.0.4
34
+ 2021-07-08
35
+ * DEV-9935 - Bug: CLI - cnvrg sync creates new commit but no blob versions
36
+ ## Version v2.0.5
37
+ 2021-07-11
38
+ * DEV-10171 - Bug: experiment randomly fails with error- "Couldn't clone artifacts"
39
+ * DEV-10189 - Bug: CLI Sync -file/folder with broken symlink will cause sync to fail
40
+ ## Version v2.0.6
41
+ 2021-07-18
42
+ * DEV-10209 - Bug: some experiments in grid failed on cnvrg-cli commands (docker container id was missing)
data/cnvrg.gemspec CHANGED
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
14
14
 
15
15
  #spec.files = `git ls-files`.split($/)
16
16
  spec.files = %w[cnvrg.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
17
- spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
- spec.executables = ['cnvrg']
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.executables = ['cnvrg']
19
19
  spec.require_paths = ['lib']
20
20
 
21
21
  spec.add_development_dependency 'bundler'
@@ -23,14 +23,15 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency 'rspec', '~> 3.0'
24
24
  spec.add_development_dependency 'vcr', '~> 3.0'
25
25
  spec.add_development_dependency 'aruba'
26
- spec.add_development_dependency 'pry'
27
-
28
- spec.add_runtime_dependency 'mimemagic', '~> 0.3.1','>=0.3.2'
26
+ spec.add_development_dependency 'pry'
27
+
28
+ spec.add_runtime_dependency 'ffi', '~> 1.9', '>= 1.9.10'
29
+ spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
29
30
  spec.add_runtime_dependency 'faraday', '~> 0.15.2'
30
31
  spec.add_runtime_dependency 'netrc', '~> 0.11.0'
31
32
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
33
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
- spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
34
+ spec.add_runtime_dependency 'thor', '~> 0.19.0', '>=0.19.1'
34
35
  spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
35
36
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
37
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
@@ -38,6 +39,7 @@ Gem::Specification.new do |spec|
38
39
  spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
39
40
  spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
40
41
  spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
42
+ spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
41
43
  spec.add_runtime_dependency 'parallel', '~> 1.12.0'
42
44
  spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
43
45
  spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
data/lib/cnvrg/api.rb CHANGED
@@ -72,11 +72,11 @@ module Cnvrg
72
72
  if response.to_hash[:status].to_i != 200
73
73
  Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
74
74
  end
75
- if [503, 502, 429].include?(response.to_hash[:status].to_i)
75
+ if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
76
76
  Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
77
77
  success = false
78
78
  sleep(5 * retries)
79
- retries +=1
79
+ retries += 1
80
80
  next
81
81
  end
82
82
  rescue => e
@@ -112,11 +112,11 @@ module Cnvrg
112
112
  if response.to_hash[:status].to_i != 200
113
113
  Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
114
114
  end
115
- if [503, 502, 429].include?(response.to_hash[:status].to_i)
115
+ if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
116
116
  Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
117
117
  success = false
118
118
  sleep(5 * retries)
119
- retries +=1
119
+ retries += 1
120
120
  next
121
121
  end
122
122
  rescue => e
data/lib/cnvrg/cli.rb CHANGED
@@ -173,7 +173,7 @@ module Cnvrg
173
173
  desc "data [COMMAND]", "Upload and manage datasets", :hide => false
174
174
  subcommand "data", Data
175
175
 
176
- desc "job", "manage running jobs", :hide => false
176
+ desc "job", "manage running jobs", :hide => true
177
177
  subcommand "job", JobCli
178
178
 
179
179
  desc "ssh", "ssh into running jobs", :hide => false
@@ -415,7 +415,7 @@ module Cnvrg
415
415
  end
416
416
  end
417
417
 
418
- desc 'set_compression_path', 'Set compression path'
418
+ desc 'set_compression_path', 'Set compression path', :hide => true
419
419
  method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
420
420
 
421
421
  def set_compression_path(*compression_path)
@@ -2311,6 +2311,7 @@ module Cnvrg
2311
2311
  method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2312
2312
  method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2313
2313
  method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
2314
+ method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
2314
2315
 
2315
2316
  def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
2316
2317
  begin
@@ -2328,6 +2329,8 @@ module Cnvrg
2328
2329
  exp_obj = nil
2329
2330
  end
2330
2331
 
2332
+ local = options["local"]
2333
+
2331
2334
  commit_msg = options["message"]
2332
2335
  if commit_msg.nil? or commit_msg.empty?
2333
2336
  commit_msg = ""
@@ -2349,7 +2352,7 @@ module Cnvrg
2349
2352
  if git_output_dir.ends_with? "/"
2350
2353
  git_output_dir = git_output_dir[0..-2]
2351
2354
  end
2352
- list = @project.generate_output_dir(git_output_dir)
2355
+ list = @project.generate_output_dir(git_output_dir, local: local)
2353
2356
  end
2354
2357
  list += @project.generate_git_diff if options["git_diff"]
2355
2358
  spec_files_to_upload = list
@@ -2668,7 +2671,7 @@ module Cnvrg
2668
2671
  end
2669
2672
  end
2670
2673
 
2671
- desc 'commit before termination', 'Commit job code before termination'
2674
+ desc 'commit before termination', 'Commit job code before termination', :hide => true
2672
2675
  def commit_before_termination()
2673
2676
  job_type = ENV['CNVRG_JOB_TYPE']
2674
2677
  job_id = ENV['CNVRG_JOB_ID']
@@ -2678,7 +2681,7 @@ module Cnvrg
2678
2681
  log_error(e)
2679
2682
  end
2680
2683
 
2681
- desc 'update_job_commit', 'Update job with its last commit'
2684
+ desc 'update_job_commit', 'Update job with its last commit' , :hide => true
2682
2685
  def update_job_commit()
2683
2686
  job_type = ENV['CNVRG_JOB_TYPE']
2684
2687
  job_id = ENV['CNVRG_JOB_ID']
@@ -2868,7 +2871,7 @@ module Cnvrg
2868
2871
 
2869
2872
 
2870
2873
 
2871
- desc 'jump', 'Jump to specific commit'
2874
+ desc 'jump COMMIT_ID', 'Jump to specific commit'
2872
2875
  def jump(commit_sha1)
2873
2876
  begin
2874
2877
  verify_logged_in()
@@ -3003,11 +3006,12 @@ module Cnvrg
3003
3006
  method_option :job_slug, :type => :string, :aliases => ["-j", "--job"], :default => nil
3004
3007
  method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
3005
3008
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
3006
- method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
3009
+ method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => 'output'
3007
3010
  method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
3008
3011
  method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
3009
3012
  method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
3010
3013
  method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
3014
+ method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
3011
3015
 
3012
3016
  def sync(direct = true)
3013
3017
  verify_logged_in(true) if direct
@@ -3030,10 +3034,10 @@ module Cnvrg
3030
3034
  if run_download or options['debug_mode']
3031
3035
  invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
3032
3036
  end
3033
- invoke :upload, [false, true, direct, "",in_exp,options[:force], output_dir, job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
3037
+ invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
3034
3038
  :ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
3035
3039
  :files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
3036
- :debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
3040
+ :debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
3037
3041
 
3038
3042
  end
3039
3043
 
@@ -3199,6 +3203,7 @@ module Cnvrg
3199
3203
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3200
3204
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3201
3205
  method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3206
+ method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
3202
3207
 
3203
3208
  def exec(*cmd)
3204
3209
  log = []
@@ -3222,6 +3227,7 @@ module Cnvrg
3222
3227
  output_dir = options['output_dir'] || "output"
3223
3228
  project_home = get_project_home
3224
3229
  data_query = options["data_query"]
3230
+ docker_stats = options["docker_stats"]
3225
3231
  @project = Project.new(project_home)
3226
3232
  if @project.is_git
3227
3233
  sync_before = false
@@ -3294,80 +3300,67 @@ module Cnvrg
3294
3300
  stdout, stderr = '', ''
3295
3301
  begin
3296
3302
  process_running = true
3297
- stats_thread = Thread.new do
3298
- while process_running do
3299
- sleep 30
3300
- begin
3301
- stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3302
- if is_on_gpu
3303
- gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3304
- stats['gpu_util'] = gu[0]
3305
- stats['gpu'] = gu[1]
3303
+ if docker_stats
3304
+ stats_thread = Thread.new do
3305
+ while process_running do
3306
+ sleep 30
3307
+ begin
3308
+ stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? { memory: memory_usage, cpu: cpu_usage } : {}
3309
+ if is_on_gpu
3310
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3311
+ stats['gpu_util'] = gu[0]
3312
+ stats['gpu'] = gu[1]
3313
+ end
3314
+ @exp.send_machine_stats [stats] unless stats.empty?
3315
+ rescue => e
3316
+ log_error(e)
3317
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3306
3318
  end
3307
- @exp.send_machine_stats [stats] unless stats.empty?
3308
- rescue => e
3309
- log_error(e)
3310
- log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3311
3319
  end
3312
3320
  end
3313
3321
  end
3314
3322
  start_time = Time.now
3315
- shell_type = options["use_bash"] ? "bash -l" : "sh"
3316
3323
  if @exp.get_cmd.present?
3317
3324
  cmd = @exp.get_cmd
3318
- if options["docker_id"].present? # Escape for docker exec
3319
- cmd = cmd.gsub("\"", "\\\"")
3320
- end
3321
3325
  end
3322
- if options["docker_id"].present?
3323
- cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
3326
+ command_slug = (0...18).map { (65 + rand(26)).chr }.join
3327
+ result_file = "/conf/result-#{command_slug}"
3328
+ data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
3329
+
3330
+ conn = Cnvrg::Helpers::Executer.get_main_conn
3331
+ response = conn.post('command', data.to_json)
3332
+ if response.to_hash[:status].to_i != 200
3333
+ exit_status = 129
3334
+ raise StandardError.new("Cant send command to slave")
3324
3335
  end
3325
- PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
3336
+ t = FileWatch::Tail.new
3337
+ filename = result_file
3338
+ lines = []
3339
+ t.tail(filename)
3340
+ t.subscribe do |path, line|
3326
3341
  begin
3327
- stdout.each do |line|
3328
- cur_time = Time.now
3329
- real_time = Time.now - real
3330
- cur_log = {time: cur_time,
3331
- message: line,
3332
- type: "stdout",
3333
- real: real_time
3334
- }
3335
- if print_log
3336
- puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
3337
- end
3342
+ cur_log = JSON.parse(line)
3343
+ if cur_log["type"] == "endMessage"
3344
+ exit_status = cur_log["real"].to_i
3345
+ break
3346
+ else
3347
+ puts(cur_log.to_json)
3348
+ STDOUT.flush
3349
+ cur_log["time"] = Time.parse(cur_log["timestamp"])
3350
+ cur_log["message"] = cur_log["message"].to_s + "\r\n"
3338
3351
  log << cur_log
3339
- if log.size >= 10
3340
- @exp.upload_temp_log(log) unless log.empty?
3341
- log = []
3342
- elsif (start_time + 15.seconds) <= Time.now
3343
- @exp.upload_temp_log(log) unless log.empty?
3344
- log = []
3345
- start_time = Time.now
3346
- end
3347
3352
  end
3348
- if stderr
3349
- stderr.each do |err|
3350
- log << {time: Time.now, message: err, type: "stderr"}
3351
- end
3352
- end
3353
- rescue Errno::EIO => e
3354
- log_error(e)
3355
- if !log.empty?
3356
- temp_log = log
3357
- @exp.upload_temp_log(temp_log) unless temp_log.empty?
3358
- log -= temp_log
3353
+ if log.size >= 10
3354
+ @exp.upload_temp_log(log)
3355
+ log = []
3356
+ elsif (start_time + 15.seconds) <= Time.now
3357
+ @exp.upload_temp_log(log) unless log.empty?
3358
+ log = []
3359
+ start_time = Time.now
3359
3360
  end
3360
- rescue Errno::ENOENT => e
3361
- exp_success = false
3362
- log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
3363
- log_error(e)
3364
3361
  rescue => e
3365
- res = @exp.end(log, 1, start_commit, 0, 0)
3366
- log_message("Error occurred,aborting", Thor::Shell::Color::RED)
3367
3362
  log_error(e)
3368
- exit(0)
3369
3363
  end
3370
- ::Process.wait pid
3371
3364
  end
3372
3365
  end_time = Time.now
3373
3366
  process_running = false
@@ -3375,14 +3368,13 @@ module Cnvrg
3375
3368
  if !log.empty?
3376
3369
 
3377
3370
  temp_log = log
3378
- @exp.upload_temp_log(temp_log) unless temp_log.empty?
3371
+ @exp.upload_temp_log(temp_log)
3379
3372
  log -= temp_log
3380
3373
  end
3381
3374
 
3382
3375
  cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
3383
3376
  memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
3384
- exit_status = $?.exitstatus
3385
- if $?.exitstatus != 0
3377
+ if exit_status != 0
3386
3378
  exp_success = false
3387
3379
  end
3388
3380
 
@@ -3405,7 +3397,7 @@ module Cnvrg
3405
3397
  end
3406
3398
 
3407
3399
  # log_thread.join
3408
- stats_thread.join
3400
+ stats_thread.join if docker_stats
3409
3401
 
3410
3402
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3411
3403
 
@@ -3425,8 +3417,7 @@ module Cnvrg
3425
3417
  log_message("Couldn't run #{cmd}, check your input parameters", Thor::Shell::Color::RED)
3426
3418
  if @exp
3427
3419
  # log_thread.join
3428
- Thread.kill(stats_thread)
3429
- exit_status = $?.exitstatus
3420
+ Thread.kill(stats_thread) if docker_stats
3430
3421
  if exit_status.blank?
3431
3422
  exit_status = "-1"
3432
3423
  end
@@ -3439,8 +3430,6 @@ module Cnvrg
3439
3430
 
3440
3431
  exit(1)
3441
3432
  end
3442
-
3443
-
3444
3433
  end
3445
3434
 
3446
3435
  end
@@ -3449,7 +3438,7 @@ module Cnvrg
3449
3438
  end_commit = @project.last_local_commit
3450
3439
  process_running = false
3451
3440
  # log_thread.join
3452
- stats_thread.join
3441
+ stats_thread.join if docker_stats
3453
3442
 
3454
3443
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
3455
3444
  if container
@@ -3685,7 +3674,7 @@ module Cnvrg
3685
3674
  end
3686
3675
  end
3687
3676
 
3688
- desc 'deploy', 'Deploys model to production'
3677
+ desc 'deploy', 'Deploys model to production', :hide => true
3689
3678
  method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
3690
3679
  method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
3691
3680
  method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
@@ -3774,7 +3763,7 @@ module Cnvrg
3774
3763
  method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
3775
3764
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
3776
3765
 
3777
- desc 'notebook', 'Starts a notebook session remotely or locally'
3766
+ desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
3778
3767
 
3779
3768
  def notebook
3780
3769
  verify_logged_in(true)
@@ -3901,7 +3890,7 @@ module Cnvrg
3901
3890
  end
3902
3891
  end
3903
3892
 
3904
- desc 'remote_notebook', 'Run notebook server on remote server'
3893
+ desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
3905
3894
  method_option :machine_type, :type => :string, :default => ""
3906
3895
  method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
3907
3896
  method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
@@ -4260,7 +4249,7 @@ module Cnvrg
4260
4249
 
4261
4250
  end
4262
4251
 
4263
- desc 'notebook_stop', 'Starts a new notebook environment'
4252
+ desc 'notebook_stop', 'Stop notebook', :hide => true
4264
4253
  method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
4265
4254
  method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
4266
4255
  method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
@@ -4640,6 +4629,108 @@ module Cnvrg
4640
4629
  end
4641
4630
  end
4642
4631
 
4632
+ desc 'Collect and send job utilization', '', :hide => true
4633
+ method_option :prometheus_url, :type => :string, :aliases => ["--prometheus_url"], :desc => "prometheus url to collect metrics from"
4634
+ method_option :node_name, :type => :string, :aliases => ["--node_name"], :desc => "machie activity node name"
4635
+ method_option :machine, :type => :boolean, :aliases => ["--machine"], :desc => "get machine_query or cluster_query"
4636
+ method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
4637
+ method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
4638
+ method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
4639
+ method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
4640
+ method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
4641
+ method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
4642
+
4643
+ def collect_metrics
4644
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4645
+ prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
4646
+ prom_user = options[:prom_user]
4647
+ prom_password = options[:prom_password]
4648
+ name = options[:name]
4649
+
4650
+ translate_result = Cnvrg::API_V2.request(
4651
+ "#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
4652
+ 'GET',
4653
+ { gpu: options[:gpu], gaudi: options[:gaudi] }
4654
+ )
4655
+
4656
+ is_machine = options[:machine]
4657
+ while true do
4658
+ begin
4659
+ stats = {}
4660
+ translate_result.each do |query_name, metric|
4661
+ if is_machine
4662
+ metric_query = metric['machine_query'].presence || metric['query']
4663
+ query_content = metric_query.gsub('#JOB_SLUG#', ENV['CNVRG_JOB_ID']).gsub('#NODE_NAME#', options[:node_name])
4664
+ else
4665
+ metric_query = metric['cluster_query'].presence || metric['query']
4666
+ pod_name = `hostname`.strip
4667
+ query_content = metric_query.gsub('#JOB_SLUG#', pod_name).gsub('#NODE_NAME#', options[:node_name])
4668
+ end
4669
+ if metric_query.blank? || query_content.blank?
4670
+ next
4671
+ end
4672
+ uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
4673
+ http = Net::HTTP.new(uri.host, uri.port)
4674
+ http.use_ssl = uri.scheme == "https"
4675
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
4676
+ req = Net::HTTP::Get.new uri.request_uri
4677
+ if prom_user.present?
4678
+ req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
4679
+ end
4680
+ resp = http.request(req)
4681
+ begin
4682
+ result = JSON.parse(resp.body)
4683
+ rescue JSON::ParserError => e
4684
+ log_error(e)
4685
+ next
4686
+ end
4687
+ data_result = result&.dig('data', 'result')
4688
+ next unless data_result
4689
+
4690
+ if data_result.size > 1
4691
+ stats[query_name] = {} unless query_name.include? 'block'
4692
+ data_result.each_with_index do |res, i|
4693
+ timestamp, value = res["value"]
4694
+ uuid = res["metric"]["UUID"].presence || i
4695
+ stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4696
+ stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4697
+ if query_name.include? 'block'
4698
+ uuid = res["metric"]["interface"].presence || i
4699
+ uuid = "#{name}-#{uuid}" if name.present?
4700
+ stats['block_io'] = {} if stats['block_io'].blank?
4701
+ io_type = query_name.split('_')[1]
4702
+ stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
4703
+ stats['block_io'][io_type].merge!({ uuid => stat_value })
4704
+ else
4705
+ stats[query_name][uuid] = stat_value
4706
+ end
4707
+ end
4708
+ else
4709
+ timestamp, value = data_result&.first&.dig('value')
4710
+ stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4711
+ stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4712
+ if query_name.include? 'block'
4713
+ stats['block_io'] = {} if stats['block_io'].blank?
4714
+ io_type = query_name.split('_')[1]
4715
+ if name.present?
4716
+ stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
4717
+ stats['block_io'][io_type].merge!({ name => stat_value })
4718
+ else
4719
+ stats['block_io'].merge!({ io_type => stat_value })
4720
+ end
4721
+ else
4722
+ stats[query_name] = name.present? ? { name => stat_value } : stat_value
4723
+ end
4724
+ end
4725
+ end
4726
+ @exp.send_machine_stats [stats] unless stats.empty?
4727
+ rescue => e
4728
+ log_error(e)
4729
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4730
+ end
4731
+ sleep options[:wait]
4732
+ end
4733
+ end
4643
4734
 
4644
4735
  desc '', '', :hide => true
4645
4736
 
@@ -4672,7 +4763,7 @@ module Cnvrg
4672
4763
  end
4673
4764
 
4674
4765
 
4675
- desc '', ''
4766
+ desc '', '', :hide => true
4676
4767
 
4677
4768
  def download_built_image(image_name, image_slug)
4678
4769
  begin