cnvrg 1.11.26 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d5b1b849c4b05132d0dbdd9f23e767ecd0080b7ae01a96a9aad5bd5b0c7327e6
4
- data.tar.gz: e4d57db1c3dede3fe7907ade6be0787d950d5b17db380c7032b41a7df12de577
3
+ metadata.gz: ab82882b2bb6c9093751cd560eaa4ccb7540fad9b9a34a81245721538dd37a5b
4
+ data.tar.gz: e35299be744d985a37794288a269ba2bb55cf64d3fcd702c6c6147bd4f5d740d
5
5
  SHA512:
6
- metadata.gz: 5a8755391132d2f30c2a7c35ab4fb3c12bbb3a6f3e29df282ca884c4a8917c03944e45639b5b03f9b1a576d65c7d57c9f3323d969271cff15ce973a0f5b42b66
7
- data.tar.gz: 0e22fb319eee5086097ed6108c21e333048ac443b9a1b4813475eb1932aa0857aba97722cdef42a9c4fe0d26784c8a95efd628e5ac4def95f785458750851b95
6
+ metadata.gz: 52b51bb4942583e9ac3eceab24891d252ef7e31d7f1a2a87d5c8f4586f914f33f2d9cc7addc09ab5aa3652a9ba939ef5a37bdb9fc82d18e4e0e0a61126265d17
7
+ data.tar.gz: df735e631778b5d36d296c33903719a2ba7832ce2a9e218eea033ad246f8c4ce044a7b7d5562f68d901a1ee32d62a88559f886da2a0341d8bf7c51b50c25660c
data/Readme.md ADDED
@@ -0,0 +1,26 @@
1
+
2
+ ## Version v1.11.15
3
+ 2021-03-30
4
+ * DEV-208 - Task: Make sure the index name is constant over days
5
+ * DEV-7555 - Bug: CLI: Error message is not correct when run a flow after removing the permission from Template.
6
+ * DEV-7800 - New Feature: FR - add stdout to CLI logs (for logging in kibana etc.)
7
+ * DEV-7928 - Bug: CLI - cnvrg clone doesnt show log message when files not found
8
+ * DEV-7956 - Bug: CLI crashes from progressbar
9
+ * DEV-8006 - Bug: Cli- cnvrg data put ,slash in the end url path will cause unique index error
10
+ * DEV-8007 - Bug: Cli- Cnvrg data clone failed sometimes to load sts, there for clone crashed
11
+ * DEV-8159 - New Feature: Oauth Proxy
12
+ * DEV-8179 - New Feature: Add auto cache and link files in cache clone
13
+ * DEV-8208 - Bug: Cli - cnvrg data put fails
14
+ * DEV-8284 - Improvement: Use server instead of docker for agent communication
15
+ * DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
16
+ * DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
17
+ * DEV-8621 - Improvement: Add more metrics
18
+ ## Version v1.11.30
19
+ 2021-04-06
20
+ ## Version v1.11.31
21
+ 2021-04-22
22
+ ## Version v1.11.32
23
+ 2021-05-05
24
+ * DEV-8868 - Bug: SDK - e.sync() in git project only creates empty "output" folder in commit
25
+ ## Version v2.0.1
26
+ 2021-06-13
data/cnvrg.gemspec CHANGED
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
14
14
 
15
15
  #spec.files = `git ls-files`.split($/)
16
16
  spec.files = %w[cnvrg.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
17
- spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
- spec.executables = ['cnvrg']
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.executables = ['cnvrg']
19
19
  spec.require_paths = ['lib']
20
20
 
21
21
  spec.add_development_dependency 'bundler'
@@ -25,12 +25,13 @@ Gem::Specification.new do |spec|
25
25
  spec.add_development_dependency 'aruba'
26
26
  spec.add_development_dependency 'pry'
27
27
 
28
- spec.add_runtime_dependency 'mimemagic', '~> 0.3.1','>=0.3.2'
28
+ spec.add_runtime_dependency 'ffi', '~> 1.9', '>= 1.9.10'
29
+ spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
29
30
  spec.add_runtime_dependency 'faraday', '~> 0.15.2'
30
31
  spec.add_runtime_dependency 'netrc', '~> 0.11.0'
31
32
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
33
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
- spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
34
+ spec.add_runtime_dependency 'thor', '~> 0.19.0', '>=0.19.1'
34
35
  spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
35
36
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
37
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
@@ -38,6 +39,7 @@ Gem::Specification.new do |spec|
38
39
  spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
39
40
  spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
40
41
  spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
42
+ spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
41
43
  spec.add_runtime_dependency 'parallel', '~> 1.12.0'
42
44
  spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
43
45
  spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
data/lib/cnvrg/api.rb CHANGED
@@ -72,11 +72,11 @@ module Cnvrg
72
72
  if response.to_hash[:status].to_i != 200
73
73
  Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
74
74
  end
75
- if [503, 502, 429].include?(response.to_hash[:status].to_i)
75
+ if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
76
76
  Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
77
77
  success = false
78
78
  sleep(5 * retries)
79
- retries +=1
79
+ retries += 1
80
80
  next
81
81
  end
82
82
  rescue => e
@@ -112,11 +112,11 @@ module Cnvrg
112
112
  if response.to_hash[:status].to_i != 200
113
113
  Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
114
114
  end
115
- if [503, 502, 429].include?(response.to_hash[:status].to_i)
115
+ if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
116
116
  Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
117
117
  success = false
118
118
  sleep(5 * retries)
119
- retries +=1
119
+ retries += 1
120
120
  next
121
121
  end
122
122
  rescue => e
data/lib/cnvrg/cli.rb CHANGED
@@ -173,7 +173,7 @@ module Cnvrg
173
173
  desc "data [COMMAND]", "Upload and manage datasets", :hide => false
174
174
  subcommand "data", Data
175
175
 
176
- desc "job", "manage running jobs", :hide => false
176
+ desc "job", "manage running jobs", :hide => true
177
177
  subcommand "job", JobCli
178
178
 
179
179
  desc "ssh", "ssh into running jobs", :hide => false
@@ -415,7 +415,7 @@ module Cnvrg
415
415
  end
416
416
  end
417
417
 
418
- desc 'set_compression_path', 'Set compression path'
418
+ desc 'set_compression_path', 'Set compression path', :hide => true
419
419
  method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
420
420
 
421
421
  def set_compression_path(*compression_path)
@@ -2311,6 +2311,7 @@ module Cnvrg
2311
2311
  method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2312
2312
  method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2313
2313
  method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
2314
+ method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
2314
2315
 
2315
2316
  def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
2316
2317
  begin
@@ -2328,6 +2329,8 @@ module Cnvrg
2328
2329
  exp_obj = nil
2329
2330
  end
2330
2331
 
2332
+ local = options["local"]
2333
+
2331
2334
  commit_msg = options["message"]
2332
2335
  if commit_msg.nil? or commit_msg.empty?
2333
2336
  commit_msg = ""
@@ -2349,7 +2352,7 @@ module Cnvrg
2349
2352
  if git_output_dir.ends_with? "/"
2350
2353
  git_output_dir = git_output_dir[0..-2]
2351
2354
  end
2352
- list = @project.generate_output_dir(git_output_dir)
2355
+ list = @project.generate_output_dir(git_output_dir, local: local)
2353
2356
  end
2354
2357
  list += @project.generate_git_diff if options["git_diff"]
2355
2358
  spec_files_to_upload = list
@@ -2668,7 +2671,7 @@ module Cnvrg
2668
2671
  end
2669
2672
  end
2670
2673
 
2671
- desc 'commit before termination', 'Commit job code before termination'
2674
+ desc 'commit before termination', 'Commit job code before termination', :hide => true
2672
2675
  def commit_before_termination()
2673
2676
  job_type = ENV['CNVRG_JOB_TYPE']
2674
2677
  job_id = ENV['CNVRG_JOB_ID']
@@ -2678,7 +2681,7 @@ module Cnvrg
2678
2681
  log_error(e)
2679
2682
  end
2680
2683
 
2681
- desc 'update_job_commit', 'Update job with its last commit'
2684
+ desc 'update_job_commit', 'Update job with its last commit' , :hide => true
2682
2685
  def update_job_commit()
2683
2686
  job_type = ENV['CNVRG_JOB_TYPE']
2684
2687
  job_id = ENV['CNVRG_JOB_ID']
@@ -2868,7 +2871,7 @@ module Cnvrg
2868
2871
 
2869
2872
 
2870
2873
 
2871
- desc 'jump', 'Jump to specific commit'
2874
+ desc 'jump COMMIT_ID', 'Jump to specific commit'
2872
2875
  def jump(commit_sha1)
2873
2876
  begin
2874
2877
  verify_logged_in()
@@ -3008,6 +3011,7 @@ module Cnvrg
3008
3011
  method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
3009
3012
  method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
3010
3013
  method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
3014
+ method_option :local, :type => :boolean, :aliases => ["--local"], :default => false
3011
3015
 
3012
3016
  def sync(direct = true)
3013
3017
  verify_logged_in(true) if direct
@@ -3030,10 +3034,10 @@ module Cnvrg
3030
3034
  if run_download or options['debug_mode']
3031
3035
  invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
3032
3036
  end
3033
- invoke :upload, [false, true, direct, "",in_exp,options[:force], output_dir, job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
3037
+ invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
3034
3038
  :ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
3035
3039
  :files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
3036
- :debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
3040
+ :debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
3037
3041
 
3038
3042
  end
3039
3043
 
@@ -3199,6 +3203,7 @@ module Cnvrg
3199
3203
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3200
3204
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3201
3205
  method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3206
+ method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
3202
3207
 
3203
3208
  def exec(*cmd)
3204
3209
  log = []
@@ -3222,6 +3227,7 @@ module Cnvrg
3222
3227
  output_dir = options['output_dir'] || "output"
3223
3228
  project_home = get_project_home
3224
3229
  data_query = options["data_query"]
3230
+ docker_stats = options["docker_stats"]
3225
3231
  @project = Project.new(project_home)
3226
3232
  if @project.is_git
3227
3233
  sync_before = false
@@ -3294,80 +3300,67 @@ module Cnvrg
3294
3300
  stdout, stderr = '', ''
3295
3301
  begin
3296
3302
  process_running = true
3297
- stats_thread = Thread.new do
3298
- while process_running do
3299
- sleep 30
3300
- begin
3301
- stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3302
- if is_on_gpu
3303
- gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3304
- stats['gpu_util'] = gu[0]
3305
- stats['gpu'] = gu[1]
3303
+ if docker_stats
3304
+ stats_thread = Thread.new do
3305
+ while process_running do
3306
+ sleep 30
3307
+ begin
3308
+ stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? { memory: memory_usage, cpu: cpu_usage } : {}
3309
+ if is_on_gpu
3310
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3311
+ stats['gpu_util'] = gu[0]
3312
+ stats['gpu'] = gu[1]
3313
+ end
3314
+ @exp.send_machine_stats [stats] unless stats.empty?
3315
+ rescue => e
3316
+ log_error(e)
3317
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3306
3318
  end
3307
- @exp.send_machine_stats [stats] unless stats.empty?
3308
- rescue => e
3309
- log_error(e)
3310
- log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3311
3319
  end
3312
3320
  end
3313
3321
  end
3314
3322
  start_time = Time.now
3315
- shell_type = options["use_bash"] ? "bash -l" : "sh"
3316
3323
  if @exp.get_cmd.present?
3317
3324
  cmd = @exp.get_cmd
3318
- if options["docker_id"].present? # Escape for docker exec
3319
- cmd = cmd.gsub("\"", "\\\"")
3320
- end
3321
3325
  end
3322
- if options["docker_id"].present?
3323
- cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
3326
+ command_slug = (0...18).map { (65 + rand(26)).chr }.join
3327
+ result_file = "/conf/result-#{command_slug}"
3328
+ data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
3329
+
3330
+ conn = Cnvrg::Helpers::Executer.get_main_conn
3331
+ response = conn.post('command', data.to_json)
3332
+ if response.to_hash[:status].to_i != 200
3333
+ exit_status = 129
3334
+ raise StandardError.new("Cant send command to slave")
3324
3335
  end
3325
- PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
3336
+ t = FileWatch::Tail.new
3337
+ filename = result_file
3338
+ lines = []
3339
+ t.tail(filename)
3340
+ t.subscribe do |path, line|
3326
3341
  begin
3327
- stdout.each do |line|
3328
- cur_time = Time.now
3329
- real_time = Time.now - real
3330
- cur_log = {time: cur_time,
3331
- message: line,
3332
- type: "stdout",
3333
- real: real_time
3334
- }
3335
- if print_log
3336
- puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
3337
- end
3342
+ cur_log = JSON.parse(line)
3343
+ if cur_log["type"] == "endMessage"
3344
+ exit_status = cur_log["real"].to_i
3345
+ break
3346
+ else
3347
+ puts(cur_log.to_json)
3348
+ STDOUT.flush
3349
+ cur_log["time"] = Time.parse(cur_log["timestamp"])
3350
+ cur_log["message"] = cur_log["message"].to_s + "\r\n"
3338
3351
  log << cur_log
3339
- if log.size >= 10
3340
- @exp.upload_temp_log(log) unless log.empty?
3341
- log = []
3342
- elsif (start_time + 15.seconds) <= Time.now
3343
- @exp.upload_temp_log(log) unless log.empty?
3344
- log = []
3345
- start_time = Time.now
3346
- end
3347
3352
  end
3348
- if stderr
3349
- stderr.each do |err|
3350
- log << {time: Time.now, message: err, type: "stderr"}
3351
- end
3353
+ if log.size >= 10
3354
+ @exp.upload_temp_log(log)
3355
+ log = []
3356
+ elsif (start_time + 15.seconds) <= Time.now
3357
+ @exp.upload_temp_log(log) unless log.empty?
3358
+ log = []
3359
+ start_time = Time.now
3352
3360
  end
3353
- rescue Errno::EIO => e
3354
- log_error(e)
3355
- if !log.empty?
3356
- temp_log = log
3357
- @exp.upload_temp_log(temp_log) unless temp_log.empty?
3358
- log -= temp_log
3359
- end
3360
- rescue Errno::ENOENT => e
3361
- exp_success = false
3362
- log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
3363
- log_error(e)
3364
3361
  rescue => e
3365
- res = @exp.end(log, 1, start_commit, 0, 0)
3366
- log_message("Error occurred,aborting", Thor::Shell::Color::RED)
3367
3362
  log_error(e)
3368
- exit(0)
3369
3363
  end
3370
- ::Process.wait pid
3371
3364
  end
3372
3365
  end_time = Time.now
3373
3366
  process_running = false
@@ -3375,14 +3368,13 @@ module Cnvrg
3375
3368
  if !log.empty?
3376
3369
 
3377
3370
  temp_log = log
3378
- @exp.upload_temp_log(temp_log) unless temp_log.empty?
3371
+ @exp.upload_temp_log(temp_log)
3379
3372
  log -= temp_log
3380
3373
  end
3381
3374
 
3382
3375
  cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
3383
3376
  memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
3384
- exit_status = $?.exitstatus
3385
- if $?.exitstatus != 0
3377
+ if exit_status != 0
3386
3378
  exp_success = false
3387
3379
  end
3388
3380
 
@@ -3405,7 +3397,7 @@ module Cnvrg
3405
3397
  end
3406
3398
 
3407
3399
  # log_thread.join
3408
- stats_thread.join
3400
+ stats_thread.join if docker_stats
3409
3401
 
3410
3402
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3411
3403
 
@@ -3425,8 +3417,7 @@ module Cnvrg
3425
3417
  log_message("Couldn't run #{cmd}, check your input parameters", Thor::Shell::Color::RED)
3426
3418
  if @exp
3427
3419
  # log_thread.join
3428
- Thread.kill(stats_thread)
3429
- exit_status = $?.exitstatus
3420
+ Thread.kill(stats_thread) if docker_stats
3430
3421
  if exit_status.blank?
3431
3422
  exit_status = "-1"
3432
3423
  end
@@ -3439,8 +3430,6 @@ module Cnvrg
3439
3430
 
3440
3431
  exit(1)
3441
3432
  end
3442
-
3443
-
3444
3433
  end
3445
3434
 
3446
3435
  end
@@ -3449,7 +3438,7 @@ module Cnvrg
3449
3438
  end_commit = @project.last_local_commit
3450
3439
  process_running = false
3451
3440
  # log_thread.join
3452
- stats_thread.join
3441
+ stats_thread.join if docker_stats
3453
3442
 
3454
3443
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
3455
3444
  if container
@@ -3685,7 +3674,7 @@ module Cnvrg
3685
3674
  end
3686
3675
  end
3687
3676
 
3688
- desc 'deploy', 'Deploys model to production'
3677
+ desc 'deploy', 'Deploys model to production', :hide => true
3689
3678
  method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
3690
3679
  method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
3691
3680
  method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
@@ -3774,7 +3763,7 @@ module Cnvrg
3774
3763
  method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
3775
3764
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
3776
3765
 
3777
- desc 'notebook', 'Starts a notebook session remotely or locally'
3766
+ desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
3778
3767
 
3779
3768
  def notebook
3780
3769
  verify_logged_in(true)
@@ -3901,7 +3890,7 @@ module Cnvrg
3901
3890
  end
3902
3891
  end
3903
3892
 
3904
- desc 'remote_notebook', 'Run notebook server on remote server'
3893
+ desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
3905
3894
  method_option :machine_type, :type => :string, :default => ""
3906
3895
  method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
3907
3896
  method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
@@ -4260,7 +4249,7 @@ module Cnvrg
4260
4249
 
4261
4250
  end
4262
4251
 
4263
- desc 'notebook_stop', 'Starts a new notebook environment'
4252
+ desc 'notebook_stop', 'Stop notebook', :hide => true
4264
4253
  method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
4265
4254
  method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
4266
4255
  method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
@@ -4640,6 +4629,108 @@ module Cnvrg
4640
4629
  end
4641
4630
  end
4642
4631
 
4632
+ desc 'Collect and send job utilization', '', :hide => true
4633
+ method_option :prometheus_url, :type => :string, :aliases => ["--prometheus_url"], :desc => "prometheus url to collect metrics from"
4634
+ method_option :node_name, :type => :string, :aliases => ["--node_name"], :desc => "machie activity node name"
4635
+ method_option :machine, :type => :boolean, :aliases => ["--machine"], :desc => "get machine_query or cluster_query"
4636
+ method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
4637
+ method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
4638
+ method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
4639
+ method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
4640
+ method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
4641
+ method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
4642
+
4643
+ def collect_metrics
4644
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4645
+ prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
4646
+ prom_user = options[:prom_user]
4647
+ prom_password = options[:prom_password]
4648
+ name = options[:name]
4649
+
4650
+ translate_result = Cnvrg::API_V2.request(
4651
+ "#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
4652
+ 'GET',
4653
+ { gpu: options[:gpu], gaudi: options[:gaudi] }
4654
+ )
4655
+
4656
+ is_machine = options[:machine]
4657
+ while true do
4658
+ begin
4659
+ stats = {}
4660
+ translate_result.each do |query_name, metric|
4661
+ if is_machine
4662
+ metric_query = metric['machine_query'].presence || metric['query']
4663
+ query_content = metric_query.gsub('#JOB_SLUG#', ENV['CNVRG_JOB_ID']).gsub('#NODE_NAME#', options[:node_name])
4664
+ else
4665
+ metric_query = metric['cluster_query'].presence || metric['query']
4666
+ pod_name = `hostname`.strip
4667
+ query_content = metric_query.gsub('#JOB_SLUG#', pod_name).gsub('#NODE_NAME#', options[:node_name])
4668
+ end
4669
+ if metric_query.blank? || query_content.blank?
4670
+ next
4671
+ end
4672
+ uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
4673
+ http = Net::HTTP.new(uri.host, uri.port)
4674
+ http.use_ssl = uri.scheme == "https"
4675
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
4676
+ req = Net::HTTP::Get.new uri.request_uri
4677
+ if prom_user.present?
4678
+ req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
4679
+ end
4680
+ resp = http.request(req)
4681
+ begin
4682
+ result = JSON.parse(resp.body)
4683
+ rescue JSON::ParserError => e
4684
+ log_error(e)
4685
+ next
4686
+ end
4687
+ data_result = result&.dig('data', 'result')
4688
+ next unless data_result
4689
+
4690
+ if data_result.size > 1
4691
+ stats[query_name] = {} unless query_name.include? 'block'
4692
+ data_result.each_with_index do |res, i|
4693
+ timestamp, value = res["value"]
4694
+ uuid = res["metric"]["UUID"].presence || i
4695
+ stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4696
+ stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4697
+ if query_name.include? 'block'
4698
+ uuid = res["metric"]["interface"].presence || i
4699
+ uuid = "#{name}-#{uuid}" if name.present?
4700
+ stats['block_io'] = {} if stats['block_io'].blank?
4701
+ io_type = query_name.split('_')[1]
4702
+ stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
4703
+ stats['block_io'][io_type].merge!({ uuid => stat_value })
4704
+ else
4705
+ stats[query_name][uuid] = stat_value
4706
+ end
4707
+ end
4708
+ else
4709
+ timestamp, value = data_result&.first&.dig('value')
4710
+ stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4711
+ stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4712
+ if query_name.include? 'block'
4713
+ stats['block_io'] = {} if stats['block_io'].blank?
4714
+ io_type = query_name.split('_')[1]
4715
+ if name.present?
4716
+ stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
4717
+ stats['block_io'][io_type].merge!({ name => stat_value })
4718
+ else
4719
+ stats['block_io'].merge!({ io_type => stat_value })
4720
+ end
4721
+ else
4722
+ stats[query_name] = name.present? ? { name => stat_value } : stat_value
4723
+ end
4724
+ end
4725
+ end
4726
+ @exp.send_machine_stats [stats] unless stats.empty?
4727
+ rescue => e
4728
+ log_error(e)
4729
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4730
+ end
4731
+ sleep options[:wait]
4732
+ end
4733
+ end
4643
4734
 
4644
4735
  desc '', '', :hide => true
4645
4736
 
@@ -4672,7 +4763,7 @@ module Cnvrg
4672
4763
  end
4673
4764
 
4674
4765
 
4675
- desc '', ''
4766
+ desc '', '', :hide => true
4676
4767
 
4677
4768
  def download_built_image(image_name, image_slug)
4678
4769
  begin
@@ -4916,7 +5007,7 @@ module Cnvrg
4916
5007
  end
4917
5008
  end
4918
5009
 
4919
- desc 'experiments', 'List project experiments'
5010
+ desc 'experiments', 'List project experiments', :hide => true
4920
5011
  method_option :id, :type => :string, :aliases => ["--id"], :desc => "Get info for specific experiments", :default => ""
4921
5012
  method_option :tag, :type => :string, :aliases => ["-t"], :desc => "Get info for specific experiment tag", :default => ""
4922
5013