cnvrg 1.11.28 → 1.11.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e617f50977eafa031b94bed3ba18753ecb4529924f74182525a5c6a53410fc5f
4
- data.tar.gz: 40b755a41a73611dbc8f48802352ca73cac613c2e6d3fa70bf823e9e24f7c3f5
3
+ metadata.gz: 0b2588acc9e199189983cf67643124e60c8c9c7a5223bbbd85d65994cb3a812c
4
+ data.tar.gz: a35822e68bc4a095e7a029d4572bf9a748f2b0f6f0b2f3baa86c04315be3f176
5
5
  SHA512:
6
- metadata.gz: 51abb620c15a66be237a4392ec7fad73e1a5f865f3932934b938028ee0b54558772b8dcc292c77379bbbe273c1771f2b3e85d250d492cafa0b0bc681cb7a52dc
7
- data.tar.gz: 06fe73c3d4246c41b43923736c2ece0680a4f04522a317acdc576b6191d5ca74057f55bc6c5120456ac8ef0c5cfe7f328317655799b0d3d3c910db1cde871ad7
6
+ metadata.gz: c0c64aecb69a20b939c990ed99b26e2387d26046c44a734b46f7f813b271c7552801ea3909478a424db42e80e9df662cc2af428283cd65cbe7eb08d939648aa2
7
+ data.tar.gz: a86c3d71aa228f7a387fd484b286e2dacf92483644cd48687eb0d68bfcb2d0c9e68ac2b87b3ff537efbef064f8c915423fb82438ef823f95aa748b71659a5772
data/Readme.md ADDED
@@ -0,0 +1,17 @@
1
+
2
+ ## Version v1.11.15
3
+ 2021-03-30
4
+ * DEV-208 - Task: Make sure the index name is constant over days
5
+ * DEV-7555 - Bug: CLI: Error message is not correct when run a flow after removing the permission from Template.
6
+ * DEV-7800 - New Feature: FR - add stdout to CLI logs (for logging in kibana etc.)
7
+ * DEV-7928 - Bug: CLI - cnvrg clone doesnt show log message when files not found
8
+ * DEV-7956 - Bug: CLI crashes from progressbar
9
+ * DEV-8006 - Bug: Cli- cnvrg data put ,slash in the end url path will cause unique index error
10
+ * DEV-8007 - Bug: Cli- Cnvrg data clone failed sometimes to load sts, there for clone crashed
11
+ * DEV-8159 - New Feature: Oauth Proxy
12
+ * DEV-8179 - New Feature: Add auto cache and link files in cache clone
13
+ * DEV-8208 - Bug: Cli - cnvrg data put fails
14
+ * DEV-8284 - Improvement: Use server instead of docker for agent communication
15
+ * DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
16
+ * DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
17
+ * DEV-8621 - Improvement: Add more metrics
data/cnvrg.gemspec CHANGED
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
14
14
 
15
15
  #spec.files = `git ls-files`.split($/)
16
16
  spec.files = %w[cnvrg.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
17
- spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
- spec.executables = ['cnvrg']
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.executables = ['cnvrg']
19
19
  spec.require_paths = ['lib']
20
20
 
21
21
  spec.add_development_dependency 'bundler'
@@ -23,14 +23,14 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency 'rspec', '~> 3.0'
24
24
  spec.add_development_dependency 'vcr', '~> 3.0'
25
25
  spec.add_development_dependency 'aruba'
26
- spec.add_development_dependency 'pry'
27
-
28
- spec.add_runtime_dependency 'mimemagic', '~> 0.3.1','>=0.3.2'
26
+ spec.add_development_dependency 'pry'
27
+
28
+ spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
29
29
  spec.add_runtime_dependency 'faraday', '~> 0.15.2'
30
30
  spec.add_runtime_dependency 'netrc', '~> 0.11.0'
31
31
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
32
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
- spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
33
+ spec.add_runtime_dependency 'thor', '~> 0.19.0', '>=0.19.1'
34
34
  spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
35
35
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
36
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
data/lib/cnvrg/cli.rb CHANGED
@@ -3199,6 +3199,7 @@ module Cnvrg
3199
3199
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3200
3200
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3201
3201
  method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3202
+ method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
3202
3203
 
3203
3204
  def exec(*cmd)
3204
3205
  log = []
@@ -3222,6 +3223,7 @@ module Cnvrg
3222
3223
  output_dir = options['output_dir'] || "output"
3223
3224
  project_home = get_project_home
3224
3225
  data_query = options["data_query"]
3226
+ docker_stats = options["docker_stats"]
3225
3227
  @project = Project.new(project_home)
3226
3228
  if @project.is_git
3227
3229
  sync_before = false
@@ -3294,20 +3296,22 @@ module Cnvrg
3294
3296
  stdout, stderr = '', ''
3295
3297
  begin
3296
3298
  process_running = true
3297
- stats_thread = Thread.new do
3298
- while process_running do
3299
- sleep 30
3300
- begin
3301
- stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3302
- if is_on_gpu
3303
- gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3304
- stats['gpu_util'] = gu[0]
3305
- stats['gpu'] = gu[1]
3299
+ if docker_stats
3300
+ stats_thread = Thread.new do
3301
+ while process_running do
3302
+ sleep 30
3303
+ begin
3304
+ stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? { memory: memory_usage, cpu: cpu_usage } : {}
3305
+ if is_on_gpu
3306
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3307
+ stats['gpu_util'] = gu[0]
3308
+ stats['gpu'] = gu[1]
3309
+ end
3310
+ @exp.send_machine_stats [stats] unless stats.empty?
3311
+ rescue => e
3312
+ log_error(e)
3313
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3306
3314
  end
3307
- @exp.send_machine_stats [stats] unless stats.empty?
3308
- rescue => e
3309
- log_error(e)
3310
- log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3311
3315
  end
3312
3316
  end
3313
3317
  end
@@ -3405,7 +3409,7 @@ module Cnvrg
3405
3409
  end
3406
3410
 
3407
3411
  # log_thread.join
3408
- stats_thread.join
3412
+ stats_thread.join if docker_stats
3409
3413
 
3410
3414
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3411
3415
 
@@ -3425,7 +3429,7 @@ module Cnvrg
3425
3429
  log_message("Couldn't run #{cmd}, check your input parameters", Thor::Shell::Color::RED)
3426
3430
  if @exp
3427
3431
  # log_thread.join
3428
- Thread.kill(stats_thread)
3432
+ Thread.kill(stats_thread) if docker_stats
3429
3433
  exit_status = $?.exitstatus
3430
3434
  if exit_status.blank?
3431
3435
  exit_status = "-1"
@@ -3449,7 +3453,7 @@ module Cnvrg
3449
3453
  end_commit = @project.last_local_commit
3450
3454
  process_running = false
3451
3455
  # log_thread.join
3452
- stats_thread.join
3456
+ stats_thread.join if docker_stats
3453
3457
 
3454
3458
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
3455
3459
  if container
@@ -4640,6 +4644,81 @@ module Cnvrg
4640
4644
  end
4641
4645
  end
4642
4646
 
4647
+ desc 'Collect and send job utilization', '', :hide => true
4648
+ method_option :prometheus_url, :type => :string, :aliases => ["--prometheus_url"], :desc => "prometheus url to collect metrics from"
4649
+ method_option :node_name, :type => :string, :aliases => ["--node_name"], :desc => "machie activity node name"
4650
+ method_option :machine, :type => :boolean, :aliases => ["--machine"], :desc => "get machine_query or cluster_query"
4651
+ method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
4652
+ method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
4653
+ method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
4654
+
4655
+ def collect_metrics
4656
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4657
+ prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
4658
+
4659
+ translate_result = Cnvrg::API_V2.request(
4660
+ "#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
4661
+ 'GET',
4662
+ { gpu: options[:gpu] }
4663
+ )
4664
+
4665
+ is_machine = options[:machine]
4666
+ while true do
4667
+ begin
4668
+ stats = {}
4669
+ translate_result.each do |query_name, metric|
4670
+ if is_machine
4671
+ metric_query = metric['machine_query'].presence || metric['query']
4672
+ query_content = metric_query.gsub('#JOB_SLUG#', ENV['CNVRG_JOB_ID']).gsub('#NODE_NAME#', options[:node_name])
4673
+ else
4674
+ metric_query = metric['cluster_query'].presence || metric['query']
4675
+ pod_name = `hostname`.strip
4676
+ query_content = metric_query.gsub('#JOB_SLUG#', pod_name).gsub('#NODE_NAME#', options[:node_name])
4677
+ end
4678
+ if metric_query.blank? || query_content.blank?
4679
+ next
4680
+ end
4681
+ uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
4682
+ resp = Net::HTTP.get(uri)
4683
+ begin
4684
+ result = JSON.parse(resp)
4685
+ rescue JSON::ParserError => e
4686
+ log_error(e)
4687
+ next
4688
+ end
4689
+ data_result = result&.dig('data', 'result')
4690
+ next unless data_result
4691
+
4692
+ if data_result.size > 1
4693
+ stats[query_name] = {}
4694
+ data_result.each_with_index do |res, i|
4695
+ timestamp, value = res["value"]
4696
+ uuid = res["metric"]["UUID"].presence || i
4697
+ stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4698
+ stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4699
+ stats[query_name][uuid] = stat_value
4700
+ end
4701
+ else
4702
+ timestamp, value = data_result&.first&.dig('value')
4703
+ stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4704
+ stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4705
+ if query_name.include? 'block'
4706
+ stats['block_io'] = {} if stats['block_io'].blank?
4707
+ io_type = query_name.split('_')[1]
4708
+ stats['block_io'].merge!({ io_type => stat_value })
4709
+ else
4710
+ stats[query_name] = stat_value
4711
+ end
4712
+ end
4713
+ end
4714
+ @exp.send_machine_stats [stats] unless stats.empty?
4715
+ rescue => e
4716
+ log_error(e)
4717
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4718
+ end
4719
+ sleep options[:wait]
4720
+ end
4721
+ end
4643
4722
 
4644
4723
  desc '', '', :hide => true
4645
4724
 
@@ -133,23 +133,30 @@ module Cnvrg
133
133
  return response
134
134
  end
135
135
  def remote_notebook(instance_type, commit, data, data_commit, notebook_type,ds_sync_options=0,data_query=nil, image = nil, datasets = nil)
136
- response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', {instance_type: instance_type,dataset_slug:data,
137
- dataset_commit: data_commit,image_slug:image,
138
- datasets: datasets,
139
- commit:commit,notebook_type:notebook_type,dataset_sync_options:ds_sync_options,
140
- dataset_query:data_query})
136
+ response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', { instance_type: instance_type, dataset_slug:data,
137
+ dataset_commit: data_commit, image_slug:image,
138
+ datasets: datasets,
139
+ commit:commit, notebook_type:notebook_type, dataset_sync_options:ds_sync_options,
140
+ dataset_query: data_query })
141
141
  return response
142
142
  end
143
143
 
144
144
  def upload_temp_log(temp_log)
145
- response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', {output: temp_log,
146
- exp_slug: @slug})
147
- Cnvrg::CLI.is_response_success(response,false)
145
+ response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', { output: temp_log,
146
+ exp_slug: @slug })
147
+ Cnvrg::CLI.is_response_success(response, false)
148
148
  end
149
149
 
150
150
  def send_machine_stats(stats)
151
- response = Cnvrg::API.request(@base_resource + "experiment/upload_stats", "POST", {exp_slug: @slug, stats: stats.map{|s| s.merge!({time: Time.now})}})
152
- Cnvrg::CLI.is_response_success(response,false)
151
+ response = Cnvrg::API.request(
152
+ @base_resource + "experiment/upload_stats",
153
+ "POST",
154
+ {
155
+ exp_slug: @slug,
156
+ stats: stats.map { |s| s.merge!({ time: Time.now }) }
157
+ }
158
+ )
159
+ Cnvrg::CLI.is_response_success(response, false)
153
160
  end
154
161
 
155
162
  def end(output, exit_status, end_commit, cpu_average, memory_average, end_time: nil)
@@ -158,7 +165,7 @@ module Cnvrg
158
165
  success = false
159
166
  end_time ||= Time.now
160
167
  while tries < 10 and success.blank?
161
- sleep (tries*rand) ** 2 ### exponential backoff
168
+ sleep (tries * rand) ** 2 ### exponential backoff
162
169
  ## this call is super important so we cant let it crash.
163
170
 
164
171
  tries += 1
@@ -201,7 +201,7 @@ class Cnvrg::Helpers::Executer
201
201
  pod_name = `hostname`.strip rescue nil
202
202
  node_name = nil
203
203
  if pod_name.present?
204
- pod_describe = `kubectl -n cnvrg get pod #{pod_name} -o json` rescue nil
204
+ pod_describe = `kubectl get pod #{pod_name} -o json` rescue nil
205
205
  pod_describe = JSON.parse(pod_describe) rescue {}
206
206
  node_name = pod_describe["spec"]["nodeName"] rescue nil
207
207
  end
@@ -217,7 +217,7 @@ class Cnvrg::Helpers::Executer
217
217
 
218
218
  def get_pod_events(pod_name)
219
219
  return if pod_name.blank?
220
- `kubectl get event --namespace cnvrg --field-selector involvedObject.name=#{pod_name} -o json`
220
+ `kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
221
221
  end
222
222
 
223
223
  def get_node_events(node_name)
data/lib/cnvrg/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Cnvrg
2
- VERSION = '1.11.28'
2
+ VERSION = '1.11.29'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cnvrg
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.28
4
+ version: 1.11.29
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yochay Ettun
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2021-03-25 00:00:00.000000000 Z
13
+ date: 2021-03-30 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: bundler
@@ -105,7 +105,7 @@ dependencies:
105
105
  version: 0.3.1
106
106
  - - ">="
107
107
  - !ruby/object:Gem::Version
108
- version: 0.3.2
108
+ version: 0.3.7
109
109
  type: :runtime
110
110
  prerelease: false
111
111
  version_requirements: !ruby/object:Gem::Requirement
@@ -115,7 +115,7 @@ dependencies:
115
115
  version: 0.3.1
116
116
  - - ">="
117
117
  - !ruby/object:Gem::Version
118
- version: 0.3.2
118
+ version: 0.3.7
119
119
  - !ruby/object:Gem::Dependency
120
120
  name: faraday
121
121
  requirement: !ruby/object:Gem::Requirement
@@ -394,6 +394,7 @@ executables:
394
394
  extensions: []
395
395
  extra_rdoc_files: []
396
396
  files:
397
+ - Readme.md
397
398
  - bin/cnvrg
398
399
  - cnvrg.gemspec
399
400
  - lib/cnvrg.rb