cnvrg 1.11.28 → 1.11.29

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e617f50977eafa031b94bed3ba18753ecb4529924f74182525a5c6a53410fc5f
4
- data.tar.gz: 40b755a41a73611dbc8f48802352ca73cac613c2e6d3fa70bf823e9e24f7c3f5
3
+ metadata.gz: 0b2588acc9e199189983cf67643124e60c8c9c7a5223bbbd85d65994cb3a812c
4
+ data.tar.gz: a35822e68bc4a095e7a029d4572bf9a748f2b0f6f0b2f3baa86c04315be3f176
5
5
  SHA512:
6
- metadata.gz: 51abb620c15a66be237a4392ec7fad73e1a5f865f3932934b938028ee0b54558772b8dcc292c77379bbbe273c1771f2b3e85d250d492cafa0b0bc681cb7a52dc
7
- data.tar.gz: 06fe73c3d4246c41b43923736c2ece0680a4f04522a317acdc576b6191d5ca74057f55bc6c5120456ac8ef0c5cfe7f328317655799b0d3d3c910db1cde871ad7
6
+ metadata.gz: c0c64aecb69a20b939c990ed99b26e2387d26046c44a734b46f7f813b271c7552801ea3909478a424db42e80e9df662cc2af428283cd65cbe7eb08d939648aa2
7
+ data.tar.gz: a86c3d71aa228f7a387fd484b286e2dacf92483644cd48687eb0d68bfcb2d0c9e68ac2b87b3ff537efbef064f8c915423fb82438ef823f95aa748b71659a5772
data/Readme.md ADDED
@@ -0,0 +1,17 @@
1
+
2
+ ## Version v1.11.15
3
+ 2021-03-30
4
+ * DEV-208 - Task: Make sure the index name is constant over days
5
+ * DEV-7555 - Bug: CLI: Error message is not correct when run a flow after removing the permission from Template.
6
+ * DEV-7800 - New Feature: FR - add stdout to CLI logs (for logging in kibana etc.)
7
+ * DEV-7928 - Bug: CLI - cnvrg clone doesnt show log message when files not found
8
+ * DEV-7956 - Bug: CLI crashes from progressbar
9
+ * DEV-8006 - Bug: Cli- cnvrg data put ,slash in the end url path will cause unique index error
10
+ * DEV-8007 - Bug: Cli- Cnvrg data clone failed sometimes to load sts, there for clone crashed
11
+ * DEV-8159 - New Feature: Oauth Proxy
12
+ * DEV-8179 - New Feature: Add auto cache and link files in cache clone
13
+ * DEV-8208 - Bug: Cli - cnvrg data put fails
14
+ * DEV-8284 - Improvement: Use server instead of docker for agent communication
15
+ * DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
16
+ * DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
17
+ * DEV-8621 - Improvement: Add more metrics
data/cnvrg.gemspec CHANGED
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
14
14
 
15
15
  #spec.files = `git ls-files`.split($/)
16
16
  spec.files = %w[cnvrg.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
17
- spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
- spec.executables = ['cnvrg']
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.executables = ['cnvrg']
19
19
  spec.require_paths = ['lib']
20
20
 
21
21
  spec.add_development_dependency 'bundler'
@@ -23,14 +23,14 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency 'rspec', '~> 3.0'
24
24
  spec.add_development_dependency 'vcr', '~> 3.0'
25
25
  spec.add_development_dependency 'aruba'
26
- spec.add_development_dependency 'pry'
27
-
28
- spec.add_runtime_dependency 'mimemagic', '~> 0.3.1','>=0.3.2'
26
+ spec.add_development_dependency 'pry'
27
+
28
+ spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
29
29
  spec.add_runtime_dependency 'faraday', '~> 0.15.2'
30
30
  spec.add_runtime_dependency 'netrc', '~> 0.11.0'
31
31
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
32
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
- spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
33
+ spec.add_runtime_dependency 'thor', '~> 0.19.0', '>=0.19.1'
34
34
  spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
35
35
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
36
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
data/lib/cnvrg/cli.rb CHANGED
@@ -3199,6 +3199,7 @@ module Cnvrg
3199
3199
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3200
3200
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3201
3201
  method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3202
+ method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
3202
3203
 
3203
3204
  def exec(*cmd)
3204
3205
  log = []
@@ -3222,6 +3223,7 @@ module Cnvrg
3222
3223
  output_dir = options['output_dir'] || "output"
3223
3224
  project_home = get_project_home
3224
3225
  data_query = options["data_query"]
3226
+ docker_stats = options["docker_stats"]
3225
3227
  @project = Project.new(project_home)
3226
3228
  if @project.is_git
3227
3229
  sync_before = false
@@ -3294,20 +3296,22 @@ module Cnvrg
3294
3296
  stdout, stderr = '', ''
3295
3297
  begin
3296
3298
  process_running = true
3297
- stats_thread = Thread.new do
3298
- while process_running do
3299
- sleep 30
3300
- begin
3301
- stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3302
- if is_on_gpu
3303
- gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3304
- stats['gpu_util'] = gu[0]
3305
- stats['gpu'] = gu[1]
3299
+ if docker_stats
3300
+ stats_thread = Thread.new do
3301
+ while process_running do
3302
+ sleep 30
3303
+ begin
3304
+ stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? { memory: memory_usage, cpu: cpu_usage } : {}
3305
+ if is_on_gpu
3306
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3307
+ stats['gpu_util'] = gu[0]
3308
+ stats['gpu'] = gu[1]
3309
+ end
3310
+ @exp.send_machine_stats [stats] unless stats.empty?
3311
+ rescue => e
3312
+ log_error(e)
3313
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3306
3314
  end
3307
- @exp.send_machine_stats [stats] unless stats.empty?
3308
- rescue => e
3309
- log_error(e)
3310
- log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3311
3315
  end
3312
3316
  end
3313
3317
  end
@@ -3405,7 +3409,7 @@ module Cnvrg
3405
3409
  end
3406
3410
 
3407
3411
  # log_thread.join
3408
- stats_thread.join
3412
+ stats_thread.join if docker_stats
3409
3413
 
3410
3414
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3411
3415
 
@@ -3425,7 +3429,7 @@ module Cnvrg
3425
3429
  log_message("Couldn't run #{cmd}, check your input parameters", Thor::Shell::Color::RED)
3426
3430
  if @exp
3427
3431
  # log_thread.join
3428
- Thread.kill(stats_thread)
3432
+ Thread.kill(stats_thread) if docker_stats
3429
3433
  exit_status = $?.exitstatus
3430
3434
  if exit_status.blank?
3431
3435
  exit_status = "-1"
@@ -3449,7 +3453,7 @@ module Cnvrg
3449
3453
  end_commit = @project.last_local_commit
3450
3454
  process_running = false
3451
3455
  # log_thread.join
3452
- stats_thread.join
3456
+ stats_thread.join if docker_stats
3453
3457
 
3454
3458
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
3455
3459
  if container
@@ -4640,6 +4644,81 @@ module Cnvrg
4640
4644
  end
4641
4645
  end
4642
4646
 
4647
+ desc 'Collect and send job utilization', '', :hide => true
4648
+ method_option :prometheus_url, :type => :string, :aliases => ["--prometheus_url"], :desc => "prometheus url to collect metrics from"
4649
+ method_option :node_name, :type => :string, :aliases => ["--node_name"], :desc => "machie activity node name"
4650
+ method_option :machine, :type => :boolean, :aliases => ["--machine"], :desc => "get machine_query or cluster_query"
4651
+ method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
4652
+ method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
4653
+ method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
4654
+
4655
+ def collect_metrics
4656
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4657
+ prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
4658
+
4659
+ translate_result = Cnvrg::API_V2.request(
4660
+ "#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
4661
+ 'GET',
4662
+ { gpu: options[:gpu] }
4663
+ )
4664
+
4665
+ is_machine = options[:machine]
4666
+ while true do
4667
+ begin
4668
+ stats = {}
4669
+ translate_result.each do |query_name, metric|
4670
+ if is_machine
4671
+ metric_query = metric['machine_query'].presence || metric['query']
4672
+ query_content = metric_query.gsub('#JOB_SLUG#', ENV['CNVRG_JOB_ID']).gsub('#NODE_NAME#', options[:node_name])
4673
+ else
4674
+ metric_query = metric['cluster_query'].presence || metric['query']
4675
+ pod_name = `hostname`.strip
4676
+ query_content = metric_query.gsub('#JOB_SLUG#', pod_name).gsub('#NODE_NAME#', options[:node_name])
4677
+ end
4678
+ if metric_query.blank? || query_content.blank?
4679
+ next
4680
+ end
4681
+ uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
4682
+ resp = Net::HTTP.get(uri)
4683
+ begin
4684
+ result = JSON.parse(resp)
4685
+ rescue JSON::ParserError => e
4686
+ log_error(e)
4687
+ next
4688
+ end
4689
+ data_result = result&.dig('data', 'result')
4690
+ next unless data_result
4691
+
4692
+ if data_result.size > 1
4693
+ stats[query_name] = {}
4694
+ data_result.each_with_index do |res, i|
4695
+ timestamp, value = res["value"]
4696
+ uuid = res["metric"]["UUID"].presence || i
4697
+ stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4698
+ stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4699
+ stats[query_name][uuid] = stat_value
4700
+ end
4701
+ else
4702
+ timestamp, value = data_result&.first&.dig('value')
4703
+ stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4704
+ stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4705
+ if query_name.include? 'block'
4706
+ stats['block_io'] = {} if stats['block_io'].blank?
4707
+ io_type = query_name.split('_')[1]
4708
+ stats['block_io'].merge!({ io_type => stat_value })
4709
+ else
4710
+ stats[query_name] = stat_value
4711
+ end
4712
+ end
4713
+ end
4714
+ @exp.send_machine_stats [stats] unless stats.empty?
4715
+ rescue => e
4716
+ log_error(e)
4717
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4718
+ end
4719
+ sleep options[:wait]
4720
+ end
4721
+ end
4643
4722
 
4644
4723
  desc '', '', :hide => true
4645
4724
 
@@ -133,23 +133,30 @@ module Cnvrg
133
133
  return response
134
134
  end
135
135
  def remote_notebook(instance_type, commit, data, data_commit, notebook_type,ds_sync_options=0,data_query=nil, image = nil, datasets = nil)
136
- response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', {instance_type: instance_type,dataset_slug:data,
137
- dataset_commit: data_commit,image_slug:image,
138
- datasets: datasets,
139
- commit:commit,notebook_type:notebook_type,dataset_sync_options:ds_sync_options,
140
- dataset_query:data_query})
136
+ response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', { instance_type: instance_type, dataset_slug:data,
137
+ dataset_commit: data_commit, image_slug:image,
138
+ datasets: datasets,
139
+ commit:commit, notebook_type:notebook_type, dataset_sync_options:ds_sync_options,
140
+ dataset_query: data_query })
141
141
  return response
142
142
  end
143
143
 
144
144
  def upload_temp_log(temp_log)
145
- response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', {output: temp_log,
146
- exp_slug: @slug})
147
- Cnvrg::CLI.is_response_success(response,false)
145
+ response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', { output: temp_log,
146
+ exp_slug: @slug })
147
+ Cnvrg::CLI.is_response_success(response, false)
148
148
  end
149
149
 
150
150
  def send_machine_stats(stats)
151
- response = Cnvrg::API.request(@base_resource + "experiment/upload_stats", "POST", {exp_slug: @slug, stats: stats.map{|s| s.merge!({time: Time.now})}})
152
- Cnvrg::CLI.is_response_success(response,false)
151
+ response = Cnvrg::API.request(
152
+ @base_resource + "experiment/upload_stats",
153
+ "POST",
154
+ {
155
+ exp_slug: @slug,
156
+ stats: stats.map { |s| s.merge!({ time: Time.now }) }
157
+ }
158
+ )
159
+ Cnvrg::CLI.is_response_success(response, false)
153
160
  end
154
161
 
155
162
  def end(output, exit_status, end_commit, cpu_average, memory_average, end_time: nil)
@@ -158,7 +165,7 @@ module Cnvrg
158
165
  success = false
159
166
  end_time ||= Time.now
160
167
  while tries < 10 and success.blank?
161
- sleep (tries*rand) ** 2 ### exponential backoff
168
+ sleep (tries * rand) ** 2 ### exponential backoff
162
169
  ## this call is super important so we cant let it crash.
163
170
 
164
171
  tries += 1
@@ -201,7 +201,7 @@ class Cnvrg::Helpers::Executer
201
201
  pod_name = `hostname`.strip rescue nil
202
202
  node_name = nil
203
203
  if pod_name.present?
204
- pod_describe = `kubectl -n cnvrg get pod #{pod_name} -o json` rescue nil
204
+ pod_describe = `kubectl get pod #{pod_name} -o json` rescue nil
205
205
  pod_describe = JSON.parse(pod_describe) rescue {}
206
206
  node_name = pod_describe["spec"]["nodeName"] rescue nil
207
207
  end
@@ -217,7 +217,7 @@ class Cnvrg::Helpers::Executer
217
217
 
218
218
  def get_pod_events(pod_name)
219
219
  return if pod_name.blank?
220
- `kubectl get event --namespace cnvrg --field-selector involvedObject.name=#{pod_name} -o json`
220
+ `kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
221
221
  end
222
222
 
223
223
  def get_node_events(node_name)
data/lib/cnvrg/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Cnvrg
2
- VERSION = '1.11.28'
2
+ VERSION = '1.11.29'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cnvrg
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.28
4
+ version: 1.11.29
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yochay Ettun
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2021-03-25 00:00:00.000000000 Z
13
+ date: 2021-03-30 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: bundler
@@ -105,7 +105,7 @@ dependencies:
105
105
  version: 0.3.1
106
106
  - - ">="
107
107
  - !ruby/object:Gem::Version
108
- version: 0.3.2
108
+ version: 0.3.7
109
109
  type: :runtime
110
110
  prerelease: false
111
111
  version_requirements: !ruby/object:Gem::Requirement
@@ -115,7 +115,7 @@ dependencies:
115
115
  version: 0.3.1
116
116
  - - ">="
117
117
  - !ruby/object:Gem::Version
118
- version: 0.3.2
118
+ version: 0.3.7
119
119
  - !ruby/object:Gem::Dependency
120
120
  name: faraday
121
121
  requirement: !ruby/object:Gem::Requirement
@@ -394,6 +394,7 @@ executables:
394
394
  extensions: []
395
395
  extra_rdoc_files: []
396
396
  files:
397
+ - Readme.md
397
398
  - bin/cnvrg
398
399
  - cnvrg.gemspec
399
400
  - lib/cnvrg.rb