cnvrg 1.11.28 → 1.11.29
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Readme.md +17 -0
- data/cnvrg.gemspec +6 -6
- data/lib/cnvrg/cli.rb +95 -16
- data/lib/cnvrg/experiment.rb +18 -11
- data/lib/cnvrg/helpers/executer.rb +2 -2
- data/lib/cnvrg/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b2588acc9e199189983cf67643124e60c8c9c7a5223bbbd85d65994cb3a812c
|
4
|
+
data.tar.gz: a35822e68bc4a095e7a029d4572bf9a748f2b0f6f0b2f3baa86c04315be3f176
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c0c64aecb69a20b939c990ed99b26e2387d26046c44a734b46f7f813b271c7552801ea3909478a424db42e80e9df662cc2af428283cd65cbe7eb08d939648aa2
|
7
|
+
data.tar.gz: a86c3d71aa228f7a387fd484b286e2dacf92483644cd48687eb0d68bfcb2d0c9e68ac2b87b3ff537efbef064f8c915423fb82438ef823f95aa748b71659a5772
|
data/Readme.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
|
2
|
+
## Version v1.11.15
|
3
|
+
2021-03-30
|
4
|
+
* DEV-208 - Task: Make sure the index name is constant over days
|
5
|
+
* DEV-7555 - Bug: CLI: Error message is not correct when run a flow after removing the permission from Template.
|
6
|
+
* DEV-7800 - New Feature: FR - add stdout to CLI logs (for logging in kibana etc.)
|
7
|
+
* DEV-7928 - Bug: CLI - cnvrg clone doesnt show log message when files not found
|
8
|
+
* DEV-7956 - Bug: CLI crashes from progressbar
|
9
|
+
* DEV-8006 - Bug: Cli- cnvrg data put ,slash in the end url path will cause unique index error
|
10
|
+
* DEV-8007 - Bug: Cli- Cnvrg data clone failed sometimes to load sts, there for clone crashed
|
11
|
+
* DEV-8159 - New Feature: Oauth Proxy
|
12
|
+
* DEV-8179 - New Feature: Add auto cache and link files in cache clone
|
13
|
+
* DEV-8208 - Bug: Cli - cnvrg data put fails
|
14
|
+
* DEV-8284 - Improvement: Use server instead of docker for agent communication
|
15
|
+
* DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
|
16
|
+
* DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
|
17
|
+
* DEV-8621 - Improvement: Add more metrics
|
data/cnvrg.gemspec
CHANGED
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
|
|
14
14
|
|
15
15
|
#spec.files = `git ls-files`.split($/)
|
16
16
|
spec.files = %w[cnvrg.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
|
17
|
-
spec.executables
|
18
|
-
spec.executables
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.executables = ['cnvrg']
|
19
19
|
spec.require_paths = ['lib']
|
20
20
|
|
21
21
|
spec.add_development_dependency 'bundler'
|
@@ -23,14 +23,14 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
24
24
|
spec.add_development_dependency 'vcr', '~> 3.0'
|
25
25
|
spec.add_development_dependency 'aruba'
|
26
|
-
spec.add_development_dependency 'pry'
|
27
|
-
|
28
|
-
spec.add_runtime_dependency 'mimemagic', '~> 0.3.1','>=0.3.
|
26
|
+
spec.add_development_dependency 'pry'
|
27
|
+
|
28
|
+
spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
|
29
29
|
spec.add_runtime_dependency 'faraday', '~> 0.15.2'
|
30
30
|
spec.add_runtime_dependency 'netrc', '~> 0.11.0'
|
31
31
|
spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
|
32
32
|
spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
|
33
|
-
spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
|
33
|
+
spec.add_runtime_dependency 'thor', '~> 0.19.0', '>=0.19.1'
|
34
34
|
spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
|
35
35
|
spec.add_runtime_dependency 'signet', '~> 0.11.0'
|
36
36
|
spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
|
data/lib/cnvrg/cli.rb
CHANGED
@@ -3199,6 +3199,7 @@ module Cnvrg
|
|
3199
3199
|
method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
|
3200
3200
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
|
3201
3201
|
method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
|
3202
|
+
method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
|
3202
3203
|
|
3203
3204
|
def exec(*cmd)
|
3204
3205
|
log = []
|
@@ -3222,6 +3223,7 @@ module Cnvrg
|
|
3222
3223
|
output_dir = options['output_dir'] || "output"
|
3223
3224
|
project_home = get_project_home
|
3224
3225
|
data_query = options["data_query"]
|
3226
|
+
docker_stats = options["docker_stats"]
|
3225
3227
|
@project = Project.new(project_home)
|
3226
3228
|
if @project.is_git
|
3227
3229
|
sync_before = false
|
@@ -3294,20 +3296,22 @@ module Cnvrg
|
|
3294
3296
|
stdout, stderr = '', ''
|
3295
3297
|
begin
|
3296
3298
|
process_running = true
|
3297
|
-
|
3298
|
-
|
3299
|
-
|
3300
|
-
|
3301
|
-
|
3302
|
-
|
3303
|
-
|
3304
|
-
|
3305
|
-
|
3299
|
+
if docker_stats
|
3300
|
+
stats_thread = Thread.new do
|
3301
|
+
while process_running do
|
3302
|
+
sleep 30
|
3303
|
+
begin
|
3304
|
+
stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? { memory: memory_usage, cpu: cpu_usage } : {}
|
3305
|
+
if is_on_gpu
|
3306
|
+
gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
|
3307
|
+
stats['gpu_util'] = gu[0]
|
3308
|
+
stats['gpu'] = gu[1]
|
3309
|
+
end
|
3310
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
3311
|
+
rescue => e
|
3312
|
+
log_error(e)
|
3313
|
+
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
3306
3314
|
end
|
3307
|
-
@exp.send_machine_stats [stats] unless stats.empty?
|
3308
|
-
rescue => e
|
3309
|
-
log_error(e)
|
3310
|
-
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
3311
3315
|
end
|
3312
3316
|
end
|
3313
3317
|
end
|
@@ -3405,7 +3409,7 @@ module Cnvrg
|
|
3405
3409
|
end
|
3406
3410
|
|
3407
3411
|
# log_thread.join
|
3408
|
-
stats_thread.join
|
3412
|
+
stats_thread.join if docker_stats
|
3409
3413
|
|
3410
3414
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
|
3411
3415
|
|
@@ -3425,7 +3429,7 @@ module Cnvrg
|
|
3425
3429
|
log_message("Couldn't run #{cmd}, check your input parameters", Thor::Shell::Color::RED)
|
3426
3430
|
if @exp
|
3427
3431
|
# log_thread.join
|
3428
|
-
Thread.kill(stats_thread)
|
3432
|
+
Thread.kill(stats_thread) if docker_stats
|
3429
3433
|
exit_status = $?.exitstatus
|
3430
3434
|
if exit_status.blank?
|
3431
3435
|
exit_status = "-1"
|
@@ -3449,7 +3453,7 @@ module Cnvrg
|
|
3449
3453
|
end_commit = @project.last_local_commit
|
3450
3454
|
process_running = false
|
3451
3455
|
# log_thread.join
|
3452
|
-
stats_thread.join
|
3456
|
+
stats_thread.join if docker_stats
|
3453
3457
|
|
3454
3458
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
|
3455
3459
|
if container
|
@@ -4640,6 +4644,81 @@ module Cnvrg
|
|
4640
4644
|
end
|
4641
4645
|
end
|
4642
4646
|
|
4647
|
+
desc 'Collect and send job utilization', '', :hide => true
|
4648
|
+
method_option :prometheus_url, :type => :string, :aliases => ["--prometheus_url"], :desc => "prometheus url to collect metrics from"
|
4649
|
+
method_option :node_name, :type => :string, :aliases => ["--node_name"], :desc => "machie activity node name"
|
4650
|
+
method_option :machine, :type => :boolean, :aliases => ["--machine"], :desc => "get machine_query or cluster_query"
|
4651
|
+
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
4652
|
+
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
4653
|
+
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
4654
|
+
|
4655
|
+
def collect_metrics
|
4656
|
+
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
4657
|
+
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
4658
|
+
|
4659
|
+
translate_result = Cnvrg::API_V2.request(
|
4660
|
+
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
4661
|
+
'GET',
|
4662
|
+
{ gpu: options[:gpu] }
|
4663
|
+
)
|
4664
|
+
|
4665
|
+
is_machine = options[:machine]
|
4666
|
+
while true do
|
4667
|
+
begin
|
4668
|
+
stats = {}
|
4669
|
+
translate_result.each do |query_name, metric|
|
4670
|
+
if is_machine
|
4671
|
+
metric_query = metric['machine_query'].presence || metric['query']
|
4672
|
+
query_content = metric_query.gsub('#JOB_SLUG#', ENV['CNVRG_JOB_ID']).gsub('#NODE_NAME#', options[:node_name])
|
4673
|
+
else
|
4674
|
+
metric_query = metric['cluster_query'].presence || metric['query']
|
4675
|
+
pod_name = `hostname`.strip
|
4676
|
+
query_content = metric_query.gsub('#JOB_SLUG#', pod_name).gsub('#NODE_NAME#', options[:node_name])
|
4677
|
+
end
|
4678
|
+
if metric_query.blank? || query_content.blank?
|
4679
|
+
next
|
4680
|
+
end
|
4681
|
+
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
4682
|
+
resp = Net::HTTP.get(uri)
|
4683
|
+
begin
|
4684
|
+
result = JSON.parse(resp)
|
4685
|
+
rescue JSON::ParserError => e
|
4686
|
+
log_error(e)
|
4687
|
+
next
|
4688
|
+
end
|
4689
|
+
data_result = result&.dig('data', 'result')
|
4690
|
+
next unless data_result
|
4691
|
+
|
4692
|
+
if data_result.size > 1
|
4693
|
+
stats[query_name] = {}
|
4694
|
+
data_result.each_with_index do |res, i|
|
4695
|
+
timestamp, value = res["value"]
|
4696
|
+
uuid = res["metric"]["UUID"].presence || i
|
4697
|
+
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4698
|
+
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4699
|
+
stats[query_name][uuid] = stat_value
|
4700
|
+
end
|
4701
|
+
else
|
4702
|
+
timestamp, value = data_result&.first&.dig('value')
|
4703
|
+
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4704
|
+
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4705
|
+
if query_name.include? 'block'
|
4706
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
4707
|
+
io_type = query_name.split('_')[1]
|
4708
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
4709
|
+
else
|
4710
|
+
stats[query_name] = stat_value
|
4711
|
+
end
|
4712
|
+
end
|
4713
|
+
end
|
4714
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
4715
|
+
rescue => e
|
4716
|
+
log_error(e)
|
4717
|
+
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
4718
|
+
end
|
4719
|
+
sleep options[:wait]
|
4720
|
+
end
|
4721
|
+
end
|
4643
4722
|
|
4644
4723
|
desc '', '', :hide => true
|
4645
4724
|
|
data/lib/cnvrg/experiment.rb
CHANGED
@@ -133,23 +133,30 @@ module Cnvrg
|
|
133
133
|
return response
|
134
134
|
end
|
135
135
|
def remote_notebook(instance_type, commit, data, data_commit, notebook_type,ds_sync_options=0,data_query=nil, image = nil, datasets = nil)
|
136
|
-
response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', {instance_type: instance_type,dataset_slug:data,
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
136
|
+
response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', { instance_type: instance_type, dataset_slug:data,
|
137
|
+
dataset_commit: data_commit, image_slug:image,
|
138
|
+
datasets: datasets,
|
139
|
+
commit:commit, notebook_type:notebook_type, dataset_sync_options:ds_sync_options,
|
140
|
+
dataset_query: data_query })
|
141
141
|
return response
|
142
142
|
end
|
143
143
|
|
144
144
|
def upload_temp_log(temp_log)
|
145
|
-
response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', {output: temp_log,
|
146
|
-
|
147
|
-
Cnvrg::CLI.is_response_success(response,false)
|
145
|
+
response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', { output: temp_log,
|
146
|
+
exp_slug: @slug })
|
147
|
+
Cnvrg::CLI.is_response_success(response, false)
|
148
148
|
end
|
149
149
|
|
150
150
|
def send_machine_stats(stats)
|
151
|
-
response = Cnvrg::API.request(
|
152
|
-
|
151
|
+
response = Cnvrg::API.request(
|
152
|
+
@base_resource + "experiment/upload_stats",
|
153
|
+
"POST",
|
154
|
+
{
|
155
|
+
exp_slug: @slug,
|
156
|
+
stats: stats.map { |s| s.merge!({ time: Time.now }) }
|
157
|
+
}
|
158
|
+
)
|
159
|
+
Cnvrg::CLI.is_response_success(response, false)
|
153
160
|
end
|
154
161
|
|
155
162
|
def end(output, exit_status, end_commit, cpu_average, memory_average, end_time: nil)
|
@@ -158,7 +165,7 @@ module Cnvrg
|
|
158
165
|
success = false
|
159
166
|
end_time ||= Time.now
|
160
167
|
while tries < 10 and success.blank?
|
161
|
-
sleep (tries*rand) ** 2 ### exponential backoff
|
168
|
+
sleep (tries * rand) ** 2 ### exponential backoff
|
162
169
|
## this call is super important so we cant let it crash.
|
163
170
|
|
164
171
|
tries += 1
|
@@ -201,7 +201,7 @@ class Cnvrg::Helpers::Executer
|
|
201
201
|
pod_name = `hostname`.strip rescue nil
|
202
202
|
node_name = nil
|
203
203
|
if pod_name.present?
|
204
|
-
pod_describe = `kubectl
|
204
|
+
pod_describe = `kubectl get pod #{pod_name} -o json` rescue nil
|
205
205
|
pod_describe = JSON.parse(pod_describe) rescue {}
|
206
206
|
node_name = pod_describe["spec"]["nodeName"] rescue nil
|
207
207
|
end
|
@@ -217,7 +217,7 @@ class Cnvrg::Helpers::Executer
|
|
217
217
|
|
218
218
|
def get_pod_events(pod_name)
|
219
219
|
return if pod_name.blank?
|
220
|
-
`kubectl get event --
|
220
|
+
`kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
|
221
221
|
end
|
222
222
|
|
223
223
|
def get_node_events(node_name)
|
data/lib/cnvrg/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cnvrg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.11.
|
4
|
+
version: 1.11.29
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yochay Ettun
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2021-03-
|
13
|
+
date: 2021-03-30 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: bundler
|
@@ -105,7 +105,7 @@ dependencies:
|
|
105
105
|
version: 0.3.1
|
106
106
|
- - ">="
|
107
107
|
- !ruby/object:Gem::Version
|
108
|
-
version: 0.3.
|
108
|
+
version: 0.3.7
|
109
109
|
type: :runtime
|
110
110
|
prerelease: false
|
111
111
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -115,7 +115,7 @@ dependencies:
|
|
115
115
|
version: 0.3.1
|
116
116
|
- - ">="
|
117
117
|
- !ruby/object:Gem::Version
|
118
|
-
version: 0.3.
|
118
|
+
version: 0.3.7
|
119
119
|
- !ruby/object:Gem::Dependency
|
120
120
|
name: faraday
|
121
121
|
requirement: !ruby/object:Gem::Requirement
|
@@ -394,6 +394,7 @@ executables:
|
|
394
394
|
extensions: []
|
395
395
|
extra_rdoc_files: []
|
396
396
|
files:
|
397
|
+
- Readme.md
|
397
398
|
- bin/cnvrg
|
398
399
|
- cnvrg.gemspec
|
399
400
|
- lib/cnvrg.rb
|