cnvrg 1.11.28 → 1.11.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Readme.md +17 -0
- data/cnvrg.gemspec +6 -6
- data/lib/cnvrg/cli.rb +95 -16
- data/lib/cnvrg/experiment.rb +18 -11
- data/lib/cnvrg/helpers/executer.rb +2 -2
- data/lib/cnvrg/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0b2588acc9e199189983cf67643124e60c8c9c7a5223bbbd85d65994cb3a812c
|
|
4
|
+
data.tar.gz: a35822e68bc4a095e7a029d4572bf9a748f2b0f6f0b2f3baa86c04315be3f176
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c0c64aecb69a20b939c990ed99b26e2387d26046c44a734b46f7f813b271c7552801ea3909478a424db42e80e9df662cc2af428283cd65cbe7eb08d939648aa2
|
|
7
|
+
data.tar.gz: a86c3d71aa228f7a387fd484b286e2dacf92483644cd48687eb0d68bfcb2d0c9e68ac2b87b3ff537efbef064f8c915423fb82438ef823f95aa748b71659a5772
|
data/Readme.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
|
|
2
|
+
## Version v1.11.15
|
|
3
|
+
2021-03-30
|
|
4
|
+
* DEV-208 - Task: Make sure the index name is constant over days
|
|
5
|
+
* DEV-7555 - Bug: CLI: Error message is not correct when run a flow after removing the permission from Template.
|
|
6
|
+
* DEV-7800 - New Feature: FR - add stdout to CLI logs (for logging in kibana etc.)
|
|
7
|
+
* DEV-7928 - Bug: CLI - cnvrg clone doesnt show log message when files not found
|
|
8
|
+
* DEV-7956 - Bug: CLI crashes from progressbar
|
|
9
|
+
* DEV-8006 - Bug: Cli- cnvrg data put ,slash in the end url path will cause unique index error
|
|
10
|
+
* DEV-8007 - Bug: Cli- Cnvrg data clone failed sometimes to load sts, there for clone crashed
|
|
11
|
+
* DEV-8159 - New Feature: Oauth Proxy
|
|
12
|
+
* DEV-8179 - New Feature: Add auto cache and link files in cache clone
|
|
13
|
+
* DEV-8208 - Bug: Cli - cnvrg data put fails
|
|
14
|
+
* DEV-8284 - Improvement: Use server instead of docker for agent communication
|
|
15
|
+
* DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
|
|
16
|
+
* DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
|
|
17
|
+
* DEV-8621 - Improvement: Add more metrics
|
data/cnvrg.gemspec
CHANGED
|
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
|
|
|
14
14
|
|
|
15
15
|
#spec.files = `git ls-files`.split($/)
|
|
16
16
|
spec.files = %w[cnvrg.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
|
|
17
|
-
spec.executables
|
|
18
|
-
spec.executables
|
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
18
|
+
spec.executables = ['cnvrg']
|
|
19
19
|
spec.require_paths = ['lib']
|
|
20
20
|
|
|
21
21
|
spec.add_development_dependency 'bundler'
|
|
@@ -23,14 +23,14 @@ Gem::Specification.new do |spec|
|
|
|
23
23
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
|
24
24
|
spec.add_development_dependency 'vcr', '~> 3.0'
|
|
25
25
|
spec.add_development_dependency 'aruba'
|
|
26
|
-
spec.add_development_dependency 'pry'
|
|
27
|
-
|
|
28
|
-
spec.add_runtime_dependency 'mimemagic', '~> 0.3.1','>=0.3.
|
|
26
|
+
spec.add_development_dependency 'pry'
|
|
27
|
+
|
|
28
|
+
spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
|
|
29
29
|
spec.add_runtime_dependency 'faraday', '~> 0.15.2'
|
|
30
30
|
spec.add_runtime_dependency 'netrc', '~> 0.11.0'
|
|
31
31
|
spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
|
|
32
32
|
spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
|
|
33
|
-
spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
|
|
33
|
+
spec.add_runtime_dependency 'thor', '~> 0.19.0', '>=0.19.1'
|
|
34
34
|
spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
|
|
35
35
|
spec.add_runtime_dependency 'signet', '~> 0.11.0'
|
|
36
36
|
spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
|
data/lib/cnvrg/cli.rb
CHANGED
|
@@ -3199,6 +3199,7 @@ module Cnvrg
|
|
|
3199
3199
|
method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
|
|
3200
3200
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
|
|
3201
3201
|
method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
|
|
3202
|
+
method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
|
|
3202
3203
|
|
|
3203
3204
|
def exec(*cmd)
|
|
3204
3205
|
log = []
|
|
@@ -3222,6 +3223,7 @@ module Cnvrg
|
|
|
3222
3223
|
output_dir = options['output_dir'] || "output"
|
|
3223
3224
|
project_home = get_project_home
|
|
3224
3225
|
data_query = options["data_query"]
|
|
3226
|
+
docker_stats = options["docker_stats"]
|
|
3225
3227
|
@project = Project.new(project_home)
|
|
3226
3228
|
if @project.is_git
|
|
3227
3229
|
sync_before = false
|
|
@@ -3294,20 +3296,22 @@ module Cnvrg
|
|
|
3294
3296
|
stdout, stderr = '', ''
|
|
3295
3297
|
begin
|
|
3296
3298
|
process_running = true
|
|
3297
|
-
|
|
3298
|
-
|
|
3299
|
-
|
|
3300
|
-
|
|
3301
|
-
|
|
3302
|
-
|
|
3303
|
-
|
|
3304
|
-
|
|
3305
|
-
|
|
3299
|
+
if docker_stats
|
|
3300
|
+
stats_thread = Thread.new do
|
|
3301
|
+
while process_running do
|
|
3302
|
+
sleep 30
|
|
3303
|
+
begin
|
|
3304
|
+
stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? { memory: memory_usage, cpu: cpu_usage } : {}
|
|
3305
|
+
if is_on_gpu
|
|
3306
|
+
gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
|
|
3307
|
+
stats['gpu_util'] = gu[0]
|
|
3308
|
+
stats['gpu'] = gu[1]
|
|
3309
|
+
end
|
|
3310
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
|
3311
|
+
rescue => e
|
|
3312
|
+
log_error(e)
|
|
3313
|
+
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
|
3306
3314
|
end
|
|
3307
|
-
@exp.send_machine_stats [stats] unless stats.empty?
|
|
3308
|
-
rescue => e
|
|
3309
|
-
log_error(e)
|
|
3310
|
-
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
|
3311
3315
|
end
|
|
3312
3316
|
end
|
|
3313
3317
|
end
|
|
@@ -3405,7 +3409,7 @@ module Cnvrg
|
|
|
3405
3409
|
end
|
|
3406
3410
|
|
|
3407
3411
|
# log_thread.join
|
|
3408
|
-
stats_thread.join
|
|
3412
|
+
stats_thread.join if docker_stats
|
|
3409
3413
|
|
|
3410
3414
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
|
|
3411
3415
|
|
|
@@ -3425,7 +3429,7 @@ module Cnvrg
|
|
|
3425
3429
|
log_message("Couldn't run #{cmd}, check your input parameters", Thor::Shell::Color::RED)
|
|
3426
3430
|
if @exp
|
|
3427
3431
|
# log_thread.join
|
|
3428
|
-
Thread.kill(stats_thread)
|
|
3432
|
+
Thread.kill(stats_thread) if docker_stats
|
|
3429
3433
|
exit_status = $?.exitstatus
|
|
3430
3434
|
if exit_status.blank?
|
|
3431
3435
|
exit_status = "-1"
|
|
@@ -3449,7 +3453,7 @@ module Cnvrg
|
|
|
3449
3453
|
end_commit = @project.last_local_commit
|
|
3450
3454
|
process_running = false
|
|
3451
3455
|
# log_thread.join
|
|
3452
|
-
stats_thread.join
|
|
3456
|
+
stats_thread.join if docker_stats
|
|
3453
3457
|
|
|
3454
3458
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
|
|
3455
3459
|
if container
|
|
@@ -4640,6 +4644,81 @@ module Cnvrg
|
|
|
4640
4644
|
end
|
|
4641
4645
|
end
|
|
4642
4646
|
|
|
4647
|
+
desc 'Collect and send job utilization', '', :hide => true
|
|
4648
|
+
method_option :prometheus_url, :type => :string, :aliases => ["--prometheus_url"], :desc => "prometheus url to collect metrics from"
|
|
4649
|
+
method_option :node_name, :type => :string, :aliases => ["--node_name"], :desc => "machie activity node name"
|
|
4650
|
+
method_option :machine, :type => :boolean, :aliases => ["--machine"], :desc => "get machine_query or cluster_query"
|
|
4651
|
+
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
|
4652
|
+
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
|
4653
|
+
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
|
4654
|
+
|
|
4655
|
+
def collect_metrics
|
|
4656
|
+
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
|
4657
|
+
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
|
4658
|
+
|
|
4659
|
+
translate_result = Cnvrg::API_V2.request(
|
|
4660
|
+
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
|
4661
|
+
'GET',
|
|
4662
|
+
{ gpu: options[:gpu] }
|
|
4663
|
+
)
|
|
4664
|
+
|
|
4665
|
+
is_machine = options[:machine]
|
|
4666
|
+
while true do
|
|
4667
|
+
begin
|
|
4668
|
+
stats = {}
|
|
4669
|
+
translate_result.each do |query_name, metric|
|
|
4670
|
+
if is_machine
|
|
4671
|
+
metric_query = metric['machine_query'].presence || metric['query']
|
|
4672
|
+
query_content = metric_query.gsub('#JOB_SLUG#', ENV['CNVRG_JOB_ID']).gsub('#NODE_NAME#', options[:node_name])
|
|
4673
|
+
else
|
|
4674
|
+
metric_query = metric['cluster_query'].presence || metric['query']
|
|
4675
|
+
pod_name = `hostname`.strip
|
|
4676
|
+
query_content = metric_query.gsub('#JOB_SLUG#', pod_name).gsub('#NODE_NAME#', options[:node_name])
|
|
4677
|
+
end
|
|
4678
|
+
if metric_query.blank? || query_content.blank?
|
|
4679
|
+
next
|
|
4680
|
+
end
|
|
4681
|
+
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
|
4682
|
+
resp = Net::HTTP.get(uri)
|
|
4683
|
+
begin
|
|
4684
|
+
result = JSON.parse(resp)
|
|
4685
|
+
rescue JSON::ParserError => e
|
|
4686
|
+
log_error(e)
|
|
4687
|
+
next
|
|
4688
|
+
end
|
|
4689
|
+
data_result = result&.dig('data', 'result')
|
|
4690
|
+
next unless data_result
|
|
4691
|
+
|
|
4692
|
+
if data_result.size > 1
|
|
4693
|
+
stats[query_name] = {}
|
|
4694
|
+
data_result.each_with_index do |res, i|
|
|
4695
|
+
timestamp, value = res["value"]
|
|
4696
|
+
uuid = res["metric"]["UUID"].presence || i
|
|
4697
|
+
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
|
4698
|
+
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
|
4699
|
+
stats[query_name][uuid] = stat_value
|
|
4700
|
+
end
|
|
4701
|
+
else
|
|
4702
|
+
timestamp, value = data_result&.first&.dig('value')
|
|
4703
|
+
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
|
4704
|
+
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
|
4705
|
+
if query_name.include? 'block'
|
|
4706
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
|
4707
|
+
io_type = query_name.split('_')[1]
|
|
4708
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
|
4709
|
+
else
|
|
4710
|
+
stats[query_name] = stat_value
|
|
4711
|
+
end
|
|
4712
|
+
end
|
|
4713
|
+
end
|
|
4714
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
|
4715
|
+
rescue => e
|
|
4716
|
+
log_error(e)
|
|
4717
|
+
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
|
4718
|
+
end
|
|
4719
|
+
sleep options[:wait]
|
|
4720
|
+
end
|
|
4721
|
+
end
|
|
4643
4722
|
|
|
4644
4723
|
desc '', '', :hide => true
|
|
4645
4724
|
|
data/lib/cnvrg/experiment.rb
CHANGED
|
@@ -133,23 +133,30 @@ module Cnvrg
|
|
|
133
133
|
return response
|
|
134
134
|
end
|
|
135
135
|
def remote_notebook(instance_type, commit, data, data_commit, notebook_type,ds_sync_options=0,data_query=nil, image = nil, datasets = nil)
|
|
136
|
-
response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', {instance_type: instance_type,dataset_slug:data,
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
136
|
+
response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', { instance_type: instance_type, dataset_slug:data,
|
|
137
|
+
dataset_commit: data_commit, image_slug:image,
|
|
138
|
+
datasets: datasets,
|
|
139
|
+
commit:commit, notebook_type:notebook_type, dataset_sync_options:ds_sync_options,
|
|
140
|
+
dataset_query: data_query })
|
|
141
141
|
return response
|
|
142
142
|
end
|
|
143
143
|
|
|
144
144
|
def upload_temp_log(temp_log)
|
|
145
|
-
response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', {output: temp_log,
|
|
146
|
-
|
|
147
|
-
Cnvrg::CLI.is_response_success(response,false)
|
|
145
|
+
response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', { output: temp_log,
|
|
146
|
+
exp_slug: @slug })
|
|
147
|
+
Cnvrg::CLI.is_response_success(response, false)
|
|
148
148
|
end
|
|
149
149
|
|
|
150
150
|
def send_machine_stats(stats)
|
|
151
|
-
response = Cnvrg::API.request(
|
|
152
|
-
|
|
151
|
+
response = Cnvrg::API.request(
|
|
152
|
+
@base_resource + "experiment/upload_stats",
|
|
153
|
+
"POST",
|
|
154
|
+
{
|
|
155
|
+
exp_slug: @slug,
|
|
156
|
+
stats: stats.map { |s| s.merge!({ time: Time.now }) }
|
|
157
|
+
}
|
|
158
|
+
)
|
|
159
|
+
Cnvrg::CLI.is_response_success(response, false)
|
|
153
160
|
end
|
|
154
161
|
|
|
155
162
|
def end(output, exit_status, end_commit, cpu_average, memory_average, end_time: nil)
|
|
@@ -158,7 +165,7 @@ module Cnvrg
|
|
|
158
165
|
success = false
|
|
159
166
|
end_time ||= Time.now
|
|
160
167
|
while tries < 10 and success.blank?
|
|
161
|
-
sleep (tries*rand) ** 2 ### exponential backoff
|
|
168
|
+
sleep (tries * rand) ** 2 ### exponential backoff
|
|
162
169
|
## this call is super important so we cant let it crash.
|
|
163
170
|
|
|
164
171
|
tries += 1
|
|
@@ -201,7 +201,7 @@ class Cnvrg::Helpers::Executer
|
|
|
201
201
|
pod_name = `hostname`.strip rescue nil
|
|
202
202
|
node_name = nil
|
|
203
203
|
if pod_name.present?
|
|
204
|
-
pod_describe = `kubectl
|
|
204
|
+
pod_describe = `kubectl get pod #{pod_name} -o json` rescue nil
|
|
205
205
|
pod_describe = JSON.parse(pod_describe) rescue {}
|
|
206
206
|
node_name = pod_describe["spec"]["nodeName"] rescue nil
|
|
207
207
|
end
|
|
@@ -217,7 +217,7 @@ class Cnvrg::Helpers::Executer
|
|
|
217
217
|
|
|
218
218
|
def get_pod_events(pod_name)
|
|
219
219
|
return if pod_name.blank?
|
|
220
|
-
`kubectl get event --
|
|
220
|
+
`kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
|
|
221
221
|
end
|
|
222
222
|
|
|
223
223
|
def get_node_events(node_name)
|
data/lib/cnvrg/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: cnvrg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.11.
|
|
4
|
+
version: 1.11.29
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yochay Ettun
|
|
@@ -10,7 +10,7 @@ authors:
|
|
|
10
10
|
autorequire:
|
|
11
11
|
bindir: bin
|
|
12
12
|
cert_chain: []
|
|
13
|
-
date: 2021-03-
|
|
13
|
+
date: 2021-03-30 00:00:00.000000000 Z
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
16
16
|
name: bundler
|
|
@@ -105,7 +105,7 @@ dependencies:
|
|
|
105
105
|
version: 0.3.1
|
|
106
106
|
- - ">="
|
|
107
107
|
- !ruby/object:Gem::Version
|
|
108
|
-
version: 0.3.
|
|
108
|
+
version: 0.3.7
|
|
109
109
|
type: :runtime
|
|
110
110
|
prerelease: false
|
|
111
111
|
version_requirements: !ruby/object:Gem::Requirement
|
|
@@ -115,7 +115,7 @@ dependencies:
|
|
|
115
115
|
version: 0.3.1
|
|
116
116
|
- - ">="
|
|
117
117
|
- !ruby/object:Gem::Version
|
|
118
|
-
version: 0.3.
|
|
118
|
+
version: 0.3.7
|
|
119
119
|
- !ruby/object:Gem::Dependency
|
|
120
120
|
name: faraday
|
|
121
121
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -394,6 +394,7 @@ executables:
|
|
|
394
394
|
extensions: []
|
|
395
395
|
extra_rdoc_files: []
|
|
396
396
|
files:
|
|
397
|
+
- Readme.md
|
|
397
398
|
- bin/cnvrg
|
|
398
399
|
- cnvrg.gemspec
|
|
399
400
|
- lib/cnvrg.rb
|