cnvrg 1.11.25 → 1.11.31
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Readme.md +21 -0
- data/cnvrg.gemspec +6 -5
- data/lib/cnvrg/cli.rb +95 -17
- data/lib/cnvrg/datafiles.rb +56 -39
- data/lib/cnvrg/experiment.rb +18 -11
- data/lib/cnvrg/files.rb +0 -1
- data/lib/cnvrg/helpers.rb +1 -0
- data/lib/cnvrg/helpers/executer.rb +2 -2
- data/lib/cnvrg/project.rb +9 -8
- data/lib/cnvrg/version.rb +1 -1
- metadata +25 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a31d199ccd8981e455384ced40cc63a0a9a687982338cbc85e3a9413c2968a2c
|
4
|
+
data.tar.gz: 10d4da464f6ef7b507d563e2976cf17f5d6cd8bcfeab742c83ec5bfa1ab575e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2495cc0d77ada40ae2c6da54ada72710f28facba737b0a37c0dd2cb904d37d32dd23d2cba476fb3854ec47e19eaa11fb34913c2038d6ea059378ad0d460e905a
|
7
|
+
data.tar.gz: dbe50928a5d25259f78fc1dcdab474545b71308b7e1d798678fb2cd8786f0b0b2eda78062bcef6bcfdb16c9481bc47349c6f571de9cfe89f547c57f583bea403
|
data/Readme.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
## Version v1.11.15
|
3
|
+
2021-03-30
|
4
|
+
* DEV-208 - Task: Make sure the index name is constant over days
|
5
|
+
* DEV-7555 - Bug: CLI: Error message is not correct when run a flow after removing the permission from Template.
|
6
|
+
* DEV-7800 - New Feature: FR - add stdout to CLI logs (for logging in kibana etc.)
|
7
|
+
* DEV-7928 - Bug: CLI - cnvrg clone doesnt show log message when files not found
|
8
|
+
* DEV-7956 - Bug: CLI crashes from progressbar
|
9
|
+
* DEV-8006 - Bug: Cli- cnvrg data put ,slash in the end url path will cause unique index error
|
10
|
+
* DEV-8007 - Bug: Cli- Cnvrg data clone failed sometimes to load sts, there for clone crashed
|
11
|
+
* DEV-8159 - New Feature: Oauth Proxy
|
12
|
+
* DEV-8179 - New Feature: Add auto cache and link files in cache clone
|
13
|
+
* DEV-8208 - Bug: Cli - cnvrg data put fails
|
14
|
+
* DEV-8284 - Improvement: Use server instead of docker for agent communication
|
15
|
+
* DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
|
16
|
+
* DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
|
17
|
+
* DEV-8621 - Improvement: Add more metrics
|
18
|
+
## Version v1.11.30
|
19
|
+
2021-04-06
|
20
|
+
## Version v1.11.31
|
21
|
+
2021-04-22
|
data/cnvrg.gemspec
CHANGED
@@ -6,7 +6,7 @@ require 'cnvrg/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = 'cnvrg'
|
8
8
|
spec.version = Cnvrg::VERSION
|
9
|
-
spec.authors = ['Yochay Ettun', 'Leah Kolben'
|
9
|
+
spec.authors = ['Yochay Ettun', 'Leah Kolben']
|
10
10
|
spec.email = ['info@cnvrg.io']
|
11
11
|
spec.summary = %q{A CLI tool for interacting with cnvrg.io.}
|
12
12
|
spec.description = %q{A CLI tool for interacting with cnvrg.io.}
|
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
|
|
14
14
|
|
15
15
|
#spec.files = `git ls-files`.split($/)
|
16
16
|
spec.files = %w[cnvrg.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
|
17
|
-
spec.executables
|
18
|
-
spec.executables
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.executables = ['cnvrg']
|
19
19
|
spec.require_paths = ['lib']
|
20
20
|
|
21
21
|
spec.add_development_dependency 'bundler'
|
@@ -25,12 +25,13 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_development_dependency 'aruba'
|
26
26
|
spec.add_development_dependency 'pry'
|
27
27
|
|
28
|
-
spec.add_runtime_dependency '
|
28
|
+
spec.add_runtime_dependency 'ffi', '~> 1.9', '>= 1.9.10'
|
29
|
+
spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
|
29
30
|
spec.add_runtime_dependency 'faraday', '~> 0.15.2'
|
30
31
|
spec.add_runtime_dependency 'netrc', '~> 0.11.0'
|
31
32
|
spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
|
32
33
|
spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
|
33
|
-
spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
|
34
|
+
spec.add_runtime_dependency 'thor', '~> 0.19.0', '>=0.19.1'
|
34
35
|
spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
|
35
36
|
spec.add_runtime_dependency 'signet', '~> 0.11.0'
|
36
37
|
spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
|
data/lib/cnvrg/cli.rb
CHANGED
@@ -2320,7 +2320,6 @@ module Cnvrg
|
|
2320
2320
|
@project = Project.new(get_project_home)
|
2321
2321
|
chunk_size = chunk_size ? chunk_size : options["chunk_size"]
|
2322
2322
|
|
2323
|
-
|
2324
2323
|
# Enable local/experiment exception logging
|
2325
2324
|
suppress_exceptions = suppress_exceptions ? suppress_exceptions : options[:suppress_exceptions]
|
2326
2325
|
if in_exp
|
@@ -3200,6 +3199,7 @@ module Cnvrg
|
|
3200
3199
|
method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
|
3201
3200
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
|
3202
3201
|
method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
|
3202
|
+
method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
|
3203
3203
|
|
3204
3204
|
def exec(*cmd)
|
3205
3205
|
log = []
|
@@ -3223,6 +3223,7 @@ module Cnvrg
|
|
3223
3223
|
output_dir = options['output_dir'] || "output"
|
3224
3224
|
project_home = get_project_home
|
3225
3225
|
data_query = options["data_query"]
|
3226
|
+
docker_stats = options["docker_stats"]
|
3226
3227
|
@project = Project.new(project_home)
|
3227
3228
|
if @project.is_git
|
3228
3229
|
sync_before = false
|
@@ -3295,20 +3296,22 @@ module Cnvrg
|
|
3295
3296
|
stdout, stderr = '', ''
|
3296
3297
|
begin
|
3297
3298
|
process_running = true
|
3298
|
-
|
3299
|
-
|
3300
|
-
|
3301
|
-
|
3302
|
-
|
3303
|
-
|
3304
|
-
|
3305
|
-
|
3306
|
-
|
3299
|
+
if docker_stats
|
3300
|
+
stats_thread = Thread.new do
|
3301
|
+
while process_running do
|
3302
|
+
sleep 30
|
3303
|
+
begin
|
3304
|
+
stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? { memory: memory_usage, cpu: cpu_usage } : {}
|
3305
|
+
if is_on_gpu
|
3306
|
+
gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
|
3307
|
+
stats['gpu_util'] = gu[0]
|
3308
|
+
stats['gpu'] = gu[1]
|
3309
|
+
end
|
3310
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
3311
|
+
rescue => e
|
3312
|
+
log_error(e)
|
3313
|
+
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
3307
3314
|
end
|
3308
|
-
@exp.send_machine_stats [stats] unless stats.empty?
|
3309
|
-
rescue => e
|
3310
|
-
log_error(e)
|
3311
|
-
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
3312
3315
|
end
|
3313
3316
|
end
|
3314
3317
|
end
|
@@ -3406,7 +3409,7 @@ module Cnvrg
|
|
3406
3409
|
end
|
3407
3410
|
|
3408
3411
|
# log_thread.join
|
3409
|
-
stats_thread.join
|
3412
|
+
stats_thread.join if docker_stats
|
3410
3413
|
|
3411
3414
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
|
3412
3415
|
|
@@ -3426,7 +3429,7 @@ module Cnvrg
|
|
3426
3429
|
log_message("Couldn't run #{cmd}, check your input parameters", Thor::Shell::Color::RED)
|
3427
3430
|
if @exp
|
3428
3431
|
# log_thread.join
|
3429
|
-
Thread.kill(stats_thread)
|
3432
|
+
Thread.kill(stats_thread) if docker_stats
|
3430
3433
|
exit_status = $?.exitstatus
|
3431
3434
|
if exit_status.blank?
|
3432
3435
|
exit_status = "-1"
|
@@ -3450,7 +3453,7 @@ module Cnvrg
|
|
3450
3453
|
end_commit = @project.last_local_commit
|
3451
3454
|
process_running = false
|
3452
3455
|
# log_thread.join
|
3453
|
-
stats_thread.join
|
3456
|
+
stats_thread.join if docker_stats
|
3454
3457
|
|
3455
3458
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
|
3456
3459
|
if container
|
@@ -4641,6 +4644,81 @@ module Cnvrg
|
|
4641
4644
|
end
|
4642
4645
|
end
|
4643
4646
|
|
4647
|
+
desc 'Collect and send job utilization', '', :hide => true
|
4648
|
+
method_option :prometheus_url, :type => :string, :aliases => ["--prometheus_url"], :desc => "prometheus url to collect metrics from"
|
4649
|
+
method_option :node_name, :type => :string, :aliases => ["--node_name"], :desc => "machie activity node name"
|
4650
|
+
method_option :machine, :type => :boolean, :aliases => ["--machine"], :desc => "get machine_query or cluster_query"
|
4651
|
+
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
4652
|
+
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
4653
|
+
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
4654
|
+
|
4655
|
+
def collect_metrics
|
4656
|
+
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
4657
|
+
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
4658
|
+
|
4659
|
+
translate_result = Cnvrg::API_V2.request(
|
4660
|
+
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
4661
|
+
'GET',
|
4662
|
+
{ gpu: options[:gpu], gaudi: options[:gaudi] }
|
4663
|
+
)
|
4664
|
+
|
4665
|
+
is_machine = options[:machine]
|
4666
|
+
while true do
|
4667
|
+
begin
|
4668
|
+
stats = {}
|
4669
|
+
translate_result.each do |query_name, metric|
|
4670
|
+
if is_machine
|
4671
|
+
metric_query = metric['machine_query'].presence || metric['query']
|
4672
|
+
query_content = metric_query.gsub('#JOB_SLUG#', ENV['CNVRG_JOB_ID']).gsub('#NODE_NAME#', options[:node_name])
|
4673
|
+
else
|
4674
|
+
metric_query = metric['cluster_query'].presence || metric['query']
|
4675
|
+
pod_name = `hostname`.strip
|
4676
|
+
query_content = metric_query.gsub('#JOB_SLUG#', pod_name).gsub('#NODE_NAME#', options[:node_name])
|
4677
|
+
end
|
4678
|
+
if metric_query.blank? || query_content.blank?
|
4679
|
+
next
|
4680
|
+
end
|
4681
|
+
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
4682
|
+
resp = Net::HTTP.get(uri)
|
4683
|
+
begin
|
4684
|
+
result = JSON.parse(resp)
|
4685
|
+
rescue JSON::ParserError => e
|
4686
|
+
log_error(e)
|
4687
|
+
next
|
4688
|
+
end
|
4689
|
+
data_result = result&.dig('data', 'result')
|
4690
|
+
next unless data_result
|
4691
|
+
|
4692
|
+
if data_result.size > 1
|
4693
|
+
stats[query_name] = {}
|
4694
|
+
data_result.each_with_index do |res, i|
|
4695
|
+
timestamp, value = res["value"]
|
4696
|
+
uuid = res["metric"]["UUID"].presence || i
|
4697
|
+
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4698
|
+
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4699
|
+
stats[query_name][uuid] = stat_value
|
4700
|
+
end
|
4701
|
+
else
|
4702
|
+
timestamp, value = data_result&.first&.dig('value')
|
4703
|
+
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4704
|
+
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4705
|
+
if query_name.include? 'block'
|
4706
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
4707
|
+
io_type = query_name.split('_')[1]
|
4708
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
4709
|
+
else
|
4710
|
+
stats[query_name] = stat_value
|
4711
|
+
end
|
4712
|
+
end
|
4713
|
+
end
|
4714
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
4715
|
+
rescue => e
|
4716
|
+
log_error(e)
|
4717
|
+
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
4718
|
+
end
|
4719
|
+
sleep options[:wait]
|
4720
|
+
end
|
4721
|
+
end
|
4644
4722
|
|
4645
4723
|
desc '', '', :hide => true
|
4646
4724
|
|
data/lib/cnvrg/datafiles.rb
CHANGED
@@ -345,7 +345,8 @@ module Cnvrg
|
|
345
345
|
cli = CLI.new
|
346
346
|
cli.log_message("Using #{threads} threads with chunk size of #{chunk_size}.", Thor::Shell::Color::GREEN)
|
347
347
|
|
348
|
-
|
348
|
+
num_files = files.size
|
349
|
+
progressbar = create_progressbar("Upload Progress", num_files)
|
349
350
|
cli = CLI.new
|
350
351
|
|
351
352
|
# Vars to handle the parallelism
|
@@ -355,6 +356,7 @@ module Cnvrg
|
|
355
356
|
dirs_queue = Queue.new
|
356
357
|
worker_threads = []
|
357
358
|
progress_threads = []
|
359
|
+
old_api = false
|
358
360
|
|
359
361
|
# Vars to keep track of uploaded files and directories
|
360
362
|
uploaded_files = []
|
@@ -382,23 +384,30 @@ module Cnvrg
|
|
382
384
|
dir_thread = Thread.new do
|
383
385
|
dirs_to_create = []
|
384
386
|
loop do
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
387
|
+
dir = dirs_queue.deq(non_block: true) rescue nil
|
388
|
+
if dir.nil? && !progressbar.finished?
|
389
|
+
sleep 0.2
|
390
|
+
Cnvrg::Logger.info("directories thread status: progressbar.finished? #{progressbar.finished?} || dirs_queue.empty? #{dirs_queue.empty?} #{dirs_queue.size} || dirs_to_create.empty? #{dirs_to_create.empty?} #{dirs_to_create.size}")
|
391
|
+
else
|
392
|
+
dirs_to_create << dir
|
393
|
+
|
394
|
+
if dirs_to_create.size >= 1000 || progressbar.finished?
|
395
|
+
resp = Cnvrg::API.request(@base_resource + "create_dirs", "POST", { dirs: dirs_to_create, commit_sha1: commit_sha1 })
|
396
|
+
Cnvrg::Logger.info("uploaded directories chunk, finished with #{resp}")
|
397
|
+
if resp == false # if resp is false it means 404 which is old server
|
398
|
+
old_api = true
|
399
|
+
break
|
400
|
+
end
|
401
|
+
unless Cnvrg::CLI.is_response_success(resp, false)
|
402
|
+
dirs_to_create = []
|
403
|
+
time = Time.current
|
404
|
+
Cnvrg::Logger.log_error_message("Failed to create dirs: #{time}, #{resp.try(:fetch, "message")}")
|
405
|
+
next
|
406
|
+
end
|
396
407
|
dirs_to_create = []
|
397
|
-
next
|
398
408
|
end
|
399
|
-
|
409
|
+
break if progressbar.finished? && dirs_queue.empty? && dirs_to_create.empty?
|
400
410
|
end
|
401
|
-
break if progressbar.finished? && dirs_queue.empty? && dirs_to_create.empty?
|
402
411
|
end
|
403
412
|
end
|
404
413
|
|
@@ -409,7 +418,6 @@ module Cnvrg
|
|
409
418
|
file = progress_queue.deq(non_block: true) rescue nil # to prevent deadlocks
|
410
419
|
unless file.nil?
|
411
420
|
blob_ids = []
|
412
|
-
|
413
421
|
progress_mutex.synchronize {
|
414
422
|
progressbar.progress += 1
|
415
423
|
uploaded_files.append(file) if file[:success]
|
@@ -421,32 +429,31 @@ module Cnvrg
|
|
421
429
|
}
|
422
430
|
|
423
431
|
if blob_ids.present?
|
432
|
+
random_id = (0...10).map { ('a'..'z').to_a[rand(26)] }.join
|
424
433
|
refresh_storage_token
|
425
|
-
Cnvrg::Logger.info("Finished
|
426
|
-
|
434
|
+
Cnvrg::Logger.info("chunk #{random_id}: Finished uploading chunk of #{chunk_size} files, Sending Upload files save")
|
427
435
|
retry_count = 0
|
428
436
|
loop do
|
429
437
|
upload_resp = Cnvrg::API.request(@base_resource + "upload_files_save", "POST", {commit: commit_sha1, blob_ids: blob_ids})
|
430
438
|
|
431
439
|
if not (Cnvrg::CLI.is_response_success(upload_resp, false))
|
432
440
|
retry_count += 1
|
433
|
-
Cnvrg::Logger.log_error_message("Failed request save files: #{Time.current}, retry: #{retry_count}")
|
434
|
-
Cnvrg::Logger.info("Got an error message from server, #{upload_resp.try(:fetch, "message")}")
|
441
|
+
Cnvrg::Logger.log_error_message("chunk #{random_id}: Failed request save files: #{Time.current}, retry: #{retry_count}")
|
435
442
|
if retry_count > 20
|
436
|
-
puts "Failed to save files: #{Time.current}, trying next chunk"
|
443
|
+
puts "chunk #{random_id}: Failed to save files: #{Time.current}, trying next chunk"
|
437
444
|
break
|
438
445
|
end
|
439
446
|
sleep 5
|
440
447
|
next
|
441
448
|
end
|
442
|
-
Cnvrg::Logger.info("Chunk saved on server")
|
449
|
+
Cnvrg::Logger.info("chunk #{random_id}: Chunk saved on server")
|
443
450
|
break
|
444
451
|
end
|
445
452
|
end
|
446
453
|
else
|
447
454
|
sleep(0.1)
|
448
455
|
end
|
449
|
-
|
456
|
+
Cnvrg::Logger.info("progress_threads: progressbar.finished? #{progressbar.finished?}")
|
450
457
|
if progressbar.finished?
|
451
458
|
Cnvrg::Logger.info("Progress bar finished closing queues")
|
452
459
|
file_queue.close
|
@@ -459,35 +466,43 @@ module Cnvrg
|
|
459
466
|
|
460
467
|
file_chunks = files.each_slice(chunk_size).to_a
|
461
468
|
# Fetch the required files from the server:
|
469
|
+
num_chunks = (num_files / 1000.0).ceil
|
470
|
+
chunk_index = 0
|
462
471
|
Parallel.map((file_chunks), in_threads: threads) do |chunk|
|
463
|
-
|
464
|
-
|
472
|
+
chunk_index += 1
|
473
|
+
self_chunk_index = chunk_index
|
474
|
+
files_chunk = chunk.map { |p| p.gsub(/^\.\//, '') }
|
475
|
+
Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: Generating chunk idx")
|
465
476
|
tree = @dataset.generate_chunked_idx(files_chunk, prefix: prefix, threads: threads, cli: cli)
|
477
|
+
Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: Finished Generating chunk idx")
|
466
478
|
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
if new_dirs.blank?
|
472
|
-
## we need to send 1 file so we will inflated dirs from in case when we dont have folders in the tree
|
473
|
-
file = tree.keys.find { |k| tree[k] != nil }
|
474
|
-
dirs_queue.push file
|
479
|
+
# Handle directories:
|
480
|
+
unless old_api
|
481
|
+
while dirs_queue.size > 5000
|
482
|
+
sleep(0.1)
|
475
483
|
end
|
484
|
+
end
|
485
|
+
new_dirs = tree.keys.select { |k| tree[k].nil? }
|
486
|
+
if new_dirs.blank?
|
487
|
+
## we need to send 1 file so we will inflated dirs from in case when we dont have folders in the tree
|
488
|
+
file = tree.keys.find { |k| tree[k] != nil }
|
489
|
+
dirs_queue.push(file) unless old_api
|
490
|
+
end
|
491
|
+
new_dirs.each { |dir| dirs_queue.push dir }
|
476
492
|
|
477
|
-
|
478
|
-
}
|
479
|
-
Cnvrg::Logger.info("Getting files info from server")
|
480
|
-
|
493
|
+
Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: Getting files info from server")
|
481
494
|
results = request_upload_files(commit_sha1, tree, override, new_branch, partial_commit)
|
495
|
+
Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: Finished Getting files info from server")
|
482
496
|
next unless results
|
483
497
|
|
484
498
|
if results['files'].blank?
|
499
|
+
Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: no files to upload skipping chunk")
|
485
500
|
progress_mutex.synchronize { progressbar.progress += tree.keys.length }
|
486
501
|
next
|
487
502
|
end
|
488
503
|
|
489
504
|
files_to_upload = results['files']
|
490
|
-
|
505
|
+
Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: number of files to upload in this chunk: #{tree.keys.length - files_to_upload.length}")
|
491
506
|
progress_mutex.synchronize {
|
492
507
|
progressbar.progress += tree.keys.length - files_to_upload.length
|
493
508
|
}
|
@@ -500,10 +515,12 @@ module Cnvrg
|
|
500
515
|
end
|
501
516
|
end
|
502
517
|
|
503
|
-
Cnvrg::Logger.info("Waiting
|
518
|
+
Cnvrg::Logger.info("Waiting dir_thread to finish")
|
504
519
|
dir_thread.join
|
505
520
|
dirs_queue.close
|
521
|
+
Cnvrg::Logger.info("Waiting progress_thread to finish")
|
506
522
|
progress_threads.each(&:join)
|
523
|
+
Cnvrg::Logger.info("Waiting workers to finish")
|
507
524
|
worker_threads.each(&:join)
|
508
525
|
Thread.report_on_exception = true
|
509
526
|
rescue => e
|
data/lib/cnvrg/experiment.rb
CHANGED
@@ -133,23 +133,30 @@ module Cnvrg
|
|
133
133
|
return response
|
134
134
|
end
|
135
135
|
def remote_notebook(instance_type, commit, data, data_commit, notebook_type,ds_sync_options=0,data_query=nil, image = nil, datasets = nil)
|
136
|
-
response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', {instance_type: instance_type,dataset_slug:data,
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
136
|
+
response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', { instance_type: instance_type, dataset_slug:data,
|
137
|
+
dataset_commit: data_commit, image_slug:image,
|
138
|
+
datasets: datasets,
|
139
|
+
commit:commit, notebook_type:notebook_type, dataset_sync_options:ds_sync_options,
|
140
|
+
dataset_query: data_query })
|
141
141
|
return response
|
142
142
|
end
|
143
143
|
|
144
144
|
def upload_temp_log(temp_log)
|
145
|
-
response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', {output: temp_log,
|
146
|
-
|
147
|
-
Cnvrg::CLI.is_response_success(response,false)
|
145
|
+
response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', { output: temp_log,
|
146
|
+
exp_slug: @slug })
|
147
|
+
Cnvrg::CLI.is_response_success(response, false)
|
148
148
|
end
|
149
149
|
|
150
150
|
def send_machine_stats(stats)
|
151
|
-
response = Cnvrg::API.request(
|
152
|
-
|
151
|
+
response = Cnvrg::API.request(
|
152
|
+
@base_resource + "experiment/upload_stats",
|
153
|
+
"POST",
|
154
|
+
{
|
155
|
+
exp_slug: @slug,
|
156
|
+
stats: stats.map { |s| s.merge!({ time: Time.now }) }
|
157
|
+
}
|
158
|
+
)
|
159
|
+
Cnvrg::CLI.is_response_success(response, false)
|
153
160
|
end
|
154
161
|
|
155
162
|
def end(output, exit_status, end_commit, cpu_average, memory_average, end_time: nil)
|
@@ -158,7 +165,7 @@ module Cnvrg
|
|
158
165
|
success = false
|
159
166
|
end_time ||= Time.now
|
160
167
|
while tries < 10 and success.blank?
|
161
|
-
sleep (tries*rand) ** 2 ### exponential backoff
|
168
|
+
sleep (tries * rand) ** 2 ### exponential backoff
|
162
169
|
## this call is super important so we cant let it crash.
|
163
170
|
|
164
171
|
tries += 1
|
data/lib/cnvrg/files.rb
CHANGED
data/lib/cnvrg/helpers.rb
CHANGED
@@ -201,7 +201,7 @@ class Cnvrg::Helpers::Executer
|
|
201
201
|
pod_name = `hostname`.strip rescue nil
|
202
202
|
node_name = nil
|
203
203
|
if pod_name.present?
|
204
|
-
pod_describe = `kubectl
|
204
|
+
pod_describe = `kubectl get pod #{pod_name} -o json` rescue nil
|
205
205
|
pod_describe = JSON.parse(pod_describe) rescue {}
|
206
206
|
node_name = pod_describe["spec"]["nodeName"] rescue nil
|
207
207
|
end
|
@@ -217,7 +217,7 @@ class Cnvrg::Helpers::Executer
|
|
217
217
|
|
218
218
|
def get_pod_events(pod_name)
|
219
219
|
return if pod_name.blank?
|
220
|
-
`kubectl get event --
|
220
|
+
`kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
|
221
221
|
end
|
222
222
|
|
223
223
|
def get_node_events(node_name)
|
data/lib/cnvrg/project.rb
CHANGED
@@ -381,7 +381,7 @@ module Cnvrg
|
|
381
381
|
def generate_output_dir(output_dir)
|
382
382
|
Cnvrg::Logger.log_info("Generating output dir for #{output_dir}")
|
383
383
|
upload_list = []
|
384
|
-
list = Dir.glob("
|
384
|
+
list = Dir.glob("/cnvrg/#{output_dir}/**/*", File::FNM_DOTMATCH)
|
385
385
|
Parallel.map(list, in_threads: IDXParallelThreads) do |e|
|
386
386
|
next if e.end_with? "/."
|
387
387
|
if File.directory? e
|
@@ -517,17 +517,17 @@ module Cnvrg
|
|
517
517
|
commit = local_idx[:commit]
|
518
518
|
tree = local_idx[:tree]
|
519
519
|
ignore_list = self.send_ignore_list()
|
520
|
-
if force
|
520
|
+
if force or specific_files.present?
|
521
521
|
added = []
|
522
522
|
if tree.present?
|
523
523
|
added += local_idx[:tree].keys
|
524
524
|
end
|
525
|
-
response = {"result" => {"commit" => nil, "tree" => {"added" => added,
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
525
|
+
response = { "result" => { "commit" => nil, "tree" => { "added" => added,
|
526
|
+
"updated_on_server" => [],
|
527
|
+
"updated_on_local" => [],
|
528
|
+
"update_local" => [],
|
529
|
+
"deleted" => [],
|
530
|
+
"conflicts" => [] } } }
|
531
531
|
return response
|
532
532
|
end
|
533
533
|
#we dont want to send it on download - we only compare between commits sha1 in download.
|
@@ -535,6 +535,7 @@ module Cnvrg
|
|
535
535
|
#the new server doesnt need the tree, but the old probably needs :X
|
536
536
|
local_idx[:tree] = {} if Cnvrg::Helpers.server_version > 0
|
537
537
|
end
|
538
|
+
|
538
539
|
response = Cnvrg::API.request(@base_resource + "status", 'POST', {idx: local_idx, new_branch: new_branch,
|
539
540
|
current_commit: commit, ignore: ignore_list, force: force, in_exp: in_exp, download: download})
|
540
541
|
|
data/lib/cnvrg/version.rb
CHANGED
metadata
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cnvrg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.11.
|
4
|
+
version: 1.11.31
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yochay Ettun
|
8
8
|
- Leah Kolben
|
9
|
-
- Omer Shacham
|
10
9
|
autorequire:
|
11
10
|
bindir: bin
|
12
11
|
cert_chain: []
|
13
|
-
date: 2021-
|
12
|
+
date: 2021-05-04 00:00:00.000000000 Z
|
14
13
|
dependencies:
|
15
14
|
- !ruby/object:Gem::Dependency
|
16
15
|
name: bundler
|
@@ -96,6 +95,26 @@ dependencies:
|
|
96
95
|
- - ">="
|
97
96
|
- !ruby/object:Gem::Version
|
98
97
|
version: '0'
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
name: ffi
|
100
|
+
requirement: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - "~>"
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '1.9'
|
105
|
+
- - ">="
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: 1.9.10
|
108
|
+
type: :runtime
|
109
|
+
prerelease: false
|
110
|
+
version_requirements: !ruby/object:Gem::Requirement
|
111
|
+
requirements:
|
112
|
+
- - "~>"
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: '1.9'
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: 1.9.10
|
99
118
|
- !ruby/object:Gem::Dependency
|
100
119
|
name: mimemagic
|
101
120
|
requirement: !ruby/object:Gem::Requirement
|
@@ -105,7 +124,7 @@ dependencies:
|
|
105
124
|
version: 0.3.1
|
106
125
|
- - ">="
|
107
126
|
- !ruby/object:Gem::Version
|
108
|
-
version: 0.3.
|
127
|
+
version: 0.3.7
|
109
128
|
type: :runtime
|
110
129
|
prerelease: false
|
111
130
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -115,7 +134,7 @@ dependencies:
|
|
115
134
|
version: 0.3.1
|
116
135
|
- - ">="
|
117
136
|
- !ruby/object:Gem::Version
|
118
|
-
version: 0.3.
|
137
|
+
version: 0.3.7
|
119
138
|
- !ruby/object:Gem::Dependency
|
120
139
|
name: faraday
|
121
140
|
requirement: !ruby/object:Gem::Requirement
|
@@ -394,6 +413,7 @@ executables:
|
|
394
413
|
extensions: []
|
395
414
|
extra_rdoc_files: []
|
396
415
|
files:
|
416
|
+
- Readme.md
|
397
417
|
- bin/cnvrg
|
398
418
|
- cnvrg.gemspec
|
399
419
|
- lib/cnvrg.rb
|