cnvrg 1.11.24 → 1.11.29

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e8a5d09f38ee4ce25caecdaf73b77e37fadff3b745454030cb15863210c1b319
4
- data.tar.gz: 1f3e27bc4fa778ff048e7257f2ce753009ceaf0801c4ca3f4facfdc3011ad940
3
+ metadata.gz: 0b2588acc9e199189983cf67643124e60c8c9c7a5223bbbd85d65994cb3a812c
4
+ data.tar.gz: a35822e68bc4a095e7a029d4572bf9a748f2b0f6f0b2f3baa86c04315be3f176
5
5
  SHA512:
6
- metadata.gz: 14d79f14723eff1d8df81907183fabbe97270fd49610d4aa456fa820e4c19994c3d2bd1b7c64160f4d66a0636ff385a32b86f6fed2bedf9c0ed98c7968267c23
7
- data.tar.gz: 80b4644fc6f560e54280ab726983f4e5d89978255991b29c510452b5514019025a3dcae45e4ad5f9a36e97823909eb6d1f2ee892cc63ef5090fa78ecacf595fe
6
+ metadata.gz: c0c64aecb69a20b939c990ed99b26e2387d26046c44a734b46f7f813b271c7552801ea3909478a424db42e80e9df662cc2af428283cd65cbe7eb08d939648aa2
7
+ data.tar.gz: a86c3d71aa228f7a387fd484b286e2dacf92483644cd48687eb0d68bfcb2d0c9e68ac2b87b3ff537efbef064f8c915423fb82438ef823f95aa748b71659a5772
data/Readme.md ADDED
@@ -0,0 +1,17 @@
1
+
2
+ ## Version v1.11.15
3
+ 2021-03-30
4
+ * DEV-208 - Task: Make sure the index name is constant over days
5
+ * DEV-7555 - Bug: CLI: Error message is not correct when run a flow after removing the permission from Template.
6
+ * DEV-7800 - New Feature: FR - add stdout to CLI logs (for logging in kibana etc.)
7
+ * DEV-7928 - Bug: CLI - cnvrg clone doesnt show log message when files not found
8
+ * DEV-7956 - Bug: CLI crashes from progressbar
9
+ * DEV-8006 - Bug: Cli- cnvrg data put ,slash in the end url path will cause unique index error
10
+ * DEV-8007 - Bug: Cli- Cnvrg data clone failed sometimes to load sts, there for clone crashed
11
+ * DEV-8159 - New Feature: Oauth Proxy
12
+ * DEV-8179 - New Feature: Add auto cache and link files in cache clone
13
+ * DEV-8208 - Bug: Cli - cnvrg data put fails
14
+ * DEV-8284 - Improvement: Use server instead of docker for agent communication
15
+ * DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
16
+ * DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
17
+ * DEV-8621 - Improvement: Add more metrics
data/cnvrg.gemspec CHANGED
@@ -14,8 +14,8 @@ Gem::Specification.new do |spec|
14
14
 
15
15
  #spec.files = `git ls-files`.split($/)
16
16
  spec.files = %w[cnvrg.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
17
- spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
- spec.executables = ['cnvrg']
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.executables = ['cnvrg']
19
19
  spec.require_paths = ['lib']
20
20
 
21
21
  spec.add_development_dependency 'bundler'
@@ -25,12 +25,12 @@ Gem::Specification.new do |spec|
25
25
  spec.add_development_dependency 'aruba'
26
26
  spec.add_development_dependency 'pry'
27
27
 
28
- spec.add_runtime_dependency 'mimemagic', '~> 0.3.1','>=0.3.2'
28
+ spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
29
29
  spec.add_runtime_dependency 'faraday', '~> 0.15.2'
30
30
  spec.add_runtime_dependency 'netrc', '~> 0.11.0'
31
31
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
32
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
- spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
33
+ spec.add_runtime_dependency 'thor', '~> 0.19.0', '>=0.19.1'
34
34
  spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
35
35
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
36
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
data/lib/cnvrg/cli.rb CHANGED
@@ -858,7 +858,7 @@ module Cnvrg
858
858
  method_option :read, :type => :boolean, :aliases => ["-r", "--read"], :default => false
859
859
  method_option :remote, :type => :boolean, :aliases => ["-h", "--remote"], :default => false
860
860
  method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
861
- def clone_data(dataset_url, only_tree=false, commit=nil, query=nil, read=false, remote=false, flatten: false, relative: false, soft: false, threads: 15)
861
+ def clone_data(dataset_url, only_tree=false, commit=nil, query=nil, read=false, remote=false, flatten: false, relative: false, soft: false, threads: 15, cache_link: false)
862
862
  begin
863
863
  verify_logged_in(false)
864
864
  log_start(__method__, args, options)
@@ -904,7 +904,7 @@ module Cnvrg
904
904
 
905
905
  commit = response["result"]["commit"]
906
906
  files_count = response["result"]["file_count"]
907
- files = @files.get_clone_chunk(commit: commit)
907
+ files = @files.get_clone_chunk(commit: commit, cache_link: cache_link)
908
908
  downloaded_files = 0
909
909
  progressbar = ProgressBar.create(:title => "Download Progress",
910
910
  :progress_mark => '=',
@@ -917,7 +917,7 @@ module Cnvrg
917
917
 
918
918
  while files['keys'].length > 0
919
919
  Cnvrg::Logger.log_info("download multiple files, #{downloaded_files.size} files downloaded")
920
- @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read, flatten: flatten, threads: threads)
920
+ @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read, flatten: flatten, threads: threads, cache_link: cache_link)
921
921
 
922
922
  downloaded_files += files['keys'].length
923
923
  files = @files.get_clone_chunk(commit: commit, latest_id: files['latest'])
@@ -1201,11 +1201,13 @@ module Cnvrg
1201
1201
  end
1202
1202
 
1203
1203
  desc '', '', :hide => true
1204
- def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, force: false, override: false, threads: 15, message: nil)
1204
+ def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, force: false, override: false, threads: 15, message: nil, auto_cache: false, external_disk: nil)
1205
1205
  begin
1206
1206
  verify_logged_in(false)
1207
1207
  log_start(__method__, args, options)
1208
-
1208
+ if auto_cache && external_disk.blank?
1209
+ raise SignalException.new(1, "for auto caching external disk is required")
1210
+ end
1209
1211
  owner, slug = get_owner_slug(dataset_url)
1210
1212
  @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1211
1213
  @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
@@ -1228,7 +1230,7 @@ module Cnvrg
1228
1230
  Cnvrg::Logger.info("Put files in latest commit")
1229
1231
  response = @datafiles.last_valid_commit()
1230
1232
  unless response #means we failed in the start commit.
1231
- raise SignalException.new(1, "Cant put files into commit:#{commit}, check the dataset id and commitc")
1233
+ raise SignalException.new(1, "Cant put files into commit:#{commit}, check the dataset id and commit")
1232
1234
  end
1233
1235
  @commit = response['result']['sha1']
1234
1236
  else
@@ -1254,7 +1256,7 @@ module Cnvrg
1254
1256
  raise SignalException.new(1, res.msg)
1255
1257
  end
1256
1258
  Cnvrg::Logger.info("Saving commit on server")
1257
- res = @datafiles.end_commit(@commit,force, success: true, commit_type: "put")
1259
+ res = @datafiles.end_commit(@commit,force, success: true, commit_type: "put", auto_cache: auto_cache, external_disk: external_disk)
1258
1260
  msg = res['result']
1259
1261
  response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1260
1262
  unless response.is_success?
@@ -1262,19 +1264,25 @@ module Cnvrg
1262
1264
  end
1263
1265
 
1264
1266
  log_message("Uploading files finished Successfully", Thor::Shell::Color::GREEN)
1267
+ if msg['cache_error'].present?
1268
+ log_message("Couldn't cache commit: #{msg['cache_error']}", Thor::Shell::Color::YELLOW)
1269
+ end
1265
1270
  rescue SignalException => e
1266
1271
  log_message(e.message, Thor::Shell::Color::RED)
1267
1272
  return false
1268
1273
  end
1269
1274
  end
1270
1275
 
1271
-
1272
1276
  desc '', '', :hide => true
1273
- def data_rm(dataset_url, regex_list: [], commit: '', message: nil)
1277
+ def data_rm(dataset_url, regex_list: [], commit: '', message: nil, auto_cache: false, external_disk: nil)
1274
1278
  begin
1275
1279
  verify_logged_in(false)
1276
1280
  log_start(__method__, args, options)
1277
1281
 
1282
+ if auto_cache && external_disk.blank?
1283
+ raise SignalException.new(1, "for auto caching external disk is required")
1284
+ end
1285
+
1278
1286
  owner, slug = get_owner_slug(dataset_url)
1279
1287
  @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1280
1288
  @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
@@ -1310,7 +1318,7 @@ module Cnvrg
1310
1318
  offset += chunk_size
1311
1319
  end
1312
1320
 
1313
- res = @datafiles.end_commit(@commit,false, success: true)
1321
+ res = @datafiles.end_commit(@commit,false, success: true, auto_cache: auto_cache, external_disk: external_disk)
1314
1322
  msg = res['result']
1315
1323
  response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1316
1324
  unless response.is_success?
@@ -1318,6 +1326,9 @@ module Cnvrg
1318
1326
  end
1319
1327
 
1320
1328
  log_message("Deleting files finished Successfully", Thor::Shell::Color::GREEN)
1329
+ if msg['cache_error'].present?
1330
+ log_message("Couldn't cache commit: #{msg['cache_error']}", Thor::Shell::Color::YELLOW)
1331
+ end
1321
1332
  rescue SignalException => e
1322
1333
  log_message(e.message, Thor::Shell::Color::RED)
1323
1334
  return false
@@ -2309,7 +2320,6 @@ module Cnvrg
2309
2320
  @project = Project.new(get_project_home)
2310
2321
  chunk_size = chunk_size ? chunk_size : options["chunk_size"]
2311
2322
 
2312
-
2313
2323
  # Enable local/experiment exception logging
2314
2324
  suppress_exceptions = suppress_exceptions ? suppress_exceptions : options[:suppress_exceptions]
2315
2325
  if in_exp
@@ -3189,6 +3199,7 @@ module Cnvrg
3189
3199
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3190
3200
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3191
3201
  method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3202
+ method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
3192
3203
 
3193
3204
  def exec(*cmd)
3194
3205
  log = []
@@ -3212,6 +3223,7 @@ module Cnvrg
3212
3223
  output_dir = options['output_dir'] || "output"
3213
3224
  project_home = get_project_home
3214
3225
  data_query = options["data_query"]
3226
+ docker_stats = options["docker_stats"]
3215
3227
  @project = Project.new(project_home)
3216
3228
  if @project.is_git
3217
3229
  sync_before = false
@@ -3284,20 +3296,22 @@ module Cnvrg
3284
3296
  stdout, stderr = '', ''
3285
3297
  begin
3286
3298
  process_running = true
3287
- stats_thread = Thread.new do
3288
- while process_running do
3289
- sleep 30
3290
- begin
3291
- stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3292
- if is_on_gpu
3293
- gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3294
- stats['gpu_util'] = gu[0]
3295
- stats['gpu'] = gu[1]
3299
+ if docker_stats
3300
+ stats_thread = Thread.new do
3301
+ while process_running do
3302
+ sleep 30
3303
+ begin
3304
+ stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? { memory: memory_usage, cpu: cpu_usage } : {}
3305
+ if is_on_gpu
3306
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3307
+ stats['gpu_util'] = gu[0]
3308
+ stats['gpu'] = gu[1]
3309
+ end
3310
+ @exp.send_machine_stats [stats] unless stats.empty?
3311
+ rescue => e
3312
+ log_error(e)
3313
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3296
3314
  end
3297
- @exp.send_machine_stats [stats] unless stats.empty?
3298
- rescue => e
3299
- log_error(e)
3300
- log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3301
3315
  end
3302
3316
  end
3303
3317
  end
@@ -3395,7 +3409,7 @@ module Cnvrg
3395
3409
  end
3396
3410
 
3397
3411
  # log_thread.join
3398
- stats_thread.join
3412
+ stats_thread.join if docker_stats
3399
3413
 
3400
3414
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3401
3415
 
@@ -3415,7 +3429,7 @@ module Cnvrg
3415
3429
  log_message("Couldn't run #{cmd}, check your input parameters", Thor::Shell::Color::RED)
3416
3430
  if @exp
3417
3431
  # log_thread.join
3418
- Thread.kill(stats_thread)
3432
+ Thread.kill(stats_thread) if docker_stats
3419
3433
  exit_status = $?.exitstatus
3420
3434
  if exit_status.blank?
3421
3435
  exit_status = "-1"
@@ -3439,7 +3453,7 @@ module Cnvrg
3439
3453
  end_commit = @project.last_local_commit
3440
3454
  process_running = false
3441
3455
  # log_thread.join
3442
- stats_thread.join
3456
+ stats_thread.join if docker_stats
3443
3457
 
3444
3458
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
3445
3459
  if container
@@ -4630,6 +4644,81 @@ module Cnvrg
4630
4644
  end
4631
4645
  end
4632
4646
 
4647
+ desc 'Collect and send job utilization', '', :hide => true
4648
+ method_option :prometheus_url, :type => :string, :aliases => ["--prometheus_url"], :desc => "prometheus url to collect metrics from"
4649
+ method_option :node_name, :type => :string, :aliases => ["--node_name"], :desc => "machie activity node name"
4650
+ method_option :machine, :type => :boolean, :aliases => ["--machine"], :desc => "get machine_query or cluster_query"
4651
+ method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
4652
+ method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
4653
+ method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
4654
+
4655
+ def collect_metrics
4656
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4657
+ prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
4658
+
4659
+ translate_result = Cnvrg::API_V2.request(
4660
+ "#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
4661
+ 'GET',
4662
+ { gpu: options[:gpu] }
4663
+ )
4664
+
4665
+ is_machine = options[:machine]
4666
+ while true do
4667
+ begin
4668
+ stats = {}
4669
+ translate_result.each do |query_name, metric|
4670
+ if is_machine
4671
+ metric_query = metric['machine_query'].presence || metric['query']
4672
+ query_content = metric_query.gsub('#JOB_SLUG#', ENV['CNVRG_JOB_ID']).gsub('#NODE_NAME#', options[:node_name])
4673
+ else
4674
+ metric_query = metric['cluster_query'].presence || metric['query']
4675
+ pod_name = `hostname`.strip
4676
+ query_content = metric_query.gsub('#JOB_SLUG#', pod_name).gsub('#NODE_NAME#', options[:node_name])
4677
+ end
4678
+ if metric_query.blank? || query_content.blank?
4679
+ next
4680
+ end
4681
+ uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
4682
+ resp = Net::HTTP.get(uri)
4683
+ begin
4684
+ result = JSON.parse(resp)
4685
+ rescue JSON::ParserError => e
4686
+ log_error(e)
4687
+ next
4688
+ end
4689
+ data_result = result&.dig('data', 'result')
4690
+ next unless data_result
4691
+
4692
+ if data_result.size > 1
4693
+ stats[query_name] = {}
4694
+ data_result.each_with_index do |res, i|
4695
+ timestamp, value = res["value"]
4696
+ uuid = res["metric"]["UUID"].presence || i
4697
+ stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4698
+ stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4699
+ stats[query_name][uuid] = stat_value
4700
+ end
4701
+ else
4702
+ timestamp, value = data_result&.first&.dig('value')
4703
+ stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4704
+ stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4705
+ if query_name.include? 'block'
4706
+ stats['block_io'] = {} if stats['block_io'].blank?
4707
+ io_type = query_name.split('_')[1]
4708
+ stats['block_io'].merge!({ io_type => stat_value })
4709
+ else
4710
+ stats[query_name] = stat_value
4711
+ end
4712
+ end
4713
+ end
4714
+ @exp.send_machine_stats [stats] unless stats.empty?
4715
+ rescue => e
4716
+ log_error(e)
4717
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4718
+ end
4719
+ sleep options[:wait]
4720
+ end
4721
+ end
4633
4722
 
4634
4723
  desc '', '', :hide => true
4635
4724
 
data/lib/cnvrg/data.rb CHANGED
@@ -81,7 +81,6 @@ module Cnvrg
81
81
  end
82
82
  end
83
83
 
84
-
85
84
  desc "data upload", "Upload files from local dataset directory to remote server"
86
85
  method_option :verbose, :type => :boolean, :aliases => ["-v"], :default => false
87
86
  method_option :new_branch, :type => :boolean, :aliases => ["-nb"], :desc => "create new branch of commits"
@@ -155,6 +154,7 @@ module Cnvrg
155
154
  method_option :flatten, :type => :boolean, :aliases => ["-f", "--flatten"], :default => false
156
155
  method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
157
156
  method_option :threads, :type => :numeric, :aliases => ["--threads"], :default => 15
157
+ method_option :cache_link, :type => :boolean, :aliases => ["--cache_link"], :default => false, :hide => true
158
158
  def clone(dataset_url)
159
159
  cli = Cnvrg::CLI.new()
160
160
  only_tree =options[:only_tree]
@@ -165,6 +165,7 @@ module Cnvrg
165
165
  soft = options[:soft]
166
166
  flatten = options[:flatten]
167
167
  threads = options[:threads]
168
+ cache_link = options[:cache_link]
168
169
  cli.clone_data(
169
170
  dataset_url,
170
171
  only_tree=only_tree,
@@ -175,7 +176,8 @@ module Cnvrg
175
176
  flatten: flatten,
176
177
  relative: options[:relative],
177
178
  soft: soft,
178
- threads: threads
179
+ threads: threads,
180
+ cache_link: cache_link
179
181
  )
180
182
  end
181
183
 
@@ -220,6 +222,8 @@ module Cnvrg
220
222
  method_option :threads, :type => :numeric, :aliases => ["-t","--threads"], :default => 15
221
223
  method_option :chunk_size, :type => :numeric, :aliases => ["-cs","--chunk"], :default => 1000
222
224
  method_option :message, :type => :string, :aliases => ["--message"], :desc => "create commit with message", :default => nil
225
+ method_option :auto_cache, :type => :boolean, :aliases => ["--auto_cache"], :desc => "auto_cache", :default => false
226
+ method_option :external_disk, :type => :string, :aliases => ["--external_disk"], :desc => "external_disk_title", :default => nil
223
227
 
224
228
  def put(dataset_url, *files)
225
229
  cli = Cnvrg::CLI.new()
@@ -231,6 +235,8 @@ module Cnvrg
231
235
  message = options[:message]
232
236
  threads = options[:threads]
233
237
  chunk_size = options[:chunk_size]
238
+ auto_cache = options[:auto_cache]
239
+ external_disk = options[:external_disk]
234
240
  cli.data_put(
235
241
  dataset_url,
236
242
  files: files,
@@ -240,16 +246,28 @@ module Cnvrg
240
246
  override: override,
241
247
  threads: threads,
242
248
  chunk_size: chunk_size,
243
- message: message
249
+ message: message,
250
+ auto_cache: auto_cache,
251
+ external_disk: external_disk
244
252
  )
245
253
  end
246
254
 
247
255
  desc 'data rm DATASET_URL FILES_PREFIX', 'Delete selected files from remote server'
248
256
  method_option :message, :type => :string, :aliases => ["--message"], :desc => "create commit with message", :default => nil
257
+ method_option :auto_cache, :type => :boolean, :aliases => ["--auto_cache"], :desc => "auto_cache", :default => false
258
+ method_option :external_disk, :type => :string, :aliases => ["--external_disk"], :desc => "external_disk_title", :default => nil
249
259
  def rm(dataset_url, *regex_list)
250
260
  cli = Cnvrg::CLI.new()
251
261
  message = options[:message]
252
- cli.data_rm(dataset_url, regex_list: regex_list, message: message)
262
+ auto_cache = options[:auto_cache]
263
+ external_disk = options[:external_disk]
264
+ cli.data_rm(
265
+ dataset_url,
266
+ regex_list: regex_list,
267
+ message: message,
268
+ auto_cache: auto_cache,
269
+ external_disk: external_disk
270
+ )
253
271
  end
254
272
 
255
273
  desc 'data clone_query --query=QUERY_SLUG DATASET_URL', 'Clone dataset with specific query'
@@ -345,7 +345,8 @@ module Cnvrg
345
345
  cli = CLI.new
346
346
  cli.log_message("Using #{threads} threads with chunk size of #{chunk_size}.", Thor::Shell::Color::GREEN)
347
347
 
348
- progressbar = create_progressbar("Upload Progress", files.size)
348
+ num_files = files.size
349
+ progressbar = create_progressbar("Upload Progress", num_files)
349
350
  cli = CLI.new
350
351
 
351
352
  # Vars to handle the parallelism
@@ -355,6 +356,7 @@ module Cnvrg
355
356
  dirs_queue = Queue.new
356
357
  worker_threads = []
357
358
  progress_threads = []
359
+ old_api = false
358
360
 
359
361
  # Vars to keep track of uploaded files and directories
360
362
  uploaded_files = []
@@ -382,23 +384,30 @@ module Cnvrg
382
384
  dir_thread = Thread.new do
383
385
  dirs_to_create = []
384
386
  loop do
385
- progress_mutex.synchronize {
386
- dir = dirs_queue.deq(non_block: true) rescue nil
387
- dirs_to_create << dir unless dir.nil?
388
- }
389
- if dirs_to_create.size >= 1000 || progressbar.finished?
390
- resp = Cnvrg::API.request(@base_resource + "create_dirs", "POST", {dirs: dirs_to_create, commit_sha1: commit_sha1})
391
-
392
- break if resp == false # if resp is false it means 404 which is old server
393
- unless Cnvrg::CLI.is_response_success(resp, false)
394
- time = Time.current
395
- Cnvrg::Logger.log_error_message("Failed to create dirs: #{time}, #{resp.try(:fetch, "message")}")
387
+ dir = dirs_queue.deq(non_block: true) rescue nil
388
+ if dir.nil? && !progressbar.finished?
389
+ sleep 0.2
390
+ Cnvrg::Logger.info("directories thread status: progressbar.finished? #{progressbar.finished?} || dirs_queue.empty? #{dirs_queue.empty?} #{dirs_queue.size} || dirs_to_create.empty? #{dirs_to_create.empty?} #{dirs_to_create.size}")
391
+ else
392
+ dirs_to_create << dir
393
+
394
+ if dirs_to_create.size >= 1000 || progressbar.finished?
395
+ resp = Cnvrg::API.request(@base_resource + "create_dirs", "POST", { dirs: dirs_to_create, commit_sha1: commit_sha1 })
396
+ Cnvrg::Logger.info("uploaded directories chunk, finished with #{resp}")
397
+ if resp == false # if resp is false it means 404 which is old server
398
+ old_api = true
399
+ break
400
+ end
401
+ unless Cnvrg::CLI.is_response_success(resp, false)
402
+ dirs_to_create = []
403
+ time = Time.current
404
+ Cnvrg::Logger.log_error_message("Failed to create dirs: #{time}, #{resp.try(:fetch, "message")}")
405
+ next
406
+ end
396
407
  dirs_to_create = []
397
- next
398
408
  end
399
- dirs_to_create = []
409
+ break if progressbar.finished? && dirs_queue.empty? && dirs_to_create.empty?
400
410
  end
401
- break if progressbar.finished? && dirs_queue.empty? && dirs_to_create.empty?
402
411
  end
403
412
  end
404
413
 
@@ -409,7 +418,6 @@ module Cnvrg
409
418
  file = progress_queue.deq(non_block: true) rescue nil # to prevent deadlocks
410
419
  unless file.nil?
411
420
  blob_ids = []
412
-
413
421
  progress_mutex.synchronize {
414
422
  progressbar.progress += 1
415
423
  uploaded_files.append(file) if file[:success]
@@ -421,32 +429,31 @@ module Cnvrg
421
429
  }
422
430
 
423
431
  if blob_ids.present?
432
+ random_id = (0...10).map { ('a'..'z').to_a[rand(26)] }.join
424
433
  refresh_storage_token
425
- Cnvrg::Logger.info("Finished upload chunk of #{chunk_size} files, Sending Upload files save")
426
-
434
+ Cnvrg::Logger.info("chunk #{random_id}: Finished uploading chunk of #{chunk_size} files, Sending Upload files save")
427
435
  retry_count = 0
428
436
  loop do
429
437
  upload_resp = Cnvrg::API.request(@base_resource + "upload_files_save", "POST", {commit: commit_sha1, blob_ids: blob_ids})
430
438
 
431
439
  if not (Cnvrg::CLI.is_response_success(upload_resp, false))
432
440
  retry_count += 1
433
- Cnvrg::Logger.log_error_message("Failed request save files: #{Time.current}, retry: #{retry_count}")
434
- Cnvrg::Logger.info("Got an error message from server, #{upload_resp.try(:fetch, "message")}")
441
+ Cnvrg::Logger.log_error_message("chunk #{random_id}: Failed request save files: #{Time.current}, retry: #{retry_count}")
435
442
  if retry_count > 20
436
- puts "Failed to save files: #{Time.current}, trying next chunk"
443
+ puts "chunk #{random_id}: Failed to save files: #{Time.current}, trying next chunk"
437
444
  break
438
445
  end
439
446
  sleep 5
440
447
  next
441
448
  end
442
- Cnvrg::Logger.info("Chunk saved on server")
449
+ Cnvrg::Logger.info("chunk #{random_id}: Chunk saved on server")
443
450
  break
444
451
  end
445
452
  end
446
453
  else
447
454
  sleep(0.1)
448
455
  end
449
-
456
+ Cnvrg::Logger.info("progress_threads: progressbar.finished? #{progressbar.finished?}")
450
457
  if progressbar.finished?
451
458
  Cnvrg::Logger.info("Progress bar finished closing queues")
452
459
  file_queue.close
@@ -459,35 +466,43 @@ module Cnvrg
459
466
 
460
467
  file_chunks = files.each_slice(chunk_size).to_a
461
468
  # Fetch the required files from the server:
469
+ num_chunks = (num_files / 1000.0).ceil
470
+ chunk_index = 0
462
471
  Parallel.map((file_chunks), in_threads: threads) do |chunk|
463
- files_chunk = chunk.map{|p| p.gsub(/^\.\//, '')}
464
- Cnvrg::Logger.info("Generating chunk idx")
472
+ chunk_index += 1
473
+ self_chunk_index = chunk_index
474
+ files_chunk = chunk.map { |p| p.gsub(/^\.\//, '') }
475
+ Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: Generating chunk idx")
465
476
  tree = @dataset.generate_chunked_idx(files_chunk, prefix: prefix, threads: threads, cli: cli)
477
+ Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: Finished Generating chunk idx")
466
478
 
467
- progress_mutex.synchronize {
468
- # Handle directories:
469
- new_dirs = tree.keys.select { |k| tree[k].nil? }
470
-
471
- if new_dirs.blank?
472
- ## we need to send 1 file so we will inflated dirs from in case when we dont have folders in the tree
473
- file = tree.keys.find { |k| tree[k] != nil }
474
- dirs_queue.push file
479
+ # Handle directories:
480
+ unless old_api
481
+ while dirs_queue.size > 5000
482
+ sleep(0.1)
475
483
  end
484
+ end
485
+ new_dirs = tree.keys.select { |k| tree[k].nil? }
486
+ if new_dirs.blank?
487
+ ## we need to send 1 file so we will inflated dirs from in case when we dont have folders in the tree
488
+ file = tree.keys.find { |k| tree[k] != nil }
489
+ dirs_queue.push(file) unless old_api
490
+ end
491
+ new_dirs.each { |dir| dirs_queue.push dir }
476
492
 
477
- new_dirs.each { |dir| dirs_queue.push dir }
478
- }
479
- Cnvrg::Logger.info("Getting files info from server")
480
-
493
+ Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: Getting files info from server")
481
494
  results = request_upload_files(commit_sha1, tree, override, new_branch, partial_commit)
495
+ Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: Finished Getting files info from server")
482
496
  next unless results
483
497
 
484
498
  if results['files'].blank?
499
+ Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: no files to upload skipping chunk")
485
500
  progress_mutex.synchronize { progressbar.progress += tree.keys.length }
486
501
  next
487
502
  end
488
503
 
489
504
  files_to_upload = results['files']
490
-
505
+ Cnvrg::Logger.info("chunk #{self_chunk_index} / #{num_chunks}: number of files to upload in this chunk: #{tree.keys.length - files_to_upload.length}")
491
506
  progress_mutex.synchronize {
492
507
  progressbar.progress += tree.keys.length - files_to_upload.length
493
508
  }
@@ -500,10 +515,12 @@ module Cnvrg
500
515
  end
501
516
  end
502
517
 
503
- Cnvrg::Logger.info("Waiting to progress and workers to finish")
518
+ Cnvrg::Logger.info("Waiting dir_thread to finish")
504
519
  dir_thread.join
505
520
  dirs_queue.close
521
+ Cnvrg::Logger.info("Waiting progress_thread to finish")
506
522
  progress_threads.each(&:join)
523
+ Cnvrg::Logger.info("Waiting workers to finish")
507
524
  worker_threads.each(&:join)
508
525
  Thread.report_on_exception = true
509
526
  rescue => e
@@ -1243,7 +1260,7 @@ module Cnvrg
1243
1260
  false
1244
1261
  end
1245
1262
 
1246
- def end_commit(commit_sha1, force, success: true, uploaded_files: 0, commit_type: nil)
1263
+ def end_commit(commit_sha1, force, success: true, uploaded_files: 0, commit_type: nil, auto_cache: false, external_disk: nil)
1247
1264
  counter = 0
1248
1265
  begin
1249
1266
  counter += 1
@@ -1255,7 +1272,9 @@ module Cnvrg
1255
1272
  force:force,
1256
1273
  success: success,
1257
1274
  uploaded_files: uploaded_files,
1258
- commit_type: commit_type
1275
+ commit_type: commit_type,
1276
+ auto_cache: auto_cache,
1277
+ external_disk: external_disk
1259
1278
  }
1260
1279
  )
1261
1280
  is_success = Cnvrg::CLI.is_response_success(response, false)
@@ -1289,8 +1308,8 @@ module Cnvrg
1289
1308
  response['result']['files']
1290
1309
  end
1291
1310
 
1292
- def get_clone_chunk(latest_id: nil, chunk_size: 1000, commit: 'latest')
1293
- response = Cnvrg::API.request("#{@base_resource}/clone_chunk", 'POST',{commit: commit, chunk_size: chunk_size, latest_id: latest_id})
1311
+ def get_clone_chunk(latest_id: nil, chunk_size: 1000, commit: 'latest', cache_link: false)
1312
+ response = Cnvrg::API.request("#{@base_resource}/clone_chunk", 'POST',{commit: commit, chunk_size: chunk_size, latest_id: latest_id, cache_link: cache_link})
1294
1313
  unless Cnvrg::CLI.is_response_success(response, false)
1295
1314
  Cnvrg::Logger.log_info("#{{commit: commit, chunk_size: chunk_size, latest_id: latest_id}}")
1296
1315
  return nil
@@ -1357,7 +1376,7 @@ module Cnvrg
1357
1376
  end
1358
1377
  end
1359
1378
 
1360
- def download_multiple_files_s3(files, project_home, conflict: false, progressbar: nil, read_only:false, flatten: false, threads: 15)
1379
+ def download_multiple_files_s3(files, project_home, conflict: false, progressbar: nil, read_only:false, flatten: false, threads: 15, cache_link: false)
1361
1380
  begin
1362
1381
  refresh_storage_token
1363
1382
  parallel_options = {
@@ -1378,10 +1397,18 @@ module Cnvrg
1378
1397
  # blob
1379
1398
  local_path = "#{local_path}.conflict" if conflict
1380
1399
  storage_path = f["path"]
1381
- # if File.exists? local_path
1382
- # Cnvrg::Logger.log_info("Trying to download #{local_path} but its already exists, skipping..")
1383
- # next
1384
- # end
1400
+ # if File.exists? local_path
1401
+ # Cnvrg::Logger.log_info("Trying to download #{local_path} but its already exists, skipping..")
1402
+ # next
1403
+ # end
1404
+ if cache_link
1405
+ cached_commits = f['cached_commits']
1406
+
1407
+ if cached_commits.present?
1408
+ next if @downloader.link_file(cached_commits, local_path, @dataset.title, f['name'])
1409
+ end
1410
+ end
1411
+
1385
1412
  resp = @downloader.safe_download(storage_path, local_path)
1386
1413
  Cnvrg::Logger.log_info("Download #{local_path} success resp: #{resp}")
1387
1414
  rescue => e
@@ -37,6 +37,21 @@ module Cnvrg
37
37
  ### need to be implemented..
38
38
  end
39
39
 
40
+ def link_file(cached_commits, local_path, dataset_title, file_name)
41
+ prepare_download(local_path)
42
+ cached_commits.each do |cached_commit|
43
+ nfs_path = "/nfs-disk/#{cached_commit}/#{dataset_title}/#{file_name}"
44
+ if File.exist? nfs_path
45
+ FileUtils.ln(nfs_path, local_path)
46
+ return true
47
+ end
48
+ end
49
+ false
50
+ rescue => e
51
+ Cnvrg::Logger.log_error(e)
52
+ false
53
+ end
54
+
40
55
  def safe_download(storage_path, local_path, decrypt: true)
41
56
  safe_operation(local_path) { self.download(storage_path, local_path, decrypt: decrypt) }
42
57
  end
@@ -133,23 +133,30 @@ module Cnvrg
133
133
  return response
134
134
  end
135
135
  def remote_notebook(instance_type, commit, data, data_commit, notebook_type,ds_sync_options=0,data_query=nil, image = nil, datasets = nil)
136
- response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', {instance_type: instance_type,dataset_slug:data,
137
- dataset_commit: data_commit,image_slug:image,
138
- datasets: datasets,
139
- commit:commit,notebook_type:notebook_type,dataset_sync_options:ds_sync_options,
140
- dataset_query:data_query})
136
+ response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/notebook/remote", 'POST', { instance_type: instance_type, dataset_slug:data,
137
+ dataset_commit: data_commit, image_slug:image,
138
+ datasets: datasets,
139
+ commit:commit, notebook_type:notebook_type, dataset_sync_options:ds_sync_options,
140
+ dataset_query: data_query })
141
141
  return response
142
142
  end
143
143
 
144
144
  def upload_temp_log(temp_log)
145
- response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', {output: temp_log,
146
- exp_slug: @slug})
147
- Cnvrg::CLI.is_response_success(response,false)
145
+ response = Cnvrg::API.request(@base_resource + "experiment/upload_temp_log", 'POST', { output: temp_log,
146
+ exp_slug: @slug })
147
+ Cnvrg::CLI.is_response_success(response, false)
148
148
  end
149
149
 
150
150
  def send_machine_stats(stats)
151
- response = Cnvrg::API.request(@base_resource + "experiment/upload_stats", "POST", {exp_slug: @slug, stats: stats.map{|s| s.merge!({time: Time.now})}})
152
- Cnvrg::CLI.is_response_success(response,false)
151
+ response = Cnvrg::API.request(
152
+ @base_resource + "experiment/upload_stats",
153
+ "POST",
154
+ {
155
+ exp_slug: @slug,
156
+ stats: stats.map { |s| s.merge!({ time: Time.now }) }
157
+ }
158
+ )
159
+ Cnvrg::CLI.is_response_success(response, false)
153
160
  end
154
161
 
155
162
  def end(output, exit_status, end_commit, cpu_average, memory_average, end_time: nil)
@@ -158,7 +165,7 @@ module Cnvrg
158
165
  success = false
159
166
  end_time ||= Time.now
160
167
  while tries < 10 and success.blank?
161
- sleep (tries*rand) ** 2 ### exponential backoff
168
+ sleep (tries * rand) ** 2 ### exponential backoff
162
169
  ## this call is super important so we cant let it crash.
163
170
 
164
171
  tries += 1
data/lib/cnvrg/files.rb CHANGED
@@ -134,7 +134,6 @@ module Cnvrg
134
134
  end
135
135
  end
136
136
 
137
-
138
137
  blob_ids.concat blob_id_chunk
139
138
  end
140
139
 
data/lib/cnvrg/helpers.rb CHANGED
@@ -14,6 +14,7 @@ module Cnvrg
14
14
  }
15
15
  end
16
16
  def checkmark
17
+ return "" if Cnvrg::Helpers.windows?
17
18
  checkmark = "\u2713"
18
19
  return checkmark.encode('utf-8')
19
20
  end
@@ -201,7 +201,7 @@ class Cnvrg::Helpers::Executer
201
201
  pod_name = `hostname`.strip rescue nil
202
202
  node_name = nil
203
203
  if pod_name.present?
204
- pod_describe = `kubectl -n cnvrg get pod #{pod_name} -o json` rescue nil
204
+ pod_describe = `kubectl get pod #{pod_name} -o json` rescue nil
205
205
  pod_describe = JSON.parse(pod_describe) rescue {}
206
206
  node_name = pod_describe["spec"]["nodeName"] rescue nil
207
207
  end
@@ -217,7 +217,7 @@ class Cnvrg::Helpers::Executer
217
217
 
218
218
  def get_pod_events(pod_name)
219
219
  return if pod_name.blank?
220
- `kubectl get event --namespace cnvrg --field-selector involvedObject.name=#{pod_name} -o json`
220
+ `kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
221
221
  end
222
222
 
223
223
  def get_node_events(node_name)
data/lib/cnvrg/project.rb CHANGED
@@ -381,7 +381,7 @@ module Cnvrg
381
381
  def generate_output_dir(output_dir)
382
382
  Cnvrg::Logger.log_info("Generating output dir for #{output_dir}")
383
383
  upload_list = []
384
- list = Dir.glob("#{output_dir}/**/*", File::FNM_DOTMATCH)
384
+ list = Dir.glob("/cnvrg/#{output_dir}/**/*", File::FNM_DOTMATCH)
385
385
  Parallel.map(list, in_threads: IDXParallelThreads) do |e|
386
386
  next if e.end_with? "/."
387
387
  if File.directory? e
@@ -517,17 +517,17 @@ module Cnvrg
517
517
  commit = local_idx[:commit]
518
518
  tree = local_idx[:tree]
519
519
  ignore_list = self.send_ignore_list()
520
- if force
520
+ if force or specific_files.present?
521
521
  added = []
522
522
  if tree.present?
523
523
  added += local_idx[:tree].keys
524
524
  end
525
- response = {"result" => {"commit" => nil, "tree" => {"added" => added,
526
- "updated_on_server" => [],
527
- "updated_on_local" => [],
528
- "update_local" => [],
529
- "deleted" => [],
530
- "conflicts" => []}}}
525
+ response = { "result" => { "commit" => nil, "tree" => { "added" => added,
526
+ "updated_on_server" => [],
527
+ "updated_on_local" => [],
528
+ "update_local" => [],
529
+ "deleted" => [],
530
+ "conflicts" => [] } } }
531
531
  return response
532
532
  end
533
533
  #we dont want to send it on download - we only compare between commits sha1 in download.
@@ -535,6 +535,7 @@ module Cnvrg
535
535
  #the new server doesnt need the tree, but the old probably needs :X
536
536
  local_idx[:tree] = {} if Cnvrg::Helpers.server_version > 0
537
537
  end
538
+
538
539
  response = Cnvrg::API.request(@base_resource + "status", 'POST', {idx: local_idx, new_branch: new_branch,
539
540
  current_commit: commit, ignore: ignore_list, force: force, in_exp: in_exp, download: download})
540
541
 
data/lib/cnvrg/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Cnvrg
2
- VERSION = '1.11.24'
2
+ VERSION = '1.11.29'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cnvrg
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.24
4
+ version: 1.11.29
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yochay Ettun
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2021-02-25 00:00:00.000000000 Z
13
+ date: 2021-03-30 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: bundler
@@ -105,7 +105,7 @@ dependencies:
105
105
  version: 0.3.1
106
106
  - - ">="
107
107
  - !ruby/object:Gem::Version
108
- version: 0.3.2
108
+ version: 0.3.7
109
109
  type: :runtime
110
110
  prerelease: false
111
111
  version_requirements: !ruby/object:Gem::Requirement
@@ -115,7 +115,7 @@ dependencies:
115
115
  version: 0.3.1
116
116
  - - ">="
117
117
  - !ruby/object:Gem::Version
118
- version: 0.3.2
118
+ version: 0.3.7
119
119
  - !ruby/object:Gem::Dependency
120
120
  name: faraday
121
121
  requirement: !ruby/object:Gem::Requirement
@@ -394,6 +394,7 @@ executables:
394
394
  extensions: []
395
395
  extra_rdoc_files: []
396
396
  files:
397
+ - Readme.md
397
398
  - bin/cnvrg
398
399
  - cnvrg.gemspec
399
400
  - lib/cnvrg.rb