cnvrg 0.4.4 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6533e6de1028236f163d504243a2afef14da2f2b
4
- data.tar.gz: 5327a071845d376bff81c76197a0b6844377d36e
3
+ metadata.gz: e44751ed52140fdaee81469a183a0f06f86968e1
4
+ data.tar.gz: 7b95fb6b3720a17d2edfe7a12da9c21ef0d3034f
5
5
  SHA512:
6
- metadata.gz: d959e3931999dfbf64b45105b9f5e1e048219ee0b6e4f695d87193540229e998093347d3aa81896faa6f6e4d72e6cdfb629aca448558c3e19b7b88d8c2b397ce
7
- data.tar.gz: b0ba98342dd6675c19e34376f6bd4f3157431e562c9d3850f988ec3b74a5d60008592b9a21cd3498c3a53de389e1f82460bcd930b09d736edd070ca8c3dba16d
6
+ metadata.gz: 2766aba6a508f5b073b4b12e987ee1d4568f84c47c632e6fefa6bf6fc0416a3737727cae88dfa7b97873c3f6461c5821d15a4ca9c7c54e32e9ca763b3e8b79d2
7
+ data.tar.gz: 8d2985f8c0ff5ea5d70131e993d896ce3ddd20a1f855dcf7ba7cad4a862a09fa13424bcdb3c5ebccb2a4e2ec9746f00eff2d7be49b7ee810826dc94d41a05a35
@@ -3095,9 +3095,10 @@ module Cnvrg
3095
3095
  method_option :small, :type => :boolean, :aliases => ["-sm", "--small"], :default => false
3096
3096
  method_option :medium, :type => :boolean, :aliases => ["-md", "--medium"], :default => false
3097
3097
  method_option :large, :type => :boolean, :aliases => ["-lg", "--large"], :default => false
3098
- method_option :gpu_1, :type => :boolean, :aliases => ["--gpu"], :default => false
3099
- method_option :gpu_2, :type => :boolean, :aliases => ["--gpuxl"], :default => false
3100
- method_option :gpu_3, :type => :boolean, :aliases => ["--gpuxxl"], :default => false
3098
+ method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
3099
+ method_option :gpuxl, :type => :boolean, :aliases => ["--gpuxl"], :default => false
3100
+ method_option :gpuxxl, :type => :boolean, :aliases => ["--gpuxxl"], :default => false
3101
+ method_option :machine, :type => :string, :aliases => ["-m", "--machine"], :default => nil
3101
3102
  method_option :sync_before, :type => :boolean, :aliases => ["-sb", "--sync_before"], :default => true
3102
3103
  method_option :sync_after, :type => :boolean, :aliases => ["-sa", "--sync_after"], :default => true
3103
3104
  method_option :title, :type => :string, :aliases => ["-t", "--title"], :default => ""
@@ -3139,12 +3140,9 @@ module Cnvrg
3139
3140
  force = options["force"]
3140
3141
  max_time = options["max_time"]
3141
3142
  dataset_only_tree = options["dataset_only_tree"]
3143
+ custom_machine = options["machine"]
3142
3144
 
3143
3145
  options_hash = Hash[options]
3144
- real_options = []
3145
- options_hash.each do |o|
3146
- real_options << o if (!o[1].eql? "" and !["small", "medium", "large", "gpu_1", "gpu_2", "gpu_3"].include? o[0])
3147
- end
3148
3146
  if local
3149
3147
  invoke :exec, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title,
3150
3148
  :log => log, :email_notification => email_notification, :upload_output => upload_output,
@@ -3163,15 +3161,10 @@ module Cnvrg
3163
3161
 
3164
3162
  end
3165
3163
  end
3166
- real_options.delete(["local", false])
3167
3164
  instances = {"small" => options["small"], "medium" => options["medium"], "large" => options["large"],
3168
- "gpu_1" => options["gpu_2"], "gpu_3" => options["gpuxxl"]}
3165
+ "gpu" => options["gpu"], "gpuxl" => options["gpuxl"], "gpuxxl" => options["gpuxxl"],
3166
+ options["machine"] => !options["machine"].blank?}
3169
3167
  instance_type = get_instance_type(instances)
3170
- if !instance_type.nil? and !instance_type.empty?
3171
- real_options << ["machine_type", instance_type]
3172
- end
3173
- exec_options = real_options.map {|x| "--#{x[0]}=#{x[1]}"}.flatten.join(" ")
3174
- cmd_to_exec = "#{exec_options} #{cmd.join(" ")}"
3175
3168
  invoke :exec_remote, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title, :machine_type => instance_type,
3176
3169
  :schedule => schedule, :log => log, :email_notification => email_notification, :upload_output => upload_output, :commit => commit,
3177
3170
  :image => image, :grid => grid, :data => data, :data_commit => data_commit, :ignore => ignore, :force => force, :sync_before_terminate => sync_before_terminate,
@@ -3227,7 +3220,6 @@ module Cnvrg
3227
3220
  time_to_upload = calc_output_time(upload_output)
3228
3221
  project_home = get_project_home
3229
3222
  @project = Project.new(project_home)
3230
-
3231
3223
  is_new_branch = @project.compare_commit(commit)
3232
3224
  begin
3233
3225
  if !commit.nil? and !commit.empty?
@@ -3244,8 +3236,6 @@ module Cnvrg
3244
3236
  end
3245
3237
  if !indocker
3246
3238
  image_proj = is_project_with_docker(working_dir)
3247
-
3248
-
3249
3239
  if image_proj and image_proj.is_docker
3250
3240
  container = image_proj.get_container
3251
3241
  if !container
@@ -3292,7 +3282,7 @@ module Cnvrg
3292
3282
  stdout, stderr = '', ''
3293
3283
  begin
3294
3284
 
3295
- # if remote
3285
+ if remote
3296
3286
  if @exp.sync_before_terminate
3297
3287
  spot_status_thread = Thread.new do
3298
3288
  begin
@@ -3312,24 +3302,11 @@ module Cnvrg
3312
3302
  end
3313
3303
  end
3314
3304
  end
3315
- if @exp.sync_delay_time
3316
- regular_sync = Thread.new do
3317
- begin
3318
- loop do
3319
- # sync
3320
- sleep(@exp.sync_delay_time)
3321
-
3322
- download(sync = true, ignore_list = [])
3323
- upload(link = false, sync = true, direct = false, ignore_list = [])
3324
- end
3325
- rescue => e
3326
- log_error(e)
3327
- end
3328
- end
3329
- end
3330
- # end
3305
+ end
3306
+ process_running = true
3331
3307
  stats_thread = Thread.new do
3332
- loop do
3308
+ while process_running do
3309
+ sleep 30
3333
3310
  begin
3334
3311
  stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.linux? ? {memory: memory_usage, cpu: cpu_usage} : {}
3335
3312
  if is_on_gpu
@@ -3337,85 +3314,90 @@ module Cnvrg
3337
3314
  stats['gpu'] = gu[0]
3338
3315
  stats['gpu_util'] = gu[1]
3339
3316
  end
3340
- @exp.send_machine_stats [stats]
3317
+ @exp.send_machine_stats [stats] unless stats.empty?
3341
3318
  rescue => e
3342
3319
  log_error(e)
3343
3320
  log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
3344
- ensure
3345
- sleep(30)
3346
3321
  end
3347
3322
  end
3348
3323
  end
3349
-
3350
-
3351
3324
  log_thread = Thread.new do
3352
- loop do
3353
- log_count = log.count
3354
- time_to_upload = calc_output_log_time(log_count)
3325
+ while process_running or !log.empty? do
3355
3326
  begin
3356
- if log_count != 0
3357
- if time_to_upload <= Time.now - start_loop
3358
- @exp.upload_temp_log(log)
3359
- log = []
3360
- start_loop = Time.now
3361
- end
3327
+ temp_log = log
3328
+ if !temp_log.empty?
3329
+ @exp.upload_temp_log(temp_log) unless temp_log.empty?
3330
+
3362
3331
  end
3332
+ log -= temp_log
3363
3333
  rescue => e
3364
3334
  log_message("Failed to upload ongoing results, continuing with experiment", Thor::Shell::Color::YELLOW)
3365
3335
  log_error(e)
3336
+ ensure
3337
+ sleep 10
3366
3338
  end
3367
- sleep(1)
3368
3339
  end
3369
3340
  end
3370
-
3371
3341
  PTY.spawn(cmd) do |stdout, stdin, pid, stderr|
3372
3342
  begin
3373
3343
  stdout.each do |line|
3374
3344
  cur_time = Time.now
3375
3345
  real_time = Time.now - real
3376
-
3377
3346
  cur_log = {time: cur_time,
3378
3347
  message: line,
3379
3348
  type: "stdout",
3380
3349
  real: real_time
3381
-
3382
3350
  }
3383
-
3384
-
3351
+ $LOG.info(cur_log)
3385
3352
  if print_log
3386
3353
  puts cur_log
3387
3354
  end
3388
3355
  log << cur_log
3356
+ # if log.size >= 10
3357
+ # @exp.upload_temp_log(log)
3358
+ # log = []
3359
+ # end
3389
3360
  end
3390
-
3391
-
3392
3361
  if stderr
3393
-
3394
3362
  stderr.each do |err|
3395
-
3396
3363
  log << {time: Time.now, message: err, type: "stderr"}
3397
3364
  end
3398
3365
  end
3399
-
3400
3366
  rescue Errno::EIO => e
3401
3367
  log_error(e)
3402
- rescue Errno::ENOENT => e
3368
+ if !log.empty?
3403
3369
 
3370
+ temp_log = log
3371
+ @exp.upload_temp_log(temp_log) unless temp_log.empty?
3372
+
3373
+ log -= temp_log
3374
+ end
3375
+ rescue Errno::ENOENT => e
3404
3376
  exp_success = false
3405
3377
  log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
3406
3378
  log_error(e)
3407
- rescue Open4::ChildExited
3408
- exp_success = false
3409
- log_message("The process exited!", Thor::Shell::Color::RED)
3379
+ # rescue Open4::ChildExited
3380
+ # exp_success = false
3381
+ # log_message("The process exited!", Thor::Shell::Color::RED)
3410
3382
  rescue => e
3411
3383
  sleep(20) # end cycle
3412
- res = @exp.end(log, 1, start_commit, cpu_average, memory_average)
3384
+ res = @exp.end(log, 1, start_commit, 0, 0)
3413
3385
  log_message("Error occurred,aborting", Thor::Shell::Color::RED)
3414
3386
  log_error(e)
3415
3387
  exit(0)
3416
3388
  end
3417
3389
  ::Process.wait pid
3418
3390
  end
3391
+ process_running = false
3392
+
3393
+ if !log.empty?
3394
+
3395
+ temp_log = log
3396
+ @exp.upload_temp_log(temp_log) unless temp_log.empty?
3397
+
3398
+ log -= temp_log
3399
+ end
3400
+
3419
3401
  cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
3420
3402
  memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
3421
3403
  exit_status = $?.exitstatus
@@ -3438,8 +3420,10 @@ module Cnvrg
3438
3420
  end
3439
3421
  end_commit = @project.last_local_commit
3440
3422
 
3441
- sleep(30) # end cycle
3442
- res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
3423
+ log_thread.join
3424
+ stats_thread.join
3425
+
3426
+ res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
3443
3427
 
3444
3428
 
3445
3429
  if !exp_success
@@ -3465,13 +3449,15 @@ module Cnvrg
3465
3449
 
3466
3450
  }
3467
3451
  log << cur_log
3468
- sleep(20) # end cycle
3452
+ process_running = false
3453
+ log_thread.join
3454
+ stats_thread.join
3469
3455
  res = @exp.end(log, "-1", end_commit, cpu_average, memory_average)
3470
3456
 
3471
3457
  end
3472
3458
  log_error(e)
3473
- Thread.kill(log_thread)
3474
- Thread.kill(stats_thread)
3459
+ # Thread.kill(log_thread)
3460
+ # Thread.kill(stats_thread)
3475
3461
 
3476
3462
  exit(1)
3477
3463
  end
@@ -3483,7 +3469,9 @@ module Cnvrg
3483
3469
  rescue SignalException
3484
3470
  exit_status = -1
3485
3471
  end_commit = @project.last_local_commit
3486
- sleep(20) # end cycle
3472
+ process_running = false
3473
+ log_thread.join
3474
+ stats_thread.join
3487
3475
 
3488
3476
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
3489
3477
  if container
@@ -5519,7 +5507,12 @@ module Cnvrg
5519
5507
  if config and !config.nil? and !config.empty? and !config.to_h[:compression_path].nil?
5520
5508
  compression_path = config.to_h[:compression_path]
5521
5509
  end
5522
- config = {owner: owner, username: username, version_last_check: get_start_day(), api: url, compression_path: compression_path}
5510
+ verify_ssl = false
5511
+
5512
+ if config and !config.nil? and !config.empty? and !config.to_h[:verfiy_ssl].nil?
5513
+ verify_ssl = config.to_h[:verify_ssl]
5514
+ end
5515
+ config = {owner: owner, username: username, version_last_check: get_start_day(), api: url, compression_path: compression_path, verfiy_ssl:verify_ssl}
5523
5516
 
5524
5517
  File.open(home_dir + "/.cnvrg/config.yml", "w+") {|f| f.write config.to_yaml}
5525
5518
  return true
@@ -6145,27 +6138,30 @@ module Cnvrg
6145
6138
  end
6146
6139
 
6147
6140
  def gpu_util
6148
- gpu = [0.0, 0.0]
6141
+ stats = [[],[]]
6149
6142
  begin
6150
- gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv |tail -1`
6143
+ gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
6151
6144
 
6152
6145
  if !gpu_stats.nil?
6153
- gpu = gpu_stats.strip
6154
- gpu = gpu_stats.gsub!("%", "").split(",")
6155
- gpu[0] = gpu[0].to_f
6156
- gpu[1] = gpu[1].to_f
6157
- return gpu
6146
+ gpu_stats = gpu_stats.split("\n")[1..-1]
6147
+ stats = [[],[]]
6148
+ gpu_stats.each do |stat|
6149
+ gpu = stat.strip.gsub!("%", "").split(",")
6150
+ stats[0] << gpu[0].to_f
6151
+ stats[1] << gpu[1].to_f
6152
+ end
6153
+ return stats
6158
6154
  end
6159
6155
 
6160
6156
  rescue
6161
- return gpu
6157
+ return stats
6162
6158
  end
6163
6159
 
6164
6160
 
6165
6161
  end
6166
6162
 
6167
6163
  def usage_metrics_in_docker(docker_id)
6168
- res = {cpu: 0.0, memory: 0.0}
6164
+ res = {cpu: 0.0, memory: 0.0, block_io: {input: 0, output: 0.0}}
6169
6165
  begin
6170
6166
  if docker_id.nil?
6171
6167
  docker_id = `cat /etc/hostname`
@@ -6174,15 +6170,46 @@ module Cnvrg
6174
6170
  if !stats.nil?
6175
6171
  conv = stats.split(",")
6176
6172
  cpu = conv[0].gsub!("%", "").to_f
6173
+ res[:cpu] = cpu
6177
6174
  memory = conv[1].gsub!("%", "").to_f
6178
- res = {cpu: cpu, memory: memory}
6175
+ res[:memory] = memory
6176
+ block_io = parse_io conv[2]
6177
+ res = {cpu: cpu, memory: memory, block_io: block_io}
6179
6178
  return res
6180
6179
  end
6181
6180
  rescue
6182
6181
  return res
6183
6182
  end
6183
+ end
6184
+
6184
6185
 
6185
6186
 
6187
+ def parse_io(block_io_str)
6188
+ block_io = block_io_str.gsub!(" ", "").split('/')
6189
+ input = block_io[0]
6190
+ output = block_io[1]
6191
+ r = Regexp.new('(\d+(\.\d+)?)([A-Za-z]+)')
6192
+ input_match = r.match(input)
6193
+ input = input_match[1].to_f * size_to_bytes(input_match[3])
6194
+ output_match = r.match(output)
6195
+ output = output_match[1].to_f * size_to_bytes(output_match[3])
6196
+ {input: input, output: output}
6197
+ end
6198
+
6199
+
6200
+ def size_to_bytes size
6201
+ case size.try(:downcase)
6202
+ when 'b'
6203
+ 1
6204
+ when 'kb'
6205
+ 2**10
6206
+ when 'mb'
6207
+ 2**20
6208
+ when 'gb'
6209
+ 2**30
6210
+ else
6211
+ 1
6212
+ end
6186
6213
  end
6187
6214
 
6188
6215
  end
@@ -381,7 +381,7 @@ module Cnvrg
381
381
  end
382
382
  response = Cnvrg::API.request("users/#{self.owner}/projects/#{self.slug}/status", 'POST', {idx: local_idx, new_branch: new_branch,
383
383
  current_commit: commit,ignore:ignore_list, force:force,in_exp:in_exp})
384
- CLI.is_response_success(response,false)
384
+ CLI.is_response_success(response,true)
385
385
  return response
386
386
  end
387
387
  def jump_idx(new_branch, commit=last_local_commit)
@@ -1,4 +1,4 @@
1
1
  module Cnvrg
2
- VERSION = '0.4.4'
2
+ VERSION = '0.5.0'
3
3
  end
4
4
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cnvrg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.4
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yochay Ettun
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-05-17 00:00:00.000000000 Z
12
+ date: 2018-06-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler