cnvrg 0.4.4 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cnvrg/cli.rb +109 -82
- data/lib/cnvrg/project.rb +1 -1
- data/lib/cnvrg/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e44751ed52140fdaee81469a183a0f06f86968e1
|
4
|
+
data.tar.gz: 7b95fb6b3720a17d2edfe7a12da9c21ef0d3034f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2766aba6a508f5b073b4b12e987ee1d4568f84c47c632e6fefa6bf6fc0416a3737727cae88dfa7b97873c3f6461c5821d15a4ca9c7c54e32e9ca763b3e8b79d2
|
7
|
+
data.tar.gz: 8d2985f8c0ff5ea5d70131e993d896ce3ddd20a1f855dcf7ba7cad4a862a09fa13424bcdb3c5ebccb2a4e2ec9746f00eff2d7be49b7ee810826dc94d41a05a35
|
data/lib/cnvrg/cli.rb
CHANGED
@@ -3095,9 +3095,10 @@ module Cnvrg
|
|
3095
3095
|
method_option :small, :type => :boolean, :aliases => ["-sm", "--small"], :default => false
|
3096
3096
|
method_option :medium, :type => :boolean, :aliases => ["-md", "--medium"], :default => false
|
3097
3097
|
method_option :large, :type => :boolean, :aliases => ["-lg", "--large"], :default => false
|
3098
|
-
method_option :
|
3099
|
-
method_option :
|
3100
|
-
method_option :
|
3098
|
+
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
|
3099
|
+
method_option :gpuxl, :type => :boolean, :aliases => ["--gpuxl"], :default => false
|
3100
|
+
method_option :gpuxxl, :type => :boolean, :aliases => ["--gpuxxl"], :default => false
|
3101
|
+
method_option :machine, :type => :string, :aliases => ["-m", "--machine"], :default => nil
|
3101
3102
|
method_option :sync_before, :type => :boolean, :aliases => ["-sb", "--sync_before"], :default => true
|
3102
3103
|
method_option :sync_after, :type => :boolean, :aliases => ["-sa", "--sync_after"], :default => true
|
3103
3104
|
method_option :title, :type => :string, :aliases => ["-t", "--title"], :default => ""
|
@@ -3139,12 +3140,9 @@ module Cnvrg
|
|
3139
3140
|
force = options["force"]
|
3140
3141
|
max_time = options["max_time"]
|
3141
3142
|
dataset_only_tree = options["dataset_only_tree"]
|
3143
|
+
custom_machine = options["machine"]
|
3142
3144
|
|
3143
3145
|
options_hash = Hash[options]
|
3144
|
-
real_options = []
|
3145
|
-
options_hash.each do |o|
|
3146
|
-
real_options << o if (!o[1].eql? "" and !["small", "medium", "large", "gpu_1", "gpu_2", "gpu_3"].include? o[0])
|
3147
|
-
end
|
3148
3146
|
if local
|
3149
3147
|
invoke :exec, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title,
|
3150
3148
|
:log => log, :email_notification => email_notification, :upload_output => upload_output,
|
@@ -3163,15 +3161,10 @@ module Cnvrg
|
|
3163
3161
|
|
3164
3162
|
end
|
3165
3163
|
end
|
3166
|
-
real_options.delete(["local", false])
|
3167
3164
|
instances = {"small" => options["small"], "medium" => options["medium"], "large" => options["large"],
|
3168
|
-
"
|
3165
|
+
"gpu" => options["gpu"], "gpuxl" => options["gpuxl"], "gpuxxl" => options["gpuxxl"],
|
3166
|
+
options["machine"] => !options["machine"].blank?}
|
3169
3167
|
instance_type = get_instance_type(instances)
|
3170
|
-
if !instance_type.nil? and !instance_type.empty?
|
3171
|
-
real_options << ["machine_type", instance_type]
|
3172
|
-
end
|
3173
|
-
exec_options = real_options.map {|x| "--#{x[0]}=#{x[1]}"}.flatten.join(" ")
|
3174
|
-
cmd_to_exec = "#{exec_options} #{cmd.join(" ")}"
|
3175
3168
|
invoke :exec_remote, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title, :machine_type => instance_type,
|
3176
3169
|
:schedule => schedule, :log => log, :email_notification => email_notification, :upload_output => upload_output, :commit => commit,
|
3177
3170
|
:image => image, :grid => grid, :data => data, :data_commit => data_commit, :ignore => ignore, :force => force, :sync_before_terminate => sync_before_terminate,
|
@@ -3227,7 +3220,6 @@ module Cnvrg
|
|
3227
3220
|
time_to_upload = calc_output_time(upload_output)
|
3228
3221
|
project_home = get_project_home
|
3229
3222
|
@project = Project.new(project_home)
|
3230
|
-
|
3231
3223
|
is_new_branch = @project.compare_commit(commit)
|
3232
3224
|
begin
|
3233
3225
|
if !commit.nil? and !commit.empty?
|
@@ -3244,8 +3236,6 @@ module Cnvrg
|
|
3244
3236
|
end
|
3245
3237
|
if !indocker
|
3246
3238
|
image_proj = is_project_with_docker(working_dir)
|
3247
|
-
|
3248
|
-
|
3249
3239
|
if image_proj and image_proj.is_docker
|
3250
3240
|
container = image_proj.get_container
|
3251
3241
|
if !container
|
@@ -3292,7 +3282,7 @@ module Cnvrg
|
|
3292
3282
|
stdout, stderr = '', ''
|
3293
3283
|
begin
|
3294
3284
|
|
3295
|
-
|
3285
|
+
if remote
|
3296
3286
|
if @exp.sync_before_terminate
|
3297
3287
|
spot_status_thread = Thread.new do
|
3298
3288
|
begin
|
@@ -3312,24 +3302,11 @@ module Cnvrg
|
|
3312
3302
|
end
|
3313
3303
|
end
|
3314
3304
|
end
|
3315
|
-
|
3316
|
-
|
3317
|
-
begin
|
3318
|
-
loop do
|
3319
|
-
# sync
|
3320
|
-
sleep(@exp.sync_delay_time)
|
3321
|
-
|
3322
|
-
download(sync = true, ignore_list = [])
|
3323
|
-
upload(link = false, sync = true, direct = false, ignore_list = [])
|
3324
|
-
end
|
3325
|
-
rescue => e
|
3326
|
-
log_error(e)
|
3327
|
-
end
|
3328
|
-
end
|
3329
|
-
end
|
3330
|
-
# end
|
3305
|
+
end
|
3306
|
+
process_running = true
|
3331
3307
|
stats_thread = Thread.new do
|
3332
|
-
|
3308
|
+
while process_running do
|
3309
|
+
sleep 30
|
3333
3310
|
begin
|
3334
3311
|
stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.linux? ? {memory: memory_usage, cpu: cpu_usage} : {}
|
3335
3312
|
if is_on_gpu
|
@@ -3337,85 +3314,90 @@ module Cnvrg
|
|
3337
3314
|
stats['gpu'] = gu[0]
|
3338
3315
|
stats['gpu_util'] = gu[1]
|
3339
3316
|
end
|
3340
|
-
@exp.send_machine_stats [stats]
|
3317
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
3341
3318
|
rescue => e
|
3342
3319
|
log_error(e)
|
3343
3320
|
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
3344
|
-
ensure
|
3345
|
-
sleep(30)
|
3346
3321
|
end
|
3347
3322
|
end
|
3348
3323
|
end
|
3349
|
-
|
3350
|
-
|
3351
3324
|
log_thread = Thread.new do
|
3352
|
-
|
3353
|
-
log_count = log.count
|
3354
|
-
time_to_upload = calc_output_log_time(log_count)
|
3325
|
+
while process_running or !log.empty? do
|
3355
3326
|
begin
|
3356
|
-
|
3357
|
-
|
3358
|
-
|
3359
|
-
|
3360
|
-
start_loop = Time.now
|
3361
|
-
end
|
3327
|
+
temp_log = log
|
3328
|
+
if !temp_log.empty?
|
3329
|
+
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3330
|
+
|
3362
3331
|
end
|
3332
|
+
log -= temp_log
|
3363
3333
|
rescue => e
|
3364
3334
|
log_message("Failed to upload ongoing results, continuing with experiment", Thor::Shell::Color::YELLOW)
|
3365
3335
|
log_error(e)
|
3336
|
+
ensure
|
3337
|
+
sleep 10
|
3366
3338
|
end
|
3367
|
-
sleep(1)
|
3368
3339
|
end
|
3369
3340
|
end
|
3370
|
-
|
3371
3341
|
PTY.spawn(cmd) do |stdout, stdin, pid, stderr|
|
3372
3342
|
begin
|
3373
3343
|
stdout.each do |line|
|
3374
3344
|
cur_time = Time.now
|
3375
3345
|
real_time = Time.now - real
|
3376
|
-
|
3377
3346
|
cur_log = {time: cur_time,
|
3378
3347
|
message: line,
|
3379
3348
|
type: "stdout",
|
3380
3349
|
real: real_time
|
3381
|
-
|
3382
3350
|
}
|
3383
|
-
|
3384
|
-
|
3351
|
+
$LOG.info(cur_log)
|
3385
3352
|
if print_log
|
3386
3353
|
puts cur_log
|
3387
3354
|
end
|
3388
3355
|
log << cur_log
|
3356
|
+
# if log.size >= 10
|
3357
|
+
# @exp.upload_temp_log(log)
|
3358
|
+
# log = []
|
3359
|
+
# end
|
3389
3360
|
end
|
3390
|
-
|
3391
|
-
|
3392
3361
|
if stderr
|
3393
|
-
|
3394
3362
|
stderr.each do |err|
|
3395
|
-
|
3396
3363
|
log << {time: Time.now, message: err, type: "stderr"}
|
3397
3364
|
end
|
3398
3365
|
end
|
3399
|
-
|
3400
3366
|
rescue Errno::EIO => e
|
3401
3367
|
log_error(e)
|
3402
|
-
|
3368
|
+
if !log.empty?
|
3403
3369
|
|
3370
|
+
temp_log = log
|
3371
|
+
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3372
|
+
|
3373
|
+
log -= temp_log
|
3374
|
+
end
|
3375
|
+
rescue Errno::ENOENT => e
|
3404
3376
|
exp_success = false
|
3405
3377
|
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
3406
3378
|
log_error(e)
|
3407
|
-
rescue Open4::ChildExited
|
3408
|
-
|
3409
|
-
|
3379
|
+
# rescue Open4::ChildExited
|
3380
|
+
# exp_success = false
|
3381
|
+
# log_message("The process exited!", Thor::Shell::Color::RED)
|
3410
3382
|
rescue => e
|
3411
3383
|
sleep(20) # end cycle
|
3412
|
-
res = @exp.end(log, 1, start_commit,
|
3384
|
+
res = @exp.end(log, 1, start_commit, 0, 0)
|
3413
3385
|
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
3414
3386
|
log_error(e)
|
3415
3387
|
exit(0)
|
3416
3388
|
end
|
3417
3389
|
::Process.wait pid
|
3418
3390
|
end
|
3391
|
+
process_running = false
|
3392
|
+
|
3393
|
+
if !log.empty?
|
3394
|
+
|
3395
|
+
temp_log = log
|
3396
|
+
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3397
|
+
|
3398
|
+
log -= temp_log
|
3399
|
+
end
|
3400
|
+
|
3419
3401
|
cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
|
3420
3402
|
memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
|
3421
3403
|
exit_status = $?.exitstatus
|
@@ -3438,8 +3420,10 @@ module Cnvrg
|
|
3438
3420
|
end
|
3439
3421
|
end_commit = @project.last_local_commit
|
3440
3422
|
|
3441
|
-
|
3442
|
-
|
3423
|
+
log_thread.join
|
3424
|
+
stats_thread.join
|
3425
|
+
|
3426
|
+
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
|
3443
3427
|
|
3444
3428
|
|
3445
3429
|
if !exp_success
|
@@ -3465,13 +3449,15 @@ module Cnvrg
|
|
3465
3449
|
|
3466
3450
|
}
|
3467
3451
|
log << cur_log
|
3468
|
-
|
3452
|
+
process_running = false
|
3453
|
+
log_thread.join
|
3454
|
+
stats_thread.join
|
3469
3455
|
res = @exp.end(log, "-1", end_commit, cpu_average, memory_average)
|
3470
3456
|
|
3471
3457
|
end
|
3472
3458
|
log_error(e)
|
3473
|
-
Thread.kill(log_thread)
|
3474
|
-
Thread.kill(stats_thread)
|
3459
|
+
# Thread.kill(log_thread)
|
3460
|
+
# Thread.kill(stats_thread)
|
3475
3461
|
|
3476
3462
|
exit(1)
|
3477
3463
|
end
|
@@ -3483,7 +3469,9 @@ module Cnvrg
|
|
3483
3469
|
rescue SignalException
|
3484
3470
|
exit_status = -1
|
3485
3471
|
end_commit = @project.last_local_commit
|
3486
|
-
|
3472
|
+
process_running = false
|
3473
|
+
log_thread.join
|
3474
|
+
stats_thread.join
|
3487
3475
|
|
3488
3476
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
|
3489
3477
|
if container
|
@@ -5519,7 +5507,12 @@ module Cnvrg
|
|
5519
5507
|
if config and !config.nil? and !config.empty? and !config.to_h[:compression_path].nil?
|
5520
5508
|
compression_path = config.to_h[:compression_path]
|
5521
5509
|
end
|
5522
|
-
|
5510
|
+
verify_ssl = false
|
5511
|
+
|
5512
|
+
if config and !config.nil? and !config.empty? and !config.to_h[:verfiy_ssl].nil?
|
5513
|
+
verify_ssl = config.to_h[:verify_ssl]
|
5514
|
+
end
|
5515
|
+
config = {owner: owner, username: username, version_last_check: get_start_day(), api: url, compression_path: compression_path, verfiy_ssl:verify_ssl}
|
5523
5516
|
|
5524
5517
|
File.open(home_dir + "/.cnvrg/config.yml", "w+") {|f| f.write config.to_yaml}
|
5525
5518
|
return true
|
@@ -6145,27 +6138,30 @@ module Cnvrg
|
|
6145
6138
|
end
|
6146
6139
|
|
6147
6140
|
def gpu_util
|
6148
|
-
|
6141
|
+
stats = [[],[]]
|
6149
6142
|
begin
|
6150
|
-
gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv
|
6143
|
+
gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
|
6151
6144
|
|
6152
6145
|
if !gpu_stats.nil?
|
6153
|
-
|
6154
|
-
|
6155
|
-
|
6156
|
-
|
6157
|
-
|
6146
|
+
gpu_stats = gpu_stats.split("\n")[1..-1]
|
6147
|
+
stats = [[],[]]
|
6148
|
+
gpu_stats.each do |stat|
|
6149
|
+
gpu = stat.strip.gsub!("%", "").split(",")
|
6150
|
+
stats[0] << gpu[0].to_f
|
6151
|
+
stats[1] << gpu[1].to_f
|
6152
|
+
end
|
6153
|
+
return stats
|
6158
6154
|
end
|
6159
6155
|
|
6160
6156
|
rescue
|
6161
|
-
return
|
6157
|
+
return stats
|
6162
6158
|
end
|
6163
6159
|
|
6164
6160
|
|
6165
6161
|
end
|
6166
6162
|
|
6167
6163
|
def usage_metrics_in_docker(docker_id)
|
6168
|
-
res = {cpu: 0.0, memory: 0.0}
|
6164
|
+
res = {cpu: 0.0, memory: 0.0, block_io: {input: 0, output: 0.0}}
|
6169
6165
|
begin
|
6170
6166
|
if docker_id.nil?
|
6171
6167
|
docker_id = `cat /etc/hostname`
|
@@ -6174,15 +6170,46 @@ module Cnvrg
|
|
6174
6170
|
if !stats.nil?
|
6175
6171
|
conv = stats.split(",")
|
6176
6172
|
cpu = conv[0].gsub!("%", "").to_f
|
6173
|
+
res[:cpu] = cpu
|
6177
6174
|
memory = conv[1].gsub!("%", "").to_f
|
6178
|
-
res =
|
6175
|
+
res[:memory] = memory
|
6176
|
+
block_io = parse_io conv[2]
|
6177
|
+
res = {cpu: cpu, memory: memory, block_io: block_io}
|
6179
6178
|
return res
|
6180
6179
|
end
|
6181
6180
|
rescue
|
6182
6181
|
return res
|
6183
6182
|
end
|
6183
|
+
end
|
6184
|
+
|
6184
6185
|
|
6185
6186
|
|
6187
|
+
def parse_io(block_io_str)
|
6188
|
+
block_io = block_io_str.gsub!(" ", "").split('/')
|
6189
|
+
input = block_io[0]
|
6190
|
+
output = block_io[1]
|
6191
|
+
r = Regexp.new('(\d+(\.\d+)?)([A-Za-z]+)')
|
6192
|
+
input_match = r.match(input)
|
6193
|
+
input = input_match[1].to_f * size_to_bytes(input_match[3])
|
6194
|
+
output_match = r.match(output)
|
6195
|
+
output = output_match[1].to_f * size_to_bytes(output_match[3])
|
6196
|
+
{input: input, output: output}
|
6197
|
+
end
|
6198
|
+
|
6199
|
+
|
6200
|
+
def size_to_bytes size
|
6201
|
+
case size.try(:downcase)
|
6202
|
+
when 'b'
|
6203
|
+
1
|
6204
|
+
when 'kb'
|
6205
|
+
2**10
|
6206
|
+
when 'mb'
|
6207
|
+
2**20
|
6208
|
+
when 'gb'
|
6209
|
+
2**30
|
6210
|
+
else
|
6211
|
+
1
|
6212
|
+
end
|
6186
6213
|
end
|
6187
6214
|
|
6188
6215
|
end
|
data/lib/cnvrg/project.rb
CHANGED
@@ -381,7 +381,7 @@ module Cnvrg
|
|
381
381
|
end
|
382
382
|
response = Cnvrg::API.request("users/#{self.owner}/projects/#{self.slug}/status", 'POST', {idx: local_idx, new_branch: new_branch,
|
383
383
|
current_commit: commit,ignore:ignore_list, force:force,in_exp:in_exp})
|
384
|
-
CLI.is_response_success(response,
|
384
|
+
CLI.is_response_success(response,true)
|
385
385
|
return response
|
386
386
|
end
|
387
387
|
def jump_idx(new_branch, commit=last_local_commit)
|
data/lib/cnvrg/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cnvrg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yochay Ettun
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-06-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|