cnvrg 0.4.4 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/cnvrg/cli.rb +109 -82
- data/lib/cnvrg/project.rb +1 -1
- data/lib/cnvrg/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e44751ed52140fdaee81469a183a0f06f86968e1
|
4
|
+
data.tar.gz: 7b95fb6b3720a17d2edfe7a12da9c21ef0d3034f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2766aba6a508f5b073b4b12e987ee1d4568f84c47c632e6fefa6bf6fc0416a3737727cae88dfa7b97873c3f6461c5821d15a4ca9c7c54e32e9ca763b3e8b79d2
|
7
|
+
data.tar.gz: 8d2985f8c0ff5ea5d70131e993d896ce3ddd20a1f855dcf7ba7cad4a862a09fa13424bcdb3c5ebccb2a4e2ec9746f00eff2d7be49b7ee810826dc94d41a05a35
|
data/lib/cnvrg/cli.rb
CHANGED
@@ -3095,9 +3095,10 @@ module Cnvrg
|
|
3095
3095
|
method_option :small, :type => :boolean, :aliases => ["-sm", "--small"], :default => false
|
3096
3096
|
method_option :medium, :type => :boolean, :aliases => ["-md", "--medium"], :default => false
|
3097
3097
|
method_option :large, :type => :boolean, :aliases => ["-lg", "--large"], :default => false
|
3098
|
-
method_option :
|
3099
|
-
method_option :
|
3100
|
-
method_option :
|
3098
|
+
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
|
3099
|
+
method_option :gpuxl, :type => :boolean, :aliases => ["--gpuxl"], :default => false
|
3100
|
+
method_option :gpuxxl, :type => :boolean, :aliases => ["--gpuxxl"], :default => false
|
3101
|
+
method_option :machine, :type => :string, :aliases => ["-m", "--machine"], :default => nil
|
3101
3102
|
method_option :sync_before, :type => :boolean, :aliases => ["-sb", "--sync_before"], :default => true
|
3102
3103
|
method_option :sync_after, :type => :boolean, :aliases => ["-sa", "--sync_after"], :default => true
|
3103
3104
|
method_option :title, :type => :string, :aliases => ["-t", "--title"], :default => ""
|
@@ -3139,12 +3140,9 @@ module Cnvrg
|
|
3139
3140
|
force = options["force"]
|
3140
3141
|
max_time = options["max_time"]
|
3141
3142
|
dataset_only_tree = options["dataset_only_tree"]
|
3143
|
+
custom_machine = options["machine"]
|
3142
3144
|
|
3143
3145
|
options_hash = Hash[options]
|
3144
|
-
real_options = []
|
3145
|
-
options_hash.each do |o|
|
3146
|
-
real_options << o if (!o[1].eql? "" and !["small", "medium", "large", "gpu_1", "gpu_2", "gpu_3"].include? o[0])
|
3147
|
-
end
|
3148
3146
|
if local
|
3149
3147
|
invoke :exec, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title,
|
3150
3148
|
:log => log, :email_notification => email_notification, :upload_output => upload_output,
|
@@ -3163,15 +3161,10 @@ module Cnvrg
|
|
3163
3161
|
|
3164
3162
|
end
|
3165
3163
|
end
|
3166
|
-
real_options.delete(["local", false])
|
3167
3164
|
instances = {"small" => options["small"], "medium" => options["medium"], "large" => options["large"],
|
3168
|
-
"
|
3165
|
+
"gpu" => options["gpu"], "gpuxl" => options["gpuxl"], "gpuxxl" => options["gpuxxl"],
|
3166
|
+
options["machine"] => !options["machine"].blank?}
|
3169
3167
|
instance_type = get_instance_type(instances)
|
3170
|
-
if !instance_type.nil? and !instance_type.empty?
|
3171
|
-
real_options << ["machine_type", instance_type]
|
3172
|
-
end
|
3173
|
-
exec_options = real_options.map {|x| "--#{x[0]}=#{x[1]}"}.flatten.join(" ")
|
3174
|
-
cmd_to_exec = "#{exec_options} #{cmd.join(" ")}"
|
3175
3168
|
invoke :exec_remote, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title, :machine_type => instance_type,
|
3176
3169
|
:schedule => schedule, :log => log, :email_notification => email_notification, :upload_output => upload_output, :commit => commit,
|
3177
3170
|
:image => image, :grid => grid, :data => data, :data_commit => data_commit, :ignore => ignore, :force => force, :sync_before_terminate => sync_before_terminate,
|
@@ -3227,7 +3220,6 @@ module Cnvrg
|
|
3227
3220
|
time_to_upload = calc_output_time(upload_output)
|
3228
3221
|
project_home = get_project_home
|
3229
3222
|
@project = Project.new(project_home)
|
3230
|
-
|
3231
3223
|
is_new_branch = @project.compare_commit(commit)
|
3232
3224
|
begin
|
3233
3225
|
if !commit.nil? and !commit.empty?
|
@@ -3244,8 +3236,6 @@ module Cnvrg
|
|
3244
3236
|
end
|
3245
3237
|
if !indocker
|
3246
3238
|
image_proj = is_project_with_docker(working_dir)
|
3247
|
-
|
3248
|
-
|
3249
3239
|
if image_proj and image_proj.is_docker
|
3250
3240
|
container = image_proj.get_container
|
3251
3241
|
if !container
|
@@ -3292,7 +3282,7 @@ module Cnvrg
|
|
3292
3282
|
stdout, stderr = '', ''
|
3293
3283
|
begin
|
3294
3284
|
|
3295
|
-
|
3285
|
+
if remote
|
3296
3286
|
if @exp.sync_before_terminate
|
3297
3287
|
spot_status_thread = Thread.new do
|
3298
3288
|
begin
|
@@ -3312,24 +3302,11 @@ module Cnvrg
|
|
3312
3302
|
end
|
3313
3303
|
end
|
3314
3304
|
end
|
3315
|
-
|
3316
|
-
|
3317
|
-
begin
|
3318
|
-
loop do
|
3319
|
-
# sync
|
3320
|
-
sleep(@exp.sync_delay_time)
|
3321
|
-
|
3322
|
-
download(sync = true, ignore_list = [])
|
3323
|
-
upload(link = false, sync = true, direct = false, ignore_list = [])
|
3324
|
-
end
|
3325
|
-
rescue => e
|
3326
|
-
log_error(e)
|
3327
|
-
end
|
3328
|
-
end
|
3329
|
-
end
|
3330
|
-
# end
|
3305
|
+
end
|
3306
|
+
process_running = true
|
3331
3307
|
stats_thread = Thread.new do
|
3332
|
-
|
3308
|
+
while process_running do
|
3309
|
+
sleep 30
|
3333
3310
|
begin
|
3334
3311
|
stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.linux? ? {memory: memory_usage, cpu: cpu_usage} : {}
|
3335
3312
|
if is_on_gpu
|
@@ -3337,85 +3314,90 @@ module Cnvrg
|
|
3337
3314
|
stats['gpu'] = gu[0]
|
3338
3315
|
stats['gpu_util'] = gu[1]
|
3339
3316
|
end
|
3340
|
-
@exp.send_machine_stats [stats]
|
3317
|
+
@exp.send_machine_stats [stats] unless stats.empty?
|
3341
3318
|
rescue => e
|
3342
3319
|
log_error(e)
|
3343
3320
|
log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
|
3344
|
-
ensure
|
3345
|
-
sleep(30)
|
3346
3321
|
end
|
3347
3322
|
end
|
3348
3323
|
end
|
3349
|
-
|
3350
|
-
|
3351
3324
|
log_thread = Thread.new do
|
3352
|
-
|
3353
|
-
log_count = log.count
|
3354
|
-
time_to_upload = calc_output_log_time(log_count)
|
3325
|
+
while process_running or !log.empty? do
|
3355
3326
|
begin
|
3356
|
-
|
3357
|
-
|
3358
|
-
|
3359
|
-
|
3360
|
-
start_loop = Time.now
|
3361
|
-
end
|
3327
|
+
temp_log = log
|
3328
|
+
if !temp_log.empty?
|
3329
|
+
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3330
|
+
|
3362
3331
|
end
|
3332
|
+
log -= temp_log
|
3363
3333
|
rescue => e
|
3364
3334
|
log_message("Failed to upload ongoing results, continuing with experiment", Thor::Shell::Color::YELLOW)
|
3365
3335
|
log_error(e)
|
3336
|
+
ensure
|
3337
|
+
sleep 10
|
3366
3338
|
end
|
3367
|
-
sleep(1)
|
3368
3339
|
end
|
3369
3340
|
end
|
3370
|
-
|
3371
3341
|
PTY.spawn(cmd) do |stdout, stdin, pid, stderr|
|
3372
3342
|
begin
|
3373
3343
|
stdout.each do |line|
|
3374
3344
|
cur_time = Time.now
|
3375
3345
|
real_time = Time.now - real
|
3376
|
-
|
3377
3346
|
cur_log = {time: cur_time,
|
3378
3347
|
message: line,
|
3379
3348
|
type: "stdout",
|
3380
3349
|
real: real_time
|
3381
|
-
|
3382
3350
|
}
|
3383
|
-
|
3384
|
-
|
3351
|
+
$LOG.info(cur_log)
|
3385
3352
|
if print_log
|
3386
3353
|
puts cur_log
|
3387
3354
|
end
|
3388
3355
|
log << cur_log
|
3356
|
+
# if log.size >= 10
|
3357
|
+
# @exp.upload_temp_log(log)
|
3358
|
+
# log = []
|
3359
|
+
# end
|
3389
3360
|
end
|
3390
|
-
|
3391
|
-
|
3392
3361
|
if stderr
|
3393
|
-
|
3394
3362
|
stderr.each do |err|
|
3395
|
-
|
3396
3363
|
log << {time: Time.now, message: err, type: "stderr"}
|
3397
3364
|
end
|
3398
3365
|
end
|
3399
|
-
|
3400
3366
|
rescue Errno::EIO => e
|
3401
3367
|
log_error(e)
|
3402
|
-
|
3368
|
+
if !log.empty?
|
3403
3369
|
|
3370
|
+
temp_log = log
|
3371
|
+
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3372
|
+
|
3373
|
+
log -= temp_log
|
3374
|
+
end
|
3375
|
+
rescue Errno::ENOENT => e
|
3404
3376
|
exp_success = false
|
3405
3377
|
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
3406
3378
|
log_error(e)
|
3407
|
-
rescue Open4::ChildExited
|
3408
|
-
|
3409
|
-
|
3379
|
+
# rescue Open4::ChildExited
|
3380
|
+
# exp_success = false
|
3381
|
+
# log_message("The process exited!", Thor::Shell::Color::RED)
|
3410
3382
|
rescue => e
|
3411
3383
|
sleep(20) # end cycle
|
3412
|
-
res = @exp.end(log, 1, start_commit,
|
3384
|
+
res = @exp.end(log, 1, start_commit, 0, 0)
|
3413
3385
|
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
3414
3386
|
log_error(e)
|
3415
3387
|
exit(0)
|
3416
3388
|
end
|
3417
3389
|
::Process.wait pid
|
3418
3390
|
end
|
3391
|
+
process_running = false
|
3392
|
+
|
3393
|
+
if !log.empty?
|
3394
|
+
|
3395
|
+
temp_log = log
|
3396
|
+
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3397
|
+
|
3398
|
+
log -= temp_log
|
3399
|
+
end
|
3400
|
+
|
3419
3401
|
cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
|
3420
3402
|
memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
|
3421
3403
|
exit_status = $?.exitstatus
|
@@ -3438,8 +3420,10 @@ module Cnvrg
|
|
3438
3420
|
end
|
3439
3421
|
end_commit = @project.last_local_commit
|
3440
3422
|
|
3441
|
-
|
3442
|
-
|
3423
|
+
log_thread.join
|
3424
|
+
stats_thread.join
|
3425
|
+
|
3426
|
+
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
|
3443
3427
|
|
3444
3428
|
|
3445
3429
|
if !exp_success
|
@@ -3465,13 +3449,15 @@ module Cnvrg
|
|
3465
3449
|
|
3466
3450
|
}
|
3467
3451
|
log << cur_log
|
3468
|
-
|
3452
|
+
process_running = false
|
3453
|
+
log_thread.join
|
3454
|
+
stats_thread.join
|
3469
3455
|
res = @exp.end(log, "-1", end_commit, cpu_average, memory_average)
|
3470
3456
|
|
3471
3457
|
end
|
3472
3458
|
log_error(e)
|
3473
|
-
Thread.kill(log_thread)
|
3474
|
-
Thread.kill(stats_thread)
|
3459
|
+
# Thread.kill(log_thread)
|
3460
|
+
# Thread.kill(stats_thread)
|
3475
3461
|
|
3476
3462
|
exit(1)
|
3477
3463
|
end
|
@@ -3483,7 +3469,9 @@ module Cnvrg
|
|
3483
3469
|
rescue SignalException
|
3484
3470
|
exit_status = -1
|
3485
3471
|
end_commit = @project.last_local_commit
|
3486
|
-
|
3472
|
+
process_running = false
|
3473
|
+
log_thread.join
|
3474
|
+
stats_thread.join
|
3487
3475
|
|
3488
3476
|
res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average)
|
3489
3477
|
if container
|
@@ -5519,7 +5507,12 @@ module Cnvrg
|
|
5519
5507
|
if config and !config.nil? and !config.empty? and !config.to_h[:compression_path].nil?
|
5520
5508
|
compression_path = config.to_h[:compression_path]
|
5521
5509
|
end
|
5522
|
-
|
5510
|
+
verify_ssl = false
|
5511
|
+
|
5512
|
+
if config and !config.nil? and !config.empty? and !config.to_h[:verfiy_ssl].nil?
|
5513
|
+
verify_ssl = config.to_h[:verify_ssl]
|
5514
|
+
end
|
5515
|
+
config = {owner: owner, username: username, version_last_check: get_start_day(), api: url, compression_path: compression_path, verfiy_ssl:verify_ssl}
|
5523
5516
|
|
5524
5517
|
File.open(home_dir + "/.cnvrg/config.yml", "w+") {|f| f.write config.to_yaml}
|
5525
5518
|
return true
|
@@ -6145,27 +6138,30 @@ module Cnvrg
|
|
6145
6138
|
end
|
6146
6139
|
|
6147
6140
|
def gpu_util
|
6148
|
-
|
6141
|
+
stats = [[],[]]
|
6149
6142
|
begin
|
6150
|
-
gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv
|
6143
|
+
gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
|
6151
6144
|
|
6152
6145
|
if !gpu_stats.nil?
|
6153
|
-
|
6154
|
-
|
6155
|
-
|
6156
|
-
|
6157
|
-
|
6146
|
+
gpu_stats = gpu_stats.split("\n")[1..-1]
|
6147
|
+
stats = [[],[]]
|
6148
|
+
gpu_stats.each do |stat|
|
6149
|
+
gpu = stat.strip.gsub!("%", "").split(",")
|
6150
|
+
stats[0] << gpu[0].to_f
|
6151
|
+
stats[1] << gpu[1].to_f
|
6152
|
+
end
|
6153
|
+
return stats
|
6158
6154
|
end
|
6159
6155
|
|
6160
6156
|
rescue
|
6161
|
-
return
|
6157
|
+
return stats
|
6162
6158
|
end
|
6163
6159
|
|
6164
6160
|
|
6165
6161
|
end
|
6166
6162
|
|
6167
6163
|
def usage_metrics_in_docker(docker_id)
|
6168
|
-
res = {cpu: 0.0, memory: 0.0}
|
6164
|
+
res = {cpu: 0.0, memory: 0.0, block_io: {input: 0, output: 0.0}}
|
6169
6165
|
begin
|
6170
6166
|
if docker_id.nil?
|
6171
6167
|
docker_id = `cat /etc/hostname`
|
@@ -6174,15 +6170,46 @@ module Cnvrg
|
|
6174
6170
|
if !stats.nil?
|
6175
6171
|
conv = stats.split(",")
|
6176
6172
|
cpu = conv[0].gsub!("%", "").to_f
|
6173
|
+
res[:cpu] = cpu
|
6177
6174
|
memory = conv[1].gsub!("%", "").to_f
|
6178
|
-
res =
|
6175
|
+
res[:memory] = memory
|
6176
|
+
block_io = parse_io conv[2]
|
6177
|
+
res = {cpu: cpu, memory: memory, block_io: block_io}
|
6179
6178
|
return res
|
6180
6179
|
end
|
6181
6180
|
rescue
|
6182
6181
|
return res
|
6183
6182
|
end
|
6183
|
+
end
|
6184
|
+
|
6184
6185
|
|
6185
6186
|
|
6187
|
+
def parse_io(block_io_str)
|
6188
|
+
block_io = block_io_str.gsub!(" ", "").split('/')
|
6189
|
+
input = block_io[0]
|
6190
|
+
output = block_io[1]
|
6191
|
+
r = Regexp.new('(\d+(\.\d+)?)([A-Za-z]+)')
|
6192
|
+
input_match = r.match(input)
|
6193
|
+
input = input_match[1].to_f * size_to_bytes(input_match[3])
|
6194
|
+
output_match = r.match(output)
|
6195
|
+
output = output_match[1].to_f * size_to_bytes(output_match[3])
|
6196
|
+
{input: input, output: output}
|
6197
|
+
end
|
6198
|
+
|
6199
|
+
|
6200
|
+
def size_to_bytes size
|
6201
|
+
case size.try(:downcase)
|
6202
|
+
when 'b'
|
6203
|
+
1
|
6204
|
+
when 'kb'
|
6205
|
+
2**10
|
6206
|
+
when 'mb'
|
6207
|
+
2**20
|
6208
|
+
when 'gb'
|
6209
|
+
2**30
|
6210
|
+
else
|
6211
|
+
1
|
6212
|
+
end
|
6186
6213
|
end
|
6187
6214
|
|
6188
6215
|
end
|
data/lib/cnvrg/project.rb
CHANGED
@@ -381,7 +381,7 @@ module Cnvrg
|
|
381
381
|
end
|
382
382
|
response = Cnvrg::API.request("users/#{self.owner}/projects/#{self.slug}/status", 'POST', {idx: local_idx, new_branch: new_branch,
|
383
383
|
current_commit: commit,ignore:ignore_list, force:force,in_exp:in_exp})
|
384
|
-
CLI.is_response_success(response,
|
384
|
+
CLI.is_response_success(response,true)
|
385
385
|
return response
|
386
386
|
end
|
387
387
|
def jump_idx(new_branch, commit=last_local_commit)
|
data/lib/cnvrg/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cnvrg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yochay Ettun
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-06-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|