cnvrg 1.11.31 → 2.0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Readme.md +42 -1
- data/cnvrg.gemspec +2 -1
- data/lib/cnvrg/api.rb +6 -4
- data/lib/cnvrg/api_v2.rb +1 -0
- data/lib/cnvrg/auth.rb +4 -1
- data/lib/cnvrg/cli/library_cli.rb +2 -2
- data/lib/cnvrg/cli.rb +158 -76
- data/lib/cnvrg/connect_job_ssh.rb +4 -4
- data/lib/cnvrg/files.rb +6 -2
- data/lib/cnvrg/helpers/agent.rb +58 -18
- data/lib/cnvrg/helpers/executer.rb +177 -35
- data/lib/cnvrg/job_ssh.rb +9 -4
- data/lib/cnvrg/project.rb +31 -10
- data/lib/cnvrg/version.rb +2 -2
- metadata +21 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0e2e22e961bc723442ec80d457f0dde2b982f1c75beb2e859bb5523e83682fbe
|
4
|
+
data.tar.gz: 645a4add593bcf3a63be4d64b4ccb45000e39447e7e11b9fd674347856e2bf97
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2af7a28b7b428f8b4066a1b3f01bada7273eaa93ea847f87badc253aea2123244ad27ab3804cda5f4cdcee91cc2cc0ed7dd168b347839f59a6c183a702438167
|
7
|
+
data.tar.gz: dc151c1ab54d54a86648b9f1341d402d0eb6fb8d578cc0a01f2594a28a34bfb8d97676d07aa389a8b1155fa466845f87be9b1dffbfcc2bcda33376937ecef418
|
data/Readme.md
CHANGED
@@ -18,4 +18,45 @@
|
|
18
18
|
## Version v1.11.30
|
19
19
|
2021-04-06
|
20
20
|
## Version v1.11.31
|
21
|
-
2021-04-22
|
21
|
+
2021-04-22
|
22
|
+
## Version v1.11.32
|
23
|
+
2021-05-05
|
24
|
+
* DEV-8868 - Bug: SDK - e.sync() in git project only creates empty "output" folder in commit
|
25
|
+
## Version v2.0.1
|
26
|
+
2021-06-13
|
27
|
+
## Version v2.0.2
|
28
|
+
2021-06-16
|
29
|
+
* DEV-9694 - Bug: Download artifacts fails on authorization error
|
30
|
+
## Version v2.0.3
|
31
|
+
2021-06-29
|
32
|
+
* DEV-9919 - Bug: clone artifacts fails on "Not Authorize, Are you logged in?"
|
33
|
+
## Version v2.0.4
|
34
|
+
2021-07-08
|
35
|
+
* DEV-9935 - Bug: CLI - cnvrg sync creates new commit but no blob versions
|
36
|
+
## Version v2.0.5
|
37
|
+
2021-07-11
|
38
|
+
* DEV-10171 - Bug: experiment randomly fails with error- "Couldn't clone artifacts"
|
39
|
+
* DEV-10189 - Bug: CLI Sync -file/folder with broken symlink will cause sync to fail
|
40
|
+
## Version v2.0.6
|
41
|
+
2021-07-18
|
42
|
+
* DEV-10209 - Bug: some experiments in grid failed on cnvrg-cli commands (docker container id was missing)
|
43
|
+
## Version v2.0.7
|
44
|
+
2021-07-27
|
45
|
+
* DEV-10186 - Bug: CLI/run an experiment with --local tag giver server error
|
46
|
+
## Version v2.0.8
|
47
|
+
2021-09-06
|
48
|
+
* DEV-10697 - Bug: Tensorboard not starting in workspace and experiment.
|
49
|
+
## Version v2.0.9
|
50
|
+
2021-09-12
|
51
|
+
* DEV-10502 - Bug: Periodic sync stuck
|
52
|
+
## Version v2.0.10
|
53
|
+
2021-09-12
|
54
|
+
* DEV-10502 - Bug: Periodic sync stuck
|
55
|
+
## Version v2.0.11
|
56
|
+
2021-10-21
|
57
|
+
## Version v2.0.12
|
58
|
+
2021-10-25
|
59
|
+
* DEV-11544 - Sub-bug: local experiment is failing to run
|
60
|
+
## Version v2.0.13
|
61
|
+
2021-10-27
|
62
|
+
* DEV-11054 - Task: Create organization and user by default
|
data/cnvrg.gemspec
CHANGED
@@ -6,7 +6,7 @@ require 'cnvrg/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = 'cnvrg'
|
8
8
|
spec.version = Cnvrg::VERSION
|
9
|
-
spec.authors = ['Yochay Ettun', 'Leah Kolben']
|
9
|
+
spec.authors = ['Yochay Ettun', 'Leah Kolben', 'Omer Shacham']
|
10
10
|
spec.email = ['info@cnvrg.io']
|
11
11
|
spec.summary = %q{A CLI tool for interacting with cnvrg.io.}
|
12
12
|
spec.description = %q{A CLI tool for interacting with cnvrg.io.}
|
@@ -39,6 +39,7 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
|
40
40
|
spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
|
41
41
|
spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
|
42
|
+
spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
|
42
43
|
spec.add_runtime_dependency 'parallel', '~> 1.12.0'
|
43
44
|
spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
|
44
45
|
spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
|
data/lib/cnvrg/api.rb
CHANGED
@@ -57,6 +57,7 @@ module Cnvrg
|
|
57
57
|
conn = Faraday.new "#{endpoint_uri}"
|
58
58
|
end
|
59
59
|
conn.headers['Auth-Token'] = @pass
|
60
|
+
conn.headers['Authorization'] = "CAPI #{@pass}"
|
60
61
|
conn.headers['User-Agent'] = "#{Cnvrg::API::USER_AGENT}"
|
61
62
|
conn.options.timeout = 420
|
62
63
|
conn.options.open_timeout=180
|
@@ -72,11 +73,11 @@ module Cnvrg
|
|
72
73
|
if response.to_hash[:status].to_i != 200
|
73
74
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
74
75
|
end
|
75
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
76
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
76
77
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
77
78
|
success = false
|
78
79
|
sleep(5 * retries)
|
79
|
-
retries +=1
|
80
|
+
retries += 1
|
80
81
|
next
|
81
82
|
end
|
82
83
|
rescue => e
|
@@ -112,11 +113,11 @@ module Cnvrg
|
|
112
113
|
if response.to_hash[:status].to_i != 200
|
113
114
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
114
115
|
end
|
115
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
116
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
116
117
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
117
118
|
success = false
|
118
119
|
sleep(5 * retries)
|
119
|
-
retries +=1
|
120
|
+
retries += 1
|
120
121
|
next
|
121
122
|
end
|
122
123
|
rescue => e
|
@@ -169,6 +170,7 @@ module Cnvrg
|
|
169
170
|
when 'POST_FILE'
|
170
171
|
conn = Faraday.new do |fr|
|
171
172
|
fr.headers['Auth-Token'] = @pass
|
173
|
+
fr.headers['Authorization'] = "CAPI #{@pass}"
|
172
174
|
fr.headers['User-Agent'] = "#{Cnvrg::API::USER_AGENT}"
|
173
175
|
fr.headers["Content-Type"] = "multipart/form-data"
|
174
176
|
if !Helpers.is_verify_ssl
|
data/lib/cnvrg/api_v2.rb
CHANGED
@@ -22,6 +22,7 @@ module Cnvrg
|
|
22
22
|
|
23
23
|
conn = Faraday.new endpoint_uri, :ssl => {:verify => !!Helpers.is_verify_ssl}
|
24
24
|
conn.headers['Auth-Token'] = pass
|
25
|
+
conn.headers['Authorization'] = "CAPI #{pass}"
|
25
26
|
conn.headers['User-Agent'] = Cnvrg::API::USER_AGENT
|
26
27
|
conn.headers['Content-Type'] = "application/json"
|
27
28
|
conn.options.timeout = 420
|
data/lib/cnvrg/auth.rb
CHANGED
@@ -44,7 +44,7 @@ module Cnvrg
|
|
44
44
|
end
|
45
45
|
end
|
46
46
|
|
47
|
-
def sign_in(email, password)
|
47
|
+
def sign_in(email, password, token: nil)
|
48
48
|
url = Cnvrg::API.endpoint_uri()
|
49
49
|
url = URI.parse(url+ "/users/sign_in")
|
50
50
|
http = Net::HTTP.new(url.host, url.port)
|
@@ -61,6 +61,9 @@ module Cnvrg
|
|
61
61
|
|
62
62
|
req.add_field("EMAIL", email)
|
63
63
|
req.add_field("PASSWORD", password)
|
64
|
+
if token.present?
|
65
|
+
req.add_field("Authorization", "CAPI #{token}")
|
66
|
+
end
|
64
67
|
|
65
68
|
response = http.request(req)
|
66
69
|
|
data/lib/cnvrg/cli.rb
CHANGED
@@ -173,7 +173,7 @@ module Cnvrg
|
|
173
173
|
desc "data [COMMAND]", "Upload and manage datasets", :hide => false
|
174
174
|
subcommand "data", Data
|
175
175
|
|
176
|
-
desc "job", "manage running jobs", :hide =>
|
176
|
+
desc "job", "manage running jobs", :hide => true
|
177
177
|
subcommand "job", JobCli
|
178
178
|
|
179
179
|
desc "ssh", "ssh into running jobs", :hide => false
|
@@ -415,7 +415,7 @@ module Cnvrg
|
|
415
415
|
end
|
416
416
|
end
|
417
417
|
|
418
|
-
desc 'set_compression_path', 'Set compression path'
|
418
|
+
desc 'set_compression_path', 'Set compression path', :hide => true
|
419
419
|
method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
|
420
420
|
|
421
421
|
def set_compression_path(*compression_path)
|
@@ -496,8 +496,10 @@ module Cnvrg
|
|
496
496
|
|
497
497
|
|
498
498
|
desc 'login', 'Authenticate with cnvrg.io platform'
|
499
|
+
method_option :sso, :type => :boolean, :aliases => ["-s", "--sso"], :default => false
|
499
500
|
|
500
501
|
def login
|
502
|
+
use_token = options["sso"]
|
501
503
|
begin
|
502
504
|
log_handler()
|
503
505
|
log_start(__method__, args, options)
|
@@ -515,12 +517,21 @@ module Cnvrg
|
|
515
517
|
exit(0)
|
516
518
|
end
|
517
519
|
@email = ask("Enter your email:")
|
518
|
-
|
519
|
-
|
520
|
+
if use_token
|
521
|
+
@token = cmd.ask("Enter your token (hidden):") {|q| q.echo = "*"}
|
522
|
+
netrc[Cnvrg::Helpers.netrc_domain] = @email, @token
|
523
|
+
netrc.save
|
524
|
+
password = ""
|
525
|
+
else
|
526
|
+
password = cmd.ask("Enter your password (hidden):") {|q| q.echo = "*"}
|
527
|
+
end
|
528
|
+
result = @auth.sign_in(@email, password, token: @token)
|
520
529
|
|
521
530
|
if !result["token"].nil?
|
522
|
-
|
523
|
-
|
531
|
+
unless use_token
|
532
|
+
netrc[Cnvrg::Helpers.netrc_domain] = @email, result["token"]
|
533
|
+
netrc.save
|
534
|
+
end
|
524
535
|
|
525
536
|
log_message("Authenticated successfully as #{@email}", Thor::Shell::Color::GREEN)
|
526
537
|
|
@@ -2311,6 +2322,7 @@ module Cnvrg
|
|
2311
2322
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
2312
2323
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
2313
2324
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
2325
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
2314
2326
|
|
2315
2327
|
def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
|
2316
2328
|
begin
|
@@ -2328,6 +2340,8 @@ module Cnvrg
|
|
2328
2340
|
exp_obj = nil
|
2329
2341
|
end
|
2330
2342
|
|
2343
|
+
local = options["local"]
|
2344
|
+
|
2331
2345
|
commit_msg = options["message"]
|
2332
2346
|
if commit_msg.nil? or commit_msg.empty?
|
2333
2347
|
commit_msg = ""
|
@@ -2349,7 +2363,7 @@ module Cnvrg
|
|
2349
2363
|
if git_output_dir.ends_with? "/"
|
2350
2364
|
git_output_dir = git_output_dir[0..-2]
|
2351
2365
|
end
|
2352
|
-
list = @project.generate_output_dir(git_output_dir)
|
2366
|
+
list = @project.generate_output_dir(git_output_dir, local: local)
|
2353
2367
|
end
|
2354
2368
|
list += @project.generate_git_diff if options["git_diff"]
|
2355
2369
|
spec_files_to_upload = list
|
@@ -2668,7 +2682,7 @@ module Cnvrg
|
|
2668
2682
|
end
|
2669
2683
|
end
|
2670
2684
|
|
2671
|
-
desc 'commit before termination', 'Commit job code before termination'
|
2685
|
+
desc 'commit before termination', 'Commit job code before termination', :hide => true
|
2672
2686
|
def commit_before_termination()
|
2673
2687
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2674
2688
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2678,7 +2692,7 @@ module Cnvrg
|
|
2678
2692
|
log_error(e)
|
2679
2693
|
end
|
2680
2694
|
|
2681
|
-
desc 'update_job_commit', 'Update job with its last commit'
|
2695
|
+
desc 'update_job_commit', 'Update job with its last commit' , :hide => true
|
2682
2696
|
def update_job_commit()
|
2683
2697
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2684
2698
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2868,7 +2882,7 @@ module Cnvrg
|
|
2868
2882
|
|
2869
2883
|
|
2870
2884
|
|
2871
|
-
desc 'jump', 'Jump to specific commit'
|
2885
|
+
desc 'jump COMMIT_ID', 'Jump to specific commit'
|
2872
2886
|
def jump(commit_sha1)
|
2873
2887
|
begin
|
2874
2888
|
verify_logged_in()
|
@@ -3003,11 +3017,12 @@ module Cnvrg
|
|
3003
3017
|
method_option :job_slug, :type => :string, :aliases => ["-j", "--job"], :default => nil
|
3004
3018
|
method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
|
3005
3019
|
method_option :files, :type => :string, :aliases => ["--files"], :default => nil
|
3006
|
-
method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default =>
|
3020
|
+
method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => 'output'
|
3007
3021
|
method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
|
3008
3022
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
3009
3023
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
3010
3024
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
3025
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
3011
3026
|
|
3012
3027
|
def sync(direct = true)
|
3013
3028
|
verify_logged_in(true) if direct
|
@@ -3030,10 +3045,10 @@ module Cnvrg
|
|
3030
3045
|
if run_download or options['debug_mode']
|
3031
3046
|
invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
|
3032
3047
|
end
|
3033
|
-
invoke :upload, [false, true,
|
3048
|
+
invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
|
3034
3049
|
:ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
|
3035
3050
|
:files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
|
3036
|
-
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
|
3051
|
+
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
|
3037
3052
|
|
3038
3053
|
end
|
3039
3054
|
|
@@ -3143,7 +3158,7 @@ module Cnvrg
|
|
3143
3158
|
invoke :exec, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title,
|
3144
3159
|
:log => log, :email_notification => email_notification, :upload_output => upload_output,
|
3145
3160
|
:commit => commit, :image => image, :data => data, :data_commit => data_commit,
|
3146
|
-
:ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query
|
3161
|
+
:ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query, :local => local
|
3147
3162
|
return
|
3148
3163
|
end
|
3149
3164
|
else
|
@@ -3200,6 +3215,7 @@ module Cnvrg
|
|
3200
3215
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
|
3201
3216
|
method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
|
3202
3217
|
method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
|
3218
|
+
method_option :local, :type => :boolean, :aliases => ["-l", "--local"], :default => false
|
3203
3219
|
|
3204
3220
|
def exec(*cmd)
|
3205
3221
|
log = []
|
@@ -3224,6 +3240,7 @@ module Cnvrg
|
|
3224
3240
|
project_home = get_project_home
|
3225
3241
|
data_query = options["data_query"]
|
3226
3242
|
docker_stats = options["docker_stats"]
|
3243
|
+
local = options[:local] || false
|
3227
3244
|
@project = Project.new(project_home)
|
3228
3245
|
if @project.is_git
|
3229
3246
|
sync_before = false
|
@@ -3316,62 +3333,53 @@ module Cnvrg
|
|
3316
3333
|
end
|
3317
3334
|
end
|
3318
3335
|
start_time = Time.now
|
3319
|
-
shell_type = options["use_bash"] ? "bash -l" : "sh"
|
3320
3336
|
if @exp.get_cmd.present?
|
3321
3337
|
cmd = @exp.get_cmd
|
3322
|
-
if options["docker_id"].present? # Escape for docker exec
|
3323
|
-
cmd = cmd.gsub("\"", "\\\"")
|
3324
|
-
end
|
3325
3338
|
end
|
3326
|
-
|
3327
|
-
|
3328
|
-
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
|
3333
|
-
|
3334
|
-
|
3335
|
-
|
3336
|
-
|
3337
|
-
|
3338
|
-
|
3339
|
-
|
3340
|
-
|
3339
|
+
|
3340
|
+
if local
|
3341
|
+
exec_local(cmd, print_log, start_commit, real, start_time)
|
3342
|
+
exit_status = $?.exitstatus
|
3343
|
+
|
3344
|
+
else
|
3345
|
+
command_slug = (0...18).map { (65 + rand(26)).chr }.join
|
3346
|
+
result_file = "/conf/result-#{command_slug}"
|
3347
|
+
data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
|
3348
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
3349
|
+
response = conn.post('command', data.to_json)
|
3350
|
+
if response.to_hash[:status].to_i != 200
|
3351
|
+
exit_status = 129
|
3352
|
+
raise StandardError.new("Cant send command to slave")
|
3353
|
+
end
|
3354
|
+
t = FileWatch::Tail.new
|
3355
|
+
filename = result_file
|
3356
|
+
lines = []
|
3357
|
+
t.tail(filename)
|
3358
|
+
t.subscribe do |path, line|
|
3359
|
+
begin
|
3360
|
+
cur_log = JSON.parse(line)
|
3361
|
+
if cur_log["type"] == "endMessage"
|
3362
|
+
exit_status = cur_log["real"].to_i
|
3363
|
+
break
|
3364
|
+
else
|
3365
|
+
puts(cur_log.to_json)
|
3366
|
+
STDOUT.flush
|
3367
|
+
cur_log["time"] = Time.parse(cur_log["timestamp"])
|
3368
|
+
cur_log["message"] = cur_log["message"].to_s + "\r\n"
|
3369
|
+
log << cur_log
|
3341
3370
|
end
|
3342
|
-
log << cur_log
|
3343
3371
|
if log.size >= 10
|
3344
|
-
@exp.upload_temp_log(log)
|
3372
|
+
@exp.upload_temp_log(log)
|
3345
3373
|
log = []
|
3346
|
-
|
3374
|
+
elsif (start_time + 15.seconds) <= Time.now
|
3347
3375
|
@exp.upload_temp_log(log) unless log.empty?
|
3348
3376
|
log = []
|
3349
3377
|
start_time = Time.now
|
3350
3378
|
end
|
3379
|
+
rescue => e
|
3380
|
+
log_error(e)
|
3351
3381
|
end
|
3352
|
-
if stderr
|
3353
|
-
stderr.each do |err|
|
3354
|
-
log << {time: Time.now, message: err, type: "stderr"}
|
3355
|
-
end
|
3356
|
-
end
|
3357
|
-
rescue Errno::EIO => e
|
3358
|
-
log_error(e)
|
3359
|
-
if !log.empty?
|
3360
|
-
temp_log = log
|
3361
|
-
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3362
|
-
log -= temp_log
|
3363
|
-
end
|
3364
|
-
rescue Errno::ENOENT => e
|
3365
|
-
exp_success = false
|
3366
|
-
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
3367
|
-
log_error(e)
|
3368
|
-
rescue => e
|
3369
|
-
res = @exp.end(log, 1, start_commit, 0, 0)
|
3370
|
-
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
3371
|
-
log_error(e)
|
3372
|
-
exit(0)
|
3373
3382
|
end
|
3374
|
-
::Process.wait pid
|
3375
3383
|
end
|
3376
3384
|
end_time = Time.now
|
3377
3385
|
process_running = false
|
@@ -3379,14 +3387,13 @@ module Cnvrg
|
|
3379
3387
|
if !log.empty?
|
3380
3388
|
|
3381
3389
|
temp_log = log
|
3382
|
-
|
3390
|
+
@exp.upload_temp_log(temp_log)
|
3383
3391
|
log -= temp_log
|
3384
3392
|
end
|
3385
3393
|
|
3386
3394
|
cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
|
3387
3395
|
memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
|
3388
|
-
exit_status
|
3389
|
-
if $?.exitstatus != 0
|
3396
|
+
if exit_status != 0
|
3390
3397
|
exp_success = false
|
3391
3398
|
end
|
3392
3399
|
|
@@ -3430,7 +3437,6 @@ module Cnvrg
|
|
3430
3437
|
if @exp
|
3431
3438
|
# log_thread.join
|
3432
3439
|
Thread.kill(stats_thread) if docker_stats
|
3433
|
-
exit_status = $?.exitstatus
|
3434
3440
|
if exit_status.blank?
|
3435
3441
|
exit_status = "-1"
|
3436
3442
|
end
|
@@ -3443,8 +3449,6 @@ module Cnvrg
|
|
3443
3449
|
|
3444
3450
|
exit(1)
|
3445
3451
|
end
|
3446
|
-
|
3447
|
-
|
3448
3452
|
end
|
3449
3453
|
|
3450
3454
|
end
|
@@ -3689,7 +3693,7 @@ module Cnvrg
|
|
3689
3693
|
end
|
3690
3694
|
end
|
3691
3695
|
|
3692
|
-
desc 'deploy', 'Deploys model to production'
|
3696
|
+
desc 'deploy', 'Deploys model to production', :hide => true
|
3693
3697
|
method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
|
3694
3698
|
method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
|
3695
3699
|
method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
|
@@ -3778,7 +3782,7 @@ module Cnvrg
|
|
3778
3782
|
method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
|
3779
3783
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
|
3780
3784
|
|
3781
|
-
desc 'notebook', 'Starts a notebook session remotely or locally'
|
3785
|
+
desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
|
3782
3786
|
|
3783
3787
|
def notebook
|
3784
3788
|
verify_logged_in(true)
|
@@ -3905,7 +3909,7 @@ module Cnvrg
|
|
3905
3909
|
end
|
3906
3910
|
end
|
3907
3911
|
|
3908
|
-
desc 'remote_notebook', 'Run notebook server on remote server'
|
3912
|
+
desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
|
3909
3913
|
method_option :machine_type, :type => :string, :default => ""
|
3910
3914
|
method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
|
3911
3915
|
method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
|
@@ -4264,7 +4268,7 @@ module Cnvrg
|
|
4264
4268
|
|
4265
4269
|
end
|
4266
4270
|
|
4267
|
-
desc 'notebook_stop', '
|
4271
|
+
desc 'notebook_stop', 'Stop notebook', :hide => true
|
4268
4272
|
method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
|
4269
4273
|
method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
|
4270
4274
|
method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
|
@@ -4651,10 +4655,16 @@ module Cnvrg
|
|
4651
4655
|
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
4652
4656
|
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
4653
4657
|
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
4658
|
+
method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
|
4659
|
+
method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
|
4660
|
+
method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
|
4654
4661
|
|
4655
4662
|
def collect_metrics
|
4656
4663
|
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
4657
4664
|
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
4665
|
+
prom_user = options[:prom_user]
|
4666
|
+
prom_password = options[:prom_password]
|
4667
|
+
name = options[:name]
|
4658
4668
|
|
4659
4669
|
translate_result = Cnvrg::API_V2.request(
|
4660
4670
|
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
@@ -4679,9 +4689,16 @@ module Cnvrg
|
|
4679
4689
|
next
|
4680
4690
|
end
|
4681
4691
|
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
4682
|
-
|
4692
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
4693
|
+
http.use_ssl = uri.scheme == "https"
|
4694
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
4695
|
+
req = Net::HTTP::Get.new uri.request_uri
|
4696
|
+
if prom_user.present?
|
4697
|
+
req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
|
4698
|
+
end
|
4699
|
+
resp = http.request(req)
|
4683
4700
|
begin
|
4684
|
-
result = JSON.parse(resp)
|
4701
|
+
result = JSON.parse(resp.body)
|
4685
4702
|
rescue JSON::ParserError => e
|
4686
4703
|
log_error(e)
|
4687
4704
|
next
|
@@ -4690,13 +4707,22 @@ module Cnvrg
|
|
4690
4707
|
next unless data_result
|
4691
4708
|
|
4692
4709
|
if data_result.size > 1
|
4693
|
-
stats[query_name] = {}
|
4710
|
+
stats[query_name] = {} unless query_name.include? 'block'
|
4694
4711
|
data_result.each_with_index do |res, i|
|
4695
4712
|
timestamp, value = res["value"]
|
4696
4713
|
uuid = res["metric"]["UUID"].presence || i
|
4697
4714
|
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4698
4715
|
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4699
|
-
|
4716
|
+
if query_name.include? 'block'
|
4717
|
+
uuid = res["metric"]["interface"].presence || i
|
4718
|
+
uuid = "#{name}-#{uuid}" if name.present?
|
4719
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
4720
|
+
io_type = query_name.split('_')[1]
|
4721
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4722
|
+
stats['block_io'][io_type].merge!({ uuid => stat_value })
|
4723
|
+
else
|
4724
|
+
stats[query_name][uuid] = stat_value
|
4725
|
+
end
|
4700
4726
|
end
|
4701
4727
|
else
|
4702
4728
|
timestamp, value = data_result&.first&.dig('value')
|
@@ -4705,9 +4731,14 @@ module Cnvrg
|
|
4705
4731
|
if query_name.include? 'block'
|
4706
4732
|
stats['block_io'] = {} if stats['block_io'].blank?
|
4707
4733
|
io_type = query_name.split('_')[1]
|
4708
|
-
|
4734
|
+
if name.present?
|
4735
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4736
|
+
stats['block_io'][io_type].merge!({ name => stat_value })
|
4737
|
+
else
|
4738
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
4739
|
+
end
|
4709
4740
|
else
|
4710
|
-
stats[query_name] = stat_value
|
4741
|
+
stats[query_name] = name.present? ? { name => stat_value } : stat_value
|
4711
4742
|
end
|
4712
4743
|
end
|
4713
4744
|
end
|
@@ -4751,7 +4782,7 @@ module Cnvrg
|
|
4751
4782
|
end
|
4752
4783
|
|
4753
4784
|
|
4754
|
-
desc '', ''
|
4785
|
+
desc '', '', :hide => true
|
4755
4786
|
|
4756
4787
|
def download_built_image(image_name, image_slug)
|
4757
4788
|
begin
|
@@ -4995,7 +5026,7 @@ module Cnvrg
|
|
4995
5026
|
end
|
4996
5027
|
end
|
4997
5028
|
|
4998
|
-
desc 'experiments', 'List project experiments'
|
5029
|
+
desc 'experiments', 'List project experiments', :hide => true
|
4999
5030
|
method_option :id, :type => :string, :aliases => ["--id"], :desc => "Get info for specific experiments", :default => ""
|
5000
5031
|
method_option :tag, :type => :string, :aliases => ["-t"], :desc => "Get info for specific experiment tag", :default => ""
|
5001
5032
|
|
@@ -5864,6 +5895,57 @@ module Cnvrg
|
|
5864
5895
|
end
|
5865
5896
|
end
|
5866
5897
|
|
5898
|
+
def exec_local(cmd , print_log, start_commit, real, start_time)
|
5899
|
+
log = []
|
5900
|
+
PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
|
5901
|
+
begin
|
5902
|
+
stdout.each do |line|
|
5903
|
+
cur_time = Time.now
|
5904
|
+
real_time = Time.now - real
|
5905
|
+
cur_log = {time: cur_time,
|
5906
|
+
message: line,
|
5907
|
+
type: "stdout",
|
5908
|
+
real: real_time
|
5909
|
+
}
|
5910
|
+
if print_log
|
5911
|
+
puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
|
5912
|
+
end
|
5913
|
+
log << cur_log
|
5914
|
+
if log.size >= 10
|
5915
|
+
@exp.upload_temp_log(log) unless log.empty?
|
5916
|
+
log = []
|
5917
|
+
elsif (start_time + 15.seconds) <= Time.now
|
5918
|
+
@exp.upload_temp_log(log) unless log.empty?
|
5919
|
+
log = []
|
5920
|
+
start_time = Time.now
|
5921
|
+
end
|
5922
|
+
end
|
5923
|
+
if stderr
|
5924
|
+
stderr.each do |err|
|
5925
|
+
log << {time: Time.now, message: err, type: "stderr"}
|
5926
|
+
end
|
5927
|
+
end
|
5928
|
+
rescue Errno::EIO => e
|
5929
|
+
log_error(e)
|
5930
|
+
if !log.empty?
|
5931
|
+
temp_log = log
|
5932
|
+
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
5933
|
+
log -= temp_log
|
5934
|
+
end
|
5935
|
+
rescue Errno::ENOENT => e
|
5936
|
+
exp_success = false
|
5937
|
+
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
5938
|
+
log_error(e)
|
5939
|
+
rescue => e
|
5940
|
+
res = @exp.end(log, 1, start_commit, 0, 0)
|
5941
|
+
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
5942
|
+
log_error(e)
|
5943
|
+
exit(0)
|
5944
|
+
end
|
5945
|
+
::Process.wait pid
|
5946
|
+
end
|
5947
|
+
end
|
5948
|
+
|
5867
5949
|
end
|
5868
5950
|
end
|
5869
5951
|
|
@@ -10,20 +10,20 @@ module Cnvrg
|
|
10
10
|
Cnvrg::Logger.log_info("cnvrg is not configured")
|
11
11
|
end
|
12
12
|
|
13
|
-
def start(username, password)
|
14
|
-
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password})
|
13
|
+
def start(username, password, no_auth, port: nil)
|
14
|
+
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password, no_auth: no_auth, port: port})
|
15
15
|
end
|
16
16
|
|
17
17
|
def status()
|
18
18
|
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/status" , 'GET', nil)
|
19
19
|
end
|
20
20
|
|
21
|
-
def run_portforward_command(pod_name, port, kubeconfig, namespace)
|
21
|
+
def run_portforward_command(pod_name, port, kubeconfig, namespace, internal_port)
|
22
22
|
command = "kubectl"
|
23
23
|
if kubeconfig.present?
|
24
24
|
command = "kubectl --kubeconfig=#{kubeconfig}"
|
25
25
|
end
|
26
|
-
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}
|
26
|
+
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:#{internal_port}"
|
27
27
|
puts("\nrunning command #{bashCommand}")
|
28
28
|
`#{bashCommand}`
|
29
29
|
end
|
data/lib/cnvrg/files.rb
CHANGED
@@ -106,7 +106,7 @@ module Cnvrg
|
|
106
106
|
commit: commit_sha1
|
107
107
|
})
|
108
108
|
unless Cnvrg::CLI.is_response_success(resp, false)
|
109
|
-
raise
|
109
|
+
raise StandardError.new("unsupported character: folder name can not include / \\ * : ? \" | ")
|
110
110
|
end
|
111
111
|
# resolve bucket
|
112
112
|
res = resp['result']
|
@@ -730,7 +730,11 @@ module Cnvrg
|
|
730
730
|
end
|
731
731
|
res = Cnvrg::API.request(@base_resource + "download_files", 'POST', {files: files, commit: commit})
|
732
732
|
unless Cnvrg::CLI.is_response_success(res, false)
|
733
|
-
|
733
|
+
begin
|
734
|
+
puts(res)
|
735
|
+
rescue
|
736
|
+
end
|
737
|
+
raise StandardError.new("Cant download files from the server.")
|
734
738
|
end
|
735
739
|
self.download_multiple_files_s3(res['result'], @project_home, postfix: postfix, progress: progress, threads: threads)
|
736
740
|
end
|
data/lib/cnvrg/helpers/agent.rb
CHANGED
@@ -18,10 +18,13 @@ class Cnvrg::Helpers::Agent
|
|
18
18
|
#### params
|
19
19
|
def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
|
20
20
|
@executer = executer
|
21
|
+
@job_id = ENV["CNVRG_JOB_ID"]
|
21
22
|
@slug = slug
|
22
23
|
@files_exist = files_exist
|
23
24
|
@container_name = container_name
|
24
|
-
@
|
25
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
26
|
+
@main_name = @is_new_main ? "main" : "slave"
|
27
|
+
@run_in_main = @container_name.downcase == @main_name
|
25
28
|
@log_interval = send_log_interval
|
26
29
|
# https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
|
27
30
|
if timeout.blank? or timeout.negative?
|
@@ -37,18 +40,8 @@ class Cnvrg::Helpers::Agent
|
|
37
40
|
@sleep_before_retry = sleep_before_retry
|
38
41
|
@real_execution_retries = 0 ## How many times the command really executed until success
|
39
42
|
@single_quotes = single_quotes
|
40
|
-
@docker_user =
|
41
|
-
@
|
42
|
-
if docker_user.present?
|
43
|
-
@docker_user = " --user #{docker_user}"
|
44
|
-
end
|
45
|
-
if @run_in_slave
|
46
|
-
if @single_quotes
|
47
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
|
48
|
-
else
|
49
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
|
50
|
-
end
|
51
|
-
end
|
43
|
+
@docker_user = docker_user
|
44
|
+
@use_bash = use_bash
|
52
45
|
@output = []
|
53
46
|
@errors = []
|
54
47
|
@exit_status = nil
|
@@ -75,19 +68,22 @@ class Cnvrg::Helpers::Agent
|
|
75
68
|
|
76
69
|
def exec!
|
77
70
|
log_internal("Command: #{@command} with slug: #{@slug} started!")
|
71
|
+
command_status = Status::FINISHED
|
78
72
|
if @command.blank?
|
79
73
|
@exit_status = 0
|
74
|
+
command_status = Status::ABORTED
|
80
75
|
elsif should_run?
|
81
76
|
send_logs(status: Status::STARTED)
|
82
77
|
periodic_thread_handle = periodic_thread
|
83
78
|
execute_command
|
84
79
|
else
|
80
|
+
command_status = Status::ABORTED
|
85
81
|
@exit_status = 127
|
86
82
|
end
|
87
83
|
finish_log = "Command: #{@command} with slug: #{@slug} finished"
|
88
84
|
finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
|
89
85
|
log_internal(finish_log)
|
90
|
-
send_logs(exit_status: @exit_status, status:
|
86
|
+
send_logs(exit_status: @exit_status, status: command_status)
|
91
87
|
if periodic_thread_handle.present?
|
92
88
|
periodic_thread_handle.join
|
93
89
|
end
|
@@ -117,14 +113,50 @@ class Cnvrg::Helpers::Agent
|
|
117
113
|
execute_command
|
118
114
|
end
|
119
115
|
|
116
|
+
def execute_command_on_slave
|
117
|
+
extra_slug = (0...2).map { (65 + rand(26)).chr }.join
|
118
|
+
result_file = "/conf/result-#{@slug}-#{extra_slug}"
|
119
|
+
Timeout.timeout(@timeout) do
|
120
|
+
data = {cmd: @command, async: true, file_name: result_file, use_script: true, use_bash: @use_bash, use_sh: !@use_bash, docker_user: @docker_user}
|
121
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
122
|
+
response = conn.post('command', data.to_json)
|
123
|
+
if response.to_hash[:status].to_i != 200
|
124
|
+
@exit_status = 129
|
125
|
+
raise StandardError.new("Cant send command to slave")
|
126
|
+
end
|
127
|
+
t = FileWatch::Tail.new
|
128
|
+
filename = result_file
|
129
|
+
t.tail(filename)
|
130
|
+
t.subscribe do |path, line|
|
131
|
+
if line.include?("cnvrg-exit-code")
|
132
|
+
@exit_status = line.split("=")[1].to_i
|
133
|
+
break
|
134
|
+
end
|
135
|
+
if !@is_new_main
|
136
|
+
log_internal(line, level: LogLevel::PURE)
|
137
|
+
end
|
138
|
+
line = line.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_')
|
139
|
+
@output << {log: line, timestamp: Time.now}
|
140
|
+
end
|
141
|
+
end
|
142
|
+
rescue Timeout::Error
|
143
|
+
@errors << {log: "Command timed out!", timestamp: Time.now}
|
144
|
+
log_internal("Command timed out!", level: LogLevel::ERROR)
|
145
|
+
@exit_status = 124
|
146
|
+
ensure
|
147
|
+
retry_command if @retries != 0 and @exit_status !=0
|
148
|
+
@exit_status
|
149
|
+
end
|
150
|
+
|
120
151
|
def execute_command
|
152
|
+
return execute_command_on_slave if @run_in_main
|
121
153
|
Timeout.timeout(@timeout) do
|
122
154
|
PTY.spawn(@command) do |stdout, stdin, pid, stderr|
|
123
155
|
@pid = pid
|
124
156
|
begin
|
125
157
|
if stdout.present?
|
126
158
|
stdout.each do |line|
|
127
|
-
log_internal(line, level: LogLevel::
|
159
|
+
log_internal(line, level: LogLevel::INFO)
|
128
160
|
line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
|
129
161
|
@output << {log: line, timestamp: Time.now}
|
130
162
|
end
|
@@ -176,10 +208,18 @@ class Cnvrg::Helpers::Agent
|
|
176
208
|
def log_internal(log, level: LogLevel::INFO)
|
177
209
|
if level == LogLevel::PURE
|
178
210
|
puts(log)
|
179
|
-
|
180
|
-
|
211
|
+
STDOUT.flush
|
212
|
+
return
|
213
|
+
end
|
214
|
+
to_print = {message: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity, job: @job_id}
|
215
|
+
if log.start_with?("{") and log.include?("timestamp")
|
216
|
+
log_json = JSON.parse(log)
|
217
|
+
to_print = to_print.stringify_keys.merge(log_json.stringify_keys)
|
181
218
|
end
|
219
|
+
puts(to_print.to_json)
|
182
220
|
STDOUT.flush
|
221
|
+
rescue => e
|
222
|
+
Cnvrg::Logger.log_error(e)
|
183
223
|
end
|
184
224
|
|
185
225
|
def filter_logs_by_regex(logs)
|
@@ -190,4 +230,4 @@ class Cnvrg::Helpers::Agent
|
|
190
230
|
end
|
191
231
|
end
|
192
232
|
end
|
193
|
-
end
|
233
|
+
end
|
@@ -1,7 +1,9 @@
|
|
1
|
+
require "filewatch/tail"
|
1
2
|
require 'cnvrg/helpers/agent'
|
2
3
|
class Cnvrg::Helpers::Executer
|
3
|
-
attr_reader :machine_activity, :agent_id, :
|
4
|
-
|
4
|
+
attr_reader :machine_activity, :agent_id, :main_id
|
5
|
+
MAIN_CONTAINER_PORT = ENV['MAIN_CONTAINER_PORT'].try(:to_i) || 4000
|
6
|
+
HAS_DOCKER = ENV['HAS_DOCKER'] == "true"
|
5
7
|
|
6
8
|
### this class represent a machine_activity. it will poll the commands, communicate with the
|
7
9
|
# server (poll commands) and let the server know the status of this executer.
|
@@ -9,11 +11,15 @@ class Cnvrg::Helpers::Executer
|
|
9
11
|
@owner = owner
|
10
12
|
@job_id = job_id
|
11
13
|
@poll_every = poll_every
|
14
|
+
@check_main_every = 10
|
12
15
|
@machine_activity = machine_activity
|
13
16
|
@commands_q = Queue.new
|
14
17
|
@files_q = Queue.new
|
15
18
|
@agent_id = nil
|
16
|
-
@
|
19
|
+
@main_id = nil
|
20
|
+
@main_start_time = nil
|
21
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
22
|
+
@main_name = @is_new_main ? "main" : "slave"
|
17
23
|
end
|
18
24
|
|
19
25
|
def create_file_cmd(path, content)
|
@@ -37,9 +43,10 @@ class Cnvrg::Helpers::Executer
|
|
37
43
|
def executer_stats
|
38
44
|
return @stats if @stats.present?
|
39
45
|
Cnvrg::Logger.log_info("getting containers")
|
40
|
-
@agent_id, @
|
46
|
+
@agent_id, @main_id = containers
|
41
47
|
Cnvrg::Logger.log_info("got containers")
|
42
48
|
pod_name, node_name = get_node_and_pod_names
|
49
|
+
# For backwards compatibility we still call this slave stats
|
43
50
|
@stats = {
|
44
51
|
pod_name: pod_name,
|
45
52
|
node_name: node_name,
|
@@ -53,39 +60,48 @@ class Cnvrg::Helpers::Executer
|
|
53
60
|
cnvrg: Cnvrg::VERSION
|
54
61
|
},
|
55
62
|
slave: {
|
56
|
-
container_id: @
|
57
|
-
|
58
|
-
|
63
|
+
container_id: @main_id,
|
64
|
+
container_name: @main_name,
|
65
|
+
workdir: run_in_main('pwd'),
|
66
|
+
homedir: main_homedir,
|
59
67
|
spark_path: spark_path,
|
60
|
-
user:
|
61
|
-
cnvrg:
|
62
|
-
has_bash:
|
63
|
-
user_id:
|
64
|
-
group_id:
|
65
|
-
python_version:
|
66
|
-
python3_version:
|
67
|
-
pip_version:
|
68
|
-
pip3_version:
|
68
|
+
user: run_in_main( 'whoami'),
|
69
|
+
cnvrg: run_in_main( 'which cnvrg'),
|
70
|
+
has_bash: run_in_main( 'which bash'),
|
71
|
+
user_id: run_in_main( 'id -u'),
|
72
|
+
group_id: run_in_main( 'id -g'),
|
73
|
+
python_version: run_in_main( 'python --version'),
|
74
|
+
python3_version: run_in_main( 'python3 --version'),
|
75
|
+
pip_version: run_in_main( 'pip --version'),
|
76
|
+
pip3_version: run_in_main( 'pip3 --version')
|
69
77
|
},
|
70
78
|
}
|
79
|
+
|
71
80
|
@stats
|
72
81
|
end
|
73
82
|
|
74
83
|
def containers
|
75
84
|
agent_id = nil
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
85
|
+
main_id = nil
|
86
|
+
timeout = 2
|
87
|
+
timeout = nil if (!@is_new_main || HAS_DOCKER)
|
88
|
+
Timeout.timeout(timeout) do
|
89
|
+
while agent_id.blank? or main_id.blank?
|
90
|
+
grep_by = @job_id
|
91
|
+
grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
|
92
|
+
cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
|
93
|
+
agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
|
94
|
+
main_id = cntrs.find{|container_name| container_name.include? @main_name}.split(",").first rescue nil
|
95
|
+
sleep(2)
|
96
|
+
end
|
84
97
|
end
|
85
|
-
if
|
86
|
-
raise "Can't find
|
98
|
+
if main_id.blank?
|
99
|
+
raise "Can't find main id"
|
87
100
|
end
|
88
|
-
[agent_id,
|
101
|
+
[agent_id, main_id]
|
102
|
+
rescue => e
|
103
|
+
Cnvrg::Logger.log_error(e)
|
104
|
+
[agent_id, main_id]
|
89
105
|
end
|
90
106
|
|
91
107
|
def current_homedir
|
@@ -93,21 +109,45 @@ class Cnvrg::Helpers::Executer
|
|
93
109
|
end
|
94
110
|
|
95
111
|
def spark_path
|
96
|
-
|
112
|
+
run_in_main("env | grep SPARK_HOME").strip.split("=").try(:last)
|
97
113
|
end
|
98
114
|
|
99
|
-
def
|
100
|
-
|
115
|
+
def main_homedir()
|
116
|
+
run_in_main("env | grep -w HOME").split("=").try(:last)
|
101
117
|
end
|
102
118
|
|
103
|
-
def
|
104
|
-
|
119
|
+
def main_env
|
120
|
+
run_in_main("env").split("\n").map{|x| x.split("=")}
|
105
121
|
end
|
106
122
|
|
107
|
-
def
|
108
|
-
|
109
|
-
end
|
123
|
+
def run_in_main(command)
|
124
|
+
data = {cmd: command, async: false, use_sh: true}
|
110
125
|
|
126
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn(timeout: 180)
|
127
|
+
response = conn.post('command', data.to_json)
|
128
|
+
if response.to_hash[:status].to_i != 200
|
129
|
+
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
130
|
+
return ""
|
131
|
+
end
|
132
|
+
resp = []
|
133
|
+
lines = response.body.split("\n")
|
134
|
+
lines.each do |line|
|
135
|
+
next if line.strip == nil or line.strip == ""
|
136
|
+
if line.include?("cnvrg-exit-code")
|
137
|
+
exit_status = line.split("=")[1].to_i
|
138
|
+
if exit_status != 0
|
139
|
+
Cnvrg::Logger.log_info("failed to run find command #{command} on main")
|
140
|
+
return ""
|
141
|
+
end
|
142
|
+
next
|
143
|
+
end
|
144
|
+
resp << line
|
145
|
+
end
|
146
|
+
return resp.join("\n")
|
147
|
+
rescue => e
|
148
|
+
Cnvrg::Logger.log_error(e)
|
149
|
+
return ""
|
150
|
+
end
|
111
151
|
|
112
152
|
def poll
|
113
153
|
resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
|
@@ -124,6 +164,7 @@ class Cnvrg::Helpers::Executer
|
|
124
164
|
success = false
|
125
165
|
puts("Agent started, connecting to #{Cnvrg::API.get_api}")
|
126
166
|
STDOUT.flush
|
167
|
+
wait_for_main
|
127
168
|
while !success and retries < 100
|
128
169
|
begin
|
129
170
|
resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
|
@@ -154,14 +195,75 @@ class Cnvrg::Helpers::Executer
|
|
154
195
|
end
|
155
196
|
end
|
156
197
|
|
198
|
+
def check_main_is_working_thread
|
199
|
+
while true
|
200
|
+
check_main_alive
|
201
|
+
sleep(@check_main_every)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
157
205
|
def main_thread
|
158
206
|
init
|
159
207
|
Thread.new do
|
160
208
|
polling_thread
|
161
209
|
end
|
210
|
+
Thread.new do
|
211
|
+
check_main_is_working_thread
|
212
|
+
end
|
162
213
|
execute_cmds
|
163
214
|
end
|
164
215
|
|
216
|
+
def wait_for_main
|
217
|
+
copy_file_to_main
|
218
|
+
start_tiny_if_missing
|
219
|
+
puts("Waiting for main container")
|
220
|
+
STDOUT.flush
|
221
|
+
got_response = false
|
222
|
+
while !got_response do
|
223
|
+
begin
|
224
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
225
|
+
response = conn.get('readiness')
|
226
|
+
if response.to_hash[:status].to_i != 200
|
227
|
+
sleep(0.1)
|
228
|
+
next
|
229
|
+
else
|
230
|
+
puts("Client container is ready")
|
231
|
+
STDOUT.flush
|
232
|
+
@main_start_time = response.body.to_i
|
233
|
+
got_response = true
|
234
|
+
end
|
235
|
+
rescue => e
|
236
|
+
puts("Failed to connect to main")
|
237
|
+
puts(e)
|
238
|
+
STDOUT.flush
|
239
|
+
sleep(0.1)
|
240
|
+
next
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def copy_file_to_main
|
246
|
+
begin
|
247
|
+
FileUtils.cp("/cnvrg-tiny", "/conf/tiny")
|
248
|
+
FileUtils.cp_r("/scripts", "/conf/scripts-bin")
|
249
|
+
FileUtils.touch("/conf/tiny-ready")
|
250
|
+
rescue => e
|
251
|
+
Cnvrg::Logger.log_error(e)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def start_tiny_if_missing
|
256
|
+
return unless ENV['MAIN_CONTAINER_PORT'].blank?
|
257
|
+
Cnvrg::Logger.log_info("Tiny not found, starting it")
|
258
|
+
@agent_id, @main_id = containers
|
259
|
+
pid = Process.fork do
|
260
|
+
Cnvrg::Logger.log_info("running docker exec -i #{@main_id} sh -c '/conf/tiny")
|
261
|
+
`docker exec -i #{@main_id} sh -c '/conf/tiny'`.strip
|
262
|
+
end
|
263
|
+
Process.detach(pid)
|
264
|
+
Cnvrg::Logger.log_info("Tiny started and detached")
|
265
|
+
end
|
266
|
+
|
165
267
|
def execute_cmds
|
166
268
|
pids = []
|
167
269
|
while true
|
@@ -215,6 +317,23 @@ class Cnvrg::Helpers::Executer
|
|
215
317
|
Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
|
216
318
|
end
|
217
319
|
|
320
|
+
def check_main_alive
|
321
|
+
# Dont check before we got first response
|
322
|
+
return if @main_start_time == nil
|
323
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
324
|
+
response = conn.get('readiness')
|
325
|
+
if response.to_hash[:status].to_i != 200
|
326
|
+
main_start_time = 0
|
327
|
+
else
|
328
|
+
main_start_time = response.body.to_i
|
329
|
+
end
|
330
|
+
if main_start_time != @main_start_time
|
331
|
+
puts("Found that main restarted, restarting agent")
|
332
|
+
Cnvrg::Logger.log_info("Found that main restarted, restarting agent")
|
333
|
+
exit(1)
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
218
337
|
def get_pod_events(pod_name)
|
219
338
|
return if pod_name.blank?
|
220
339
|
`kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
|
@@ -224,4 +343,27 @@ class Cnvrg::Helpers::Executer
|
|
224
343
|
return if node_name.blank?
|
225
344
|
`kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
|
226
345
|
end
|
346
|
+
|
347
|
+
def self.main_container_url
|
348
|
+
if ENV["CNVRG_COMPUTE_CLUSTER"].blank? and ENV["KUBERNETES_SERVICE_HOST"].blank?
|
349
|
+
if ENV["MAIN_CONTAINER_PORT"].blank?
|
350
|
+
host = "slave"
|
351
|
+
else
|
352
|
+
host = "main"
|
353
|
+
end
|
354
|
+
"http://#{host}:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
355
|
+
else
|
356
|
+
"http://localhost:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
def self.get_main_conn(timeout: 4, open_timeout: 1)
|
361
|
+
conn = Faraday.new(
|
362
|
+
url: Cnvrg::Helpers::Executer.main_container_url,
|
363
|
+
headers: {'Content-Type' => 'application/json'}
|
364
|
+
)
|
365
|
+
conn.options.timeout = timeout
|
366
|
+
conn.options.open_timeout = open_timeout
|
367
|
+
conn
|
368
|
+
end
|
227
369
|
end
|
data/lib/cnvrg/job_ssh.rb
CHANGED
@@ -5,14 +5,18 @@ module Cnvrg
|
|
5
5
|
method_option :port, :type => :numeric, :aliases => ["-p", "--port"], :desc => "Port to bind into", :default => 2222
|
6
6
|
method_option :username, :type => :string, :aliases => ["-u", "--username"], :desc => "Job container user name", :default => nil
|
7
7
|
method_option :password, :type => :string, :aliases => ["--password"], :desc =>"Job Conatainer user name, will be set by cnvrg", :default => nil
|
8
|
+
method_option :no_auth, :type => :boolean, :aliases => ["--no-auth"], :default => false
|
9
|
+
method_option :internal_port, :type => :numeric, :aliases => ["--internal-port"], :desc =>"Internal port in the pod for the ssh", :default => 22
|
8
10
|
method_option :kubeconfig, :type => :string, :aliases => ["--kubeconfig"], :desc => "Path to kubeconfig, if blank default config will be used", :default => nil
|
9
11
|
def start(job_id)
|
12
|
+
no_auth = options["no_auth"]
|
10
13
|
Cnvrg::CLI.new.log_start(__method__, args, options)
|
11
14
|
@job_ssh = ConnectJobSsh.new(job_id)
|
12
|
-
@job_ssh.start(options['username'], options['password'])
|
15
|
+
@job_ssh.start(options['username'], options['password'], no_auth, port: options['internal_port'])
|
13
16
|
pod_name = nil
|
14
17
|
namespace = "cnvrg"
|
15
18
|
ssh_ready = false
|
19
|
+
internal_port = options['internal_port']
|
16
20
|
while not ssh_ready
|
17
21
|
resp = @job_ssh.status()
|
18
22
|
status = resp["ssh_status"]
|
@@ -26,13 +30,14 @@ module Cnvrg
|
|
26
30
|
username = resp["username"]
|
27
31
|
pod_name = resp["pod_name"]
|
28
32
|
namespace = resp["namespace"]
|
33
|
+
internal_port = resp["port"] || internal_port
|
29
34
|
ssh_ready = true
|
30
35
|
else
|
31
36
|
puts("Failed to start ssh")
|
32
37
|
break
|
33
38
|
end
|
34
39
|
end
|
35
|
-
if pod_name.blank? or password.blank? or username.blank?
|
40
|
+
if pod_name.blank? or (password.blank? and !no_auth) or username.blank?
|
36
41
|
puts("Failed to get required params")
|
37
42
|
return
|
38
43
|
end
|
@@ -41,8 +46,8 @@ module Cnvrg
|
|
41
46
|
puts("host: 127.0.0.1")
|
42
47
|
puts("port: #{options["port"]}")
|
43
48
|
puts("username: #{username}")
|
44
|
-
puts("password: #{password}")
|
45
|
-
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace)
|
49
|
+
puts("password: #{password}") unless no_auth
|
50
|
+
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace, internal_port)
|
46
51
|
end
|
47
52
|
end
|
48
53
|
end
|
data/lib/cnvrg/project.rb
CHANGED
@@ -328,15 +328,21 @@ module Cnvrg
|
|
328
328
|
end
|
329
329
|
|
330
330
|
def get_storage_client
|
331
|
-
|
332
|
-
|
333
|
-
|
331
|
+
client_params = nil
|
332
|
+
i = 0
|
333
|
+
begin
|
334
|
+
response = Cnvrg::API.request("users/#{@owner}/projects/#{@slug}/client", 'GET')
|
335
|
+
unless Cnvrg::CLI.is_response_success(response, false)
|
336
|
+
raise StandardError.new("Can't find project credentials")
|
337
|
+
end
|
334
338
|
client_params = response['client']
|
335
|
-
|
336
|
-
|
339
|
+
rescue StandardError
|
340
|
+
i += 1
|
341
|
+
sleep(5 * i)
|
342
|
+
retry if i < 10
|
337
343
|
client_params = get_storage_client_fallback
|
338
344
|
end
|
339
|
-
|
345
|
+
raise StandardError.new("Can't find project credentials") unless client_params
|
340
346
|
Cnvrg::Downloader::Client.factory(client_params)
|
341
347
|
end
|
342
348
|
|
@@ -378,14 +384,18 @@ module Cnvrg
|
|
378
384
|
[]
|
379
385
|
end
|
380
386
|
|
381
|
-
def generate_output_dir(output_dir)
|
387
|
+
def generate_output_dir(output_dir, local: false)
|
382
388
|
Cnvrg::Logger.log_info("Generating output dir for #{output_dir}")
|
383
389
|
upload_list = []
|
390
|
+
list = []
|
384
391
|
list = Dir.glob("/cnvrg/#{output_dir}/**/*", File::FNM_DOTMATCH)
|
392
|
+
if local
|
393
|
+
list += Dir.glob("#{output_dir}/**/*", File::FNM_DOTMATCH)
|
394
|
+
end
|
395
|
+
list.uniq!
|
385
396
|
Parallel.map(list, in_threads: IDXParallelThreads) do |e|
|
386
397
|
next if e.end_with? "/."
|
387
398
|
if File.directory? e
|
388
|
-
|
389
399
|
upload_list << e + "/"
|
390
400
|
else
|
391
401
|
upload_list << e
|
@@ -447,6 +457,10 @@ module Cnvrg
|
|
447
457
|
if list_ignore_new.include? label
|
448
458
|
next
|
449
459
|
end
|
460
|
+
if File.symlink?(e)
|
461
|
+
Cnvrg::Logger.log_info("Skipping symlink #{e}")
|
462
|
+
next
|
463
|
+
end
|
450
464
|
if File.directory? e
|
451
465
|
dir_name = (label.ends_with? "/") ? label : (label + "/")
|
452
466
|
tree_idx[dir_name] = nil
|
@@ -647,7 +661,11 @@ module Cnvrg
|
|
647
661
|
|
648
662
|
def fetch_webapp_slugs(webapp_slug, slugs: nil)
|
649
663
|
response = Cnvrg::API_V2.request("#{self.owner}/projects/#{self.slug}/webapps/#{webapp_slug}" , 'GET')
|
650
|
-
|
664
|
+
|
665
|
+
if response.key?("experiments")
|
666
|
+
return response["experiments"]
|
667
|
+
end
|
668
|
+
return response["data"]["attributes"]["experiments"]
|
651
669
|
rescue
|
652
670
|
slugs
|
653
671
|
end
|
@@ -699,8 +717,11 @@ module Cnvrg
|
|
699
717
|
res = JSON.parse(resp['result']) rescue nil
|
700
718
|
return if res.blank?
|
701
719
|
config = self.get_config
|
702
|
-
config[:is_git] = res['git']
|
703
720
|
config[:project_name] = res['title']
|
721
|
+
config[:project_slug] = @slug
|
722
|
+
config[:owner] = @owner
|
723
|
+
config[:git] = res['git'] || false
|
724
|
+
config[:is_git] = res['git'] || false
|
704
725
|
self.set_config(config)
|
705
726
|
end
|
706
727
|
|
data/lib/cnvrg/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
1
|
module Cnvrg
|
2
|
-
VERSION = '
|
3
|
-
end
|
2
|
+
VERSION = '2.0.13'
|
3
|
+
end
|
metadata
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cnvrg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yochay Ettun
|
8
8
|
- Leah Kolben
|
9
|
-
|
9
|
+
- Omer Shacham
|
10
|
+
autorequire:
|
10
11
|
bindir: bin
|
11
12
|
cert_chain: []
|
12
|
-
date: 2021-
|
13
|
+
date: 2021-10-27 00:00:00.000000000 Z
|
13
14
|
dependencies:
|
14
15
|
- !ruby/object:Gem::Dependency
|
15
16
|
name: bundler
|
@@ -321,6 +322,20 @@ dependencies:
|
|
321
322
|
- - "~>"
|
322
323
|
- !ruby/object:Gem::Version
|
323
324
|
version: 0.1.1
|
325
|
+
- !ruby/object:Gem::Dependency
|
326
|
+
name: filewatch
|
327
|
+
requirement: !ruby/object:Gem::Requirement
|
328
|
+
requirements:
|
329
|
+
- - "~>"
|
330
|
+
- !ruby/object:Gem::Version
|
331
|
+
version: 0.9.0
|
332
|
+
type: :runtime
|
333
|
+
prerelease: false
|
334
|
+
version_requirements: !ruby/object:Gem::Requirement
|
335
|
+
requirements:
|
336
|
+
- - "~>"
|
337
|
+
- !ruby/object:Gem::Version
|
338
|
+
version: 0.9.0
|
324
339
|
- !ruby/object:Gem::Dependency
|
325
340
|
name: parallel
|
326
341
|
requirement: !ruby/object:Gem::Requirement
|
@@ -458,7 +473,7 @@ files:
|
|
458
473
|
homepage: https://cnvrg.io
|
459
474
|
licenses: []
|
460
475
|
metadata: {}
|
461
|
-
post_install_message:
|
476
|
+
post_install_message:
|
462
477
|
rdoc_options: []
|
463
478
|
require_paths:
|
464
479
|
- lib
|
@@ -473,8 +488,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
473
488
|
- !ruby/object:Gem::Version
|
474
489
|
version: '0'
|
475
490
|
requirements: []
|
476
|
-
rubygems_version: 3.
|
477
|
-
signing_key:
|
491
|
+
rubygems_version: 3.2.22
|
492
|
+
signing_key:
|
478
493
|
specification_version: 4
|
479
494
|
summary: A CLI tool for interacting with cnvrg.io.
|
480
495
|
test_files: []
|