cnvrg 1.11.31 → 2.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Readme.md +42 -1
- data/cnvrg.gemspec +2 -1
- data/lib/cnvrg/api.rb +6 -4
- data/lib/cnvrg/api_v2.rb +1 -0
- data/lib/cnvrg/auth.rb +4 -1
- data/lib/cnvrg/cli/library_cli.rb +2 -2
- data/lib/cnvrg/cli.rb +158 -76
- data/lib/cnvrg/connect_job_ssh.rb +4 -4
- data/lib/cnvrg/files.rb +6 -2
- data/lib/cnvrg/helpers/agent.rb +58 -18
- data/lib/cnvrg/helpers/executer.rb +177 -35
- data/lib/cnvrg/job_ssh.rb +9 -4
- data/lib/cnvrg/project.rb +31 -10
- data/lib/cnvrg/version.rb +2 -2
- metadata +21 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0e2e22e961bc723442ec80d457f0dde2b982f1c75beb2e859bb5523e83682fbe
|
4
|
+
data.tar.gz: 645a4add593bcf3a63be4d64b4ccb45000e39447e7e11b9fd674347856e2bf97
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2af7a28b7b428f8b4066a1b3f01bada7273eaa93ea847f87badc253aea2123244ad27ab3804cda5f4cdcee91cc2cc0ed7dd168b347839f59a6c183a702438167
|
7
|
+
data.tar.gz: dc151c1ab54d54a86648b9f1341d402d0eb6fb8d578cc0a01f2594a28a34bfb8d97676d07aa389a8b1155fa466845f87be9b1dffbfcc2bcda33376937ecef418
|
data/Readme.md
CHANGED
@@ -18,4 +18,45 @@
|
|
18
18
|
## Version v1.11.30
|
19
19
|
2021-04-06
|
20
20
|
## Version v1.11.31
|
21
|
-
2021-04-22
|
21
|
+
2021-04-22
|
22
|
+
## Version v1.11.32
|
23
|
+
2021-05-05
|
24
|
+
* DEV-8868 - Bug: SDK - e.sync() in git project only creates empty "output" folder in commit
|
25
|
+
## Version v2.0.1
|
26
|
+
2021-06-13
|
27
|
+
## Version v2.0.2
|
28
|
+
2021-06-16
|
29
|
+
* DEV-9694 - Bug: Download artifacts fails on authorization error
|
30
|
+
## Version v2.0.3
|
31
|
+
2021-06-29
|
32
|
+
* DEV-9919 - Bug: clone artifacts fails on "Not Authorize, Are you logged in?"
|
33
|
+
## Version v2.0.4
|
34
|
+
2021-07-08
|
35
|
+
* DEV-9935 - Bug: CLI - cnvrg sync creates new commit but no blob versions
|
36
|
+
## Version v2.0.5
|
37
|
+
2021-07-11
|
38
|
+
* DEV-10171 - Bug: experiment randomly fails with error- "Couldn't clone artifacts"
|
39
|
+
* DEV-10189 - Bug: CLI Sync -file/folder with broken symlink will cause sync to fail
|
40
|
+
## Version v2.0.6
|
41
|
+
2021-07-18
|
42
|
+
* DEV-10209 - Bug: some experiments in grid failed on cnvrg-cli commands (docker container id was missing)
|
43
|
+
## Version v2.0.7
|
44
|
+
2021-07-27
|
45
|
+
* DEV-10186 - Bug: CLI/run an experiment with --local tag giver server error
|
46
|
+
## Version v2.0.8
|
47
|
+
2021-09-06
|
48
|
+
* DEV-10697 - Bug: Tensorboard not starting in workspace and experiment.
|
49
|
+
## Version v2.0.9
|
50
|
+
2021-09-12
|
51
|
+
* DEV-10502 - Bug: Periodic sync stuck
|
52
|
+
## Version v2.0.10
|
53
|
+
2021-09-12
|
54
|
+
* DEV-10502 - Bug: Periodic sync stuck
|
55
|
+
## Version v2.0.11
|
56
|
+
2021-10-21
|
57
|
+
## Version v2.0.12
|
58
|
+
2021-10-25
|
59
|
+
* DEV-11544 - Sub-bug: local experiment is failing to run
|
60
|
+
## Version v2.0.13
|
61
|
+
2021-10-27
|
62
|
+
* DEV-11054 - Task: Create organization and user by default
|
data/cnvrg.gemspec
CHANGED
@@ -6,7 +6,7 @@ require 'cnvrg/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = 'cnvrg'
|
8
8
|
spec.version = Cnvrg::VERSION
|
9
|
-
spec.authors = ['Yochay Ettun', 'Leah Kolben']
|
9
|
+
spec.authors = ['Yochay Ettun', 'Leah Kolben', 'Omer Shacham']
|
10
10
|
spec.email = ['info@cnvrg.io']
|
11
11
|
spec.summary = %q{A CLI tool for interacting with cnvrg.io.}
|
12
12
|
spec.description = %q{A CLI tool for interacting with cnvrg.io.}
|
@@ -39,6 +39,7 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
|
40
40
|
spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
|
41
41
|
spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
|
42
|
+
spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
|
42
43
|
spec.add_runtime_dependency 'parallel', '~> 1.12.0'
|
43
44
|
spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
|
44
45
|
spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
|
data/lib/cnvrg/api.rb
CHANGED
@@ -57,6 +57,7 @@ module Cnvrg
|
|
57
57
|
conn = Faraday.new "#{endpoint_uri}"
|
58
58
|
end
|
59
59
|
conn.headers['Auth-Token'] = @pass
|
60
|
+
conn.headers['Authorization'] = "CAPI #{@pass}"
|
60
61
|
conn.headers['User-Agent'] = "#{Cnvrg::API::USER_AGENT}"
|
61
62
|
conn.options.timeout = 420
|
62
63
|
conn.options.open_timeout=180
|
@@ -72,11 +73,11 @@ module Cnvrg
|
|
72
73
|
if response.to_hash[:status].to_i != 200
|
73
74
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
74
75
|
end
|
75
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
76
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
76
77
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
77
78
|
success = false
|
78
79
|
sleep(5 * retries)
|
79
|
-
retries +=1
|
80
|
+
retries += 1
|
80
81
|
next
|
81
82
|
end
|
82
83
|
rescue => e
|
@@ -112,11 +113,11 @@ module Cnvrg
|
|
112
113
|
if response.to_hash[:status].to_i != 200
|
113
114
|
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
114
115
|
end
|
115
|
-
if [503, 502, 429].include?(response.to_hash[:status].to_i)
|
116
|
+
if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
|
116
117
|
Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
|
117
118
|
success = false
|
118
119
|
sleep(5 * retries)
|
119
|
-
retries +=1
|
120
|
+
retries += 1
|
120
121
|
next
|
121
122
|
end
|
122
123
|
rescue => e
|
@@ -169,6 +170,7 @@ module Cnvrg
|
|
169
170
|
when 'POST_FILE'
|
170
171
|
conn = Faraday.new do |fr|
|
171
172
|
fr.headers['Auth-Token'] = @pass
|
173
|
+
fr.headers['Authorization'] = "CAPI #{@pass}"
|
172
174
|
fr.headers['User-Agent'] = "#{Cnvrg::API::USER_AGENT}"
|
173
175
|
fr.headers["Content-Type"] = "multipart/form-data"
|
174
176
|
if !Helpers.is_verify_ssl
|
data/lib/cnvrg/api_v2.rb
CHANGED
@@ -22,6 +22,7 @@ module Cnvrg
|
|
22
22
|
|
23
23
|
conn = Faraday.new endpoint_uri, :ssl => {:verify => !!Helpers.is_verify_ssl}
|
24
24
|
conn.headers['Auth-Token'] = pass
|
25
|
+
conn.headers['Authorization'] = "CAPI #{pass}"
|
25
26
|
conn.headers['User-Agent'] = Cnvrg::API::USER_AGENT
|
26
27
|
conn.headers['Content-Type'] = "application/json"
|
27
28
|
conn.options.timeout = 420
|
data/lib/cnvrg/auth.rb
CHANGED
@@ -44,7 +44,7 @@ module Cnvrg
|
|
44
44
|
end
|
45
45
|
end
|
46
46
|
|
47
|
-
def sign_in(email, password)
|
47
|
+
def sign_in(email, password, token: nil)
|
48
48
|
url = Cnvrg::API.endpoint_uri()
|
49
49
|
url = URI.parse(url+ "/users/sign_in")
|
50
50
|
http = Net::HTTP.new(url.host, url.port)
|
@@ -61,6 +61,9 @@ module Cnvrg
|
|
61
61
|
|
62
62
|
req.add_field("EMAIL", email)
|
63
63
|
req.add_field("PASSWORD", password)
|
64
|
+
if token.present?
|
65
|
+
req.add_field("Authorization", "CAPI #{token}")
|
66
|
+
end
|
64
67
|
|
65
68
|
response = http.request(req)
|
66
69
|
|
data/lib/cnvrg/cli.rb
CHANGED
@@ -173,7 +173,7 @@ module Cnvrg
|
|
173
173
|
desc "data [COMMAND]", "Upload and manage datasets", :hide => false
|
174
174
|
subcommand "data", Data
|
175
175
|
|
176
|
-
desc "job", "manage running jobs", :hide =>
|
176
|
+
desc "job", "manage running jobs", :hide => true
|
177
177
|
subcommand "job", JobCli
|
178
178
|
|
179
179
|
desc "ssh", "ssh into running jobs", :hide => false
|
@@ -415,7 +415,7 @@ module Cnvrg
|
|
415
415
|
end
|
416
416
|
end
|
417
417
|
|
418
|
-
desc 'set_compression_path', 'Set compression path'
|
418
|
+
desc 'set_compression_path', 'Set compression path', :hide => true
|
419
419
|
method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
|
420
420
|
|
421
421
|
def set_compression_path(*compression_path)
|
@@ -496,8 +496,10 @@ module Cnvrg
|
|
496
496
|
|
497
497
|
|
498
498
|
desc 'login', 'Authenticate with cnvrg.io platform'
|
499
|
+
method_option :sso, :type => :boolean, :aliases => ["-s", "--sso"], :default => false
|
499
500
|
|
500
501
|
def login
|
502
|
+
use_token = options["sso"]
|
501
503
|
begin
|
502
504
|
log_handler()
|
503
505
|
log_start(__method__, args, options)
|
@@ -515,12 +517,21 @@ module Cnvrg
|
|
515
517
|
exit(0)
|
516
518
|
end
|
517
519
|
@email = ask("Enter your email:")
|
518
|
-
|
519
|
-
|
520
|
+
if use_token
|
521
|
+
@token = cmd.ask("Enter your token (hidden):") {|q| q.echo = "*"}
|
522
|
+
netrc[Cnvrg::Helpers.netrc_domain] = @email, @token
|
523
|
+
netrc.save
|
524
|
+
password = ""
|
525
|
+
else
|
526
|
+
password = cmd.ask("Enter your password (hidden):") {|q| q.echo = "*"}
|
527
|
+
end
|
528
|
+
result = @auth.sign_in(@email, password, token: @token)
|
520
529
|
|
521
530
|
if !result["token"].nil?
|
522
|
-
|
523
|
-
|
531
|
+
unless use_token
|
532
|
+
netrc[Cnvrg::Helpers.netrc_domain] = @email, result["token"]
|
533
|
+
netrc.save
|
534
|
+
end
|
524
535
|
|
525
536
|
log_message("Authenticated successfully as #{@email}", Thor::Shell::Color::GREEN)
|
526
537
|
|
@@ -2311,6 +2322,7 @@ module Cnvrg
|
|
2311
2322
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
2312
2323
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
2313
2324
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
2325
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
2314
2326
|
|
2315
2327
|
def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
|
2316
2328
|
begin
|
@@ -2328,6 +2340,8 @@ module Cnvrg
|
|
2328
2340
|
exp_obj = nil
|
2329
2341
|
end
|
2330
2342
|
|
2343
|
+
local = options["local"]
|
2344
|
+
|
2331
2345
|
commit_msg = options["message"]
|
2332
2346
|
if commit_msg.nil? or commit_msg.empty?
|
2333
2347
|
commit_msg = ""
|
@@ -2349,7 +2363,7 @@ module Cnvrg
|
|
2349
2363
|
if git_output_dir.ends_with? "/"
|
2350
2364
|
git_output_dir = git_output_dir[0..-2]
|
2351
2365
|
end
|
2352
|
-
list = @project.generate_output_dir(git_output_dir)
|
2366
|
+
list = @project.generate_output_dir(git_output_dir, local: local)
|
2353
2367
|
end
|
2354
2368
|
list += @project.generate_git_diff if options["git_diff"]
|
2355
2369
|
spec_files_to_upload = list
|
@@ -2668,7 +2682,7 @@ module Cnvrg
|
|
2668
2682
|
end
|
2669
2683
|
end
|
2670
2684
|
|
2671
|
-
desc 'commit before termination', 'Commit job code before termination'
|
2685
|
+
desc 'commit before termination', 'Commit job code before termination', :hide => true
|
2672
2686
|
def commit_before_termination()
|
2673
2687
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2674
2688
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2678,7 +2692,7 @@ module Cnvrg
|
|
2678
2692
|
log_error(e)
|
2679
2693
|
end
|
2680
2694
|
|
2681
|
-
desc 'update_job_commit', 'Update job with its last commit'
|
2695
|
+
desc 'update_job_commit', 'Update job with its last commit' , :hide => true
|
2682
2696
|
def update_job_commit()
|
2683
2697
|
job_type = ENV['CNVRG_JOB_TYPE']
|
2684
2698
|
job_id = ENV['CNVRG_JOB_ID']
|
@@ -2868,7 +2882,7 @@ module Cnvrg
|
|
2868
2882
|
|
2869
2883
|
|
2870
2884
|
|
2871
|
-
desc 'jump', 'Jump to specific commit'
|
2885
|
+
desc 'jump COMMIT_ID', 'Jump to specific commit'
|
2872
2886
|
def jump(commit_sha1)
|
2873
2887
|
begin
|
2874
2888
|
verify_logged_in()
|
@@ -3003,11 +3017,12 @@ module Cnvrg
|
|
3003
3017
|
method_option :job_slug, :type => :string, :aliases => ["-j", "--job"], :default => nil
|
3004
3018
|
method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
|
3005
3019
|
method_option :files, :type => :string, :aliases => ["--files"], :default => nil
|
3006
|
-
method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default =>
|
3020
|
+
method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => 'output'
|
3007
3021
|
method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
|
3008
3022
|
method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
|
3009
3023
|
method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
|
3010
3024
|
method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
|
3025
|
+
method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
|
3011
3026
|
|
3012
3027
|
def sync(direct = true)
|
3013
3028
|
verify_logged_in(true) if direct
|
@@ -3030,10 +3045,10 @@ module Cnvrg
|
|
3030
3045
|
if run_download or options['debug_mode']
|
3031
3046
|
invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
|
3032
3047
|
end
|
3033
|
-
invoke :upload, [false, true,
|
3048
|
+
invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
|
3034
3049
|
:ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
|
3035
3050
|
:files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
|
3036
|
-
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
|
3051
|
+
:debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
|
3037
3052
|
|
3038
3053
|
end
|
3039
3054
|
|
@@ -3143,7 +3158,7 @@ module Cnvrg
|
|
3143
3158
|
invoke :exec, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title,
|
3144
3159
|
:log => log, :email_notification => email_notification, :upload_output => upload_output,
|
3145
3160
|
:commit => commit, :image => image, :data => data, :data_commit => data_commit,
|
3146
|
-
:ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query
|
3161
|
+
:ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query, :local => local
|
3147
3162
|
return
|
3148
3163
|
end
|
3149
3164
|
else
|
@@ -3200,6 +3215,7 @@ module Cnvrg
|
|
3200
3215
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
|
3201
3216
|
method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
|
3202
3217
|
method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
|
3218
|
+
method_option :local, :type => :boolean, :aliases => ["-l", "--local"], :default => false
|
3203
3219
|
|
3204
3220
|
def exec(*cmd)
|
3205
3221
|
log = []
|
@@ -3224,6 +3240,7 @@ module Cnvrg
|
|
3224
3240
|
project_home = get_project_home
|
3225
3241
|
data_query = options["data_query"]
|
3226
3242
|
docker_stats = options["docker_stats"]
|
3243
|
+
local = options[:local] || false
|
3227
3244
|
@project = Project.new(project_home)
|
3228
3245
|
if @project.is_git
|
3229
3246
|
sync_before = false
|
@@ -3316,62 +3333,53 @@ module Cnvrg
|
|
3316
3333
|
end
|
3317
3334
|
end
|
3318
3335
|
start_time = Time.now
|
3319
|
-
shell_type = options["use_bash"] ? "bash -l" : "sh"
|
3320
3336
|
if @exp.get_cmd.present?
|
3321
3337
|
cmd = @exp.get_cmd
|
3322
|
-
if options["docker_id"].present? # Escape for docker exec
|
3323
|
-
cmd = cmd.gsub("\"", "\\\"")
|
3324
|
-
end
|
3325
3338
|
end
|
3326
|
-
|
3327
|
-
|
3328
|
-
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
|
3333
|
-
|
3334
|
-
|
3335
|
-
|
3336
|
-
|
3337
|
-
|
3338
|
-
|
3339
|
-
|
3340
|
-
|
3339
|
+
|
3340
|
+
if local
|
3341
|
+
exec_local(cmd, print_log, start_commit, real, start_time)
|
3342
|
+
exit_status = $?.exitstatus
|
3343
|
+
|
3344
|
+
else
|
3345
|
+
command_slug = (0...18).map { (65 + rand(26)).chr }.join
|
3346
|
+
result_file = "/conf/result-#{command_slug}"
|
3347
|
+
data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
|
3348
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
3349
|
+
response = conn.post('command', data.to_json)
|
3350
|
+
if response.to_hash[:status].to_i != 200
|
3351
|
+
exit_status = 129
|
3352
|
+
raise StandardError.new("Cant send command to slave")
|
3353
|
+
end
|
3354
|
+
t = FileWatch::Tail.new
|
3355
|
+
filename = result_file
|
3356
|
+
lines = []
|
3357
|
+
t.tail(filename)
|
3358
|
+
t.subscribe do |path, line|
|
3359
|
+
begin
|
3360
|
+
cur_log = JSON.parse(line)
|
3361
|
+
if cur_log["type"] == "endMessage"
|
3362
|
+
exit_status = cur_log["real"].to_i
|
3363
|
+
break
|
3364
|
+
else
|
3365
|
+
puts(cur_log.to_json)
|
3366
|
+
STDOUT.flush
|
3367
|
+
cur_log["time"] = Time.parse(cur_log["timestamp"])
|
3368
|
+
cur_log["message"] = cur_log["message"].to_s + "\r\n"
|
3369
|
+
log << cur_log
|
3341
3370
|
end
|
3342
|
-
log << cur_log
|
3343
3371
|
if log.size >= 10
|
3344
|
-
@exp.upload_temp_log(log)
|
3372
|
+
@exp.upload_temp_log(log)
|
3345
3373
|
log = []
|
3346
|
-
|
3374
|
+
elsif (start_time + 15.seconds) <= Time.now
|
3347
3375
|
@exp.upload_temp_log(log) unless log.empty?
|
3348
3376
|
log = []
|
3349
3377
|
start_time = Time.now
|
3350
3378
|
end
|
3379
|
+
rescue => e
|
3380
|
+
log_error(e)
|
3351
3381
|
end
|
3352
|
-
if stderr
|
3353
|
-
stderr.each do |err|
|
3354
|
-
log << {time: Time.now, message: err, type: "stderr"}
|
3355
|
-
end
|
3356
|
-
end
|
3357
|
-
rescue Errno::EIO => e
|
3358
|
-
log_error(e)
|
3359
|
-
if !log.empty?
|
3360
|
-
temp_log = log
|
3361
|
-
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
3362
|
-
log -= temp_log
|
3363
|
-
end
|
3364
|
-
rescue Errno::ENOENT => e
|
3365
|
-
exp_success = false
|
3366
|
-
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
3367
|
-
log_error(e)
|
3368
|
-
rescue => e
|
3369
|
-
res = @exp.end(log, 1, start_commit, 0, 0)
|
3370
|
-
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
3371
|
-
log_error(e)
|
3372
|
-
exit(0)
|
3373
3382
|
end
|
3374
|
-
::Process.wait pid
|
3375
3383
|
end
|
3376
3384
|
end_time = Time.now
|
3377
3385
|
process_running = false
|
@@ -3379,14 +3387,13 @@ module Cnvrg
|
|
3379
3387
|
if !log.empty?
|
3380
3388
|
|
3381
3389
|
temp_log = log
|
3382
|
-
|
3390
|
+
@exp.upload_temp_log(temp_log)
|
3383
3391
|
log -= temp_log
|
3384
3392
|
end
|
3385
3393
|
|
3386
3394
|
cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
|
3387
3395
|
memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
|
3388
|
-
exit_status
|
3389
|
-
if $?.exitstatus != 0
|
3396
|
+
if exit_status != 0
|
3390
3397
|
exp_success = false
|
3391
3398
|
end
|
3392
3399
|
|
@@ -3430,7 +3437,6 @@ module Cnvrg
|
|
3430
3437
|
if @exp
|
3431
3438
|
# log_thread.join
|
3432
3439
|
Thread.kill(stats_thread) if docker_stats
|
3433
|
-
exit_status = $?.exitstatus
|
3434
3440
|
if exit_status.blank?
|
3435
3441
|
exit_status = "-1"
|
3436
3442
|
end
|
@@ -3443,8 +3449,6 @@ module Cnvrg
|
|
3443
3449
|
|
3444
3450
|
exit(1)
|
3445
3451
|
end
|
3446
|
-
|
3447
|
-
|
3448
3452
|
end
|
3449
3453
|
|
3450
3454
|
end
|
@@ -3689,7 +3693,7 @@ module Cnvrg
|
|
3689
3693
|
end
|
3690
3694
|
end
|
3691
3695
|
|
3692
|
-
desc 'deploy', 'Deploys model to production'
|
3696
|
+
desc 'deploy', 'Deploys model to production', :hide => true
|
3693
3697
|
method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
|
3694
3698
|
method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
|
3695
3699
|
method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
|
@@ -3778,7 +3782,7 @@ module Cnvrg
|
|
3778
3782
|
method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
|
3779
3783
|
method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
|
3780
3784
|
|
3781
|
-
desc 'notebook', 'Starts a notebook session remotely or locally'
|
3785
|
+
desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
|
3782
3786
|
|
3783
3787
|
def notebook
|
3784
3788
|
verify_logged_in(true)
|
@@ -3905,7 +3909,7 @@ module Cnvrg
|
|
3905
3909
|
end
|
3906
3910
|
end
|
3907
3911
|
|
3908
|
-
desc 'remote_notebook', 'Run notebook server on remote server'
|
3912
|
+
desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
|
3909
3913
|
method_option :machine_type, :type => :string, :default => ""
|
3910
3914
|
method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
|
3911
3915
|
method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
|
@@ -4264,7 +4268,7 @@ module Cnvrg
|
|
4264
4268
|
|
4265
4269
|
end
|
4266
4270
|
|
4267
|
-
desc 'notebook_stop', '
|
4271
|
+
desc 'notebook_stop', 'Stop notebook', :hide => true
|
4268
4272
|
method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
|
4269
4273
|
method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
|
4270
4274
|
method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
|
@@ -4651,10 +4655,16 @@ module Cnvrg
|
|
4651
4655
|
method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
|
4652
4656
|
method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
|
4653
4657
|
method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
|
4658
|
+
method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
|
4659
|
+
method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
|
4660
|
+
method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
|
4654
4661
|
|
4655
4662
|
def collect_metrics
|
4656
4663
|
@exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
|
4657
4664
|
prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
|
4665
|
+
prom_user = options[:prom_user]
|
4666
|
+
prom_password = options[:prom_password]
|
4667
|
+
name = options[:name]
|
4658
4668
|
|
4659
4669
|
translate_result = Cnvrg::API_V2.request(
|
4660
4670
|
"#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
|
@@ -4679,9 +4689,16 @@ module Cnvrg
|
|
4679
4689
|
next
|
4680
4690
|
end
|
4681
4691
|
uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
|
4682
|
-
|
4692
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
4693
|
+
http.use_ssl = uri.scheme == "https"
|
4694
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
4695
|
+
req = Net::HTTP::Get.new uri.request_uri
|
4696
|
+
if prom_user.present?
|
4697
|
+
req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
|
4698
|
+
end
|
4699
|
+
resp = http.request(req)
|
4683
4700
|
begin
|
4684
|
-
result = JSON.parse(resp)
|
4701
|
+
result = JSON.parse(resp.body)
|
4685
4702
|
rescue JSON::ParserError => e
|
4686
4703
|
log_error(e)
|
4687
4704
|
next
|
@@ -4690,13 +4707,22 @@ module Cnvrg
|
|
4690
4707
|
next unless data_result
|
4691
4708
|
|
4692
4709
|
if data_result.size > 1
|
4693
|
-
stats[query_name] = {}
|
4710
|
+
stats[query_name] = {} unless query_name.include? 'block'
|
4694
4711
|
data_result.each_with_index do |res, i|
|
4695
4712
|
timestamp, value = res["value"]
|
4696
4713
|
uuid = res["metric"]["UUID"].presence || i
|
4697
4714
|
stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
|
4698
4715
|
stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
|
4699
|
-
|
4716
|
+
if query_name.include? 'block'
|
4717
|
+
uuid = res["metric"]["interface"].presence || i
|
4718
|
+
uuid = "#{name}-#{uuid}" if name.present?
|
4719
|
+
stats['block_io'] = {} if stats['block_io'].blank?
|
4720
|
+
io_type = query_name.split('_')[1]
|
4721
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4722
|
+
stats['block_io'][io_type].merge!({ uuid => stat_value })
|
4723
|
+
else
|
4724
|
+
stats[query_name][uuid] = stat_value
|
4725
|
+
end
|
4700
4726
|
end
|
4701
4727
|
else
|
4702
4728
|
timestamp, value = data_result&.first&.dig('value')
|
@@ -4705,9 +4731,14 @@ module Cnvrg
|
|
4705
4731
|
if query_name.include? 'block'
|
4706
4732
|
stats['block_io'] = {} if stats['block_io'].blank?
|
4707
4733
|
io_type = query_name.split('_')[1]
|
4708
|
-
|
4734
|
+
if name.present?
|
4735
|
+
stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
|
4736
|
+
stats['block_io'][io_type].merge!({ name => stat_value })
|
4737
|
+
else
|
4738
|
+
stats['block_io'].merge!({ io_type => stat_value })
|
4739
|
+
end
|
4709
4740
|
else
|
4710
|
-
stats[query_name] = stat_value
|
4741
|
+
stats[query_name] = name.present? ? { name => stat_value } : stat_value
|
4711
4742
|
end
|
4712
4743
|
end
|
4713
4744
|
end
|
@@ -4751,7 +4782,7 @@ module Cnvrg
|
|
4751
4782
|
end
|
4752
4783
|
|
4753
4784
|
|
4754
|
-
desc '', ''
|
4785
|
+
desc '', '', :hide => true
|
4755
4786
|
|
4756
4787
|
def download_built_image(image_name, image_slug)
|
4757
4788
|
begin
|
@@ -4995,7 +5026,7 @@ module Cnvrg
|
|
4995
5026
|
end
|
4996
5027
|
end
|
4997
5028
|
|
4998
|
-
desc 'experiments', 'List project experiments'
|
5029
|
+
desc 'experiments', 'List project experiments', :hide => true
|
4999
5030
|
method_option :id, :type => :string, :aliases => ["--id"], :desc => "Get info for specific experiments", :default => ""
|
5000
5031
|
method_option :tag, :type => :string, :aliases => ["-t"], :desc => "Get info for specific experiment tag", :default => ""
|
5001
5032
|
|
@@ -5864,6 +5895,57 @@ module Cnvrg
|
|
5864
5895
|
end
|
5865
5896
|
end
|
5866
5897
|
|
5898
|
+
def exec_local(cmd , print_log, start_commit, real, start_time)
|
5899
|
+
log = []
|
5900
|
+
PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
|
5901
|
+
begin
|
5902
|
+
stdout.each do |line|
|
5903
|
+
cur_time = Time.now
|
5904
|
+
real_time = Time.now - real
|
5905
|
+
cur_log = {time: cur_time,
|
5906
|
+
message: line,
|
5907
|
+
type: "stdout",
|
5908
|
+
real: real_time
|
5909
|
+
}
|
5910
|
+
if print_log
|
5911
|
+
puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
|
5912
|
+
end
|
5913
|
+
log << cur_log
|
5914
|
+
if log.size >= 10
|
5915
|
+
@exp.upload_temp_log(log) unless log.empty?
|
5916
|
+
log = []
|
5917
|
+
elsif (start_time + 15.seconds) <= Time.now
|
5918
|
+
@exp.upload_temp_log(log) unless log.empty?
|
5919
|
+
log = []
|
5920
|
+
start_time = Time.now
|
5921
|
+
end
|
5922
|
+
end
|
5923
|
+
if stderr
|
5924
|
+
stderr.each do |err|
|
5925
|
+
log << {time: Time.now, message: err, type: "stderr"}
|
5926
|
+
end
|
5927
|
+
end
|
5928
|
+
rescue Errno::EIO => e
|
5929
|
+
log_error(e)
|
5930
|
+
if !log.empty?
|
5931
|
+
temp_log = log
|
5932
|
+
@exp.upload_temp_log(temp_log) unless temp_log.empty?
|
5933
|
+
log -= temp_log
|
5934
|
+
end
|
5935
|
+
rescue Errno::ENOENT => e
|
5936
|
+
exp_success = false
|
5937
|
+
log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
|
5938
|
+
log_error(e)
|
5939
|
+
rescue => e
|
5940
|
+
res = @exp.end(log, 1, start_commit, 0, 0)
|
5941
|
+
log_message("Error occurred,aborting", Thor::Shell::Color::RED)
|
5942
|
+
log_error(e)
|
5943
|
+
exit(0)
|
5944
|
+
end
|
5945
|
+
::Process.wait pid
|
5946
|
+
end
|
5947
|
+
end
|
5948
|
+
|
5867
5949
|
end
|
5868
5950
|
end
|
5869
5951
|
|
@@ -10,20 +10,20 @@ module Cnvrg
|
|
10
10
|
Cnvrg::Logger.log_info("cnvrg is not configured")
|
11
11
|
end
|
12
12
|
|
13
|
-
def start(username, password)
|
14
|
-
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password})
|
13
|
+
def start(username, password, no_auth, port: nil)
|
14
|
+
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password, no_auth: no_auth, port: port})
|
15
15
|
end
|
16
16
|
|
17
17
|
def status()
|
18
18
|
Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/status" , 'GET', nil)
|
19
19
|
end
|
20
20
|
|
21
|
-
def run_portforward_command(pod_name, port, kubeconfig, namespace)
|
21
|
+
def run_portforward_command(pod_name, port, kubeconfig, namespace, internal_port)
|
22
22
|
command = "kubectl"
|
23
23
|
if kubeconfig.present?
|
24
24
|
command = "kubectl --kubeconfig=#{kubeconfig}"
|
25
25
|
end
|
26
|
-
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}
|
26
|
+
bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:#{internal_port}"
|
27
27
|
puts("\nrunning command #{bashCommand}")
|
28
28
|
`#{bashCommand}`
|
29
29
|
end
|
data/lib/cnvrg/files.rb
CHANGED
@@ -106,7 +106,7 @@ module Cnvrg
|
|
106
106
|
commit: commit_sha1
|
107
107
|
})
|
108
108
|
unless Cnvrg::CLI.is_response_success(resp, false)
|
109
|
-
raise
|
109
|
+
raise StandardError.new("unsupported character: folder name can not include / \\ * : ? \" | ")
|
110
110
|
end
|
111
111
|
# resolve bucket
|
112
112
|
res = resp['result']
|
@@ -730,7 +730,11 @@ module Cnvrg
|
|
730
730
|
end
|
731
731
|
res = Cnvrg::API.request(@base_resource + "download_files", 'POST', {files: files, commit: commit})
|
732
732
|
unless Cnvrg::CLI.is_response_success(res, false)
|
733
|
-
|
733
|
+
begin
|
734
|
+
puts(res)
|
735
|
+
rescue
|
736
|
+
end
|
737
|
+
raise StandardError.new("Cant download files from the server.")
|
734
738
|
end
|
735
739
|
self.download_multiple_files_s3(res['result'], @project_home, postfix: postfix, progress: progress, threads: threads)
|
736
740
|
end
|
data/lib/cnvrg/helpers/agent.rb
CHANGED
@@ -18,10 +18,13 @@ class Cnvrg::Helpers::Agent
|
|
18
18
|
#### params
|
19
19
|
def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
|
20
20
|
@executer = executer
|
21
|
+
@job_id = ENV["CNVRG_JOB_ID"]
|
21
22
|
@slug = slug
|
22
23
|
@files_exist = files_exist
|
23
24
|
@container_name = container_name
|
24
|
-
@
|
25
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
26
|
+
@main_name = @is_new_main ? "main" : "slave"
|
27
|
+
@run_in_main = @container_name.downcase == @main_name
|
25
28
|
@log_interval = send_log_interval
|
26
29
|
# https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
|
27
30
|
if timeout.blank? or timeout.negative?
|
@@ -37,18 +40,8 @@ class Cnvrg::Helpers::Agent
|
|
37
40
|
@sleep_before_retry = sleep_before_retry
|
38
41
|
@real_execution_retries = 0 ## How many times the command really executed until success
|
39
42
|
@single_quotes = single_quotes
|
40
|
-
@docker_user =
|
41
|
-
@
|
42
|
-
if docker_user.present?
|
43
|
-
@docker_user = " --user #{docker_user}"
|
44
|
-
end
|
45
|
-
if @run_in_slave
|
46
|
-
if @single_quotes
|
47
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
|
48
|
-
else
|
49
|
-
@command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
|
50
|
-
end
|
51
|
-
end
|
43
|
+
@docker_user = docker_user
|
44
|
+
@use_bash = use_bash
|
52
45
|
@output = []
|
53
46
|
@errors = []
|
54
47
|
@exit_status = nil
|
@@ -75,19 +68,22 @@ class Cnvrg::Helpers::Agent
|
|
75
68
|
|
76
69
|
def exec!
|
77
70
|
log_internal("Command: #{@command} with slug: #{@slug} started!")
|
71
|
+
command_status = Status::FINISHED
|
78
72
|
if @command.blank?
|
79
73
|
@exit_status = 0
|
74
|
+
command_status = Status::ABORTED
|
80
75
|
elsif should_run?
|
81
76
|
send_logs(status: Status::STARTED)
|
82
77
|
periodic_thread_handle = periodic_thread
|
83
78
|
execute_command
|
84
79
|
else
|
80
|
+
command_status = Status::ABORTED
|
85
81
|
@exit_status = 127
|
86
82
|
end
|
87
83
|
finish_log = "Command: #{@command} with slug: #{@slug} finished"
|
88
84
|
finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
|
89
85
|
log_internal(finish_log)
|
90
|
-
send_logs(exit_status: @exit_status, status:
|
86
|
+
send_logs(exit_status: @exit_status, status: command_status)
|
91
87
|
if periodic_thread_handle.present?
|
92
88
|
periodic_thread_handle.join
|
93
89
|
end
|
@@ -117,14 +113,50 @@ class Cnvrg::Helpers::Agent
|
|
117
113
|
execute_command
|
118
114
|
end
|
119
115
|
|
116
|
+
def execute_command_on_slave
|
117
|
+
extra_slug = (0...2).map { (65 + rand(26)).chr }.join
|
118
|
+
result_file = "/conf/result-#{@slug}-#{extra_slug}"
|
119
|
+
Timeout.timeout(@timeout) do
|
120
|
+
data = {cmd: @command, async: true, file_name: result_file, use_script: true, use_bash: @use_bash, use_sh: !@use_bash, docker_user: @docker_user}
|
121
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
122
|
+
response = conn.post('command', data.to_json)
|
123
|
+
if response.to_hash[:status].to_i != 200
|
124
|
+
@exit_status = 129
|
125
|
+
raise StandardError.new("Cant send command to slave")
|
126
|
+
end
|
127
|
+
t = FileWatch::Tail.new
|
128
|
+
filename = result_file
|
129
|
+
t.tail(filename)
|
130
|
+
t.subscribe do |path, line|
|
131
|
+
if line.include?("cnvrg-exit-code")
|
132
|
+
@exit_status = line.split("=")[1].to_i
|
133
|
+
break
|
134
|
+
end
|
135
|
+
if !@is_new_main
|
136
|
+
log_internal(line, level: LogLevel::PURE)
|
137
|
+
end
|
138
|
+
line = line.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_')
|
139
|
+
@output << {log: line, timestamp: Time.now}
|
140
|
+
end
|
141
|
+
end
|
142
|
+
rescue Timeout::Error
|
143
|
+
@errors << {log: "Command timed out!", timestamp: Time.now}
|
144
|
+
log_internal("Command timed out!", level: LogLevel::ERROR)
|
145
|
+
@exit_status = 124
|
146
|
+
ensure
|
147
|
+
retry_command if @retries != 0 and @exit_status !=0
|
148
|
+
@exit_status
|
149
|
+
end
|
150
|
+
|
120
151
|
def execute_command
|
152
|
+
return execute_command_on_slave if @run_in_main
|
121
153
|
Timeout.timeout(@timeout) do
|
122
154
|
PTY.spawn(@command) do |stdout, stdin, pid, stderr|
|
123
155
|
@pid = pid
|
124
156
|
begin
|
125
157
|
if stdout.present?
|
126
158
|
stdout.each do |line|
|
127
|
-
log_internal(line, level: LogLevel::
|
159
|
+
log_internal(line, level: LogLevel::INFO)
|
128
160
|
line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
|
129
161
|
@output << {log: line, timestamp: Time.now}
|
130
162
|
end
|
@@ -176,10 +208,18 @@ class Cnvrg::Helpers::Agent
|
|
176
208
|
def log_internal(log, level: LogLevel::INFO)
|
177
209
|
if level == LogLevel::PURE
|
178
210
|
puts(log)
|
179
|
-
|
180
|
-
|
211
|
+
STDOUT.flush
|
212
|
+
return
|
213
|
+
end
|
214
|
+
to_print = {message: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity, job: @job_id}
|
215
|
+
if log.start_with?("{") and log.include?("timestamp")
|
216
|
+
log_json = JSON.parse(log)
|
217
|
+
to_print = to_print.stringify_keys.merge(log_json.stringify_keys)
|
181
218
|
end
|
219
|
+
puts(to_print.to_json)
|
182
220
|
STDOUT.flush
|
221
|
+
rescue => e
|
222
|
+
Cnvrg::Logger.log_error(e)
|
183
223
|
end
|
184
224
|
|
185
225
|
def filter_logs_by_regex(logs)
|
@@ -190,4 +230,4 @@ class Cnvrg::Helpers::Agent
|
|
190
230
|
end
|
191
231
|
end
|
192
232
|
end
|
193
|
-
end
|
233
|
+
end
|
@@ -1,7 +1,9 @@
|
|
1
|
+
require "filewatch/tail"
|
1
2
|
require 'cnvrg/helpers/agent'
|
2
3
|
class Cnvrg::Helpers::Executer
|
3
|
-
attr_reader :machine_activity, :agent_id, :
|
4
|
-
|
4
|
+
attr_reader :machine_activity, :agent_id, :main_id
|
5
|
+
MAIN_CONTAINER_PORT = ENV['MAIN_CONTAINER_PORT'].try(:to_i) || 4000
|
6
|
+
HAS_DOCKER = ENV['HAS_DOCKER'] == "true"
|
5
7
|
|
6
8
|
### this class represent a machine_activity. it will poll the commands, communicate with the
|
7
9
|
# server (poll commands) and let the server know the status of this executer.
|
@@ -9,11 +11,15 @@ class Cnvrg::Helpers::Executer
|
|
9
11
|
@owner = owner
|
10
12
|
@job_id = job_id
|
11
13
|
@poll_every = poll_every
|
14
|
+
@check_main_every = 10
|
12
15
|
@machine_activity = machine_activity
|
13
16
|
@commands_q = Queue.new
|
14
17
|
@files_q = Queue.new
|
15
18
|
@agent_id = nil
|
16
|
-
@
|
19
|
+
@main_id = nil
|
20
|
+
@main_start_time = nil
|
21
|
+
@is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
|
22
|
+
@main_name = @is_new_main ? "main" : "slave"
|
17
23
|
end
|
18
24
|
|
19
25
|
def create_file_cmd(path, content)
|
@@ -37,9 +43,10 @@ class Cnvrg::Helpers::Executer
|
|
37
43
|
def executer_stats
|
38
44
|
return @stats if @stats.present?
|
39
45
|
Cnvrg::Logger.log_info("getting containers")
|
40
|
-
@agent_id, @
|
46
|
+
@agent_id, @main_id = containers
|
41
47
|
Cnvrg::Logger.log_info("got containers")
|
42
48
|
pod_name, node_name = get_node_and_pod_names
|
49
|
+
# For backwards compatibility we still call this slave stats
|
43
50
|
@stats = {
|
44
51
|
pod_name: pod_name,
|
45
52
|
node_name: node_name,
|
@@ -53,39 +60,48 @@ class Cnvrg::Helpers::Executer
|
|
53
60
|
cnvrg: Cnvrg::VERSION
|
54
61
|
},
|
55
62
|
slave: {
|
56
|
-
container_id: @
|
57
|
-
|
58
|
-
|
63
|
+
container_id: @main_id,
|
64
|
+
container_name: @main_name,
|
65
|
+
workdir: run_in_main('pwd'),
|
66
|
+
homedir: main_homedir,
|
59
67
|
spark_path: spark_path,
|
60
|
-
user:
|
61
|
-
cnvrg:
|
62
|
-
has_bash:
|
63
|
-
user_id:
|
64
|
-
group_id:
|
65
|
-
python_version:
|
66
|
-
python3_version:
|
67
|
-
pip_version:
|
68
|
-
pip3_version:
|
68
|
+
user: run_in_main( 'whoami'),
|
69
|
+
cnvrg: run_in_main( 'which cnvrg'),
|
70
|
+
has_bash: run_in_main( 'which bash'),
|
71
|
+
user_id: run_in_main( 'id -u'),
|
72
|
+
group_id: run_in_main( 'id -g'),
|
73
|
+
python_version: run_in_main( 'python --version'),
|
74
|
+
python3_version: run_in_main( 'python3 --version'),
|
75
|
+
pip_version: run_in_main( 'pip --version'),
|
76
|
+
pip3_version: run_in_main( 'pip3 --version')
|
69
77
|
},
|
70
78
|
}
|
79
|
+
|
71
80
|
@stats
|
72
81
|
end
|
73
82
|
|
74
83
|
def containers
|
75
84
|
agent_id = nil
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
85
|
+
main_id = nil
|
86
|
+
timeout = 2
|
87
|
+
timeout = nil if (!@is_new_main || HAS_DOCKER)
|
88
|
+
Timeout.timeout(timeout) do
|
89
|
+
while agent_id.blank? or main_id.blank?
|
90
|
+
grep_by = @job_id
|
91
|
+
grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
|
92
|
+
cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
|
93
|
+
agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
|
94
|
+
main_id = cntrs.find{|container_name| container_name.include? @main_name}.split(",").first rescue nil
|
95
|
+
sleep(2)
|
96
|
+
end
|
84
97
|
end
|
85
|
-
if
|
86
|
-
raise "Can't find
|
98
|
+
if main_id.blank?
|
99
|
+
raise "Can't find main id"
|
87
100
|
end
|
88
|
-
[agent_id,
|
101
|
+
[agent_id, main_id]
|
102
|
+
rescue => e
|
103
|
+
Cnvrg::Logger.log_error(e)
|
104
|
+
[agent_id, main_id]
|
89
105
|
end
|
90
106
|
|
91
107
|
def current_homedir
|
@@ -93,21 +109,45 @@ class Cnvrg::Helpers::Executer
|
|
93
109
|
end
|
94
110
|
|
95
111
|
def spark_path
|
96
|
-
|
112
|
+
run_in_main("env | grep SPARK_HOME").strip.split("=").try(:last)
|
97
113
|
end
|
98
114
|
|
99
|
-
def
|
100
|
-
|
115
|
+
def main_homedir()
|
116
|
+
run_in_main("env | grep -w HOME").split("=").try(:last)
|
101
117
|
end
|
102
118
|
|
103
|
-
def
|
104
|
-
|
119
|
+
def main_env
|
120
|
+
run_in_main("env").split("\n").map{|x| x.split("=")}
|
105
121
|
end
|
106
122
|
|
107
|
-
def
|
108
|
-
|
109
|
-
end
|
123
|
+
def run_in_main(command)
|
124
|
+
data = {cmd: command, async: false, use_sh: true}
|
110
125
|
|
126
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn(timeout: 180)
|
127
|
+
response = conn.post('command', data.to_json)
|
128
|
+
if response.to_hash[:status].to_i != 200
|
129
|
+
Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
|
130
|
+
return ""
|
131
|
+
end
|
132
|
+
resp = []
|
133
|
+
lines = response.body.split("\n")
|
134
|
+
lines.each do |line|
|
135
|
+
next if line.strip == nil or line.strip == ""
|
136
|
+
if line.include?("cnvrg-exit-code")
|
137
|
+
exit_status = line.split("=")[1].to_i
|
138
|
+
if exit_status != 0
|
139
|
+
Cnvrg::Logger.log_info("failed to run find command #{command} on main")
|
140
|
+
return ""
|
141
|
+
end
|
142
|
+
next
|
143
|
+
end
|
144
|
+
resp << line
|
145
|
+
end
|
146
|
+
return resp.join("\n")
|
147
|
+
rescue => e
|
148
|
+
Cnvrg::Logger.log_error(e)
|
149
|
+
return ""
|
150
|
+
end
|
111
151
|
|
112
152
|
def poll
|
113
153
|
resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
|
@@ -124,6 +164,7 @@ class Cnvrg::Helpers::Executer
|
|
124
164
|
success = false
|
125
165
|
puts("Agent started, connecting to #{Cnvrg::API.get_api}")
|
126
166
|
STDOUT.flush
|
167
|
+
wait_for_main
|
127
168
|
while !success and retries < 100
|
128
169
|
begin
|
129
170
|
resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
|
@@ -154,14 +195,75 @@ class Cnvrg::Helpers::Executer
|
|
154
195
|
end
|
155
196
|
end
|
156
197
|
|
198
|
+
def check_main_is_working_thread
|
199
|
+
while true
|
200
|
+
check_main_alive
|
201
|
+
sleep(@check_main_every)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
157
205
|
def main_thread
|
158
206
|
init
|
159
207
|
Thread.new do
|
160
208
|
polling_thread
|
161
209
|
end
|
210
|
+
Thread.new do
|
211
|
+
check_main_is_working_thread
|
212
|
+
end
|
162
213
|
execute_cmds
|
163
214
|
end
|
164
215
|
|
216
|
+
def wait_for_main
|
217
|
+
copy_file_to_main
|
218
|
+
start_tiny_if_missing
|
219
|
+
puts("Waiting for main container")
|
220
|
+
STDOUT.flush
|
221
|
+
got_response = false
|
222
|
+
while !got_response do
|
223
|
+
begin
|
224
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
225
|
+
response = conn.get('readiness')
|
226
|
+
if response.to_hash[:status].to_i != 200
|
227
|
+
sleep(0.1)
|
228
|
+
next
|
229
|
+
else
|
230
|
+
puts("Client container is ready")
|
231
|
+
STDOUT.flush
|
232
|
+
@main_start_time = response.body.to_i
|
233
|
+
got_response = true
|
234
|
+
end
|
235
|
+
rescue => e
|
236
|
+
puts("Failed to connect to main")
|
237
|
+
puts(e)
|
238
|
+
STDOUT.flush
|
239
|
+
sleep(0.1)
|
240
|
+
next
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def copy_file_to_main
|
246
|
+
begin
|
247
|
+
FileUtils.cp("/cnvrg-tiny", "/conf/tiny")
|
248
|
+
FileUtils.cp_r("/scripts", "/conf/scripts-bin")
|
249
|
+
FileUtils.touch("/conf/tiny-ready")
|
250
|
+
rescue => e
|
251
|
+
Cnvrg::Logger.log_error(e)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def start_tiny_if_missing
|
256
|
+
return unless ENV['MAIN_CONTAINER_PORT'].blank?
|
257
|
+
Cnvrg::Logger.log_info("Tiny not found, starting it")
|
258
|
+
@agent_id, @main_id = containers
|
259
|
+
pid = Process.fork do
|
260
|
+
Cnvrg::Logger.log_info("running docker exec -i #{@main_id} sh -c '/conf/tiny")
|
261
|
+
`docker exec -i #{@main_id} sh -c '/conf/tiny'`.strip
|
262
|
+
end
|
263
|
+
Process.detach(pid)
|
264
|
+
Cnvrg::Logger.log_info("Tiny started and detached")
|
265
|
+
end
|
266
|
+
|
165
267
|
def execute_cmds
|
166
268
|
pids = []
|
167
269
|
while true
|
@@ -215,6 +317,23 @@ class Cnvrg::Helpers::Executer
|
|
215
317
|
Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
|
216
318
|
end
|
217
319
|
|
320
|
+
def check_main_alive
|
321
|
+
# Dont check before we got first response
|
322
|
+
return if @main_start_time == nil
|
323
|
+
conn = Cnvrg::Helpers::Executer.get_main_conn
|
324
|
+
response = conn.get('readiness')
|
325
|
+
if response.to_hash[:status].to_i != 200
|
326
|
+
main_start_time = 0
|
327
|
+
else
|
328
|
+
main_start_time = response.body.to_i
|
329
|
+
end
|
330
|
+
if main_start_time != @main_start_time
|
331
|
+
puts("Found that main restarted, restarting agent")
|
332
|
+
Cnvrg::Logger.log_info("Found that main restarted, restarting agent")
|
333
|
+
exit(1)
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
218
337
|
def get_pod_events(pod_name)
|
219
338
|
return if pod_name.blank?
|
220
339
|
`kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
|
@@ -224,4 +343,27 @@ class Cnvrg::Helpers::Executer
|
|
224
343
|
return if node_name.blank?
|
225
344
|
`kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
|
226
345
|
end
|
346
|
+
|
347
|
+
def self.main_container_url
|
348
|
+
if ENV["CNVRG_COMPUTE_CLUSTER"].blank? and ENV["KUBERNETES_SERVICE_HOST"].blank?
|
349
|
+
if ENV["MAIN_CONTAINER_PORT"].blank?
|
350
|
+
host = "slave"
|
351
|
+
else
|
352
|
+
host = "main"
|
353
|
+
end
|
354
|
+
"http://#{host}:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
355
|
+
else
|
356
|
+
"http://localhost:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
def self.get_main_conn(timeout: 4, open_timeout: 1)
|
361
|
+
conn = Faraday.new(
|
362
|
+
url: Cnvrg::Helpers::Executer.main_container_url,
|
363
|
+
headers: {'Content-Type' => 'application/json'}
|
364
|
+
)
|
365
|
+
conn.options.timeout = timeout
|
366
|
+
conn.options.open_timeout = open_timeout
|
367
|
+
conn
|
368
|
+
end
|
227
369
|
end
|
data/lib/cnvrg/job_ssh.rb
CHANGED
@@ -5,14 +5,18 @@ module Cnvrg
|
|
5
5
|
method_option :port, :type => :numeric, :aliases => ["-p", "--port"], :desc => "Port to bind into", :default => 2222
|
6
6
|
method_option :username, :type => :string, :aliases => ["-u", "--username"], :desc => "Job container user name", :default => nil
|
7
7
|
method_option :password, :type => :string, :aliases => ["--password"], :desc =>"Job Conatainer user name, will be set by cnvrg", :default => nil
|
8
|
+
method_option :no_auth, :type => :boolean, :aliases => ["--no-auth"], :default => false
|
9
|
+
method_option :internal_port, :type => :numeric, :aliases => ["--internal-port"], :desc =>"Internal port in the pod for the ssh", :default => 22
|
8
10
|
method_option :kubeconfig, :type => :string, :aliases => ["--kubeconfig"], :desc => "Path to kubeconfig, if blank default config will be used", :default => nil
|
9
11
|
def start(job_id)
|
12
|
+
no_auth = options["no_auth"]
|
10
13
|
Cnvrg::CLI.new.log_start(__method__, args, options)
|
11
14
|
@job_ssh = ConnectJobSsh.new(job_id)
|
12
|
-
@job_ssh.start(options['username'], options['password'])
|
15
|
+
@job_ssh.start(options['username'], options['password'], no_auth, port: options['internal_port'])
|
13
16
|
pod_name = nil
|
14
17
|
namespace = "cnvrg"
|
15
18
|
ssh_ready = false
|
19
|
+
internal_port = options['internal_port']
|
16
20
|
while not ssh_ready
|
17
21
|
resp = @job_ssh.status()
|
18
22
|
status = resp["ssh_status"]
|
@@ -26,13 +30,14 @@ module Cnvrg
|
|
26
30
|
username = resp["username"]
|
27
31
|
pod_name = resp["pod_name"]
|
28
32
|
namespace = resp["namespace"]
|
33
|
+
internal_port = resp["port"] || internal_port
|
29
34
|
ssh_ready = true
|
30
35
|
else
|
31
36
|
puts("Failed to start ssh")
|
32
37
|
break
|
33
38
|
end
|
34
39
|
end
|
35
|
-
if pod_name.blank? or password.blank? or username.blank?
|
40
|
+
if pod_name.blank? or (password.blank? and !no_auth) or username.blank?
|
36
41
|
puts("Failed to get required params")
|
37
42
|
return
|
38
43
|
end
|
@@ -41,8 +46,8 @@ module Cnvrg
|
|
41
46
|
puts("host: 127.0.0.1")
|
42
47
|
puts("port: #{options["port"]}")
|
43
48
|
puts("username: #{username}")
|
44
|
-
puts("password: #{password}")
|
45
|
-
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace)
|
49
|
+
puts("password: #{password}") unless no_auth
|
50
|
+
@job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace, internal_port)
|
46
51
|
end
|
47
52
|
end
|
48
53
|
end
|
data/lib/cnvrg/project.rb
CHANGED
@@ -328,15 +328,21 @@ module Cnvrg
|
|
328
328
|
end
|
329
329
|
|
330
330
|
def get_storage_client
|
331
|
-
|
332
|
-
|
333
|
-
|
331
|
+
client_params = nil
|
332
|
+
i = 0
|
333
|
+
begin
|
334
|
+
response = Cnvrg::API.request("users/#{@owner}/projects/#{@slug}/client", 'GET')
|
335
|
+
unless Cnvrg::CLI.is_response_success(response, false)
|
336
|
+
raise StandardError.new("Can't find project credentials")
|
337
|
+
end
|
334
338
|
client_params = response['client']
|
335
|
-
|
336
|
-
|
339
|
+
rescue StandardError
|
340
|
+
i += 1
|
341
|
+
sleep(5 * i)
|
342
|
+
retry if i < 10
|
337
343
|
client_params = get_storage_client_fallback
|
338
344
|
end
|
339
|
-
|
345
|
+
raise StandardError.new("Can't find project credentials") unless client_params
|
340
346
|
Cnvrg::Downloader::Client.factory(client_params)
|
341
347
|
end
|
342
348
|
|
@@ -378,14 +384,18 @@ module Cnvrg
|
|
378
384
|
[]
|
379
385
|
end
|
380
386
|
|
381
|
-
def generate_output_dir(output_dir)
|
387
|
+
def generate_output_dir(output_dir, local: false)
|
382
388
|
Cnvrg::Logger.log_info("Generating output dir for #{output_dir}")
|
383
389
|
upload_list = []
|
390
|
+
list = []
|
384
391
|
list = Dir.glob("/cnvrg/#{output_dir}/**/*", File::FNM_DOTMATCH)
|
392
|
+
if local
|
393
|
+
list += Dir.glob("#{output_dir}/**/*", File::FNM_DOTMATCH)
|
394
|
+
end
|
395
|
+
list.uniq!
|
385
396
|
Parallel.map(list, in_threads: IDXParallelThreads) do |e|
|
386
397
|
next if e.end_with? "/."
|
387
398
|
if File.directory? e
|
388
|
-
|
389
399
|
upload_list << e + "/"
|
390
400
|
else
|
391
401
|
upload_list << e
|
@@ -447,6 +457,10 @@ module Cnvrg
|
|
447
457
|
if list_ignore_new.include? label
|
448
458
|
next
|
449
459
|
end
|
460
|
+
if File.symlink?(e)
|
461
|
+
Cnvrg::Logger.log_info("Skipping symlink #{e}")
|
462
|
+
next
|
463
|
+
end
|
450
464
|
if File.directory? e
|
451
465
|
dir_name = (label.ends_with? "/") ? label : (label + "/")
|
452
466
|
tree_idx[dir_name] = nil
|
@@ -647,7 +661,11 @@ module Cnvrg
|
|
647
661
|
|
648
662
|
def fetch_webapp_slugs(webapp_slug, slugs: nil)
|
649
663
|
response = Cnvrg::API_V2.request("#{self.owner}/projects/#{self.slug}/webapps/#{webapp_slug}" , 'GET')
|
650
|
-
|
664
|
+
|
665
|
+
if response.key?("experiments")
|
666
|
+
return response["experiments"]
|
667
|
+
end
|
668
|
+
return response["data"]["attributes"]["experiments"]
|
651
669
|
rescue
|
652
670
|
slugs
|
653
671
|
end
|
@@ -699,8 +717,11 @@ module Cnvrg
|
|
699
717
|
res = JSON.parse(resp['result']) rescue nil
|
700
718
|
return if res.blank?
|
701
719
|
config = self.get_config
|
702
|
-
config[:is_git] = res['git']
|
703
720
|
config[:project_name] = res['title']
|
721
|
+
config[:project_slug] = @slug
|
722
|
+
config[:owner] = @owner
|
723
|
+
config[:git] = res['git'] || false
|
724
|
+
config[:is_git] = res['git'] || false
|
704
725
|
self.set_config(config)
|
705
726
|
end
|
706
727
|
|
data/lib/cnvrg/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
1
|
module Cnvrg
|
2
|
-
VERSION = '
|
3
|
-
end
|
2
|
+
VERSION = '2.0.13'
|
3
|
+
end
|
metadata
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cnvrg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yochay Ettun
|
8
8
|
- Leah Kolben
|
9
|
-
|
9
|
+
- Omer Shacham
|
10
|
+
autorequire:
|
10
11
|
bindir: bin
|
11
12
|
cert_chain: []
|
12
|
-
date: 2021-
|
13
|
+
date: 2021-10-27 00:00:00.000000000 Z
|
13
14
|
dependencies:
|
14
15
|
- !ruby/object:Gem::Dependency
|
15
16
|
name: bundler
|
@@ -321,6 +322,20 @@ dependencies:
|
|
321
322
|
- - "~>"
|
322
323
|
- !ruby/object:Gem::Version
|
323
324
|
version: 0.1.1
|
325
|
+
- !ruby/object:Gem::Dependency
|
326
|
+
name: filewatch
|
327
|
+
requirement: !ruby/object:Gem::Requirement
|
328
|
+
requirements:
|
329
|
+
- - "~>"
|
330
|
+
- !ruby/object:Gem::Version
|
331
|
+
version: 0.9.0
|
332
|
+
type: :runtime
|
333
|
+
prerelease: false
|
334
|
+
version_requirements: !ruby/object:Gem::Requirement
|
335
|
+
requirements:
|
336
|
+
- - "~>"
|
337
|
+
- !ruby/object:Gem::Version
|
338
|
+
version: 0.9.0
|
324
339
|
- !ruby/object:Gem::Dependency
|
325
340
|
name: parallel
|
326
341
|
requirement: !ruby/object:Gem::Requirement
|
@@ -458,7 +473,7 @@ files:
|
|
458
473
|
homepage: https://cnvrg.io
|
459
474
|
licenses: []
|
460
475
|
metadata: {}
|
461
|
-
post_install_message:
|
476
|
+
post_install_message:
|
462
477
|
rdoc_options: []
|
463
478
|
require_paths:
|
464
479
|
- lib
|
@@ -473,8 +488,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
473
488
|
- !ruby/object:Gem::Version
|
474
489
|
version: '0'
|
475
490
|
requirements: []
|
476
|
-
rubygems_version: 3.
|
477
|
-
signing_key:
|
491
|
+
rubygems_version: 3.2.22
|
492
|
+
signing_key:
|
478
493
|
specification_version: 4
|
479
494
|
summary: A CLI tool for interacting with cnvrg.io.
|
480
495
|
test_files: []
|