cnvrg 1.11.31 → 2.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a31d199ccd8981e455384ced40cc63a0a9a687982338cbc85e3a9413c2968a2c
4
- data.tar.gz: 10d4da464f6ef7b507d563e2976cf17f5d6cd8bcfeab742c83ec5bfa1ab575e2
3
+ metadata.gz: 0e2e22e961bc723442ec80d457f0dde2b982f1c75beb2e859bb5523e83682fbe
4
+ data.tar.gz: 645a4add593bcf3a63be4d64b4ccb45000e39447e7e11b9fd674347856e2bf97
5
5
  SHA512:
6
- metadata.gz: 2495cc0d77ada40ae2c6da54ada72710f28facba737b0a37c0dd2cb904d37d32dd23d2cba476fb3854ec47e19eaa11fb34913c2038d6ea059378ad0d460e905a
7
- data.tar.gz: dbe50928a5d25259f78fc1dcdab474545b71308b7e1d798678fb2cd8786f0b0b2eda78062bcef6bcfdb16c9481bc47349c6f571de9cfe89f547c57f583bea403
6
+ metadata.gz: 2af7a28b7b428f8b4066a1b3f01bada7273eaa93ea847f87badc253aea2123244ad27ab3804cda5f4cdcee91cc2cc0ed7dd168b347839f59a6c183a702438167
7
+ data.tar.gz: dc151c1ab54d54a86648b9f1341d402d0eb6fb8d578cc0a01f2594a28a34bfb8d97676d07aa389a8b1155fa466845f87be9b1dffbfcc2bcda33376937ecef418
data/Readme.md CHANGED
@@ -18,4 +18,45 @@
18
18
  ## Version v1.11.30
19
19
  2021-04-06
20
20
  ## Version v1.11.31
21
- 2021-04-22
21
+ 2021-04-22
22
+ ## Version v1.11.32
23
+ 2021-05-05
24
+ * DEV-8868 - Bug: SDK - e.sync() in git project only creates empty "output" folder in commit
25
+ ## Version v2.0.1
26
+ 2021-06-13
27
+ ## Version v2.0.2
28
+ 2021-06-16
29
+ * DEV-9694 - Bug: Download artifacts fails on authorization error
30
+ ## Version v2.0.3
31
+ 2021-06-29
32
+ * DEV-9919 - Bug: clone artifacts fails on "Not Authorize, Are you logged in?"
33
+ ## Version v2.0.4
34
+ 2021-07-08
35
+ * DEV-9935 - Bug: CLI - cnvrg sync creates new commit but no blob versions
36
+ ## Version v2.0.5
37
+ 2021-07-11
38
+ * DEV-10171 - Bug: experiment randomly fails with error- "Couldn't clone artifacts"
39
+ * DEV-10189 - Bug: CLI Sync -file/folder with broken symlink will cause sync to fail
40
+ ## Version v2.0.6
41
+ 2021-07-18
42
+ * DEV-10209 - Bug: some experiments in grid failed on cnvrg-cli commands (docker container id was missing)
43
+ ## Version v2.0.7
44
+ 2021-07-27
45
+ * DEV-10186 - Bug: CLI/run an experiment with --local tag giver server error
46
+ ## Version v2.0.8
47
+ 2021-09-06
48
+ * DEV-10697 - Bug: Tensorboard not starting in workspace and experiment.
49
+ ## Version v2.0.9
50
+ 2021-09-12
51
+ * DEV-10502 - Bug: Periodic sync stuck
52
+ ## Version v2.0.10
53
+ 2021-09-12
54
+ * DEV-10502 - Bug: Periodic sync stuck
55
+ ## Version v2.0.11
56
+ 2021-10-21
57
+ ## Version v2.0.12
58
+ 2021-10-25
59
+ * DEV-11544 - Sub-bug: local experiment is failing to run
60
+ ## Version v2.0.13
61
+ 2021-10-27
62
+ * DEV-11054 - Task: Create organization and user by default
data/cnvrg.gemspec CHANGED
@@ -6,7 +6,7 @@ require 'cnvrg/version'
6
6
  Gem::Specification.new do |spec|
7
7
  spec.name = 'cnvrg'
8
8
  spec.version = Cnvrg::VERSION
9
- spec.authors = ['Yochay Ettun', 'Leah Kolben']
9
+ spec.authors = ['Yochay Ettun', 'Leah Kolben', 'Omer Shacham']
10
10
  spec.email = ['info@cnvrg.io']
11
11
  spec.summary = %q{A CLI tool for interacting with cnvrg.io.}
12
12
  spec.description = %q{A CLI tool for interacting with cnvrg.io.}
@@ -39,6 +39,7 @@ Gem::Specification.new do |spec|
39
39
  spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
40
40
  spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
41
41
  spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
42
+ spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
42
43
  spec.add_runtime_dependency 'parallel', '~> 1.12.0'
43
44
  spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
44
45
  spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
data/lib/cnvrg/api.rb CHANGED
@@ -57,6 +57,7 @@ module Cnvrg
57
57
  conn = Faraday.new "#{endpoint_uri}"
58
58
  end
59
59
  conn.headers['Auth-Token'] = @pass
60
+ conn.headers['Authorization'] = "CAPI #{@pass}"
60
61
  conn.headers['User-Agent'] = "#{Cnvrg::API::USER_AGENT}"
61
62
  conn.options.timeout = 420
62
63
  conn.options.open_timeout=180
@@ -72,11 +73,11 @@ module Cnvrg
72
73
  if response.to_hash[:status].to_i != 200
73
74
  Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
74
75
  end
75
- if [503, 502, 429].include?(response.to_hash[:status].to_i)
76
+ if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
76
77
  Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
77
78
  success = false
78
79
  sleep(5 * retries)
79
- retries +=1
80
+ retries += 1
80
81
  next
81
82
  end
82
83
  rescue => e
@@ -112,11 +113,11 @@ module Cnvrg
112
113
  if response.to_hash[:status].to_i != 200
113
114
  Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
114
115
  end
115
- if [503, 502, 429].include?(response.to_hash[:status].to_i)
116
+ if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
116
117
  Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
117
118
  success = false
118
119
  sleep(5 * retries)
119
- retries +=1
120
+ retries += 1
120
121
  next
121
122
  end
122
123
  rescue => e
@@ -169,6 +170,7 @@ module Cnvrg
169
170
  when 'POST_FILE'
170
171
  conn = Faraday.new do |fr|
171
172
  fr.headers['Auth-Token'] = @pass
173
+ fr.headers['Authorization'] = "CAPI #{@pass}"
172
174
  fr.headers['User-Agent'] = "#{Cnvrg::API::USER_AGENT}"
173
175
  fr.headers["Content-Type"] = "multipart/form-data"
174
176
  if !Helpers.is_verify_ssl
data/lib/cnvrg/api_v2.rb CHANGED
@@ -22,6 +22,7 @@ module Cnvrg
22
22
 
23
23
  conn = Faraday.new endpoint_uri, :ssl => {:verify => !!Helpers.is_verify_ssl}
24
24
  conn.headers['Auth-Token'] = pass
25
+ conn.headers['Authorization'] = "CAPI #{pass}"
25
26
  conn.headers['User-Agent'] = Cnvrg::API::USER_AGENT
26
27
  conn.headers['Content-Type'] = "application/json"
27
28
  conn.options.timeout = 420
data/lib/cnvrg/auth.rb CHANGED
@@ -44,7 +44,7 @@ module Cnvrg
44
44
  end
45
45
  end
46
46
 
47
- def sign_in(email, password)
47
+ def sign_in(email, password, token: nil)
48
48
  url = Cnvrg::API.endpoint_uri()
49
49
  url = URI.parse(url+ "/users/sign_in")
50
50
  http = Net::HTTP.new(url.host, url.port)
@@ -61,6 +61,9 @@ module Cnvrg
61
61
 
62
62
  req.add_field("EMAIL", email)
63
63
  req.add_field("PASSWORD", password)
64
+ if token.present?
65
+ req.add_field("Authorization", "CAPI #{token}")
66
+ end
64
67
 
65
68
  response = http.request(req)
66
69
 
@@ -1,7 +1,7 @@
1
1
  module Cnvrg
2
2
  class LibraryCli < SubCommandBase
3
-
4
- desc "library import", ''
3
+ map push: :import
4
+ desc "library push", 'Push a new library to AI Library'
5
5
  def import
6
6
  unless File.exists? "library.yml"
7
7
  Cnvrg::CLI.log_message("Can't find library.yml", 'red')
data/lib/cnvrg/cli.rb CHANGED
@@ -173,7 +173,7 @@ module Cnvrg
173
173
  desc "data [COMMAND]", "Upload and manage datasets", :hide => false
174
174
  subcommand "data", Data
175
175
 
176
- desc "job", "manage running jobs", :hide => false
176
+ desc "job", "manage running jobs", :hide => true
177
177
  subcommand "job", JobCli
178
178
 
179
179
  desc "ssh", "ssh into running jobs", :hide => false
@@ -415,7 +415,7 @@ module Cnvrg
415
415
  end
416
416
  end
417
417
 
418
- desc 'set_compression_path', 'Set compression path'
418
+ desc 'set_compression_path', 'Set compression path', :hide => true
419
419
  method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
420
420
 
421
421
  def set_compression_path(*compression_path)
@@ -496,8 +496,10 @@ module Cnvrg
496
496
 
497
497
 
498
498
  desc 'login', 'Authenticate with cnvrg.io platform'
499
+ method_option :sso, :type => :boolean, :aliases => ["-s", "--sso"], :default => false
499
500
 
500
501
  def login
502
+ use_token = options["sso"]
501
503
  begin
502
504
  log_handler()
503
505
  log_start(__method__, args, options)
@@ -515,12 +517,21 @@ module Cnvrg
515
517
  exit(0)
516
518
  end
517
519
  @email = ask("Enter your email:")
518
- password = cmd.ask("Enter your password (hidden):") {|q| q.echo = "*"}
519
- result = @auth.sign_in(@email, password)
520
+ if use_token
521
+ @token = cmd.ask("Enter your token (hidden):") {|q| q.echo = "*"}
522
+ netrc[Cnvrg::Helpers.netrc_domain] = @email, @token
523
+ netrc.save
524
+ password = ""
525
+ else
526
+ password = cmd.ask("Enter your password (hidden):") {|q| q.echo = "*"}
527
+ end
528
+ result = @auth.sign_in(@email, password, token: @token)
520
529
 
521
530
  if !result["token"].nil?
522
- netrc[Cnvrg::Helpers.netrc_domain] = @email, result["token"]
523
- netrc.save
531
+ unless use_token
532
+ netrc[Cnvrg::Helpers.netrc_domain] = @email, result["token"]
533
+ netrc.save
534
+ end
524
535
 
525
536
  log_message("Authenticated successfully as #{@email}", Thor::Shell::Color::GREEN)
526
537
 
@@ -2311,6 +2322,7 @@ module Cnvrg
2311
2322
  method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2312
2323
  method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2313
2324
  method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
2325
+ method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
2314
2326
 
2315
2327
  def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
2316
2328
  begin
@@ -2328,6 +2340,8 @@ module Cnvrg
2328
2340
  exp_obj = nil
2329
2341
  end
2330
2342
 
2343
+ local = options["local"]
2344
+
2331
2345
  commit_msg = options["message"]
2332
2346
  if commit_msg.nil? or commit_msg.empty?
2333
2347
  commit_msg = ""
@@ -2349,7 +2363,7 @@ module Cnvrg
2349
2363
  if git_output_dir.ends_with? "/"
2350
2364
  git_output_dir = git_output_dir[0..-2]
2351
2365
  end
2352
- list = @project.generate_output_dir(git_output_dir)
2366
+ list = @project.generate_output_dir(git_output_dir, local: local)
2353
2367
  end
2354
2368
  list += @project.generate_git_diff if options["git_diff"]
2355
2369
  spec_files_to_upload = list
@@ -2668,7 +2682,7 @@ module Cnvrg
2668
2682
  end
2669
2683
  end
2670
2684
 
2671
- desc 'commit before termination', 'Commit job code before termination'
2685
+ desc 'commit before termination', 'Commit job code before termination', :hide => true
2672
2686
  def commit_before_termination()
2673
2687
  job_type = ENV['CNVRG_JOB_TYPE']
2674
2688
  job_id = ENV['CNVRG_JOB_ID']
@@ -2678,7 +2692,7 @@ module Cnvrg
2678
2692
  log_error(e)
2679
2693
  end
2680
2694
 
2681
- desc 'update_job_commit', 'Update job with its last commit'
2695
+ desc 'update_job_commit', 'Update job with its last commit' , :hide => true
2682
2696
  def update_job_commit()
2683
2697
  job_type = ENV['CNVRG_JOB_TYPE']
2684
2698
  job_id = ENV['CNVRG_JOB_ID']
@@ -2868,7 +2882,7 @@ module Cnvrg
2868
2882
 
2869
2883
 
2870
2884
 
2871
- desc 'jump', 'Jump to specific commit'
2885
+ desc 'jump COMMIT_ID', 'Jump to specific commit'
2872
2886
  def jump(commit_sha1)
2873
2887
  begin
2874
2888
  verify_logged_in()
@@ -3003,11 +3017,12 @@ module Cnvrg
3003
3017
  method_option :job_slug, :type => :string, :aliases => ["-j", "--job"], :default => nil
3004
3018
  method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
3005
3019
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
3006
- method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
3020
+ method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => 'output'
3007
3021
  method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
3008
3022
  method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
3009
3023
  method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
3010
3024
  method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
3025
+ method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
3011
3026
 
3012
3027
  def sync(direct = true)
3013
3028
  verify_logged_in(true) if direct
@@ -3030,10 +3045,10 @@ module Cnvrg
3030
3045
  if run_download or options['debug_mode']
3031
3046
  invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
3032
3047
  end
3033
- invoke :upload, [false, true, direct, "",in_exp,options[:force], output_dir, job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
3048
+ invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
3034
3049
  :ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
3035
3050
  :files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
3036
- :debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
3051
+ :debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
3037
3052
 
3038
3053
  end
3039
3054
 
@@ -3143,7 +3158,7 @@ module Cnvrg
3143
3158
  invoke :exec, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title,
3144
3159
  :log => log, :email_notification => email_notification, :upload_output => upload_output,
3145
3160
  :commit => commit, :image => image, :data => data, :data_commit => data_commit,
3146
- :ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query
3161
+ :ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query, :local => local
3147
3162
  return
3148
3163
  end
3149
3164
  else
@@ -3200,6 +3215,7 @@ module Cnvrg
3200
3215
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3201
3216
  method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3202
3217
  method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
3218
+ method_option :local, :type => :boolean, :aliases => ["-l", "--local"], :default => false
3203
3219
 
3204
3220
  def exec(*cmd)
3205
3221
  log = []
@@ -3224,6 +3240,7 @@ module Cnvrg
3224
3240
  project_home = get_project_home
3225
3241
  data_query = options["data_query"]
3226
3242
  docker_stats = options["docker_stats"]
3243
+ local = options[:local] || false
3227
3244
  @project = Project.new(project_home)
3228
3245
  if @project.is_git
3229
3246
  sync_before = false
@@ -3316,62 +3333,53 @@ module Cnvrg
3316
3333
  end
3317
3334
  end
3318
3335
  start_time = Time.now
3319
- shell_type = options["use_bash"] ? "bash -l" : "sh"
3320
3336
  if @exp.get_cmd.present?
3321
3337
  cmd = @exp.get_cmd
3322
- if options["docker_id"].present? # Escape for docker exec
3323
- cmd = cmd.gsub("\"", "\\\"")
3324
- end
3325
3338
  end
3326
- if options["docker_id"].present?
3327
- cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
3328
- end
3329
- PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
3330
- begin
3331
- stdout.each do |line|
3332
- cur_time = Time.now
3333
- real_time = Time.now - real
3334
- cur_log = {time: cur_time,
3335
- message: line,
3336
- type: "stdout",
3337
- real: real_time
3338
- }
3339
- if print_log
3340
- puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
3339
+
3340
+ if local
3341
+ exec_local(cmd, print_log, start_commit, real, start_time)
3342
+ exit_status = $?.exitstatus
3343
+
3344
+ else
3345
+ command_slug = (0...18).map { (65 + rand(26)).chr }.join
3346
+ result_file = "/conf/result-#{command_slug}"
3347
+ data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
3348
+ conn = Cnvrg::Helpers::Executer.get_main_conn
3349
+ response = conn.post('command', data.to_json)
3350
+ if response.to_hash[:status].to_i != 200
3351
+ exit_status = 129
3352
+ raise StandardError.new("Cant send command to slave")
3353
+ end
3354
+ t = FileWatch::Tail.new
3355
+ filename = result_file
3356
+ lines = []
3357
+ t.tail(filename)
3358
+ t.subscribe do |path, line|
3359
+ begin
3360
+ cur_log = JSON.parse(line)
3361
+ if cur_log["type"] == "endMessage"
3362
+ exit_status = cur_log["real"].to_i
3363
+ break
3364
+ else
3365
+ puts(cur_log.to_json)
3366
+ STDOUT.flush
3367
+ cur_log["time"] = Time.parse(cur_log["timestamp"])
3368
+ cur_log["message"] = cur_log["message"].to_s + "\r\n"
3369
+ log << cur_log
3341
3370
  end
3342
- log << cur_log
3343
3371
  if log.size >= 10
3344
- @exp.upload_temp_log(log) unless log.empty?
3372
+ @exp.upload_temp_log(log)
3345
3373
  log = []
3346
- elsif (start_time + 15.seconds) <= Time.now
3374
+ elsif (start_time + 15.seconds) <= Time.now
3347
3375
  @exp.upload_temp_log(log) unless log.empty?
3348
3376
  log = []
3349
3377
  start_time = Time.now
3350
3378
  end
3379
+ rescue => e
3380
+ log_error(e)
3351
3381
  end
3352
- if stderr
3353
- stderr.each do |err|
3354
- log << {time: Time.now, message: err, type: "stderr"}
3355
- end
3356
- end
3357
- rescue Errno::EIO => e
3358
- log_error(e)
3359
- if !log.empty?
3360
- temp_log = log
3361
- @exp.upload_temp_log(temp_log) unless temp_log.empty?
3362
- log -= temp_log
3363
- end
3364
- rescue Errno::ENOENT => e
3365
- exp_success = false
3366
- log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
3367
- log_error(e)
3368
- rescue => e
3369
- res = @exp.end(log, 1, start_commit, 0, 0)
3370
- log_message("Error occurred,aborting", Thor::Shell::Color::RED)
3371
- log_error(e)
3372
- exit(0)
3373
3382
  end
3374
- ::Process.wait pid
3375
3383
  end
3376
3384
  end_time = Time.now
3377
3385
  process_running = false
@@ -3379,14 +3387,13 @@ module Cnvrg
3379
3387
  if !log.empty?
3380
3388
 
3381
3389
  temp_log = log
3382
- @exp.upload_temp_log(temp_log) unless temp_log.empty?
3390
+ @exp.upload_temp_log(temp_log)
3383
3391
  log -= temp_log
3384
3392
  end
3385
3393
 
3386
3394
  cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
3387
3395
  memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
3388
- exit_status = $?.exitstatus
3389
- if $?.exitstatus != 0
3396
+ if exit_status != 0
3390
3397
  exp_success = false
3391
3398
  end
3392
3399
 
@@ -3430,7 +3437,6 @@ module Cnvrg
3430
3437
  if @exp
3431
3438
  # log_thread.join
3432
3439
  Thread.kill(stats_thread) if docker_stats
3433
- exit_status = $?.exitstatus
3434
3440
  if exit_status.blank?
3435
3441
  exit_status = "-1"
3436
3442
  end
@@ -3443,8 +3449,6 @@ module Cnvrg
3443
3449
 
3444
3450
  exit(1)
3445
3451
  end
3446
-
3447
-
3448
3452
  end
3449
3453
 
3450
3454
  end
@@ -3689,7 +3693,7 @@ module Cnvrg
3689
3693
  end
3690
3694
  end
3691
3695
 
3692
- desc 'deploy', 'Deploys model to production'
3696
+ desc 'deploy', 'Deploys model to production', :hide => true
3693
3697
  method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
3694
3698
  method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
3695
3699
  method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
@@ -3778,7 +3782,7 @@ module Cnvrg
3778
3782
  method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
3779
3783
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
3780
3784
 
3781
- desc 'notebook', 'Starts a notebook session remotely or locally'
3785
+ desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
3782
3786
 
3783
3787
  def notebook
3784
3788
  verify_logged_in(true)
@@ -3905,7 +3909,7 @@ module Cnvrg
3905
3909
  end
3906
3910
  end
3907
3911
 
3908
- desc 'remote_notebook', 'Run notebook server on remote server'
3912
+ desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
3909
3913
  method_option :machine_type, :type => :string, :default => ""
3910
3914
  method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
3911
3915
  method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
@@ -4264,7 +4268,7 @@ module Cnvrg
4264
4268
 
4265
4269
  end
4266
4270
 
4267
- desc 'notebook_stop', 'Starts a new notebook environment'
4271
+ desc 'notebook_stop', 'Stop notebook', :hide => true
4268
4272
  method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
4269
4273
  method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
4270
4274
  method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
@@ -4651,10 +4655,16 @@ module Cnvrg
4651
4655
  method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
4652
4656
  method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
4653
4657
  method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
4658
+ method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
4659
+ method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
4660
+ method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
4654
4661
 
4655
4662
  def collect_metrics
4656
4663
  @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4657
4664
  prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
4665
+ prom_user = options[:prom_user]
4666
+ prom_password = options[:prom_password]
4667
+ name = options[:name]
4658
4668
 
4659
4669
  translate_result = Cnvrg::API_V2.request(
4660
4670
  "#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
@@ -4679,9 +4689,16 @@ module Cnvrg
4679
4689
  next
4680
4690
  end
4681
4691
  uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
4682
- resp = Net::HTTP.get(uri)
4692
+ http = Net::HTTP.new(uri.host, uri.port)
4693
+ http.use_ssl = uri.scheme == "https"
4694
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
4695
+ req = Net::HTTP::Get.new uri.request_uri
4696
+ if prom_user.present?
4697
+ req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
4698
+ end
4699
+ resp = http.request(req)
4683
4700
  begin
4684
- result = JSON.parse(resp)
4701
+ result = JSON.parse(resp.body)
4685
4702
  rescue JSON::ParserError => e
4686
4703
  log_error(e)
4687
4704
  next
@@ -4690,13 +4707,22 @@ module Cnvrg
4690
4707
  next unless data_result
4691
4708
 
4692
4709
  if data_result.size > 1
4693
- stats[query_name] = {}
4710
+ stats[query_name] = {} unless query_name.include? 'block'
4694
4711
  data_result.each_with_index do |res, i|
4695
4712
  timestamp, value = res["value"]
4696
4713
  uuid = res["metric"]["UUID"].presence || i
4697
4714
  stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4698
4715
  stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4699
- stats[query_name][uuid] = stat_value
4716
+ if query_name.include? 'block'
4717
+ uuid = res["metric"]["interface"].presence || i
4718
+ uuid = "#{name}-#{uuid}" if name.present?
4719
+ stats['block_io'] = {} if stats['block_io'].blank?
4720
+ io_type = query_name.split('_')[1]
4721
+ stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
4722
+ stats['block_io'][io_type].merge!({ uuid => stat_value })
4723
+ else
4724
+ stats[query_name][uuid] = stat_value
4725
+ end
4700
4726
  end
4701
4727
  else
4702
4728
  timestamp, value = data_result&.first&.dig('value')
@@ -4705,9 +4731,14 @@ module Cnvrg
4705
4731
  if query_name.include? 'block'
4706
4732
  stats['block_io'] = {} if stats['block_io'].blank?
4707
4733
  io_type = query_name.split('_')[1]
4708
- stats['block_io'].merge!({ io_type => stat_value })
4734
+ if name.present?
4735
+ stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
4736
+ stats['block_io'][io_type].merge!({ name => stat_value })
4737
+ else
4738
+ stats['block_io'].merge!({ io_type => stat_value })
4739
+ end
4709
4740
  else
4710
- stats[query_name] = stat_value
4741
+ stats[query_name] = name.present? ? { name => stat_value } : stat_value
4711
4742
  end
4712
4743
  end
4713
4744
  end
@@ -4751,7 +4782,7 @@ module Cnvrg
4751
4782
  end
4752
4783
 
4753
4784
 
4754
- desc '', ''
4785
+ desc '', '', :hide => true
4755
4786
 
4756
4787
  def download_built_image(image_name, image_slug)
4757
4788
  begin
@@ -4995,7 +5026,7 @@ module Cnvrg
4995
5026
  end
4996
5027
  end
4997
5028
 
4998
- desc 'experiments', 'List project experiments'
5029
+ desc 'experiments', 'List project experiments', :hide => true
4999
5030
  method_option :id, :type => :string, :aliases => ["--id"], :desc => "Get info for specific experiments", :default => ""
5000
5031
  method_option :tag, :type => :string, :aliases => ["-t"], :desc => "Get info for specific experiment tag", :default => ""
5001
5032
 
@@ -5864,6 +5895,57 @@ module Cnvrg
5864
5895
  end
5865
5896
  end
5866
5897
 
5898
+ def exec_local(cmd , print_log, start_commit, real, start_time)
5899
+ log = []
5900
+ PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
5901
+ begin
5902
+ stdout.each do |line|
5903
+ cur_time = Time.now
5904
+ real_time = Time.now - real
5905
+ cur_log = {time: cur_time,
5906
+ message: line,
5907
+ type: "stdout",
5908
+ real: real_time
5909
+ }
5910
+ if print_log
5911
+ puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
5912
+ end
5913
+ log << cur_log
5914
+ if log.size >= 10
5915
+ @exp.upload_temp_log(log) unless log.empty?
5916
+ log = []
5917
+ elsif (start_time + 15.seconds) <= Time.now
5918
+ @exp.upload_temp_log(log) unless log.empty?
5919
+ log = []
5920
+ start_time = Time.now
5921
+ end
5922
+ end
5923
+ if stderr
5924
+ stderr.each do |err|
5925
+ log << {time: Time.now, message: err, type: "stderr"}
5926
+ end
5927
+ end
5928
+ rescue Errno::EIO => e
5929
+ log_error(e)
5930
+ if !log.empty?
5931
+ temp_log = log
5932
+ @exp.upload_temp_log(temp_log) unless temp_log.empty?
5933
+ log -= temp_log
5934
+ end
5935
+ rescue Errno::ENOENT => e
5936
+ exp_success = false
5937
+ log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
5938
+ log_error(e)
5939
+ rescue => e
5940
+ res = @exp.end(log, 1, start_commit, 0, 0)
5941
+ log_message("Error occurred,aborting", Thor::Shell::Color::RED)
5942
+ log_error(e)
5943
+ exit(0)
5944
+ end
5945
+ ::Process.wait pid
5946
+ end
5947
+ end
5948
+
5867
5949
  end
5868
5950
  end
5869
5951
 
@@ -10,20 +10,20 @@ module Cnvrg
10
10
  Cnvrg::Logger.log_info("cnvrg is not configured")
11
11
  end
12
12
 
13
- def start(username, password)
14
- Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password})
13
+ def start(username, password, no_auth, port: nil)
14
+ Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password, no_auth: no_auth, port: port})
15
15
  end
16
16
 
17
17
  def status()
18
18
  Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/status" , 'GET', nil)
19
19
  end
20
20
 
21
- def run_portforward_command(pod_name, port, kubeconfig, namespace)
21
+ def run_portforward_command(pod_name, port, kubeconfig, namespace, internal_port)
22
22
  command = "kubectl"
23
23
  if kubeconfig.present?
24
24
  command = "kubectl --kubeconfig=#{kubeconfig}"
25
25
  end
26
- bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:22"
26
+ bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:#{internal_port}"
27
27
  puts("\nrunning command #{bashCommand}")
28
28
  `#{bashCommand}`
29
29
  end
data/lib/cnvrg/files.rb CHANGED
@@ -106,7 +106,7 @@ module Cnvrg
106
106
  commit: commit_sha1
107
107
  })
108
108
  unless Cnvrg::CLI.is_response_success(resp, false)
109
- raise SignalException.new("Cant upload files to the server.")
109
+ raise StandardError.new("unsupported character: folder name can not include / \\ * : ? \" | ")
110
110
  end
111
111
  # resolve bucket
112
112
  res = resp['result']
@@ -730,7 +730,11 @@ module Cnvrg
730
730
  end
731
731
  res = Cnvrg::API.request(@base_resource + "download_files", 'POST', {files: files, commit: commit})
732
732
  unless Cnvrg::CLI.is_response_success(res, false)
733
- raise SignalException.new("Cant download files from the server.")
733
+ begin
734
+ puts(res)
735
+ rescue
736
+ end
737
+ raise StandardError.new("Cant download files from the server.")
734
738
  end
735
739
  self.download_multiple_files_s3(res['result'], @project_home, postfix: postfix, progress: progress, threads: threads)
736
740
  end
@@ -18,10 +18,13 @@ class Cnvrg::Helpers::Agent
18
18
  #### params
19
19
  def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
20
20
  @executer = executer
21
+ @job_id = ENV["CNVRG_JOB_ID"]
21
22
  @slug = slug
22
23
  @files_exist = files_exist
23
24
  @container_name = container_name
24
- @run_in_slave = @container_name.downcase == "slave"
25
+ @is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
26
+ @main_name = @is_new_main ? "main" : "slave"
27
+ @run_in_main = @container_name.downcase == @main_name
25
28
  @log_interval = send_log_interval
26
29
  # https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
27
30
  if timeout.blank? or timeout.negative?
@@ -37,18 +40,8 @@ class Cnvrg::Helpers::Agent
37
40
  @sleep_before_retry = sleep_before_retry
38
41
  @real_execution_retries = 0 ## How many times the command really executed until success
39
42
  @single_quotes = single_quotes
40
- @docker_user = ""
41
- @shell_type = use_bash ? "bash -l" : "sh"
42
- if docker_user.present?
43
- @docker_user = " --user #{docker_user}"
44
- end
45
- if @run_in_slave
46
- if @single_quotes
47
- @command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
48
- else
49
- @command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
50
- end
51
- end
43
+ @docker_user = docker_user
44
+ @use_bash = use_bash
52
45
  @output = []
53
46
  @errors = []
54
47
  @exit_status = nil
@@ -75,19 +68,22 @@ class Cnvrg::Helpers::Agent
75
68
 
76
69
  def exec!
77
70
  log_internal("Command: #{@command} with slug: #{@slug} started!")
71
+ command_status = Status::FINISHED
78
72
  if @command.blank?
79
73
  @exit_status = 0
74
+ command_status = Status::ABORTED
80
75
  elsif should_run?
81
76
  send_logs(status: Status::STARTED)
82
77
  periodic_thread_handle = periodic_thread
83
78
  execute_command
84
79
  else
80
+ command_status = Status::ABORTED
85
81
  @exit_status = 127
86
82
  end
87
83
  finish_log = "Command: #{@command} with slug: #{@slug} finished"
88
84
  finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
89
85
  log_internal(finish_log)
90
- send_logs(exit_status: @exit_status, status: Status::FINISHED)
86
+ send_logs(exit_status: @exit_status, status: command_status)
91
87
  if periodic_thread_handle.present?
92
88
  periodic_thread_handle.join
93
89
  end
@@ -117,14 +113,50 @@ class Cnvrg::Helpers::Agent
117
113
  execute_command
118
114
  end
119
115
 
116
+ def execute_command_on_slave
117
+ extra_slug = (0...2).map { (65 + rand(26)).chr }.join
118
+ result_file = "/conf/result-#{@slug}-#{extra_slug}"
119
+ Timeout.timeout(@timeout) do
120
+ data = {cmd: @command, async: true, file_name: result_file, use_script: true, use_bash: @use_bash, use_sh: !@use_bash, docker_user: @docker_user}
121
+ conn = Cnvrg::Helpers::Executer.get_main_conn
122
+ response = conn.post('command', data.to_json)
123
+ if response.to_hash[:status].to_i != 200
124
+ @exit_status = 129
125
+ raise StandardError.new("Cant send command to slave")
126
+ end
127
+ t = FileWatch::Tail.new
128
+ filename = result_file
129
+ t.tail(filename)
130
+ t.subscribe do |path, line|
131
+ if line.include?("cnvrg-exit-code")
132
+ @exit_status = line.split("=")[1].to_i
133
+ break
134
+ end
135
+ if !@is_new_main
136
+ log_internal(line, level: LogLevel::PURE)
137
+ end
138
+ line = line.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_')
139
+ @output << {log: line, timestamp: Time.now}
140
+ end
141
+ end
142
+ rescue Timeout::Error
143
+ @errors << {log: "Command timed out!", timestamp: Time.now}
144
+ log_internal("Command timed out!", level: LogLevel::ERROR)
145
+ @exit_status = 124
146
+ ensure
147
+ retry_command if @retries != 0 and @exit_status !=0
148
+ @exit_status
149
+ end
150
+
120
151
  def execute_command
152
+ return execute_command_on_slave if @run_in_main
121
153
  Timeout.timeout(@timeout) do
122
154
  PTY.spawn(@command) do |stdout, stdin, pid, stderr|
123
155
  @pid = pid
124
156
  begin
125
157
  if stdout.present?
126
158
  stdout.each do |line|
127
- log_internal(line, level: LogLevel::PURE)
159
+ log_internal(line, level: LogLevel::INFO)
128
160
  line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
129
161
  @output << {log: line, timestamp: Time.now}
130
162
  end
@@ -176,10 +208,18 @@ class Cnvrg::Helpers::Agent
176
208
  def log_internal(log, level: LogLevel::INFO)
177
209
  if level == LogLevel::PURE
178
210
  puts(log)
179
- else
180
- puts({log: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity}.to_json)
211
+ STDOUT.flush
212
+ return
213
+ end
214
+ to_print = {message: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity, job: @job_id}
215
+ if log.start_with?("{") and log.include?("timestamp")
216
+ log_json = JSON.parse(log)
217
+ to_print = to_print.stringify_keys.merge(log_json.stringify_keys)
181
218
  end
219
+ puts(to_print.to_json)
182
220
  STDOUT.flush
221
+ rescue => e
222
+ Cnvrg::Logger.log_error(e)
183
223
  end
184
224
 
185
225
  def filter_logs_by_regex(logs)
@@ -190,4 +230,4 @@ class Cnvrg::Helpers::Agent
190
230
  end
191
231
  end
192
232
  end
193
- end
233
+ end
@@ -1,7 +1,9 @@
1
+ require "filewatch/tail"
1
2
  require 'cnvrg/helpers/agent'
2
3
  class Cnvrg::Helpers::Executer
3
- attr_reader :machine_activity, :agent_id, :slave_id
4
-
4
+ attr_reader :machine_activity, :agent_id, :main_id
5
+ MAIN_CONTAINER_PORT = ENV['MAIN_CONTAINER_PORT'].try(:to_i) || 4000
6
+ HAS_DOCKER = ENV['HAS_DOCKER'] == "true"
5
7
 
6
8
  ### this class represent a machine_activity. it will poll the commands, communicate with the
7
9
  # server (poll commands) and let the server know the status of this executer.
@@ -9,11 +11,15 @@ class Cnvrg::Helpers::Executer
9
11
  @owner = owner
10
12
  @job_id = job_id
11
13
  @poll_every = poll_every
14
+ @check_main_every = 10
12
15
  @machine_activity = machine_activity
13
16
  @commands_q = Queue.new
14
17
  @files_q = Queue.new
15
18
  @agent_id = nil
16
- @slave_id = nil
19
+ @main_id = nil
20
+ @main_start_time = nil
21
+ @is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
22
+ @main_name = @is_new_main ? "main" : "slave"
17
23
  end
18
24
 
19
25
  def create_file_cmd(path, content)
@@ -37,9 +43,10 @@ class Cnvrg::Helpers::Executer
37
43
  def executer_stats
38
44
  return @stats if @stats.present?
39
45
  Cnvrg::Logger.log_info("getting containers")
40
- @agent_id, @slave_id = containers
46
+ @agent_id, @main_id = containers
41
47
  Cnvrg::Logger.log_info("got containers")
42
48
  pod_name, node_name = get_node_and_pod_names
49
+ # For backwards compatibility we still call this slave stats
43
50
  @stats = {
44
51
  pod_name: pod_name,
45
52
  node_name: node_name,
@@ -53,39 +60,48 @@ class Cnvrg::Helpers::Executer
53
60
  cnvrg: Cnvrg::VERSION
54
61
  },
55
62
  slave: {
56
- container_id: @slave_id,
57
- workdir: run_in_slave('pwd'),
58
- homedir: slave_homedir,
63
+ container_id: @main_id,
64
+ container_name: @main_name,
65
+ workdir: run_in_main('pwd'),
66
+ homedir: main_homedir,
59
67
  spark_path: spark_path,
60
- user: run_in_slave( 'whoami'),
61
- cnvrg: run_in_slave( 'which cnvrg'),
62
- has_bash: run_in_slave( 'which bash'),
63
- user_id: run_in_slave( 'id -u'),
64
- group_id: run_in_slave( 'id -g'),
65
- python_version: run_in_slave( 'python --version'),
66
- python3_version: run_in_slave( 'python3 --version'),
67
- pip_version: run_in_slave( 'pip --version'),
68
- pip3_version: run_in_slave( 'pip3 --version')
68
+ user: run_in_main( 'whoami'),
69
+ cnvrg: run_in_main( 'which cnvrg'),
70
+ has_bash: run_in_main( 'which bash'),
71
+ user_id: run_in_main( 'id -u'),
72
+ group_id: run_in_main( 'id -g'),
73
+ python_version: run_in_main( 'python --version'),
74
+ python3_version: run_in_main( 'python3 --version'),
75
+ pip_version: run_in_main( 'pip --version'),
76
+ pip3_version: run_in_main( 'pip3 --version')
69
77
  },
70
78
  }
79
+
71
80
  @stats
72
81
  end
73
82
 
74
83
  def containers
75
84
  agent_id = nil
76
- slave_id = nil
77
- while agent_id.blank? or slave_id.blank?
78
- grep_by = @job_id
79
- grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
80
- cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
81
- agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
82
- slave_id = cntrs.find{|container_name| container_name.include? "slave"}.split(",").first rescue nil
83
- sleep(5)
85
+ main_id = nil
86
+ timeout = 2
87
+ timeout = nil if (!@is_new_main || HAS_DOCKER)
88
+ Timeout.timeout(timeout) do
89
+ while agent_id.blank? or main_id.blank?
90
+ grep_by = @job_id
91
+ grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
92
+ cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
93
+ agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
94
+ main_id = cntrs.find{|container_name| container_name.include? @main_name}.split(",").first rescue nil
95
+ sleep(2)
96
+ end
84
97
  end
85
- if slave_id.blank?
86
- raise "Can't find slave id"
98
+ if main_id.blank?
99
+ raise "Can't find main id"
87
100
  end
88
- [agent_id, slave_id]
101
+ [agent_id, main_id]
102
+ rescue => e
103
+ Cnvrg::Logger.log_error(e)
104
+ [agent_id, main_id]
89
105
  end
90
106
 
91
107
  def current_homedir
@@ -93,21 +109,45 @@ class Cnvrg::Helpers::Executer
93
109
  end
94
110
 
95
111
  def spark_path
96
- run_in_slave("env | grep SPARK_HOME").strip.split("=").try(:last)
112
+ run_in_main("env | grep SPARK_HOME").strip.split("=").try(:last)
97
113
  end
98
114
 
99
- def slave_homedir()
100
- run_in_slave("env | grep -w HOME").split("=").try(:last)
115
+ def main_homedir()
116
+ run_in_main("env | grep -w HOME").split("=").try(:last)
101
117
  end
102
118
 
103
- def slave_env
104
- run_in_slave("env").split("\n").map{|x| x.split("=")}
119
+ def main_env
120
+ run_in_main("env").split("\n").map{|x| x.split("=")}
105
121
  end
106
122
 
107
- def run_in_slave(command)
108
- `docker exec -i #{@slave_id} sh -c '#{command}'`.strip
109
- end
123
+ def run_in_main(command)
124
+ data = {cmd: command, async: false, use_sh: true}
110
125
 
126
+ conn = Cnvrg::Helpers::Executer.get_main_conn(timeout: 180)
127
+ response = conn.post('command', data.to_json)
128
+ if response.to_hash[:status].to_i != 200
129
+ Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
130
+ return ""
131
+ end
132
+ resp = []
133
+ lines = response.body.split("\n")
134
+ lines.each do |line|
135
+ next if line.strip == nil or line.strip == ""
136
+ if line.include?("cnvrg-exit-code")
137
+ exit_status = line.split("=")[1].to_i
138
+ if exit_status != 0
139
+ Cnvrg::Logger.log_info("failed to run find command #{command} on main")
140
+ return ""
141
+ end
142
+ next
143
+ end
144
+ resp << line
145
+ end
146
+ return resp.join("\n")
147
+ rescue => e
148
+ Cnvrg::Logger.log_error(e)
149
+ return ""
150
+ end
111
151
 
112
152
  def poll
113
153
  resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
@@ -124,6 +164,7 @@ class Cnvrg::Helpers::Executer
124
164
  success = false
125
165
  puts("Agent started, connecting to #{Cnvrg::API.get_api}")
126
166
  STDOUT.flush
167
+ wait_for_main
127
168
  while !success and retries < 100
128
169
  begin
129
170
  resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
@@ -154,14 +195,75 @@ class Cnvrg::Helpers::Executer
154
195
  end
155
196
  end
156
197
 
198
+ def check_main_is_working_thread
199
+ while true
200
+ check_main_alive
201
+ sleep(@check_main_every)
202
+ end
203
+ end
204
+
157
205
  def main_thread
158
206
  init
159
207
  Thread.new do
160
208
  polling_thread
161
209
  end
210
+ Thread.new do
211
+ check_main_is_working_thread
212
+ end
162
213
  execute_cmds
163
214
  end
164
215
 
216
+ def wait_for_main
217
+ copy_file_to_main
218
+ start_tiny_if_missing
219
+ puts("Waiting for main container")
220
+ STDOUT.flush
221
+ got_response = false
222
+ while !got_response do
223
+ begin
224
+ conn = Cnvrg::Helpers::Executer.get_main_conn
225
+ response = conn.get('readiness')
226
+ if response.to_hash[:status].to_i != 200
227
+ sleep(0.1)
228
+ next
229
+ else
230
+ puts("Client container is ready")
231
+ STDOUT.flush
232
+ @main_start_time = response.body.to_i
233
+ got_response = true
234
+ end
235
+ rescue => e
236
+ puts("Failed to connect to main")
237
+ puts(e)
238
+ STDOUT.flush
239
+ sleep(0.1)
240
+ next
241
+ end
242
+ end
243
+ end
244
+
245
+ def copy_file_to_main
246
+ begin
247
+ FileUtils.cp("/cnvrg-tiny", "/conf/tiny")
248
+ FileUtils.cp_r("/scripts", "/conf/scripts-bin")
249
+ FileUtils.touch("/conf/tiny-ready")
250
+ rescue => e
251
+ Cnvrg::Logger.log_error(e)
252
+ end
253
+ end
254
+
255
+ def start_tiny_if_missing
256
+ return unless ENV['MAIN_CONTAINER_PORT'].blank?
257
+ Cnvrg::Logger.log_info("Tiny not found, starting it")
258
+ @agent_id, @main_id = containers
259
+ pid = Process.fork do
260
+ Cnvrg::Logger.log_info("running docker exec -i #{@main_id} sh -c '/conf/tiny")
261
+ `docker exec -i #{@main_id} sh -c '/conf/tiny'`.strip
262
+ end
263
+ Process.detach(pid)
264
+ Cnvrg::Logger.log_info("Tiny started and detached")
265
+ end
266
+
165
267
  def execute_cmds
166
268
  pids = []
167
269
  while true
@@ -215,6 +317,23 @@ class Cnvrg::Helpers::Executer
215
317
  Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
216
318
  end
217
319
 
320
+ def check_main_alive
321
+ # Dont check before we got first response
322
+ return if @main_start_time == nil
323
+ conn = Cnvrg::Helpers::Executer.get_main_conn
324
+ response = conn.get('readiness')
325
+ if response.to_hash[:status].to_i != 200
326
+ main_start_time = 0
327
+ else
328
+ main_start_time = response.body.to_i
329
+ end
330
+ if main_start_time != @main_start_time
331
+ puts("Found that main restarted, restarting agent")
332
+ Cnvrg::Logger.log_info("Found that main restarted, restarting agent")
333
+ exit(1)
334
+ end
335
+ end
336
+
218
337
  def get_pod_events(pod_name)
219
338
  return if pod_name.blank?
220
339
  `kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
@@ -224,4 +343,27 @@ class Cnvrg::Helpers::Executer
224
343
  return if node_name.blank?
225
344
  `kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
226
345
  end
346
+
347
+ def self.main_container_url
348
+ if ENV["CNVRG_COMPUTE_CLUSTER"].blank? and ENV["KUBERNETES_SERVICE_HOST"].blank?
349
+ if ENV["MAIN_CONTAINER_PORT"].blank?
350
+ host = "slave"
351
+ else
352
+ host = "main"
353
+ end
354
+ "http://#{host}:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
355
+ else
356
+ "http://localhost:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
357
+ end
358
+ end
359
+
360
+ def self.get_main_conn(timeout: 4, open_timeout: 1)
361
+ conn = Faraday.new(
362
+ url: Cnvrg::Helpers::Executer.main_container_url,
363
+ headers: {'Content-Type' => 'application/json'}
364
+ )
365
+ conn.options.timeout = timeout
366
+ conn.options.open_timeout = open_timeout
367
+ conn
368
+ end
227
369
  end
data/lib/cnvrg/job_ssh.rb CHANGED
@@ -5,14 +5,18 @@ module Cnvrg
5
5
  method_option :port, :type => :numeric, :aliases => ["-p", "--port"], :desc => "Port to bind into", :default => 2222
6
6
  method_option :username, :type => :string, :aliases => ["-u", "--username"], :desc => "Job container user name", :default => nil
7
7
  method_option :password, :type => :string, :aliases => ["--password"], :desc =>"Job Conatainer user name, will be set by cnvrg", :default => nil
8
+ method_option :no_auth, :type => :boolean, :aliases => ["--no-auth"], :default => false
9
+ method_option :internal_port, :type => :numeric, :aliases => ["--internal-port"], :desc =>"Internal port in the pod for the ssh", :default => 22
8
10
  method_option :kubeconfig, :type => :string, :aliases => ["--kubeconfig"], :desc => "Path to kubeconfig, if blank default config will be used", :default => nil
9
11
  def start(job_id)
12
+ no_auth = options["no_auth"]
10
13
  Cnvrg::CLI.new.log_start(__method__, args, options)
11
14
  @job_ssh = ConnectJobSsh.new(job_id)
12
- @job_ssh.start(options['username'], options['password'])
15
+ @job_ssh.start(options['username'], options['password'], no_auth, port: options['internal_port'])
13
16
  pod_name = nil
14
17
  namespace = "cnvrg"
15
18
  ssh_ready = false
19
+ internal_port = options['internal_port']
16
20
  while not ssh_ready
17
21
  resp = @job_ssh.status()
18
22
  status = resp["ssh_status"]
@@ -26,13 +30,14 @@ module Cnvrg
26
30
  username = resp["username"]
27
31
  pod_name = resp["pod_name"]
28
32
  namespace = resp["namespace"]
33
+ internal_port = resp["port"] || internal_port
29
34
  ssh_ready = true
30
35
  else
31
36
  puts("Failed to start ssh")
32
37
  break
33
38
  end
34
39
  end
35
- if pod_name.blank? or password.blank? or username.blank?
40
+ if pod_name.blank? or (password.blank? and !no_auth) or username.blank?
36
41
  puts("Failed to get required params")
37
42
  return
38
43
  end
@@ -41,8 +46,8 @@ module Cnvrg
41
46
  puts("host: 127.0.0.1")
42
47
  puts("port: #{options["port"]}")
43
48
  puts("username: #{username}")
44
- puts("password: #{password}")
45
- @job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace)
49
+ puts("password: #{password}") unless no_auth
50
+ @job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace, internal_port)
46
51
  end
47
52
  end
48
53
  end
data/lib/cnvrg/project.rb CHANGED
@@ -328,15 +328,21 @@ module Cnvrg
328
328
  end
329
329
 
330
330
  def get_storage_client
331
- response = Cnvrg::API.request("users/#{@owner}/projects/#{@slug}/client", 'GET')
332
- if Cnvrg::CLI.is_response_success(response, false)
333
-
331
+ client_params = nil
332
+ i = 0
333
+ begin
334
+ response = Cnvrg::API.request("users/#{@owner}/projects/#{@slug}/client", 'GET')
335
+ unless Cnvrg::CLI.is_response_success(response, false)
336
+ raise StandardError.new("Can't find project credentials")
337
+ end
334
338
  client_params = response['client']
335
- else
336
-
339
+ rescue StandardError
340
+ i += 1
341
+ sleep(5 * i)
342
+ retry if i < 10
337
343
  client_params = get_storage_client_fallback
338
344
  end
339
-
345
+ raise StandardError.new("Can't find project credentials") unless client_params
340
346
  Cnvrg::Downloader::Client.factory(client_params)
341
347
  end
342
348
 
@@ -378,14 +384,18 @@ module Cnvrg
378
384
  []
379
385
  end
380
386
 
381
- def generate_output_dir(output_dir)
387
+ def generate_output_dir(output_dir, local: false)
382
388
  Cnvrg::Logger.log_info("Generating output dir for #{output_dir}")
383
389
  upload_list = []
390
+ list = []
384
391
  list = Dir.glob("/cnvrg/#{output_dir}/**/*", File::FNM_DOTMATCH)
392
+ if local
393
+ list += Dir.glob("#{output_dir}/**/*", File::FNM_DOTMATCH)
394
+ end
395
+ list.uniq!
385
396
  Parallel.map(list, in_threads: IDXParallelThreads) do |e|
386
397
  next if e.end_with? "/."
387
398
  if File.directory? e
388
-
389
399
  upload_list << e + "/"
390
400
  else
391
401
  upload_list << e
@@ -447,6 +457,10 @@ module Cnvrg
447
457
  if list_ignore_new.include? label
448
458
  next
449
459
  end
460
+ if File.symlink?(e)
461
+ Cnvrg::Logger.log_info("Skipping symlink #{e}")
462
+ next
463
+ end
450
464
  if File.directory? e
451
465
  dir_name = (label.ends_with? "/") ? label : (label + "/")
452
466
  tree_idx[dir_name] = nil
@@ -647,7 +661,11 @@ module Cnvrg
647
661
 
648
662
  def fetch_webapp_slugs(webapp_slug, slugs: nil)
649
663
  response = Cnvrg::API_V2.request("#{self.owner}/projects/#{self.slug}/webapps/#{webapp_slug}" , 'GET')
650
- return response["experiments"]
664
+
665
+ if response.key?("experiments")
666
+ return response["experiments"]
667
+ end
668
+ return response["data"]["attributes"]["experiments"]
651
669
  rescue
652
670
  slugs
653
671
  end
@@ -699,8 +717,11 @@ module Cnvrg
699
717
  res = JSON.parse(resp['result']) rescue nil
700
718
  return if res.blank?
701
719
  config = self.get_config
702
- config[:is_git] = res['git']
703
720
  config[:project_name] = res['title']
721
+ config[:project_slug] = @slug
722
+ config[:owner] = @owner
723
+ config[:git] = res['git'] || false
724
+ config[:is_git] = res['git'] || false
704
725
  self.set_config(config)
705
726
  end
706
727
 
data/lib/cnvrg/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Cnvrg
2
- VERSION = '1.11.31'
3
- end
2
+ VERSION = '2.0.13'
3
+ end
metadata CHANGED
@@ -1,15 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cnvrg
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.31
4
+ version: 2.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yochay Ettun
8
8
  - Leah Kolben
9
- autorequire:
9
+ - Omer Shacham
10
+ autorequire:
10
11
  bindir: bin
11
12
  cert_chain: []
12
- date: 2021-05-04 00:00:00.000000000 Z
13
+ date: 2021-10-27 00:00:00.000000000 Z
13
14
  dependencies:
14
15
  - !ruby/object:Gem::Dependency
15
16
  name: bundler
@@ -321,6 +322,20 @@ dependencies:
321
322
  - - "~>"
322
323
  - !ruby/object:Gem::Version
323
324
  version: 0.1.1
325
+ - !ruby/object:Gem::Dependency
326
+ name: filewatch
327
+ requirement: !ruby/object:Gem::Requirement
328
+ requirements:
329
+ - - "~>"
330
+ - !ruby/object:Gem::Version
331
+ version: 0.9.0
332
+ type: :runtime
333
+ prerelease: false
334
+ version_requirements: !ruby/object:Gem::Requirement
335
+ requirements:
336
+ - - "~>"
337
+ - !ruby/object:Gem::Version
338
+ version: 0.9.0
324
339
  - !ruby/object:Gem::Dependency
325
340
  name: parallel
326
341
  requirement: !ruby/object:Gem::Requirement
@@ -458,7 +473,7 @@ files:
458
473
  homepage: https://cnvrg.io
459
474
  licenses: []
460
475
  metadata: {}
461
- post_install_message:
476
+ post_install_message:
462
477
  rdoc_options: []
463
478
  require_paths:
464
479
  - lib
@@ -473,8 +488,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
473
488
  - !ruby/object:Gem::Version
474
489
  version: '0'
475
490
  requirements: []
476
- rubygems_version: 3.0.9
477
- signing_key:
491
+ rubygems_version: 3.2.22
492
+ signing_key:
478
493
  specification_version: 4
479
494
  summary: A CLI tool for interacting with cnvrg.io.
480
495
  test_files: []