cnvrg 1.11.29 → 2.0.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0b2588acc9e199189983cf67643124e60c8c9c7a5223bbbd85d65994cb3a812c
4
- data.tar.gz: a35822e68bc4a095e7a029d4572bf9a748f2b0f6f0b2f3baa86c04315be3f176
3
+ metadata.gz: 86857e06a0d107172c161467e8ff8cb04a120d8b230c97843e91ce13c826ccce
4
+ data.tar.gz: 272fe88e1e390f2887c36c49915cc89f10a6cf9947bb98ab6fd503476ac03820
5
5
  SHA512:
6
- metadata.gz: c0c64aecb69a20b939c990ed99b26e2387d26046c44a734b46f7f813b271c7552801ea3909478a424db42e80e9df662cc2af428283cd65cbe7eb08d939648aa2
7
- data.tar.gz: a86c3d71aa228f7a387fd484b286e2dacf92483644cd48687eb0d68bfcb2d0c9e68ac2b87b3ff537efbef064f8c915423fb82438ef823f95aa748b71659a5772
6
+ metadata.gz: c28ab54953e47843897273f8038f5e0bfc92e101e161d1edae4477250e1334b5730b5020713572ad2482c5b0c305c5c0b2c5725b8796a44c06750eccf2924e9f
7
+ data.tar.gz: 69be0cd1fc89180ce7b878aaa8e0767b5be9a00cb0c9813c504e62677c1c839831aa6cf4fbf27aeb66aa50b756f7b28f46eb721182e588bcc4b7c65544e6a41b
data/Readme.md CHANGED
@@ -14,4 +14,43 @@
14
14
  * DEV-8284 - Improvement: Use server instead of docker for agent communication
15
15
  * DEV-8434 - Bug: Rerun of experiment in git project doesn't show artifacts
16
16
  * DEV-8539 - Bug: SDK - in windows: e.sync doesnt perform sync
17
- * DEV-8621 - Improvement: Add more metrics
17
+ * DEV-8621 - Improvement: Add more metrics
18
+ ## Version v1.11.30
19
+ 2021-04-06
20
+ ## Version v1.11.31
21
+ 2021-04-22
22
+ ## Version v1.11.32
23
+ 2021-05-05
24
+ * DEV-8868 - Bug: SDK - e.sync() in git project only creates empty "output" folder in commit
25
+ ## Version v2.0.1
26
+ 2021-06-13
27
+ ## Version v2.0.2
28
+ 2021-06-16
29
+ * DEV-9694 - Bug: Download artifacts fails on authorization error
30
+ ## Version v2.0.3
31
+ 2021-06-29
32
+ * DEV-9919 - Bug: clone artifacts fails on "Not Authorize, Are you logged in?"
33
+ ## Version v2.0.4
34
+ 2021-07-08
35
+ * DEV-9935 - Bug: CLI - cnvrg sync creates new commit but no blob versions
36
+ ## Version v2.0.5
37
+ 2021-07-11
38
+ * DEV-10171 - Bug: experiment randomly fails with error- "Couldn't clone artifacts"
39
+ * DEV-10189 - Bug: CLI Sync -file/folder with broken symlink will cause sync to fail
40
+ ## Version v2.0.6
41
+ 2021-07-18
42
+ * DEV-10209 - Bug: some experiments in grid failed on cnvrg-cli commands (docker container id was missing)
43
+ ## Version v2.0.7
44
+ 2021-07-27
45
+ * DEV-10186 - Bug: CLI/run an experiment with --local tag giver server error
46
+ ## Version v2.0.8
47
+ 2021-09-06
48
+ * DEV-10697 - Bug: Tensorboard not starting in workspace and experiment.
49
+ ## Version v2.0.9
50
+ 2021-09-12
51
+ * DEV-10502 - Bug: Periodic sync stuck
52
+ ## Version v2.0.10
53
+ 2021-09-12
54
+ * DEV-10502 - Bug: Periodic sync stuck
55
+ ## Version v2.0.11
56
+ 2021-10-21
data/cnvrg.gemspec CHANGED
@@ -25,6 +25,7 @@ Gem::Specification.new do |spec|
25
25
  spec.add_development_dependency 'aruba'
26
26
  spec.add_development_dependency 'pry'
27
27
 
28
+ spec.add_runtime_dependency 'ffi', '~> 1.9', '>= 1.9.10'
28
29
  spec.add_runtime_dependency 'mimemagic', '~> 0.3.1', '>=0.3.7'
29
30
  spec.add_runtime_dependency 'faraday', '~> 0.15.2'
30
31
  spec.add_runtime_dependency 'netrc', '~> 0.11.0'
@@ -38,6 +39,7 @@ Gem::Specification.new do |spec|
38
39
  spec.add_runtime_dependency 'google-cloud-storage', '~> 1.21.1'
39
40
  spec.add_runtime_dependency 'sucker_punch', '~> 2.0'
40
41
  spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
42
+ spec.add_runtime_dependency 'filewatch', '~> 0.9.0'
41
43
  spec.add_runtime_dependency 'parallel', '~> 1.12.0'
42
44
  spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
43
45
  spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
data/lib/cnvrg/api.rb CHANGED
@@ -72,11 +72,11 @@ module Cnvrg
72
72
  if response.to_hash[:status].to_i != 200
73
73
  Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
74
74
  end
75
- if [503, 502, 429].include?(response.to_hash[:status].to_i)
75
+ if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
76
76
  Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
77
77
  success = false
78
78
  sleep(5 * retries)
79
- retries +=1
79
+ retries += 1
80
80
  next
81
81
  end
82
82
  rescue => e
@@ -112,11 +112,11 @@ module Cnvrg
112
112
  if response.to_hash[:status].to_i != 200
113
113
  Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
114
114
  end
115
- if [503, 502, 429].include?(response.to_hash[:status].to_i)
115
+ if [503, 502, 429, 401].include?(response.to_hash[:status].to_i)
116
116
  Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
117
117
  success = false
118
118
  sleep(5 * retries)
119
- retries +=1
119
+ retries += 1
120
120
  next
121
121
  end
122
122
  rescue => e
@@ -1,7 +1,7 @@
1
1
  module Cnvrg
2
2
  class LibraryCli < SubCommandBase
3
-
4
- desc "library import", ''
3
+ map push: :import
4
+ desc "library push", 'Push a new library to AI Library'
5
5
  def import
6
6
  unless File.exists? "library.yml"
7
7
  Cnvrg::CLI.log_message("Can't find library.yml", 'red')
data/lib/cnvrg/cli.rb CHANGED
@@ -173,7 +173,7 @@ module Cnvrg
173
173
  desc "data [COMMAND]", "Upload and manage datasets", :hide => false
174
174
  subcommand "data", Data
175
175
 
176
- desc "job", "manage running jobs", :hide => false
176
+ desc "job", "manage running jobs", :hide => true
177
177
  subcommand "job", JobCli
178
178
 
179
179
  desc "ssh", "ssh into running jobs", :hide => false
@@ -415,7 +415,7 @@ module Cnvrg
415
415
  end
416
416
  end
417
417
 
418
- desc 'set_compression_path', 'Set compression path'
418
+ desc 'set_compression_path', 'Set compression path', :hide => true
419
419
  method_option :reset, :type => :boolean, :aliases => ["-r", "--reset"], :default => false
420
420
 
421
421
  def set_compression_path(*compression_path)
@@ -2311,6 +2311,7 @@ module Cnvrg
2311
2311
  method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2312
2312
  method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2313
2313
  method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
2314
+ method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
2314
2315
 
2315
2316
  def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true,chunk_size=1000)
2316
2317
  begin
@@ -2328,6 +2329,8 @@ module Cnvrg
2328
2329
  exp_obj = nil
2329
2330
  end
2330
2331
 
2332
+ local = options["local"]
2333
+
2331
2334
  commit_msg = options["message"]
2332
2335
  if commit_msg.nil? or commit_msg.empty?
2333
2336
  commit_msg = ""
@@ -2349,7 +2352,7 @@ module Cnvrg
2349
2352
  if git_output_dir.ends_with? "/"
2350
2353
  git_output_dir = git_output_dir[0..-2]
2351
2354
  end
2352
- list = @project.generate_output_dir(git_output_dir)
2355
+ list = @project.generate_output_dir(git_output_dir, local: local)
2353
2356
  end
2354
2357
  list += @project.generate_git_diff if options["git_diff"]
2355
2358
  spec_files_to_upload = list
@@ -2668,7 +2671,7 @@ module Cnvrg
2668
2671
  end
2669
2672
  end
2670
2673
 
2671
- desc 'commit before termination', 'Commit job code before termination'
2674
+ desc 'commit before termination', 'Commit job code before termination', :hide => true
2672
2675
  def commit_before_termination()
2673
2676
  job_type = ENV['CNVRG_JOB_TYPE']
2674
2677
  job_id = ENV['CNVRG_JOB_ID']
@@ -2678,7 +2681,7 @@ module Cnvrg
2678
2681
  log_error(e)
2679
2682
  end
2680
2683
 
2681
- desc 'update_job_commit', 'Update job with its last commit'
2684
+ desc 'update_job_commit', 'Update job with its last commit' , :hide => true
2682
2685
  def update_job_commit()
2683
2686
  job_type = ENV['CNVRG_JOB_TYPE']
2684
2687
  job_id = ENV['CNVRG_JOB_ID']
@@ -2868,7 +2871,7 @@ module Cnvrg
2868
2871
 
2869
2872
 
2870
2873
 
2871
- desc 'jump', 'Jump to specific commit'
2874
+ desc 'jump COMMIT_ID', 'Jump to specific commit'
2872
2875
  def jump(commit_sha1)
2873
2876
  begin
2874
2877
  verify_logged_in()
@@ -3003,11 +3006,12 @@ module Cnvrg
3003
3006
  method_option :job_slug, :type => :string, :aliases => ["-j", "--job"], :default => nil
3004
3007
  method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
3005
3008
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
3006
- method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
3009
+ method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => 'output'
3007
3010
  method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
3008
3011
  method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
3009
3012
  method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
3010
3013
  method_option :chunk_size, :type => :numeric, :aliases => ["--chunk"], :default => 1000
3014
+ method_option :local, :type => :boolean, :aliases => ["--local"], :default => true
3011
3015
 
3012
3016
  def sync(direct = true)
3013
3017
  verify_logged_in(true) if direct
@@ -3030,10 +3034,10 @@ module Cnvrg
3030
3034
  if run_download or options['debug_mode']
3031
3035
  invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
3032
3036
  end
3033
- invoke :upload, [false, true, direct, "",in_exp,options[:force], output_dir, job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
3037
+ invoke :upload, [false, true, direct, "", in_exp, options[:force], output_dir, job_type, job_slug], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
3034
3038
  :ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
3035
3039
  :files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"],
3036
- :debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"]
3040
+ :debug_mode => options['debug_mode'], :git_diff => options["git_diff"], :chunk_size => options["chunk_size"], :local => options["local"]
3037
3041
 
3038
3042
  end
3039
3043
 
@@ -3143,7 +3147,7 @@ module Cnvrg
3143
3147
  invoke :exec, [cmd], :sync_before => sync_before, :sync_after => sync_after, :title => title,
3144
3148
  :log => log, :email_notification => email_notification, :upload_output => upload_output,
3145
3149
  :commit => commit, :image => image, :data => data, :data_commit => data_commit,
3146
- :ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query
3150
+ :ignore => ignore, :force => force, :output_dir=>output_dir, :data_query=>data_query, :local => local
3147
3151
  return
3148
3152
  end
3149
3153
  else
@@ -3200,6 +3204,7 @@ module Cnvrg
3200
3204
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3201
3205
  method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3202
3206
  method_option :docker_stats, :type => :boolean, :aliases => ["--docker_stats"], :default => true
3207
+ method_option :local, :type => :boolean, :aliases => ["-l", "--local"], :default => false
3203
3208
 
3204
3209
  def exec(*cmd)
3205
3210
  log = []
@@ -3224,6 +3229,7 @@ module Cnvrg
3224
3229
  project_home = get_project_home
3225
3230
  data_query = options["data_query"]
3226
3231
  docker_stats = options["docker_stats"]
3232
+ local = options[:local] || false
3227
3233
  @project = Project.new(project_home)
3228
3234
  if @project.is_git
3229
3235
  sync_before = false
@@ -3316,62 +3322,53 @@ module Cnvrg
3316
3322
  end
3317
3323
  end
3318
3324
  start_time = Time.now
3319
- shell_type = options["use_bash"] ? "bash -l" : "sh"
3320
3325
  if @exp.get_cmd.present?
3321
3326
  cmd = @exp.get_cmd
3322
- if options["docker_id"].present? # Escape for docker exec
3323
- cmd = cmd.gsub("\"", "\\\"")
3324
- end
3325
- end
3326
- if options["docker_id"].present?
3327
- cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
3328
3327
  end
3329
- PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
3330
- begin
3331
- stdout.each do |line|
3332
- cur_time = Time.now
3333
- real_time = Time.now - real
3334
- cur_log = {time: cur_time,
3335
- message: line,
3336
- type: "stdout",
3337
- real: real_time
3338
- }
3339
- if print_log
3340
- puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
3328
+
3329
+ if local
3330
+ exec_local(cmd)
3331
+ exit_status = $?.exitstatus
3332
+
3333
+ else
3334
+ command_slug = (0...18).map { (65 + rand(26)).chr }.join
3335
+ result_file = "/conf/result-#{command_slug}"
3336
+ data = {cmd: cmd, async: true, format: true, file_name: result_file, use_script: true, use_bash: options["use_bash"]}
3337
+ conn = Cnvrg::Helpers::Executer.get_main_conn
3338
+ response = conn.post('command', data.to_json)
3339
+ if response.to_hash[:status].to_i != 200
3340
+ exit_status = 129
3341
+ raise StandardError.new("Cant send command to slave")
3342
+ end
3343
+ t = FileWatch::Tail.new
3344
+ filename = result_file
3345
+ lines = []
3346
+ t.tail(filename)
3347
+ t.subscribe do |path, line|
3348
+ begin
3349
+ cur_log = JSON.parse(line)
3350
+ if cur_log["type"] == "endMessage"
3351
+ exit_status = cur_log["real"].to_i
3352
+ break
3353
+ else
3354
+ puts(cur_log.to_json)
3355
+ STDOUT.flush
3356
+ cur_log["time"] = Time.parse(cur_log["timestamp"])
3357
+ cur_log["message"] = cur_log["message"].to_s + "\r\n"
3358
+ log << cur_log
3341
3359
  end
3342
- log << cur_log
3343
3360
  if log.size >= 10
3344
- @exp.upload_temp_log(log) unless log.empty?
3361
+ @exp.upload_temp_log(log)
3345
3362
  log = []
3346
- elsif (start_time + 15.seconds) <= Time.now
3363
+ elsif (start_time + 15.seconds) <= Time.now
3347
3364
  @exp.upload_temp_log(log) unless log.empty?
3348
3365
  log = []
3349
3366
  start_time = Time.now
3350
3367
  end
3368
+ rescue => e
3369
+ log_error(e)
3351
3370
  end
3352
- if stderr
3353
- stderr.each do |err|
3354
- log << {time: Time.now, message: err, type: "stderr"}
3355
- end
3356
- end
3357
- rescue Errno::EIO => e
3358
- log_error(e)
3359
- if !log.empty?
3360
- temp_log = log
3361
- @exp.upload_temp_log(temp_log) unless temp_log.empty?
3362
- log -= temp_log
3363
- end
3364
- rescue Errno::ENOENT => e
3365
- exp_success = false
3366
- log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
3367
- log_error(e)
3368
- rescue => e
3369
- res = @exp.end(log, 1, start_commit, 0, 0)
3370
- log_message("Error occurred,aborting", Thor::Shell::Color::RED)
3371
- log_error(e)
3372
- exit(0)
3373
3371
  end
3374
- ::Process.wait pid
3375
3372
  end
3376
3373
  end_time = Time.now
3377
3374
  process_running = false
@@ -3379,14 +3376,13 @@ module Cnvrg
3379
3376
  if !log.empty?
3380
3377
 
3381
3378
  temp_log = log
3382
- @exp.upload_temp_log(temp_log) unless temp_log.empty?
3379
+ @exp.upload_temp_log(temp_log)
3383
3380
  log -= temp_log
3384
3381
  end
3385
3382
 
3386
3383
  cpu_average = cpu_total.inject(0) {|sum, el| sum + el}.to_f / cpu_total.size
3387
3384
  memory_average = memory_total.inject(0) {|sum, el| sum + el}.to_f / memory_total.size
3388
- exit_status = $?.exitstatus
3389
- if $?.exitstatus != 0
3385
+ if exit_status != 0
3390
3386
  exp_success = false
3391
3387
  end
3392
3388
 
@@ -3430,7 +3426,6 @@ module Cnvrg
3430
3426
  if @exp
3431
3427
  # log_thread.join
3432
3428
  Thread.kill(stats_thread) if docker_stats
3433
- exit_status = $?.exitstatus
3434
3429
  if exit_status.blank?
3435
3430
  exit_status = "-1"
3436
3431
  end
@@ -3443,8 +3438,6 @@ module Cnvrg
3443
3438
 
3444
3439
  exit(1)
3445
3440
  end
3446
-
3447
-
3448
3441
  end
3449
3442
 
3450
3443
  end
@@ -3689,7 +3682,7 @@ module Cnvrg
3689
3682
  end
3690
3683
  end
3691
3684
 
3692
- desc 'deploy', 'Deploys model to production'
3685
+ desc 'deploy', 'Deploys model to production', :hide => true
3693
3686
  method_option :small, :type => :boolean, :aliases => ["-s", "--small"], :default => false
3694
3687
  method_option :medium, :type => :boolean, :aliases => ["-m", "--medium"], :default => false
3695
3688
  method_option :large, :type => :boolean, :aliases => ["-l", "--large"], :default => false
@@ -3778,7 +3771,7 @@ module Cnvrg
3778
3771
  method_option :dataset_only_tree, :type => :boolean, :aliases => [ "--dataset_only_tree"], :default => false
3779
3772
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => ""
3780
3773
 
3781
- desc 'notebook', 'Starts a notebook session remotely or locally'
3774
+ desc 'notebook', 'Starts a notebook session remotely or locally', :hide => true
3782
3775
 
3783
3776
  def notebook
3784
3777
  verify_logged_in(true)
@@ -3905,7 +3898,7 @@ module Cnvrg
3905
3898
  end
3906
3899
  end
3907
3900
 
3908
- desc 'remote_notebook', 'Run notebook server on remote server'
3901
+ desc 'remote_notebook', 'Run notebook server on remote server', :hide => true
3909
3902
  method_option :machine_type, :type => :string, :default => ""
3910
3903
  method_option :notebook_type, :type => :string, :aliases => ["-n", "--notebook_type"], :default => ""
3911
3904
  method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
@@ -4264,7 +4257,7 @@ module Cnvrg
4264
4257
 
4265
4258
  end
4266
4259
 
4267
- desc 'notebook_stop', 'Starts a new notebook environment'
4260
+ desc 'notebook_stop', 'Stop notebook', :hide => true
4268
4261
  method_option :notebook_dir, :type => :string, :aliases => ["-n", "--n"], :default => "", :desc => "relative path to notebook dir from current directory"
4269
4262
  method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false, :desc => "run on remote machine"
4270
4263
  method_option :verbose, :type => :boolean, :aliases => ["--v"], :default => false
@@ -4651,15 +4644,21 @@ module Cnvrg
4651
4644
  method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :desc => "collect gpu metrics", :default => false
4652
4645
  method_option :gaudi, :type => :boolean, :aliases => ["--gaudi"], :desc => "collect gaudi metrics", :default => false
4653
4646
  method_option :wait, :type => :numeric, :aliases => ["--wait"], :desc => "to to wait between querying", :default => 30
4647
+ method_option :prom_user, :type => :string, :aliases => ["--prom_user"], :desc => "prometheus username", :default => nil
4648
+ method_option :prom_password, :type => :string, :aliases => ["--prom_password"], :desc => "prometheus password", :default => nil
4649
+ method_option :name, :type => :string, :aliases => ["--name"], :desc => "pod name - used for master-workers jobs", :default => nil
4654
4650
 
4655
4651
  def collect_metrics
4656
4652
  @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4657
4653
  prometheus_url = options[:prometheus_url].ends_with?("/") ? options[:prometheus_url] : "#{options[:prometheus_url]}/"
4654
+ prom_user = options[:prom_user]
4655
+ prom_password = options[:prom_password]
4656
+ name = options[:name]
4658
4657
 
4659
4658
  translate_result = Cnvrg::API_V2.request(
4660
4659
  "#{ENV['CNVRG_OWNER']}/resources/translate_metrics",
4661
4660
  'GET',
4662
- { gpu: options[:gpu] }
4661
+ { gpu: options[:gpu], gaudi: options[:gaudi] }
4663
4662
  )
4664
4663
 
4665
4664
  is_machine = options[:machine]
@@ -4679,9 +4678,16 @@ module Cnvrg
4679
4678
  next
4680
4679
  end
4681
4680
  uri = URI("#{prometheus_url}api/v1/query?query=#{query_content}")
4682
- resp = Net::HTTP.get(uri)
4681
+ http = Net::HTTP.new(uri.host, uri.port)
4682
+ http.use_ssl = uri.scheme == "https"
4683
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
4684
+ req = Net::HTTP::Get.new uri.request_uri
4685
+ if prom_user.present?
4686
+ req.basic_auth(Base64.decode64(prom_user), Base64.decode64(prom_password))
4687
+ end
4688
+ resp = http.request(req)
4683
4689
  begin
4684
- result = JSON.parse(resp)
4690
+ result = JSON.parse(resp.body)
4685
4691
  rescue JSON::ParserError => e
4686
4692
  log_error(e)
4687
4693
  next
@@ -4690,13 +4696,22 @@ module Cnvrg
4690
4696
  next unless data_result
4691
4697
 
4692
4698
  if data_result.size > 1
4693
- stats[query_name] = {}
4699
+ stats[query_name] = {} unless query_name.include? 'block'
4694
4700
  data_result.each_with_index do |res, i|
4695
4701
  timestamp, value = res["value"]
4696
4702
  uuid = res["metric"]["UUID"].presence || i
4697
4703
  stat_value = value.present? ? ("%.2f" % value) : 0 # converting 34.685929244444445 to 34.69
4698
4704
  stat_value = stat_value.to_i == stat_value.to_f ? stat_value.to_i : stat_value.to_f # converting 34.00 to 34
4699
- stats[query_name][uuid] = stat_value
4705
+ if query_name.include? 'block'
4706
+ uuid = res["metric"]["interface"].presence || i
4707
+ uuid = "#{name}-#{uuid}" if name.present?
4708
+ stats['block_io'] = {} if stats['block_io'].blank?
4709
+ io_type = query_name.split('_')[1]
4710
+ stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
4711
+ stats['block_io'][io_type].merge!({ uuid => stat_value })
4712
+ else
4713
+ stats[query_name][uuid] = stat_value
4714
+ end
4700
4715
  end
4701
4716
  else
4702
4717
  timestamp, value = data_result&.first&.dig('value')
@@ -4705,9 +4720,14 @@ module Cnvrg
4705
4720
  if query_name.include? 'block'
4706
4721
  stats['block_io'] = {} if stats['block_io'].blank?
4707
4722
  io_type = query_name.split('_')[1]
4708
- stats['block_io'].merge!({ io_type => stat_value })
4723
+ if name.present?
4724
+ stats['block_io'][io_type] = {} if stats['block_io'][io_type].blank?
4725
+ stats['block_io'][io_type].merge!({ name => stat_value })
4726
+ else
4727
+ stats['block_io'].merge!({ io_type => stat_value })
4728
+ end
4709
4729
  else
4710
- stats[query_name] = stat_value
4730
+ stats[query_name] = name.present? ? { name => stat_value } : stat_value
4711
4731
  end
4712
4732
  end
4713
4733
  end
@@ -4751,7 +4771,7 @@ module Cnvrg
4751
4771
  end
4752
4772
 
4753
4773
 
4754
- desc '', ''
4774
+ desc '', '', :hide => true
4755
4775
 
4756
4776
  def download_built_image(image_name, image_slug)
4757
4777
  begin
@@ -4995,7 +5015,7 @@ module Cnvrg
4995
5015
  end
4996
5016
  end
4997
5017
 
4998
- desc 'experiments', 'List project experiments'
5018
+ desc 'experiments', 'List project experiments', :hide => true
4999
5019
  method_option :id, :type => :string, :aliases => ["--id"], :desc => "Get info for specific experiments", :default => ""
5000
5020
  method_option :tag, :type => :string, :aliases => ["-t"], :desc => "Get info for specific experiment tag", :default => ""
5001
5021
 
@@ -5864,6 +5884,56 @@ module Cnvrg
5864
5884
  end
5865
5885
  end
5866
5886
 
5887
+ def exec_local(cmd)
5888
+ PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
5889
+ begin
5890
+ stdout.each do |line|
5891
+ cur_time = Time.now
5892
+ real_time = Time.now - real
5893
+ cur_log = {time: cur_time,
5894
+ message: line,
5895
+ type: "stdout",
5896
+ real: real_time
5897
+ }
5898
+ if print_log
5899
+ puts({log: line, timestamp: Time.now, exp_logs: true}.to_json)
5900
+ end
5901
+ log << cur_log
5902
+ if log.size >= 10
5903
+ @exp.upload_temp_log(log) unless log.empty?
5904
+ log = []
5905
+ elsif (start_time + 15.seconds) <= Time.now
5906
+ @exp.upload_temp_log(log) unless log.empty?
5907
+ log = []
5908
+ start_time = Time.now
5909
+ end
5910
+ end
5911
+ if stderr
5912
+ stderr.each do |err|
5913
+ log << {time: Time.now, message: err, type: "stderr"}
5914
+ end
5915
+ end
5916
+ rescue Errno::EIO => e
5917
+ log_error(e)
5918
+ if !log.empty?
5919
+ temp_log = log
5920
+ @exp.upload_temp_log(temp_log) unless temp_log.empty?
5921
+ log -= temp_log
5922
+ end
5923
+ rescue Errno::ENOENT => e
5924
+ exp_success = false
5925
+ log_message("command \"#{cmd}\" couldn't be executed, verify command is valid", Thor::Shell::Color::RED)
5926
+ log_error(e)
5927
+ rescue => e
5928
+ res = @exp.end(log, 1, start_commit, 0, 0)
5929
+ log_message("Error occurred,aborting", Thor::Shell::Color::RED)
5930
+ log_error(e)
5931
+ exit(0)
5932
+ end
5933
+ ::Process.wait pid
5934
+ end
5935
+ end
5936
+
5867
5937
  end
5868
5938
  end
5869
5939
 
@@ -10,20 +10,20 @@ module Cnvrg
10
10
  Cnvrg::Logger.log_info("cnvrg is not configured")
11
11
  end
12
12
 
13
- def start(username, password)
14
- Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password})
13
+ def start(username, password, no_auth, port: nil)
14
+ Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/start" , 'POST', {username: username, password: password, no_auth: no_auth, port: port})
15
15
  end
16
16
 
17
17
  def status()
18
18
  Cnvrg::API_V2.request("#{@owner}/job_ssh/#{@job_id}/status" , 'GET', nil)
19
19
  end
20
20
 
21
- def run_portforward_command(pod_name, port, kubeconfig, namespace)
21
+ def run_portforward_command(pod_name, port, kubeconfig, namespace, internal_port)
22
22
  command = "kubectl"
23
23
  if kubeconfig.present?
24
24
  command = "kubectl --kubeconfig=#{kubeconfig}"
25
25
  end
26
- bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:22"
26
+ bashCommand = "#{command} -n #{namespace} port-forward #{pod_name} #{port}:#{internal_port}"
27
27
  puts("\nrunning command #{bashCommand}")
28
28
  `#{bashCommand}`
29
29
  end
data/lib/cnvrg/files.rb CHANGED
@@ -106,7 +106,7 @@ module Cnvrg
106
106
  commit: commit_sha1
107
107
  })
108
108
  unless Cnvrg::CLI.is_response_success(resp, false)
109
- raise SignalException.new("Cant upload files to the server.")
109
+ raise StandardError.new("unsupported character: folder name can not include / \\ * : ? \" | ")
110
110
  end
111
111
  # resolve bucket
112
112
  res = resp['result']
@@ -730,7 +730,11 @@ module Cnvrg
730
730
  end
731
731
  res = Cnvrg::API.request(@base_resource + "download_files", 'POST', {files: files, commit: commit})
732
732
  unless Cnvrg::CLI.is_response_success(res, false)
733
- raise SignalException.new("Cant download files from the server.")
733
+ begin
734
+ puts(res)
735
+ rescue
736
+ end
737
+ raise StandardError.new("Cant download files from the server.")
734
738
  end
735
739
  self.download_multiple_files_s3(res['result'], @project_home, postfix: postfix, progress: progress, threads: threads)
736
740
  end
@@ -18,10 +18,13 @@ class Cnvrg::Helpers::Agent
18
18
  #### params
19
19
  def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
20
20
  @executer = executer
21
+ @job_id = ENV["CNVRG_JOB_ID"]
21
22
  @slug = slug
22
23
  @files_exist = files_exist
23
24
  @container_name = container_name
24
- @run_in_slave = @container_name.downcase == "slave"
25
+ @is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
26
+ @main_name = @is_new_main ? "main" : "slave"
27
+ @run_in_main = @container_name.downcase == @main_name
25
28
  @log_interval = send_log_interval
26
29
  # https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
27
30
  if timeout.blank? or timeout.negative?
@@ -37,18 +40,8 @@ class Cnvrg::Helpers::Agent
37
40
  @sleep_before_retry = sleep_before_retry
38
41
  @real_execution_retries = 0 ## How many times the command really executed until success
39
42
  @single_quotes = single_quotes
40
- @docker_user = ""
41
- @shell_type = use_bash ? "bash -l" : "sh"
42
- if docker_user.present?
43
- @docker_user = " --user #{docker_user}"
44
- end
45
- if @run_in_slave
46
- if @single_quotes
47
- @command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
48
- else
49
- @command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
50
- end
51
- end
43
+ @docker_user = docker_user
44
+ @use_bash = use_bash
52
45
  @output = []
53
46
  @errors = []
54
47
  @exit_status = nil
@@ -75,19 +68,22 @@ class Cnvrg::Helpers::Agent
75
68
 
76
69
  def exec!
77
70
  log_internal("Command: #{@command} with slug: #{@slug} started!")
71
+ command_status = Status::FINISHED
78
72
  if @command.blank?
79
73
  @exit_status = 0
74
+ command_status = Status::ABORTED
80
75
  elsif should_run?
81
76
  send_logs(status: Status::STARTED)
82
77
  periodic_thread_handle = periodic_thread
83
78
  execute_command
84
79
  else
80
+ command_status = Status::ABORTED
85
81
  @exit_status = 127
86
82
  end
87
83
  finish_log = "Command: #{@command} with slug: #{@slug} finished"
88
84
  finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
89
85
  log_internal(finish_log)
90
- send_logs(exit_status: @exit_status, status: Status::FINISHED)
86
+ send_logs(exit_status: @exit_status, status: command_status)
91
87
  if periodic_thread_handle.present?
92
88
  periodic_thread_handle.join
93
89
  end
@@ -117,14 +113,50 @@ class Cnvrg::Helpers::Agent
117
113
  execute_command
118
114
  end
119
115
 
116
+ def execute_command_on_slave
117
+ extra_slug = (0...2).map { (65 + rand(26)).chr }.join
118
+ result_file = "/conf/result-#{@slug}-#{extra_slug}"
119
+ Timeout.timeout(@timeout) do
120
+ data = {cmd: @command, async: true, file_name: result_file, use_script: true, use_bash: @use_bash, use_sh: !@use_bash, docker_user: @docker_user}
121
+ conn = Cnvrg::Helpers::Executer.get_main_conn
122
+ response = conn.post('command', data.to_json)
123
+ if response.to_hash[:status].to_i != 200
124
+ @exit_status = 129
125
+ raise StandardError.new("Cant send command to slave")
126
+ end
127
+ t = FileWatch::Tail.new
128
+ filename = result_file
129
+ t.tail(filename)
130
+ t.subscribe do |path, line|
131
+ if line.include?("cnvrg-exit-code")
132
+ @exit_status = line.split("=")[1].to_i
133
+ break
134
+ end
135
+ if !@is_new_main
136
+ log_internal(line, level: LogLevel::PURE)
137
+ end
138
+ line = line.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_')
139
+ @output << {log: line, timestamp: Time.now}
140
+ end
141
+ end
142
+ rescue Timeout::Error
143
+ @errors << {log: "Command timed out!", timestamp: Time.now}
144
+ log_internal("Command timed out!", level: LogLevel::ERROR)
145
+ @exit_status = 124
146
+ ensure
147
+ retry_command if @retries != 0 and @exit_status !=0
148
+ @exit_status
149
+ end
150
+
120
151
  def execute_command
152
+ return execute_command_on_slave if @run_in_main
121
153
  Timeout.timeout(@timeout) do
122
154
  PTY.spawn(@command) do |stdout, stdin, pid, stderr|
123
155
  @pid = pid
124
156
  begin
125
157
  if stdout.present?
126
158
  stdout.each do |line|
127
- log_internal(line, level: LogLevel::PURE)
159
+ log_internal(line, level: LogLevel::INFO)
128
160
  line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
129
161
  @output << {log: line, timestamp: Time.now}
130
162
  end
@@ -176,10 +208,18 @@ class Cnvrg::Helpers::Agent
176
208
  def log_internal(log, level: LogLevel::INFO)
177
209
  if level == LogLevel::PURE
178
210
  puts(log)
179
- else
180
- puts({log: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity}.to_json)
211
+ STDOUT.flush
212
+ return
213
+ end
214
+ to_print = {message: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity, job: @job_id}
215
+ if log.start_with?("{") and log.include?("timestamp")
216
+ log_json = JSON.parse(log)
217
+ to_print = to_print.stringify_keys.merge(log_json.stringify_keys)
181
218
  end
219
+ puts(to_print.to_json)
182
220
  STDOUT.flush
221
+ rescue => e
222
+ Cnvrg::Logger.log_error(e)
183
223
  end
184
224
 
185
225
  def filter_logs_by_regex(logs)
@@ -190,4 +230,4 @@ class Cnvrg::Helpers::Agent
190
230
  end
191
231
  end
192
232
  end
193
- end
233
+ end
@@ -1,7 +1,9 @@
1
+ require "filewatch/tail"
1
2
  require 'cnvrg/helpers/agent'
2
3
  class Cnvrg::Helpers::Executer
3
- attr_reader :machine_activity, :agent_id, :slave_id
4
-
4
+ attr_reader :machine_activity, :agent_id, :main_id
5
+ MAIN_CONTAINER_PORT = ENV['MAIN_CONTAINER_PORT'].try(:to_i) || 4000
6
+ HAS_DOCKER = ENV['HAS_DOCKER'] == "true"
5
7
 
6
8
  ### this class represent a machine_activity. it will poll the commands, communicate with the
7
9
  # server (poll commands) and let the server know the status of this executer.
@@ -9,11 +11,15 @@ class Cnvrg::Helpers::Executer
9
11
  @owner = owner
10
12
  @job_id = job_id
11
13
  @poll_every = poll_every
14
+ @check_main_every = 10
12
15
  @machine_activity = machine_activity
13
16
  @commands_q = Queue.new
14
17
  @files_q = Queue.new
15
18
  @agent_id = nil
16
- @slave_id = nil
19
+ @main_id = nil
20
+ @main_start_time = nil
21
+ @is_new_main = !ENV["MAIN_CONTAINER_PORT"].blank?
22
+ @main_name = @is_new_main ? "main" : "slave"
17
23
  end
18
24
 
19
25
  def create_file_cmd(path, content)
@@ -37,9 +43,10 @@ class Cnvrg::Helpers::Executer
37
43
  def executer_stats
38
44
  return @stats if @stats.present?
39
45
  Cnvrg::Logger.log_info("getting containers")
40
- @agent_id, @slave_id = containers
46
+ @agent_id, @main_id = containers
41
47
  Cnvrg::Logger.log_info("got containers")
42
48
  pod_name, node_name = get_node_and_pod_names
49
+ # For backwards compatibility we still call this slave stats
43
50
  @stats = {
44
51
  pod_name: pod_name,
45
52
  node_name: node_name,
@@ -53,39 +60,48 @@ class Cnvrg::Helpers::Executer
53
60
  cnvrg: Cnvrg::VERSION
54
61
  },
55
62
  slave: {
56
- container_id: @slave_id,
57
- workdir: run_in_slave('pwd'),
58
- homedir: slave_homedir,
63
+ container_id: @main_id,
64
+ container_name: @main_name,
65
+ workdir: run_in_main('pwd'),
66
+ homedir: main_homedir,
59
67
  spark_path: spark_path,
60
- user: run_in_slave( 'whoami'),
61
- cnvrg: run_in_slave( 'which cnvrg'),
62
- has_bash: run_in_slave( 'which bash'),
63
- user_id: run_in_slave( 'id -u'),
64
- group_id: run_in_slave( 'id -g'),
65
- python_version: run_in_slave( 'python --version'),
66
- python3_version: run_in_slave( 'python3 --version'),
67
- pip_version: run_in_slave( 'pip --version'),
68
- pip3_version: run_in_slave( 'pip3 --version')
68
+ user: run_in_main( 'whoami'),
69
+ cnvrg: run_in_main( 'which cnvrg'),
70
+ has_bash: run_in_main( 'which bash'),
71
+ user_id: run_in_main( 'id -u'),
72
+ group_id: run_in_main( 'id -g'),
73
+ python_version: run_in_main( 'python --version'),
74
+ python3_version: run_in_main( 'python3 --version'),
75
+ pip_version: run_in_main( 'pip --version'),
76
+ pip3_version: run_in_main( 'pip3 --version')
69
77
  },
70
78
  }
79
+
71
80
  @stats
72
81
  end
73
82
 
74
83
  def containers
75
84
  agent_id = nil
76
- slave_id = nil
77
- while agent_id.blank? or slave_id.blank?
78
- grep_by = @job_id
79
- grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
80
- cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
81
- agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
82
- slave_id = cntrs.find{|container_name| container_name.include? "slave"}.split(",").first rescue nil
83
- sleep(5)
85
+ main_id = nil
86
+ timeout = 2
87
+ timeout = nil if (!@is_new_main || HAS_DOCKER)
88
+ Timeout.timeout(timeout) do
89
+ while agent_id.blank? or main_id.blank?
90
+ grep_by = @job_id
91
+ grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
92
+ cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
93
+ agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
94
+ main_id = cntrs.find{|container_name| container_name.include? @main_name}.split(",").first rescue nil
95
+ sleep(2)
96
+ end
84
97
  end
85
- if slave_id.blank?
86
- raise "Can't find slave id"
98
+ if main_id.blank?
99
+ raise "Can't find main id"
87
100
  end
88
- [agent_id, slave_id]
101
+ [agent_id, main_id]
102
+ rescue => e
103
+ Cnvrg::Logger.log_error(e)
104
+ [agent_id, main_id]
89
105
  end
90
106
 
91
107
  def current_homedir
@@ -93,21 +109,45 @@ class Cnvrg::Helpers::Executer
93
109
  end
94
110
 
95
111
  def spark_path
96
- run_in_slave("env | grep SPARK_HOME").strip.split("=").try(:last)
112
+ run_in_main("env | grep SPARK_HOME").strip.split("=").try(:last)
97
113
  end
98
114
 
99
- def slave_homedir()
100
- run_in_slave("env | grep -w HOME").split("=").try(:last)
115
+ def main_homedir()
116
+ run_in_main("env | grep -w HOME").split("=").try(:last)
101
117
  end
102
118
 
103
- def slave_env
104
- run_in_slave("env").split("\n").map{|x| x.split("=")}
119
+ def main_env
120
+ run_in_main("env").split("\n").map{|x| x.split("=")}
105
121
  end
106
122
 
107
- def run_in_slave(command)
108
- `docker exec -i #{@slave_id} sh -c '#{command}'`.strip
109
- end
123
+ def run_in_main(command)
124
+ data = {cmd: command, async: false, use_sh: true}
110
125
 
126
+ conn = Cnvrg::Helpers::Executer.get_main_conn(timeout: 180)
127
+ response = conn.post('command', data.to_json)
128
+ if response.to_hash[:status].to_i != 200
129
+ Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
130
+ return ""
131
+ end
132
+ resp = []
133
+ lines = response.body.split("\n")
134
+ lines.each do |line|
135
+ next if line.strip == nil or line.strip == ""
136
+ if line.include?("cnvrg-exit-code")
137
+ exit_status = line.split("=")[1].to_i
138
+ if exit_status != 0
139
+ Cnvrg::Logger.log_info("failed to run find command #{command} on main")
140
+ return ""
141
+ end
142
+ next
143
+ end
144
+ resp << line
145
+ end
146
+ return resp.join("\n")
147
+ rescue => e
148
+ Cnvrg::Logger.log_error(e)
149
+ return ""
150
+ end
111
151
 
112
152
  def poll
113
153
  resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
@@ -124,6 +164,7 @@ class Cnvrg::Helpers::Executer
124
164
  success = false
125
165
  puts("Agent started, connecting to #{Cnvrg::API.get_api}")
126
166
  STDOUT.flush
167
+ wait_for_main
127
168
  while !success and retries < 100
128
169
  begin
129
170
  resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
@@ -154,14 +195,75 @@ class Cnvrg::Helpers::Executer
154
195
  end
155
196
  end
156
197
 
198
+ def check_main_is_working_thread
199
+ while true
200
+ check_main_alive
201
+ sleep(@check_main_every)
202
+ end
203
+ end
204
+
157
205
  def main_thread
158
206
  init
159
207
  Thread.new do
160
208
  polling_thread
161
209
  end
210
+ Thread.new do
211
+ check_main_is_working_thread
212
+ end
162
213
  execute_cmds
163
214
  end
164
215
 
216
+ def wait_for_main
217
+ copy_file_to_main
218
+ start_tiny_if_missing
219
+ puts("Waiting for main container")
220
+ STDOUT.flush
221
+ got_response = false
222
+ while !got_response do
223
+ begin
224
+ conn = Cnvrg::Helpers::Executer.get_main_conn
225
+ response = conn.get('readiness')
226
+ if response.to_hash[:status].to_i != 200
227
+ sleep(0.1)
228
+ next
229
+ else
230
+ puts("Client container is ready")
231
+ STDOUT.flush
232
+ @main_start_time = response.body.to_i
233
+ got_response = true
234
+ end
235
+ rescue => e
236
+ puts("Failed to connect to main")
237
+ puts(e)
238
+ STDOUT.flush
239
+ sleep(0.1)
240
+ next
241
+ end
242
+ end
243
+ end
244
+
245
+ def copy_file_to_main
246
+ begin
247
+ FileUtils.cp("/cnvrg-tiny", "/conf/tiny")
248
+ FileUtils.cp_r("/scripts", "/conf/scripts-bin")
249
+ FileUtils.touch("/conf/tiny-ready")
250
+ rescue => e
251
+ Cnvrg::Logger.log_error(e)
252
+ end
253
+ end
254
+
255
+ def start_tiny_if_missing
256
+ return unless ENV['MAIN_CONTAINER_PORT'].blank?
257
+ Cnvrg::Logger.log_info("Tiny not found, starting it")
258
+ @agent_id, @main_id = containers
259
+ pid = Process.fork do
260
+ Cnvrg::Logger.log_info("running docker exec -i #{@main_id} sh -c '/conf/tiny")
261
+ `docker exec -i #{@main_id} sh -c '/conf/tiny'`.strip
262
+ end
263
+ Process.detach(pid)
264
+ Cnvrg::Logger.log_info("Tiny started and detached")
265
+ end
266
+
165
267
  def execute_cmds
166
268
  pids = []
167
269
  while true
@@ -215,6 +317,23 @@ class Cnvrg::Helpers::Executer
215
317
  Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
216
318
  end
217
319
 
320
+ def check_main_alive
321
+ # Dont check before we got first response
322
+ return if @main_start_time == nil
323
+ conn = Cnvrg::Helpers::Executer.get_main_conn
324
+ response = conn.get('readiness')
325
+ if response.to_hash[:status].to_i != 200
326
+ main_start_time = 0
327
+ else
328
+ main_start_time = response.body.to_i
329
+ end
330
+ if main_start_time != @main_start_time
331
+ puts("Found that main restarted, restarting agent")
332
+ Cnvrg::Logger.log_info("Found that main restarted, restarting agent")
333
+ exit(1)
334
+ end
335
+ end
336
+
218
337
  def get_pod_events(pod_name)
219
338
  return if pod_name.blank?
220
339
  `kubectl get event --field-selector involvedObject.name=#{pod_name} -o json`
@@ -224,4 +343,27 @@ class Cnvrg::Helpers::Executer
224
343
  return if node_name.blank?
225
344
  `kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
226
345
  end
346
+
347
+ def self.main_container_url
348
+ if ENV["CNVRG_COMPUTE_CLUSTER"].blank? and ENV["KUBERNETES_SERVICE_HOST"].blank?
349
+ if ENV["MAIN_CONTAINER_PORT"].blank?
350
+ host = "slave"
351
+ else
352
+ host = "main"
353
+ end
354
+ "http://#{host}:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
355
+ else
356
+ "http://localhost:#{Cnvrg::Helpers::Executer::MAIN_CONTAINER_PORT}"
357
+ end
358
+ end
359
+
360
+ def self.get_main_conn(timeout: 4, open_timeout: 1)
361
+ conn = Faraday.new(
362
+ url: Cnvrg::Helpers::Executer.main_container_url,
363
+ headers: {'Content-Type' => 'application/json'}
364
+ )
365
+ conn.options.timeout = timeout
366
+ conn.options.open_timeout = open_timeout
367
+ conn
368
+ end
227
369
  end
data/lib/cnvrg/job_ssh.rb CHANGED
@@ -5,14 +5,18 @@ module Cnvrg
5
5
  method_option :port, :type => :numeric, :aliases => ["-p", "--port"], :desc => "Port to bind into", :default => 2222
6
6
  method_option :username, :type => :string, :aliases => ["-u", "--username"], :desc => "Job container user name", :default => nil
7
7
  method_option :password, :type => :string, :aliases => ["--password"], :desc =>"Job Conatainer user name, will be set by cnvrg", :default => nil
8
+ method_option :no_auth, :type => :boolean, :aliases => ["--no-auth"], :default => false
9
+ method_option :internal_port, :type => :numeric, :aliases => ["--internal-port"], :desc =>"Internal port in the pod for the ssh", :default => 22
8
10
  method_option :kubeconfig, :type => :string, :aliases => ["--kubeconfig"], :desc => "Path to kubeconfig, if blank default config will be used", :default => nil
9
11
  def start(job_id)
12
+ no_auth = options["no_auth"]
10
13
  Cnvrg::CLI.new.log_start(__method__, args, options)
11
14
  @job_ssh = ConnectJobSsh.new(job_id)
12
- @job_ssh.start(options['username'], options['password'])
15
+ @job_ssh.start(options['username'], options['password'], no_auth, port: options['internal_port'])
13
16
  pod_name = nil
14
17
  namespace = "cnvrg"
15
18
  ssh_ready = false
19
+ internal_port = options['internal_port']
16
20
  while not ssh_ready
17
21
  resp = @job_ssh.status()
18
22
  status = resp["ssh_status"]
@@ -26,13 +30,14 @@ module Cnvrg
26
30
  username = resp["username"]
27
31
  pod_name = resp["pod_name"]
28
32
  namespace = resp["namespace"]
33
+ internal_port = resp["port"] || internal_port
29
34
  ssh_ready = true
30
35
  else
31
36
  puts("Failed to start ssh")
32
37
  break
33
38
  end
34
39
  end
35
- if pod_name.blank? or password.blank? or username.blank?
40
+ if pod_name.blank? or (password.blank? and !no_auth) or username.blank?
36
41
  puts("Failed to get required params")
37
42
  return
38
43
  end
@@ -41,8 +46,8 @@ module Cnvrg
41
46
  puts("host: 127.0.0.1")
42
47
  puts("port: #{options["port"]}")
43
48
  puts("username: #{username}")
44
- puts("password: #{password}")
45
- @job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace)
49
+ puts("password: #{password}") unless no_auth
50
+ @job_ssh.run_portforward_command(pod_name, options["port"], options["kubeconfig"], namespace, internal_port)
46
51
  end
47
52
  end
48
53
  end
data/lib/cnvrg/project.rb CHANGED
@@ -328,15 +328,21 @@ module Cnvrg
328
328
  end
329
329
 
330
330
  def get_storage_client
331
- response = Cnvrg::API.request("users/#{@owner}/projects/#{@slug}/client", 'GET')
332
- if Cnvrg::CLI.is_response_success(response, false)
333
-
331
+ client_params = nil
332
+ i = 0
333
+ begin
334
+ response = Cnvrg::API.request("users/#{@owner}/projects/#{@slug}/client", 'GET')
335
+ unless Cnvrg::CLI.is_response_success(response, false)
336
+ raise StandardError.new("Can't find project credentials")
337
+ end
334
338
  client_params = response['client']
335
- else
336
-
339
+ rescue StandardError
340
+ i += 1
341
+ sleep(5 * i)
342
+ retry if i < 10
337
343
  client_params = get_storage_client_fallback
338
344
  end
339
-
345
+ raise StandardError.new("Can't find project credentials") unless client_params
340
346
  Cnvrg::Downloader::Client.factory(client_params)
341
347
  end
342
348
 
@@ -378,14 +384,18 @@ module Cnvrg
378
384
  []
379
385
  end
380
386
 
381
- def generate_output_dir(output_dir)
387
+ def generate_output_dir(output_dir, local: false)
382
388
  Cnvrg::Logger.log_info("Generating output dir for #{output_dir}")
383
389
  upload_list = []
390
+ list = []
384
391
  list = Dir.glob("/cnvrg/#{output_dir}/**/*", File::FNM_DOTMATCH)
392
+ if local
393
+ list += Dir.glob("#{output_dir}/**/*", File::FNM_DOTMATCH)
394
+ end
395
+ list.uniq!
385
396
  Parallel.map(list, in_threads: IDXParallelThreads) do |e|
386
397
  next if e.end_with? "/."
387
398
  if File.directory? e
388
-
389
399
  upload_list << e + "/"
390
400
  else
391
401
  upload_list << e
@@ -447,6 +457,10 @@ module Cnvrg
447
457
  if list_ignore_new.include? label
448
458
  next
449
459
  end
460
+ if File.symlink?(e)
461
+ Cnvrg::Logger.log_info("Skipping symlink #{e}")
462
+ next
463
+ end
450
464
  if File.directory? e
451
465
  dir_name = (label.ends_with? "/") ? label : (label + "/")
452
466
  tree_idx[dir_name] = nil
@@ -647,7 +661,11 @@ module Cnvrg
647
661
 
648
662
  def fetch_webapp_slugs(webapp_slug, slugs: nil)
649
663
  response = Cnvrg::API_V2.request("#{self.owner}/projects/#{self.slug}/webapps/#{webapp_slug}" , 'GET')
650
- return response["experiments"]
664
+
665
+ if response.key?("experiments")
666
+ return response["experiments"]
667
+ end
668
+ return response["data"]["attributes"]["experiments"]
651
669
  rescue
652
670
  slugs
653
671
  end
@@ -699,8 +717,11 @@ module Cnvrg
699
717
  res = JSON.parse(resp['result']) rescue nil
700
718
  return if res.blank?
701
719
  config = self.get_config
702
- config[:is_git] = res['git']
703
720
  config[:project_name] = res['title']
721
+ config[:project_slug] = @slug
722
+ config[:owner] = @owner
723
+ config[:git] = res['git'] || false
724
+ config[:is_git] = res['git'] || false
704
725
  self.set_config(config)
705
726
  end
706
727
 
data/lib/cnvrg/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Cnvrg
2
- VERSION = '1.11.29'
3
- end
2
+ VERSION = '2.0.11'
3
+ end
metadata CHANGED
@@ -1,16 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cnvrg
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.29
4
+ version: 2.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yochay Ettun
8
8
  - Leah Kolben
9
9
  - Omer Shacham
10
- autorequire:
10
+ autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2021-03-30 00:00:00.000000000 Z
13
+ date: 2021-10-21 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: bundler
@@ -96,6 +96,26 @@ dependencies:
96
96
  - - ">="
97
97
  - !ruby/object:Gem::Version
98
98
  version: '0'
99
+ - !ruby/object:Gem::Dependency
100
+ name: ffi
101
+ requirement: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - "~>"
104
+ - !ruby/object:Gem::Version
105
+ version: '1.9'
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: 1.9.10
109
+ type: :runtime
110
+ prerelease: false
111
+ version_requirements: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - "~>"
114
+ - !ruby/object:Gem::Version
115
+ version: '1.9'
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: 1.9.10
99
119
  - !ruby/object:Gem::Dependency
100
120
  name: mimemagic
101
121
  requirement: !ruby/object:Gem::Requirement
@@ -302,6 +322,20 @@ dependencies:
302
322
  - - "~>"
303
323
  - !ruby/object:Gem::Version
304
324
  version: 0.1.1
325
+ - !ruby/object:Gem::Dependency
326
+ name: filewatch
327
+ requirement: !ruby/object:Gem::Requirement
328
+ requirements:
329
+ - - "~>"
330
+ - !ruby/object:Gem::Version
331
+ version: 0.9.0
332
+ type: :runtime
333
+ prerelease: false
334
+ version_requirements: !ruby/object:Gem::Requirement
335
+ requirements:
336
+ - - "~>"
337
+ - !ruby/object:Gem::Version
338
+ version: 0.9.0
305
339
  - !ruby/object:Gem::Dependency
306
340
  name: parallel
307
341
  requirement: !ruby/object:Gem::Requirement
@@ -439,7 +473,7 @@ files:
439
473
  homepage: https://cnvrg.io
440
474
  licenses: []
441
475
  metadata: {}
442
- post_install_message:
476
+ post_install_message:
443
477
  rdoc_options: []
444
478
  require_paths:
445
479
  - lib
@@ -454,8 +488,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
454
488
  - !ruby/object:Gem::Version
455
489
  version: '0'
456
490
  requirements: []
457
- rubygems_version: 3.1.2
458
- signing_key:
491
+ rubygems_version: 3.2.22
492
+ signing_key:
459
493
  specification_version: 4
460
494
  summary: A CLI tool for interacting with cnvrg.io.
461
495
  test_files: []