cnvrg 1.11.6 → 1.11.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6f3169f1879068224c90bee508cc59f70ea0511ddf035121f68fa3903a179cb6
4
- data.tar.gz: '008d3c425851891ecc80363c141636c54a9ed3abcc3fa80083f5ef190a7bcba8'
3
+ metadata.gz: cc3f80382d5e1ebdddea819a74e92cd00806a273dfd4d58c484e2a91a7f86932
4
+ data.tar.gz: cfb22dc8d1548b7c6a1f0af0e588e8c03a62bcad67b06fc8c1573116df7db31b
5
5
  SHA512:
6
- metadata.gz: a3b9ad00997260f78fbdd3b492676847d85102ba2e87d6c54d64a8f0703250e87f01cd0346b797355f1206ff0b2c5dd2c1e2b70bbb50ec03dac2ff1a6f2a4c61
7
- data.tar.gz: 87beff960a61713abab357b95e890c6e247fc3605f2cea7fed5767d259a728448a3bdfcd2e810833b4632f9794efc3e043e773ff600d47abf6d49d4792bc9953
6
+ metadata.gz: 3096470b6f4aa425b2804e12b1db724c33700d209387475fbee16977c89ac75d4794f3eea304de454936fca93b5a24df8ba57391f05490fc076abc2c2f3b2a20
7
+ data.tar.gz: dd3dfd71604961c548d197c72cda3a6dd586103e19d38dabc05cc1af2e8262d84bab4126dfe93fe46f920c8fb345508c6626c42b212a7ad5a02b6cca0893c0d2
@@ -65,6 +65,16 @@ module Cnvrg
65
65
  response = conn.get "#{resource}", data
66
66
  success = true
67
67
  Cnvrg::API.parse_version(response)
68
+ if response.to_hash[:status].to_i != 200
69
+ Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
70
+ end
71
+ if [503, 502, 429].include?(response.to_hash[:status].to_i)
72
+ Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
73
+ success = false
74
+ sleep(5 * retries)
75
+ retries +=1
76
+ next
77
+ end
68
78
  rescue => e
69
79
  Cnvrg::Logger.log_error(e)
70
80
  sleep(5)
@@ -95,11 +105,20 @@ module Cnvrg
95
105
  response = conn.put "#{resource}", data.to_json if method.eql? 'PUT'
96
106
  success = true
97
107
  Cnvrg::API.parse_version(response)
98
-
108
+ if response.to_hash[:status].to_i != 200
109
+ Cnvrg::Logger.log_info("Got back bad status #{response.to_hash[:status]}")
110
+ end
111
+ if [503, 502, 429].include?(response.to_hash[:status].to_i)
112
+ Cnvrg::Logger.log_info("Got back status #{response.to_hash[:status]}, will retry in #{5 * retries} seconds")
113
+ success = false
114
+ sleep(5 * retries)
115
+ retries +=1
116
+ next
117
+ end
99
118
  rescue => e
100
119
  Cnvrg::Logger.log_error(e)
101
- sleep(5)
102
- retries +=1
120
+ sleep(5)
121
+ retries +=1
103
122
  end
104
123
  end
105
124
  if !success
@@ -972,6 +972,7 @@ module Cnvrg
972
972
 
973
973
  if Dataset.blank_clone(owner, dataset_name, dataset_slug)
974
974
  dataset = Dataset.new(dataset_home)
975
+ downloader = dataset.get_storage_client
975
976
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
976
977
  parallel_options = {
977
978
  :progress => {
@@ -994,15 +995,20 @@ module Cnvrg
994
995
  relative_path_dir = relative_path_dir.join("/")
995
996
  abs_path = dataset_home + "/" + relative_path_dir
996
997
  abs_path = dataset_home if flatten
998
+ fullpath = abs_path + "/" + file_name
997
999
 
998
1000
  begin
999
- FileUtils.mkdir_p(abs_path) unless File.exist? (abs_path + "/" + file_name)
1001
+ FileUtils.mkdir_p(abs_path) unless File.exist? (fullpath)
1000
1002
  rescue
1001
1003
  log_message("Could not create directory: #{abs_path}", Thor::Shell::Color::RED)
1002
1004
  exit(1)
1003
1005
  end
1004
1006
  begin
1005
- File.write "#{abs_path}/#{file_name}", open(f["url"]).read unless File.exist? (abs_path + "/" + file_name)
1007
+ unless File.exist?(fullpath)
1008
+ downloader.safe_operation("#{abs_path}/#{file_name}") do
1009
+ File.open(fullpath, "w") { |file| file.write open(f["url"]).read }
1010
+ end
1011
+ end
1006
1012
  rescue => e
1007
1013
  log_message("Could not download file: #{f["fullpath"]}", Thor::Shell::Color::RED)
1008
1014
  exit(1)
@@ -3062,6 +3068,7 @@ module Cnvrg
3062
3068
  method_option :notify_on_success, :type => :boolean, :aliases => ["-nos", "--notify_on_success"], :default => nil
3063
3069
  method_option :emails, :type => :string, :aliases => ["-es", "--emails"], :default => "", :desc => "additional emails to notify on success / or error, comma separated"
3064
3070
  method_option :wait, :type => :boolean, :aliases => ["-w", "--wait"], :default => false, :desc => "keep command session open until experiment finished to return exit status"
3071
+ method_option :debug, :type => :boolean, :aliases => ["--debug"], :default => true
3065
3072
 
3066
3073
  def run(*cmd)
3067
3074
  verify_logged_in(true)
@@ -3070,6 +3077,7 @@ module Cnvrg
3070
3077
  sync_before = options["sync_before"]
3071
3078
  sync_after = options["sync_after"]
3072
3079
  log = options["log"]
3080
+ debug = options["debug"]
3073
3081
  title = options["title"]
3074
3082
  commit = options["commit"] || nil
3075
3083
  email_notification = options["email_notification"]
@@ -3103,6 +3111,7 @@ module Cnvrg
3103
3111
  wait = false
3104
3112
  end
3105
3113
 
3114
+
3106
3115
  if !data.present? and data_query.present?
3107
3116
  log_message("Please provide data with data_query", Thor::Shell::Color::RED)
3108
3117
  exit(1)
@@ -3149,7 +3158,7 @@ module Cnvrg
3149
3158
  :image => image, :grid => grid, :data => data, :data_commit => data_commit, :ignore => ignore, :force => force, :sync_before_terminate => sync_before_terminate,
3150
3159
  :max_time => max_time,
3151
3160
  :periodic_sync => periodic_sync, :dataset_only_tree=> dataset_only_tree,
3152
- :output_dir=>output_dir, :data_query=>data_query, :git_commit =>git_commit, :git_branch=> git_branch,
3161
+ :output_dir=>output_dir, :data_query=>data_query, :git_commit =>git_commit, :git_branch=> git_branch, :debug => debug,
3153
3162
  :restart_if_stuck =>restart_if_stuck, :local_folders => local_folders, :datasets => datasets, :prerun => prerun, :requirements => requirements,
3154
3163
  :email_notification_error => email_notification_error, :email_notification_success => email_notification_success, :emails => emails, :wait => wait
3155
3164
 
@@ -3475,6 +3484,7 @@ module Cnvrg
3475
3484
  method_option :email_notification_success, :type => :boolean, :aliases => ["-nos", "--email_notification_success"], :default => true
3476
3485
  method_option :emails, :type => :string, :aliases => ["-es", "--emails"], :default => "", :desc => "additional emails to notify on success / or error"
3477
3486
  method_option :wait, :type => :boolean, :aliases => ["-w", "--wait"], :default => false, :desc => "keep command session open until experiment finished to return exit status"
3487
+ method_option :debug, :type => :boolean, :aliases => ["--debug"], :default => true
3478
3488
 
3479
3489
  def exec_remote(*cmd)
3480
3490
 
@@ -3492,6 +3502,7 @@ module Cnvrg
3492
3502
  data_query = options["data_query"] || nil
3493
3503
  sync_before = options["sync_before"]
3494
3504
  force = options["force"]
3505
+ debug = options["debug"]
3495
3506
  prerun = options["prerun"]
3496
3507
  requirements = options["requirements"]
3497
3508
  email_notification_error = options["email_notification_error"]
@@ -3535,8 +3546,8 @@ module Cnvrg
3535
3546
  local_folders_options = options["local_folders"]
3536
3547
  options_hash.except!("schedule", "recurring", "machine_type", "image", "upload_output", "grid", "data", "data_commit", "title",
3537
3548
  "local", "small", "medium", "large", "gpu", "gpuxl", "gpuxxl","max_time","dataset_only_tree",
3538
- "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets",
3539
- "requirements", "prerun", "email_notification_error", "email_notification_success", "emails", "wait")
3549
+ "data_query", "git_commit","git_branch","restart_if_stuck","local_folders","output_dir", "commit", "datasets",
3550
+ "requirements", "prerun", "email_notification_error", "email_notification_success", "emails", "wait","debug")
3540
3551
  exec_options = options_hash.map {|x| "--#{x[0]}=#{x[1]}"}.flatten.join(" ")
3541
3552
  command = "#{exec_options} #{remote} #{upload_output_option} #{cmd.flatten.join(" ")}"
3542
3553
  commit_to_run = options["commit"] || nil
@@ -3579,20 +3590,6 @@ module Cnvrg
3579
3590
  end
3580
3591
  end
3581
3592
 
3582
- if command.include? "'"
3583
- oc = command.to_enum(:scan, /'/).map {Regexp.last_match}
3584
- pairs = oc.enum_for(:each_slice, 2).to_a
3585
- pairs.each_with_index do |p, i|
3586
- add = 0
3587
- if i != 0
3588
- add = 2 * i
3589
- end
3590
- total_loc = command[p[0].offset(0)[0] + add..p[1].offset(0)[0] + add]
3591
- command[p[0].offset(0)[0] + add..p[1].offset(0)[0] + add] = "\"#{total_loc}\""
3592
- end
3593
-
3594
-
3595
- end
3596
3593
  log_message("Running remote experiment", Thor::Shell::Color::BLUE)
3597
3594
  exp = Experiment.new(project.owner, project.slug)
3598
3595
  if forced_commit and (commit_to_run.nil? or commit_to_run.empty?)
@@ -3603,7 +3600,7 @@ module Cnvrg
3603
3600
 
3604
3601
  res = exp.exec_remote(command, commit_to_run, instance_type, image, schedule, local_timestamp, grid, path_to_cmd, data, data_commit,
3605
3602
  periodic_sync, sync_before_terminate, max_time, ds_sync_options,output_dir,
3606
- data_query, git_commit, git_branch, restart_if_stuck,local_folders_options, title, datasets, prerun: prerun, requirements: requirements, recurring: recurring,
3603
+ data_query, git_commit, git_branch,debug, restart_if_stuck,local_folders_options, title, datasets, prerun: prerun, requirements: requirements, recurring: recurring,
3607
3604
  email_notification_error: email_notification_error, email_notification_success: email_notification_success, emails_to_notify: emails)
3608
3605
  if Cnvrg::CLI.is_response_success(res)
3609
3606
  check = Helpers.checkmark()
@@ -3655,6 +3652,7 @@ module Cnvrg
3655
3652
  sleep 3
3656
3653
  tries += 1
3657
3654
  retry if tries <= 5
3655
+ exit(1)
3658
3656
  end
3659
3657
  end
3660
3658
  end
@@ -11,6 +11,7 @@ module Cnvrg
11
11
 
12
12
  LARGE_FILE=1024*1024*5
13
13
  MULTIPART_SPLIT=10000000
14
+ RETRIES = ENV['UPLOAD_FILE_RETRIES'].try(:to_i) || 10
14
15
 
15
16
  attr_reader :base_resource
16
17
 
@@ -256,6 +257,7 @@ module Cnvrg
256
257
  end
257
258
 
258
259
  def delete_file_chunk(commit_sha1, regex_list, chunk_size, offset)
260
+ retry_count = 0
259
261
  begin
260
262
  resp = Cnvrg::API.request(
261
263
  @base_resource + "delete_files_by_chunk",
@@ -268,13 +270,19 @@ module Cnvrg
268
270
  }
269
271
  )
270
272
  unless Cnvrg::CLI.is_response_success(resp, false)
271
- Cnvrg::Logger.log_method(bind: binding)
272
273
  raise Exception.new("Got an error message from server, #{resp.try(:fetch, "message")}")
273
274
  end
274
275
  return resp["total_changes"]
275
276
  rescue => e
276
277
  Cnvrg::Logger.log_method(bind: binding)
277
278
  Cnvrg::Logger.log_error(e)
279
+
280
+ if retry_count < RETRIES
281
+ sleep(2**retry_count) # Exponential backoff
282
+ retry_count += 1
283
+ retry
284
+ end
285
+
278
286
  raise e
279
287
  end
280
288
  end
@@ -1338,7 +1346,7 @@ module Cnvrg
1338
1346
  # Cnvrg::Logger.log_info("Trying to download #{local_path} but its already exists, skipping..")
1339
1347
  # next
1340
1348
  # end
1341
- resp = @downloader.download(storage_path, local_path)
1349
+ resp = @downloader.safe_download(storage_path, local_path)
1342
1350
  Cnvrg::Logger.log_info("Download #{local_path} success resp: #{resp}")
1343
1351
  rescue => e
1344
1352
  Cnvrg::Logger.log_error(e)
@@ -14,11 +14,13 @@ module Cnvrg
14
14
  end
15
15
 
16
16
  def extract_key_iv(sts_path)
17
- count = 20
17
+ count = 0
18
18
  begin
19
19
  count += 1
20
20
  sts = open(sts_path, {ssl_verify_mode: 0}).read rescue nil
21
21
  rescue => e
22
+ backoff_time_seconds = backoff_time(count)
23
+ sleep backoff_time_seconds
22
24
  Cnvrg::Logger.log_error(e)
23
25
  retry if count <= 20
24
26
  raise StandardError.new("Cant access storage: #{e.message}")
@@ -30,10 +32,14 @@ module Cnvrg
30
32
  file.gsub(prefix, '').gsub(/^\/*/, '')
31
33
  end
32
34
 
33
- def download(storage_path, local_path)
35
+ def download(storage_path, local_path, decrypt: true)
34
36
  ### need to be implemented..
35
37
  end
36
38
 
39
+ def safe_download(storage_path, local_path, decrypt: true)
40
+ safe_operation(local_path) { self.download(storage_path, local_path, decrypt: decrypt) }
41
+ end
42
+
37
43
  def upload(storage_path, local_path)
38
44
  ### need to be implemented..
39
45
  end
@@ -51,17 +57,34 @@ module Cnvrg
51
57
  end
52
58
 
53
59
  def safe_upload(storage_path, local_path)
60
+ safe_operation(local_path) { self.upload(storage_path, local_path) }
61
+ end
62
+
63
+ def self.factory(params)
64
+ params = params.as_json
65
+ case params["storage"]
66
+ when 's3', 'minio'
67
+ return Cnvrg::Downloader::Clients::S3Client.new(sts_path: params["path_sts"], access_key: params["sts_a"], secret: params["sts_s"], session_token: params["sts_st"], region: params["region"], bucket: params["bucket"], encryption: params["encryption"], endpoint: params["endpoint"], storage: params["storage"])
68
+ when 'azure'
69
+ azure_params = params.symbolize_keys.slice(*[:storage_account_name, :storage_access_key, :container, :sts])
70
+ return Cnvrg::Downloader::Clients::AzureClient.new(**azure_params)
71
+ when 'gcp'
72
+ return Cnvrg::Downloader::Clients::GcpClient.new(project_id: params["project_id"], credentials: params["credentials"], bucket_name: params["bucket_name"], sts: params["sts"])
73
+ end
74
+ end
75
+
76
+ def safe_operation(local_path)
54
77
  n = 1
55
78
  error = nil
56
79
  while n <= RETRIES
57
80
  begin
58
- self.upload(storage_path, local_path)
81
+ yield
59
82
  error = nil
60
83
  break
61
84
  rescue => e
62
85
  backoff_time_seconds = backoff_time(n)
63
86
 
64
- message = "Got error: #{e.class.name} with message: #{e.message} while uploading a single file: #{local_path}, retry: #{n} of: #{RETRIES}"
87
+ message = "Got error: #{e.class.name} with message: #{e.message} while uploading / downloading a single file: #{local_path}, retry: #{n} of: #{RETRIES}"
65
88
  if n < RETRIES
66
89
  message += ", next retry in: #{backoff_time_seconds} seconds"
67
90
  else
@@ -79,19 +102,6 @@ module Cnvrg
79
102
  true
80
103
  end
81
104
 
82
- def self.factory(params)
83
- params = params.as_json
84
- case params["storage"]
85
- when 's3', 'minio'
86
- return Cnvrg::Downloader::Clients::S3Client.new(sts_path: params["path_sts"], access_key: params["sts_a"], secret: params["sts_s"], session_token: params["sts_st"], region: params["region"], bucket: params["bucket"], encryption: params["encryption"], endpoint: params["endpoint"], storage: params["storage"])
87
- when 'azure'
88
- azure_params = params.symbolize_keys.slice(*[:storage_account_name, :storage_access_key, :container, :sts])
89
- return Cnvrg::Downloader::Clients::AzureClient.new(**azure_params)
90
- when 'gcp'
91
- return Cnvrg::Downloader::Clients::GcpClient.new(project_id: params["project_id"], credentials: params["credentials"], bucket_name: params["bucket_name"], sts: params["sts"])
92
- end
93
- end
94
-
95
105
  private
96
106
 
97
107
  def random_number_milliseconds
@@ -31,7 +31,7 @@ module Cnvrg
31
31
  @tempfile = t
32
32
  end
33
33
 
34
- def download(storage_path, local_path)
34
+ def download(storage_path, local_path, decrypt: true)
35
35
  prepare_download(local_path)
36
36
  file = @bucket.file(decrypt(storage_path))
37
37
  file.download local_path
@@ -110,7 +110,7 @@ module Cnvrg
110
110
 
111
111
  def exec_remote(command, commit_to_run, instance_type, image_slug,schedule,local_timestamp, grid,path_to_cmd,data, data_commit,periodic_sync,
112
112
  sync_before_terminate, max_time, ds_sync_options=0,output_dir=nil,data_query=nil,
113
- git_commit=nil, git_branch=nil, restart_if_stuck=nil, local_folders=nil,title=nil, datasets=nil, prerun: true, requirements: true, recurring: nil,
113
+ git_commit=nil, git_branch=nil,debug=true, restart_if_stuck=nil, local_folders=nil,title=nil, datasets=nil, prerun: true, requirements: true, recurring: nil,
114
114
  email_notification_error: false, email_notification_success: false, emails_to_notify: nil)
115
115
  response = Cnvrg::API.request("users/#{@owner}/projects/#{@project_slug}/experiment/remote", 'POST', {command: command, image_slug: image_slug,
116
116
  commit_sha1: commit_to_run,
@@ -119,6 +119,7 @@ module Cnvrg
119
119
  local_timestamp:local_timestamp,
120
120
  datasets: datasets,
121
121
  grid: grid,
122
+ debug:debug,
122
123
  path_to_cmd:path_to_cmd,dataset_slug:data,
123
124
  dataset_commit: data_commit,max_time:max_time,
124
125
  periodic_sync:periodic_sync, sync_before_terminate:sync_before_terminate,
@@ -789,7 +789,7 @@ module Cnvrg
789
789
  end
790
790
  local_path = project_home+"/"+file_path
791
791
  storage_path = f["path"]
792
- @client.download(storage_path, local_path)
792
+ @client.safe_download(storage_path, local_path)
793
793
  progress.progress += 1 if progress.present?
794
794
  download_succ_count += 1
795
795
  rescue => e
@@ -962,7 +962,7 @@ module Cnvrg
962
962
 
963
963
  def download_file(file_path: '', key: '', iv: '', bucket: '', path: '', client: nil)
964
964
  local_path = @project_home+"/"+file_path
965
- @client.download(path, local_path)
965
+ @client.safe_download(path, local_path)
966
966
  end
967
967
 
968
968
  def delete(file)
@@ -75,9 +75,11 @@ class Cnvrg::Helpers::Agent
75
75
 
76
76
  def exec!
77
77
  log_internal("Command: #{@command} with slug: #{@slug} started!")
78
- if should_run?
78
+ if @command.blank?
79
+ @exit_status = 0
80
+ elsif should_run?
79
81
  send_logs(status: Status::STARTED)
80
- periodic_thread
82
+ periodic_thread_handle = periodic_thread
81
83
  execute_command
82
84
  else
83
85
  @exit_status = 127
@@ -86,6 +88,9 @@ class Cnvrg::Helpers::Agent
86
88
  finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
87
89
  log_internal(finish_log)
88
90
  send_logs(exit_status: @exit_status, status: Status::FINISHED)
91
+ if periodic_thread_handle.present?
92
+ periodic_thread_handle.join
93
+ end
89
94
  end
90
95
 
91
96
  def get_logs_to_send
@@ -120,17 +120,31 @@ class Cnvrg::Helpers::Executer
120
120
  end
121
121
 
122
122
  def init
123
- resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
124
- machine_activity = resp["machine_activity"]
125
- Cnvrg::Logger.log_info("Got back machine activity #{machine_activity}")
126
- if machine_activity.present? and @machine_activity != machine_activity
127
- Cnvrg::Logger.log_info("Changing to machine activity #{machine_activity}")
128
- machine_activity_yml = {slug: machine_activity}
129
- File.open("/conf/.machine_activity.yml", "w+") {|f| f.write machine_activity_yml.to_yaml}
130
- @machine_activity = machine_activity
123
+ retries = 0
124
+ success = false
125
+ puts("Agent started, connecting to #{Cnvrg::API.get_api}")
126
+ STDOUT.flush
127
+ while !success and retries < 100
128
+ begin
129
+ resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
130
+ machine_activity = resp["machine_activity"]
131
+ success = true
132
+ puts("Connected to server")
133
+ STDOUT.flush
134
+ Cnvrg::Logger.log_info("Got back machine activity #{machine_activity}")
135
+ if machine_activity.present? and @machine_activity != machine_activity
136
+ Cnvrg::Logger.log_info("Changing to machine activity #{machine_activity}")
137
+ machine_activity_yml = {slug: machine_activity}
138
+ File.open("/conf/.machine_activity.yml", "w+") {|f| f.write machine_activity_yml.to_yaml}
139
+ @machine_activity = machine_activity
140
+ end
141
+ rescue => e
142
+ Cnvrg::Logger.log_error(e)
143
+ Cnvrg::Logger.info("Sleeping for #{5 * retries}")
144
+ sleep(5 * retries)
145
+ retries +=1
146
+ end
131
147
  end
132
- rescue => e
133
- Cnvrg::Logger.log_error(e)
134
148
  end
135
149
 
136
150
  def polling_thread
@@ -34,7 +34,7 @@ module Cnvrg
34
34
  @element.get_clone_chunk(commit: commit, chunk_size: params[:limit], offset: params[:offset])
35
35
  end
36
36
  action = Proc.new do |storage, local|
37
- @client.download(storage, local)
37
+ @client.safe_download(storage, local)
38
38
  end
39
39
 
40
40
  @stats = @element.get_stats
@@ -1,3 +1,3 @@
1
1
  module Cnvrg
2
- VERSION = '1.11.6'
2
+ VERSION = '1.11.12'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cnvrg
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.6
4
+ version: 1.11.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yochay Ettun
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2020-11-17 00:00:00.000000000 Z
13
+ date: 2021-01-06 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: bundler