cnvrg 1.6.36 → 1.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c5e60c1cc7c0af133390dd106ee666db3787b6c995b54d5ef97edff3d073ff14
4
- data.tar.gz: 3d37f71642874898b131812fdeb1ca7be3b7eebddc3a8aa521245c31eaa47a0b
3
+ metadata.gz: 10a6bb8d2946d743e8dd1f609369d503c9bf44a9ba748e1e2dfb33df57444aa0
4
+ data.tar.gz: 382e9b28d7edb8856bcd12d5accabf31eb0f264055b4f652da94508a97458b3c
5
5
  SHA512:
6
- metadata.gz: dc82247f7d78186aa0b35084d83b0d4ae72608f0c765727ee57bebcec76d6bc9c84fad2795c7c7ed7ba6ac64921df49c9c45cd3536128b11b8a5b8203e8e0040
7
- data.tar.gz: 896d84a1615dd3a698c5e78ce92d7422fca9e6f21e1c7e9ca6011ab315437b39b2f3f5fb0790d683397e2ba6bdaaad4185156255e615e298471b50949201f9d2
6
+ metadata.gz: db580ef688cf3c3a1e3c95f62c3c520fa606a13ebc678fee420c312e89b7c2c62e39783d25a7962d1f1ba5fa439c968626164443c0828d4fae5604eb881794fd
7
+ data.tar.gz: 296974a98310ef9ba922124385723a33380cd6fd412d80d18cb3cb0a73b070a1a119d61ce26b8a4fe560e6f7a8b591fb222e3ba6ccc09c046d891a02436358bd
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
32
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
33
  spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
34
- spec.add_runtime_dependency 'aws-sdk', '~> 2.11.417'
34
+ spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
35
35
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
36
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
37
37
  spec.add_runtime_dependency 'google-cloud-core', '~> 1.3.2'
@@ -40,11 +40,8 @@ Gem::Specification.new do |spec|
40
40
  spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
41
41
  spec.add_runtime_dependency 'parallel', '~> 1.12.0'
42
42
  spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
43
-
44
43
  spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
45
- spec.add_runtime_dependency 'docker-api', '~> 1.33'
46
44
  spec.add_runtime_dependency 'activesupport', '~> 5.2.0'
47
45
  spec.add_runtime_dependency 'ruby-progressbar'
48
- spec.add_runtime_dependency 'net-ssh'
49
46
  spec.add_runtime_dependency 'down'
50
47
  end
@@ -1,7 +1,5 @@
1
1
  require 'fileutils'
2
2
  require 'cnvrg/files'
3
- require 'docker'
4
- require 'net/ssh'
5
3
  require 'mimemagic'
6
4
 
7
5
 
@@ -175,58 +173,6 @@ module Cnvrg
175
173
  response = Cnvrg::API.request("users/#{owner}/images/#{slug}/commit_custom_image", 'POST', {image_logs:logs})
176
174
  return response
177
175
  end
178
- def self.ssh_to_machine(resp)
179
-
180
- sts_path = resp["result"]["sts_path"]
181
-
182
- uri = URI.parse(sts_path)
183
-
184
- http_object = Net::HTTP.new(uri.host, uri.port)
185
- http_object.use_ssl = true if uri.scheme == 'https'
186
- request = Net::HTTP::Get.new(sts_path)
187
-
188
- body = ""
189
- http_object.start do |http|
190
- response = http.request request
191
- body = response.read_body
192
- end
193
-
194
- URLcrypt::key = [body].pack('H*')
195
-
196
- ip = URLcrypt.decrypt(resp["result"]["machine_i"])
197
-
198
- user = URLcrypt.decrypt(resp["result"]["machine_u"])
199
- key = URLcrypt.decrypt(resp["result"]["machine_k"])
200
- tempssh = Tempfile.new "sshkey"
201
- tempssh.write open(key).read
202
- tempssh.rewind
203
- key_path = tempssh.path
204
- count = 0
205
- while count < 5
206
-
207
- begin
208
- ssh = Net::SSH.start(ip, user=user, :keys => key_path, :timeout => 10)
209
- if !ssh.nil?
210
- return ssh
211
- else
212
- count+=1
213
- sleep(2)
214
-
215
- end
216
- rescue
217
- count+=1
218
- sleep(2)
219
-
220
-
221
- end
222
- end
223
- if tempssh
224
- tempssh.close
225
- tempssh.unlink
226
- end
227
- return false
228
- end
229
-
230
176
 
231
177
 
232
178
  def create_custom_image(new_image_name,working_dir,stored_commands)
@@ -270,100 +216,6 @@ module Cnvrg
270
216
  File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
271
217
  end
272
218
 
273
- def get_container(stop=false)
274
- begin
275
- container_id=is_container_exist()
276
-
277
- if !container_id
278
- return create_container()
279
- else
280
- container = Docker::Container.get(container_id)
281
- status = container.json["State"]["Status"]
282
-
283
- if status == "running"
284
- return container
285
- else
286
- if stop
287
- return false
288
- end
289
- res = container.start()
290
- if res.info["State"]["Status"].eql? "exited" and res.info["State"]["Error"].include? "port is already allocated"
291
- return create_container()
292
- end
293
- return container
294
- end
295
- end
296
- rescue => e
297
- if e.message.include? "No such container"
298
-
299
- return create_container()
300
- else
301
- return false
302
- end
303
- end
304
-
305
- end
306
-
307
- def create_container(port=7654, is_remote=false)
308
- begin
309
- image_settings = {
310
- 'Image' => "#{@image_name}:latest",
311
- 'User' => 'ds',
312
- 'Cmd' => '/usr/local/cnvrg/run_ipython.sh',
313
- 'WorkingDir' => '/home/ds/notebooks',
314
- 'ExposedPorts' => {
315
- '8888/tcp' => {},
316
- },
317
- 'HostConfig' => {
318
- 'Binds' => ["#{@working_dir}:/home/ds/notebooks"],
319
- 'PortBindings' => {
320
- '8888/tcp' => [
321
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
322
- ],
323
- },
324
- },
325
- }
326
- container = Docker::Container.create(image_settings)
327
- container.start()
328
- netrc = File.open(File.expand_path('~')+"/.netrc", "rb")
329
- netrc_content = netrc.read
330
- container.store_file("/home/ds/.netrc", netrc_content)
331
- command = ["/bin/bash", "-lc", "sudo chmod 600 /home/ds/.netrc"]
332
- p = container.exec(command, tty: true)
333
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.netrc"]
334
- p = container.exec(command, tty: true)
335
- config = File.open(File.expand_path('~')+"/.cnvrg/config.yml", "rb")
336
- config_content = config.read
337
- container.store_file("/home/ds/.cnvrg/config.yml", config_content)
338
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg"]
339
- container.exec(command, tty: true)
340
- # Libraries instlled
341
- save_installed_libraries(container)
342
- config = {project_name: @project_name,
343
- project_slug: @project_slug,
344
- owner: @owner,
345
- docker: true, image_base: @image_name, image_tag: @image_tag, container: container.id, port: port, image_slug: @image_slug}
346
-
347
- File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
348
-
349
-
350
- return container
351
-
352
-
353
- rescue => e
354
- if e.message.include? "is not running"
355
- return create_container(port-1)
356
- end
357
- return false
358
- rescue SignalException
359
-
360
- say "\nAborting", Thor::Shell::Color::RED
361
- exit(1)
362
- end
363
-
364
-
365
- end
366
-
367
219
  def save_installed_libraries(container)
368
220
  begin
369
221
  command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
@@ -77,20 +77,22 @@ module Cnvrg
77
77
  if response.to_hash[:status] == 404
78
78
  return false
79
79
  end
80
- if parse_request == true
80
+ if parse_request
81
81
  JSON.parse(response.body)
82
82
  else
83
83
  response
84
84
  end
85
- when 'POST', 'PUT'
85
+ when 'POST', 'PUT'
86
86
  conn.options.timeout = 4200
87
- conn.options.open_timeout=180
87
+ conn.options.open_timeout = 180
88
+ conn.headers['Content-Type'] = "application/json"
88
89
  retries = 0
89
90
  success = false
91
+ data = data || {}
90
92
  while !success and retries < 20
91
93
  begin
92
- response = conn.post "#{resource}", data if method.eql? 'POST'
93
- response = conn.put "#{resource}", data if method.eql? 'PUT'
94
+ response = conn.post "#{resource}", data.to_json if method.eql? 'POST'
95
+ response = conn.put "#{resource}", data.to_json if method.eql? 'PUT'
94
96
  success = true
95
97
  Cnvrg::API.parse_version(response)
96
98
 
@@ -113,7 +115,7 @@ module Cnvrg
113
115
  end
114
116
  when 'POST_JSON'
115
117
  conn.options.timeout = 4200
116
- conn.options.open_timeout =4200
118
+ conn.options.open_timeout = 4200
117
119
  conn.headers['Content-Type'] = "application/json"
118
120
  new_data = JSON.dump(data)
119
121
 
@@ -124,8 +126,6 @@ module Cnvrg
124
126
  begin
125
127
  response = conn.post "#{resource}", new_data
126
128
  success = true
127
- Cnvrg::API.parse_version(response)
128
-
129
129
  rescue => e
130
130
  Cnvrg::Logger.log_error(e)
131
131
  sleep(5)
@@ -0,0 +1,14 @@
1
+ module Cnvrg
2
+ class API_V2 < API
3
+ ENDPOINT_VERSION = 'v2'
4
+
5
+ def self.endpoint_uri
6
+ api = get_api()
7
+ return "#{api}/#{Cnvrg::API_V2::ENDPOINT_VERSION}"
8
+ end
9
+
10
+ def self.is_response_success(response)
11
+ raise Exception.new("Bad status in response #{response.status}") if response.status != 200
12
+ end
13
+ end
14
+ end
@@ -12,7 +12,6 @@ require 'digest' # sha1up
12
12
  require "highline/import"
13
13
  require 'socket'
14
14
  require 'thor'
15
- require 'docker'
16
15
  require 'socket'
17
16
  require 'timeout'
18
17
  require 'fileutils'
@@ -28,13 +27,11 @@ require 'cnvrg/auth'
28
27
  require 'cnvrg/project'
29
28
  require 'cnvrg/files'
30
29
  require 'cnvrg/experiment'
31
- require 'cnvrg/Images'
32
30
  require 'cnvrg/image'
33
31
  require 'cnvrg/dataset'
34
32
  require 'cnvrg/datafiles'
35
33
  require 'cnvrg/data'
36
34
  require 'cnvrg/storage'
37
- require 'cnvrg/ssh'
38
35
  require 'cnvrg/result'
39
36
  require 'cnvrg/logger'
40
37
  require 'cnvrg/org_helpers'
@@ -49,6 +46,9 @@ require 'cnvrg/downloader/clients/s3_client'
49
46
  require 'cnvrg/downloader/clients/gcp_client'
50
47
  require 'cnvrg/downloader/clients/azure_client'
51
48
  require 'cnvrg/job_cli'
49
+ require 'cnvrg/job_ssh'
50
+ require 'cnvrg/connect_job_ssh'
51
+ require 'cnvrg/api_v2'
52
52
 
53
53
  class Thor
54
54
  module Base
@@ -175,6 +175,9 @@ module Cnvrg
175
175
  desc "job", "manage running jobs", :hide => false
176
176
  subcommand "job", JobCli
177
177
 
178
+ desc "ssh", "ssh into running jobs", :hide => false
179
+ subcommand "ssh", JobSsh
180
+
178
181
  desc "image [COMMAND]", "build existing images", :hide => true
179
182
  subcommand "image", ImageCli
180
183
 
@@ -819,9 +822,9 @@ module Cnvrg
819
822
  end
820
823
 
821
824
  desc 'data verify', 'Verify datasets', :hide => true
822
- method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => 15
825
+ method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => nil
823
826
 
824
- def verify_datasets(dataset_titles, timeout=0)
827
+ def verify_datasets(dataset_titles, timeout=nil)
825
828
  begin
826
829
  verify_logged_in(false)
827
830
  log_start(__method__, args, options)
@@ -830,21 +833,31 @@ module Cnvrg
830
833
  log_message("All datasets are verified", Thor::Shell::Color::BLUE) if verified
831
834
  log_message("Failed to verify datasets", Thor::Shell::Color::RED) if !verified
832
835
  exit(1) if !verified
833
-
834
836
  rescue SignalException
835
837
  say "\nAborting", Thor::Shell::Color::RED
836
838
  exit(1)
837
839
  end
838
840
  end
839
841
 
842
+ desc 'data scan', 'Lookup datasets', :hide => true
843
+ def scan_datasets()
844
+ begin
845
+ verify_logged_in(false)
846
+ log_start(__method__, args, options)
847
+ log_message("Scanning datasets", Thor::Shell::Color::BLUE)
848
+ datasets = Dataset.scan_datasets()
849
+ puts(datasets.to_json)
850
+ end
851
+ end
852
+
840
853
  desc 'data clone', 'Clone dataset', :hide => true
841
854
  method_option :commit, :type => :string, :aliases => ["-c", "--commit"], :default => ""
842
855
  method_option :only_tree, :type => :boolean, :aliases => ["-t", "--tree"], :default => false
843
856
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => nil
844
857
  method_option :read, :type => :boolean, :aliases => ["-r", "--read"], :default => false
845
858
  method_option :remote, :type => :boolean, :aliases => ["-h", "--remote"], :default => false
846
-
847
- def clone_data(dataset_url,only_tree=false,commit=nil,query=nil,read=false,remote=false, relative: false)
859
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
860
+ def clone_data(dataset_url, only_tree=false, commit=nil, query=nil, read=false, remote=false, flatten: false, relative: false, soft: false)
848
861
  begin
849
862
  verify_logged_in(false)
850
863
  log_start(__method__, args, options)
@@ -853,10 +866,10 @@ module Cnvrg
853
866
  read = options["read"] || read || false
854
867
  remote = options["remote"] || remote || false
855
868
  query = options['query'].presence || query.presence
869
+ soft = options['soft'] || soft
856
870
  if query.present?
857
- return clone_data_query(dataset_url, query)
871
+ return clone_data_query(dataset_url, query, flatten, soft: soft)
858
872
  end
859
- @executer = Cnvrg::Helpers::Executer.get_executer
860
873
 
861
874
  url_parts = dataset_url.split("/")
862
875
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
@@ -868,6 +881,8 @@ module Cnvrg
868
881
  dataset_name = response["result"]["name"]
869
882
  dataset_home = Dir.pwd+"/"+dataset_name
870
883
 
884
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name, commit: response["result"]["commit"]) if soft
885
+
871
886
  check = Helpers.checkmark
872
887
  if @dataset.init_home(remote:remote)
873
888
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -875,14 +890,12 @@ module Cnvrg
875
890
  log_message("Downloading files", Thor::Shell::Color::BLUE)
876
891
  if @dataset.softlinked?
877
892
  @files.cp_ds(relative: relative)
878
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
879
893
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
880
894
  @dataset.write_success
881
895
  return
882
896
  end
883
897
 
884
898
  if only_tree
885
-
886
899
  success = Dataset.clone_tree(commit: commit, dataset_home: dataset_home)
887
900
  return if success
888
901
  end
@@ -900,7 +913,7 @@ module Cnvrg
900
913
 
901
914
  while files['keys'].length > 0
902
915
  Cnvrg::Logger.log_info("download multiple files, #{downloaded_files.size} files downloaded")
903
- @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read)
916
+ @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read, flatten: flatten)
904
917
 
905
918
  downloaded_files += files['keys'].length
906
919
  files = @files.get_clone_chunk(commit: commit, latest_id: files['latest'])
@@ -908,7 +921,6 @@ module Cnvrg
908
921
  progressbar.finish
909
922
  if downloaded_files == files_count
910
923
  Dataset.verify_cnvrgignore_exist(dataset_name, false)
911
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
912
924
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
913
925
  @dataset.write_success
914
926
  ### if read, dont generate idx (but create idx.yml) if not read, generate idx.
@@ -930,12 +942,14 @@ module Cnvrg
930
942
 
931
943
  desc 'data clone_query', 'Clone dataset _query', :hide => true
932
944
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => ""
933
- def clone_data_query(dataset_url,query=nil)
945
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
946
+ def clone_data_query(dataset_url, query=nil, flatten=false, soft: false)
934
947
  begin
935
948
  verify_logged_in(false)
936
- @executer = Cnvrg::Helpers::Executer.get_executer
949
+ #@executer = Cnvrg::Helpers::Executer.get_executer
937
950
  log_start(__method__, args, options)
938
951
  query = options["query"] || query
952
+ soft = options["soft"] || soft
939
953
  if !query.present?
940
954
  log_message("Argument missing : query", Thor::Shell::Color::RED)
941
955
  exit(1)
@@ -945,13 +959,14 @@ module Cnvrg
945
959
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
946
960
  slug = url_parts[project_index + 1]
947
961
  owner = url_parts[project_index - 1]
948
-
949
962
  response = Cnvrg::API.request("users/#{owner}/datasets/#{slug}/search/#{query}", 'GET')
950
963
  Cnvrg::CLI.is_response_success(response,true)
951
964
  dataset_name = response["results"]["name"]
952
965
  dataset_slug = response["results"]["slug"]
953
- dataset_home = File.join(Dir.pwd, dataset_name)
966
+ dataset_home = Dir.pwd+"/"+dataset_slug
967
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name) if soft
954
968
 
969
+ # dataset_home = Dir.pwd
955
970
  if Dataset.blank_clone(owner, dataset_name, dataset_slug)
956
971
  dataset = Dataset.new(dataset_home)
957
972
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -966,6 +981,7 @@ module Cnvrg
966
981
  },
967
982
  in_threads: ParallelThreads
968
983
  }
984
+
969
985
  begin
970
986
  log_message("Downloading files", Thor::Shell::Color::BLUE)
971
987
  Parallel.map((response["results"]["query_files"]), parallel_options) do |f|
@@ -974,6 +990,7 @@ module Cnvrg
974
990
  file_name = relative_path_dir.pop()
975
991
  relative_path_dir = relative_path_dir.join("/")
976
992
  abs_path = dataset_home + "/" + relative_path_dir
993
+ abs_path = dataset_home if flatten
977
994
  begin
978
995
  FileUtils.mkdir_p(abs_path) unless File.exist? (abs_path + "/" + file_name)
979
996
  rescue
@@ -981,14 +998,14 @@ module Cnvrg
981
998
  exit(1)
982
999
  end
983
1000
  begin
984
- File.write "#{abs_path}/#{file_name}", open(f["s3_url"]).read unless File.exist? (abs_path + "/" + file_name)
985
- rescue
1001
+ File.write "#{abs_path}/#{file_name}", open(f["url"]).read unless File.exist? (abs_path + "/" + file_name)
1002
+ rescue => e
986
1003
  log_message("Could not download file: #{f["fullpath"]}", Thor::Shell::Color::RED)
987
1004
  exit(1)
988
1005
  end
989
1006
 
990
1007
  end
991
- @executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
1008
+ #@executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
992
1009
  rescue Interrupt
993
1010
  log_message("Couldn't download", Thor::Shell::Color::RED)
994
1011
  exit(1)
@@ -998,7 +1015,7 @@ module Cnvrg
998
1015
  check = Helpers.checkmark
999
1016
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
1000
1017
  dataset.write_success(in_folder=true)
1001
- rescue
1018
+ rescue => e
1002
1019
  exit(1)
1003
1020
  end
1004
1021
  end
@@ -1008,32 +1025,6 @@ module Cnvrg
1008
1025
  end
1009
1026
  end
1010
1027
 
1011
- desc 'init_data_container', 'Init dataset directory', :hide => true
1012
- method_option :login_content, :type => :string, :aliases => ["-l"], :default => ""
1013
-
1014
- def init_data_container(container)
1015
- begin
1016
- login_content = options["login_content"]
1017
-
1018
- container = Docker::Container.get(container)
1019
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
1020
- container.exec(command, tty: true)
1021
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
1022
- container.exec(command, tty: true)
1023
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
1024
- container.exec(command, tty: true)
1025
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg /home/ds/.netrc"]
1026
- container.exec(command, tty: true)
1027
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
1028
- container.exec(command, tty: true)
1029
-
1030
- rescue SignalException
1031
-
1032
- say "\nAborting", Thor::Shell::Color::RED
1033
- exit(1)
1034
- end
1035
- end
1036
-
1037
1028
  desc 'data_snap', 'Init dataset directory', :hide => true
1038
1029
  method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
1039
1030
 
@@ -1184,17 +1175,29 @@ module Cnvrg
1184
1175
  end
1185
1176
 
1186
1177
  desc '', '', :hide => true
1187
- def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, message: nil)
1178
+ def get_owner_slug(url_or_slug)
1179
+ if url_or_slug =~ URI::regexp
1180
+ # Find owner and slug in url
1181
+ url_parts = url_or_slug.split("/")
1182
+ project_index = Cnvrg::Helpers.look_for_in_path(url_or_slug, "datasets")
1183
+ slug = url_parts[project_index + 1]
1184
+ owner = url_parts[project_index - 1]
1185
+ else
1186
+ # Find owner in config file
1187
+ owner = CLI.get_owner
1188
+ slug = url_or_slug
1189
+ end
1190
+ return owner, slug
1191
+ end
1192
+
1193
+ desc '', '', :hide => true
1194
+ def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, force: false, threads: 15, message: nil)
1188
1195
  begin
1189
1196
  verify_logged_in(false)
1190
1197
  log_start(__method__, args, options)
1191
1198
 
1192
- #find owner and slug in url
1193
- url_parts = dataset_url.split("/")
1194
- project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
1195
- slug = url_parts[project_index + 1]
1196
- owner = url_parts[project_index - 1]
1197
- @dataset = Dataset.new(dataset_url: dataset_url)
1199
+ owner, slug = get_owner_slug(dataset_url)
1200
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1198
1201
  @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1199
1202
  @files = @datafiles.verify_files_exists(files)
1200
1203
 
@@ -1218,28 +1221,33 @@ module Cnvrg
1218
1221
  else
1219
1222
  @commit = commit
1220
1223
  end
1221
- #dir shouldnt have starting or ending slash.
1224
+
1225
+ # dir shouldnt have starting or ending slash.
1222
1226
  dir = dir[0..-2] if dir.end_with? '/'
1223
1227
  dir = dir[1..-1] if dir.start_with? '/'
1224
1228
 
1225
- @files.each_slice(chunk_size).each do |list_files|
1226
- temp_tree = @dataset.generate_chunked_idx(list_files, prefix: dir)
1227
- #will throw a signal exception if something goes wrong.
1228
- @datafiles.upload_multiple_files(@commit, temp_tree, force: true, prefix: dir, total: @files.size)
1229
+ @datafiles.upload_multiple_files_optimized(
1230
+ @files,
1231
+ @commit,
1232
+ force: force,
1233
+ chunk_size: chunk_size,
1234
+ prefix: dir,
1235
+ threads: threads
1236
+ )
1237
+
1238
+ # This is for backwards compatibility only and should be removed in future versions:
1239
+ res = @datafiles.put_commit(@commit)
1240
+ unless res.is_success?
1241
+ raise SignalException.new(1, res.msg)
1229
1242
  end
1230
- if commit.blank?
1231
- res = @datafiles.put_commit(@commit)
1232
- unless res.is_success?
1233
- raise SignalException.new(1, res.msg)
1234
- end
1235
- else
1236
- res = @datafiles.end_commit(@commit,false, success: true )
1237
- msg = res['result']
1238
- response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1239
- unless response.is_success?
1240
- raise SignalException.new(1, res.msg)
1241
- end
1243
+
1244
+ res = @datafiles.end_commit(@commit,false, success: true, commit_type: "put")
1245
+ msg = res['result']
1246
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1247
+ unless response.is_success?
1248
+ raise SignalException.new(1, res.msg)
1242
1249
  end
1250
+
1243
1251
  log_message("Uploading files finished Successfully", Thor::Shell::Color::GREEN)
1244
1252
  rescue SignalException => e
1245
1253
  log_message(e.message, Thor::Shell::Color::RED)
@@ -1248,7 +1256,49 @@ module Cnvrg
1248
1256
  end
1249
1257
 
1250
1258
 
1259
+ desc '', '', :hide => true
1260
+ def data_rm(dataset_url, regex_list: [], commit: '', message: nil)
1261
+ begin
1262
+ verify_logged_in(false)
1263
+ log_start(__method__, args, options)
1251
1264
 
1265
+ owner, slug = get_owner_slug(dataset_url)
1266
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1267
+ @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1268
+
1269
+ # Init a new commit
1270
+ response = @datafiles.start_commit(false, true, chunks: 1, message: message )
1271
+ unless response #means we failed in the start commit.
1272
+ raise SignalException.new(1, "Cant put files into dataset, check the dataset id")
1273
+ end
1274
+ @commit = response['result']['commit_sha1']
1275
+ files_to_delete, folders_to_delete, job_id = @datafiles.delete_multiple_files(@commit, regex_list)
1276
+ log_message("Deleting #{files_to_delete} files and #{folders_to_delete} folders", Thor::Shell::Color::GREEN)
1277
+
1278
+ total_files = files_to_delete + folders_to_delete
1279
+ current_progress = 0
1280
+ progressbar = @datafiles.create_progressbar("Delete Progress", total_files)
1281
+ chunk_size = 1000
1282
+ offset = 0
1283
+ while current_progress < total_files
1284
+ current_progress = @datafiles.delete_file_chunk(@commit, regex_list, chunk_size, offset)
1285
+ progressbar.progress = current_progress
1286
+ offset += chunk_size
1287
+ end
1288
+
1289
+ res = @datafiles.end_commit(@commit,false, success: true)
1290
+ msg = res['result']
1291
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1292
+ unless response.is_success?
1293
+ raise SignalException.new(1, res.msg)
1294
+ end
1295
+
1296
+ log_message("Deleting files finished Successfully", Thor::Shell::Color::GREEN)
1297
+ rescue SignalException => e
1298
+ log_message(e.message, Thor::Shell::Color::RED)
1299
+ return false
1300
+ end
1301
+ end
1252
1302
 
1253
1303
  desc 'upload_data', 'Upload data files', :hide => true
1254
1304
  method_option :ignore, :type => :array, :aliases => ["-i", "--i"], :desc => "ignore following files"
@@ -1699,18 +1749,22 @@ module Cnvrg
1699
1749
  end
1700
1750
 
1701
1751
  desc 'data commits', 'List all commits for a specific dataset', :hide => true
1702
-
1703
- def list_dataset_commits()
1704
- verify_logged_in(true)
1752
+ def list_dataset_commits(dataset_url, commit_sha1: nil)
1753
+ verify_logged_in(false)
1705
1754
  log_start(__method__, args, options)
1706
1755
 
1707
- dataset_dir = is_cnvrg_dir(Dir.pwd)
1708
- @dataset = Dataset.new(dataset_dir)
1709
- result = @dataset.list_commits()
1756
+ if dataset_url == "."
1757
+ dataset_dir = is_cnvrg_dir(Dir.pwd)
1758
+ @dataset = Dataset.new(dataset_dir)
1759
+ else
1760
+ owner, slug = get_owner_slug(dataset_url)
1761
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1762
+ end
1763
+
1764
+ result = @dataset.list_commits(commit_sha1:commit_sha1)
1710
1765
  list = result["result"]["list"]
1711
1766
 
1712
1767
  print_table(list)
1713
-
1714
1768
  end
1715
1769
 
1716
1770
  desc 'commits', 'List all commits for a specific Project'
@@ -1741,17 +1795,17 @@ module Cnvrg
1741
1795
 
1742
1796
 
1743
1797
  desc 'git_clone', 'Clone project'
1798
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1744
1799
  def git_clone(slug, owner)
1745
1800
  verify_logged_in(false)
1746
1801
  log_start(__method__, args, options)
1747
-
1802
+ project_home = Dir.pwd
1803
+ soft = options["soft"] || false
1804
+ Project.stop_if_project_present(project_home, slug) if soft
1748
1805
  clone_resp = Project.clone_dir_remote(slug, owner, slug,true)
1749
- idx_status = Project.new(get_project_home).generate_idx
1806
+ exit 1 if not clone_resp
1807
+ idx_status = Project.new(get_project_home).generate_idx(files:[])
1750
1808
  FileUtils.mkdir_p File.join(get_project_home, ENV['CNVRG_OUTPUT_DIR']) if ENV['CNVRG_OUTPUT_DIR'].present?
1751
- @executer = Cnvrg::Helpers::Executer.get_executer
1752
- if @executer.present?
1753
- @executer.update_git_commit
1754
- end
1755
1809
  end
1756
1810
 
1757
1811
 
@@ -1791,7 +1845,7 @@ module Cnvrg
1791
1845
  desc 'clone PROJECT_URL', 'Clone project'
1792
1846
  method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false
1793
1847
  method_option :commit, :type => :string, :aliases => ["-c", "--c"], :default => nil
1794
-
1848
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1795
1849
  def clone(project_url)
1796
1850
  begin
1797
1851
  verify_logged_in(false)
@@ -1801,6 +1855,8 @@ module Cnvrg
1801
1855
  slug = url_parts[project_index + 1]
1802
1856
  owner = url_parts[project_index - 1]
1803
1857
  remote = options["remote"] || false
1858
+ soft = options["soft"] || false
1859
+
1804
1860
 
1805
1861
  response = Cnvrg::API.request("users/#{owner}/projects/#{slug}/get_project", 'GET')
1806
1862
  Cnvrg::CLI.is_response_success(response)
@@ -1814,6 +1870,8 @@ module Cnvrg
1814
1870
  clone_resp = false
1815
1871
  project_home = Dir.pwd
1816
1872
 
1873
+ Project.stop_if_project_present(project_home, project_name) if soft
1874
+
1817
1875
  if remote and !git
1818
1876
  clone_resp = Project.clone_dir_remote(slug, owner, project_name,git)
1819
1877
  elsif git
@@ -1954,8 +2012,6 @@ module Cnvrg
1954
2012
  method_option :parallel, :type => :numeric, :aliases => ["-p", "--parallel"], :desc => "uparallel upload at the same time", :default => 15
1955
2013
  method_option :init, :type => :boolean, :aliases => ["--initial"], :desc => "initial sync", :default => false
1956
2014
  method_option :message, :type => :string, :aliases => ["--message"], :desc => "create commit with message", :default => nil
1957
-
1958
-
1959
2015
  def sync_data_new(new_branch, force, verbose, commit, all_files, tags ,parallel, chunk_size, init, message)
1960
2016
  verify_logged_in(true)
1961
2017
  log_start(__method__, args, options)
@@ -1964,11 +2020,13 @@ module Cnvrg
1964
2020
  # w(verbose=false, new_branch=false,sync=false, commit=nil,all_files=true)
1965
2021
  total_deleted, total_downloaded = invoke :download_data_new,[verbose, new_branch, true, commit, all_files], :new_branch=>new_branch, :direct=>false, :force =>force
1966
2022
  end
1967
- # w(new_branch, verbose,sync,force, tags, chunk_size)
2023
+
1968
2024
  invoke :upload_data_new,[new_branch, verbose, true, force, tags, chunk_size, message:message, total_deleted: total_deleted, total_downloaded: total_downloaded],
1969
2025
  :new_branch=>new_branch, :direct=>false, :force =>force, :sync =>true, :tags =>tags, :parallel => parallel, :message => message
1970
2026
 
1971
2027
  end
2028
+
2029
+
1972
2030
  desc 'upload_data_new', 'upload_data_new', :hide => true
1973
2031
  method_option :verbose, :type => :boolean, :aliases => ["-v"], :default => false
1974
2032
  method_option :new_branch, :type => :boolean, :aliases => ["-nb"], :desc => "create new branch of commits"
@@ -2211,15 +2269,27 @@ module Cnvrg
2211
2269
  method_option :return_id, :type => :boolean, :aliases => ["-r", "--return_id"], :default => false
2212
2270
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
2213
2271
  method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
2272
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2214
2273
  method_option :job_slug, :type => :string, :aliases => ["--job"], :default => nil, :hide=>true
2215
2274
  method_option :job_type, :type => :string, :aliases => [ "--job_type"], :default => nil, :hide=>true
2275
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2276
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2216
2277
 
2217
- def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil)
2278
+ def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true)
2218
2279
  begin
2219
2280
  # we are passing "force" twice.. doesnt really make sense :\\
2220
2281
  verify_logged_in(true)
2221
2282
  log_start(__method__, args, options)
2222
2283
  @project = Project.new(get_project_home)
2284
+
2285
+ # Enable local/experiment exception logging
2286
+ suppress_exceptions = suppress_exceptions ? suppress_exceptions : options[:suppress_exceptions]
2287
+ if in_exp
2288
+ exp_obj = Experiment.new(@project.owner, @project.slug, job_id: job_slug)
2289
+ else
2290
+ exp_obj = nil
2291
+ end
2292
+
2223
2293
  commit_msg = options["message"]
2224
2294
  if commit_msg.nil? or commit_msg.empty?
2225
2295
  commit_msg = ""
@@ -2235,19 +2305,21 @@ module Cnvrg
2235
2305
  spec_files_to_upload = spec_files_to_upload.split(",")
2236
2306
  end
2237
2307
  if @project.is_git
2308
+ list = []
2238
2309
  git_output_dir = options["output_dir"] || output_dir
2239
2310
  if git_output_dir.present?
2240
2311
  if git_output_dir.ends_with? "/"
2241
2312
  git_output_dir = git_output_dir[0..-2]
2242
2313
  end
2243
2314
  list = @project.generate_output_dir(git_output_dir)
2244
- spec_files_to_upload = list
2245
- if spec_files_to_upload.blank?
2246
- log_message("#{check} Project is up to date", Thor::Shell::Color::GREEN, (((options["sync"] or sync) and !direct) ? false : true))
2247
- return true
2248
- end
2249
- force = true
2250
2315
  end
2316
+ list += @project.generate_git_diff if options["git_diff"]
2317
+ spec_files_to_upload = list
2318
+ if spec_files_to_upload.blank?
2319
+ log_message("#{check} Project is up to date", Thor::Shell::Color::GREEN, (((options["sync"] or sync) and !direct) ? false : true))
2320
+ return true
2321
+ end
2322
+ force = true
2251
2323
  end
2252
2324
 
2253
2325
  if ignore.nil? or ignore.empty?
@@ -2289,8 +2361,6 @@ module Cnvrg
2289
2361
  end
2290
2362
  update_count = 0
2291
2363
  update_total = result["added"].size + result["updated_on_local"].size + result["deleted"].size
2292
- successful_updates = []
2293
- successful_deletions = []
2294
2364
  if options["verbose"]
2295
2365
  if update_total == 1
2296
2366
  log_message("Updating #{update_total} file", Thor::Shell::Color::BLUE)
@@ -2310,8 +2380,11 @@ module Cnvrg
2310
2380
  end
2311
2381
  job_type = options['job_type'] || job_type
2312
2382
  job_slug = options['job_slug'] || job_slug
2313
- commit_sha1 = @files.start_commit(new_branch, force: force, exp_start_commit: exp_start_commit,
2314
- job_type: job_type, job_slug: job_slug, start_commit: current_commit, message: options["message"])["result"]["commit_sha1"]
2383
+ commit_sha1 = @files.start_commit(
2384
+ new_branch, force: force, exp_start_commit: exp_start_commit,
2385
+ job_type: job_type, job_slug: job_slug, start_commit: current_commit,message: options["message"],
2386
+ debug_mode: options["debug_mode"]
2387
+ )["result"]["commit_sha1"]
2315
2388
  # upload / update
2316
2389
  # delete
2317
2390
  to_upload = result["added"] + result["updated_on_local"]
@@ -2322,32 +2395,30 @@ module Cnvrg
2322
2395
  :starting_at => 0,
2323
2396
  :total => (to_upload.size + deleted.size),
2324
2397
  :autofinish => true)
2325
- @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar)
2326
2398
 
2327
- @files.delete_files_from_server(deleted, commit_sha1)
2399
+ buffered_errors = @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar, suppress_exceptions: suppress_exceptions)
2400
+ @files.delete_files_from_server(deleted, commit_sha1, suppress_exceptions: suppress_exceptions)
2328
2401
 
2329
2402
  progressbar.finish
2403
+
2404
+ if buffered_errors.is_a?(Hash)
2405
+ buffered_errors.keys.each do |file|
2406
+ to_upload.delete(file)
2407
+ Cnvrg::CLI.log_message(buffered_errors[file], 'red')
2408
+ exp_obj.job_log([buffered_errors[file]]) unless exp_obj.nil?
2409
+ end
2410
+ end
2411
+
2330
2412
  res = @files.end_commit(commit_sha1, force: force, message: commit_msg)
2331
2413
  unless Cnvrg::CLI.is_response_success(res, false)
2332
2414
  raise StandardError.new("Cant end commit")
2333
2415
  end
2416
+
2334
2417
  # save idx
2335
2418
  @project.update_idx_with_files_commits!((to_upload + deleted), res["result"]["commit_time"])
2336
2419
  @project.update_idx_with_commit!(commit_sha1)
2337
2420
  if options["verbose"]
2338
2421
  log_message("#{check} Done", Thor::Shell::Color::BLUE)
2339
- if successful_updates.size > 0
2340
- successful_updates.flatten!
2341
- log_message("Updated:", Thor::Shell::Color::GREEN)
2342
- suc = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2343
- log_message(suc.join("\n"), Thor::Shell::Color::GREEN)
2344
- end
2345
- if successful_deletions.size > 0
2346
- successful_deletions.flatten!
2347
- log_message("Deleted:", Thor::Shell::Color::GREEN)
2348
- del = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2349
- log_message(del.join("\n"), Thor::Shell::Color::GREEN)
2350
- end
2351
2422
  log_message("Total of #{update_count} / #{update_total} files.", Thor::Shell::Color::GREEN)
2352
2423
  else
2353
2424
  if return_id
@@ -2372,9 +2443,13 @@ module Cnvrg
2372
2443
  if e.is_a? SignalException
2373
2444
  say "\nAborting", Thor::Shell::Color::BLUE
2374
2445
  say "\nRolling back all changes", Thor::Shell::Color::BLUE
2446
+
2447
+ exp_obj.job_log(["Aborting", "Rolling back all changes"]) unless exp_obj.nil?
2375
2448
  else
2376
2449
  log_message(error_message, Thor::Shell::Color::RED)
2377
2450
  log_error(e)
2451
+
2452
+ exp_obj.job_log([error_message, e]) unless exp_obj.nil?
2378
2453
  end
2379
2454
  @files.rollback_commit(commit_sha1) unless commit_sha1.nil?
2380
2455
  print_res = {
@@ -2892,6 +2967,11 @@ module Cnvrg
2892
2967
  method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
2893
2968
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
2894
2969
  method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
2970
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2971
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2972
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2973
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2974
+
2895
2975
  def sync(direct = true)
2896
2976
  verify_logged_in(true) if direct
2897
2977
  @project = Project.new(get_project_home)
@@ -2903,16 +2983,20 @@ module Cnvrg
2903
2983
  is_git = ENV['CNVRG_GIT_PROJECT'] == "true" || @project.is_git
2904
2984
  in_exp = options["in_exp"] || (job_slug.present? and job_type.present?)
2905
2985
  in_exp = false if job_type.present? and job_type == "NotebookSession"
2986
+ output_dir = options["output_dir"] || ENV['CNVRG_OUTPUT_DIR']
2987
+
2906
2988
  run_download = true
2907
- if options[:force] or options[:files].present? or options[:output_dir].present? or in_exp or @project.is_branch
2989
+ if (job_type == "NotebookSession" and is_git) or job_type == "Experiment" or options['force']
2908
2990
  run_download = false
2909
2991
  end
2910
- if run_download
2992
+
2993
+ if run_download or options['debug_mode']
2911
2994
  invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
2912
2995
  end
2913
- invoke :upload, [false, true, direct, "",in_exp,options[:force], options["output_dir"],job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2996
+ invoke :upload, [false, true, direct, "",in_exp,options[:force], output_dir, job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2914
2997
  :ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
2915
- :files => options["files"], :output_dir => options["output_dir"], :job_slug => job_slug, :job_type => job_type
2998
+ :files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"], :debug_mode => options['debug_mode'], :git_diff => options["git_diff"]
2999
+
2916
3000
  end
2917
3001
 
2918
3002
  desc 'run cmd', 'Runs an experiment'
@@ -3057,6 +3141,8 @@ module Cnvrg
3057
3141
  method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
3058
3142
  method_option :data_commit, :type => :string, :aliases => ["-dc", "--data_commit"], :default => ""
3059
3143
  method_option :ignore, :type => :string, :aliases => ["-i", "--ignore"], :desc => "ignore following files", :default => ""
3144
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch", :default => ""
3145
+ method_option :gpu_util_from_docker, :type => :boolean, :aliases => ["--gpu-util-from-docker"], :desc => "take gpu utilization from job docker", :default => false
3060
3146
  method_option :remote, :type => :boolean, :aliases => ["--remote"], :default => false
3061
3147
  method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
3062
3148
  method_option :force, :type => :boolean, :aliases => ["-f", "--force"], :default => false
@@ -3064,6 +3150,7 @@ module Cnvrg
3064
3150
  method_option :periodic_sync, :type => :string, :aliases => ["-ps", "--periodic_sync"], :default => ""
3065
3151
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3066
3152
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3153
+ method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3067
3154
 
3068
3155
  def exec(*cmd)
3069
3156
  log = []
@@ -3130,8 +3217,12 @@ module Cnvrg
3130
3217
  end
3131
3218
  remote = options["remote"]
3132
3219
  if remote
3133
- docker_id = `cat /etc/hostname`
3134
- docker_id = docker_id.strip()
3220
+ if options["docker_id"].present?
3221
+ docker_id = options["docker_id"]
3222
+ else
3223
+ docker_id = `cat /etc/hostname`
3224
+ docker_id = docker_id.strip()
3225
+ end
3135
3226
  end
3136
3227
  is_on_gpu = options["gpu"]
3137
3228
  start_commit = @project.last_local_commit
@@ -3141,9 +3232,9 @@ module Cnvrg
3141
3232
 
3142
3233
  platform = RUBY_PLATFORM
3143
3234
  machine_name = Socket.gethostname
3235
+ machine_activity_slug = ENV["CNVRG_MACHINE_ACTIVITY"]
3144
3236
  begin
3145
- machine_activity = @exp.get_machine_activity(working_dir)
3146
- @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity, script_path, sync_before_terminate, periodic_sync)
3237
+ @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity_slug, script_path, sync_before_terminate, periodic_sync)
3147
3238
  log_message("Experiment's live results: #{Cnvrg::Helpers.remote_url}/#{@project.owner}/projects/#{@project.slug}/experiments/#{@exp.slug}", Thor::Shell::Color::GREEN)
3148
3239
  log_message("Running: #{cmd}\n", Thor::Shell::Color::BLUE)
3149
3240
  unless @exp.slug.nil?
@@ -3161,7 +3252,7 @@ module Cnvrg
3161
3252
  begin
3162
3253
  stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3163
3254
  if is_on_gpu
3164
- gu = gpu_util
3255
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3165
3256
  stats['gpu_util'] = gu[0]
3166
3257
  stats['gpu'] = gu[1]
3167
3258
  end
@@ -3173,6 +3264,16 @@ module Cnvrg
3173
3264
  end
3174
3265
  end
3175
3266
  start_time = Time.now
3267
+ shell_type = options["use_bash"] ? "bash -l" : "sh"
3268
+ if @exp.get_cmd.present?
3269
+ cmd = @exp.get_cmd
3270
+ if options["docker_id"].present? # Escape for docker exec
3271
+ cmd = cmd.gsub("\"", "\\\"")
3272
+ end
3273
+ end
3274
+ if options["docker_id"].present?
3275
+ cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
3276
+ end
3176
3277
  PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
3177
3278
  begin
3178
3279
  stdout.each do |line|
@@ -3187,7 +3288,7 @@ module Cnvrg
3187
3288
  puts line
3188
3289
  end
3189
3290
  log << cur_log
3190
- if log.size >= 5
3291
+ if log.size >= 1
3191
3292
  @exp.upload_temp_log(log) unless log.empty?
3192
3293
  log = []
3193
3294
  elsif (start_time + 15.seconds) <= Time.now
@@ -3237,29 +3338,26 @@ module Cnvrg
3237
3338
  exp_success = false
3238
3339
  end
3239
3340
 
3240
- if sync_after
3241
- @exp.job_log(["Syncing Experiment"])
3242
- # Sync after run
3243
- if @project.is_git
3244
- output_dir = output_dir || @exp.output_dir
3245
- if output_dir.present?
3246
- upload(false, false, true, ignore, true, true,output_dir,"Experiment",@exp.slug )
3247
- # invoke :upload, [false, false, true, ignore, true, true], :output_dir => output_dir, :force=>true, :job_type=>'Experiment', :job_slug=>@exp.slug
3248
- end
3249
- else
3250
- upload(false, false, true, ignore, true, true,nil,"Experiment",@exp.slug )
3251
-
3252
- # invoke :upload, [false, false, true, ignore,true, true], :job_type=>'Experiment', :job_slug=>@exp.slug, :force=>true
3341
+ if sync_after
3342
+ @exp.job_log(["Syncing Experiment"])
3343
+ # Sync after run
3344
+ if @project.is_git
3345
+ output_dir = output_dir || @exp.output_dir
3346
+ if output_dir.present?
3347
+ upload(false, false, true, ignore, true, true, output_dir, "Experiment", @exp.slug, true )
3253
3348
  end
3254
-
3349
+ else
3350
+ upload(false, false, true, ignore, true, true, nil, "Experiment", @exp.slug, true )
3255
3351
  end
3352
+ end
3353
+
3256
3354
  end_commit = @project.last_local_commit
3257
3355
  if end_commit.present?
3258
3356
  @exp.job_log(["Experiment end commit: #{end_commit}"])
3259
3357
  end
3260
3358
 
3261
3359
  # log_thread.join
3262
- stats_thread.join
3360
+ stats_thread.join
3263
3361
 
3264
3362
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3265
3363
 
@@ -3407,8 +3505,8 @@ module Cnvrg
3407
3505
  local_folders_options = options["local_folders"]
3408
3506
  options_hash.except!("schedule", "recurring", "machine_type", "image", "upload_output", "grid", "data", "data_commit", "title",
3409
3507
  "local", "small", "medium", "large", "gpu", "gpuxl", "gpuxxl","max_time","dataset_only_tree",
3410
- "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets", "requirements", "prerun",
3411
- "email_notification_error", "email_notification_success", "emails")
3508
+ "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets",
3509
+ "requirements", "prerun", "email_notification_error", "email_notification_success", "emails")
3412
3510
  exec_options = options_hash.map {|x| "--#{x[0]}=#{x[1]}"}.flatten.join(" ")
3413
3511
  command = "#{exec_options} #{remote} #{upload_output_option} #{cmd.flatten.join(" ")}"
3414
3512
  commit_to_run = options["commit"] || nil
@@ -4233,144 +4331,6 @@ module Cnvrg
4233
4331
 
4234
4332
  end
4235
4333
 
4236
- method_option :small, :type => :boolean, :aliases => ["-sm", "--small"], :default => false
4237
- method_option :medium, :type => :boolean, :aliases => ["-md", "--medium"], :default => false
4238
- method_option :large, :type => :boolean, :aliases => ["-lg", "--large"], :default => false
4239
- method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
4240
- method_option :gpuxl, :type => :boolean, :aliases => ["--gpuxl"], :default => false
4241
- method_option :gpuxxl, :type => :boolean, :aliases => ["--gpuxxl"], :default => false
4242
- method_option :image, :type => :string, :aliases => ["-i", "--image"], :default => ""
4243
- method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
4244
- method_option :base, :type => :boolean, :aliases => ["-b", "--base"], :default => false
4245
- method_option :python3, :type => :boolean, :aliases => ["--python3"], :default => false
4246
- method_option :docker_path, :type => :string, :aliases => ["--docker_path"], :default => ""
4247
-
4248
-
4249
- desc 'create_custom_image', 'run commands inside containers', :hide => true
4250
-
4251
- def build_image(image_name)
4252
- begin
4253
- verify_logged_in(false)
4254
- log_start(__method__, args, options)
4255
- instances = {"small" => options["small"], "medium" => options["medium"], "large" => options["large"],
4256
- "gpu" => options["gpu"], "gpuxl" => options["gpuxl"], "gpuxxl" => options["gpuxxl"]}
4257
- instance_type = get_instance_type(instances)
4258
- image_extend = options["image"]
4259
- public = options["public"]
4260
- base = options["base"]
4261
- python3 = options["python3"]
4262
- docker_path = options["docker_path"]
4263
- owner = CLI.get_owner
4264
- checks = Helpers.checkmark()
4265
- tar_path = nil
4266
- if !docker_path.nil? and !docker_path.empty?
4267
- docker_path = File.absolute_path(docker_path)
4268
- #create tar of the docker path: it could be a docker file, and it could be a docker folder
4269
- tar_path = File.expand_path('~') + "/.cnvrg/tmp/docker_#{File.basename docker_path}.tar.gz"
4270
- resp = create_docker_tar(docker_path, tar_path)
4271
- if !resp
4272
- log_message("Couldn't create tar from docker path", Thor::Shell::Color::RED)
4273
- FileUtils.rm_rf tar_path
4274
- exit(1)
4275
- end
4276
- files = Cnvrg::Files.new(owner, "")
4277
- resp = Images.create_new_custom_image_with_docker(instance_type, owner, image_name, public, base, image_extend, python3, tar_path, files)
4278
- if resp
4279
- end
4280
- else
4281
- log_message("Creating machine for your custom image, this may take a few moments...", Thor::Shell::Color::BLUE)
4282
- resp = Images.create_new_custom_image(instance_type, owner, image_name, public, base, image_extend, python3, nil)
4283
-
4284
- end
4285
-
4286
- if Cnvrg::CLI.is_response_success(resp, false)
4287
- image_slug = resp["result"]["slug"]
4288
- container = resp["result"]["machine_c"]
4289
- log_message("#{checks} Created image and machine successfully", Thor::Shell::Color::GREEN)
4290
- log_message("Connecting to machine", Thor::Shell::Color::BLUE)
4291
- ssh = Ssh.new(resp)
4292
- if !ssh.is_ssh
4293
- log_message("Couldn't connect to machine,aborting", Thor::Shell::Color::RED)
4294
- Images.revoke_custom_new_image(owner, image_slug)
4295
- end
4296
- log_message("run command until ctrl + c or quit is initiated", Thor::Shell::Color::BLUE)
4297
- begin
4298
- logs = []
4299
-
4300
- while true
4301
- command = ask("$>")
4302
- logs << {time: Time.now,
4303
- message: command,
4304
- type: "stdout"
4305
- }
4306
- if command.eql? "quit"
4307
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4308
- break
4309
- end
4310
- res = ssh.exec_command(command)
4311
- begin
4312
- res_parsed = JSON.parse(res)
4313
- res = res_parsed.join(",")
4314
- end
4315
-
4316
- puts res
4317
- logs << {time: Time.now,
4318
- message: res,
4319
- type: "stdout"
4320
- }
4321
- logs.flatten!
4322
-
4323
- end
4324
-
4325
- rescue SignalException
4326
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4327
-
4328
- end
4329
- resp = Images.commit_custom_image(owner, image_slug, logs)
4330
- if Cnvrg::CLI.is_response_success(resp, false)
4331
- log_message("#{checks} Image commited successfuly, email will be sent when image is ready", Thor::Shell::Color::GREEN)
4332
- else
4333
- if image_slug
4334
- Images.revoke_custom_new_image(owner, image_slug)
4335
- end
4336
- if ssh
4337
- ssh.close_ssh()
4338
- end
4339
- log_message("Image couldn't be commited, rolling back changes", Thor::Shell::Color::RED)
4340
-
4341
- exit(1)
4342
- end
4343
- if ssh
4344
- ssh.close_ssh()
4345
- end
4346
-
4347
-
4348
- end
4349
- rescue => e
4350
- log_message("Error occurd, aborting", Thor::Shell::Color::RED)
4351
-
4352
- log_error(e)
4353
- if image_slug
4354
- Images.revoke_custom_new_image(owner, image_slug)
4355
- end
4356
- if ssh
4357
- ssh.close_ssh()
4358
- end
4359
-
4360
-
4361
- rescue SignalException
4362
- if image_slug
4363
- Images.revoke_custom_new_image(owner, image_slug)
4364
- end
4365
- if ssh
4366
- ssh.close_ssh
4367
- end
4368
- say "\nAborting"
4369
- exit(1)
4370
- end
4371
-
4372
- end
4373
-
4374
4334
 
4375
4335
  desc 'build', 'run commands inside containers', :hide => true
4376
4336
  method_option :install, :type => :string, :aliases => ["--i"], :default => nil, :desc => "Install from the given instructions file"
@@ -4564,66 +4524,7 @@ module Cnvrg
4564
4524
  end
4565
4525
 
4566
4526
 
4567
- desc 'upload_image', 'commit notebook changes to create a new notebook image', :hide =>true
4568
-
4569
- def upload_image_old(image_id, is_public, is_base, *message)
4570
- verify_logged_in(true)
4571
- log_start(__method__, args, options)
4572
- image = Docker::Image.get(image_id)
4573
- project_home = get_project_home
4574
- @project = Project.new(project_home)
4575
- last_local_commit = @project.last_local_commit
4576
- image_name = @project.slug + "#{last_local_commit}"
4577
- path = File.expand_path('~') + "/.cnvrg/tmp/#{image_name}.tar"
4578
- owner = Cnvrg::CLI.get_owner()
4579
- if !message.nil? or !message.empty?
4580
- message = message.join(" ")
4581
- end
4582
-
4583
- log_message("Saving image's current state", Thor::Shell::Color::BLUE)
4584
- image.save(path)
4585
-
4586
- begin
4587
- log_message("Compressing image file to upload", Thor::Shell::Color::BLUE)
4588
- gzipRes = system("gzip -f #{path}")
4589
- if !gzipRes
4590
-
4591
- log_message("Couldn't create tar file from image", Thor::Shell::Color::RED)
4592
- exit(1)
4593
- end
4594
- path = path + ".gz"
4595
- @files = Cnvrg::Files.new(owner, "")
4596
-
4597
- exit_status = $?.exitstatus
4598
- if exit_status == 0
4599
- log_message("Uploading image file", Thor::Shell::Color::BLUE)
4600
-
4601
- diff = container_changes(Dir.pwd)
4602
- res = @files.upload_image(path, image_name, owner, is_public, is_base, diff[1], diff[0], diff[2], message, image.commit_id)
4603
- if res
4604
- File.delete(path)
4605
- image_loc = is_project_with_docker(Dir.pwd)
4606
- image_loc.update_slug(res["result"]["id"])
4607
-
4608
- checks = Helpers.checkmark()
4609
- log_message("#{checks} Done", Thor::Shell::Color::GREEN)
4610
- else
4611
- log_message("Couldn't upload image", Thor::Shell::Color::RED)
4612
-
4613
- end
4614
- else
4615
- log_message("Couldn't create image file for: #{image_name}", Thor::Shell::Color::RED)
4616
- exit(1)
4617
- end
4618
- rescue => e
4619
- log_message("Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED)
4620
- log_error(e)
4621
- rescue SignalException
4622
4527
 
4623
- say "Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED
4624
- exit(1)
4625
- end
4626
- end
4627
4528
 
4628
4529
  desc '', '', :hide => true
4629
4530
 
@@ -4634,278 +4535,30 @@ module Cnvrg
4634
4535
 
4635
4536
  end
4636
4537
 
4637
- desc '', '', :hide => true
4638
-
4639
- def exec_container(container_id, *cmd)
4640
- container = Docker::Container.get(container_id)
4641
- container.start()
4642
- cnvrg_command = cmd.join(" ")
4643
- command = ["/bin/bash", "-lc", "#{cnvrg_command}"]
4644
- res = container.exec(command, tty: true, wait: 5400)[0]
4645
- say res
4646
- end
4647
-
4648
- desc '', '', :hide => true
4649
-
4650
- def port_container(container_id)
4651
- container = Docker::Container.get(container_id)
4652
- say container.json["HostConfig"]["PortBindings"]["8888/tcp"][0]["HostPort"]
4653
- end
4654
-
4655
- desc '', '', :hide => true
4656
-
4657
- def tensor_port_container(container_id)
4658
- container = Docker::Container.get(container_id)
4659
- say container.json["HostConfig"]["PortBindings"]["6006/tcp"][0]["HostPort"]
4660
- end
4661
-
4662
- desc '', '', :hide => true
4663
-
4664
- def stop_container(container_id)
4665
- container = Docker::Container.get(container_id)
4666
- container.stop()
4667
- container.remove()
4668
-
4669
- end
4670
-
4671
- desc '', '', :hide => true
4672
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4673
- method_option :app_dir, :type => :string, :aliases => ["-d"], :default => "/home/ds/notebooks"
4674
- method_option :cmd, :type => :string, :aliases => ["-c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4675
-
4676
-
4677
- def config_remote(image_name, port = 7654, tensport = 6006)
4678
- local_images = Docker::Image.all
4679
-
4680
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4681
- if docker_image_local.empty?
4682
- say "no image"
4683
- exit(1)
4684
- end
4685
-
4686
- begin
4687
- login_content = options["login"]
4688
- app_dir = options["app_dir"]
4689
- cmd = options["cmd"]
4690
- volume_from = options["volume"]
4691
-
4692
- image_settings = {
4693
- 'Image' => "#{image_name}:latest",
4694
-
4695
- 'Cmd' => cmd,
4696
- 'WorkingDir' => app_dir,
4697
- 'ExposedPorts' => {
4698
- '8888/tcp' => {},
4699
- },
4700
- 'HostConfig' => {
4701
- 'Binds' => ["/var/run/docker.sock:/var/run/docker.sock", "/usr/bin/docker:/usr/bin/docker"],
4702
- 'PortBindings' => {
4703
- '8888/tcp' => [
4704
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4705
- ],
4706
- '6006/tcp' => [
4707
- {'HostPort' => "#{tensport}", 'HostIp' => 'localhost'}
4708
- ],
4709
- },
4710
- },
4711
- }
4712
- container = Docker::Container.create(image_settings)
4713
- container.start()
4714
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4715
- container.exec(command, tty: true)
4716
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
4717
- # container.exec(command, tty: true)
4718
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
4719
- # container.exec(command, tty: true)
4720
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4721
- container.exec(command, tty: true)
4722
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4723
- container.exec(command, tty: true)
4724
- say "#{container.id}:#{port}##{tensport}"
4725
- rescue => e
4726
- puts e
4727
- if e.message.include? "is not running"
4728
- return config_remote(image_name, port - 1, tensport - 1)
4729
- end
4730
-
4731
- if container
4732
- container.kill()
4733
- end
4734
- return false
4735
- end
4736
- end
4737
-
4738
-
4739
- desc '', '', :hide => true
4740
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4741
-
4742
- def config_netrc(container)
4743
-
4744
- login_content = options["login"]
4745
-
4746
- container = Docker::Container.get(container)
4747
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4748
- container.exec(command, tty: true)
4749
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4750
- container.exec(command, tty: true)
4751
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4752
- container.exec(command, tty: true)
4753
- say "OK"
4754
-
4755
- end
4756
-
4757
- desc '', '', :hide => true
4758
- method_option :login, :type => :string, :aliases => ["-l", "--l"], :default => ""
4759
- method_option :app_dir, :type => :string, :aliases => ["-d", "--d"], :default => "/home/ds/notebooks"
4760
- method_option :cmd, :type => :string, :aliases => ["-c", "--c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4761
-
4762
-
4763
- def config_remote_gpu(image_name, port = 7654, tensport = 6006)
4764
- local_images = Docker::Image.all
4765
-
4766
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4767
- if docker_image_local.empty?
4768
- say "no image"
4769
- exit(1)
4770
- end
4771
-
4772
- begin
4773
- login_content = options["login"]
4774
- app_dir = options["app_dir"]
4775
- cmd = options["cmd"]
4776
-
4777
- # image_settings = {
4778
- # 'Image' => "#{image_name}:latest",
4779
- # 'User' => 'ds',
4780
- # 'Cmd' => cmd,
4781
- # 'WorkingDir' => app_dir,
4782
- # 'ExposedPorts' => {
4783
- # '8888/tcp' => {},
4784
- # },
4785
- # 'HostConfig' => {
4786
- # 'PortBindings' => {
4787
- # '8888/tcp' => [
4788
- # {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4789
- # ],
4790
- # '6006/tcp' => [
4791
- # {'HostPort' => "6006", 'HostIp' => 'localhost'}
4792
- # ],
4793
- # },
4794
- # },
4795
- # }
4796
-
4797
- container_id = `nvidia-docker run -itd -p #{port}:8888 -p #{tensport}:6006 -w #{app_dir} -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi -v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker #{image_name}:latest #{cmd} `
4798
- container_id = container_id.gsub("\n", "")
4799
- container = Docker::Container.get(container_id)
4800
- # container.start()
4801
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4802
- container.exec(command, tty: true)
4803
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4804
- container.exec(command, tty: true)
4805
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4806
- container.exec(command, tty: true)
4807
- say "#{container.id}:#{port}##{tensport}"
4808
- rescue => e
4809
- if e.message.include? "is not running"
4810
- puts "running asgain with: #{port - 1} #{tensport - 1}"
4811
- return config_remote_gpu(image_name, port - 1, tensport - 1)
4812
- end
4813
-
4814
- if container
4815
- container.kill()
4816
- end
4817
- return false
4818
- end
4819
- end
4820
-
4821
- desc '', '', :hide => true
4822
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4823
-
4824
- def config_flask_remote(image_name, port = 80)
4825
- local_images = Docker::Image.all
4826
-
4827
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4828
- if docker_image_local.empty?
4829
- say "no image"
4830
- exit(1)
4831
- end
4832
-
4833
- begin
4834
- login_content = options["login"]
4835
- image_settings = {
4836
- 'Image' => "#{image_name}:latest",
4837
- 'User' => 'ds',
4838
- 'Cmd' => '/usr/local/cnvrg/start_super.sh',
4839
- 'WorkingDir' => '/home/ds/app',
4840
- 'ExposedPorts' => {
4841
- '80/tcp' => {},
4842
- },
4843
- 'HostConfig' => {
4844
- 'PortBindings' => {
4845
- '80/tcp' => [
4846
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4847
- ],
4848
- },
4849
- },
4850
- }
4851
- container = Docker::Container.create(image_settings)
4852
- container.start()
4853
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4854
- container.exec(command, tty: true)
4855
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4856
- container.exec(command, tty: true)
4857
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4858
- container.exec(command, tty: true)
4859
- say "#{container.id}:#{port}"
4860
- rescue => e
4861
- pus e
4862
- if e.message.include? "is not running"
4863
- return "port is taken"
4864
- end
4865
- puts "error"
4866
- if container
4867
- container.kill()
4538
+ desc 'Collect and send job utilization', '', :hide => true
4539
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch"
4540
+ method_option :is_on_gpu, :type => :boolean, :aliases => ["--is_on_gpu"], :desc => "is on gpu", :default => true
4541
+ def get_utilization()
4542
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4543
+ docker_id = options["docker_id"]
4544
+ while true do
4545
+ sleep 30
4546
+ begin
4547
+ stats = usage_metrics_in_docker(docker_id)
4548
+ if options["is_on_gpu"]
4549
+ gu = gpu_util(take_from_docker: true, docker_id: docker_id)
4550
+ stats['gpu_util'] = gu[0]
4551
+ stats['gpu'] = gu[1]
4552
+ end
4553
+ stats['docker_id'] = docker_id
4554
+ @exp.send_machine_stats [stats] unless stats.empty?
4555
+ rescue => e
4556
+ log_error(e)
4557
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4868
4558
  end
4869
- return false
4870
4559
  end
4871
4560
  end
4872
4561
 
4873
- desc '', '', :hide => true
4874
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4875
-
4876
- def config_flask_remote_gpu(image_name, port = 80)
4877
- local_images = Docker::Image.all
4878
-
4879
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4880
- if docker_image_local.empty?
4881
- say "no image"
4882
- exit(1)
4883
- end
4884
-
4885
- begin
4886
- login_content = options["login"]
4887
- container_id = `nvidia-docker run -itd -p 80:80 -w /home/ds/app #{image_name}:latest /usr/local/cnvrg/start_super.sh`
4888
- container_id = container_id.gsub("\n", "")
4889
- container = Docker::Container.get(container_id)
4890
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4891
- container.exec(command, tty: true)
4892
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4893
- container.exec(command, tty: true)
4894
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4895
- container.exec(command, tty: true)
4896
- say "#{container.id}:#{port}"
4897
- rescue => e
4898
- puts e
4899
- if e.message.include? "is not running"
4900
- return "port is taken"
4901
- end
4902
- puts "error"
4903
- if container
4904
- container.kill()
4905
- end
4906
- return false
4907
- end
4908
- end
4909
4562
 
4910
4563
  desc '', '', :hide => true
4911
4564
 
@@ -4931,39 +4584,10 @@ module Cnvrg
4931
4584
 
4932
4585
  end
4933
4586
 
4934
- desc 'upload_image', 'Upload new docker image to cnvrg', :hide => true
4935
- method_option :workdir, :type => :string, :aliases => ["-w","--workdir"], :desc => "workdir of docker image", :default => "/root"
4936
- method_option :description, :type => :string, :aliases => ["-d", "--description"], :desc => "description for docker image", :default => ""
4937
- method_option :user, :type => :string, :aliases => ["-u","--user"], :default => "root"
4938
- method_option :gpu, :type => :boolean, :aliases => ["-g","--gpu"], :default => false
4939
- def upload_image(image_name,image_path)
4940
- begin
4941
- verify_logged_in(false)
4942
- log_start(__method__, args, options)
4943
-
4944
- @image = Cnvrg::Images.new()
4945
- say "Uploading new docker image file", Thor::Shell::Color::BLUE
4946
- workdir = options[:workdir]
4947
- description = options[:description]
4948
- user = options[:user]
4949
- is_gpu = options[:gpu]
4950
- res = @image.upload_docker_image(image_path, image_name, workdir, user, description, is_gpu)
4951
- if res["status"] == 200
4952
- image_slug = res["id"]
4953
- owner = CLI.get_owner
4954
- image_url = "#{Cnvrg::Helpers.remote_url}/#{owner}/settings/images/#{image_slug}"
4955
- log_message("Successfully uploaded image: #{image_url}", Thor::Shell::Color::GREEN, true)
4956
-
4957
-
4958
- else
4959
- log_message("Couldn't upload image: #{image_name}", Thor::Shell::Color::RED, true)
4960
-
4961
- end
4962
- rescue => e
4963
- log_error(e)
4964
- end
4965
-
4966
-
4587
+ desc 'file_exists', '', :hide => true
4588
+ def file_exists(file)
4589
+ exit(0) if File.exists? file
4590
+ exit(1)
4967
4591
  end
4968
4592
 
4969
4593
 
@@ -5143,29 +4767,40 @@ module Cnvrg
5143
4767
  method_option :project_slug, :type => :string, :aliases => ["-s"], :desc => "project slug"
5144
4768
  method_option :project_owner, :type => :string, :aliases => ["-o"], :desc => "project slug"
5145
4769
  method_option :frequency, :type => :numeric, :aliases => ["-f"], :desc => "poll frequency"
4770
+ method_option :fetch_slugs, :type => :boolean, :default => false, :desc => "Fetch experiments slugs to compare"
5146
4771
 
5147
4772
  def compare_experiments
5148
4773
  verify_logged_in(true)
5149
4774
  log_start(__method__, args, options)
5150
4775
  exps_map = {}
4776
+ copied_commits = []
5151
4777
 
5152
- if options[:slugs].blank?
4778
+ if options[:slugs].blank? and options[:fetch_slugs].blank?
5153
4779
  log_message("No experiments slugs given", Thor::Shell::Color::RED)
5154
4780
  return false
5155
4781
  end
5156
- slugs = options[:slugs].split(",")
5157
- if slugs.blank?
5158
- log_message("No experiments slugs given", Thor::Shell::Color::RED)
5159
- return false
4782
+ if options[:slugs].present?
4783
+ slugs = options[:slugs].split(",")
5160
4784
  end
4785
+
5161
4786
  frequency = options[:frequency] || 5
5162
4787
  namespace = options[:namespace]
5163
4788
  project_dir = is_cnvrg_dir(Dir.pwd)
5164
4789
  @project = Project.new(project_home=project_dir, slug: options[:project_slug], owner: options[:project_owner])
4790
+ fetch_slugs = options[:fetch_slugs]
4791
+ webapp_slug = ENV["CNVRG_JOB_ID"]
4792
+ if fetch_slugs and webapp_slug.present?
4793
+ slugs = @project.fetch_webapp_slugs(webapp_slug)
4794
+ end
4795
+ if slugs.blank?
4796
+ log_message("No experiments slugs given", Thor::Shell::Color::RED)
4797
+ return false
4798
+ end
5165
4799
 
4800
+ log_message("compare is running")
5166
4801
  while true
4802
+ log_message("compare is running for slugs #{slugs}")
5167
4803
  slugs.each do |exp_slug|
5168
-
5169
4804
  begin
5170
4805
  if exps_map[exp_slug].blank?
5171
4806
  exp = @project.get_experiment(exp_slug)["experiment"]
@@ -5179,15 +4814,23 @@ module Cnvrg
5179
4814
  log_message("#{exp_name} has ended, getting files from end commit", Thor::Shell::Color::BLUE)
5180
4815
  Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project)
5181
4816
  exps_map[exp_slug] = exp
5182
- elsif exp["machine_activity"].present?
4817
+ else
5183
4818
  log_message("#{exp_name} is running should get logs", Thor::Shell::Color::BLUE)
5184
- Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4819
+ success = Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4820
+ if !success and exp["last_successful_commit"].present? and !copied_commits.include?(exp["last_successful_commit"])
4821
+ log_message("Failed to get kube files, using last commit", Thor::Shell::Color::BLUE)
4822
+ Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project, commit: exp["last_successful_commit"])
4823
+ copied_commits << exp["last_successful_commit"]
4824
+ end
5185
4825
  end
5186
4826
  rescue => e
5187
4827
  Cnvrg::Logger.log_error(e)
5188
4828
  end
5189
4829
  end
5190
4830
  sleep frequency
4831
+ if fetch_slugs
4832
+ slugs = @project.fetch_webapp_slugs(webapp_slug, slugs: slugs)
4833
+ end
5191
4834
  end
5192
4835
  end
5193
4836
 
@@ -5273,127 +4916,6 @@ module Cnvrg
5273
4916
  end
5274
4917
 
5275
4918
 
5276
- desc 'pull_image', 'downloads and loads an image', :hide => true
5277
-
5278
- def pull_image(image_name)
5279
- begin
5280
- verify_logged_in(false)
5281
- log_start(__method__, args, options)
5282
- owner = Cnvrg::CLI.get_owner()
5283
- image = Cnvrg::Images.image_exist(owner, image_name)
5284
- if !image
5285
- log_message("Couldn't find image in cnvrg repository", Thor::Shell::Color::RED)
5286
- exit(1)
5287
- end
5288
- path = download_image(image_name, image["slug"])
5289
- if path
5290
- log_message("Building image", Thor::Shell::Color::BLUE)
5291
- Docker.options[:read_timeout] = 216000
5292
- image = Docker::Image.build_from_dir(path, {'dockerfile' => 'Dockerfile.cpu', 't' => "#{image_name}:latest"}) do |v|
5293
- begin
5294
- if (log = JSON.parse(v)) && log.has_key?("stream")
5295
- next if log["stream"].starts_with? "Step"
5296
- $stdout.puts log["stream"]
5297
- end
5298
- rescue
5299
- end
5300
-
5301
- end
5302
-
5303
- if not image.nil?
5304
- FileUtils.rm_rf(path)
5305
- checks = Helpers.checkmark()
5306
- log_message("#{checks} Image built successfully", Thor::Shell::Color::GREEN)
5307
- return image
5308
- else
5309
-
5310
- log_message("Could not build image", Thor::Shell::Color::RED)
5311
- return false
5312
- end
5313
- else
5314
-
5315
- log_message("Could not download image", Thor::Shell::Color::RED)
5316
- return false
5317
-
5318
-
5319
- end
5320
-
5321
- # else
5322
- # path = download_image(image_name,image["slug"])
5323
- # if path
5324
- # image = Docker::Image.import(path)
5325
- # image.tag('repo' => image_name, 'tag' => 'latest')
5326
- # if not image.nil?
5327
- # say "Finished downloading image, cleaning up..", Thor::Shell::Color::GREEN
5328
- # FileUtils.rm(path)
5329
- # checks = Helpers.checkmark()
5330
- # say "#{checks} Done", Thor::Shell::Color::GREEN
5331
- # log_end(0)
5332
- # return image
5333
- # log_end(0)
5334
- # else
5335
- # say "Could not download image", Thor::Shell::Color::RED
5336
- # return false
5337
- # end
5338
- #
5339
- # end
5340
- # end
5341
- rescue => e
5342
-
5343
- log_message "Error: couldn't build image", Thor::Shell::Color::RED
5344
- log_error(e)
5345
-
5346
- rescue SignalException
5347
- say "\nAborting"
5348
- exit(1)
5349
- ensure
5350
- if path
5351
- FileUtils.rm_rf(path)
5352
-
5353
- end
5354
- end
5355
-
5356
-
5357
- end
5358
-
5359
- desc 'set_image', 'set image to a porject', :hide => true
5360
-
5361
- def set_image(docker_image)
5362
- verify_logged_in(true)
5363
- log_start(__method__, args, options)
5364
- working_dir = is_cnvrg_dir
5365
- project = Project.new(working_dir)
5366
-
5367
- local_images = Docker::Image.all
5368
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.include? docker_image}.flatten
5369
- if docker_image_local.size == 0
5370
-
5371
- if yes? "Image wasn't found locally, pull image from cnvrg repository?", Thor::Shell::Color::YELLOW
5372
- image = pull(docker_image)
5373
- if image
5374
- log_message("downloaded image: #{docker_image}", Thor::Shell::Color::BLUE)
5375
- @image = Images.new(working_dir, docker_image)
5376
- else
5377
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5378
- exit(1)
5379
- end
5380
- else
5381
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5382
- exit(1)
5383
-
5384
- end
5385
- elsif docker_image_local.size == 1
5386
- log_message("found image: #{docker_image_local[0]}, setting it up..", Thor::Shell::Color::BLUE)
5387
- @image = Images.new(working_dir, docker_image_local[0])
5388
- elsif docker_image_local.size > 1
5389
- log_message("found #{docker_image_local.size} images, choose the image name you want to use", Thor::Shell::Color::BLUE)
5390
- image_name = ask "#{docker_image_local.join("\n")}\n", Thor::Shell::Color::BLUE
5391
- image_name = image_name.strip
5392
- @image = Images.new(working_dir, image_name)
5393
- end
5394
- @image.update_image_activity(project.last_local_commit, nil)
5395
- end
5396
-
5397
4919
  desc 'check_pod_restart', 'Check pod restart', :hide => true
5398
4920
  def check_pod_restart
5399
4921
  Cnvrg::CLI.new.log_start(__method__, args, options)
@@ -5668,7 +5190,7 @@ module Cnvrg
5668
5190
 
5669
5191
  if dirs.size == 0
5670
5192
  log_message("Couldn't find cnvrg directory. Please start a new project", Thor::Shell::Color::RED)
5671
-
5193
+ puts Thread.current.backtrace
5672
5194
  exit(1)
5673
5195
  end
5674
5196
  return dirs.join("/")
@@ -5771,7 +5293,7 @@ module Cnvrg
5771
5293
  is_cnvrg = is_cnvrg_dir
5772
5294
  if !is_cnvrg
5773
5295
  say "You're not in a cnvrg project directory", Thor::Shell::Color::RED
5774
- exit(0)
5296
+ exit(1)
5775
5297
  end
5776
5298
 
5777
5299
  end
@@ -5917,21 +5439,6 @@ module Cnvrg
5917
5439
 
5918
5440
  end
5919
5441
 
5920
- def container_changes(dir)
5921
- container_id = is_project_with_docker(dir)
5922
- if not container_id
5923
- return false
5924
- end
5925
- container = Docker::Container.get(container_id)
5926
- command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
5927
- pip = container.exec(command, tty: true)[0]
5928
- command = ["/bin/bash", "-lc", "dpkg -l"]
5929
- dpkg = container.exec(command, tty: true)[0]
5930
- command = ["/bin/bash", "-lc", "cat /home/ds/.bash_history"]
5931
- history = container.exec(command, tty: true)[0]
5932
- diff = [pip, dpkg, history]
5933
- return diff
5934
- end
5935
5442
 
5936
5443
  def is_port_taken(ip = Cnvrg::CLI::IP, port = Cnvrg::CLI::PORT, seconds = 1)
5937
5444
  Timeout::timeout(seconds) do
@@ -6114,13 +5621,17 @@ module Cnvrg
6114
5621
 
6115
5622
  end
6116
5623
 
6117
- def gpu_util
5624
+ def gpu_util(take_from_docker: false, docker_id: nil)
6118
5625
  if !Helpers.ubuntu?
6119
5626
  return 0.0
6120
5627
  end
6121
5628
  stats = [[],[]]
6122
5629
  begin
6123
- gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5630
+ if take_from_docker
5631
+ gpu_stats = `docker exec -it #{docker_id} sh -c 'nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv'`
5632
+ else
5633
+ gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5634
+ end
6124
5635
 
6125
5636
  if !gpu_stats.nil?
6126
5637
  gpu_stats = gpu_stats.split("\n")[1..-1]