cnvrg 1.6.35 → 1.9.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5f90f59c9a970e96f5ac718fa295cc2ce4f81ff143d8d9f8848b039a4ecdd004
4
- data.tar.gz: 66beb8643ccbe900863b674b20f6138d69380cd0719deba77b15ce836c14d3dd
3
+ metadata.gz: d2e037c6264223158a1d85eb3c570453d0c67982e1088060a439bfcc7ffac37f
4
+ data.tar.gz: e71b3030503d3f128ba912dcb798faa8626ebad4dd0377b94eb6b3ea74df9c5e
5
5
  SHA512:
6
- metadata.gz: 771c2fc62849c1d33363d49aa555ffd7408ef3f821dc3b3106c5c94004ab1076f4d96574b12af92b80f5a7fe8ca38270875cd06801edd26145d6c7eaef249266
7
- data.tar.gz: 53fd6f5f3653ed67777022e9c50f9b042f2a4b35739e23a28ff4777c5480c62e227b62633cd7b34ff44bf19a8d1ed2268555194a5650f2c103578960f2d0bf5f
6
+ metadata.gz: e8b35e3e285a0ee031f4c61680004b91db9be7387c3544f1386eb92f7961ea0b7ef7191594255ef9cd4f3531119db86da682557a7380de2e36973173fc4749ac
7
+ data.tar.gz: f5be3adefda82fa9d37a59055b341b894551ea87e6b65045ae4e3fcec4db65f7d9a02d632e43116de046bc2fd6d17636351bb7af1ee0f3ee2593dbd4890bbc1c
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
32
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
33
  spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
34
- spec.add_runtime_dependency 'aws-sdk', '~> 2.11.417'
34
+ spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
35
35
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
36
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
37
37
  spec.add_runtime_dependency 'google-cloud-core', '~> 1.3.2'
@@ -40,11 +40,8 @@ Gem::Specification.new do |spec|
40
40
  spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
41
41
  spec.add_runtime_dependency 'parallel', '~> 1.12.0'
42
42
  spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
43
-
44
43
  spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
45
- spec.add_runtime_dependency 'docker-api', '~> 1.33'
46
44
  spec.add_runtime_dependency 'activesupport', '~> 5.2.0'
47
45
  spec.add_runtime_dependency 'ruby-progressbar'
48
- spec.add_runtime_dependency 'net-ssh'
49
46
  spec.add_runtime_dependency 'down'
50
47
  end
@@ -1,7 +1,5 @@
1
1
  require 'fileutils'
2
2
  require 'cnvrg/files'
3
- require 'docker'
4
- require 'net/ssh'
5
3
  require 'mimemagic'
6
4
 
7
5
 
@@ -175,58 +173,6 @@ module Cnvrg
175
173
  response = Cnvrg::API.request("users/#{owner}/images/#{slug}/commit_custom_image", 'POST', {image_logs:logs})
176
174
  return response
177
175
  end
178
- def self.ssh_to_machine(resp)
179
-
180
- sts_path = resp["result"]["sts_path"]
181
-
182
- uri = URI.parse(sts_path)
183
-
184
- http_object = Net::HTTP.new(uri.host, uri.port)
185
- http_object.use_ssl = true if uri.scheme == 'https'
186
- request = Net::HTTP::Get.new(sts_path)
187
-
188
- body = ""
189
- http_object.start do |http|
190
- response = http.request request
191
- body = response.read_body
192
- end
193
-
194
- URLcrypt::key = [body].pack('H*')
195
-
196
- ip = URLcrypt.decrypt(resp["result"]["machine_i"])
197
-
198
- user = URLcrypt.decrypt(resp["result"]["machine_u"])
199
- key = URLcrypt.decrypt(resp["result"]["machine_k"])
200
- tempssh = Tempfile.new "sshkey"
201
- tempssh.write open(key).read
202
- tempssh.rewind
203
- key_path = tempssh.path
204
- count = 0
205
- while count < 5
206
-
207
- begin
208
- ssh = Net::SSH.start(ip, user=user, :keys => key_path, :timeout => 10)
209
- if !ssh.nil?
210
- return ssh
211
- else
212
- count+=1
213
- sleep(2)
214
-
215
- end
216
- rescue
217
- count+=1
218
- sleep(2)
219
-
220
-
221
- end
222
- end
223
- if tempssh
224
- tempssh.close
225
- tempssh.unlink
226
- end
227
- return false
228
- end
229
-
230
176
 
231
177
 
232
178
  def create_custom_image(new_image_name,working_dir,stored_commands)
@@ -270,100 +216,6 @@ module Cnvrg
270
216
  File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
271
217
  end
272
218
 
273
- def get_container(stop=false)
274
- begin
275
- container_id=is_container_exist()
276
-
277
- if !container_id
278
- return create_container()
279
- else
280
- container = Docker::Container.get(container_id)
281
- status = container.json["State"]["Status"]
282
-
283
- if status == "running"
284
- return container
285
- else
286
- if stop
287
- return false
288
- end
289
- res = container.start()
290
- if res.info["State"]["Status"].eql? "exited" and res.info["State"]["Error"].include? "port is already allocated"
291
- return create_container()
292
- end
293
- return container
294
- end
295
- end
296
- rescue => e
297
- if e.message.include? "No such container"
298
-
299
- return create_container()
300
- else
301
- return false
302
- end
303
- end
304
-
305
- end
306
-
307
- def create_container(port=7654, is_remote=false)
308
- begin
309
- image_settings = {
310
- 'Image' => "#{@image_name}:latest",
311
- 'User' => 'ds',
312
- 'Cmd' => '/usr/local/cnvrg/run_ipython.sh',
313
- 'WorkingDir' => '/home/ds/notebooks',
314
- 'ExposedPorts' => {
315
- '8888/tcp' => {},
316
- },
317
- 'HostConfig' => {
318
- 'Binds' => ["#{@working_dir}:/home/ds/notebooks"],
319
- 'PortBindings' => {
320
- '8888/tcp' => [
321
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
322
- ],
323
- },
324
- },
325
- }
326
- container = Docker::Container.create(image_settings)
327
- container.start()
328
- netrc = File.open(File.expand_path('~')+"/.netrc", "rb")
329
- netrc_content = netrc.read
330
- container.store_file("/home/ds/.netrc", netrc_content)
331
- command = ["/bin/bash", "-lc", "sudo chmod 600 /home/ds/.netrc"]
332
- p = container.exec(command, tty: true)
333
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.netrc"]
334
- p = container.exec(command, tty: true)
335
- config = File.open(File.expand_path('~')+"/.cnvrg/config.yml", "rb")
336
- config_content = config.read
337
- container.store_file("/home/ds/.cnvrg/config.yml", config_content)
338
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg"]
339
- container.exec(command, tty: true)
340
- # Libraries instlled
341
- save_installed_libraries(container)
342
- config = {project_name: @project_name,
343
- project_slug: @project_slug,
344
- owner: @owner,
345
- docker: true, image_base: @image_name, image_tag: @image_tag, container: container.id, port: port, image_slug: @image_slug}
346
-
347
- File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
348
-
349
-
350
- return container
351
-
352
-
353
- rescue => e
354
- if e.message.include? "is not running"
355
- return create_container(port-1)
356
- end
357
- return false
358
- rescue SignalException
359
-
360
- say "\nAborting", Thor::Shell::Color::RED
361
- exit(1)
362
- end
363
-
364
-
365
- end
366
-
367
219
  def save_installed_libraries(container)
368
220
  begin
369
221
  command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
@@ -77,20 +77,22 @@ module Cnvrg
77
77
  if response.to_hash[:status] == 404
78
78
  return false
79
79
  end
80
- if parse_request == true
80
+ if parse_request
81
81
  JSON.parse(response.body)
82
82
  else
83
83
  response
84
84
  end
85
- when 'POST', 'PUT'
85
+ when 'POST', 'PUT'
86
86
  conn.options.timeout = 4200
87
- conn.options.open_timeout=180
87
+ conn.options.open_timeout = 180
88
+ conn.headers['Content-Type'] = "application/json"
88
89
  retries = 0
89
90
  success = false
91
+ data = data || {}
90
92
  while !success and retries < 20
91
93
  begin
92
- response = conn.post "#{resource}", data if method.eql? 'POST'
93
- response = conn.put "#{resource}", data if method.eql? 'PUT'
94
+ response = conn.post "#{resource}", data.to_json if method.eql? 'POST'
95
+ response = conn.put "#{resource}", data.to_json if method.eql? 'PUT'
94
96
  success = true
95
97
  Cnvrg::API.parse_version(response)
96
98
 
@@ -113,7 +115,7 @@ module Cnvrg
113
115
  end
114
116
  when 'POST_JSON'
115
117
  conn.options.timeout = 4200
116
- conn.options.open_timeout =4200
118
+ conn.options.open_timeout = 4200
117
119
  conn.headers['Content-Type'] = "application/json"
118
120
  new_data = JSON.dump(data)
119
121
 
@@ -124,8 +126,6 @@ module Cnvrg
124
126
  begin
125
127
  response = conn.post "#{resource}", new_data
126
128
  success = true
127
- Cnvrg::API.parse_version(response)
128
-
129
129
  rescue => e
130
130
  Cnvrg::Logger.log_error(e)
131
131
  sleep(5)
@@ -0,0 +1,14 @@
1
+ module Cnvrg
2
+ class API_V2 < API
3
+ ENDPOINT_VERSION = 'v2'
4
+
5
+ def self.endpoint_uri
6
+ api = get_api()
7
+ return "#{api}/#{Cnvrg::API_V2::ENDPOINT_VERSION}"
8
+ end
9
+
10
+ def self.is_response_success(response)
11
+ raise Exception.new("Bad status in response #{response.status}") if response.status != 200
12
+ end
13
+ end
14
+ end
@@ -12,7 +12,6 @@ require 'digest' # sha1up
12
12
  require "highline/import"
13
13
  require 'socket'
14
14
  require 'thor'
15
- require 'docker'
16
15
  require 'socket'
17
16
  require 'timeout'
18
17
  require 'fileutils'
@@ -28,13 +27,11 @@ require 'cnvrg/auth'
28
27
  require 'cnvrg/project'
29
28
  require 'cnvrg/files'
30
29
  require 'cnvrg/experiment'
31
- require 'cnvrg/Images'
32
30
  require 'cnvrg/image'
33
31
  require 'cnvrg/dataset'
34
32
  require 'cnvrg/datafiles'
35
33
  require 'cnvrg/data'
36
34
  require 'cnvrg/storage'
37
- require 'cnvrg/ssh'
38
35
  require 'cnvrg/result'
39
36
  require 'cnvrg/logger'
40
37
  require 'cnvrg/org_helpers'
@@ -49,6 +46,9 @@ require 'cnvrg/downloader/clients/s3_client'
49
46
  require 'cnvrg/downloader/clients/gcp_client'
50
47
  require 'cnvrg/downloader/clients/azure_client'
51
48
  require 'cnvrg/job_cli'
49
+ require 'cnvrg/job_ssh'
50
+ require 'cnvrg/connect_job_ssh'
51
+ require 'cnvrg/api_v2'
52
52
 
53
53
  class Thor
54
54
  module Base
@@ -175,6 +175,9 @@ module Cnvrg
175
175
  desc "job", "manage running jobs", :hide => false
176
176
  subcommand "job", JobCli
177
177
 
178
+ desc "ssh", "ssh into running jobs", :hide => false
179
+ subcommand "ssh", JobSsh
180
+
178
181
  desc "image [COMMAND]", "build existing images", :hide => true
179
182
  subcommand "image", ImageCli
180
183
 
@@ -819,9 +822,9 @@ module Cnvrg
819
822
  end
820
823
 
821
824
  desc 'data verify', 'Verify datasets', :hide => true
822
- method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => 15
825
+ method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => nil
823
826
 
824
- def verify_datasets(dataset_titles, timeout=0)
827
+ def verify_datasets(dataset_titles, timeout=nil)
825
828
  begin
826
829
  verify_logged_in(false)
827
830
  log_start(__method__, args, options)
@@ -830,21 +833,31 @@ module Cnvrg
830
833
  log_message("All datasets are verified", Thor::Shell::Color::BLUE) if verified
831
834
  log_message("Failed to verify datasets", Thor::Shell::Color::RED) if !verified
832
835
  exit(1) if !verified
833
-
834
836
  rescue SignalException
835
837
  say "\nAborting", Thor::Shell::Color::RED
836
838
  exit(1)
837
839
  end
838
840
  end
839
841
 
842
+ desc 'data scan', 'Lookup datasets', :hide => true
843
+ def scan_datasets()
844
+ begin
845
+ verify_logged_in(false)
846
+ log_start(__method__, args, options)
847
+ log_message("Scanning datasets", Thor::Shell::Color::BLUE)
848
+ datasets = Dataset.scan_datasets()
849
+ puts(datasets.to_json)
850
+ end
851
+ end
852
+
840
853
  desc 'data clone', 'Clone dataset', :hide => true
841
854
  method_option :commit, :type => :string, :aliases => ["-c", "--commit"], :default => ""
842
855
  method_option :only_tree, :type => :boolean, :aliases => ["-t", "--tree"], :default => false
843
856
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => nil
844
857
  method_option :read, :type => :boolean, :aliases => ["-r", "--read"], :default => false
845
858
  method_option :remote, :type => :boolean, :aliases => ["-h", "--remote"], :default => false
846
-
847
- def clone_data(dataset_url,only_tree=false,commit=nil,query=nil,read=false,remote=false, relative: false)
859
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
860
+ def clone_data(dataset_url, only_tree=false, commit=nil, query=nil, read=false, remote=false, flatten: false, relative: false, soft: false)
848
861
  begin
849
862
  verify_logged_in(false)
850
863
  log_start(__method__, args, options)
@@ -853,10 +866,10 @@ module Cnvrg
853
866
  read = options["read"] || read || false
854
867
  remote = options["remote"] || remote || false
855
868
  query = options['query'].presence || query.presence
869
+ soft = options['soft'] || soft
856
870
  if query.present?
857
- return clone_data_query(dataset_url, query)
871
+ return clone_data_query(dataset_url, query, flatten, soft: soft)
858
872
  end
859
- @executer = Cnvrg::Helpers::Executer.get_executer
860
873
 
861
874
  url_parts = dataset_url.split("/")
862
875
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
@@ -868,6 +881,8 @@ module Cnvrg
868
881
  dataset_name = response["result"]["name"]
869
882
  dataset_home = Dir.pwd+"/"+dataset_name
870
883
 
884
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name, commit: response["result"]["commit"]) if soft
885
+
871
886
  check = Helpers.checkmark
872
887
  if @dataset.init_home(remote:remote)
873
888
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -875,14 +890,12 @@ module Cnvrg
875
890
  log_message("Downloading files", Thor::Shell::Color::BLUE)
876
891
  if @dataset.softlinked?
877
892
  @files.cp_ds(relative: relative)
878
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
879
893
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
880
894
  @dataset.write_success
881
895
  return
882
896
  end
883
897
 
884
898
  if only_tree
885
-
886
899
  success = Dataset.clone_tree(commit: commit, dataset_home: dataset_home)
887
900
  return if success
888
901
  end
@@ -900,7 +913,7 @@ module Cnvrg
900
913
 
901
914
  while files['keys'].length > 0
902
915
  Cnvrg::Logger.log_info("download multiple files, #{downloaded_files.size} files downloaded")
903
- @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read)
916
+ @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read, flatten: flatten)
904
917
 
905
918
  downloaded_files += files['keys'].length
906
919
  files = @files.get_clone_chunk(commit: commit, latest_id: files['latest'])
@@ -908,7 +921,6 @@ module Cnvrg
908
921
  progressbar.finish
909
922
  if downloaded_files == files_count
910
923
  Dataset.verify_cnvrgignore_exist(dataset_name, false)
911
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
912
924
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
913
925
  @dataset.write_success
914
926
  ### if read, dont generate idx (but create idx.yml) if not read, generate idx.
@@ -930,12 +942,14 @@ module Cnvrg
930
942
 
931
943
  desc 'data clone_query', 'Clone dataset _query', :hide => true
932
944
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => ""
933
- def clone_data_query(dataset_url,query=nil)
945
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
946
+ def clone_data_query(dataset_url, query=nil, flatten=false, soft: false)
934
947
  begin
935
948
  verify_logged_in(false)
936
- @executer = Cnvrg::Helpers::Executer.get_executer
949
+ #@executer = Cnvrg::Helpers::Executer.get_executer
937
950
  log_start(__method__, args, options)
938
951
  query = options["query"] || query
952
+ soft = options["soft"] || soft
939
953
  if !query.present?
940
954
  log_message("Argument missing : query", Thor::Shell::Color::RED)
941
955
  exit(1)
@@ -945,13 +959,14 @@ module Cnvrg
945
959
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
946
960
  slug = url_parts[project_index + 1]
947
961
  owner = url_parts[project_index - 1]
948
-
949
962
  response = Cnvrg::API.request("users/#{owner}/datasets/#{slug}/search/#{query}", 'GET')
950
963
  Cnvrg::CLI.is_response_success(response,true)
951
964
  dataset_name = response["results"]["name"]
952
965
  dataset_slug = response["results"]["slug"]
953
- dataset_home = File.join(Dir.pwd, dataset_name)
966
+ dataset_home = Dir.pwd+"/"+dataset_slug
967
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name) if soft
954
968
 
969
+ # dataset_home = Dir.pwd
955
970
  if Dataset.blank_clone(owner, dataset_name, dataset_slug)
956
971
  dataset = Dataset.new(dataset_home)
957
972
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -966,6 +981,7 @@ module Cnvrg
966
981
  },
967
982
  in_threads: ParallelThreads
968
983
  }
984
+
969
985
  begin
970
986
  log_message("Downloading files", Thor::Shell::Color::BLUE)
971
987
  Parallel.map((response["results"]["query_files"]), parallel_options) do |f|
@@ -974,6 +990,7 @@ module Cnvrg
974
990
  file_name = relative_path_dir.pop()
975
991
  relative_path_dir = relative_path_dir.join("/")
976
992
  abs_path = dataset_home + "/" + relative_path_dir
993
+ abs_path = dataset_home if flatten
977
994
  begin
978
995
  FileUtils.mkdir_p(abs_path) unless File.exist? (abs_path + "/" + file_name)
979
996
  rescue
@@ -981,14 +998,14 @@ module Cnvrg
981
998
  exit(1)
982
999
  end
983
1000
  begin
984
- File.write "#{abs_path}/#{file_name}", open(f["s3_url"]).read unless File.exist? (abs_path + "/" + file_name)
985
- rescue
1001
+ File.write "#{abs_path}/#{file_name}", open(f["url"]).read unless File.exist? (abs_path + "/" + file_name)
1002
+ rescue => e
986
1003
  log_message("Could not download file: #{f["fullpath"]}", Thor::Shell::Color::RED)
987
1004
  exit(1)
988
1005
  end
989
1006
 
990
1007
  end
991
- @executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
1008
+ #@executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
992
1009
  rescue Interrupt
993
1010
  log_message("Couldn't download", Thor::Shell::Color::RED)
994
1011
  exit(1)
@@ -998,7 +1015,7 @@ module Cnvrg
998
1015
  check = Helpers.checkmark
999
1016
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
1000
1017
  dataset.write_success(in_folder=true)
1001
- rescue
1018
+ rescue => e
1002
1019
  exit(1)
1003
1020
  end
1004
1021
  end
@@ -1008,32 +1025,6 @@ module Cnvrg
1008
1025
  end
1009
1026
  end
1010
1027
 
1011
- desc 'init_data_container', 'Init dataset directory', :hide => true
1012
- method_option :login_content, :type => :string, :aliases => ["-l"], :default => ""
1013
-
1014
- def init_data_container(container)
1015
- begin
1016
- login_content = options["login_content"]
1017
-
1018
- container = Docker::Container.get(container)
1019
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
1020
- container.exec(command, tty: true)
1021
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
1022
- container.exec(command, tty: true)
1023
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
1024
- container.exec(command, tty: true)
1025
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg /home/ds/.netrc"]
1026
- container.exec(command, tty: true)
1027
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
1028
- container.exec(command, tty: true)
1029
-
1030
- rescue SignalException
1031
-
1032
- say "\nAborting", Thor::Shell::Color::RED
1033
- exit(1)
1034
- end
1035
- end
1036
-
1037
1028
  desc 'data_snap', 'Init dataset directory', :hide => true
1038
1029
  method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
1039
1030
 
@@ -1184,17 +1175,29 @@ module Cnvrg
1184
1175
  end
1185
1176
 
1186
1177
  desc '', '', :hide => true
1187
- def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, message: nil)
1178
+ def get_owner_slug(url_or_slug)
1179
+ if url_or_slug =~ URI::regexp
1180
+ # Find owner and slug in url
1181
+ url_parts = url_or_slug.split("/")
1182
+ project_index = Cnvrg::Helpers.look_for_in_path(url_or_slug, "datasets")
1183
+ slug = url_parts[project_index + 1]
1184
+ owner = url_parts[project_index - 1]
1185
+ else
1186
+ # Find owner in config file
1187
+ owner = CLI.get_owner
1188
+ slug = url_or_slug
1189
+ end
1190
+ return owner, slug
1191
+ end
1192
+
1193
+ desc '', '', :hide => true
1194
+ def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, force: false, threads: 15, message: nil)
1188
1195
  begin
1189
1196
  verify_logged_in(false)
1190
1197
  log_start(__method__, args, options)
1191
1198
 
1192
- #find owner and slug in url
1193
- url_parts = dataset_url.split("/")
1194
- project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
1195
- slug = url_parts[project_index + 1]
1196
- owner = url_parts[project_index - 1]
1197
- @dataset = Dataset.new(dataset_url: dataset_url)
1199
+ owner, slug = get_owner_slug(dataset_url)
1200
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1198
1201
  @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1199
1202
  @files = @datafiles.verify_files_exists(files)
1200
1203
 
@@ -1218,28 +1221,33 @@ module Cnvrg
1218
1221
  else
1219
1222
  @commit = commit
1220
1223
  end
1221
- #dir shouldnt have starting or ending slash.
1224
+
1225
+ # dir shouldnt have starting or ending slash.
1222
1226
  dir = dir[0..-2] if dir.end_with? '/'
1223
1227
  dir = dir[1..-1] if dir.start_with? '/'
1224
1228
 
1225
- @files.each_slice(chunk_size).each do |list_files|
1226
- temp_tree = @dataset.generate_chunked_idx(list_files, prefix: dir)
1227
- #will throw a signal exception if something goes wrong.
1228
- @datafiles.upload_multiple_files(@commit, temp_tree, force: true, prefix: dir, total: @files.size)
1229
+ @datafiles.upload_multiple_files_optimized(
1230
+ @files,
1231
+ @commit,
1232
+ force: force,
1233
+ chunk_size: chunk_size,
1234
+ prefix: dir,
1235
+ threads: threads
1236
+ )
1237
+
1238
+ # This is for backwards compatibility only and should be removed in future versions:
1239
+ res = @datafiles.put_commit(@commit)
1240
+ unless res.is_success?
1241
+ raise SignalException.new(1, res.msg)
1229
1242
  end
1230
- if commit.blank?
1231
- res = @datafiles.put_commit(@commit)
1232
- unless res.is_success?
1233
- raise SignalException.new(1, res.msg)
1234
- end
1235
- else
1236
- res = @datafiles.end_commit(@commit,false, success: true )
1237
- msg = res['result']
1238
- response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1239
- unless response.is_success?
1240
- raise SignalException.new(1, res.msg)
1241
- end
1243
+
1244
+ res = @datafiles.end_commit(@commit,false, success: true, commit_type: "put")
1245
+ msg = res['result']
1246
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1247
+ unless response.is_success?
1248
+ raise SignalException.new(1, res.msg)
1242
1249
  end
1250
+
1243
1251
  log_message("Uploading files finished Successfully", Thor::Shell::Color::GREEN)
1244
1252
  rescue SignalException => e
1245
1253
  log_message(e.message, Thor::Shell::Color::RED)
@@ -1248,7 +1256,49 @@ module Cnvrg
1248
1256
  end
1249
1257
 
1250
1258
 
1259
+ desc '', '', :hide => true
1260
+ def data_rm(dataset_url, regex_list: [], commit: '', message: nil)
1261
+ begin
1262
+ verify_logged_in(false)
1263
+ log_start(__method__, args, options)
1251
1264
 
1265
+ owner, slug = get_owner_slug(dataset_url)
1266
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1267
+ @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1268
+
1269
+ # Init a new commit
1270
+ response = @datafiles.start_commit(false, true, chunks: 1, message: message )
1271
+ unless response #means we failed in the start commit.
1272
+ raise SignalException.new(1, "Cant put files into dataset, check the dataset id")
1273
+ end
1274
+ @commit = response['result']['commit_sha1']
1275
+ files_to_delete, folders_to_delete, job_id = @datafiles.delete_multiple_files(@commit, regex_list)
1276
+ log_message("Deleting #{files_to_delete} files and #{folders_to_delete} folders", Thor::Shell::Color::GREEN)
1277
+
1278
+ total_files = files_to_delete + folders_to_delete
1279
+ current_progress = 0
1280
+ progressbar = @datafiles.create_progressbar("Delete Progress", total_files)
1281
+ chunk_size = 1000
1282
+ offset = 0
1283
+ while current_progress < total_files
1284
+ current_progress = @datafiles.delete_file_chunk(@commit, regex_list, chunk_size, offset)
1285
+ progressbar.progress = current_progress
1286
+ offset += chunk_size
1287
+ end
1288
+
1289
+ res = @datafiles.end_commit(@commit,false, success: true)
1290
+ msg = res['result']
1291
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1292
+ unless response.is_success?
1293
+ raise SignalException.new(1, res.msg)
1294
+ end
1295
+
1296
+ log_message("Deleting files finished Successfully", Thor::Shell::Color::GREEN)
1297
+ rescue SignalException => e
1298
+ log_message(e.message, Thor::Shell::Color::RED)
1299
+ return false
1300
+ end
1301
+ end
1252
1302
 
1253
1303
  desc 'upload_data', 'Upload data files', :hide => true
1254
1304
  method_option :ignore, :type => :array, :aliases => ["-i", "--i"], :desc => "ignore following files"
@@ -1699,18 +1749,22 @@ module Cnvrg
1699
1749
  end
1700
1750
 
1701
1751
  desc 'data commits', 'List all commits for a specific dataset', :hide => true
1702
-
1703
- def list_dataset_commits()
1704
- verify_logged_in(true)
1752
+ def list_dataset_commits(dataset_url, commit_sha1: nil)
1753
+ verify_logged_in(false)
1705
1754
  log_start(__method__, args, options)
1706
1755
 
1707
- dataset_dir = is_cnvrg_dir(Dir.pwd)
1708
- @dataset = Dataset.new(dataset_dir)
1709
- result = @dataset.list_commits()
1756
+ if dataset_url == "."
1757
+ dataset_dir = is_cnvrg_dir(Dir.pwd)
1758
+ @dataset = Dataset.new(dataset_dir)
1759
+ else
1760
+ owner, slug = get_owner_slug(dataset_url)
1761
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1762
+ end
1763
+
1764
+ result = @dataset.list_commits(commit_sha1:commit_sha1)
1710
1765
  list = result["result"]["list"]
1711
1766
 
1712
1767
  print_table(list)
1713
-
1714
1768
  end
1715
1769
 
1716
1770
  desc 'commits', 'List all commits for a specific Project'
@@ -1741,17 +1795,17 @@ module Cnvrg
1741
1795
 
1742
1796
 
1743
1797
  desc 'git_clone', 'Clone project'
1798
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1744
1799
  def git_clone(slug, owner)
1745
1800
  verify_logged_in(false)
1746
1801
  log_start(__method__, args, options)
1747
-
1802
+ project_home = Dir.pwd
1803
+ soft = options["soft"] || false
1804
+ Project.stop_if_project_present(project_home, slug) if soft
1748
1805
  clone_resp = Project.clone_dir_remote(slug, owner, slug,true)
1749
- idx_status = Project.new(get_project_home).generate_idx
1806
+ exit 1 if not clone_resp
1807
+ idx_status = Project.new(get_project_home).generate_idx(files:[])
1750
1808
  FileUtils.mkdir_p File.join(get_project_home, ENV['CNVRG_OUTPUT_DIR']) if ENV['CNVRG_OUTPUT_DIR'].present?
1751
- @executer = Cnvrg::Helpers::Executer.get_executer
1752
- if @executer.present?
1753
- @executer.update_git_commit
1754
- end
1755
1809
  end
1756
1810
 
1757
1811
 
@@ -1791,7 +1845,7 @@ module Cnvrg
1791
1845
  desc 'clone PROJECT_URL', 'Clone project'
1792
1846
  method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false
1793
1847
  method_option :commit, :type => :string, :aliases => ["-c", "--c"], :default => nil
1794
-
1848
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1795
1849
  def clone(project_url)
1796
1850
  begin
1797
1851
  verify_logged_in(false)
@@ -1801,6 +1855,8 @@ module Cnvrg
1801
1855
  slug = url_parts[project_index + 1]
1802
1856
  owner = url_parts[project_index - 1]
1803
1857
  remote = options["remote"] || false
1858
+ soft = options["soft"] || false
1859
+
1804
1860
 
1805
1861
  response = Cnvrg::API.request("users/#{owner}/projects/#{slug}/get_project", 'GET')
1806
1862
  Cnvrg::CLI.is_response_success(response)
@@ -1814,6 +1870,8 @@ module Cnvrg
1814
1870
  clone_resp = false
1815
1871
  project_home = Dir.pwd
1816
1872
 
1873
+ Project.stop_if_project_present(project_home, project_name) if soft
1874
+
1817
1875
  if remote and !git
1818
1876
  clone_resp = Project.clone_dir_remote(slug, owner, project_name,git)
1819
1877
  elsif git
@@ -1954,8 +2012,6 @@ module Cnvrg
1954
2012
  method_option :parallel, :type => :numeric, :aliases => ["-p", "--parallel"], :desc => "uparallel upload at the same time", :default => 15
1955
2013
  method_option :init, :type => :boolean, :aliases => ["--initial"], :desc => "initial sync", :default => false
1956
2014
  method_option :message, :type => :string, :aliases => ["--message"], :desc => "create commit with message", :default => nil
1957
-
1958
-
1959
2015
  def sync_data_new(new_branch, force, verbose, commit, all_files, tags ,parallel, chunk_size, init, message)
1960
2016
  verify_logged_in(true)
1961
2017
  log_start(__method__, args, options)
@@ -1964,11 +2020,13 @@ module Cnvrg
1964
2020
  # w(verbose=false, new_branch=false,sync=false, commit=nil,all_files=true)
1965
2021
  total_deleted, total_downloaded = invoke :download_data_new,[verbose, new_branch, true, commit, all_files], :new_branch=>new_branch, :direct=>false, :force =>force
1966
2022
  end
1967
- # w(new_branch, verbose,sync,force, tags, chunk_size)
2023
+
1968
2024
  invoke :upload_data_new,[new_branch, verbose, true, force, tags, chunk_size, message:message, total_deleted: total_deleted, total_downloaded: total_downloaded],
1969
2025
  :new_branch=>new_branch, :direct=>false, :force =>force, :sync =>true, :tags =>tags, :parallel => parallel, :message => message
1970
2026
 
1971
2027
  end
2028
+
2029
+
1972
2030
  desc 'upload_data_new', 'upload_data_new', :hide => true
1973
2031
  method_option :verbose, :type => :boolean, :aliases => ["-v"], :default => false
1974
2032
  method_option :new_branch, :type => :boolean, :aliases => ["-nb"], :desc => "create new branch of commits"
@@ -2211,15 +2269,27 @@ module Cnvrg
2211
2269
  method_option :return_id, :type => :boolean, :aliases => ["-r", "--return_id"], :default => false
2212
2270
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
2213
2271
  method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
2272
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2214
2273
  method_option :job_slug, :type => :string, :aliases => ["--job"], :default => nil, :hide=>true
2215
2274
  method_option :job_type, :type => :string, :aliases => [ "--job_type"], :default => nil, :hide=>true
2275
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2276
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2216
2277
 
2217
- def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil)
2278
+ def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true)
2218
2279
  begin
2219
2280
  # we are passing "force" twice.. doesnt really make sense :\\
2220
2281
  verify_logged_in(true)
2221
2282
  log_start(__method__, args, options)
2222
2283
  @project = Project.new(get_project_home)
2284
+
2285
+ # Enable local/experiment exception logging
2286
+ suppress_exceptions = suppress_exceptions ? suppress_exceptions : options[:suppress_exceptions]
2287
+ if in_exp
2288
+ exp_obj = Experiment.new(@project.owner, @project.slug, job_id: job_slug)
2289
+ else
2290
+ exp_obj = nil
2291
+ end
2292
+
2223
2293
  commit_msg = options["message"]
2224
2294
  if commit_msg.nil? or commit_msg.empty?
2225
2295
  commit_msg = ""
@@ -2235,19 +2305,21 @@ module Cnvrg
2235
2305
  spec_files_to_upload = spec_files_to_upload.split(",")
2236
2306
  end
2237
2307
  if @project.is_git
2308
+ list = []
2238
2309
  git_output_dir = options["output_dir"] || output_dir
2239
2310
  if git_output_dir.present?
2240
2311
  if git_output_dir.ends_with? "/"
2241
2312
  git_output_dir = git_output_dir[0..-2]
2242
2313
  end
2243
2314
  list = @project.generate_output_dir(git_output_dir)
2244
- spec_files_to_upload = list
2245
- if spec_files_to_upload.blank?
2246
- log_message("#{check} Project is up to date", Thor::Shell::Color::GREEN, (((options["sync"] or sync) and !direct) ? false : true))
2247
- return true
2248
- end
2249
- force = true
2250
2315
  end
2316
+ list += @project.generate_git_diff if options["git_diff"]
2317
+ spec_files_to_upload = list
2318
+ if spec_files_to_upload.blank?
2319
+ log_message("#{check} Project is up to date", Thor::Shell::Color::GREEN, (((options["sync"] or sync) and !direct) ? false : true))
2320
+ return true
2321
+ end
2322
+ force = true
2251
2323
  end
2252
2324
 
2253
2325
  if ignore.nil? or ignore.empty?
@@ -2289,8 +2361,6 @@ module Cnvrg
2289
2361
  end
2290
2362
  update_count = 0
2291
2363
  update_total = result["added"].size + result["updated_on_local"].size + result["deleted"].size
2292
- successful_updates = []
2293
- successful_deletions = []
2294
2364
  if options["verbose"]
2295
2365
  if update_total == 1
2296
2366
  log_message("Updating #{update_total} file", Thor::Shell::Color::BLUE)
@@ -2310,8 +2380,11 @@ module Cnvrg
2310
2380
  end
2311
2381
  job_type = options['job_type'] || job_type
2312
2382
  job_slug = options['job_slug'] || job_slug
2313
- commit_sha1 = @files.start_commit(new_branch, force: force, exp_start_commit: exp_start_commit,
2314
- job_type: job_type, job_slug: job_slug, start_commit: current_commit, message: options["message"])["result"]["commit_sha1"]
2383
+ commit_sha1 = @files.start_commit(
2384
+ new_branch, force: force, exp_start_commit: exp_start_commit,
2385
+ job_type: job_type, job_slug: job_slug, start_commit: current_commit,message: options["message"],
2386
+ debug_mode: options["debug_mode"]
2387
+ )["result"]["commit_sha1"]
2315
2388
  # upload / update
2316
2389
  # delete
2317
2390
  to_upload = result["added"] + result["updated_on_local"]
@@ -2322,32 +2395,30 @@ module Cnvrg
2322
2395
  :starting_at => 0,
2323
2396
  :total => (to_upload.size + deleted.size),
2324
2397
  :autofinish => true)
2325
- @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar)
2326
2398
 
2327
- @files.delete_files_from_server(deleted, commit_sha1)
2399
+ buffered_errors = @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar, suppress_exceptions: suppress_exceptions)
2400
+ @files.delete_files_from_server(deleted, commit_sha1, suppress_exceptions: suppress_exceptions)
2328
2401
 
2329
2402
  progressbar.finish
2403
+
2404
+ if buffered_errors.is_a?(Hash)
2405
+ buffered_errors.keys.each do |file|
2406
+ to_upload.delete(file)
2407
+ Cnvrg::CLI.log_message(buffered_errors[file], 'red')
2408
+ exp_obj.job_log([buffered_errors[file]]) unless exp_obj.nil?
2409
+ end
2410
+ end
2411
+
2330
2412
  res = @files.end_commit(commit_sha1, force: force, message: commit_msg)
2331
2413
  unless Cnvrg::CLI.is_response_success(res, false)
2332
2414
  raise StandardError.new("Cant end commit")
2333
2415
  end
2416
+
2334
2417
  # save idx
2335
2418
  @project.update_idx_with_files_commits!((to_upload + deleted), res["result"]["commit_time"])
2336
2419
  @project.update_idx_with_commit!(commit_sha1)
2337
2420
  if options["verbose"]
2338
2421
  log_message("#{check} Done", Thor::Shell::Color::BLUE)
2339
- if successful_updates.size > 0
2340
- successful_updates.flatten!
2341
- log_message("Updated:", Thor::Shell::Color::GREEN)
2342
- suc = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2343
- log_message(suc.join("\n"), Thor::Shell::Color::GREEN)
2344
- end
2345
- if successful_deletions.size > 0
2346
- successful_deletions.flatten!
2347
- log_message("Deleted:", Thor::Shell::Color::GREEN)
2348
- del = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2349
- log_message(del.join("\n"), Thor::Shell::Color::GREEN)
2350
- end
2351
2422
  log_message("Total of #{update_count} / #{update_total} files.", Thor::Shell::Color::GREEN)
2352
2423
  else
2353
2424
  if return_id
@@ -2372,9 +2443,13 @@ module Cnvrg
2372
2443
  if e.is_a? SignalException
2373
2444
  say "\nAborting", Thor::Shell::Color::BLUE
2374
2445
  say "\nRolling back all changes", Thor::Shell::Color::BLUE
2446
+
2447
+ exp_obj.job_log(["Aborting", "Rolling back all changes"]) unless exp_obj.nil?
2375
2448
  else
2376
2449
  log_message(error_message, Thor::Shell::Color::RED)
2377
2450
  log_error(e)
2451
+
2452
+ exp_obj.job_log([error_message, e]) unless exp_obj.nil?
2378
2453
  end
2379
2454
  @files.rollback_commit(commit_sha1) unless commit_sha1.nil?
2380
2455
  print_res = {
@@ -2892,6 +2967,11 @@ module Cnvrg
2892
2967
  method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
2893
2968
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
2894
2969
  method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
2970
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2971
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2972
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2973
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2974
+
2895
2975
  def sync(direct = true)
2896
2976
  verify_logged_in(true) if direct
2897
2977
  @project = Project.new(get_project_home)
@@ -2903,16 +2983,20 @@ module Cnvrg
2903
2983
  is_git = ENV['CNVRG_GIT_PROJECT'] == "true" || @project.is_git
2904
2984
  in_exp = options["in_exp"] || (job_slug.present? and job_type.present?)
2905
2985
  in_exp = false if job_type.present? and job_type == "NotebookSession"
2986
+ output_dir = options["output_dir"] || ENV['CNVRG_OUTPUT_DIR']
2987
+
2906
2988
  run_download = true
2907
- if options[:force] or options[:files].present? or options[:output_dir].present? or in_exp or @project.is_branch
2989
+ if (job_type == "NotebookSession" and is_git) or job_type == "Experiment" or options['force']
2908
2990
  run_download = false
2909
2991
  end
2910
- if run_download
2992
+
2993
+ if run_download or options['debug_mode']
2911
2994
  invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
2912
2995
  end
2913
- invoke :upload, [false, true, direct, "",in_exp,options[:force], options["output_dir"],job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2996
+ invoke :upload, [false, true, direct, "",in_exp,options[:force], output_dir, job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2914
2997
  :ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
2915
- :files => options["files"], :output_dir => options["output_dir"], :job_slug => job_slug, :job_type => job_type
2998
+ :files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"], :debug_mode => options['debug_mode'], :git_diff => options["git_diff"]
2999
+
2916
3000
  end
2917
3001
 
2918
3002
  desc 'run cmd', 'Runs an experiment'
@@ -3057,6 +3141,8 @@ module Cnvrg
3057
3141
  method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
3058
3142
  method_option :data_commit, :type => :string, :aliases => ["-dc", "--data_commit"], :default => ""
3059
3143
  method_option :ignore, :type => :string, :aliases => ["-i", "--ignore"], :desc => "ignore following files", :default => ""
3144
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch", :default => ""
3145
+ method_option :gpu_util_from_docker, :type => :boolean, :aliases => ["--gpu-util-from-docker"], :desc => "take gpu utilization from job docker", :default => false
3060
3146
  method_option :remote, :type => :boolean, :aliases => ["--remote"], :default => false
3061
3147
  method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
3062
3148
  method_option :force, :type => :boolean, :aliases => ["-f", "--force"], :default => false
@@ -3064,6 +3150,7 @@ module Cnvrg
3064
3150
  method_option :periodic_sync, :type => :string, :aliases => ["-ps", "--periodic_sync"], :default => ""
3065
3151
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3066
3152
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3153
+ method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3067
3154
 
3068
3155
  def exec(*cmd)
3069
3156
  log = []
@@ -3130,8 +3217,12 @@ module Cnvrg
3130
3217
  end
3131
3218
  remote = options["remote"]
3132
3219
  if remote
3133
- docker_id = `cat /etc/hostname`
3134
- docker_id = docker_id.strip()
3220
+ if options["docker_id"].present?
3221
+ docker_id = options["docker_id"]
3222
+ else
3223
+ docker_id = `cat /etc/hostname`
3224
+ docker_id = docker_id.strip()
3225
+ end
3135
3226
  end
3136
3227
  is_on_gpu = options["gpu"]
3137
3228
  start_commit = @project.last_local_commit
@@ -3141,9 +3232,9 @@ module Cnvrg
3141
3232
 
3142
3233
  platform = RUBY_PLATFORM
3143
3234
  machine_name = Socket.gethostname
3235
+ machine_activity_slug = ENV["CNVRG_MACHINE_ACTIVITY"]
3144
3236
  begin
3145
- machine_activity = @exp.get_machine_activity(working_dir)
3146
- @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity, script_path, sync_before_terminate, periodic_sync)
3237
+ @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity_slug, script_path, sync_before_terminate, periodic_sync)
3147
3238
  log_message("Experiment's live results: #{Cnvrg::Helpers.remote_url}/#{@project.owner}/projects/#{@project.slug}/experiments/#{@exp.slug}", Thor::Shell::Color::GREEN)
3148
3239
  log_message("Running: #{cmd}\n", Thor::Shell::Color::BLUE)
3149
3240
  unless @exp.slug.nil?
@@ -3161,7 +3252,7 @@ module Cnvrg
3161
3252
  begin
3162
3253
  stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3163
3254
  if is_on_gpu
3164
- gu = gpu_util
3255
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3165
3256
  stats['gpu_util'] = gu[0]
3166
3257
  stats['gpu'] = gu[1]
3167
3258
  end
@@ -3173,6 +3264,16 @@ module Cnvrg
3173
3264
  end
3174
3265
  end
3175
3266
  start_time = Time.now
3267
+ shell_type = options["use_bash"] ? "bash -l" : "sh"
3268
+ if @exp.get_cmd.present?
3269
+ cmd = @exp.get_cmd
3270
+ if options["docker_id"].present? # Escape for docker exec
3271
+ cmd = cmd.gsub("\"", "\\\"")
3272
+ end
3273
+ end
3274
+ if options["docker_id"].present?
3275
+ cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
3276
+ end
3176
3277
  PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
3177
3278
  begin
3178
3279
  stdout.each do |line|
@@ -3187,7 +3288,7 @@ module Cnvrg
3187
3288
  puts line
3188
3289
  end
3189
3290
  log << cur_log
3190
- if log.size >= 5
3291
+ if log.size >= 1
3191
3292
  @exp.upload_temp_log(log) unless log.empty?
3192
3293
  log = []
3193
3294
  elsif (start_time + 15.seconds) <= Time.now
@@ -3237,29 +3338,26 @@ module Cnvrg
3237
3338
  exp_success = false
3238
3339
  end
3239
3340
 
3240
- if sync_after
3241
- @exp.job_log(["Syncing Experiment"])
3242
- # Sync after run
3243
- if @project.is_git
3244
- output_dir = output_dir || @exp.output_dir
3245
- if output_dir.present?
3246
- upload(false, false, true, ignore, true, true,output_dir,"Experiment",@exp.slug )
3247
- # invoke :upload, [false, false, true, ignore, true, true], :output_dir => output_dir, :force=>true, :job_type=>'Experiment', :job_slug=>@exp.slug
3248
- end
3249
- else
3250
- upload(false, false, true, ignore, true, true,nil,"Experiment",@exp.slug )
3251
-
3252
- # invoke :upload, [false, false, true, ignore,true, true], :job_type=>'Experiment', :job_slug=>@exp.slug, :force=>true
3341
+ if sync_after
3342
+ @exp.job_log(["Syncing Experiment"])
3343
+ # Sync after run
3344
+ if @project.is_git
3345
+ output_dir = output_dir || @exp.output_dir
3346
+ if output_dir.present?
3347
+ upload(false, false, true, ignore, true, true, output_dir, "Experiment", @exp.slug, true )
3253
3348
  end
3254
-
3349
+ else
3350
+ upload(false, false, true, ignore, true, true, nil, "Experiment", @exp.slug, true )
3255
3351
  end
3352
+ end
3353
+
3256
3354
  end_commit = @project.last_local_commit
3257
3355
  if end_commit.present?
3258
3356
  @exp.job_log(["Experiment end commit: #{end_commit}"])
3259
3357
  end
3260
3358
 
3261
3359
  # log_thread.join
3262
- stats_thread.join
3360
+ stats_thread.join
3263
3361
 
3264
3362
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3265
3363
 
@@ -3407,8 +3505,8 @@ module Cnvrg
3407
3505
  local_folders_options = options["local_folders"]
3408
3506
  options_hash.except!("schedule", "recurring", "machine_type", "image", "upload_output", "grid", "data", "data_commit", "title",
3409
3507
  "local", "small", "medium", "large", "gpu", "gpuxl", "gpuxxl","max_time","dataset_only_tree",
3410
- "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets", "requirements", "prerun",
3411
- "email_notification_error", "email_notification_success", "emails")
3508
+ "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets",
3509
+ "requirements", "prerun", "email_notification_error", "email_notification_success", "emails")
3412
3510
  exec_options = options_hash.map {|x| "--#{x[0]}=#{x[1]}"}.flatten.join(" ")
3413
3511
  command = "#{exec_options} #{remote} #{upload_output_option} #{cmd.flatten.join(" ")}"
3414
3512
  commit_to_run = options["commit"] || nil
@@ -4233,144 +4331,6 @@ module Cnvrg
4233
4331
 
4234
4332
  end
4235
4333
 
4236
- method_option :small, :type => :boolean, :aliases => ["-sm", "--small"], :default => false
4237
- method_option :medium, :type => :boolean, :aliases => ["-md", "--medium"], :default => false
4238
- method_option :large, :type => :boolean, :aliases => ["-lg", "--large"], :default => false
4239
- method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
4240
- method_option :gpuxl, :type => :boolean, :aliases => ["--gpuxl"], :default => false
4241
- method_option :gpuxxl, :type => :boolean, :aliases => ["--gpuxxl"], :default => false
4242
- method_option :image, :type => :string, :aliases => ["-i", "--image"], :default => ""
4243
- method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
4244
- method_option :base, :type => :boolean, :aliases => ["-b", "--base"], :default => false
4245
- method_option :python3, :type => :boolean, :aliases => ["--python3"], :default => false
4246
- method_option :docker_path, :type => :string, :aliases => ["--docker_path"], :default => ""
4247
-
4248
-
4249
- desc 'create_custom_image', 'run commands inside containers', :hide => true
4250
-
4251
- def build_image(image_name)
4252
- begin
4253
- verify_logged_in(false)
4254
- log_start(__method__, args, options)
4255
- instances = {"small" => options["small"], "medium" => options["medium"], "large" => options["large"],
4256
- "gpu" => options["gpu"], "gpuxl" => options["gpuxl"], "gpuxxl" => options["gpuxxl"]}
4257
- instance_type = get_instance_type(instances)
4258
- image_extend = options["image"]
4259
- public = options["public"]
4260
- base = options["base"]
4261
- python3 = options["python3"]
4262
- docker_path = options["docker_path"]
4263
- owner = CLI.get_owner
4264
- checks = Helpers.checkmark()
4265
- tar_path = nil
4266
- if !docker_path.nil? and !docker_path.empty?
4267
- docker_path = File.absolute_path(docker_path)
4268
- #create tar of the docker path: it could be a docker file, and it could be a docker folder
4269
- tar_path = File.expand_path('~') + "/.cnvrg/tmp/docker_#{File.basename docker_path}.tar.gz"
4270
- resp = create_docker_tar(docker_path, tar_path)
4271
- if !resp
4272
- log_message("Couldn't create tar from docker path", Thor::Shell::Color::RED)
4273
- FileUtils.rm_rf tar_path
4274
- exit(1)
4275
- end
4276
- files = Cnvrg::Files.new(owner, "")
4277
- resp = Images.create_new_custom_image_with_docker(instance_type, owner, image_name, public, base, image_extend, python3, tar_path, files)
4278
- if resp
4279
- end
4280
- else
4281
- log_message("Creating machine for your custom image, this may take a few moments...", Thor::Shell::Color::BLUE)
4282
- resp = Images.create_new_custom_image(instance_type, owner, image_name, public, base, image_extend, python3, nil)
4283
-
4284
- end
4285
-
4286
- if Cnvrg::CLI.is_response_success(resp, false)
4287
- image_slug = resp["result"]["slug"]
4288
- container = resp["result"]["machine_c"]
4289
- log_message("#{checks} Created image and machine successfully", Thor::Shell::Color::GREEN)
4290
- log_message("Connecting to machine", Thor::Shell::Color::BLUE)
4291
- ssh = Ssh.new(resp)
4292
- if !ssh.is_ssh
4293
- log_message("Couldn't connect to machine,aborting", Thor::Shell::Color::RED)
4294
- Images.revoke_custom_new_image(owner, image_slug)
4295
- end
4296
- log_message("run command until ctrl + c or quit is initiated", Thor::Shell::Color::BLUE)
4297
- begin
4298
- logs = []
4299
-
4300
- while true
4301
- command = ask("$>")
4302
- logs << {time: Time.now,
4303
- message: command,
4304
- type: "stdout"
4305
- }
4306
- if command.eql? "quit"
4307
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4308
- break
4309
- end
4310
- res = ssh.exec_command(command)
4311
- begin
4312
- res_parsed = JSON.parse(res)
4313
- res = res_parsed.join(",")
4314
- end
4315
-
4316
- puts res
4317
- logs << {time: Time.now,
4318
- message: res,
4319
- type: "stdout"
4320
- }
4321
- logs.flatten!
4322
-
4323
- end
4324
-
4325
- rescue SignalException
4326
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4327
-
4328
- end
4329
- resp = Images.commit_custom_image(owner, image_slug, logs)
4330
- if Cnvrg::CLI.is_response_success(resp, false)
4331
- log_message("#{checks} Image commited successfuly, email will be sent when image is ready", Thor::Shell::Color::GREEN)
4332
- else
4333
- if image_slug
4334
- Images.revoke_custom_new_image(owner, image_slug)
4335
- end
4336
- if ssh
4337
- ssh.close_ssh()
4338
- end
4339
- log_message("Image couldn't be commited, rolling back changes", Thor::Shell::Color::RED)
4340
-
4341
- exit(1)
4342
- end
4343
- if ssh
4344
- ssh.close_ssh()
4345
- end
4346
-
4347
-
4348
- end
4349
- rescue => e
4350
- log_message("Error occurd, aborting", Thor::Shell::Color::RED)
4351
-
4352
- log_error(e)
4353
- if image_slug
4354
- Images.revoke_custom_new_image(owner, image_slug)
4355
- end
4356
- if ssh
4357
- ssh.close_ssh()
4358
- end
4359
-
4360
-
4361
- rescue SignalException
4362
- if image_slug
4363
- Images.revoke_custom_new_image(owner, image_slug)
4364
- end
4365
- if ssh
4366
- ssh.close_ssh
4367
- end
4368
- say "\nAborting"
4369
- exit(1)
4370
- end
4371
-
4372
- end
4373
-
4374
4334
 
4375
4335
  desc 'build', 'run commands inside containers', :hide => true
4376
4336
  method_option :install, :type => :string, :aliases => ["--i"], :default => nil, :desc => "Install from the given instructions file"
@@ -4564,66 +4524,7 @@ module Cnvrg
4564
4524
  end
4565
4525
 
4566
4526
 
4567
- desc 'upload_image', 'commit notebook changes to create a new notebook image', :hide =>true
4568
-
4569
- def upload_image_old(image_id, is_public, is_base, *message)
4570
- verify_logged_in(true)
4571
- log_start(__method__, args, options)
4572
- image = Docker::Image.get(image_id)
4573
- project_home = get_project_home
4574
- @project = Project.new(project_home)
4575
- last_local_commit = @project.last_local_commit
4576
- image_name = @project.slug + "#{last_local_commit}"
4577
- path = File.expand_path('~') + "/.cnvrg/tmp/#{image_name}.tar"
4578
- owner = Cnvrg::CLI.get_owner()
4579
- if !message.nil? or !message.empty?
4580
- message = message.join(" ")
4581
- end
4582
-
4583
- log_message("Saving image's current state", Thor::Shell::Color::BLUE)
4584
- image.save(path)
4585
-
4586
- begin
4587
- log_message("Compressing image file to upload", Thor::Shell::Color::BLUE)
4588
- gzipRes = system("gzip -f #{path}")
4589
- if !gzipRes
4590
-
4591
- log_message("Couldn't create tar file from image", Thor::Shell::Color::RED)
4592
- exit(1)
4593
- end
4594
- path = path + ".gz"
4595
- @files = Cnvrg::Files.new(owner, "")
4596
-
4597
- exit_status = $?.exitstatus
4598
- if exit_status == 0
4599
- log_message("Uploading image file", Thor::Shell::Color::BLUE)
4600
-
4601
- diff = container_changes(Dir.pwd)
4602
- res = @files.upload_image(path, image_name, owner, is_public, is_base, diff[1], diff[0], diff[2], message, image.commit_id)
4603
- if res
4604
- File.delete(path)
4605
- image_loc = is_project_with_docker(Dir.pwd)
4606
- image_loc.update_slug(res["result"]["id"])
4607
-
4608
- checks = Helpers.checkmark()
4609
- log_message("#{checks} Done", Thor::Shell::Color::GREEN)
4610
- else
4611
- log_message("Couldn't upload image", Thor::Shell::Color::RED)
4612
-
4613
- end
4614
- else
4615
- log_message("Couldn't create image file for: #{image_name}", Thor::Shell::Color::RED)
4616
- exit(1)
4617
- end
4618
- rescue => e
4619
- log_message("Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED)
4620
- log_error(e)
4621
- rescue SignalException
4622
4527
 
4623
- say "Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED
4624
- exit(1)
4625
- end
4626
- end
4627
4528
 
4628
4529
  desc '', '', :hide => true
4629
4530
 
@@ -4634,278 +4535,30 @@ module Cnvrg
4634
4535
 
4635
4536
  end
4636
4537
 
4637
- desc '', '', :hide => true
4638
-
4639
- def exec_container(container_id, *cmd)
4640
- container = Docker::Container.get(container_id)
4641
- container.start()
4642
- cnvrg_command = cmd.join(" ")
4643
- command = ["/bin/bash", "-lc", "#{cnvrg_command}"]
4644
- res = container.exec(command, tty: true, wait: 5400)[0]
4645
- say res
4646
- end
4647
-
4648
- desc '', '', :hide => true
4649
-
4650
- def port_container(container_id)
4651
- container = Docker::Container.get(container_id)
4652
- say container.json["HostConfig"]["PortBindings"]["8888/tcp"][0]["HostPort"]
4653
- end
4654
-
4655
- desc '', '', :hide => true
4656
-
4657
- def tensor_port_container(container_id)
4658
- container = Docker::Container.get(container_id)
4659
- say container.json["HostConfig"]["PortBindings"]["6006/tcp"][0]["HostPort"]
4660
- end
4661
-
4662
- desc '', '', :hide => true
4663
-
4664
- def stop_container(container_id)
4665
- container = Docker::Container.get(container_id)
4666
- container.stop()
4667
- container.remove()
4668
-
4669
- end
4670
-
4671
- desc '', '', :hide => true
4672
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4673
- method_option :app_dir, :type => :string, :aliases => ["-d"], :default => "/home/ds/notebooks"
4674
- method_option :cmd, :type => :string, :aliases => ["-c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4675
-
4676
-
4677
- def config_remote(image_name, port = 7654, tensport = 6006)
4678
- local_images = Docker::Image.all
4679
-
4680
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4681
- if docker_image_local.empty?
4682
- say "no image"
4683
- exit(1)
4684
- end
4685
-
4686
- begin
4687
- login_content = options["login"]
4688
- app_dir = options["app_dir"]
4689
- cmd = options["cmd"]
4690
- volume_from = options["volume"]
4691
-
4692
- image_settings = {
4693
- 'Image' => "#{image_name}:latest",
4694
-
4695
- 'Cmd' => cmd,
4696
- 'WorkingDir' => app_dir,
4697
- 'ExposedPorts' => {
4698
- '8888/tcp' => {},
4699
- },
4700
- 'HostConfig' => {
4701
- 'Binds' => ["/var/run/docker.sock:/var/run/docker.sock", "/usr/bin/docker:/usr/bin/docker"],
4702
- 'PortBindings' => {
4703
- '8888/tcp' => [
4704
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4705
- ],
4706
- '6006/tcp' => [
4707
- {'HostPort' => "#{tensport}", 'HostIp' => 'localhost'}
4708
- ],
4709
- },
4710
- },
4711
- }
4712
- container = Docker::Container.create(image_settings)
4713
- container.start()
4714
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4715
- container.exec(command, tty: true)
4716
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
4717
- # container.exec(command, tty: true)
4718
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
4719
- # container.exec(command, tty: true)
4720
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4721
- container.exec(command, tty: true)
4722
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4723
- container.exec(command, tty: true)
4724
- say "#{container.id}:#{port}##{tensport}"
4725
- rescue => e
4726
- puts e
4727
- if e.message.include? "is not running"
4728
- return config_remote(image_name, port - 1, tensport - 1)
4729
- end
4730
-
4731
- if container
4732
- container.kill()
4733
- end
4734
- return false
4735
- end
4736
- end
4737
-
4738
-
4739
- desc '', '', :hide => true
4740
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4741
-
4742
- def config_netrc(container)
4743
-
4744
- login_content = options["login"]
4745
-
4746
- container = Docker::Container.get(container)
4747
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4748
- container.exec(command, tty: true)
4749
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4750
- container.exec(command, tty: true)
4751
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4752
- container.exec(command, tty: true)
4753
- say "OK"
4754
-
4755
- end
4756
-
4757
- desc '', '', :hide => true
4758
- method_option :login, :type => :string, :aliases => ["-l", "--l"], :default => ""
4759
- method_option :app_dir, :type => :string, :aliases => ["-d", "--d"], :default => "/home/ds/notebooks"
4760
- method_option :cmd, :type => :string, :aliases => ["-c", "--c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4761
-
4762
-
4763
- def config_remote_gpu(image_name, port = 7654, tensport = 6006)
4764
- local_images = Docker::Image.all
4765
-
4766
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4767
- if docker_image_local.empty?
4768
- say "no image"
4769
- exit(1)
4770
- end
4771
-
4772
- begin
4773
- login_content = options["login"]
4774
- app_dir = options["app_dir"]
4775
- cmd = options["cmd"]
4776
-
4777
- # image_settings = {
4778
- # 'Image' => "#{image_name}:latest",
4779
- # 'User' => 'ds',
4780
- # 'Cmd' => cmd,
4781
- # 'WorkingDir' => app_dir,
4782
- # 'ExposedPorts' => {
4783
- # '8888/tcp' => {},
4784
- # },
4785
- # 'HostConfig' => {
4786
- # 'PortBindings' => {
4787
- # '8888/tcp' => [
4788
- # {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4789
- # ],
4790
- # '6006/tcp' => [
4791
- # {'HostPort' => "6006", 'HostIp' => 'localhost'}
4792
- # ],
4793
- # },
4794
- # },
4795
- # }
4796
-
4797
- container_id = `nvidia-docker run -itd -p #{port}:8888 -p #{tensport}:6006 -w #{app_dir} -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi -v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker #{image_name}:latest #{cmd} `
4798
- container_id = container_id.gsub("\n", "")
4799
- container = Docker::Container.get(container_id)
4800
- # container.start()
4801
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4802
- container.exec(command, tty: true)
4803
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4804
- container.exec(command, tty: true)
4805
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4806
- container.exec(command, tty: true)
4807
- say "#{container.id}:#{port}##{tensport}"
4808
- rescue => e
4809
- if e.message.include? "is not running"
4810
- puts "running asgain with: #{port - 1} #{tensport - 1}"
4811
- return config_remote_gpu(image_name, port - 1, tensport - 1)
4812
- end
4813
-
4814
- if container
4815
- container.kill()
4816
- end
4817
- return false
4818
- end
4819
- end
4820
-
4821
- desc '', '', :hide => true
4822
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4823
-
4824
- def config_flask_remote(image_name, port = 80)
4825
- local_images = Docker::Image.all
4826
-
4827
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4828
- if docker_image_local.empty?
4829
- say "no image"
4830
- exit(1)
4831
- end
4832
-
4833
- begin
4834
- login_content = options["login"]
4835
- image_settings = {
4836
- 'Image' => "#{image_name}:latest",
4837
- 'User' => 'ds',
4838
- 'Cmd' => '/usr/local/cnvrg/start_super.sh',
4839
- 'WorkingDir' => '/home/ds/app',
4840
- 'ExposedPorts' => {
4841
- '80/tcp' => {},
4842
- },
4843
- 'HostConfig' => {
4844
- 'PortBindings' => {
4845
- '80/tcp' => [
4846
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4847
- ],
4848
- },
4849
- },
4850
- }
4851
- container = Docker::Container.create(image_settings)
4852
- container.start()
4853
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4854
- container.exec(command, tty: true)
4855
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4856
- container.exec(command, tty: true)
4857
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4858
- container.exec(command, tty: true)
4859
- say "#{container.id}:#{port}"
4860
- rescue => e
4861
- pus e
4862
- if e.message.include? "is not running"
4863
- return "port is taken"
4864
- end
4865
- puts "error"
4866
- if container
4867
- container.kill()
4538
+ desc 'Collect and send job utilization', '', :hide => true
4539
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch"
4540
+ method_option :is_on_gpu, :type => :boolean, :aliases => ["--is_on_gpu"], :desc => "is on gpu", :default => true
4541
+ def get_utilization()
4542
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4543
+ docker_id = options["docker_id"]
4544
+ while true do
4545
+ sleep 30
4546
+ begin
4547
+ stats = usage_metrics_in_docker(docker_id)
4548
+ if options["is_on_gpu"]
4549
+ gu = gpu_util(take_from_docker: true, docker_id: docker_id)
4550
+ stats['gpu_util'] = gu[0]
4551
+ stats['gpu'] = gu[1]
4552
+ end
4553
+ stats['docker_id'] = docker_id
4554
+ @exp.send_machine_stats [stats] unless stats.empty?
4555
+ rescue => e
4556
+ log_error(e)
4557
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4868
4558
  end
4869
- return false
4870
4559
  end
4871
4560
  end
4872
4561
 
4873
- desc '', '', :hide => true
4874
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4875
-
4876
- def config_flask_remote_gpu(image_name, port = 80)
4877
- local_images = Docker::Image.all
4878
-
4879
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4880
- if docker_image_local.empty?
4881
- say "no image"
4882
- exit(1)
4883
- end
4884
-
4885
- begin
4886
- login_content = options["login"]
4887
- container_id = `nvidia-docker run -itd -p 80:80 -w /home/ds/app #{image_name}:latest /usr/local/cnvrg/start_super.sh`
4888
- container_id = container_id.gsub("\n", "")
4889
- container = Docker::Container.get(container_id)
4890
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4891
- container.exec(command, tty: true)
4892
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4893
- container.exec(command, tty: true)
4894
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4895
- container.exec(command, tty: true)
4896
- say "#{container.id}:#{port}"
4897
- rescue => e
4898
- puts e
4899
- if e.message.include? "is not running"
4900
- return "port is taken"
4901
- end
4902
- puts "error"
4903
- if container
4904
- container.kill()
4905
- end
4906
- return false
4907
- end
4908
- end
4909
4562
 
4910
4563
  desc '', '', :hide => true
4911
4564
 
@@ -4931,39 +4584,10 @@ module Cnvrg
4931
4584
 
4932
4585
  end
4933
4586
 
4934
- desc 'upload_image', 'Upload new docker image to cnvrg', :hide => true
4935
- method_option :workdir, :type => :string, :aliases => ["-w","--workdir"], :desc => "workdir of docker image", :default => "/root"
4936
- method_option :description, :type => :string, :aliases => ["-d", "--description"], :desc => "description for docker image", :default => ""
4937
- method_option :user, :type => :string, :aliases => ["-u","--user"], :default => "root"
4938
- method_option :gpu, :type => :boolean, :aliases => ["-g","--gpu"], :default => false
4939
- def upload_image(image_name,image_path)
4940
- begin
4941
- verify_logged_in(false)
4942
- log_start(__method__, args, options)
4943
-
4944
- @image = Cnvrg::Images.new()
4945
- say "Uploading new docker image file", Thor::Shell::Color::BLUE
4946
- workdir = options[:workdir]
4947
- description = options[:description]
4948
- user = options[:user]
4949
- is_gpu = options[:gpu]
4950
- res = @image.upload_docker_image(image_path, image_name, workdir, user, description, is_gpu)
4951
- if res["status"] == 200
4952
- image_slug = res["id"]
4953
- owner = CLI.get_owner
4954
- image_url = "#{Cnvrg::Helpers.remote_url}/#{owner}/settings/images/#{image_slug}"
4955
- log_message("Successfully uploaded image: #{image_url}", Thor::Shell::Color::GREEN, true)
4956
-
4957
-
4958
- else
4959
- log_message("Couldn't upload image: #{image_name}", Thor::Shell::Color::RED, true)
4960
-
4961
- end
4962
- rescue => e
4963
- log_error(e)
4964
- end
4965
-
4966
-
4587
+ desc 'file_exists', description: '', hide: true
4588
+ def file_exists(file)
4589
+ exit(0) if File.exists? file
4590
+ exit(1)
4967
4591
  end
4968
4592
 
4969
4593
 
@@ -5143,29 +4767,40 @@ module Cnvrg
5143
4767
  method_option :project_slug, :type => :string, :aliases => ["-s"], :desc => "project slug"
5144
4768
  method_option :project_owner, :type => :string, :aliases => ["-o"], :desc => "project slug"
5145
4769
  method_option :frequency, :type => :numeric, :aliases => ["-f"], :desc => "poll frequency"
4770
+ method_option :fetch_slugs, :type => :boolean, :default => false, :desc => "Fetch experiments slugs to compare"
5146
4771
 
5147
4772
  def compare_experiments
5148
4773
  verify_logged_in(true)
5149
4774
  log_start(__method__, args, options)
5150
4775
  exps_map = {}
4776
+ copied_commits = []
5151
4777
 
5152
- if options[:slugs].blank?
4778
+ if options[:slugs].blank? and options[:fetch_slugs].blank?
5153
4779
  log_message("No experiments slugs given", Thor::Shell::Color::RED)
5154
4780
  return false
5155
4781
  end
5156
- slugs = options[:slugs].split(",")
5157
- if slugs.blank?
5158
- log_message("No experiments slugs given", Thor::Shell::Color::RED)
5159
- return false
4782
+ if options[:slugs].present?
4783
+ slugs = options[:slugs].split(",")
5160
4784
  end
4785
+
5161
4786
  frequency = options[:frequency] || 5
5162
4787
  namespace = options[:namespace]
5163
4788
  project_dir = is_cnvrg_dir(Dir.pwd)
5164
4789
  @project = Project.new(project_home=project_dir, slug: options[:project_slug], owner: options[:project_owner])
4790
+ fetch_slugs = options[:fetch_slugs]
4791
+ webapp_slug = ENV["CNVRG_JOB_ID"]
4792
+ if fetch_slugs and webapp_slug.present?
4793
+ slugs = @project.fetch_webapp_slugs(webapp_slug)
4794
+ end
4795
+ if slugs.blank?
4796
+ log_message("No experiments slugs given", Thor::Shell::Color::RED)
4797
+ return false
4798
+ end
5165
4799
 
4800
+ log_message("compare is running")
5166
4801
  while true
4802
+ log_message("compare is running for slugs #{slugs}")
5167
4803
  slugs.each do |exp_slug|
5168
-
5169
4804
  begin
5170
4805
  if exps_map[exp_slug].blank?
5171
4806
  exp = @project.get_experiment(exp_slug)["experiment"]
@@ -5179,15 +4814,23 @@ module Cnvrg
5179
4814
  log_message("#{exp_name} has ended, getting files from end commit", Thor::Shell::Color::BLUE)
5180
4815
  Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project)
5181
4816
  exps_map[exp_slug] = exp
5182
- elsif exp["machine_activity"].present?
4817
+ else
5183
4818
  log_message("#{exp_name} is running should get logs", Thor::Shell::Color::BLUE)
5184
- Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4819
+ success = Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4820
+ if !success and exp["last_successful_commit"].present? and !copied_commits.include?(exp["last_successful_commit"])
4821
+ log_message("Failed to get kube files, using last commit", Thor::Shell::Color::BLUE)
4822
+ Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project, commit: exp["last_successful_commit"])
4823
+ copied_commits << exp["last_successful_commit"]
4824
+ end
5185
4825
  end
5186
4826
  rescue => e
5187
4827
  Cnvrg::Logger.log_error(e)
5188
4828
  end
5189
4829
  end
5190
4830
  sleep frequency
4831
+ if fetch_slugs
4832
+ slugs = @project.fetch_webapp_slugs(webapp_slug, slugs: slugs)
4833
+ end
5191
4834
  end
5192
4835
  end
5193
4836
 
@@ -5273,127 +4916,6 @@ module Cnvrg
5273
4916
  end
5274
4917
 
5275
4918
 
5276
- desc 'pull_image', 'downloads and loads an image', :hide => true
5277
-
5278
- def pull_image(image_name)
5279
- begin
5280
- verify_logged_in(false)
5281
- log_start(__method__, args, options)
5282
- owner = Cnvrg::CLI.get_owner()
5283
- image = Cnvrg::Images.image_exist(owner, image_name)
5284
- if !image
5285
- log_message("Couldn't find image in cnvrg repository", Thor::Shell::Color::RED)
5286
- exit(1)
5287
- end
5288
- path = download_image(image_name, image["slug"])
5289
- if path
5290
- log_message("Building image", Thor::Shell::Color::BLUE)
5291
- Docker.options[:read_timeout] = 216000
5292
- image = Docker::Image.build_from_dir(path, {'dockerfile' => 'Dockerfile.cpu', 't' => "#{image_name}:latest"}) do |v|
5293
- begin
5294
- if (log = JSON.parse(v)) && log.has_key?("stream")
5295
- next if log["stream"].starts_with? "Step"
5296
- $stdout.puts log["stream"]
5297
- end
5298
- rescue
5299
- end
5300
-
5301
- end
5302
-
5303
- if not image.nil?
5304
- FileUtils.rm_rf(path)
5305
- checks = Helpers.checkmark()
5306
- log_message("#{checks} Image built successfully", Thor::Shell::Color::GREEN)
5307
- return image
5308
- else
5309
-
5310
- log_message("Could not build image", Thor::Shell::Color::RED)
5311
- return false
5312
- end
5313
- else
5314
-
5315
- log_message("Could not download image", Thor::Shell::Color::RED)
5316
- return false
5317
-
5318
-
5319
- end
5320
-
5321
- # else
5322
- # path = download_image(image_name,image["slug"])
5323
- # if path
5324
- # image = Docker::Image.import(path)
5325
- # image.tag('repo' => image_name, 'tag' => 'latest')
5326
- # if not image.nil?
5327
- # say "Finished downloading image, cleaning up..", Thor::Shell::Color::GREEN
5328
- # FileUtils.rm(path)
5329
- # checks = Helpers.checkmark()
5330
- # say "#{checks} Done", Thor::Shell::Color::GREEN
5331
- # log_end(0)
5332
- # return image
5333
- # log_end(0)
5334
- # else
5335
- # say "Could not download image", Thor::Shell::Color::RED
5336
- # return false
5337
- # end
5338
- #
5339
- # end
5340
- # end
5341
- rescue => e
5342
-
5343
- log_message "Error: couldn't build image", Thor::Shell::Color::RED
5344
- log_error(e)
5345
-
5346
- rescue SignalException
5347
- say "\nAborting"
5348
- exit(1)
5349
- ensure
5350
- if path
5351
- FileUtils.rm_rf(path)
5352
-
5353
- end
5354
- end
5355
-
5356
-
5357
- end
5358
-
5359
- desc 'set_image', 'set image to a porject', :hide => true
5360
-
5361
- def set_image(docker_image)
5362
- verify_logged_in(true)
5363
- log_start(__method__, args, options)
5364
- working_dir = is_cnvrg_dir
5365
- project = Project.new(working_dir)
5366
-
5367
- local_images = Docker::Image.all
5368
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.include? docker_image}.flatten
5369
- if docker_image_local.size == 0
5370
-
5371
- if yes? "Image wasn't found locally, pull image from cnvrg repository?", Thor::Shell::Color::YELLOW
5372
- image = pull(docker_image)
5373
- if image
5374
- log_message("downloaded image: #{docker_image}", Thor::Shell::Color::BLUE)
5375
- @image = Images.new(working_dir, docker_image)
5376
- else
5377
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5378
- exit(1)
5379
- end
5380
- else
5381
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5382
- exit(1)
5383
-
5384
- end
5385
- elsif docker_image_local.size == 1
5386
- log_message("found image: #{docker_image_local[0]}, setting it up..", Thor::Shell::Color::BLUE)
5387
- @image = Images.new(working_dir, docker_image_local[0])
5388
- elsif docker_image_local.size > 1
5389
- log_message("found #{docker_image_local.size} images, choose the image name you want to use", Thor::Shell::Color::BLUE)
5390
- image_name = ask "#{docker_image_local.join("\n")}\n", Thor::Shell::Color::BLUE
5391
- image_name = image_name.strip
5392
- @image = Images.new(working_dir, image_name)
5393
- end
5394
- @image.update_image_activity(project.last_local_commit, nil)
5395
- end
5396
-
5397
4919
  desc 'check_pod_restart', 'Check pod restart', :hide => true
5398
4920
  def check_pod_restart
5399
4921
  Cnvrg::CLI.new.log_start(__method__, args, options)
@@ -5668,7 +5190,7 @@ module Cnvrg
5668
5190
 
5669
5191
  if dirs.size == 0
5670
5192
  log_message("Couldn't find cnvrg directory. Please start a new project", Thor::Shell::Color::RED)
5671
-
5193
+ puts Thread.current.backtrace
5672
5194
  exit(1)
5673
5195
  end
5674
5196
  return dirs.join("/")
@@ -5771,7 +5293,7 @@ module Cnvrg
5771
5293
  is_cnvrg = is_cnvrg_dir
5772
5294
  if !is_cnvrg
5773
5295
  say "You're not in a cnvrg project directory", Thor::Shell::Color::RED
5774
- exit(0)
5296
+ exit(1)
5775
5297
  end
5776
5298
 
5777
5299
  end
@@ -5917,21 +5439,6 @@ module Cnvrg
5917
5439
 
5918
5440
  end
5919
5441
 
5920
- def container_changes(dir)
5921
- container_id = is_project_with_docker(dir)
5922
- if not container_id
5923
- return false
5924
- end
5925
- container = Docker::Container.get(container_id)
5926
- command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
5927
- pip = container.exec(command, tty: true)[0]
5928
- command = ["/bin/bash", "-lc", "dpkg -l"]
5929
- dpkg = container.exec(command, tty: true)[0]
5930
- command = ["/bin/bash", "-lc", "cat /home/ds/.bash_history"]
5931
- history = container.exec(command, tty: true)[0]
5932
- diff = [pip, dpkg, history]
5933
- return diff
5934
- end
5935
5442
 
5936
5443
  def is_port_taken(ip = Cnvrg::CLI::IP, port = Cnvrg::CLI::PORT, seconds = 1)
5937
5444
  Timeout::timeout(seconds) do
@@ -6114,13 +5621,17 @@ module Cnvrg
6114
5621
 
6115
5622
  end
6116
5623
 
6117
- def gpu_util
5624
+ def gpu_util(take_from_docker: false, docker_id: nil)
6118
5625
  if !Helpers.ubuntu?
6119
5626
  return 0.0
6120
5627
  end
6121
5628
  stats = [[],[]]
6122
5629
  begin
6123
- gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5630
+ if take_from_docker
5631
+ gpu_stats = `docker exec -it #{docker_id} sh -c 'nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv'`
5632
+ else
5633
+ gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5634
+ end
6124
5635
 
6125
5636
  if !gpu_stats.nil?
6126
5637
  gpu_stats = gpu_stats.split("\n")[1..-1]