cnvrg 1.6.33 → 1.9.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 465c50d73c84ef9a768c647926ab576225b5c4344db62b815f64685340b29e38
4
- data.tar.gz: 6bf6610502b0b43c94d9cf5036b509e39da10210dc1cf6eacadb4dce1c837f60
3
+ metadata.gz: 84b1379d995cf6475f7a0a946560bb3c1d438fd514c83df2123251dc4f7de560
4
+ data.tar.gz: 862caef093f400c6645b2cb311379fd976539fde8f7544a1146d603e0cdb3a13
5
5
  SHA512:
6
- metadata.gz: 06fc439571eb477075c79d816b16409eb33b3171d6ebb710cce394e44ab5c37ca0da02d370a29c2957169c6a779f37b29415d8ee7af576cd7a6cf975de11dfe0
7
- data.tar.gz: 5c7f72eff0112c158b3048fde389a75108151f9b527e017782c73229434180f280308d2e85d52eb765f4cba356232bcd96844aaafed0335d11f455aa73ba486b
6
+ metadata.gz: e4548991e1debc2c5c389af062cc0f73da8d172cf2ee1614315356e47203f490f2b147010e1a41913c75b071ccbd4174dbff7a44fea0816135b7f1f1b8965c21
7
+ data.tar.gz: f29420115fd784a0e57d44af5ec265d5c39f2636f998c630d0c21138c46b3fd517997c05d3eb48d8917c3c6ef0e5689081df052b01080185387240222003fd13
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
32
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
33
  spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
34
- spec.add_runtime_dependency 'aws-sdk', '~> 2.11.417'
34
+ spec.add_runtime_dependency 'aws-sdk-s3', '~> 1'
35
35
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
36
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
37
37
  spec.add_runtime_dependency 'google-cloud-core', '~> 1.3.2'
@@ -40,11 +40,8 @@ Gem::Specification.new do |spec|
40
40
  spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
41
41
  spec.add_runtime_dependency 'parallel', '~> 1.12.0'
42
42
  spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
43
-
44
43
  spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
45
- spec.add_runtime_dependency 'docker-api', '~> 1.33'
46
44
  spec.add_runtime_dependency 'activesupport', '~> 5.2.0'
47
45
  spec.add_runtime_dependency 'ruby-progressbar'
48
- spec.add_runtime_dependency 'net-ssh'
49
46
  spec.add_runtime_dependency 'down'
50
47
  end
@@ -1,7 +1,5 @@
1
1
  require 'fileutils'
2
2
  require 'cnvrg/files'
3
- require 'docker'
4
- require 'net/ssh'
5
3
  require 'mimemagic'
6
4
 
7
5
 
@@ -175,58 +173,6 @@ module Cnvrg
175
173
  response = Cnvrg::API.request("users/#{owner}/images/#{slug}/commit_custom_image", 'POST', {image_logs:logs})
176
174
  return response
177
175
  end
178
- def self.ssh_to_machine(resp)
179
-
180
- sts_path = resp["result"]["sts_path"]
181
-
182
- uri = URI.parse(sts_path)
183
-
184
- http_object = Net::HTTP.new(uri.host, uri.port)
185
- http_object.use_ssl = true if uri.scheme == 'https'
186
- request = Net::HTTP::Get.new(sts_path)
187
-
188
- body = ""
189
- http_object.start do |http|
190
- response = http.request request
191
- body = response.read_body
192
- end
193
-
194
- URLcrypt::key = [body].pack('H*')
195
-
196
- ip = URLcrypt.decrypt(resp["result"]["machine_i"])
197
-
198
- user = URLcrypt.decrypt(resp["result"]["machine_u"])
199
- key = URLcrypt.decrypt(resp["result"]["machine_k"])
200
- tempssh = Tempfile.new "sshkey"
201
- tempssh.write open(key).read
202
- tempssh.rewind
203
- key_path = tempssh.path
204
- count = 0
205
- while count < 5
206
-
207
- begin
208
- ssh = Net::SSH.start(ip, user=user, :keys => key_path, :timeout => 10)
209
- if !ssh.nil?
210
- return ssh
211
- else
212
- count+=1
213
- sleep(2)
214
-
215
- end
216
- rescue
217
- count+=1
218
- sleep(2)
219
-
220
-
221
- end
222
- end
223
- if tempssh
224
- tempssh.close
225
- tempssh.unlink
226
- end
227
- return false
228
- end
229
-
230
176
 
231
177
 
232
178
  def create_custom_image(new_image_name,working_dir,stored_commands)
@@ -270,100 +216,6 @@ module Cnvrg
270
216
  File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
271
217
  end
272
218
 
273
- def get_container(stop=false)
274
- begin
275
- container_id=is_container_exist()
276
-
277
- if !container_id
278
- return create_container()
279
- else
280
- container = Docker::Container.get(container_id)
281
- status = container.json["State"]["Status"]
282
-
283
- if status == "running"
284
- return container
285
- else
286
- if stop
287
- return false
288
- end
289
- res = container.start()
290
- if res.info["State"]["Status"].eql? "exited" and res.info["State"]["Error"].include? "port is already allocated"
291
- return create_container()
292
- end
293
- return container
294
- end
295
- end
296
- rescue => e
297
- if e.message.include? "No such container"
298
-
299
- return create_container()
300
- else
301
- return false
302
- end
303
- end
304
-
305
- end
306
-
307
- def create_container(port=7654, is_remote=false)
308
- begin
309
- image_settings = {
310
- 'Image' => "#{@image_name}:latest",
311
- 'User' => 'ds',
312
- 'Cmd' => '/usr/local/cnvrg/run_ipython.sh',
313
- 'WorkingDir' => '/home/ds/notebooks',
314
- 'ExposedPorts' => {
315
- '8888/tcp' => {},
316
- },
317
- 'HostConfig' => {
318
- 'Binds' => ["#{@working_dir}:/home/ds/notebooks"],
319
- 'PortBindings' => {
320
- '8888/tcp' => [
321
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
322
- ],
323
- },
324
- },
325
- }
326
- container = Docker::Container.create(image_settings)
327
- container.start()
328
- netrc = File.open(File.expand_path('~')+"/.netrc", "rb")
329
- netrc_content = netrc.read
330
- container.store_file("/home/ds/.netrc", netrc_content)
331
- command = ["/bin/bash", "-lc", "sudo chmod 600 /home/ds/.netrc"]
332
- p = container.exec(command, tty: true)
333
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.netrc"]
334
- p = container.exec(command, tty: true)
335
- config = File.open(File.expand_path('~')+"/.cnvrg/config.yml", "rb")
336
- config_content = config.read
337
- container.store_file("/home/ds/.cnvrg/config.yml", config_content)
338
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg"]
339
- container.exec(command, tty: true)
340
- # Libraries instlled
341
- save_installed_libraries(container)
342
- config = {project_name: @project_name,
343
- project_slug: @project_slug,
344
- owner: @owner,
345
- docker: true, image_base: @image_name, image_tag: @image_tag, container: container.id, port: port, image_slug: @image_slug}
346
-
347
- File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
348
-
349
-
350
- return container
351
-
352
-
353
- rescue => e
354
- if e.message.include? "is not running"
355
- return create_container(port-1)
356
- end
357
- return false
358
- rescue SignalException
359
-
360
- say "\nAborting", Thor::Shell::Color::RED
361
- exit(1)
362
- end
363
-
364
-
365
- end
366
-
367
219
  def save_installed_libraries(container)
368
220
  begin
369
221
  command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
@@ -77,20 +77,22 @@ module Cnvrg
77
77
  if response.to_hash[:status] == 404
78
78
  return false
79
79
  end
80
- if parse_request == true
80
+ if parse_request
81
81
  JSON.parse(response.body)
82
82
  else
83
83
  response
84
84
  end
85
- when 'POST', 'PUT'
85
+ when 'POST', 'PUT'
86
86
  conn.options.timeout = 4200
87
- conn.options.open_timeout=180
87
+ conn.options.open_timeout = 180
88
+ conn.headers['Content-Type'] = "application/json"
88
89
  retries = 0
89
90
  success = false
91
+ data = data || {}
90
92
  while !success and retries < 20
91
93
  begin
92
- response = conn.post "#{resource}", data if method.eql? 'POST'
93
- response = conn.put "#{resource}", data if method.eql? 'PUT'
94
+ response = conn.post "#{resource}", data.to_json if method.eql? 'POST'
95
+ response = conn.put "#{resource}", data.to_json if method.eql? 'PUT'
94
96
  success = true
95
97
  Cnvrg::API.parse_version(response)
96
98
 
@@ -113,7 +115,7 @@ module Cnvrg
113
115
  end
114
116
  when 'POST_JSON'
115
117
  conn.options.timeout = 4200
116
- conn.options.open_timeout =4200
118
+ conn.options.open_timeout = 4200
117
119
  conn.headers['Content-Type'] = "application/json"
118
120
  new_data = JSON.dump(data)
119
121
 
@@ -124,8 +126,6 @@ module Cnvrg
124
126
  begin
125
127
  response = conn.post "#{resource}", new_data
126
128
  success = true
127
- Cnvrg::API.parse_version(response)
128
-
129
129
  rescue => e
130
130
  Cnvrg::Logger.log_error(e)
131
131
  sleep(5)
@@ -0,0 +1,14 @@
1
+ module Cnvrg
2
+ class API_V2 < API
3
+ ENDPOINT_VERSION = 'v2'
4
+
5
+ def self.endpoint_uri
6
+ api = get_api()
7
+ return "#{api}/#{Cnvrg::API_V2::ENDPOINT_VERSION}"
8
+ end
9
+
10
+ def self.is_response_success(response)
11
+ raise Exception.new("Bad status in response #{response.status}") if response.status != 200
12
+ end
13
+ end
14
+ end
@@ -12,7 +12,6 @@ require 'digest' # sha1up
12
12
  require "highline/import"
13
13
  require 'socket'
14
14
  require 'thor'
15
- require 'docker'
16
15
  require 'socket'
17
16
  require 'timeout'
18
17
  require 'fileutils'
@@ -28,13 +27,11 @@ require 'cnvrg/auth'
28
27
  require 'cnvrg/project'
29
28
  require 'cnvrg/files'
30
29
  require 'cnvrg/experiment'
31
- require 'cnvrg/Images'
32
30
  require 'cnvrg/image'
33
31
  require 'cnvrg/dataset'
34
32
  require 'cnvrg/datafiles'
35
33
  require 'cnvrg/data'
36
34
  require 'cnvrg/storage'
37
- require 'cnvrg/ssh'
38
35
  require 'cnvrg/result'
39
36
  require 'cnvrg/logger'
40
37
  require 'cnvrg/org_helpers'
@@ -49,6 +46,9 @@ require 'cnvrg/downloader/clients/s3_client'
49
46
  require 'cnvrg/downloader/clients/gcp_client'
50
47
  require 'cnvrg/downloader/clients/azure_client'
51
48
  require 'cnvrg/job_cli'
49
+ require 'cnvrg/job_ssh'
50
+ require 'cnvrg/connect_job_ssh'
51
+ require 'cnvrg/api_v2'
52
52
 
53
53
  class Thor
54
54
  module Base
@@ -175,6 +175,9 @@ module Cnvrg
175
175
  desc "job", "manage running jobs", :hide => false
176
176
  subcommand "job", JobCli
177
177
 
178
+ desc "ssh", "ssh into running jobs", :hide => false
179
+ subcommand "ssh", JobSsh
180
+
178
181
  desc "image [COMMAND]", "build existing images", :hide => true
179
182
  subcommand "image", ImageCli
180
183
 
@@ -819,9 +822,9 @@ module Cnvrg
819
822
  end
820
823
 
821
824
  desc 'data verify', 'Verify datasets', :hide => true
822
- method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => 15
825
+ method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => nil
823
826
 
824
- def verify_datasets(dataset_titles, timeout=0)
827
+ def verify_datasets(dataset_titles, timeout=nil)
825
828
  begin
826
829
  verify_logged_in(false)
827
830
  log_start(__method__, args, options)
@@ -830,21 +833,31 @@ module Cnvrg
830
833
  log_message("All datasets are verified", Thor::Shell::Color::BLUE) if verified
831
834
  log_message("Failed to verify datasets", Thor::Shell::Color::RED) if !verified
832
835
  exit(1) if !verified
833
-
834
836
  rescue SignalException
835
837
  say "\nAborting", Thor::Shell::Color::RED
836
838
  exit(1)
837
839
  end
838
840
  end
839
841
 
842
+ desc 'data scan', 'Lookup datasets', :hide => true
843
+ def scan_datasets()
844
+ begin
845
+ verify_logged_in(false)
846
+ log_start(__method__, args, options)
847
+ log_message("Scanning datasets", Thor::Shell::Color::BLUE)
848
+ datasets = Dataset.scan_datasets()
849
+ puts(datasets.to_json)
850
+ end
851
+ end
852
+
840
853
  desc 'data clone', 'Clone dataset', :hide => true
841
854
  method_option :commit, :type => :string, :aliases => ["-c", "--commit"], :default => ""
842
855
  method_option :only_tree, :type => :boolean, :aliases => ["-t", "--tree"], :default => false
843
856
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => nil
844
857
  method_option :read, :type => :boolean, :aliases => ["-r", "--read"], :default => false
845
858
  method_option :remote, :type => :boolean, :aliases => ["-h", "--remote"], :default => false
846
-
847
- def clone_data(dataset_url,only_tree=false,commit=nil,query=nil,read=false,remote=false, relative: false)
859
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
860
+ def clone_data(dataset_url, only_tree=false, commit=nil, query=nil, read=false, remote=false, flatten: false, relative: false, soft: false)
848
861
  begin
849
862
  verify_logged_in(false)
850
863
  log_start(__method__, args, options)
@@ -853,10 +866,10 @@ module Cnvrg
853
866
  read = options["read"] || read || false
854
867
  remote = options["remote"] || remote || false
855
868
  query = options['query'].presence || query.presence
869
+ soft = options['soft'] || soft
856
870
  if query.present?
857
- return clone_data_query(dataset_url, query)
871
+ return clone_data_query(dataset_url, query, flatten, soft: soft)
858
872
  end
859
- @executer = Cnvrg::Helpers::Executer.get_executer
860
873
 
861
874
  url_parts = dataset_url.split("/")
862
875
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
@@ -868,6 +881,8 @@ module Cnvrg
868
881
  dataset_name = response["result"]["name"]
869
882
  dataset_home = Dir.pwd+"/"+dataset_name
870
883
 
884
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name, commit: response["result"]["commit"]) if soft
885
+
871
886
  check = Helpers.checkmark
872
887
  if @dataset.init_home(remote:remote)
873
888
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -875,14 +890,12 @@ module Cnvrg
875
890
  log_message("Downloading files", Thor::Shell::Color::BLUE)
876
891
  if @dataset.softlinked?
877
892
  @files.cp_ds(relative: relative)
878
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
879
893
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
880
894
  @dataset.write_success
881
895
  return
882
896
  end
883
897
 
884
898
  if only_tree
885
-
886
899
  success = Dataset.clone_tree(commit: commit, dataset_home: dataset_home)
887
900
  return if success
888
901
  end
@@ -900,7 +913,7 @@ module Cnvrg
900
913
 
901
914
  while files['keys'].length > 0
902
915
  Cnvrg::Logger.log_info("download multiple files, #{downloaded_files.size} files downloaded")
903
- @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read)
916
+ @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read, flatten: flatten)
904
917
 
905
918
  downloaded_files += files['keys'].length
906
919
  files = @files.get_clone_chunk(commit: commit, latest_id: files['latest'])
@@ -908,7 +921,6 @@ module Cnvrg
908
921
  progressbar.finish
909
922
  if downloaded_files == files_count
910
923
  Dataset.verify_cnvrgignore_exist(dataset_name, false)
911
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
912
924
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
913
925
  @dataset.write_success
914
926
  ### if read, dont generate idx (but create idx.yml) if not read, generate idx.
@@ -930,12 +942,14 @@ module Cnvrg
930
942
 
931
943
  desc 'data clone_query', 'Clone dataset _query', :hide => true
932
944
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => ""
933
- def clone_data_query(dataset_url,query=nil)
945
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
946
+ def clone_data_query(dataset_url, query=nil, flatten=false, soft: false)
934
947
  begin
935
948
  verify_logged_in(false)
936
- @executer = Cnvrg::Helpers::Executer.get_executer
949
+ #@executer = Cnvrg::Helpers::Executer.get_executer
937
950
  log_start(__method__, args, options)
938
951
  query = options["query"] || query
952
+ soft = options["soft"] || soft
939
953
  if !query.present?
940
954
  log_message("Argument missing : query", Thor::Shell::Color::RED)
941
955
  exit(1)
@@ -945,13 +959,14 @@ module Cnvrg
945
959
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
946
960
  slug = url_parts[project_index + 1]
947
961
  owner = url_parts[project_index - 1]
948
-
949
962
  response = Cnvrg::API.request("users/#{owner}/datasets/#{slug}/search/#{query}", 'GET')
950
963
  Cnvrg::CLI.is_response_success(response,true)
951
964
  dataset_name = response["results"]["name"]
952
965
  dataset_slug = response["results"]["slug"]
953
- dataset_home = File.join(Dir.pwd, dataset_name)
966
+ dataset_home = Dir.pwd+"/"+dataset_slug
967
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name) if soft
954
968
 
969
+ # dataset_home = Dir.pwd
955
970
  if Dataset.blank_clone(owner, dataset_name, dataset_slug)
956
971
  dataset = Dataset.new(dataset_home)
957
972
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -966,6 +981,7 @@ module Cnvrg
966
981
  },
967
982
  in_threads: ParallelThreads
968
983
  }
984
+
969
985
  begin
970
986
  log_message("Downloading files", Thor::Shell::Color::BLUE)
971
987
  Parallel.map((response["results"]["query_files"]), parallel_options) do |f|
@@ -974,6 +990,7 @@ module Cnvrg
974
990
  file_name = relative_path_dir.pop()
975
991
  relative_path_dir = relative_path_dir.join("/")
976
992
  abs_path = dataset_home + "/" + relative_path_dir
993
+ abs_path = dataset_home if flatten
977
994
  begin
978
995
  FileUtils.mkdir_p(abs_path) unless File.exist? (abs_path + "/" + file_name)
979
996
  rescue
@@ -981,14 +998,14 @@ module Cnvrg
981
998
  exit(1)
982
999
  end
983
1000
  begin
984
- File.write "#{abs_path}/#{file_name}", open(f["s3_url"]).read unless File.exist? (abs_path + "/" + file_name)
985
- rescue
1001
+ File.write "#{abs_path}/#{file_name}", open(f["url"]).read unless File.exist? (abs_path + "/" + file_name)
1002
+ rescue => e
986
1003
  log_message("Could not download file: #{f["fullpath"]}", Thor::Shell::Color::RED)
987
1004
  exit(1)
988
1005
  end
989
1006
 
990
1007
  end
991
- @executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
1008
+ #@executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
992
1009
  rescue Interrupt
993
1010
  log_message("Couldn't download", Thor::Shell::Color::RED)
994
1011
  exit(1)
@@ -998,7 +1015,7 @@ module Cnvrg
998
1015
  check = Helpers.checkmark
999
1016
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
1000
1017
  dataset.write_success(in_folder=true)
1001
- rescue
1018
+ rescue => e
1002
1019
  exit(1)
1003
1020
  end
1004
1021
  end
@@ -1008,32 +1025,6 @@ module Cnvrg
1008
1025
  end
1009
1026
  end
1010
1027
 
1011
- desc 'init_data_container', 'Init dataset directory', :hide => true
1012
- method_option :login_content, :type => :string, :aliases => ["-l"], :default => ""
1013
-
1014
- def init_data_container(container)
1015
- begin
1016
- login_content = options["login_content"]
1017
-
1018
- container = Docker::Container.get(container)
1019
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
1020
- container.exec(command, tty: true)
1021
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
1022
- container.exec(command, tty: true)
1023
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
1024
- container.exec(command, tty: true)
1025
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg /home/ds/.netrc"]
1026
- container.exec(command, tty: true)
1027
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
1028
- container.exec(command, tty: true)
1029
-
1030
- rescue SignalException
1031
-
1032
- say "\nAborting", Thor::Shell::Color::RED
1033
- exit(1)
1034
- end
1035
- end
1036
-
1037
1028
  desc 'data_snap', 'Init dataset directory', :hide => true
1038
1029
  method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
1039
1030
 
@@ -1184,17 +1175,29 @@ module Cnvrg
1184
1175
  end
1185
1176
 
1186
1177
  desc '', '', :hide => true
1187
- def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000)
1178
+ def get_owner_slug(url_or_slug)
1179
+ if url_or_slug =~ URI::regexp
1180
+ # Find owner and slug in url
1181
+ url_parts = url_or_slug.split("/")
1182
+ project_index = Cnvrg::Helpers.look_for_in_path(url_or_slug, "datasets")
1183
+ slug = url_parts[project_index + 1]
1184
+ owner = url_parts[project_index - 1]
1185
+ else
1186
+ # Find owner in config file
1187
+ owner = CLI.get_owner
1188
+ slug = url_or_slug
1189
+ end
1190
+ return owner, slug
1191
+ end
1192
+
1193
+ desc '', '', :hide => true
1194
+ def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, force: false, threads: 15, message: nil)
1188
1195
  begin
1189
1196
  verify_logged_in(false)
1190
1197
  log_start(__method__, args, options)
1191
1198
 
1192
- #find owner and slug in url
1193
- url_parts = dataset_url.split("/")
1194
- project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
1195
- slug = url_parts[project_index + 1]
1196
- owner = url_parts[project_index - 1]
1197
- @dataset = Dataset.new(dataset_url: dataset_url)
1199
+ owner, slug = get_owner_slug(dataset_url)
1200
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1198
1201
  @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1199
1202
  @files = @datafiles.verify_files_exists(files)
1200
1203
 
@@ -1204,7 +1207,7 @@ module Cnvrg
1204
1207
  log_message("Uploading #{@files.size} files", Thor::Shell::Color::GREEN)
1205
1208
  number_of_chunks = (@files.size.to_f / chunk_size).ceil
1206
1209
  if commit.blank?
1207
- response = @datafiles.start_commit(false, true, chunks: number_of_chunks)
1210
+ response = @datafiles.start_commit(false, true, chunks: number_of_chunks, message: message )
1208
1211
  unless response #means we failed in the start commit.
1209
1212
  raise SignalException.new(1, "Cant put files into dataset, check the dataset id")
1210
1213
  end
@@ -1218,28 +1221,33 @@ module Cnvrg
1218
1221
  else
1219
1222
  @commit = commit
1220
1223
  end
1221
- #dir shouldnt have starting or ending slash.
1224
+
1225
+ # dir shouldnt have starting or ending slash.
1222
1226
  dir = dir[0..-2] if dir.end_with? '/'
1223
1227
  dir = dir[1..-1] if dir.start_with? '/'
1224
1228
 
1225
- @files.each_slice(chunk_size).each do |list_files|
1226
- temp_tree = @dataset.generate_chunked_idx(list_files, prefix: dir)
1227
- #will throw a signal exception if something goes wrong.
1228
- @datafiles.upload_multiple_files(@commit, temp_tree, force: true, prefix: dir, total: @files.size)
1229
+ @datafiles.upload_multiple_files_optimized(
1230
+ @files,
1231
+ @commit,
1232
+ force: force,
1233
+ chunk_size: chunk_size,
1234
+ prefix: dir,
1235
+ threads: threads
1236
+ )
1237
+
1238
+ # This is for backwards compatibility only and should be removed in future versions:
1239
+ res = @datafiles.put_commit(@commit)
1240
+ unless res.is_success?
1241
+ raise SignalException.new(1, res.msg)
1229
1242
  end
1230
- if commit.blank?
1231
- res = @datafiles.put_commit(@commit)
1232
- unless res.is_success?
1233
- raise SignalException.new(1, res.msg)
1234
- end
1235
- else
1236
- res = @datafiles.end_commit(@commit,false, success: true )
1237
- msg = res['result']
1238
- response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1239
- unless response.is_success?
1240
- raise SignalException.new(1, res.msg)
1241
- end
1243
+
1244
+ res = @datafiles.end_commit(@commit,false, success: true, commit_type: "put")
1245
+ msg = res['result']
1246
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1247
+ unless response.is_success?
1248
+ raise SignalException.new(1, res.msg)
1242
1249
  end
1250
+
1243
1251
  log_message("Uploading files finished Successfully", Thor::Shell::Color::GREEN)
1244
1252
  rescue SignalException => e
1245
1253
  log_message(e.message, Thor::Shell::Color::RED)
@@ -1248,7 +1256,49 @@ module Cnvrg
1248
1256
  end
1249
1257
 
1250
1258
 
1259
+ desc '', '', :hide => true
1260
+ def data_rm(dataset_url, regex_list: [], commit: '', message: nil)
1261
+ begin
1262
+ verify_logged_in(false)
1263
+ log_start(__method__, args, options)
1264
+
1265
+ owner, slug = get_owner_slug(dataset_url)
1266
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1267
+ @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1268
+
1269
+ # Init a new commit
1270
+ response = @datafiles.start_commit(false, true, chunks: 1, message: message )
1271
+ unless response #means we failed in the start commit.
1272
+ raise SignalException.new(1, "Cant put files into dataset, check the dataset id")
1273
+ end
1274
+ @commit = response['result']['commit_sha1']
1275
+ files_to_delete, folders_to_delete, job_id = @datafiles.delete_multiple_files(@commit, regex_list)
1276
+ log_message("Deleting #{files_to_delete} files and #{folders_to_delete} folders", Thor::Shell::Color::GREEN)
1251
1277
 
1278
+ total_files = files_to_delete + folders_to_delete
1279
+ current_progress = 0
1280
+ progressbar = @datafiles.create_progressbar("Delete Progress", total_files)
1281
+ chunk_size = 1000
1282
+ offset = 0
1283
+ while current_progress < total_files
1284
+ current_progress = @datafiles.delete_file_chunk(@commit, regex_list, chunk_size, offset)
1285
+ progressbar.progress = current_progress
1286
+ offset += chunk_size
1287
+ end
1288
+
1289
+ res = @datafiles.end_commit(@commit,false, success: true)
1290
+ msg = res['result']
1291
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1292
+ unless response.is_success?
1293
+ raise SignalException.new(1, res.msg)
1294
+ end
1295
+
1296
+ log_message("Deleting files finished Successfully", Thor::Shell::Color::GREEN)
1297
+ rescue SignalException => e
1298
+ log_message(e.message, Thor::Shell::Color::RED)
1299
+ return false
1300
+ end
1301
+ end
1252
1302
 
1253
1303
  desc 'upload_data', 'Upload data files', :hide => true
1254
1304
  method_option :ignore, :type => :array, :aliases => ["-i", "--i"], :desc => "ignore following files"
@@ -1699,18 +1749,22 @@ module Cnvrg
1699
1749
  end
1700
1750
 
1701
1751
  desc 'data commits', 'List all commits for a specific dataset', :hide => true
1702
-
1703
- def list_dataset_commits()
1704
- verify_logged_in(true)
1752
+ def list_dataset_commits(dataset_url, commit_sha1: nil)
1753
+ verify_logged_in(false)
1705
1754
  log_start(__method__, args, options)
1706
1755
 
1707
- dataset_dir = is_cnvrg_dir(Dir.pwd)
1708
- @dataset = Dataset.new(dataset_dir)
1709
- result = @dataset.list_commits()
1756
+ if dataset_url == "."
1757
+ dataset_dir = is_cnvrg_dir(Dir.pwd)
1758
+ @dataset = Dataset.new(dataset_dir)
1759
+ else
1760
+ owner, slug = get_owner_slug(dataset_url)
1761
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1762
+ end
1763
+
1764
+ result = @dataset.list_commits(commit_sha1:commit_sha1)
1710
1765
  list = result["result"]["list"]
1711
1766
 
1712
1767
  print_table(list)
1713
-
1714
1768
  end
1715
1769
 
1716
1770
  desc 'commits', 'List all commits for a specific Project'
@@ -1741,17 +1795,17 @@ module Cnvrg
1741
1795
 
1742
1796
 
1743
1797
  desc 'git_clone', 'Clone project'
1798
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1744
1799
  def git_clone(slug, owner)
1745
1800
  verify_logged_in(false)
1746
1801
  log_start(__method__, args, options)
1747
-
1802
+ project_home = Dir.pwd
1803
+ soft = options["soft"] || false
1804
+ Project.stop_if_project_present(project_home, slug) if soft
1748
1805
  clone_resp = Project.clone_dir_remote(slug, owner, slug,true)
1749
- idx_status = Project.new(get_project_home).generate_idx
1806
+ exit 1 if not clone_resp
1807
+ idx_status = Project.new(get_project_home).generate_idx(files:[])
1750
1808
  FileUtils.mkdir_p File.join(get_project_home, ENV['CNVRG_OUTPUT_DIR']) if ENV['CNVRG_OUTPUT_DIR'].present?
1751
- @executer = Cnvrg::Helpers::Executer.get_executer
1752
- if @executer.present?
1753
- @executer.update_git_commit
1754
- end
1755
1809
  end
1756
1810
 
1757
1811
 
@@ -1791,7 +1845,7 @@ module Cnvrg
1791
1845
  desc 'clone PROJECT_URL', 'Clone project'
1792
1846
  method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false
1793
1847
  method_option :commit, :type => :string, :aliases => ["-c", "--c"], :default => nil
1794
-
1848
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1795
1849
  def clone(project_url)
1796
1850
  begin
1797
1851
  verify_logged_in(false)
@@ -1801,6 +1855,8 @@ module Cnvrg
1801
1855
  slug = url_parts[project_index + 1]
1802
1856
  owner = url_parts[project_index - 1]
1803
1857
  remote = options["remote"] || false
1858
+ soft = options["soft"] || false
1859
+
1804
1860
 
1805
1861
  response = Cnvrg::API.request("users/#{owner}/projects/#{slug}/get_project", 'GET')
1806
1862
  Cnvrg::CLI.is_response_success(response)
@@ -1814,6 +1870,8 @@ module Cnvrg
1814
1870
  clone_resp = false
1815
1871
  project_home = Dir.pwd
1816
1872
 
1873
+ Project.stop_if_project_present(project_home, project_name) if soft
1874
+
1817
1875
  if remote and !git
1818
1876
  clone_resp = Project.clone_dir_remote(slug, owner, project_name,git)
1819
1877
  elsif git
@@ -1837,8 +1895,6 @@ module Cnvrg
1837
1895
  end
1838
1896
  clone_resp = Project.clone_dir(slug, owner, project_name,git)
1839
1897
  project_home = Dir.pwd + "/" + project_name
1840
-
1841
-
1842
1898
  end
1843
1899
 
1844
1900
  if clone_resp
@@ -1956,8 +2012,6 @@ module Cnvrg
1956
2012
  method_option :parallel, :type => :numeric, :aliases => ["-p", "--parallel"], :desc => "uparallel upload at the same time", :default => 15
1957
2013
  method_option :init, :type => :boolean, :aliases => ["--initial"], :desc => "initial sync", :default => false
1958
2014
  method_option :message, :type => :string, :aliases => ["--message"], :desc => "create commit with message", :default => nil
1959
-
1960
-
1961
2015
  def sync_data_new(new_branch, force, verbose, commit, all_files, tags ,parallel, chunk_size, init, message)
1962
2016
  verify_logged_in(true)
1963
2017
  log_start(__method__, args, options)
@@ -1966,11 +2020,13 @@ module Cnvrg
1966
2020
  # w(verbose=false, new_branch=false,sync=false, commit=nil,all_files=true)
1967
2021
  total_deleted, total_downloaded = invoke :download_data_new,[verbose, new_branch, true, commit, all_files], :new_branch=>new_branch, :direct=>false, :force =>force
1968
2022
  end
1969
- # w(new_branch, verbose,sync,force, tags, chunk_size)
2023
+
1970
2024
  invoke :upload_data_new,[new_branch, verbose, true, force, tags, chunk_size, message:message, total_deleted: total_deleted, total_downloaded: total_downloaded],
1971
2025
  :new_branch=>new_branch, :direct=>false, :force =>force, :sync =>true, :tags =>tags, :parallel => parallel, :message => message
1972
2026
 
1973
2027
  end
2028
+
2029
+
1974
2030
  desc 'upload_data_new', 'upload_data_new', :hide => true
1975
2031
  method_option :verbose, :type => :boolean, :aliases => ["-v"], :default => false
1976
2032
  method_option :new_branch, :type => :boolean, :aliases => ["-nb"], :desc => "create new branch of commits"
@@ -2213,15 +2269,27 @@ module Cnvrg
2213
2269
  method_option :return_id, :type => :boolean, :aliases => ["-r", "--return_id"], :default => false
2214
2270
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
2215
2271
  method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
2272
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2216
2273
  method_option :job_slug, :type => :string, :aliases => ["--job"], :default => nil, :hide=>true
2217
2274
  method_option :job_type, :type => :string, :aliases => [ "--job_type"], :default => nil, :hide=>true
2275
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2276
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2218
2277
 
2219
- def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil)
2278
+ def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true)
2220
2279
  begin
2221
2280
  # we are passing "force" twice.. doesnt really make sense :\\
2222
2281
  verify_logged_in(true)
2223
2282
  log_start(__method__, args, options)
2224
2283
  @project = Project.new(get_project_home)
2284
+
2285
+ # Enable local/experiment exception logging
2286
+ suppress_exceptions = suppress_exceptions ? suppress_exceptions : options[:suppress_exceptions]
2287
+ if in_exp
2288
+ exp_obj = Experiment.new(@project.owner, @project.slug, job_id: job_slug)
2289
+ else
2290
+ exp_obj = nil
2291
+ end
2292
+
2225
2293
  commit_msg = options["message"]
2226
2294
  if commit_msg.nil? or commit_msg.empty?
2227
2295
  commit_msg = ""
@@ -2237,19 +2305,21 @@ module Cnvrg
2237
2305
  spec_files_to_upload = spec_files_to_upload.split(",")
2238
2306
  end
2239
2307
  if @project.is_git
2308
+ list = []
2240
2309
  git_output_dir = options["output_dir"] || output_dir
2241
2310
  if git_output_dir.present?
2242
2311
  if git_output_dir.ends_with? "/"
2243
2312
  git_output_dir = git_output_dir[0..-2]
2244
2313
  end
2245
2314
  list = @project.generate_output_dir(git_output_dir)
2246
- spec_files_to_upload = list
2247
- if spec_files_to_upload.blank?
2248
- log_message("#{check} Project is up to date", Thor::Shell::Color::GREEN, (((options["sync"] or sync) and !direct) ? false : true))
2249
- return true
2250
- end
2251
- force = true
2252
2315
  end
2316
+ list += @project.generate_git_diff if options["git_diff"]
2317
+ spec_files_to_upload = list
2318
+ if spec_files_to_upload.blank?
2319
+ log_message("#{check} Project is up to date", Thor::Shell::Color::GREEN, (((options["sync"] or sync) and !direct) ? false : true))
2320
+ return true
2321
+ end
2322
+ force = true
2253
2323
  end
2254
2324
 
2255
2325
  if ignore.nil? or ignore.empty?
@@ -2291,8 +2361,6 @@ module Cnvrg
2291
2361
  end
2292
2362
  update_count = 0
2293
2363
  update_total = result["added"].size + result["updated_on_local"].size + result["deleted"].size
2294
- successful_updates = []
2295
- successful_deletions = []
2296
2364
  if options["verbose"]
2297
2365
  if update_total == 1
2298
2366
  log_message("Updating #{update_total} file", Thor::Shell::Color::BLUE)
@@ -2312,8 +2380,11 @@ module Cnvrg
2312
2380
  end
2313
2381
  job_type = options['job_type'] || job_type
2314
2382
  job_slug = options['job_slug'] || job_slug
2315
- commit_sha1 = @files.start_commit(new_branch, force: force, exp_start_commit: exp_start_commit,
2316
- job_type: job_type, job_slug: job_slug, start_commit: current_commit, message: options["message"])["result"]["commit_sha1"]
2383
+ commit_sha1 = @files.start_commit(
2384
+ new_branch, force: force, exp_start_commit: exp_start_commit,
2385
+ job_type: job_type, job_slug: job_slug, start_commit: current_commit,message: options["message"],
2386
+ debug_mode: options["debug_mode"]
2387
+ )["result"]["commit_sha1"]
2317
2388
  # upload / update
2318
2389
  # delete
2319
2390
  to_upload = result["added"] + result["updated_on_local"]
@@ -2324,32 +2395,30 @@ module Cnvrg
2324
2395
  :starting_at => 0,
2325
2396
  :total => (to_upload.size + deleted.size),
2326
2397
  :autofinish => true)
2327
- @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar)
2328
2398
 
2329
- @files.delete_files_from_server(deleted, commit_sha1)
2399
+ buffered_errors = @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar, suppress_exceptions: suppress_exceptions)
2400
+ @files.delete_files_from_server(deleted, commit_sha1, suppress_exceptions: suppress_exceptions)
2330
2401
 
2331
2402
  progressbar.finish
2403
+
2404
+ if buffered_errors.is_a?(Hash)
2405
+ buffered_errors.keys.each do |file|
2406
+ to_upload.delete(file)
2407
+ Cnvrg::CLI.log_message(buffered_errors[file], 'red')
2408
+ exp_obj.job_log([buffered_errors[file]]) unless exp_obj.nil?
2409
+ end
2410
+ end
2411
+
2332
2412
  res = @files.end_commit(commit_sha1, force: force, message: commit_msg)
2333
2413
  unless Cnvrg::CLI.is_response_success(res, false)
2334
2414
  raise StandardError.new("Cant end commit")
2335
2415
  end
2416
+
2336
2417
  # save idx
2337
2418
  @project.update_idx_with_files_commits!((to_upload + deleted), res["result"]["commit_time"])
2338
2419
  @project.update_idx_with_commit!(commit_sha1)
2339
2420
  if options["verbose"]
2340
2421
  log_message("#{check} Done", Thor::Shell::Color::BLUE)
2341
- if successful_updates.size > 0
2342
- successful_updates.flatten!
2343
- log_message("Updated:", Thor::Shell::Color::GREEN)
2344
- suc = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2345
- log_message(suc.join("\n"), Thor::Shell::Color::GREEN)
2346
- end
2347
- if successful_deletions.size > 0
2348
- successful_deletions.flatten!
2349
- log_message("Deleted:", Thor::Shell::Color::GREEN)
2350
- del = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2351
- log_message(del.join("\n"), Thor::Shell::Color::GREEN)
2352
- end
2353
2422
  log_message("Total of #{update_count} / #{update_total} files.", Thor::Shell::Color::GREEN)
2354
2423
  else
2355
2424
  if return_id
@@ -2374,9 +2443,13 @@ module Cnvrg
2374
2443
  if e.is_a? SignalException
2375
2444
  say "\nAborting", Thor::Shell::Color::BLUE
2376
2445
  say "\nRolling back all changes", Thor::Shell::Color::BLUE
2446
+
2447
+ exp_obj.job_log(["Aborting", "Rolling back all changes"]) unless exp_obj.nil?
2377
2448
  else
2378
2449
  log_message(error_message, Thor::Shell::Color::RED)
2379
2450
  log_error(e)
2451
+
2452
+ exp_obj.job_log([error_message, e]) unless exp_obj.nil?
2380
2453
  end
2381
2454
  @files.rollback_commit(commit_sha1) unless commit_sha1.nil?
2382
2455
  print_res = {
@@ -2894,6 +2967,11 @@ module Cnvrg
2894
2967
  method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
2895
2968
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
2896
2969
  method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
2970
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2971
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2972
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2973
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2974
+
2897
2975
  def sync(direct = true)
2898
2976
  verify_logged_in(true) if direct
2899
2977
  @project = Project.new(get_project_home)
@@ -2905,16 +2983,20 @@ module Cnvrg
2905
2983
  is_git = ENV['CNVRG_GIT_PROJECT'] == "true" || @project.is_git
2906
2984
  in_exp = options["in_exp"] || (job_slug.present? and job_type.present?)
2907
2985
  in_exp = false if job_type.present? and job_type == "NotebookSession"
2986
+ output_dir = options["output_dir"] || ENV['CNVRG_OUTPUT_DIR']
2987
+
2908
2988
  run_download = true
2909
- if options[:force] or options[:files].present? or options[:output_dir].present? or in_exp or @project.is_branch
2989
+ if (job_type == "NotebookSession" and is_git) or job_type == "Experiment" or options['force']
2910
2990
  run_download = false
2911
2991
  end
2912
- if run_download
2992
+
2993
+ if run_download or options['debug_mode']
2913
2994
  invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
2914
2995
  end
2915
- invoke :upload, [false, true, direct, "",in_exp,options[:force], options["output_dir"],job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2996
+ invoke :upload, [false, true, direct, "",in_exp,options[:force], output_dir, job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2916
2997
  :ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
2917
- :files => options["files"], :output_dir => options["output_dir"], :job_slug => job_slug, :job_type => job_type
2998
+ :files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"], :debug_mode => options['debug_mode'], :git_diff => options["git_diff"]
2999
+
2918
3000
  end
2919
3001
 
2920
3002
  desc 'run cmd', 'Runs an experiment'
@@ -3059,6 +3141,8 @@ module Cnvrg
3059
3141
  method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
3060
3142
  method_option :data_commit, :type => :string, :aliases => ["-dc", "--data_commit"], :default => ""
3061
3143
  method_option :ignore, :type => :string, :aliases => ["-i", "--ignore"], :desc => "ignore following files", :default => ""
3144
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch", :default => ""
3145
+ method_option :gpu_util_from_docker, :type => :boolean, :aliases => ["--gpu-util-from-docker"], :desc => "take gpu utilization from job docker", :default => false
3062
3146
  method_option :remote, :type => :boolean, :aliases => ["--remote"], :default => false
3063
3147
  method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
3064
3148
  method_option :force, :type => :boolean, :aliases => ["-f", "--force"], :default => false
@@ -3066,6 +3150,7 @@ module Cnvrg
3066
3150
  method_option :periodic_sync, :type => :string, :aliases => ["-ps", "--periodic_sync"], :default => ""
3067
3151
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3068
3152
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3153
+ method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3069
3154
 
3070
3155
  def exec(*cmd)
3071
3156
  log = []
@@ -3132,8 +3217,12 @@ module Cnvrg
3132
3217
  end
3133
3218
  remote = options["remote"]
3134
3219
  if remote
3135
- docker_id = `cat /etc/hostname`
3136
- docker_id = docker_id.strip()
3220
+ if options["docker_id"].present?
3221
+ docker_id = options["docker_id"]
3222
+ else
3223
+ docker_id = `cat /etc/hostname`
3224
+ docker_id = docker_id.strip()
3225
+ end
3137
3226
  end
3138
3227
  is_on_gpu = options["gpu"]
3139
3228
  start_commit = @project.last_local_commit
@@ -3143,9 +3232,9 @@ module Cnvrg
3143
3232
 
3144
3233
  platform = RUBY_PLATFORM
3145
3234
  machine_name = Socket.gethostname
3235
+ machine_activity_slug = ENV["CNVRG_MACHINE_ACTIVITY"]
3146
3236
  begin
3147
- machine_activity = @exp.get_machine_activity(working_dir)
3148
- @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity, script_path, sync_before_terminate, periodic_sync)
3237
+ @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity_slug, script_path, sync_before_terminate, periodic_sync)
3149
3238
  log_message("Experiment's live results: #{Cnvrg::Helpers.remote_url}/#{@project.owner}/projects/#{@project.slug}/experiments/#{@exp.slug}", Thor::Shell::Color::GREEN)
3150
3239
  log_message("Running: #{cmd}\n", Thor::Shell::Color::BLUE)
3151
3240
  unless @exp.slug.nil?
@@ -3163,7 +3252,7 @@ module Cnvrg
3163
3252
  begin
3164
3253
  stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3165
3254
  if is_on_gpu
3166
- gu = gpu_util
3255
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3167
3256
  stats['gpu_util'] = gu[0]
3168
3257
  stats['gpu'] = gu[1]
3169
3258
  end
@@ -3175,6 +3264,16 @@ module Cnvrg
3175
3264
  end
3176
3265
  end
3177
3266
  start_time = Time.now
3267
+ shell_type = options["use_bash"] ? "bash -l" : "sh"
3268
+ if @exp.get_cmd.present?
3269
+ cmd = @exp.get_cmd
3270
+ if options["docker_id"].present? # Escape for docker exec
3271
+ cmd = cmd.gsub("\"", "\\\"")
3272
+ end
3273
+ end
3274
+ if options["docker_id"].present?
3275
+ cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
3276
+ end
3178
3277
  PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
3179
3278
  begin
3180
3279
  stdout.each do |line|
@@ -3189,7 +3288,7 @@ module Cnvrg
3189
3288
  puts line
3190
3289
  end
3191
3290
  log << cur_log
3192
- if log.size >= 5
3291
+ if log.size >= 1
3193
3292
  @exp.upload_temp_log(log) unless log.empty?
3194
3293
  log = []
3195
3294
  elsif (start_time + 15.seconds) <= Time.now
@@ -3239,29 +3338,26 @@ module Cnvrg
3239
3338
  exp_success = false
3240
3339
  end
3241
3340
 
3242
- if sync_after
3243
- @exp.job_log(["Syncing Experiment"])
3244
- # Sync after run
3245
- if @project.is_git
3246
- output_dir = output_dir || @exp.output_dir
3247
- if output_dir.present?
3248
- upload(false, false, true, ignore, true, true,output_dir,"Experiment",@exp.slug )
3249
- # invoke :upload, [false, false, true, ignore, true, true], :output_dir => output_dir, :force=>true, :job_type=>'Experiment', :job_slug=>@exp.slug
3250
- end
3251
- else
3252
- upload(false, false, true, ignore, true, true,nil,"Experiment",@exp.slug )
3253
-
3254
- # invoke :upload, [false, false, true, ignore,true, true], :job_type=>'Experiment', :job_slug=>@exp.slug, :force=>true
3341
+ if sync_after
3342
+ @exp.job_log(["Syncing Experiment"])
3343
+ # Sync after run
3344
+ if @project.is_git
3345
+ output_dir = output_dir || @exp.output_dir
3346
+ if output_dir.present?
3347
+ upload(false, false, true, ignore, true, true, output_dir, "Experiment", @exp.slug, true )
3255
3348
  end
3256
-
3349
+ else
3350
+ upload(false, false, true, ignore, true, true, nil, "Experiment", @exp.slug, true )
3257
3351
  end
3352
+ end
3353
+
3258
3354
  end_commit = @project.last_local_commit
3259
3355
  if end_commit.present?
3260
3356
  @exp.job_log(["Experiment end commit: #{end_commit}"])
3261
3357
  end
3262
3358
 
3263
3359
  # log_thread.join
3264
- stats_thread.join
3360
+ stats_thread.join
3265
3361
 
3266
3362
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3267
3363
 
@@ -3409,8 +3505,8 @@ module Cnvrg
3409
3505
  local_folders_options = options["local_folders"]
3410
3506
  options_hash.except!("schedule", "recurring", "machine_type", "image", "upload_output", "grid", "data", "data_commit", "title",
3411
3507
  "local", "small", "medium", "large", "gpu", "gpuxl", "gpuxxl","max_time","dataset_only_tree",
3412
- "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets", "requirements", "prerun",
3413
- "email_notification_error", "email_notification_success", "emails")
3508
+ "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets",
3509
+ "requirements", "prerun", "email_notification_error", "email_notification_success", "emails")
3414
3510
  exec_options = options_hash.map {|x| "--#{x[0]}=#{x[1]}"}.flatten.join(" ")
3415
3511
  command = "#{exec_options} #{remote} #{upload_output_option} #{cmd.flatten.join(" ")}"
3416
3512
  commit_to_run = options["commit"] || nil
@@ -4235,144 +4331,6 @@ module Cnvrg
4235
4331
 
4236
4332
  end
4237
4333
 
4238
- method_option :small, :type => :boolean, :aliases => ["-sm", "--small"], :default => false
4239
- method_option :medium, :type => :boolean, :aliases => ["-md", "--medium"], :default => false
4240
- method_option :large, :type => :boolean, :aliases => ["-lg", "--large"], :default => false
4241
- method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
4242
- method_option :gpuxl, :type => :boolean, :aliases => ["--gpuxl"], :default => false
4243
- method_option :gpuxxl, :type => :boolean, :aliases => ["--gpuxxl"], :default => false
4244
- method_option :image, :type => :string, :aliases => ["-i", "--image"], :default => ""
4245
- method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
4246
- method_option :base, :type => :boolean, :aliases => ["-b", "--base"], :default => false
4247
- method_option :python3, :type => :boolean, :aliases => ["--python3"], :default => false
4248
- method_option :docker_path, :type => :string, :aliases => ["--docker_path"], :default => ""
4249
-
4250
-
4251
- desc 'create_custom_image', 'run commands inside containers', :hide => true
4252
-
4253
- def build_image(image_name)
4254
- begin
4255
- verify_logged_in(false)
4256
- log_start(__method__, args, options)
4257
- instances = {"small" => options["small"], "medium" => options["medium"], "large" => options["large"],
4258
- "gpu" => options["gpu"], "gpuxl" => options["gpuxl"], "gpuxxl" => options["gpuxxl"]}
4259
- instance_type = get_instance_type(instances)
4260
- image_extend = options["image"]
4261
- public = options["public"]
4262
- base = options["base"]
4263
- python3 = options["python3"]
4264
- docker_path = options["docker_path"]
4265
- owner = CLI.get_owner
4266
- checks = Helpers.checkmark()
4267
- tar_path = nil
4268
- if !docker_path.nil? and !docker_path.empty?
4269
- docker_path = File.absolute_path(docker_path)
4270
- #create tar of the docker path: it could be a docker file, and it could be a docker folder
4271
- tar_path = File.expand_path('~') + "/.cnvrg/tmp/docker_#{File.basename docker_path}.tar.gz"
4272
- resp = create_docker_tar(docker_path, tar_path)
4273
- if !resp
4274
- log_message("Couldn't create tar from docker path", Thor::Shell::Color::RED)
4275
- FileUtils.rm_rf tar_path
4276
- exit(1)
4277
- end
4278
- files = Cnvrg::Files.new(owner, "")
4279
- resp = Images.create_new_custom_image_with_docker(instance_type, owner, image_name, public, base, image_extend, python3, tar_path, files)
4280
- if resp
4281
- end
4282
- else
4283
- log_message("Creating machine for your custom image, this may take a few moments...", Thor::Shell::Color::BLUE)
4284
- resp = Images.create_new_custom_image(instance_type, owner, image_name, public, base, image_extend, python3, nil)
4285
-
4286
- end
4287
-
4288
- if Cnvrg::CLI.is_response_success(resp, false)
4289
- image_slug = resp["result"]["slug"]
4290
- container = resp["result"]["machine_c"]
4291
- log_message("#{checks} Created image and machine successfully", Thor::Shell::Color::GREEN)
4292
- log_message("Connecting to machine", Thor::Shell::Color::BLUE)
4293
- ssh = Ssh.new(resp)
4294
- if !ssh.is_ssh
4295
- log_message("Couldn't connect to machine,aborting", Thor::Shell::Color::RED)
4296
- Images.revoke_custom_new_image(owner, image_slug)
4297
- end
4298
- log_message("run command until ctrl + c or quit is initiated", Thor::Shell::Color::BLUE)
4299
- begin
4300
- logs = []
4301
-
4302
- while true
4303
- command = ask("$>")
4304
- logs << {time: Time.now,
4305
- message: command,
4306
- type: "stdout"
4307
- }
4308
- if command.eql? "quit"
4309
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4310
- break
4311
- end
4312
- res = ssh.exec_command(command)
4313
- begin
4314
- res_parsed = JSON.parse(res)
4315
- res = res_parsed.join(",")
4316
- end
4317
-
4318
- puts res
4319
- logs << {time: Time.now,
4320
- message: res,
4321
- type: "stdout"
4322
- }
4323
- logs.flatten!
4324
-
4325
- end
4326
-
4327
- rescue SignalException
4328
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4329
-
4330
- end
4331
- resp = Images.commit_custom_image(owner, image_slug, logs)
4332
- if Cnvrg::CLI.is_response_success(resp, false)
4333
- log_message("#{checks} Image commited successfuly, email will be sent when image is ready", Thor::Shell::Color::GREEN)
4334
- else
4335
- if image_slug
4336
- Images.revoke_custom_new_image(owner, image_slug)
4337
- end
4338
- if ssh
4339
- ssh.close_ssh()
4340
- end
4341
- log_message("Image couldn't be commited, rolling back changes", Thor::Shell::Color::RED)
4342
-
4343
- exit(1)
4344
- end
4345
- if ssh
4346
- ssh.close_ssh()
4347
- end
4348
-
4349
-
4350
- end
4351
- rescue => e
4352
- log_message("Error occurd, aborting", Thor::Shell::Color::RED)
4353
-
4354
- log_error(e)
4355
- if image_slug
4356
- Images.revoke_custom_new_image(owner, image_slug)
4357
- end
4358
- if ssh
4359
- ssh.close_ssh()
4360
- end
4361
-
4362
-
4363
- rescue SignalException
4364
- if image_slug
4365
- Images.revoke_custom_new_image(owner, image_slug)
4366
- end
4367
- if ssh
4368
- ssh.close_ssh
4369
- end
4370
- say "\nAborting"
4371
- exit(1)
4372
- end
4373
-
4374
- end
4375
-
4376
4334
 
4377
4335
  desc 'build', 'run commands inside containers', :hide => true
4378
4336
  method_option :install, :type => :string, :aliases => ["--i"], :default => nil, :desc => "Install from the given instructions file"
@@ -4566,66 +4524,7 @@ module Cnvrg
4566
4524
  end
4567
4525
 
4568
4526
 
4569
- desc 'upload_image', 'commit notebook changes to create a new notebook image', :hide =>true
4570
-
4571
- def upload_image_old(image_id, is_public, is_base, *message)
4572
- verify_logged_in(true)
4573
- log_start(__method__, args, options)
4574
- image = Docker::Image.get(image_id)
4575
- project_home = get_project_home
4576
- @project = Project.new(project_home)
4577
- last_local_commit = @project.last_local_commit
4578
- image_name = @project.slug + "#{last_local_commit}"
4579
- path = File.expand_path('~') + "/.cnvrg/tmp/#{image_name}.tar"
4580
- owner = Cnvrg::CLI.get_owner()
4581
- if !message.nil? or !message.empty?
4582
- message = message.join(" ")
4583
- end
4584
-
4585
- log_message("Saving image's current state", Thor::Shell::Color::BLUE)
4586
- image.save(path)
4587
-
4588
- begin
4589
- log_message("Compressing image file to upload", Thor::Shell::Color::BLUE)
4590
- gzipRes = system("gzip -f #{path}")
4591
- if !gzipRes
4592
-
4593
- log_message("Couldn't create tar file from image", Thor::Shell::Color::RED)
4594
- exit(1)
4595
- end
4596
- path = path + ".gz"
4597
- @files = Cnvrg::Files.new(owner, "")
4598
-
4599
- exit_status = $?.exitstatus
4600
- if exit_status == 0
4601
- log_message("Uploading image file", Thor::Shell::Color::BLUE)
4602
-
4603
- diff = container_changes(Dir.pwd)
4604
- res = @files.upload_image(path, image_name, owner, is_public, is_base, diff[1], diff[0], diff[2], message, image.commit_id)
4605
- if res
4606
- File.delete(path)
4607
- image_loc = is_project_with_docker(Dir.pwd)
4608
- image_loc.update_slug(res["result"]["id"])
4609
-
4610
- checks = Helpers.checkmark()
4611
- log_message("#{checks} Done", Thor::Shell::Color::GREEN)
4612
- else
4613
- log_message("Couldn't upload image", Thor::Shell::Color::RED)
4614
-
4615
- end
4616
- else
4617
- log_message("Couldn't create image file for: #{image_name}", Thor::Shell::Color::RED)
4618
- exit(1)
4619
- end
4620
- rescue => e
4621
- log_message("Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED)
4622
- log_error(e)
4623
- rescue SignalException
4624
4527
 
4625
- say "Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED
4626
- exit(1)
4627
- end
4628
- end
4629
4528
 
4630
4529
  desc '', '', :hide => true
4631
4530
 
@@ -4636,278 +4535,30 @@ module Cnvrg
4636
4535
 
4637
4536
  end
4638
4537
 
4639
- desc '', '', :hide => true
4640
-
4641
- def exec_container(container_id, *cmd)
4642
- container = Docker::Container.get(container_id)
4643
- container.start()
4644
- cnvrg_command = cmd.join(" ")
4645
- command = ["/bin/bash", "-lc", "#{cnvrg_command}"]
4646
- res = container.exec(command, tty: true, wait: 5400)[0]
4647
- say res
4648
- end
4649
-
4650
- desc '', '', :hide => true
4651
-
4652
- def port_container(container_id)
4653
- container = Docker::Container.get(container_id)
4654
- say container.json["HostConfig"]["PortBindings"]["8888/tcp"][0]["HostPort"]
4655
- end
4656
-
4657
- desc '', '', :hide => true
4658
-
4659
- def tensor_port_container(container_id)
4660
- container = Docker::Container.get(container_id)
4661
- say container.json["HostConfig"]["PortBindings"]["6006/tcp"][0]["HostPort"]
4662
- end
4663
-
4664
- desc '', '', :hide => true
4665
-
4666
- def stop_container(container_id)
4667
- container = Docker::Container.get(container_id)
4668
- container.stop()
4669
- container.remove()
4670
-
4671
- end
4672
-
4673
- desc '', '', :hide => true
4674
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4675
- method_option :app_dir, :type => :string, :aliases => ["-d"], :default => "/home/ds/notebooks"
4676
- method_option :cmd, :type => :string, :aliases => ["-c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4677
-
4678
-
4679
- def config_remote(image_name, port = 7654, tensport = 6006)
4680
- local_images = Docker::Image.all
4681
-
4682
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4683
- if docker_image_local.empty?
4684
- say "no image"
4685
- exit(1)
4686
- end
4687
-
4688
- begin
4689
- login_content = options["login"]
4690
- app_dir = options["app_dir"]
4691
- cmd = options["cmd"]
4692
- volume_from = options["volume"]
4693
-
4694
- image_settings = {
4695
- 'Image' => "#{image_name}:latest",
4696
-
4697
- 'Cmd' => cmd,
4698
- 'WorkingDir' => app_dir,
4699
- 'ExposedPorts' => {
4700
- '8888/tcp' => {},
4701
- },
4702
- 'HostConfig' => {
4703
- 'Binds' => ["/var/run/docker.sock:/var/run/docker.sock", "/usr/bin/docker:/usr/bin/docker"],
4704
- 'PortBindings' => {
4705
- '8888/tcp' => [
4706
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4707
- ],
4708
- '6006/tcp' => [
4709
- {'HostPort' => "#{tensport}", 'HostIp' => 'localhost'}
4710
- ],
4711
- },
4712
- },
4713
- }
4714
- container = Docker::Container.create(image_settings)
4715
- container.start()
4716
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4717
- container.exec(command, tty: true)
4718
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
4719
- # container.exec(command, tty: true)
4720
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
4721
- # container.exec(command, tty: true)
4722
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4723
- container.exec(command, tty: true)
4724
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4725
- container.exec(command, tty: true)
4726
- say "#{container.id}:#{port}##{tensport}"
4727
- rescue => e
4728
- puts e
4729
- if e.message.include? "is not running"
4730
- return config_remote(image_name, port - 1, tensport - 1)
4731
- end
4732
-
4733
- if container
4734
- container.kill()
4735
- end
4736
- return false
4737
- end
4738
- end
4739
-
4740
-
4741
- desc '', '', :hide => true
4742
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4743
-
4744
- def config_netrc(container)
4745
-
4746
- login_content = options["login"]
4747
-
4748
- container = Docker::Container.get(container)
4749
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4750
- container.exec(command, tty: true)
4751
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4752
- container.exec(command, tty: true)
4753
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4754
- container.exec(command, tty: true)
4755
- say "OK"
4756
-
4757
- end
4758
-
4759
- desc '', '', :hide => true
4760
- method_option :login, :type => :string, :aliases => ["-l", "--l"], :default => ""
4761
- method_option :app_dir, :type => :string, :aliases => ["-d", "--d"], :default => "/home/ds/notebooks"
4762
- method_option :cmd, :type => :string, :aliases => ["-c", "--c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4763
-
4764
-
4765
- def config_remote_gpu(image_name, port = 7654, tensport = 6006)
4766
- local_images = Docker::Image.all
4767
-
4768
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4769
- if docker_image_local.empty?
4770
- say "no image"
4771
- exit(1)
4772
- end
4773
-
4774
- begin
4775
- login_content = options["login"]
4776
- app_dir = options["app_dir"]
4777
- cmd = options["cmd"]
4778
-
4779
- # image_settings = {
4780
- # 'Image' => "#{image_name}:latest",
4781
- # 'User' => 'ds',
4782
- # 'Cmd' => cmd,
4783
- # 'WorkingDir' => app_dir,
4784
- # 'ExposedPorts' => {
4785
- # '8888/tcp' => {},
4786
- # },
4787
- # 'HostConfig' => {
4788
- # 'PortBindings' => {
4789
- # '8888/tcp' => [
4790
- # {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4791
- # ],
4792
- # '6006/tcp' => [
4793
- # {'HostPort' => "6006", 'HostIp' => 'localhost'}
4794
- # ],
4795
- # },
4796
- # },
4797
- # }
4798
-
4799
- container_id = `nvidia-docker run -itd -p #{port}:8888 -p #{tensport}:6006 -w #{app_dir} -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi -v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker #{image_name}:latest #{cmd} `
4800
- container_id = container_id.gsub("\n", "")
4801
- container = Docker::Container.get(container_id)
4802
- # container.start()
4803
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4804
- container.exec(command, tty: true)
4805
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4806
- container.exec(command, tty: true)
4807
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4808
- container.exec(command, tty: true)
4809
- say "#{container.id}:#{port}##{tensport}"
4810
- rescue => e
4811
- if e.message.include? "is not running"
4812
- puts "running asgain with: #{port - 1} #{tensport - 1}"
4813
- return config_remote_gpu(image_name, port - 1, tensport - 1)
4814
- end
4815
-
4816
- if container
4817
- container.kill()
4538
+ desc 'Collect and send job utilization', '', :hide => true
4539
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch"
4540
+ method_option :is_on_gpu, :type => :boolean, :aliases => ["--is_on_gpu"], :desc => "is on gpu", :default => true
4541
+ def get_utilization()
4542
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4543
+ docker_id = options["docker_id"]
4544
+ while true do
4545
+ sleep 30
4546
+ begin
4547
+ stats = usage_metrics_in_docker(docker_id)
4548
+ if options["is_on_gpu"]
4549
+ gu = gpu_util(take_from_docker: true, docker_id: docker_id)
4550
+ stats['gpu_util'] = gu[0]
4551
+ stats['gpu'] = gu[1]
4552
+ end
4553
+ stats['docker_id'] = docker_id
4554
+ @exp.send_machine_stats [stats] unless stats.empty?
4555
+ rescue => e
4556
+ log_error(e)
4557
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4818
4558
  end
4819
- return false
4820
4559
  end
4821
4560
  end
4822
4561
 
4823
- desc '', '', :hide => true
4824
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4825
-
4826
- def config_flask_remote(image_name, port = 80)
4827
- local_images = Docker::Image.all
4828
-
4829
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4830
- if docker_image_local.empty?
4831
- say "no image"
4832
- exit(1)
4833
- end
4834
-
4835
- begin
4836
- login_content = options["login"]
4837
- image_settings = {
4838
- 'Image' => "#{image_name}:latest",
4839
- 'User' => 'ds',
4840
- 'Cmd' => '/usr/local/cnvrg/start_super.sh',
4841
- 'WorkingDir' => '/home/ds/app',
4842
- 'ExposedPorts' => {
4843
- '80/tcp' => {},
4844
- },
4845
- 'HostConfig' => {
4846
- 'PortBindings' => {
4847
- '80/tcp' => [
4848
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4849
- ],
4850
- },
4851
- },
4852
- }
4853
- container = Docker::Container.create(image_settings)
4854
- container.start()
4855
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4856
- container.exec(command, tty: true)
4857
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4858
- container.exec(command, tty: true)
4859
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4860
- container.exec(command, tty: true)
4861
- say "#{container.id}:#{port}"
4862
- rescue => e
4863
- pus e
4864
- if e.message.include? "is not running"
4865
- return "port is taken"
4866
- end
4867
- puts "error"
4868
- if container
4869
- container.kill()
4870
- end
4871
- return false
4872
- end
4873
- end
4874
-
4875
- desc '', '', :hide => true
4876
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4877
-
4878
- def config_flask_remote_gpu(image_name, port = 80)
4879
- local_images = Docker::Image.all
4880
-
4881
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4882
- if docker_image_local.empty?
4883
- say "no image"
4884
- exit(1)
4885
- end
4886
-
4887
- begin
4888
- login_content = options["login"]
4889
- container_id = `nvidia-docker run -itd -p 80:80 -w /home/ds/app #{image_name}:latest /usr/local/cnvrg/start_super.sh`
4890
- container_id = container_id.gsub("\n", "")
4891
- container = Docker::Container.get(container_id)
4892
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4893
- container.exec(command, tty: true)
4894
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4895
- container.exec(command, tty: true)
4896
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4897
- container.exec(command, tty: true)
4898
- say "#{container.id}:#{port}"
4899
- rescue => e
4900
- puts e
4901
- if e.message.include? "is not running"
4902
- return "port is taken"
4903
- end
4904
- puts "error"
4905
- if container
4906
- container.kill()
4907
- end
4908
- return false
4909
- end
4910
- end
4911
4562
 
4912
4563
  desc '', '', :hide => true
4913
4564
 
@@ -4933,39 +4584,10 @@ module Cnvrg
4933
4584
 
4934
4585
  end
4935
4586
 
4936
- desc 'upload_image', 'Upload new docker image to cnvrg', :hide => true
4937
- method_option :workdir, :type => :string, :aliases => ["-w","--workdir"], :desc => "workdir of docker image", :default => "/root"
4938
- method_option :description, :type => :string, :aliases => ["-d", "--description"], :desc => "description for docker image", :default => ""
4939
- method_option :user, :type => :string, :aliases => ["-u","--user"], :default => "root"
4940
- method_option :gpu, :type => :boolean, :aliases => ["-g","--gpu"], :default => false
4941
- def upload_image(image_name,image_path)
4942
- begin
4943
- verify_logged_in(false)
4944
- log_start(__method__, args, options)
4945
-
4946
- @image = Cnvrg::Images.new()
4947
- say "Uploading new docker image file", Thor::Shell::Color::BLUE
4948
- workdir = options[:workdir]
4949
- description = options[:description]
4950
- user = options[:user]
4951
- is_gpu = options[:gpu]
4952
- res = @image.upload_docker_image(image_path, image_name, workdir, user, description, is_gpu)
4953
- if res["status"] == 200
4954
- image_slug = res["id"]
4955
- owner = CLI.get_owner
4956
- image_url = "#{Cnvrg::Helpers.remote_url}/#{owner}/settings/images/#{image_slug}"
4957
- log_message("Successfully uploaded image: #{image_url}", Thor::Shell::Color::GREEN, true)
4958
-
4959
-
4960
- else
4961
- log_message("Couldn't upload image: #{image_name}", Thor::Shell::Color::RED, true)
4962
-
4963
- end
4964
- rescue => e
4965
- log_error(e)
4966
- end
4967
-
4968
-
4587
+ desc 'file_exists', description: '', hide: true
4588
+ def file_exists(file)
4589
+ exit(0) if File.exists? file
4590
+ exit(1)
4969
4591
  end
4970
4592
 
4971
4593
 
@@ -5145,29 +4767,40 @@ module Cnvrg
5145
4767
  method_option :project_slug, :type => :string, :aliases => ["-s"], :desc => "project slug"
5146
4768
  method_option :project_owner, :type => :string, :aliases => ["-o"], :desc => "project slug"
5147
4769
  method_option :frequency, :type => :numeric, :aliases => ["-f"], :desc => "poll frequency"
4770
+ method_option :fetch_slugs, :type => :boolean, :default => false, :desc => "Fetch experiments slugs to compare"
5148
4771
 
5149
4772
  def compare_experiments
5150
4773
  verify_logged_in(true)
5151
4774
  log_start(__method__, args, options)
5152
4775
  exps_map = {}
4776
+ copied_commits = []
5153
4777
 
5154
- if options[:slugs].blank?
4778
+ if options[:slugs].blank? and options[:fetch_slugs].blank?
5155
4779
  log_message("No experiments slugs given", Thor::Shell::Color::RED)
5156
4780
  return false
5157
4781
  end
5158
- slugs = options[:slugs].split(",")
5159
- if slugs.blank?
5160
- log_message("No experiments slugs given", Thor::Shell::Color::RED)
5161
- return false
4782
+ if options[:slugs].present?
4783
+ slugs = options[:slugs].split(",")
5162
4784
  end
4785
+
5163
4786
  frequency = options[:frequency] || 5
5164
4787
  namespace = options[:namespace]
5165
4788
  project_dir = is_cnvrg_dir(Dir.pwd)
5166
4789
  @project = Project.new(project_home=project_dir, slug: options[:project_slug], owner: options[:project_owner])
4790
+ fetch_slugs = options[:fetch_slugs]
4791
+ webapp_slug = ENV["CNVRG_JOB_ID"]
4792
+ if fetch_slugs and webapp_slug.present?
4793
+ slugs = @project.fetch_webapp_slugs(webapp_slug)
4794
+ end
4795
+ if slugs.blank?
4796
+ log_message("No experiments slugs given", Thor::Shell::Color::RED)
4797
+ return false
4798
+ end
5167
4799
 
4800
+ log_message("compare is running")
5168
4801
  while true
4802
+ log_message("compare is running for slugs #{slugs}")
5169
4803
  slugs.each do |exp_slug|
5170
-
5171
4804
  begin
5172
4805
  if exps_map[exp_slug].blank?
5173
4806
  exp = @project.get_experiment(exp_slug)["experiment"]
@@ -5181,15 +4814,23 @@ module Cnvrg
5181
4814
  log_message("#{exp_name} has ended, getting files from end commit", Thor::Shell::Color::BLUE)
5182
4815
  Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project)
5183
4816
  exps_map[exp_slug] = exp
5184
- elsif exp["machine_activity"].present?
4817
+ else
5185
4818
  log_message("#{exp_name} is running should get logs", Thor::Shell::Color::BLUE)
5186
- Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4819
+ success = Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4820
+ if !success and exp["last_successful_commit"].present? and !copied_commits.include?(exp["last_successful_commit"])
4821
+ log_message("Failed to get kube files, using last commit", Thor::Shell::Color::BLUE)
4822
+ Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project, commit: exp["last_successful_commit"])
4823
+ copied_commits << exp["last_successful_commit"]
4824
+ end
5187
4825
  end
5188
4826
  rescue => e
5189
4827
  Cnvrg::Logger.log_error(e)
5190
4828
  end
5191
4829
  end
5192
4830
  sleep frequency
4831
+ if fetch_slugs
4832
+ slugs = @project.fetch_webapp_slugs(webapp_slug, slugs: slugs)
4833
+ end
5193
4834
  end
5194
4835
  end
5195
4836
 
@@ -5275,127 +4916,6 @@ module Cnvrg
5275
4916
  end
5276
4917
 
5277
4918
 
5278
- desc 'pull_image', 'downloads and loads an image', :hide => true
5279
-
5280
- def pull_image(image_name)
5281
- begin
5282
- verify_logged_in(false)
5283
- log_start(__method__, args, options)
5284
- owner = Cnvrg::CLI.get_owner()
5285
- image = Cnvrg::Images.image_exist(owner, image_name)
5286
- if !image
5287
- log_message("Couldn't find image in cnvrg repository", Thor::Shell::Color::RED)
5288
- exit(1)
5289
- end
5290
- path = download_image(image_name, image["slug"])
5291
- if path
5292
- log_message("Building image", Thor::Shell::Color::BLUE)
5293
- Docker.options[:read_timeout] = 216000
5294
- image = Docker::Image.build_from_dir(path, {'dockerfile' => 'Dockerfile.cpu', 't' => "#{image_name}:latest"}) do |v|
5295
- begin
5296
- if (log = JSON.parse(v)) && log.has_key?("stream")
5297
- next if log["stream"].starts_with? "Step"
5298
- $stdout.puts log["stream"]
5299
- end
5300
- rescue
5301
- end
5302
-
5303
- end
5304
-
5305
- if not image.nil?
5306
- FileUtils.rm_rf(path)
5307
- checks = Helpers.checkmark()
5308
- log_message("#{checks} Image built successfully", Thor::Shell::Color::GREEN)
5309
- return image
5310
- else
5311
-
5312
- log_message("Could not build image", Thor::Shell::Color::RED)
5313
- return false
5314
- end
5315
- else
5316
-
5317
- log_message("Could not download image", Thor::Shell::Color::RED)
5318
- return false
5319
-
5320
-
5321
- end
5322
-
5323
- # else
5324
- # path = download_image(image_name,image["slug"])
5325
- # if path
5326
- # image = Docker::Image.import(path)
5327
- # image.tag('repo' => image_name, 'tag' => 'latest')
5328
- # if not image.nil?
5329
- # say "Finished downloading image, cleaning up..", Thor::Shell::Color::GREEN
5330
- # FileUtils.rm(path)
5331
- # checks = Helpers.checkmark()
5332
- # say "#{checks} Done", Thor::Shell::Color::GREEN
5333
- # log_end(0)
5334
- # return image
5335
- # log_end(0)
5336
- # else
5337
- # say "Could not download image", Thor::Shell::Color::RED
5338
- # return false
5339
- # end
5340
- #
5341
- # end
5342
- # end
5343
- rescue => e
5344
-
5345
- log_message "Error: couldn't build image", Thor::Shell::Color::RED
5346
- log_error(e)
5347
-
5348
- rescue SignalException
5349
- say "\nAborting"
5350
- exit(1)
5351
- ensure
5352
- if path
5353
- FileUtils.rm_rf(path)
5354
-
5355
- end
5356
- end
5357
-
5358
-
5359
- end
5360
-
5361
- desc 'set_image', 'set image to a porject', :hide => true
5362
-
5363
- def set_image(docker_image)
5364
- verify_logged_in(true)
5365
- log_start(__method__, args, options)
5366
- working_dir = is_cnvrg_dir
5367
- project = Project.new(working_dir)
5368
-
5369
- local_images = Docker::Image.all
5370
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.include? docker_image}.flatten
5371
- if docker_image_local.size == 0
5372
-
5373
- if yes? "Image wasn't found locally, pull image from cnvrg repository?", Thor::Shell::Color::YELLOW
5374
- image = pull(docker_image)
5375
- if image
5376
- log_message("downloaded image: #{docker_image}", Thor::Shell::Color::BLUE)
5377
- @image = Images.new(working_dir, docker_image)
5378
- else
5379
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5380
- exit(1)
5381
- end
5382
- else
5383
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5384
- exit(1)
5385
-
5386
- end
5387
- elsif docker_image_local.size == 1
5388
- log_message("found image: #{docker_image_local[0]}, setting it up..", Thor::Shell::Color::BLUE)
5389
- @image = Images.new(working_dir, docker_image_local[0])
5390
- elsif docker_image_local.size > 1
5391
- log_message("found #{docker_image_local.size} images, choose the image name you want to use", Thor::Shell::Color::BLUE)
5392
- image_name = ask "#{docker_image_local.join("\n")}\n", Thor::Shell::Color::BLUE
5393
- image_name = image_name.strip
5394
- @image = Images.new(working_dir, image_name)
5395
- end
5396
- @image.update_image_activity(project.last_local_commit, nil)
5397
- end
5398
-
5399
4919
  desc 'check_pod_restart', 'Check pod restart', :hide => true
5400
4920
  def check_pod_restart
5401
4921
  Cnvrg::CLI.new.log_start(__method__, args, options)
@@ -5670,7 +5190,7 @@ module Cnvrg
5670
5190
 
5671
5191
  if dirs.size == 0
5672
5192
  log_message("Couldn't find cnvrg directory. Please start a new project", Thor::Shell::Color::RED)
5673
-
5193
+ puts Thread.current.backtrace
5674
5194
  exit(1)
5675
5195
  end
5676
5196
  return dirs.join("/")
@@ -5773,7 +5293,7 @@ module Cnvrg
5773
5293
  is_cnvrg = is_cnvrg_dir
5774
5294
  if !is_cnvrg
5775
5295
  say "You're not in a cnvrg project directory", Thor::Shell::Color::RED
5776
- exit(0)
5296
+ exit(1)
5777
5297
  end
5778
5298
 
5779
5299
  end
@@ -5919,21 +5439,6 @@ module Cnvrg
5919
5439
 
5920
5440
  end
5921
5441
 
5922
- def container_changes(dir)
5923
- container_id = is_project_with_docker(dir)
5924
- if not container_id
5925
- return false
5926
- end
5927
- container = Docker::Container.get(container_id)
5928
- command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
5929
- pip = container.exec(command, tty: true)[0]
5930
- command = ["/bin/bash", "-lc", "dpkg -l"]
5931
- dpkg = container.exec(command, tty: true)[0]
5932
- command = ["/bin/bash", "-lc", "cat /home/ds/.bash_history"]
5933
- history = container.exec(command, tty: true)[0]
5934
- diff = [pip, dpkg, history]
5935
- return diff
5936
- end
5937
5442
 
5938
5443
  def is_port_taken(ip = Cnvrg::CLI::IP, port = Cnvrg::CLI::PORT, seconds = 1)
5939
5444
  Timeout::timeout(seconds) do
@@ -6116,13 +5621,17 @@ module Cnvrg
6116
5621
 
6117
5622
  end
6118
5623
 
6119
- def gpu_util
5624
+ def gpu_util(take_from_docker: false, docker_id: nil)
6120
5625
  if !Helpers.ubuntu?
6121
5626
  return 0.0
6122
5627
  end
6123
5628
  stats = [[],[]]
6124
5629
  begin
6125
- gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5630
+ if take_from_docker
5631
+ gpu_stats = `docker exec -it #{docker_id} sh -c 'nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv'`
5632
+ else
5633
+ gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5634
+ end
6126
5635
 
6127
5636
  if !gpu_stats.nil?
6128
5637
  gpu_stats = gpu_stats.split("\n")[1..-1]