cnvrg 1.6.38 → 1.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e5953510af9633c925be99ac1463aef31667eed5f83f5b0c610672ab9fe375cb
4
- data.tar.gz: 7bd8fce2d93a20ed695e4c202f2b29092de84bf90332385a3921844046a0c591
3
+ metadata.gz: e708ef034df38ed0b4f5c1ac4bb02fa79a26c93b188f571256f75dbc9d2eaaa6
4
+ data.tar.gz: 6badf54b65660776e63c02c7d3c5dbbab83d0e1e83f6e877b48d77fad5ba3036
5
5
  SHA512:
6
- metadata.gz: 8b93cd899060800095895967dbe2b8f3391e2d77b6f29ea05cd6651de4d26de2ba24fdaedaa8dafad82a22e250d95abf7651d2b77afe2dcf68b883461774c965
7
- data.tar.gz: a4dc324e6f58b628013bb00278a74233f6b2c6e4ffa260e9edf74ea2557ca144907449078defb1abdb22ffe411fdde196b2e029bd66e67233140cf54d6caf67d
6
+ metadata.gz: 21d89ec4fb99c4102bc1e8e0e50df516339a1c9e9660ee8f0dd8acf3ae30bd27067f5ea4fe979de3b737bd6f748ced98f023100487a4226b7f21eed17975142c
7
+ data.tar.gz: 91fb2d10994c11e9b28ef3bbc128f847ac2efd641892c29ec1ec2b16d4b125266e85a6166153b66ab9e9e1c475190f6eca771e42d739a02c1136dbe8cb6c3abb
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
32
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
33
  spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
34
- spec.add_runtime_dependency 'aws-sdk', '~> 2.11.417'
34
+ spec.add_runtime_dependency 'aws-sdk', '~> 3.0'
35
35
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
36
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
37
37
  spec.add_runtime_dependency 'google-cloud-core', '~> 1.3.2'
@@ -40,11 +40,8 @@ Gem::Specification.new do |spec|
40
40
  spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
41
41
  spec.add_runtime_dependency 'parallel', '~> 1.12.0'
42
42
  spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
43
-
44
43
  spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
45
- spec.add_runtime_dependency 'docker-api', '~> 1.33'
46
44
  spec.add_runtime_dependency 'activesupport', '~> 5.2.0'
47
45
  spec.add_runtime_dependency 'ruby-progressbar'
48
- spec.add_runtime_dependency 'net-ssh'
49
46
  spec.add_runtime_dependency 'down'
50
47
  end
@@ -1,7 +1,5 @@
1
1
  require 'fileutils'
2
2
  require 'cnvrg/files'
3
- require 'docker'
4
- require 'net/ssh'
5
3
  require 'mimemagic'
6
4
 
7
5
 
@@ -175,58 +173,6 @@ module Cnvrg
175
173
  response = Cnvrg::API.request("users/#{owner}/images/#{slug}/commit_custom_image", 'POST', {image_logs:logs})
176
174
  return response
177
175
  end
178
- def self.ssh_to_machine(resp)
179
-
180
- sts_path = resp["result"]["sts_path"]
181
-
182
- uri = URI.parse(sts_path)
183
-
184
- http_object = Net::HTTP.new(uri.host, uri.port)
185
- http_object.use_ssl = true if uri.scheme == 'https'
186
- request = Net::HTTP::Get.new(sts_path)
187
-
188
- body = ""
189
- http_object.start do |http|
190
- response = http.request request
191
- body = response.read_body
192
- end
193
-
194
- URLcrypt::key = [body].pack('H*')
195
-
196
- ip = URLcrypt.decrypt(resp["result"]["machine_i"])
197
-
198
- user = URLcrypt.decrypt(resp["result"]["machine_u"])
199
- key = URLcrypt.decrypt(resp["result"]["machine_k"])
200
- tempssh = Tempfile.new "sshkey"
201
- tempssh.write open(key).read
202
- tempssh.rewind
203
- key_path = tempssh.path
204
- count = 0
205
- while count < 5
206
-
207
- begin
208
- ssh = Net::SSH.start(ip, user=user, :keys => key_path, :timeout => 10)
209
- if !ssh.nil?
210
- return ssh
211
- else
212
- count+=1
213
- sleep(2)
214
-
215
- end
216
- rescue
217
- count+=1
218
- sleep(2)
219
-
220
-
221
- end
222
- end
223
- if tempssh
224
- tempssh.close
225
- tempssh.unlink
226
- end
227
- return false
228
- end
229
-
230
176
 
231
177
 
232
178
  def create_custom_image(new_image_name,working_dir,stored_commands)
@@ -270,100 +216,6 @@ module Cnvrg
270
216
  File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
271
217
  end
272
218
 
273
- def get_container(stop=false)
274
- begin
275
- container_id=is_container_exist()
276
-
277
- if !container_id
278
- return create_container()
279
- else
280
- container = Docker::Container.get(container_id)
281
- status = container.json["State"]["Status"]
282
-
283
- if status == "running"
284
- return container
285
- else
286
- if stop
287
- return false
288
- end
289
- res = container.start()
290
- if res.info["State"]["Status"].eql? "exited" and res.info["State"]["Error"].include? "port is already allocated"
291
- return create_container()
292
- end
293
- return container
294
- end
295
- end
296
- rescue => e
297
- if e.message.include? "No such container"
298
-
299
- return create_container()
300
- else
301
- return false
302
- end
303
- end
304
-
305
- end
306
-
307
- def create_container(port=7654, is_remote=false)
308
- begin
309
- image_settings = {
310
- 'Image' => "#{@image_name}:latest",
311
- 'User' => 'ds',
312
- 'Cmd' => '/usr/local/cnvrg/run_ipython.sh',
313
- 'WorkingDir' => '/home/ds/notebooks',
314
- 'ExposedPorts' => {
315
- '8888/tcp' => {},
316
- },
317
- 'HostConfig' => {
318
- 'Binds' => ["#{@working_dir}:/home/ds/notebooks"],
319
- 'PortBindings' => {
320
- '8888/tcp' => [
321
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
322
- ],
323
- },
324
- },
325
- }
326
- container = Docker::Container.create(image_settings)
327
- container.start()
328
- netrc = File.open(File.expand_path('~')+"/.netrc", "rb")
329
- netrc_content = netrc.read
330
- container.store_file("/home/ds/.netrc", netrc_content)
331
- command = ["/bin/bash", "-lc", "sudo chmod 600 /home/ds/.netrc"]
332
- p = container.exec(command, tty: true)
333
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.netrc"]
334
- p = container.exec(command, tty: true)
335
- config = File.open(File.expand_path('~')+"/.cnvrg/config.yml", "rb")
336
- config_content = config.read
337
- container.store_file("/home/ds/.cnvrg/config.yml", config_content)
338
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg"]
339
- container.exec(command, tty: true)
340
- # Libraries instlled
341
- save_installed_libraries(container)
342
- config = {project_name: @project_name,
343
- project_slug: @project_slug,
344
- owner: @owner,
345
- docker: true, image_base: @image_name, image_tag: @image_tag, container: container.id, port: port, image_slug: @image_slug}
346
-
347
- File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
348
-
349
-
350
- return container
351
-
352
-
353
- rescue => e
354
- if e.message.include? "is not running"
355
- return create_container(port-1)
356
- end
357
- return false
358
- rescue SignalException
359
-
360
- say "\nAborting", Thor::Shell::Color::RED
361
- exit(1)
362
- end
363
-
364
-
365
- end
366
-
367
219
  def save_installed_libraries(container)
368
220
  begin
369
221
  command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
@@ -77,20 +77,22 @@ module Cnvrg
77
77
  if response.to_hash[:status] == 404
78
78
  return false
79
79
  end
80
- if parse_request == true
80
+ if parse_request
81
81
  JSON.parse(response.body)
82
82
  else
83
83
  response
84
84
  end
85
- when 'POST', 'PUT'
85
+ when 'POST', 'PUT'
86
86
  conn.options.timeout = 4200
87
- conn.options.open_timeout=180
87
+ conn.options.open_timeout = 180
88
+ conn.headers['Content-Type'] = "application/json"
88
89
  retries = 0
89
90
  success = false
91
+ data = data || {}
90
92
  while !success and retries < 20
91
93
  begin
92
- response = conn.post "#{resource}", data if method.eql? 'POST'
93
- response = conn.put "#{resource}", data if method.eql? 'PUT'
94
+ response = conn.post "#{resource}", data.to_json if method.eql? 'POST'
95
+ response = conn.put "#{resource}", data.to_json if method.eql? 'PUT'
94
96
  success = true
95
97
  Cnvrg::API.parse_version(response)
96
98
 
@@ -113,7 +115,7 @@ module Cnvrg
113
115
  end
114
116
  when 'POST_JSON'
115
117
  conn.options.timeout = 4200
116
- conn.options.open_timeout =4200
118
+ conn.options.open_timeout = 4200
117
119
  conn.headers['Content-Type'] = "application/json"
118
120
  new_data = JSON.dump(data)
119
121
 
@@ -124,8 +126,6 @@ module Cnvrg
124
126
  begin
125
127
  response = conn.post "#{resource}", new_data
126
128
  success = true
127
- Cnvrg::API.parse_version(response)
128
-
129
129
  rescue => e
130
130
  Cnvrg::Logger.log_error(e)
131
131
  sleep(5)
@@ -0,0 +1,14 @@
1
+ module Cnvrg
2
+ class API_V2 < API
3
+ ENDPOINT_VERSION = 'v2'
4
+
5
+ def self.endpoint_uri
6
+ api = get_api()
7
+ return "#{api}/#{Cnvrg::API_V2::ENDPOINT_VERSION}"
8
+ end
9
+
10
+ def self.is_response_success(response)
11
+ raise Exception.new("Bad status in response #{response.status}") if response.status != 200
12
+ end
13
+ end
14
+ end
@@ -12,7 +12,6 @@ require 'digest' # sha1up
12
12
  require "highline/import"
13
13
  require 'socket'
14
14
  require 'thor'
15
- require 'docker'
16
15
  require 'socket'
17
16
  require 'timeout'
18
17
  require 'fileutils'
@@ -28,13 +27,11 @@ require 'cnvrg/auth'
28
27
  require 'cnvrg/project'
29
28
  require 'cnvrg/files'
30
29
  require 'cnvrg/experiment'
31
- require 'cnvrg/Images'
32
30
  require 'cnvrg/image'
33
31
  require 'cnvrg/dataset'
34
32
  require 'cnvrg/datafiles'
35
33
  require 'cnvrg/data'
36
34
  require 'cnvrg/storage'
37
- require 'cnvrg/ssh'
38
35
  require 'cnvrg/result'
39
36
  require 'cnvrg/logger'
40
37
  require 'cnvrg/org_helpers'
@@ -49,6 +46,9 @@ require 'cnvrg/downloader/clients/s3_client'
49
46
  require 'cnvrg/downloader/clients/gcp_client'
50
47
  require 'cnvrg/downloader/clients/azure_client'
51
48
  require 'cnvrg/job_cli'
49
+ require 'cnvrg/job_ssh'
50
+ require 'cnvrg/connect_job_ssh'
51
+ require 'cnvrg/api_v2'
52
52
 
53
53
  class Thor
54
54
  module Base
@@ -175,6 +175,9 @@ module Cnvrg
175
175
  desc "job", "manage running jobs", :hide => false
176
176
  subcommand "job", JobCli
177
177
 
178
+ desc "ssh", "ssh into running jobs", :hide => false
179
+ subcommand "ssh", JobSsh
180
+
178
181
  desc "image [COMMAND]", "build existing images", :hide => true
179
182
  subcommand "image", ImageCli
180
183
 
@@ -819,9 +822,9 @@ module Cnvrg
819
822
  end
820
823
 
821
824
  desc 'data verify', 'Verify datasets', :hide => true
822
- method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => 15
825
+ method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => nil
823
826
 
824
- def verify_datasets(dataset_titles, timeout=0)
827
+ def verify_datasets(dataset_titles, timeout=nil)
825
828
  begin
826
829
  verify_logged_in(false)
827
830
  log_start(__method__, args, options)
@@ -830,21 +833,31 @@ module Cnvrg
830
833
  log_message("All datasets are verified", Thor::Shell::Color::BLUE) if verified
831
834
  log_message("Failed to verify datasets", Thor::Shell::Color::RED) if !verified
832
835
  exit(1) if !verified
833
-
834
836
  rescue SignalException
835
837
  say "\nAborting", Thor::Shell::Color::RED
836
838
  exit(1)
837
839
  end
838
840
  end
839
841
 
842
+ desc 'data scan', 'Lookup datasets', :hide => true
843
+ def scan_datasets()
844
+ begin
845
+ verify_logged_in(false)
846
+ log_start(__method__, args, options)
847
+ log_message("Scanning datasets", Thor::Shell::Color::BLUE)
848
+ datasets = Dataset.scan_datasets()
849
+ puts(datasets.to_json)
850
+ end
851
+ end
852
+
840
853
  desc 'data clone', 'Clone dataset', :hide => true
841
854
  method_option :commit, :type => :string, :aliases => ["-c", "--commit"], :default => ""
842
855
  method_option :only_tree, :type => :boolean, :aliases => ["-t", "--tree"], :default => false
843
856
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => nil
844
857
  method_option :read, :type => :boolean, :aliases => ["-r", "--read"], :default => false
845
858
  method_option :remote, :type => :boolean, :aliases => ["-h", "--remote"], :default => false
846
-
847
- def clone_data(dataset_url,only_tree=false,commit=nil,query=nil,read=false,remote=false, relative: false)
859
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
860
+ def clone_data(dataset_url, only_tree=false, commit=nil, query=nil, read=false, remote=false, flatten: false, relative: false, soft: false)
848
861
  begin
849
862
  verify_logged_in(false)
850
863
  log_start(__method__, args, options)
@@ -853,10 +866,10 @@ module Cnvrg
853
866
  read = options["read"] || read || false
854
867
  remote = options["remote"] || remote || false
855
868
  query = options['query'].presence || query.presence
869
+ soft = options['soft'] || soft
856
870
  if query.present?
857
- return clone_data_query(dataset_url, query)
871
+ return clone_data_query(dataset_url, query, flatten, soft: soft)
858
872
  end
859
- @executer = Cnvrg::Helpers::Executer.get_executer
860
873
 
861
874
  url_parts = dataset_url.split("/")
862
875
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
@@ -868,6 +881,8 @@ module Cnvrg
868
881
  dataset_name = response["result"]["name"]
869
882
  dataset_home = Dir.pwd+"/"+dataset_name
870
883
 
884
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name, commit: response["result"]["commit"]) if soft
885
+
871
886
  check = Helpers.checkmark
872
887
  if @dataset.init_home(remote:remote)
873
888
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -875,14 +890,12 @@ module Cnvrg
875
890
  log_message("Downloading files", Thor::Shell::Color::BLUE)
876
891
  if @dataset.softlinked?
877
892
  @files.cp_ds(relative: relative)
878
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
879
893
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
880
894
  @dataset.write_success
881
895
  return
882
896
  end
883
897
 
884
898
  if only_tree
885
-
886
899
  success = Dataset.clone_tree(commit: commit, dataset_home: dataset_home)
887
900
  return if success
888
901
  end
@@ -900,7 +913,7 @@ module Cnvrg
900
913
 
901
914
  while files['keys'].length > 0
902
915
  Cnvrg::Logger.log_info("download multiple files, #{downloaded_files.size} files downloaded")
903
- @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read)
916
+ @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read, flatten: flatten)
904
917
 
905
918
  downloaded_files += files['keys'].length
906
919
  files = @files.get_clone_chunk(commit: commit, latest_id: files['latest'])
@@ -908,7 +921,6 @@ module Cnvrg
908
921
  progressbar.finish
909
922
  if downloaded_files == files_count
910
923
  Dataset.verify_cnvrgignore_exist(dataset_name, false)
911
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
912
924
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
913
925
  @dataset.write_success
914
926
  ### if read, dont generate idx (but create idx.yml) if not read, generate idx.
@@ -930,12 +942,14 @@ module Cnvrg
930
942
 
931
943
  desc 'data clone_query', 'Clone dataset _query', :hide => true
932
944
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => ""
933
- def clone_data_query(dataset_url,query=nil)
945
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
946
+ def clone_data_query(dataset_url, query=nil, flatten=false, soft: false)
934
947
  begin
935
948
  verify_logged_in(false)
936
- @executer = Cnvrg::Helpers::Executer.get_executer
949
+ #@executer = Cnvrg::Helpers::Executer.get_executer
937
950
  log_start(__method__, args, options)
938
951
  query = options["query"] || query
952
+ soft = options["soft"] || soft
939
953
  if !query.present?
940
954
  log_message("Argument missing : query", Thor::Shell::Color::RED)
941
955
  exit(1)
@@ -945,13 +959,14 @@ module Cnvrg
945
959
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
946
960
  slug = url_parts[project_index + 1]
947
961
  owner = url_parts[project_index - 1]
948
-
949
962
  response = Cnvrg::API.request("users/#{owner}/datasets/#{slug}/search/#{query}", 'GET')
950
963
  Cnvrg::CLI.is_response_success(response,true)
951
964
  dataset_name = response["results"]["name"]
952
965
  dataset_slug = response["results"]["slug"]
953
- dataset_home = File.join(Dir.pwd, dataset_name)
966
+ dataset_home = Dir.pwd+"/"+dataset_slug
967
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name) if soft
954
968
 
969
+ # dataset_home = Dir.pwd
955
970
  if Dataset.blank_clone(owner, dataset_name, dataset_slug)
956
971
  dataset = Dataset.new(dataset_home)
957
972
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -966,6 +981,7 @@ module Cnvrg
966
981
  },
967
982
  in_threads: ParallelThreads
968
983
  }
984
+
969
985
  begin
970
986
  log_message("Downloading files", Thor::Shell::Color::BLUE)
971
987
  Parallel.map((response["results"]["query_files"]), parallel_options) do |f|
@@ -974,6 +990,7 @@ module Cnvrg
974
990
  file_name = relative_path_dir.pop()
975
991
  relative_path_dir = relative_path_dir.join("/")
976
992
  abs_path = dataset_home + "/" + relative_path_dir
993
+ abs_path = dataset_home if flatten
977
994
  begin
978
995
  FileUtils.mkdir_p(abs_path) unless File.exist? (abs_path + "/" + file_name)
979
996
  rescue
@@ -981,14 +998,14 @@ module Cnvrg
981
998
  exit(1)
982
999
  end
983
1000
  begin
984
- File.write "#{abs_path}/#{file_name}", open(f["s3_url"]).read unless File.exist? (abs_path + "/" + file_name)
985
- rescue
1001
+ File.write "#{abs_path}/#{file_name}", open(f["url"]).read unless File.exist? (abs_path + "/" + file_name)
1002
+ rescue => e
986
1003
  log_message("Could not download file: #{f["fullpath"]}", Thor::Shell::Color::RED)
987
1004
  exit(1)
988
1005
  end
989
1006
 
990
1007
  end
991
- @executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
1008
+ #@executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
992
1009
  rescue Interrupt
993
1010
  log_message("Couldn't download", Thor::Shell::Color::RED)
994
1011
  exit(1)
@@ -998,7 +1015,7 @@ module Cnvrg
998
1015
  check = Helpers.checkmark
999
1016
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
1000
1017
  dataset.write_success(in_folder=true)
1001
- rescue
1018
+ rescue => e
1002
1019
  exit(1)
1003
1020
  end
1004
1021
  end
@@ -1008,32 +1025,6 @@ module Cnvrg
1008
1025
  end
1009
1026
  end
1010
1027
 
1011
- desc 'init_data_container', 'Init dataset directory', :hide => true
1012
- method_option :login_content, :type => :string, :aliases => ["-l"], :default => ""
1013
-
1014
- def init_data_container(container)
1015
- begin
1016
- login_content = options["login_content"]
1017
-
1018
- container = Docker::Container.get(container)
1019
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
1020
- container.exec(command, tty: true)
1021
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
1022
- container.exec(command, tty: true)
1023
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
1024
- container.exec(command, tty: true)
1025
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg /home/ds/.netrc"]
1026
- container.exec(command, tty: true)
1027
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
1028
- container.exec(command, tty: true)
1029
-
1030
- rescue SignalException
1031
-
1032
- say "\nAborting", Thor::Shell::Color::RED
1033
- exit(1)
1034
- end
1035
- end
1036
-
1037
1028
  desc 'data_snap', 'Init dataset directory', :hide => true
1038
1029
  method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
1039
1030
 
@@ -1184,17 +1175,29 @@ module Cnvrg
1184
1175
  end
1185
1176
 
1186
1177
  desc '', '', :hide => true
1187
- def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, message: nil)
1178
+ def get_owner_slug(url_or_slug)
1179
+ if url_or_slug =~ URI::regexp
1180
+ # Find owner and slug in url
1181
+ url_parts = url_or_slug.split("/")
1182
+ project_index = Cnvrg::Helpers.look_for_in_path(url_or_slug, "datasets")
1183
+ slug = url_parts[project_index + 1]
1184
+ owner = url_parts[project_index - 1]
1185
+ else
1186
+ # Find owner in config file
1187
+ owner = CLI.get_owner
1188
+ slug = url_or_slug
1189
+ end
1190
+ return owner, slug
1191
+ end
1192
+
1193
+ desc '', '', :hide => true
1194
+ def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, force: false, threads: 15, message: nil)
1188
1195
  begin
1189
1196
  verify_logged_in(false)
1190
1197
  log_start(__method__, args, options)
1191
1198
 
1192
- #find owner and slug in url
1193
- url_parts = dataset_url.split("/")
1194
- project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
1195
- slug = url_parts[project_index + 1]
1196
- owner = url_parts[project_index - 1]
1197
- @dataset = Dataset.new(dataset_url: dataset_url)
1199
+ owner, slug = get_owner_slug(dataset_url)
1200
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1198
1201
  @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1199
1202
  @files = @datafiles.verify_files_exists(files)
1200
1203
 
@@ -1218,28 +1221,33 @@ module Cnvrg
1218
1221
  else
1219
1222
  @commit = commit
1220
1223
  end
1221
- #dir shouldnt have starting or ending slash.
1224
+
1225
+ # dir shouldnt have starting or ending slash.
1222
1226
  dir = dir[0..-2] if dir.end_with? '/'
1223
1227
  dir = dir[1..-1] if dir.start_with? '/'
1224
1228
 
1225
- @files.each_slice(chunk_size).each do |list_files|
1226
- temp_tree = @dataset.generate_chunked_idx(list_files, prefix: dir)
1227
- #will throw a signal exception if something goes wrong.
1228
- @datafiles.upload_multiple_files(@commit, temp_tree, force: true, prefix: dir, total: @files.size)
1229
+ @datafiles.upload_multiple_files_optimized(
1230
+ @files,
1231
+ @commit,
1232
+ force: force,
1233
+ chunk_size: chunk_size,
1234
+ prefix: dir,
1235
+ threads: threads
1236
+ )
1237
+
1238
+ # This is for backwards compatibility only and should be removed in future versions:
1239
+ res = @datafiles.put_commit(@commit)
1240
+ unless res.is_success?
1241
+ raise SignalException.new(1, res.msg)
1229
1242
  end
1230
- if commit.blank?
1231
- res = @datafiles.put_commit(@commit)
1232
- unless res.is_success?
1233
- raise SignalException.new(1, res.msg)
1234
- end
1235
- else
1236
- res = @datafiles.end_commit(@commit,false, success: true )
1237
- msg = res['result']
1238
- response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1239
- unless response.is_success?
1240
- raise SignalException.new(1, res.msg)
1241
- end
1243
+
1244
+ res = @datafiles.end_commit(@commit,false, success: true, commit_type: "put")
1245
+ msg = res['result']
1246
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1247
+ unless response.is_success?
1248
+ raise SignalException.new(1, res.msg)
1242
1249
  end
1250
+
1243
1251
  log_message("Uploading files finished Successfully", Thor::Shell::Color::GREEN)
1244
1252
  rescue SignalException => e
1245
1253
  log_message(e.message, Thor::Shell::Color::RED)
@@ -1248,7 +1256,49 @@ module Cnvrg
1248
1256
  end
1249
1257
 
1250
1258
 
1259
+ desc '', '', :hide => true
1260
+ def data_rm(dataset_url, regex_list: [], commit: '', message: nil)
1261
+ begin
1262
+ verify_logged_in(false)
1263
+ log_start(__method__, args, options)
1251
1264
 
1265
+ owner, slug = get_owner_slug(dataset_url)
1266
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1267
+ @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1268
+
1269
+ # Init a new commit
1270
+ response = @datafiles.start_commit(false, true, chunks: 1, message: message )
1271
+ unless response #means we failed in the start commit.
1272
+ raise SignalException.new(1, "Cant put files into dataset, check the dataset id")
1273
+ end
1274
+ @commit = response['result']['commit_sha1']
1275
+ files_to_delete, folders_to_delete, job_id = @datafiles.delete_multiple_files(@commit, regex_list)
1276
+ log_message("Deleting #{files_to_delete} files and #{folders_to_delete} folders", Thor::Shell::Color::GREEN)
1277
+
1278
+ total_files = files_to_delete + folders_to_delete
1279
+ current_progress = 0
1280
+ progressbar = @datafiles.create_progressbar("Delete Progress", total_files)
1281
+ chunk_size = 1000
1282
+ offset = 0
1283
+ while current_progress < total_files
1284
+ current_progress = @datafiles.delete_file_chunk(@commit, regex_list, chunk_size, offset)
1285
+ progressbar.progress = current_progress
1286
+ offset += chunk_size
1287
+ end
1288
+
1289
+ res = @datafiles.end_commit(@commit,false, success: true)
1290
+ msg = res['result']
1291
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1292
+ unless response.is_success?
1293
+ raise SignalException.new(1, res.msg)
1294
+ end
1295
+
1296
+ log_message("Deleting files finished Successfully", Thor::Shell::Color::GREEN)
1297
+ rescue SignalException => e
1298
+ log_message(e.message, Thor::Shell::Color::RED)
1299
+ return false
1300
+ end
1301
+ end
1252
1302
 
1253
1303
  desc 'upload_data', 'Upload data files', :hide => true
1254
1304
  method_option :ignore, :type => :array, :aliases => ["-i", "--i"], :desc => "ignore following files"
@@ -1699,18 +1749,22 @@ module Cnvrg
1699
1749
  end
1700
1750
 
1701
1751
  desc 'data commits', 'List all commits for a specific dataset', :hide => true
1702
-
1703
- def list_dataset_commits()
1704
- verify_logged_in(true)
1752
+ def list_dataset_commits(dataset_url, commit_sha1: nil)
1753
+ verify_logged_in(false)
1705
1754
  log_start(__method__, args, options)
1706
1755
 
1707
- dataset_dir = is_cnvrg_dir(Dir.pwd)
1708
- @dataset = Dataset.new(dataset_dir)
1709
- result = @dataset.list_commits()
1756
+ if dataset_url == "."
1757
+ dataset_dir = is_cnvrg_dir(Dir.pwd)
1758
+ @dataset = Dataset.new(dataset_dir)
1759
+ else
1760
+ owner, slug = get_owner_slug(dataset_url)
1761
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1762
+ end
1763
+
1764
+ result = @dataset.list_commits(commit_sha1:commit_sha1)
1710
1765
  list = result["result"]["list"]
1711
1766
 
1712
1767
  print_table(list)
1713
-
1714
1768
  end
1715
1769
 
1716
1770
  desc 'commits', 'List all commits for a specific Project'
@@ -1741,17 +1795,17 @@ module Cnvrg
1741
1795
 
1742
1796
 
1743
1797
  desc 'git_clone', 'Clone project'
1798
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1744
1799
  def git_clone(slug, owner)
1745
1800
  verify_logged_in(false)
1746
1801
  log_start(__method__, args, options)
1747
-
1802
+ project_home = Dir.pwd
1803
+ soft = options["soft"] || false
1804
+ Project.stop_if_project_present(project_home, slug) if soft
1748
1805
  clone_resp = Project.clone_dir_remote(slug, owner, slug,true)
1749
- idx_status = Project.new(get_project_home).generate_idx
1806
+ exit 1 if not clone_resp
1807
+ idx_status = Project.new(get_project_home).generate_idx(files:[])
1750
1808
  FileUtils.mkdir_p File.join(get_project_home, ENV['CNVRG_OUTPUT_DIR']) if ENV['CNVRG_OUTPUT_DIR'].present?
1751
- @executer = Cnvrg::Helpers::Executer.get_executer
1752
- if @executer.present?
1753
- @executer.update_git_commit
1754
- end
1755
1809
  end
1756
1810
 
1757
1811
 
@@ -1791,7 +1845,7 @@ module Cnvrg
1791
1845
  desc 'clone PROJECT_URL', 'Clone project'
1792
1846
  method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false
1793
1847
  method_option :commit, :type => :string, :aliases => ["-c", "--c"], :default => nil
1794
-
1848
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1795
1849
  def clone(project_url)
1796
1850
  begin
1797
1851
  verify_logged_in(false)
@@ -1801,6 +1855,8 @@ module Cnvrg
1801
1855
  slug = url_parts[project_index + 1]
1802
1856
  owner = url_parts[project_index - 1]
1803
1857
  remote = options["remote"] || false
1858
+ soft = options["soft"] || false
1859
+
1804
1860
 
1805
1861
  response = Cnvrg::API.request("users/#{owner}/projects/#{slug}/get_project", 'GET')
1806
1862
  Cnvrg::CLI.is_response_success(response)
@@ -1814,6 +1870,8 @@ module Cnvrg
1814
1870
  clone_resp = false
1815
1871
  project_home = Dir.pwd
1816
1872
 
1873
+ Project.stop_if_project_present(project_home, project_name) if soft
1874
+
1817
1875
  if remote and !git
1818
1876
  clone_resp = Project.clone_dir_remote(slug, owner, project_name,git)
1819
1877
  elsif git
@@ -1954,8 +2012,6 @@ module Cnvrg
1954
2012
  method_option :parallel, :type => :numeric, :aliases => ["-p", "--parallel"], :desc => "uparallel upload at the same time", :default => 15
1955
2013
  method_option :init, :type => :boolean, :aliases => ["--initial"], :desc => "initial sync", :default => false
1956
2014
  method_option :message, :type => :string, :aliases => ["--message"], :desc => "create commit with message", :default => nil
1957
-
1958
-
1959
2015
  def sync_data_new(new_branch, force, verbose, commit, all_files, tags ,parallel, chunk_size, init, message)
1960
2016
  verify_logged_in(true)
1961
2017
  log_start(__method__, args, options)
@@ -1964,11 +2020,13 @@ module Cnvrg
1964
2020
  # w(verbose=false, new_branch=false,sync=false, commit=nil,all_files=true)
1965
2021
  total_deleted, total_downloaded = invoke :download_data_new,[verbose, new_branch, true, commit, all_files], :new_branch=>new_branch, :direct=>false, :force =>force
1966
2022
  end
1967
- # w(new_branch, verbose,sync,force, tags, chunk_size)
2023
+
1968
2024
  invoke :upload_data_new,[new_branch, verbose, true, force, tags, chunk_size, message:message, total_deleted: total_deleted, total_downloaded: total_downloaded],
1969
2025
  :new_branch=>new_branch, :direct=>false, :force =>force, :sync =>true, :tags =>tags, :parallel => parallel, :message => message
1970
2026
 
1971
2027
  end
2028
+
2029
+
1972
2030
  desc 'upload_data_new', 'upload_data_new', :hide => true
1973
2031
  method_option :verbose, :type => :boolean, :aliases => ["-v"], :default => false
1974
2032
  method_option :new_branch, :type => :boolean, :aliases => ["-nb"], :desc => "create new branch of commits"
@@ -2214,13 +2272,24 @@ module Cnvrg
2214
2272
  method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2215
2273
  method_option :job_slug, :type => :string, :aliases => ["--job"], :default => nil, :hide=>true
2216
2274
  method_option :job_type, :type => :string, :aliases => [ "--job_type"], :default => nil, :hide=>true
2275
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2276
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2217
2277
 
2218
- def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil)
2278
+ def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true)
2219
2279
  begin
2220
2280
  # we are passing "force" twice.. doesnt really make sense :\\
2221
2281
  verify_logged_in(true)
2222
2282
  log_start(__method__, args, options)
2223
2283
  @project = Project.new(get_project_home)
2284
+
2285
+ # Enable local/experiment exception logging
2286
+ suppress_exceptions = suppress_exceptions ? suppress_exceptions : options[:suppress_exceptions]
2287
+ if in_exp
2288
+ exp_obj = Experiment.new(@project.owner, @project.slug, job_id: job_slug)
2289
+ else
2290
+ exp_obj = nil
2291
+ end
2292
+
2224
2293
  commit_msg = options["message"]
2225
2294
  if commit_msg.nil? or commit_msg.empty?
2226
2295
  commit_msg = ""
@@ -2292,8 +2361,6 @@ module Cnvrg
2292
2361
  end
2293
2362
  update_count = 0
2294
2363
  update_total = result["added"].size + result["updated_on_local"].size + result["deleted"].size
2295
- successful_updates = []
2296
- successful_deletions = []
2297
2364
  if options["verbose"]
2298
2365
  if update_total == 1
2299
2366
  log_message("Updating #{update_total} file", Thor::Shell::Color::BLUE)
@@ -2313,8 +2380,11 @@ module Cnvrg
2313
2380
  end
2314
2381
  job_type = options['job_type'] || job_type
2315
2382
  job_slug = options['job_slug'] || job_slug
2316
- commit_sha1 = @files.start_commit(new_branch, force: force, exp_start_commit: exp_start_commit,
2317
- job_type: job_type, job_slug: job_slug, start_commit: current_commit, message: options["message"])["result"]["commit_sha1"]
2383
+ commit_sha1 = @files.start_commit(
2384
+ new_branch, force: force, exp_start_commit: exp_start_commit,
2385
+ job_type: job_type, job_slug: job_slug, start_commit: current_commit,message: options["message"],
2386
+ debug_mode: options["debug_mode"]
2387
+ )["result"]["commit_sha1"]
2318
2388
  # upload / update
2319
2389
  # delete
2320
2390
  to_upload = result["added"] + result["updated_on_local"]
@@ -2325,32 +2395,30 @@ module Cnvrg
2325
2395
  :starting_at => 0,
2326
2396
  :total => (to_upload.size + deleted.size),
2327
2397
  :autofinish => true)
2328
- @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar)
2329
2398
 
2330
- @files.delete_files_from_server(deleted, commit_sha1)
2399
+ buffered_errors = @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar, suppress_exceptions: suppress_exceptions)
2400
+ @files.delete_files_from_server(deleted, commit_sha1, suppress_exceptions: suppress_exceptions)
2331
2401
 
2332
2402
  progressbar.finish
2403
+
2404
+ if buffered_errors.is_a?(Hash)
2405
+ buffered_errors.keys.each do |file|
2406
+ to_upload.delete(file)
2407
+ Cnvrg::CLI.log_message(buffered_errors[file], 'red')
2408
+ exp_obj.job_log([buffered_errors[file]]) unless exp_obj.nil?
2409
+ end
2410
+ end
2411
+
2333
2412
  res = @files.end_commit(commit_sha1, force: force, message: commit_msg)
2334
2413
  unless Cnvrg::CLI.is_response_success(res, false)
2335
2414
  raise StandardError.new("Cant end commit")
2336
2415
  end
2416
+
2337
2417
  # save idx
2338
2418
  @project.update_idx_with_files_commits!((to_upload + deleted), res["result"]["commit_time"])
2339
2419
  @project.update_idx_with_commit!(commit_sha1)
2340
2420
  if options["verbose"]
2341
2421
  log_message("#{check} Done", Thor::Shell::Color::BLUE)
2342
- if successful_updates.size > 0
2343
- successful_updates.flatten!
2344
- log_message("Updated:", Thor::Shell::Color::GREEN)
2345
- suc = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2346
- log_message(suc.join("\n"), Thor::Shell::Color::GREEN)
2347
- end
2348
- if successful_deletions.size > 0
2349
- successful_deletions.flatten!
2350
- log_message("Deleted:", Thor::Shell::Color::GREEN)
2351
- del = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2352
- log_message(del.join("\n"), Thor::Shell::Color::GREEN)
2353
- end
2354
2422
  log_message("Total of #{update_count} / #{update_total} files.", Thor::Shell::Color::GREEN)
2355
2423
  else
2356
2424
  if return_id
@@ -2375,9 +2443,13 @@ module Cnvrg
2375
2443
  if e.is_a? SignalException
2376
2444
  say "\nAborting", Thor::Shell::Color::BLUE
2377
2445
  say "\nRolling back all changes", Thor::Shell::Color::BLUE
2446
+
2447
+ exp_obj.job_log(["Aborting", "Rolling back all changes"]) unless exp_obj.nil?
2378
2448
  else
2379
2449
  log_message(error_message, Thor::Shell::Color::RED)
2380
2450
  log_error(e)
2451
+
2452
+ exp_obj.job_log([error_message, e]) unless exp_obj.nil?
2381
2453
  end
2382
2454
  @files.rollback_commit(commit_sha1) unless commit_sha1.nil?
2383
2455
  print_res = {
@@ -2896,6 +2968,10 @@ module Cnvrg
2896
2968
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
2897
2969
  method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
2898
2970
  method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2971
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2972
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2973
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2974
+
2899
2975
  def sync(direct = true)
2900
2976
  verify_logged_in(true) if direct
2901
2977
  @project = Project.new(get_project_home)
@@ -2907,16 +2983,20 @@ module Cnvrg
2907
2983
  is_git = ENV['CNVRG_GIT_PROJECT'] == "true" || @project.is_git
2908
2984
  in_exp = options["in_exp"] || (job_slug.present? and job_type.present?)
2909
2985
  in_exp = false if job_type.present? and job_type == "NotebookSession"
2986
+ output_dir = options["output_dir"] || ENV['CNVRG_OUTPUT_DIR']
2987
+
2910
2988
  run_download = true
2911
- if options[:force] or options[:files].present? or options[:output_dir].present? or in_exp or @project.is_branch
2989
+ if (job_type == "NotebookSession" and is_git) or job_type == "Experiment" or options['force']
2912
2990
  run_download = false
2913
2991
  end
2914
- if run_download
2992
+
2993
+ if run_download or options['debug_mode']
2915
2994
  invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
2916
2995
  end
2917
- invoke :upload, [false, true, direct, "",in_exp,options[:force], options["output_dir"],job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2996
+ invoke :upload, [false, true, direct, "",in_exp,options[:force], output_dir, job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2918
2997
  :ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
2919
- :files => options["files"], :output_dir => options["output_dir"], :job_slug => job_slug, :job_type => job_type, :git_diff=> options["git_diff"]
2998
+ :files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"], :debug_mode => options['debug_mode'], :git_diff => options["git_diff"]
2999
+
2920
3000
  end
2921
3001
 
2922
3002
  desc 'run cmd', 'Runs an experiment'
@@ -3061,6 +3141,8 @@ module Cnvrg
3061
3141
  method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
3062
3142
  method_option :data_commit, :type => :string, :aliases => ["-dc", "--data_commit"], :default => ""
3063
3143
  method_option :ignore, :type => :string, :aliases => ["-i", "--ignore"], :desc => "ignore following files", :default => ""
3144
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch", :default => ""
3145
+ method_option :gpu_util_from_docker, :type => :boolean, :aliases => ["--gpu-util-from-docker"], :desc => "take gpu utilization from job docker", :default => false
3064
3146
  method_option :remote, :type => :boolean, :aliases => ["--remote"], :default => false
3065
3147
  method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
3066
3148
  method_option :force, :type => :boolean, :aliases => ["-f", "--force"], :default => false
@@ -3068,6 +3150,7 @@ module Cnvrg
3068
3150
  method_option :periodic_sync, :type => :string, :aliases => ["-ps", "--periodic_sync"], :default => ""
3069
3151
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3070
3152
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3153
+ method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3071
3154
 
3072
3155
  def exec(*cmd)
3073
3156
  log = []
@@ -3134,8 +3217,12 @@ module Cnvrg
3134
3217
  end
3135
3218
  remote = options["remote"]
3136
3219
  if remote
3137
- docker_id = `cat /etc/hostname`
3138
- docker_id = docker_id.strip()
3220
+ if options["docker_id"].present?
3221
+ docker_id = options["docker_id"]
3222
+ else
3223
+ docker_id = `cat /etc/hostname`
3224
+ docker_id = docker_id.strip()
3225
+ end
3139
3226
  end
3140
3227
  is_on_gpu = options["gpu"]
3141
3228
  start_commit = @project.last_local_commit
@@ -3145,9 +3232,9 @@ module Cnvrg
3145
3232
 
3146
3233
  platform = RUBY_PLATFORM
3147
3234
  machine_name = Socket.gethostname
3235
+ machine_activity_slug = ENV["CNVRG_MACHINE_ACTIVITY"]
3148
3236
  begin
3149
- machine_activity = @exp.get_machine_activity(working_dir)
3150
- @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity, script_path, sync_before_terminate, periodic_sync)
3237
+ @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity_slug, script_path, sync_before_terminate, periodic_sync)
3151
3238
  log_message("Experiment's live results: #{Cnvrg::Helpers.remote_url}/#{@project.owner}/projects/#{@project.slug}/experiments/#{@exp.slug}", Thor::Shell::Color::GREEN)
3152
3239
  log_message("Running: #{cmd}\n", Thor::Shell::Color::BLUE)
3153
3240
  unless @exp.slug.nil?
@@ -3165,7 +3252,7 @@ module Cnvrg
3165
3252
  begin
3166
3253
  stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3167
3254
  if is_on_gpu
3168
- gu = gpu_util
3255
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3169
3256
  stats['gpu_util'] = gu[0]
3170
3257
  stats['gpu'] = gu[1]
3171
3258
  end
@@ -3177,6 +3264,16 @@ module Cnvrg
3177
3264
  end
3178
3265
  end
3179
3266
  start_time = Time.now
3267
+ shell_type = options["use_bash"] ? "bash -l" : "sh"
3268
+ if @exp.get_cmd.present?
3269
+ cmd = @exp.get_cmd
3270
+ if options["docker_id"].present? # Escape for docker exec
3271
+ cmd = cmd.gsub("\"", "\\\"")
3272
+ end
3273
+ end
3274
+ if options["docker_id"].present?
3275
+ cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
3276
+ end
3180
3277
  PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
3181
3278
  begin
3182
3279
  stdout.each do |line|
@@ -3191,7 +3288,7 @@ module Cnvrg
3191
3288
  puts line
3192
3289
  end
3193
3290
  log << cur_log
3194
- if log.size >= 5
3291
+ if log.size >= 1
3195
3292
  @exp.upload_temp_log(log) unless log.empty?
3196
3293
  log = []
3197
3294
  elsif (start_time + 15.seconds) <= Time.now
@@ -3241,29 +3338,26 @@ module Cnvrg
3241
3338
  exp_success = false
3242
3339
  end
3243
3340
 
3244
- if sync_after
3245
- @exp.job_log(["Syncing Experiment"])
3246
- # Sync after run
3247
- if @project.is_git
3248
- output_dir = output_dir || @exp.output_dir
3249
- if output_dir.present?
3250
- upload(false, false, true, ignore, true, true,output_dir,"Experiment",@exp.slug )
3251
- # invoke :upload, [false, false, true, ignore, true, true], :output_dir => output_dir, :force=>true, :job_type=>'Experiment', :job_slug=>@exp.slug
3252
- end
3253
- else
3254
- upload(false, false, true, ignore, true, true,nil,"Experiment",@exp.slug )
3255
-
3256
- # invoke :upload, [false, false, true, ignore,true, true], :job_type=>'Experiment', :job_slug=>@exp.slug, :force=>true
3341
+ if sync_after
3342
+ @exp.job_log(["Syncing Experiment"])
3343
+ # Sync after run
3344
+ if @project.is_git
3345
+ output_dir = output_dir || @exp.output_dir
3346
+ if output_dir.present?
3347
+ upload(false, false, true, ignore, true, true, output_dir, "Experiment", @exp.slug, true )
3257
3348
  end
3258
-
3349
+ else
3350
+ upload(false, false, true, ignore, true, true, nil, "Experiment", @exp.slug, true )
3259
3351
  end
3352
+ end
3353
+
3260
3354
  end_commit = @project.last_local_commit
3261
3355
  if end_commit.present?
3262
3356
  @exp.job_log(["Experiment end commit: #{end_commit}"])
3263
3357
  end
3264
3358
 
3265
3359
  # log_thread.join
3266
- stats_thread.join
3360
+ stats_thread.join
3267
3361
 
3268
3362
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3269
3363
 
@@ -3411,8 +3505,8 @@ module Cnvrg
3411
3505
  local_folders_options = options["local_folders"]
3412
3506
  options_hash.except!("schedule", "recurring", "machine_type", "image", "upload_output", "grid", "data", "data_commit", "title",
3413
3507
  "local", "small", "medium", "large", "gpu", "gpuxl", "gpuxxl","max_time","dataset_only_tree",
3414
- "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets", "requirements", "prerun",
3415
- "email_notification_error", "email_notification_success", "emails")
3508
+ "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets",
3509
+ "requirements", "prerun", "email_notification_error", "email_notification_success", "emails")
3416
3510
  exec_options = options_hash.map {|x| "--#{x[0]}=#{x[1]}"}.flatten.join(" ")
3417
3511
  command = "#{exec_options} #{remote} #{upload_output_option} #{cmd.flatten.join(" ")}"
3418
3512
  commit_to_run = options["commit"] || nil
@@ -4237,144 +4331,6 @@ module Cnvrg
4237
4331
 
4238
4332
  end
4239
4333
 
4240
- method_option :small, :type => :boolean, :aliases => ["-sm", "--small"], :default => false
4241
- method_option :medium, :type => :boolean, :aliases => ["-md", "--medium"], :default => false
4242
- method_option :large, :type => :boolean, :aliases => ["-lg", "--large"], :default => false
4243
- method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
4244
- method_option :gpuxl, :type => :boolean, :aliases => ["--gpuxl"], :default => false
4245
- method_option :gpuxxl, :type => :boolean, :aliases => ["--gpuxxl"], :default => false
4246
- method_option :image, :type => :string, :aliases => ["-i", "--image"], :default => ""
4247
- method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
4248
- method_option :base, :type => :boolean, :aliases => ["-b", "--base"], :default => false
4249
- method_option :python3, :type => :boolean, :aliases => ["--python3"], :default => false
4250
- method_option :docker_path, :type => :string, :aliases => ["--docker_path"], :default => ""
4251
-
4252
-
4253
- desc 'create_custom_image', 'run commands inside containers', :hide => true
4254
-
4255
- def build_image(image_name)
4256
- begin
4257
- verify_logged_in(false)
4258
- log_start(__method__, args, options)
4259
- instances = {"small" => options["small"], "medium" => options["medium"], "large" => options["large"],
4260
- "gpu" => options["gpu"], "gpuxl" => options["gpuxl"], "gpuxxl" => options["gpuxxl"]}
4261
- instance_type = get_instance_type(instances)
4262
- image_extend = options["image"]
4263
- public = options["public"]
4264
- base = options["base"]
4265
- python3 = options["python3"]
4266
- docker_path = options["docker_path"]
4267
- owner = CLI.get_owner
4268
- checks = Helpers.checkmark()
4269
- tar_path = nil
4270
- if !docker_path.nil? and !docker_path.empty?
4271
- docker_path = File.absolute_path(docker_path)
4272
- #create tar of the docker path: it could be a docker file, and it could be a docker folder
4273
- tar_path = File.expand_path('~') + "/.cnvrg/tmp/docker_#{File.basename docker_path}.tar.gz"
4274
- resp = create_docker_tar(docker_path, tar_path)
4275
- if !resp
4276
- log_message("Couldn't create tar from docker path", Thor::Shell::Color::RED)
4277
- FileUtils.rm_rf tar_path
4278
- exit(1)
4279
- end
4280
- files = Cnvrg::Files.new(owner, "")
4281
- resp = Images.create_new_custom_image_with_docker(instance_type, owner, image_name, public, base, image_extend, python3, tar_path, files)
4282
- if resp
4283
- end
4284
- else
4285
- log_message("Creating machine for your custom image, this may take a few moments...", Thor::Shell::Color::BLUE)
4286
- resp = Images.create_new_custom_image(instance_type, owner, image_name, public, base, image_extend, python3, nil)
4287
-
4288
- end
4289
-
4290
- if Cnvrg::CLI.is_response_success(resp, false)
4291
- image_slug = resp["result"]["slug"]
4292
- container = resp["result"]["machine_c"]
4293
- log_message("#{checks} Created image and machine successfully", Thor::Shell::Color::GREEN)
4294
- log_message("Connecting to machine", Thor::Shell::Color::BLUE)
4295
- ssh = Ssh.new(resp)
4296
- if !ssh.is_ssh
4297
- log_message("Couldn't connect to machine,aborting", Thor::Shell::Color::RED)
4298
- Images.revoke_custom_new_image(owner, image_slug)
4299
- end
4300
- log_message("run command until ctrl + c or quit is initiated", Thor::Shell::Color::BLUE)
4301
- begin
4302
- logs = []
4303
-
4304
- while true
4305
- command = ask("$>")
4306
- logs << {time: Time.now,
4307
- message: command,
4308
- type: "stdout"
4309
- }
4310
- if command.eql? "quit"
4311
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4312
- break
4313
- end
4314
- res = ssh.exec_command(command)
4315
- begin
4316
- res_parsed = JSON.parse(res)
4317
- res = res_parsed.join(",")
4318
- end
4319
-
4320
- puts res
4321
- logs << {time: Time.now,
4322
- message: res,
4323
- type: "stdout"
4324
- }
4325
- logs.flatten!
4326
-
4327
- end
4328
-
4329
- rescue SignalException
4330
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4331
-
4332
- end
4333
- resp = Images.commit_custom_image(owner, image_slug, logs)
4334
- if Cnvrg::CLI.is_response_success(resp, false)
4335
- log_message("#{checks} Image commited successfuly, email will be sent when image is ready", Thor::Shell::Color::GREEN)
4336
- else
4337
- if image_slug
4338
- Images.revoke_custom_new_image(owner, image_slug)
4339
- end
4340
- if ssh
4341
- ssh.close_ssh()
4342
- end
4343
- log_message("Image couldn't be commited, rolling back changes", Thor::Shell::Color::RED)
4344
-
4345
- exit(1)
4346
- end
4347
- if ssh
4348
- ssh.close_ssh()
4349
- end
4350
-
4351
-
4352
- end
4353
- rescue => e
4354
- log_message("Error occurd, aborting", Thor::Shell::Color::RED)
4355
-
4356
- log_error(e)
4357
- if image_slug
4358
- Images.revoke_custom_new_image(owner, image_slug)
4359
- end
4360
- if ssh
4361
- ssh.close_ssh()
4362
- end
4363
-
4364
-
4365
- rescue SignalException
4366
- if image_slug
4367
- Images.revoke_custom_new_image(owner, image_slug)
4368
- end
4369
- if ssh
4370
- ssh.close_ssh
4371
- end
4372
- say "\nAborting"
4373
- exit(1)
4374
- end
4375
-
4376
- end
4377
-
4378
4334
 
4379
4335
  desc 'build', 'run commands inside containers', :hide => true
4380
4336
  method_option :install, :type => :string, :aliases => ["--i"], :default => nil, :desc => "Install from the given instructions file"
@@ -4568,66 +4524,7 @@ module Cnvrg
4568
4524
  end
4569
4525
 
4570
4526
 
4571
- desc 'upload_image', 'commit notebook changes to create a new notebook image', :hide =>true
4572
-
4573
- def upload_image_old(image_id, is_public, is_base, *message)
4574
- verify_logged_in(true)
4575
- log_start(__method__, args, options)
4576
- image = Docker::Image.get(image_id)
4577
- project_home = get_project_home
4578
- @project = Project.new(project_home)
4579
- last_local_commit = @project.last_local_commit
4580
- image_name = @project.slug + "#{last_local_commit}"
4581
- path = File.expand_path('~') + "/.cnvrg/tmp/#{image_name}.tar"
4582
- owner = Cnvrg::CLI.get_owner()
4583
- if !message.nil? or !message.empty?
4584
- message = message.join(" ")
4585
- end
4586
-
4587
- log_message("Saving image's current state", Thor::Shell::Color::BLUE)
4588
- image.save(path)
4589
-
4590
- begin
4591
- log_message("Compressing image file to upload", Thor::Shell::Color::BLUE)
4592
- gzipRes = system("gzip -f #{path}")
4593
- if !gzipRes
4594
-
4595
- log_message("Couldn't create tar file from image", Thor::Shell::Color::RED)
4596
- exit(1)
4597
- end
4598
- path = path + ".gz"
4599
- @files = Cnvrg::Files.new(owner, "")
4600
-
4601
- exit_status = $?.exitstatus
4602
- if exit_status == 0
4603
- log_message("Uploading image file", Thor::Shell::Color::BLUE)
4604
-
4605
- diff = container_changes(Dir.pwd)
4606
- res = @files.upload_image(path, image_name, owner, is_public, is_base, diff[1], diff[0], diff[2], message, image.commit_id)
4607
- if res
4608
- File.delete(path)
4609
- image_loc = is_project_with_docker(Dir.pwd)
4610
- image_loc.update_slug(res["result"]["id"])
4611
-
4612
- checks = Helpers.checkmark()
4613
- log_message("#{checks} Done", Thor::Shell::Color::GREEN)
4614
- else
4615
- log_message("Couldn't upload image", Thor::Shell::Color::RED)
4616
-
4617
- end
4618
- else
4619
- log_message("Couldn't create image file for: #{image_name}", Thor::Shell::Color::RED)
4620
- exit(1)
4621
- end
4622
- rescue => e
4623
- log_message("Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED)
4624
- log_error(e)
4625
- rescue SignalException
4626
4527
 
4627
- say "Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED
4628
- exit(1)
4629
- end
4630
- end
4631
4528
 
4632
4529
  desc '', '', :hide => true
4633
4530
 
@@ -4638,278 +4535,30 @@ module Cnvrg
4638
4535
 
4639
4536
  end
4640
4537
 
4641
- desc '', '', :hide => true
4642
-
4643
- def exec_container(container_id, *cmd)
4644
- container = Docker::Container.get(container_id)
4645
- container.start()
4646
- cnvrg_command = cmd.join(" ")
4647
- command = ["/bin/bash", "-lc", "#{cnvrg_command}"]
4648
- res = container.exec(command, tty: true, wait: 5400)[0]
4649
- say res
4650
- end
4651
-
4652
- desc '', '', :hide => true
4653
-
4654
- def port_container(container_id)
4655
- container = Docker::Container.get(container_id)
4656
- say container.json["HostConfig"]["PortBindings"]["8888/tcp"][0]["HostPort"]
4657
- end
4658
-
4659
- desc '', '', :hide => true
4660
-
4661
- def tensor_port_container(container_id)
4662
- container = Docker::Container.get(container_id)
4663
- say container.json["HostConfig"]["PortBindings"]["6006/tcp"][0]["HostPort"]
4664
- end
4665
-
4666
- desc '', '', :hide => true
4667
-
4668
- def stop_container(container_id)
4669
- container = Docker::Container.get(container_id)
4670
- container.stop()
4671
- container.remove()
4672
-
4673
- end
4674
-
4675
- desc '', '', :hide => true
4676
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4677
- method_option :app_dir, :type => :string, :aliases => ["-d"], :default => "/home/ds/notebooks"
4678
- method_option :cmd, :type => :string, :aliases => ["-c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4679
-
4680
-
4681
- def config_remote(image_name, port = 7654, tensport = 6006)
4682
- local_images = Docker::Image.all
4683
-
4684
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4685
- if docker_image_local.empty?
4686
- say "no image"
4687
- exit(1)
4688
- end
4689
-
4690
- begin
4691
- login_content = options["login"]
4692
- app_dir = options["app_dir"]
4693
- cmd = options["cmd"]
4694
- volume_from = options["volume"]
4695
-
4696
- image_settings = {
4697
- 'Image' => "#{image_name}:latest",
4698
-
4699
- 'Cmd' => cmd,
4700
- 'WorkingDir' => app_dir,
4701
- 'ExposedPorts' => {
4702
- '8888/tcp' => {},
4703
- },
4704
- 'HostConfig' => {
4705
- 'Binds' => ["/var/run/docker.sock:/var/run/docker.sock", "/usr/bin/docker:/usr/bin/docker"],
4706
- 'PortBindings' => {
4707
- '8888/tcp' => [
4708
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4709
- ],
4710
- '6006/tcp' => [
4711
- {'HostPort' => "#{tensport}", 'HostIp' => 'localhost'}
4712
- ],
4713
- },
4714
- },
4715
- }
4716
- container = Docker::Container.create(image_settings)
4717
- container.start()
4718
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4719
- container.exec(command, tty: true)
4720
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
4721
- # container.exec(command, tty: true)
4722
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
4723
- # container.exec(command, tty: true)
4724
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4725
- container.exec(command, tty: true)
4726
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4727
- container.exec(command, tty: true)
4728
- say "#{container.id}:#{port}##{tensport}"
4729
- rescue => e
4730
- puts e
4731
- if e.message.include? "is not running"
4732
- return config_remote(image_name, port - 1, tensport - 1)
4733
- end
4734
-
4735
- if container
4736
- container.kill()
4737
- end
4738
- return false
4739
- end
4740
- end
4741
-
4742
-
4743
- desc '', '', :hide => true
4744
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4745
-
4746
- def config_netrc(container)
4747
-
4748
- login_content = options["login"]
4749
-
4750
- container = Docker::Container.get(container)
4751
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4752
- container.exec(command, tty: true)
4753
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4754
- container.exec(command, tty: true)
4755
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4756
- container.exec(command, tty: true)
4757
- say "OK"
4758
-
4759
- end
4760
-
4761
- desc '', '', :hide => true
4762
- method_option :login, :type => :string, :aliases => ["-l", "--l"], :default => ""
4763
- method_option :app_dir, :type => :string, :aliases => ["-d", "--d"], :default => "/home/ds/notebooks"
4764
- method_option :cmd, :type => :string, :aliases => ["-c", "--c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4765
-
4766
-
4767
- def config_remote_gpu(image_name, port = 7654, tensport = 6006)
4768
- local_images = Docker::Image.all
4769
-
4770
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4771
- if docker_image_local.empty?
4772
- say "no image"
4773
- exit(1)
4774
- end
4775
-
4776
- begin
4777
- login_content = options["login"]
4778
- app_dir = options["app_dir"]
4779
- cmd = options["cmd"]
4780
-
4781
- # image_settings = {
4782
- # 'Image' => "#{image_name}:latest",
4783
- # 'User' => 'ds',
4784
- # 'Cmd' => cmd,
4785
- # 'WorkingDir' => app_dir,
4786
- # 'ExposedPorts' => {
4787
- # '8888/tcp' => {},
4788
- # },
4789
- # 'HostConfig' => {
4790
- # 'PortBindings' => {
4791
- # '8888/tcp' => [
4792
- # {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4793
- # ],
4794
- # '6006/tcp' => [
4795
- # {'HostPort' => "6006", 'HostIp' => 'localhost'}
4796
- # ],
4797
- # },
4798
- # },
4799
- # }
4800
-
4801
- container_id = `nvidia-docker run -itd -p #{port}:8888 -p #{tensport}:6006 -w #{app_dir} -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi -v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker #{image_name}:latest #{cmd} `
4802
- container_id = container_id.gsub("\n", "")
4803
- container = Docker::Container.get(container_id)
4804
- # container.start()
4805
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4806
- container.exec(command, tty: true)
4807
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4808
- container.exec(command, tty: true)
4809
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4810
- container.exec(command, tty: true)
4811
- say "#{container.id}:#{port}##{tensport}"
4812
- rescue => e
4813
- if e.message.include? "is not running"
4814
- puts "running asgain with: #{port - 1} #{tensport - 1}"
4815
- return config_remote_gpu(image_name, port - 1, tensport - 1)
4816
- end
4817
-
4818
- if container
4819
- container.kill()
4538
+ desc 'Collect and send job utilization', '', :hide => true
4539
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch"
4540
+ method_option :is_on_gpu, :type => :boolean, :aliases => ["--is_on_gpu"], :desc => "is on gpu", :default => true
4541
+ def get_utilization()
4542
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4543
+ docker_id = options["docker_id"]
4544
+ while true do
4545
+ sleep 30
4546
+ begin
4547
+ stats = usage_metrics_in_docker(docker_id)
4548
+ if options["is_on_gpu"]
4549
+ gu = gpu_util(take_from_docker: true, docker_id: docker_id)
4550
+ stats['gpu_util'] = gu[0]
4551
+ stats['gpu'] = gu[1]
4552
+ end
4553
+ stats['docker_id'] = docker_id
4554
+ @exp.send_machine_stats [stats] unless stats.empty?
4555
+ rescue => e
4556
+ log_error(e)
4557
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4820
4558
  end
4821
- return false
4822
4559
  end
4823
4560
  end
4824
4561
 
4825
- desc '', '', :hide => true
4826
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4827
-
4828
- def config_flask_remote(image_name, port = 80)
4829
- local_images = Docker::Image.all
4830
-
4831
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4832
- if docker_image_local.empty?
4833
- say "no image"
4834
- exit(1)
4835
- end
4836
-
4837
- begin
4838
- login_content = options["login"]
4839
- image_settings = {
4840
- 'Image' => "#{image_name}:latest",
4841
- 'User' => 'ds',
4842
- 'Cmd' => '/usr/local/cnvrg/start_super.sh',
4843
- 'WorkingDir' => '/home/ds/app',
4844
- 'ExposedPorts' => {
4845
- '80/tcp' => {},
4846
- },
4847
- 'HostConfig' => {
4848
- 'PortBindings' => {
4849
- '80/tcp' => [
4850
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4851
- ],
4852
- },
4853
- },
4854
- }
4855
- container = Docker::Container.create(image_settings)
4856
- container.start()
4857
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4858
- container.exec(command, tty: true)
4859
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4860
- container.exec(command, tty: true)
4861
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4862
- container.exec(command, tty: true)
4863
- say "#{container.id}:#{port}"
4864
- rescue => e
4865
- pus e
4866
- if e.message.include? "is not running"
4867
- return "port is taken"
4868
- end
4869
- puts "error"
4870
- if container
4871
- container.kill()
4872
- end
4873
- return false
4874
- end
4875
- end
4876
-
4877
- desc '', '', :hide => true
4878
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4879
-
4880
- def config_flask_remote_gpu(image_name, port = 80)
4881
- local_images = Docker::Image.all
4882
-
4883
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4884
- if docker_image_local.empty?
4885
- say "no image"
4886
- exit(1)
4887
- end
4888
-
4889
- begin
4890
- login_content = options["login"]
4891
- container_id = `nvidia-docker run -itd -p 80:80 -w /home/ds/app #{image_name}:latest /usr/local/cnvrg/start_super.sh`
4892
- container_id = container_id.gsub("\n", "")
4893
- container = Docker::Container.get(container_id)
4894
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4895
- container.exec(command, tty: true)
4896
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4897
- container.exec(command, tty: true)
4898
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4899
- container.exec(command, tty: true)
4900
- say "#{container.id}:#{port}"
4901
- rescue => e
4902
- puts e
4903
- if e.message.include? "is not running"
4904
- return "port is taken"
4905
- end
4906
- puts "error"
4907
- if container
4908
- container.kill()
4909
- end
4910
- return false
4911
- end
4912
- end
4913
4562
 
4914
4563
  desc '', '', :hide => true
4915
4564
 
@@ -4935,39 +4584,10 @@ module Cnvrg
4935
4584
 
4936
4585
  end
4937
4586
 
4938
- desc 'upload_image', 'Upload new docker image to cnvrg', :hide => true
4939
- method_option :workdir, :type => :string, :aliases => ["-w","--workdir"], :desc => "workdir of docker image", :default => "/root"
4940
- method_option :description, :type => :string, :aliases => ["-d", "--description"], :desc => "description for docker image", :default => ""
4941
- method_option :user, :type => :string, :aliases => ["-u","--user"], :default => "root"
4942
- method_option :gpu, :type => :boolean, :aliases => ["-g","--gpu"], :default => false
4943
- def upload_image(image_name,image_path)
4944
- begin
4945
- verify_logged_in(false)
4946
- log_start(__method__, args, options)
4947
-
4948
- @image = Cnvrg::Images.new()
4949
- say "Uploading new docker image file", Thor::Shell::Color::BLUE
4950
- workdir = options[:workdir]
4951
- description = options[:description]
4952
- user = options[:user]
4953
- is_gpu = options[:gpu]
4954
- res = @image.upload_docker_image(image_path, image_name, workdir, user, description, is_gpu)
4955
- if res["status"] == 200
4956
- image_slug = res["id"]
4957
- owner = CLI.get_owner
4958
- image_url = "#{Cnvrg::Helpers.remote_url}/#{owner}/settings/images/#{image_slug}"
4959
- log_message("Successfully uploaded image: #{image_url}", Thor::Shell::Color::GREEN, true)
4960
-
4961
-
4962
- else
4963
- log_message("Couldn't upload image: #{image_name}", Thor::Shell::Color::RED, true)
4964
-
4965
- end
4966
- rescue => e
4967
- log_error(e)
4968
- end
4969
-
4970
-
4587
+ desc 'file_exists', description: '', hide: true
4588
+ def file_exists(file)
4589
+ exit(0) if File.exists? file
4590
+ exit(1)
4971
4591
  end
4972
4592
 
4973
4593
 
@@ -5147,29 +4767,40 @@ module Cnvrg
5147
4767
  method_option :project_slug, :type => :string, :aliases => ["-s"], :desc => "project slug"
5148
4768
  method_option :project_owner, :type => :string, :aliases => ["-o"], :desc => "project slug"
5149
4769
  method_option :frequency, :type => :numeric, :aliases => ["-f"], :desc => "poll frequency"
4770
+ method_option :fetch_slugs, :type => :boolean, :default => false, :desc => "Fetch experiments slugs to compare"
5150
4771
 
5151
4772
  def compare_experiments
5152
4773
  verify_logged_in(true)
5153
4774
  log_start(__method__, args, options)
5154
4775
  exps_map = {}
4776
+ copied_commits = []
5155
4777
 
5156
- if options[:slugs].blank?
4778
+ if options[:slugs].blank? and options[:fetch_slugs].blank?
5157
4779
  log_message("No experiments slugs given", Thor::Shell::Color::RED)
5158
4780
  return false
5159
4781
  end
5160
- slugs = options[:slugs].split(",")
5161
- if slugs.blank?
5162
- log_message("No experiments slugs given", Thor::Shell::Color::RED)
5163
- return false
4782
+ if options[:slugs].present?
4783
+ slugs = options[:slugs].split(",")
5164
4784
  end
4785
+
5165
4786
  frequency = options[:frequency] || 5
5166
4787
  namespace = options[:namespace]
5167
4788
  project_dir = is_cnvrg_dir(Dir.pwd)
5168
4789
  @project = Project.new(project_home=project_dir, slug: options[:project_slug], owner: options[:project_owner])
4790
+ fetch_slugs = options[:fetch_slugs]
4791
+ webapp_slug = ENV["CNVRG_JOB_ID"]
4792
+ if fetch_slugs and webapp_slug.present?
4793
+ slugs = @project.fetch_webapp_slugs(webapp_slug)
4794
+ end
4795
+ if slugs.blank?
4796
+ log_message("No experiments slugs given", Thor::Shell::Color::RED)
4797
+ return false
4798
+ end
5169
4799
 
4800
+ log_message("compare is running")
5170
4801
  while true
4802
+ log_message("compare is running for slugs #{slugs}")
5171
4803
  slugs.each do |exp_slug|
5172
-
5173
4804
  begin
5174
4805
  if exps_map[exp_slug].blank?
5175
4806
  exp = @project.get_experiment(exp_slug)["experiment"]
@@ -5183,15 +4814,23 @@ module Cnvrg
5183
4814
  log_message("#{exp_name} has ended, getting files from end commit", Thor::Shell::Color::BLUE)
5184
4815
  Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project)
5185
4816
  exps_map[exp_slug] = exp
5186
- elsif exp["machine_activity"].present?
4817
+ else
5187
4818
  log_message("#{exp_name} is running should get logs", Thor::Shell::Color::BLUE)
5188
- Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4819
+ success = Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4820
+ if !success and exp["last_successful_commit"].present? and !copied_commits.include?(exp["last_successful_commit"])
4821
+ log_message("Failed to get kube files, using last commit", Thor::Shell::Color::BLUE)
4822
+ Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project, commit: exp["last_successful_commit"])
4823
+ copied_commits << exp["last_successful_commit"]
4824
+ end
5189
4825
  end
5190
4826
  rescue => e
5191
4827
  Cnvrg::Logger.log_error(e)
5192
4828
  end
5193
4829
  end
5194
4830
  sleep frequency
4831
+ if fetch_slugs
4832
+ slugs = @project.fetch_webapp_slugs(webapp_slug, slugs: slugs)
4833
+ end
5195
4834
  end
5196
4835
  end
5197
4836
 
@@ -5277,127 +4916,6 @@ module Cnvrg
5277
4916
  end
5278
4917
 
5279
4918
 
5280
- desc 'pull_image', 'downloads and loads an image', :hide => true
5281
-
5282
- def pull_image(image_name)
5283
- begin
5284
- verify_logged_in(false)
5285
- log_start(__method__, args, options)
5286
- owner = Cnvrg::CLI.get_owner()
5287
- image = Cnvrg::Images.image_exist(owner, image_name)
5288
- if !image
5289
- log_message("Couldn't find image in cnvrg repository", Thor::Shell::Color::RED)
5290
- exit(1)
5291
- end
5292
- path = download_image(image_name, image["slug"])
5293
- if path
5294
- log_message("Building image", Thor::Shell::Color::BLUE)
5295
- Docker.options[:read_timeout] = 216000
5296
- image = Docker::Image.build_from_dir(path, {'dockerfile' => 'Dockerfile.cpu', 't' => "#{image_name}:latest"}) do |v|
5297
- begin
5298
- if (log = JSON.parse(v)) && log.has_key?("stream")
5299
- next if log["stream"].starts_with? "Step"
5300
- $stdout.puts log["stream"]
5301
- end
5302
- rescue
5303
- end
5304
-
5305
- end
5306
-
5307
- if not image.nil?
5308
- FileUtils.rm_rf(path)
5309
- checks = Helpers.checkmark()
5310
- log_message("#{checks} Image built successfully", Thor::Shell::Color::GREEN)
5311
- return image
5312
- else
5313
-
5314
- log_message("Could not build image", Thor::Shell::Color::RED)
5315
- return false
5316
- end
5317
- else
5318
-
5319
- log_message("Could not download image", Thor::Shell::Color::RED)
5320
- return false
5321
-
5322
-
5323
- end
5324
-
5325
- # else
5326
- # path = download_image(image_name,image["slug"])
5327
- # if path
5328
- # image = Docker::Image.import(path)
5329
- # image.tag('repo' => image_name, 'tag' => 'latest')
5330
- # if not image.nil?
5331
- # say "Finished downloading image, cleaning up..", Thor::Shell::Color::GREEN
5332
- # FileUtils.rm(path)
5333
- # checks = Helpers.checkmark()
5334
- # say "#{checks} Done", Thor::Shell::Color::GREEN
5335
- # log_end(0)
5336
- # return image
5337
- # log_end(0)
5338
- # else
5339
- # say "Could not download image", Thor::Shell::Color::RED
5340
- # return false
5341
- # end
5342
- #
5343
- # end
5344
- # end
5345
- rescue => e
5346
-
5347
- log_message "Error: couldn't build image", Thor::Shell::Color::RED
5348
- log_error(e)
5349
-
5350
- rescue SignalException
5351
- say "\nAborting"
5352
- exit(1)
5353
- ensure
5354
- if path
5355
- FileUtils.rm_rf(path)
5356
-
5357
- end
5358
- end
5359
-
5360
-
5361
- end
5362
-
5363
- desc 'set_image', 'set image to a porject', :hide => true
5364
-
5365
- def set_image(docker_image)
5366
- verify_logged_in(true)
5367
- log_start(__method__, args, options)
5368
- working_dir = is_cnvrg_dir
5369
- project = Project.new(working_dir)
5370
-
5371
- local_images = Docker::Image.all
5372
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.include? docker_image}.flatten
5373
- if docker_image_local.size == 0
5374
-
5375
- if yes? "Image wasn't found locally, pull image from cnvrg repository?", Thor::Shell::Color::YELLOW
5376
- image = pull(docker_image)
5377
- if image
5378
- log_message("downloaded image: #{docker_image}", Thor::Shell::Color::BLUE)
5379
- @image = Images.new(working_dir, docker_image)
5380
- else
5381
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5382
- exit(1)
5383
- end
5384
- else
5385
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5386
- exit(1)
5387
-
5388
- end
5389
- elsif docker_image_local.size == 1
5390
- log_message("found image: #{docker_image_local[0]}, setting it up..", Thor::Shell::Color::BLUE)
5391
- @image = Images.new(working_dir, docker_image_local[0])
5392
- elsif docker_image_local.size > 1
5393
- log_message("found #{docker_image_local.size} images, choose the image name you want to use", Thor::Shell::Color::BLUE)
5394
- image_name = ask "#{docker_image_local.join("\n")}\n", Thor::Shell::Color::BLUE
5395
- image_name = image_name.strip
5396
- @image = Images.new(working_dir, image_name)
5397
- end
5398
- @image.update_image_activity(project.last_local_commit, nil)
5399
- end
5400
-
5401
4919
  desc 'check_pod_restart', 'Check pod restart', :hide => true
5402
4920
  def check_pod_restart
5403
4921
  Cnvrg::CLI.new.log_start(__method__, args, options)
@@ -5672,7 +5190,7 @@ module Cnvrg
5672
5190
 
5673
5191
  if dirs.size == 0
5674
5192
  log_message("Couldn't find cnvrg directory. Please start a new project", Thor::Shell::Color::RED)
5675
-
5193
+ puts Thread.current.backtrace
5676
5194
  exit(1)
5677
5195
  end
5678
5196
  return dirs.join("/")
@@ -5775,7 +5293,7 @@ module Cnvrg
5775
5293
  is_cnvrg = is_cnvrg_dir
5776
5294
  if !is_cnvrg
5777
5295
  say "You're not in a cnvrg project directory", Thor::Shell::Color::RED
5778
- exit(0)
5296
+ exit(1)
5779
5297
  end
5780
5298
 
5781
5299
  end
@@ -5921,21 +5439,6 @@ module Cnvrg
5921
5439
 
5922
5440
  end
5923
5441
 
5924
- def container_changes(dir)
5925
- container_id = is_project_with_docker(dir)
5926
- if not container_id
5927
- return false
5928
- end
5929
- container = Docker::Container.get(container_id)
5930
- command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
5931
- pip = container.exec(command, tty: true)[0]
5932
- command = ["/bin/bash", "-lc", "dpkg -l"]
5933
- dpkg = container.exec(command, tty: true)[0]
5934
- command = ["/bin/bash", "-lc", "cat /home/ds/.bash_history"]
5935
- history = container.exec(command, tty: true)[0]
5936
- diff = [pip, dpkg, history]
5937
- return diff
5938
- end
5939
5442
 
5940
5443
  def is_port_taken(ip = Cnvrg::CLI::IP, port = Cnvrg::CLI::PORT, seconds = 1)
5941
5444
  Timeout::timeout(seconds) do
@@ -6118,13 +5621,17 @@ module Cnvrg
6118
5621
 
6119
5622
  end
6120
5623
 
6121
- def gpu_util
5624
+ def gpu_util(take_from_docker: false, docker_id: nil)
6122
5625
  if !Helpers.ubuntu?
6123
5626
  return 0.0
6124
5627
  end
6125
5628
  stats = [[],[]]
6126
5629
  begin
6127
- gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5630
+ if take_from_docker
5631
+ gpu_stats = `docker exec -it #{docker_id} sh -c 'nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv'`
5632
+ else
5633
+ gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5634
+ end
6128
5635
 
6129
5636
  if !gpu_stats.nil?
6130
5637
  gpu_stats = gpu_stats.split("\n")[1..-1]