cnvrg 1.6.38 → 1.9.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e5953510af9633c925be99ac1463aef31667eed5f83f5b0c610672ab9fe375cb
4
- data.tar.gz: 7bd8fce2d93a20ed695e4c202f2b29092de84bf90332385a3921844046a0c591
3
+ metadata.gz: e708ef034df38ed0b4f5c1ac4bb02fa79a26c93b188f571256f75dbc9d2eaaa6
4
+ data.tar.gz: 6badf54b65660776e63c02c7d3c5dbbab83d0e1e83f6e877b48d77fad5ba3036
5
5
  SHA512:
6
- metadata.gz: 8b93cd899060800095895967dbe2b8f3391e2d77b6f29ea05cd6651de4d26de2ba24fdaedaa8dafad82a22e250d95abf7651d2b77afe2dcf68b883461774c965
7
- data.tar.gz: a4dc324e6f58b628013bb00278a74233f6b2c6e4ffa260e9edf74ea2557ca144907449078defb1abdb22ffe411fdde196b2e029bd66e67233140cf54d6caf67d
6
+ metadata.gz: 21d89ec4fb99c4102bc1e8e0e50df516339a1c9e9660ee8f0dd8acf3ae30bd27067f5ea4fe979de3b737bd6f748ced98f023100487a4226b7f21eed17975142c
7
+ data.tar.gz: 91fb2d10994c11e9b28ef3bbc128f847ac2efd641892c29ec1ec2b16d4b125266e85a6166153b66ab9e9e1c475190f6eca771e42d739a02c1136dbe8cb6c3abb
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
32
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
33
  spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
34
- spec.add_runtime_dependency 'aws-sdk', '~> 2.11.417'
34
+ spec.add_runtime_dependency 'aws-sdk', '~> 3.0'
35
35
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
36
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
37
37
  spec.add_runtime_dependency 'google-cloud-core', '~> 1.3.2'
@@ -40,11 +40,8 @@ Gem::Specification.new do |spec|
40
40
  spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
41
41
  spec.add_runtime_dependency 'parallel', '~> 1.12.0'
42
42
  spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
43
-
44
43
  spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
45
- spec.add_runtime_dependency 'docker-api', '~> 1.33'
46
44
  spec.add_runtime_dependency 'activesupport', '~> 5.2.0'
47
45
  spec.add_runtime_dependency 'ruby-progressbar'
48
- spec.add_runtime_dependency 'net-ssh'
49
46
  spec.add_runtime_dependency 'down'
50
47
  end
@@ -1,7 +1,5 @@
1
1
  require 'fileutils'
2
2
  require 'cnvrg/files'
3
- require 'docker'
4
- require 'net/ssh'
5
3
  require 'mimemagic'
6
4
 
7
5
 
@@ -175,58 +173,6 @@ module Cnvrg
175
173
  response = Cnvrg::API.request("users/#{owner}/images/#{slug}/commit_custom_image", 'POST', {image_logs:logs})
176
174
  return response
177
175
  end
178
- def self.ssh_to_machine(resp)
179
-
180
- sts_path = resp["result"]["sts_path"]
181
-
182
- uri = URI.parse(sts_path)
183
-
184
- http_object = Net::HTTP.new(uri.host, uri.port)
185
- http_object.use_ssl = true if uri.scheme == 'https'
186
- request = Net::HTTP::Get.new(sts_path)
187
-
188
- body = ""
189
- http_object.start do |http|
190
- response = http.request request
191
- body = response.read_body
192
- end
193
-
194
- URLcrypt::key = [body].pack('H*')
195
-
196
- ip = URLcrypt.decrypt(resp["result"]["machine_i"])
197
-
198
- user = URLcrypt.decrypt(resp["result"]["machine_u"])
199
- key = URLcrypt.decrypt(resp["result"]["machine_k"])
200
- tempssh = Tempfile.new "sshkey"
201
- tempssh.write open(key).read
202
- tempssh.rewind
203
- key_path = tempssh.path
204
- count = 0
205
- while count < 5
206
-
207
- begin
208
- ssh = Net::SSH.start(ip, user=user, :keys => key_path, :timeout => 10)
209
- if !ssh.nil?
210
- return ssh
211
- else
212
- count+=1
213
- sleep(2)
214
-
215
- end
216
- rescue
217
- count+=1
218
- sleep(2)
219
-
220
-
221
- end
222
- end
223
- if tempssh
224
- tempssh.close
225
- tempssh.unlink
226
- end
227
- return false
228
- end
229
-
230
176
 
231
177
 
232
178
  def create_custom_image(new_image_name,working_dir,stored_commands)
@@ -270,100 +216,6 @@ module Cnvrg
270
216
  File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
271
217
  end
272
218
 
273
- def get_container(stop=false)
274
- begin
275
- container_id=is_container_exist()
276
-
277
- if !container_id
278
- return create_container()
279
- else
280
- container = Docker::Container.get(container_id)
281
- status = container.json["State"]["Status"]
282
-
283
- if status == "running"
284
- return container
285
- else
286
- if stop
287
- return false
288
- end
289
- res = container.start()
290
- if res.info["State"]["Status"].eql? "exited" and res.info["State"]["Error"].include? "port is already allocated"
291
- return create_container()
292
- end
293
- return container
294
- end
295
- end
296
- rescue => e
297
- if e.message.include? "No such container"
298
-
299
- return create_container()
300
- else
301
- return false
302
- end
303
- end
304
-
305
- end
306
-
307
- def create_container(port=7654, is_remote=false)
308
- begin
309
- image_settings = {
310
- 'Image' => "#{@image_name}:latest",
311
- 'User' => 'ds',
312
- 'Cmd' => '/usr/local/cnvrg/run_ipython.sh',
313
- 'WorkingDir' => '/home/ds/notebooks',
314
- 'ExposedPorts' => {
315
- '8888/tcp' => {},
316
- },
317
- 'HostConfig' => {
318
- 'Binds' => ["#{@working_dir}:/home/ds/notebooks"],
319
- 'PortBindings' => {
320
- '8888/tcp' => [
321
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
322
- ],
323
- },
324
- },
325
- }
326
- container = Docker::Container.create(image_settings)
327
- container.start()
328
- netrc = File.open(File.expand_path('~')+"/.netrc", "rb")
329
- netrc_content = netrc.read
330
- container.store_file("/home/ds/.netrc", netrc_content)
331
- command = ["/bin/bash", "-lc", "sudo chmod 600 /home/ds/.netrc"]
332
- p = container.exec(command, tty: true)
333
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.netrc"]
334
- p = container.exec(command, tty: true)
335
- config = File.open(File.expand_path('~')+"/.cnvrg/config.yml", "rb")
336
- config_content = config.read
337
- container.store_file("/home/ds/.cnvrg/config.yml", config_content)
338
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg"]
339
- container.exec(command, tty: true)
340
- # Libraries instlled
341
- save_installed_libraries(container)
342
- config = {project_name: @project_name,
343
- project_slug: @project_slug,
344
- owner: @owner,
345
- docker: true, image_base: @image_name, image_tag: @image_tag, container: container.id, port: port, image_slug: @image_slug}
346
-
347
- File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
348
-
349
-
350
- return container
351
-
352
-
353
- rescue => e
354
- if e.message.include? "is not running"
355
- return create_container(port-1)
356
- end
357
- return false
358
- rescue SignalException
359
-
360
- say "\nAborting", Thor::Shell::Color::RED
361
- exit(1)
362
- end
363
-
364
-
365
- end
366
-
367
219
  def save_installed_libraries(container)
368
220
  begin
369
221
  command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
@@ -77,20 +77,22 @@ module Cnvrg
77
77
  if response.to_hash[:status] == 404
78
78
  return false
79
79
  end
80
- if parse_request == true
80
+ if parse_request
81
81
  JSON.parse(response.body)
82
82
  else
83
83
  response
84
84
  end
85
- when 'POST', 'PUT'
85
+ when 'POST', 'PUT'
86
86
  conn.options.timeout = 4200
87
- conn.options.open_timeout=180
87
+ conn.options.open_timeout = 180
88
+ conn.headers['Content-Type'] = "application/json"
88
89
  retries = 0
89
90
  success = false
91
+ data = data || {}
90
92
  while !success and retries < 20
91
93
  begin
92
- response = conn.post "#{resource}", data if method.eql? 'POST'
93
- response = conn.put "#{resource}", data if method.eql? 'PUT'
94
+ response = conn.post "#{resource}", data.to_json if method.eql? 'POST'
95
+ response = conn.put "#{resource}", data.to_json if method.eql? 'PUT'
94
96
  success = true
95
97
  Cnvrg::API.parse_version(response)
96
98
 
@@ -113,7 +115,7 @@ module Cnvrg
113
115
  end
114
116
  when 'POST_JSON'
115
117
  conn.options.timeout = 4200
116
- conn.options.open_timeout =4200
118
+ conn.options.open_timeout = 4200
117
119
  conn.headers['Content-Type'] = "application/json"
118
120
  new_data = JSON.dump(data)
119
121
 
@@ -124,8 +126,6 @@ module Cnvrg
124
126
  begin
125
127
  response = conn.post "#{resource}", new_data
126
128
  success = true
127
- Cnvrg::API.parse_version(response)
128
-
129
129
  rescue => e
130
130
  Cnvrg::Logger.log_error(e)
131
131
  sleep(5)
@@ -0,0 +1,14 @@
1
+ module Cnvrg
2
+ class API_V2 < API
3
+ ENDPOINT_VERSION = 'v2'
4
+
5
+ def self.endpoint_uri
6
+ api = get_api()
7
+ return "#{api}/#{Cnvrg::API_V2::ENDPOINT_VERSION}"
8
+ end
9
+
10
+ def self.is_response_success(response)
11
+ raise Exception.new("Bad status in response #{response.status}") if response.status != 200
12
+ end
13
+ end
14
+ end
@@ -12,7 +12,6 @@ require 'digest' # sha1up
12
12
  require "highline/import"
13
13
  require 'socket'
14
14
  require 'thor'
15
- require 'docker'
16
15
  require 'socket'
17
16
  require 'timeout'
18
17
  require 'fileutils'
@@ -28,13 +27,11 @@ require 'cnvrg/auth'
28
27
  require 'cnvrg/project'
29
28
  require 'cnvrg/files'
30
29
  require 'cnvrg/experiment'
31
- require 'cnvrg/Images'
32
30
  require 'cnvrg/image'
33
31
  require 'cnvrg/dataset'
34
32
  require 'cnvrg/datafiles'
35
33
  require 'cnvrg/data'
36
34
  require 'cnvrg/storage'
37
- require 'cnvrg/ssh'
38
35
  require 'cnvrg/result'
39
36
  require 'cnvrg/logger'
40
37
  require 'cnvrg/org_helpers'
@@ -49,6 +46,9 @@ require 'cnvrg/downloader/clients/s3_client'
49
46
  require 'cnvrg/downloader/clients/gcp_client'
50
47
  require 'cnvrg/downloader/clients/azure_client'
51
48
  require 'cnvrg/job_cli'
49
+ require 'cnvrg/job_ssh'
50
+ require 'cnvrg/connect_job_ssh'
51
+ require 'cnvrg/api_v2'
52
52
 
53
53
  class Thor
54
54
  module Base
@@ -175,6 +175,9 @@ module Cnvrg
175
175
  desc "job", "manage running jobs", :hide => false
176
176
  subcommand "job", JobCli
177
177
 
178
+ desc "ssh", "ssh into running jobs", :hide => false
179
+ subcommand "ssh", JobSsh
180
+
178
181
  desc "image [COMMAND]", "build existing images", :hide => true
179
182
  subcommand "image", ImageCli
180
183
 
@@ -819,9 +822,9 @@ module Cnvrg
819
822
  end
820
823
 
821
824
  desc 'data verify', 'Verify datasets', :hide => true
822
- method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => 15
825
+ method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => nil
823
826
 
824
- def verify_datasets(dataset_titles, timeout=0)
827
+ def verify_datasets(dataset_titles, timeout=nil)
825
828
  begin
826
829
  verify_logged_in(false)
827
830
  log_start(__method__, args, options)
@@ -830,21 +833,31 @@ module Cnvrg
830
833
  log_message("All datasets are verified", Thor::Shell::Color::BLUE) if verified
831
834
  log_message("Failed to verify datasets", Thor::Shell::Color::RED) if !verified
832
835
  exit(1) if !verified
833
-
834
836
  rescue SignalException
835
837
  say "\nAborting", Thor::Shell::Color::RED
836
838
  exit(1)
837
839
  end
838
840
  end
839
841
 
842
+ desc 'data scan', 'Lookup datasets', :hide => true
843
+ def scan_datasets()
844
+ begin
845
+ verify_logged_in(false)
846
+ log_start(__method__, args, options)
847
+ log_message("Scanning datasets", Thor::Shell::Color::BLUE)
848
+ datasets = Dataset.scan_datasets()
849
+ puts(datasets.to_json)
850
+ end
851
+ end
852
+
840
853
  desc 'data clone', 'Clone dataset', :hide => true
841
854
  method_option :commit, :type => :string, :aliases => ["-c", "--commit"], :default => ""
842
855
  method_option :only_tree, :type => :boolean, :aliases => ["-t", "--tree"], :default => false
843
856
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => nil
844
857
  method_option :read, :type => :boolean, :aliases => ["-r", "--read"], :default => false
845
858
  method_option :remote, :type => :boolean, :aliases => ["-h", "--remote"], :default => false
846
-
847
- def clone_data(dataset_url,only_tree=false,commit=nil,query=nil,read=false,remote=false, relative: false)
859
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
860
+ def clone_data(dataset_url, only_tree=false, commit=nil, query=nil, read=false, remote=false, flatten: false, relative: false, soft: false)
848
861
  begin
849
862
  verify_logged_in(false)
850
863
  log_start(__method__, args, options)
@@ -853,10 +866,10 @@ module Cnvrg
853
866
  read = options["read"] || read || false
854
867
  remote = options["remote"] || remote || false
855
868
  query = options['query'].presence || query.presence
869
+ soft = options['soft'] || soft
856
870
  if query.present?
857
- return clone_data_query(dataset_url, query)
871
+ return clone_data_query(dataset_url, query, flatten, soft: soft)
858
872
  end
859
- @executer = Cnvrg::Helpers::Executer.get_executer
860
873
 
861
874
  url_parts = dataset_url.split("/")
862
875
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
@@ -868,6 +881,8 @@ module Cnvrg
868
881
  dataset_name = response["result"]["name"]
869
882
  dataset_home = Dir.pwd+"/"+dataset_name
870
883
 
884
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name, commit: response["result"]["commit"]) if soft
885
+
871
886
  check = Helpers.checkmark
872
887
  if @dataset.init_home(remote:remote)
873
888
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -875,14 +890,12 @@ module Cnvrg
875
890
  log_message("Downloading files", Thor::Shell::Color::BLUE)
876
891
  if @dataset.softlinked?
877
892
  @files.cp_ds(relative: relative)
878
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
879
893
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
880
894
  @dataset.write_success
881
895
  return
882
896
  end
883
897
 
884
898
  if only_tree
885
-
886
899
  success = Dataset.clone_tree(commit: commit, dataset_home: dataset_home)
887
900
  return if success
888
901
  end
@@ -900,7 +913,7 @@ module Cnvrg
900
913
 
901
914
  while files['keys'].length > 0
902
915
  Cnvrg::Logger.log_info("download multiple files, #{downloaded_files.size} files downloaded")
903
- @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read)
916
+ @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read, flatten: flatten)
904
917
 
905
918
  downloaded_files += files['keys'].length
906
919
  files = @files.get_clone_chunk(commit: commit, latest_id: files['latest'])
@@ -908,7 +921,6 @@ module Cnvrg
908
921
  progressbar.finish
909
922
  if downloaded_files == files_count
910
923
  Dataset.verify_cnvrgignore_exist(dataset_name, false)
911
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
912
924
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
913
925
  @dataset.write_success
914
926
  ### if read, dont generate idx (but create idx.yml) if not read, generate idx.
@@ -930,12 +942,14 @@ module Cnvrg
930
942
 
931
943
  desc 'data clone_query', 'Clone dataset _query', :hide => true
932
944
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => ""
933
- def clone_data_query(dataset_url,query=nil)
945
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
946
+ def clone_data_query(dataset_url, query=nil, flatten=false, soft: false)
934
947
  begin
935
948
  verify_logged_in(false)
936
- @executer = Cnvrg::Helpers::Executer.get_executer
949
+ #@executer = Cnvrg::Helpers::Executer.get_executer
937
950
  log_start(__method__, args, options)
938
951
  query = options["query"] || query
952
+ soft = options["soft"] || soft
939
953
  if !query.present?
940
954
  log_message("Argument missing : query", Thor::Shell::Color::RED)
941
955
  exit(1)
@@ -945,13 +959,14 @@ module Cnvrg
945
959
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
946
960
  slug = url_parts[project_index + 1]
947
961
  owner = url_parts[project_index - 1]
948
-
949
962
  response = Cnvrg::API.request("users/#{owner}/datasets/#{slug}/search/#{query}", 'GET')
950
963
  Cnvrg::CLI.is_response_success(response,true)
951
964
  dataset_name = response["results"]["name"]
952
965
  dataset_slug = response["results"]["slug"]
953
- dataset_home = File.join(Dir.pwd, dataset_name)
966
+ dataset_home = Dir.pwd+"/"+dataset_slug
967
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name) if soft
954
968
 
969
+ # dataset_home = Dir.pwd
955
970
  if Dataset.blank_clone(owner, dataset_name, dataset_slug)
956
971
  dataset = Dataset.new(dataset_home)
957
972
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -966,6 +981,7 @@ module Cnvrg
966
981
  },
967
982
  in_threads: ParallelThreads
968
983
  }
984
+
969
985
  begin
970
986
  log_message("Downloading files", Thor::Shell::Color::BLUE)
971
987
  Parallel.map((response["results"]["query_files"]), parallel_options) do |f|
@@ -974,6 +990,7 @@ module Cnvrg
974
990
  file_name = relative_path_dir.pop()
975
991
  relative_path_dir = relative_path_dir.join("/")
976
992
  abs_path = dataset_home + "/" + relative_path_dir
993
+ abs_path = dataset_home if flatten
977
994
  begin
978
995
  FileUtils.mkdir_p(abs_path) unless File.exist? (abs_path + "/" + file_name)
979
996
  rescue
@@ -981,14 +998,14 @@ module Cnvrg
981
998
  exit(1)
982
999
  end
983
1000
  begin
984
- File.write "#{abs_path}/#{file_name}", open(f["s3_url"]).read unless File.exist? (abs_path + "/" + file_name)
985
- rescue
1001
+ File.write "#{abs_path}/#{file_name}", open(f["url"]).read unless File.exist? (abs_path + "/" + file_name)
1002
+ rescue => e
986
1003
  log_message("Could not download file: #{f["fullpath"]}", Thor::Shell::Color::RED)
987
1004
  exit(1)
988
1005
  end
989
1006
 
990
1007
  end
991
- @executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
1008
+ #@executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
992
1009
  rescue Interrupt
993
1010
  log_message("Couldn't download", Thor::Shell::Color::RED)
994
1011
  exit(1)
@@ -998,7 +1015,7 @@ module Cnvrg
998
1015
  check = Helpers.checkmark
999
1016
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
1000
1017
  dataset.write_success(in_folder=true)
1001
- rescue
1018
+ rescue => e
1002
1019
  exit(1)
1003
1020
  end
1004
1021
  end
@@ -1008,32 +1025,6 @@ module Cnvrg
1008
1025
  end
1009
1026
  end
1010
1027
 
1011
- desc 'init_data_container', 'Init dataset directory', :hide => true
1012
- method_option :login_content, :type => :string, :aliases => ["-l"], :default => ""
1013
-
1014
- def init_data_container(container)
1015
- begin
1016
- login_content = options["login_content"]
1017
-
1018
- container = Docker::Container.get(container)
1019
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
1020
- container.exec(command, tty: true)
1021
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
1022
- container.exec(command, tty: true)
1023
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
1024
- container.exec(command, tty: true)
1025
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg /home/ds/.netrc"]
1026
- container.exec(command, tty: true)
1027
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
1028
- container.exec(command, tty: true)
1029
-
1030
- rescue SignalException
1031
-
1032
- say "\nAborting", Thor::Shell::Color::RED
1033
- exit(1)
1034
- end
1035
- end
1036
-
1037
1028
  desc 'data_snap', 'Init dataset directory', :hide => true
1038
1029
  method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
1039
1030
 
@@ -1184,17 +1175,29 @@ module Cnvrg
1184
1175
  end
1185
1176
 
1186
1177
  desc '', '', :hide => true
1187
- def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, message: nil)
1178
+ def get_owner_slug(url_or_slug)
1179
+ if url_or_slug =~ URI::regexp
1180
+ # Find owner and slug in url
1181
+ url_parts = url_or_slug.split("/")
1182
+ project_index = Cnvrg::Helpers.look_for_in_path(url_or_slug, "datasets")
1183
+ slug = url_parts[project_index + 1]
1184
+ owner = url_parts[project_index - 1]
1185
+ else
1186
+ # Find owner in config file
1187
+ owner = CLI.get_owner
1188
+ slug = url_or_slug
1189
+ end
1190
+ return owner, slug
1191
+ end
1192
+
1193
+ desc '', '', :hide => true
1194
+ def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, force: false, threads: 15, message: nil)
1188
1195
  begin
1189
1196
  verify_logged_in(false)
1190
1197
  log_start(__method__, args, options)
1191
1198
 
1192
- #find owner and slug in url
1193
- url_parts = dataset_url.split("/")
1194
- project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
1195
- slug = url_parts[project_index + 1]
1196
- owner = url_parts[project_index - 1]
1197
- @dataset = Dataset.new(dataset_url: dataset_url)
1199
+ owner, slug = get_owner_slug(dataset_url)
1200
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1198
1201
  @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1199
1202
  @files = @datafiles.verify_files_exists(files)
1200
1203
 
@@ -1218,28 +1221,33 @@ module Cnvrg
1218
1221
  else
1219
1222
  @commit = commit
1220
1223
  end
1221
- #dir shouldnt have starting or ending slash.
1224
+
1225
+ # dir shouldnt have starting or ending slash.
1222
1226
  dir = dir[0..-2] if dir.end_with? '/'
1223
1227
  dir = dir[1..-1] if dir.start_with? '/'
1224
1228
 
1225
- @files.each_slice(chunk_size).each do |list_files|
1226
- temp_tree = @dataset.generate_chunked_idx(list_files, prefix: dir)
1227
- #will throw a signal exception if something goes wrong.
1228
- @datafiles.upload_multiple_files(@commit, temp_tree, force: true, prefix: dir, total: @files.size)
1229
+ @datafiles.upload_multiple_files_optimized(
1230
+ @files,
1231
+ @commit,
1232
+ force: force,
1233
+ chunk_size: chunk_size,
1234
+ prefix: dir,
1235
+ threads: threads
1236
+ )
1237
+
1238
+ # This is for backwards compatibility only and should be removed in future versions:
1239
+ res = @datafiles.put_commit(@commit)
1240
+ unless res.is_success?
1241
+ raise SignalException.new(1, res.msg)
1229
1242
  end
1230
- if commit.blank?
1231
- res = @datafiles.put_commit(@commit)
1232
- unless res.is_success?
1233
- raise SignalException.new(1, res.msg)
1234
- end
1235
- else
1236
- res = @datafiles.end_commit(@commit,false, success: true )
1237
- msg = res['result']
1238
- response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1239
- unless response.is_success?
1240
- raise SignalException.new(1, res.msg)
1241
- end
1243
+
1244
+ res = @datafiles.end_commit(@commit,false, success: true, commit_type: "put")
1245
+ msg = res['result']
1246
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1247
+ unless response.is_success?
1248
+ raise SignalException.new(1, res.msg)
1242
1249
  end
1250
+
1243
1251
  log_message("Uploading files finished Successfully", Thor::Shell::Color::GREEN)
1244
1252
  rescue SignalException => e
1245
1253
  log_message(e.message, Thor::Shell::Color::RED)
@@ -1248,7 +1256,49 @@ module Cnvrg
1248
1256
  end
1249
1257
 
1250
1258
 
1259
+ desc '', '', :hide => true
1260
+ def data_rm(dataset_url, regex_list: [], commit: '', message: nil)
1261
+ begin
1262
+ verify_logged_in(false)
1263
+ log_start(__method__, args, options)
1251
1264
 
1265
+ owner, slug = get_owner_slug(dataset_url)
1266
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1267
+ @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1268
+
1269
+ # Init a new commit
1270
+ response = @datafiles.start_commit(false, true, chunks: 1, message: message )
1271
+ unless response #means we failed in the start commit.
1272
+ raise SignalException.new(1, "Cant put files into dataset, check the dataset id")
1273
+ end
1274
+ @commit = response['result']['commit_sha1']
1275
+ files_to_delete, folders_to_delete, job_id = @datafiles.delete_multiple_files(@commit, regex_list)
1276
+ log_message("Deleting #{files_to_delete} files and #{folders_to_delete} folders", Thor::Shell::Color::GREEN)
1277
+
1278
+ total_files = files_to_delete + folders_to_delete
1279
+ current_progress = 0
1280
+ progressbar = @datafiles.create_progressbar("Delete Progress", total_files)
1281
+ chunk_size = 1000
1282
+ offset = 0
1283
+ while current_progress < total_files
1284
+ current_progress = @datafiles.delete_file_chunk(@commit, regex_list, chunk_size, offset)
1285
+ progressbar.progress = current_progress
1286
+ offset += chunk_size
1287
+ end
1288
+
1289
+ res = @datafiles.end_commit(@commit,false, success: true)
1290
+ msg = res['result']
1291
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1292
+ unless response.is_success?
1293
+ raise SignalException.new(1, res.msg)
1294
+ end
1295
+
1296
+ log_message("Deleting files finished Successfully", Thor::Shell::Color::GREEN)
1297
+ rescue SignalException => e
1298
+ log_message(e.message, Thor::Shell::Color::RED)
1299
+ return false
1300
+ end
1301
+ end
1252
1302
 
1253
1303
  desc 'upload_data', 'Upload data files', :hide => true
1254
1304
  method_option :ignore, :type => :array, :aliases => ["-i", "--i"], :desc => "ignore following files"
@@ -1699,18 +1749,22 @@ module Cnvrg
1699
1749
  end
1700
1750
 
1701
1751
  desc 'data commits', 'List all commits for a specific dataset', :hide => true
1702
-
1703
- def list_dataset_commits()
1704
- verify_logged_in(true)
1752
+ def list_dataset_commits(dataset_url, commit_sha1: nil)
1753
+ verify_logged_in(false)
1705
1754
  log_start(__method__, args, options)
1706
1755
 
1707
- dataset_dir = is_cnvrg_dir(Dir.pwd)
1708
- @dataset = Dataset.new(dataset_dir)
1709
- result = @dataset.list_commits()
1756
+ if dataset_url == "."
1757
+ dataset_dir = is_cnvrg_dir(Dir.pwd)
1758
+ @dataset = Dataset.new(dataset_dir)
1759
+ else
1760
+ owner, slug = get_owner_slug(dataset_url)
1761
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1762
+ end
1763
+
1764
+ result = @dataset.list_commits(commit_sha1:commit_sha1)
1710
1765
  list = result["result"]["list"]
1711
1766
 
1712
1767
  print_table(list)
1713
-
1714
1768
  end
1715
1769
 
1716
1770
  desc 'commits', 'List all commits for a specific Project'
@@ -1741,17 +1795,17 @@ module Cnvrg
1741
1795
 
1742
1796
 
1743
1797
  desc 'git_clone', 'Clone project'
1798
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1744
1799
  def git_clone(slug, owner)
1745
1800
  verify_logged_in(false)
1746
1801
  log_start(__method__, args, options)
1747
-
1802
+ project_home = Dir.pwd
1803
+ soft = options["soft"] || false
1804
+ Project.stop_if_project_present(project_home, slug) if soft
1748
1805
  clone_resp = Project.clone_dir_remote(slug, owner, slug,true)
1749
- idx_status = Project.new(get_project_home).generate_idx
1806
+ exit 1 if not clone_resp
1807
+ idx_status = Project.new(get_project_home).generate_idx(files:[])
1750
1808
  FileUtils.mkdir_p File.join(get_project_home, ENV['CNVRG_OUTPUT_DIR']) if ENV['CNVRG_OUTPUT_DIR'].present?
1751
- @executer = Cnvrg::Helpers::Executer.get_executer
1752
- if @executer.present?
1753
- @executer.update_git_commit
1754
- end
1755
1809
  end
1756
1810
 
1757
1811
 
@@ -1791,7 +1845,7 @@ module Cnvrg
1791
1845
  desc 'clone PROJECT_URL', 'Clone project'
1792
1846
  method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false
1793
1847
  method_option :commit, :type => :string, :aliases => ["-c", "--c"], :default => nil
1794
-
1848
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1795
1849
  def clone(project_url)
1796
1850
  begin
1797
1851
  verify_logged_in(false)
@@ -1801,6 +1855,8 @@ module Cnvrg
1801
1855
  slug = url_parts[project_index + 1]
1802
1856
  owner = url_parts[project_index - 1]
1803
1857
  remote = options["remote"] || false
1858
+ soft = options["soft"] || false
1859
+
1804
1860
 
1805
1861
  response = Cnvrg::API.request("users/#{owner}/projects/#{slug}/get_project", 'GET')
1806
1862
  Cnvrg::CLI.is_response_success(response)
@@ -1814,6 +1870,8 @@ module Cnvrg
1814
1870
  clone_resp = false
1815
1871
  project_home = Dir.pwd
1816
1872
 
1873
+ Project.stop_if_project_present(project_home, project_name) if soft
1874
+
1817
1875
  if remote and !git
1818
1876
  clone_resp = Project.clone_dir_remote(slug, owner, project_name,git)
1819
1877
  elsif git
@@ -1954,8 +2012,6 @@ module Cnvrg
1954
2012
  method_option :parallel, :type => :numeric, :aliases => ["-p", "--parallel"], :desc => "uparallel upload at the same time", :default => 15
1955
2013
  method_option :init, :type => :boolean, :aliases => ["--initial"], :desc => "initial sync", :default => false
1956
2014
  method_option :message, :type => :string, :aliases => ["--message"], :desc => "create commit with message", :default => nil
1957
-
1958
-
1959
2015
  def sync_data_new(new_branch, force, verbose, commit, all_files, tags ,parallel, chunk_size, init, message)
1960
2016
  verify_logged_in(true)
1961
2017
  log_start(__method__, args, options)
@@ -1964,11 +2020,13 @@ module Cnvrg
1964
2020
  # w(verbose=false, new_branch=false,sync=false, commit=nil,all_files=true)
1965
2021
  total_deleted, total_downloaded = invoke :download_data_new,[verbose, new_branch, true, commit, all_files], :new_branch=>new_branch, :direct=>false, :force =>force
1966
2022
  end
1967
- # w(new_branch, verbose,sync,force, tags, chunk_size)
2023
+
1968
2024
  invoke :upload_data_new,[new_branch, verbose, true, force, tags, chunk_size, message:message, total_deleted: total_deleted, total_downloaded: total_downloaded],
1969
2025
  :new_branch=>new_branch, :direct=>false, :force =>force, :sync =>true, :tags =>tags, :parallel => parallel, :message => message
1970
2026
 
1971
2027
  end
2028
+
2029
+
1972
2030
  desc 'upload_data_new', 'upload_data_new', :hide => true
1973
2031
  method_option :verbose, :type => :boolean, :aliases => ["-v"], :default => false
1974
2032
  method_option :new_branch, :type => :boolean, :aliases => ["-nb"], :desc => "create new branch of commits"
@@ -2214,13 +2272,24 @@ module Cnvrg
2214
2272
  method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2215
2273
  method_option :job_slug, :type => :string, :aliases => ["--job"], :default => nil, :hide=>true
2216
2274
  method_option :job_type, :type => :string, :aliases => [ "--job_type"], :default => nil, :hide=>true
2275
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2276
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2217
2277
 
2218
- def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil)
2278
+ def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true)
2219
2279
  begin
2220
2280
  # we are passing "force" twice.. doesnt really make sense :\\
2221
2281
  verify_logged_in(true)
2222
2282
  log_start(__method__, args, options)
2223
2283
  @project = Project.new(get_project_home)
2284
+
2285
+ # Enable local/experiment exception logging
2286
+ suppress_exceptions = suppress_exceptions ? suppress_exceptions : options[:suppress_exceptions]
2287
+ if in_exp
2288
+ exp_obj = Experiment.new(@project.owner, @project.slug, job_id: job_slug)
2289
+ else
2290
+ exp_obj = nil
2291
+ end
2292
+
2224
2293
  commit_msg = options["message"]
2225
2294
  if commit_msg.nil? or commit_msg.empty?
2226
2295
  commit_msg = ""
@@ -2292,8 +2361,6 @@ module Cnvrg
2292
2361
  end
2293
2362
  update_count = 0
2294
2363
  update_total = result["added"].size + result["updated_on_local"].size + result["deleted"].size
2295
- successful_updates = []
2296
- successful_deletions = []
2297
2364
  if options["verbose"]
2298
2365
  if update_total == 1
2299
2366
  log_message("Updating #{update_total} file", Thor::Shell::Color::BLUE)
@@ -2313,8 +2380,11 @@ module Cnvrg
2313
2380
  end
2314
2381
  job_type = options['job_type'] || job_type
2315
2382
  job_slug = options['job_slug'] || job_slug
2316
- commit_sha1 = @files.start_commit(new_branch, force: force, exp_start_commit: exp_start_commit,
2317
- job_type: job_type, job_slug: job_slug, start_commit: current_commit, message: options["message"])["result"]["commit_sha1"]
2383
+ commit_sha1 = @files.start_commit(
2384
+ new_branch, force: force, exp_start_commit: exp_start_commit,
2385
+ job_type: job_type, job_slug: job_slug, start_commit: current_commit,message: options["message"],
2386
+ debug_mode: options["debug_mode"]
2387
+ )["result"]["commit_sha1"]
2318
2388
  # upload / update
2319
2389
  # delete
2320
2390
  to_upload = result["added"] + result["updated_on_local"]
@@ -2325,32 +2395,30 @@ module Cnvrg
2325
2395
  :starting_at => 0,
2326
2396
  :total => (to_upload.size + deleted.size),
2327
2397
  :autofinish => true)
2328
- @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar)
2329
2398
 
2330
- @files.delete_files_from_server(deleted, commit_sha1)
2399
+ buffered_errors = @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar, suppress_exceptions: suppress_exceptions)
2400
+ @files.delete_files_from_server(deleted, commit_sha1, suppress_exceptions: suppress_exceptions)
2331
2401
 
2332
2402
  progressbar.finish
2403
+
2404
+ if buffered_errors.is_a?(Hash)
2405
+ buffered_errors.keys.each do |file|
2406
+ to_upload.delete(file)
2407
+ Cnvrg::CLI.log_message(buffered_errors[file], 'red')
2408
+ exp_obj.job_log([buffered_errors[file]]) unless exp_obj.nil?
2409
+ end
2410
+ end
2411
+
2333
2412
  res = @files.end_commit(commit_sha1, force: force, message: commit_msg)
2334
2413
  unless Cnvrg::CLI.is_response_success(res, false)
2335
2414
  raise StandardError.new("Cant end commit")
2336
2415
  end
2416
+
2337
2417
  # save idx
2338
2418
  @project.update_idx_with_files_commits!((to_upload + deleted), res["result"]["commit_time"])
2339
2419
  @project.update_idx_with_commit!(commit_sha1)
2340
2420
  if options["verbose"]
2341
2421
  log_message("#{check} Done", Thor::Shell::Color::BLUE)
2342
- if successful_updates.size > 0
2343
- successful_updates.flatten!
2344
- log_message("Updated:", Thor::Shell::Color::GREEN)
2345
- suc = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2346
- log_message(suc.join("\n"), Thor::Shell::Color::GREEN)
2347
- end
2348
- if successful_deletions.size > 0
2349
- successful_deletions.flatten!
2350
- log_message("Deleted:", Thor::Shell::Color::GREEN)
2351
- del = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2352
- log_message(del.join("\n"), Thor::Shell::Color::GREEN)
2353
- end
2354
2422
  log_message("Total of #{update_count} / #{update_total} files.", Thor::Shell::Color::GREEN)
2355
2423
  else
2356
2424
  if return_id
@@ -2375,9 +2443,13 @@ module Cnvrg
2375
2443
  if e.is_a? SignalException
2376
2444
  say "\nAborting", Thor::Shell::Color::BLUE
2377
2445
  say "\nRolling back all changes", Thor::Shell::Color::BLUE
2446
+
2447
+ exp_obj.job_log(["Aborting", "Rolling back all changes"]) unless exp_obj.nil?
2378
2448
  else
2379
2449
  log_message(error_message, Thor::Shell::Color::RED)
2380
2450
  log_error(e)
2451
+
2452
+ exp_obj.job_log([error_message, e]) unless exp_obj.nil?
2381
2453
  end
2382
2454
  @files.rollback_commit(commit_sha1) unless commit_sha1.nil?
2383
2455
  print_res = {
@@ -2896,6 +2968,10 @@ module Cnvrg
2896
2968
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
2897
2969
  method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
2898
2970
  method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2971
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2972
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2973
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2974
+
2899
2975
  def sync(direct = true)
2900
2976
  verify_logged_in(true) if direct
2901
2977
  @project = Project.new(get_project_home)
@@ -2907,16 +2983,20 @@ module Cnvrg
2907
2983
  is_git = ENV['CNVRG_GIT_PROJECT'] == "true" || @project.is_git
2908
2984
  in_exp = options["in_exp"] || (job_slug.present? and job_type.present?)
2909
2985
  in_exp = false if job_type.present? and job_type == "NotebookSession"
2986
+ output_dir = options["output_dir"] || ENV['CNVRG_OUTPUT_DIR']
2987
+
2910
2988
  run_download = true
2911
- if options[:force] or options[:files].present? or options[:output_dir].present? or in_exp or @project.is_branch
2989
+ if (job_type == "NotebookSession" and is_git) or job_type == "Experiment" or options['force']
2912
2990
  run_download = false
2913
2991
  end
2914
- if run_download
2992
+
2993
+ if run_download or options['debug_mode']
2915
2994
  invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
2916
2995
  end
2917
- invoke :upload, [false, true, direct, "",in_exp,options[:force], options["output_dir"],job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2996
+ invoke :upload, [false, true, direct, "",in_exp,options[:force], output_dir, job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2918
2997
  :ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
2919
- :files => options["files"], :output_dir => options["output_dir"], :job_slug => job_slug, :job_type => job_type, :git_diff=> options["git_diff"]
2998
+ :files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"], :debug_mode => options['debug_mode'], :git_diff => options["git_diff"]
2999
+
2920
3000
  end
2921
3001
 
2922
3002
  desc 'run cmd', 'Runs an experiment'
@@ -3061,6 +3141,8 @@ module Cnvrg
3061
3141
  method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
3062
3142
  method_option :data_commit, :type => :string, :aliases => ["-dc", "--data_commit"], :default => ""
3063
3143
  method_option :ignore, :type => :string, :aliases => ["-i", "--ignore"], :desc => "ignore following files", :default => ""
3144
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch", :default => ""
3145
+ method_option :gpu_util_from_docker, :type => :boolean, :aliases => ["--gpu-util-from-docker"], :desc => "take gpu utilization from job docker", :default => false
3064
3146
  method_option :remote, :type => :boolean, :aliases => ["--remote"], :default => false
3065
3147
  method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
3066
3148
  method_option :force, :type => :boolean, :aliases => ["-f", "--force"], :default => false
@@ -3068,6 +3150,7 @@ module Cnvrg
3068
3150
  method_option :periodic_sync, :type => :string, :aliases => ["-ps", "--periodic_sync"], :default => ""
3069
3151
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3070
3152
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3153
+ method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3071
3154
 
3072
3155
  def exec(*cmd)
3073
3156
  log = []
@@ -3134,8 +3217,12 @@ module Cnvrg
3134
3217
  end
3135
3218
  remote = options["remote"]
3136
3219
  if remote
3137
- docker_id = `cat /etc/hostname`
3138
- docker_id = docker_id.strip()
3220
+ if options["docker_id"].present?
3221
+ docker_id = options["docker_id"]
3222
+ else
3223
+ docker_id = `cat /etc/hostname`
3224
+ docker_id = docker_id.strip()
3225
+ end
3139
3226
  end
3140
3227
  is_on_gpu = options["gpu"]
3141
3228
  start_commit = @project.last_local_commit
@@ -3145,9 +3232,9 @@ module Cnvrg
3145
3232
 
3146
3233
  platform = RUBY_PLATFORM
3147
3234
  machine_name = Socket.gethostname
3235
+ machine_activity_slug = ENV["CNVRG_MACHINE_ACTIVITY"]
3148
3236
  begin
3149
- machine_activity = @exp.get_machine_activity(working_dir)
3150
- @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity, script_path, sync_before_terminate, periodic_sync)
3237
+ @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity_slug, script_path, sync_before_terminate, periodic_sync)
3151
3238
  log_message("Experiment's live results: #{Cnvrg::Helpers.remote_url}/#{@project.owner}/projects/#{@project.slug}/experiments/#{@exp.slug}", Thor::Shell::Color::GREEN)
3152
3239
  log_message("Running: #{cmd}\n", Thor::Shell::Color::BLUE)
3153
3240
  unless @exp.slug.nil?
@@ -3165,7 +3252,7 @@ module Cnvrg
3165
3252
  begin
3166
3253
  stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3167
3254
  if is_on_gpu
3168
- gu = gpu_util
3255
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3169
3256
  stats['gpu_util'] = gu[0]
3170
3257
  stats['gpu'] = gu[1]
3171
3258
  end
@@ -3177,6 +3264,16 @@ module Cnvrg
3177
3264
  end
3178
3265
  end
3179
3266
  start_time = Time.now
3267
+ shell_type = options["use_bash"] ? "bash -l" : "sh"
3268
+ if @exp.get_cmd.present?
3269
+ cmd = @exp.get_cmd
3270
+ if options["docker_id"].present? # Escape for docker exec
3271
+ cmd = cmd.gsub("\"", "\\\"")
3272
+ end
3273
+ end
3274
+ if options["docker_id"].present?
3275
+ cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
3276
+ end
3180
3277
  PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
3181
3278
  begin
3182
3279
  stdout.each do |line|
@@ -3191,7 +3288,7 @@ module Cnvrg
3191
3288
  puts line
3192
3289
  end
3193
3290
  log << cur_log
3194
- if log.size >= 5
3291
+ if log.size >= 1
3195
3292
  @exp.upload_temp_log(log) unless log.empty?
3196
3293
  log = []
3197
3294
  elsif (start_time + 15.seconds) <= Time.now
@@ -3241,29 +3338,26 @@ module Cnvrg
3241
3338
  exp_success = false
3242
3339
  end
3243
3340
 
3244
- if sync_after
3245
- @exp.job_log(["Syncing Experiment"])
3246
- # Sync after run
3247
- if @project.is_git
3248
- output_dir = output_dir || @exp.output_dir
3249
- if output_dir.present?
3250
- upload(false, false, true, ignore, true, true,output_dir,"Experiment",@exp.slug )
3251
- # invoke :upload, [false, false, true, ignore, true, true], :output_dir => output_dir, :force=>true, :job_type=>'Experiment', :job_slug=>@exp.slug
3252
- end
3253
- else
3254
- upload(false, false, true, ignore, true, true,nil,"Experiment",@exp.slug )
3255
-
3256
- # invoke :upload, [false, false, true, ignore,true, true], :job_type=>'Experiment', :job_slug=>@exp.slug, :force=>true
3341
+ if sync_after
3342
+ @exp.job_log(["Syncing Experiment"])
3343
+ # Sync after run
3344
+ if @project.is_git
3345
+ output_dir = output_dir || @exp.output_dir
3346
+ if output_dir.present?
3347
+ upload(false, false, true, ignore, true, true, output_dir, "Experiment", @exp.slug, true )
3257
3348
  end
3258
-
3349
+ else
3350
+ upload(false, false, true, ignore, true, true, nil, "Experiment", @exp.slug, true )
3259
3351
  end
3352
+ end
3353
+
3260
3354
  end_commit = @project.last_local_commit
3261
3355
  if end_commit.present?
3262
3356
  @exp.job_log(["Experiment end commit: #{end_commit}"])
3263
3357
  end
3264
3358
 
3265
3359
  # log_thread.join
3266
- stats_thread.join
3360
+ stats_thread.join
3267
3361
 
3268
3362
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3269
3363
 
@@ -3411,8 +3505,8 @@ module Cnvrg
3411
3505
  local_folders_options = options["local_folders"]
3412
3506
  options_hash.except!("schedule", "recurring", "machine_type", "image", "upload_output", "grid", "data", "data_commit", "title",
3413
3507
  "local", "small", "medium", "large", "gpu", "gpuxl", "gpuxxl","max_time","dataset_only_tree",
3414
- "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets", "requirements", "prerun",
3415
- "email_notification_error", "email_notification_success", "emails")
3508
+ "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets",
3509
+ "requirements", "prerun", "email_notification_error", "email_notification_success", "emails")
3416
3510
  exec_options = options_hash.map {|x| "--#{x[0]}=#{x[1]}"}.flatten.join(" ")
3417
3511
  command = "#{exec_options} #{remote} #{upload_output_option} #{cmd.flatten.join(" ")}"
3418
3512
  commit_to_run = options["commit"] || nil
@@ -4237,144 +4331,6 @@ module Cnvrg
4237
4331
 
4238
4332
  end
4239
4333
 
4240
- method_option :small, :type => :boolean, :aliases => ["-sm", "--small"], :default => false
4241
- method_option :medium, :type => :boolean, :aliases => ["-md", "--medium"], :default => false
4242
- method_option :large, :type => :boolean, :aliases => ["-lg", "--large"], :default => false
4243
- method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
4244
- method_option :gpuxl, :type => :boolean, :aliases => ["--gpuxl"], :default => false
4245
- method_option :gpuxxl, :type => :boolean, :aliases => ["--gpuxxl"], :default => false
4246
- method_option :image, :type => :string, :aliases => ["-i", "--image"], :default => ""
4247
- method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
4248
- method_option :base, :type => :boolean, :aliases => ["-b", "--base"], :default => false
4249
- method_option :python3, :type => :boolean, :aliases => ["--python3"], :default => false
4250
- method_option :docker_path, :type => :string, :aliases => ["--docker_path"], :default => ""
4251
-
4252
-
4253
- desc 'create_custom_image', 'run commands inside containers', :hide => true
4254
-
4255
- def build_image(image_name)
4256
- begin
4257
- verify_logged_in(false)
4258
- log_start(__method__, args, options)
4259
- instances = {"small" => options["small"], "medium" => options["medium"], "large" => options["large"],
4260
- "gpu" => options["gpu"], "gpuxl" => options["gpuxl"], "gpuxxl" => options["gpuxxl"]}
4261
- instance_type = get_instance_type(instances)
4262
- image_extend = options["image"]
4263
- public = options["public"]
4264
- base = options["base"]
4265
- python3 = options["python3"]
4266
- docker_path = options["docker_path"]
4267
- owner = CLI.get_owner
4268
- checks = Helpers.checkmark()
4269
- tar_path = nil
4270
- if !docker_path.nil? and !docker_path.empty?
4271
- docker_path = File.absolute_path(docker_path)
4272
- #create tar of the docker path: it could be a docker file, and it could be a docker folder
4273
- tar_path = File.expand_path('~') + "/.cnvrg/tmp/docker_#{File.basename docker_path}.tar.gz"
4274
- resp = create_docker_tar(docker_path, tar_path)
4275
- if !resp
4276
- log_message("Couldn't create tar from docker path", Thor::Shell::Color::RED)
4277
- FileUtils.rm_rf tar_path
4278
- exit(1)
4279
- end
4280
- files = Cnvrg::Files.new(owner, "")
4281
- resp = Images.create_new_custom_image_with_docker(instance_type, owner, image_name, public, base, image_extend, python3, tar_path, files)
4282
- if resp
4283
- end
4284
- else
4285
- log_message("Creating machine for your custom image, this may take a few moments...", Thor::Shell::Color::BLUE)
4286
- resp = Images.create_new_custom_image(instance_type, owner, image_name, public, base, image_extend, python3, nil)
4287
-
4288
- end
4289
-
4290
- if Cnvrg::CLI.is_response_success(resp, false)
4291
- image_slug = resp["result"]["slug"]
4292
- container = resp["result"]["machine_c"]
4293
- log_message("#{checks} Created image and machine successfully", Thor::Shell::Color::GREEN)
4294
- log_message("Connecting to machine", Thor::Shell::Color::BLUE)
4295
- ssh = Ssh.new(resp)
4296
- if !ssh.is_ssh
4297
- log_message("Couldn't connect to machine,aborting", Thor::Shell::Color::RED)
4298
- Images.revoke_custom_new_image(owner, image_slug)
4299
- end
4300
- log_message("run command until ctrl + c or quit is initiated", Thor::Shell::Color::BLUE)
4301
- begin
4302
- logs = []
4303
-
4304
- while true
4305
- command = ask("$>")
4306
- logs << {time: Time.now,
4307
- message: command,
4308
- type: "stdout"
4309
- }
4310
- if command.eql? "quit"
4311
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4312
- break
4313
- end
4314
- res = ssh.exec_command(command)
4315
- begin
4316
- res_parsed = JSON.parse(res)
4317
- res = res_parsed.join(",")
4318
- end
4319
-
4320
- puts res
4321
- logs << {time: Time.now,
4322
- message: res,
4323
- type: "stdout"
4324
- }
4325
- logs.flatten!
4326
-
4327
- end
4328
-
4329
- rescue SignalException
4330
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4331
-
4332
- end
4333
- resp = Images.commit_custom_image(owner, image_slug, logs)
4334
- if Cnvrg::CLI.is_response_success(resp, false)
4335
- log_message("#{checks} Image commited successfuly, email will be sent when image is ready", Thor::Shell::Color::GREEN)
4336
- else
4337
- if image_slug
4338
- Images.revoke_custom_new_image(owner, image_slug)
4339
- end
4340
- if ssh
4341
- ssh.close_ssh()
4342
- end
4343
- log_message("Image couldn't be commited, rolling back changes", Thor::Shell::Color::RED)
4344
-
4345
- exit(1)
4346
- end
4347
- if ssh
4348
- ssh.close_ssh()
4349
- end
4350
-
4351
-
4352
- end
4353
- rescue => e
4354
- log_message("Error occurd, aborting", Thor::Shell::Color::RED)
4355
-
4356
- log_error(e)
4357
- if image_slug
4358
- Images.revoke_custom_new_image(owner, image_slug)
4359
- end
4360
- if ssh
4361
- ssh.close_ssh()
4362
- end
4363
-
4364
-
4365
- rescue SignalException
4366
- if image_slug
4367
- Images.revoke_custom_new_image(owner, image_slug)
4368
- end
4369
- if ssh
4370
- ssh.close_ssh
4371
- end
4372
- say "\nAborting"
4373
- exit(1)
4374
- end
4375
-
4376
- end
4377
-
4378
4334
 
4379
4335
  desc 'build', 'run commands inside containers', :hide => true
4380
4336
  method_option :install, :type => :string, :aliases => ["--i"], :default => nil, :desc => "Install from the given instructions file"
@@ -4568,66 +4524,7 @@ module Cnvrg
4568
4524
  end
4569
4525
 
4570
4526
 
4571
- desc 'upload_image', 'commit notebook changes to create a new notebook image', :hide =>true
4572
-
4573
- def upload_image_old(image_id, is_public, is_base, *message)
4574
- verify_logged_in(true)
4575
- log_start(__method__, args, options)
4576
- image = Docker::Image.get(image_id)
4577
- project_home = get_project_home
4578
- @project = Project.new(project_home)
4579
- last_local_commit = @project.last_local_commit
4580
- image_name = @project.slug + "#{last_local_commit}"
4581
- path = File.expand_path('~') + "/.cnvrg/tmp/#{image_name}.tar"
4582
- owner = Cnvrg::CLI.get_owner()
4583
- if !message.nil? or !message.empty?
4584
- message = message.join(" ")
4585
- end
4586
-
4587
- log_message("Saving image's current state", Thor::Shell::Color::BLUE)
4588
- image.save(path)
4589
-
4590
- begin
4591
- log_message("Compressing image file to upload", Thor::Shell::Color::BLUE)
4592
- gzipRes = system("gzip -f #{path}")
4593
- if !gzipRes
4594
-
4595
- log_message("Couldn't create tar file from image", Thor::Shell::Color::RED)
4596
- exit(1)
4597
- end
4598
- path = path + ".gz"
4599
- @files = Cnvrg::Files.new(owner, "")
4600
-
4601
- exit_status = $?.exitstatus
4602
- if exit_status == 0
4603
- log_message("Uploading image file", Thor::Shell::Color::BLUE)
4604
-
4605
- diff = container_changes(Dir.pwd)
4606
- res = @files.upload_image(path, image_name, owner, is_public, is_base, diff[1], diff[0], diff[2], message, image.commit_id)
4607
- if res
4608
- File.delete(path)
4609
- image_loc = is_project_with_docker(Dir.pwd)
4610
- image_loc.update_slug(res["result"]["id"])
4611
-
4612
- checks = Helpers.checkmark()
4613
- log_message("#{checks} Done", Thor::Shell::Color::GREEN)
4614
- else
4615
- log_message("Couldn't upload image", Thor::Shell::Color::RED)
4616
-
4617
- end
4618
- else
4619
- log_message("Couldn't create image file for: #{image_name}", Thor::Shell::Color::RED)
4620
- exit(1)
4621
- end
4622
- rescue => e
4623
- log_message("Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED)
4624
- log_error(e)
4625
- rescue SignalException
4626
4527
 
4627
- say "Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED
4628
- exit(1)
4629
- end
4630
- end
4631
4528
 
4632
4529
  desc '', '', :hide => true
4633
4530
 
@@ -4638,278 +4535,30 @@ module Cnvrg
4638
4535
 
4639
4536
  end
4640
4537
 
4641
- desc '', '', :hide => true
4642
-
4643
- def exec_container(container_id, *cmd)
4644
- container = Docker::Container.get(container_id)
4645
- container.start()
4646
- cnvrg_command = cmd.join(" ")
4647
- command = ["/bin/bash", "-lc", "#{cnvrg_command}"]
4648
- res = container.exec(command, tty: true, wait: 5400)[0]
4649
- say res
4650
- end
4651
-
4652
- desc '', '', :hide => true
4653
-
4654
- def port_container(container_id)
4655
- container = Docker::Container.get(container_id)
4656
- say container.json["HostConfig"]["PortBindings"]["8888/tcp"][0]["HostPort"]
4657
- end
4658
-
4659
- desc '', '', :hide => true
4660
-
4661
- def tensor_port_container(container_id)
4662
- container = Docker::Container.get(container_id)
4663
- say container.json["HostConfig"]["PortBindings"]["6006/tcp"][0]["HostPort"]
4664
- end
4665
-
4666
- desc '', '', :hide => true
4667
-
4668
- def stop_container(container_id)
4669
- container = Docker::Container.get(container_id)
4670
- container.stop()
4671
- container.remove()
4672
-
4673
- end
4674
-
4675
- desc '', '', :hide => true
4676
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4677
- method_option :app_dir, :type => :string, :aliases => ["-d"], :default => "/home/ds/notebooks"
4678
- method_option :cmd, :type => :string, :aliases => ["-c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4679
-
4680
-
4681
- def config_remote(image_name, port = 7654, tensport = 6006)
4682
- local_images = Docker::Image.all
4683
-
4684
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4685
- if docker_image_local.empty?
4686
- say "no image"
4687
- exit(1)
4688
- end
4689
-
4690
- begin
4691
- login_content = options["login"]
4692
- app_dir = options["app_dir"]
4693
- cmd = options["cmd"]
4694
- volume_from = options["volume"]
4695
-
4696
- image_settings = {
4697
- 'Image' => "#{image_name}:latest",
4698
-
4699
- 'Cmd' => cmd,
4700
- 'WorkingDir' => app_dir,
4701
- 'ExposedPorts' => {
4702
- '8888/tcp' => {},
4703
- },
4704
- 'HostConfig' => {
4705
- 'Binds' => ["/var/run/docker.sock:/var/run/docker.sock", "/usr/bin/docker:/usr/bin/docker"],
4706
- 'PortBindings' => {
4707
- '8888/tcp' => [
4708
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4709
- ],
4710
- '6006/tcp' => [
4711
- {'HostPort' => "#{tensport}", 'HostIp' => 'localhost'}
4712
- ],
4713
- },
4714
- },
4715
- }
4716
- container = Docker::Container.create(image_settings)
4717
- container.start()
4718
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4719
- container.exec(command, tty: true)
4720
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
4721
- # container.exec(command, tty: true)
4722
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
4723
- # container.exec(command, tty: true)
4724
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4725
- container.exec(command, tty: true)
4726
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4727
- container.exec(command, tty: true)
4728
- say "#{container.id}:#{port}##{tensport}"
4729
- rescue => e
4730
- puts e
4731
- if e.message.include? "is not running"
4732
- return config_remote(image_name, port - 1, tensport - 1)
4733
- end
4734
-
4735
- if container
4736
- container.kill()
4737
- end
4738
- return false
4739
- end
4740
- end
4741
-
4742
-
4743
- desc '', '', :hide => true
4744
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4745
-
4746
- def config_netrc(container)
4747
-
4748
- login_content = options["login"]
4749
-
4750
- container = Docker::Container.get(container)
4751
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4752
- container.exec(command, tty: true)
4753
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4754
- container.exec(command, tty: true)
4755
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4756
- container.exec(command, tty: true)
4757
- say "OK"
4758
-
4759
- end
4760
-
4761
- desc '', '', :hide => true
4762
- method_option :login, :type => :string, :aliases => ["-l", "--l"], :default => ""
4763
- method_option :app_dir, :type => :string, :aliases => ["-d", "--d"], :default => "/home/ds/notebooks"
4764
- method_option :cmd, :type => :string, :aliases => ["-c", "--c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4765
-
4766
-
4767
- def config_remote_gpu(image_name, port = 7654, tensport = 6006)
4768
- local_images = Docker::Image.all
4769
-
4770
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4771
- if docker_image_local.empty?
4772
- say "no image"
4773
- exit(1)
4774
- end
4775
-
4776
- begin
4777
- login_content = options["login"]
4778
- app_dir = options["app_dir"]
4779
- cmd = options["cmd"]
4780
-
4781
- # image_settings = {
4782
- # 'Image' => "#{image_name}:latest",
4783
- # 'User' => 'ds',
4784
- # 'Cmd' => cmd,
4785
- # 'WorkingDir' => app_dir,
4786
- # 'ExposedPorts' => {
4787
- # '8888/tcp' => {},
4788
- # },
4789
- # 'HostConfig' => {
4790
- # 'PortBindings' => {
4791
- # '8888/tcp' => [
4792
- # {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4793
- # ],
4794
- # '6006/tcp' => [
4795
- # {'HostPort' => "6006", 'HostIp' => 'localhost'}
4796
- # ],
4797
- # },
4798
- # },
4799
- # }
4800
-
4801
- container_id = `nvidia-docker run -itd -p #{port}:8888 -p #{tensport}:6006 -w #{app_dir} -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi -v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker #{image_name}:latest #{cmd} `
4802
- container_id = container_id.gsub("\n", "")
4803
- container = Docker::Container.get(container_id)
4804
- # container.start()
4805
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4806
- container.exec(command, tty: true)
4807
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4808
- container.exec(command, tty: true)
4809
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4810
- container.exec(command, tty: true)
4811
- say "#{container.id}:#{port}##{tensport}"
4812
- rescue => e
4813
- if e.message.include? "is not running"
4814
- puts "running asgain with: #{port - 1} #{tensport - 1}"
4815
- return config_remote_gpu(image_name, port - 1, tensport - 1)
4816
- end
4817
-
4818
- if container
4819
- container.kill()
4538
+ desc 'Collect and send job utilization', '', :hide => true
4539
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch"
4540
+ method_option :is_on_gpu, :type => :boolean, :aliases => ["--is_on_gpu"], :desc => "is on gpu", :default => true
4541
+ def get_utilization()
4542
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4543
+ docker_id = options["docker_id"]
4544
+ while true do
4545
+ sleep 30
4546
+ begin
4547
+ stats = usage_metrics_in_docker(docker_id)
4548
+ if options["is_on_gpu"]
4549
+ gu = gpu_util(take_from_docker: true, docker_id: docker_id)
4550
+ stats['gpu_util'] = gu[0]
4551
+ stats['gpu'] = gu[1]
4552
+ end
4553
+ stats['docker_id'] = docker_id
4554
+ @exp.send_machine_stats [stats] unless stats.empty?
4555
+ rescue => e
4556
+ log_error(e)
4557
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4820
4558
  end
4821
- return false
4822
4559
  end
4823
4560
  end
4824
4561
 
4825
- desc '', '', :hide => true
4826
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4827
-
4828
- def config_flask_remote(image_name, port = 80)
4829
- local_images = Docker::Image.all
4830
-
4831
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4832
- if docker_image_local.empty?
4833
- say "no image"
4834
- exit(1)
4835
- end
4836
-
4837
- begin
4838
- login_content = options["login"]
4839
- image_settings = {
4840
- 'Image' => "#{image_name}:latest",
4841
- 'User' => 'ds',
4842
- 'Cmd' => '/usr/local/cnvrg/start_super.sh',
4843
- 'WorkingDir' => '/home/ds/app',
4844
- 'ExposedPorts' => {
4845
- '80/tcp' => {},
4846
- },
4847
- 'HostConfig' => {
4848
- 'PortBindings' => {
4849
- '80/tcp' => [
4850
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4851
- ],
4852
- },
4853
- },
4854
- }
4855
- container = Docker::Container.create(image_settings)
4856
- container.start()
4857
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4858
- container.exec(command, tty: true)
4859
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4860
- container.exec(command, tty: true)
4861
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4862
- container.exec(command, tty: true)
4863
- say "#{container.id}:#{port}"
4864
- rescue => e
4865
- pus e
4866
- if e.message.include? "is not running"
4867
- return "port is taken"
4868
- end
4869
- puts "error"
4870
- if container
4871
- container.kill()
4872
- end
4873
- return false
4874
- end
4875
- end
4876
-
4877
- desc '', '', :hide => true
4878
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4879
-
4880
- def config_flask_remote_gpu(image_name, port = 80)
4881
- local_images = Docker::Image.all
4882
-
4883
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4884
- if docker_image_local.empty?
4885
- say "no image"
4886
- exit(1)
4887
- end
4888
-
4889
- begin
4890
- login_content = options["login"]
4891
- container_id = `nvidia-docker run -itd -p 80:80 -w /home/ds/app #{image_name}:latest /usr/local/cnvrg/start_super.sh`
4892
- container_id = container_id.gsub("\n", "")
4893
- container = Docker::Container.get(container_id)
4894
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4895
- container.exec(command, tty: true)
4896
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4897
- container.exec(command, tty: true)
4898
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4899
- container.exec(command, tty: true)
4900
- say "#{container.id}:#{port}"
4901
- rescue => e
4902
- puts e
4903
- if e.message.include? "is not running"
4904
- return "port is taken"
4905
- end
4906
- puts "error"
4907
- if container
4908
- container.kill()
4909
- end
4910
- return false
4911
- end
4912
- end
4913
4562
 
4914
4563
  desc '', '', :hide => true
4915
4564
 
@@ -4935,39 +4584,10 @@ module Cnvrg
4935
4584
 
4936
4585
  end
4937
4586
 
4938
- desc 'upload_image', 'Upload new docker image to cnvrg', :hide => true
4939
- method_option :workdir, :type => :string, :aliases => ["-w","--workdir"], :desc => "workdir of docker image", :default => "/root"
4940
- method_option :description, :type => :string, :aliases => ["-d", "--description"], :desc => "description for docker image", :default => ""
4941
- method_option :user, :type => :string, :aliases => ["-u","--user"], :default => "root"
4942
- method_option :gpu, :type => :boolean, :aliases => ["-g","--gpu"], :default => false
4943
- def upload_image(image_name,image_path)
4944
- begin
4945
- verify_logged_in(false)
4946
- log_start(__method__, args, options)
4947
-
4948
- @image = Cnvrg::Images.new()
4949
- say "Uploading new docker image file", Thor::Shell::Color::BLUE
4950
- workdir = options[:workdir]
4951
- description = options[:description]
4952
- user = options[:user]
4953
- is_gpu = options[:gpu]
4954
- res = @image.upload_docker_image(image_path, image_name, workdir, user, description, is_gpu)
4955
- if res["status"] == 200
4956
- image_slug = res["id"]
4957
- owner = CLI.get_owner
4958
- image_url = "#{Cnvrg::Helpers.remote_url}/#{owner}/settings/images/#{image_slug}"
4959
- log_message("Successfully uploaded image: #{image_url}", Thor::Shell::Color::GREEN, true)
4960
-
4961
-
4962
- else
4963
- log_message("Couldn't upload image: #{image_name}", Thor::Shell::Color::RED, true)
4964
-
4965
- end
4966
- rescue => e
4967
- log_error(e)
4968
- end
4969
-
4970
-
4587
+ desc 'file_exists', description: '', hide: true
4588
+ def file_exists(file)
4589
+ exit(0) if File.exists? file
4590
+ exit(1)
4971
4591
  end
4972
4592
 
4973
4593
 
@@ -5147,29 +4767,40 @@ module Cnvrg
5147
4767
  method_option :project_slug, :type => :string, :aliases => ["-s"], :desc => "project slug"
5148
4768
  method_option :project_owner, :type => :string, :aliases => ["-o"], :desc => "project slug"
5149
4769
  method_option :frequency, :type => :numeric, :aliases => ["-f"], :desc => "poll frequency"
4770
+ method_option :fetch_slugs, :type => :boolean, :default => false, :desc => "Fetch experiments slugs to compare"
5150
4771
 
5151
4772
  def compare_experiments
5152
4773
  verify_logged_in(true)
5153
4774
  log_start(__method__, args, options)
5154
4775
  exps_map = {}
4776
+ copied_commits = []
5155
4777
 
5156
- if options[:slugs].blank?
4778
+ if options[:slugs].blank? and options[:fetch_slugs].blank?
5157
4779
  log_message("No experiments slugs given", Thor::Shell::Color::RED)
5158
4780
  return false
5159
4781
  end
5160
- slugs = options[:slugs].split(",")
5161
- if slugs.blank?
5162
- log_message("No experiments slugs given", Thor::Shell::Color::RED)
5163
- return false
4782
+ if options[:slugs].present?
4783
+ slugs = options[:slugs].split(",")
5164
4784
  end
4785
+
5165
4786
  frequency = options[:frequency] || 5
5166
4787
  namespace = options[:namespace]
5167
4788
  project_dir = is_cnvrg_dir(Dir.pwd)
5168
4789
  @project = Project.new(project_home=project_dir, slug: options[:project_slug], owner: options[:project_owner])
4790
+ fetch_slugs = options[:fetch_slugs]
4791
+ webapp_slug = ENV["CNVRG_JOB_ID"]
4792
+ if fetch_slugs and webapp_slug.present?
4793
+ slugs = @project.fetch_webapp_slugs(webapp_slug)
4794
+ end
4795
+ if slugs.blank?
4796
+ log_message("No experiments slugs given", Thor::Shell::Color::RED)
4797
+ return false
4798
+ end
5169
4799
 
4800
+ log_message("compare is running")
5170
4801
  while true
4802
+ log_message("compare is running for slugs #{slugs}")
5171
4803
  slugs.each do |exp_slug|
5172
-
5173
4804
  begin
5174
4805
  if exps_map[exp_slug].blank?
5175
4806
  exp = @project.get_experiment(exp_slug)["experiment"]
@@ -5183,15 +4814,23 @@ module Cnvrg
5183
4814
  log_message("#{exp_name} has ended, getting files from end commit", Thor::Shell::Color::BLUE)
5184
4815
  Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project)
5185
4816
  exps_map[exp_slug] = exp
5186
- elsif exp["machine_activity"].present?
4817
+ else
5187
4818
  log_message("#{exp_name} is running should get logs", Thor::Shell::Color::BLUE)
5188
- Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4819
+ success = Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4820
+ if !success and exp["last_successful_commit"].present? and !copied_commits.include?(exp["last_successful_commit"])
4821
+ log_message("Failed to get kube files, using last commit", Thor::Shell::Color::BLUE)
4822
+ Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project, commit: exp["last_successful_commit"])
4823
+ copied_commits << exp["last_successful_commit"]
4824
+ end
5189
4825
  end
5190
4826
  rescue => e
5191
4827
  Cnvrg::Logger.log_error(e)
5192
4828
  end
5193
4829
  end
5194
4830
  sleep frequency
4831
+ if fetch_slugs
4832
+ slugs = @project.fetch_webapp_slugs(webapp_slug, slugs: slugs)
4833
+ end
5195
4834
  end
5196
4835
  end
5197
4836
 
@@ -5277,127 +4916,6 @@ module Cnvrg
5277
4916
  end
5278
4917
 
5279
4918
 
5280
- desc 'pull_image', 'downloads and loads an image', :hide => true
5281
-
5282
- def pull_image(image_name)
5283
- begin
5284
- verify_logged_in(false)
5285
- log_start(__method__, args, options)
5286
- owner = Cnvrg::CLI.get_owner()
5287
- image = Cnvrg::Images.image_exist(owner, image_name)
5288
- if !image
5289
- log_message("Couldn't find image in cnvrg repository", Thor::Shell::Color::RED)
5290
- exit(1)
5291
- end
5292
- path = download_image(image_name, image["slug"])
5293
- if path
5294
- log_message("Building image", Thor::Shell::Color::BLUE)
5295
- Docker.options[:read_timeout] = 216000
5296
- image = Docker::Image.build_from_dir(path, {'dockerfile' => 'Dockerfile.cpu', 't' => "#{image_name}:latest"}) do |v|
5297
- begin
5298
- if (log = JSON.parse(v)) && log.has_key?("stream")
5299
- next if log["stream"].starts_with? "Step"
5300
- $stdout.puts log["stream"]
5301
- end
5302
- rescue
5303
- end
5304
-
5305
- end
5306
-
5307
- if not image.nil?
5308
- FileUtils.rm_rf(path)
5309
- checks = Helpers.checkmark()
5310
- log_message("#{checks} Image built successfully", Thor::Shell::Color::GREEN)
5311
- return image
5312
- else
5313
-
5314
- log_message("Could not build image", Thor::Shell::Color::RED)
5315
- return false
5316
- end
5317
- else
5318
-
5319
- log_message("Could not download image", Thor::Shell::Color::RED)
5320
- return false
5321
-
5322
-
5323
- end
5324
-
5325
- # else
5326
- # path = download_image(image_name,image["slug"])
5327
- # if path
5328
- # image = Docker::Image.import(path)
5329
- # image.tag('repo' => image_name, 'tag' => 'latest')
5330
- # if not image.nil?
5331
- # say "Finished downloading image, cleaning up..", Thor::Shell::Color::GREEN
5332
- # FileUtils.rm(path)
5333
- # checks = Helpers.checkmark()
5334
- # say "#{checks} Done", Thor::Shell::Color::GREEN
5335
- # log_end(0)
5336
- # return image
5337
- # log_end(0)
5338
- # else
5339
- # say "Could not download image", Thor::Shell::Color::RED
5340
- # return false
5341
- # end
5342
- #
5343
- # end
5344
- # end
5345
- rescue => e
5346
-
5347
- log_message "Error: couldn't build image", Thor::Shell::Color::RED
5348
- log_error(e)
5349
-
5350
- rescue SignalException
5351
- say "\nAborting"
5352
- exit(1)
5353
- ensure
5354
- if path
5355
- FileUtils.rm_rf(path)
5356
-
5357
- end
5358
- end
5359
-
5360
-
5361
- end
5362
-
5363
- desc 'set_image', 'set image to a porject', :hide => true
5364
-
5365
- def set_image(docker_image)
5366
- verify_logged_in(true)
5367
- log_start(__method__, args, options)
5368
- working_dir = is_cnvrg_dir
5369
- project = Project.new(working_dir)
5370
-
5371
- local_images = Docker::Image.all
5372
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.include? docker_image}.flatten
5373
- if docker_image_local.size == 0
5374
-
5375
- if yes? "Image wasn't found locally, pull image from cnvrg repository?", Thor::Shell::Color::YELLOW
5376
- image = pull(docker_image)
5377
- if image
5378
- log_message("downloaded image: #{docker_image}", Thor::Shell::Color::BLUE)
5379
- @image = Images.new(working_dir, docker_image)
5380
- else
5381
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5382
- exit(1)
5383
- end
5384
- else
5385
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5386
- exit(1)
5387
-
5388
- end
5389
- elsif docker_image_local.size == 1
5390
- log_message("found image: #{docker_image_local[0]}, setting it up..", Thor::Shell::Color::BLUE)
5391
- @image = Images.new(working_dir, docker_image_local[0])
5392
- elsif docker_image_local.size > 1
5393
- log_message("found #{docker_image_local.size} images, choose the image name you want to use", Thor::Shell::Color::BLUE)
5394
- image_name = ask "#{docker_image_local.join("\n")}\n", Thor::Shell::Color::BLUE
5395
- image_name = image_name.strip
5396
- @image = Images.new(working_dir, image_name)
5397
- end
5398
- @image.update_image_activity(project.last_local_commit, nil)
5399
- end
5400
-
5401
4919
  desc 'check_pod_restart', 'Check pod restart', :hide => true
5402
4920
  def check_pod_restart
5403
4921
  Cnvrg::CLI.new.log_start(__method__, args, options)
@@ -5672,7 +5190,7 @@ module Cnvrg
5672
5190
 
5673
5191
  if dirs.size == 0
5674
5192
  log_message("Couldn't find cnvrg directory. Please start a new project", Thor::Shell::Color::RED)
5675
-
5193
+ puts Thread.current.backtrace
5676
5194
  exit(1)
5677
5195
  end
5678
5196
  return dirs.join("/")
@@ -5775,7 +5293,7 @@ module Cnvrg
5775
5293
  is_cnvrg = is_cnvrg_dir
5776
5294
  if !is_cnvrg
5777
5295
  say "You're not in a cnvrg project directory", Thor::Shell::Color::RED
5778
- exit(0)
5296
+ exit(1)
5779
5297
  end
5780
5298
 
5781
5299
  end
@@ -5921,21 +5439,6 @@ module Cnvrg
5921
5439
 
5922
5440
  end
5923
5441
 
5924
- def container_changes(dir)
5925
- container_id = is_project_with_docker(dir)
5926
- if not container_id
5927
- return false
5928
- end
5929
- container = Docker::Container.get(container_id)
5930
- command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
5931
- pip = container.exec(command, tty: true)[0]
5932
- command = ["/bin/bash", "-lc", "dpkg -l"]
5933
- dpkg = container.exec(command, tty: true)[0]
5934
- command = ["/bin/bash", "-lc", "cat /home/ds/.bash_history"]
5935
- history = container.exec(command, tty: true)[0]
5936
- diff = [pip, dpkg, history]
5937
- return diff
5938
- end
5939
5442
 
5940
5443
  def is_port_taken(ip = Cnvrg::CLI::IP, port = Cnvrg::CLI::PORT, seconds = 1)
5941
5444
  Timeout::timeout(seconds) do
@@ -6118,13 +5621,17 @@ module Cnvrg
6118
5621
 
6119
5622
  end
6120
5623
 
6121
- def gpu_util
5624
+ def gpu_util(take_from_docker: false, docker_id: nil)
6122
5625
  if !Helpers.ubuntu?
6123
5626
  return 0.0
6124
5627
  end
6125
5628
  stats = [[],[]]
6126
5629
  begin
6127
- gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5630
+ if take_from_docker
5631
+ gpu_stats = `docker exec -it #{docker_id} sh -c 'nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv'`
5632
+ else
5633
+ gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5634
+ end
6128
5635
 
6129
5636
  if !gpu_stats.nil?
6130
5637
  gpu_stats = gpu_stats.split("\n")[1..-1]