cnvrg 1.6.32 → 1.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ccd271bedf17f196897bdc28e238698676613ae09f391a9b412e3135703e3f7b
4
- data.tar.gz: 3f61217606f6f9c50e0e27dee3c9f3f4d00d1eec7bd48622d4c62b4de5562f5f
3
+ metadata.gz: e708ef034df38ed0b4f5c1ac4bb02fa79a26c93b188f571256f75dbc9d2eaaa6
4
+ data.tar.gz: 6badf54b65660776e63c02c7d3c5dbbab83d0e1e83f6e877b48d77fad5ba3036
5
5
  SHA512:
6
- metadata.gz: afd4299de4463f503d3e45528974ad451b941331af03cb067c6a801fdb25d49197ae5673505f8bd30f03c258026b63324b952d80270925040cf4cf344409a4ee
7
- data.tar.gz: 6d1716507fdf9abbc1199a5815f653242bfc0f43bac340b018696eed869e22f6be89cf0ef1d27e1a44516541d69e2e1332b3eb5a6aba25f098504d97a0eb03f4
6
+ metadata.gz: 21d89ec4fb99c4102bc1e8e0e50df516339a1c9e9660ee8f0dd8acf3ae30bd27067f5ea4fe979de3b737bd6f748ced98f023100487a4226b7f21eed17975142c
7
+ data.tar.gz: 91fb2d10994c11e9b28ef3bbc128f847ac2efd641892c29ec1ec2b16d4b125266e85a6166153b66ab9e9e1c475190f6eca771e42d739a02c1136dbe8cb6c3abb
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_runtime_dependency 'open4', '~> 1.3', '>= 1.3.4'
32
32
  spec.add_runtime_dependency 'highline', '~> 1.7', '>= 1.7.8'
33
33
  spec.add_runtime_dependency 'thor', '~> 0.19.0','>=0.19.1'
34
- spec.add_runtime_dependency 'aws-sdk', '~> 2.11.417'
34
+ spec.add_runtime_dependency 'aws-sdk', '~> 3.0'
35
35
  spec.add_runtime_dependency 'signet', '~> 0.11.0'
36
36
  spec.add_runtime_dependency 'google-cloud-env', '~> 1.2.1'
37
37
  spec.add_runtime_dependency 'google-cloud-core', '~> 1.3.2'
@@ -40,11 +40,8 @@ Gem::Specification.new do |spec|
40
40
  spec.add_runtime_dependency 'urlcrypt', '~> 0.1.1'
41
41
  spec.add_runtime_dependency 'parallel', '~> 1.12.0'
42
42
  spec.add_runtime_dependency 'azure-storage-blob', '~> 1.1.0'
43
-
44
43
  spec.add_runtime_dependency 'logstash-logger', '~> 0.22.1'
45
- spec.add_runtime_dependency 'docker-api', '~> 1.33'
46
44
  spec.add_runtime_dependency 'activesupport', '~> 5.2.0'
47
45
  spec.add_runtime_dependency 'ruby-progressbar'
48
- spec.add_runtime_dependency 'net-ssh'
49
46
  spec.add_runtime_dependency 'down'
50
47
  end
@@ -1,7 +1,5 @@
1
1
  require 'fileutils'
2
2
  require 'cnvrg/files'
3
- require 'docker'
4
- require 'net/ssh'
5
3
  require 'mimemagic'
6
4
 
7
5
 
@@ -175,58 +173,6 @@ module Cnvrg
175
173
  response = Cnvrg::API.request("users/#{owner}/images/#{slug}/commit_custom_image", 'POST', {image_logs:logs})
176
174
  return response
177
175
  end
178
- def self.ssh_to_machine(resp)
179
-
180
- sts_path = resp["result"]["sts_path"]
181
-
182
- uri = URI.parse(sts_path)
183
-
184
- http_object = Net::HTTP.new(uri.host, uri.port)
185
- http_object.use_ssl = true if uri.scheme == 'https'
186
- request = Net::HTTP::Get.new(sts_path)
187
-
188
- body = ""
189
- http_object.start do |http|
190
- response = http.request request
191
- body = response.read_body
192
- end
193
-
194
- URLcrypt::key = [body].pack('H*')
195
-
196
- ip = URLcrypt.decrypt(resp["result"]["machine_i"])
197
-
198
- user = URLcrypt.decrypt(resp["result"]["machine_u"])
199
- key = URLcrypt.decrypt(resp["result"]["machine_k"])
200
- tempssh = Tempfile.new "sshkey"
201
- tempssh.write open(key).read
202
- tempssh.rewind
203
- key_path = tempssh.path
204
- count = 0
205
- while count < 5
206
-
207
- begin
208
- ssh = Net::SSH.start(ip, user=user, :keys => key_path, :timeout => 10)
209
- if !ssh.nil?
210
- return ssh
211
- else
212
- count+=1
213
- sleep(2)
214
-
215
- end
216
- rescue
217
- count+=1
218
- sleep(2)
219
-
220
-
221
- end
222
- end
223
- if tempssh
224
- tempssh.close
225
- tempssh.unlink
226
- end
227
- return false
228
- end
229
-
230
176
 
231
177
 
232
178
  def create_custom_image(new_image_name,working_dir,stored_commands)
@@ -270,100 +216,6 @@ module Cnvrg
270
216
  File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
271
217
  end
272
218
 
273
- def get_container(stop=false)
274
- begin
275
- container_id=is_container_exist()
276
-
277
- if !container_id
278
- return create_container()
279
- else
280
- container = Docker::Container.get(container_id)
281
- status = container.json["State"]["Status"]
282
-
283
- if status == "running"
284
- return container
285
- else
286
- if stop
287
- return false
288
- end
289
- res = container.start()
290
- if res.info["State"]["Status"].eql? "exited" and res.info["State"]["Error"].include? "port is already allocated"
291
- return create_container()
292
- end
293
- return container
294
- end
295
- end
296
- rescue => e
297
- if e.message.include? "No such container"
298
-
299
- return create_container()
300
- else
301
- return false
302
- end
303
- end
304
-
305
- end
306
-
307
- def create_container(port=7654, is_remote=false)
308
- begin
309
- image_settings = {
310
- 'Image' => "#{@image_name}:latest",
311
- 'User' => 'ds',
312
- 'Cmd' => '/usr/local/cnvrg/run_ipython.sh',
313
- 'WorkingDir' => '/home/ds/notebooks',
314
- 'ExposedPorts' => {
315
- '8888/tcp' => {},
316
- },
317
- 'HostConfig' => {
318
- 'Binds' => ["#{@working_dir}:/home/ds/notebooks"],
319
- 'PortBindings' => {
320
- '8888/tcp' => [
321
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
322
- ],
323
- },
324
- },
325
- }
326
- container = Docker::Container.create(image_settings)
327
- container.start()
328
- netrc = File.open(File.expand_path('~')+"/.netrc", "rb")
329
- netrc_content = netrc.read
330
- container.store_file("/home/ds/.netrc", netrc_content)
331
- command = ["/bin/bash", "-lc", "sudo chmod 600 /home/ds/.netrc"]
332
- p = container.exec(command, tty: true)
333
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.netrc"]
334
- p = container.exec(command, tty: true)
335
- config = File.open(File.expand_path('~')+"/.cnvrg/config.yml", "rb")
336
- config_content = config.read
337
- container.store_file("/home/ds/.cnvrg/config.yml", config_content)
338
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg"]
339
- container.exec(command, tty: true)
340
- # Libraries instlled
341
- save_installed_libraries(container)
342
- config = {project_name: @project_name,
343
- project_slug: @project_slug,
344
- owner: @owner,
345
- docker: true, image_base: @image_name, image_tag: @image_tag, container: container.id, port: port, image_slug: @image_slug}
346
-
347
- File.open(@working_dir+"/.cnvrg/config.yml", "w+") { |f| f.write config.to_yaml }
348
-
349
-
350
- return container
351
-
352
-
353
- rescue => e
354
- if e.message.include? "is not running"
355
- return create_container(port-1)
356
- end
357
- return false
358
- rescue SignalException
359
-
360
- say "\nAborting", Thor::Shell::Color::RED
361
- exit(1)
362
- end
363
-
364
-
365
- end
366
-
367
219
  def save_installed_libraries(container)
368
220
  begin
369
221
  command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
@@ -77,20 +77,22 @@ module Cnvrg
77
77
  if response.to_hash[:status] == 404
78
78
  return false
79
79
  end
80
- if parse_request == true
80
+ if parse_request
81
81
  JSON.parse(response.body)
82
82
  else
83
83
  response
84
84
  end
85
- when 'POST', 'PUT'
85
+ when 'POST', 'PUT'
86
86
  conn.options.timeout = 4200
87
- conn.options.open_timeout=180
87
+ conn.options.open_timeout = 180
88
+ conn.headers['Content-Type'] = "application/json"
88
89
  retries = 0
89
90
  success = false
91
+ data = data || {}
90
92
  while !success and retries < 20
91
93
  begin
92
- response = conn.post "#{resource}", data if method.eql? 'POST'
93
- response = conn.put "#{resource}", data if method.eql? 'PUT'
94
+ response = conn.post "#{resource}", data.to_json if method.eql? 'POST'
95
+ response = conn.put "#{resource}", data.to_json if method.eql? 'PUT'
94
96
  success = true
95
97
  Cnvrg::API.parse_version(response)
96
98
 
@@ -113,7 +115,7 @@ module Cnvrg
113
115
  end
114
116
  when 'POST_JSON'
115
117
  conn.options.timeout = 4200
116
- conn.options.open_timeout =4200
118
+ conn.options.open_timeout = 4200
117
119
  conn.headers['Content-Type'] = "application/json"
118
120
  new_data = JSON.dump(data)
119
121
 
@@ -124,8 +126,6 @@ module Cnvrg
124
126
  begin
125
127
  response = conn.post "#{resource}", new_data
126
128
  success = true
127
- Cnvrg::API.parse_version(response)
128
-
129
129
  rescue => e
130
130
  Cnvrg::Logger.log_error(e)
131
131
  sleep(5)
@@ -0,0 +1,14 @@
1
+ module Cnvrg
2
+ class API_V2 < API
3
+ ENDPOINT_VERSION = 'v2'
4
+
5
+ def self.endpoint_uri
6
+ api = get_api()
7
+ return "#{api}/#{Cnvrg::API_V2::ENDPOINT_VERSION}"
8
+ end
9
+
10
+ def self.is_response_success(response)
11
+ raise Exception.new("Bad status in response #{response.status}") if response.status != 200
12
+ end
13
+ end
14
+ end
@@ -12,7 +12,6 @@ require 'digest' # sha1up
12
12
  require "highline/import"
13
13
  require 'socket'
14
14
  require 'thor'
15
- require 'docker'
16
15
  require 'socket'
17
16
  require 'timeout'
18
17
  require 'fileutils'
@@ -28,13 +27,11 @@ require 'cnvrg/auth'
28
27
  require 'cnvrg/project'
29
28
  require 'cnvrg/files'
30
29
  require 'cnvrg/experiment'
31
- require 'cnvrg/Images'
32
30
  require 'cnvrg/image'
33
31
  require 'cnvrg/dataset'
34
32
  require 'cnvrg/datafiles'
35
33
  require 'cnvrg/data'
36
34
  require 'cnvrg/storage'
37
- require 'cnvrg/ssh'
38
35
  require 'cnvrg/result'
39
36
  require 'cnvrg/logger'
40
37
  require 'cnvrg/org_helpers'
@@ -49,6 +46,9 @@ require 'cnvrg/downloader/clients/s3_client'
49
46
  require 'cnvrg/downloader/clients/gcp_client'
50
47
  require 'cnvrg/downloader/clients/azure_client'
51
48
  require 'cnvrg/job_cli'
49
+ require 'cnvrg/job_ssh'
50
+ require 'cnvrg/connect_job_ssh'
51
+ require 'cnvrg/api_v2'
52
52
 
53
53
  class Thor
54
54
  module Base
@@ -175,6 +175,9 @@ module Cnvrg
175
175
  desc "job", "manage running jobs", :hide => false
176
176
  subcommand "job", JobCli
177
177
 
178
+ desc "ssh", "ssh into running jobs", :hide => false
179
+ subcommand "ssh", JobSsh
180
+
178
181
  desc "image [COMMAND]", "build existing images", :hide => true
179
182
  subcommand "image", ImageCli
180
183
 
@@ -819,9 +822,9 @@ module Cnvrg
819
822
  end
820
823
 
821
824
  desc 'data verify', 'Verify datasets', :hide => true
822
- method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => 15
825
+ method_option :timeout, :type => :numeric, :aliases => ["-t", "--timeout"], :desc => "Time to wait before returning final answer", :default => nil
823
826
 
824
- def verify_datasets(dataset_titles, timeout=0)
827
+ def verify_datasets(dataset_titles, timeout=nil)
825
828
  begin
826
829
  verify_logged_in(false)
827
830
  log_start(__method__, args, options)
@@ -830,21 +833,31 @@ module Cnvrg
830
833
  log_message("All datasets are verified", Thor::Shell::Color::BLUE) if verified
831
834
  log_message("Failed to verify datasets", Thor::Shell::Color::RED) if !verified
832
835
  exit(1) if !verified
833
-
834
836
  rescue SignalException
835
837
  say "\nAborting", Thor::Shell::Color::RED
836
838
  exit(1)
837
839
  end
838
840
  end
839
841
 
842
+ desc 'data scan', 'Lookup datasets', :hide => true
843
+ def scan_datasets()
844
+ begin
845
+ verify_logged_in(false)
846
+ log_start(__method__, args, options)
847
+ log_message("Scanning datasets", Thor::Shell::Color::BLUE)
848
+ datasets = Dataset.scan_datasets()
849
+ puts(datasets.to_json)
850
+ end
851
+ end
852
+
840
853
  desc 'data clone', 'Clone dataset', :hide => true
841
854
  method_option :commit, :type => :string, :aliases => ["-c", "--commit"], :default => ""
842
855
  method_option :only_tree, :type => :boolean, :aliases => ["-t", "--tree"], :default => false
843
856
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => nil
844
857
  method_option :read, :type => :boolean, :aliases => ["-r", "--read"], :default => false
845
858
  method_option :remote, :type => :boolean, :aliases => ["-h", "--remote"], :default => false
846
-
847
- def clone_data(dataset_url,only_tree=false,commit=nil,query=nil,read=false,remote=false, relative: false)
859
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
860
+ def clone_data(dataset_url, only_tree=false, commit=nil, query=nil, read=false, remote=false, flatten: false, relative: false, soft: false)
848
861
  begin
849
862
  verify_logged_in(false)
850
863
  log_start(__method__, args, options)
@@ -853,10 +866,10 @@ module Cnvrg
853
866
  read = options["read"] || read || false
854
867
  remote = options["remote"] || remote || false
855
868
  query = options['query'].presence || query.presence
869
+ soft = options['soft'] || soft
856
870
  if query.present?
857
- return clone_data_query(dataset_url, query)
871
+ return clone_data_query(dataset_url, query, flatten, soft: soft)
858
872
  end
859
- @executer = Cnvrg::Helpers::Executer.get_executer
860
873
 
861
874
  url_parts = dataset_url.split("/")
862
875
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
@@ -868,6 +881,8 @@ module Cnvrg
868
881
  dataset_name = response["result"]["name"]
869
882
  dataset_home = Dir.pwd+"/"+dataset_name
870
883
 
884
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name, commit: response["result"]["commit"]) if soft
885
+
871
886
  check = Helpers.checkmark
872
887
  if @dataset.init_home(remote:remote)
873
888
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -875,14 +890,12 @@ module Cnvrg
875
890
  log_message("Downloading files", Thor::Shell::Color::BLUE)
876
891
  if @dataset.softlinked?
877
892
  @files.cp_ds(relative: relative)
878
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
879
893
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
880
894
  @dataset.write_success
881
895
  return
882
896
  end
883
897
 
884
898
  if only_tree
885
-
886
899
  success = Dataset.clone_tree(commit: commit, dataset_home: dataset_home)
887
900
  return if success
888
901
  end
@@ -900,7 +913,7 @@ module Cnvrg
900
913
 
901
914
  while files['keys'].length > 0
902
915
  Cnvrg::Logger.log_info("download multiple files, #{downloaded_files.size} files downloaded")
903
- @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read)
916
+ @files.download_multiple_files_s3(files, @dataset.local_path, progressbar: progressbar, read_only: read, flatten: flatten)
904
917
 
905
918
  downloaded_files += files['keys'].length
906
919
  files = @files.get_clone_chunk(commit: commit, latest_id: files['latest'])
@@ -908,7 +921,6 @@ module Cnvrg
908
921
  progressbar.finish
909
922
  if downloaded_files == files_count
910
923
  Dataset.verify_cnvrgignore_exist(dataset_name, false)
911
- @executer.set_dataset_status(dataset: @dataset.slug, status: "cloned") if @executer
912
924
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
913
925
  @dataset.write_success
914
926
  ### if read, dont generate idx (but create idx.yml) if not read, generate idx.
@@ -930,12 +942,14 @@ module Cnvrg
930
942
 
931
943
  desc 'data clone_query', 'Clone dataset _query', :hide => true
932
944
  method_option :query, :type => :string, :aliases => ["-q", "--query"], :default => ""
933
- def clone_data_query(dataset_url,query=nil)
945
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
946
+ def clone_data_query(dataset_url, query=nil, flatten=false, soft: false)
934
947
  begin
935
948
  verify_logged_in(false)
936
- @executer = Cnvrg::Helpers::Executer.get_executer
949
+ #@executer = Cnvrg::Helpers::Executer.get_executer
937
950
  log_start(__method__, args, options)
938
951
  query = options["query"] || query
952
+ soft = options["soft"] || soft
939
953
  if !query.present?
940
954
  log_message("Argument missing : query", Thor::Shell::Color::RED)
941
955
  exit(1)
@@ -945,13 +959,14 @@ module Cnvrg
945
959
  project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
946
960
  slug = url_parts[project_index + 1]
947
961
  owner = url_parts[project_index - 1]
948
-
949
962
  response = Cnvrg::API.request("users/#{owner}/datasets/#{slug}/search/#{query}", 'GET')
950
963
  Cnvrg::CLI.is_response_success(response,true)
951
964
  dataset_name = response["results"]["name"]
952
965
  dataset_slug = response["results"]["slug"]
953
- dataset_home = File.join(Dir.pwd, dataset_name)
966
+ dataset_home = Dir.pwd+"/"+dataset_slug
967
+ Dataset.stop_if_dataset_present(dataset_home, dataset_name) if soft
954
968
 
969
+ # dataset_home = Dir.pwd
955
970
  if Dataset.blank_clone(owner, dataset_name, dataset_slug)
956
971
  dataset = Dataset.new(dataset_home)
957
972
  log_message("Cloning #{dataset_name}", Thor::Shell::Color::BLUE)
@@ -966,6 +981,7 @@ module Cnvrg
966
981
  },
967
982
  in_threads: ParallelThreads
968
983
  }
984
+
969
985
  begin
970
986
  log_message("Downloading files", Thor::Shell::Color::BLUE)
971
987
  Parallel.map((response["results"]["query_files"]), parallel_options) do |f|
@@ -974,6 +990,7 @@ module Cnvrg
974
990
  file_name = relative_path_dir.pop()
975
991
  relative_path_dir = relative_path_dir.join("/")
976
992
  abs_path = dataset_home + "/" + relative_path_dir
993
+ abs_path = dataset_home if flatten
977
994
  begin
978
995
  FileUtils.mkdir_p(abs_path) unless File.exist? (abs_path + "/" + file_name)
979
996
  rescue
@@ -981,14 +998,14 @@ module Cnvrg
981
998
  exit(1)
982
999
  end
983
1000
  begin
984
- File.write "#{abs_path}/#{file_name}", open(f["s3_url"]).read unless File.exist? (abs_path + "/" + file_name)
985
- rescue
1001
+ File.write "#{abs_path}/#{file_name}", open(f["url"]).read unless File.exist? (abs_path + "/" + file_name)
1002
+ rescue => e
986
1003
  log_message("Could not download file: #{f["fullpath"]}", Thor::Shell::Color::RED)
987
1004
  exit(1)
988
1005
  end
989
1006
 
990
1007
  end
991
- @executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
1008
+ #@executer.set_dataset_status(dataset: dataset.slug, status: "cloned") if @executer.present?
992
1009
  rescue Interrupt
993
1010
  log_message("Couldn't download", Thor::Shell::Color::RED)
994
1011
  exit(1)
@@ -998,7 +1015,7 @@ module Cnvrg
998
1015
  check = Helpers.checkmark
999
1016
  log_message("#{check} Clone finished successfully", Thor::Shell::Color::GREEN)
1000
1017
  dataset.write_success(in_folder=true)
1001
- rescue
1018
+ rescue => e
1002
1019
  exit(1)
1003
1020
  end
1004
1021
  end
@@ -1008,32 +1025,6 @@ module Cnvrg
1008
1025
  end
1009
1026
  end
1010
1027
 
1011
- desc 'init_data_container', 'Init dataset directory', :hide => true
1012
- method_option :login_content, :type => :string, :aliases => ["-l"], :default => ""
1013
-
1014
- def init_data_container(container)
1015
- begin
1016
- login_content = options["login_content"]
1017
-
1018
- container = Docker::Container.get(container)
1019
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
1020
- container.exec(command, tty: true)
1021
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
1022
- container.exec(command, tty: true)
1023
- command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
1024
- container.exec(command, tty: true)
1025
- command = ["/bin/bash", "-lc", "sudo chown -R ds /home/ds/.cnvrg /home/ds/.netrc"]
1026
- container.exec(command, tty: true)
1027
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
1028
- container.exec(command, tty: true)
1029
-
1030
- rescue SignalException
1031
-
1032
- say "\nAborting", Thor::Shell::Color::RED
1033
- exit(1)
1034
- end
1035
- end
1036
-
1037
1028
  desc 'data_snap', 'Init dataset directory', :hide => true
1038
1029
  method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
1039
1030
 
@@ -1184,17 +1175,29 @@ module Cnvrg
1184
1175
  end
1185
1176
 
1186
1177
  desc '', '', :hide => true
1187
- def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000)
1178
+ def get_owner_slug(url_or_slug)
1179
+ if url_or_slug =~ URI::regexp
1180
+ # Find owner and slug in url
1181
+ url_parts = url_or_slug.split("/")
1182
+ project_index = Cnvrg::Helpers.look_for_in_path(url_or_slug, "datasets")
1183
+ slug = url_parts[project_index + 1]
1184
+ owner = url_parts[project_index - 1]
1185
+ else
1186
+ # Find owner in config file
1187
+ owner = CLI.get_owner
1188
+ slug = url_or_slug
1189
+ end
1190
+ return owner, slug
1191
+ end
1192
+
1193
+ desc '', '', :hide => true
1194
+ def data_put(dataset_url, files: [], dir: '', commit: '', chunk_size: 1000, force: false, threads: 15, message: nil)
1188
1195
  begin
1189
1196
  verify_logged_in(false)
1190
1197
  log_start(__method__, args, options)
1191
1198
 
1192
- #find owner and slug in url
1193
- url_parts = dataset_url.split("/")
1194
- project_index = Cnvrg::Helpers.look_for_in_path(dataset_url, "datasets")
1195
- slug = url_parts[project_index + 1]
1196
- owner = url_parts[project_index - 1]
1197
- @dataset = Dataset.new(dataset_url: dataset_url)
1199
+ owner, slug = get_owner_slug(dataset_url)
1200
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1198
1201
  @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1199
1202
  @files = @datafiles.verify_files_exists(files)
1200
1203
 
@@ -1204,7 +1207,7 @@ module Cnvrg
1204
1207
  log_message("Uploading #{@files.size} files", Thor::Shell::Color::GREEN)
1205
1208
  number_of_chunks = (@files.size.to_f / chunk_size).ceil
1206
1209
  if commit.blank?
1207
- response = @datafiles.start_commit(false, true, chunks: number_of_chunks)
1210
+ response = @datafiles.start_commit(false, true, chunks: number_of_chunks, message: message )
1208
1211
  unless response #means we failed in the start commit.
1209
1212
  raise SignalException.new(1, "Cant put files into dataset, check the dataset id")
1210
1213
  end
@@ -1218,28 +1221,33 @@ module Cnvrg
1218
1221
  else
1219
1222
  @commit = commit
1220
1223
  end
1221
- #dir shouldnt have starting or ending slash.
1224
+
1225
+ # dir shouldnt have starting or ending slash.
1222
1226
  dir = dir[0..-2] if dir.end_with? '/'
1223
1227
  dir = dir[1..-1] if dir.start_with? '/'
1224
1228
 
1225
- @files.each_slice(chunk_size).each do |list_files|
1226
- temp_tree = @dataset.generate_chunked_idx(list_files, prefix: dir)
1227
- #will throw a signal exception if something goes wrong.
1228
- @datafiles.upload_multiple_files(@commit, temp_tree, force: true, prefix: dir, total: @files.size)
1229
+ @datafiles.upload_multiple_files_optimized(
1230
+ @files,
1231
+ @commit,
1232
+ force: force,
1233
+ chunk_size: chunk_size,
1234
+ prefix: dir,
1235
+ threads: threads
1236
+ )
1237
+
1238
+ # This is for backwards compatibility only and should be removed in future versions:
1239
+ res = @datafiles.put_commit(@commit)
1240
+ unless res.is_success?
1241
+ raise SignalException.new(1, res.msg)
1229
1242
  end
1230
- if commit.blank?
1231
- res = @datafiles.put_commit(@commit)
1232
- unless res.is_success?
1233
- raise SignalException.new(1, res.msg)
1234
- end
1235
- else
1236
- res = @datafiles.end_commit(@commit,false, success: true )
1237
- msg = res['result']
1238
- response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1239
- unless response.is_success?
1240
- raise SignalException.new(1, res.msg)
1241
- end
1243
+
1244
+ res = @datafiles.end_commit(@commit,false, success: true, commit_type: "put")
1245
+ msg = res['result']
1246
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1247
+ unless response.is_success?
1248
+ raise SignalException.new(1, res.msg)
1242
1249
  end
1250
+
1243
1251
  log_message("Uploading files finished Successfully", Thor::Shell::Color::GREEN)
1244
1252
  rescue SignalException => e
1245
1253
  log_message(e.message, Thor::Shell::Color::RED)
@@ -1248,7 +1256,49 @@ module Cnvrg
1248
1256
  end
1249
1257
 
1250
1258
 
1259
+ desc '', '', :hide => true
1260
+ def data_rm(dataset_url, regex_list: [], commit: '', message: nil)
1261
+ begin
1262
+ verify_logged_in(false)
1263
+ log_start(__method__, args, options)
1264
+
1265
+ owner, slug = get_owner_slug(dataset_url)
1266
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1267
+ @datafiles = Cnvrg::Datafiles.new(owner, slug, dataset: @dataset)
1268
+
1269
+ # Init a new commit
1270
+ response = @datafiles.start_commit(false, true, chunks: 1, message: message )
1271
+ unless response #means we failed in the start commit.
1272
+ raise SignalException.new(1, "Cant put files into dataset, check the dataset id")
1273
+ end
1274
+ @commit = response['result']['commit_sha1']
1275
+ files_to_delete, folders_to_delete, job_id = @datafiles.delete_multiple_files(@commit, regex_list)
1276
+ log_message("Deleting #{files_to_delete} files and #{folders_to_delete} folders", Thor::Shell::Color::GREEN)
1251
1277
 
1278
+ total_files = files_to_delete + folders_to_delete
1279
+ current_progress = 0
1280
+ progressbar = @datafiles.create_progressbar("Delete Progress", total_files)
1281
+ chunk_size = 1000
1282
+ offset = 0
1283
+ while current_progress < total_files
1284
+ current_progress = @datafiles.delete_file_chunk(@commit, regex_list, chunk_size, offset)
1285
+ progressbar.progress = current_progress
1286
+ offset += chunk_size
1287
+ end
1288
+
1289
+ res = @datafiles.end_commit(@commit,false, success: true)
1290
+ msg = res['result']
1291
+ response = Cnvrg::Result.new(Cnvrg::CLI.is_response_success(res, true), msg)
1292
+ unless response.is_success?
1293
+ raise SignalException.new(1, res.msg)
1294
+ end
1295
+
1296
+ log_message("Deleting files finished Successfully", Thor::Shell::Color::GREEN)
1297
+ rescue SignalException => e
1298
+ log_message(e.message, Thor::Shell::Color::RED)
1299
+ return false
1300
+ end
1301
+ end
1252
1302
 
1253
1303
  desc 'upload_data', 'Upload data files', :hide => true
1254
1304
  method_option :ignore, :type => :array, :aliases => ["-i", "--i"], :desc => "ignore following files"
@@ -1699,18 +1749,22 @@ module Cnvrg
1699
1749
  end
1700
1750
 
1701
1751
  desc 'data commits', 'List all commits for a specific dataset', :hide => true
1702
-
1703
- def list_dataset_commits()
1704
- verify_logged_in(true)
1752
+ def list_dataset_commits(dataset_url, commit_sha1: nil)
1753
+ verify_logged_in(false)
1705
1754
  log_start(__method__, args, options)
1706
1755
 
1707
- dataset_dir = is_cnvrg_dir(Dir.pwd)
1708
- @dataset = Dataset.new(dataset_dir)
1709
- result = @dataset.list_commits()
1756
+ if dataset_url == "."
1757
+ dataset_dir = is_cnvrg_dir(Dir.pwd)
1758
+ @dataset = Dataset.new(dataset_dir)
1759
+ else
1760
+ owner, slug = get_owner_slug(dataset_url)
1761
+ @dataset = Dataset.new(dataset_info: {:owner => owner, :slug => slug})
1762
+ end
1763
+
1764
+ result = @dataset.list_commits(commit_sha1:commit_sha1)
1710
1765
  list = result["result"]["list"]
1711
1766
 
1712
1767
  print_table(list)
1713
-
1714
1768
  end
1715
1769
 
1716
1770
  desc 'commits', 'List all commits for a specific Project'
@@ -1741,17 +1795,17 @@ module Cnvrg
1741
1795
 
1742
1796
 
1743
1797
  desc 'git_clone', 'Clone project'
1798
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1744
1799
  def git_clone(slug, owner)
1745
1800
  verify_logged_in(false)
1746
1801
  log_start(__method__, args, options)
1747
-
1802
+ project_home = Dir.pwd
1803
+ soft = options["soft"] || false
1804
+ Project.stop_if_project_present(project_home, slug) if soft
1748
1805
  clone_resp = Project.clone_dir_remote(slug, owner, slug,true)
1749
- idx_status = Project.new(get_project_home).generate_idx
1806
+ exit 1 if not clone_resp
1807
+ idx_status = Project.new(get_project_home).generate_idx(files:[])
1750
1808
  FileUtils.mkdir_p File.join(get_project_home, ENV['CNVRG_OUTPUT_DIR']) if ENV['CNVRG_OUTPUT_DIR'].present?
1751
- @executer = Cnvrg::Helpers::Executer.get_executer
1752
- if @executer.present?
1753
- @executer.update_git_commit
1754
- end
1755
1809
  end
1756
1810
 
1757
1811
 
@@ -1791,7 +1845,7 @@ module Cnvrg
1791
1845
  desc 'clone PROJECT_URL', 'Clone project'
1792
1846
  method_option :remote, :type => :boolean, :aliases => ["-r", "--r"], :default => false
1793
1847
  method_option :commit, :type => :string, :aliases => ["-c", "--c"], :default => nil
1794
-
1848
+ method_option :soft, :type => :boolean, :aliases => ["-s", "--soft"], :default => false, :hide => true
1795
1849
  def clone(project_url)
1796
1850
  begin
1797
1851
  verify_logged_in(false)
@@ -1801,6 +1855,8 @@ module Cnvrg
1801
1855
  slug = url_parts[project_index + 1]
1802
1856
  owner = url_parts[project_index - 1]
1803
1857
  remote = options["remote"] || false
1858
+ soft = options["soft"] || false
1859
+
1804
1860
 
1805
1861
  response = Cnvrg::API.request("users/#{owner}/projects/#{slug}/get_project", 'GET')
1806
1862
  Cnvrg::CLI.is_response_success(response)
@@ -1814,6 +1870,8 @@ module Cnvrg
1814
1870
  clone_resp = false
1815
1871
  project_home = Dir.pwd
1816
1872
 
1873
+ Project.stop_if_project_present(project_home, project_name) if soft
1874
+
1817
1875
  if remote and !git
1818
1876
  clone_resp = Project.clone_dir_remote(slug, owner, project_name,git)
1819
1877
  elsif git
@@ -1837,8 +1895,6 @@ module Cnvrg
1837
1895
  end
1838
1896
  clone_resp = Project.clone_dir(slug, owner, project_name,git)
1839
1897
  project_home = Dir.pwd + "/" + project_name
1840
-
1841
-
1842
1898
  end
1843
1899
 
1844
1900
  if clone_resp
@@ -1956,8 +2012,6 @@ module Cnvrg
1956
2012
  method_option :parallel, :type => :numeric, :aliases => ["-p", "--parallel"], :desc => "uparallel upload at the same time", :default => 15
1957
2013
  method_option :init, :type => :boolean, :aliases => ["--initial"], :desc => "initial sync", :default => false
1958
2014
  method_option :message, :type => :string, :aliases => ["--message"], :desc => "create commit with message", :default => nil
1959
-
1960
-
1961
2015
  def sync_data_new(new_branch, force, verbose, commit, all_files, tags ,parallel, chunk_size, init, message)
1962
2016
  verify_logged_in(true)
1963
2017
  log_start(__method__, args, options)
@@ -1966,11 +2020,13 @@ module Cnvrg
1966
2020
  # w(verbose=false, new_branch=false,sync=false, commit=nil,all_files=true)
1967
2021
  total_deleted, total_downloaded = invoke :download_data_new,[verbose, new_branch, true, commit, all_files], :new_branch=>new_branch, :direct=>false, :force =>force
1968
2022
  end
1969
- # w(new_branch, verbose,sync,force, tags, chunk_size)
2023
+
1970
2024
  invoke :upload_data_new,[new_branch, verbose, true, force, tags, chunk_size, message:message, total_deleted: total_deleted, total_downloaded: total_downloaded],
1971
2025
  :new_branch=>new_branch, :direct=>false, :force =>force, :sync =>true, :tags =>tags, :parallel => parallel, :message => message
1972
2026
 
1973
2027
  end
2028
+
2029
+
1974
2030
  desc 'upload_data_new', 'upload_data_new', :hide => true
1975
2031
  method_option :verbose, :type => :boolean, :aliases => ["-v"], :default => false
1976
2032
  method_option :new_branch, :type => :boolean, :aliases => ["-nb"], :desc => "create new branch of commits"
@@ -2213,15 +2269,27 @@ module Cnvrg
2213
2269
  method_option :return_id, :type => :boolean, :aliases => ["-r", "--return_id"], :default => false
2214
2270
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
2215
2271
  method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
2272
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2216
2273
  method_option :job_slug, :type => :string, :aliases => ["--job"], :default => nil, :hide=>true
2217
2274
  method_option :job_type, :type => :string, :aliases => [ "--job_type"], :default => nil, :hide=>true
2275
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2276
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2218
2277
 
2219
- def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil)
2278
+ def upload(link = false, sync = false, direct = false, ignore_list = "", in_exp = false, force = false, output_dir = "output", job_type = nil, job_slug = nil, suppress_exceptions = true)
2220
2279
  begin
2221
2280
  # we are passing "force" twice.. doesnt really make sense :\\
2222
2281
  verify_logged_in(true)
2223
2282
  log_start(__method__, args, options)
2224
2283
  @project = Project.new(get_project_home)
2284
+
2285
+ # Enable local/experiment exception logging
2286
+ suppress_exceptions = suppress_exceptions ? suppress_exceptions : options[:suppress_exceptions]
2287
+ if in_exp
2288
+ exp_obj = Experiment.new(@project.owner, @project.slug, job_id: job_slug)
2289
+ else
2290
+ exp_obj = nil
2291
+ end
2292
+
2225
2293
  commit_msg = options["message"]
2226
2294
  if commit_msg.nil? or commit_msg.empty?
2227
2295
  commit_msg = ""
@@ -2237,19 +2305,21 @@ module Cnvrg
2237
2305
  spec_files_to_upload = spec_files_to_upload.split(",")
2238
2306
  end
2239
2307
  if @project.is_git
2308
+ list = []
2240
2309
  git_output_dir = options["output_dir"] || output_dir
2241
2310
  if git_output_dir.present?
2242
2311
  if git_output_dir.ends_with? "/"
2243
2312
  git_output_dir = git_output_dir[0..-2]
2244
2313
  end
2245
2314
  list = @project.generate_output_dir(git_output_dir)
2246
- spec_files_to_upload = list
2247
- if spec_files_to_upload.blank?
2248
- log_message("#{check} Project is up to date", Thor::Shell::Color::GREEN, (((options["sync"] or sync) and !direct) ? false : true))
2249
- return true
2250
- end
2251
- force = true
2252
2315
  end
2316
+ list += @project.generate_git_diff if options["git_diff"]
2317
+ spec_files_to_upload = list
2318
+ if spec_files_to_upload.blank?
2319
+ log_message("#{check} Project is up to date", Thor::Shell::Color::GREEN, (((options["sync"] or sync) and !direct) ? false : true))
2320
+ return true
2321
+ end
2322
+ force = true
2253
2323
  end
2254
2324
 
2255
2325
  if ignore.nil? or ignore.empty?
@@ -2291,8 +2361,6 @@ module Cnvrg
2291
2361
  end
2292
2362
  update_count = 0
2293
2363
  update_total = result["added"].size + result["updated_on_local"].size + result["deleted"].size
2294
- successful_updates = []
2295
- successful_deletions = []
2296
2364
  if options["verbose"]
2297
2365
  if update_total == 1
2298
2366
  log_message("Updating #{update_total} file", Thor::Shell::Color::BLUE)
@@ -2312,8 +2380,11 @@ module Cnvrg
2312
2380
  end
2313
2381
  job_type = options['job_type'] || job_type
2314
2382
  job_slug = options['job_slug'] || job_slug
2315
- commit_sha1 = @files.start_commit(new_branch, force: force, exp_start_commit: exp_start_commit,
2316
- job_type: job_type, job_slug: job_slug, start_commit: current_commit, message: options["message"])["result"]["commit_sha1"]
2383
+ commit_sha1 = @files.start_commit(
2384
+ new_branch, force: force, exp_start_commit: exp_start_commit,
2385
+ job_type: job_type, job_slug: job_slug, start_commit: current_commit,message: options["message"],
2386
+ debug_mode: options["debug_mode"]
2387
+ )["result"]["commit_sha1"]
2317
2388
  # upload / update
2318
2389
  # delete
2319
2390
  to_upload = result["added"] + result["updated_on_local"]
@@ -2324,32 +2395,30 @@ module Cnvrg
2324
2395
  :starting_at => 0,
2325
2396
  :total => (to_upload.size + deleted.size),
2326
2397
  :autofinish => true)
2327
- @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar)
2328
2398
 
2329
- @files.delete_files_from_server(deleted, commit_sha1)
2399
+ buffered_errors = @files.upload_multiple_files(to_upload, commit_sha1, progress: progressbar, suppress_exceptions: suppress_exceptions)
2400
+ @files.delete_files_from_server(deleted, commit_sha1, suppress_exceptions: suppress_exceptions)
2330
2401
 
2331
2402
  progressbar.finish
2403
+
2404
+ if buffered_errors.is_a?(Hash)
2405
+ buffered_errors.keys.each do |file|
2406
+ to_upload.delete(file)
2407
+ Cnvrg::CLI.log_message(buffered_errors[file], 'red')
2408
+ exp_obj.job_log([buffered_errors[file]]) unless exp_obj.nil?
2409
+ end
2410
+ end
2411
+
2332
2412
  res = @files.end_commit(commit_sha1, force: force, message: commit_msg)
2333
2413
  unless Cnvrg::CLI.is_response_success(res, false)
2334
2414
  raise StandardError.new("Cant end commit")
2335
2415
  end
2416
+
2336
2417
  # save idx
2337
2418
  @project.update_idx_with_files_commits!((to_upload + deleted), res["result"]["commit_time"])
2338
2419
  @project.update_idx_with_commit!(commit_sha1)
2339
2420
  if options["verbose"]
2340
2421
  log_message("#{check} Done", Thor::Shell::Color::BLUE)
2341
- if successful_updates.size > 0
2342
- successful_updates.flatten!
2343
- log_message("Updated:", Thor::Shell::Color::GREEN)
2344
- suc = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2345
- log_message(suc.join("\n"), Thor::Shell::Color::GREEN)
2346
- end
2347
- if successful_deletions.size > 0
2348
- successful_deletions.flatten!
2349
- log_message("Deleted:", Thor::Shell::Color::GREEN)
2350
- del = successful_updates.map {|x| x = Helpers.checkmark() + " " + x}
2351
- log_message(del.join("\n"), Thor::Shell::Color::GREEN)
2352
- end
2353
2422
  log_message("Total of #{update_count} / #{update_total} files.", Thor::Shell::Color::GREEN)
2354
2423
  else
2355
2424
  if return_id
@@ -2374,9 +2443,13 @@ module Cnvrg
2374
2443
  if e.is_a? SignalException
2375
2444
  say "\nAborting", Thor::Shell::Color::BLUE
2376
2445
  say "\nRolling back all changes", Thor::Shell::Color::BLUE
2446
+
2447
+ exp_obj.job_log(["Aborting", "Rolling back all changes"]) unless exp_obj.nil?
2377
2448
  else
2378
2449
  log_message(error_message, Thor::Shell::Color::RED)
2379
2450
  log_error(e)
2451
+
2452
+ exp_obj.job_log([error_message, e]) unless exp_obj.nil?
2380
2453
  end
2381
2454
  @files.rollback_commit(commit_sha1) unless commit_sha1.nil?
2382
2455
  print_res = {
@@ -2894,6 +2967,11 @@ module Cnvrg
2894
2967
  method_option :job_type, :type => :string, :aliases => ["-jt", "--job_type"], :default => nil
2895
2968
  method_option :files, :type => :string, :aliases => ["--files"], :default => nil
2896
2969
  method_option :output_dir, :type => :string, :aliases => ["--output_dir"], :default => nil
2970
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2971
+ method_option :suppress_exceptions, :type => :boolean, :aliases => ["--suppress-exceptions"], :default => true
2972
+ method_option :debug_mode, :type => :boolean, :aliases => ["--debug-mode"], :default => false
2973
+ method_option :git_diff, :type => :boolean, :aliases => ["--git_diff"], :default => false
2974
+
2897
2975
  def sync(direct = true)
2898
2976
  verify_logged_in(true) if direct
2899
2977
  @project = Project.new(get_project_home)
@@ -2905,16 +2983,20 @@ module Cnvrg
2905
2983
  is_git = ENV['CNVRG_GIT_PROJECT'] == "true" || @project.is_git
2906
2984
  in_exp = options["in_exp"] || (job_slug.present? and job_type.present?)
2907
2985
  in_exp = false if job_type.present? and job_type == "NotebookSession"
2986
+ output_dir = options["output_dir"] || ENV['CNVRG_OUTPUT_DIR']
2987
+
2908
2988
  run_download = true
2909
- if options[:force] or options[:files].present? or options[:output_dir].present? or in_exp or @project.is_branch
2989
+ if (job_type == "NotebookSession" and is_git) or job_type == "Experiment" or options['force']
2910
2990
  run_download = false
2911
2991
  end
2912
- if run_download
2992
+
2993
+ if run_download or options['debug_mode']
2913
2994
  invoke :download, [true, "", in_exp ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true
2914
2995
  end
2915
- invoke :upload, [false, true, direct, "",in_exp,options[:force], options["output_dir"],job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2996
+ invoke :upload, [false, true, direct, "",in_exp,options[:force], output_dir, job_type, job_slug ], :new_branch => options["new_branch"], :verbose => options["verbose"], :sync => true,
2916
2997
  :ignore => options[:ignore], :force => options[:force], :message => options[:message], :deploy => options["deploy"], :return_id => options["return_id"],
2917
- :files => options["files"], :output_dir => options["output_dir"], :job_slug => job_slug, :job_type => job_type
2998
+ :files => options["files"], :output_dir => output_dir, :job_slug => job_slug, :job_type => job_type, :suppress_exceptions => options["suppress_exceptions"], :debug_mode => options['debug_mode'], :git_diff => options["git_diff"]
2999
+
2918
3000
  end
2919
3001
 
2920
3002
  desc 'run cmd', 'Runs an experiment'
@@ -3059,6 +3141,8 @@ module Cnvrg
3059
3141
  method_option :data, :type => :string, :aliases => ["-d", "--data"], :default => ""
3060
3142
  method_option :data_commit, :type => :string, :aliases => ["-dc", "--data_commit"], :default => ""
3061
3143
  method_option :ignore, :type => :string, :aliases => ["-i", "--ignore"], :desc => "ignore following files", :default => ""
3144
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch", :default => ""
3145
+ method_option :gpu_util_from_docker, :type => :boolean, :aliases => ["--gpu-util-from-docker"], :desc => "take gpu utilization from job docker", :default => false
3062
3146
  method_option :remote, :type => :boolean, :aliases => ["--remote"], :default => false
3063
3147
  method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
3064
3148
  method_option :force, :type => :boolean, :aliases => ["-f", "--force"], :default => false
@@ -3066,6 +3150,7 @@ module Cnvrg
3066
3150
  method_option :periodic_sync, :type => :string, :aliases => ["-ps", "--periodic_sync"], :default => ""
3067
3151
  method_option :output_dir, :type => :string, :aliases => ["-o", "--output_dir"], :default => nil
3068
3152
  method_option :data_query, :type => :string, :aliases => ["-q", "--query"], :default => nil
3153
+ method_option :use_bash, :type => :boolean, :aliases => ["-b", "--use_bash"], :default => false
3069
3154
 
3070
3155
  def exec(*cmd)
3071
3156
  log = []
@@ -3132,8 +3217,12 @@ module Cnvrg
3132
3217
  end
3133
3218
  remote = options["remote"]
3134
3219
  if remote
3135
- docker_id = `cat /etc/hostname`
3136
- docker_id = docker_id.strip()
3220
+ if options["docker_id"].present?
3221
+ docker_id = options["docker_id"]
3222
+ else
3223
+ docker_id = `cat /etc/hostname`
3224
+ docker_id = docker_id.strip()
3225
+ end
3137
3226
  end
3138
3227
  is_on_gpu = options["gpu"]
3139
3228
  start_commit = @project.last_local_commit
@@ -3143,9 +3232,9 @@ module Cnvrg
3143
3232
 
3144
3233
  platform = RUBY_PLATFORM
3145
3234
  machine_name = Socket.gethostname
3235
+ machine_activity_slug = ENV["CNVRG_MACHINE_ACTIVITY"]
3146
3236
  begin
3147
- machine_activity = @exp.get_machine_activity(working_dir)
3148
- @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity, script_path, sync_before_terminate, periodic_sync)
3237
+ @exp.start(cmd, platform, machine_name, start_commit, title, email_notification, machine_activity_slug, script_path, sync_before_terminate, periodic_sync)
3149
3238
  log_message("Experiment's live results: #{Cnvrg::Helpers.remote_url}/#{@project.owner}/projects/#{@project.slug}/experiments/#{@exp.slug}", Thor::Shell::Color::GREEN)
3150
3239
  log_message("Running: #{cmd}\n", Thor::Shell::Color::BLUE)
3151
3240
  unless @exp.slug.nil?
@@ -3163,7 +3252,7 @@ module Cnvrg
3163
3252
  begin
3164
3253
  stats = remote ? usage_metrics_in_docker(docker_id) : Helpers.ubuntu? ? {memory: memory_usage, cpu: cpu_usage} : {}
3165
3254
  if is_on_gpu
3166
- gu = gpu_util
3255
+ gu = gpu_util(take_from_docker: options["gpu_util_from_docker"], docker_id: docker_id)
3167
3256
  stats['gpu_util'] = gu[0]
3168
3257
  stats['gpu'] = gu[1]
3169
3258
  end
@@ -3175,6 +3264,16 @@ module Cnvrg
3175
3264
  end
3176
3265
  end
3177
3266
  start_time = Time.now
3267
+ shell_type = options["use_bash"] ? "bash -l" : "sh"
3268
+ if @exp.get_cmd.present?
3269
+ cmd = @exp.get_cmd
3270
+ if options["docker_id"].present? # Escape for docker exec
3271
+ cmd = cmd.gsub("\"", "\\\"")
3272
+ end
3273
+ end
3274
+ if options["docker_id"].present?
3275
+ cmd = "docker exec -it #{options["docker_id"]} #{shell_type} -c \"#{cmd}\""
3276
+ end
3178
3277
  PTY.spawn(@exp.as_env, cmd) do |stdout, stdin, pid, stderr|
3179
3278
  begin
3180
3279
  stdout.each do |line|
@@ -3189,7 +3288,7 @@ module Cnvrg
3189
3288
  puts line
3190
3289
  end
3191
3290
  log << cur_log
3192
- if log.size >= 5
3291
+ if log.size >= 1
3193
3292
  @exp.upload_temp_log(log) unless log.empty?
3194
3293
  log = []
3195
3294
  elsif (start_time + 15.seconds) <= Time.now
@@ -3239,29 +3338,26 @@ module Cnvrg
3239
3338
  exp_success = false
3240
3339
  end
3241
3340
 
3242
- if sync_after
3243
- @exp.job_log(["Syncing Experiment"])
3244
- # Sync after run
3245
- if @project.is_git
3246
- output_dir = output_dir || @exp.output_dir
3247
- if output_dir.present?
3248
- upload(false, false, true, ignore, true, true,output_dir,"Experiment",@exp.slug )
3249
- # invoke :upload, [false, false, true, ignore, true, true], :output_dir => output_dir, :force=>true, :job_type=>'Experiment', :job_slug=>@exp.slug
3250
- end
3251
- else
3252
- upload(false, false, true, ignore, true, true,nil,"Experiment",@exp.slug )
3253
-
3254
- # invoke :upload, [false, false, true, ignore,true, true], :job_type=>'Experiment', :job_slug=>@exp.slug, :force=>true
3341
+ if sync_after
3342
+ @exp.job_log(["Syncing Experiment"])
3343
+ # Sync after run
3344
+ if @project.is_git
3345
+ output_dir = output_dir || @exp.output_dir
3346
+ if output_dir.present?
3347
+ upload(false, false, true, ignore, true, true, output_dir, "Experiment", @exp.slug, true )
3255
3348
  end
3256
-
3349
+ else
3350
+ upload(false, false, true, ignore, true, true, nil, "Experiment", @exp.slug, true )
3257
3351
  end
3352
+ end
3353
+
3258
3354
  end_commit = @project.last_local_commit
3259
3355
  if end_commit.present?
3260
3356
  @exp.job_log(["Experiment end commit: #{end_commit}"])
3261
3357
  end
3262
3358
 
3263
3359
  # log_thread.join
3264
- stats_thread.join
3360
+ stats_thread.join
3265
3361
 
3266
3362
  res = @exp.end(log, exit_status, end_commit, cpu_average, memory_average, end_time: end_time)
3267
3363
 
@@ -3409,8 +3505,8 @@ module Cnvrg
3409
3505
  local_folders_options = options["local_folders"]
3410
3506
  options_hash.except!("schedule", "recurring", "machine_type", "image", "upload_output", "grid", "data", "data_commit", "title",
3411
3507
  "local", "small", "medium", "large", "gpu", "gpuxl", "gpuxxl","max_time","dataset_only_tree",
3412
- "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets", "requirements", "prerun",
3413
- "email_notification_error", "email_notification_success", "emails")
3508
+ "data_query", "git_commit","git_branch", "restart_if_stuck","local_folders","output_dir", "commit", "datasets",
3509
+ "requirements", "prerun", "email_notification_error", "email_notification_success", "emails")
3414
3510
  exec_options = options_hash.map {|x| "--#{x[0]}=#{x[1]}"}.flatten.join(" ")
3415
3511
  command = "#{exec_options} #{remote} #{upload_output_option} #{cmd.flatten.join(" ")}"
3416
3512
  commit_to_run = options["commit"] || nil
@@ -4235,144 +4331,6 @@ module Cnvrg
4235
4331
 
4236
4332
  end
4237
4333
 
4238
- method_option :small, :type => :boolean, :aliases => ["-sm", "--small"], :default => false
4239
- method_option :medium, :type => :boolean, :aliases => ["-md", "--medium"], :default => false
4240
- method_option :large, :type => :boolean, :aliases => ["-lg", "--large"], :default => false
4241
- method_option :gpu, :type => :boolean, :aliases => ["--gpu"], :default => false
4242
- method_option :gpuxl, :type => :boolean, :aliases => ["--gpuxl"], :default => false
4243
- method_option :gpuxxl, :type => :boolean, :aliases => ["--gpuxxl"], :default => false
4244
- method_option :image, :type => :string, :aliases => ["-i", "--image"], :default => ""
4245
- method_option :public, :type => :boolean, :aliases => ["-p", "--public"], :default => false
4246
- method_option :base, :type => :boolean, :aliases => ["-b", "--base"], :default => false
4247
- method_option :python3, :type => :boolean, :aliases => ["--python3"], :default => false
4248
- method_option :docker_path, :type => :string, :aliases => ["--docker_path"], :default => ""
4249
-
4250
-
4251
- desc 'create_custom_image', 'run commands inside containers', :hide => true
4252
-
4253
- def build_image(image_name)
4254
- begin
4255
- verify_logged_in(false)
4256
- log_start(__method__, args, options)
4257
- instances = {"small" => options["small"], "medium" => options["medium"], "large" => options["large"],
4258
- "gpu" => options["gpu"], "gpuxl" => options["gpuxl"], "gpuxxl" => options["gpuxxl"]}
4259
- instance_type = get_instance_type(instances)
4260
- image_extend = options["image"]
4261
- public = options["public"]
4262
- base = options["base"]
4263
- python3 = options["python3"]
4264
- docker_path = options["docker_path"]
4265
- owner = CLI.get_owner
4266
- checks = Helpers.checkmark()
4267
- tar_path = nil
4268
- if !docker_path.nil? and !docker_path.empty?
4269
- docker_path = File.absolute_path(docker_path)
4270
- #create tar of the docker path: it could be a docker file, and it could be a docker folder
4271
- tar_path = File.expand_path('~') + "/.cnvrg/tmp/docker_#{File.basename docker_path}.tar.gz"
4272
- resp = create_docker_tar(docker_path, tar_path)
4273
- if !resp
4274
- log_message("Couldn't create tar from docker path", Thor::Shell::Color::RED)
4275
- FileUtils.rm_rf tar_path
4276
- exit(1)
4277
- end
4278
- files = Cnvrg::Files.new(owner, "")
4279
- resp = Images.create_new_custom_image_with_docker(instance_type, owner, image_name, public, base, image_extend, python3, tar_path, files)
4280
- if resp
4281
- end
4282
- else
4283
- log_message("Creating machine for your custom image, this may take a few moments...", Thor::Shell::Color::BLUE)
4284
- resp = Images.create_new_custom_image(instance_type, owner, image_name, public, base, image_extend, python3, nil)
4285
-
4286
- end
4287
-
4288
- if Cnvrg::CLI.is_response_success(resp, false)
4289
- image_slug = resp["result"]["slug"]
4290
- container = resp["result"]["machine_c"]
4291
- log_message("#{checks} Created image and machine successfully", Thor::Shell::Color::GREEN)
4292
- log_message("Connecting to machine", Thor::Shell::Color::BLUE)
4293
- ssh = Ssh.new(resp)
4294
- if !ssh.is_ssh
4295
- log_message("Couldn't connect to machine,aborting", Thor::Shell::Color::RED)
4296
- Images.revoke_custom_new_image(owner, image_slug)
4297
- end
4298
- log_message("run command until ctrl + c or quit is initiated", Thor::Shell::Color::BLUE)
4299
- begin
4300
- logs = []
4301
-
4302
- while true
4303
- command = ask("$>")
4304
- logs << {time: Time.now,
4305
- message: command,
4306
- type: "stdout"
4307
- }
4308
- if command.eql? "quit"
4309
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4310
- break
4311
- end
4312
- res = ssh.exec_command(command)
4313
- begin
4314
- res_parsed = JSON.parse(res)
4315
- res = res_parsed.join(",")
4316
- end
4317
-
4318
- puts res
4319
- logs << {time: Time.now,
4320
- message: res,
4321
- type: "stdout"
4322
- }
4323
- logs.flatten!
4324
-
4325
- end
4326
-
4327
- rescue SignalException
4328
- log_message("Commiting Image..", Thor::Shell::Color::BLUE)
4329
-
4330
- end
4331
- resp = Images.commit_custom_image(owner, image_slug, logs)
4332
- if Cnvrg::CLI.is_response_success(resp, false)
4333
- log_message("#{checks} Image commited successfuly, email will be sent when image is ready", Thor::Shell::Color::GREEN)
4334
- else
4335
- if image_slug
4336
- Images.revoke_custom_new_image(owner, image_slug)
4337
- end
4338
- if ssh
4339
- ssh.close_ssh()
4340
- end
4341
- log_message("Image couldn't be commited, rolling back changes", Thor::Shell::Color::RED)
4342
-
4343
- exit(1)
4344
- end
4345
- if ssh
4346
- ssh.close_ssh()
4347
- end
4348
-
4349
-
4350
- end
4351
- rescue => e
4352
- log_message("Error occurd, aborting", Thor::Shell::Color::RED)
4353
-
4354
- log_error(e)
4355
- if image_slug
4356
- Images.revoke_custom_new_image(owner, image_slug)
4357
- end
4358
- if ssh
4359
- ssh.close_ssh()
4360
- end
4361
-
4362
-
4363
- rescue SignalException
4364
- if image_slug
4365
- Images.revoke_custom_new_image(owner, image_slug)
4366
- end
4367
- if ssh
4368
- ssh.close_ssh
4369
- end
4370
- say "\nAborting"
4371
- exit(1)
4372
- end
4373
-
4374
- end
4375
-
4376
4334
 
4377
4335
  desc 'build', 'run commands inside containers', :hide => true
4378
4336
  method_option :install, :type => :string, :aliases => ["--i"], :default => nil, :desc => "Install from the given instructions file"
@@ -4566,66 +4524,7 @@ module Cnvrg
4566
4524
  end
4567
4525
 
4568
4526
 
4569
- desc 'upload_image', 'commit notebook changes to create a new notebook image', :hide =>true
4570
-
4571
- def upload_image_old(image_id, is_public, is_base, *message)
4572
- verify_logged_in(true)
4573
- log_start(__method__, args, options)
4574
- image = Docker::Image.get(image_id)
4575
- project_home = get_project_home
4576
- @project = Project.new(project_home)
4577
- last_local_commit = @project.last_local_commit
4578
- image_name = @project.slug + "#{last_local_commit}"
4579
- path = File.expand_path('~') + "/.cnvrg/tmp/#{image_name}.tar"
4580
- owner = Cnvrg::CLI.get_owner()
4581
- if !message.nil? or !message.empty?
4582
- message = message.join(" ")
4583
- end
4584
-
4585
- log_message("Saving image's current state", Thor::Shell::Color::BLUE)
4586
- image.save(path)
4587
-
4588
- begin
4589
- log_message("Compressing image file to upload", Thor::Shell::Color::BLUE)
4590
- gzipRes = system("gzip -f #{path}")
4591
- if !gzipRes
4592
-
4593
- log_message("Couldn't create tar file from image", Thor::Shell::Color::RED)
4594
- exit(1)
4595
- end
4596
- path = path + ".gz"
4597
- @files = Cnvrg::Files.new(owner, "")
4598
-
4599
- exit_status = $?.exitstatus
4600
- if exit_status == 0
4601
- log_message("Uploading image file", Thor::Shell::Color::BLUE)
4602
-
4603
- diff = container_changes(Dir.pwd)
4604
- res = @files.upload_image(path, image_name, owner, is_public, is_base, diff[1], diff[0], diff[2], message, image.commit_id)
4605
- if res
4606
- File.delete(path)
4607
- image_loc = is_project_with_docker(Dir.pwd)
4608
- image_loc.update_slug(res["result"]["id"])
4609
-
4610
- checks = Helpers.checkmark()
4611
- log_message("#{checks} Done", Thor::Shell::Color::GREEN)
4612
- else
4613
- log_message("Couldn't upload image", Thor::Shell::Color::RED)
4614
-
4615
- end
4616
- else
4617
- log_message("Couldn't create image file for: #{image_name}", Thor::Shell::Color::RED)
4618
- exit(1)
4619
- end
4620
- rescue => e
4621
- log_message("Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED)
4622
- log_error(e)
4623
- rescue SignalException
4624
4527
 
4625
- say "Couldn't upload image file for: #{image_name}", Thor::Shell::Color::RED
4626
- exit(1)
4627
- end
4628
- end
4629
4528
 
4630
4529
  desc '', '', :hide => true
4631
4530
 
@@ -4636,278 +4535,30 @@ module Cnvrg
4636
4535
 
4637
4536
  end
4638
4537
 
4639
- desc '', '', :hide => true
4640
-
4641
- def exec_container(container_id, *cmd)
4642
- container = Docker::Container.get(container_id)
4643
- container.start()
4644
- cnvrg_command = cmd.join(" ")
4645
- command = ["/bin/bash", "-lc", "#{cnvrg_command}"]
4646
- res = container.exec(command, tty: true, wait: 5400)[0]
4647
- say res
4648
- end
4649
-
4650
- desc '', '', :hide => true
4651
-
4652
- def port_container(container_id)
4653
- container = Docker::Container.get(container_id)
4654
- say container.json["HostConfig"]["PortBindings"]["8888/tcp"][0]["HostPort"]
4655
- end
4656
-
4657
- desc '', '', :hide => true
4658
-
4659
- def tensor_port_container(container_id)
4660
- container = Docker::Container.get(container_id)
4661
- say container.json["HostConfig"]["PortBindings"]["6006/tcp"][0]["HostPort"]
4662
- end
4663
-
4664
- desc '', '', :hide => true
4665
-
4666
- def stop_container(container_id)
4667
- container = Docker::Container.get(container_id)
4668
- container.stop()
4669
- container.remove()
4670
-
4671
- end
4672
-
4673
- desc '', '', :hide => true
4674
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4675
- method_option :app_dir, :type => :string, :aliases => ["-d"], :default => "/home/ds/notebooks"
4676
- method_option :cmd, :type => :string, :aliases => ["-c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4677
-
4678
-
4679
- def config_remote(image_name, port = 7654, tensport = 6006)
4680
- local_images = Docker::Image.all
4681
-
4682
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4683
- if docker_image_local.empty?
4684
- say "no image"
4685
- exit(1)
4686
- end
4687
-
4688
- begin
4689
- login_content = options["login"]
4690
- app_dir = options["app_dir"]
4691
- cmd = options["cmd"]
4692
- volume_from = options["volume"]
4693
-
4694
- image_settings = {
4695
- 'Image' => "#{image_name}:latest",
4696
-
4697
- 'Cmd' => cmd,
4698
- 'WorkingDir' => app_dir,
4699
- 'ExposedPorts' => {
4700
- '8888/tcp' => {},
4701
- },
4702
- 'HostConfig' => {
4703
- 'Binds' => ["/var/run/docker.sock:/var/run/docker.sock", "/usr/bin/docker:/usr/bin/docker"],
4704
- 'PortBindings' => {
4705
- '8888/tcp' => [
4706
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4707
- ],
4708
- '6006/tcp' => [
4709
- {'HostPort' => "#{tensport}", 'HostIp' => 'localhost'}
4710
- ],
4711
- },
4712
- },
4713
- }
4714
- container = Docker::Container.create(image_settings)
4715
- container.start()
4716
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4717
- container.exec(command, tty: true)
4718
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg"]
4719
- # container.exec(command, tty: true)
4720
- # command = ["/bin/bash", "-lc", "mkdir /home/ds/.cnvrg/tmp"]
4721
- # container.exec(command, tty: true)
4722
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4723
- container.exec(command, tty: true)
4724
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4725
- container.exec(command, tty: true)
4726
- say "#{container.id}:#{port}##{tensport}"
4727
- rescue => e
4728
- puts e
4729
- if e.message.include? "is not running"
4730
- return config_remote(image_name, port - 1, tensport - 1)
4731
- end
4732
-
4733
- if container
4734
- container.kill()
4735
- end
4736
- return false
4737
- end
4738
- end
4739
-
4740
-
4741
- desc '', '', :hide => true
4742
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4743
-
4744
- def config_netrc(container)
4745
-
4746
- login_content = options["login"]
4747
-
4748
- container = Docker::Container.get(container)
4749
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4750
- container.exec(command, tty: true)
4751
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4752
- container.exec(command, tty: true)
4753
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4754
- container.exec(command, tty: true)
4755
- say "OK"
4756
-
4757
- end
4758
-
4759
- desc '', '', :hide => true
4760
- method_option :login, :type => :string, :aliases => ["-l", "--l"], :default => ""
4761
- method_option :app_dir, :type => :string, :aliases => ["-d", "--d"], :default => "/home/ds/notebooks"
4762
- method_option :cmd, :type => :string, :aliases => ["-c", "--c"], :default => "/usr/local/cnvrg/run_ipython.sh"
4763
-
4764
-
4765
- def config_remote_gpu(image_name, port = 7654, tensport = 6006)
4766
- local_images = Docker::Image.all
4767
-
4768
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4769
- if docker_image_local.empty?
4770
- say "no image"
4771
- exit(1)
4772
- end
4773
-
4774
- begin
4775
- login_content = options["login"]
4776
- app_dir = options["app_dir"]
4777
- cmd = options["cmd"]
4778
-
4779
- # image_settings = {
4780
- # 'Image' => "#{image_name}:latest",
4781
- # 'User' => 'ds',
4782
- # 'Cmd' => cmd,
4783
- # 'WorkingDir' => app_dir,
4784
- # 'ExposedPorts' => {
4785
- # '8888/tcp' => {},
4786
- # },
4787
- # 'HostConfig' => {
4788
- # 'PortBindings' => {
4789
- # '8888/tcp' => [
4790
- # {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4791
- # ],
4792
- # '6006/tcp' => [
4793
- # {'HostPort' => "6006", 'HostIp' => 'localhost'}
4794
- # ],
4795
- # },
4796
- # },
4797
- # }
4798
-
4799
- container_id = `nvidia-docker run -itd -p #{port}:8888 -p #{tensport}:6006 -w #{app_dir} -v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi -v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker #{image_name}:latest #{cmd} `
4800
- container_id = container_id.gsub("\n", "")
4801
- container = Docker::Container.get(container_id)
4802
- # container.start()
4803
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4804
- container.exec(command, tty: true)
4805
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4806
- container.exec(command, tty: true)
4807
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4808
- container.exec(command, tty: true)
4809
- say "#{container.id}:#{port}##{tensport}"
4810
- rescue => e
4811
- if e.message.include? "is not running"
4812
- puts "running asgain with: #{port - 1} #{tensport - 1}"
4813
- return config_remote_gpu(image_name, port - 1, tensport - 1)
4814
- end
4815
-
4816
- if container
4817
- container.kill()
4538
+ desc 'Collect and send job utilization', '', :hide => true
4539
+ method_option :docker_id, :type => :string, :aliases => ["--docker_id"], :desc => "docker id to watch"
4540
+ method_option :is_on_gpu, :type => :boolean, :aliases => ["--is_on_gpu"], :desc => "is on gpu", :default => true
4541
+ def get_utilization()
4542
+ @exp = Experiment.new(ENV['CNVRG_OWNER'], ENV['CNVRG_PROJECT'], job_id: ENV['CNVRG_JOB_ID'])
4543
+ docker_id = options["docker_id"]
4544
+ while true do
4545
+ sleep 30
4546
+ begin
4547
+ stats = usage_metrics_in_docker(docker_id)
4548
+ if options["is_on_gpu"]
4549
+ gu = gpu_util(take_from_docker: true, docker_id: docker_id)
4550
+ stats['gpu_util'] = gu[0]
4551
+ stats['gpu'] = gu[1]
4552
+ end
4553
+ stats['docker_id'] = docker_id
4554
+ @exp.send_machine_stats [stats] unless stats.empty?
4555
+ rescue => e
4556
+ log_error(e)
4557
+ log_message("Failed to upload ongoing stats, continuing with experiment", Thor::Shell::Color::YELLOW)
4818
4558
  end
4819
- return false
4820
4559
  end
4821
4560
  end
4822
4561
 
4823
- desc '', '', :hide => true
4824
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4825
-
4826
- def config_flask_remote(image_name, port = 80)
4827
- local_images = Docker::Image.all
4828
-
4829
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4830
- if docker_image_local.empty?
4831
- say "no image"
4832
- exit(1)
4833
- end
4834
-
4835
- begin
4836
- login_content = options["login"]
4837
- image_settings = {
4838
- 'Image' => "#{image_name}:latest",
4839
- 'User' => 'ds',
4840
- 'Cmd' => '/usr/local/cnvrg/start_super.sh',
4841
- 'WorkingDir' => '/home/ds/app',
4842
- 'ExposedPorts' => {
4843
- '80/tcp' => {},
4844
- },
4845
- 'HostConfig' => {
4846
- 'PortBindings' => {
4847
- '80/tcp' => [
4848
- {'HostPort' => "#{port}", 'HostIp' => 'localhost'}
4849
- ],
4850
- },
4851
- },
4852
- }
4853
- container = Docker::Container.create(image_settings)
4854
- container.start()
4855
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4856
- container.exec(command, tty: true)
4857
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4858
- container.exec(command, tty: true)
4859
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4860
- container.exec(command, tty: true)
4861
- say "#{container.id}:#{port}"
4862
- rescue => e
4863
- pus e
4864
- if e.message.include? "is not running"
4865
- return "port is taken"
4866
- end
4867
- puts "error"
4868
- if container
4869
- container.kill()
4870
- end
4871
- return false
4872
- end
4873
- end
4874
-
4875
- desc '', '', :hide => true
4876
- method_option :login, :type => :string, :aliases => ["-l"], :default => ""
4877
-
4878
- def config_flask_remote_gpu(image_name, port = 80)
4879
- local_images = Docker::Image.all
4880
-
4881
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.eql? "#{image_name}:latest"}.flatten
4882
- if docker_image_local.empty?
4883
- say "no image"
4884
- exit(1)
4885
- end
4886
-
4887
- begin
4888
- login_content = options["login"]
4889
- container_id = `nvidia-docker run -itd -p 80:80 -w /home/ds/app #{image_name}:latest /usr/local/cnvrg/start_super.sh`
4890
- container_id = container_id.gsub("\n", "")
4891
- container = Docker::Container.get(container_id)
4892
- command = ["/bin/bash", "-lc", "sudo echo -e \"#{login_content}\" >/home/ds/.netrc"]
4893
- container.exec(command, tty: true)
4894
- command = ["/bin/bash", "-lc", "sudo chown -R ds:ds /home/ds/.netrc"]
4895
- container.exec(command, tty: true)
4896
- command = ["/bin/bash", "-lc", "sudo chmod 0600 /home/ds/.netrc"]
4897
- container.exec(command, tty: true)
4898
- say "#{container.id}:#{port}"
4899
- rescue => e
4900
- puts e
4901
- if e.message.include? "is not running"
4902
- return "port is taken"
4903
- end
4904
- puts "error"
4905
- if container
4906
- container.kill()
4907
- end
4908
- return false
4909
- end
4910
- end
4911
4562
 
4912
4563
  desc '', '', :hide => true
4913
4564
 
@@ -4933,39 +4584,10 @@ module Cnvrg
4933
4584
 
4934
4585
  end
4935
4586
 
4936
- desc 'upload_image', 'Upload new docker image to cnvrg', :hide => true
4937
- method_option :workdir, :type => :string, :aliases => ["-w","--workdir"], :desc => "workdir of docker image", :default => "/root"
4938
- method_option :description, :type => :string, :aliases => ["-d", "--description"], :desc => "description for docker image", :default => ""
4939
- method_option :user, :type => :string, :aliases => ["-u","--user"], :default => "root"
4940
- method_option :gpu, :type => :boolean, :aliases => ["-g","--gpu"], :default => false
4941
- def upload_image(image_name,image_path)
4942
- begin
4943
- verify_logged_in(false)
4944
- log_start(__method__, args, options)
4945
-
4946
- @image = Cnvrg::Images.new()
4947
- say "Uploading new docker image file", Thor::Shell::Color::BLUE
4948
- workdir = options[:workdir]
4949
- description = options[:description]
4950
- user = options[:user]
4951
- is_gpu = options[:gpu]
4952
- res = @image.upload_docker_image(image_path, image_name, workdir, user, description, is_gpu)
4953
- if res["status"] == 200
4954
- image_slug = res["id"]
4955
- owner = CLI.get_owner
4956
- image_url = "#{Cnvrg::Helpers.remote_url}/#{owner}/settings/images/#{image_slug}"
4957
- log_message("Successfully uploaded image: #{image_url}", Thor::Shell::Color::GREEN, true)
4958
-
4959
-
4960
- else
4961
- log_message("Couldn't upload image: #{image_name}", Thor::Shell::Color::RED, true)
4962
-
4963
- end
4964
- rescue => e
4965
- log_error(e)
4966
- end
4967
-
4968
-
4587
+ desc 'file_exists', description: '', hide: true
4588
+ def file_exists(file)
4589
+ exit(0) if File.exists? file
4590
+ exit(1)
4969
4591
  end
4970
4592
 
4971
4593
 
@@ -5145,29 +4767,40 @@ module Cnvrg
5145
4767
  method_option :project_slug, :type => :string, :aliases => ["-s"], :desc => "project slug"
5146
4768
  method_option :project_owner, :type => :string, :aliases => ["-o"], :desc => "project slug"
5147
4769
  method_option :frequency, :type => :numeric, :aliases => ["-f"], :desc => "poll frequency"
4770
+ method_option :fetch_slugs, :type => :boolean, :default => false, :desc => "Fetch experiments slugs to compare"
5148
4771
 
5149
4772
  def compare_experiments
5150
4773
  verify_logged_in(true)
5151
4774
  log_start(__method__, args, options)
5152
4775
  exps_map = {}
4776
+ copied_commits = []
5153
4777
 
5154
- if options[:slugs].blank?
4778
+ if options[:slugs].blank? and options[:fetch_slugs].blank?
5155
4779
  log_message("No experiments slugs given", Thor::Shell::Color::RED)
5156
4780
  return false
5157
4781
  end
5158
- slugs = options[:slugs].split(",")
5159
- if slugs.blank?
5160
- log_message("No experiments slugs given", Thor::Shell::Color::RED)
5161
- return false
4782
+ if options[:slugs].present?
4783
+ slugs = options[:slugs].split(",")
5162
4784
  end
4785
+
5163
4786
  frequency = options[:frequency] || 5
5164
4787
  namespace = options[:namespace]
5165
4788
  project_dir = is_cnvrg_dir(Dir.pwd)
5166
4789
  @project = Project.new(project_home=project_dir, slug: options[:project_slug], owner: options[:project_owner])
4790
+ fetch_slugs = options[:fetch_slugs]
4791
+ webapp_slug = ENV["CNVRG_JOB_ID"]
4792
+ if fetch_slugs and webapp_slug.present?
4793
+ slugs = @project.fetch_webapp_slugs(webapp_slug)
4794
+ end
4795
+ if slugs.blank?
4796
+ log_message("No experiments slugs given", Thor::Shell::Color::RED)
4797
+ return false
4798
+ end
5167
4799
 
4800
+ log_message("compare is running")
5168
4801
  while true
4802
+ log_message("compare is running for slugs #{slugs}")
5169
4803
  slugs.each do |exp_slug|
5170
-
5171
4804
  begin
5172
4805
  if exps_map[exp_slug].blank?
5173
4806
  exp = @project.get_experiment(exp_slug)["experiment"]
@@ -5181,15 +4814,23 @@ module Cnvrg
5181
4814
  log_message("#{exp_name} has ended, getting files from end commit", Thor::Shell::Color::BLUE)
5182
4815
  Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project)
5183
4816
  exps_map[exp_slug] = exp
5184
- elsif exp["machine_activity"].present?
4817
+ else
5185
4818
  log_message("#{exp_name} is running should get logs", Thor::Shell::Color::BLUE)
5186
- Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4819
+ success = Cnvrg::Helpers.get_experiment_events_log_via_kubectl(exp, namespace)
4820
+ if !success and exp["last_successful_commit"].present? and !copied_commits.include?(exp["last_successful_commit"])
4821
+ log_message("Failed to get kube files, using last commit", Thor::Shell::Color::BLUE)
4822
+ Cnvrg::Helpers.get_experiment_events_log_from_server(exp, @project, commit: exp["last_successful_commit"])
4823
+ copied_commits << exp["last_successful_commit"]
4824
+ end
5187
4825
  end
5188
4826
  rescue => e
5189
4827
  Cnvrg::Logger.log_error(e)
5190
4828
  end
5191
4829
  end
5192
4830
  sleep frequency
4831
+ if fetch_slugs
4832
+ slugs = @project.fetch_webapp_slugs(webapp_slug, slugs: slugs)
4833
+ end
5193
4834
  end
5194
4835
  end
5195
4836
 
@@ -5275,127 +4916,6 @@ module Cnvrg
5275
4916
  end
5276
4917
 
5277
4918
 
5278
- desc 'pull_image', 'downloads and loads an image', :hide => true
5279
-
5280
- def pull_image(image_name)
5281
- begin
5282
- verify_logged_in(false)
5283
- log_start(__method__, args, options)
5284
- owner = Cnvrg::CLI.get_owner()
5285
- image = Cnvrg::Images.image_exist(owner, image_name)
5286
- if !image
5287
- log_message("Couldn't find image in cnvrg repository", Thor::Shell::Color::RED)
5288
- exit(1)
5289
- end
5290
- path = download_image(image_name, image["slug"])
5291
- if path
5292
- log_message("Building image", Thor::Shell::Color::BLUE)
5293
- Docker.options[:read_timeout] = 216000
5294
- image = Docker::Image.build_from_dir(path, {'dockerfile' => 'Dockerfile.cpu', 't' => "#{image_name}:latest"}) do |v|
5295
- begin
5296
- if (log = JSON.parse(v)) && log.has_key?("stream")
5297
- next if log["stream"].starts_with? "Step"
5298
- $stdout.puts log["stream"]
5299
- end
5300
- rescue
5301
- end
5302
-
5303
- end
5304
-
5305
- if not image.nil?
5306
- FileUtils.rm_rf(path)
5307
- checks = Helpers.checkmark()
5308
- log_message("#{checks} Image built successfully", Thor::Shell::Color::GREEN)
5309
- return image
5310
- else
5311
-
5312
- log_message("Could not build image", Thor::Shell::Color::RED)
5313
- return false
5314
- end
5315
- else
5316
-
5317
- log_message("Could not download image", Thor::Shell::Color::RED)
5318
- return false
5319
-
5320
-
5321
- end
5322
-
5323
- # else
5324
- # path = download_image(image_name,image["slug"])
5325
- # if path
5326
- # image = Docker::Image.import(path)
5327
- # image.tag('repo' => image_name, 'tag' => 'latest')
5328
- # if not image.nil?
5329
- # say "Finished downloading image, cleaning up..", Thor::Shell::Color::GREEN
5330
- # FileUtils.rm(path)
5331
- # checks = Helpers.checkmark()
5332
- # say "#{checks} Done", Thor::Shell::Color::GREEN
5333
- # log_end(0)
5334
- # return image
5335
- # log_end(0)
5336
- # else
5337
- # say "Could not download image", Thor::Shell::Color::RED
5338
- # return false
5339
- # end
5340
- #
5341
- # end
5342
- # end
5343
- rescue => e
5344
-
5345
- log_message "Error: couldn't build image", Thor::Shell::Color::RED
5346
- log_error(e)
5347
-
5348
- rescue SignalException
5349
- say "\nAborting"
5350
- exit(1)
5351
- ensure
5352
- if path
5353
- FileUtils.rm_rf(path)
5354
-
5355
- end
5356
- end
5357
-
5358
-
5359
- end
5360
-
5361
- desc 'set_image', 'set image to a porject', :hide => true
5362
-
5363
- def set_image(docker_image)
5364
- verify_logged_in(true)
5365
- log_start(__method__, args, options)
5366
- working_dir = is_cnvrg_dir
5367
- project = Project.new(working_dir)
5368
-
5369
- local_images = Docker::Image.all
5370
- docker_image_local = local_images.map {|x| x.info["RepoTags"]}.flatten.select {|y| y.include? docker_image}.flatten
5371
- if docker_image_local.size == 0
5372
-
5373
- if yes? "Image wasn't found locally, pull image from cnvrg repository?", Thor::Shell::Color::YELLOW
5374
- image = pull(docker_image)
5375
- if image
5376
- log_message("downloaded image: #{docker_image}", Thor::Shell::Color::BLUE)
5377
- @image = Images.new(working_dir, docker_image)
5378
- else
5379
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5380
- exit(1)
5381
- end
5382
- else
5383
- log_message("Could not create a new project with docker, image was not found", Thor::Shell::Color::RED)
5384
- exit(1)
5385
-
5386
- end
5387
- elsif docker_image_local.size == 1
5388
- log_message("found image: #{docker_image_local[0]}, setting it up..", Thor::Shell::Color::BLUE)
5389
- @image = Images.new(working_dir, docker_image_local[0])
5390
- elsif docker_image_local.size > 1
5391
- log_message("found #{docker_image_local.size} images, choose the image name you want to use", Thor::Shell::Color::BLUE)
5392
- image_name = ask "#{docker_image_local.join("\n")}\n", Thor::Shell::Color::BLUE
5393
- image_name = image_name.strip
5394
- @image = Images.new(working_dir, image_name)
5395
- end
5396
- @image.update_image_activity(project.last_local_commit, nil)
5397
- end
5398
-
5399
4919
  desc 'check_pod_restart', 'Check pod restart', :hide => true
5400
4920
  def check_pod_restart
5401
4921
  Cnvrg::CLI.new.log_start(__method__, args, options)
@@ -5670,7 +5190,7 @@ module Cnvrg
5670
5190
 
5671
5191
  if dirs.size == 0
5672
5192
  log_message("Couldn't find cnvrg directory. Please start a new project", Thor::Shell::Color::RED)
5673
-
5193
+ puts Thread.current.backtrace
5674
5194
  exit(1)
5675
5195
  end
5676
5196
  return dirs.join("/")
@@ -5773,7 +5293,7 @@ module Cnvrg
5773
5293
  is_cnvrg = is_cnvrg_dir
5774
5294
  if !is_cnvrg
5775
5295
  say "You're not in a cnvrg project directory", Thor::Shell::Color::RED
5776
- exit(0)
5296
+ exit(1)
5777
5297
  end
5778
5298
 
5779
5299
  end
@@ -5919,21 +5439,6 @@ module Cnvrg
5919
5439
 
5920
5440
  end
5921
5441
 
5922
- def container_changes(dir)
5923
- container_id = is_project_with_docker(dir)
5924
- if not container_id
5925
- return false
5926
- end
5927
- container = Docker::Container.get(container_id)
5928
- command = ['/bin/bash', '-lc', '/opt/ds/bin/pip freeze']
5929
- pip = container.exec(command, tty: true)[0]
5930
- command = ["/bin/bash", "-lc", "dpkg -l"]
5931
- dpkg = container.exec(command, tty: true)[0]
5932
- command = ["/bin/bash", "-lc", "cat /home/ds/.bash_history"]
5933
- history = container.exec(command, tty: true)[0]
5934
- diff = [pip, dpkg, history]
5935
- return diff
5936
- end
5937
5442
 
5938
5443
  def is_port_taken(ip = Cnvrg::CLI::IP, port = Cnvrg::CLI::PORT, seconds = 1)
5939
5444
  Timeout::timeout(seconds) do
@@ -6116,13 +5621,17 @@ module Cnvrg
6116
5621
 
6117
5622
  end
6118
5623
 
6119
- def gpu_util
5624
+ def gpu_util(take_from_docker: false, docker_id: nil)
6120
5625
  if !Helpers.ubuntu?
6121
5626
  return 0.0
6122
5627
  end
6123
5628
  stats = [[],[]]
6124
5629
  begin
6125
- gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5630
+ if take_from_docker
5631
+ gpu_stats = `docker exec -it #{docker_id} sh -c 'nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv'`
5632
+ else
5633
+ gpu_stats = `nvidia-smi --query-gpu=utilization.gpu,utilization.memory --format=csv`
5634
+ end
6126
5635
 
6127
5636
  if !gpu_stats.nil?
6128
5637
  gpu_stats = gpu_stats.split("\n")[1..-1]