cnvrg 1.9.9.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/bin/cnvrg +9 -0
  3. data/cnvrg.gemspec +47 -0
  4. data/lib/cnvrg.rb +7 -0
  5. data/lib/cnvrg/Images.rb +351 -0
  6. data/lib/cnvrg/api.rb +247 -0
  7. data/lib/cnvrg/api_v2.rb +14 -0
  8. data/lib/cnvrg/auth.rb +79 -0
  9. data/lib/cnvrg/cli.rb +5715 -0
  10. data/lib/cnvrg/cli/flow.rb +166 -0
  11. data/lib/cnvrg/cli/library_cli.rb +33 -0
  12. data/lib/cnvrg/cli/subcommand.rb +28 -0
  13. data/lib/cnvrg/cli/task.rb +116 -0
  14. data/lib/cnvrg/colors.rb +8 -0
  15. data/lib/cnvrg/connect_job_ssh.rb +31 -0
  16. data/lib/cnvrg/data.rb +335 -0
  17. data/lib/cnvrg/datafiles.rb +1325 -0
  18. data/lib/cnvrg/dataset.rb +892 -0
  19. data/lib/cnvrg/downloader/client.rb +101 -0
  20. data/lib/cnvrg/downloader/clients/azure_client.rb +45 -0
  21. data/lib/cnvrg/downloader/clients/gcp_client.rb +50 -0
  22. data/lib/cnvrg/downloader/clients/s3_client.rb +78 -0
  23. data/lib/cnvrg/experiment.rb +209 -0
  24. data/lib/cnvrg/files.rb +1047 -0
  25. data/lib/cnvrg/flow.rb +137 -0
  26. data/lib/cnvrg/helpers.rb +422 -0
  27. data/lib/cnvrg/helpers/agent.rb +188 -0
  28. data/lib/cnvrg/helpers/executer.rb +213 -0
  29. data/lib/cnvrg/hyper.rb +21 -0
  30. data/lib/cnvrg/image.rb +113 -0
  31. data/lib/cnvrg/image_cli.rb +25 -0
  32. data/lib/cnvrg/job_cli.rb +73 -0
  33. data/lib/cnvrg/job_ssh.rb +48 -0
  34. data/lib/cnvrg/logger.rb +111 -0
  35. data/lib/cnvrg/org_helpers.rb +5 -0
  36. data/lib/cnvrg/project.rb +822 -0
  37. data/lib/cnvrg/result.rb +29 -0
  38. data/lib/cnvrg/runner.rb +49 -0
  39. data/lib/cnvrg/ssh.rb +94 -0
  40. data/lib/cnvrg/storage.rb +128 -0
  41. data/lib/cnvrg/task.rb +165 -0
  42. data/lib/cnvrg/version.rb +3 -0
  43. metadata +460 -0
@@ -0,0 +1,188 @@
1
+ class Cnvrg::Helpers::Agent
2
+
3
+ module Status
4
+ STARTED = :started
5
+ RUNNING = :running
6
+ FINISHED = :finished
7
+ ABORTED = "aborted"
8
+ end
9
+
10
+ module LogLevel
11
+ INFO = :info
12
+ PURE = :pure
13
+ ERROR = :error
14
+ end
15
+
16
+ #### This class represent a single command in the system.
17
+ #### it runs under an executer (machine_activity) so it should have all the executer
18
+ #### params
19
+ def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
20
+ @executer = executer
21
+ @slug = slug
22
+ @files_exist = files_exist
23
+ @container_name = container_name
24
+ @run_in_slave = @container_name.downcase == "slave"
25
+ @log_interval = send_log_interval
26
+ # https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
27
+ if timeout.blank? or timeout.negative?
28
+ @timeout = 0
29
+ else
30
+ @timeout = timeout
31
+ end
32
+ @logs_regex = logs_regex || []
33
+ @async = async
34
+ @command = command
35
+ @send_logs = send_logs
36
+ @retries = retries.try(:to_i) ## How many times the user asked to try to execute the command again
37
+ @sleep_before_retry = sleep_before_retry
38
+ @real_execution_retries = 0 ## How many times the command really executed until success
39
+ @single_quotes = single_quotes
40
+ @docker_user = ""
41
+ @shell_type = use_bash ? "bash -l" : "sh"
42
+ if docker_user.present?
43
+ @docker_user = " --user #{docker_user}"
44
+ end
45
+ if @run_in_slave
46
+ if @single_quotes
47
+ @command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
48
+ else
49
+ @command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
50
+ end
51
+ end
52
+ @output = []
53
+ @errors = []
54
+ @exit_status = nil
55
+ @is_running = true
56
+ @pid = nil
57
+ end
58
+
59
+ def base_url
60
+ [@executer.activity_url, "commands", @slug].join("/")
61
+ end
62
+
63
+ def should_run?
64
+ if @files_exist.present?
65
+ file_doesnt_exists = @files_exist.find do |file|
66
+ not File.exists? file
67
+ end
68
+ return true if file_doesnt_exists.blank?
69
+ log_internal("Can't find file #{file_doesnt_exists}, stopping the job")
70
+ return false
71
+ end
72
+ true
73
+ end
74
+
75
+
76
+ def exec!
77
+ log_internal("Command: #{@command} with slug: #{@slug} started!")
78
+ if should_run?
79
+ send_logs(status: Status::STARTED)
80
+ periodic_thread
81
+ execute_command
82
+ else
83
+ @exit_status = 127
84
+ end
85
+ finish_log = "Command: #{@command} with slug: #{@slug} finished"
86
+ finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
87
+ log_internal(finish_log)
88
+ send_logs(exit_status: @exit_status, status: Status::FINISHED)
89
+ end
90
+
91
+ def get_logs_to_send
92
+ new_logs = @output.pop(@output.length)
93
+ new_errors = @errors.pop(@errors.length)
94
+ [new_logs, new_errors]
95
+ end
96
+
97
+
98
+ def periodic_thread
99
+ Thread.new do
100
+ while @exit_status.blank?
101
+ Thread.exit if @log_interval.blank?
102
+ sleep(@log_interval)
103
+ send_logs
104
+ end
105
+ end
106
+ end
107
+
108
+ def retry_command
109
+ @retries -=1
110
+ sleep @sleep_before_retry
111
+ @real_execution_retries +=1
112
+ execute_command
113
+ end
114
+
115
+ def execute_command
116
+ Timeout.timeout(@timeout) do
117
+ PTY.spawn(@command) do |stdout, stdin, pid, stderr|
118
+ @pid = pid
119
+ begin
120
+ if stdout.present?
121
+ stdout.each do |line|
122
+ log_internal(line, level: LogLevel::PURE)
123
+ line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
124
+ @output << {log: line, timestamp: Time.now}
125
+ end
126
+ end
127
+
128
+ if stderr.present?
129
+ stderr.each do |line|
130
+ line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
131
+ log_internal(line, level: LogLevel::ERROR)
132
+ @errors << {log: line, timestamp: Time.now}
133
+ end
134
+ end
135
+ rescue Errno::EIO => e
136
+ next
137
+ rescue => e
138
+ log_internal(e.message, level: LogLevel::ERROR)
139
+ log_internal(e.backtrace.join("\n"), level: LogLevel::ERROR)
140
+ @errors << {log: e.message, timestamp: Time.now}
141
+ end
142
+ ::Process.wait pid
143
+ end
144
+ end
145
+ @exit_status = $?.exitstatus
146
+ rescue Timeout::Error
147
+ Process.kill(0, @pid)
148
+ @errors << {log: "Command timed out!", timestamp: Time.now}
149
+ log_internal("Command timed out!", level: LogLevel::ERROR)
150
+ @exit_status = 124
151
+ ensure
152
+ retry_command if @retries != 0 and @exit_status !=0
153
+ @exit_status
154
+ end
155
+
156
+ private
157
+ def send_logs(exit_status: nil, status: Status::RUNNING)
158
+ logs, error_logs = get_logs_to_send
159
+ # Filter logs only if not failed
160
+ if exit_status.blank? or exit_status == 0
161
+ logs = filter_logs_by_regex(logs)
162
+ end
163
+ ### there is no logs, no exit_status and status is running.
164
+ ### this condition let us call "send_logs" every interval iteration.
165
+ if logs.blank? and error_logs.blank? and exit_status.blank? and status == Status::RUNNING
166
+ return
167
+ end
168
+ Cnvrg::API.request(base_url, 'PUT', {logs: logs, error_logs: error_logs, exit_status: exit_status, status: status, execution_retries: @real_execution_retries, pid: @pid})
169
+ end
170
+
171
+ def log_internal(log, level: LogLevel::INFO)
172
+ if level == LogLevel::PURE
173
+ puts(log)
174
+ else
175
+ puts({log: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity}.to_json)
176
+ end
177
+ STDOUT.flush
178
+ end
179
+
180
+ def filter_logs_by_regex(logs)
181
+ logs.select do |log|
182
+ next true if @send_logs
183
+ @logs_regex.find do |regexp_str|
184
+ Regexp.new(regexp_str).match(log[:log]).present?
185
+ end
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,213 @@
1
+ require 'cnvrg/helpers/agent'
2
+ class Cnvrg::Helpers::Executer
3
+ attr_reader :machine_activity, :agent_id, :slave_id
4
+
5
+
6
+ ### this class represent a machine_activity. it will poll the commands, communicate with the
7
+ # server (poll commands) and let the server know the status of this executer.
8
+ def initialize(owner: nil, machine_activity: nil, poll_every: 30, job_id: nil)
9
+ @owner = owner
10
+ @job_id = job_id
11
+ @poll_every = poll_every
12
+ @machine_activity = machine_activity
13
+ @commands_q = Queue.new
14
+ @files_q = Queue.new
15
+ @agent_id = nil
16
+ @slave_id = nil
17
+ end
18
+
19
+ def create_file_cmd(path, content)
20
+ if path.include? "~"
21
+ path = File.expand_path(path)
22
+ end
23
+ FileUtils.mkdir_p(File.dirname(path))
24
+ File.open(path, "w+"){|f| f.write(content)}
25
+ end
26
+
27
+ def handle_files(files)
28
+ (files || {}).each do |path, content|
29
+ create_file_cmd(path, content)
30
+ end
31
+ end
32
+
33
+ def activity_url
34
+ ['users', @owner, 'machine_activities', @machine_activity].join("/")
35
+ end
36
+
37
+ def executer_stats
38
+ return @stats if @stats.present?
39
+ Cnvrg::Logger.log_info("getting containers")
40
+ @agent_id, @slave_id = containers
41
+ Cnvrg::Logger.log_info("got containers")
42
+ pod_name, node_name = get_node_and_pod_names
43
+ @stats = {
44
+ pod_name: pod_name,
45
+ node_name: node_name,
46
+ agent: {
47
+ container_id: @agent_id,
48
+ workdir: `pwd`.strip,
49
+ homedir: current_homedir,
50
+ user: `whoami`.strip,
51
+ user_id: `id -u`.strip,
52
+ group_id: `id -g`.strip,
53
+ cnvrg: Cnvrg::VERSION
54
+ },
55
+ slave: {
56
+ container_id: @slave_id,
57
+ workdir: run_in_slave('pwd'),
58
+ homedir: slave_homedir,
59
+ spark_path: spark_path,
60
+ user: run_in_slave( 'whoami'),
61
+ cnvrg: run_in_slave( 'which cnvrg'),
62
+ has_bash: run_in_slave( 'which bash'),
63
+ user_id: run_in_slave( 'id -u'),
64
+ group_id: run_in_slave( 'id -g'),
65
+ python_version: run_in_slave( 'python --version'),
66
+ python3_version: run_in_slave( 'python3 --version'),
67
+ pip_version: run_in_slave( 'pip --version'),
68
+ pip3_version: run_in_slave( 'pip3 --version')
69
+ },
70
+ }
71
+ @stats
72
+ end
73
+
74
+ def containers
75
+ agent_id = nil
76
+ slave_id = nil
77
+ while agent_id.blank? or slave_id.blank?
78
+ grep_by = @job_id
79
+ grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
80
+ cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
81
+ agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
82
+ slave_id = cntrs.find{|container_name| container_name.include? "slave"}.split(",").first rescue nil
83
+ sleep(5)
84
+ end
85
+ if slave_id.blank?
86
+ raise "Can't find slave id"
87
+ end
88
+ [agent_id, slave_id]
89
+ end
90
+
91
+ def current_homedir
92
+ `env | grep -w HOME`.strip.split("=").try(:last)
93
+ end
94
+
95
+ def spark_path
96
+ run_in_slave("env | grep SPARK_HOME").strip.split("=").try(:last)
97
+ end
98
+
99
+ def slave_homedir()
100
+ run_in_slave("env | grep -w HOME").split("=").try(:last)
101
+ end
102
+
103
+ def slave_env
104
+ run_in_slave("env").split("\n").map{|x| x.split("=")}
105
+ end
106
+
107
+ def run_in_slave(command)
108
+ `docker exec -i #{@slave_id} sh -c '#{command}'`.strip
109
+ end
110
+
111
+
112
+ def poll
113
+ resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
114
+ commands = resp["commands"]
115
+ files = resp["files"]
116
+ handle_files(files)
117
+ commands.each{|cmd| @commands_q.push(cmd)}
118
+ rescue => e
119
+ Cnvrg::Logger.log_error(e)
120
+ end
121
+
122
+ def init
123
+ resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
124
+ machine_activity = resp["machine_activity"]
125
+ Cnvrg::Logger.log_info("Got back machine activity #{machine_activity}")
126
+ if machine_activity.present? and @machine_activity != machine_activity
127
+ Cnvrg::Logger.log_info("Changing to machine activity #{machine_activity}")
128
+ machine_activity_yml = {slug: machine_activity}
129
+ File.open("/conf/.machine_activity.yml", "w+") {|f| f.write machine_activity_yml.to_yaml}
130
+ @machine_activity = machine_activity
131
+ end
132
+ rescue => e
133
+ Cnvrg::Logger.log_error(e)
134
+ end
135
+
136
+ def polling_thread
137
+ while true
138
+ poll
139
+ sleep(@poll_every)
140
+ end
141
+ end
142
+
143
+ def main_thread
144
+ init
145
+ Thread.new do
146
+ polling_thread
147
+ end
148
+ execute_cmds
149
+ end
150
+
151
+ def execute_cmds
152
+ pids = []
153
+ while true
154
+ if @commands_q.empty?
155
+ sleep(5)
156
+ next
157
+ end
158
+ cmd = @commands_q.pop.symbolize_keys
159
+ command_json = Cnvrg::API.request([activity_url, "commands", cmd[:slug]].join('/'), "GET")
160
+
161
+ cmd_status = command_json["status"] rescue ""
162
+
163
+ if cmd_status == Cnvrg::Helpers::Agent::Status::ABORTED
164
+ Cnvrg::Logger.log_info("stopping job because command #{cmd[:slug]} with status #{cmd_status}")
165
+ next
166
+ end
167
+ pid = Process.fork do
168
+ Cnvrg::Helpers::Agent.new(executer: self, **cmd).exec!
169
+ end
170
+ if cmd[:async].blank?
171
+ Process.waitpid(pid)
172
+ else
173
+ Process.detach(pid)
174
+ end
175
+ pids << pid
176
+ ######
177
+ end
178
+ pids
179
+ end
180
+
181
+ def merge_log_block(logs)
182
+ logs.group_by {|log| log[:timestamp].to_s}
183
+ .map {|ts, logz| {timestamp: ts, logs: logz.map {|l| l[:log]}.join("\n")}}
184
+ end
185
+
186
+ def get_node_and_pod_names
187
+ pod_name = `hostname`.strip rescue nil
188
+ node_name = nil
189
+ if pod_name.present?
190
+ pod_describe = `kubectl -n cnvrg get pod #{pod_name} -o json` rescue nil
191
+ pod_describe = JSON.parse(pod_describe) rescue {}
192
+ node_name = pod_describe["spec"]["nodeName"] rescue nil
193
+ end
194
+ [pod_name, node_name]
195
+ end
196
+
197
+ def pre_pod_stop
198
+ pod_name, node_name = get_node_and_pod_names
199
+ pod_events = get_pod_events(pod_name)
200
+ node_events = get_node_events(node_name)
201
+ Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
202
+ end
203
+
204
+ def get_pod_events(pod_name)
205
+ return if pod_name.blank?
206
+ `kubectl get event --namespace cnvrg --field-selector involvedObject.name=#{pod_name} -o json`
207
+ end
208
+
209
+ def get_node_events(node_name)
210
+ return if node_name.blank?
211
+ `kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
212
+ end
213
+ end
@@ -0,0 +1,21 @@
1
+ module Cnvrg
2
+ class Hyper
3
+ def initialize(project_path, path)
4
+ @project = Cnvrg::Project.new(project_path)
5
+ @content = YAML.load_file(path)
6
+ @base_resource = "users/#{@project.owner}"
7
+ @params = []
8
+ end
9
+
10
+ def resolve_params
11
+ resp = Cnvrg::API.request(@base_resource + "/resolve_grid", "POST", {hyper_search: @content})
12
+ unless Cnvrg::CLI.is_response_success(resp, false)
13
+ return nil
14
+ end
15
+ resp['result']['params'].each do |param|
16
+ @params << {key: param.first.keys.first, value: param.map{|p| p.values}.flatten.join(',')}
17
+ end
18
+ @params
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,113 @@
1
+ module Cnvrg
2
+ class Image
3
+ # attr_reader :image_name, :image_tag, :is_docker, :project_slug, :commit_id, :owner, :port, :image_slug
4
+
5
+
6
+ def initialize(image_id)
7
+ begin
8
+ @cli = Cnvrg::CLI.new
9
+ home_dir = File.expand_path('~')
10
+ config = YAML.load_file(home_dir+"/.cnvrg/config.yml")
11
+ @owner = config.to_h[:owner]
12
+ @username = config.to_h[:username]
13
+ @image_id = image_id
14
+ rescue => e
15
+ @owner = ""
16
+ @username = ""
17
+ @cli.log_message("cnvrg is not configured")
18
+ end
19
+ end
20
+
21
+ def build
22
+ image_data = get_image_data
23
+ file_name = "Dockerfile-#{@image_id}"
24
+ File.new(file_name, "w+")
25
+ if image_data["reqs_file_path"].present? and image_data["from_image_name"]
26
+ File.open(file_name, "w+") do |i|
27
+ i.write("FROM #{image_data["from_image_name"]}")
28
+ i.write("ADD requirements.txt requirements.txt")
29
+ i.write("RUN pip3 install -r requirements.txt")
30
+ end
31
+ else
32
+ open(file_name, 'wb') do |file|
33
+ file << open(image_data["docker_file_path"]).read
34
+ end
35
+ end
36
+ docker_url = "#{image_data["docker_name"]}"
37
+ command = {:type=>"notify",
38
+ :title=>"docker build",
39
+ :logs=>true,
40
+ :before_execute_log=>"Building docker image",
41
+ :timeout=>3600,
42
+ :command=>"sudo docker build . -t #{docker_url} -f #{file_name}"}
43
+ @executer = Helpers::Executer.new(project: @project, job_type: "image", job_id: @image_id, image: self)
44
+ exit_status, output, errors, _, _ = @executer.execute(command)
45
+ all_logs = join_logs(output, errors)
46
+ if exit_status != 0
47
+ raise StandardError.new(all_logs)
48
+ end
49
+ if ENV["CNVRG_IMAGE_BUILD_USERNAME"].present? and ENV["CNVRG_IMAGE_BUILD_PASSWORD"].present?
50
+ if ENV["CNVRG_IMAGE_BUILD_REGISTRY"].present?
51
+ command = {:type=>"notify",
52
+ :no_stdout => true,
53
+ :title=>"docker login",
54
+ :logs=>true,
55
+ :command=>"sudo docker login #{ENV["CNVRG_IMAGE_BUILD_REGISTRY"]} --username=#{ENV["CNVRG_IMAGE_BUILD_USERNAME"]} --password=\"#{ENV["CNVRG_IMAGE_BUILD_PASSWORD"]}\""}
56
+ else
57
+ command = {:type=>"notify",
58
+ :no_stdout => true,
59
+ :title=>"docker login",
60
+ :logs=>true,
61
+ :command=>"sudo docker login --username=#{ENV["CNVRG_IMAGE_BUILD_USERNAME"]} --password=\"#{ENV["CNVRG_IMAGE_BUILD_PASSWORD"]}\""}
62
+ end
63
+ exit_status, output, errors, _, _ = @executer.execute(command)
64
+ all_logs = join_logs(output, errors)
65
+ if exit_status != 0
66
+ raise StandardError.new(all_logs)
67
+ end
68
+ end
69
+ command = {:type=>"notify",
70
+ :title=>"docker push",
71
+ :logs=>true,
72
+ :before_execute_log=>"Pushing docker image",
73
+ :timeout=>3600,
74
+ :command=>"sudo docker push #{docker_url}"}
75
+ exit_status, output, errors, _, _ = @executer.execute(command)
76
+ all_logs = join_logs(output, errors)
77
+ if exit_status != 0
78
+ raise StandardError.new(all_logs)
79
+ end
80
+ post_build_update(true)
81
+ rescue => e
82
+ @cli.log_message("Image Build failed")
83
+ post_build_update(false, e.message)
84
+ end
85
+
86
+ def get_image_data
87
+ response = Cnvrg::API.request("users/#{@owner}/images/#{@image_id}/image_start_build", 'GET')
88
+ CLI.is_response_success(response)
89
+ return response["image"]
90
+ end
91
+
92
+ def post_build_update(success, message = "")
93
+ response = Cnvrg::API.request("users/#{@owner}/images/#{@image_id}/image_end_build", 'POST', {success: success, message: message})
94
+ CLI.is_response_success(response)
95
+ return response["image"]
96
+ end
97
+
98
+
99
+ def job_log(logs, level: 'info', step: nil, job_type: "image", job_id: @image_id)
100
+ logs = [logs].flatten
101
+ logs.each_slice(10).each do |temp_logs|
102
+ Cnvrg::API.request("users/#{@owner}/images/#{@image_id}/log", "POST", {job_type: job_type, job_id: job_id, logs: temp_logs, log_level: level, step: step, timestamp: Time.now})
103
+ sleep(1)
104
+ end
105
+ end
106
+
107
+
108
+ def join_logs(output, errors)
109
+ output.map{ |o| o[:logs]}.join(" ") + " " + errors.map{ |o| o[:logs]}.join(" ")
110
+ end
111
+
112
+ end
113
+ end