cnvrg 1.9.9.9.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/bin/cnvrg +9 -0
  3. data/cnvrg.gemspec +47 -0
  4. data/lib/cnvrg.rb +7 -0
  5. data/lib/cnvrg/Images.rb +351 -0
  6. data/lib/cnvrg/api.rb +247 -0
  7. data/lib/cnvrg/api_v2.rb +14 -0
  8. data/lib/cnvrg/auth.rb +79 -0
  9. data/lib/cnvrg/cli.rb +5715 -0
  10. data/lib/cnvrg/cli/flow.rb +166 -0
  11. data/lib/cnvrg/cli/library_cli.rb +33 -0
  12. data/lib/cnvrg/cli/subcommand.rb +28 -0
  13. data/lib/cnvrg/cli/task.rb +116 -0
  14. data/lib/cnvrg/colors.rb +8 -0
  15. data/lib/cnvrg/connect_job_ssh.rb +31 -0
  16. data/lib/cnvrg/data.rb +335 -0
  17. data/lib/cnvrg/datafiles.rb +1325 -0
  18. data/lib/cnvrg/dataset.rb +892 -0
  19. data/lib/cnvrg/downloader/client.rb +101 -0
  20. data/lib/cnvrg/downloader/clients/azure_client.rb +45 -0
  21. data/lib/cnvrg/downloader/clients/gcp_client.rb +50 -0
  22. data/lib/cnvrg/downloader/clients/s3_client.rb +78 -0
  23. data/lib/cnvrg/experiment.rb +209 -0
  24. data/lib/cnvrg/files.rb +1047 -0
  25. data/lib/cnvrg/flow.rb +137 -0
  26. data/lib/cnvrg/helpers.rb +422 -0
  27. data/lib/cnvrg/helpers/agent.rb +188 -0
  28. data/lib/cnvrg/helpers/executer.rb +213 -0
  29. data/lib/cnvrg/hyper.rb +21 -0
  30. data/lib/cnvrg/image.rb +113 -0
  31. data/lib/cnvrg/image_cli.rb +25 -0
  32. data/lib/cnvrg/job_cli.rb +73 -0
  33. data/lib/cnvrg/job_ssh.rb +48 -0
  34. data/lib/cnvrg/logger.rb +111 -0
  35. data/lib/cnvrg/org_helpers.rb +5 -0
  36. data/lib/cnvrg/project.rb +822 -0
  37. data/lib/cnvrg/result.rb +29 -0
  38. data/lib/cnvrg/runner.rb +49 -0
  39. data/lib/cnvrg/ssh.rb +94 -0
  40. data/lib/cnvrg/storage.rb +128 -0
  41. data/lib/cnvrg/task.rb +165 -0
  42. data/lib/cnvrg/version.rb +3 -0
  43. metadata +460 -0
@@ -0,0 +1,188 @@
1
+ class Cnvrg::Helpers::Agent
2
+
3
+ module Status
4
+ STARTED = :started
5
+ RUNNING = :running
6
+ FINISHED = :finished
7
+ ABORTED = "aborted"
8
+ end
9
+
10
+ module LogLevel
11
+ INFO = :info
12
+ PURE = :pure
13
+ ERROR = :error
14
+ end
15
+
16
+ #### This class represent a single command in the system.
17
+ #### it runs under an executer (machine_activity) so it should have all the executer
18
+ #### params
19
+ def initialize(executer: nil, slug: nil, command: nil, container_name: nil, send_log_interval: 60, timeout: -1, logs_regex: [], async: false, send_logs: false, files_exist: [], retries: 0, sleep_before_retry: 30, single_quotes: false, docker_user: nil, use_bash: false, **kwargs)
20
+ @executer = executer
21
+ @slug = slug
22
+ @files_exist = files_exist
23
+ @container_name = container_name
24
+ @run_in_slave = @container_name.downcase == "slave"
25
+ @log_interval = send_log_interval
26
+ # https://ruby-doc.org/stdlib-2.5.1/libdoc/timeout/rdoc/Timeout.html timeout should be 0 for running forever
27
+ if timeout.blank? or timeout.negative?
28
+ @timeout = 0
29
+ else
30
+ @timeout = timeout
31
+ end
32
+ @logs_regex = logs_regex || []
33
+ @async = async
34
+ @command = command
35
+ @send_logs = send_logs
36
+ @retries = retries.try(:to_i) ## How many times the user asked to try to execute the command again
37
+ @sleep_before_retry = sleep_before_retry
38
+ @real_execution_retries = 0 ## How many times the command really executed until success
39
+ @single_quotes = single_quotes
40
+ @docker_user = ""
41
+ @shell_type = use_bash ? "bash -l" : "sh"
42
+ if docker_user.present?
43
+ @docker_user = " --user #{docker_user}"
44
+ end
45
+ if @run_in_slave
46
+ if @single_quotes
47
+ @command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c '#{@command}'"
48
+ else
49
+ @command = "docker exec #{@docker_user} -it #{@executer.slave_id} #{@shell_type} -c \"#{@command}\""
50
+ end
51
+ end
52
+ @output = []
53
+ @errors = []
54
+ @exit_status = nil
55
+ @is_running = true
56
+ @pid = nil
57
+ end
58
+
59
+ def base_url
60
+ [@executer.activity_url, "commands", @slug].join("/")
61
+ end
62
+
63
+ def should_run?
64
+ if @files_exist.present?
65
+ file_doesnt_exists = @files_exist.find do |file|
66
+ not File.exists? file
67
+ end
68
+ return true if file_doesnt_exists.blank?
69
+ log_internal("Can't find file #{file_doesnt_exists}, stopping the job")
70
+ return false
71
+ end
72
+ true
73
+ end
74
+
75
+
76
+ def exec!
77
+ log_internal("Command: #{@command} with slug: #{@slug} started!")
78
+ if should_run?
79
+ send_logs(status: Status::STARTED)
80
+ periodic_thread
81
+ execute_command
82
+ else
83
+ @exit_status = 127
84
+ end
85
+ finish_log = "Command: #{@command} with slug: #{@slug} finished"
86
+ finish_log += " after #{@real_execution_retries} retries" if @real_execution_retries > 0
87
+ log_internal(finish_log)
88
+ send_logs(exit_status: @exit_status, status: Status::FINISHED)
89
+ end
90
+
91
+ def get_logs_to_send
92
+ new_logs = @output.pop(@output.length)
93
+ new_errors = @errors.pop(@errors.length)
94
+ [new_logs, new_errors]
95
+ end
96
+
97
+
98
+ def periodic_thread
99
+ Thread.new do
100
+ while @exit_status.blank?
101
+ Thread.exit if @log_interval.blank?
102
+ sleep(@log_interval)
103
+ send_logs
104
+ end
105
+ end
106
+ end
107
+
108
+ def retry_command
109
+ @retries -=1
110
+ sleep @sleep_before_retry
111
+ @real_execution_retries +=1
112
+ execute_command
113
+ end
114
+
115
+ def execute_command
116
+ Timeout.timeout(@timeout) do
117
+ PTY.spawn(@command) do |stdout, stdin, pid, stderr|
118
+ @pid = pid
119
+ begin
120
+ if stdout.present?
121
+ stdout.each do |line|
122
+ log_internal(line, level: LogLevel::PURE)
123
+ line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
124
+ @output << {log: line, timestamp: Time.now}
125
+ end
126
+ end
127
+
128
+ if stderr.present?
129
+ stderr.each do |line|
130
+ line = line.strip.gsub(/\e\[([;\d]+)?m/, '')
131
+ log_internal(line, level: LogLevel::ERROR)
132
+ @errors << {log: line, timestamp: Time.now}
133
+ end
134
+ end
135
+ rescue Errno::EIO => e
136
+ next
137
+ rescue => e
138
+ log_internal(e.message, level: LogLevel::ERROR)
139
+ log_internal(e.backtrace.join("\n"), level: LogLevel::ERROR)
140
+ @errors << {log: e.message, timestamp: Time.now}
141
+ end
142
+ ::Process.wait pid
143
+ end
144
+ end
145
+ @exit_status = $?.exitstatus
146
+ rescue Timeout::Error
147
+ Process.kill(0, @pid)
148
+ @errors << {log: "Command timed out!", timestamp: Time.now}
149
+ log_internal("Command timed out!", level: LogLevel::ERROR)
150
+ @exit_status = 124
151
+ ensure
152
+ retry_command if @retries != 0 and @exit_status !=0
153
+ @exit_status
154
+ end
155
+
156
+ private
157
+ def send_logs(exit_status: nil, status: Status::RUNNING)
158
+ logs, error_logs = get_logs_to_send
159
+ # Filter logs only if not failed
160
+ if exit_status.blank? or exit_status == 0
161
+ logs = filter_logs_by_regex(logs)
162
+ end
163
+ ### there is no logs, no exit_status and status is running.
164
+ ### this condition let us call "send_logs" every interval iteration.
165
+ if logs.blank? and error_logs.blank? and exit_status.blank? and status == Status::RUNNING
166
+ return
167
+ end
168
+ Cnvrg::API.request(base_url, 'PUT', {logs: logs, error_logs: error_logs, exit_status: exit_status, status: status, execution_retries: @real_execution_retries, pid: @pid})
169
+ end
170
+
171
+ def log_internal(log, level: LogLevel::INFO)
172
+ if level == LogLevel::PURE
173
+ puts(log)
174
+ else
175
+ puts({log: log, level: level, timestamp: Time.now, command: @slug, machine_activity: @executer.machine_activity}.to_json)
176
+ end
177
+ STDOUT.flush
178
+ end
179
+
180
+ def filter_logs_by_regex(logs)
181
+ logs.select do |log|
182
+ next true if @send_logs
183
+ @logs_regex.find do |regexp_str|
184
+ Regexp.new(regexp_str).match(log[:log]).present?
185
+ end
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,213 @@
1
+ require 'cnvrg/helpers/agent'
2
+ class Cnvrg::Helpers::Executer
3
+ attr_reader :machine_activity, :agent_id, :slave_id
4
+
5
+
6
+ ### this class represent a machine_activity. it will poll the commands, communicate with the
7
+ # server (poll commands) and let the server know the status of this executer.
8
+ def initialize(owner: nil, machine_activity: nil, poll_every: 30, job_id: nil)
9
+ @owner = owner
10
+ @job_id = job_id
11
+ @poll_every = poll_every
12
+ @machine_activity = machine_activity
13
+ @commands_q = Queue.new
14
+ @files_q = Queue.new
15
+ @agent_id = nil
16
+ @slave_id = nil
17
+ end
18
+
19
+ def create_file_cmd(path, content)
20
+ if path.include? "~"
21
+ path = File.expand_path(path)
22
+ end
23
+ FileUtils.mkdir_p(File.dirname(path))
24
+ File.open(path, "w+"){|f| f.write(content)}
25
+ end
26
+
27
+ def handle_files(files)
28
+ (files || {}).each do |path, content|
29
+ create_file_cmd(path, content)
30
+ end
31
+ end
32
+
33
+ def activity_url
34
+ ['users', @owner, 'machine_activities', @machine_activity].join("/")
35
+ end
36
+
37
+ def executer_stats
38
+ return @stats if @stats.present?
39
+ Cnvrg::Logger.log_info("getting containers")
40
+ @agent_id, @slave_id = containers
41
+ Cnvrg::Logger.log_info("got containers")
42
+ pod_name, node_name = get_node_and_pod_names
43
+ @stats = {
44
+ pod_name: pod_name,
45
+ node_name: node_name,
46
+ agent: {
47
+ container_id: @agent_id,
48
+ workdir: `pwd`.strip,
49
+ homedir: current_homedir,
50
+ user: `whoami`.strip,
51
+ user_id: `id -u`.strip,
52
+ group_id: `id -g`.strip,
53
+ cnvrg: Cnvrg::VERSION
54
+ },
55
+ slave: {
56
+ container_id: @slave_id,
57
+ workdir: run_in_slave('pwd'),
58
+ homedir: slave_homedir,
59
+ spark_path: spark_path,
60
+ user: run_in_slave( 'whoami'),
61
+ cnvrg: run_in_slave( 'which cnvrg'),
62
+ has_bash: run_in_slave( 'which bash'),
63
+ user_id: run_in_slave( 'id -u'),
64
+ group_id: run_in_slave( 'id -g'),
65
+ python_version: run_in_slave( 'python --version'),
66
+ python3_version: run_in_slave( 'python3 --version'),
67
+ pip_version: run_in_slave( 'pip --version'),
68
+ pip3_version: run_in_slave( 'pip3 --version')
69
+ },
70
+ }
71
+ @stats
72
+ end
73
+
74
+ def containers
75
+ agent_id = nil
76
+ slave_id = nil
77
+ while agent_id.blank? or slave_id.blank?
78
+ grep_by = @job_id
79
+ grep_by = "$(hostname)" if ENV['KUBERNETES_PORT'].present?
80
+ cntrs = `docker ps --format "table {{.ID}},{{.Names}}" | grep -i #{grep_by}`.split("\n").map{|x| x.strip}
81
+ agent_id = cntrs.find{|container_name| container_name.include? "agent"}.split(",").first rescue nil
82
+ slave_id = cntrs.find{|container_name| container_name.include? "slave"}.split(",").first rescue nil
83
+ sleep(5)
84
+ end
85
+ if slave_id.blank?
86
+ raise "Can't find slave id"
87
+ end
88
+ [agent_id, slave_id]
89
+ end
90
+
91
+ def current_homedir
92
+ `env | grep -w HOME`.strip.split("=").try(:last)
93
+ end
94
+
95
+ def spark_path
96
+ run_in_slave("env | grep SPARK_HOME").strip.split("=").try(:last)
97
+ end
98
+
99
+ def slave_homedir()
100
+ run_in_slave("env | grep -w HOME").split("=").try(:last)
101
+ end
102
+
103
+ def slave_env
104
+ run_in_slave("env").split("\n").map{|x| x.split("=")}
105
+ end
106
+
107
+ def run_in_slave(command)
108
+ `docker exec -i #{@slave_id} sh -c '#{command}'`.strip
109
+ end
110
+
111
+
112
+ def poll
113
+ resp = Cnvrg::API.request([activity_url, "commands"].join('/'), "POST")
114
+ commands = resp["commands"]
115
+ files = resp["files"]
116
+ handle_files(files)
117
+ commands.each{|cmd| @commands_q.push(cmd)}
118
+ rescue => e
119
+ Cnvrg::Logger.log_error(e)
120
+ end
121
+
122
+ def init
123
+ resp = Cnvrg::API.request(activity_url, "PUT", {stats: executer_stats})
124
+ machine_activity = resp["machine_activity"]
125
+ Cnvrg::Logger.log_info("Got back machine activity #{machine_activity}")
126
+ if machine_activity.present? and @machine_activity != machine_activity
127
+ Cnvrg::Logger.log_info("Changing to machine activity #{machine_activity}")
128
+ machine_activity_yml = {slug: machine_activity}
129
+ File.open("/conf/.machine_activity.yml", "w+") {|f| f.write machine_activity_yml.to_yaml}
130
+ @machine_activity = machine_activity
131
+ end
132
+ rescue => e
133
+ Cnvrg::Logger.log_error(e)
134
+ end
135
+
136
+ def polling_thread
137
+ while true
138
+ poll
139
+ sleep(@poll_every)
140
+ end
141
+ end
142
+
143
+ def main_thread
144
+ init
145
+ Thread.new do
146
+ polling_thread
147
+ end
148
+ execute_cmds
149
+ end
150
+
151
+ def execute_cmds
152
+ pids = []
153
+ while true
154
+ if @commands_q.empty?
155
+ sleep(5)
156
+ next
157
+ end
158
+ cmd = @commands_q.pop.symbolize_keys
159
+ command_json = Cnvrg::API.request([activity_url, "commands", cmd[:slug]].join('/'), "GET")
160
+
161
+ cmd_status = command_json["status"] rescue ""
162
+
163
+ if cmd_status == Cnvrg::Helpers::Agent::Status::ABORTED
164
+ Cnvrg::Logger.log_info("stopping job because command #{cmd[:slug]} with status #{cmd_status}")
165
+ next
166
+ end
167
+ pid = Process.fork do
168
+ Cnvrg::Helpers::Agent.new(executer: self, **cmd).exec!
169
+ end
170
+ if cmd[:async].blank?
171
+ Process.waitpid(pid)
172
+ else
173
+ Process.detach(pid)
174
+ end
175
+ pids << pid
176
+ ######
177
+ end
178
+ pids
179
+ end
180
+
181
+ def merge_log_block(logs)
182
+ logs.group_by {|log| log[:timestamp].to_s}
183
+ .map {|ts, logz| {timestamp: ts, logs: logz.map {|l| l[:log]}.join("\n")}}
184
+ end
185
+
186
+ def get_node_and_pod_names
187
+ pod_name = `hostname`.strip rescue nil
188
+ node_name = nil
189
+ if pod_name.present?
190
+ pod_describe = `kubectl -n cnvrg get pod #{pod_name} -o json` rescue nil
191
+ pod_describe = JSON.parse(pod_describe) rescue {}
192
+ node_name = pod_describe["spec"]["nodeName"] rescue nil
193
+ end
194
+ [pod_name, node_name]
195
+ end
196
+
197
+ def pre_pod_stop
198
+ pod_name, node_name = get_node_and_pod_names
199
+ pod_events = get_pod_events(pod_name)
200
+ node_events = get_node_events(node_name)
201
+ Cnvrg::API.request([activity_url, "job_events"].join('/'), "POST", {pod_events: pod_events, node_events: node_events})
202
+ end
203
+
204
+ def get_pod_events(pod_name)
205
+ return if pod_name.blank?
206
+ `kubectl get event --namespace cnvrg --field-selector involvedObject.name=#{pod_name} -o json`
207
+ end
208
+
209
+ def get_node_events(node_name)
210
+ return if node_name.blank?
211
+ `kubectl get event --all-namespaces --field-selector involvedObject.name=#{node_name} -o json`
212
+ end
213
+ end
@@ -0,0 +1,21 @@
1
+ module Cnvrg
2
+ class Hyper
3
+ def initialize(project_path, path)
4
+ @project = Cnvrg::Project.new(project_path)
5
+ @content = YAML.load_file(path)
6
+ @base_resource = "users/#{@project.owner}"
7
+ @params = []
8
+ end
9
+
10
+ def resolve_params
11
+ resp = Cnvrg::API.request(@base_resource + "/resolve_grid", "POST", {hyper_search: @content})
12
+ unless Cnvrg::CLI.is_response_success(resp, false)
13
+ return nil
14
+ end
15
+ resp['result']['params'].each do |param|
16
+ @params << {key: param.first.keys.first, value: param.map{|p| p.values}.flatten.join(',')}
17
+ end
18
+ @params
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,113 @@
1
+ module Cnvrg
2
+ class Image
3
+ # attr_reader :image_name, :image_tag, :is_docker, :project_slug, :commit_id, :owner, :port, :image_slug
4
+
5
+
6
+ def initialize(image_id)
7
+ begin
8
+ @cli = Cnvrg::CLI.new
9
+ home_dir = File.expand_path('~')
10
+ config = YAML.load_file(home_dir+"/.cnvrg/config.yml")
11
+ @owner = config.to_h[:owner]
12
+ @username = config.to_h[:username]
13
+ @image_id = image_id
14
+ rescue => e
15
+ @owner = ""
16
+ @username = ""
17
+ @cli.log_message("cnvrg is not configured")
18
+ end
19
+ end
20
+
21
+ def build
22
+ image_data = get_image_data
23
+ file_name = "Dockerfile-#{@image_id}"
24
+ File.new(file_name, "w+")
25
+ if image_data["reqs_file_path"].present? and image_data["from_image_name"]
26
+ File.open(file_name, "w+") do |i|
27
+ i.write("FROM #{image_data["from_image_name"]}")
28
+ i.write("ADD requirements.txt requirements.txt")
29
+ i.write("RUN pip3 install -r requirements.txt")
30
+ end
31
+ else
32
+ open(file_name, 'wb') do |file|
33
+ file << open(image_data["docker_file_path"]).read
34
+ end
35
+ end
36
+ docker_url = "#{image_data["docker_name"]}"
37
+ command = {:type=>"notify",
38
+ :title=>"docker build",
39
+ :logs=>true,
40
+ :before_execute_log=>"Building docker image",
41
+ :timeout=>3600,
42
+ :command=>"sudo docker build . -t #{docker_url} -f #{file_name}"}
43
+ @executer = Helpers::Executer.new(project: @project, job_type: "image", job_id: @image_id, image: self)
44
+ exit_status, output, errors, _, _ = @executer.execute(command)
45
+ all_logs = join_logs(output, errors)
46
+ if exit_status != 0
47
+ raise StandardError.new(all_logs)
48
+ end
49
+ if ENV["CNVRG_IMAGE_BUILD_USERNAME"].present? and ENV["CNVRG_IMAGE_BUILD_PASSWORD"].present?
50
+ if ENV["CNVRG_IMAGE_BUILD_REGISTRY"].present?
51
+ command = {:type=>"notify",
52
+ :no_stdout => true,
53
+ :title=>"docker login",
54
+ :logs=>true,
55
+ :command=>"sudo docker login #{ENV["CNVRG_IMAGE_BUILD_REGISTRY"]} --username=#{ENV["CNVRG_IMAGE_BUILD_USERNAME"]} --password=\"#{ENV["CNVRG_IMAGE_BUILD_PASSWORD"]}\""}
56
+ else
57
+ command = {:type=>"notify",
58
+ :no_stdout => true,
59
+ :title=>"docker login",
60
+ :logs=>true,
61
+ :command=>"sudo docker login --username=#{ENV["CNVRG_IMAGE_BUILD_USERNAME"]} --password=\"#{ENV["CNVRG_IMAGE_BUILD_PASSWORD"]}\""}
62
+ end
63
+ exit_status, output, errors, _, _ = @executer.execute(command)
64
+ all_logs = join_logs(output, errors)
65
+ if exit_status != 0
66
+ raise StandardError.new(all_logs)
67
+ end
68
+ end
69
+ command = {:type=>"notify",
70
+ :title=>"docker push",
71
+ :logs=>true,
72
+ :before_execute_log=>"Pushing docker image",
73
+ :timeout=>3600,
74
+ :command=>"sudo docker push #{docker_url}"}
75
+ exit_status, output, errors, _, _ = @executer.execute(command)
76
+ all_logs = join_logs(output, errors)
77
+ if exit_status != 0
78
+ raise StandardError.new(all_logs)
79
+ end
80
+ post_build_update(true)
81
+ rescue => e
82
+ @cli.log_message("Image Build failed")
83
+ post_build_update(false, e.message)
84
+ end
85
+
86
+ def get_image_data
87
+ response = Cnvrg::API.request("users/#{@owner}/images/#{@image_id}/image_start_build", 'GET')
88
+ CLI.is_response_success(response)
89
+ return response["image"]
90
+ end
91
+
92
+ def post_build_update(success, message = "")
93
+ response = Cnvrg::API.request("users/#{@owner}/images/#{@image_id}/image_end_build", 'POST', {success: success, message: message})
94
+ CLI.is_response_success(response)
95
+ return response["image"]
96
+ end
97
+
98
+
99
+ def job_log(logs, level: 'info', step: nil, job_type: "image", job_id: @image_id)
100
+ logs = [logs].flatten
101
+ logs.each_slice(10).each do |temp_logs|
102
+ Cnvrg::API.request("users/#{@owner}/images/#{@image_id}/log", "POST", {job_type: job_type, job_id: job_id, logs: temp_logs, log_level: level, step: step, timestamp: Time.now})
103
+ sleep(1)
104
+ end
105
+ end
106
+
107
+
108
+ def join_logs(output, errors)
109
+ output.map{ |o| o[:logs]}.join(" ") + " " + errors.map{ |o| o[:logs]}.join(" ")
110
+ end
111
+
112
+ end
113
+ end