ood_core 0.12.0 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,9 @@
1
+ # An object that describes a submitted kubernetes job with extended information
2
+ class OodCore::Job::Adapters::Kubernetes::K8sJobInfo < OodCore::Job::Info
3
+ attr_reader :ood_connection_info
4
+
5
+ def initialize(ood_connection_info: {}, **options)
6
+ super(options)
7
+ @ood_connection_info = ood_connection_info
8
+ end
9
+ end
@@ -0,0 +1,82 @@
1
+ module OodCore::Job::Adapters::Kubernetes::Resources
2
+
3
+ class ConfigMap
4
+ attr_accessor :name, :files
5
+
6
+ def initialize(name, files)
7
+ @name = name
8
+ @files = []
9
+ files.each do |f|
10
+ @files << ConfigMapFile.new(f)
11
+ end
12
+ end
13
+
14
+ def mounts?
15
+ @files.any? { |f| f.mount_path }
16
+ end
17
+
18
+ def init_mounts?
19
+ @files.any? { |f| f.init_mount_path }
20
+ end
21
+ end
22
+
23
+ class ConfigMapFile
24
+ attr_accessor :filename, :data, :mount_path, :sub_path, :init_mount_path, :init_sub_path
25
+
26
+ def initialize(data)
27
+ @filename = data[:filename]
28
+ @data = data[:data]
29
+ @mount_path = data[:mount_path]
30
+ @sub_path = data[:sub_path]
31
+ @init_mount_path = data[:init_mount_path]
32
+ @init_sub_path = data[:init_sub_path]
33
+ end
34
+ end
35
+
36
+ class Container
37
+ attr_accessor :name, :image, :command, :port, :env, :memory, :cpu, :working_dir,
38
+ :restart_policy, :image_pull_secret, :supplemental_groups
39
+
40
+ def initialize(
41
+ name, image, command: [], port: nil, env: {}, memory: "4Gi", cpu: "1",
42
+ working_dir: "", restart_policy: "Never", image_pull_secret: nil, supplemental_groups: []
43
+ )
44
+ raise ArgumentError, "containers need valid names and images" unless name && image
45
+
46
+ @name = name
47
+ @image = image
48
+ @command = command.nil? ? [] : command
49
+ @port = port&.to_i
50
+ @env = env.nil? ? {} : env
51
+ @memory = memory.nil? ? "4Gi" : memory
52
+ @cpu = cpu.nil? ? "1" : cpu
53
+ @working_dir = working_dir.nil? ? "" : working_dir
54
+ @restart_policy = restart_policy.nil? ? "Never" : restart_policy
55
+ @image_pull_secret = image_pull_secret
56
+ @supplemental_groups = supplemental_groups.nil? ? [] : supplemental_groups
57
+ end
58
+
59
+ def ==(other)
60
+ name == other.name &&
61
+ image == other.image &&
62
+ command == other.command &&
63
+ port == other.port &&
64
+ env == other.env &&
65
+ memory == other.memory &&
66
+ cpu == other.cpu &&
67
+ working_dir == other.working_dir &&
68
+ restart_policy == other.restart_policy &&
69
+ image_pull_secret == other.image_pull_secret &&
70
+ supplemental_groups == other.supplemental_groups
71
+ end
72
+ end
73
+
74
+ class PodSpec
75
+ attr_accessor :container, :init_containers
76
+ def initialize(container, init_containers: nil)
77
+ @container = container
78
+ @init_containers = init_containers
79
+ end
80
+ end
81
+
82
+ end
@@ -0,0 +1,188 @@
1
+ apiVersion: v1
2
+ kind: Pod
3
+ metadata:
4
+ namespace: <%= namespace %>
5
+ name: <%= id %>
6
+ labels:
7
+ job: <%= id %>
8
+ app.kubernetes.io/name: <%= container.name %>
9
+ app.kubernetes.io/managed-by: open-ondemand
10
+ <%- if !script.accounting_id.nil? && script.accounting_id != "" -%>
11
+ account: <%= script.accounting_id %>
12
+ <%- end -%>
13
+ annotations:
14
+ <%- unless script.wall_time.nil? -%>
15
+ pod.kubernetes.io/lifetime: <%= helper.seconds_to_duration(script.wall_time) %>
16
+ <%- end -%>
17
+ spec:
18
+ restartPolicy: <%= spec.container.restart_policy %>
19
+ securityContext:
20
+ runAsUser: <%= run_as_user %>
21
+ runAsGroup: <%= run_as_group %>
22
+ runAsNonRoot: true
23
+ <%- if spec.container.supplemental_groups.empty? -%>
24
+ supplementalGroups: []
25
+ <%- else -%>
26
+ supplementalGroups:
27
+ <%- spec.container.supplemental_groups.each do |supplemental_group| -%>
28
+ - "<%= supplemental_group %>"
29
+ <%- end -%>
30
+ <%- end -%>
31
+ fsGroup: <%= fs_group %>
32
+ hostNetwork: false
33
+ hostIPC: false
34
+ hostPID: false
35
+ <%- unless spec.container.image_pull_secret.nil? -%>
36
+ imagePullSecrets:
37
+ - name: <%= spec.container.image_pull_secret %>
38
+ <%- end -%>
39
+ containers:
40
+ - name: "<%= spec.container.name %>"
41
+ image: <%= spec.container.image %>
42
+ imagePullPolicy: IfNotPresent
43
+ <%- unless spec.container.working_dir.empty? -%>
44
+ workingDir: "<%= spec.container.working_dir %>"
45
+ <%- end -%>
46
+ env:
47
+ - name: POD_NAME
48
+ valueFrom:
49
+ fieldRef:
50
+ fieldPath: metadata.name
51
+ <%- spec.container.env.each_pair do |name, value| -%>
52
+ - name: <%= name %>
53
+ value: "<%= value %>"
54
+ <%- end # for each env -%>
55
+ <%- unless spec.container.command.empty? -%>
56
+ command:
57
+ <%- spec.container.command.each do |cmd| -%>
58
+ - "<%= cmd %>"
59
+ <%- end # for each command -%>
60
+ <%- end # unless command is nil -%>
61
+ <%- unless spec.container.port.nil? -%>
62
+ ports:
63
+ - containerPort: <%= spec.container.port %>
64
+ <%- end -%>
65
+ <%- if configmap.mounts? || !all_mounts.empty? -%>
66
+ volumeMounts:
67
+ <%- configmap.files.each do |file| -%>
68
+ <%- next if file.mount_path.nil? -%>
69
+ - name: configmap-volume
70
+ mountPath: <%= file.mount_path %>
71
+ <%- unless file.sub_path.nil? -%>
72
+ subPath: <%= file.sub_path %>
73
+ <%- end # end unless file.sub_path.nil? -%>
74
+ <%- end # end configmap.files.each -%>
75
+ <%- all_mounts.each do |mount| -%>
76
+ - name: <%= mount[:name] %>
77
+ mountPath: <%= mount[:destination_path] %>
78
+ <%- end # for each mount -%>
79
+ <%- end # configmap mounts? and all_mounts not empty -%>
80
+ resources:
81
+ limits:
82
+ memory: "<%= spec.container.memory %>"
83
+ cpu: "<%= spec.container.cpu %>"
84
+ requests:
85
+ memory: "<%= spec.container.memory %>"
86
+ cpu: "<%= spec.container.cpu %>"
87
+ securityContext:
88
+ allowPrivilegeEscalation: false
89
+ capabilities:
90
+ drop:
91
+ - all
92
+ privileged: false
93
+ <%- unless spec.init_containers.nil? -%>
94
+ initContainers:
95
+ <%- spec.init_containers.each do |ctr| -%>
96
+ - name: "<%= ctr.name %>"
97
+ image: "<%= ctr.image %>"
98
+ env:
99
+ - name: POD_NAME
100
+ valueFrom:
101
+ fieldRef:
102
+ fieldPath: metadata.name
103
+ <%- ctr.env.each_pair do |name, value| -%>
104
+ - name: <%= name %>
105
+ value: "<%= value %>"
106
+ <%- end # for each env -%>
107
+ command:
108
+ <%- ctr.command.each do |cmd| -%>
109
+ - "<%= cmd %>"
110
+ <%- end # command loop -%>
111
+ <%- if configmap.init_mounts? || !all_mounts.empty? -%>
112
+ volumeMounts:
113
+ <%- configmap.files.each do |file| -%>
114
+ <%- next if file.init_mount_path.nil? -%>
115
+ - name: configmap-volume
116
+ mountPath: <%= file.init_mount_path %>
117
+ <%- unless file.init_sub_path.nil? -%>
118
+ subPath: <%= file.init_sub_path %>
119
+ <%- end # end unless file.sub_path.nil? -%>
120
+ <%- end # end configmap.files.each -%>
121
+ <%- all_mounts.each do |mount| -%>
122
+ - name: <%= mount[:name] %>
123
+ mountPath: <%= mount[:destination_path] %>
124
+ <%- end # for each mount -%>
125
+ <%- end # if config_map init mounts and all_mounts not empty -%>
126
+ securityContext:
127
+ allowPrivilegeEscalation: false
128
+ capabilities:
129
+ drop:
130
+ - all
131
+ privileged: false
132
+ <%- end # init container loop -%>
133
+ <%- end # if init containers -%>
134
+ <%- unless (configmap.to_s.empty? && all_mounts.empty?) -%>
135
+ volumes:
136
+ <%- unless configmap.to_s.empty? -%>
137
+ - name: configmap-volume
138
+ configMap:
139
+ name: <%= configmap_name(id) %>
140
+ <%- end -%>
141
+ <%- all_mounts.each do |mount| -%>
142
+ <%- if mount[:type] == 'nfs' -%>
143
+ - name: <%= mount[:name] %>
144
+ nfs:
145
+ server: <%= mount[:host] %>
146
+ path: <%= mount[:path] %>
147
+ <%- elsif mount[:type] == 'host' -%>
148
+ - name: <%= mount[:name] %>
149
+ hostPath:
150
+ path: <%= mount[:path] %>
151
+ type: <%= mount[:host_type] %>
152
+ <%- end # if mount is [host,nfs] -%>
153
+ <%- end # for each mount -%>
154
+ <%- end # (configmap.to_s.empty? || all_mounts.empty?) -%>
155
+ ---
156
+ <%- unless spec.container.port.nil? -%>
157
+ apiVersion: v1
158
+ kind: Service
159
+ metadata:
160
+ name: <%= service_name(id) %>
161
+ namespace: <%= namespace %>
162
+ labels:
163
+ job: <%= id %>
164
+ spec:
165
+ selector:
166
+ job: <%= id %>
167
+ ports:
168
+ - protocol: TCP
169
+ port: 80
170
+ targetPort: <%= spec.container.port %>
171
+ type: NodePort
172
+ <%- end # end for service -%>
173
+ ---
174
+ <%- unless configmap.nil? -%>
175
+ apiVersion: v1
176
+ kind: ConfigMap
177
+ metadata:
178
+ name: <%= configmap_name(id) %>
179
+ namespace: <%= namespace %>
180
+ labels:
181
+ job: <%= id %>
182
+ data:
183
+ <%- configmap.files.each do |file| -%>
184
+ <%- next if file.data.nil? || file.filename.nil? -%>
185
+ <%= file.filename %>: |
186
+ <% config_data_lines(file.data).each do |line| %><%= line %><% end %>
187
+ <%- end # end for configmap files -%>
188
+ <%- end # end configmap.nil? %>
@@ -16,7 +16,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
16
16
  # from
17
17
  class Error < StandardError; end
18
18
 
19
- UNIT_SEPARATOR = "\x1F"
19
+ UNIT_SEPARATOR = ","
20
20
 
21
21
  # @param debug Whether the adapter should be used in debug mode
22
22
  # @param site_timeout [#to_i] A period after which the job should be killed or nil
@@ -80,12 +80,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
80
80
 
81
81
  call(*cmd, stdin: kill_cmd)
82
82
  rescue Error => e
83
- raise e unless (
84
- # The tmux server not running is not an error
85
- e.message.include?('failed to connect to server') ||
86
- # The session not being found is not an error
87
- e.message.include?("session not found: #{session_name_label}")
88
- )
83
+ interpret_and_raise(e)
89
84
  end
90
85
 
91
86
  def list_remote_sessions(host: nil)
@@ -166,7 +161,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
166
161
  'email_on_terminated' => script_email_on_event(script, 'terminated'),
167
162
  'email_on_start' => script_email_on_event(script, 'started'),
168
163
  'environment' => export_env(script),
169
- 'error_path' => (script.error_path) ? script.error_path.to_s : '/dev/null',
164
+ 'error_path' => error_path(script),
170
165
  'job_name' => script.job_name.to_s,
171
166
  'output_path' => (script.output_path) ? script.output_path.to_s : '/dev/null',
172
167
  'script_content' => content,
@@ -176,6 +171,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
176
171
  'singularity_image' => singularity_image(script.native),
177
172
  'ssh_hosts' => ssh_hosts,
178
173
  'tmux_bin' => tmux_bin,
174
+ 'workdir' => (script.workdir) ? script.workdir.to_s : '/tmp',
179
175
  }.each{
180
176
  |key, value| bnd.local_variable_set(key, value)
181
177
  }
@@ -263,8 +259,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
263
259
  |session_hash| session_hash[:session_name].start_with?(session_name_label)
264
260
  }
265
261
  rescue Error => e
266
- # The tmux server not running is not an error
267
- raise e unless e.message.include?('failed to connect to server')
262
+ interpret_and_raise(e)
268
263
  []
269
264
  end
270
265
 
@@ -272,4 +267,24 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
272
267
  return false if script.content.empty?
273
268
  script.content.split("\n").first.start_with?('#!/')
274
269
  end
270
+
271
+ def error_path(script)
272
+ return script.error_path.to_s if script.error_path
273
+ return script.output_path.to_s if script.output_path
274
+
275
+ '/dev/null'
276
+ end
277
+
278
+ # under some conditions tmux returns status code 1 but it's not an actual
279
+ # error. These are when the session is not found or there are no sessions
280
+ # at all.
281
+ def interpret_and_raise(error)
282
+ if error.message.include?('failed to connect to server') # no sessions in tmux 1.8
283
+ nil
284
+ elsif error.message.include?('no server running on') # no sessions in tmux 2.7+ message
285
+ nil
286
+ else
287
+ raise error
288
+ end
289
+ end
275
290
  end
@@ -16,13 +16,9 @@ fi
16
16
  echo $hostname
17
17
 
18
18
  # Put the script into a temp file on localhost
19
- <% if debug %>
20
- singularity_tmp_file=$(mktemp -p "$HOME" --suffix '_sing')
21
- tmux_tmp_file=$(mktemp -p "$HOME" --suffix "_tmux")
22
- <% else %>
23
- singularity_tmp_file=$(mktemp)
24
- tmux_tmp_file=$(mktemp)
25
- <% end %>
19
+ singularity_tmp_file=$(mktemp -p "<%= workdir %>" --suffix '_sing')
20
+ tmux_tmp_file=$(mktemp -p "<%= workdir %>" --suffix "_tmux")
21
+
26
22
 
27
23
  # Create an executable to run in a tmux session
28
24
  # The escaped HEREDOC means that we need to substitute in $singularity_tmp_file ourselves
@@ -69,10 +65,3 @@ SINGULARITY_LAUNCHER
69
65
  chmod +x "$singularity_tmp_file"
70
66
  chmod +x "$tmux_tmp_file"
71
67
  <%= tmux_bin %> new-session -d -s "<%= session_name %>" "$tmux_tmp_file"
72
-
73
- # Remove the file
74
- <% if ! debug %>
75
- # Wait 1 second to ensure that tmux session has started before the file is removed
76
- sleep 1
77
- rm -f "$tmux_tmp_file"; rm -f "$singularity_tmp_file"
78
- <% end %>
@@ -80,6 +80,9 @@ module OodCore
80
80
  # from
81
81
  class Error < StandardError; end
82
82
 
83
+ # An error indicating the slurm command timed out
84
+ class SlurmTimeoutError < Error; end
85
+
83
86
  # @param cluster [#to_s, nil] the cluster name
84
87
  # @param conf [#to_s, nil] path to the slurm conf
85
88
  # @param bin [#to_s] path to slurm installation binaries
@@ -147,6 +150,9 @@ module OodCore
147
150
  end
148
151
  jobs
149
152
  end
153
+ rescue SlurmTimeoutError
154
+ # TODO: could use a log entry here
155
+ return [{ id: id, state: 'undetermined' }]
150
156
  end
151
157
 
152
158
  def squeue_fields(attrs)
@@ -303,7 +309,18 @@ module OodCore
303
309
 
304
310
  cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
305
311
  o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
306
- s.success? ? o : raise(Error, e)
312
+ s.success? ? interpret_and_raise(o, e) : raise(Error, e)
313
+ end
314
+
315
+ # Helper function to raise an error based on the contents of stderr.
316
+ # Slurm exits 0 even when the command fails, so we need to interpret stderr
317
+ # to see if the command was actually successful.
318
+ def interpret_and_raise(stdout, stderr)
319
+ return stdout if stderr.empty?
320
+
321
+ raise SlurmTimeoutError, stderr if /^slurm_load_jobs error: Socket timed out/.match(stderr)
322
+
323
+ stdout
307
324
  end
308
325
 
309
326
  def squeue_attrs_for_info_attrs(attrs)
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.12.0"
3
+ VERSION = "0.16.0"
4
4
  end
data/ood_core.gemspec CHANGED
@@ -24,7 +24,8 @@ Gem::Specification.new do |spec|
24
24
 
25
25
  spec.add_runtime_dependency "ood_support", "~> 0.0.2"
26
26
  spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
27
- spec.add_development_dependency "bundler", "~> 1.7"
27
+ spec.add_development_dependency "bundler", "~> 2.1"
28
+ spec.add_runtime_dependency "activesupport", ">= 5.2", "< 6.0"
28
29
  spec.add_development_dependency "rake", "~> 13.0.1"
29
30
  spec.add_development_dependency "rspec", "~> 3.0"
30
31
  spec.add_development_dependency "pry", "~> 0.10"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2020-08-05 00:00:00.000000000 Z
13
+ date: 2021-04-20 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -52,14 +52,34 @@ dependencies:
52
52
  requirements:
53
53
  - - "~>"
54
54
  - !ruby/object:Gem::Version
55
- version: '1.7'
55
+ version: '2.1'
56
56
  type: :development
57
57
  prerelease: false
58
58
  version_requirements: !ruby/object:Gem::Requirement
59
59
  requirements:
60
60
  - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '1.7'
62
+ version: '2.1'
63
+ - !ruby/object:Gem::Dependency
64
+ name: activesupport
65
+ requirement: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '5.2'
70
+ - - "<"
71
+ - !ruby/object:Gem::Version
72
+ version: '6.0'
73
+ type: :runtime
74
+ prerelease: false
75
+ version_requirements: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: '5.2'
80
+ - - "<"
81
+ - !ruby/object:Gem::Version
82
+ version: '6.0'
63
83
  - !ruby/object:Gem::Dependency
64
84
  name: rake
65
85
  requirement: !ruby/object:Gem::Requirement
@@ -140,9 +160,9 @@ executables: []
140
160
  extensions: []
141
161
  extra_rdoc_files: []
142
162
  files:
163
+ - ".github/workflows/test.yml"
143
164
  - ".gitignore"
144
165
  - ".rspec"
145
- - ".travis.yml"
146
166
  - CHANGELOG.md
147
167
  - Gemfile
148
168
  - LICENSE.txt
@@ -163,8 +183,15 @@ files:
163
183
  - lib/ood_core/errors.rb
164
184
  - lib/ood_core/invalid_cluster.rb
165
185
  - lib/ood_core/job/adapter.rb
186
+ - lib/ood_core/job/adapters/ccq.rb
166
187
  - lib/ood_core/job/adapters/drmaa.rb
167
188
  - lib/ood_core/job/adapters/helper.rb
189
+ - lib/ood_core/job/adapters/kubernetes.rb
190
+ - lib/ood_core/job/adapters/kubernetes/batch.rb
191
+ - lib/ood_core/job/adapters/kubernetes/helper.rb
192
+ - lib/ood_core/job/adapters/kubernetes/k8s_job_info.rb
193
+ - lib/ood_core/job/adapters/kubernetes/resources.rb
194
+ - lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb
168
195
  - lib/ood_core/job/adapters/linux_host.rb
169
196
  - lib/ood_core/job/adapters/linux_host/launcher.rb
170
197
  - lib/ood_core/job/adapters/linux_host/templates/email.erb.sh
@@ -215,7 +242,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
215
242
  - !ruby/object:Gem::Version
216
243
  version: '0'
217
244
  requirements: []
218
- rubygems_version: 3.0.3
245
+ rubygems_version: 3.1.2
219
246
  signing_key:
220
247
  specification_version: 4
221
248
  summary: Open OnDemand core library