ood_core 0.12.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ # An object that describes a submitted kubernetes job with extended information
2
+ class OodCore::Job::Adapters::Kubernetes::K8sJobInfo < OodCore::Job::Info
3
+ attr_reader :ood_connection_info
4
+
5
+ def initialize(ood_connection_info: {}, **options)
6
+ super(options)
7
+ @ood_connection_info = ood_connection_info
8
+ end
9
+ end
@@ -0,0 +1,82 @@
1
+ module OodCore::Job::Adapters::Kubernetes::Resources
2
+
3
+ class ConfigMap
4
+ attr_accessor :name, :files
5
+
6
+ def initialize(name, files)
7
+ @name = name
8
+ @files = []
9
+ files.each do |f|
10
+ @files << ConfigMapFile.new(f)
11
+ end
12
+ end
13
+
14
+ def mounts?
15
+ @files.any? { |f| f.mount_path }
16
+ end
17
+
18
+ def init_mounts?
19
+ @files.any? { |f| f.init_mount_path }
20
+ end
21
+ end
22
+
23
+ class ConfigMapFile
24
+ attr_accessor :filename, :data, :mount_path, :sub_path, :init_mount_path, :init_sub_path
25
+
26
+ def initialize(data)
27
+ @filename = data[:filename]
28
+ @data = data[:data]
29
+ @mount_path = data[:mount_path]
30
+ @sub_path = data[:sub_path]
31
+ @init_mount_path = data[:init_mount_path]
32
+ @init_sub_path = data[:init_sub_path]
33
+ end
34
+ end
35
+
36
+ class Container
37
+ attr_accessor :name, :image, :command, :port, :env, :memory, :cpu, :working_dir,
38
+ :restart_policy, :image_pull_secret, :supplemental_groups
39
+
40
+ def initialize(
41
+ name, image, command: [], port: nil, env: {}, memory: "4Gi", cpu: "1",
42
+ working_dir: "", restart_policy: "Never", image_pull_secret: nil, supplemental_groups: []
43
+ )
44
+ raise ArgumentError, "containers need valid names and images" unless name && image
45
+
46
+ @name = name
47
+ @image = image
48
+ @command = command.nil? ? [] : command
49
+ @port = port&.to_i
50
+ @env = env.nil? ? {} : env
51
+ @memory = memory.nil? ? "4Gi" : memory
52
+ @cpu = cpu.nil? ? "1" : cpu
53
+ @working_dir = working_dir.nil? ? "" : working_dir
54
+ @restart_policy = restart_policy.nil? ? "Never" : restart_policy
55
+ @image_pull_secret = image_pull_secret
56
+ @supplemental_groups = supplemental_groups.nil? ? [] : supplemental_groups
57
+ end
58
+
59
+ def ==(other)
60
+ name == other.name &&
61
+ image == other.image &&
62
+ command == other.command &&
63
+ port == other.port &&
64
+ env == other.env &&
65
+ memory == other.memory &&
66
+ cpu == other.cpu &&
67
+ working_dir == other.working_dir &&
68
+ restart_policy == other.restart_policy &&
69
+ image_pull_secret == other.image_pull_secret &&
70
+ supplemental_groups == other.supplemental_groups
71
+ end
72
+ end
73
+
74
+ class PodSpec
75
+ attr_accessor :container, :init_containers
76
+ def initialize(container, init_containers: nil)
77
+ @container = container
78
+ @init_containers = init_containers
79
+ end
80
+ end
81
+
82
+ end
@@ -0,0 +1,188 @@
1
+ apiVersion: v1
2
+ kind: Pod
3
+ metadata:
4
+ namespace: <%= namespace %>
5
+ name: <%= id %>
6
+ labels:
7
+ job: <%= id %>
8
+ app.kubernetes.io/name: <%= container.name %>
9
+ app.kubernetes.io/managed-by: open-ondemand
10
+ <%- if !script.accounting_id.nil? && script.accounting_id != "" -%>
11
+ account: <%= script.accounting_id %>
12
+ <%- end -%>
13
+ annotations:
14
+ <%- unless script.wall_time.nil? -%>
15
+ pod.kubernetes.io/lifetime: <%= helper.seconds_to_duration(script.wall_time) %>
16
+ <%- end -%>
17
+ spec:
18
+ restartPolicy: <%= spec.container.restart_policy %>
19
+ securityContext:
20
+ runAsUser: <%= run_as_user %>
21
+ runAsGroup: <%= run_as_group %>
22
+ runAsNonRoot: true
23
+ <%- if spec.container.supplemental_groups.empty? -%>
24
+ supplementalGroups: []
25
+ <%- else -%>
26
+ supplementalGroups:
27
+ <%- spec.container.supplemental_groups.each do |supplemental_group| -%>
28
+ - "<%= supplemental_group %>"
29
+ <%- end -%>
30
+ <%- end -%>
31
+ fsGroup: <%= fs_group %>
32
+ hostNetwork: false
33
+ hostIPC: false
34
+ hostPID: false
35
+ <%- unless spec.container.image_pull_secret.nil? -%>
36
+ imagePullSecrets:
37
+ - name: <%= spec.container.image_pull_secret %>
38
+ <%- end -%>
39
+ containers:
40
+ - name: "<%= spec.container.name %>"
41
+ image: <%= spec.container.image %>
42
+ imagePullPolicy: IfNotPresent
43
+ <%- unless spec.container.working_dir.empty? -%>
44
+ workingDir: "<%= spec.container.working_dir %>"
45
+ <%- end -%>
46
+ env:
47
+ - name: POD_NAME
48
+ valueFrom:
49
+ fieldRef:
50
+ fieldPath: metadata.name
51
+ <%- spec.container.env.each_pair do |name, value| -%>
52
+ - name: <%= name %>
53
+ value: "<%= value %>"
54
+ <%- end # for each env -%>
55
+ <%- unless spec.container.command.empty? -%>
56
+ command:
57
+ <%- spec.container.command.each do |cmd| -%>
58
+ - "<%= cmd %>"
59
+ <%- end # for each command -%>
60
+ <%- end # unless command is nil -%>
61
+ <%- unless spec.container.port.nil? -%>
62
+ ports:
63
+ - containerPort: <%= spec.container.port %>
64
+ <%- end -%>
65
+ <%- if configmap.mounts? || !all_mounts.empty? -%>
66
+ volumeMounts:
67
+ <%- configmap.files.each do |file| -%>
68
+ <%- next if file.mount_path.nil? -%>
69
+ - name: configmap-volume
70
+ mountPath: <%= file.mount_path %>
71
+ <%- unless file.sub_path.nil? -%>
72
+ subPath: <%= file.sub_path %>
73
+ <%- end # end unless file.sub_path.nil? -%>
74
+ <%- end # end configmap.files.each -%>
75
+ <%- all_mounts.each do |mount| -%>
76
+ - name: <%= mount[:name] %>
77
+ mountPath: <%= mount[:destination_path] %>
78
+ <%- end # for each mount -%>
79
+ <%- end # configmap mounts? and all_mounts not empty -%>
80
+ resources:
81
+ limits:
82
+ memory: "<%= spec.container.memory %>"
83
+ cpu: "<%= spec.container.cpu %>"
84
+ requests:
85
+ memory: "<%= spec.container.memory %>"
86
+ cpu: "<%= spec.container.cpu %>"
87
+ securityContext:
88
+ allowPrivilegeEscalation: false
89
+ capabilities:
90
+ drop:
91
+ - all
92
+ privileged: false
93
+ <%- unless spec.init_containers.nil? -%>
94
+ initContainers:
95
+ <%- spec.init_containers.each do |ctr| -%>
96
+ - name: "<%= ctr.name %>"
97
+ image: "<%= ctr.image %>"
98
+ env:
99
+ - name: POD_NAME
100
+ valueFrom:
101
+ fieldRef:
102
+ fieldPath: metadata.name
103
+ <%- ctr.env.each_pair do |name, value| -%>
104
+ - name: <%= name %>
105
+ value: "<%= value %>"
106
+ <%- end # for each env -%>
107
+ command:
108
+ <%- ctr.command.each do |cmd| -%>
109
+ - "<%= cmd %>"
110
+ <%- end # command loop -%>
111
+ <%- if configmap.init_mounts? || !all_mounts.empty? -%>
112
+ volumeMounts:
113
+ <%- configmap.files.each do |file| -%>
114
+ <%- next if file.init_mount_path.nil? -%>
115
+ - name: configmap-volume
116
+ mountPath: <%= file.init_mount_path %>
117
+ <%- unless file.init_sub_path.nil? -%>
118
+ subPath: <%= file.init_sub_path %>
119
+ <%- end # end unless file.sub_path.nil? -%>
120
+ <%- end # end configmap.files.each -%>
121
+ <%- all_mounts.each do |mount| -%>
122
+ - name: <%= mount[:name] %>
123
+ mountPath: <%= mount[:destination_path] %>
124
+ <%- end # for each mount -%>
125
+ <%- end # if config_map init mounts and all_mounts not empty -%>
126
+ securityContext:
127
+ allowPrivilegeEscalation: false
128
+ capabilities:
129
+ drop:
130
+ - all
131
+ privileged: false
132
+ <%- end # init container loop -%>
133
+ <%- end # if init containers -%>
134
+ <%- unless (configmap.to_s.empty? && all_mounts.empty?) -%>
135
+ volumes:
136
+ <%- unless configmap.to_s.empty? -%>
137
+ - name: configmap-volume
138
+ configMap:
139
+ name: <%= configmap_name(id) %>
140
+ <%- end -%>
141
+ <%- all_mounts.each do |mount| -%>
142
+ <%- if mount[:type] == 'nfs' -%>
143
+ - name: <%= mount[:name] %>
144
+ nfs:
145
+ server: <%= mount[:host] %>
146
+ path: <%= mount[:path] %>
147
+ <%- elsif mount[:type] == 'host' -%>
148
+ - name: <%= mount[:name] %>
149
+ hostPath:
150
+ path: <%= mount[:path] %>
151
+ type: <%= mount[:host_type] %>
152
+ <%- end # if mount is [host,nfs] -%>
153
+ <%- end # for each mount -%>
154
+ <%- end # (configmap.to_s.empty? || all_mounts.empty?) -%>
155
+ ---
156
+ <%- unless spec.container.port.nil? -%>
157
+ apiVersion: v1
158
+ kind: Service
159
+ metadata:
160
+ name: <%= service_name(id) %>
161
+ namespace: <%= namespace %>
162
+ labels:
163
+ job: <%= id %>
164
+ spec:
165
+ selector:
166
+ job: <%= id %>
167
+ ports:
168
+ - protocol: TCP
169
+ port: 80
170
+ targetPort: <%= spec.container.port %>
171
+ type: NodePort
172
+ <%- end # end for service -%>
173
+ ---
174
+ <%- unless configmap.nil? -%>
175
+ apiVersion: v1
176
+ kind: ConfigMap
177
+ metadata:
178
+ name: <%= configmap_name(id) %>
179
+ namespace: <%= namespace %>
180
+ labels:
181
+ job: <%= id %>
182
+ data:
183
+ <%- configmap.files.each do |file| -%>
184
+ <%- next if file.data.nil? || file.filename.nil? -%>
185
+ <%= file.filename %>: |
186
+ <% config_data_lines(file.data).each do |line| %><%= line %><% end %>
187
+ <%- end # end for configmap files -%>
188
+ <%- end # end configmap.nil? %>
@@ -16,7 +16,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
16
16
  # from
17
17
  class Error < StandardError; end
18
18
 
19
- UNIT_SEPARATOR = "\x1F"
19
+ UNIT_SEPARATOR = ","
20
20
 
21
21
  # @param debug Whether the adapter should be used in debug mode
22
22
  # @param site_timeout [#to_i] A period after which the job should be killed or nil
@@ -80,12 +80,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
80
80
 
81
81
  call(*cmd, stdin: kill_cmd)
82
82
  rescue Error => e
83
- raise e unless (
84
- # The tmux server not running is not an error
85
- e.message.include?('failed to connect to server') ||
86
- # The session not being found is not an error
87
- e.message.include?("session not found: #{session_name_label}")
88
- )
83
+ interpret_and_raise(e)
89
84
  end
90
85
 
91
86
  def list_remote_sessions(host: nil)
@@ -166,7 +161,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
166
161
  'email_on_terminated' => script_email_on_event(script, 'terminated'),
167
162
  'email_on_start' => script_email_on_event(script, 'started'),
168
163
  'environment' => export_env(script),
169
- 'error_path' => (script.error_path) ? script.error_path.to_s : '/dev/null',
164
+ 'error_path' => error_path(script),
170
165
  'job_name' => script.job_name.to_s,
171
166
  'output_path' => (script.output_path) ? script.output_path.to_s : '/dev/null',
172
167
  'script_content' => content,
@@ -176,6 +171,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
176
171
  'singularity_image' => singularity_image(script.native),
177
172
  'ssh_hosts' => ssh_hosts,
178
173
  'tmux_bin' => tmux_bin,
174
+ 'workdir' => (script.workdir) ? script.workdir.to_s : '/tmp',
179
175
  }.each{
180
176
  |key, value| bnd.local_variable_set(key, value)
181
177
  }
@@ -263,8 +259,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
263
259
  |session_hash| session_hash[:session_name].start_with?(session_name_label)
264
260
  }
265
261
  rescue Error => e
266
- # The tmux server not running is not an error
267
- raise e unless e.message.include?('failed to connect to server')
262
+ interpret_and_raise(e)
268
263
  []
269
264
  end
270
265
 
@@ -272,4 +267,24 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
272
267
  return false if script.content.empty?
273
268
  script.content.split("\n").first.start_with?('#!/')
274
269
  end
270
+
271
+ def error_path(script)
272
+ return script.error_path.to_s if script.error_path
273
+ return script.output_path.to_s if script.output_path
274
+
275
+ '/dev/null'
276
+ end
277
+
278
+ # under some conditions tmux returns status code 1 but it's not an actual
279
+ # error. These are when the session is not found or there are no sessions
280
+ # at all.
281
+ def interpret_and_raise(error)
282
+ if error.message.include?('failed to connect to server') # no sessions in tmux 1.8
283
+ nil
284
+ elsif error.message.include?('no server running on') # no sessions in tmux 2.7+ message
285
+ nil
286
+ else
287
+ raise error
288
+ end
289
+ end
275
290
  end
@@ -16,13 +16,9 @@ fi
16
16
  echo $hostname
17
17
 
18
18
  # Put the script into a temp file on localhost
19
- <% if debug %>
20
- singularity_tmp_file=$(mktemp -p "$HOME" --suffix '_sing')
21
- tmux_tmp_file=$(mktemp -p "$HOME" --suffix "_tmux")
22
- <% else %>
23
- singularity_tmp_file=$(mktemp)
24
- tmux_tmp_file=$(mktemp)
25
- <% end %>
19
+ singularity_tmp_file=$(mktemp -p "<%= workdir %>" --suffix '_sing')
20
+ tmux_tmp_file=$(mktemp -p "<%= workdir %>" --suffix "_tmux")
21
+
26
22
 
27
23
  # Create an executable to run in a tmux session
28
24
  # The escaped HEREDOC means that we need to substitute in $singularity_tmp_file ourselves
@@ -69,10 +65,3 @@ SINGULARITY_LAUNCHER
69
65
  chmod +x "$singularity_tmp_file"
70
66
  chmod +x "$tmux_tmp_file"
71
67
  <%= tmux_bin %> new-session -d -s "<%= session_name %>" "$tmux_tmp_file"
72
-
73
- # Remove the file
74
- <% if ! debug %>
75
- # Wait 1 second to ensure that tmux session has started before the file is removed
76
- sleep 1
77
- rm -f "$tmux_tmp_file"; rm -f "$singularity_tmp_file"
78
- <% end %>
@@ -80,6 +80,9 @@ module OodCore
80
80
  # from
81
81
  class Error < StandardError; end
82
82
 
83
+ # An error indicating the slurm command timed out
84
+ class SlurmTimeoutError < Error; end
85
+
83
86
  # @param cluster [#to_s, nil] the cluster name
84
87
  # @param conf [#to_s, nil] path to the slurm conf
85
88
  # @param bin [#to_s] path to slurm installation binaries
@@ -147,6 +150,9 @@ module OodCore
147
150
  end
148
151
  jobs
149
152
  end
153
+ rescue SlurmTimeoutError
154
+ # TODO: could use a log entry here
155
+ return [{ id: id, state: 'undetermined' }]
150
156
  end
151
157
 
152
158
  def squeue_fields(attrs)
@@ -303,7 +309,18 @@ module OodCore
303
309
 
304
310
  cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
305
311
  o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
306
- s.success? ? o : raise(Error, e)
312
+ s.success? ? interpret_and_raise(o, e) : raise(Error, e)
313
+ end
314
+
315
+ # Helper function to raise an error based on the contents of stderr.
316
+ # Slurm exits 0 even when the command fails, so we need to interpret stderr
317
+ # to see if the command was actually successful.
318
+ def interpret_and_raise(stdout, stderr)
319
+ return stdout if stderr.empty?
320
+
321
+ raise SlurmTimeoutError, stderr if /^slurm_load_jobs error: Socket timed out/.match(stderr)
322
+
323
+ stdout
307
324
  end
308
325
 
309
326
  def squeue_attrs_for_info_attrs(attrs)
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.12.0"
3
+ VERSION = "0.16.0"
4
4
  end
data/ood_core.gemspec CHANGED
@@ -24,7 +24,8 @@ Gem::Specification.new do |spec|
24
24
 
25
25
  spec.add_runtime_dependency "ood_support", "~> 0.0.2"
26
26
  spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
27
- spec.add_development_dependency "bundler", "~> 1.7"
27
+ spec.add_development_dependency "bundler", "~> 2.1"
28
+ spec.add_runtime_dependency "activesupport", ">= 5.2", "< 6.0"
28
29
  spec.add_development_dependency "rake", "~> 13.0.1"
29
30
  spec.add_development_dependency "rspec", "~> 3.0"
30
31
  spec.add_development_dependency "pry", "~> 0.10"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2020-08-05 00:00:00.000000000 Z
13
+ date: 2021-04-20 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -52,14 +52,34 @@ dependencies:
52
52
  requirements:
53
53
  - - "~>"
54
54
  - !ruby/object:Gem::Version
55
- version: '1.7'
55
+ version: '2.1'
56
56
  type: :development
57
57
  prerelease: false
58
58
  version_requirements: !ruby/object:Gem::Requirement
59
59
  requirements:
60
60
  - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '1.7'
62
+ version: '2.1'
63
+ - !ruby/object:Gem::Dependency
64
+ name: activesupport
65
+ requirement: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '5.2'
70
+ - - "<"
71
+ - !ruby/object:Gem::Version
72
+ version: '6.0'
73
+ type: :runtime
74
+ prerelease: false
75
+ version_requirements: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: '5.2'
80
+ - - "<"
81
+ - !ruby/object:Gem::Version
82
+ version: '6.0'
63
83
  - !ruby/object:Gem::Dependency
64
84
  name: rake
65
85
  requirement: !ruby/object:Gem::Requirement
@@ -140,9 +160,9 @@ executables: []
140
160
  extensions: []
141
161
  extra_rdoc_files: []
142
162
  files:
163
+ - ".github/workflows/test.yml"
143
164
  - ".gitignore"
144
165
  - ".rspec"
145
- - ".travis.yml"
146
166
  - CHANGELOG.md
147
167
  - Gemfile
148
168
  - LICENSE.txt
@@ -163,8 +183,15 @@ files:
163
183
  - lib/ood_core/errors.rb
164
184
  - lib/ood_core/invalid_cluster.rb
165
185
  - lib/ood_core/job/adapter.rb
186
+ - lib/ood_core/job/adapters/ccq.rb
166
187
  - lib/ood_core/job/adapters/drmaa.rb
167
188
  - lib/ood_core/job/adapters/helper.rb
189
+ - lib/ood_core/job/adapters/kubernetes.rb
190
+ - lib/ood_core/job/adapters/kubernetes/batch.rb
191
+ - lib/ood_core/job/adapters/kubernetes/helper.rb
192
+ - lib/ood_core/job/adapters/kubernetes/k8s_job_info.rb
193
+ - lib/ood_core/job/adapters/kubernetes/resources.rb
194
+ - lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb
168
195
  - lib/ood_core/job/adapters/linux_host.rb
169
196
  - lib/ood_core/job/adapters/linux_host/launcher.rb
170
197
  - lib/ood_core/job/adapters/linux_host/templates/email.erb.sh
@@ -215,7 +242,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
215
242
  - !ruby/object:Gem::Version
216
243
  version: '0'
217
244
  requirements: []
218
- rubygems_version: 3.0.3
245
+ rubygems_version: 3.1.2
219
246
  signing_key:
220
247
  specification_version: 4
221
248
  summary: Open OnDemand core library