scout-gear 10.11.1 → 10.11.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aca5b47a57bc63063f01af5278f878b9cf6205f16c7bf7154db2c8ba30043627
4
- data.tar.gz: b9df56fe4d812cba82bb2d30a7474a64867e99a6181501d8136da3861aaf72dd
3
+ metadata.gz: 59a4e58c16692963459b2f3993829a34a48bedae88a879e5b0a349acb279e135
4
+ data.tar.gz: 5473ffe02a923a32ea39d1c8b72e9841c25716382e13975d93f846b83943c43a
5
5
  SHA512:
6
- metadata.gz: 0bfd403c3ecbe1750e5937c27db75b58d92e6bcccdb96b1470cbfb20e1c8c0cbe3fa14d2eef1dd2d8780a0edede5a600cd4968144c939be9074fcbf2c352ea27
7
- data.tar.gz: ab126a4dfe118ec5083f2121909c6b0b51a0bddc617b9678908472a23eeb6dcc81c23e37a6e75d82a22381a096339e6fd92f6018501589141aaa650b8991f1ab
6
+ metadata.gz: f0937e1aedac6d478e5164a26802f342e759c71a62d6e6abf81b3621191c1b7c4ad72b9db1f5579c6b1478fff749b734b5fe9926774aa783a54c3130cc676e79
7
+ data.tar.gz: fe8ba7061d5d381097acab46ec9ad591526fb2623a9b8e7739b1afd20079f0df5fce7ca35418c781b416993afd9dcdc5d611c6db008c7f246a45fea37c150f42
data/.vimproject CHANGED
@@ -3,6 +3,8 @@ scout-gear=/$PWD filter="*.rb *.yaml" {
3
3
  README.md
4
4
  chats=chats filter="*"{
5
5
 
6
+ semaphore
7
+
6
8
  deploy
7
9
 
8
10
 
@@ -202,6 +204,7 @@ scout-gear=/$PWD filter="*.rb *.yaml" {
202
204
  cmd
203
205
  }
204
206
  batch=batch{
207
+ tail
205
208
  list
206
209
  clean
207
210
  }
data/VERSION CHANGED
@@ -1 +1 @@
1
- 10.11.1
1
+ 10.11.3
@@ -23,70 +23,226 @@ if continue
23
23
  #include <fcntl.h>
24
24
  EOF
25
25
 
26
+ # Create a named semaphore. Return 0 on success, -errno on error.
26
27
  builder.c_singleton <<-EOF
27
- void create_semaphore(char* name, int value){
28
- sem_open(name, O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO, value);
28
+ int create_semaphore(char* name, int value){
29
+ sem_t* sem;
30
+ sem = sem_open(name, O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO, value);
31
+ if (sem == SEM_FAILED){
32
+ return -errno;
33
+ }
34
+ /* close our handle; the semaphore lives on until unlinked and all handles closed */
35
+ sem_close(sem);
36
+ return 0;
29
37
  }
30
38
  EOF
39
+
40
+ # Unlink (remove) a named semaphore. Return 0 on success, -errno on error.
31
41
  builder.c_singleton <<-EOF
32
- void delete_semaphore(char* name){
33
- sem_unlink(name);
42
+ int delete_semaphore(char* name){
43
+ int ret = sem_unlink(name);
44
+ if (ret == -1) {
45
+ return -errno;
46
+ }
47
+ return 0;
34
48
  }
35
49
  EOF
36
50
 
51
+ # Wait (sem_wait) on a named semaphore. Return 0 on success, -errno on error.
37
52
  builder.c_singleton <<-EOF
38
53
  int wait_semaphore(char* name){
39
- int ret;
40
54
  sem_t* sem;
41
55
  sem = sem_open(name, 0);
42
56
  if (sem == SEM_FAILED){
43
- return(errno);
57
+ return -errno;
44
58
  }
45
- ret = sem_wait(sem);
59
+
60
+ int ret;
61
+ /* retry if interrupted by signal; stop on success or other error */
62
+ do {
63
+ ret = sem_wait(sem);
64
+ } while (ret == -1 && errno == EINTR);
65
+
46
66
  if (ret == -1){
47
- return(errno);
67
+ int e = errno;
68
+ sem_close(sem);
69
+ return -e;
48
70
  }
71
+
49
72
  sem_close(sem);
50
- return(ret);
73
+ return 0;
51
74
  }
52
75
  EOF
53
76
 
77
+ # Post (sem_post) on a named semaphore. Return 0 on success, -errno on error.
54
78
  builder.c_singleton <<-EOF
55
- void post_semaphore(char* name){
79
+ int post_semaphore(char* name){
56
80
  sem_t* sem;
57
81
  sem = sem_open(name, 0);
58
- sem_post(sem);
82
+ if (sem == SEM_FAILED){
83
+ return -errno;
84
+ }
85
+
86
+ int ret;
87
+ /* retry post if interrupted */
88
+ do {
89
+ ret = sem_post(sem);
90
+ } while (ret == -1 && errno == EINTR);
91
+
92
+ if (ret == -1) {
93
+ int e = errno;
94
+ sem_close(sem);
95
+ return -e;
96
+ }
97
+
59
98
  sem_close(sem);
99
+ return 0;
60
100
  }
61
101
  EOF
62
102
 
63
103
  end
64
104
 
65
105
  SEM_MUTEX = Mutex.new
106
+
107
+ def self.ensure_semaphore_name(file)
108
+ # Ensure a valid POSIX named semaphore name: must start with '/'
109
+ s = file.to_s.dup
110
+ # strip leading slashes and replace other slashes with underscores, then prepend single '/'
111
+ s.gsub!(%r{^/+}, '')
112
+ s = '/' + s.gsub('/', '_')
113
+ s
114
+ end
115
+
116
+ # Errno numeric lists
117
+ RETRIABLE_ERRNOS = [
118
+ Errno::ENOENT,
119
+ Errno::EIDRM,
120
+ Errno::EAGAIN,
121
+ Errno::EMFILE,
122
+ Errno::ENFILE,
123
+ Errno::EINTR
124
+ ].map { |c| c.new.errno }
125
+
126
+ FATAL_ERRNOS = [
127
+ Errno::EINVAL,
128
+ Errno::EACCES
129
+ ].map { |c| c.new.errno }
130
+
131
+ # Generic retry wrapper with exponential backoff + jitter
132
+ def self.with_retry(max_attempts: 6, base_delay: 0.01, max_delay: 1.0, jitter: 0.5, retriable: RETRIABLE_ERRNOS)
133
+ attempts = 0
134
+ while true
135
+ attempts += 1
136
+ ret = yield
137
+ # caller expects 0 on success, negative errno on failure
138
+ return ret if ret >= 0
139
+
140
+ err = -ret
141
+ # don't retry if it's clearly fatal or not in retriable list
142
+ if FATAL_ERRNOS.include?(err) || attempts >= max_attempts || !retriable.include?(err)
143
+ return ret
144
+ end
145
+
146
+ # exponential backoff with jitter
147
+ base = base_delay * (2 ** (attempts - 1))
148
+ sleep_time = [base, max_delay].min
149
+ # add jitter in range [0, jitter * sleep_time)
150
+ sleep_time += rand * jitter * sleep_time
151
+
152
+ Log.warn "Semaphore operation failed (errno=#{err}), retrying in #{'%.3f' % sleep_time}s (attempt #{attempts}/#{max_attempts})"
153
+ sleep(sleep_time)
154
+ end
155
+ end
156
+
157
+ # Safe wrappers that raise SystemCallError on final failure
158
+ def self.safe_create_semaphore(name, value, **opts)
159
+ ret = with_retry(**opts) { ScoutSemaphore.create_semaphore(name, value) }
160
+ if ret < 0
161
+ raise SystemCallError.new("create_semaphore(#{name}) failed", -ret)
162
+ end
163
+ ret
164
+ end
165
+
166
+ def self.safe_delete_semaphore(name, **opts)
167
+ ret = with_retry(**opts) { ScoutSemaphore.delete_semaphore(name) }
168
+ if ret < 0
169
+ raise SystemCallError.new("delete_semaphore(#{name}) failed", -ret)
170
+ end
171
+ ret
172
+ end
173
+
174
+ def self.safe_wait_semaphore(name, **opts)
175
+ ret = with_retry(**opts) { ScoutSemaphore.wait_semaphore(name) }
176
+ if ret < 0
177
+ err = -ret
178
+ if err == Errno::EINTR.new.errno
179
+ raise SemaphoreInterrupted
180
+ else
181
+ raise SystemCallError.new("wait_semaphore(#{name}) failed", err)
182
+ end
183
+ end
184
+ ret
185
+ end
186
+
187
+ def self.safe_post_semaphore(name, **opts)
188
+ ret = with_retry(**opts) { ScoutSemaphore.post_semaphore(name) }
189
+ if ret < 0
190
+ raise SystemCallError.new("post_semaphore(#{name}) failed", -ret)
191
+ end
192
+ ret
193
+ end
194
+
66
195
  def self.synchronize(sem)
67
- ret = ScoutSemaphore.wait_semaphore(sem)
68
- raise SemaphoreInterrupted if ret == -1
196
+ # Ensure name is normalized (caller should pass normalized name, but be safe)
197
+ sem = ensure_semaphore_name(sem)
198
+
199
+ # wait_semaphore returns 0 on success or -errno on error
200
+ begin
201
+ ScoutSemaphore.safe_wait_semaphore(sem)
202
+ rescue SemaphoreInterrupted
203
+ raise
204
+ rescue SystemCallError => e
205
+ # bubble up for callers to handle
206
+ raise
207
+ end
208
+
69
209
  begin
70
210
  yield
71
211
  ensure
72
- ScoutSemaphore.post_semaphore(sem)
212
+ begin
213
+ ScoutSemaphore.safe_post_semaphore(sem)
214
+ rescue SystemCallError => e
215
+ # Log but don't raise from ensure
216
+ Log.warn "post_semaphore(#{sem}) failed in ensure: #{e.message}"
217
+ end
73
218
  end
74
219
  end
75
220
 
76
221
  def self.with_semaphore(size, file = nil)
77
222
  if file.nil?
78
- file = "/scout-" + Misc.digest(rand(100000000000).to_s)[0..10] if file.nil?
223
+ file = "/scout-" + Misc.digest(rand(100000000000).to_s)[0..10]
79
224
  else
80
- file = file.gsub('/', '_') if file
225
+ # ensure valid POSIX name
226
+ file = ensure_semaphore_name(file)
81
227
  end
82
228
 
83
229
  begin
84
230
  Log.debug "Creating semaphore (#{ size }): #{file}"
85
- ScoutSemaphore.create_semaphore(file, size)
231
+ begin
232
+ ScoutSemaphore.safe_create_semaphore(file, size)
233
+ rescue SystemCallError => e
234
+ Log.error "Failed to create semaphore #{file}: #{e.message}"
235
+ raise
236
+ end
237
+
86
238
  yield file
87
239
  ensure
88
240
  Log.debug "Removing semaphore #{ file }"
89
- ScoutSemaphore.delete_semaphore(file)
241
+ begin
242
+ ScoutSemaphore.safe_delete_semaphore(file)
243
+ rescue SystemCallError => e
244
+ Log.warn "delete_semaphore(#{file}) failed: #{e.message}"
245
+ end
90
246
  end
91
247
  end
92
248
 
@@ -114,16 +270,17 @@ if continue
114
270
 
115
271
  threads = []
116
272
  wait_mutex.synchronize do
117
- threads = elems.collect do |elem|
273
+ threads = elems.collect do |elem|
118
274
  Thread.new(elem) do |elem|
119
275
 
120
276
  continue = false
121
277
  mutex.synchronize do
122
278
  while not continue do
123
- if count < size
279
+ if count < size
124
280
  continue = true
125
281
  count += 1
126
282
  end
283
+ # wait briefly to avoid busy loop; ConditionVariable could be used here properly
127
284
  mutex.sleep 1 unless continue
128
285
  end
129
286
  end
@@ -143,8 +300,8 @@ if continue
143
300
  end
144
301
  end
145
302
 
146
- threads.each do |thread|
147
- thread.join
303
+ threads.each do |thread|
304
+ thread.join
148
305
  end
149
306
  rescue Exception
150
307
  Log.exception $!
@@ -152,6 +309,5 @@ if continue
152
309
  threads.each do |thread| thread.kill end
153
310
  end
154
311
  end
155
- end
312
+ end
156
313
  end
157
-
@@ -71,9 +71,8 @@ class WorkQueue
71
71
  Thread.current.report_on_exception = false
72
72
  Thread.current["name"] = "Output reader #{queue_id}"
73
73
  @done_workers ||= []
74
- #while true
75
- # obj = @output.read
76
- while obj = @output.read
74
+ while true
75
+ obj = @output.read
77
76
  if DoneProcessing === obj
78
77
 
79
78
  done = @worker_mutex.synchronize do
@@ -90,6 +89,7 @@ class WorkQueue
90
89
  callback.call obj if callback
91
90
  end
92
91
  end
92
+ @waiter.join if @workers.any?
93
93
  rescue DoneProcessing
94
94
  rescue Aborted
95
95
  rescue WorkerException
@@ -135,9 +135,6 @@ class WorkQueue
135
135
  end
136
136
 
137
137
  raise exceptions.first if exceptions.any?
138
- if @workers.empty? && ! @closed
139
- @output.write DoneProcessing.new
140
- end
141
138
  end
142
139
  end
143
140
 
@@ -45,6 +45,7 @@ class Workflow::LocalExecutor
45
45
  end
46
46
 
47
47
  def process_batches(batches)
48
+ retry_jobs = []
48
49
  failed_jobs = []
49
50
 
50
51
  while batches.reject{|b| Workflow::Orchestrator.done_batch?(b) }.any?
@@ -54,6 +55,16 @@ class Workflow::LocalExecutor
54
55
 
55
56
  raise NoWork, "No candidates and no running jobs #{Log.fingerprint batches}" if resources_used.empty? && top_level_jobs.empty?
56
57
 
58
+ if candidates.reject{|batch| failed_jobs.include? batch[:top_level] }.empty? && resources_used.empty? && top_level_jobs.empty?
59
+ exception = failed_jobs.collect(&:get_exception).compact.first
60
+ if exception
61
+ Log.warn 'Some work failed'
62
+ raise exception
63
+ else
64
+ raise 'Some work failed'
65
+ end
66
+ end
67
+
57
68
  candidates.each do |batch|
58
69
  begin
59
70
 
@@ -63,15 +74,18 @@ class Workflow::LocalExecutor
63
74
  when (job.error? || job.aborted?)
64
75
  begin
65
76
  if job.recoverable_error?
66
- if failed_jobs.include?(job)
77
+ if retry_jobs.include?(job)
67
78
  Log.warn "Failed twice #{job.path} with recoverable error"
79
+ retry_jobs.delete job
80
+ failed_jobs << job
68
81
  next
69
82
  else
70
- failed_jobs << job
83
+ retry_jobs << job
71
84
  job.clean
72
85
  raise TryAgain
73
86
  end
74
87
  else
88
+ failed_jobs << job
75
89
  Log.warn "Non-recoverable error in #{job.path}"
76
90
  next
77
91
  end
@@ -695,8 +695,17 @@ env > #{batch_options[:fenv]}
695
695
  out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exist?(fout) and not tail == :STDERR
696
696
  err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exist?(ferr)
697
697
 
698
- terr = Misc.consume_stream(err, true, STDERR) if err
699
- tout = Misc.consume_stream(out, true, STDOUT) if out
698
+ tout = Thread.new do
699
+ while c = out.getc
700
+ STDOUT << c
701
+ end
702
+ end
703
+
704
+ terr = Thread.new do
705
+ while c = err.getc
706
+ STDERR << c
707
+ end
708
+ end
700
709
 
701
710
  sleep 3 while job_queued(job)
702
711
  rescue Aborted
@@ -206,7 +206,7 @@ class Step
206
206
  Marshal.load(Base64.decode64(info[:exception]))
207
207
  rescue
208
208
  Log.exception $!
209
- nil
209
+ return Exception.new messages.last
210
210
  end
211
211
  end
212
212
 
@@ -19,6 +19,6 @@ class Step
19
19
  end
20
20
 
21
21
  def to_json(...)
22
- self.path
22
+ self.path.to_json
23
23
  end
24
24
  end
data/scout-gear.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Juwelier::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: scout-gear 10.11.1 ruby lib
5
+ # stub: scout-gear 10.11.3 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
8
  s.name = "scout-gear".freeze
9
- s.version = "10.11.1".freeze
9
+ s.version = "10.11.3".freeze
10
10
 
11
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
12
12
  s.require_paths = ["lib".freeze]
@@ -147,6 +147,7 @@ Gem::Specification.new do |s|
147
147
  "scout_commands/alias",
148
148
  "scout_commands/batch/clean",
149
149
  "scout_commands/batch/list",
150
+ "scout_commands/batch/tail",
150
151
  "scout_commands/cat",
151
152
  "scout_commands/doc",
152
153
  "scout_commands/entity",
@@ -282,7 +283,7 @@ Gem::Specification.new do |s|
282
283
  ]
283
284
  s.homepage = "http://github.com/mikisvaz/scout-gear".freeze
284
285
  s.licenses = ["MIT".freeze]
285
- s.rubygems_version = "3.7.2".freeze
286
+ s.rubygems_version = "3.7.0.dev".freeze
286
287
  s.summary = "basic gear for scouts".freeze
287
288
 
288
289
  s.specification_version = 4
@@ -189,7 +189,7 @@ workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
189
189
 
190
190
  count += 1
191
191
 
192
- if options[:compressed]
192
+ if options[:compressed] && tail.nil?
193
193
  status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : Log.color(:green, id)
194
194
  if different_system
195
195
  status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id)
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scout'
4
+
5
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
6
+
7
+ options = SOPT.setup <<EOF
8
+
9
+ Queue a job in Marenostrum
10
+
11
+ $ rbbt slurm tail <directory|jobid> [options]
12
+
13
+ -h--help Print this help
14
+ EOF
15
+
16
+ if options[:help]
17
+ if defined? rbbt_usage
18
+ rbbt_usage
19
+ else
20
+ puts SOPT.doc
21
+ end
22
+ exit 0
23
+ end
24
+
25
+ batch_system = options.delete :batch_system
26
+ batch_system ||= 'auto'
27
+
28
+ directory = ARGV.shift
29
+
30
+ raise ParameterException if directory.nil?
31
+
32
+ if directory =~ /^[0-9]*$/
33
+ workdir = File.expand_path('~/scout-batch')
34
+ Path.setup(workdir)
35
+
36
+ workdir.glob("**/job.id").each do |file|
37
+ next unless directory == Open.read(file).strip
38
+ directory = File.dirname(file)
39
+ break
40
+ end
41
+ end
42
+
43
+ raise ParameterException, "Could not identify job #{directory}" unless File.exist?(directory)
44
+
45
+ require 'rbbt/hpc/slurm'
46
+
47
+ command_txt = Open.read(File.join(directory, 'command.batch'))
48
+ if m = command_txt.match(/#STEP_PATH: (.*)/)
49
+ step_path = m[1]
50
+ else
51
+ step_path = nil
52
+ end
53
+
54
+ puts Log.color(:magenta, "Directory: ") + directory if directory
55
+ puts Log.color(:magenta, "Step path: ") + step_path if step_path
56
+
57
+ SLURM.follow_job directory, true
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scout-gear
3
3
  version: !ruby/object:Gem::Version
4
- version: 10.11.1
4
+ version: 10.11.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
@@ -230,6 +230,7 @@ files:
230
230
  - scout_commands/alias
231
231
  - scout_commands/batch/clean
232
232
  - scout_commands/batch/list
233
+ - scout_commands/batch/tail
233
234
  - scout_commands/cat
234
235
  - scout_commands/doc
235
236
  - scout_commands/entity
@@ -380,7 +381,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
380
381
  - !ruby/object:Gem::Version
381
382
  version: '0'
382
383
  requirements: []
383
- rubygems_version: 3.7.2
384
+ rubygems_version: 3.7.0.dev
384
385
  specification_version: 4
385
386
  summary: basic gear for scouts
386
387
  test_files: []