scout-gear 10.11.2 → 10.11.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 77727bffd885a022ae17b23477024af34da91584f2ac55cd410483d1b6074e5a
4
- data.tar.gz: b6631c3b02c5c90194c60e68c0a6d6abd1b45ea74c3152422328488e8397f524
3
+ metadata.gz: f4ce69534bfbce4c385cfc9c23b280c6bb935a9f7228afbf7e585c3f093a5a18
4
+ data.tar.gz: 98dee8673df93baf527d75df6fb9358b9ec1ec54a37e60c38f0bcb34920fb124
5
5
  SHA512:
6
- metadata.gz: 1ad9229647dfe17146a3e5d3eac31347dde32e5b74f5e866c42b24a1e82816a2a9ef9a3ee61f020e18479decdf0d1c0c7b8fb33afa742f9d3b88602b86dc6c04
7
- data.tar.gz: f51b0bdb79d3aa42f27697ad01b598ee79e0eb71ecaa172f38005926c1a0c13b08751475ff7d374699b7d1f42922222c025a5b6cb2430864e69b064b63168009
6
+ metadata.gz: 81b897cea0e0d4153fd1d97cae601a17dc75c4c36ebb0c6fe0699a4440d3b7baddbb501fbdb48c409f89386379fcc153a67b7b53b71f95d460366c6993637986
7
+ data.tar.gz: d0cb23b7fa0b9b04e5719cab7ee365231105fb16e32f62aed37c22f886054de9538922b07cc4495371eb1356fbc9b98707ebfb23635dee847564623b53c50bce
data/.vimproject CHANGED
@@ -3,6 +3,8 @@ scout-gear=/$PWD filter="*.rb *.yaml" {
3
3
  README.md
4
4
  chats=chats filter="*"{
5
5
 
6
+ semaphore
7
+
6
8
  deploy
7
9
 
8
10
 
@@ -202,6 +204,7 @@ scout-gear=/$PWD filter="*.rb *.yaml" {
202
204
  cmd
203
205
  }
204
206
  batch=batch{
207
+ tail
205
208
  list
206
209
  clean
207
210
  }
data/VERSION CHANGED
@@ -1 +1 @@
1
- 10.11.2
1
+ 10.11.4
@@ -23,70 +23,229 @@ if continue
23
23
  #include <fcntl.h>
24
24
  EOF
25
25
 
26
+ # Create a named semaphore. Return 0 on success, -errno on error.
26
27
  builder.c_singleton <<-EOF
27
- void create_semaphore(char* name, int value){
28
- sem_open(name, O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO, value);
28
+ int create_semaphore(char* name, int value){
29
+ sem_t* sem;
30
+ sem = sem_open(name, O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO, value);
31
+ if (sem == SEM_FAILED){
32
+ return -errno;
33
+ }
34
+ /* close our handle; the semaphore lives on until unlinked and all handles closed */
35
+ sem_close(sem);
36
+ return 0;
29
37
  }
30
38
  EOF
39
+
40
+ # Unlink (remove) a named semaphore. Return 0 on success, -errno on error.
31
41
  builder.c_singleton <<-EOF
32
- void delete_semaphore(char* name){
33
- sem_unlink(name);
42
+ int delete_semaphore(char* name){
43
+ int ret = sem_unlink(name);
44
+ if (ret == -1) {
45
+ return -errno;
46
+ }
47
+ return 0;
34
48
  }
35
49
  EOF
36
50
 
51
+ # Wait (sem_wait) on a named semaphore. Return 0 on success, -errno on error.
37
52
  builder.c_singleton <<-EOF
38
53
  int wait_semaphore(char* name){
39
- int ret;
40
54
  sem_t* sem;
41
55
  sem = sem_open(name, 0);
42
56
  if (sem == SEM_FAILED){
43
- return(errno);
57
+ return -errno;
44
58
  }
45
- ret = sem_wait(sem);
59
+
60
+ int ret;
61
+ /* retry if interrupted by signal; stop on success or other error */
62
+ do {
63
+ ret = sem_wait(sem);
64
+ } while (ret == -1 && errno == EINTR);
65
+
46
66
  if (ret == -1){
47
- return(errno);
67
+ int e = errno;
68
+ sem_close(sem);
69
+ return -e;
48
70
  }
71
+
49
72
  sem_close(sem);
50
- return(ret);
73
+ return 0;
51
74
  }
52
75
  EOF
53
76
 
77
+ # Post (sem_post) on a named semaphore. Return 0 on success, -errno on error.
54
78
  builder.c_singleton <<-EOF
55
- void post_semaphore(char* name){
79
+ int post_semaphore(char* name){
56
80
  sem_t* sem;
57
81
  sem = sem_open(name, 0);
58
- sem_post(sem);
82
+ if (sem == SEM_FAILED){
83
+ return -errno;
84
+ }
85
+
86
+ int ret;
87
+ /* retry post if interrupted */
88
+ do {
89
+ ret = sem_post(sem);
90
+ } while (ret == -1 && errno == EINTR);
91
+
92
+ if (ret == -1) {
93
+ int e = errno;
94
+ sem_close(sem);
95
+ return -e;
96
+ }
97
+
59
98
  sem_close(sem);
99
+ return 0;
60
100
  }
61
101
  EOF
62
102
 
63
103
  end
64
104
 
65
105
  SEM_MUTEX = Mutex.new
106
+
107
+ def self.ensure_semaphore_name(file)
108
+ # Ensure a valid POSIX named semaphore name: must start with '/'
109
+ s = file.to_s.dup
110
+ # strip leading slashes and replace other slashes with underscores, then prepend single '/'
111
+ s.gsub!(%r{^/+}, '')
112
+ s = '/' + s.gsub('/', '_')
113
+ s
114
+ end
115
+
116
+ # Errno numeric lists
117
+ RETRIABLE_ERRNOS = [
118
+ Errno::ENOENT,
119
+ Errno::EIDRM,
120
+ Errno::EAGAIN,
121
+ Errno::EMFILE,
122
+ Errno::ENFILE,
123
+ Errno::EINTR
124
+ ].map { |c| c.new.errno }
125
+
126
+ FATAL_ERRNOS = [
127
+ Errno::EINVAL,
128
+ Errno::EACCES
129
+ ].map { |c| c.new.errno }
130
+
131
+ # Generic retry wrapper with exponential backoff + jitter
132
+ def self.with_retry(max_attempts: 6, base_delay: 0.01, max_delay: 1.0, jitter: 0.5, retriable: RETRIABLE_ERRNOS)
133
+ attempts = 0
134
+ while true
135
+ attempts += 1
136
+ ret = yield
137
+ # caller expects 0 on success, negative errno on failure
138
+ return ret if ret >= 0
139
+
140
+ err = -ret
141
+ # don't retry if it's clearly fatal or not in retriable list
142
+ if FATAL_ERRNOS.include?(err) || attempts >= max_attempts || !retriable.include?(err)
143
+ return ret
144
+ end
145
+
146
+ # exponential backoff with jitter
147
+ base = base_delay * (2 ** (attempts - 1))
148
+ sleep_time = [base, max_delay].min
149
+ # add jitter in range [0, jitter * sleep_time)
150
+ sleep_time += rand * jitter * sleep_time
151
+
152
+ Log.warn "Semaphore operation failed (errno=#{err}), retrying in #{'%.3f' % sleep_time}s (attempt #{attempts}/#{max_attempts})"
153
+ sleep(sleep_time)
154
+ end
155
+ end
156
+
157
+ # Safe wrappers that raise SystemCallError on final failure
158
+ def self.safe_create_semaphore(name, value, **opts)
159
+ ret = with_retry(**opts) { ScoutSemaphore.create_semaphore(name, value) }
160
+ if ret < 0
161
+ raise SystemCallError.new("create_semaphore(#{name}) failed", -ret)
162
+ end
163
+ ret
164
+ end
165
+
166
+ def self.safe_delete_semaphore(name, **opts)
167
+ ret = with_retry(**opts) { ScoutSemaphore.delete_semaphore(name) }
168
+ if ret < 0
169
+ raise SystemCallError.new("delete_semaphore(#{name}) failed", -ret)
170
+ end
171
+ ret
172
+ end
173
+
174
+ def self.safe_wait_semaphore(name, **opts)
175
+ ret = with_retry(**opts) { ScoutSemaphore.wait_semaphore(name) }
176
+ if ret < 0
177
+ err = -ret
178
+ if err == Errno::EINTR.new.errno
179
+ raise SemaphoreInterrupted
180
+ else
181
+ raise SystemCallError.new("wait_semaphore(#{name}) failed", err)
182
+ end
183
+ end
184
+ ret
185
+ end
186
+
187
+ def self.safe_post_semaphore(name, **opts)
188
+ ret = with_retry(**opts) { ScoutSemaphore.post_semaphore(name) }
189
+ if ret < 0
190
+ raise SystemCallError.new("post_semaphore(#{name}) failed", -ret)
191
+ end
192
+ ret
193
+ end
194
+
66
195
  def self.synchronize(sem)
67
- ret = ScoutSemaphore.wait_semaphore(sem)
68
- raise SemaphoreInterrupted if ret == -1
196
+ # Ensure name is normalized (caller should pass normalized name, but be safe)
197
+ sem = ensure_semaphore_name(sem)
198
+
199
+ # wait_semaphore returns 0 on success or -errno on error
200
+ begin
201
+ ScoutSemaphore.safe_wait_semaphore(sem)
202
+ rescue SemaphoreInterrupted
203
+ raise
204
+ rescue SystemCallError => e
205
+ # bubble up for callers to handle
206
+ raise
207
+ end
208
+
69
209
  begin
70
210
  yield
71
211
  ensure
72
- ScoutSemaphore.post_semaphore(sem)
212
+ begin
213
+ ScoutSemaphore.safe_post_semaphore(sem)
214
+ rescue SystemCallError => e
215
+ # Log but don't raise from ensure
216
+ # Log.warn "post_semaphore(#{sem}) failed in ensure: #{e.message}"
217
+
218
+ # Actually, do raise
219
+ raise e
220
+ end
73
221
  end
74
222
  end
75
223
 
76
224
  def self.with_semaphore(size, file = nil)
77
225
  if file.nil?
78
- file = "/scout-" + Misc.digest(rand(100000000000).to_s)[0..10] if file.nil?
226
+ file = "/scout-" + Misc.digest(rand(100000000000).to_s)[0..10]
79
227
  else
80
- file = file.gsub('/', '_') if file
228
+ # ensure valid POSIX name
229
+ file = ensure_semaphore_name(file)
81
230
  end
82
231
 
83
232
  begin
84
233
  Log.debug "Creating semaphore (#{ size }): #{file}"
85
- ScoutSemaphore.create_semaphore(file, size)
234
+ begin
235
+ ScoutSemaphore.safe_create_semaphore(file, size)
236
+ rescue SystemCallError => e
237
+ Log.error "Failed to create semaphore #{file}: #{e.message}"
238
+ raise
239
+ end
240
+
86
241
  yield file
87
242
  ensure
88
243
  Log.debug "Removing semaphore #{ file }"
89
- ScoutSemaphore.delete_semaphore(file)
244
+ begin
245
+ ScoutSemaphore.safe_delete_semaphore(file)
246
+ rescue SystemCallError => e
247
+ Log.warn "delete_semaphore(#{file}) failed: #{e.message}"
248
+ end
90
249
  end
91
250
  end
92
251
 
@@ -114,16 +273,17 @@ if continue
114
273
 
115
274
  threads = []
116
275
  wait_mutex.synchronize do
117
- threads = elems.collect do |elem|
276
+ threads = elems.collect do |elem|
118
277
  Thread.new(elem) do |elem|
119
278
 
120
279
  continue = false
121
280
  mutex.synchronize do
122
281
  while not continue do
123
- if count < size
282
+ if count < size
124
283
  continue = true
125
284
  count += 1
126
285
  end
286
+ # wait briefly to avoid busy loop; ConditionVariable could be used here properly
127
287
  mutex.sleep 1 unless continue
128
288
  end
129
289
  end
@@ -143,8 +303,8 @@ if continue
143
303
  end
144
304
  end
145
305
 
146
- threads.each do |thread|
147
- thread.join
306
+ threads.each do |thread|
307
+ thread.join
148
308
  end
149
309
  rescue Exception
150
310
  Log.exception $!
@@ -152,6 +312,5 @@ if continue
152
312
  threads.each do |thread| thread.kill end
153
313
  end
154
314
  end
155
- end
315
+ end
156
316
  end
157
-
@@ -51,8 +51,9 @@ class WorkQueue
51
51
  rescue Exception
52
52
  begin
53
53
  output.write WorkerException.new($!, Process.pid)
54
- ensure
55
54
  exit EXIT_STATUS
55
+ rescue
56
+ exit -1
56
57
  end
57
58
  end
58
59
  exit 0
@@ -71,9 +71,8 @@ class WorkQueue
71
71
  Thread.current.report_on_exception = false
72
72
  Thread.current["name"] = "Output reader #{queue_id}"
73
73
  @done_workers ||= []
74
- #while true
75
- # obj = @output.read
76
- while obj = @output.read
74
+ while true
75
+ obj = @output.read
77
76
  if DoneProcessing === obj
78
77
 
79
78
  done = @worker_mutex.synchronize do
@@ -90,6 +89,7 @@ class WorkQueue
90
89
  callback.call obj if callback
91
90
  end
92
91
  end
92
+ @waiter.join if @workers.any?
93
93
  rescue DoneProcessing
94
94
  rescue Aborted
95
95
  rescue WorkerException
@@ -115,7 +115,7 @@ class WorkQueue
115
115
  break if @worker_mutex.synchronize{ @workers.empty? }
116
116
  threads = @workers.collect do |w|
117
117
  t = Thread.new do
118
- Thread.report_on_exception = false
118
+ Thread.current.report_on_exception = false
119
119
  Thread.current["name"] = "Worker waiter #{queue_id} worker #{w.pid}"
120
120
  pid, status = Process.wait2 w.pid
121
121
  remove_worker(pid) if pid
@@ -135,9 +135,6 @@ class WorkQueue
135
135
  end
136
136
 
137
137
  raise exceptions.first if exceptions.any?
138
- if @workers.empty? && ! @closed
139
- @output.write DoneProcessing.new
140
- end
141
138
  end
142
139
  end
143
140
 
@@ -172,7 +169,7 @@ class WorkQueue
172
169
  @worker_mutex.synchronize{ @workers.length }.times do
173
170
  begin
174
171
  @input.write DoneProcessing.new() unless @input.closed_write?
175
- rescue IOError
172
+ rescue IOError,Errno::ENOENT
176
173
  end
177
174
  end
178
175
  end
@@ -695,8 +695,17 @@ env > #{batch_options[:fenv]}
695
695
  out = CMD.cmd("tail -f '#{fout}'", :pipe => true) if File.exist?(fout) and not tail == :STDERR
696
696
  err = CMD.cmd("tail -f '#{ferr}'", :pipe => true) if File.exist?(ferr)
697
697
 
698
- terr = Misc.consume_stream(err, true, STDERR) if err
699
- tout = Misc.consume_stream(out, true, STDOUT) if out
698
+ tout = Thread.new do
699
+ while c = out.getc
700
+ STDOUT << c
701
+ end
702
+ end
703
+
704
+ terr = Thread.new do
705
+ while c = err.getc
706
+ STDERR << c
707
+ end
708
+ end
700
709
 
701
710
  sleep 3 while job_queued(job)
702
711
  rescue Aborted
@@ -206,7 +206,7 @@ class Step
206
206
  Marshal.load(Base64.decode64(info[:exception]))
207
207
  rescue
208
208
  Log.exception $!
209
- nil
209
+ return Exception.new messages.last
210
210
  end
211
211
  end
212
212
 
data/scout-gear.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Juwelier::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: scout-gear 10.11.2 ruby lib
5
+ # stub: scout-gear 10.11.4 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
8
  s.name = "scout-gear".freeze
9
- s.version = "10.11.2".freeze
9
+ s.version = "10.11.4".freeze
10
10
 
11
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
12
12
  s.require_paths = ["lib".freeze]
@@ -147,6 +147,7 @@ Gem::Specification.new do |s|
147
147
  "scout_commands/alias",
148
148
  "scout_commands/batch/clean",
149
149
  "scout_commands/batch/list",
150
+ "scout_commands/batch/tail",
150
151
  "scout_commands/cat",
151
152
  "scout_commands/doc",
152
153
  "scout_commands/entity",
@@ -282,7 +283,7 @@ Gem::Specification.new do |s|
282
283
  ]
283
284
  s.homepage = "http://github.com/mikisvaz/scout-gear".freeze
284
285
  s.licenses = ["MIT".freeze]
285
- s.rubygems_version = "3.7.2".freeze
286
+ s.rubygems_version = "3.7.0.dev".freeze
286
287
  s.summary = "basic gear for scouts".freeze
287
288
 
288
289
  s.specification_version = 4
@@ -189,7 +189,7 @@ workdir.glob("**/command.batch").sort_by{|f| File.mtime(f)}.each do |fcmd|
189
189
 
190
190
  count += 1
191
191
 
192
- if options[:compressed]
192
+ if options[:compressed] && tail.nil?
193
193
  status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) : Log.color(:green, id)
194
194
  if different_system
195
195
  status = exit_status ? (exit_status == 0 ? Log.color(:green, "Done") : Log.color(:red, "Error")) + " (#{ id })" : Log.color(:green, id)
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scout'
4
+
5
+ #$0 = "rbbt #{$previous_commands*""} #{ File.basename(__FILE__) }" if $previous_commands
6
+
7
+ options = SOPT.setup <<EOF
8
+
9
+ Queue a job in Marenostrum
10
+
11
+ $ rbbt slurm tail <directory|jobid> [options]
12
+
13
+ -h--help Print this help
14
+ EOF
15
+
16
+ if options[:help]
17
+ if defined? rbbt_usage
18
+ rbbt_usage
19
+ else
20
+ puts SOPT.doc
21
+ end
22
+ exit 0
23
+ end
24
+
25
+ batch_system = options.delete :batch_system
26
+ batch_system ||= 'auto'
27
+
28
+ directory = ARGV.shift
29
+
30
+ raise ParameterException if directory.nil?
31
+
32
+ if directory =~ /^[0-9]*$/
33
+ workdir = File.expand_path('~/scout-batch')
34
+ Path.setup(workdir)
35
+
36
+ workdir.glob("**/job.id").each do |file|
37
+ next unless directory == Open.read(file).strip
38
+ directory = File.dirname(file)
39
+ break
40
+ end
41
+ end
42
+
43
+ raise ParameterException, "Could not identify job #{directory}" unless File.exist?(directory)
44
+
45
+ require 'rbbt/hpc/slurm'
46
+
47
+ command_txt = Open.read(File.join(directory, 'command.batch'))
48
+ if m = command_txt.match(/#STEP_PATH: (.*)/)
49
+ step_path = m[1]
50
+ else
51
+ step_path = nil
52
+ end
53
+
54
+ puts Log.color(:magenta, "Directory: ") + directory if directory
55
+ puts Log.color(:magenta, "Step path: ") + step_path if step_path
56
+
57
+ SLURM.follow_job directory, true
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scout-gear
3
3
  version: !ruby/object:Gem::Version
4
- version: 10.11.2
4
+ version: 10.11.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
@@ -230,6 +230,7 @@ files:
230
230
  - scout_commands/alias
231
231
  - scout_commands/batch/clean
232
232
  - scout_commands/batch/list
233
+ - scout_commands/batch/tail
233
234
  - scout_commands/cat
234
235
  - scout_commands/doc
235
236
  - scout_commands/entity
@@ -380,7 +381,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
380
381
  - !ruby/object:Gem::Version
381
382
  version: '0'
382
383
  requirements: []
383
- rubygems_version: 3.7.2
384
+ rubygems_version: 3.7.0.dev
384
385
  specification_version: 4
385
386
  summary: basic gear for scouts
386
387
  test_files: []