evilution 0.32.0 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.beads/interactions.jsonl +28 -0
  3. data/.rubocop_todo.yml +1 -0
  4. data/CHANGELOG.md +31 -0
  5. data/README.md +12 -10
  6. data/docs/integrations.md +15 -0
  7. data/docs/isolation.md +46 -2
  8. data/lib/evilution/baseline.rb +11 -4
  9. data/lib/evilution/cli/parser/options_builder.rb +17 -0
  10. data/lib/evilution/config/validators/example_targeting_strategy.rb +22 -0
  11. data/lib/evilution/config.rb +16 -2
  12. data/lib/evilution/coverage/digest.rb +16 -0
  13. data/lib/evilution/coverage/map.rb +64 -0
  14. data/lib/evilution/coverage/map_builder.rb +82 -0
  15. data/lib/evilution/coverage/map_store.rb +87 -0
  16. data/lib/evilution/coverage/recorder.rb +85 -0
  17. data/lib/evilution/coverage.rb +8 -0
  18. data/lib/evilution/coverage_example_filter.rb +41 -0
  19. data/lib/evilution/integration/loading/test_load_path.rb +76 -0
  20. data/lib/evilution/integration/minitest.rb +5 -1
  21. data/lib/evilution/integration/rspec/state_guard/configuration_state.rb +72 -0
  22. data/lib/evilution/integration/rspec/state_guard/configuration_streams.rb +45 -0
  23. data/lib/evilution/integration/rspec/state_guard.rb +3 -1
  24. data/lib/evilution/integration/test_unit.rb +12 -4
  25. data/lib/evilution/isolation/fork.rb +38 -50
  26. data/lib/evilution/parallel/work_queue/dispatcher/deadline_tracker.rb +63 -0
  27. data/lib/evilution/parallel/work_queue/dispatcher.rb +70 -25
  28. data/lib/evilution/parallel/work_queue/worker.rb +50 -14
  29. data/lib/evilution/parallel/work_queue.rb +8 -0
  30. data/lib/evilution/process_supervisor.rb +259 -0
  31. data/lib/evilution/reporter/cli/line_formatters/unresolved_rate_warning.rb +50 -0
  32. data/lib/evilution/reporter/cli/metrics_block.rb +2 -0
  33. data/lib/evilution/runner/baseline_runner.rb +52 -0
  34. data/lib/evilution/runner/isolation_resolver.rb +106 -12
  35. data/lib/evilution/runner/mutation_executor/strategy/parallel.rb +28 -1
  36. data/lib/evilution/runner.rb +7 -0
  37. data/lib/evilution/spec_resolver.rb +147 -9
  38. data/lib/evilution/spec_selector.rb +14 -4
  39. data/lib/evilution/version.rb +1 -1
  40. data/lib/evilution.rb +1 -0
  41. data/lib/tasks/stress.rake +15 -0
  42. data/scripts/canary_manifest.yml +47 -0
  43. data/scripts/compare_targeting +277 -0
  44. data/scripts/compare_targeting.example.yml +24 -0
  45. metadata +20 -2
@@ -5,7 +5,7 @@ require "tmpdir"
5
5
  require_relative "../memory"
6
6
  require_relative "../temp_dir_tracker"
7
7
  require_relative "../child_output"
8
- require_relative "../process_cleanup"
8
+ require_relative "../process_supervisor"
9
9
 
10
10
  require_relative "../isolation"
11
11
 
@@ -15,21 +15,25 @@ class Evilution::Isolation::Fork
15
15
 
16
16
  def initialize(hooks: nil)
17
17
  @hooks = hooks
18
+ # EV-3aw3 / EV-5rrh step 2: the supervisor owns this path's lifecycle --
19
+ # spawn + process-group isolation, the TERM/grace/KILL ladder, and reap +
20
+ # sandbox removal. fork.rb keeps only the marshal-pipe read protocol.
21
+ @supervisor = Evilution::ProcessSupervisor.new
18
22
  end
19
23
 
20
24
  def call(mutation:, test_command:, timeout:)
21
- pid = nil
25
+ handle = nil
22
26
  sandbox_dir = Dir.mktmpdir("evilution-run")
23
27
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
24
28
  parent_rss = Evilution::Memory.rss_kb
25
29
  read_io, write_io = binary_pipe
26
- pid = fork_child(read_io, write_io, sandbox_dir, mutation, test_command)
30
+ handle = spawn_child(read_io, write_io, sandbox_dir, mutation, test_command)
27
31
  write_io.close
28
- result = wait_for_result(pid, read_io, timeout)
32
+ result = wait_for_result(handle, read_io, timeout)
29
33
  duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
30
34
  build_mutation_result(mutation, result, duration, parent_rss)
31
35
  ensure
32
- cleanup_resources(read_io, write_io, pid, sandbox_dir)
36
+ cleanup_resources(read_io, write_io, handle, sandbox_dir)
33
37
  end
34
38
 
35
39
  private
@@ -46,13 +50,18 @@ class Evilution::Isolation::Fork
46
50
  [read_io, write_io]
47
51
  end
48
52
 
49
- def fork_child(read_io, write_io, sandbox_dir, mutation, test_command)
50
- ::Process.fork do
53
+ # Supervisor.spawn makes the child its own process-group leader (setpgid)
54
+ # before this block runs, so any grandchildren test_command forks inherit the
55
+ # group and the TERM/KILL ladder sweeps the whole subtree on timeout (EV-2sh8
56
+ # / GH #1330). The block keeps the marshal-pipe protocol: write a
57
+ # length-prefixed payload, then exit with the pass/fail code.
58
+ def spawn_child(read_io, write_io, sandbox_dir, mutation, test_command)
59
+ @supervisor.spawn(sandbox_dir: sandbox_dir) do
51
60
  ENV["TMPDIR"] = sandbox_dir
52
61
  # Path-relativizing mutations (e.g. File.join(dir, name) -> name) would
53
62
  # otherwise write into the parent's CWD (typically the repo root) and
54
63
  # leak past the run. chdir here keeps such writes inside sandbox_dir,
55
- # which the ensure block of #call removes. The in_isolated_worker! flag
64
+ # which the supervisor removes on reap. The in_isolated_worker! flag
56
65
  # signals the rest of evilution (SpecResolver/SpecSelector/SpecAstCache/
57
66
  # MutationApplier/SourceEvaluator/Integration) to anchor project-relative
58
67
  # paths to Evilution::PROJECT_ROOT instead of the sandbox CWD.
@@ -70,12 +79,20 @@ class Evilution::Isolation::Fork
70
79
  end
71
80
  end
72
81
 
73
- def cleanup_resources(read_io, write_io, pid, sandbox_dir)
82
+ # The parent owns read_io/write_io (write_io is closed right after spawn so
83
+ # read_io can see EOF), so they are closed here rather than handed to the
84
+ # supervisor. The supervisor reaps the child and removes the sandbox dir; on
85
+ # the early-failure path (binary_pipe raised before spawn) handle is nil, so
86
+ # the orphaned sandbox is removed directly.
87
+ def cleanup_resources(read_io, write_io, handle, sandbox_dir)
74
88
  read_io.close unless read_io.nil?
75
89
  write_io.close unless write_io.nil?
76
- ensure_reaped(pid)
90
+ if handle
91
+ @supervisor.terminate(handle, grace: GRACE_PERIOD)
92
+ elsif sandbox_dir
93
+ FileUtils.rm_rf(sandbox_dir)
94
+ end
77
95
  restore_original_source
78
- FileUtils.rm_rf(sandbox_dir) if sandbox_dir
79
96
  end
80
97
 
81
98
  def restore_original_source
@@ -109,21 +126,21 @@ class Evilution::Isolation::Fork
109
126
  # never sees EOF and hangs forever. The length prefix makes payload reads
110
127
  # bounded; the waitpid-WNOHANG check inside the poll loop lets us exit
111
128
  # promptly when the child died without writing anything.
112
- def wait_for_result(pid, read_io, timeout)
129
+ def wait_for_result(handle, read_io, timeout)
113
130
  deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + timeout
114
131
  loop do
115
132
  remaining = deadline - Process.clock_gettime(Process::CLOCK_MONOTONIC)
116
- return timeout_result(pid) if remaining <= 0
133
+ return timeout_result(handle) if remaining <= 0
117
134
 
118
135
  if read_io.wait_readable([remaining, 0.5].min)
119
136
  payload = read_payload(read_io, deadline)
120
- return reap_and_decode(pid, payload) if payload
137
+ return reap_and_decode(handle, payload) if payload
121
138
  end
122
139
 
123
- next unless ::Process.waitpid(pid, ::Process::WNOHANG)
140
+ next unless @supervisor.reap_nonblock(handle)
124
141
 
125
142
  # Child exited. Drain any final payload that arrived between
126
- # wait_readable timeout and waitpid (race) before declaring empty.
143
+ # wait_readable timeout and the reap (race) before declaring empty.
127
144
  final = read_payload(read_io, Process.clock_gettime(Process::CLOCK_MONOTONIC) + 0.1)
128
145
  return decode_payload(final) if final
129
146
 
@@ -137,13 +154,13 @@ class Evilution::Isolation::Fork
137
154
  # in execute_in_child waiting on a subject grandchild the mutation broke.
138
155
  # wait_for_result has already returned by this point, so the per-mutation
139
156
  # timeout cannot fire. Bound the wait and fall back to the TERM/KILL ladder.
140
- def reap_and_decode(pid, payload)
157
+ def reap_and_decode(handle, payload)
141
158
  deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + REAP_DEADLINE
142
159
  loop do
143
- break if ::Process.waitpid(pid, ::Process::WNOHANG)
160
+ break if @supervisor.reap_nonblock(handle)
144
161
 
145
162
  if Process.clock_gettime(Process::CLOCK_MONOTONIC) >= deadline
146
- terminate_child(pid)
163
+ @supervisor.terminate(handle, grace: GRACE_PERIOD)
147
164
  break
148
165
  end
149
166
  sleep 0.05
@@ -194,40 +211,11 @@ class Evilution::Isolation::Fork
194
211
  { timeout: false, passed: false, error: "empty result from child" }
195
212
  end
196
213
 
197
- def timeout_result(pid)
198
- terminate_child(pid)
214
+ def timeout_result(handle)
215
+ @supervisor.terminate(handle, grace: GRACE_PERIOD)
199
216
  { timeout: true }
200
217
  end
201
218
 
202
- # Defensive reap: if normal control flow raised before wait_for_result
203
- # reaped the child (e.g. Marshal.load on corrupt payload), the child becomes
204
- # a zombie. Reuse terminate_child for the bounded TERM + GRACE_PERIOD + KILL
205
- # ladder so this never hangs the ensure path; swallow SystemCallError so
206
- # cleanup can't mask the primary failure.
207
- def ensure_reaped(pid)
208
- return unless pid
209
-
210
- reaped = ::Process.waitpid(pid, ::Process::WNOHANG)
211
- return if reaped
212
-
213
- terminate_child(pid)
214
- rescue SystemCallError
215
- nil
216
- end
217
-
218
- def terminate_child(pid)
219
- Evilution::ProcessCleanup.safe_kill("TERM", pid)
220
- _, status = ::Process.waitpid2(pid, ::Process::WNOHANG)
221
- return if status
222
-
223
- sleep(GRACE_PERIOD)
224
- _, status = ::Process.waitpid2(pid, ::Process::WNOHANG)
225
- return if status
226
-
227
- Evilution::ProcessCleanup.safe_kill("KILL", pid)
228
- Evilution::ProcessCleanup.safe_wait(pid)
229
- end
230
-
231
219
  def classify_status(result)
232
220
  return :timeout if result[:timeout]
233
221
  return :killed if result[:test_crashed]
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../dispatcher"
4
+
5
+ # Owns the per-worker item-timeout deadline clock for the Dispatcher: arming a
6
+ # worker's deadline when it goes busy, re-arming it on each result, surfacing the
7
+ # workers whose deadline has passed, and computing how long IO.select may block.
8
+ # Each worker carries its own deadline so a single stuck worker is reaped in
9
+ # isolation rather than aborting the whole pool (EV-gl1e). Pulling this cohesive
10
+ # timeout concern out of the Dispatcher keeps the dispatcher focused on the
11
+ # collect/recycle orchestration (EV-9mij).
12
+ #
13
+ # `workers` is the Dispatcher's live array (mutated in place as workers recycle),
14
+ # so the tracker always reads the current pool. `clock` is injectable for tests.
15
+ class Evilution::Parallel::WorkQueue::Dispatcher::DeadlineTracker
16
+ def initialize(item_timeout:, workers:, clock: -> { Process.clock_gettime(Process::CLOCK_MONOTONIC) })
17
+ @item_timeout = item_timeout
18
+ @workers = workers
19
+ @clock = clock
20
+ end
21
+
22
+ def enabled?
23
+ !@item_timeout.nil?
24
+ end
25
+
26
+ # Seconds IO.select may block: until the nearest worker deadline (never
27
+ # negative), or the raw timeout when no worker is currently on the clock.
28
+ def select_timeout
29
+ return @item_timeout unless enabled?
30
+
31
+ deadlines = @workers.filter_map(&:deadline)
32
+ return @item_timeout if deadlines.empty?
33
+
34
+ [deadlines.min - now, 0].max
35
+ end
36
+
37
+ # Workers whose deadline has passed while still holding in-flight work.
38
+ def overdue
39
+ return [] unless enabled?
40
+
41
+ moment = now
42
+ @workers.select { |worker| worker.deadline && worker.deadline <= moment && worker.pending.positive? }
43
+ end
44
+
45
+ # Arm a worker's clock when it first goes busy; idempotent for the in-flight
46
+ # item so a refresh does not extend an already-running deadline.
47
+ def start(worker)
48
+ return unless enabled?
49
+
50
+ worker.deadline ||= now + @item_timeout
51
+ end
52
+
53
+ # After a result: re-arm while work remains, otherwise stop the clock.
54
+ def refresh(worker)
55
+ worker.deadline = (now + @item_timeout if enabled? && worker.pending.positive?)
56
+ end
57
+
58
+ private
59
+
60
+ def now
61
+ @clock.call
62
+ end
63
+ end
@@ -12,11 +12,11 @@ class Evilution::Parallel::WorkQueue::Dispatcher
12
12
  @workers = workers
13
13
  @items = items
14
14
  @prefetch = prefetch
15
- @item_timeout = item_timeout
16
15
  @worker_max_items = worker_max_items
17
16
  @recycle_factory = recycle_factory
18
17
  @state = Evilution::Parallel::WorkQueue.send(:const_get, :CollectionState).new(items.length)
19
18
  @retired = []
19
+ @deadlines = DeadlineTracker.new(item_timeout:, workers: @workers)
20
20
  end
21
21
 
22
22
  def run
@@ -38,24 +38,34 @@ class Evilution::Parallel::WorkQueue::Dispatcher
38
38
  end
39
39
  end
40
40
 
41
+ # Each worker carries its own deadline (set when it goes busy, refreshed on
42
+ # every result). The select blocks only until the nearest worker deadline,
43
+ # so a single stuck worker is reaped in isolation -- its in-flight item gets
44
+ # the WorkQueue::TIMED_OUT sentinel and the worker is recycled -- instead of
45
+ # the old pool-wide watchdog that SIGKILLed every worker and aborted the run.
41
46
  def collect
42
47
  io_to_worker = @workers.to_h { |w| [w.res_io, w] }
43
48
  result_ios = io_to_worker.keys
44
49
 
45
50
  while @state.in_flight.positive?
46
- readable, = IO.select(result_ios, nil, nil, @item_timeout)
47
- if readable.nil?
48
- record_timeout
49
- break
50
- end
51
+ readable, = IO.select(result_ios, nil, nil, @deadlines.select_timeout)
52
+ reap_timed_out(io_to_worker, result_ios)
53
+ next if readable.nil?
51
54
 
52
- readable.each { |io| process_readable(io, io_to_worker, result_ios) }
55
+ readable.each do |io|
56
+ process_readable(io, io_to_worker, result_ios) if result_ios.include?(io)
57
+ end
53
58
  end
54
59
  end
55
60
 
56
- def record_timeout
57
- terminate_stuck
58
- @state.first_error ||= Evilution::Error.new("worker timed out after #{@item_timeout}s")
61
+ def reap_timed_out(io_to_worker, result_ios)
62
+ @deadlines.overdue.each { |worker| time_out_worker(worker, io_to_worker, result_ios) }
63
+ end
64
+
65
+ def time_out_worker(worker, io_to_worker, result_ios)
66
+ worker.kill
67
+ mark_unfinished(worker, Evilution::Parallel::WorkQueue::TIMED_OUT)
68
+ retire_or_replace(worker, io_to_worker, result_ios)
59
69
  end
60
70
 
61
71
  def process_readable(io, io_to_worker, result_ios)
@@ -65,7 +75,7 @@ class Evilution::Parallel::WorkQueue::Dispatcher
65
75
 
66
76
  def handle(worker, io_to_worker, result_ios)
67
77
  message = worker.read_result
68
- return handle_dead(worker) if message.nil?
78
+ return handle_dead(worker, io_to_worker, result_ios) if message.nil?
69
79
 
70
80
  record(message, worker)
71
81
  return false if recycle_and_dispatch(worker, io_to_worker, result_ios)
@@ -82,13 +92,24 @@ class Evilution::Parallel::WorkQueue::Dispatcher
82
92
  @state.in_flight -= 1
83
93
  worker.pending -= 1
84
94
  worker.items_completed += 1
95
+ worker.in_flight_indices.delete(index)
96
+ @deadlines.refresh(worker)
97
+ end
98
+
99
+ # A worker that exited without replying loses only its in-flight item(s)
100
+ # (marked :died) and is recycled; the run continues rather than aborting.
101
+ def handle_dead(worker, io_to_worker, result_ios)
102
+ mark_unfinished(worker, Evilution::Parallel::WorkQueue::DIED)
103
+ retire_or_replace(worker, io_to_worker, result_ios)
104
+ false
85
105
  end
86
106
 
87
- def handle_dead(worker)
88
- @state.first_error ||= Evilution::Error.new("worker process exited unexpectedly")
107
+ def mark_unfinished(worker, sentinel)
108
+ worker.in_flight_indices.each { |index| @state.results[index] = sentinel }
89
109
  @state.in_flight -= worker.pending
90
110
  worker.pending = 0
91
- false
111
+ worker.in_flight_indices.clear
112
+ worker.deadline = nil
92
113
  end
93
114
 
94
115
  def draining_for_recycle?(worker)
@@ -113,28 +134,52 @@ class Evilution::Parallel::WorkQueue::Dispatcher
113
134
  end
114
135
 
115
136
  def recycle(old_worker, io_to_worker, result_ios)
116
- io_to_worker.delete(old_worker.res_io)
117
- result_ios.delete(old_worker.res_io)
118
- @retired << old_worker.retire
119
-
137
+ index = @workers.index(old_worker)
138
+ detach(old_worker, io_to_worker, result_ios)
120
139
  new_worker = @recycle_factory.call(old_worker)
121
- @workers[@workers.index(old_worker)] = new_worker
122
- io_to_worker[new_worker.res_io] = new_worker
123
- result_ios << new_worker.res_io
140
+ @workers[index] = new_worker
141
+ attach(new_worker, io_to_worker, result_ios)
124
142
  new_worker
125
143
  end
126
144
 
145
+ # Shared failure-path recovery: retire the worker, and as long as work
146
+ # remains spin up a replacement to keep the pool full and hand it the next
147
+ # item. When the queue is already drained, just drop the worker.
148
+ def retire_or_replace(worker, io_to_worker, result_ios)
149
+ index = @workers.index(worker)
150
+ detach(worker, io_to_worker, result_ios)
151
+
152
+ if more_to_send? && @state.first_error.nil?
153
+ new_worker = @recycle_factory.call(worker)
154
+ @workers[index] = new_worker
155
+ attach(new_worker, io_to_worker, result_ios)
156
+ send_item(new_worker)
157
+ else
158
+ @workers.delete_at(index)
159
+ end
160
+ end
161
+
162
+ def detach(worker, io_to_worker, result_ios)
163
+ io_to_worker.delete(worker.res_io)
164
+ result_ios.delete(worker.res_io)
165
+ @retired << worker.retire
166
+ end
167
+
168
+ def attach(worker, io_to_worker, result_ios)
169
+ io_to_worker[worker.res_io] = worker
170
+ result_ios << worker.res_io
171
+ end
172
+
127
173
  def send_item(worker)
128
174
  worker.send_item(@state.next_index, @items[@state.next_index])
129
175
  @state.next_index += 1
130
176
  @state.in_flight += 1
177
+ @deadlines.start(worker)
131
178
  end
132
179
 
133
180
  def more_to_send?
134
181
  @state.next_index < @items.length
135
182
  end
136
-
137
- def terminate_stuck
138
- @workers.each(&:kill)
139
- end
140
183
  end
184
+
185
+ require_relative "dispatcher/deadline_tracker"
@@ -2,23 +2,34 @@
2
2
 
3
3
  require_relative "../work_queue"
4
4
  require_relative "../../child_output"
5
+ require_relative "../../process_supervisor"
6
+ require_relative "../../temp_dir_tracker"
5
7
  require_relative "channel"
6
8
  require_relative "channel/frame"
7
9
 
8
10
  class Evilution::Parallel::WorkQueue::Worker
9
11
  Timing = Data.define(:busy, :wall)
10
12
 
11
- attr_reader :pid, :worker_index
12
- attr_accessor :items_completed, :pending, :busy_time, :wall_time
13
-
14
- def self.spawn(worker_index:, hooks:, &block)
13
+ attr_reader :pid, :worker_index, :in_flight_indices
14
+ attr_accessor :items_completed, :pending, :busy_time, :wall_time, :deadline
15
+
16
+ # EV-dg69 / EV-5rrh step 3: the supervisor owns the worker's process-group
17
+ # isolation, signal-safe registry, group-kill and reap. spawn passes
18
+ # isolate_in_child: false so the worker becomes its own group leader only
19
+ # parent-side, AFTER the supervisor has registered it -- preserving the
20
+ # EV-jwao register-before-isolate ordering (the trap can never see a leader
21
+ # missing from the registry). EV-cnx8 group-leadership (so #kill sweeps the
22
+ # whole subtree) is still established, now by the supervisor's parent-side
23
+ # setpgid.
24
+ def self.spawn(worker_index:, hooks:, supervisor: Evilution::ProcessSupervisor.new, &block)
15
25
  cmd_read, cmd_write = IO.pipe
16
26
  res_read, res_write = IO.pipe
17
27
  [cmd_read, cmd_write, res_read, res_write].each(&:binmode)
18
28
 
19
- pid = Process.fork do
29
+ handle = supervisor.spawn(isolate_in_child: false) do
20
30
  cmd_write.close
21
31
  res_read.close
32
+ install_child_signal_handlers
22
33
  ENV["TEST_ENV_NUMBER"] = test_env_number_for(worker_index)
23
34
  Evilution::ChildOutput.redirect!
24
35
  Loop.run(cmd_read, res_write, hooks: hooks, &block)
@@ -26,7 +37,25 @@ class Evilution::Parallel::WorkQueue::Worker
26
37
 
27
38
  cmd_read.close
28
39
  res_write.close
29
- new(pid: pid, cmd_write: cmd_write, res_read: res_read, worker_index: worker_index)
40
+ new(handle:, supervisor:, cmd_write:, res_read:, worker_index:)
41
+ end
42
+
43
+ # EV-7a91: a worker is the parent of the inner per-mutation Fork children it
44
+ # spawns, and those children are their own process-group leaders (EV-2sh8), so
45
+ # the Runner's group-kill of the worker never reaches them. On a terminal
46
+ # INT/TERM the worker must therefore tear down AND reap the inner children it
47
+ # owns before it dies, or they survive as zombies (their parent gone) until an
48
+ # ancestor exits. cleanup_all clears any per-mutation sandbox dirs the inner
49
+ # children registered in this worker's TempDirTracker.
50
+ def self.install_child_signal_handlers
51
+ %w[INT TERM].each do |sig|
52
+ Signal.trap(sig) do
53
+ Evilution::TempDirTracker.cleanup_all
54
+ Evilution::ProcessSupervisor.kill_and_reap_all
55
+ Signal.trap(sig, "DEFAULT")
56
+ Process.kill(sig, Process.pid)
57
+ end
58
+ end
30
59
  end
31
60
 
32
61
  # EV-kdns / GH #817: translate 0-based worker slot to parallel_tests'
@@ -37,8 +66,10 @@ class Evilution::Parallel::WorkQueue::Worker
37
66
  worker_index.zero? ? "" : (worker_index + 1).to_s
38
67
  end
39
68
 
40
- def initialize(pid:, cmd_write:, res_read:, worker_index:)
41
- @pid = pid
69
+ def initialize(handle:, supervisor:, cmd_write:, res_read:, worker_index:)
70
+ @handle = handle
71
+ @supervisor = supervisor
72
+ @pid = handle.pid
42
73
  @cmd_write = cmd_write
43
74
  @res_read = res_read
44
75
  @worker_index = worker_index
@@ -46,6 +77,8 @@ class Evilution::Parallel::WorkQueue::Worker
46
77
  @pending = 0
47
78
  @busy_time = 0.0
48
79
  @wall_time = 0.0
80
+ @in_flight_indices = []
81
+ @deadline = nil
49
82
  end
50
83
 
51
84
  def res_io
@@ -55,6 +88,7 @@ class Evilution::Parallel::WorkQueue::Worker
55
88
  def send_item(index, item)
56
89
  Evilution::Parallel::WorkQueue::Channel.write(@cmd_write, [index, item])
57
90
  @pending += 1
91
+ @in_flight_indices << index
58
92
  end
59
93
 
60
94
  def read_result
@@ -67,10 +101,11 @@ class Evilution::Parallel::WorkQueue::Worker
67
101
  nil
68
102
  end
69
103
 
104
+ # SIGKILL the worker's whole process group (negative pid), reaping any
105
+ # grandchildren it forked, with the bare pid as a fallback for the case where
106
+ # the group is gone (already reaped, or setpgid did not take).
70
107
  def kill
71
- Process.kill("KILL", @pid)
72
- rescue Errno::ESRCH
73
- nil
108
+ @supervisor.signal_group("KILL", @handle)
74
109
  end
75
110
 
76
111
  def close_pipes
@@ -78,10 +113,11 @@ class Evilution::Parallel::WorkQueue::Worker
78
113
  @res_read.close unless @res_read.closed?
79
114
  end
80
115
 
116
+ # Reap the leader and drop it from the registry so the trap never signals a
117
+ # group whose pid the OS may have recycled. ECHILD-tolerant; unregister is a
118
+ # no-op if it was never registered.
81
119
  def reap
82
- Process.wait(@pid)
83
- rescue Errno::ECHILD
84
- nil
120
+ @supervisor.reap(@handle)
85
121
  end
86
122
 
87
123
  def retire
@@ -9,6 +9,14 @@ class Evilution::Parallel::WorkQueue
9
9
 
10
10
  TIMING_GRACE_PERIOD = 5
11
11
 
12
+ # Sentinel results for items whose worker never produced a value. The
13
+ # dispatcher writes these into the results array (instead of aborting the
14
+ # whole run) so a single stuck/dead worker only loses its own in-flight
15
+ # item(s). Mutation-aware callers translate the reason into a status.
16
+ Unfinished = Data.define(:reason)
17
+ TIMED_OUT = Unfinished.new(reason: :timeout)
18
+ DIED = Unfinished.new(reason: :died)
19
+
12
20
  def initialize(size:, hooks: nil, prefetch: 1, item_timeout: nil, worker_max_items: nil)
13
21
  Validators::PositiveInt.call!(:size, size)
14
22
  Validators::PositiveInt.call!(:prefetch, prefetch)