polyrun 1.4.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +21 -0
  3. data/lib/polyrun/cli/ci_shard_hooks.rb +12 -4
  4. data/lib/polyrun/cli/ci_shard_run_command.rb +3 -1
  5. data/lib/polyrun/cli/help.rb +3 -0
  6. data/lib/polyrun/cli/helpers.rb +22 -0
  7. data/lib/polyrun/cli/run_shards_parallel_children.rb +26 -34
  8. data/lib/polyrun/cli/run_shards_parallel_wait.rb +267 -0
  9. data/lib/polyrun/cli/run_shards_plan_boot_phases.rb +34 -1
  10. data/lib/polyrun/cli/run_shards_plan_options.rb +6 -2
  11. data/lib/polyrun/cli/run_shards_run.rb +7 -33
  12. data/lib/polyrun/cli/run_shards_worker_interrupt.rb +75 -0
  13. data/lib/polyrun/coverage/collector_finish.rb +3 -2
  14. data/lib/polyrun/coverage/formatter.rb +2 -1
  15. data/lib/polyrun/coverage/merge/formatters_html.rb +191 -43
  16. data/lib/polyrun/coverage/merge/html/_file_list.html.erb +21 -0
  17. data/lib/polyrun/coverage/merge/html/_file_section.html.erb +26 -0
  18. data/lib/polyrun/coverage/merge/html/_groups_table.html.erb +18 -0
  19. data/lib/polyrun/coverage/merge/html/_overview.html.erb +47 -0
  20. data/lib/polyrun/coverage/merge/html/report.css +147 -0
  21. data/lib/polyrun/coverage/merge/html/report.js +48 -0
  22. data/lib/polyrun/coverage/merge/html/template.html.erb +30 -0
  23. data/lib/polyrun/coverage/track_files.rb +9 -0
  24. data/lib/polyrun/hooks.rb +9 -1
  25. data/lib/polyrun/log.rb +16 -0
  26. data/lib/polyrun/minitest.rb +34 -0
  27. data/lib/polyrun/quick/example_runner.rb +11 -0
  28. data/lib/polyrun/rspec.rb +18 -0
  29. data/lib/polyrun/version.rb +1 -1
  30. data/lib/polyrun/worker_ping.rb +74 -0
  31. data/sig/polyrun/minitest.rbs +2 -0
  32. data/sig/polyrun/rspec.rbs +4 -0
  33. data/sig/polyrun/worker_ping.rbs +10 -0
  34. metadata +12 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7666af9186562083f29dc56e6c867e48b877acdff6ad28ff8c351e8d3c308582
4
- data.tar.gz: 503f5435deb22112044f7841a82728e6782a770eb656859419e8412d623dcff0
3
+ metadata.gz: 1d4fc5867eb97f45848d6da2b7ac8d0c3906de5cd0df849f02042aaaee1e9bbf
4
+ data.tar.gz: a10e216d02b76c722627ab5d0c73e5fe061c118d14f356bdc620655a7e83454a
5
5
  SHA512:
6
- metadata.gz: d0d5d248f1e072c446049bafff111db6e29d784a4a0992528214a6e29cca7b156b67144e6b9c22fc71fa9143f110ac443cda2953219aa816c5562d3247b5e02b
7
- data.tar.gz: d0be776ce5d4a7a5acacfe237a4d07a47c078562aee6a054fb5b63a31a78e199bc2b5ee276b0a8e77a0f0cacd3e4f03cb72d8fd5037baf782d0efb8a1cdf1b04
6
+ metadata.gz: fd37e0e3c6f3afccb8da9dba32b0b836d6b0ddf25181b0bbc2b3d20be561d93518af54ffb929cb62511259a788ad080f9f7080252231d3554aee5ab056c3c841
7
+ data.tar.gz: 92cb1e5ded19005ccca5975754525857ae5f1b9561947b9a6d1b736e62c098a61e377e3465dba1add9d265277f44672911be48e27097b9c3813ef188341f3a94
data/CHANGELOG.md CHANGED
@@ -1,5 +1,26 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## Unreleased
4
+
5
+ ## 1.5.0 (2026-05-04)
6
+
7
+ - Add `run-shards --worker-timeout SEC` and `POLYRUN_WORKER_TIMEOUT_SEC` (wall time per worker since spawn); stop stuck workers; record exit 124 for that shard.
8
+ - Add `run-shards --worker-idle-timeout SEC` and `POLYRUN_WORKER_IDLE_TIMEOUT_SEC`; parent reads monotonic timestamps from `POLYRUN_WORKER_PING_FILE`; record exit 125 when the last ping is stale. Idle applies only after a valid positive ping (use wall timeout until the first ping).
9
+ - Add `Polyrun::WorkerPing` (`ping!`, `ensure_interval_ping_thread!` when `POLYRUN_WORKER_PING_THREAD`). Add `Polyrun::RSpec.install_worker_ping!` and `Polyrun::Minitest.install_worker_ping!`; Polyrun Quick calls `WorkerPing.ping!` around each example. Parent creates ping paths under `tmp/polyrun/` and unlinks files after workers exit.
10
+ - Poll every live shard worker together when timeouts are enabled so idle and wall limits apply to all children, not only the first waiter.
11
+ - Split parallel worker teardown into `RunShardsParallelWait` and `RunShardsWorkerInterrupt`; keep spawn logic in `RunShardsParallelChildren`.
12
+ - Add `Polyrun::Log.orchestration_warn`; when `POLYRUN_ORCHESTRATION_STDERR=1`, copy one line to process `$stderr` if `Log.stderr` is not the same object (custom/null sinks).
13
+ - Wire `env_worker_timeout_sec` / `env_worker_idle_timeout_sec` into `ci-shard-run` plan context. Rescue `Interrupt` around `after_suite` in `run-shards` and `ci-shard` orchestration where suite hooks run.
14
+ - In `Polyrun::Hooks#run_phase`, rescue `Interrupt` for Ruby DSL and shell hook phases (return 130).
15
+ - Document worker timeout, idle ping, and `POLYRUN_ORCHESTRATION_STDERR` in `polyrun help`. Add `sig/polyrun/worker_ping.rbs` and extend `Polyrun::RSpec` / `Polyrun::Minitest` installer signatures.
16
+
17
+ ## 1.4.2 (2026-04-24)
18
+
19
+ - Add richer HTML coverage reports: summary cards, group coverage, sortable file tables, project-relative paths, and per-file source detail.
20
+ - Refactor HTML coverage rendering into stdlib `ERB` templates with `_*.html.erb` partials and isolated `report.css` / `report.js` assets; inline assets into final standalone report.
21
+ - Fix `track_files` coverage scope in `Collector.finish`: keep only files matched by tracked globs, drop unrelated loaded runtime files, and add unloaded tracked files only for non-sharded runs.
22
+ - Add coverage specs for divergent `track_under` / `track_files` configs in serial and sharded finish paths; add `TrackFiles.keep_tracked_files`.
23
+
3
24
  ## 1.4.1 (2026-04-16)
4
25
 
5
26
  - Add `polyrun merge-failures` and `run-shards --merge-failures` / `--merge-failures-output` / `--merge-failures-format`; merge per-worker JSONL under `tmp/polyrun_failures/polyrun-failure-fragment-*.jsonl` (or RSpec JSON via `-i`). Run merge after all workers exit, including when a shard failed (`--merge-coverage` still runs only after all shards succeed).
@@ -49,7 +49,11 @@ module Polyrun
49
49
  "POLYRUN_SHARD_TOTAL" => ctx[:workers].to_s,
50
50
  "POLYRUN_SUITE_EXIT_STATUS" => exit_code.to_s
51
51
  )
52
- hook_cfg.run_phase_if_enabled(:after_suite, env_after)
52
+ begin
53
+ hook_cfg.run_phase_if_enabled(:after_suite, env_after)
54
+ rescue Interrupt
55
+ Polyrun::Log.warn "polyrun ci-shard: after_suite hook interrupted"
56
+ end
53
57
  end
54
58
  end
55
59
  end
@@ -109,9 +113,13 @@ module Polyrun
109
113
  exit_code
110
114
  ensure
111
115
  if suite_started
112
- hook_cfg.run_phase_if_enabled(:after_suite, env_orch.merge(
113
- "POLYRUN_SUITE_EXIT_STATUS" => exit_code.to_s
114
- ))
116
+ begin
117
+ hook_cfg.run_phase_if_enabled(:after_suite, env_orch.merge(
118
+ "POLYRUN_SUITE_EXIT_STATUS" => exit_code.to_s
119
+ ))
120
+ rescue Interrupt
121
+ Polyrun::Log.warn "polyrun ci-shard: after_suite hook interrupted"
122
+ end
115
123
  end
116
124
  end
117
125
  end
@@ -66,7 +66,9 @@ module Polyrun
66
66
  merge_format: nil,
67
67
  config_path: config_path,
68
68
  matrix_shard_index: mx,
69
- matrix_shard_total: mt
69
+ matrix_shard_total: mt,
70
+ worker_timeout_sec: env_worker_timeout_sec,
71
+ worker_idle_timeout_sec: env_worker_idle_timeout_sec
70
72
  }
71
73
  end
72
74
 
@@ -23,6 +23,9 @@ module Polyrun
23
23
  Warn if merge-coverage wall time exceeds N seconds (default 10): POLYRUN_MERGE_SLOW_WARN_SECONDS (0 disables)
24
24
  Failure fragments (run-shards --merge-failures): POLYRUN_MERGE_FAILURES=1; parent sets POLYRUN_FAILURE_FRAGMENTS=1 in workers; POLYRUN_FAILURE_FRAGMENT_DIR, POLYRUN_MERGED_FAILURES_OUT, POLYRUN_MERGED_FAILURES_FORMAT; after_suite sets POLYRUN_MERGED_FAILURES_PATH when merge ran
25
25
  Parallel RSpec workers: POLYRUN_WORKERS default 5, max 10 (run-shards / parallel-rspec / start); distinct from POLYRUN_SHARD_PROCESSES / ci-shard --shard-processes (local processes per CI matrix job)
26
+ Per-worker wall timeout: run-shards --worker-timeout SEC or POLYRUN_WORKER_TIMEOUT_SEC (max time since each worker spawn). Parent polls all live workers together. Exit 124; remaining workers stopped.
27
+ Per-worker idle timeout: --worker-idle-timeout SEC or POLYRUN_WORKER_IDLE_TIMEOUT_SEC counts only after a successful ping timestamp (positive float in POLYRUN_WORKER_PING_FILE); empty or unreadable pings do not satisfy idle enforcement—use wall timeout until the first ping. RSpec/Minitest/Quick installers call Polyrun::WorkerPing.ping! per example/suite. Ping files live under tmp/polyrun/ (gitignored via tmp/); parent unlinks each after its worker exits. Exit 125. Optional outer cap: --worker-timeout (exit 124). Optional periodic pings: POLYRUN_WORKER_PING_THREAD=1 (POLYRUN_WORKER_PING_INTERVAL_SEC); WorkerPing.ensure_interval_ping_thread! (installers invoke it—call yourself if wiring workers without install_worker_ping!).
28
+ If Polyrun::Log.stderr is null or redirected away, set POLYRUN_ORCHESTRATION_STDERR=1 to also print timeout/SIGINT summary lines to process stderr.
26
29
  Partition timing granularity (default file): POLYRUN_TIMING_GRANULARITY=file|example (experimental per-example; see partition.timing_granularity)
27
30
 
28
31
  commands:
@@ -11,6 +11,28 @@ module Polyrun
11
11
  Polyrun::Config::Resolver.env_int(name, fallback)
12
12
  end
13
13
 
14
+ # Per-worker wall clock (from spawn) for run-shards / ci-shard fan-out; unset or invalid means no limit.
15
+ def env_worker_timeout_sec
16
+ s = ENV["POLYRUN_WORKER_TIMEOUT_SEC"].to_s.strip
17
+ return nil if s.empty?
18
+
19
+ f = Float(s, exception: false)
20
+ return nil if f.nil? || f <= 0
21
+
22
+ f
23
+ end
24
+
25
+ # Max seconds without a new monotonic timestamp ping in the worker (see +polyrun/worker_ping+).
26
+ def env_worker_idle_timeout_sec
27
+ s = ENV["POLYRUN_WORKER_IDLE_TIMEOUT_SEC"].to_s.strip
28
+ return nil if s.empty?
29
+
30
+ f = Float(s, exception: false)
31
+ return nil if f.nil? || f <= 0
32
+
33
+ f
34
+ end
35
+
14
36
  def resolve_shard_index(pc)
15
37
  Polyrun::Config::Resolver.resolve_shard_index(pc)
16
38
  end
@@ -1,7 +1,13 @@
1
+ require "fileutils"
2
+
3
+ require_relative "run_shards_parallel_wait"
4
+
1
5
  module Polyrun
2
6
  class CLI
3
- # Spawns and waits on worker processes for +run-shards+ / +ci-shard-*+ fan-out.
7
+ # Spawns worker processes for +run-shards+ / +ci-shard-*+ fan-out. See {RunShardsParallelWait} for wait/timeout.
4
8
  module RunShardsParallelChildren
9
+ include RunShardsParallelWait
10
+
5
11
  private
6
12
 
7
13
  # @return [Array(Array, Integer, nil)] +[pids, spawn_error_code]+; +spawn_error_code+ is +nil+ when all spawns succeeded
@@ -45,9 +51,12 @@ module Polyrun
45
51
  child_env = child_env.merge("POLYRUN_HOOK_ORCHESTRATOR" => "0")
46
52
  child_env = hook_cfg.merge_worker_ruby_env(child_env)
47
53
 
54
+ ping_path = run_shards_prepare_worker_ping!(ctx, child_env, shard)
55
+
48
56
  Polyrun::Log.warn "polyrun run-shards: shard #{shard} → #{paths.size} file(s)" if @verbose
57
+ spawned_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
49
58
  pid = run_shards_spawn_one_worker(child_env, cmd, paths, hook_cfg)
50
- pids << {pid: pid, shard: shard}
59
+ pids << {pid: pid, shard: shard, spawned_at: spawned_at, ping_path: ping_path}
51
60
  Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.spawn shard=#{shard} child_pid=#{pid} spec_files=#{paths.size}")
52
61
  Polyrun::Log.warn "polyrun run-shards: started shard #{shard} pid=#{pid} (#{paths.size} file(s))" if parallel
53
62
  end
@@ -55,6 +64,21 @@ module Polyrun
55
64
  end
56
65
  # rubocop:enable Metrics/AbcSize
57
66
 
67
+ def run_shards_prepare_worker_ping!(ctx, child_env, shard)
68
+ idle_sec = ctx[:worker_idle_timeout_sec]
69
+ idle_sec = nil if idle_sec.is_a?(Numeric) && idle_sec <= 0
70
+ return nil unless idle_sec
71
+
72
+ dir = File.join(Dir.pwd, "tmp", "polyrun")
73
+ FileUtils.mkdir_p(dir)
74
+ path = File.expand_path("worker-ping-#{$$}-#{shard}.txt", dir)
75
+ File.binwrite(path, "")
76
+ child_env["POLYRUN_WORKER_PING_FILE"] = path
77
+ interval = ENV["POLYRUN_WORKER_PING_INTERVAL_SEC"].to_s.strip
78
+ child_env["POLYRUN_WORKER_PING_INTERVAL_SEC"] = interval.empty? ? "15" : interval
79
+ path
80
+ end
81
+
58
82
  def run_shards_spawn_one_worker(child_env, cmd, paths, hook_cfg)
59
83
  if hook_cfg.worker_hooks? && !Polyrun::Hooks.disabled?
60
84
  Process.spawn(child_env, "sh", "-c", hook_cfg.build_worker_shell_script(cmd, paths))
@@ -62,38 +86,6 @@ module Polyrun
62
86
  Process.spawn(child_env, *cmd, *paths)
63
87
  end
64
88
  end
65
-
66
- # @return [Array(Array, Integer)] +[shard_results, after_shard_hook_error_code]+ (0 when all +after_shard+ hooks passed)
67
- def run_shards_wait_all_children(pids, hook_cfg, ctx)
68
- workers = ctx[:workers]
69
- shard_results = []
70
- after_hook_err = 0
71
- Polyrun::Debug.time("Process.wait (#{pids.size} worker process(es))") do
72
- pids.each do |h|
73
- Process.wait(h[:pid])
74
- exitstatus = $?.exitstatus
75
- ok = $?.success?
76
- Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.wait child_pid=#{h[:pid]} shard=#{h[:shard]} exit=#{exitstatus} success=#{ok}")
77
- env_after = ENV.to_h.merge(
78
- "POLYRUN_HOOK_ORCHESTRATOR" => "1",
79
- "POLYRUN_SHARD_INDEX" => h[:shard].to_s,
80
- "POLYRUN_SHARD_TOTAL" => workers.to_s,
81
- "POLYRUN_WORKER_EXIT_STATUS" => exitstatus.to_s
82
- )
83
- rc = hook_cfg.run_phase_if_enabled(:after_shard, env_after)
84
- after_hook_err = rc if rc != 0 && after_hook_err == 0
85
- shard_results << {shard: h[:shard], exitstatus: exitstatus, success: ok}
86
- end
87
- rescue Interrupt
88
- # Do not trap SIGINT: Process.wait raises Interrupt; a trap races and prints Interrupt + SystemExit traces.
89
- run_shards_shutdown_on_signal!(pids, 130)
90
- rescue SignalException => e
91
- raise unless e.signm == "SIGTERM"
92
-
93
- run_shards_shutdown_on_signal!(pids, 143)
94
- end
95
- [shard_results, after_hook_err]
96
- end
97
89
  end
98
90
  end
99
91
  end
@@ -0,0 +1,267 @@
1
+ # rubocop:disable Polyrun/FileLength, Metrics/ModuleLength -- wait loop + idle/wall flush (kept out of spawn module)
2
+ module Polyrun
3
+ class CLI
4
+ # Wait, wall/idle timeout, and +after_shard+ hooks for parallel workers (+run-shards+ / +ci-shard-*+).
5
+ module RunShardsParallelWait
6
+ WORKER_TIMEOUT_EXIT_STATUS = 124
7
+ WORKER_IDLE_TIMEOUT_EXIT_STATUS = 125
8
+
9
+ private
10
+
11
+ # @return [Array(Array, Integer)] +[shard_results, after_shard_hook_error_code]+ (0 when all +after_shard+ hooks passed)
12
+ # rubocop:disable Metrics/AbcSize -- wait loop + timeout flush
13
+ def run_shards_wait_all_children(pids, hook_cfg, ctx)
14
+ workers = ctx[:workers]
15
+ shard_results = []
16
+ after_hook_err = 0
17
+ timeout_sec = ctx[:worker_timeout_sec]
18
+ timeout_sec = nil if timeout_sec.is_a?(Numeric) && timeout_sec <= 0
19
+ idle_sec = ctx[:worker_idle_timeout_sec]
20
+ idle_sec = nil if idle_sec.is_a?(Numeric) && idle_sec <= 0
21
+
22
+ Polyrun::Debug.time("Process.wait (#{pids.size} worker process(es))") do
23
+ if timeout_sec || idle_sec
24
+ run_shards_wait_all_children_multiplex(
25
+ pids, hook_cfg, ctx, workers, timeout_sec, idle_sec, shard_results, after_hook_err
26
+ )
27
+ else
28
+ run_shards_wait_all_children_sequential(pids, hook_cfg, workers, shard_results, after_hook_err)
29
+ end
30
+ rescue Interrupt
31
+ run_shards_shutdown_on_signal!(pids, 130)
32
+ rescue SignalException => e
33
+ raise unless e.signm == "SIGTERM"
34
+
35
+ run_shards_shutdown_on_signal!(pids, 143)
36
+ end
37
+ end
38
+ # rubocop:enable Metrics/AbcSize
39
+
40
+ def run_shards_wait_all_children_sequential(pids, hook_cfg, workers, shard_results, after_hook_err)
41
+ pids.each do |h|
42
+ Process.wait(h[:pid])
43
+ st = $?
44
+ after_hook_err = run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
45
+ end
46
+ [shard_results, after_hook_err]
47
+ end
48
+
49
+ # Poll every live PID each tick so wall and idle timeouts apply to all workers, not only the first in wait order.
50
+ def run_shards_wait_all_children_multiplex(pids, hook_cfg, ctx, workers, timeout_sec, idle_sec, shard_results, after_hook_err)
51
+ pending = pids.dup
52
+
53
+ loop do
54
+ pending.delete_if do |h|
55
+ wpid = Process.wait(h[:pid], Process::WNOHANG)
56
+ next false unless wpid == h[:pid]
57
+
58
+ st = $?
59
+ after_hook_err = run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
60
+ true
61
+ end
62
+
63
+ return [shard_results, after_hook_err] if pending.empty?
64
+
65
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
66
+ violation = run_shards_timeout_violation(pids, pending, ctx, now, timeout_sec, idle_sec)
67
+ if violation
68
+ reason, timed_h = violation
69
+ others = pending.reject { |x| x[:pid] == timed_h[:pid] }
70
+ case reason
71
+ when :wall_timeout
72
+ return run_shards_wait_flush_after_worker_timeout!(
73
+ timed_h, others, hook_cfg, ctx, timeout_sec, workers, shard_results, after_hook_err
74
+ )
75
+ when :idle_timeout
76
+ return run_shards_wait_flush_after_worker_idle!(
77
+ timed_h, others, hook_cfg, ctx, idle_sec, workers, shard_results, after_hook_err
78
+ )
79
+ end
80
+ end
81
+
82
+ sleep(0.2)
83
+ end
84
+ end
85
+
86
+ # @return [(Symbol, Hash), nil] e.g. +[:wall_timeout, h]+ when a limit is exceeded
87
+ def run_shards_timeout_violation(pids_order, pending, ctx, now, timeout_sec, idle_sec)
88
+ pids_order.each do |h|
89
+ next unless pending.any? { |p| p[:pid] == h[:pid] }
90
+
91
+ if timeout_sec && timeout_sec > 0
92
+ spawned_at = h[:spawned_at] || ctx[:run_t0]
93
+ return [:wall_timeout, h] if now >= spawned_at + timeout_sec
94
+ end
95
+ end
96
+
97
+ pids_order.each do |h|
98
+ next unless pending.any? { |p| p[:pid] == h[:pid] }
99
+
100
+ next unless idle_sec && idle_sec > 0 && h[:ping_path]
101
+
102
+ t, = run_shards_read_worker_ping_payload(h[:ping_path])
103
+ return [:idle_timeout, h] if t && (now - t) > idle_sec
104
+ end
105
+
106
+ nil
107
+ end
108
+
109
+ def run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
110
+ exitstatus = st.exitstatus
111
+ ok = st.success?
112
+ Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.wait child_pid=#{h[:pid]} shard=#{h[:shard]} exit=#{exitstatus} success=#{ok}")
113
+ rc = run_shards_invoke_after_shard!(hook_cfg, h[:shard], workers, exitstatus)
114
+ after_hook_err = rc if rc != 0 && after_hook_err == 0
115
+ shard_results << {shard: h[:shard], exitstatus: exitstatus, success: ok}
116
+ run_shards_unlink_ping_path(h[:ping_path])
117
+ after_hook_err
118
+ end
119
+
120
+ def run_shards_unlink_ping_path(path)
121
+ s = path.to_s.strip
122
+ return if s.empty?
123
+
124
+ File.unlink(s) if File.file?(s)
125
+ rescue SystemCallError
126
+ # best-effort cleanup of tmp/polyrun/worker-ping-*.txt
127
+ end
128
+
129
+ def run_shards_read_worker_ping_time(path)
130
+ run_shards_read_worker_ping_payload(path)[0]
131
+ end
132
+
133
+ # @return [Array(Float?, String?)] monotonic time and optional location line (path:line)
134
+ def run_shards_read_worker_ping_payload(path)
135
+ return [nil, nil] unless path && File.file?(path)
136
+
137
+ s = File.binread(path)
138
+ return [nil, nil] if s.nil? || s.strip.empty?
139
+
140
+ time_line, rest = s.split("\n", 2)
141
+ first = time_line.to_s.strip
142
+ return [nil, nil] if first.empty?
143
+
144
+ f = first.to_f
145
+ t = f.positive? ? f : nil
146
+ loc = rest.to_s.strip
147
+ loc = nil if loc.empty?
148
+ [t, loc]
149
+ rescue SystemCallError
150
+ [nil, nil]
151
+ end
152
+
153
+ def run_shards_wait_flush_after_worker_idle!(timed_h, others, hook_cfg, ctx, idle_sec, workers, shard_results, after_hook_err)
154
+ run_shards_warn_worker_idle!(timed_h, ctx, idle_sec)
155
+ run_shards_force_stop_pid_status(timed_h[:pid])
156
+ run_shards_unlink_ping_path(timed_h[:ping_path])
157
+ rc = run_shards_invoke_after_shard!(hook_cfg, timed_h[:shard], workers, WORKER_IDLE_TIMEOUT_EXIT_STATUS)
158
+ after_hook_err = rc if rc != 0 && after_hook_err == 0
159
+ shard_results << {shard: timed_h[:shard], exitstatus: WORKER_IDLE_TIMEOUT_EXIT_STATUS, success: false}
160
+ others.each do |h2|
161
+ st2 = run_shards_wait_or_force_stop_status(h2[:pid])
162
+ exit2 = st2&.exitstatus
163
+ ok2 = st2 ? st2.success? : false
164
+ exit2 = WORKER_IDLE_TIMEOUT_EXIT_STATUS if exit2.nil?
165
+ run_shards_unlink_ping_path(h2[:ping_path])
166
+ rc2 = run_shards_invoke_after_shard!(hook_cfg, h2[:shard], workers, exit2)
167
+ after_hook_err = rc2 if rc2 != 0 && after_hook_err == 0
168
+ shard_results << {shard: h2[:shard], exitstatus: exit2, success: ok2}
169
+ end
170
+ [shard_results, after_hook_err]
171
+ end
172
+
173
+ def run_shards_wait_flush_after_worker_timeout!(timed_h, others, hook_cfg, ctx, timeout_sec, workers, shard_results, after_hook_err)
174
+ run_shards_warn_worker_timeout!(timed_h, ctx, timeout_sec)
175
+ run_shards_force_stop_pid_status(timed_h[:pid])
176
+ run_shards_unlink_ping_path(timed_h[:ping_path])
177
+ rc = run_shards_invoke_after_shard!(hook_cfg, timed_h[:shard], workers, WORKER_TIMEOUT_EXIT_STATUS)
178
+ after_hook_err = rc if rc != 0 && after_hook_err == 0
179
+ shard_results << {shard: timed_h[:shard], exitstatus: WORKER_TIMEOUT_EXIT_STATUS, success: false}
180
+ others.each do |h2|
181
+ st2 = run_shards_wait_or_force_stop_status(h2[:pid])
182
+ exit2 = st2&.exitstatus
183
+ ok2 = st2 ? st2.success? : false
184
+ exit2 = WORKER_TIMEOUT_EXIT_STATUS if exit2.nil?
185
+ run_shards_unlink_ping_path(h2[:ping_path])
186
+ rc2 = run_shards_invoke_after_shard!(hook_cfg, h2[:shard], workers, exit2)
187
+ after_hook_err = rc2 if rc2 != 0 && after_hook_err == 0
188
+ shard_results << {shard: h2[:shard], exitstatus: exit2, success: ok2}
189
+ end
190
+ [shard_results, after_hook_err]
191
+ end
192
+
193
+ def run_shards_invoke_after_shard!(hook_cfg, shard, workers, exitstatus)
194
+ env_after = ENV.to_h.merge(
195
+ "POLYRUN_HOOK_ORCHESTRATOR" => "1",
196
+ "POLYRUN_SHARD_INDEX" => shard.to_s,
197
+ "POLYRUN_SHARD_TOTAL" => workers.to_s,
198
+ "POLYRUN_WORKER_EXIT_STATUS" => exitstatus.to_s
199
+ )
200
+ hook_cfg.run_phase_if_enabled(:after_shard, env_after)
201
+ end
202
+
203
+ def run_shards_warn_worker_idle!(h, ctx, idle_sec)
204
+ paths = ctx[:plan].shard(h[:shard])
205
+ sample = paths.first(5).join(", ")
206
+ suffix =
207
+ if paths.empty?
208
+ " (no paths)"
209
+ elsif paths.size > 5
210
+ " (#{paths.size} files total)"
211
+ else
212
+ ""
213
+ end
214
+ _t, loc = run_shards_read_worker_ping_payload(h[:ping_path])
215
+ ping_suffix = (loc && !loc.to_s.strip.empty?) ? "; last ping #{loc.to_s.strip}" : ""
216
+ Polyrun::Log.orchestration_warn "polyrun run-shards: WORKER IDLE TIMEOUT after #{idle_sec}s since last per-example progress ping — shard #{h[:shard]} pid #{h[:pid]}#{ping_suffix}."
217
+ Polyrun::Log.warn "polyrun run-shards: idle shard file sample: #{sample}#{suffix}"
218
+ Polyrun::Log.warn "polyrun run-shards: use Polyrun::RSpec.install_worker_ping! / Polyrun::Minitest.install_worker_ping! (Polyrun Quick calls ping! each example); exit #{WORKER_IDLE_TIMEOUT_EXIT_STATUS}."
219
+ end
220
+
221
+ def run_shards_wait_or_force_stop_status(pid)
222
+ wpid = Process.wait(pid, Process::WNOHANG)
223
+ return $? if wpid == pid
224
+
225
+ run_shards_force_stop_pid_status(pid)
226
+ rescue Errno::ECHILD
227
+ nil
228
+ end
229
+
230
+ def run_shards_force_stop_pid_status(pid)
231
+ Process.kill(:KILL, pid)
232
+ st = nil
233
+ begin
234
+ Process.wait(pid)
235
+ st = $?
236
+ rescue Errno::ECHILD
237
+ # child already reaped
238
+ end
239
+ st
240
+ rescue Errno::ESRCH
241
+ begin
242
+ Process.wait(pid)
243
+ $?
244
+ rescue Errno::ECHILD
245
+ nil
246
+ end
247
+ end
248
+
249
+ def run_shards_warn_worker_timeout!(h, ctx, timeout_sec)
250
+ paths = ctx[:plan].shard(h[:shard])
251
+ sample = paths.first(5).join(", ")
252
+ suffix =
253
+ if paths.empty?
254
+ " (no paths)"
255
+ elsif paths.size > 5
256
+ " (#{paths.size} files total)"
257
+ else
258
+ ""
259
+ end
260
+ Polyrun::Log.orchestration_warn "polyrun run-shards: WORKER TIMEOUT after #{timeout_sec}s (wall time since worker spawn) — shard #{h[:shard]} pid #{h[:pid]}."
261
+ Polyrun::Log.warn "polyrun run-shards: timeout shard includes: #{sample}#{suffix}"
262
+ Polyrun::Log.warn "polyrun run-shards: override with --worker-timeout SEC or POLYRUN_WORKER_TIMEOUT_SEC; recorded exit #{WORKER_TIMEOUT_EXIT_STATUS} for this worker."
263
+ end
264
+ end
265
+ end
266
+ end
267
+ # rubocop:enable Polyrun/FileLength, Metrics/ModuleLength
@@ -19,6 +19,9 @@ module Polyrun
19
19
  err = run_shards_validate_cmd(cmd)
20
20
  return [:fail, err] if err
21
21
 
22
+ run_shards_normalize_worker_timeout_option!(o)
23
+ run_shards_normalize_worker_idle_timeout_option!(o)
24
+
22
25
  cmd = Shellwords.split(cmd.first) if cmd.size == 1 && cmd.first.include?(" ")
23
26
 
24
27
  [:ok, o, cmd]
@@ -118,9 +121,39 @@ module Polyrun
118
121
  merge_failures: run_shards_merge_failures_flag(o, cfg),
119
122
  merge_failures_output: run_shards_merge_failures_output_opt(o, cfg),
120
123
  merge_failures_format: run_shards_merge_failures_format_opt(o, cfg),
121
- config_path: config_path
124
+ config_path: config_path,
125
+ worker_timeout_sec: run_shards_resolved_worker_timeout_sec(o),
126
+ worker_idle_timeout_sec: run_shards_resolved_worker_idle_timeout_sec(o)
122
127
  }
123
128
  end
129
+
130
+ def run_shards_normalize_worker_idle_timeout_option!(o)
131
+ v = o[:worker_idle_timeout_sec]
132
+ return if v.nil?
133
+
134
+ o[:worker_idle_timeout_sec] = nil if v <= 0
135
+ end
136
+
137
+ def run_shards_resolved_worker_idle_timeout_sec(o)
138
+ cli = o[:worker_idle_timeout_sec]
139
+ return cli.to_f if cli.is_a?(Numeric) && cli > 0
140
+
141
+ env_worker_idle_timeout_sec
142
+ end
143
+
144
+ def run_shards_normalize_worker_timeout_option!(o)
145
+ v = o[:worker_timeout_sec]
146
+ return if v.nil?
147
+
148
+ o[:worker_timeout_sec] = nil if v <= 0
149
+ end
150
+
151
+ def run_shards_resolved_worker_timeout_sec(o)
152
+ cli = o[:worker_timeout_sec]
153
+ return cli.to_f if cli.is_a?(Numeric) && cli > 0
154
+
155
+ env_worker_timeout_sec
156
+ end
124
157
  end
125
158
  end
126
159
  end
@@ -27,7 +27,9 @@ module Polyrun
27
27
  merge_format: nil,
28
28
  merge_failures: false,
29
29
  merge_failures_output: nil,
30
- merge_failures_format: nil
30
+ merge_failures_format: nil,
31
+ worker_timeout_sec: nil,
32
+ worker_idle_timeout_sec: nil
31
33
  }
32
34
  end
33
35
 
@@ -39,8 +41,10 @@ module Polyrun
39
41
 
40
42
  # rubocop:disable Metrics/AbcSize -- one argv block for run-shards
41
43
  def run_shards_plan_options_register!(opts, st)
42
- opts.banner = "usage: polyrun run-shards [--workers N] [--strategy NAME] [--paths-file P] [--timing P] [--timing-granularity VAL] [--constraints P] [--seed S] [--merge-coverage] [--merge-output P] [--merge-format LIST] [--merge-failures] [--merge-failures-output P] [--merge-failures-format jsonl|json] [--] <command> [args...]"
44
+ opts.banner = "usage: polyrun run-shards [--workers N] [--worker-timeout SEC] [--worker-idle-timeout SEC] [--strategy NAME] [--paths-file P] [--timing P] [--timing-granularity VAL] [--constraints P] [--seed S] [--merge-coverage] [--merge-output P] [--merge-format LIST] [--merge-failures] [--merge-failures-output P] [--merge-failures-format jsonl|json] [--] <command> [args...]"
43
45
  opts.on("--workers N", Integer) { |v| st[:workers] = v }
46
+ opts.on("--worker-timeout SEC", Float, "Max seconds per worker since spawn (also POLYRUN_WORKER_TIMEOUT_SEC); kills stuck workers (exit 124)") { |v| st[:worker_timeout_sec] = v }
47
+ opts.on("--worker-idle-timeout SEC", Float, "Max seconds since last valid WorkerPing timestamp in POLYRUN_WORKER_PING_FILE (needs prior ping); RSpec/Minitest: install_worker_ping!; Quick: automatic; exit 125") { |v| st[:worker_idle_timeout_sec] = v }
44
48
  opts.on("--strategy NAME", String) { |v| st[:strategy] = v }
45
49
  opts.on("--seed VAL") { |v| st[:seed] = v }
46
50
  opts.on("--paths-file PATH", String) { |v| st[:paths_file] = v }
@@ -2,6 +2,7 @@ require "shellwords"
2
2
  require "rbconfig"
3
3
 
4
4
  require_relative "run_shards_planning"
5
+ require_relative "run_shards_worker_interrupt"
5
6
  require_relative "run_shards_parallel_children"
6
7
 
7
8
  module Polyrun
@@ -9,6 +10,7 @@ module Polyrun
9
10
  # Partition + spawn workers for `polyrun run-shards` (keeps {RunShardsCommand} file small).
10
11
  module RunShardsRun
11
12
  include RunShardsPlanning
13
+ include RunShardsWorkerInterrupt
12
14
  include RunShardsParallelChildren
13
15
 
14
16
  private
@@ -93,7 +95,11 @@ module Polyrun
93
95
  "POLYRUN_SUITE_EXIT_STATUS" => exit_code.to_s,
94
96
  "POLYRUN_MERGED_FAILURES_PATH" => merged_failures_path.to_s
95
97
  )
96
- hook_cfg.run_phase_if_enabled(:after_suite, env_after)
98
+ begin
99
+ hook_cfg.run_phase_if_enabled(:after_suite, env_after)
100
+ rescue Interrupt
101
+ Polyrun::Log.warn "polyrun run-shards: after_suite hook interrupted; workers are stopped or were not started"
102
+ end
97
103
  end
98
104
  end
99
105
  end
@@ -106,38 +112,6 @@ module Polyrun
106
112
  Polyrun::Log.warn "polyrun run-shards: each worker prints its own summary line; the last \"N examples\" line is not a total across shards."
107
113
  end
108
114
 
109
- # Best-effort worker teardown then exit. Does not return.
110
- def run_shards_shutdown_on_signal!(pids, code)
111
- run_shards_terminate_children!(pids)
112
- exit(code)
113
- rescue Interrupt
114
- pids.each do |h|
115
- Process.kill(:KILL, h[:pid])
116
- rescue Errno::ESRCH
117
- # already reaped
118
- end
119
- pids.each do |h|
120
- Process.wait(h[:pid])
121
- rescue Errno::ESRCH, Errno::ECHILD, Interrupt
122
- # already reaped or give up
123
- end
124
- exit(code)
125
- end
126
-
127
- # Send SIGTERM to each worker PID and wait so Ctrl+C / SIGTERM does not leave orphans.
128
- def run_shards_terminate_children!(pids)
129
- pids.each do |h|
130
- Process.kill(:TERM, h[:pid])
131
- rescue Errno::ESRCH
132
- # already reaped
133
- end
134
- pids.each do |h|
135
- Process.wait(h[:pid])
136
- rescue Errno::ESRCH, Errno::ECHILD
137
- # already reaped
138
- end
139
- end
140
-
141
115
  def run_shards_merge_or_hint_coverage(ctx)
142
116
  if ctx[:merge_coverage]
143
117
  mo = ctx[:merge_output] || "coverage/merged.json"