polyrun 1.4.2 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/lib/polyrun/cli/ci_shard_hooks.rb +12 -4
- data/lib/polyrun/cli/ci_shard_run_command.rb +3 -1
- data/lib/polyrun/cli/help.rb +3 -0
- data/lib/polyrun/cli/helpers.rb +22 -0
- data/lib/polyrun/cli/run_shards_parallel_children.rb +26 -34
- data/lib/polyrun/cli/run_shards_parallel_wait.rb +267 -0
- data/lib/polyrun/cli/run_shards_plan_boot_phases.rb +34 -1
- data/lib/polyrun/cli/run_shards_plan_options.rb +6 -2
- data/lib/polyrun/cli/run_shards_run.rb +7 -33
- data/lib/polyrun/cli/run_shards_worker_interrupt.rb +75 -0
- data/lib/polyrun/coverage/merge/formatters_html.rb +4 -0
- data/lib/polyrun/hooks.rb +9 -1
- data/lib/polyrun/log.rb +16 -0
- data/lib/polyrun/minitest.rb +34 -0
- data/lib/polyrun/quick/example_runner.rb +11 -0
- data/lib/polyrun/rspec.rb +18 -0
- data/lib/polyrun/version.rb +1 -1
- data/lib/polyrun/worker_ping.rb +74 -0
- data/sig/polyrun/minitest.rbs +2 -0
- data/sig/polyrun/rspec.rbs +4 -0
- data/sig/polyrun/worker_ping.rbs +10 -0
- metadata +5 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1d4fc5867eb97f45848d6da2b7ac8d0c3906de5cd0df849f02042aaaee1e9bbf
|
|
4
|
+
data.tar.gz: a10e216d02b76c722627ab5d0c73e5fe061c118d14f356bdc620655a7e83454a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fd37e0e3c6f3afccb8da9dba32b0b836d6b0ddf25181b0bbc2b3d20be561d93518af54ffb929cb62511259a788ad080f9f7080252231d3554aee5ab056c3c841
|
|
7
|
+
data.tar.gz: 92cb1e5ded19005ccca5975754525857ae5f1b9561947b9a6d1b736e62c098a61e377e3465dba1add9d265277f44672911be48e27097b9c3813ef188341f3a94
|
data/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
## Unreleased
|
|
4
4
|
|
|
5
|
+
## 1.5.0 (2026-05-04)
|
|
6
|
+
|
|
7
|
+
- Add `run-shards --worker-timeout SEC` and `POLYRUN_WORKER_TIMEOUT_SEC` (wall time per worker since spawn); stop stuck workers; record exit 124 for that shard.
|
|
8
|
+
- Add `run-shards --worker-idle-timeout SEC` and `POLYRUN_WORKER_IDLE_TIMEOUT_SEC`; parent reads monotonic timestamps from `POLYRUN_WORKER_PING_FILE`; record exit 125 when the last ping is stale. Idle applies only after a valid positive ping (use wall timeout until the first ping).
|
|
9
|
+
- Add `Polyrun::WorkerPing` (`ping!`, `ensure_interval_ping_thread!` when `POLYRUN_WORKER_PING_THREAD`). Add `Polyrun::RSpec.install_worker_ping!` and `Polyrun::Minitest.install_worker_ping!`; Polyrun Quick calls `WorkerPing.ping!` around each example. Parent creates ping paths under `tmp/polyrun/` and unlinks files after workers exit.
|
|
10
|
+
- Poll every live shard worker together when timeouts are enabled so idle and wall limits apply to all children, not only the first waiter.
|
|
11
|
+
- Split parallel worker teardown into `RunShardsParallelWait` and `RunShardsWorkerInterrupt`; keep spawn logic in `RunShardsParallelChildren`.
|
|
12
|
+
- Add `Polyrun::Log.orchestration_warn`; when `POLYRUN_ORCHESTRATION_STDERR=1`, copy one line to process `$stderr` if `Log.stderr` is not the same object (custom/null sinks).
|
|
13
|
+
- Wire `env_worker_timeout_sec` / `env_worker_idle_timeout_sec` into `ci-shard-run` plan context. Rescue `Interrupt` around `after_suite` in `run-shards` and `ci-shard` orchestration where suite hooks run.
|
|
14
|
+
- In `Polyrun::Hooks#run_phase`, rescue `Interrupt` for Ruby DSL and shell hook phases (return 130).
|
|
15
|
+
- Document worker timeout, idle ping, and `POLYRUN_ORCHESTRATION_STDERR` in `polyrun help`. Add `sig/polyrun/worker_ping.rbs` and extend `Polyrun::RSpec` / `Polyrun::Minitest` installer signatures.
|
|
16
|
+
|
|
5
17
|
## 1.4.2 (2026-04-24)
|
|
6
18
|
|
|
7
19
|
- Add richer HTML coverage reports: summary cards, group coverage, sortable file tables, project-relative paths, and per-file source detail.
|
|
@@ -49,7 +49,11 @@ module Polyrun
|
|
|
49
49
|
"POLYRUN_SHARD_TOTAL" => ctx[:workers].to_s,
|
|
50
50
|
"POLYRUN_SUITE_EXIT_STATUS" => exit_code.to_s
|
|
51
51
|
)
|
|
52
|
-
|
|
52
|
+
begin
|
|
53
|
+
hook_cfg.run_phase_if_enabled(:after_suite, env_after)
|
|
54
|
+
rescue Interrupt
|
|
55
|
+
Polyrun::Log.warn "polyrun ci-shard: after_suite hook interrupted"
|
|
56
|
+
end
|
|
53
57
|
end
|
|
54
58
|
end
|
|
55
59
|
end
|
|
@@ -109,9 +113,13 @@ module Polyrun
|
|
|
109
113
|
exit_code
|
|
110
114
|
ensure
|
|
111
115
|
if suite_started
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
116
|
+
begin
|
|
117
|
+
hook_cfg.run_phase_if_enabled(:after_suite, env_orch.merge(
|
|
118
|
+
"POLYRUN_SUITE_EXIT_STATUS" => exit_code.to_s
|
|
119
|
+
))
|
|
120
|
+
rescue Interrupt
|
|
121
|
+
Polyrun::Log.warn "polyrun ci-shard: after_suite hook interrupted"
|
|
122
|
+
end
|
|
115
123
|
end
|
|
116
124
|
end
|
|
117
125
|
end
|
|
@@ -66,7 +66,9 @@ module Polyrun
|
|
|
66
66
|
merge_format: nil,
|
|
67
67
|
config_path: config_path,
|
|
68
68
|
matrix_shard_index: mx,
|
|
69
|
-
matrix_shard_total: mt
|
|
69
|
+
matrix_shard_total: mt,
|
|
70
|
+
worker_timeout_sec: env_worker_timeout_sec,
|
|
71
|
+
worker_idle_timeout_sec: env_worker_idle_timeout_sec
|
|
70
72
|
}
|
|
71
73
|
end
|
|
72
74
|
|
data/lib/polyrun/cli/help.rb
CHANGED
|
@@ -23,6 +23,9 @@ module Polyrun
|
|
|
23
23
|
Warn if merge-coverage wall time exceeds N seconds (default 10): POLYRUN_MERGE_SLOW_WARN_SECONDS (0 disables)
|
|
24
24
|
Failure fragments (run-shards --merge-failures): POLYRUN_MERGE_FAILURES=1; parent sets POLYRUN_FAILURE_FRAGMENTS=1 in workers; POLYRUN_FAILURE_FRAGMENT_DIR, POLYRUN_MERGED_FAILURES_OUT, POLYRUN_MERGED_FAILURES_FORMAT; after_suite sets POLYRUN_MERGED_FAILURES_PATH when merge ran
|
|
25
25
|
Parallel RSpec workers: POLYRUN_WORKERS default 5, max 10 (run-shards / parallel-rspec / start); distinct from POLYRUN_SHARD_PROCESSES / ci-shard --shard-processes (local processes per CI matrix job)
|
|
26
|
+
Per-worker wall timeout: run-shards --worker-timeout SEC or POLYRUN_WORKER_TIMEOUT_SEC (max time since each worker spawn). Parent polls all live workers together. Exit 124; remaining workers stopped.
|
|
27
|
+
Per-worker idle timeout: --worker-idle-timeout SEC or POLYRUN_WORKER_IDLE_TIMEOUT_SEC counts only after a successful ping timestamp (positive float in POLYRUN_WORKER_PING_FILE); empty or unreadable pings do not satisfy idle enforcement—use wall timeout until the first ping. RSpec/Minitest/Quick installers call Polyrun::WorkerPing.ping! per example/suite. Ping files live under tmp/polyrun/ (gitignored via tmp/); parent unlinks each after its worker exits. Exit 125. Optional outer cap: --worker-timeout (exit 124). Optional periodic pings: POLYRUN_WORKER_PING_THREAD=1 (POLYRUN_WORKER_PING_INTERVAL_SEC); WorkerPing.ensure_interval_ping_thread! (installers invoke it—call yourself if wiring workers without install_worker_ping!).
|
|
28
|
+
If Polyrun::Log.stderr is null or redirected away, set POLYRUN_ORCHESTRATION_STDERR=1 to also print timeout/SIGINT summary lines to process stderr.
|
|
26
29
|
Partition timing granularity (default file): POLYRUN_TIMING_GRANULARITY=file|example (experimental per-example; see partition.timing_granularity)
|
|
27
30
|
|
|
28
31
|
commands:
|
data/lib/polyrun/cli/helpers.rb
CHANGED
|
@@ -11,6 +11,28 @@ module Polyrun
|
|
|
11
11
|
Polyrun::Config::Resolver.env_int(name, fallback)
|
|
12
12
|
end
|
|
13
13
|
|
|
14
|
+
# Per-worker wall clock (from spawn) for run-shards / ci-shard fan-out; unset or invalid means no limit.
|
|
15
|
+
def env_worker_timeout_sec
|
|
16
|
+
s = ENV["POLYRUN_WORKER_TIMEOUT_SEC"].to_s.strip
|
|
17
|
+
return nil if s.empty?
|
|
18
|
+
|
|
19
|
+
f = Float(s, exception: false)
|
|
20
|
+
return nil if f.nil? || f <= 0
|
|
21
|
+
|
|
22
|
+
f
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Max seconds without a new monotonic timestamp ping in the worker (see +polyrun/worker_ping+).
|
|
26
|
+
def env_worker_idle_timeout_sec
|
|
27
|
+
s = ENV["POLYRUN_WORKER_IDLE_TIMEOUT_SEC"].to_s.strip
|
|
28
|
+
return nil if s.empty?
|
|
29
|
+
|
|
30
|
+
f = Float(s, exception: false)
|
|
31
|
+
return nil if f.nil? || f <= 0
|
|
32
|
+
|
|
33
|
+
f
|
|
34
|
+
end
|
|
35
|
+
|
|
14
36
|
def resolve_shard_index(pc)
|
|
15
37
|
Polyrun::Config::Resolver.resolve_shard_index(pc)
|
|
16
38
|
end
|
|
@@ -1,7 +1,13 @@
|
|
|
1
|
+
require "fileutils"
|
|
2
|
+
|
|
3
|
+
require_relative "run_shards_parallel_wait"
|
|
4
|
+
|
|
1
5
|
module Polyrun
|
|
2
6
|
class CLI
|
|
3
|
-
# Spawns
|
|
7
|
+
# Spawns worker processes for +run-shards+ / +ci-shard-*+ fan-out. See {RunShardsParallelWait} for wait/timeout.
|
|
4
8
|
module RunShardsParallelChildren
|
|
9
|
+
include RunShardsParallelWait
|
|
10
|
+
|
|
5
11
|
private
|
|
6
12
|
|
|
7
13
|
# @return [Array(Array, Integer, nil)] +[pids, spawn_error_code]+; +spawn_error_code+ is +nil+ when all spawns succeeded
|
|
@@ -45,9 +51,12 @@ module Polyrun
|
|
|
45
51
|
child_env = child_env.merge("POLYRUN_HOOK_ORCHESTRATOR" => "0")
|
|
46
52
|
child_env = hook_cfg.merge_worker_ruby_env(child_env)
|
|
47
53
|
|
|
54
|
+
ping_path = run_shards_prepare_worker_ping!(ctx, child_env, shard)
|
|
55
|
+
|
|
48
56
|
Polyrun::Log.warn "polyrun run-shards: shard #{shard} → #{paths.size} file(s)" if @verbose
|
|
57
|
+
spawned_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
49
58
|
pid = run_shards_spawn_one_worker(child_env, cmd, paths, hook_cfg)
|
|
50
|
-
pids << {pid: pid, shard: shard}
|
|
59
|
+
pids << {pid: pid, shard: shard, spawned_at: spawned_at, ping_path: ping_path}
|
|
51
60
|
Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.spawn shard=#{shard} child_pid=#{pid} spec_files=#{paths.size}")
|
|
52
61
|
Polyrun::Log.warn "polyrun run-shards: started shard #{shard} pid=#{pid} (#{paths.size} file(s))" if parallel
|
|
53
62
|
end
|
|
@@ -55,6 +64,21 @@ module Polyrun
|
|
|
55
64
|
end
|
|
56
65
|
# rubocop:enable Metrics/AbcSize
|
|
57
66
|
|
|
67
|
+
def run_shards_prepare_worker_ping!(ctx, child_env, shard)
|
|
68
|
+
idle_sec = ctx[:worker_idle_timeout_sec]
|
|
69
|
+
idle_sec = nil if idle_sec.is_a?(Numeric) && idle_sec <= 0
|
|
70
|
+
return nil unless idle_sec
|
|
71
|
+
|
|
72
|
+
dir = File.join(Dir.pwd, "tmp", "polyrun")
|
|
73
|
+
FileUtils.mkdir_p(dir)
|
|
74
|
+
path = File.expand_path("worker-ping-#{$$}-#{shard}.txt", dir)
|
|
75
|
+
File.binwrite(path, "")
|
|
76
|
+
child_env["POLYRUN_WORKER_PING_FILE"] = path
|
|
77
|
+
interval = ENV["POLYRUN_WORKER_PING_INTERVAL_SEC"].to_s.strip
|
|
78
|
+
child_env["POLYRUN_WORKER_PING_INTERVAL_SEC"] = interval.empty? ? "15" : interval
|
|
79
|
+
path
|
|
80
|
+
end
|
|
81
|
+
|
|
58
82
|
def run_shards_spawn_one_worker(child_env, cmd, paths, hook_cfg)
|
|
59
83
|
if hook_cfg.worker_hooks? && !Polyrun::Hooks.disabled?
|
|
60
84
|
Process.spawn(child_env, "sh", "-c", hook_cfg.build_worker_shell_script(cmd, paths))
|
|
@@ -62,38 +86,6 @@ module Polyrun
|
|
|
62
86
|
Process.spawn(child_env, *cmd, *paths)
|
|
63
87
|
end
|
|
64
88
|
end
|
|
65
|
-
|
|
66
|
-
# @return [Array(Array, Integer)] +[shard_results, after_shard_hook_error_code]+ (0 when all +after_shard+ hooks passed)
|
|
67
|
-
def run_shards_wait_all_children(pids, hook_cfg, ctx)
|
|
68
|
-
workers = ctx[:workers]
|
|
69
|
-
shard_results = []
|
|
70
|
-
after_hook_err = 0
|
|
71
|
-
Polyrun::Debug.time("Process.wait (#{pids.size} worker process(es))") do
|
|
72
|
-
pids.each do |h|
|
|
73
|
-
Process.wait(h[:pid])
|
|
74
|
-
exitstatus = $?.exitstatus
|
|
75
|
-
ok = $?.success?
|
|
76
|
-
Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.wait child_pid=#{h[:pid]} shard=#{h[:shard]} exit=#{exitstatus} success=#{ok}")
|
|
77
|
-
env_after = ENV.to_h.merge(
|
|
78
|
-
"POLYRUN_HOOK_ORCHESTRATOR" => "1",
|
|
79
|
-
"POLYRUN_SHARD_INDEX" => h[:shard].to_s,
|
|
80
|
-
"POLYRUN_SHARD_TOTAL" => workers.to_s,
|
|
81
|
-
"POLYRUN_WORKER_EXIT_STATUS" => exitstatus.to_s
|
|
82
|
-
)
|
|
83
|
-
rc = hook_cfg.run_phase_if_enabled(:after_shard, env_after)
|
|
84
|
-
after_hook_err = rc if rc != 0 && after_hook_err == 0
|
|
85
|
-
shard_results << {shard: h[:shard], exitstatus: exitstatus, success: ok}
|
|
86
|
-
end
|
|
87
|
-
rescue Interrupt
|
|
88
|
-
# Do not trap SIGINT: Process.wait raises Interrupt; a trap races and prints Interrupt + SystemExit traces.
|
|
89
|
-
run_shards_shutdown_on_signal!(pids, 130)
|
|
90
|
-
rescue SignalException => e
|
|
91
|
-
raise unless e.signm == "SIGTERM"
|
|
92
|
-
|
|
93
|
-
run_shards_shutdown_on_signal!(pids, 143)
|
|
94
|
-
end
|
|
95
|
-
[shard_results, after_hook_err]
|
|
96
|
-
end
|
|
97
89
|
end
|
|
98
90
|
end
|
|
99
91
|
end
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
# rubocop:disable Polyrun/FileLength, Metrics/ModuleLength -- wait loop + idle/wall flush (kept out of spawn module)
|
|
2
|
+
module Polyrun
|
|
3
|
+
class CLI
|
|
4
|
+
# Wait, wall/idle timeout, and +after_shard+ hooks for parallel workers (+run-shards+ / +ci-shard-*+).
|
|
5
|
+
module RunShardsParallelWait
|
|
6
|
+
WORKER_TIMEOUT_EXIT_STATUS = 124
|
|
7
|
+
WORKER_IDLE_TIMEOUT_EXIT_STATUS = 125
|
|
8
|
+
|
|
9
|
+
private
|
|
10
|
+
|
|
11
|
+
# @return [Array(Array, Integer)] +[shard_results, after_shard_hook_error_code]+ (0 when all +after_shard+ hooks passed)
|
|
12
|
+
# rubocop:disable Metrics/AbcSize -- wait loop + timeout flush
|
|
13
|
+
def run_shards_wait_all_children(pids, hook_cfg, ctx)
|
|
14
|
+
workers = ctx[:workers]
|
|
15
|
+
shard_results = []
|
|
16
|
+
after_hook_err = 0
|
|
17
|
+
timeout_sec = ctx[:worker_timeout_sec]
|
|
18
|
+
timeout_sec = nil if timeout_sec.is_a?(Numeric) && timeout_sec <= 0
|
|
19
|
+
idle_sec = ctx[:worker_idle_timeout_sec]
|
|
20
|
+
idle_sec = nil if idle_sec.is_a?(Numeric) && idle_sec <= 0
|
|
21
|
+
|
|
22
|
+
Polyrun::Debug.time("Process.wait (#{pids.size} worker process(es))") do
|
|
23
|
+
if timeout_sec || idle_sec
|
|
24
|
+
run_shards_wait_all_children_multiplex(
|
|
25
|
+
pids, hook_cfg, ctx, workers, timeout_sec, idle_sec, shard_results, after_hook_err
|
|
26
|
+
)
|
|
27
|
+
else
|
|
28
|
+
run_shards_wait_all_children_sequential(pids, hook_cfg, workers, shard_results, after_hook_err)
|
|
29
|
+
end
|
|
30
|
+
rescue Interrupt
|
|
31
|
+
run_shards_shutdown_on_signal!(pids, 130)
|
|
32
|
+
rescue SignalException => e
|
|
33
|
+
raise unless e.signm == "SIGTERM"
|
|
34
|
+
|
|
35
|
+
run_shards_shutdown_on_signal!(pids, 143)
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
# rubocop:enable Metrics/AbcSize
|
|
39
|
+
|
|
40
|
+
def run_shards_wait_all_children_sequential(pids, hook_cfg, workers, shard_results, after_hook_err)
|
|
41
|
+
pids.each do |h|
|
|
42
|
+
Process.wait(h[:pid])
|
|
43
|
+
st = $?
|
|
44
|
+
after_hook_err = run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
|
|
45
|
+
end
|
|
46
|
+
[shard_results, after_hook_err]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Poll every live PID each tick so wall and idle timeouts apply to all workers, not only the first in wait order.
|
|
50
|
+
def run_shards_wait_all_children_multiplex(pids, hook_cfg, ctx, workers, timeout_sec, idle_sec, shard_results, after_hook_err)
|
|
51
|
+
pending = pids.dup
|
|
52
|
+
|
|
53
|
+
loop do
|
|
54
|
+
pending.delete_if do |h|
|
|
55
|
+
wpid = Process.wait(h[:pid], Process::WNOHANG)
|
|
56
|
+
next false unless wpid == h[:pid]
|
|
57
|
+
|
|
58
|
+
st = $?
|
|
59
|
+
after_hook_err = run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
|
|
60
|
+
true
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
return [shard_results, after_hook_err] if pending.empty?
|
|
64
|
+
|
|
65
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
66
|
+
violation = run_shards_timeout_violation(pids, pending, ctx, now, timeout_sec, idle_sec)
|
|
67
|
+
if violation
|
|
68
|
+
reason, timed_h = violation
|
|
69
|
+
others = pending.reject { |x| x[:pid] == timed_h[:pid] }
|
|
70
|
+
case reason
|
|
71
|
+
when :wall_timeout
|
|
72
|
+
return run_shards_wait_flush_after_worker_timeout!(
|
|
73
|
+
timed_h, others, hook_cfg, ctx, timeout_sec, workers, shard_results, after_hook_err
|
|
74
|
+
)
|
|
75
|
+
when :idle_timeout
|
|
76
|
+
return run_shards_wait_flush_after_worker_idle!(
|
|
77
|
+
timed_h, others, hook_cfg, ctx, idle_sec, workers, shard_results, after_hook_err
|
|
78
|
+
)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
sleep(0.2)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# @return [(Symbol, Hash), nil] e.g. +[:wall_timeout, h]+ when a limit is exceeded
|
|
87
|
+
def run_shards_timeout_violation(pids_order, pending, ctx, now, timeout_sec, idle_sec)
|
|
88
|
+
pids_order.each do |h|
|
|
89
|
+
next unless pending.any? { |p| p[:pid] == h[:pid] }
|
|
90
|
+
|
|
91
|
+
if timeout_sec && timeout_sec > 0
|
|
92
|
+
spawned_at = h[:spawned_at] || ctx[:run_t0]
|
|
93
|
+
return [:wall_timeout, h] if now >= spawned_at + timeout_sec
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
pids_order.each do |h|
|
|
98
|
+
next unless pending.any? { |p| p[:pid] == h[:pid] }
|
|
99
|
+
|
|
100
|
+
next unless idle_sec && idle_sec > 0 && h[:ping_path]
|
|
101
|
+
|
|
102
|
+
t, = run_shards_read_worker_ping_payload(h[:ping_path])
|
|
103
|
+
return [:idle_timeout, h] if t && (now - t) > idle_sec
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
nil
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
|
|
110
|
+
exitstatus = st.exitstatus
|
|
111
|
+
ok = st.success?
|
|
112
|
+
Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.wait child_pid=#{h[:pid]} shard=#{h[:shard]} exit=#{exitstatus} success=#{ok}")
|
|
113
|
+
rc = run_shards_invoke_after_shard!(hook_cfg, h[:shard], workers, exitstatus)
|
|
114
|
+
after_hook_err = rc if rc != 0 && after_hook_err == 0
|
|
115
|
+
shard_results << {shard: h[:shard], exitstatus: exitstatus, success: ok}
|
|
116
|
+
run_shards_unlink_ping_path(h[:ping_path])
|
|
117
|
+
after_hook_err
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def run_shards_unlink_ping_path(path)
|
|
121
|
+
s = path.to_s.strip
|
|
122
|
+
return if s.empty?
|
|
123
|
+
|
|
124
|
+
File.unlink(s) if File.file?(s)
|
|
125
|
+
rescue SystemCallError
|
|
126
|
+
# best-effort cleanup of tmp/polyrun/worker-ping-*.txt
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def run_shards_read_worker_ping_time(path)
|
|
130
|
+
run_shards_read_worker_ping_payload(path)[0]
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# @return [Array(Float?, String?)] monotonic time and optional location line (path:line)
|
|
134
|
+
def run_shards_read_worker_ping_payload(path)
|
|
135
|
+
return [nil, nil] unless path && File.file?(path)
|
|
136
|
+
|
|
137
|
+
s = File.binread(path)
|
|
138
|
+
return [nil, nil] if s.nil? || s.strip.empty?
|
|
139
|
+
|
|
140
|
+
time_line, rest = s.split("\n", 2)
|
|
141
|
+
first = time_line.to_s.strip
|
|
142
|
+
return [nil, nil] if first.empty?
|
|
143
|
+
|
|
144
|
+
f = first.to_f
|
|
145
|
+
t = f.positive? ? f : nil
|
|
146
|
+
loc = rest.to_s.strip
|
|
147
|
+
loc = nil if loc.empty?
|
|
148
|
+
[t, loc]
|
|
149
|
+
rescue SystemCallError
|
|
150
|
+
[nil, nil]
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def run_shards_wait_flush_after_worker_idle!(timed_h, others, hook_cfg, ctx, idle_sec, workers, shard_results, after_hook_err)
|
|
154
|
+
run_shards_warn_worker_idle!(timed_h, ctx, idle_sec)
|
|
155
|
+
run_shards_force_stop_pid_status(timed_h[:pid])
|
|
156
|
+
run_shards_unlink_ping_path(timed_h[:ping_path])
|
|
157
|
+
rc = run_shards_invoke_after_shard!(hook_cfg, timed_h[:shard], workers, WORKER_IDLE_TIMEOUT_EXIT_STATUS)
|
|
158
|
+
after_hook_err = rc if rc != 0 && after_hook_err == 0
|
|
159
|
+
shard_results << {shard: timed_h[:shard], exitstatus: WORKER_IDLE_TIMEOUT_EXIT_STATUS, success: false}
|
|
160
|
+
others.each do |h2|
|
|
161
|
+
st2 = run_shards_wait_or_force_stop_status(h2[:pid])
|
|
162
|
+
exit2 = st2&.exitstatus
|
|
163
|
+
ok2 = st2 ? st2.success? : false
|
|
164
|
+
exit2 = WORKER_IDLE_TIMEOUT_EXIT_STATUS if exit2.nil?
|
|
165
|
+
run_shards_unlink_ping_path(h2[:ping_path])
|
|
166
|
+
rc2 = run_shards_invoke_after_shard!(hook_cfg, h2[:shard], workers, exit2)
|
|
167
|
+
after_hook_err = rc2 if rc2 != 0 && after_hook_err == 0
|
|
168
|
+
shard_results << {shard: h2[:shard], exitstatus: exit2, success: ok2}
|
|
169
|
+
end
|
|
170
|
+
[shard_results, after_hook_err]
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def run_shards_wait_flush_after_worker_timeout!(timed_h, others, hook_cfg, ctx, timeout_sec, workers, shard_results, after_hook_err)
|
|
174
|
+
run_shards_warn_worker_timeout!(timed_h, ctx, timeout_sec)
|
|
175
|
+
run_shards_force_stop_pid_status(timed_h[:pid])
|
|
176
|
+
run_shards_unlink_ping_path(timed_h[:ping_path])
|
|
177
|
+
rc = run_shards_invoke_after_shard!(hook_cfg, timed_h[:shard], workers, WORKER_TIMEOUT_EXIT_STATUS)
|
|
178
|
+
after_hook_err = rc if rc != 0 && after_hook_err == 0
|
|
179
|
+
shard_results << {shard: timed_h[:shard], exitstatus: WORKER_TIMEOUT_EXIT_STATUS, success: false}
|
|
180
|
+
others.each do |h2|
|
|
181
|
+
st2 = run_shards_wait_or_force_stop_status(h2[:pid])
|
|
182
|
+
exit2 = st2&.exitstatus
|
|
183
|
+
ok2 = st2 ? st2.success? : false
|
|
184
|
+
exit2 = WORKER_TIMEOUT_EXIT_STATUS if exit2.nil?
|
|
185
|
+
run_shards_unlink_ping_path(h2[:ping_path])
|
|
186
|
+
rc2 = run_shards_invoke_after_shard!(hook_cfg, h2[:shard], workers, exit2)
|
|
187
|
+
after_hook_err = rc2 if rc2 != 0 && after_hook_err == 0
|
|
188
|
+
shard_results << {shard: h2[:shard], exitstatus: exit2, success: ok2}
|
|
189
|
+
end
|
|
190
|
+
[shard_results, after_hook_err]
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def run_shards_invoke_after_shard!(hook_cfg, shard, workers, exitstatus)
|
|
194
|
+
env_after = ENV.to_h.merge(
|
|
195
|
+
"POLYRUN_HOOK_ORCHESTRATOR" => "1",
|
|
196
|
+
"POLYRUN_SHARD_INDEX" => shard.to_s,
|
|
197
|
+
"POLYRUN_SHARD_TOTAL" => workers.to_s,
|
|
198
|
+
"POLYRUN_WORKER_EXIT_STATUS" => exitstatus.to_s
|
|
199
|
+
)
|
|
200
|
+
hook_cfg.run_phase_if_enabled(:after_shard, env_after)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def run_shards_warn_worker_idle!(h, ctx, idle_sec)
|
|
204
|
+
paths = ctx[:plan].shard(h[:shard])
|
|
205
|
+
sample = paths.first(5).join(", ")
|
|
206
|
+
suffix =
|
|
207
|
+
if paths.empty?
|
|
208
|
+
" (no paths)"
|
|
209
|
+
elsif paths.size > 5
|
|
210
|
+
" (#{paths.size} files total)"
|
|
211
|
+
else
|
|
212
|
+
""
|
|
213
|
+
end
|
|
214
|
+
_t, loc = run_shards_read_worker_ping_payload(h[:ping_path])
|
|
215
|
+
ping_suffix = (loc && !loc.to_s.strip.empty?) ? "; last ping #{loc.to_s.strip}" : ""
|
|
216
|
+
Polyrun::Log.orchestration_warn "polyrun run-shards: WORKER IDLE TIMEOUT after #{idle_sec}s since last per-example progress ping — shard #{h[:shard]} pid #{h[:pid]}#{ping_suffix}."
|
|
217
|
+
Polyrun::Log.warn "polyrun run-shards: idle shard file sample: #{sample}#{suffix}"
|
|
218
|
+
Polyrun::Log.warn "polyrun run-shards: use Polyrun::RSpec.install_worker_ping! / Polyrun::Minitest.install_worker_ping! (Polyrun Quick calls ping! each example); exit #{WORKER_IDLE_TIMEOUT_EXIT_STATUS}."
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def run_shards_wait_or_force_stop_status(pid)
|
|
222
|
+
wpid = Process.wait(pid, Process::WNOHANG)
|
|
223
|
+
return $? if wpid == pid
|
|
224
|
+
|
|
225
|
+
run_shards_force_stop_pid_status(pid)
|
|
226
|
+
rescue Errno::ECHILD
|
|
227
|
+
nil
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def run_shards_force_stop_pid_status(pid)
|
|
231
|
+
Process.kill(:KILL, pid)
|
|
232
|
+
st = nil
|
|
233
|
+
begin
|
|
234
|
+
Process.wait(pid)
|
|
235
|
+
st = $?
|
|
236
|
+
rescue Errno::ECHILD
|
|
237
|
+
# child already reaped
|
|
238
|
+
end
|
|
239
|
+
st
|
|
240
|
+
rescue Errno::ESRCH
|
|
241
|
+
begin
|
|
242
|
+
Process.wait(pid)
|
|
243
|
+
$?
|
|
244
|
+
rescue Errno::ECHILD
|
|
245
|
+
nil
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def run_shards_warn_worker_timeout!(h, ctx, timeout_sec)
|
|
250
|
+
paths = ctx[:plan].shard(h[:shard])
|
|
251
|
+
sample = paths.first(5).join(", ")
|
|
252
|
+
suffix =
|
|
253
|
+
if paths.empty?
|
|
254
|
+
" (no paths)"
|
|
255
|
+
elsif paths.size > 5
|
|
256
|
+
" (#{paths.size} files total)"
|
|
257
|
+
else
|
|
258
|
+
""
|
|
259
|
+
end
|
|
260
|
+
Polyrun::Log.orchestration_warn "polyrun run-shards: WORKER TIMEOUT after #{timeout_sec}s (wall time since worker spawn) — shard #{h[:shard]} pid #{h[:pid]}."
|
|
261
|
+
Polyrun::Log.warn "polyrun run-shards: timeout shard includes: #{sample}#{suffix}"
|
|
262
|
+
Polyrun::Log.warn "polyrun run-shards: override with --worker-timeout SEC or POLYRUN_WORKER_TIMEOUT_SEC; recorded exit #{WORKER_TIMEOUT_EXIT_STATUS} for this worker."
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
# rubocop:enable Polyrun/FileLength, Metrics/ModuleLength
|
|
@@ -19,6 +19,9 @@ module Polyrun
|
|
|
19
19
|
err = run_shards_validate_cmd(cmd)
|
|
20
20
|
return [:fail, err] if err
|
|
21
21
|
|
|
22
|
+
run_shards_normalize_worker_timeout_option!(o)
|
|
23
|
+
run_shards_normalize_worker_idle_timeout_option!(o)
|
|
24
|
+
|
|
22
25
|
cmd = Shellwords.split(cmd.first) if cmd.size == 1 && cmd.first.include?(" ")
|
|
23
26
|
|
|
24
27
|
[:ok, o, cmd]
|
|
@@ -118,9 +121,39 @@ module Polyrun
|
|
|
118
121
|
merge_failures: run_shards_merge_failures_flag(o, cfg),
|
|
119
122
|
merge_failures_output: run_shards_merge_failures_output_opt(o, cfg),
|
|
120
123
|
merge_failures_format: run_shards_merge_failures_format_opt(o, cfg),
|
|
121
|
-
config_path: config_path
|
|
124
|
+
config_path: config_path,
|
|
125
|
+
worker_timeout_sec: run_shards_resolved_worker_timeout_sec(o),
|
|
126
|
+
worker_idle_timeout_sec: run_shards_resolved_worker_idle_timeout_sec(o)
|
|
122
127
|
}
|
|
123
128
|
end
|
|
129
|
+
|
|
130
|
+
def run_shards_normalize_worker_idle_timeout_option!(o)
|
|
131
|
+
v = o[:worker_idle_timeout_sec]
|
|
132
|
+
return if v.nil?
|
|
133
|
+
|
|
134
|
+
o[:worker_idle_timeout_sec] = nil if v <= 0
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def run_shards_resolved_worker_idle_timeout_sec(o)
|
|
138
|
+
cli = o[:worker_idle_timeout_sec]
|
|
139
|
+
return cli.to_f if cli.is_a?(Numeric) && cli > 0
|
|
140
|
+
|
|
141
|
+
env_worker_idle_timeout_sec
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def run_shards_normalize_worker_timeout_option!(o)
|
|
145
|
+
v = o[:worker_timeout_sec]
|
|
146
|
+
return if v.nil?
|
|
147
|
+
|
|
148
|
+
o[:worker_timeout_sec] = nil if v <= 0
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def run_shards_resolved_worker_timeout_sec(o)
|
|
152
|
+
cli = o[:worker_timeout_sec]
|
|
153
|
+
return cli.to_f if cli.is_a?(Numeric) && cli > 0
|
|
154
|
+
|
|
155
|
+
env_worker_timeout_sec
|
|
156
|
+
end
|
|
124
157
|
end
|
|
125
158
|
end
|
|
126
159
|
end
|
|
@@ -27,7 +27,9 @@ module Polyrun
|
|
|
27
27
|
merge_format: nil,
|
|
28
28
|
merge_failures: false,
|
|
29
29
|
merge_failures_output: nil,
|
|
30
|
-
merge_failures_format: nil
|
|
30
|
+
merge_failures_format: nil,
|
|
31
|
+
worker_timeout_sec: nil,
|
|
32
|
+
worker_idle_timeout_sec: nil
|
|
31
33
|
}
|
|
32
34
|
end
|
|
33
35
|
|
|
@@ -39,8 +41,10 @@ module Polyrun
|
|
|
39
41
|
|
|
40
42
|
# rubocop:disable Metrics/AbcSize -- one argv block for run-shards
|
|
41
43
|
def run_shards_plan_options_register!(opts, st)
|
|
42
|
-
opts.banner = "usage: polyrun run-shards [--workers N] [--strategy NAME] [--paths-file P] [--timing P] [--timing-granularity VAL] [--constraints P] [--seed S] [--merge-coverage] [--merge-output P] [--merge-format LIST] [--merge-failures] [--merge-failures-output P] [--merge-failures-format jsonl|json] [--] <command> [args...]"
|
|
44
|
+
opts.banner = "usage: polyrun run-shards [--workers N] [--worker-timeout SEC] [--worker-idle-timeout SEC] [--strategy NAME] [--paths-file P] [--timing P] [--timing-granularity VAL] [--constraints P] [--seed S] [--merge-coverage] [--merge-output P] [--merge-format LIST] [--merge-failures] [--merge-failures-output P] [--merge-failures-format jsonl|json] [--] <command> [args...]"
|
|
43
45
|
opts.on("--workers N", Integer) { |v| st[:workers] = v }
|
|
46
|
+
opts.on("--worker-timeout SEC", Float, "Max seconds per worker since spawn (also POLYRUN_WORKER_TIMEOUT_SEC); kills stuck workers (exit 124)") { |v| st[:worker_timeout_sec] = v }
|
|
47
|
+
opts.on("--worker-idle-timeout SEC", Float, "Max seconds since last valid WorkerPing timestamp in POLYRUN_WORKER_PING_FILE (needs prior ping); RSpec/Minitest: install_worker_ping!; Quick: automatic; exit 125") { |v| st[:worker_idle_timeout_sec] = v }
|
|
44
48
|
opts.on("--strategy NAME", String) { |v| st[:strategy] = v }
|
|
45
49
|
opts.on("--seed VAL") { |v| st[:seed] = v }
|
|
46
50
|
opts.on("--paths-file PATH", String) { |v| st[:paths_file] = v }
|
|
@@ -2,6 +2,7 @@ require "shellwords"
|
|
|
2
2
|
require "rbconfig"
|
|
3
3
|
|
|
4
4
|
require_relative "run_shards_planning"
|
|
5
|
+
require_relative "run_shards_worker_interrupt"
|
|
5
6
|
require_relative "run_shards_parallel_children"
|
|
6
7
|
|
|
7
8
|
module Polyrun
|
|
@@ -9,6 +10,7 @@ module Polyrun
|
|
|
9
10
|
# Partition + spawn workers for `polyrun run-shards` (keeps {RunShardsCommand} file small).
|
|
10
11
|
module RunShardsRun
|
|
11
12
|
include RunShardsPlanning
|
|
13
|
+
include RunShardsWorkerInterrupt
|
|
12
14
|
include RunShardsParallelChildren
|
|
13
15
|
|
|
14
16
|
private
|
|
@@ -93,7 +95,11 @@ module Polyrun
|
|
|
93
95
|
"POLYRUN_SUITE_EXIT_STATUS" => exit_code.to_s,
|
|
94
96
|
"POLYRUN_MERGED_FAILURES_PATH" => merged_failures_path.to_s
|
|
95
97
|
)
|
|
96
|
-
|
|
98
|
+
begin
|
|
99
|
+
hook_cfg.run_phase_if_enabled(:after_suite, env_after)
|
|
100
|
+
rescue Interrupt
|
|
101
|
+
Polyrun::Log.warn "polyrun run-shards: after_suite hook interrupted; workers are stopped or were not started"
|
|
102
|
+
end
|
|
97
103
|
end
|
|
98
104
|
end
|
|
99
105
|
end
|
|
@@ -106,38 +112,6 @@ module Polyrun
|
|
|
106
112
|
Polyrun::Log.warn "polyrun run-shards: each worker prints its own summary line; the last \"N examples\" line is not a total across shards."
|
|
107
113
|
end
|
|
108
114
|
|
|
109
|
-
# Best-effort worker teardown then exit. Does not return.
|
|
110
|
-
def run_shards_shutdown_on_signal!(pids, code)
|
|
111
|
-
run_shards_terminate_children!(pids)
|
|
112
|
-
exit(code)
|
|
113
|
-
rescue Interrupt
|
|
114
|
-
pids.each do |h|
|
|
115
|
-
Process.kill(:KILL, h[:pid])
|
|
116
|
-
rescue Errno::ESRCH
|
|
117
|
-
# already reaped
|
|
118
|
-
end
|
|
119
|
-
pids.each do |h|
|
|
120
|
-
Process.wait(h[:pid])
|
|
121
|
-
rescue Errno::ESRCH, Errno::ECHILD, Interrupt
|
|
122
|
-
# already reaped or give up
|
|
123
|
-
end
|
|
124
|
-
exit(code)
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
# Send SIGTERM to each worker PID and wait so Ctrl+C / SIGTERM does not leave orphans.
|
|
128
|
-
def run_shards_terminate_children!(pids)
|
|
129
|
-
pids.each do |h|
|
|
130
|
-
Process.kill(:TERM, h[:pid])
|
|
131
|
-
rescue Errno::ESRCH
|
|
132
|
-
# already reaped
|
|
133
|
-
end
|
|
134
|
-
pids.each do |h|
|
|
135
|
-
Process.wait(h[:pid])
|
|
136
|
-
rescue Errno::ESRCH, Errno::ECHILD
|
|
137
|
-
# already reaped
|
|
138
|
-
end
|
|
139
|
-
end
|
|
140
|
-
|
|
141
115
|
def run_shards_merge_or_hint_coverage(ctx)
|
|
142
116
|
if ctx[:merge_coverage]
|
|
143
117
|
mo = ctx[:merge_output] || "coverage/merged.json"
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
module Polyrun
|
|
2
|
+
class CLI
|
|
3
|
+
# SIGINT/SIGTERM handling and non-blocking reap for parallel worker PIDs (used by run-shards / ci-shard fan-out).
|
|
4
|
+
module RunShardsWorkerInterrupt
|
|
5
|
+
private
|
|
6
|
+
|
|
7
|
+
def run_shards_log_interrupt_workers(pids, _ctx)
|
|
8
|
+
parts = pids.map { |h| "shard=#{h[:shard]} pid=#{h[:pid]}" }
|
|
9
|
+
Polyrun::Log.orchestration_warn "polyrun run-shards: SIGINT/SIGTERM while waiting on workers — stopping: #{parts.join(", ")}"
|
|
10
|
+
Polyrun::Log.warn "polyrun run-shards: search this log for each shard's started … pid= line and RSpec output; repeat SIGINT during cleanup escalates to SIGKILL"
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# Best-effort worker teardown then exit. Does not return.
|
|
14
|
+
def run_shards_shutdown_on_signal!(pids, code)
|
|
15
|
+
run_shards_log_interrupt_workers(pids, nil)
|
|
16
|
+
run_shards_terminate_children!(pids)
|
|
17
|
+
exit(code)
|
|
18
|
+
rescue Interrupt
|
|
19
|
+
run_shards_signal_workers_kill(pids)
|
|
20
|
+
run_shards_reap_worker_pids_interruptible(pids.map { |h| h[:pid] })
|
|
21
|
+
exit(code)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Send SIGTERM to each worker PID and wait so Ctrl+C / SIGTERM does not leave orphans.
|
|
25
|
+
def run_shards_terminate_children!(pids)
|
|
26
|
+
run_shards_signal_workers_term(pids)
|
|
27
|
+
run_shards_reap_worker_pids_interruptible(pids.map { |h| h[:pid] })
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def run_shards_signal_workers_term(pids)
|
|
31
|
+
pids.each do |h|
|
|
32
|
+
Process.kill(:TERM, h[:pid])
|
|
33
|
+
rescue Errno::ESRCH
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def run_shards_signal_workers_kill(pids)
|
|
38
|
+
pids.each do |h|
|
|
39
|
+
Process.kill(:KILL, h[:pid])
|
|
40
|
+
rescue Errno::ESRCH
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Reap child PIDs without blocking uninterruptibly on one stuck zombie (avoids noisy stacks on repeat Ctrl+C).
|
|
45
|
+
def run_shards_reap_worker_pids_interruptible(pids)
|
|
46
|
+
pending = pids.compact.uniq
|
|
47
|
+
force_note = false
|
|
48
|
+
until pending.empty?
|
|
49
|
+
pending.reject! do |pid|
|
|
50
|
+
w = Process.wait(pid, Process::WNOHANG)
|
|
51
|
+
next true if w == pid
|
|
52
|
+
|
|
53
|
+
false
|
|
54
|
+
rescue Errno::ECHILD
|
|
55
|
+
true
|
|
56
|
+
end
|
|
57
|
+
break if pending.empty?
|
|
58
|
+
|
|
59
|
+
begin
|
|
60
|
+
sleep(0.05)
|
|
61
|
+
rescue Interrupt
|
|
62
|
+
unless force_note
|
|
63
|
+
force_note = true
|
|
64
|
+
Polyrun::Log.orchestration_warn "polyrun run-shards: repeated SIGINT during worker cleanup — SIGKILL to #{pending.size} process(es)"
|
|
65
|
+
end
|
|
66
|
+
pending.each do |pid|
|
|
67
|
+
Process.kill(:KILL, pid)
|
|
68
|
+
rescue Errno::ESRCH
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# rubocop:disable Polyrun/FileLength -- HTML merge formatter + helpers in one file
|
|
1
2
|
require "cgi"
|
|
2
3
|
require "digest/sha1"
|
|
3
4
|
require "erb"
|
|
@@ -9,6 +10,7 @@ module Polyrun
|
|
|
9
10
|
module_function
|
|
10
11
|
|
|
11
12
|
# Standalone HTML report with summary, file table, and per-file source details.
|
|
13
|
+
# rubocop:disable Metrics/AbcSize -- linear assembly of overview, file table, sections, asset reads
|
|
12
14
|
def emit_html(coverage_blob, title: "Polyrun coverage", root: nil, groups: nil, generated_at: Time.now)
|
|
13
15
|
files = coverage_blob.keys.sort.map { |path| html_file_payload(path, coverage_blob[path], root) }
|
|
14
16
|
summary = html_summary(files)
|
|
@@ -31,6 +33,7 @@ module Polyrun
|
|
|
31
33
|
javascript: File.read(html_javascript_path)
|
|
32
34
|
)
|
|
33
35
|
end
|
|
36
|
+
# rubocop:enable Metrics/AbcSize
|
|
34
37
|
|
|
35
38
|
def html_asset_dir
|
|
36
39
|
File.join(__dir__, "html")
|
|
@@ -197,3 +200,4 @@ module Polyrun
|
|
|
197
200
|
end
|
|
198
201
|
end
|
|
199
202
|
end
|
|
203
|
+
# rubocop:enable Polyrun/FileLength
|
data/lib/polyrun/hooks.rb
CHANGED
|
@@ -118,6 +118,9 @@ module Polyrun
|
|
|
118
118
|
if reg&.any?(phase)
|
|
119
119
|
begin
|
|
120
120
|
reg.run(phase, merged)
|
|
121
|
+
rescue Interrupt
|
|
122
|
+
Polyrun::Log.warn "polyrun hooks: #{phase} ruby hook interrupted"
|
|
123
|
+
return 130
|
|
121
124
|
rescue => e
|
|
122
125
|
Polyrun::Log.warn "polyrun hooks: #{phase} ruby hook failed: #{e.class}: #{e.message}"
|
|
123
126
|
return 1
|
|
@@ -125,7 +128,12 @@ module Polyrun
|
|
|
125
128
|
end
|
|
126
129
|
|
|
127
130
|
commands_for(phase).each do |cmd|
|
|
128
|
-
ok =
|
|
131
|
+
ok = begin
|
|
132
|
+
system(merged, "sh", "-c", cmd)
|
|
133
|
+
rescue Interrupt
|
|
134
|
+
Polyrun::Log.warn "polyrun hooks: #{phase} shell hook interrupted"
|
|
135
|
+
return 130
|
|
136
|
+
end
|
|
129
137
|
return $?.exitstatus unless ok
|
|
130
138
|
end
|
|
131
139
|
0
|
data/lib/polyrun/log.rb
CHANGED
|
@@ -6,6 +6,9 @@ module Polyrun
|
|
|
6
6
|
#
|
|
7
7
|
# Polyrun::Log.stderr = Logger.new($stderr)
|
|
8
8
|
# Polyrun::Log.stdout = StringIO.new
|
|
9
|
+
#
|
|
10
|
+
# Orchestration (+orchestration_warn+): worker timeout and SIGINT lines use the same sink as +warn+ unless
|
|
11
|
+
# +POLYRUN_ORCHESTRATION_STDERR=1+ and stderr is not process +$stderr+ (then the summary is copied to +$stderr+).
|
|
9
12
|
module Log
|
|
10
13
|
class << self
|
|
11
14
|
attr_writer :stderr
|
|
@@ -25,6 +28,19 @@ module Polyrun
|
|
|
25
28
|
emit_line(stderr, msg)
|
|
26
29
|
end
|
|
27
30
|
|
|
31
|
+
# Like {#warn}, and when +POLYRUN_ORCHESTRATION_STDERR=1+ and {#stderr} is not the process +$stderr+,
|
|
32
|
+
# also writes one line to +$stderr+ so timeout/interrupt attribution survives custom/null Log sinks.
|
|
33
|
+
def orchestration_warn(msg)
|
|
34
|
+
warn(msg)
|
|
35
|
+
return unless %w[1 true yes].include?(ENV["POLYRUN_ORCHESTRATION_STDERR"]&.downcase)
|
|
36
|
+
return if stderr.equal?($stderr)
|
|
37
|
+
|
|
38
|
+
# Intentionally the real stderr stream (+Kernel#warn+ routes through +Log.stderr+).
|
|
39
|
+
# rubocop:disable Style/StderrPuts
|
|
40
|
+
$stderr.puts(msg.to_s.chomp)
|
|
41
|
+
# rubocop:enable Style/StderrPuts
|
|
42
|
+
end
|
|
43
|
+
|
|
28
44
|
def puts(msg = "")
|
|
29
45
|
if msg.nil?
|
|
30
46
|
stdout.write("\n")
|
data/lib/polyrun/minitest.rb
CHANGED
|
@@ -7,11 +7,45 @@ module Polyrun
|
|
|
7
7
|
# after Rails / DB configuration (same timing as a direct call to
|
|
8
8
|
# {Data::ParallelProvisioning.run_suite_hooks!}).
|
|
9
9
|
module Minitest
|
|
10
|
+
module WorkerPingTestHook
|
|
11
|
+
def setup
|
|
12
|
+
Polyrun::WorkerPing.ping!(location: polyrun_minitest_location)
|
|
13
|
+
super
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def teardown
|
|
17
|
+
super
|
|
18
|
+
Polyrun::WorkerPing.ping!(location: polyrun_minitest_location)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def polyrun_minitest_location
|
|
24
|
+
file, line = method(name).source_location
|
|
25
|
+
(file && line) ? "#{file}:#{line}" : nil
|
|
26
|
+
rescue NameError
|
|
27
|
+
nil
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
10
31
|
module_function
|
|
11
32
|
|
|
12
33
|
# Runs {Data::ParallelProvisioning.run_suite_hooks!} (serial vs shard worker hooks).
|
|
13
34
|
def install_parallel_provisioning!
|
|
14
35
|
Polyrun::Data::ParallelProvisioning.run_suite_hooks!
|
|
15
36
|
end
|
|
37
|
+
|
|
38
|
+
# Same ping semantics as {RSpec.install_worker_ping!}: +ping!+ at test +setup+ and +teardown+.
|
|
39
|
+
# Requires +minitest+ to be loaded first (+Minitest::Test+ defined).
|
|
40
|
+
def install_worker_ping!
|
|
41
|
+
require_relative "worker_ping"
|
|
42
|
+
unless defined?(::Minitest::Test)
|
|
43
|
+
Polyrun::Log.warn "polyrun minitest: install_worker_ping! skipped (load minitest/autorun or minitest/test first)"
|
|
44
|
+
return
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
::Minitest::Test.send(:prepend, WorkerPingTestHook)
|
|
48
|
+
Polyrun::WorkerPing.ensure_interval_ping_thread!
|
|
49
|
+
end
|
|
16
50
|
end
|
|
17
51
|
end
|
|
@@ -1,7 +1,10 @@
|
|
|
1
|
+
require_relative "../worker_ping"
|
|
1
2
|
require_relative "assertions"
|
|
2
3
|
require_relative "errors"
|
|
3
4
|
require_relative "matchers"
|
|
4
5
|
|
|
6
|
+
Polyrun::WorkerPing.ensure_interval_ping_thread!
|
|
7
|
+
|
|
5
8
|
module Polyrun
|
|
6
9
|
module Quick
|
|
7
10
|
# Per-example execution: merged lets, hooks, assertions, optional Capybara::DSL.
|
|
@@ -20,6 +23,8 @@ module Polyrun
|
|
|
20
23
|
define_let_methods!
|
|
21
24
|
run_let_bangs_from_chain
|
|
22
25
|
extend_capybara_if_enabled!
|
|
26
|
+
qloc = quick_example_location(block)
|
|
27
|
+
Polyrun::WorkerPing.ping!(location: qloc)
|
|
23
28
|
begin
|
|
24
29
|
run_before_hooks_from_chain(ancestor_chain)
|
|
25
30
|
instance_eval(&block)
|
|
@@ -32,11 +37,17 @@ module Polyrun
|
|
|
32
37
|
run_after_hooks_from_chain(ancestor_chain)
|
|
33
38
|
reset_capybara_if_enabled!
|
|
34
39
|
@_let_cache = {}
|
|
40
|
+
Polyrun::WorkerPing.ping!(location: qloc)
|
|
35
41
|
end
|
|
36
42
|
end
|
|
37
43
|
|
|
38
44
|
private
|
|
39
45
|
|
|
46
|
+
def quick_example_location(block)
|
|
47
|
+
loc = block&.source_location
|
|
48
|
+
loc ? "#{loc[0]}:#{loc[1]}" : nil
|
|
49
|
+
end
|
|
50
|
+
|
|
40
51
|
def merge_lets_from_chain(ancestor_chain)
|
|
41
52
|
@merged_lets = {}
|
|
42
53
|
ancestor_chain.each do |g|
|
data/lib/polyrun/rspec.rb
CHANGED
|
@@ -44,5 +44,23 @@ module Polyrun
|
|
|
44
44
|
config.add_formatter Polyrun::Reporting::RspecFailureFragmentFormatter
|
|
45
45
|
end
|
|
46
46
|
end
|
|
47
|
+
|
|
48
|
+
# Writes {WorkerPing} after suite start, before/after each example (+location+ is file:line from metadata).
|
|
49
|
+
# Keeps +--worker-idle-timeout+ sensitive to example progress (not only a background thread).
|
|
50
|
+
def install_worker_ping!
|
|
51
|
+
require "rspec/core"
|
|
52
|
+
require_relative "worker_ping"
|
|
53
|
+
::RSpec.configure do |config|
|
|
54
|
+
config.before(:suite) { Polyrun::WorkerPing.ping! }
|
|
55
|
+
config.before(:each) do |example|
|
|
56
|
+
Polyrun::WorkerPing.ping!(location: example.metadata[:location] || example.location)
|
|
57
|
+
end
|
|
58
|
+
config.after(:each) do |example|
|
|
59
|
+
Polyrun::WorkerPing.ping!(location: example.metadata[:location] || example.location)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
Polyrun::WorkerPing.ensure_interval_ping_thread!
|
|
64
|
+
end
|
|
47
65
|
end
|
|
48
66
|
end
|
data/lib/polyrun/version.rb
CHANGED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
module Polyrun
|
|
2
|
+
# Writes a monotonic timestamp to +POLYRUN_WORKER_PING_FILE+ when the test process advances
|
|
3
|
+
# (typically once per example). When +location:+ is passed (path:line of the example), the file
|
|
4
|
+
# is two lines: timestamp, then that string. Parents use +--worker-idle-timeout+ to detect a worker with no
|
|
5
|
+
# progress *inside* a single example—unlike a background thread, +ping!+ does not run while Ruby
|
|
6
|
+
# is busy on the main thread, so a tight CPU loop or stuck native code leaves the timestamp stale.
|
|
7
|
+
#
|
|
8
|
+
# Prefer framework installs (call from helpers *after* loading the runner):
|
|
9
|
+
#
|
|
10
|
+
# require "polyrun/rspec"
|
|
11
|
+
# Polyrun::RSpec.install_worker_ping!
|
|
12
|
+
#
|
|
13
|
+
# require "polyrun/minitest"
|
|
14
|
+
# Polyrun::Minitest.install_worker_ping!
|
|
15
|
+
#
|
|
16
|
+
# Polyrun Quick runs +ping!+ automatically when requiring the Quick stack.
|
|
17
|
+
#
|
|
18
|
+
# Optional interval thread (+POLYRUN_WORKER_PING_THREAD=1+, +POLYRUN_WORKER_PING_INTERVAL_SEC+): call
|
|
19
|
+
# {#ensure_interval_ping_thread!} once at worker startup if you rely on periodic pings without per-example {#ping!};
|
|
20
|
+
# installers call this so the env toggle works out of the box.
|
|
21
|
+
module WorkerPing
|
|
22
|
+
class << self
|
|
23
|
+
def ping!(location: nil)
|
|
24
|
+
path = ping_file_path
|
|
25
|
+
return if path.empty?
|
|
26
|
+
|
|
27
|
+
t = Process.clock_gettime(Process::CLOCK_MONOTONIC).to_s
|
|
28
|
+
loc = location.to_s.strip
|
|
29
|
+
payload = loc.empty? ? t : "#{t}\n#{loc}"
|
|
30
|
+
File.binwrite(path, payload)
|
|
31
|
+
rescue SystemCallError
|
|
32
|
+
# best-effort
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def ping_file_path
|
|
36
|
+
ENV["POLYRUN_WORKER_PING_FILE"].to_s.strip
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Starts a periodic +ping!+ thread when +POLYRUN_WORKER_PING_THREAD+ is truthy and +POLYRUN_WORKER_PING_FILE+ is set.
|
|
40
|
+
# Prefer per-example {#ping!}; safe to call more than once (idempotent).
|
|
41
|
+
# rubocop:disable ThreadSafety/ClassInstanceVariable -- idempotent once-per-process latch
|
|
42
|
+
def ensure_interval_ping_thread!
|
|
43
|
+
thread_flag = ENV["POLYRUN_WORKER_PING_THREAD"]
|
|
44
|
+
return unless %w[1 true yes].include?(thread_flag&.downcase)
|
|
45
|
+
|
|
46
|
+
path = ping_file_path
|
|
47
|
+
return if path.empty?
|
|
48
|
+
|
|
49
|
+
@interval_ping_mx ||= Mutex.new
|
|
50
|
+
@interval_ping_mx.synchronize do
|
|
51
|
+
return if @interval_ping_started
|
|
52
|
+
|
|
53
|
+
raw = ENV["POLYRUN_WORKER_PING_INTERVAL_SEC"].to_s.strip
|
|
54
|
+
interval = Float(raw.empty? ? "15" : raw, exception: false) || 15.0
|
|
55
|
+
interval = 1.0 if interval < 1.0
|
|
56
|
+
|
|
57
|
+
ping!
|
|
58
|
+
# rubocop:disable ThreadSafety/NewThread -- optional periodic ping alongside per-example ping!
|
|
59
|
+
Thread.new do
|
|
60
|
+
loop do
|
|
61
|
+
sleep(interval)
|
|
62
|
+
ping!
|
|
63
|
+
rescue SystemCallError, Interrupt
|
|
64
|
+
break
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
# rubocop:enable ThreadSafety/NewThread
|
|
68
|
+
@interval_ping_started = true
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
# rubocop:enable ThreadSafety/ClassInstanceVariable
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
data/sig/polyrun/minitest.rbs
CHANGED
data/sig/polyrun/rspec.rbs
CHANGED
|
@@ -3,5 +3,9 @@ module Polyrun
|
|
|
3
3
|
def self.install_parallel_provisioning!: (untyped rspec_config) -> void
|
|
4
4
|
|
|
5
5
|
def self.install_example_timing!: (?output_path: String? ) -> void
|
|
6
|
+
|
|
7
|
+
def self.install_failure_fragments!: (?only_if: untyped?) -> void
|
|
8
|
+
|
|
9
|
+
def self.install_worker_ping!: () -> void
|
|
6
10
|
end
|
|
7
11
|
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
module Polyrun
|
|
2
|
+
module WorkerPing
|
|
3
|
+
def self.ping!: (?location: String?) -> void
|
|
4
|
+
|
|
5
|
+
def self.ping_file_path: () -> String
|
|
6
|
+
|
|
7
|
+
# Idempotent — installers call once; POLYRUN_WORKER_PING_THREAD gates the periodic thread.
|
|
8
|
+
def self.ensure_interval_ping_thread!: () -> void
|
|
9
|
+
end
|
|
10
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: polyrun
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrei Makarov
|
|
@@ -188,10 +188,12 @@ files:
|
|
|
188
188
|
- lib/polyrun/cli/report_commands.rb
|
|
189
189
|
- lib/polyrun/cli/run_shards_command.rb
|
|
190
190
|
- lib/polyrun/cli/run_shards_parallel_children.rb
|
|
191
|
+
- lib/polyrun/cli/run_shards_parallel_wait.rb
|
|
191
192
|
- lib/polyrun/cli/run_shards_plan_boot_phases.rb
|
|
192
193
|
- lib/polyrun/cli/run_shards_plan_options.rb
|
|
193
194
|
- lib/polyrun/cli/run_shards_planning.rb
|
|
194
195
|
- lib/polyrun/cli/run_shards_run.rb
|
|
196
|
+
- lib/polyrun/cli/run_shards_worker_interrupt.rb
|
|
195
197
|
- lib/polyrun/cli/start_bootstrap.rb
|
|
196
198
|
- lib/polyrun/cli/timing_command.rb
|
|
197
199
|
- lib/polyrun/config.rb
|
|
@@ -280,6 +282,7 @@ files:
|
|
|
280
282
|
- lib/polyrun/timing/rspec_example_formatter.rb
|
|
281
283
|
- lib/polyrun/timing/summary.rb
|
|
282
284
|
- lib/polyrun/version.rb
|
|
285
|
+
- lib/polyrun/worker_ping.rb
|
|
283
286
|
- polyrun.gemspec
|
|
284
287
|
- sig/polyrun.rbs
|
|
285
288
|
- sig/polyrun/cli.rbs
|
|
@@ -289,6 +292,7 @@ files:
|
|
|
289
292
|
- sig/polyrun/minitest.rbs
|
|
290
293
|
- sig/polyrun/quick.rbs
|
|
291
294
|
- sig/polyrun/rspec.rbs
|
|
295
|
+
- sig/polyrun/worker_ping.rbs
|
|
292
296
|
homepage: https://github.com/amkisko/polyrun.rb
|
|
293
297
|
licenses:
|
|
294
298
|
- MIT
|