polyrun 1.4.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +38 -0
  3. data/README.md +2 -2
  4. data/docs/SETUP_PROFILE.md +2 -0
  5. data/lib/polyrun/cli/ci_shard_hooks.rb +12 -4
  6. data/lib/polyrun/cli/ci_shard_run_command.rb +3 -1
  7. data/lib/polyrun/cli/help.rb +10 -2
  8. data/lib/polyrun/cli/helpers.rb +38 -0
  9. data/lib/polyrun/cli/init_command.rb +8 -1
  10. data/lib/polyrun/cli/partition_diagnostics.rb +22 -0
  11. data/lib/polyrun/cli/plan_command.rb +47 -18
  12. data/lib/polyrun/cli/queue_command.rb +25 -2
  13. data/lib/polyrun/cli/run_queue_command.rb +145 -0
  14. data/lib/polyrun/cli/run_shards_command.rb +6 -1
  15. data/lib/polyrun/cli/run_shards_parallel_children.rb +28 -35
  16. data/lib/polyrun/cli/run_shards_parallel_wait.rb +267 -0
  17. data/lib/polyrun/cli/run_shards_plan_boot_phases.rb +81 -3
  18. data/lib/polyrun/cli/run_shards_plan_options.rb +17 -3
  19. data/lib/polyrun/cli/run_shards_planning.rb +20 -12
  20. data/lib/polyrun/cli/run_shards_run.rb +28 -37
  21. data/lib/polyrun/cli/run_shards_worker_interrupt.rb +75 -0
  22. data/lib/polyrun/cli/spec_quality_commands.rb +140 -0
  23. data/lib/polyrun/cli.rb +16 -2
  24. data/lib/polyrun/coverage/example_diff.rb +122 -0
  25. data/lib/polyrun/coverage/merge/formatters_html.rb +4 -0
  26. data/lib/polyrun/data/factory_counts.rb +14 -1
  27. data/lib/polyrun/database/clone_shards.rb +2 -0
  28. data/lib/polyrun/database/shard.rb +2 -1
  29. data/lib/polyrun/hooks.rb +9 -1
  30. data/lib/polyrun/log.rb +16 -0
  31. data/lib/polyrun/minitest.rb +43 -0
  32. data/lib/polyrun/partition/hrw.rb +40 -3
  33. data/lib/polyrun/partition/paths_build.rb +8 -3
  34. data/lib/polyrun/partition/plan.rb +88 -19
  35. data/lib/polyrun/partition/plan_lpt.rb +49 -7
  36. data/lib/polyrun/partition/plan_sharding.rb +8 -0
  37. data/lib/polyrun/partition/reports.rb +139 -0
  38. data/lib/polyrun/partition/timing_diagnostics.rb +139 -0
  39. data/lib/polyrun/partition/timing_keys.rb +2 -1
  40. data/lib/polyrun/queue/duration.rb +30 -0
  41. data/lib/polyrun/queue/file_store.rb +107 -3
  42. data/lib/polyrun/quick/example_runner.rb +13 -0
  43. data/lib/polyrun/quick/runner.rb +21 -0
  44. data/lib/polyrun/rspec.rb +26 -0
  45. data/lib/polyrun/spec_quality/config.rb +134 -0
  46. data/lib/polyrun/spec_quality/fragment.rb +39 -0
  47. data/lib/polyrun/spec_quality/merge.rb +78 -0
  48. data/lib/polyrun/spec_quality/minitest_hook.rb +42 -0
  49. data/lib/polyrun/spec_quality/plan_loader.rb +47 -0
  50. data/lib/polyrun/spec_quality/profile.rb +91 -0
  51. data/lib/polyrun/spec_quality/report.rb +261 -0
  52. data/lib/polyrun/spec_quality/rspec_hook.rb +55 -0
  53. data/lib/polyrun/spec_quality/sql_counter.rb +34 -0
  54. data/lib/polyrun/spec_quality.rb +205 -0
  55. data/lib/polyrun/templates/POLYRUN.md +6 -0
  56. data/lib/polyrun/templates/ci_matrix.polyrun.yml +4 -0
  57. data/lib/polyrun/templates/polyrun_hooks_spec_quality.rb +12 -0
  58. data/lib/polyrun/templates/polyrun_spec_quality.yml +20 -0
  59. data/lib/polyrun/templates/rails_prepare.polyrun.yml +5 -0
  60. data/lib/polyrun/timing/merge.rb +5 -5
  61. data/lib/polyrun/timing/stats.rb +76 -0
  62. data/lib/polyrun/timing/summary.rb +5 -2
  63. data/lib/polyrun/timing/variance_report.rb +51 -0
  64. data/lib/polyrun/version.rb +1 -1
  65. data/lib/polyrun/worker_ping.rb +74 -0
  66. data/sig/polyrun/minitest.rbs +2 -0
  67. data/sig/polyrun/rspec.rbs +4 -0
  68. data/sig/polyrun/worker_ping.rbs +10 -0
  69. metadata +26 -1
@@ -3,6 +3,7 @@ require "rbconfig"
3
3
 
4
4
  require_relative "start_bootstrap"
5
5
  require_relative "failure_commands"
6
+ require_relative "spec_quality_commands"
6
7
  require_relative "run_shards_run"
7
8
 
8
9
  module Polyrun
@@ -10,6 +11,7 @@ module Polyrun
10
11
  module RunShardsCommand
11
12
  include StartBootstrap
12
13
  include FailureCommands
14
+ include SpecQualityCommands
13
15
  include RunShardsRun
14
16
 
15
17
  private
@@ -116,11 +118,13 @@ module Polyrun
116
118
  # ENV for a worker process: POLYRUN_SHARD_* plus per-shard database URLs from polyrun.yml or DATABASE_URL.
117
119
  # When +matrix_total+ > 1 with multiple local workers, sets +POLYRUN_SHARD_MATRIX_INDEX+ / +POLYRUN_SHARD_MATRIX_TOTAL+
118
120
  # so {Coverage::Collector} can name fragments uniquely across CI matrix jobs (NxM sharding).
119
- def shard_child_env(cfg:, workers:, shard:, matrix_index: nil, matrix_total: nil, failure_fragments: false)
121
+ # rubocop:disable Metrics/AbcSize -- shard ENV: matrix, DB URLs, fragment flags
122
+ def shard_child_env(cfg:, workers:, shard:, matrix_index: nil, matrix_total: nil, failure_fragments: false, spec_quality_fragments: false)
120
123
  child_env = ENV.to_h.merge(
121
124
  Polyrun::Database::Shard.env_map(shard_index: shard, shard_total: workers)
122
125
  )
123
126
  child_env["POLYRUN_FAILURE_FRAGMENTS"] = "1" if failure_fragments
127
+ child_env["POLYRUN_SPEC_QUALITY_FRAGMENTS"] = "1" if spec_quality_fragments
124
128
  mt = matrix_total.nil? ? 0 : Integer(matrix_total)
125
129
  if mt > 1
126
130
  if matrix_index.nil?
@@ -138,6 +142,7 @@ module Polyrun
138
142
  end
139
143
  child_env
140
144
  end
145
+ # rubocop:enable Metrics/AbcSize
141
146
 
142
147
  def cmd_build_paths(config_path)
143
148
  cfg = Polyrun::Config.load(path: config_path || ENV["POLYRUN_CONFIG"])
@@ -1,7 +1,13 @@
1
+ require "fileutils"
2
+
3
+ require_relative "run_shards_parallel_wait"
4
+
1
5
  module Polyrun
2
6
  class CLI
3
- # Spawns and waits on worker processes for +run-shards+ / +ci-shard-*+ fan-out.
7
+ # Spawns worker processes for +run-shards+ / +ci-shard-*+ fan-out. See {RunShardsParallelWait} for wait/timeout.
4
8
  module RunShardsParallelChildren
9
+ include RunShardsParallelWait
10
+
5
11
  private
6
12
 
7
13
  # @return [Array(Array, Integer, nil)] +[pids, spawn_error_code]+; +spawn_error_code+ is +nil+ when all spawns succeeded
@@ -40,14 +46,18 @@ module Polyrun
40
46
  shard: shard,
41
47
  matrix_index: mx,
42
48
  matrix_total: mt,
43
- failure_fragments: ctx[:merge_failures]
49
+ failure_fragments: ctx[:merge_failures],
50
+ spec_quality_fragments: ctx[:merge_spec_quality]
44
51
  )
45
52
  child_env = child_env.merge("POLYRUN_HOOK_ORCHESTRATOR" => "0")
46
53
  child_env = hook_cfg.merge_worker_ruby_env(child_env)
47
54
 
55
+ ping_path = run_shards_prepare_worker_ping!(ctx, child_env, shard)
56
+
48
57
  Polyrun::Log.warn "polyrun run-shards: shard #{shard} → #{paths.size} file(s)" if @verbose
58
+ spawned_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
49
59
  pid = run_shards_spawn_one_worker(child_env, cmd, paths, hook_cfg)
50
- pids << {pid: pid, shard: shard}
60
+ pids << {pid: pid, shard: shard, spawned_at: spawned_at, ping_path: ping_path}
51
61
  Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.spawn shard=#{shard} child_pid=#{pid} spec_files=#{paths.size}")
52
62
  Polyrun::Log.warn "polyrun run-shards: started shard #{shard} pid=#{pid} (#{paths.size} file(s))" if parallel
53
63
  end
@@ -55,6 +65,21 @@ module Polyrun
55
65
  end
56
66
  # rubocop:enable Metrics/AbcSize
57
67
 
68
+ def run_shards_prepare_worker_ping!(ctx, child_env, shard)
69
+ idle_sec = ctx[:worker_idle_timeout_sec]
70
+ idle_sec = nil if idle_sec.is_a?(Numeric) && idle_sec <= 0
71
+ return nil unless idle_sec
72
+
73
+ dir = File.join(Dir.pwd, "tmp", "polyrun")
74
+ FileUtils.mkdir_p(dir)
75
+ path = File.expand_path("worker-ping-#{$$}-#{shard}.txt", dir)
76
+ File.binwrite(path, "")
77
+ child_env["POLYRUN_WORKER_PING_FILE"] = path
78
+ interval = ENV["POLYRUN_WORKER_PING_INTERVAL_SEC"].to_s.strip
79
+ child_env["POLYRUN_WORKER_PING_INTERVAL_SEC"] = interval.empty? ? "15" : interval
80
+ path
81
+ end
82
+
58
83
  def run_shards_spawn_one_worker(child_env, cmd, paths, hook_cfg)
59
84
  if hook_cfg.worker_hooks? && !Polyrun::Hooks.disabled?
60
85
  Process.spawn(child_env, "sh", "-c", hook_cfg.build_worker_shell_script(cmd, paths))
@@ -62,38 +87,6 @@ module Polyrun
62
87
  Process.spawn(child_env, *cmd, *paths)
63
88
  end
64
89
  end
65
-
66
- # @return [Array(Array, Integer)] +[shard_results, after_shard_hook_error_code]+ (0 when all +after_shard+ hooks passed)
67
- def run_shards_wait_all_children(pids, hook_cfg, ctx)
68
- workers = ctx[:workers]
69
- shard_results = []
70
- after_hook_err = 0
71
- Polyrun::Debug.time("Process.wait (#{pids.size} worker process(es))") do
72
- pids.each do |h|
73
- Process.wait(h[:pid])
74
- exitstatus = $?.exitstatus
75
- ok = $?.success?
76
- Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.wait child_pid=#{h[:pid]} shard=#{h[:shard]} exit=#{exitstatus} success=#{ok}")
77
- env_after = ENV.to_h.merge(
78
- "POLYRUN_HOOK_ORCHESTRATOR" => "1",
79
- "POLYRUN_SHARD_INDEX" => h[:shard].to_s,
80
- "POLYRUN_SHARD_TOTAL" => workers.to_s,
81
- "POLYRUN_WORKER_EXIT_STATUS" => exitstatus.to_s
82
- )
83
- rc = hook_cfg.run_phase_if_enabled(:after_shard, env_after)
84
- after_hook_err = rc if rc != 0 && after_hook_err == 0
85
- shard_results << {shard: h[:shard], exitstatus: exitstatus, success: ok}
86
- end
87
- rescue Interrupt
88
- # Do not trap SIGINT: Process.wait raises Interrupt; a trap races and prints Interrupt + SystemExit traces.
89
- run_shards_shutdown_on_signal!(pids, 130)
90
- rescue SignalException => e
91
- raise unless e.signm == "SIGTERM"
92
-
93
- run_shards_shutdown_on_signal!(pids, 143)
94
- end
95
- [shard_results, after_hook_err]
96
- end
97
90
  end
98
91
  end
99
92
  end
@@ -0,0 +1,267 @@
1
+ # rubocop:disable Polyrun/FileLength, Metrics/ModuleLength -- wait loop + idle/wall flush (kept out of spawn module)
2
+ module Polyrun
3
+ class CLI
4
+ # Wait, wall/idle timeout, and +after_shard+ hooks for parallel workers (+run-shards+ / +ci-shard-*+).
5
+ module RunShardsParallelWait
6
+ WORKER_TIMEOUT_EXIT_STATUS = 124
7
+ WORKER_IDLE_TIMEOUT_EXIT_STATUS = 125
8
+
9
+ private
10
+
11
+ # @return [Array(Array, Integer)] +[shard_results, after_shard_hook_error_code]+ (0 when all +after_shard+ hooks passed)
12
+ # rubocop:disable Metrics/AbcSize -- wait loop + timeout flush
13
+ def run_shards_wait_all_children(pids, hook_cfg, ctx)
14
+ workers = ctx[:workers]
15
+ shard_results = []
16
+ after_hook_err = 0
17
+ timeout_sec = ctx[:worker_timeout_sec]
18
+ timeout_sec = nil if timeout_sec.is_a?(Numeric) && timeout_sec <= 0
19
+ idle_sec = ctx[:worker_idle_timeout_sec]
20
+ idle_sec = nil if idle_sec.is_a?(Numeric) && idle_sec <= 0
21
+
22
+ Polyrun::Debug.time("Process.wait (#{pids.size} worker process(es))") do
23
+ if timeout_sec || idle_sec
24
+ run_shards_wait_all_children_multiplex(
25
+ pids, hook_cfg, ctx, workers, timeout_sec, idle_sec, shard_results, after_hook_err
26
+ )
27
+ else
28
+ run_shards_wait_all_children_sequential(pids, hook_cfg, workers, shard_results, after_hook_err)
29
+ end
30
+ rescue Interrupt
31
+ run_shards_shutdown_on_signal!(pids, 130)
32
+ rescue SignalException => e
33
+ raise unless e.signm == "SIGTERM"
34
+
35
+ run_shards_shutdown_on_signal!(pids, 143)
36
+ end
37
+ end
38
+ # rubocop:enable Metrics/AbcSize
39
+
40
+ def run_shards_wait_all_children_sequential(pids, hook_cfg, workers, shard_results, after_hook_err)
41
+ pids.each do |h|
42
+ Process.wait(h[:pid])
43
+ st = $?
44
+ after_hook_err = run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
45
+ end
46
+ [shard_results, after_hook_err]
47
+ end
48
+
49
+ # Poll every live PID each tick so wall and idle timeouts apply to all workers, not only the first in wait order.
50
+ def run_shards_wait_all_children_multiplex(pids, hook_cfg, ctx, workers, timeout_sec, idle_sec, shard_results, after_hook_err)
51
+ pending = pids.dup
52
+
53
+ loop do
54
+ pending.delete_if do |h|
55
+ wpid = Process.wait(h[:pid], Process::WNOHANG)
56
+ next false unless wpid == h[:pid]
57
+
58
+ st = $?
59
+ after_hook_err = run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
60
+ true
61
+ end
62
+
63
+ return [shard_results, after_hook_err] if pending.empty?
64
+
65
+ now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
66
+ violation = run_shards_timeout_violation(pids, pending, ctx, now, timeout_sec, idle_sec)
67
+ if violation
68
+ reason, timed_h = violation
69
+ others = pending.reject { |x| x[:pid] == timed_h[:pid] }
70
+ case reason
71
+ when :wall_timeout
72
+ return run_shards_wait_flush_after_worker_timeout!(
73
+ timed_h, others, hook_cfg, ctx, timeout_sec, workers, shard_results, after_hook_err
74
+ )
75
+ when :idle_timeout
76
+ return run_shards_wait_flush_after_worker_idle!(
77
+ timed_h, others, hook_cfg, ctx, idle_sec, workers, shard_results, after_hook_err
78
+ )
79
+ end
80
+ end
81
+
82
+ sleep(0.2)
83
+ end
84
+ end
85
+
86
+ # @return [(Symbol, Hash), nil] e.g. +[:wall_timeout, h]+ when a limit is exceeded
87
+ def run_shards_timeout_violation(pids_order, pending, ctx, now, timeout_sec, idle_sec)
88
+ pids_order.each do |h|
89
+ next unless pending.any? { |p| p[:pid] == h[:pid] }
90
+
91
+ if timeout_sec && timeout_sec > 0
92
+ spawned_at = h[:spawned_at] || ctx[:run_t0]
93
+ return [:wall_timeout, h] if now >= spawned_at + timeout_sec
94
+ end
95
+ end
96
+
97
+ pids_order.each do |h|
98
+ next unless pending.any? { |p| p[:pid] == h[:pid] }
99
+
100
+ next unless idle_sec && idle_sec > 0 && h[:ping_path]
101
+
102
+ t, = run_shards_read_worker_ping_payload(h[:ping_path])
103
+ return [:idle_timeout, h] if t && (now - t) > idle_sec
104
+ end
105
+
106
+ nil
107
+ end
108
+
109
+ def run_shards_finalize_reaped_worker!(h, hook_cfg, workers, st, shard_results, after_hook_err)
110
+ exitstatus = st.exitstatus
111
+ ok = st.success?
112
+ Polyrun::Debug.log("[parent pid=#{$$}] run-shards: Process.wait child_pid=#{h[:pid]} shard=#{h[:shard]} exit=#{exitstatus} success=#{ok}")
113
+ rc = run_shards_invoke_after_shard!(hook_cfg, h[:shard], workers, exitstatus)
114
+ after_hook_err = rc if rc != 0 && after_hook_err == 0
115
+ shard_results << {shard: h[:shard], exitstatus: exitstatus, success: ok}
116
+ run_shards_unlink_ping_path(h[:ping_path])
117
+ after_hook_err
118
+ end
119
+
120
+ def run_shards_unlink_ping_path(path)
121
+ s = path.to_s.strip
122
+ return if s.empty?
123
+
124
+ File.unlink(s) if File.file?(s)
125
+ rescue SystemCallError
126
+ # best-effort cleanup of tmp/polyrun/worker-ping-*.txt
127
+ end
128
+
129
+ def run_shards_read_worker_ping_time(path)
130
+ run_shards_read_worker_ping_payload(path)[0]
131
+ end
132
+
133
+ # @return [Array(Float?, String?)] monotonic time and optional location line (path:line)
134
+ def run_shards_read_worker_ping_payload(path)
135
+ return [nil, nil] unless path && File.file?(path)
136
+
137
+ s = File.binread(path)
138
+ return [nil, nil] if s.nil? || s.strip.empty?
139
+
140
+ time_line, rest = s.split("\n", 2)
141
+ first = time_line.to_s.strip
142
+ return [nil, nil] if first.empty?
143
+
144
+ f = first.to_f
145
+ t = f.positive? ? f : nil
146
+ loc = rest.to_s.strip
147
+ loc = nil if loc.empty?
148
+ [t, loc]
149
+ rescue SystemCallError
150
+ [nil, nil]
151
+ end
152
+
153
+ def run_shards_wait_flush_after_worker_idle!(timed_h, others, hook_cfg, ctx, idle_sec, workers, shard_results, after_hook_err)
154
+ run_shards_warn_worker_idle!(timed_h, ctx, idle_sec)
155
+ run_shards_force_stop_pid_status(timed_h[:pid])
156
+ run_shards_unlink_ping_path(timed_h[:ping_path])
157
+ rc = run_shards_invoke_after_shard!(hook_cfg, timed_h[:shard], workers, WORKER_IDLE_TIMEOUT_EXIT_STATUS)
158
+ after_hook_err = rc if rc != 0 && after_hook_err == 0
159
+ shard_results << {shard: timed_h[:shard], exitstatus: WORKER_IDLE_TIMEOUT_EXIT_STATUS, success: false}
160
+ others.each do |h2|
161
+ st2 = run_shards_wait_or_force_stop_status(h2[:pid])
162
+ exit2 = st2&.exitstatus
163
+ ok2 = st2 ? st2.success? : false
164
+ exit2 = WORKER_IDLE_TIMEOUT_EXIT_STATUS if exit2.nil?
165
+ run_shards_unlink_ping_path(h2[:ping_path])
166
+ rc2 = run_shards_invoke_after_shard!(hook_cfg, h2[:shard], workers, exit2)
167
+ after_hook_err = rc2 if rc2 != 0 && after_hook_err == 0
168
+ shard_results << {shard: h2[:shard], exitstatus: exit2, success: ok2}
169
+ end
170
+ [shard_results, after_hook_err]
171
+ end
172
+
173
+ def run_shards_wait_flush_after_worker_timeout!(timed_h, others, hook_cfg, ctx, timeout_sec, workers, shard_results, after_hook_err)
174
+ run_shards_warn_worker_timeout!(timed_h, ctx, timeout_sec)
175
+ run_shards_force_stop_pid_status(timed_h[:pid])
176
+ run_shards_unlink_ping_path(timed_h[:ping_path])
177
+ rc = run_shards_invoke_after_shard!(hook_cfg, timed_h[:shard], workers, WORKER_TIMEOUT_EXIT_STATUS)
178
+ after_hook_err = rc if rc != 0 && after_hook_err == 0
179
+ shard_results << {shard: timed_h[:shard], exitstatus: WORKER_TIMEOUT_EXIT_STATUS, success: false}
180
+ others.each do |h2|
181
+ st2 = run_shards_wait_or_force_stop_status(h2[:pid])
182
+ exit2 = st2&.exitstatus
183
+ ok2 = st2 ? st2.success? : false
184
+ exit2 = WORKER_TIMEOUT_EXIT_STATUS if exit2.nil?
185
+ run_shards_unlink_ping_path(h2[:ping_path])
186
+ rc2 = run_shards_invoke_after_shard!(hook_cfg, h2[:shard], workers, exit2)
187
+ after_hook_err = rc2 if rc2 != 0 && after_hook_err == 0
188
+ shard_results << {shard: h2[:shard], exitstatus: exit2, success: ok2}
189
+ end
190
+ [shard_results, after_hook_err]
191
+ end
192
+
193
+ def run_shards_invoke_after_shard!(hook_cfg, shard, workers, exitstatus)
194
+ env_after = ENV.to_h.merge(
195
+ "POLYRUN_HOOK_ORCHESTRATOR" => "1",
196
+ "POLYRUN_SHARD_INDEX" => shard.to_s,
197
+ "POLYRUN_SHARD_TOTAL" => workers.to_s,
198
+ "POLYRUN_WORKER_EXIT_STATUS" => exitstatus.to_s
199
+ )
200
+ hook_cfg.run_phase_if_enabled(:after_shard, env_after)
201
+ end
202
+
203
+ def run_shards_warn_worker_idle!(h, ctx, idle_sec)
204
+ paths = ctx[:plan].shard(h[:shard])
205
+ sample = paths.first(5).join(", ")
206
+ suffix =
207
+ if paths.empty?
208
+ " (no paths)"
209
+ elsif paths.size > 5
210
+ " (#{paths.size} files total)"
211
+ else
212
+ ""
213
+ end
214
+ _t, loc = run_shards_read_worker_ping_payload(h[:ping_path])
215
+ ping_suffix = (loc && !loc.to_s.strip.empty?) ? "; last ping #{loc.to_s.strip}" : ""
216
+ Polyrun::Log.orchestration_warn "polyrun run-shards: WORKER IDLE TIMEOUT after #{idle_sec}s since last per-example progress ping — shard #{h[:shard]} pid #{h[:pid]}#{ping_suffix}."
217
+ Polyrun::Log.warn "polyrun run-shards: idle shard file sample: #{sample}#{suffix}"
218
+ Polyrun::Log.warn "polyrun run-shards: use Polyrun::RSpec.install_worker_ping! / Polyrun::Minitest.install_worker_ping! (Polyrun Quick calls ping! each example); exit #{WORKER_IDLE_TIMEOUT_EXIT_STATUS}."
219
+ end
220
+
221
+ def run_shards_wait_or_force_stop_status(pid)
222
+ wpid = Process.wait(pid, Process::WNOHANG)
223
+ return $? if wpid == pid
224
+
225
+ run_shards_force_stop_pid_status(pid)
226
+ rescue Errno::ECHILD
227
+ nil
228
+ end
229
+
230
+ def run_shards_force_stop_pid_status(pid)
231
+ Process.kill(:KILL, pid)
232
+ st = nil
233
+ begin
234
+ Process.wait(pid)
235
+ st = $?
236
+ rescue Errno::ECHILD
237
+ # child already reaped
238
+ end
239
+ st
240
+ rescue Errno::ESRCH
241
+ begin
242
+ Process.wait(pid)
243
+ $?
244
+ rescue Errno::ECHILD
245
+ nil
246
+ end
247
+ end
248
+
249
+ def run_shards_warn_worker_timeout!(h, ctx, timeout_sec)
250
+ paths = ctx[:plan].shard(h[:shard])
251
+ sample = paths.first(5).join(", ")
252
+ suffix =
253
+ if paths.empty?
254
+ " (no paths)"
255
+ elsif paths.size > 5
256
+ " (#{paths.size} files total)"
257
+ else
258
+ ""
259
+ end
260
+ Polyrun::Log.orchestration_warn "polyrun run-shards: WORKER TIMEOUT after #{timeout_sec}s (wall time since worker spawn) — shard #{h[:shard]} pid #{h[:pid]}."
261
+ Polyrun::Log.warn "polyrun run-shards: timeout shard includes: #{sample}#{suffix}"
262
+ Polyrun::Log.warn "polyrun run-shards: override with --worker-timeout SEC or POLYRUN_WORKER_TIMEOUT_SEC; recorded exit #{WORKER_TIMEOUT_EXIT_STATUS} for this worker."
263
+ end
264
+ end
265
+ end
266
+ end
267
+ # rubocop:enable Polyrun/FileLength, Metrics/ModuleLength
@@ -1,3 +1,4 @@
1
+ # rubocop:disable Polyrun/FileLength -- run-shards boot phases A/B
1
2
  require "shellwords"
2
3
 
3
4
  module Polyrun
@@ -19,22 +20,43 @@ module Polyrun
19
20
  err = run_shards_validate_cmd(cmd)
20
21
  return [:fail, err] if err
21
22
 
23
+ run_shards_normalize_worker_timeout_option!(o)
24
+ run_shards_normalize_worker_idle_timeout_option!(o)
25
+
22
26
  cmd = Shellwords.split(cmd.first) if cmd.size == 1 && cmd.first.include?(" ")
23
27
 
24
28
  [:ok, o, cmd]
25
29
  end
26
30
 
31
+ # rubocop:disable Metrics/AbcSize -- items + costs + plan emit for run-shards
27
32
  def run_shards_plan_phase_b(o, cmd, cfg, pc, run_t0, config_path)
28
33
  items, paths_source, err = run_shards_resolve_items(o[:paths_file], pc)
29
34
  return [err, nil] if err
30
35
 
31
- costs, strategy, err = run_shards_resolve_costs(o[:timing_path], o[:strategy], o[:timing_granularity])
36
+ costs, strategy, err = run_shards_resolve_costs(
37
+ o[:timing_path],
38
+ o[:strategy],
39
+ o[:timing_granularity],
40
+ strategy_explicit: o[:strategy_explicit]
41
+ )
32
42
  return [err, nil] if err
33
43
 
34
44
  run_shards_plan_ready_log(o, cfg, strategy, cmd, paths_source, items.size)
35
45
 
36
46
  constraints = load_partition_constraints(pc, o[:constraints_path])
37
- plan = run_shards_make_plan(items, o[:workers], strategy, o[:seed], costs, constraints, o[:timing_granularity])
47
+ stable = load_stable_assignment(pc)
48
+ plan = run_shards_make_plan(
49
+ items, o[:workers], strategy, o[:seed], costs, constraints, o[:timing_granularity], stable,
50
+ shard_weights: pc["shard_weights"] || pc[:shard_weights]
51
+ )
52
+
53
+ partition_emit_diagnostics!(
54
+ plan: plan,
55
+ items: items,
56
+ costs: costs,
57
+ timing_path: o[:timing_path],
58
+ granularity: o[:timing_granularity]
59
+ )
38
60
 
39
61
  run_shards_debug_shard_sizes(plan, o[:workers])
40
62
  Polyrun::Log.warn "polyrun run-shards: #{items.size} paths → #{o[:workers]} workers (#{strategy})" if @verbose
@@ -44,6 +66,7 @@ module Polyrun
44
66
 
45
67
  [nil, run_shards_plan_context_hash(o, cmd, cfg, plan, run_t0, parallel, config_path)]
46
68
  end
69
+ # rubocop:enable Metrics/AbcSize
47
70
 
48
71
  def run_shards_plan_boot(argv, config_path)
49
72
  run_t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
@@ -66,6 +89,7 @@ module Polyrun
66
89
  strategy: strategy,
67
90
  merge_coverage: o[:merge_coverage],
68
91
  merge_failures: run_shards_merge_failures_flag(o, cfg),
92
+ merge_spec_quality: run_shards_merge_spec_quality_flag(o, cfg),
69
93
  command: cmd,
70
94
  timing_path: o[:timing_path],
71
95
  paths_source: paths_source,
@@ -73,6 +97,26 @@ module Polyrun
73
97
  )
74
98
  end
75
99
 
100
+ def run_shards_merge_spec_quality_flag(o, cfg)
101
+ return true if o[:merge_spec_quality]
102
+ return true if %w[1 true yes].include?(ENV["POLYRUN_MERGE_SPEC_QUALITY"].to_s.downcase)
103
+
104
+ rep = cfg.reporting
105
+ v = rep["merge_spec_quality"] || rep[:merge_spec_quality]
106
+ v == true || %w[1 true yes].include?(v.to_s.downcase)
107
+ end
108
+
109
+ def run_shards_merge_spec_quality_output_opt(o, cfg)
110
+ x = o[:merge_spec_quality_output]
111
+ return x if x && !x.to_s.strip.empty?
112
+
113
+ x = ENV["POLYRUN_MERGED_SPEC_QUALITY_OUT"]
114
+ return x if x && !x.to_s.strip.empty?
115
+
116
+ rep = cfg.reporting
117
+ rep["merge_spec_quality_output"] || rep[:merge_spec_quality_output]
118
+ end
119
+
76
120
  def run_shards_merge_failures_flag(o, cfg)
77
121
  return true if o[:merge_failures]
78
122
  return true if %w[1 true yes].include?(ENV["POLYRUN_MERGE_FAILURES"].to_s.downcase)
@@ -118,9 +162,43 @@ module Polyrun
118
162
  merge_failures: run_shards_merge_failures_flag(o, cfg),
119
163
  merge_failures_output: run_shards_merge_failures_output_opt(o, cfg),
120
164
  merge_failures_format: run_shards_merge_failures_format_opt(o, cfg),
121
- config_path: config_path
165
+ merge_spec_quality: run_shards_merge_spec_quality_flag(o, cfg),
166
+ merge_spec_quality_output: run_shards_merge_spec_quality_output_opt(o, cfg),
167
+ report_spec_quality: o[:report_spec_quality] != false,
168
+ config_path: config_path,
169
+ worker_timeout_sec: run_shards_resolved_worker_timeout_sec(o),
170
+ worker_idle_timeout_sec: run_shards_resolved_worker_idle_timeout_sec(o)
122
171
  }
123
172
  end
173
+
174
+ def run_shards_normalize_worker_idle_timeout_option!(o)
175
+ v = o[:worker_idle_timeout_sec]
176
+ return if v.nil?
177
+
178
+ o[:worker_idle_timeout_sec] = nil if v <= 0
179
+ end
180
+
181
+ def run_shards_resolved_worker_idle_timeout_sec(o)
182
+ cli = o[:worker_idle_timeout_sec]
183
+ return cli.to_f if cli.is_a?(Numeric) && cli > 0
184
+
185
+ env_worker_idle_timeout_sec
186
+ end
187
+
188
+ def run_shards_normalize_worker_timeout_option!(o)
189
+ v = o[:worker_timeout_sec]
190
+ return if v.nil?
191
+
192
+ o[:worker_timeout_sec] = nil if v <= 0
193
+ end
194
+
195
+ def run_shards_resolved_worker_timeout_sec(o)
196
+ cli = o[:worker_timeout_sec]
197
+ return cli.to_f if cli.is_a?(Numeric) && cli > 0
198
+
199
+ env_worker_timeout_sec
200
+ end
124
201
  end
125
202
  end
126
203
  end
204
+ # rubocop:enable Polyrun/FileLength
@@ -18,6 +18,7 @@ module Polyrun
18
18
  workers: env_int("POLYRUN_WORKERS", Polyrun::Config::DEFAULT_PARALLEL_WORKERS),
19
19
  paths_file: nil,
20
20
  strategy: (pc["strategy"] || pc[:strategy] || "round_robin").to_s,
21
+ strategy_explicit: !!(pc["strategy"] || pc[:strategy]),
21
22
  seed: pc["seed"] || pc[:seed],
22
23
  timing_path: nil,
23
24
  constraints_path: nil,
@@ -27,7 +28,12 @@ module Polyrun
27
28
  merge_format: nil,
28
29
  merge_failures: false,
29
30
  merge_failures_output: nil,
30
- merge_failures_format: nil
31
+ merge_failures_format: nil,
32
+ merge_spec_quality: false,
33
+ merge_spec_quality_output: nil,
34
+ report_spec_quality: true,
35
+ worker_timeout_sec: nil,
36
+ worker_idle_timeout_sec: nil
31
37
  }
32
38
  end
33
39
 
@@ -39,9 +45,14 @@ module Polyrun
39
45
 
40
46
  # rubocop:disable Metrics/AbcSize -- one argv block for run-shards
41
47
  def run_shards_plan_options_register!(opts, st)
42
- opts.banner = "usage: polyrun run-shards [--workers N] [--strategy NAME] [--paths-file P] [--timing P] [--timing-granularity VAL] [--constraints P] [--seed S] [--merge-coverage] [--merge-output P] [--merge-format LIST] [--merge-failures] [--merge-failures-output P] [--merge-failures-format jsonl|json] [--] <command> [args...]"
48
+ opts.banner = "usage: polyrun run-shards [--workers N] [--worker-timeout SEC] [--worker-idle-timeout SEC] [--strategy NAME] [--paths-file P] [--timing P] [--timing-granularity VAL] [--constraints P] [--seed S] [--merge-coverage] [--merge-output P] [--merge-format LIST] [--merge-failures] [--merge-failures-output P] [--merge-failures-format jsonl|json] [--merge-spec-quality] [--merge-spec-quality-output P] [--no-report-spec-quality] [--] <command> [args...]"
43
49
  opts.on("--workers N", Integer) { |v| st[:workers] = v }
44
- opts.on("--strategy NAME", String) { |v| st[:strategy] = v }
50
+ opts.on("--worker-timeout SEC", Float, "Max seconds per worker since spawn (also POLYRUN_WORKER_TIMEOUT_SEC); kills stuck workers (exit 124)") { |v| st[:worker_timeout_sec] = v }
51
+ opts.on("--worker-idle-timeout SEC", Float, "Max seconds since last valid WorkerPing timestamp in POLYRUN_WORKER_PING_FILE (needs prior ping); RSpec/Minitest: install_worker_ping!; Quick: automatic; exit 125") { |v| st[:worker_idle_timeout_sec] = v }
52
+ opts.on("--strategy NAME", String) do |v|
53
+ st[:strategy] = v
54
+ st[:strategy_explicit] = true
55
+ end
45
56
  opts.on("--seed VAL") { |v| st[:seed] = v }
46
57
  opts.on("--paths-file PATH", String) { |v| st[:paths_file] = v }
47
58
  opts.on("--constraints PATH", String) { |v| st[:constraints_path] = v }
@@ -53,6 +64,9 @@ module Polyrun
53
64
  opts.on("--merge-failures", "After all workers exit, merge tmp/polyrun_failures/polyrun-failure-fragment-*.jsonl (use Polyrun::RSpec.install_failure_fragments!)") { st[:merge_failures] = true }
54
65
  opts.on("--merge-failures-output PATH", String) { |v| st[:merge_failures_output] = v }
55
66
  opts.on("--merge-failures-format VAL", "jsonl (default) or json") { |v| st[:merge_failures_format] = v }
67
+ opts.on("--merge-spec-quality", "After workers exit, merge coverage/polyrun-spec-quality-fragment-*.jsonl (POLYRUN_SPEC_QUALITY in workers)") { st[:merge_spec_quality] = true }
68
+ opts.on("--merge-spec-quality-output PATH", String) { |v| st[:merge_spec_quality_output] = v }
69
+ opts.on("--no-report-spec-quality", "Skip printing spec-quality report after merge") { st[:report_spec_quality] = false }
56
70
  end
57
71
  # rubocop:enable Metrics/AbcSize
58
72
  end
@@ -23,13 +23,10 @@ module Polyrun
23
23
  run_shards_plan_phase_b(o, cmd, cfg, pc, run_t0, config_path)
24
24
  end
25
25
 
26
- def run_shards_default_timing_path(pc, timing_path, strategy)
26
+ def run_shards_default_timing_path(pc, timing_path, _strategy = nil)
27
27
  return timing_path if timing_path
28
28
 
29
- tf = pc["timing_file"] || pc[:timing_file]
30
- return tf if tf && (Polyrun::Partition::Plan.cost_strategy?(strategy) || Polyrun::Partition::Plan.hrw_strategy?(strategy))
31
-
32
- nil
29
+ pc["timing_file"] || pc[:timing_file]
33
30
  end
34
31
 
35
32
  def run_shards_validate_workers!(o)
@@ -70,8 +67,13 @@ module Polyrun
70
67
  [items, paths_source, nil]
71
68
  end
72
69
 
73
- def run_shards_resolve_costs(timing_path, strategy, timing_granularity)
70
+ def run_shards_resolve_costs(timing_path, strategy, timing_granularity, strategy_explicit: false)
71
+ strategy = strategy.to_s
74
72
  if timing_path
73
+ if strategy_explicit && strategy == "round_robin"
74
+ return [nil, strategy, nil]
75
+ end
76
+
75
77
  costs = Polyrun::Partition::Plan.load_timing_costs(
76
78
  File.expand_path(timing_path.to_s, Dir.pwd),
77
79
  granularity: timing_granularity
@@ -80,12 +82,16 @@ module Polyrun
80
82
  Polyrun::Log.warn "polyrun run-shards: timing file missing or empty: #{timing_path}"
81
83
  return [nil, nil, 2]
82
84
  end
83
- unless Polyrun::Partition::Plan.cost_strategy?(strategy) || Polyrun::Partition::Plan.hrw_strategy?(strategy)
85
+ if Polyrun::Partition::Plan.timing_load_strategy?(strategy)
86
+ return [costs, strategy, nil]
87
+ end
88
+ unless strategy_explicit
84
89
  Polyrun::Log.warn "polyrun run-shards: using cost_binpack (timing data present)" if @verbose
85
- strategy = "cost_binpack"
90
+ return [costs, "cost_binpack", nil]
86
91
  end
87
- [costs, strategy, nil]
88
- elsif Polyrun::Partition::Plan.cost_strategy?(strategy)
92
+
93
+ [nil, strategy, nil]
94
+ elsif Polyrun::Partition::Plan.cost_strategy?(strategy) || Polyrun::Partition::Plan.lazy_robin_strategy?(strategy)
89
95
  Polyrun::Log.warn "polyrun run-shards: --timing or partition.timing_file required for strategy #{strategy}"
90
96
  [nil, nil, 2]
91
97
  else
@@ -93,7 +99,7 @@ module Polyrun
93
99
  end
94
100
  end
95
101
 
96
- def run_shards_make_plan(items, workers, strategy, seed, costs, constraints, timing_granularity)
102
+ def run_shards_make_plan(items, workers, strategy, seed, costs, constraints, timing_granularity, stable_assignment = nil, shard_weights: nil)
97
103
  Polyrun::Debug.time("Partition::Plan.new (partition #{items.size} paths → #{workers} shards)") do
98
104
  Polyrun::Partition::Plan.new(
99
105
  items: items,
@@ -103,7 +109,9 @@ module Polyrun
103
109
  costs: costs,
104
110
  constraints: constraints,
105
111
  root: Dir.pwd,
106
- timing_granularity: timing_granularity
112
+ timing_granularity: timing_granularity,
113
+ stable_assignment: stable_assignment,
114
+ shard_weights: shard_weights
107
115
  )
108
116
  end
109
117
  end