chrono_forge 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,68 @@
1
+ {
2
+ "planPath": "docs/superpowers/plans/2026-07-01-workflow-definition-dag.md",
3
+ "tasks": [
4
+ {
5
+ "id": 1,
6
+ "subject": "Task 1: Definition value objects + Prism dep",
7
+ "status": "completed",
8
+ "description": "Graph data model (Definition/Node/Edge) + prism runtime dependency, round-trippable to_h.\n\n```json:metadata\n{\"files\": [\"lib/chrono_forge/definition.rb\", \"chrono_forge.gemspec\", \"test/definition_test.rb\"], \"verifyCommand\": \"bundle exec ruby -I test test/definition_test.rb\", \"acceptanceCriteria\": [\"Definition holds nodes/edges/warnings; to_h JSON-safe\", \"Node#dynamic?\", \"prism declared dependency\"], \"requiresUserVerification\": false}\n```"
9
+ },
10
+ {
11
+ "id": 2,
12
+ "subject": "Task 2: Analyzer — linear steps",
13
+ "status": "completed",
14
+ "blockedBy": [1],
15
+ "description": "DefinitionAnalyzer.call resolves perform via Prism; a node per straight-line durable call with sequential edges from start.\n\n```json:metadata\n{\"files\": [\"lib/chrono_forge/definition_analyzer.rb\", \"test/support/definition_fixtures.rb\", \"test/definition_analyzer_test.rb\"], \"verifyCommand\": \"bundle exec ruby -I test test/definition_analyzer_test.rb\", \"acceptanceCriteria\": [\"node per durable call in source order with exact step_name\", \"seq edges start->n1->n2\", \"non-durable Ruby ignored\", \"no DB/exec\"], \"requiresUserVerification\": false}\n```"
16
+ },
17
+ {
18
+ "id": 3,
19
+ "subject": "Task 3: Analyzer — conditionals & guards",
20
+ "status": "completed",
21
+ "blockedBy": [2],
22
+ "description": "if/unless/case around durable calls -> :conditional edges with guard labels; rejoin skip/body; continue_if false path -> :terminal.\n\n```json:metadata\n{\"files\": [\"lib/chrono_forge/definition_analyzer.rb\", \"test/support/definition_fixtures.rb\", \"test/definition_analyzer_test.rb\"], \"verifyCommand\": \"bundle exec ruby -I test test/definition_analyzer_test.rb\", \"acceptanceCriteria\": [\"guarded conditional edge with source guard\", \"rejoin skip+body\", \"continue_if terminal edge\"], \"requiresUserVerification\": false}\n```"
23
+ },
24
+ {
25
+ "id": 4,
26
+ "subject": "Task 4: Analyzer — branch fan-out + merge join",
27
+ "status": "completed",
28
+ "blockedBy": [3],
29
+ "description": "branch block -> :branch fan-out node + child-group node via :fanout edge; merge_branches -> :join edge from branch.\n\n```json:metadata\n{\"files\": [\"lib/chrono_forge/definition_analyzer.rb\", \"test/support/definition_fixtures.rb\", \"test/definition_analyzer_test.rb\"], \"verifyCommand\": \"bundle exec ruby -I test test/definition_analyzer_test.rb\", \"acceptanceCriteria\": [\"branch$name node + child-group with pattern\", \"branch->child :fanout\", \"branch->merge :join\"], \"requiresUserVerification\": false}\n```"
30
+ },
31
+ {
32
+ "id": 5,
33
+ "subject": "Task 5: Analyzer — repeat, helper tracing, loop warnings",
34
+ "status": "completed",
35
+ "blockedBy": [4],
36
+ "description": "durably_repeat -> :repeat node; trace durable calls in same-class helpers (recursion-guarded); durable call inside a loop -> warning, no crash.\n\n```json:metadata\n{\"files\": [\"lib/chrono_forge/definition_analyzer.rb\", \"test/support/definition_fixtures.rb\", \"test/definition_analyzer_test.rb\"], \"verifyCommand\": \"bundle exec ruby -I test test/definition_analyzer_test.rb\", \"acceptanceCriteria\": [\"durably_repeat$tick single repeat node\", \"same-class helper traced in position\", \"loop-with-durable warns\", \"no infinite recursion\"], \"requiresUserVerification\": false}\n```"
37
+ },
38
+ {
39
+ "id": 6,
40
+ "subject": "Task 6: Dashboard — DefinitionOverlay",
41
+ "status": "completed",
42
+ "blockedBy": [1],
43
+ "description": "Annotate nodes with runtime status from execution_logs; fan-out/repeat aggregates via BranchProbe/rep logs; append unmapped nodes.\n\n```json:metadata\n{\"files\": [\"chrono_forge-dashboard/app/presenters/chrono_forge/dashboard/definition_overlay.rb\", \"chrono_forge-dashboard/test/definition_overlay_test.rb\"], \"verifyCommand\": \"cd chrono_forge-dashboard && bundle exec rake test TEST=test/definition_overlay_test.rb\", \"acceptanceCriteria\": [\"exact-name status\", \"branch/merge counts\", \"repeat repetitions\", \"unmapped logs appended\"], \"requiresUserVerification\": false}\n```"
44
+ },
45
+ {
46
+ "id": 7,
47
+ "subject": "Task 7: Dashboard — MermaidRenderer",
48
+ "status": "completed",
49
+ "blockedBy": [6],
50
+ "description": "Statused nodes + edges -> Mermaid flowchart TD string with shapes by kind, :::status classes, guard edge labels, classDef lines.\n\n```json:metadata\n{\"files\": [\"chrono_forge-dashboard/app/presenters/chrono_forge/dashboard/mermaid_renderer.rb\", \"chrono_forge-dashboard/test/mermaid_renderer_test.rb\"], \"verifyCommand\": \"cd chrono_forge-dashboard && bundle exec rake test TEST=test/mermaid_renderer_test.rb\", \"acceptanceCriteria\": [\"flowchart TD header\", \"node line per node with :::status\", \"edge lines with guard labels\", \"classDef per used status\"], \"requiresUserVerification\": false}\n```"
51
+ },
52
+ {
53
+ "id": 8,
54
+ "subject": "Task 8: Dashboard — definition page (route/controller/view/Mermaid/link)",
55
+ "status": "completed",
56
+ "blockedBy": [5, 6, 7],
57
+ "description": "GET workflows/:id/definition page — analyze class, overlay run, render Mermaid client-side (vendored), warnings panel, graceful degradation, link from detail page.\n\n```json:metadata\n{\"files\": [\"chrono_forge-dashboard/config/routes.rb\", \"chrono_forge-dashboard/app/controllers/chrono_forge/dashboard/definitions_controller.rb\", \"chrono_forge-dashboard/app/views/chrono_forge/dashboard/definitions/show.html.erb\", \"chrono_forge-dashboard/app/assets/chrono_forge/dashboard/mermaid.min.js\", \"chrono_forge-dashboard/app/views/chrono_forge/dashboard/workflows/show.html.erb\", \"chrono_forge-dashboard/test/definitions_controller_test.rb\"], \"verifyCommand\": \"cd chrono_forge-dashboard && bundle exec rake test TEST=test/definitions_controller_test.rb\", \"acceptanceCriteria\": [\"200 with flowchart TD for analyzable wf\", \"unknown class degrades to warning not 500\", \"detail page links to page\"], \"requiresUserVerification\": false}\n```"
58
+ },
59
+ {
60
+ "id": 9,
61
+ "subject": "Task 9: Full suite + docs",
62
+ "status": "completed",
63
+ "blockedBy": [8],
64
+ "description": "Both packages green, lint clean, document the definition page.\n\n```json:metadata\n{\"files\": [\"chrono_forge-dashboard/README.md\"], \"verifyCommand\": \"bundle exec rake test && cd chrono_forge-dashboard && bundle exec rake test\", \"acceptanceCriteria\": [\"core suite green\", \"dashboard suite green\", \"lint clean on new files\"], \"requiresUserVerification\": false}\n```"
65
+ }
66
+ ],
67
+ "lastUpdated": "2026-07-01T00:00:00Z"
68
+ }
@@ -0,0 +1,203 @@
1
+ # Workflow Definition DAG — static "future timeline" for ChronoForge
2
+
3
+ **Status:** Design approved (pending written-spec review)
4
+ **Date:** 2026-07-01
5
+ **Reference:** the `durable_flow` gem's `DefinitionAnalyzer` (Prism-based static analyzer + definition DAG overlaid with runtime status).
6
+
7
+ ## Problem
8
+
9
+ The dashboard today shows only the **historical** timeline of a workflow — the
10
+ `execution_logs` that have already run. There is no forward view: an operator
11
+ can't see the steps a workflow *will* run, where the current run sits in the
12
+ overall shape, or which branches/loops are still ahead.
13
+
14
+ ChronoForge workflows are plain Ruby: a `perform` method that the engine
15
+ **replays** every resume, with each durable step identified by a string name
16
+ (`durably_execute$name`, `wait_until$cond`, `branch$name`, `merge$a,b`,
17
+ `durably_repeat$name$<ts>`). Because the structure is expressed in source, we can
18
+ recover a *projection* of the step sequence by statically parsing `perform` with
19
+ Prism — without executing anything — and then paint the run's actual status onto
20
+ that static map.
21
+
22
+ ## Goal
23
+
24
+ A **new per-run dashboard page** that renders a workflow's **conditional DAG**
25
+ (the static definition graph) with the current run's `execution_logs` **overlaid**
26
+ as node status. The existing workflow detail page is unchanged; it gains a link
27
+ to this page.
28
+
29
+ Non-goals for v1 are listed under [Scope](#scope-v1).
30
+
31
+ ## Key decisions (locked during brainstorming)
32
+
33
+ 1. **Primary consumer:** dashboard overlay — run status painted on the static map
34
+ (mirrors durable_flow's run → definition-DAG view).
35
+ 2. **Map shape:** a **conditional DAG** — guarded edges for `if`/`continue_if`,
36
+ fan-out groups for branches, joins for merges.
37
+ 3. **Fidelity:** **conservative + trace same-class helper methods**. Resolve step
38
+ names statically where possible; anything unresolvable (computed `name:`,
39
+ data-dependent loop count, a durable call behind an unknown/external call)
40
+ becomes an explicit **`dynamic` node with a warning**. No unrolling, no
41
+ cross-class tracing.
42
+ 4. **Rendering:** **Mermaid.js** (client-side, vendored). The analyzer's graph
43
+ model is rendering-agnostic; a renderer emits Mermaid flowchart text with
44
+ status encoded as node classes.
45
+ 5. **Static vs runtime:** static Prism analysis is the source of the *shape*
46
+ (only it can show not-yet-run steps and untaken branches); the run log is the
47
+ *overlay*, never the source of the graph.
48
+ 6. **Placement:** a **new route/page**, not an inline addition to the detail page.
49
+
50
+ ## Architecture
51
+
52
+ ```
53
+ workflow_class
54
+ │ DefinitionAnalyzer.call (core gem; Prism; memoized by class + source digest)
55
+
56
+ Definition (Node[], Edge[], warnings) (plain, JSON-serializable value objects)
57
+ │ DefinitionOverlay(execution_logs) (dashboard; read-only queries; per-run; never cached)
58
+
59
+ statused Definition
60
+ │ MermaidRenderer (dashboard; statused graph → flowchart text)
61
+
62
+ new DAG page → vendored Mermaid JS renders client-side (inside data-poll-region)
63
+ ```
64
+
65
+ ### Core gem — `lib/chrono_forge/` (rendering-agnostic, no dashboard/DB dependency)
66
+
67
+ - **`ChronoForge::DefinitionAnalyzer`** — `.call(workflow_class) → Definition`.
68
+ - Resolves `workflow_class.instance_method(:perform).source_location`, reads the
69
+ file, `Prism.parse`, locates the `perform` def node, and walks its body with a
70
+ visitor.
71
+ - **Traces durable calls in same-class helper methods** to a fixed point within
72
+ the class (a call to a method defined on the same class whose body contains
73
+ durable DSL calls is expanded inline; recursion is guarded).
74
+ - Emits nodes, edges, and warnings. **Only reads source text — never touches the
75
+ DB, never executes workflow code.**
76
+ - **`ChronoForge::Definition`** (+ `Node`, `Edge`) — plain value objects,
77
+ JSON-serializable so a `Definition` can be cached.
78
+ - `Node`: `id`, `kind` ∈ `{:execute, :wait, :wait_until, :continue_if, :branch,
79
+ :merge, :repeat, :dynamic}`, `label`, and **either** an exact `step_name`
80
+ **or** a `step_name_pattern` (fan-out/repeat/dynamic), plus optional `guard`
81
+ (condition source label) and `warnings`.
82
+ - `Edge`: `from`, `to`, optional `guard` label, and a `kind` (`:seq`,
83
+ `:conditional`, `:fanout`, `:join`, `:terminal`).
84
+
85
+ ### Dashboard package — `chrono_forge-dashboard/`
86
+
87
+ - **`DefinitionOverlay`** — takes a `Definition` + a workflow's `execution_logs`
88
+ (and, for `:branch`/`:merge` nodes, child-workflow state counts via the existing
89
+ `BranchProbe`) and annotates each node with a runtime `status`. Read-only.
90
+ - **`MermaidRenderer`** — `statused Definition → flowchart text`; status encoded
91
+ as `classDef` + `class` assignments.
92
+ - **New controller action + view** — `GET workflows/:id/definition`, plus a
93
+ "Definition graph" link from the existing detail page.
94
+ - **Vendored Mermaid JS** — the dashboard's first client script, initialized
95
+ inside the existing `data-poll-region` so the DAG re-renders on the normal
96
+ page refresh.
97
+
98
+ ## Node → step-name binding
99
+
100
+ Each node knows the step-name it *would* produce, so the overlay is a lookup, not
101
+ guesswork:
102
+
103
+ | DSL call | Node kind | Binds to |
104
+ |---|---|---|
105
+ | `durably_execute :m` / `name: "x"` | `:execute` | exact `durably_execute$x` (or `$m`) |
106
+ | `durably_execute :m, name: <expr>` | `:dynamic` | prefix `durably_execute$`, by ordinal |
107
+ | `wait <duration>, "n"` | `:wait` | exact `wait$n` (name is the 2nd positional) |
108
+ | `wait_until :cond` | `:wait_until` | exact `wait_until$cond` |
109
+ | `continue_if :cond` | `:continue_if` | exact `continue_if$cond` |
110
+ | `branch :name { spawn/spawn_each }` | `:branch` (fan-out) | `branch$name` + child-workflow aggregate |
111
+ | `merge_branches :a, :b` | `:merge` (join) | `merge$a,b` (names sorted) |
112
+ | `durably_repeat :name` | `:repeat` (loop) | `durably_repeat$name` coord + `$<ts>` reps |
113
+
114
+ **Fan-out (`branch`/`spawn_each`) and `durably_repeat` collapse to a single node
115
+ with aggregate status** — not one node per child/iteration.
116
+
117
+ ## Overlay status vocabulary (→ Mermaid classes)
118
+
119
+ - `done` — matching log is `completed`.
120
+ - `active` — log is `started`/`running`, not completed.
121
+ - `pending` — reached but not done (a coordination log exists, work outstanding).
122
+ - `not_reached` — no log yet.
123
+ - `failed` / `stalled` — from the log state.
124
+ - `conditional` — statically guarded; may be skipped.
125
+ - `dynamic` — unresolved name; bound by prefix + ordinal.
126
+ - `unmapped` — **a runtime log with no matching static node**; appended so
127
+ analyzer gaps are surfaced, not hidden.
128
+
129
+ Aggregates:
130
+ - `:repeat` → "N done, current active, `till` met?" from the coordination log +
131
+ its `$<ts>` repetition logs.
132
+ - `:branch`/`:merge` → child-workflow state counts (running/idle/completed/failed)
133
+ via `BranchProbe`.
134
+
135
+ ## Edges & conditionals
136
+
137
+ - Sequential DSL calls → `:seq` edges.
138
+ - `if`/`unless`/`case`/`&&`/`||`/early-return around a step → `:conditional` edge
139
+ labeled with the condition source; steps only reachable under a guard render
140
+ `conditional`.
141
+ - `continue_if` → a gate node; its false path is a `:terminal` edge (workflow
142
+ halts).
143
+ - `branch` block → fans out (`:fanout`) to its spawn/`spawn_each` child-group;
144
+ `merge_branches` is the `:join` those edges reconnect into.
145
+ - `each`/`times`/`while` containing durable calls → one node + a "dynamic loop
146
+ count" **warning** (conservative — no unrolling).
147
+
148
+ ## Error handling
149
+
150
+ The analyzer must never break the dashboard:
151
+
152
+ - Source unavailable (`source_location` nil, C-defined, `eval`'d, unreadable
153
+ file) → return a `Definition` carrying a single `unavailable` warning; the page
154
+ renders "can't be statically analyzed" gracefully. Never raises.
155
+ - Any Prism parse issue degrades the same way (Prism is error-tolerant).
156
+ - Missing/unloadable `job_class`, or a partially-resolved analysis → render what
157
+ was found plus a warnings panel.
158
+ - **Analyzer is pure/read-only over source text**; the overlay does read-only
159
+ queries only.
160
+
161
+ ## Caching
162
+
163
+ - Memoize `Definition` by `job_class` + source-file digest — auto-invalidates on
164
+ dev code reload, stable in prod.
165
+ - The **overlay is never cached** — it is per-run and changes every poll.
166
+
167
+ ## Testing
168
+
169
+ - **Analyzer unit tests (no DB):** a fixture set of workflow classes — linear,
170
+ conditional/`continue_if`, `branch`+`spawn_each`, `durably_repeat`, dynamic
171
+ `name:`, helper-traced, unanalyzable loop — asserting node kinds, edges, guards,
172
+ and warnings. Deterministic and fast.
173
+ - **Overlay tests (dashboard harness):** seed `execution_logs` + child workflows;
174
+ assert per-node status, fan-out aggregates, repeat counts, and the `unmapped`
175
+ path.
176
+ - **`MermaidRenderer`:** golden-text tests (statused `Definition` → expected
177
+ flowchart string).
178
+
179
+ ## Scope (v1)
180
+
181
+ **In:** all seven primitives as nodes + conditional edges + fan-out/repeat
182
+ aggregation + the overlay + the new per-run DAG page + Mermaid rendering;
183
+ same-class helper tracing.
184
+
185
+ **Out (deferred):**
186
+ - Cross-class helper tracing.
187
+ - Recursively expanding a spawned child *workflow class* into its own graph
188
+ (v1 shows it as one fan-out node; "drill into child" is a future feature).
189
+ - Per-node ETA/timing beyond status + counts (that's the separate progress/ETA
190
+ feature).
191
+ - A class-level (no-overlay) definition view (trivial later addition).
192
+
193
+ ## Open questions / risks
194
+
195
+ - **Helper-tracing fixed point:** need a clear rule for what counts as "a durable
196
+ call inside a same-class method" vs. ordinary work, and recursion/mutual-call
197
+ guards. The analyzer stays conservative — when in doubt, emit a `dynamic` node +
198
+ warning rather than a confident-but-wrong expansion.
199
+ - **Ordinal binding for dynamic siblings** is best-effort; if two dynamic
200
+ `durably_execute` calls interleave at runtime out of source order, the overlay
201
+ may mis-bind. Acceptable for v1 (surfaced as `dynamic`).
202
+ - **Mermaid as first client dependency** — keep it vendored and isolated so the
203
+ rest of the dashboard stays server-rendered.
@@ -4,7 +4,22 @@ module ChronoForge
4
4
  # Lightweight poller that joins one or more branches. NOT a workflow — it holds
5
5
  # no lock, does no replay, and carries no context. It exists so the heavy parent
6
6
  # workflow is replayed only twice per merge (kick off + completion wake).
7
+ #
8
+ # DEPLOY NOTE — queue placement matters. merge_branches enqueues this poller
9
+ # AFTER dispatching the branch's children, so if it runs on the SAME queue as a
10
+ # large fan-out's children it is starved behind the whole backlog and only gets a
11
+ # worker slot near the end. It then polls once, at pending≈0, with no prior sample
12
+ # (rate 0) and backs off to max_interval — so the parent's convergence lags by up
13
+ # to max_interval and no mid-drain throughput sample is ever recorded. Set
14
+ # ChronoForge.config.branch_merge_queue to a queue NOT saturated by the fan-out's
15
+ # own children so it polls throughout the drain (ETA cadence then converges
16
+ # tightly). See ChronoForge::Configuration and docs/fanout-scale-test.md.
7
17
  class BranchMergeJob < ActiveJob::Base
18
+ # Resolved per-enqueue from config (a block, so changing the config takes effect
19
+ # without redefining the job — and it can't be silently reset by a code reload,
20
+ # unlike a queue_as monkey-patch in a to_prepare block).
21
+ queue_as { ChronoForge.config.branch_merge_queue }
22
+
8
23
  # The poller is the parent's only wake mechanism, so survive TRANSIENT
9
24
  # infrastructure errors (DB connection/timeout/deadlock) with backoff. Any
10
25
  # other error — a programming bug, a bad guard — is NOT retried: it propagates
@@ -16,8 +31,7 @@ module ChronoForge
16
31
  ActiveRecord::LockWaitTimeout,
17
32
  wait: :polynomially_longer, attempts: 25
18
33
 
19
- CAP = 5_000 # cap the pending count; beyond it we just pick max_interval
20
- FACTOR = 0.06 # seconds of delay per pending child
34
+ ETA_FRACTION = 0.5 # poll at this fraction of the projected time-to-drain
21
35
  REKICK_AFTER = 5.minutes
22
36
  REKICK_BATCH = 200 # bound per-run rekicks; later polls handle the rest
23
37
 
@@ -29,44 +43,135 @@ module ChronoForge
29
43
  # re-enqueue) holds a stale token. It stops quietly — no poll, no wake, no
30
44
  # reschedule — leaving only the newest chain to drive the merge. (A nil token
31
45
  # is a pre-upgrade job enqueued before fencing existed; it runs unfenced.)
32
- return if superseded?(branch_log_ids, token)
46
+ logs = ExecutionLog.where(id: branch_log_ids).to_a
47
+ return if superseded?(logs, token)
33
48
 
34
49
  # Per-branch probe (kept as maps so we can persist each branch's own state,
35
50
  # not just the merge aggregate). Same query count as a plain sum/all?.
36
- pending_by_branch = branch_log_ids.to_h { |id| [id, BranchProbe.incomplete(id).limit(CAP).count] }
51
+ # The pending count is UNCAPPED: it feeds the drain signal below (a change in
52
+ # pending since the prior poll), which a CAP would flatten into a false
53
+ # "not draining" for large branches.
54
+ prev_pending_by_branch = logs.to_h { |l| [l.id, l.metadata&.dig("poll", "pending")] }
55
+ pending_by_branch = branch_log_ids.to_h { |id| [id, BranchProbe.incomplete(id).count] }
37
56
  sealed_by_branch = branch_log_ids.to_h { |id| [id, BranchProbe.sealed?(id)] }
38
57
  pending = pending_by_branch.values.sum
39
58
  sealed = sealed_by_branch.values.all?
40
59
 
60
+ # Total children spawned per branch. Immutable once the branch is SEALED
61
+ # (dispatch done), so we count it exactly ONCE and cache it on the metadata;
62
+ # every later poll (and the dashboard) reuses the cached value, never recounting.
63
+ # Unsealed (mid-spawn, count still climbing) => nil, and the dashboard falls back
64
+ # to its capped live count until the seal freezes the total.
65
+ logs_by_id = logs.index_by(&:id)
66
+ spawned_by_branch = branch_log_ids.to_h do |id|
67
+ cached = logs_by_id[id]&.metadata&.dig("poll", "spawned")
68
+ [id, cached || (sealed_by_branch[id] ? BranchProbe.spawned(id).count : nil)]
69
+ end
70
+
41
71
  if sealed && pending.zero?
42
- record_poll!(pending_by_branch, sealed_by_branch, token, next_poll_at: nil)
72
+ record_poll!(pending_by_branch, sealed_by_branch, token, next_poll_at: nil, interval: nil,
73
+ rate_by_branch: {}, never_started_by_branch: {}, spawned_by_branch: spawned_by_branch, rekicked_by_branch: {})
43
74
  parent_job_class.constantize.perform_later(parent_key)
44
75
  return
45
76
  end
46
77
 
47
- rekick_dropped_jobs(branch_log_ids)
78
+ # DISPATCHED (never-started) count per branch — the rekick drain signal. A
79
+ # drop since the prior poll means workers are consuming this branch's queue,
80
+ # so a still-queued child is in line; a flat count with stale never-started
81
+ # children is a dropped job to recover. Keyed off this, NOT total pending,
82
+ # which a wait/wait_until child completing would drop without any never-started
83
+ # child moving (masking a genuinely-dropped one behind staggered waits).
84
+ prev_never_started_by_branch = logs.to_h { |l| [l.id, l.metadata&.dig("poll", "never_started")] }
85
+ never_started_by_branch = branch_log_ids.to_h { |id| [id, BranchProbe.never_started(id).count] }
86
+
87
+ rekicked_by_branch = rekick_dropped_jobs(branch_log_ids, never_started_by_branch, prev_never_started_by_branch)
88
+
89
+ # Cadence is driven by ESTIMATED TIME-TO-DRAIN, measured from the prior
90
+ # poll's persisted pending. `motion` (EXISTS probes) is the fallback signal
91
+ # when nothing completed this interval: :running => a live worker is
92
+ # executing a child (hold the floor, it'll finish); :never_started => the only
93
+ # motion is a queued/rekicked-but-unpicked child (back off exponentially,
94
+ # it may never be picked up); :none => blocked/waiting (max backstop).
95
+ # See reschedule_delay. Computed lazily below, only off the drain path.
96
+ prior = logs.map { |l| l.metadata&.dig("poll") }
97
+ # Only trust the AGGREGATE prev_pending when every requested branch log is
98
+ # loaded AND carries a prior sample — otherwise `pending` (over all
99
+ # branch_log_ids) and prev_pending (over loaded logs) would cover different
100
+ # sets and yield a bogus aggregate rate. Missing/partial => no sample =>
101
+ # bootstrap. Per-branch rate below is independently safe (missing => nil => 0).
102
+ complete_prior = logs.size == branch_log_ids.size && prior.all?
103
+ prev_pending = (prior.sum { |p| p["pending"].to_i } if complete_prior)
104
+ prev_polled_at = prior.filter_map { |p| p && p["last_polled_at"] }.map { |s| Time.zone.parse(s) }.min
105
+ elapsed = prev_polled_at && (Time.current - prev_polled_at)
106
+ prev_delay = prior.filter_map { |p| p && p["interval"] }.max
107
+
108
+ # Drain rate = children completed / second since the prior poll — THIS is the
109
+ # throughput surfaced on the dashboard. Per branch for display; aggregated for
110
+ # the ETA. Zero unless the branch actually drained (a no-headway / cold poll).
111
+ # NOTE: the aggregate ETA blurs a heterogeneous multi-branch merge; acceptable
112
+ # (the common case is single-branch; clamp + per-poll re-estimate bound any
113
+ # skew, and only poll timing is affected — the parent is still woken).
114
+ drained = ->(pend, prev) { prev && elapsed && elapsed > 0 && pend < prev }
115
+ rate_by_branch = pending_by_branch.to_h do |id, pend|
116
+ prev = prev_pending_by_branch[id]
117
+ [id, drained.call(pend, prev) ? (prev - pend) / elapsed.to_f : 0.0]
118
+ end
119
+ rate = drained.call(pending, prev_pending) ? (prev_pending - pending) / elapsed.to_f : 0.0
120
+
121
+ # Only needed when the ETA branch won't be taken (rate == 0); computing the
122
+ # EXISTS probes lazily keeps them off the hot drain path. See reschedule_delay.
123
+ motion = if rate > 0 then nil
124
+ elsif branch_log_ids.any? { |id| BranchProbe.running?(id) } then :running
125
+ elsif never_started_by_branch.values.any?(&:positive?) then :never_started
126
+ else :none
127
+ end
48
128
 
49
- delay = reschedule_delay(pending, min_interval, max_interval)
50
- record_poll!(pending_by_branch, sealed_by_branch, token, next_poll_at: delay.seconds.from_now)
129
+ delay = reschedule_delay(pending, rate, motion, prev_delay, min_interval, max_interval)
130
+ record_poll!(pending_by_branch, sealed_by_branch, token, next_poll_at: delay.seconds.from_now,
131
+ interval: delay, rate_by_branch: rate_by_branch, never_started_by_branch: never_started_by_branch,
132
+ spawned_by_branch: spawned_by_branch, rekicked_by_branch: rekicked_by_branch)
51
133
  self.class.set(wait: delay.seconds)
52
134
  .perform_later(parent_key, parent_job_class, branch_log_ids, min_interval, max_interval, token)
53
135
  end
54
136
 
55
137
  private
56
138
 
57
- # Adaptive poll cadence: scale the wait with the number of pending children,
58
- # clamped to [min_interval, max_interval]. min_interval <= max_interval is
59
- # enforced up front in merge_branches, so the clamp can't raise here.
60
- def reschedule_delay(pending, min_interval, max_interval)
61
- (pending * FACTOR).clamp(min_interval, max_interval)
139
+ # Adaptive poll cadence driven by ESTIMATED TIME-TO-DRAIN, not backlog size.
140
+ # When the branch-set drained since the last poll we project completion from
141
+ # the measured rate and poll at ETA_FRACTION of it, clamped [min, max]. Because
142
+ # each poll re-estimates against the shrinking remainder, cadence converges
143
+ # geometrically and detects the merge within ~min_interval of the last child
144
+ # finishing — where the old count-based cadence polled SLOWEST (max_interval)
145
+ # exactly when a fast-draining backlog was about to complete.
146
+ #
147
+ # No completion observed this interval — fall back on `motion`:
148
+ # :running => a live worker is executing a child; it will finish, so hold
149
+ # the responsive floor (matches prior behaviour and avoids
150
+ # waking the parent late for a slow/low-fan-out child).
151
+ # :never_started => the only motion is a queued/rekicked-but-unpicked child that
152
+ # may never be picked up => exponential backoff from the floor
153
+ # (double prev_delay, capped at max), catching a quick recovery
154
+ # within seconds without spinning on a dead dispatch.
155
+ # :none => nothing can progress (blocked/failed or parked on a wait) =>
156
+ # straight to max_interval, the cheap recovery backstop.
157
+ # min_interval <= max_interval is enforced in merge_branches, so clamp is safe.
158
+ # `rate` is children/s measured by the caller (0 => nothing completed since the
159
+ # prior poll / cold poll).
160
+ def reschedule_delay(pending, rate, motion, prev_delay, min_interval, max_interval)
161
+ return (pending / rate * ETA_FRACTION).clamp(min_interval, max_interval) if rate > 0
162
+
163
+ case motion
164
+ when :running then min_interval
165
+ when :never_started then prev_delay ? (prev_delay * 2).clamp(min_interval, max_interval) : min_interval
166
+ else max_interval
167
+ end
62
168
  end
63
169
 
64
170
  # A poller is superseded when its token no longer matches what's stored on the
65
171
  # branch logs (a newer merge_branches pass rotated it). A plain read is enough
66
172
  # for the early-out; the persisting write in record_poll! re-checks the token
67
173
  # under a row lock so it can never clobber the newer chain.
68
- def superseded?(branch_log_ids, token)
69
- logs = ExecutionLog.where(id: branch_log_ids).to_a
174
+ def superseded?(logs, token)
70
175
  logs.empty? || logs.any? { |log| log.metadata&.dig("poll_token") != token }
71
176
  end
72
177
 
@@ -78,7 +183,7 @@ module ChronoForge
78
183
  # work still pending is the signal that the poller was dropped). This is purely
79
184
  # observational — replay and correctness never read it. It writes a "poll"
80
185
  # sub-key, leaving spawn_each's "cursors" metadata untouched.
81
- def record_poll!(pending_by_branch, sealed_by_branch, token, next_poll_at:)
186
+ def record_poll!(pending_by_branch, sealed_by_branch, token, next_poll_at:, interval:, rate_by_branch:, never_started_by_branch:, spawned_by_branch:, rekicked_by_branch:)
82
187
  now = Time.current
83
188
  ExecutionLog.where(id: pending_by_branch.keys).find_each do |log|
84
189
  # Lock the row so this read-modify-write can't clobber a concurrent token
@@ -88,12 +193,25 @@ module ChronoForge
88
193
  log.with_lock do
89
194
  meta = log.metadata || {}
90
195
  next unless meta["poll_token"] == token
196
+ prev = meta["poll"] || {}
197
+ n = rekicked_by_branch[log.id].to_i
198
+ pend = pending_by_branch[log.id]
199
+ rate = rate_by_branch[log.id].to_f
91
200
  meta["poll"] = {
92
201
  "last_polled_at" => now.iso8601,
93
202
  "next_poll_at" => next_poll_at&.iso8601,
94
- "pending" => pending_by_branch[log.id],
203
+ "interval" => interval,
204
+ "pending" => pend,
205
+ "never_started" => never_started_by_branch[log.id], # never-started count (rekick drain signal)
206
+ "spawned" => prev["spawned"] || spawned_by_branch[log.id], # total spawned; immutable once sealed, so sticky
95
207
  "sealed" => sealed_by_branch[log.id],
96
- "polls" => meta.dig("poll", "polls").to_i + 1
208
+ "rate" => rate.round(3), # children/s (round(3), not (2), so a
209
+ # very slow but real drain still reads > 0)
210
+ "eta_seconds" => (rate > 0 ? (pend / rate).round : nil),
211
+ "polls" => prev["polls"].to_i + 1,
212
+ "rekicked" => n,
213
+ "rekick_total" => prev["rekick_total"].to_i + n,
214
+ "last_rekick_at" => (n.positive? ? now.iso8601 : prev["last_rekick_at"])
97
215
  }
98
216
  log.update!(metadata: meta)
99
217
  end
@@ -111,10 +229,22 @@ module ChronoForge
111
229
  # keep the :idle guard (a running/failed/stalled child must never be
112
230
  # re-dispatched). Re-enqueue of an :idle child a worker just grabbed is still
113
231
  # safe — the lock guard rejects the duplicate. Capped per run.
114
- def rekick_dropped_jobs(branch_log_ids)
115
- branch_log_ids.each do |id|
232
+ def rekick_dropped_jobs(branch_log_ids, never_started_by_branch, prev_never_started_by_branch)
233
+ cutoff = REKICK_AFTER.ago
234
+ branch_log_ids.to_h do |id|
235
+ # Skip a branch whose NEVER-STARTED count dropped since the last poll:
236
+ # workers are pulling its dispatched children off the queue, so a still-
237
+ # queued child is in line, not dropped. Deliberately NOT total pending —
238
+ # a wait/wait_until child completing would drop pending without any
239
+ # never-started child moving, masking a genuinely-dropped child behind
240
+ # staggered waits. With no prior sample (cold poll) we don't gate — the
241
+ # per-child staleness filter below still spares freshly-dispatched rows.
242
+ prev = prev_never_started_by_branch[id]
243
+ next [id, 0] if prev && never_started_by_branch[id] < prev
244
+
245
+ count = 0
116
246
  Workflow.where(parent_execution_log_id: id, state: Workflow.states[:idle], started_at: nil)
117
- .where("updated_at < ?", REKICK_AFTER.ago)
247
+ .where("updated_at < ?", cutoff)
118
248
  .limit(REKICK_BATCH)
119
249
  .find_each do |child|
120
250
  # Intentionally uses the GUARDED perform_later (single-child path),
@@ -126,12 +256,19 @@ module ChronoForge
126
256
  # error — dead-letter the poller, orphaning every healthy sibling. Catch
127
257
  # per child, log, and let the next poll retry it (it's still idle+stale).
128
258
  child.job_klass.perform_later(child.key, **child.kwargs.symbolize_keys)
259
+ # Debounce: bump updated_at so this child isn't re-rekicked until it's
260
+ # been unstarted for another REKICK_AFTER — one redelivery window for a
261
+ # worker to pick it up. Only on a SUCCESSFUL enqueue; a rescued failure
262
+ # leaves it stale so the next poll retries.
263
+ child.touch
264
+ count += 1
129
265
  rescue => e
130
266
  Rails.logger.error do
131
267
  "ChronoForge:BranchMergeJob rekick failed for child #{child.key}: " \
132
268
  "#{e.class}: #{e.message}"
133
269
  end
134
270
  end
271
+ [id, count]
135
272
  end
136
273
  end
137
274
  end
@@ -19,6 +19,50 @@ module ChronoForge
19
19
  .where.not(state: Workflow.states[:completed])
20
20
  end
21
21
 
22
+ # Relation of children that can advance on their own — actively running, or
23
+ # dispatched-but-not-yet-started (started_at nil). This drives the adaptive
24
+ # poll cadence. Deliberately EXCLUDES waiting children (idle with started_at
25
+ # SET — parked on a wait/wait_until) and blocked children (failed/stalled —
26
+ # awaiting operator recovery): polling can't make either progress, so they
27
+ # must not pin the cadence at the responsive floor. They still count as
28
+ # +incomplete+ (the branch stays open), they just don't accelerate polling.
29
+ def progressing(branch_log_id)
30
+ base = Workflow.where(parent_execution_log_id: branch_log_id)
31
+ base.where(state: Workflow.states[:running])
32
+ .or(base.where(state: Workflow.states[:idle], started_at: nil))
33
+ end
34
+
35
+ # A child of this branch is actively executing — a live worker will complete
36
+ # it, so the poller can hold its responsive floor rather than backing off.
37
+ def running?(branch_log_id)
38
+ Workflow.where(parent_execution_log_id: branch_log_id, state: Workflow.states[:running]).exists?
39
+ end
40
+
41
+ # Children dispatched but not yet started (idle, started_at nil) — the queue of
42
+ # never-started work for this branch. A DROP in this count between polls means
43
+ # workers are actively pulling it off the queue (so a still-queued child is in
44
+ # line, not dropped); the rekick gate keys off that. Distinct from total pending,
45
+ # which a wait/wait_until child completing would drop without any never-started
46
+ # child moving. (Not to be confused with the dashboard's "Dispatched" column,
47
+ # which is the TOTAL children spawned.)
48
+ def never_started(branch_log_id)
49
+ Workflow.where(parent_execution_log_id: branch_log_id,
50
+ state: Workflow.states[:idle], started_at: nil)
51
+ end
52
+
53
+ # A child was dispatched but no worker has started it yet. If this is the only
54
+ # motion left, it's a queued/rekicked-but-unpicked straggler (which may never be
55
+ # picked up), NOT active work — so the poller backs off.
56
+ def never_started?(branch_log_id) = never_started(branch_log_id).exists?
57
+
58
+ # All children spawned into this branch (every state) — the dispatch total. Fixed
59
+ # once the branch is sealed, so the poller counts it exactly once and caches it on
60
+ # the branch-log metadata. This is the dashboard's "Spawned" column. Distinct from
61
+ # #never_started, which is only the idle-and-unstarted subset.
62
+ def spawned(branch_log_id)
63
+ Workflow.where(parent_execution_log_id: branch_log_id)
64
+ end
65
+
22
66
  def done?(branch_log_id)
23
67
  sealed?(branch_log_id) && !incomplete(branch_log_id).exists?
24
68
  end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChronoForge
4
+ # Engine-wide configuration. Set via ChronoForge.configure in an initializer.
5
+ class Configuration
6
+ # The queue the branch-merge poller (BranchMergeJob) runs on.
7
+ #
8
+ # This MUST NOT be a queue that a fan-out's own children saturate: merge_branches
9
+ # enqueues the poller AFTER dispatching the branch's children, so on a shared
10
+ # queue it is starved behind the whole backlog and only gets a worker slot near
11
+ # the end — it then polls once, at pending≈0, and backs off, so the parent's
12
+ # convergence lags by up to max_interval and no mid-drain throughput is recorded.
13
+ # Because the poller is OUR code (not the user's job), its placement is a
14
+ # first-class setting rather than something to monkey-patch onto BranchMergeJob.
15
+ #
16
+ # Defaults to :default (fine when fan-outs run on their own queues). For large
17
+ # fan-outs, point this at a dedicated queue with its own worker so the poller
18
+ # runs promptly throughout the drain.
19
+ attr_accessor :branch_merge_queue
20
+
21
+ def initialize
22
+ @branch_merge_queue = :default
23
+ end
24
+ end
25
+ end