@aztec/prover-node 5.0.0-nightly.20260531 → 5.0.0-nightly.20260610

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +506 -0
  2. package/dest/actions/rerun-epoch-proving-job.d.ts +3 -3
  3. package/dest/actions/rerun-epoch-proving-job.d.ts.map +1 -1
  4. package/dest/actions/rerun-epoch-proving-job.js +97 -17
  5. package/dest/checkpoint-store.d.ts +83 -0
  6. package/dest/checkpoint-store.d.ts.map +1 -0
  7. package/dest/checkpoint-store.js +181 -0
  8. package/dest/config.d.ts +1 -1
  9. package/dest/config.d.ts.map +1 -1
  10. package/dest/config.js +1 -1
  11. package/dest/factory.d.ts +1 -1
  12. package/dest/factory.d.ts.map +1 -1
  13. package/dest/factory.js +1 -6
  14. package/dest/index.d.ts +2 -1
  15. package/dest/index.d.ts.map +1 -1
  16. package/dest/index.js +1 -0
  17. package/dest/job/checkpoint-prover.d.ts +134 -0
  18. package/dest/job/checkpoint-prover.d.ts.map +1 -0
  19. package/dest/job/checkpoint-prover.js +350 -0
  20. package/dest/job/epoch-session.d.ts +146 -0
  21. package/dest/job/epoch-session.d.ts.map +1 -0
  22. package/dest/job/epoch-session.js +709 -0
  23. package/dest/job/top-tree-job.d.ts +82 -0
  24. package/dest/job/top-tree-job.d.ts.map +1 -0
  25. package/dest/job/top-tree-job.js +152 -0
  26. package/dest/metrics.d.ts +19 -5
  27. package/dest/metrics.d.ts.map +1 -1
  28. package/dest/metrics.js +51 -9
  29. package/dest/proof-publishing-service.d.ts +159 -0
  30. package/dest/proof-publishing-service.d.ts.map +1 -0
  31. package/dest/proof-publishing-service.js +334 -0
  32. package/dest/prover-node-publisher.d.ts +3 -15
  33. package/dest/prover-node-publisher.d.ts.map +1 -1
  34. package/dest/prover-node-publisher.js +36 -93
  35. package/dest/prover-node.d.ts +96 -67
  36. package/dest/prover-node.d.ts.map +1 -1
  37. package/dest/prover-node.js +374 -219
  38. package/dest/session-manager.d.ts +158 -0
  39. package/dest/session-manager.d.ts.map +1 -0
  40. package/dest/session-manager.js +452 -0
  41. package/dest/test/index.d.ts +7 -6
  42. package/dest/test/index.d.ts.map +1 -1
  43. package/package.json +23 -23
  44. package/src/actions/rerun-epoch-proving-job.ts +101 -26
  45. package/src/checkpoint-store.ts +213 -0
  46. package/src/config.ts +2 -1
  47. package/src/factory.ts +0 -8
  48. package/src/index.ts +1 -0
  49. package/src/job/checkpoint-prover.ts +465 -0
  50. package/src/job/epoch-session.ts +424 -0
  51. package/src/job/top-tree-job.ts +227 -0
  52. package/src/metrics.ts +51 -12
  53. package/src/proof-publishing-service.ts +424 -0
  54. package/src/prover-node-publisher.ts +41 -111
  55. package/src/prover-node.ts +440 -242
  56. package/src/session-manager.ts +552 -0
  57. package/src/test/index.ts +6 -6
  58. package/dest/job/epoch-proving-job.d.ts +0 -67
  59. package/dest/job/epoch-proving-job.d.ts.map +0 -1
  60. package/dest/job/epoch-proving-job.js +0 -912
  61. package/src/job/epoch-proving-job.ts +0 -531
package/README.md CHANGED
@@ -1 +1,507 @@
1
1
  # Prover Node
2
+
3
+ The prover node turns sequenced checkpoints into epoch proofs that get submitted to the L1
4
+ rollup contract. It runs alongside an Aztec validator/full-node and consumes the
5
+ canonical chain view those nodes emit, proving epochs **optimistically** — sub-tree
6
+ work begins the moment a checkpoint lands on L1, not when the epoch closes.
7
+
8
+ This document describes the internal architecture: the state held by the prover-node,
9
+ the events that drive it, and the data flow from a fresh `chain-checkpointed` event
10
+ through to a `submitEpochRootProof` on L1.
11
+
12
+ ## Contents
13
+
14
+ 1. [Architecture](#architecture)
15
+ 2. [CheckpointProver lifecycle](#checkpointprover-lifecycle)
16
+ 3. [EpochSession lifecycle](#epochsession-lifecycle)
17
+ 4. [Event flow](#event-flow)
18
+ 5. [Walkthroughs](#walkthroughs)
19
+ 6. [Design rationale](#design-rationale)
20
+ 7. [Configuration](#configuration)
21
+ 8. [Failure handling and observability](#failure-handling-and-observability)
22
+
23
+ ## Architecture
24
+
25
+ ```mermaid
26
+ flowchart TB
27
+ L2BlockStream -->|chain-checkpointed| ProverNode
28
+ L2BlockStream -->|chain-pruned| ProverNode
29
+ L2BlockStream -->|chain-proven| ProverNode
30
+ L2BlockStream -->|any event| ProverNode
31
+ ProverNode --> CheckpointStore
32
+ ProverNode --> ChonkCache
33
+ ProverNode --> SessionManager
34
+ ProverNode --> ProofPublishingService
35
+ SessionManager --> EpochTicker[(periodic tick)]
36
+ SessionManager --> FullSessions[(fullSessions)]
37
+ SessionManager --> PartialSessions[(partialSessions)]
38
+ CheckpointStore --> SlotWatcher
39
+ FullSessions -.referenced checkpoints.-> CheckpointStore
40
+ PartialSessions -.referenced checkpoints.-> CheckpointStore
41
+ FullSessions --> TopTreeJob
42
+ PartialSessions --> TopTreeJob
43
+ TopTreeJob -->|PublishCandidate| ProofPublishingService
44
+ ProofPublishingService -->|fresh per publish| ProverNodePublisher
45
+ ProverNodePublisher --> L1[L1 Rollup]
46
+ ```
47
+
48
+ The prover-node splits responsibility between four classes:
49
+
50
+ - **`ProverNode`** — owns the long-lived collections, wires the L2BlockStream, and
51
+ translates each chain event into a single method call on the `SessionManager` or
52
+ `ProofPublishingService`. It also performs the per-event side effects that don't
53
+ belong on an `EpochSession` (registering new checkpoints with the store, sweeping
54
+ expired epochs out of the cache and the store, etc.) and runs the failure-upload
55
+ action when an `EpochSession` exits with `failed`.
56
+ - **`CheckpointStore`** — a registry of `CheckpointProver` instances keyed by
57
+ `(checkpointNumber, slot, archiveRoot)`. Each `CheckpointProver` runs its own sub-tree pipeline
58
+ (tx gather → block processing → block-rollup proofs), starting eagerly the moment a
59
+ checkpoint is registered. The store is the single source of canonical-vs-pruned
60
+ checkpoint content that `EpochSession`s query when assembling their subsets.
61
+ - **`SessionManager`** — owns every live `EpochSession`, the serial reconcile queue,
62
+ the periodic tick, and all `EpochSession` lifecycle decisions. `ProverNode` calls into it
63
+ via `onCheckpointAdded`, `onPrune`, and `startProof`. Every trigger it receives is
64
+ translated into a `reconcile(trigger)` call, a single idempotent function that walks
65
+ all `EpochSession`s, cancels any whose canonical content has shifted, re-creates them with
66
+ the new content, and opens fresh full `EpochSession`s for any epoch that has become provable.
67
+ Reconcile runs on a `SerialQueue` (from `@aztec/foundation/queue`), so two concurrent
68
+ triggers can never interleave on an `await` and race on the `EpochSession` maps.
69
+ - **`ProofPublishingService`** — central owner of L1 proof submission. `EpochSession`s hand
70
+ their top-tree proofs to the service as `PublishCandidate`s; the service serialises
71
+ one publish at a time against a freshly-created `ProverNodePublisher`, gates eligibility
72
+ on the proven block tip, picks the longest candidate per epoch as the winner
73
+ (others resolve `'superseded'`), and enforces a per-candidate `deadline`. It runs its
74
+ own `drain()` on a separate `SerialQueue`: submits, withdrawals, chain-proven advances,
75
+ and per-candidate deadline expiries all enqueue a drain pass, so the eligibility
76
+ re-check and the L1 publish never interleave with each other.
77
+
78
+ ## CheckpointProver lifecycle
79
+
80
+ A `CheckpointProver` is content-addressed by `(checkpoint.number, slot, archiveRoot)`,
81
+ where `archiveRoot` is the checkpoint's own archive root (its post-state). Keying on the
82
+ post-state makes the identity precise: two checkpoints are "the same" iff they produce
83
+ the same archive — so a reorg branch, or a replacement built on the same predecessor but
84
+ with different content, yields a different archive root and a distinct `CheckpointProver`, while an
85
+ identical re-add collapses to the same `CheckpointProver` and reuses its in-flight work.
86
+
87
+ ```mermaid
88
+ stateDiagram-v2
89
+ [*] --> Created
90
+ Created --> Proving: gather + execute
91
+ Proving --> Proven: sub-tree resolves blockProofs
92
+ Proving --> Cancelled: cancel()
93
+ Proven --> Reaped: reapExpired(epoch)
94
+ Cancelled --> [*]
95
+
96
+ state "Pruned (side)" as Pruned
97
+ Proving --> Pruned: markPruned()
98
+ Pruned --> Proving: markCanonical()
99
+ Proven --> Pruned: markPruned()
100
+ Pruned --> Reaped: SlotWatcher (slot < syncedSlot)
101
+ ```
102
+
103
+ The **`Pruned`** state is a side flag, not a place in the main lifecycle: sub-tree
104
+ proving keeps running underneath, so a brief reorg that prunes and immediately
105
+ re-adds the same checkpoint avoids any re-proving. The flag only gates *eligibility*
106
+ to be included in an `EpochSession` — `EpochSession`s ask the store for *canonical* (non-pruned)
107
+ checkpoints when assembling their subsets.
108
+
109
+ ### Reaping rules
110
+
111
+ - **Pruned**: the `SlotWatcher` (a `RunningPromise` polling
112
+ `l2BlockSource.getSyncedL2SlotNumber`) reaps a pruned `CheckpointProver` when the chain's
113
+ synced slot has moved past the `CheckpointProver`'s slot. Once the chain is past that slot,
114
+ a re-add with the same content is impossible.
115
+ - **Canonical**: `CheckpointStore.reapExpired(expiredEpoch)` drops any canonical
116
+ `CheckpointProver` whose epoch is at or below the supplied expired epoch. Once an epoch's
117
+ proof-submission window has closed, its proof can no longer be accepted on L1,
118
+ so the `CheckpointProver` is no longer needed.
119
+ - **Cancelled**: removed immediately by whichever path called `cancel()` (store
120
+ shutdown, prune past-slot, `EpochSession` error).
121
+
122
+ ### Eager tx gathering
123
+
124
+ A `CheckpointProver` starts its tx gather + sub-tree pipeline **in its constructor**.
125
+ The tx provider is injected as a dependency,
126
+ and the `CheckpointProver` pulls its own txs via `txProvider.getTxsForBlock(block)` for each
127
+ block in its checkpoint.
128
+
129
+ This means the moment a checkpoint lands on L1, sub-tree proving is already in flight.
130
+ By the time the epoch closes (and the `EpochSession` is constructed), most or all of the
131
+ block-rollup proofs are already done — the `EpochSession` only has to drive the top tree.
132
+
133
+ ## EpochSession lifecycle
134
+
135
+ An `EpochSession` is identified by a slot-based **spec**:
136
+
137
+ ```ts
138
+ interface SessionSpec {
139
+ kind: 'full' | 'partial';
140
+ epochNumber: EpochNumber;
141
+ fromSlot: SlotNumber;
142
+ toSlot: SlotNumber;
143
+ }
144
+ ```
145
+
146
+ The spec declares *what to prove* (a slot range). The concrete checkpoint set the
147
+ `EpochSession` holds is the *implementation* of the spec — frozen at construction time,
148
+ derived from the canonical content for that slot range.
149
+
150
+ ```mermaid
151
+ stateDiagram-v2
152
+ [*] --> initialized
153
+ initialized --> awaiting_checkpoints: start()
154
+ awaiting_checkpoints --> completed: publish succeeds
155
+ awaiting_checkpoints --> superseded: longer same-epoch candidate wins
156
+ awaiting_checkpoints --> failed: L1 submission errored
157
+ awaiting_checkpoints --> cancelled: cancel()
158
+ initialized --> timed_out: deadline
159
+ awaiting_checkpoints --> timed_out: deadline (EpochSession or candidate)
160
+ completed --> [*]
161
+ superseded --> [*]
162
+ cancelled --> [*]
163
+ timed_out --> [*]
164
+ failed --> [*]
165
+ ```
166
+
167
+ The `awaiting-checkpoints` state covers the window between `start()` and the L1
168
+ submission: a `TopTreeJob` is running over the `EpochSession`'s frozen checkpoint set,
169
+ awaiting each checkpoint's sub-tree result (`CheckpointProver.whenBlockProofsReady`)
170
+ and assembling the epoch proof.
171
+
172
+ The `EpochSession` does three sequential things: (1) run a `TopTreeJob` over the frozen
173
+ checkpoint subset, (2) hand the resulting proof to `ProofPublishingService` as a
174
+ `PublishCandidate`, (3) translate the service's outcome into a terminal state.
175
+ Predecessor gating, same-epoch dedup, deadline enforcement, and the L1 tx are all
176
+ the `ProofPublishingService`'s concern; the `EpochSession` is just the producer of one
177
+ candidate and the observer of its outcome.
178
+
179
+ Outcome → state mapping:
180
+
181
+ | `PublishOutcome` | `EpochSession` state |
182
+ |---|---|
183
+ | `published` | `completed` |
184
+ | `superseded` | `superseded` |
185
+ | `failed` | `failed` |
186
+ | `expired` | `timed-out` |
187
+ | `withdrawn` | `cancelled` |
188
+
189
+ There is a single deadline — the proof submission window — that applies across both
190
+ proving and publishing. Before submission, the `EpochSession` arms its own timer against
191
+ it: if proving doesn't finish in time, the `EpochSession` enters `timed-out` via
192
+ `cancel('deadline')`. After submission, the publishing service enforces the same deadline
193
+ on the candidate. It's the same instant throughout; only which component enforces it
194
+ changes once the candidate has been handed off.
195
+
196
+ ### Full vs partial
197
+
198
+ Every `EpochSession` — full or partial — has `fromSlot = firstSlotOfEpoch(N)`. The L1 rollup
199
+ contract requires every proof to extend from the previous epoch's proven tip, so
200
+ there's no value in starting later than the epoch boundary. The two kinds differ
201
+ only in `toSlot` and in how the publishing service treats their candidate:
202
+
203
+ - **Full** `EpochSession`s are opened by reconcile when the epoch is complete on L1 *and*
204
+ every archiver-reported checkpoint is present in the store. Their `toSlot` is
205
+ the epoch's last slot. The publishing service never auto-supersedes a `full`
206
+ candidate on proven-tip subsumption — the L1 contract records a `(epoch, prover-id)`
207
+ submission for every full-epoch proof, so even after another prover-node has
208
+ landed first, this prover's submission is still worthwhile.
209
+ - **Partial** `EpochSession`s are constructed by an explicit `startProof(epochNumber)` API
210
+ call. Their `toSlot` is the last canonical slot present at request time, which may
211
+ be earlier than the epoch's last slot. Partial candidates are an early-finish
212
+ optimisation: if the proven chain has caught up to or past `endBlock` by the time
213
+ the publishing service picks the winner, the partial resolves `'superseded'`
214
+ without spending L1 gas. Dedup: if the partial's spec collapses to the full's spec
215
+ (canonical content already covers the whole epoch), `startProof` awaits the
216
+ existing full `EpochSession` instead of opening a duplicate.
217
+
218
+ ## ProofPublishingService
219
+
220
+ The service is a single per-prover-node owner of L1 submission. `EpochSession`s call
221
+ `submit(candidate)` and await one of five outcomes:
222
+
223
+ | Outcome | Meaning |
224
+ |---|---|
225
+ | `published` | L1 accepted the proof. |
226
+ | `superseded` | A longer same-epoch candidate won, or (for `partial` candidates) the proven tip has caught up to `endBlock`. |
227
+ | `failed` | L1 submission errored. |
228
+ | `expired` | The candidate's `deadline` elapsed before publishing started. |
229
+ | `withdrawn` | An `EpochSession` called `withdraw(uuid)` on a still-queued candidate. |
230
+
231
+ Key invariants:
232
+
233
+ - **One publish at a time** via a `SerialQueue` drain.
234
+ - **Fresh publisher per publish.** Each drain call constructs a new `ProverNodePublisher`
235
+ via the factory. There is no shared in-memory state across publishes.
236
+ - **Once an L1 publish starts, it runs to completion.** `withdraw` is a queue-only
237
+ operation: it removes a candidate that hasn't started publishing. An in-flight
238
+ candidate is left alone and its outcome reports whatever L1 returned. The
239
+ originating `EpochSession` has already moved to a terminal state via `cancel()` and
240
+ ignores the late outcome.
241
+ - **Drain reads the proven block number afresh** from `l2BlockSource` inside the
242
+ serial queue, so the eligibility
243
+ check is consistent with the publish that follows it on the same drain pass.
244
+ - **Per-candidate `deadline`** arms a `setTimeout` (against the injected `DateProvider`).
245
+ When it fires, a still-queued candidate resolves `'expired'`. An in-flight publish
246
+ is left alone (its outcome reports the natural L1 result).
247
+ - **Transient `publisherFactory.create()` failures are retried.** Instead of resolving
248
+ the candidate as `'failed'`, the service schedules another drain after a 1s backoff
249
+ and leaves the candidate in the queue. The candidate's `deadline` caps the total
250
+ retry window — persistent acquire failure resolves as `'expired'`.
251
+
252
+ ### Eligibility
253
+
254
+ A candidate is eligible to publish when its **predecessor block is proven**
255
+ (`startBlock - 1 <= proven`). Among eligible candidates for the same epoch, the
256
+ one with the **highest `endBlock`** wins; the others resolve `'superseded'`.
257
+ Partial candidates whose `endBlock <= proven` are dropped before this check
258
+ (early-finish optimisation no longer helps); full candidates are never
259
+ auto-superseded on the proven tip.
260
+
261
+ ## Event flow
262
+
263
+ ### chain-checkpointed
264
+
265
+ ```mermaid
266
+ sequenceDiagram
267
+ participant L2 as L2BlockStream
268
+ participant PN as ProverNode
269
+ participant CS as CheckpointStore
270
+ participant CP as CheckpointProver
271
+ participant SM as SessionManager
272
+
273
+ L2->>PN: chain-checkpointed{checkpoint}
274
+ PN->>PN: collectRegisterData (prev-header, l1ToL2 messages, sibling path)
275
+ PN->>CS: addOrUpdate(checkpoint, data)
276
+ alt content key new
277
+ CS->>CP: new CheckpointProver(args)
278
+ CP->>CP: eager gather + sub-tree start
279
+ else content key matches
280
+ CS->>CP: markCanonical()
281
+ end
282
+ PN->>SM: onCheckpointAdded(epoch)
283
+ SM->>SM: queue reconcile({kind:'checkpoint', epoch})
284
+ SM->>SM: walk EpochSessions, recreate invalid
285
+ SM->>SM: open full EpochSession if epoch ready
286
+ ```
287
+
288
+ ### chain-pruned
289
+
290
+ ```mermaid
291
+ sequenceDiagram
292
+ participant L2 as L2BlockStream
293
+ participant PN as ProverNode
294
+ participant CS as CheckpointStore
295
+ participant SM as SessionManager
296
+
297
+ L2->>PN: chain-pruned{checkpoint}
298
+ PN->>CS: markPrunedAfter(checkpoint.number)
299
+ CS->>CS: flip every CheckpointProver above threshold to pruned (sub-tree keeps running)
300
+ PN->>SM: onPrune(affectedEpochs)
301
+ SM->>SM: queue reconcile({kind:'prune', affectedEpochs})
302
+ SM->>SM: walk EpochSessions, cancel-and-recreate those with shifted content
303
+ ```
304
+
305
+ ### chain-proven
306
+
307
+ ```mermaid
308
+ sequenceDiagram
309
+ participant L2 as L2BlockStream
310
+ participant PN as ProverNode
311
+ participant PS as ProofPublishingService
312
+
313
+ L2->>PN: chain-proven{block}
314
+ PN->>PS: onChainProven(blockNumber)
315
+ PS->>PS: scheduleDrain (wake-up only, no state cached)
316
+ PS->>PS: drain reads proven afresh, re-checks eligibility
317
+ ```
318
+
319
+ ### Per-event expiry sweep
320
+
321
+ ```mermaid
322
+ sequenceDiagram
323
+ participant L2 as L2BlockStream
324
+ participant PN as ProverNode
325
+ participant CC as ChonkCache
326
+ participant CS as CheckpointStore
327
+
328
+ L2->>PN: any event
329
+ PN->>L2: getSyncedL2SlotNumber()
330
+ PN->>PN: latestEpoch = getEpochAtSlot(latestSlot)
331
+ PN->>PN: newlyExpiredUpTo = latestEpoch - (proofSubmissionEpochs + 1)
332
+ loop for each newly-expired epoch
333
+ PN->>L2: getCheckpointsData({epoch}) + getBlocks(...)
334
+ PN->>CC: releaseForBlocks(blocks)
335
+ PN->>CS: reapExpired(epoch)
336
+ end
337
+ ```
338
+
339
+ Expiry runs at the end of every `handleBlockStreamEvent` call (not on any specific
340
+ event type). An epoch `E` is expired once the chain reaches the start of epoch
341
+ `E + proofSubmissionEpochs + 1` — the deadline beyond which an L1 submission for
342
+ `E` would be rejected. A monotonic high-water mark (`lastExpiredEpoch`) makes the
343
+ sweep cheap: it advances per event and never revisits an epoch. It is seeded at
344
+ `start()` from the last fully-proven epoch (computed in `computeStartupState`),
345
+ so on a restart we never re-sweep epochs that already reached L1.
346
+
347
+ ### Periodic tick
348
+
349
+ `SessionManager.start()` arms a `RunningPromise` that fires
350
+ `reconcile({ kind: 'tick' })` every `tickIntervalMs`. The tick picks up epochs that
351
+ became complete by time alone (no fresh checkpoint event) and advances to the
352
+ next unproven epoch once the previous one lands on L1. A monotonic high-water
353
+ mark (`lastTickEpoch`) prevents the tick from re-opening an epoch whose `EpochSession`
354
+ already terminated; the mark advances only after an `EpochSession` actually exists for
355
+ the epoch, so transient blockers (max-pending-jobs reached, archiver still
356
+ indexing) leave the mark in place and the next tick retries.
357
+
358
+ ## Walkthroughs
359
+
360
+ ### checkpoint-added → prune → checkpoint-added (reorg resilience)
361
+
362
+ State: epoch N has checkpoints c1..c4 all canonical (slots s1..s4). `fullSessions[N]`
363
+ holds `EpochSession` **A** with spec `{kind:'full', N, fromSlot:s1, toSlot:s4}`, referencing
364
+ checkpoints `[c1, c2, c3, c4]`.
365
+
366
+ 1. **chain-pruned arrives, target c3.** Store flips c4 to pruned. Reconcile fires:
367
+ for `EpochSession` A, canonical content for `(s1, s4)` is now `[c1, c2, c3]` (c4 pruned).
368
+ The frozen set `[c1, c2, c3, c4]` no longer matches → `A.cancel('canonical content
369
+ changed')`. Epoch N still complete on L1 → reconcile constructs `EpochSession` **B** with
370
+ the same spec `{full, N, s1, s4}` but checkpoints `[c1, c2, c3]`.
371
+
372
+ 2. **`EpochSession` B starts top-tree proving over [c1, c2, c3].**
373
+
374
+ 3. **chain-checkpointed arrives, target c4_re (same content key as old c4).** The
375
+ store finds the existing `CheckpointProver` at `(c4.number, s4, c4.archive.root)`
376
+ and calls `markCanonical()`. The sub-tree work that never stopped is visible to
377
+ `EpochSession`s again. (A re-add with *different* content would have a different archive
378
+ root and so get a fresh `CheckpointProver` instead.)
379
+
380
+ 4. **Reconcile fires.** `EpochSession` B's canonical content for `(s1, s4)` is now `[c1, c2,
381
+ c3, c4]`, doesn't match its frozen `[c1, c2, c3]` → `B.cancel(...)`. Construct
382
+ `EpochSession` **C** with same spec but checkpoints `[c1, c2, c3, c4]`.
383
+
384
+ 5. **`EpochSession` C reuses the long-lived c1..c4 `CheckpointProver` instances.** Sub-tree
385
+ work may already be complete; only the top-tree is recomputed. The chonk cache
386
+ survived the reorg because no epoch in this range has expired yet.
387
+
388
+
389
+ ### Partial request dedups against a running full `EpochSession`
390
+
391
+ The operator calls `startProof(N)` while the full `EpochSession` for epoch N is running with
392
+ c1..c4. Current canonical slot range is `(s1, s4)`, so the partial's computed spec is
393
+ `{partial, N, s1, s4}` — its `fromSlot`/`toSlot` exactly match the running full `EpochSession`'s. `startProof`
394
+ detects this and awaits the existing full instead of opening a duplicate: no partial
395
+ `EpochSession` is created and no second `TopTreeJob` is built. The caller simply blocks on the
396
+ full session's result and the epoch is proven once.
397
+
398
+ ### True partial proof
399
+
400
+ The operator calls `startProof(N)` when only c1, c2 are canonical (epoch incomplete).
401
+ `fromSlot` is the epoch's first slot; `toSlot` is `s2` (the last canonical slot).
402
+ Partial `EpochSession` created with spec `{partial, N, firstSlotOfEpoch(N), s2}` and
403
+ checkpoints `[c1, c2]`.
404
+
405
+ When c3 later arrives in slot s3, the partial is **not** invalidated — c3's slot is
406
+ outside its range. If c2 is then pruned, the partial **is** invalidated (canonical
407
+ content for the same slot range is now just `[c1]`) and recreated with the same
408
+ spec but checkpoints `[c1]`. If c2 re-adds, the partial is invalidated again and
409
+ recreated with `[c1, c2]`.
410
+
411
+ ## Design rationale
412
+
413
+ ### Why slot-based specs (not checkpoint-based)?
414
+
415
+ A spec like "prove checkpoints 7..10" is invalidated by any reorg that renumbers
416
+ those checkpoints. A spec like "prove slots 350..399" survives renumbering — the
417
+ slot range is determined by epoch math and L1 constants, not by which checkpoints
418
+ happen to be canonical at the moment. Reconciliation preserves the slot range
419
+ across cancel-and-recreate cycles.
420
+
421
+ ### Why does every `EpochSession` start at the epoch's first slot?
422
+
423
+ The L1 rollup contract validates that every submitted proof extends from the previous
424
+ proven tip — the `fromCheckpoint` of any submission must be the checkpoint immediately
425
+ after the current L1 proven head. Starting a partial `EpochSession` at a later slot would
426
+ mean the partial's `fromCheckpoint` lies past the proven tip, which the contract
427
+ rejects. Fixing `fromSlot` to `firstSlotOfEpoch(N)` for both kinds means partials and
428
+ fulls always share the same starting point; they differ only in `toSlot` and in the
429
+ submission decision.
430
+
431
+ ### Why does a publishing service own L1 submission instead of the `EpochSession`?
432
+
433
+ Concentrating L1 submission gives us three properties for free that were awkward
434
+ or impossible when each `EpochSession` called the publisher directly:
435
+
436
+ 1. **Atomic same-epoch dedup.** Multiple candidates for the same epoch (full +
437
+ partial, or partial-then-full as canonical content extends) can be in flight
438
+ at once; the service picks the winner under the serial drain so only one L1
439
+ tx is ever sent for the longer candidate.
440
+ 2. **One source of truth for the proven tip.** Reading the proven block number
441
+ inside the drain means the eligibility check and the publish that follows are
442
+ guaranteed to use the same value. `EpochSession`s can't race each other on stale
443
+ reads.
444
+ 3. **Per-candidate deadline and retry.** The service owns expiry timers and the
445
+ `publisherFactory.create()` retry loop. `EpochSession`s don't need to know about
446
+ either — they just await the outcome.
447
+
448
+ ### Why is the chonk cache keyed by tx hash and released on finality?
449
+
450
+ Chonk-verifier proofs are tx-scoped: they prove a transaction's chonk circuit is
451
+ valid, independently of which block or epoch the tx lands in. A tx that gets
452
+ reorged out of one block and re-mined into another should not need to be re-proved.
453
+ Keying by tx hash makes the cache survive any reorg up to finality; releasing on
454
+ finality means we don't grow the cache indefinitely while still keeping every
455
+ reorg-relevant proof.
456
+
457
+ ### Why does the slot watcher only reap pruned `CheckpointProver`s?
458
+
459
+ Canonical `CheckpointProver`s can't be reaped on a slot heuristic — they're still part of the
460
+ proven-chain story. Pruned `CheckpointProver`s, on the other hand, are only kept around in
461
+ case the chain re-adds the same content; once the synced slot has moved past, that
462
+ re-add is impossible, and the `CheckpointProver` can go. Finality is the right signal for
463
+ canonical reaping, because finality is the only state that rules out future reorgs.
464
+
465
+ ## Configuration
466
+
467
+ | Env var | Description |
468
+ |---|---|
469
+ | `PROVER_NODE_POLLING_INTERVAL_MS` | Polling interval for the L2BlockStream, the checkpoint-store slot watcher, and the SessionManager periodic tick. Default 1000 ms. |
470
+ | `PROVER_NODE_MAX_PENDING_JOBS` | Cap on the number of non-terminal `EpochSession`s (full + partial). When at limit, reconcile defers opening new full `EpochSession`s; explicit `startProof` calls throw. |
471
+ | `PROVER_NODE_EPOCH_PROVING_DELAY_MS` | Optional sleep at the start of each `EpochSession`, before the TopTreeJob is constructed. Used in tests to give late events time to land. |
472
+ | `TX_GATHERING_TIMEOUT_MS` | Per-block tx gather deadline used by each `CheckpointProver`. |
473
+ | `PROVER_NODE_FAILED_EPOCH_STORE` | If set, failed `EpochSession`s upload their proving data (every `CheckpointProver`'s txs + register-time data, regardless of sub-tree completion) to this file store. |
474
+ | `PROVER_NODE_DISABLE_PROOF_PUBLISH` | If true, the publishing service runs `analyzeEpochProofSubmission` (estimates L1 fees) instead of actually submitting. |
475
+
476
+ ## Failure handling and observability
477
+
478
+ Loggers:
479
+
480
+ - `prover-node` — `ProverNode` itself (event dispatch, lifecycle).
481
+ - `prover-node:session-manager` — reconcile decisions, `EpochSession` opens / drops, tick.
482
+ - `prover-node:epoch-session` — per-`EpochSession` lifecycle (`Created EpochSession`,
483
+ `Top-tree proof ready`, `Submitted proof for epoch N`, etc.).
484
+ - `prover-node:proof-publishing-service` — candidate submit / withdraw / expire,
485
+ drain, publish attempts, transient acquire retries.
486
+ - `prover-node:l1-tx-publisher` — the per-publish `ProverNodePublisher`'s L1 work.
487
+ - `prover-node:checkpoint-store` — content-key collisions, reap decisions.
488
+ - `prover-node:checkpoint-prover` — sub-tree pipeline (gather, block processing).
489
+ - `prover-client:chonk-cache` — chonk-verifier cache enqueue / release events.
490
+
491
+ On `failed` exit, `SessionManager.runSession` invokes the `onSessionFailed` callback
492
+ the manager was constructed with. `ProverNode` wires this to `tryUploadSessionFailure`,
493
+ which calls `SessionManager.buildSessionProvingData(session)` to walk every `CheckpointProver`
494
+ referenced by the `EpochSession` and assemble an `EpochProvingJobData` snapshot — including
495
+ every `CheckpointProver`'s txs and register-time data even if its sub-tree never reached
496
+ `isCompleted()`. This snapshot is what `uploadEpochProofFailure` ships to the
497
+ configured file store along with a world-state + archiver backup, so the failure
498
+ can be reproduced offline via `rerunEpochProvingJob`.
499
+
500
+ Metrics emitted by `EpochSession`s:
501
+
502
+ - `aztec.prover_node.execution_duration` — wall-clock time from `EpochSession` start to terminal.
503
+ - `aztec.prover_node.job_duration` — same, in seconds.
504
+ - `aztec.prover_node.job_checkpoints` / `_blocks` / `_transactions` — sizes of the
505
+ proven range.
506
+ - `aztec.prover_node.block_processing_duration` /
507
+ `aztec.prover_node.checkpoint_processing_duration` — sub-tree breakdown.
@@ -6,8 +6,8 @@ import type { DataStoreConfig } from '@aztec/stdlib/kv-store';
6
6
  import type { GenesisData } from '@aztec/stdlib/world-state';
7
7
  /**
8
8
  * Given a local folder where `downloadEpochProvingJob` was called, creates a new archiver and world state
9
- * using the state snapshots, and creates a new epoch proving job to prove the downloaded proving job.
9
+ * using the state snapshots, and creates a new epoch proving session to prove the downloaded proving job.
10
10
  * Proving is done with a local proving broker and agents as specified by the config.
11
11
  */
12
- export declare function rerunEpochProvingJob(localPath: string, log: Logger, config: DataStoreConfig & ProverBrokerConfig & ProverClientConfig & Pick<L1ContractsConfig, 'aztecEpochDuration'>, genesis?: GenesisData): Promise<"awaiting-prover" | "completed" | "failed" | "initialized" | "processing" | "publishing-proof" | "reorg" | "stopped" | "timed-out">;
13
- //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoicmVydW4tZXBvY2gtcHJvdmluZy1qb2IuZC50cyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uLy4uL3NyYy9hY3Rpb25zL3JlcnVuLWVwb2NoLXByb3Zpbmctam9iLnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiJBQUNBLE9BQU8sS0FBSyxFQUFFLGlCQUFpQixFQUFFLE1BQU0sd0JBQXdCLENBQUM7QUFDaEUsT0FBTyxLQUFLLEVBQUUsTUFBTSxFQUFFLE1BQU0sdUJBQXVCLENBQUM7QUFDcEQsT0FBTyxFQUFFLEtBQUssa0JBQWtCLEVBQXNCLE1BQU0sc0JBQXNCLENBQUM7QUFDbkYsT0FBTyxFQUFFLGtCQUFrQixFQUErQixNQUFNLDZCQUE2QixDQUFDO0FBRTlGLE9BQU8sS0FBSyxFQUFFLGVBQWUsRUFBRSxNQUFNLHdCQUF3QixDQUFDO0FBQzlELE9BQU8sS0FBSyxFQUFFLFdBQVcsRUFBRSxNQUFNLDJCQUEyQixDQUFDO0FBVTdEOzs7O0dBSUc7QUFDSCx3QkFBc0Isb0JBQW9CLENBQ3hDLFNBQVMsRUFBRSxNQUFNLEVBQ2pCLEdBQUcsRUFBRSxNQUFNLEVBQ1gsTUFBTSxFQUFFLGVBQWUsR0FBRyxrQkFBa0IsR0FBRyxrQkFBa0IsR0FBRyxJQUFJLENBQUMsaUJBQWlCLEVBQUUsb0JBQW9CLENBQUMsRUFDakgsT0FBTyxDQUFDLEVBQUUsV0FBVywrSUErQ3RCIn0=
12
+ export declare function rerunEpochProvingJob(localPath: string, log: Logger, config: DataStoreConfig & ProverBrokerConfig & ProverClientConfig & Pick<L1ContractsConfig, 'aztecEpochDuration'>, genesis?: GenesisData): Promise<"awaiting-checkpoints" | "awaiting-predecessor" | "cancelled" | "completed" | "failed" | "initialized" | "publishing-proof" | "stopped" | "superseded" | "timed-out">;
13
+ //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoicmVydW4tZXBvY2gtcHJvdmluZy1qb2IuZC50cyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uLy4uL3NyYy9hY3Rpb25zL3JlcnVuLWVwb2NoLXByb3Zpbmctam9iLnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiJBQUNBLE9BQU8sS0FBSyxFQUFFLGlCQUFpQixFQUFFLE1BQU0sd0JBQXdCLENBQUM7QUFFaEUsT0FBTyxLQUFLLEVBQUUsTUFBTSxFQUFFLE1BQU0sdUJBQXVCLENBQUM7QUFFcEQsT0FBTyxFQUFFLEtBQUssa0JBQWtCLEVBQXNCLE1BQU0sc0JBQXNCLENBQUM7QUFDbkYsT0FBTyxFQUFFLGtCQUFrQixFQUErQixNQUFNLDZCQUE2QixDQUFDO0FBTzlGLE9BQU8sS0FBSyxFQUFFLGVBQWUsRUFBRSxNQUFNLHdCQUF3QixDQUFDO0FBRzlELE9BQU8sS0FBSyxFQUFFLFdBQVcsRUFBRSxNQUFNLDJCQUEyQixDQUFDO0FBVzdEOzs7O0dBSUc7QUFDSCx3QkFBc0Isb0JBQW9CLENBQ3hDLFNBQVMsRUFBRSxNQUFNLEVBQ2pCLEdBQUcsRUFBRSxNQUFNLEVBQ1gsTUFBTSxFQUFFLGVBQWUsR0FBRyxrQkFBa0IsR0FBRyxrQkFBa0IsR0FBRyxJQUFJLENBQUMsaUJBQWlCLEVBQUUsb0JBQW9CLENBQUMsRUFDakgsT0FBTyxDQUFDLEVBQUUsV0FBVyxpTEF5RnRCIn0=
@@ -1 +1 @@
1
- {"version":3,"file":"rerun-epoch-proving-job.d.ts","sourceRoot":"","sources":["../../src/actions/rerun-epoch-proving-job.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAChE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAC;AACpD,OAAO,EAAE,KAAK,kBAAkB,EAAsB,MAAM,sBAAsB,CAAC;AACnF,OAAO,EAAE,kBAAkB,EAA+B,MAAM,6BAA6B,CAAC;AAE9F,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAC;AAU7D;;;;GAIG;AACH,wBAAsB,oBAAoB,CACxC,SAAS,EAAE,MAAM,EACjB,GAAG,EAAE,MAAM,EACX,MAAM,EAAE,eAAe,GAAG,kBAAkB,GAAG,kBAAkB,GAAG,IAAI,CAAC,iBAAiB,EAAE,oBAAoB,CAAC,EACjH,OAAO,CAAC,EAAE,WAAW,+IA+CtB"}
1
+ {"version":3,"file":"rerun-epoch-proving-job.d.ts","sourceRoot":"","sources":["../../src/actions/rerun-epoch-proving-job.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAEhE,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAC;AAEpD,OAAO,EAAE,KAAK,kBAAkB,EAAsB,MAAM,sBAAsB,CAAC;AACnF,OAAO,EAAE,kBAAkB,EAA+B,MAAM,6BAA6B,CAAC;AAO9F,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AAG9D,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAC;AAW7D;;;;GAIG;AACH,wBAAsB,oBAAoB,CACxC,SAAS,EAAE,MAAM,EACjB,GAAG,EAAE,MAAM,EACX,MAAM,EAAE,eAAe,GAAG,kBAAkB,GAAG,kBAAkB,GAAG,IAAI,CAAC,iBAAiB,EAAE,oBAAoB,CAAC,EACjH,OAAO,CAAC,EAAE,WAAW,iLAyFtB"}
@@ -1,16 +1,23 @@
1
1
  import { createArchiverStore, createContractDataSource } from '@aztec/archiver';
2
+ import { BlockNumber } from '@aztec/foundation/branded-types';
3
+ import { DateProvider } from '@aztec/foundation/timer';
2
4
  import { createProverClient } from '@aztec/prover-client';
3
5
  import { createAndStartProvingBroker } from '@aztec/prover-client/broker';
6
+ import { getLastSiblingPath } from '@aztec/prover-client/helpers';
7
+ import { ChonkCache } from '@aztec/prover-client/orchestrator';
4
8
  import { PublicProcessorFactory } from '@aztec/simulator/server';
9
+ import { getEpochAtSlot, getSlotRangeForEpoch } from '@aztec/stdlib/epoch-helpers';
10
+ import { MerkleTreeId } from '@aztec/stdlib/trees';
5
11
  import { getTelemetryClient } from '@aztec/telemetry-client';
6
12
  import { createWorldState } from '@aztec/world-state';
7
13
  import { readFileSync } from 'fs';
14
+ import { CheckpointProver } from '../job/checkpoint-prover.js';
8
15
  import { deserializeEpochProvingJobData } from '../job/epoch-proving-job-data.js';
9
- import { EpochProvingJob } from '../job/epoch-proving-job.js';
16
+ import { EpochSession } from '../job/epoch-session.js';
10
17
  import { ProverNodeJobMetrics } from '../metrics.js';
11
18
  /**
12
19
  * Given a local folder where `downloadEpochProvingJob` was called, creates a new archiver and world state
13
- * using the state snapshots, and creates a new epoch proving job to prove the downloaded proving job.
20
+ * using the state snapshots, and creates a new epoch proving session to prove the downloaded proving job.
14
21
  * Proving is done with a local proving broker and agents as specified by the config.
15
22
  */ export async function rerunEpochProvingJob(localPath, log, config, genesis) {
16
23
  const jobData = deserializeEpochProvingJobData(readFileSync(localPath));
@@ -21,22 +28,95 @@ import { ProverNodeJobMetrics } from '../metrics.js';
21
28
  const initialBlockHash = await worldState.getInitialHeader().hash();
22
29
  const archiver = await createArchiverStore(config, initialBlockHash);
23
30
  const publicProcessorFactory = new PublicProcessorFactory(createContractDataSource(archiver), undefined, undefined, log.getBindings());
24
- const publisher = {
25
- submitEpochProof: ()=>Promise.resolve(true),
26
- analyzeEpochProofSubmission: ()=>Promise.resolve()
31
+ // Local rerun never publishes — stub the service so submit() always resolves 'published'
32
+ // and withdraw is a no-op.
33
+ const publishingService = {
34
+ submit: ()=>Promise.resolve('published'),
35
+ withdraw: ()=>{}
27
36
  };
28
- const l2BlockSourceForReorgDetection = undefined;
29
- const deadline = undefined;
30
- // This starts a local proving broker that does not get exposed as a service. This should be good enough for
31
- // smallish epochs to be proven if we run on a large machine, but as epochs grow larger, we may want to switch
32
- // this out for a live proving broker with multiple agents that we can connect to.
33
37
  const broker = await createAndStartProvingBroker(config, telemetry);
34
38
  const prover = await createProverClient(config, worldState, broker, telemetry);
35
- const provingJob = new EpochProvingJob(jobData, worldState, prover.createEpochProver(), publicProcessorFactory, publisher, l2BlockSourceForReorgDetection, metrics, deadline, {
36
- skipEpochCheck: true
37
- }, log.getBindings());
38
- log.info(`Rerunning epoch proving job for epoch ${jobData.epochNumber}`);
39
- await provingJob.run();
40
- log.info(`Completed job for epoch ${jobData.epochNumber} with status ${provingJob.getState()}`);
41
- return provingJob.getState();
39
+ const chonkCache = new ChonkCache(log.getBindings());
40
+ const txProvider = makeReplayingTxProvider(jobData.txs);
41
+ log.info(`Rerunning epoch proving for epoch ${jobData.epochNumber}`);
42
+ const provers = [];
43
+ for(let i = 0; i < jobData.checkpoints.length; i++){
44
+ const checkpoint = jobData.checkpoints[i];
45
+ const previousBlockHeader = i === 0 ? jobData.previousBlockHeader : jobData.checkpoints[i - 1].blocks.at(-1).header;
46
+ const l1ToL2Messages = jobData.l1ToL2Messages[checkpoint.number] ?? [];
47
+ const previousArchiveSiblingPath = await getLastSiblingPath(MerkleTreeId.ARCHIVE, worldState.getSnapshot(BlockNumber(checkpoint.blocks[0].number - 1)));
48
+ const attestations = checkpoint.number === jobData.checkpoints.at(-1).number ? jobData.attestations : [];
49
+ provers.push(new CheckpointProver({
50
+ checkpoint,
51
+ epochNumber: jobData.epochNumber,
52
+ attestations,
53
+ previousBlockHeader,
54
+ l1ToL2Messages,
55
+ previousArchiveSiblingPath
56
+ }, {
57
+ proverFactory: prover,
58
+ chonkCache,
59
+ publicProcessorFactory,
60
+ dbProvider: worldState,
61
+ txProvider,
62
+ dateProvider: new DateProvider(),
63
+ proverId: prover.getProverId(),
64
+ metrics,
65
+ txGatheringTimeoutMs: 120_000,
66
+ deadline: undefined,
67
+ log
68
+ }));
69
+ }
70
+ const l1Constants = {
71
+ epochDuration: config.aztecEpochDuration
72
+ };
73
+ const [fromSlot, toSlot] = getSlotRangeForEpoch(jobData.epochNumber, l1Constants);
74
+ const spec = {
75
+ kind: 'full',
76
+ epochNumber: jobData.epochNumber,
77
+ fromSlot,
78
+ toSlot
79
+ };
80
+ const session = new EpochSession(spec, provers, {
81
+ proverFactory: prover,
82
+ proverId: prover.getProverId(),
83
+ publishingService,
84
+ metrics,
85
+ dateProvider: new DateProvider(),
86
+ deadline: undefined,
87
+ config: {},
88
+ bindings: log.getBindings()
89
+ });
90
+ const finalState = await session.start();
91
+ log.info(`Completed proving for epoch ${jobData.epochNumber} with status ${finalState}`, {
92
+ derivedEpoch: getEpochAtSlot(provers[0].slotNumber, l1Constants)
93
+ });
94
+ return finalState;
95
+ }
96
+ /** Build a synthetic ITxProvider that returns the supplied txs map by lookup. */ function makeReplayingTxProvider(txs) {
97
+ const lookup = (hashes)=>{
98
+ const found = [];
99
+ const missing = [];
100
+ for (const hash of hashes){
101
+ const tx = txs.get(hash.toString());
102
+ if (tx) {
103
+ found.push(tx);
104
+ } else {
105
+ missing.push(hash);
106
+ }
107
+ }
108
+ return {
109
+ txs: found,
110
+ missingTxs: missing
111
+ };
112
+ };
113
+ return {
114
+ getAvailableTxs: (hashes)=>Promise.resolve(lookup(hashes)),
115
+ hasTxs: (hashes)=>Promise.resolve(hashes.map((h)=>txs.has(h.toString()))),
116
+ getTxsForBlockProposal: ()=>Promise.resolve({
117
+ txs: [],
118
+ missingTxs: []
119
+ }),
120
+ getTxsForBlock: (block)=>Promise.resolve(lookup(block.body.txEffects.map((e)=>e.txHash)))
121
+ };
42
122
  }