@aztec/prover-client 0.66.0 → 0.67.1-devnet

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/dest/block_builder/light.d.ts +4 -3
  2. package/dest/block_builder/light.d.ts.map +1 -1
  3. package/dest/block_builder/light.js +30 -20
  4. package/dest/index.d.ts +0 -1
  5. package/dest/index.d.ts.map +1 -1
  6. package/dest/index.js +1 -2
  7. package/dest/mocks/fixtures.d.ts +3 -3
  8. package/dest/mocks/fixtures.d.ts.map +1 -1
  9. package/dest/mocks/fixtures.js +2 -2
  10. package/dest/mocks/test_context.d.ts +10 -9
  11. package/dest/mocks/test_context.d.ts.map +1 -1
  12. package/dest/mocks/test_context.js +24 -13
  13. package/dest/orchestrator/block-building-helpers.d.ts +10 -6
  14. package/dest/orchestrator/block-building-helpers.d.ts.map +1 -1
  15. package/dest/orchestrator/block-building-helpers.js +27 -16
  16. package/dest/orchestrator/block-proving-state.d.ts +6 -5
  17. package/dest/orchestrator/block-proving-state.d.ts.map +1 -1
  18. package/dest/orchestrator/block-proving-state.js +16 -8
  19. package/dest/orchestrator/epoch-proving-state.d.ts +1 -1
  20. package/dest/orchestrator/epoch-proving-state.d.ts.map +1 -1
  21. package/dest/orchestrator/epoch-proving-state.js +3 -3
  22. package/dest/orchestrator/orchestrator.d.ts +11 -8
  23. package/dest/orchestrator/orchestrator.d.ts.map +1 -1
  24. package/dest/orchestrator/orchestrator.js +94 -58
  25. package/dest/orchestrator/orchestrator_metrics.d.ts.map +1 -1
  26. package/dest/orchestrator/orchestrator_metrics.js +2 -5
  27. package/dest/prover-agent/memory-proving-queue.d.ts +2 -1
  28. package/dest/prover-agent/memory-proving-queue.d.ts.map +1 -1
  29. package/dest/prover-agent/memory-proving-queue.js +241 -224
  30. package/dest/prover-agent/prover-agent.d.ts +11 -2
  31. package/dest/prover-agent/prover-agent.d.ts.map +1 -1
  32. package/dest/prover-agent/prover-agent.js +187 -160
  33. package/dest/prover-client/prover-client.d.ts +2 -3
  34. package/dest/prover-client/prover-client.d.ts.map +1 -1
  35. package/dest/prover-client/prover-client.js +6 -9
  36. package/dest/proving_broker/broker_prover_facade.d.ts +26 -0
  37. package/dest/proving_broker/broker_prover_facade.d.ts.map +1 -0
  38. package/dest/proving_broker/broker_prover_facade.js +107 -0
  39. package/dest/proving_broker/proving_agent.d.ts +4 -3
  40. package/dest/proving_broker/proving_agent.d.ts.map +1 -1
  41. package/dest/proving_broker/proving_agent.js +74 -65
  42. package/dest/proving_broker/proving_broker.d.ts +27 -7
  43. package/dest/proving_broker/proving_broker.d.ts.map +1 -1
  44. package/dest/proving_broker/proving_broker.js +405 -258
  45. package/dest/proving_broker/proving_broker_database/persisted.d.ts.map +1 -1
  46. package/dest/proving_broker/proving_broker_database/persisted.js +4 -8
  47. package/dest/proving_broker/proving_broker_instrumentation.d.ts.map +1 -1
  48. package/dest/proving_broker/proving_broker_instrumentation.js +2 -8
  49. package/dest/proving_broker/proving_job_controller.d.ts +2 -1
  50. package/dest/proving_broker/proving_job_controller.d.ts.map +1 -1
  51. package/dest/proving_broker/proving_job_controller.js +15 -14
  52. package/dest/proving_broker/rpc.js +2 -2
  53. package/dest/test/mock_prover.d.ts +6 -6
  54. package/dest/test/mock_prover.d.ts.map +1 -1
  55. package/dest/test/mock_prover.js +5 -5
  56. package/package.json +18 -13
  57. package/src/block_builder/light.ts +31 -22
  58. package/src/index.ts +0 -1
  59. package/src/mocks/fixtures.ts +4 -4
  60. package/src/mocks/test_context.ts +39 -24
  61. package/src/orchestrator/block-building-helpers.ts +33 -20
  62. package/src/orchestrator/block-proving-state.ts +17 -6
  63. package/src/orchestrator/epoch-proving-state.ts +0 -2
  64. package/src/orchestrator/orchestrator.ts +111 -62
  65. package/src/orchestrator/orchestrator_metrics.ts +1 -11
  66. package/src/prover-agent/memory-proving-queue.ts +12 -7
  67. package/src/prover-agent/prover-agent.ts +67 -48
  68. package/src/prover-client/prover-client.ts +5 -12
  69. package/src/proving_broker/{caching_broker_facade.ts → broker_prover_facade.ts} +62 -85
  70. package/src/proving_broker/proving_agent.ts +74 -78
  71. package/src/proving_broker/proving_broker.ts +240 -73
  72. package/src/proving_broker/proving_broker_database/persisted.ts +2 -8
  73. package/src/proving_broker/proving_broker_instrumentation.ts +0 -7
  74. package/src/proving_broker/proving_job_controller.ts +13 -12
  75. package/src/proving_broker/rpc.ts +1 -1
  76. package/src/test/mock_prover.ts +7 -3
  77. package/dest/proving_broker/caching_broker_facade.d.ts +0 -30
  78. package/dest/proving_broker/caching_broker_facade.d.ts.map +0 -1
  79. package/dest/proving_broker/caching_broker_facade.js +0 -150
  80. package/dest/proving_broker/prover_cache/memory.d.ts +0 -9
  81. package/dest/proving_broker/prover_cache/memory.d.ts.map +0 -1
  82. package/dest/proving_broker/prover_cache/memory.js +0 -16
  83. package/src/proving_broker/prover_cache/memory.ts +0 -20
@@ -9,11 +9,12 @@ import {
9
9
  type ProvingJobStatus,
10
10
  ProvingRequestType,
11
11
  } from '@aztec/circuit-types';
12
- import { createDebugLogger } from '@aztec/foundation/log';
12
+ import { asyncPool } from '@aztec/foundation/async-pool';
13
+ import { createLogger } from '@aztec/foundation/log';
13
14
  import { type PromiseWithResolvers, RunningPromise, promiseWithResolvers } from '@aztec/foundation/promise';
14
15
  import { PriorityMemoryQueue } from '@aztec/foundation/queue';
15
16
  import { Timer } from '@aztec/foundation/timer';
16
- import { type TelemetryClient } from '@aztec/telemetry-client';
17
+ import { type TelemetryClient, type Traceable, type Tracer, trackSpan } from '@aztec/telemetry-client';
17
18
 
18
19
  import assert from 'assert';
19
20
 
@@ -30,29 +31,33 @@ type ProofRequestBrokerConfig = {
30
31
  timeoutIntervalMs?: number;
31
32
  jobTimeoutMs?: number;
32
33
  maxRetries?: number;
34
+ maxEpochsToKeepResultsFor?: number;
35
+ maxParallelCleanUps?: number;
33
36
  };
34
37
 
38
+ type EnqueuedProvingJob = Pick<ProvingJob, 'id' | 'epochNumber'>;
39
+
35
40
  /**
36
41
  * A broker that manages proof requests and distributes them to workers based on their priority.
37
42
  * It takes a backend that is responsible for storing and retrieving proof requests and results.
38
43
  */
39
- export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
44
+ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer, Traceable {
40
45
  private queues: ProvingQueues = {
41
- [ProvingRequestType.PUBLIC_VM]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
42
- [ProvingRequestType.TUBE_PROOF]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
43
- [ProvingRequestType.PRIVATE_KERNEL_EMPTY]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
46
+ [ProvingRequestType.PUBLIC_VM]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
47
+ [ProvingRequestType.TUBE_PROOF]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
48
+ [ProvingRequestType.PRIVATE_KERNEL_EMPTY]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
44
49
 
45
- [ProvingRequestType.PRIVATE_BASE_ROLLUP]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
46
- [ProvingRequestType.PUBLIC_BASE_ROLLUP]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
47
- [ProvingRequestType.MERGE_ROLLUP]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
48
- [ProvingRequestType.ROOT_ROLLUP]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
50
+ [ProvingRequestType.PRIVATE_BASE_ROLLUP]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
51
+ [ProvingRequestType.PUBLIC_BASE_ROLLUP]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
52
+ [ProvingRequestType.MERGE_ROLLUP]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
53
+ [ProvingRequestType.ROOT_ROLLUP]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
49
54
 
50
- [ProvingRequestType.BLOCK_MERGE_ROLLUP]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
51
- [ProvingRequestType.BLOCK_ROOT_ROLLUP]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
52
- [ProvingRequestType.EMPTY_BLOCK_ROOT_ROLLUP]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
55
+ [ProvingRequestType.BLOCK_MERGE_ROLLUP]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
56
+ [ProvingRequestType.BLOCK_ROOT_ROLLUP]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
57
+ [ProvingRequestType.EMPTY_BLOCK_ROOT_ROLLUP]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
53
58
 
54
- [ProvingRequestType.BASE_PARITY]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
55
- [ProvingRequestType.ROOT_PARITY]: new PriorityMemoryQueue<ProvingJob>(provingJobComparator),
59
+ [ProvingRequestType.BASE_PARITY]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
60
+ [ProvingRequestType.ROOT_PARITY]: new PriorityMemoryQueue<EnqueuedProvingJob>(provingJobComparator),
56
61
  };
57
62
 
58
63
  // holds a copy of the database in memory in order to quickly fulfill requests
@@ -76,23 +81,48 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
76
81
  // a map of promises that will be resolved when a job is settled
77
82
  private promises = new Map<ProvingJobId, PromiseWithResolvers<ProvingJobSettledResult>>();
78
83
 
79
- private timeoutPromise: RunningPromise;
80
- private timeSource = () => Math.floor(Date.now() / 1000);
84
+ private cleanupPromise: RunningPromise;
85
+ private msTimeSource = () => Date.now();
81
86
  private jobTimeoutMs: number;
82
87
  private maxRetries: number;
83
88
 
84
89
  private instrumentation: ProvingBrokerInstrumentation;
90
+ public readonly tracer: Tracer;
91
+
92
+ private maxParallelCleanUps: number;
93
+
94
+ /**
95
+ * The broker keeps track of the highest epoch its seen.
96
+ * This information is used for garbage collection: once it reaches the next epoch, it can start pruning the database of old state.
97
+ * This clean up pass is only done against _settled_ jobs. This pass will not cancel jobs that are in-progress or in-queue.
98
+ * It is a client responsibility to cancel jobs if they are no longer necessary.
99
+ * Example:
100
+ * proving epoch 11 - the broker will wipe all setlled jobs for epochs 9 and lower
101
+ * finished proving epoch 11 and got first job for epoch 12 -> the broker will wipe all setlled jobs for epochs 10 and lower
102
+ * reorged back to end of epoch 10 -> epoch 11 is skipped and epoch 12 starts -> the broker will wipe all setlled jobs for epochs 10 and lower
103
+ */
104
+ private epochHeight = 0;
105
+ private maxEpochsToKeepResultsFor = 1;
85
106
 
86
107
  public constructor(
87
108
  private database: ProvingBrokerDatabase,
88
109
  client: TelemetryClient,
89
- { jobTimeoutMs = 30_000, timeoutIntervalMs = 10_000, maxRetries = 3 }: ProofRequestBrokerConfig = {},
90
- private logger = createDebugLogger('aztec:prover-client:proving-broker'),
110
+ {
111
+ jobTimeoutMs = 30_000,
112
+ timeoutIntervalMs = 10_000,
113
+ maxRetries = 3,
114
+ maxEpochsToKeepResultsFor = 1,
115
+ maxParallelCleanUps = 20,
116
+ }: ProofRequestBrokerConfig = {},
117
+ private logger = createLogger('prover-client:proving-broker'),
91
118
  ) {
119
+ this.tracer = client.getTracer('ProvingBroker');
92
120
  this.instrumentation = new ProvingBrokerInstrumentation(client);
93
- this.timeoutPromise = new RunningPromise(this.timeoutCheck, timeoutIntervalMs);
121
+ this.cleanupPromise = new RunningPromise(this.cleanupPass.bind(this), this.logger, timeoutIntervalMs);
94
122
  this.jobTimeoutMs = jobTimeoutMs;
95
123
  this.maxRetries = maxRetries;
124
+ this.maxEpochsToKeepResultsFor = maxEpochsToKeepResultsFor;
125
+ this.maxParallelCleanUps = maxParallelCleanUps;
96
126
  }
97
127
 
98
128
  private measureQueueDepth: MonitorCallback = (type: ProvingRequestType) => {
@@ -113,7 +143,10 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
113
143
 
114
144
  public start(): Promise<void> {
115
145
  for (const [item, result] of this.database.allProvingJobs()) {
116
- this.logger.info(`Restoring proving job id=${item.id} settled=${!!result}`);
146
+ this.logger.info(`Restoring proving job id=${item.id} settled=${!!result}`, {
147
+ provingJobId: item.id,
148
+ status: result ? result.status : 'pending',
149
+ });
117
150
 
118
151
  this.jobsCache.set(item.id, item);
119
152
  this.promises.set(item.id, promiseWithResolvers());
@@ -122,12 +155,11 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
122
155
  this.promises.get(item.id)!.resolve(result);
123
156
  this.resultsCache.set(item.id, result);
124
157
  } else {
125
- this.logger.debug(`Re-enqueuing proving job id=${item.id}`);
126
158
  this.enqueueJobInternal(item);
127
159
  }
128
160
  }
129
161
 
130
- this.timeoutPromise.start();
162
+ this.cleanupPromise.start();
131
163
 
132
164
  this.instrumentation.monitorQueueDepth(this.measureQueueDepth);
133
165
  this.instrumentation.monitorActiveJobs(this.countActiveJobs);
@@ -135,39 +167,75 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
135
167
  return Promise.resolve();
136
168
  }
137
169
 
138
- public stop(): Promise<void> {
139
- return this.timeoutPromise.stop();
170
+ public async stop(): Promise<void> {
171
+ await this.cleanupPromise.stop();
140
172
  }
141
173
 
142
174
  public async enqueueProvingJob(job: ProvingJob): Promise<void> {
143
175
  if (this.jobsCache.has(job.id)) {
144
176
  const existing = this.jobsCache.get(job.id);
145
177
  assert.deepStrictEqual(job, existing, 'Duplicate proving job ID');
178
+ this.logger.debug(`Duplicate proving job id=${job.id} epochNumber=${job.epochNumber}. Ignoring`, {
179
+ provingJobId: job.id,
180
+ });
146
181
  return;
147
182
  }
148
183
 
149
- await this.database.addProvingJob(job);
150
- this.jobsCache.set(job.id, job);
151
- this.enqueueJobInternal(job);
184
+ if (this.isJobStale(job)) {
185
+ this.logger.warn(`Tried enqueueing stale proving job id=${job.id} epochNumber=${job.epochNumber}`, {
186
+ provingJobId: job.id,
187
+ });
188
+ throw new Error(`Epoch too old: job epoch ${job.epochNumber}, current epoch: ${this.epochHeight}`);
189
+ }
190
+
191
+ this.logger.info(`New proving job id=${job.id} epochNumber=${job.epochNumber}`, { provingJobId: job.id });
192
+ try {
193
+ // do this first so it acts as a "lock". If this job is enqueued again while we're saving it the if at the top will catch it.
194
+ this.jobsCache.set(job.id, job);
195
+ await this.database.addProvingJob(job);
196
+ this.enqueueJobInternal(job);
197
+ } catch (err) {
198
+ this.logger.error(`Failed to save proving job id=${job.id}: ${err}`, err, { provingJobId: job.id });
199
+ this.jobsCache.delete(job.id);
200
+ throw err;
201
+ }
152
202
  }
153
203
 
154
204
  public waitForJobToSettle(id: ProvingJobId): Promise<ProvingJobSettledResult> {
155
205
  const promiseWithResolvers = this.promises.get(id);
156
206
  if (!promiseWithResolvers) {
207
+ this.logger.warn(`Job id=${id} not found`, { provingJobId: id });
157
208
  return Promise.resolve({ status: 'rejected', reason: `Job ${id} not found` });
158
209
  }
159
210
  return promiseWithResolvers.promise;
160
211
  }
161
212
 
162
- public async removeAndCancelProvingJob(id: ProvingJobId): Promise<void> {
163
- this.logger.info(`Cancelling job id=${id}`);
164
- await this.database.deleteProvingJobAndResult(id);
213
+ public async cancelProvingJob(id: ProvingJobId): Promise<void> {
214
+ if (!this.jobsCache.has(id)) {
215
+ this.logger.warn(`Can't cancel a job that doesn't exist id=${id}`, { provingJobId: id });
216
+ return;
217
+ }
165
218
 
166
219
  // notify listeners of the cancellation
167
220
  if (!this.resultsCache.has(id)) {
168
- this.promises.get(id)?.resolve({ status: 'rejected', reason: 'Aborted' });
221
+ this.logger.info(`Cancelling job id=${id}`, { provingJobId: id });
222
+ await this.reportProvingJobError(id, 'Aborted', false);
223
+ }
224
+ }
225
+
226
+ private async cleanUpProvingJobState(id: ProvingJobId): Promise<void> {
227
+ if (!this.jobsCache.has(id)) {
228
+ this.logger.warn(`Can't clean up a job that doesn't exist id=${id}`, { provingJobId: id });
229
+ return;
169
230
  }
170
231
 
232
+ if (!this.resultsCache.has(id)) {
233
+ this.logger.warn(`Can't cleanup busy proving job: id=${id}`, { provingJobId: id });
234
+ return;
235
+ }
236
+
237
+ this.logger.debug(`Cleaning up state for job id=${id}`, { provingJobId: id });
238
+ await this.database.deleteProvingJobAndResult(id);
171
239
  this.jobsCache.delete(id);
172
240
  this.promises.delete(id);
173
241
  this.resultsCache.delete(id);
@@ -184,7 +252,7 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
184
252
  const item = this.jobsCache.get(id);
185
253
 
186
254
  if (!item) {
187
- this.logger.warn(`Proving job id=${id} not found`);
255
+ this.logger.warn(`Proving job id=${id} not found`, { provingJobId: id });
188
256
  return Promise.resolve({ status: 'not-found' });
189
257
  }
190
258
 
@@ -204,14 +272,15 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
204
272
 
205
273
  for (const proofType of allowedProofs) {
206
274
  const queue = this.queues[proofType];
207
- let job: ProvingJob | undefined;
275
+ let enqueuedJob: EnqueuedProvingJob | undefined;
208
276
  // exhaust the queue and make sure we're not sending a job that's already in progress
209
277
  // or has already been completed
210
278
  // this can happen if the broker crashes and restarts
211
279
  // it's possible agents will report progress or results for jobs that are in the queue (after the restart)
212
- while ((job = queue.getImmediate())) {
213
- if (!this.inProgress.has(job.id) && !this.resultsCache.has(job.id)) {
214
- const time = this.timeSource();
280
+ while ((enqueuedJob = queue.getImmediate())) {
281
+ const job = this.jobsCache.get(enqueuedJob.id);
282
+ if (job && !this.inProgress.has(enqueuedJob.id) && !this.resultsCache.has(enqueuedJob.id)) {
283
+ const time = this.msTimeSource();
215
284
  this.inProgress.set(job.id, {
216
285
  id: job.id,
217
286
  startedAt: time,
@@ -236,37 +305,67 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
236
305
  const retries = this.retries.get(id) ?? 0;
237
306
 
238
307
  if (!item) {
239
- this.logger.warn(`Proving job id=${id} not found`);
308
+ this.logger.warn(`Can't set error on unknown proving job id=${id} err=${err}`, { provingJoId: id });
240
309
  return;
241
310
  }
242
311
 
243
312
  if (!info) {
244
- this.logger.warn(`Proving job id=${id} type=${ProvingRequestType[item.type]} not in the in-progress set`);
313
+ this.logger.warn(`Proving job id=${id} type=${ProvingRequestType[item.type]} not in the in-progress set`, {
314
+ provingJobId: id,
315
+ });
245
316
  } else {
246
317
  this.inProgress.delete(id);
247
318
  }
248
319
 
249
- if (retry && retries + 1 < this.maxRetries) {
250
- this.logger.info(`Retrying proving job id=${id} type=${ProvingRequestType[item.type]} retry=${retries + 1}`);
320
+ if (this.resultsCache.has(id)) {
321
+ this.logger.warn(`Proving job id=${id} is already settled, ignoring err=${err}`, {
322
+ provingJobId: id,
323
+ });
324
+ return;
325
+ }
326
+
327
+ if (retry && retries + 1 < this.maxRetries && !this.isJobStale(item)) {
328
+ this.logger.info(
329
+ `Retrying proving job id=${id} type=${ProvingRequestType[item.type]} retry=${retries + 1} err=${err}`,
330
+ {
331
+ provingJobId: id,
332
+ },
333
+ );
251
334
  this.retries.set(id, retries + 1);
252
335
  this.enqueueJobInternal(item);
253
336
  this.instrumentation.incRetriedJobs(item.type);
254
337
  return;
255
338
  }
256
339
 
257
- this.logger.debug(
258
- `Marking proving job id=${id} type=${ProvingRequestType[item.type]} totalAttempts=${retries + 1} as failed`,
340
+ this.logger.info(
341
+ `Marking proving job as failed id=${id} type=${ProvingRequestType[item.type]} totalAttempts=${
342
+ retries + 1
343
+ } err=${err}`,
344
+ {
345
+ provingJobId: id,
346
+ },
259
347
  );
260
348
 
261
- await this.database.setProvingJobError(id, err);
262
-
349
+ // save the result to the cache and notify clients of the job status
350
+ // this should work even if our database breaks because the result is cached in memory
263
351
  const result: ProvingJobSettledResult = { status: 'rejected', reason: String(err) };
264
352
  this.resultsCache.set(id, result);
265
353
  this.promises.get(id)!.resolve(result);
354
+
266
355
  this.instrumentation.incRejectedJobs(item.type);
267
356
  if (info) {
268
- const duration = this.timeSource() - info.startedAt;
269
- this.instrumentation.recordJobDuration(item.type, duration * 1000);
357
+ const duration = this.msTimeSource() - info.startedAt;
358
+ this.instrumentation.recordJobDuration(item.type, duration);
359
+ }
360
+
361
+ try {
362
+ await this.database.setProvingJobError(id, err);
363
+ } catch (saveErr) {
364
+ this.logger.error(`Failed to save proving job error status id=${id} jobErr=${err}`, saveErr, {
365
+ provingJobId: id,
366
+ });
367
+
368
+ throw saveErr;
270
369
  }
271
370
  }
272
371
 
@@ -277,15 +376,21 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
277
376
  ): Promise<{ job: ProvingJob; time: number } | undefined> {
278
377
  const job = this.jobsCache.get(id);
279
378
  if (!job) {
280
- this.logger.warn(`Proving job id=${id} does not exist`);
379
+ this.logger.warn(`Proving job id=${id} does not exist`, { provingJobId: id });
380
+ return filter ? this.getProvingJob(filter) : Promise.resolve(undefined);
381
+ }
382
+
383
+ if (this.resultsCache.has(id)) {
384
+ this.logger.warn(`Proving job id=${id} has already been completed`, { provingJobId: id });
281
385
  return filter ? this.getProvingJob(filter) : Promise.resolve(undefined);
282
386
  }
283
387
 
284
388
  const metadata = this.inProgress.get(id);
285
- const now = this.timeSource();
389
+ const now = this.msTimeSource();
286
390
  if (!metadata) {
287
391
  this.logger.warn(
288
392
  `Proving job id=${id} type=${ProvingRequestType[job.type]} not found in the in-progress cache, adding it`,
393
+ { provingJobId: id },
289
394
  );
290
395
  // the queue will still contain the item at this point!
291
396
  // we need to be careful when popping off the queue to make sure we're not sending
@@ -293,16 +398,17 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
293
398
  this.inProgress.set(id, {
294
399
  id,
295
400
  startedAt,
296
- lastUpdatedAt: this.timeSource(),
401
+ lastUpdatedAt: this.msTimeSource(),
297
402
  });
298
403
  return Promise.resolve(undefined);
299
404
  } else if (startedAt <= metadata.startedAt) {
300
405
  if (startedAt < metadata.startedAt) {
301
- this.logger.debug(
406
+ this.logger.info(
302
407
  `Proving job id=${id} type=${ProvingRequestType[job.type]} startedAt=${startedAt} older agent has taken job`,
408
+ { provingJobId: id },
303
409
  );
304
410
  } else {
305
- this.logger.debug(`Proving job id=${id} type=${ProvingRequestType[job.type]} heartbeat`);
411
+ this.logger.debug(`Proving job id=${id} type=${ProvingRequestType[job.type]} heartbeat`, { provingJobId: id });
306
412
  }
307
413
  metadata.startedAt = startedAt;
308
414
  metadata.lastUpdatedAt = now;
@@ -312,6 +418,7 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
312
418
  `Proving job id=${id} type=${
313
419
  ProvingRequestType[job.type]
314
420
  } already being worked on by another agent. Sending new one`,
421
+ { provingJobId: id },
315
422
  );
316
423
  return this.getProvingJob(filter);
317
424
  } else {
@@ -324,60 +431,122 @@ export class ProvingBroker implements ProvingJobProducer, ProvingJobConsumer {
324
431
  const item = this.jobsCache.get(id);
325
432
  const retries = this.retries.get(id) ?? 0;
326
433
  if (!item) {
327
- this.logger.warn(`Proving job id=${id} not found`);
434
+ this.logger.warn(`Proving job id=${id} not found`, { provingJobId: id });
328
435
  return;
329
436
  }
330
437
 
331
438
  if (!info) {
332
- this.logger.warn(`Proving job id=${id} type=${ProvingRequestType[item.type]} not in the in-progress set`);
439
+ this.logger.warn(`Proving job id=${id} type=${ProvingRequestType[item.type]} not in the in-progress set`, {
440
+ provingJobId: id,
441
+ });
333
442
  } else {
334
443
  this.inProgress.delete(id);
335
444
  }
336
445
 
337
- this.logger.debug(
446
+ if (this.resultsCache.has(id)) {
447
+ this.logger.warn(`Proving job id=${id} already settled, ignoring result`, { provingJobId: id });
448
+ return;
449
+ }
450
+
451
+ this.logger.info(
338
452
  `Proving job complete id=${id} type=${ProvingRequestType[item.type]} totalAttempts=${retries + 1}`,
453
+ { provingJobId: id },
339
454
  );
340
455
 
341
- await this.database.setProvingJobResult(id, value);
342
-
456
+ // save result to our local cache and notify clients
457
+ // if save to database fails, that's ok because we have the result in memory
458
+ // if the broker crashes and needs the result again, we're covered because we can just recompute it
343
459
  const result: ProvingJobSettledResult = { status: 'fulfilled', value };
344
460
  this.resultsCache.set(id, result);
345
461
  this.promises.get(id)!.resolve(result);
462
+
346
463
  this.instrumentation.incResolvedJobs(item.type);
464
+ if (info) {
465
+ const duration = this.msTimeSource() - info.startedAt;
466
+ this.instrumentation.recordJobDuration(item.type, duration);
467
+ }
468
+
469
+ try {
470
+ await this.database.setProvingJobResult(id, value);
471
+ } catch (saveErr) {
472
+ this.logger.error(`Failed to save proving job result id=${id}`, saveErr, {
473
+ provingJobId: id,
474
+ });
475
+
476
+ throw saveErr;
477
+ }
347
478
  }
348
479
 
349
- private timeoutCheck = () => {
480
+ @trackSpan('ProvingBroker.cleanupPass')
481
+ private async cleanupPass() {
482
+ await this.cleanupStaleJobs();
483
+ await this.reEnqueueExpiredJobs();
484
+ }
485
+
486
+ private async cleanupStaleJobs() {
487
+ const jobIds = Array.from(this.jobsCache.keys());
488
+ const jobsToClean: ProvingJobId[] = [];
489
+ for (const id of jobIds) {
490
+ const job = this.jobsCache.get(id)!;
491
+ const isComplete = this.resultsCache.has(id);
492
+ if (isComplete && this.isJobStale(job)) {
493
+ jobsToClean.push(id);
494
+ }
495
+ }
496
+
497
+ if (jobsToClean.length > 0) {
498
+ this.logger.info(`Cleaning up jobs=${jobsToClean.length}`);
499
+ await asyncPool(this.maxParallelCleanUps, jobsToClean, async jobId => {
500
+ await this.cleanUpProvingJobState(jobId);
501
+ });
502
+ }
503
+ }
504
+
505
+ private async reEnqueueExpiredJobs() {
350
506
  const inProgressEntries = Array.from(this.inProgress.entries());
351
507
  for (const [id, metadata] of inProgressEntries) {
352
508
  const item = this.jobsCache.get(id);
353
509
  if (!item) {
354
- this.logger.warn(`Proving job id=${id} not found. Removing it from the queue.`);
510
+ this.logger.warn(`Proving job id=${id} not found. Removing it from the queue.`, { provingJobId: id });
355
511
  this.inProgress.delete(id);
356
512
  continue;
357
513
  }
358
514
 
359
- const msSinceLastUpdate = (this.timeSource() - metadata.lastUpdatedAt) * 1000;
515
+ const now = this.msTimeSource();
516
+ const msSinceLastUpdate = now - metadata.lastUpdatedAt;
360
517
  if (msSinceLastUpdate >= this.jobTimeoutMs) {
361
- this.logger.warn(`Proving job id=${id} timed out. Adding it back to the queue.`);
362
- this.inProgress.delete(id);
363
- this.enqueueJobInternal(item);
364
- this.instrumentation.incTimedOutJobs(item.type);
518
+ if (this.isJobStale(item)) {
519
+ // the job has timed out and it's also old, just cancel and move on
520
+ await this.cancelProvingJob(item.id);
521
+ } else {
522
+ this.logger.warn(`Proving job id=${id} timed out. Adding it back to the queue.`, { provingJobId: id });
523
+ this.inProgress.delete(id);
524
+ this.enqueueJobInternal(item);
525
+ this.instrumentation.incTimedOutJobs(item.type);
526
+ }
365
527
  }
366
528
  }
367
- };
529
+ }
368
530
 
369
531
  private enqueueJobInternal(job: ProvingJob): void {
370
532
  if (!this.promises.has(job.id)) {
371
533
  this.promises.set(job.id, promiseWithResolvers());
372
534
  }
373
- this.queues[job.type].put(job);
535
+ this.queues[job.type].put({
536
+ epochNumber: job.epochNumber,
537
+ id: job.id,
538
+ });
374
539
  this.enqueuedAt.set(job.id, new Timer());
375
- this.logger.debug(`Enqueued new proving job id=${job.id}`);
540
+ this.epochHeight = Math.max(this.epochHeight, job.epochNumber);
541
+ }
542
+
543
+ private isJobStale(job: ProvingJob) {
544
+ return job.epochNumber < this.epochHeight - this.maxEpochsToKeepResultsFor;
376
545
  }
377
546
  }
378
547
 
379
548
  type ProvingQueues = {
380
- [K in ProvingRequestType]: PriorityMemoryQueue<ProvingJob>;
549
+ [K in ProvingRequestType]: PriorityMemoryQueue<EnqueuedProvingJob>;
381
550
  };
382
551
 
383
552
  /**
@@ -386,12 +555,10 @@ type ProvingQueues = {
386
555
  * @param b - Another proving job
387
556
  * @returns A number indicating the relative priority of the two proving jobs
388
557
  */
389
- function provingJobComparator(a: ProvingJob, b: ProvingJob): -1 | 0 | 1 {
390
- const aBlockNumber = a.blockNumber ?? 0;
391
- const bBlockNumber = b.blockNumber ?? 0;
392
- if (aBlockNumber < bBlockNumber) {
558
+ function provingJobComparator(a: EnqueuedProvingJob, b: EnqueuedProvingJob): -1 | 0 | 1 {
559
+ if (a.epochNumber < b.epochNumber) {
393
560
  return -1;
394
- } else if (aBlockNumber > bBlockNumber) {
561
+ } else if (a.epochNumber > b.epochNumber) {
395
562
  return 1;
396
563
  } else {
397
564
  return 0;
@@ -1,7 +1,7 @@
1
1
  import { type ProofUri, ProvingJob, type ProvingJobId, ProvingJobSettledResult } from '@aztec/circuit-types';
2
2
  import { jsonParseWithSchema, jsonStringify } from '@aztec/foundation/json-rpc';
3
3
  import { type AztecKVStore, type AztecMap } from '@aztec/kv-store';
4
- import { LmdbMetrics, Metrics, type TelemetryClient } from '@aztec/telemetry-client';
4
+ import { Attributes, LmdbMetrics, type TelemetryClient } from '@aztec/telemetry-client';
5
5
 
6
6
  import { type ProvingBrokerDatabase } from '../proving_broker_database.js';
7
7
 
@@ -14,14 +14,8 @@ export class KVBrokerDatabase implements ProvingBrokerDatabase {
14
14
  this.metrics = new LmdbMetrics(
15
15
  client.getMeter('KVBrokerDatabase'),
16
16
  {
17
- name: Metrics.PROVING_QUEUE_DB_MAP_SIZE,
18
- description: 'Database map size for the proving broker',
17
+ [Attributes.DB_DATA_TYPE]: 'prover-broker',
19
18
  },
20
- {
21
- name: Metrics.PROVING_QUEUE_DB_USED_SIZE,
22
- description: 'Database used size for the proving broker',
23
- },
24
- { name: Metrics.PROVING_QUEUE_DB_NUM_ITEMS, description: 'Number of items in the broker database' },
25
19
  () => store.estimateSize(),
26
20
  );
27
21
  this.jobs = store.openMap('proving_jobs');
@@ -9,7 +9,6 @@ import {
9
9
  type TelemetryClient,
10
10
  type UpDownCounter,
11
11
  ValueType,
12
- millisecondBuckets,
13
12
  } from '@aztec/telemetry-client';
14
13
 
15
14
  export type MonitorCallback = (proofType: ProvingRequestType) => number;
@@ -55,18 +54,12 @@ export class ProvingBrokerInstrumentation {
55
54
  description: 'Records how long a job sits in the queue',
56
55
  unit: 'ms',
57
56
  valueType: ValueType.INT,
58
- advice: {
59
- explicitBucketBoundaries: millisecondBuckets(1), // 10ms -> ~327s
60
- },
61
57
  });
62
58
 
63
59
  this.jobDuration = meter.createHistogram(Metrics.PROVING_QUEUE_JOB_DURATION, {
64
60
  description: 'Records how long a job takes to complete',
65
61
  unit: 'ms',
66
62
  valueType: ValueType.INT,
67
- advice: {
68
- explicitBucketBoundaries: millisecondBuckets(1), // 10ms -> ~327s
69
- },
70
63
  });
71
64
  }
72
65
 
@@ -30,6 +30,7 @@ export class ProvingJobController {
30
30
  constructor(
31
31
  private jobId: ProvingJobId,
32
32
  private inputs: ProvingJobInputs,
33
+ private epochNumber: number,
33
34
  private startedAt: number,
34
35
  private circuitProver: ServerCircuitProver,
35
36
  private onComplete: ProvingJobCompletionCallback,
@@ -100,51 +101,51 @@ export class ProvingJobController {
100
101
  const signal = this.abortController.signal;
101
102
  switch (type) {
102
103
  case ProvingRequestType.PUBLIC_VM: {
103
- return await this.circuitProver.getAvmProof(inputs, signal);
104
+ return await this.circuitProver.getAvmProof(inputs, signal, this.epochNumber);
104
105
  }
105
106
 
106
107
  case ProvingRequestType.PRIVATE_BASE_ROLLUP: {
107
- return await this.circuitProver.getPrivateBaseRollupProof(inputs, signal);
108
+ return await this.circuitProver.getPrivateBaseRollupProof(inputs, signal, this.epochNumber);
108
109
  }
109
110
 
110
111
  case ProvingRequestType.PUBLIC_BASE_ROLLUP: {
111
- return await this.circuitProver.getPublicBaseRollupProof(inputs, signal);
112
+ return await this.circuitProver.getPublicBaseRollupProof(inputs, signal, this.epochNumber);
112
113
  }
113
114
 
114
115
  case ProvingRequestType.MERGE_ROLLUP: {
115
- return await this.circuitProver.getMergeRollupProof(inputs, signal);
116
+ return await this.circuitProver.getMergeRollupProof(inputs, signal, this.epochNumber);
116
117
  }
117
118
 
118
119
  case ProvingRequestType.EMPTY_BLOCK_ROOT_ROLLUP: {
119
- return await this.circuitProver.getEmptyBlockRootRollupProof(inputs, signal);
120
+ return await this.circuitProver.getEmptyBlockRootRollupProof(inputs, signal, this.epochNumber);
120
121
  }
121
122
 
122
123
  case ProvingRequestType.BLOCK_ROOT_ROLLUP: {
123
- return await this.circuitProver.getBlockRootRollupProof(inputs, signal);
124
+ return await this.circuitProver.getBlockRootRollupProof(inputs, signal, this.epochNumber);
124
125
  }
125
126
 
126
127
  case ProvingRequestType.BLOCK_MERGE_ROLLUP: {
127
- return await this.circuitProver.getBlockMergeRollupProof(inputs, signal);
128
+ return await this.circuitProver.getBlockMergeRollupProof(inputs, signal, this.epochNumber);
128
129
  }
129
130
 
130
131
  case ProvingRequestType.ROOT_ROLLUP: {
131
- return await this.circuitProver.getRootRollupProof(inputs, signal);
132
+ return await this.circuitProver.getRootRollupProof(inputs, signal, this.epochNumber);
132
133
  }
133
134
 
134
135
  case ProvingRequestType.BASE_PARITY: {
135
- return await this.circuitProver.getBaseParityProof(inputs, signal);
136
+ return await this.circuitProver.getBaseParityProof(inputs, signal, this.epochNumber);
136
137
  }
137
138
 
138
139
  case ProvingRequestType.ROOT_PARITY: {
139
- return await this.circuitProver.getRootParityProof(inputs, signal);
140
+ return await this.circuitProver.getRootParityProof(inputs, signal, this.epochNumber);
140
141
  }
141
142
 
142
143
  case ProvingRequestType.PRIVATE_KERNEL_EMPTY: {
143
- return await this.circuitProver.getEmptyPrivateKernelProof(inputs, signal);
144
+ return await this.circuitProver.getEmptyPrivateKernelProof(inputs, signal, this.epochNumber);
144
145
  }
145
146
 
146
147
  case ProvingRequestType.TUBE_PROOF: {
147
- return await this.circuitProver.getTubeProof(inputs, signal);
148
+ return await this.circuitProver.getTubeProof(inputs, signal, this.epochNumber);
148
149
  }
149
150
 
150
151
  default: {