@convex-dev/workpool 0.2.0-beta.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +7 -16
  2. package/dist/commonjs/client/index.d.ts +3 -3
  3. package/dist/commonjs/client/index.d.ts.map +1 -1
  4. package/dist/commonjs/client/index.js +10 -5
  5. package/dist/commonjs/client/index.js.map +1 -1
  6. package/dist/commonjs/component/complete.d.ts +89 -0
  7. package/dist/commonjs/component/complete.d.ts.map +1 -0
  8. package/dist/commonjs/component/complete.js +80 -0
  9. package/dist/commonjs/component/complete.js.map +1 -0
  10. package/dist/commonjs/component/kick.d.ts +1 -2
  11. package/dist/commonjs/component/kick.d.ts.map +1 -1
  12. package/dist/commonjs/component/kick.js +7 -5
  13. package/dist/commonjs/component/kick.js.map +1 -1
  14. package/dist/commonjs/component/lib.d.ts +3 -3
  15. package/dist/commonjs/component/lib.d.ts.map +1 -1
  16. package/dist/commonjs/component/lib.js +43 -20
  17. package/dist/commonjs/component/lib.js.map +1 -1
  18. package/dist/commonjs/component/logging.d.ts.map +1 -1
  19. package/dist/commonjs/component/logging.js +1 -2
  20. package/dist/commonjs/component/logging.js.map +1 -1
  21. package/dist/commonjs/component/loop.d.ts +1 -14
  22. package/dist/commonjs/component/loop.d.ts.map +1 -1
  23. package/dist/commonjs/component/loop.js +215 -178
  24. package/dist/commonjs/component/loop.js.map +1 -1
  25. package/dist/commonjs/component/recovery.d.ts +16 -0
  26. package/dist/commonjs/component/recovery.d.ts.map +1 -1
  27. package/dist/commonjs/component/recovery.js +64 -44
  28. package/dist/commonjs/component/recovery.js.map +1 -1
  29. package/dist/commonjs/component/schema.d.ts +6 -2
  30. package/dist/commonjs/component/schema.d.ts.map +1 -1
  31. package/dist/commonjs/component/schema.js +5 -3
  32. package/dist/commonjs/component/schema.js.map +1 -1
  33. package/dist/commonjs/component/shared.d.ts +20 -11
  34. package/dist/commonjs/component/shared.d.ts.map +1 -1
  35. package/dist/commonjs/component/shared.js +18 -5
  36. package/dist/commonjs/component/shared.js.map +1 -1
  37. package/dist/commonjs/component/stats.d.ts +21 -13
  38. package/dist/commonjs/component/stats.d.ts.map +1 -1
  39. package/dist/commonjs/component/stats.js +32 -22
  40. package/dist/commonjs/component/stats.js.map +1 -1
  41. package/dist/commonjs/component/worker.d.ts +2 -12
  42. package/dist/commonjs/component/worker.d.ts.map +1 -1
  43. package/dist/commonjs/component/worker.js +23 -36
  44. package/dist/commonjs/component/worker.js.map +1 -1
  45. package/dist/esm/client/index.d.ts +3 -3
  46. package/dist/esm/client/index.d.ts.map +1 -1
  47. package/dist/esm/client/index.js +10 -5
  48. package/dist/esm/client/index.js.map +1 -1
  49. package/dist/esm/component/complete.d.ts +89 -0
  50. package/dist/esm/component/complete.d.ts.map +1 -0
  51. package/dist/esm/component/complete.js +80 -0
  52. package/dist/esm/component/complete.js.map +1 -0
  53. package/dist/esm/component/kick.d.ts +1 -2
  54. package/dist/esm/component/kick.d.ts.map +1 -1
  55. package/dist/esm/component/kick.js +7 -5
  56. package/dist/esm/component/kick.js.map +1 -1
  57. package/dist/esm/component/lib.d.ts +3 -3
  58. package/dist/esm/component/lib.d.ts.map +1 -1
  59. package/dist/esm/component/lib.js +43 -20
  60. package/dist/esm/component/lib.js.map +1 -1
  61. package/dist/esm/component/logging.d.ts.map +1 -1
  62. package/dist/esm/component/logging.js +1 -2
  63. package/dist/esm/component/logging.js.map +1 -1
  64. package/dist/esm/component/loop.d.ts +1 -14
  65. package/dist/esm/component/loop.d.ts.map +1 -1
  66. package/dist/esm/component/loop.js +215 -178
  67. package/dist/esm/component/loop.js.map +1 -1
  68. package/dist/esm/component/recovery.d.ts +16 -0
  69. package/dist/esm/component/recovery.d.ts.map +1 -1
  70. package/dist/esm/component/recovery.js +64 -44
  71. package/dist/esm/component/recovery.js.map +1 -1
  72. package/dist/esm/component/schema.d.ts +6 -2
  73. package/dist/esm/component/schema.d.ts.map +1 -1
  74. package/dist/esm/component/schema.js +5 -3
  75. package/dist/esm/component/schema.js.map +1 -1
  76. package/dist/esm/component/shared.d.ts +20 -11
  77. package/dist/esm/component/shared.d.ts.map +1 -1
  78. package/dist/esm/component/shared.js +18 -5
  79. package/dist/esm/component/shared.js.map +1 -1
  80. package/dist/esm/component/stats.d.ts +21 -13
  81. package/dist/esm/component/stats.d.ts.map +1 -1
  82. package/dist/esm/component/stats.js +32 -22
  83. package/dist/esm/component/stats.js.map +1 -1
  84. package/dist/esm/component/worker.d.ts +2 -12
  85. package/dist/esm/component/worker.d.ts.map +1 -1
  86. package/dist/esm/component/worker.js +23 -36
  87. package/dist/esm/component/worker.js.map +1 -1
  88. package/package.json +7 -6
  89. package/src/client/index.ts +18 -8
  90. package/src/component/README.md +15 -15
  91. package/src/component/_generated/api.d.ts +7 -2
  92. package/src/component/complete.test.ts +508 -0
  93. package/src/component/complete.ts +98 -0
  94. package/src/component/kick.test.ts +13 -13
  95. package/src/component/kick.ts +13 -8
  96. package/src/component/lib.test.ts +262 -17
  97. package/src/component/lib.ts +55 -24
  98. package/src/component/logging.ts +1 -2
  99. package/src/component/loop.test.ts +1158 -0
  100. package/src/component/loop.ts +289 -221
  101. package/src/component/recovery.test.ts +541 -0
  102. package/src/component/recovery.ts +80 -63
  103. package/src/component/schema.ts +6 -4
  104. package/src/component/shared.ts +21 -6
  105. package/src/component/stats.ts +48 -25
  106. package/src/component/worker.ts +25 -38
@@ -1,9 +1,9 @@
1
- import { FunctionHandle, WithoutSystemFields } from "convex/server";
1
+ import { WithoutSystemFields } from "convex/server";
2
2
  import { v } from "convex/values";
3
3
  import { internal } from "./_generated/api.js";
4
4
  import { Doc, Id } from "./_generated/dataModel.js";
5
5
  import { internalMutation, MutationCtx } from "./_generated/server.js";
6
- import { DEFAULT_MAX_PARALLELISM } from "./kick.js";
6
+ import type { CompleteJob } from "./complete.js";
7
7
  import {
8
8
  createLogger,
9
9
  DEFAULT_LOG_LEVEL,
@@ -14,10 +14,11 @@ import {
14
14
  boundScheduledTime,
15
15
  Config,
16
16
  currentSegment,
17
+ DEFAULT_MAX_PARALLELISM,
17
18
  fromSegment,
19
+ max,
18
20
  nextSegment,
19
- OnCompleteArgs,
20
- runResult,
21
+ RunResult,
21
22
  toSegment,
22
23
  } from "./shared.js";
23
24
  import { recordCompleted, recordReport, recordStarted } from "./stats.js";
@@ -45,49 +46,50 @@ export const INITIAL_STATE: WithoutSystemFields<Doc<"internalState">> = {
45
46
 
46
47
  // There should only ever be at most one of these scheduled or running.
47
48
  export const main = internalMutation({
48
- args: {
49
- generation: v.int64(),
50
- segment: v.int64(),
51
- },
52
- handler: async (ctx, args) => {
49
+ args: { generation: v.int64(), segment: v.int64() },
50
+ handler: async (ctx, { generation, segment }) => {
53
51
  // State will be modified and patched at the end of the function.
54
52
  const state = await getOrCreateState(ctx);
55
- if (args.generation !== state.generation) {
53
+ if (generation !== state.generation) {
56
54
  throw new Error(
57
- `generation mismatch: ${args.generation} !== ${state.generation}`
55
+ `generation mismatch: ${generation} !== ${state.generation}`
58
56
  );
59
57
  }
60
58
  state.generation++;
59
+ const runStatus = await getOrCreateRunningStatus(ctx);
60
+ if (runStatus.state.kind !== "running") {
61
+ await ctx.db.patch(runStatus._id, {
62
+ state: { kind: "running" },
63
+ });
64
+ }
61
65
 
62
66
  const globals = await getGlobals(ctx);
63
67
  const console = createLogger(globals.logLevel);
68
+ const delayMs = Date.now() - fromSegment(segment);
69
+ console.debug(`[main] generation ${generation} behind: ${delayMs}ms`);
64
70
 
65
71
  // Read pendingCompletions, including retry handling.
66
72
  console.time("[main] pendingCompletion");
67
- const done = await handleCompletions(ctx, state, args.segment, console);
73
+ const toCancel = await handleCompletions(ctx, state, segment, console);
68
74
  console.timeEnd("[main] pendingCompletion");
69
75
 
70
76
  // Read pendingCancelation, deleting from pendingStart. If it's still running, queue to cancel.
71
77
  console.time("[main] pendingCancelation");
72
- done.push(...(await handleCancelation(ctx, state, args.segment, console)));
78
+ await handleCancelation(ctx, state, segment, console, toCancel);
73
79
  console.timeEnd("[main] pendingCancelation");
74
80
 
75
81
  if (state.running.length === 0) {
76
82
  // If there's nothing active, reset lastRecovery.
77
- state.lastRecovery = args.segment;
78
- } else if (args.segment - state.lastRecovery >= RECOVERY_PERIOD_SEGMENTS) {
83
+ state.lastRecovery = segment;
84
+ } else if (segment - state.lastRecovery >= RECOVERY_PERIOD_SEGMENTS) {
79
85
  // Otherwise schedule recovery for any old jobs.
80
- const oldEnoughToConsider = Date.now() - RECOVERY_THRESHOLD_MS;
81
- const jobs = state.running.filter((r) => r.started < oldEnoughToConsider);
82
- if (jobs.length) {
83
- await ctx.scheduler.runAfter(0, internal.recovery.recover, { jobs });
84
- }
85
- state.lastRecovery = args.segment;
86
+ await handleRecovery(ctx, state, console);
87
+ state.lastRecovery = segment;
86
88
  }
87
89
 
88
90
  // Read pendingStart up to max capacity. Update the config, and incomingSegmentCursor.
89
91
  console.time("[main] pendingStart");
90
- await handleStart(ctx, state, args.segment, console, globals);
92
+ await handleStart(ctx, state, segment, console, globals);
91
93
  console.timeEnd("[main] pendingStart");
92
94
 
93
95
  if (Date.now() - state.report.lastReportTs >= MINUTE) {
@@ -98,7 +100,7 @@ export const main = internalMutation({
98
100
  // It's been a while, let's start fresh.
99
101
  lastReportTs = Date.now();
100
102
  }
101
- console.info(recordReport(state));
103
+ recordReport(console, state);
102
104
  state.report = {
103
105
  completed: 0,
104
106
  succeeded: 0,
@@ -110,100 +112,82 @@ export const main = internalMutation({
110
112
  }
111
113
 
112
114
  await ctx.db.replace(state._id, state);
113
- await ctx.scheduler.runAfter(0, internal.loop.complete, { done });
114
115
  await ctx.scheduler.runAfter(0, internal.loop.updateRunStatus, {
115
116
  generation: state.generation,
117
+ segment,
116
118
  });
117
- },
118
- });
119
-
120
- export const complete = internalMutation({
121
- args: {
122
- done: v.array(v.object({ runResult, workId: v.id("work") })),
123
- },
124
- handler: async (ctx, args) => {
125
- const globals = await getGlobals(ctx);
126
- const console = createLogger(globals.logLevel);
127
- await Promise.all(
128
- args.done.map(async ({ runResult, workId }) => {
129
- const work = await ctx.db.get(workId);
130
- if (!work) {
131
- console.warn(`[complete] ${workId} is done, but its work is gone`);
132
- return;
133
- }
134
- if (work.onComplete) {
135
- try {
136
- const handle = work.onComplete.fnHandle as FunctionHandle<
137
- "mutation",
138
- OnCompleteArgs,
139
- void
140
- >;
141
- await ctx.runMutation(handle, {
142
- workId: work._id,
143
- context: work.onComplete.context,
144
- result: runResult,
145
- });
146
- console.debug(`[complete] onComplete for ${workId} completed`);
147
- } catch (e) {
148
- console.error(
149
- `[complete] error running onComplete for ${workId}`,
150
- e
151
- );
152
- }
153
- }
154
- await ctx.db.delete(workId);
155
- })
156
- );
119
+ // TODO: if there were more cancellations, schedule main directly.
157
120
  },
158
121
  });
159
122
 
160
123
  export const updateRunStatus = internalMutation({
161
- args: { generation: v.int64() },
162
- handler: async (ctx, args) => {
124
+ args: { generation: v.int64(), segment: v.int64() },
125
+ handler: async (ctx, { generation, segment }) => {
163
126
  const globals = await getGlobals(ctx);
164
127
  const console = createLogger(globals.logLevel);
165
128
  const maxParallelism = globals.maxParallelism;
166
129
  const state = await getOrCreateState(ctx);
167
- if (args.generation !== state.generation) {
130
+ if (generation !== state.generation) {
168
131
  throw new Error(
169
- `generation mismatch: ${args.generation} !== ${state.generation}`
132
+ `generation mismatch: ${generation} !== ${state.generation}`
170
133
  );
171
134
  }
172
135
 
173
136
  console.time("[updateRunStatus] outstandingCancelations");
174
- const thisSegment = currentSegment();
175
137
  const outstandingCancelations = await getNextUp(ctx, "pendingCancelation", {
176
138
  start: state.segmentCursors.cancelation,
177
- end: thisSegment,
139
+ end: segment,
178
140
  });
179
141
  console.timeEnd("[updateRunStatus] outstandingCancelations");
180
142
  if (outstandingCancelations) {
181
143
  await ctx.scheduler.runAfter(0, internal.loop.main, {
182
- generation: args.generation,
183
- segment: thisSegment,
144
+ generation,
145
+ segment,
184
146
  });
185
147
  return;
186
148
  }
187
149
 
150
+ // TODO: check for current segment (or from args) first, to avoid OCCs.
188
151
  console.time("[updateRunStatus] nextSegmentIsActionable");
189
- const [nextIsActionable, cursors] = await nextSegmentIsActionable(
152
+ const next = max(segment + 1n, currentSegment());
153
+ const nextIsActionable = await nextSegmentIsActionable(
190
154
  ctx,
191
155
  state,
192
- maxParallelism
156
+ maxParallelism,
157
+ next
193
158
  );
194
159
  console.timeEnd("[updateRunStatus] nextSegmentIsActionable");
195
160
 
196
- const start = nextSegment();
197
161
  if (nextIsActionable) {
162
+ await ctx.scheduler.runAt(
163
+ boundScheduledTime(fromSegment(next), console),
164
+ internal.loop.main,
165
+ {
166
+ generation,
167
+ segment: next,
168
+ }
169
+ );
170
+ return;
171
+ }
172
+
173
+ console.time("[updateRunStatus] oldSegmentIsActionable");
174
+ const [oldIsActionable, cursors] = await oldSegmentIsActionable(
175
+ ctx,
176
+ state,
177
+ maxParallelism
178
+ );
179
+ console.timeEnd("[updateRunStatus] oldSegmentIsActionable");
180
+
181
+ if (oldIsActionable) {
198
182
  await ctx.db.patch(state._id, {
199
183
  segmentCursors: {
200
184
  ...state.segmentCursors,
201
185
  ...cursors,
202
186
  },
203
187
  });
204
- await ctx.scheduler.runAt(fromSegment(start), internal.loop.main, {
205
- generation: args.generation,
206
- segment: start,
188
+ await ctx.scheduler.runAfter(0, internal.loop.main, {
189
+ generation,
190
+ segment: currentSegment(),
207
191
  });
208
192
  return;
209
193
  }
@@ -220,39 +204,45 @@ export const updateRunStatus = internalMutation({
220
204
  }
221
205
  const docs = await Promise.all(
222
206
  actionableTables.map(async (tableName) =>
223
- getNextUp(ctx, tableName, { start })
207
+ getNextUp(ctx, tableName, { start: next })
224
208
  )
225
209
  );
226
210
  console.timeEnd("[updateRunStatus] findNextSegment");
227
- let segment = docs.map((d) => d?.segment).sort()[0];
211
+ let targetSegment = docs.map((d) => d?.segment).sort()[0];
228
212
  const runStatus = await getOrCreateRunningStatus(ctx);
229
213
  const saturated = state.running.length >= maxParallelism;
230
- if (segment || state.running.length > 0) {
214
+ if (targetSegment !== undefined || state.running.length > 0) {
231
215
  // If there's something to do, schedule for next actionable segment.
232
216
  // Or the next recovery, whichever comes first.
233
217
  const nextRecoverySegment = state.lastRecovery + RECOVERY_PERIOD_SEGMENTS;
234
- if (!segment || segment > nextRecoverySegment) {
235
- segment = nextRecoverySegment;
218
+ if (!targetSegment || targetSegment > nextRecoverySegment) {
219
+ targetSegment = nextRecoverySegment;
236
220
  }
237
221
  const scheduledId = await ctx.scheduler.runAt(
238
- fromSegment(segment),
222
+ boundScheduledTime(fromSegment(targetSegment), console),
239
223
  internal.loop.main,
240
- { generation: args.generation, segment }
224
+ { generation, segment: targetSegment }
241
225
  );
242
- await ctx.db.patch(runStatus._id, {
243
- state: {
244
- kind: "scheduled",
245
- scheduledId,
246
- saturated,
247
- generation: args.generation,
248
- segment,
249
- },
250
- });
226
+ if (targetSegment > nextSegment()) {
227
+ await ctx.db.patch(runStatus._id, {
228
+ state: {
229
+ kind: "scheduled",
230
+ scheduledId,
231
+ saturated,
232
+ generation,
233
+ segment: targetSegment,
234
+ },
235
+ });
236
+ } else {
237
+ console.debug(
238
+ `[updateRunStatus] staying running because it's the next segment`
239
+ );
240
+ }
251
241
  return;
252
242
  }
253
243
  // There seems to be nothing in the future to do, so go idle.
254
244
  await ctx.db.patch(runStatus._id, {
255
- state: { kind: "idle", generation: args.generation },
245
+ state: { kind: "idle", generation },
256
246
  });
257
247
  },
258
248
  });
@@ -260,19 +250,17 @@ export const updateRunStatus = internalMutation({
260
250
  async function nextSegmentIsActionable(
261
251
  ctx: MutationCtx,
262
252
  state: Doc<"internalState">,
263
- maxParallelism: number
264
- ): Promise<
265
- [boolean, { completion?: bigint; cancelation?: bigint; incoming?: bigint }]
266
- > {
267
- // First, try with our cursor range, up to next segment.
268
- const end = nextSegment();
253
+ maxParallelism: number,
254
+ end: bigint
255
+ ): Promise<boolean> {
256
+ // First, try with our cursor range, up to end.
269
257
  if (
270
258
  await getNextUp(ctx, "pendingCancelation", {
271
259
  start: state.segmentCursors.cancelation,
272
260
  end,
273
261
  })
274
262
  ) {
275
- return [true, {}];
263
+ return true;
276
264
  }
277
265
  if (
278
266
  await getNextUp(ctx, "pendingCompletion", {
@@ -280,7 +268,7 @@ async function nextSegmentIsActionable(
280
268
  end,
281
269
  })
282
270
  ) {
283
- return [true, {}];
271
+ return true;
284
272
  }
285
273
  if (state.running.length < maxParallelism) {
286
274
  if (
@@ -289,9 +277,19 @@ async function nextSegmentIsActionable(
289
277
  end,
290
278
  })
291
279
  ) {
292
- return [true, {}];
280
+ return true;
293
281
  }
294
282
  }
283
+ return false;
284
+ }
285
+
286
+ async function oldSegmentIsActionable(
287
+ ctx: MutationCtx,
288
+ state: Doc<"internalState">,
289
+ maxParallelism: number
290
+ ): Promise<
291
+ [boolean, { completion?: bigint; cancelation?: bigint; incoming?: bigint }]
292
+ > {
295
293
  // Next, we look for out-of-order additions we may have missed.
296
294
  const oldCompletion = await getNextUp(ctx, "pendingCompletion", {
297
295
  end: state.segmentCursors.completion,
@@ -325,13 +323,13 @@ async function getNextUp(
325
323
  return ctx.db
326
324
  .query(table)
327
325
  .withIndex("segment", (q) =>
328
- range.start
329
- ? range.end
326
+ range.start !== undefined
327
+ ? range.end !== undefined
330
328
  ? q
331
329
  .gte("segment", range.start - CURSOR_BUFFER_SEGMENTS)
332
330
  .lte("segment", range.end)
333
331
  : q.gt("segment", range.start - CURSOR_BUFFER_SEGMENTS)
334
- : range.end
332
+ : range.end !== undefined
335
333
  ? q.lt("segment", range.end)
336
334
  : q
337
335
  )
@@ -340,9 +338,7 @@ async function getNextUp(
340
338
 
341
339
  /**
342
340
  * Handles the completion of pending completions.
343
-
344
- * Important: It should handle retries before cancelations are processed,
345
- * to allow retries to be canceled.
341
+ * This only processes work that succeeded or failed, not canceled.
346
342
  */
347
343
  async function handleCompletions(
348
344
  ctx: MutationCtx,
@@ -359,87 +355,66 @@ async function handleCompletions(
359
355
  q.gte("segment", startSegment).lte("segment", segment)
360
356
  )
361
357
  .collect();
362
- state.report.completed += completed.length;
363
358
  state.segmentCursors.completion = segment;
364
- const done: Doc<"pendingCompletion">[] = [];
359
+ // Completions that were going to be retried but have since been canceled.
360
+ const toCancel: CompleteJob[] = [];
365
361
  await Promise.all(
366
362
  completed.map(async (c) => {
367
363
  await ctx.db.delete(c._id);
368
- const work = await ctx.db.get(c.workId);
369
- const maxAttempts = work?.retryBehavior?.maxAttempts;
370
- const pendingCancelations = await ctx.db
371
- .query("pendingCancelation")
372
- .withIndex("workId", (q) => q.eq("workId", c.workId))
373
- .collect();
374
- if (work && state.running.some((r) => r.workId === c.workId)) {
375
- if (
376
- c.runResult.kind === "failed" &&
377
- maxAttempts &&
378
- pendingCancelations.length === 0 &&
379
- work.attempts < maxAttempts
380
- ) {
381
- await rescheduleJob(ctx, work, console);
364
+
365
+ const running = state.running.find((r) => r.workId === c.workId);
366
+ if (!running) {
367
+ console.error(
368
+ `[main] completing ${c.workId} but it's not in "running"`
369
+ );
370
+ return;
371
+ }
372
+ if (c.retry) {
373
+ // Only check for work if it's going to be retried.
374
+ const work = await ctx.db.get(c.workId);
375
+ if (!work) {
376
+ console.warn(`[main] ${c.workId} is gone, but trying to complete`);
377
+ return;
378
+ }
379
+ const retried = await rescheduleJob(ctx, work, console);
380
+ if (retried) {
382
381
  state.report.retries++;
382
+ recordCompleted(console, work, "retrying");
383
383
  } else {
384
- if (c.runResult.kind === "success") {
385
- state.report.succeeded++;
386
- } else if (c.runResult.kind === "failed") {
387
- state.report.failed++;
388
- }
389
- // Ensure there aren't any pending cancelations for this work.
390
- for (const pendingCancelation of pendingCancelations) {
391
- await ctx.db.delete(pendingCancelation._id);
392
- }
393
- done.push(c);
384
+ // We don't retry if it's been canceled in the mean time.
385
+ state.report.canceled++;
386
+ toCancel.push({
387
+ workId: c.workId,
388
+ runResult: { kind: "canceled" },
389
+ attempt: work.attempts,
390
+ });
394
391
  }
395
- console.info(recordCompleted(work, c.runResult.kind));
396
- } else if (work) {
397
- console.warn(`[main] completing ${c.workId} but it's not in "running"`);
398
392
  } else {
399
- console.warn(`[main] completing ${c.workId} but it's not found`);
393
+ if (c.runResult.kind === "success") {
394
+ state.report.succeeded++;
395
+ } else if (c.runResult.kind === "failed") {
396
+ state.report.failed++;
397
+ }
400
398
  }
401
399
  })
402
400
  );
403
- console.debug(`[main] completing ${done.length}`);
401
+ // We do this after so the stats above know if it was in progress.
402
+ const before = state.running.length;
404
403
  state.running = state.running.filter(
405
404
  (r) => !completed.some((c) => c.workId === r.workId)
406
405
  );
407
- return done.map((c) => ({ runResult: c.runResult, workId: c.workId }));
408
- }
409
-
410
- async function rescheduleJob(
411
- ctx: MutationCtx,
412
- work: Doc<"work">,
413
- console: Logger
414
- ): Promise<number> {
415
- if (!work.retryBehavior) {
416
- throw new Error("work has no retryBehavior");
417
- }
418
- const backoffMs =
419
- work.retryBehavior.initialBackoffMs *
420
- Math.pow(work.retryBehavior.base, work.attempts - 1);
421
- const nextAttempt = withJitter(backoffMs);
422
- const startTime = boundScheduledTime(Date.now() + nextAttempt, console);
423
- const segment = toSegment(startTime);
424
- await ctx.db.patch(work._id, {
425
- attempts: work.attempts + 1,
426
- });
427
- await ctx.db.insert("pendingStart", {
428
- workId: work._id,
429
- segment,
430
- });
431
- return nextAttempt;
432
- }
433
-
434
- export function withJitter(delay: number) {
435
- return delay * (0.5 + Math.random());
406
+ const numCompleted = before - state.running.length;
407
+ state.report.completed += numCompleted;
408
+ console.debug(`[main] completed ${numCompleted} work`);
409
+ return toCancel;
436
410
  }
437
411
 
438
412
  async function handleCancelation(
439
413
  ctx: MutationCtx,
440
414
  state: Doc<"internalState">,
441
415
  segment: bigint,
442
- console: Logger
416
+ console: Logger,
417
+ toCancel: CompleteJob[]
443
418
  ) {
444
419
  const start = state.segmentCursors.cancelation - CURSOR_BUFFER_SEGMENTS;
445
420
  const canceled = await ctx.db
@@ -449,35 +424,76 @@ async function handleCancelation(
449
424
  )
450
425
  .take(CANCELLATION_BATCH_SIZE);
451
426
  state.segmentCursors.cancelation = canceled.at(-1)?.segment ?? segment;
452
- console.debug(`[main] attempting to cancel ${canceled.length}`);
427
+ if (canceled.length) {
428
+ console.debug(`[main] attempting to cancel ${canceled.length}`);
429
+ }
453
430
  const canceledWork: Set<Id<"work">> = new Set();
454
- await Promise.all(
455
- canceled.map(async ({ _id, workId }) => {
456
- await ctx.db.delete(_id);
457
- const work = await ctx.db.get(workId);
458
- if (!work) {
459
- console.warn(`[handleCancelation] ${workId} is gone`);
460
- return;
461
- }
462
- // Ensure it doesn't retry.
463
- await ctx.db.patch(workId, { retryBehavior: undefined });
464
- // Ensure it doesn't start.
465
- const pendingStart = await ctx.db
466
- .query("pendingStart")
467
- .withIndex("workId", (q) => q.eq("workId", workId))
468
- .unique();
469
- if (pendingStart && !canceledWork.has(workId)) {
470
- console.info(recordCompleted(work, "canceled"));
471
- state.report.canceled++;
472
- await ctx.db.delete(pendingStart._id);
473
- canceledWork.add(workId);
474
- }
475
- })
431
+ const runResult: RunResult = { kind: "canceled" };
432
+ const jobs = toCancel.concat(
433
+ ...(
434
+ await Promise.all(
435
+ canceled.map(async ({ _id, _creationTime, workId }) => {
436
+ await ctx.db.delete(_id);
437
+ if (canceledWork.has(workId)) {
438
+ // We shouldn't have multiple pending cancelations for the same work.
439
+ console.error(`[main] ${workId} already canceled`);
440
+ return null;
441
+ }
442
+ const work = await ctx.db.get(workId);
443
+ if (!work) {
444
+ console.warn(`[main] ${workId} is gone, but trying to cancel`);
445
+ return null;
446
+ }
447
+ // Ensure it doesn't retry.
448
+ await ctx.db.patch(workId, { canceled: true });
449
+ // Ensure it doesn't start.
450
+ const pendingStart = await ctx.db
451
+ .query("pendingStart")
452
+ .withIndex("workId", (q) => q.eq("workId", workId))
453
+ .unique();
454
+ if (pendingStart && !canceledWork.has(workId)) {
455
+ state.report.canceled++;
456
+ await ctx.db.delete(pendingStart._id);
457
+ canceledWork.add(workId);
458
+ return { workId, runResult, attempt: work.attempts };
459
+ }
460
+ return null;
461
+ })
462
+ )
463
+ ).flatMap((r) => (r ? [r] : []))
476
464
  );
477
- return Array.from(canceledWork).map((id) => ({
478
- runResult: { kind: "canceled" as const },
479
- workId: id,
480
- }));
465
+ if (jobs.length) {
466
+ await ctx.scheduler.runAfter(0, internal.complete.complete, { jobs });
467
+ }
468
+ }
469
+
470
+ async function handleRecovery(
471
+ ctx: MutationCtx,
472
+ state: Doc<"internalState">,
473
+ console: Logger
474
+ ) {
475
+ const missing = new Set<Id<"work">>();
476
+ const oldEnoughToConsider = Date.now() - RECOVERY_THRESHOLD_MS;
477
+ const jobs = (
478
+ await Promise.all(
479
+ state.running.map(async (r) => {
480
+ if (r.started >= oldEnoughToConsider) {
481
+ return null;
482
+ }
483
+ const work = await ctx.db.get(r.workId);
484
+ if (!work) {
485
+ missing.add(r.workId);
486
+ console.error(`[main] ${r.workId} already gone (skipping recovery)`);
487
+ return null;
488
+ }
489
+ return { ...r, attempt: work.attempts };
490
+ })
491
+ )
492
+ ).flatMap((r) => (r ? [r] : []));
493
+ state.running = state.running.filter((r) => !missing.has(r.workId));
494
+ if (jobs.length) {
495
+ await ctx.scheduler.runAfter(0, internal.recovery.recover, { jobs });
496
+ }
481
497
  }
482
498
 
483
499
  async function handleStart(
@@ -485,9 +501,8 @@ async function handleStart(
485
501
  state: Doc<"internalState">,
486
502
  segment: bigint,
487
503
  console: Logger,
488
- globals: Config
504
+ { maxParallelism, logLevel }: Config
489
505
  ) {
490
- const maxParallelism = globals.maxParallelism;
491
506
  // Schedule as many as needed to reach maxParallelism.
492
507
  const toSchedule = maxParallelism - state.running.length;
493
508
 
@@ -499,50 +514,103 @@ async function handleStart(
499
514
  .lte("segment", segment)
500
515
  )
501
516
  .take(toSchedule);
517
+
502
518
  state.segmentCursors.incoming = pending.at(-1)?.segment ?? segment;
503
519
  console.debug(`[main] scheduling ${pending.length} pending work`);
504
520
  // Start new work.
505
521
  state.running.push(
506
- ...(await Promise.all(
507
- pending.map(async ({ _id, workId }) => {
508
- const scheduledId = await beginWork(ctx, workId, globals.logLevel);
509
- await ctx.db.delete(_id);
510
- return { scheduledId, workId, started: Date.now() };
511
- })
512
- ))
522
+ ...(
523
+ await Promise.all(
524
+ pending.map(async ({ _id, workId, segment }) => {
525
+ if (state.running.some((r) => r.workId === workId)) {
526
+ console.error(`[main] ${workId} already running (skipping start)`);
527
+ return null;
528
+ }
529
+ const lagMs = Date.now() - fromSegment(segment);
530
+ const scheduledId = await beginWork(ctx, workId, logLevel, lagMs);
531
+ await ctx.db.delete(_id);
532
+ return { scheduledId, workId, started: Date.now() };
533
+ })
534
+ )
535
+ ).flatMap((r) => (r ? [r] : []))
513
536
  );
514
537
  }
515
538
 
516
539
  async function beginWork(
517
540
  ctx: MutationCtx,
518
541
  workId: Id<"work">,
519
- logLevel: LogLevel
542
+ logLevel: LogLevel,
543
+ lagMs: number
520
544
  ): Promise<Id<"_scheduled_functions">> {
521
545
  const console = createLogger(logLevel);
522
546
  const work = await ctx.db.get(workId);
523
547
  if (!work) {
524
548
  throw new Error("work not found");
525
549
  }
526
- console.info(recordStarted(work));
550
+ recordStarted(console, work, lagMs);
551
+ const { attempts: attempt, fnHandle, fnArgs } = work;
552
+ const args = { workId, fnHandle, fnArgs, logLevel, attempt };
527
553
  if (work.fnType === "action") {
528
- return await ctx.scheduler.runAfter(0, internal.worker.runActionWrapper, {
529
- workId: work._id,
530
- fnHandle: work.fnHandle,
531
- fnArgs: work.fnArgs,
532
- logLevel,
533
- });
554
+ return ctx.scheduler.runAfter(0, internal.worker.runActionWrapper, args);
534
555
  } else if (work.fnType === "mutation") {
535
- return await ctx.scheduler.runAfter(0, internal.worker.runMutationWrapper, {
536
- workId: work._id,
537
- fnHandle: work.fnHandle,
538
- fnArgs: work.fnArgs,
539
- logLevel,
540
- });
556
+ return ctx.scheduler.runAfter(0, internal.worker.runMutationWrapper, args);
541
557
  } else {
542
558
  throw new Error(`Unexpected fnType ${work.fnType}`);
543
559
  }
544
560
  }
545
561
 
562
+ /**
563
+ * Reschedules a job for retry.
564
+ * If it's been canceled in the mean time, don't retry.
565
+ * @returns true if the job was rescheduled, false if it was not.
566
+ */
567
+ async function rescheduleJob(
568
+ ctx: MutationCtx,
569
+ work: Doc<"work">,
570
+ console: Logger
571
+ ): Promise<boolean> {
572
+ const pendingCancelation = await ctx.db
573
+ .query("pendingCancelation")
574
+ .withIndex("workId", (q) => q.eq("workId", work._id))
575
+ .unique();
576
+ if (pendingCancelation) {
577
+ // If there's an un-processed cancelation request, don't retry.
578
+ console.warn(`[main] ${work._id} in pendingCancelation so not retrying`);
579
+ return false;
580
+ }
581
+ if (work.canceled) {
582
+ return false;
583
+ }
584
+ if (!work.retryBehavior) {
585
+ console.warn(`[main] ${work._id} has no retryBehavior so not retrying`);
586
+ return false;
587
+ }
588
+ const existing = await ctx.db
589
+ .query("pendingStart")
590
+ .withIndex("workId", (q) => q.eq("workId", work._id))
591
+ .first();
592
+ if (existing) {
593
+ // Not sure why this would ever happen, but ensure uniqueness explicitly.
594
+ console.error(`[main] ${work._id} already in pendingStart so not retrying`);
595
+ return false;
596
+ }
597
+ const backoffMs =
598
+ work.retryBehavior.initialBackoffMs *
599
+ Math.pow(work.retryBehavior.base, work.attempts - 1);
600
+ const nextAttempt = withJitter(backoffMs);
601
+ const startTime = boundScheduledTime(Date.now() + nextAttempt, console);
602
+ const segment = toSegment(startTime);
603
+ await ctx.db.insert("pendingStart", {
604
+ workId: work._id,
605
+ segment,
606
+ });
607
+ return true;
608
+ }
609
+
610
+ export function withJitter(delay: number) {
611
+ return delay * (0.5 + Math.random());
612
+ }
613
+
546
614
  async function getGlobals(ctx: MutationCtx) {
547
615
  const globals = await ctx.db.query("globals").unique();
548
616
  if (!globals) {