@blokjs/runner 0.2.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Configuration.d.ts +18 -0
- package/dist/Configuration.js +151 -4
- package/dist/Configuration.js.map +1 -1
- package/dist/PayloadTooLargeError.d.ts +19 -0
- package/dist/PayloadTooLargeError.js +29 -0
- package/dist/PayloadTooLargeError.js.map +1 -0
- package/dist/RunCancelledError.d.ts +17 -0
- package/dist/RunCancelledError.js +25 -0
- package/dist/RunCancelledError.js.map +1 -0
- package/dist/RunnerSteps.js +330 -33
- package/dist/RunnerSteps.js.map +1 -1
- package/dist/SubworkflowNode.d.ts +75 -0
- package/dist/SubworkflowNode.js +221 -0
- package/dist/SubworkflowNode.js.map +1 -0
- package/dist/TriggerBase.d.ts +128 -0
- package/dist/TriggerBase.js +773 -4
- package/dist/TriggerBase.js.map +1 -1
- package/dist/WaitDispatchRequest.d.ts +38 -0
- package/dist/WaitDispatchRequest.js +13 -0
- package/dist/WaitDispatchRequest.js.map +1 -0
- package/dist/WaitNode.d.ts +23 -0
- package/dist/WaitNode.js +26 -0
- package/dist/WaitNode.js.map +1 -0
- package/dist/concurrency/ConcurrencyBackend.d.ts +61 -0
- package/dist/concurrency/ConcurrencyBackend.js +20 -0
- package/dist/concurrency/ConcurrencyBackend.js.map +1 -0
- package/dist/concurrency/ConcurrencyLimitError.d.ts +37 -0
- package/dist/concurrency/ConcurrencyLimitError.js +16 -0
- package/dist/concurrency/ConcurrencyLimitError.js.map +1 -0
- package/dist/concurrency/NatsKvConcurrencyBackend.d.ts +64 -0
- package/dist/concurrency/NatsKvConcurrencyBackend.js +297 -0
- package/dist/concurrency/NatsKvConcurrencyBackend.js.map +1 -0
- package/dist/concurrency/QueueExpiredError.d.ts +40 -0
- package/dist/concurrency/QueueExpiredError.js +15 -0
- package/dist/concurrency/QueueExpiredError.js.map +1 -0
- package/dist/concurrency/createConcurrencyBackend.d.ts +23 -0
- package/dist/concurrency/createConcurrencyBackend.js +34 -0
- package/dist/concurrency/createConcurrencyBackend.js.map +1 -0
- package/dist/concurrency/readConcurrencyConfig.d.ts +60 -0
- package/dist/concurrency/readConcurrencyConfig.js +60 -0
- package/dist/concurrency/readConcurrencyConfig.js.map +1 -0
- package/dist/idempotency/resolveIdempotencyKey.d.ts +20 -0
- package/dist/idempotency/resolveIdempotencyKey.js +37 -0
- package/dist/idempotency/resolveIdempotencyKey.js.map +1 -0
- package/dist/index.d.ts +23 -3
- package/dist/index.js +47 -2
- package/dist/index.js.map +1 -1
- package/dist/monitoring/ConcurrencyMetrics.d.ts +56 -0
- package/dist/monitoring/ConcurrencyMetrics.js +107 -0
- package/dist/monitoring/ConcurrencyMetrics.js.map +1 -0
- package/dist/monitoring/JanitorMetrics.d.ts +27 -0
- package/dist/monitoring/JanitorMetrics.js +48 -0
- package/dist/monitoring/JanitorMetrics.js.map +1 -0
- package/dist/scheduling/DebounceCoordinator.d.ts +88 -0
- package/dist/scheduling/DebounceCoordinator.js +141 -0
- package/dist/scheduling/DebounceCoordinator.js.map +1 -0
- package/dist/scheduling/DeferredDispatchSignal.d.ts +50 -0
- package/dist/scheduling/DeferredDispatchSignal.js +14 -0
- package/dist/scheduling/DeferredDispatchSignal.js.map +1 -0
- package/dist/scheduling/DeferredRunScheduler.d.ts +68 -0
- package/dist/scheduling/DeferredRunScheduler.js +154 -0
- package/dist/scheduling/DeferredRunScheduler.js.map +1 -0
- package/dist/scheduling/readSchedulingConfig.d.ts +24 -0
- package/dist/scheduling/readSchedulingConfig.js +52 -0
- package/dist/scheduling/readSchedulingConfig.js.map +1 -0
- package/dist/timeouts/StepTimeoutError.d.ts +22 -0
- package/dist/timeouts/StepTimeoutError.js +31 -0
- package/dist/timeouts/StepTimeoutError.js.map +1 -0
- package/dist/tracing/InMemoryRunStore.d.ts +28 -1
- package/dist/tracing/InMemoryRunStore.js +150 -0
- package/dist/tracing/InMemoryRunStore.js.map +1 -1
- package/dist/tracing/Janitor.d.ts +70 -0
- package/dist/tracing/Janitor.js +150 -0
- package/dist/tracing/Janitor.js.map +1 -0
- package/dist/tracing/PostgresRunStore.d.ts +30 -0
- package/dist/tracing/PostgresRunStore.js +435 -3
- package/dist/tracing/PostgresRunStore.js.map +1 -1
- package/dist/tracing/RunStore.d.ts +100 -1
- package/dist/tracing/RunTracker.d.ts +238 -9
- package/dist/tracing/RunTracker.js +571 -1
- package/dist/tracing/RunTracker.js.map +1 -1
- package/dist/tracing/SqliteRunStore.d.ts +23 -1
- package/dist/tracing/SqliteRunStore.js +405 -6
- package/dist/tracing/SqliteRunStore.js.map +1 -1
- package/dist/tracing/TraceRouter.d.ts +20 -2
- package/dist/tracing/TraceRouter.js +249 -5
- package/dist/tracing/TraceRouter.js.map +1 -1
- package/dist/tracing/sanitize.d.ts +11 -0
- package/dist/tracing/sanitize.js +29 -0
- package/dist/tracing/sanitize.js.map +1 -1
- package/dist/tracing/types.d.ts +348 -2
- package/dist/utils/createChildContext.d.ts +32 -0
- package/dist/utils/createChildContext.js +113 -0
- package/dist/utils/createChildContext.js.map +1 -0
- package/dist/workflow/WorkflowNormalizer.d.ts +29 -41
- package/dist/workflow/WorkflowNormalizer.js +182 -0
- package/dist/workflow/WorkflowNormalizer.js.map +1 -1
- package/dist/workflow/WorkflowRegistry.d.ts +64 -0
- package/dist/workflow/WorkflowRegistry.js +81 -0
- package/dist/workflow/WorkflowRegistry.js.map +1 -0
- package/package.json +3 -3
|
@@ -5,6 +5,31 @@ import https from "node:https";
|
|
|
5
5
|
import { v4 as uuid } from "uuid";
|
|
6
6
|
import { InMemoryRunStore } from "./InMemoryRunStore";
|
|
7
7
|
import { createStore } from "./createStore";
|
|
8
|
+
import { sanitize } from "./sanitize";
|
|
9
|
+
/**
|
|
10
|
+
* Cap on the number of `NODE_ATTEMPT_FAILED` entries kept on a single
|
|
11
|
+
* `NodeRun.attempts` array. Bounds store growth on extreme retry counts —
|
|
12
|
+
* a runaway loop generating 1000 attempts can't bloat the run store. The
|
|
13
|
+
* latest attempts are always preserved (older ones are dropped).
|
|
14
|
+
*/
|
|
15
|
+
const MAX_STORED_ATTEMPTS = 10;
|
|
16
|
+
/**
|
|
17
|
+
* PR 1 follow-up · terminal status guard.
|
|
18
|
+
*
|
|
19
|
+
* Once a run reaches a terminal status, late-arriving completeRun/failRun
|
|
20
|
+
* calls (e.g., from a runner that didn't see a parallel cancel) must NOT
|
|
21
|
+
* overwrite it. Cancellation, expiry, throttling, crashes, and timeouts
|
|
22
|
+
* all win over a stale "the steps finished" signal.
|
|
23
|
+
*/
|
|
24
|
+
const TERMINAL_STATUSES = new Set([
|
|
25
|
+
"completed",
|
|
26
|
+
"failed",
|
|
27
|
+
"cancelled",
|
|
28
|
+
"throttled",
|
|
29
|
+
"expired",
|
|
30
|
+
"crashed",
|
|
31
|
+
"timedOut",
|
|
32
|
+
]);
|
|
8
33
|
/**
|
|
9
34
|
* Build a {@link RunErrorDetail} from any thrown error. When the source is
|
|
10
35
|
* a typed `BlokError` (master plan §17), all 17+ structured fields are
|
|
@@ -115,6 +140,14 @@ export class RunTracker extends EventEmitter {
|
|
|
115
140
|
tags: opts.tags,
|
|
116
141
|
metadata: opts.metadata,
|
|
117
142
|
environment,
|
|
143
|
+
replayOf: opts.replayOf,
|
|
144
|
+
parentRunId: opts.parentRunId,
|
|
145
|
+
parentNodeRunId: opts.parentNodeRunId,
|
|
146
|
+
scheduledAt: opts.scheduledAt,
|
|
147
|
+
expiresAt: opts.expiresAt,
|
|
148
|
+
debounceKey: opts.debounceKey,
|
|
149
|
+
debounceMode: opts.debounceMode,
|
|
150
|
+
pingCount: opts.pingCount,
|
|
118
151
|
};
|
|
119
152
|
this.store.saveRun(run);
|
|
120
153
|
this.emitEvent(run.id, run.workflowName, "RUN_STARTED", undefined, undefined, {
|
|
@@ -130,6 +163,13 @@ export class RunTracker extends EventEmitter {
|
|
|
130
163
|
const run = this.store.getRun(runId);
|
|
131
164
|
if (!run)
|
|
132
165
|
return;
|
|
166
|
+
// PR 1 follow-up · terminal-status guard. Don't overwrite a run that
|
|
167
|
+
// has already reached a terminal status (cancelled / expired / etc.)
|
|
168
|
+
// — a late completeRun from a runner that didn't see a parallel
|
|
169
|
+
// cancel must not flip the status back. Defense in depth against the
|
|
170
|
+
// REVIEW.md A2 class of bug.
|
|
171
|
+
if (TERMINAL_STATUSES.has(run.status))
|
|
172
|
+
return;
|
|
133
173
|
const finishedAt = Date.now();
|
|
134
174
|
const durationMs = finishedAt - run.startedAt;
|
|
135
175
|
this.store.updateRun(runId, {
|
|
@@ -147,6 +187,9 @@ export class RunTracker extends EventEmitter {
|
|
|
147
187
|
const run = this.store.getRun(runId);
|
|
148
188
|
if (!run)
|
|
149
189
|
return;
|
|
190
|
+
// PR 1 follow-up · terminal-status guard. Same rationale as completeRun.
|
|
191
|
+
if (TERMINAL_STATUSES.has(run.status))
|
|
192
|
+
return;
|
|
150
193
|
const finishedAt = Date.now();
|
|
151
194
|
const durationMs = finishedAt - run.startedAt;
|
|
152
195
|
this.store.updateRun(runId, {
|
|
@@ -160,6 +203,452 @@ export class RunTracker extends EventEmitter {
|
|
|
160
203
|
error: toRunErrorDetail(error),
|
|
161
204
|
});
|
|
162
205
|
}
|
|
206
|
+
/**
|
|
207
|
+
* Tier 2 #6 — mark a run as throttled because the concurrency gate
|
|
208
|
+
* denied it before any step executed. Distinct from `failRun` because
|
|
209
|
+
* no step ran; nothing produced an error. Studio surfaces a Throttled
|
|
210
|
+
* badge and SSE subscribers see a granular `RUN_THROTTLED` event.
|
|
211
|
+
*/
|
|
212
|
+
markRunThrottled(runId, info) {
|
|
213
|
+
const run = this.store.getRun(runId);
|
|
214
|
+
if (!run)
|
|
215
|
+
return;
|
|
216
|
+
// Review fix-up · BUG-1. Don't overwrite a terminal status. A
|
|
217
|
+
// concurrent operator-cancel or crash auto-flip might have flipped
|
|
218
|
+
// the run between read and write; preserve the earlier terminal
|
|
219
|
+
// outcome rather than re-marking as throttled.
|
|
220
|
+
if (TERMINAL_STATUSES.has(run.status))
|
|
221
|
+
return;
|
|
222
|
+
const finishedAt = Date.now();
|
|
223
|
+
const durationMs = finishedAt - run.startedAt;
|
|
224
|
+
this.store.updateRun(runId, {
|
|
225
|
+
status: "throttled",
|
|
226
|
+
finishedAt,
|
|
227
|
+
durationMs,
|
|
228
|
+
});
|
|
229
|
+
this.emitEvent(runId, run.workflowName, "RUN_THROTTLED", undefined, undefined, {
|
|
230
|
+
durationMs,
|
|
231
|
+
concurrencyKey: info.concurrencyKey,
|
|
232
|
+
concurrencyLimit: info.concurrencyLimit,
|
|
233
|
+
currentInFlight: info.currentInFlight,
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
/**
|
|
237
|
+
* Tier 2 #6 follow-up — mark a run as queued because the concurrency
|
|
238
|
+
* gate denied it AND the trigger is configured with `onLimit: "queue"`.
|
|
239
|
+
* The run will be re-attempted after `scheduledAt`; `scheduledAt` is
|
|
240
|
+
* persisted on the run record so Studio can render a "queued · retries
|
|
241
|
+
* at <time>" badge.
|
|
242
|
+
*
|
|
243
|
+
* Distinct from `markRunThrottled` because queued runs WILL eventually
|
|
244
|
+
* execute (or stay queued indefinitely until a slot frees), while
|
|
245
|
+
* throttled runs are terminal and `failRun` semantics are skipped.
|
|
246
|
+
*
|
|
247
|
+
* Caller is responsible for actually scheduling the retry via
|
|
248
|
+
* `DeferredRunScheduler`. This method only flips status + emits the
|
|
249
|
+
* `RUN_QUEUED` event. Re-marking with a later `scheduledAt` updates
|
|
250
|
+
* the field (used when re-defer happens after a timer-fired re-acquire
|
|
251
|
+
* also fails).
|
|
252
|
+
*/
|
|
253
|
+
markRunQueued(runId, info) {
|
|
254
|
+
const run = this.store.getRun(runId);
|
|
255
|
+
if (!run)
|
|
256
|
+
return;
|
|
257
|
+
// Review fix-up · BUG-1. Don't overwrite a terminal status (e.g.,
|
|
258
|
+
// `cancelled` from a concurrent operator-cancel during the
|
|
259
|
+
// onLimit:queue re-defer race). The TTL-expired path is handled
|
|
260
|
+
// separately in TriggerBase via QueueExpiredError.
|
|
261
|
+
if (TERMINAL_STATUSES.has(run.status))
|
|
262
|
+
return;
|
|
263
|
+
this.store.updateRun(runId, {
|
|
264
|
+
status: "queued",
|
|
265
|
+
scheduledAt: info.scheduledAt,
|
|
266
|
+
});
|
|
267
|
+
this.emitEvent(runId, run.workflowName, "RUN_QUEUED", undefined, undefined, {
|
|
268
|
+
concurrencyKey: info.concurrencyKey,
|
|
269
|
+
concurrencyLimit: info.concurrencyLimit,
|
|
270
|
+
currentInFlight: info.currentInFlight,
|
|
271
|
+
scheduledAt: info.scheduledAt,
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
// === Scheduling lifecycle (Tier 2 #5 + #7) ===
|
|
275
|
+
/**
|
|
276
|
+
* Tier 2 #5 — mark a run as `delayed`. Called immediately after
|
|
277
|
+
* `startRun` for runs that should be deferred. The run record carries
|
|
278
|
+
* `scheduledAt` (and optionally `expiresAt`) so Studio can render a
|
|
279
|
+
* "Delayed → fires at <time>" badge.
|
|
280
|
+
*
|
|
281
|
+
* Caller is responsible for actually scheduling the dispatch via
|
|
282
|
+
* `DeferredRunScheduler`. This method only flips status + emits the
|
|
283
|
+
* `RUN_DELAYED` event.
|
|
284
|
+
*/
|
|
285
|
+
markRunDelayed(runId, info) {
|
|
286
|
+
const run = this.store.getRun(runId);
|
|
287
|
+
if (!run)
|
|
288
|
+
return;
|
|
289
|
+
// Review fix-up · BUG-1. Don't overwrite a terminal status — e.g.,
|
|
290
|
+
// a wait.for() re-entry race where the operator cancelled the run
|
|
291
|
+
// while WaitDispatchRequest was being thrown.
|
|
292
|
+
if (TERMINAL_STATUSES.has(run.status))
|
|
293
|
+
return;
|
|
294
|
+
this.store.updateRun(runId, {
|
|
295
|
+
status: "delayed",
|
|
296
|
+
scheduledAt: info.scheduledAt,
|
|
297
|
+
expiresAt: info.expiresAt,
|
|
298
|
+
});
|
|
299
|
+
this.emitEvent(runId, run.workflowName, "RUN_DELAYED", undefined, undefined, {
|
|
300
|
+
scheduledAt: info.scheduledAt,
|
|
301
|
+
delayMs: info.delayMs,
|
|
302
|
+
expiresAt: info.expiresAt,
|
|
303
|
+
});
|
|
304
|
+
}
|
|
305
|
+
/**
|
|
306
|
+
* Tier 2 #5 — mark a run as `expired` because its TTL was exceeded
|
|
307
|
+
* before dispatch. Distinct from `failed` (no step ran) and
|
|
308
|
+
* `cancelled` (operator action — TTL is automatic).
|
|
309
|
+
*/
|
|
310
|
+
markRunExpired(runId, info) {
|
|
311
|
+
const run = this.store.getRun(runId);
|
|
312
|
+
if (!run)
|
|
313
|
+
return;
|
|
314
|
+
// Review fix-up · BUG-1. Don't overwrite a terminal status. A
|
|
315
|
+
// run that was cancelled by an operator before the dispatch timer
|
|
316
|
+
// fired should stay `cancelled`, not flip to `expired`.
|
|
317
|
+
if (TERMINAL_STATUSES.has(run.status))
|
|
318
|
+
return;
|
|
319
|
+
const finishedAt = info.expiredAt;
|
|
320
|
+
const durationMs = finishedAt - run.startedAt;
|
|
321
|
+
const lateBy = info.expiredAt - info.expiresAt;
|
|
322
|
+
this.store.updateRun(runId, {
|
|
323
|
+
status: "expired",
|
|
324
|
+
finishedAt,
|
|
325
|
+
durationMs,
|
|
326
|
+
});
|
|
327
|
+
this.emitEvent(runId, run.workflowName, "RUN_EXPIRED", undefined, undefined, {
|
|
328
|
+
expiresAt: info.expiresAt,
|
|
329
|
+
expiredAt: info.expiredAt,
|
|
330
|
+
lateBy,
|
|
331
|
+
});
|
|
332
|
+
}
|
|
333
|
+
/**
|
|
334
|
+
* Tier 2 #7 — mark a run as `debounced`. In **leading** mode this is
|
|
335
|
+
* terminal: the ping was suppressed because a sibling fired
|
|
336
|
+
* immediately (`intoRunId` carries the sibling's id). In **trailing**
|
|
337
|
+
* mode this is transient: the same run is marked `debounced` while
|
|
338
|
+
* the timer is active and flips to `running` when the window closes
|
|
339
|
+
* (no separate transition method needed — `tracker` updates status
|
|
340
|
+
* directly via store before invoking the runner).
|
|
341
|
+
*/
|
|
342
|
+
markRunDebounced(runId, info) {
|
|
343
|
+
const run = this.store.getRun(runId);
|
|
344
|
+
if (!run)
|
|
345
|
+
return;
|
|
346
|
+
// Review fix-up · BUG-1. Don't overwrite a terminal status. A
|
|
347
|
+
// trailing debounce timer firing into a cancelled active run
|
|
348
|
+
// should NOT flip the run back to debounced.
|
|
349
|
+
if (TERMINAL_STATUSES.has(run.status))
|
|
350
|
+
return;
|
|
351
|
+
const isTerminal = info.mode === "leading" && info.intoRunId !== undefined;
|
|
352
|
+
const finishedAt = isTerminal ? Date.now() : undefined;
|
|
353
|
+
const durationMs = isTerminal && finishedAt ? finishedAt - run.startedAt : undefined;
|
|
354
|
+
this.store.updateRun(runId, {
|
|
355
|
+
status: "debounced",
|
|
356
|
+
debounceKey: info.debounceKey,
|
|
357
|
+
debounceMode: info.mode,
|
|
358
|
+
pingCount: info.pingCount,
|
|
359
|
+
scheduledAt: info.scheduledAt,
|
|
360
|
+
...(isTerminal ? { finishedAt, durationMs } : {}),
|
|
361
|
+
});
|
|
362
|
+
this.emitEvent(runId, run.workflowName, "RUN_DEBOUNCED", undefined, undefined, {
|
|
363
|
+
debounceKey: info.debounceKey,
|
|
364
|
+
mode: info.mode,
|
|
365
|
+
intoRunId: info.intoRunId,
|
|
366
|
+
pingCount: info.pingCount,
|
|
367
|
+
scheduledAt: info.scheduledAt,
|
|
368
|
+
});
|
|
369
|
+
}
|
|
370
|
+
/**
|
|
371
|
+
* Tier 2 quick-wins — mark a run as `crashed` (uncaught exception,
|
|
372
|
+
* OOM, signal). Distinct from `failRun` because the failure was
|
|
373
|
+
* NOT a step's `process()` throwing — it was the runner itself
|
|
374
|
+
* giving up. Currently manual; call from custom triggers / ops
|
|
375
|
+
* harnesses when uncaught failures are detected.
|
|
376
|
+
*/
|
|
377
|
+
markRunCrashed(runId, info) {
|
|
378
|
+
const run = this.store.getRun(runId);
|
|
379
|
+
if (!run)
|
|
380
|
+
return;
|
|
381
|
+
// Review fix-up · BUG-1. Don't overwrite a terminal status. A
|
|
382
|
+
// run that was already cancelled / failed / timedOut shouldn't
|
|
383
|
+
// be flipped to crashed by the boot orphan-recovery pass.
|
|
384
|
+
if (TERMINAL_STATUSES.has(run.status))
|
|
385
|
+
return;
|
|
386
|
+
const finishedAt = Date.now();
|
|
387
|
+
const durationMs = finishedAt - run.startedAt;
|
|
388
|
+
this.store.updateRun(runId, {
|
|
389
|
+
status: "crashed",
|
|
390
|
+
finishedAt,
|
|
391
|
+
durationMs,
|
|
392
|
+
error: toRunErrorDetail(info.error),
|
|
393
|
+
});
|
|
394
|
+
this.emitEvent(runId, run.workflowName, "RUN_CRASHED", undefined, undefined, {
|
|
395
|
+
durationMs,
|
|
396
|
+
error: toRunErrorDetail(info.error),
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
/**
|
|
400
|
+
* Tier 2 quick-wins follow-up — bulk-flip every run currently in
|
|
401
|
+
* `running` status to `crashed`. Returns the count flipped.
|
|
402
|
+
*
|
|
403
|
+
* Used by:
|
|
404
|
+
* - Process-level uncaught-exception handlers
|
|
405
|
+
* (`TriggerBase.installCrashHandlers`) — flip in-flight runs
|
|
406
|
+
* before the process dies.
|
|
407
|
+
* - Boot recovery (`TriggerBase.recoverOrphanedRuns`) — flip runs
|
|
408
|
+
* that were `running` from the previous (dead) process.
|
|
409
|
+
*
|
|
410
|
+
* Synchronous + safe to call from a `process.on("uncaughtException")`
|
|
411
|
+
* handler (which can't await). Backed by sync sqlite/in-memory
|
|
412
|
+
* writes that complete before the handler returns.
|
|
413
|
+
*
|
|
414
|
+
* Optional `opts.maxStartedAt` filter — only flip runs whose
|
|
415
|
+
* `startedAt` is at or before this timestamp. Used by boot recovery
|
|
416
|
+
* to avoid flipping runs from the current (live) process.
|
|
417
|
+
*/
|
|
418
|
+
markAllRunningRunsAsCrashed(error, opts) {
|
|
419
|
+
// PR 1 follow-up · A1 fix. `getRuns` defaults `opts?.limit ?? 50` in
|
|
420
|
+
// SqliteRunStore — left unbounded, this method silently flips at
|
|
421
|
+
// most 50 orphans per call. Loop until the store returns fewer rows
|
|
422
|
+
// than the page size (= no more matches under the LIMIT).
|
|
423
|
+
//
|
|
424
|
+
// Bounded outer loop: cap at 1000 iterations defensively. With the
|
|
425
|
+
// 50-row page size that's 50K orphans handled per single call —
|
|
426
|
+
// well above any realistic boot-recovery scenario.
|
|
427
|
+
let totalFlipped = 0;
|
|
428
|
+
const PAGE_SIZE = 50; // mirrors SqliteRunStore.getRuns default LIMIT
|
|
429
|
+
const MAX_PAGES = 1000;
|
|
430
|
+
for (let page = 0; page < MAX_PAGES; page++) {
|
|
431
|
+
// Snapshot the runs first — markRunCrashed mutates the store and
|
|
432
|
+
// could perturb iteration if we read+update inline.
|
|
433
|
+
const { runs } = this.store.getRuns({ status: "running" });
|
|
434
|
+
const candidates = opts?.maxStartedAt !== undefined ? runs.filter((r) => r.startedAt <= opts.maxStartedAt) : runs;
|
|
435
|
+
if (candidates.length === 0)
|
|
436
|
+
break;
|
|
437
|
+
for (const run of candidates) {
|
|
438
|
+
this.markRunCrashed(run.id, { error });
|
|
439
|
+
}
|
|
440
|
+
totalFlipped += candidates.length;
|
|
441
|
+
// If we got fewer rows than the page size, the store has no more
|
|
442
|
+
// matches under the LIMIT — exit early.
|
|
443
|
+
if (runs.length < PAGE_SIZE)
|
|
444
|
+
break;
|
|
445
|
+
}
|
|
446
|
+
return totalFlipped;
|
|
447
|
+
}
|
|
448
|
+
/**
|
|
449
|
+
* Tier 2 quick-wins — mark a run as `timedOut` because a step's
|
|
450
|
+
* final retry attempt exceeded its `maxDuration` cap. Distinct
|
|
451
|
+
* from `failed` so SLA dashboards can separate timeout-driven
|
|
452
|
+
* failures (network / capacity) from logic failures (bugs).
|
|
453
|
+
* Auto-called by `RunnerSteps` on final-attempt `StepTimeoutError`.
|
|
454
|
+
*/
|
|
455
|
+
markRunTimedOut(runId, info) {
|
|
456
|
+
const run = this.store.getRun(runId);
|
|
457
|
+
if (!run)
|
|
458
|
+
return;
|
|
459
|
+
// Review fix-up · BUG-1. Don't overwrite a terminal status — a
|
|
460
|
+
// run that was cancelled mid-step shouldn't flip to timedOut
|
|
461
|
+
// when the maxDuration timer fires after the cancel.
|
|
462
|
+
if (TERMINAL_STATUSES.has(run.status))
|
|
463
|
+
return;
|
|
464
|
+
const finishedAt = Date.now();
|
|
465
|
+
const durationMs = finishedAt - run.startedAt;
|
|
466
|
+
this.store.updateRun(runId, {
|
|
467
|
+
status: "timedOut",
|
|
468
|
+
finishedAt,
|
|
469
|
+
durationMs,
|
|
470
|
+
});
|
|
471
|
+
this.emitEvent(runId, run.workflowName, "RUN_TIMED_OUT", undefined, undefined, {
|
|
472
|
+
durationMs,
|
|
473
|
+
stepId: info.stepId,
|
|
474
|
+
maxDurationMs: info.maxDurationMs,
|
|
475
|
+
attemptsExhausted: info.attemptsExhausted,
|
|
476
|
+
});
|
|
477
|
+
}
|
|
478
|
+
/**
|
|
479
|
+
* Tier 2 #7 — record an additional ping into an existing trailing-mode
|
|
480
|
+
* debounce window. Increments `pingCount` and updates `scheduledAt`.
|
|
481
|
+
* Does NOT emit a new event (avoid event-stream bloat under burst).
|
|
482
|
+
*/
|
|
483
|
+
recordDebouncePing(runId, opts) {
|
|
484
|
+
const run = this.store.getRun(runId);
|
|
485
|
+
if (!run)
|
|
486
|
+
return;
|
|
487
|
+
this.store.updateRun(runId, {
|
|
488
|
+
pingCount: opts.pingCount,
|
|
489
|
+
scheduledAt: opts.scheduledAt,
|
|
490
|
+
});
|
|
491
|
+
}
|
|
492
|
+
/**
|
|
493
|
+
* Tier 2 #7 — transition a `delayed`/`debounced` run into `running`
|
|
494
|
+
* when its timer fires. Studio sees the status change via the
|
|
495
|
+
* existing run-update SSE stream.
|
|
496
|
+
*/
|
|
497
|
+
transitionRunToRunning(runId) {
|
|
498
|
+
const run = this.store.getRun(runId);
|
|
499
|
+
if (!run)
|
|
500
|
+
return;
|
|
501
|
+
this.store.updateRun(runId, {
|
|
502
|
+
status: "running",
|
|
503
|
+
startedAt: run.startedAt, // preserve the original submission time
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Tier 2 polish — cancel a pending (delayed/debounced/queued) run.
|
|
508
|
+
* Idempotent. Returns true when the run existed AND was in a cancellable
|
|
509
|
+
* state; false when the run doesn't exist OR is already running/completed/
|
|
510
|
+
* failed/throttled/expired/crashed/timedOut/cancelled.
|
|
511
|
+
*
|
|
512
|
+
* **Caller responsibility**: this method only updates the run record
|
|
513
|
+
* (status → `"cancelled"`) and emits `RUN_CANCELLED`. The caller must
|
|
514
|
+
* separately clear any pending scheduler timers via
|
|
515
|
+
* `DeferredRunScheduler.getInstance().cancel(runId)` and (when applicable)
|
|
516
|
+
* `DebounceCoordinator.getInstance().cancel(workflowName, debounceKey)`.
|
|
517
|
+
* Done this way to avoid an import cycle from tracing → scheduling.
|
|
518
|
+
*/
|
|
519
|
+
cancelRun(runId, options) {
|
|
520
|
+
const run = this.store.getRun(runId);
|
|
521
|
+
if (!run)
|
|
522
|
+
return false;
|
|
523
|
+
// Tier 2 follow-up · "running" added so cooperative AbortSignal
|
|
524
|
+
// cancellation can flip status to "cancelled" before the in-flight
|
|
525
|
+
// step throws `RunCancelledError`. The tracker's `abortRunningRun`
|
|
526
|
+
// calls this method right after firing the AbortController.
|
|
527
|
+
const cancellable = ["delayed", "debounced", "queued", "running"];
|
|
528
|
+
if (!cancellable.includes(run.status))
|
|
529
|
+
return false;
|
|
530
|
+
const previousStatus = run.status;
|
|
531
|
+
const finishedAt = Date.now();
|
|
532
|
+
const durationMs = finishedAt - run.startedAt;
|
|
533
|
+
this.store.updateRun(runId, {
|
|
534
|
+
status: "cancelled",
|
|
535
|
+
finishedAt,
|
|
536
|
+
durationMs,
|
|
537
|
+
});
|
|
538
|
+
this.emitEvent(runId, run.workflowName, "RUN_CANCELLED", undefined, undefined, {
|
|
539
|
+
durationMs,
|
|
540
|
+
previousStatus,
|
|
541
|
+
});
|
|
542
|
+
// PR 5 G1 — cascade to fire-and-forget children. Sub-workflow
|
|
543
|
+
// children with `wait: true` (sync) cancel automatically via the
|
|
544
|
+
// AbortSignal chain in createChildContext; children with
|
|
545
|
+
// `wait: false` (async / fire-and-forget) need explicit cascade
|
|
546
|
+
// because the parent step has already returned before the cancel.
|
|
547
|
+
// Walk getRunsByParent recursively (bounded by
|
|
548
|
+
// BLOK_MAX_SUBWORKFLOW_DEPTH).
|
|
549
|
+
if (options?.cascade !== false) {
|
|
550
|
+
const children = this.store.getRunsByParent(runId);
|
|
551
|
+
for (const child of children) {
|
|
552
|
+
if (cancellable.includes(child.status)) {
|
|
553
|
+
// Recursive — bounded by max-depth; each level reduces
|
|
554
|
+
// the candidate pool until none remain.
|
|
555
|
+
this.cancelRun(child.id, { cascade: true });
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
return true;
|
|
560
|
+
}
|
|
561
|
+
// === Cooperative cancellation (Tier 2 follow-up) ===
|
|
562
|
+
/**
|
|
563
|
+
* Per-process map from runId to the AbortController owned by the
|
|
564
|
+
* trigger's createContext call. Populated by TriggerBase right after
|
|
565
|
+
* `startRun()`; cleared in TriggerBase's finally block. Used by
|
|
566
|
+
* `abortRunningRun` to fire the signal when an operator cancels a
|
|
567
|
+
* `running` run via the cancel API.
|
|
568
|
+
*/
|
|
569
|
+
abortControllers = new Map();
|
|
570
|
+
registerAbortController(runId, controller) {
|
|
571
|
+
this.abortControllers.set(runId, controller);
|
|
572
|
+
}
|
|
573
|
+
unregisterAbortController(runId) {
|
|
574
|
+
this.abortControllers.delete(runId);
|
|
575
|
+
}
|
|
576
|
+
/**
|
|
577
|
+
* Tier 2 follow-up · cooperative cancellation for `running` runs.
|
|
578
|
+
*
|
|
579
|
+
* Fires the run's AbortController (so `ctx.signal.aborted` becomes
|
|
580
|
+
* true and any node consulting it can abort early) AND flips the run
|
|
581
|
+
* status to `"cancelled"` immediately via `cancelRun`. RunnerSteps'
|
|
582
|
+
* between-step abort check throws `RunCancelledError` shortly after,
|
|
583
|
+
* which TriggerBase catches without re-flipping the status.
|
|
584
|
+
*
|
|
585
|
+
* Returns true when an AbortController was registered for this run
|
|
586
|
+
* AND the status was successfully flipped; false otherwise (run not
|
|
587
|
+
* found, run not in `running` status, or no controller registered —
|
|
588
|
+
* e.g. controller already cleaned up).
|
|
589
|
+
*/
|
|
590
|
+
abortRunningRun(runId) {
|
|
591
|
+
const run = this.store.getRun(runId);
|
|
592
|
+
if (!run || run.status !== "running")
|
|
593
|
+
return false;
|
|
594
|
+
const controller = this.abortControllers.get(runId);
|
|
595
|
+
if (controller) {
|
|
596
|
+
try {
|
|
597
|
+
controller.abort();
|
|
598
|
+
}
|
|
599
|
+
catch {
|
|
600
|
+
// AbortController.abort never throws on first call; double-abort is safe.
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
// Flip status now so polls return cancelled immediately. The
|
|
604
|
+
// in-flight step's throw will land in TriggerBase.run's catch
|
|
605
|
+
// shortly; the catch sees status is already terminal and skips
|
|
606
|
+
// failRun (RunCancelledError instanceof check).
|
|
607
|
+
return this.cancelRun(runId);
|
|
608
|
+
}
|
|
609
|
+
// === Concurrency gate pass-throughs (Tier 2 #6) ===
|
|
610
|
+
/**
|
|
611
|
+
* Tier 2 #6 follow-up · cross-process concurrency backend.
|
|
612
|
+
*
|
|
613
|
+
* When set (via {@link setConcurrencyBackend}), the tracker's
|
|
614
|
+
* `acquireConcurrencySlot` and `releaseConcurrencySlot` methods
|
|
615
|
+
* delegate to the backend instead of the local sync `RunStore` impl.
|
|
616
|
+
* Used to coordinate across processes via NATS KV / Redis.
|
|
617
|
+
*
|
|
618
|
+
* Default `null` — preserves zero-overhead in-process behavior.
|
|
619
|
+
* Trigger packages install a backend in `listen()` when the operator
|
|
620
|
+
* sets `BLOK_CONCURRENCY_BACKEND=nats-kv`.
|
|
621
|
+
*/
|
|
622
|
+
concurrencyBackend = null;
|
|
623
|
+
setConcurrencyBackend(backend) {
|
|
624
|
+
this.concurrencyBackend = backend;
|
|
625
|
+
}
|
|
626
|
+
getConcurrencyBackend() {
|
|
627
|
+
return this.concurrencyBackend;
|
|
628
|
+
}
|
|
629
|
+
/**
|
|
630
|
+
* Acquire a concurrency slot for `(workflowName, concurrencyKey)`.
|
|
631
|
+
* Delegates to the configured cross-process backend when set; falls
|
|
632
|
+
* back to the local sync `RunStore` impl otherwise.
|
|
633
|
+
*
|
|
634
|
+
* Async — the cross-process backend (NATS KV) is async-only. The
|
|
635
|
+
* sync fallback is wrapped in `Promise.resolve()` so the call site
|
|
636
|
+
* is uniform.
|
|
637
|
+
*/
|
|
638
|
+
async acquireConcurrencySlot(workflowName, concurrencyKey, concurrencyLimit, runId, leaseExpiresAt) {
|
|
639
|
+
if (this.concurrencyBackend) {
|
|
640
|
+
return this.concurrencyBackend.acquireSlot(workflowName, concurrencyKey, concurrencyLimit, runId, leaseExpiresAt);
|
|
641
|
+
}
|
|
642
|
+
return this.store.acquireConcurrencySlot(workflowName, concurrencyKey, concurrencyLimit, runId, leaseExpiresAt);
|
|
643
|
+
}
|
|
644
|
+
/** Release a slot acquired via `acquireConcurrencySlot`. Idempotent. */
|
|
645
|
+
async releaseConcurrencySlot(workflowName, concurrencyKey, runId) {
|
|
646
|
+
if (this.concurrencyBackend) {
|
|
647
|
+
await this.concurrencyBackend.releaseSlot(workflowName, concurrencyKey, runId);
|
|
648
|
+
return;
|
|
649
|
+
}
|
|
650
|
+
this.store.releaseConcurrencySlot(workflowName, concurrencyKey, runId);
|
|
651
|
+
}
|
|
163
652
|
// === Node Lifecycle ===
|
|
164
653
|
startNode(runId, opts) {
|
|
165
654
|
const nodeRun = {
|
|
@@ -174,6 +663,8 @@ export class RunTracker extends EventEmitter {
|
|
|
174
663
|
parentNodeId: opts.parentNodeId,
|
|
175
664
|
depth: opts.depth,
|
|
176
665
|
stepIndex: opts.stepIndex,
|
|
666
|
+
wait: opts.wait,
|
|
667
|
+
subworkflowDepth: opts.subworkflowDepth,
|
|
177
668
|
};
|
|
178
669
|
this.store.saveNodeRun(nodeRun);
|
|
179
670
|
const run = this.store.getRun(runId);
|
|
@@ -209,6 +700,70 @@ export class RunTracker extends EventEmitter {
|
|
|
209
700
|
metrics: nodeMetrics,
|
|
210
701
|
});
|
|
211
702
|
}
|
|
703
|
+
/**
|
|
704
|
+
* Tier 1 idempotency cache hit. Marks the node as completed without
|
|
705
|
+
* having actually run, attaches the source-run/source-node lineage so
|
|
706
|
+
* Studio can render a CACHED badge with click-through, and emits a
|
|
707
|
+
* `NODE_CACHED` event so SSE subscribers see the short-circuit live.
|
|
708
|
+
*
|
|
709
|
+
* Caller is responsible for replaying the cached result through
|
|
710
|
+
* `PersistenceHelper.applyStepOutput` — this method only records the
|
|
711
|
+
* tracing side. Caching layers ABOVE persistence, never within it.
|
|
712
|
+
*/
|
|
713
|
+
markNodeCached(nodeRunId, source, outputs) {
|
|
714
|
+
const nodeRun = this.store.getNodeRun(nodeRunId);
|
|
715
|
+
if (!nodeRun)
|
|
716
|
+
return;
|
|
717
|
+
const finishedAt = Date.now();
|
|
718
|
+
const durationMs = finishedAt - nodeRun.startedAt;
|
|
719
|
+
// Security review FW-10 — the idempotency cache holds raw step
|
|
720
|
+
// output (correct: downstream steps need actual values to run),
|
|
721
|
+
// but trace storage of a cache hit must mirror the live-run path
|
|
722
|
+
// where `completeNode` calls `sanitize(ctx.response.data)`.
|
|
723
|
+
// Without this, a cached step's outputs row could contain raw
|
|
724
|
+
// `password`/`token` fields that the live run would have redacted.
|
|
725
|
+
const sanitizedOutputs = outputs === undefined ? undefined : sanitize(outputs);
|
|
726
|
+
this.store.updateNodeRun(nodeRunId, {
|
|
727
|
+
status: "completed",
|
|
728
|
+
finishedAt,
|
|
729
|
+
durationMs,
|
|
730
|
+
outputs: sanitizedOutputs,
|
|
731
|
+
cached: { ...source },
|
|
732
|
+
});
|
|
733
|
+
const run = this.store.getRun(nodeRun.runId);
|
|
734
|
+
if (run) {
|
|
735
|
+
this.store.updateRun(nodeRun.runId, {
|
|
736
|
+
completedNodes: run.completedNodes + 1,
|
|
737
|
+
});
|
|
738
|
+
}
|
|
739
|
+
this.emitEvent(nodeRun.runId, run?.workflowName || "", "NODE_CACHED", nodeRun.nodeName, nodeRunId, {
|
|
740
|
+
durationMs,
|
|
741
|
+
source: { ...source },
|
|
742
|
+
});
|
|
743
|
+
}
|
|
744
|
+
/**
|
|
745
|
+
* Tier 1 retry: record a single failed attempt before the next retry. The
|
|
746
|
+
* node stays in `running` status — `failNode` is the terminal call that
|
|
747
|
+
* fires only after `retry.maxAttempts` is exhausted.
|
|
748
|
+
*
|
|
749
|
+
* Per-node attempt history is capped at {@link MAX_STORED_ATTEMPTS} (10)
|
|
750
|
+
* to bound store growth on extreme retry counts. The cap matches the
|
|
751
|
+
* risk-register decision in `tier1-idempotency-replay-retry.md`.
|
|
752
|
+
*/
|
|
753
|
+
recordNodeAttemptFailed(nodeRunId, info) {
|
|
754
|
+
const nodeRun = this.store.getNodeRun(nodeRunId);
|
|
755
|
+
if (!nodeRun)
|
|
756
|
+
return;
|
|
757
|
+
const errorDetail = toRunErrorDetail(info.error);
|
|
758
|
+
const next = [...(nodeRun.attempts ?? []), { attempt: info.attempt, error: errorDetail, timestamp: Date.now() }];
|
|
759
|
+
const capped = next.length > MAX_STORED_ATTEMPTS ? next.slice(-MAX_STORED_ATTEMPTS) : next;
|
|
760
|
+
this.store.updateNodeRun(nodeRunId, { attempts: capped });
|
|
761
|
+
const run = this.store.getRun(nodeRun.runId);
|
|
762
|
+
this.emitEvent(nodeRun.runId, run?.workflowName || "", "NODE_ATTEMPT_FAILED", nodeRun.nodeName, nodeRunId, {
|
|
763
|
+
attempt: info.attempt,
|
|
764
|
+
error: errorDetail,
|
|
765
|
+
});
|
|
766
|
+
}
|
|
212
767
|
failNode(nodeRunId, error) {
|
|
213
768
|
const nodeRun = this.store.getNodeRun(nodeRunId);
|
|
214
769
|
if (!nodeRun)
|
|
@@ -290,9 +845,16 @@ export class RunTracker extends EventEmitter {
|
|
|
290
845
|
}
|
|
291
846
|
// === Logging ===
|
|
292
847
|
addLog(entry) {
|
|
848
|
+
// Security review FW-6 — pipe arbitrary log payload through
|
|
849
|
+
// the sensitive-field redactor before persisting or emitting.
|
|
850
|
+
// `ctx.logger.logLevel("warn", "x", { password: "..." })` lands
|
|
851
|
+
// here; without sanitize the secret would persist + stream via
|
|
852
|
+
// SSE to anyone with /__blok/runs/:id/events access.
|
|
853
|
+
const sanitizedData = entry.data === undefined ? undefined : sanitize(entry.data);
|
|
293
854
|
const log = {
|
|
294
855
|
id: `log_${uuid().replace(/-/g, "").slice(0, 12)}`,
|
|
295
856
|
...entry,
|
|
857
|
+
data: sanitizedData,
|
|
296
858
|
timestamp: Date.now(),
|
|
297
859
|
};
|
|
298
860
|
this.store.saveLog(log);
|
|
@@ -300,7 +862,7 @@ export class RunTracker extends EventEmitter {
|
|
|
300
862
|
this.emitEvent(entry.runId, run?.workflowName || "", "LOG_ENTRY", entry.nodeName, entry.nodeId, {
|
|
301
863
|
level: entry.level,
|
|
302
864
|
message: entry.message,
|
|
303
|
-
data:
|
|
865
|
+
data: sanitizedData,
|
|
304
866
|
});
|
|
305
867
|
}
|
|
306
868
|
// === Vars Updated ===
|
|
@@ -324,6 +886,14 @@ export class RunTracker extends EventEmitter {
|
|
|
324
886
|
getEvents(runId, since) {
|
|
325
887
|
return this.store.getEvents(runId, since);
|
|
326
888
|
}
|
|
889
|
+
/**
|
|
890
|
+
* Tier 2 sub-workflow lineage. Returns every run that was started by
|
|
891
|
+
* a `subworkflow:` step inside the given parent run. Powers Studio's
|
|
892
|
+
* "Sub-runs" list on a parent's run-detail page.
|
|
893
|
+
*/
|
|
894
|
+
getRunsByParent(parentRunId) {
|
|
895
|
+
return this.store.getRunsByParent(parentRunId);
|
|
896
|
+
}
|
|
327
897
|
getLogs(runId, nodeId) {
|
|
328
898
|
return this.store.getLogs(runId, nodeId);
|
|
329
899
|
}
|