@blokjs/runner 0.2.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/dist/Configuration.d.ts +18 -0
  2. package/dist/Configuration.js +151 -4
  3. package/dist/Configuration.js.map +1 -1
  4. package/dist/PayloadTooLargeError.d.ts +19 -0
  5. package/dist/PayloadTooLargeError.js +29 -0
  6. package/dist/PayloadTooLargeError.js.map +1 -0
  7. package/dist/RunCancelledError.d.ts +17 -0
  8. package/dist/RunCancelledError.js +25 -0
  9. package/dist/RunCancelledError.js.map +1 -0
  10. package/dist/RunnerSteps.js +330 -33
  11. package/dist/RunnerSteps.js.map +1 -1
  12. package/dist/SubworkflowNode.d.ts +75 -0
  13. package/dist/SubworkflowNode.js +221 -0
  14. package/dist/SubworkflowNode.js.map +1 -0
  15. package/dist/TriggerBase.d.ts +128 -0
  16. package/dist/TriggerBase.js +773 -4
  17. package/dist/TriggerBase.js.map +1 -1
  18. package/dist/WaitDispatchRequest.d.ts +38 -0
  19. package/dist/WaitDispatchRequest.js +13 -0
  20. package/dist/WaitDispatchRequest.js.map +1 -0
  21. package/dist/WaitNode.d.ts +23 -0
  22. package/dist/WaitNode.js +26 -0
  23. package/dist/WaitNode.js.map +1 -0
  24. package/dist/concurrency/ConcurrencyBackend.d.ts +61 -0
  25. package/dist/concurrency/ConcurrencyBackend.js +20 -0
  26. package/dist/concurrency/ConcurrencyBackend.js.map +1 -0
  27. package/dist/concurrency/ConcurrencyLimitError.d.ts +37 -0
  28. package/dist/concurrency/ConcurrencyLimitError.js +16 -0
  29. package/dist/concurrency/ConcurrencyLimitError.js.map +1 -0
  30. package/dist/concurrency/NatsKvConcurrencyBackend.d.ts +64 -0
  31. package/dist/concurrency/NatsKvConcurrencyBackend.js +297 -0
  32. package/dist/concurrency/NatsKvConcurrencyBackend.js.map +1 -0
  33. package/dist/concurrency/QueueExpiredError.d.ts +40 -0
  34. package/dist/concurrency/QueueExpiredError.js +15 -0
  35. package/dist/concurrency/QueueExpiredError.js.map +1 -0
  36. package/dist/concurrency/createConcurrencyBackend.d.ts +23 -0
  37. package/dist/concurrency/createConcurrencyBackend.js +34 -0
  38. package/dist/concurrency/createConcurrencyBackend.js.map +1 -0
  39. package/dist/concurrency/readConcurrencyConfig.d.ts +60 -0
  40. package/dist/concurrency/readConcurrencyConfig.js +60 -0
  41. package/dist/concurrency/readConcurrencyConfig.js.map +1 -0
  42. package/dist/idempotency/resolveIdempotencyKey.d.ts +20 -0
  43. package/dist/idempotency/resolveIdempotencyKey.js +37 -0
  44. package/dist/idempotency/resolveIdempotencyKey.js.map +1 -0
  45. package/dist/index.d.ts +23 -3
  46. package/dist/index.js +47 -2
  47. package/dist/index.js.map +1 -1
  48. package/dist/monitoring/ConcurrencyMetrics.d.ts +56 -0
  49. package/dist/monitoring/ConcurrencyMetrics.js +107 -0
  50. package/dist/monitoring/ConcurrencyMetrics.js.map +1 -0
  51. package/dist/monitoring/JanitorMetrics.d.ts +27 -0
  52. package/dist/monitoring/JanitorMetrics.js +48 -0
  53. package/dist/monitoring/JanitorMetrics.js.map +1 -0
  54. package/dist/scheduling/DebounceCoordinator.d.ts +88 -0
  55. package/dist/scheduling/DebounceCoordinator.js +141 -0
  56. package/dist/scheduling/DebounceCoordinator.js.map +1 -0
  57. package/dist/scheduling/DeferredDispatchSignal.d.ts +50 -0
  58. package/dist/scheduling/DeferredDispatchSignal.js +14 -0
  59. package/dist/scheduling/DeferredDispatchSignal.js.map +1 -0
  60. package/dist/scheduling/DeferredRunScheduler.d.ts +68 -0
  61. package/dist/scheduling/DeferredRunScheduler.js +154 -0
  62. package/dist/scheduling/DeferredRunScheduler.js.map +1 -0
  63. package/dist/scheduling/readSchedulingConfig.d.ts +24 -0
  64. package/dist/scheduling/readSchedulingConfig.js +52 -0
  65. package/dist/scheduling/readSchedulingConfig.js.map +1 -0
  66. package/dist/timeouts/StepTimeoutError.d.ts +22 -0
  67. package/dist/timeouts/StepTimeoutError.js +31 -0
  68. package/dist/timeouts/StepTimeoutError.js.map +1 -0
  69. package/dist/tracing/InMemoryRunStore.d.ts +28 -1
  70. package/dist/tracing/InMemoryRunStore.js +150 -0
  71. package/dist/tracing/InMemoryRunStore.js.map +1 -1
  72. package/dist/tracing/Janitor.d.ts +70 -0
  73. package/dist/tracing/Janitor.js +150 -0
  74. package/dist/tracing/Janitor.js.map +1 -0
  75. package/dist/tracing/PostgresRunStore.d.ts +30 -0
  76. package/dist/tracing/PostgresRunStore.js +435 -3
  77. package/dist/tracing/PostgresRunStore.js.map +1 -1
  78. package/dist/tracing/RunStore.d.ts +100 -1
  79. package/dist/tracing/RunTracker.d.ts +238 -9
  80. package/dist/tracing/RunTracker.js +571 -1
  81. package/dist/tracing/RunTracker.js.map +1 -1
  82. package/dist/tracing/SqliteRunStore.d.ts +23 -1
  83. package/dist/tracing/SqliteRunStore.js +405 -6
  84. package/dist/tracing/SqliteRunStore.js.map +1 -1
  85. package/dist/tracing/TraceRouter.d.ts +20 -2
  86. package/dist/tracing/TraceRouter.js +249 -5
  87. package/dist/tracing/TraceRouter.js.map +1 -1
  88. package/dist/tracing/sanitize.d.ts +11 -0
  89. package/dist/tracing/sanitize.js +29 -0
  90. package/dist/tracing/sanitize.js.map +1 -1
  91. package/dist/tracing/types.d.ts +348 -2
  92. package/dist/utils/createChildContext.d.ts +32 -0
  93. package/dist/utils/createChildContext.js +113 -0
  94. package/dist/utils/createChildContext.js.map +1 -0
  95. package/dist/workflow/WorkflowNormalizer.d.ts +29 -41
  96. package/dist/workflow/WorkflowNormalizer.js +182 -0
  97. package/dist/workflow/WorkflowNormalizer.js.map +1 -1
  98. package/dist/workflow/WorkflowRegistry.d.ts +64 -0
  99. package/dist/workflow/WorkflowRegistry.js +81 -0
  100. package/dist/workflow/WorkflowRegistry.js.map +1 -0
  101. package/package.json +3 -3
@@ -5,6 +5,31 @@ import https from "node:https";
5
5
  import { v4 as uuid } from "uuid";
6
6
  import { InMemoryRunStore } from "./InMemoryRunStore";
7
7
  import { createStore } from "./createStore";
8
+ import { sanitize } from "./sanitize";
9
+ /**
10
+ * Cap on the number of `NODE_ATTEMPT_FAILED` entries kept on a single
11
+ * `NodeRun.attempts` array. Bounds store growth on extreme retry counts —
12
+ * a runaway loop generating 1000 attempts can't bloat the run store. The
13
+ * latest attempts are always preserved (older ones are dropped).
14
+ */
15
+ const MAX_STORED_ATTEMPTS = 10;
16
+ /**
17
+ * PR 1 follow-up · terminal status guard.
18
+ *
19
+ * Once a run reaches a terminal status, late-arriving completeRun/failRun
20
+ * calls (e.g., from a runner that didn't see a parallel cancel) must NOT
21
+ * overwrite it. Cancellation, expiry, throttling, crashes, and timeouts
22
+ * all win over a stale "the steps finished" signal.
23
+ */
24
+ const TERMINAL_STATUSES = new Set([
25
+ "completed",
26
+ "failed",
27
+ "cancelled",
28
+ "throttled",
29
+ "expired",
30
+ "crashed",
31
+ "timedOut",
32
+ ]);
8
33
  /**
9
34
  * Build a {@link RunErrorDetail} from any thrown error. When the source is
10
35
  * a typed `BlokError` (master plan §17), all 17+ structured fields are
@@ -115,6 +140,14 @@ export class RunTracker extends EventEmitter {
115
140
  tags: opts.tags,
116
141
  metadata: opts.metadata,
117
142
  environment,
143
+ replayOf: opts.replayOf,
144
+ parentRunId: opts.parentRunId,
145
+ parentNodeRunId: opts.parentNodeRunId,
146
+ scheduledAt: opts.scheduledAt,
147
+ expiresAt: opts.expiresAt,
148
+ debounceKey: opts.debounceKey,
149
+ debounceMode: opts.debounceMode,
150
+ pingCount: opts.pingCount,
118
151
  };
119
152
  this.store.saveRun(run);
120
153
  this.emitEvent(run.id, run.workflowName, "RUN_STARTED", undefined, undefined, {
@@ -130,6 +163,13 @@ export class RunTracker extends EventEmitter {
130
163
  const run = this.store.getRun(runId);
131
164
  if (!run)
132
165
  return;
166
+ // PR 1 follow-up · terminal-status guard. Don't overwrite a run that
167
+ // has already reached a terminal status (cancelled / expired / etc.)
168
+ // — a late completeRun from a runner that didn't see a parallel
169
+ // cancel must not flip the status back. Defense in depth against the
170
+ // REVIEW.md A2 class of bug.
171
+ if (TERMINAL_STATUSES.has(run.status))
172
+ return;
133
173
  const finishedAt = Date.now();
134
174
  const durationMs = finishedAt - run.startedAt;
135
175
  this.store.updateRun(runId, {
@@ -147,6 +187,9 @@ export class RunTracker extends EventEmitter {
147
187
  const run = this.store.getRun(runId);
148
188
  if (!run)
149
189
  return;
190
+ // PR 1 follow-up · terminal-status guard. Same rationale as completeRun.
191
+ if (TERMINAL_STATUSES.has(run.status))
192
+ return;
150
193
  const finishedAt = Date.now();
151
194
  const durationMs = finishedAt - run.startedAt;
152
195
  this.store.updateRun(runId, {
@@ -160,6 +203,452 @@ export class RunTracker extends EventEmitter {
160
203
  error: toRunErrorDetail(error),
161
204
  });
162
205
  }
206
+ /**
207
+ * Tier 2 #6 — mark a run as throttled because the concurrency gate
208
+ * denied it before any step executed. Distinct from `failRun` because
209
+ * no step ran; nothing produced an error. Studio surfaces a Throttled
210
+ * badge and SSE subscribers see a granular `RUN_THROTTLED` event.
211
+ */
212
+ markRunThrottled(runId, info) {
213
+ const run = this.store.getRun(runId);
214
+ if (!run)
215
+ return;
216
+ // Review fix-up · BUG-1. Don't overwrite a terminal status. A
217
+ // concurrent operator-cancel or crash auto-flip might have flipped
218
+ // the run between read and write; preserve the earlier terminal
219
+ // outcome rather than re-marking as throttled.
220
+ if (TERMINAL_STATUSES.has(run.status))
221
+ return;
222
+ const finishedAt = Date.now();
223
+ const durationMs = finishedAt - run.startedAt;
224
+ this.store.updateRun(runId, {
225
+ status: "throttled",
226
+ finishedAt,
227
+ durationMs,
228
+ });
229
+ this.emitEvent(runId, run.workflowName, "RUN_THROTTLED", undefined, undefined, {
230
+ durationMs,
231
+ concurrencyKey: info.concurrencyKey,
232
+ concurrencyLimit: info.concurrencyLimit,
233
+ currentInFlight: info.currentInFlight,
234
+ });
235
+ }
236
+ /**
237
+ * Tier 2 #6 follow-up — mark a run as queued because the concurrency
238
+ * gate denied it AND the trigger is configured with `onLimit: "queue"`.
239
+ * The run will be re-attempted after `scheduledAt`; `scheduledAt` is
240
+ * persisted on the run record so Studio can render a "queued · retries
241
+ * at <time>" badge.
242
+ *
243
+ * Distinct from `markRunThrottled` because queued runs WILL eventually
244
+ * execute (or stay queued indefinitely until a slot frees), while
245
+ * throttled runs are terminal and `failRun` semantics are skipped.
246
+ *
247
+ * Caller is responsible for actually scheduling the retry via
248
+ * `DeferredRunScheduler`. This method only flips status + emits the
249
+ * `RUN_QUEUED` event. Re-marking with a later `scheduledAt` updates
250
+ * the field (used when re-defer happens after a timer-fired re-acquire
251
+ * also fails).
252
+ */
253
+ markRunQueued(runId, info) {
254
+ const run = this.store.getRun(runId);
255
+ if (!run)
256
+ return;
257
+ // Review fix-up · BUG-1. Don't overwrite a terminal status (e.g.,
258
+ // `cancelled` from a concurrent operator-cancel during the
259
+ // onLimit:queue re-defer race). The TTL-expired path is handled
260
+ // separately in TriggerBase via QueueExpiredError.
261
+ if (TERMINAL_STATUSES.has(run.status))
262
+ return;
263
+ this.store.updateRun(runId, {
264
+ status: "queued",
265
+ scheduledAt: info.scheduledAt,
266
+ });
267
+ this.emitEvent(runId, run.workflowName, "RUN_QUEUED", undefined, undefined, {
268
+ concurrencyKey: info.concurrencyKey,
269
+ concurrencyLimit: info.concurrencyLimit,
270
+ currentInFlight: info.currentInFlight,
271
+ scheduledAt: info.scheduledAt,
272
+ });
273
+ }
274
+ // === Scheduling lifecycle (Tier 2 #5 + #7) ===
275
+ /**
276
+ * Tier 2 #5 — mark a run as `delayed`. Called immediately after
277
+ * `startRun` for runs that should be deferred. The run record carries
278
+ * `scheduledAt` (and optionally `expiresAt`) so Studio can render a
279
+ * "Delayed → fires at <time>" badge.
280
+ *
281
+ * Caller is responsible for actually scheduling the dispatch via
282
+ * `DeferredRunScheduler`. This method only flips status + emits the
283
+ * `RUN_DELAYED` event.
284
+ */
285
+ markRunDelayed(runId, info) {
286
+ const run = this.store.getRun(runId);
287
+ if (!run)
288
+ return;
289
+ // Review fix-up · BUG-1. Don't overwrite a terminal status — e.g.,
290
+ // a wait.for() re-entry race where the operator cancelled the run
291
+ // while WaitDispatchRequest was being thrown.
292
+ if (TERMINAL_STATUSES.has(run.status))
293
+ return;
294
+ this.store.updateRun(runId, {
295
+ status: "delayed",
296
+ scheduledAt: info.scheduledAt,
297
+ expiresAt: info.expiresAt,
298
+ });
299
+ this.emitEvent(runId, run.workflowName, "RUN_DELAYED", undefined, undefined, {
300
+ scheduledAt: info.scheduledAt,
301
+ delayMs: info.delayMs,
302
+ expiresAt: info.expiresAt,
303
+ });
304
+ }
305
+ /**
306
+ * Tier 2 #5 — mark a run as `expired` because its TTL was exceeded
307
+ * before dispatch. Distinct from `failed` (no step ran) and
308
+ * `cancelled` (operator action — TTL is automatic).
309
+ */
310
+ markRunExpired(runId, info) {
311
+ const run = this.store.getRun(runId);
312
+ if (!run)
313
+ return;
314
+ // Review fix-up · BUG-1. Don't overwrite a terminal status. A
315
+ // run that was cancelled by an operator before the dispatch timer
316
+ // fired should stay `cancelled`, not flip to `expired`.
317
+ if (TERMINAL_STATUSES.has(run.status))
318
+ return;
319
+ const finishedAt = info.expiredAt;
320
+ const durationMs = finishedAt - run.startedAt;
321
+ const lateBy = info.expiredAt - info.expiresAt;
322
+ this.store.updateRun(runId, {
323
+ status: "expired",
324
+ finishedAt,
325
+ durationMs,
326
+ });
327
+ this.emitEvent(runId, run.workflowName, "RUN_EXPIRED", undefined, undefined, {
328
+ expiresAt: info.expiresAt,
329
+ expiredAt: info.expiredAt,
330
+ lateBy,
331
+ });
332
+ }
333
+ /**
334
+ * Tier 2 #7 — mark a run as `debounced`. In **leading** mode this is
335
+ * terminal: the ping was suppressed because a sibling fired
336
+ * immediately (`intoRunId` carries the sibling's id). In **trailing**
337
+ * mode this is transient: the same run is marked `debounced` while
338
+ * the timer is active and flips to `running` when the window closes
339
+ * (no separate transition method needed — `tracker` updates status
340
+ * directly via store before invoking the runner).
341
+ */
342
+ markRunDebounced(runId, info) {
343
+ const run = this.store.getRun(runId);
344
+ if (!run)
345
+ return;
346
+ // Review fix-up · BUG-1. Don't overwrite a terminal status. A
347
+ // trailing debounce timer firing into a cancelled active run
348
+ // should NOT flip the run back to debounced.
349
+ if (TERMINAL_STATUSES.has(run.status))
350
+ return;
351
+ const isTerminal = info.mode === "leading" && info.intoRunId !== undefined;
352
+ const finishedAt = isTerminal ? Date.now() : undefined;
353
+ const durationMs = isTerminal && finishedAt ? finishedAt - run.startedAt : undefined;
354
+ this.store.updateRun(runId, {
355
+ status: "debounced",
356
+ debounceKey: info.debounceKey,
357
+ debounceMode: info.mode,
358
+ pingCount: info.pingCount,
359
+ scheduledAt: info.scheduledAt,
360
+ ...(isTerminal ? { finishedAt, durationMs } : {}),
361
+ });
362
+ this.emitEvent(runId, run.workflowName, "RUN_DEBOUNCED", undefined, undefined, {
363
+ debounceKey: info.debounceKey,
364
+ mode: info.mode,
365
+ intoRunId: info.intoRunId,
366
+ pingCount: info.pingCount,
367
+ scheduledAt: info.scheduledAt,
368
+ });
369
+ }
370
+ /**
371
+ * Tier 2 quick-wins — mark a run as `crashed` (uncaught exception,
372
+ * OOM, signal). Distinct from `failRun` because the failure was
373
+ * NOT a step's `process()` throwing — it was the runner itself
374
+ * giving up. Currently manual; call from custom triggers / ops
375
+ * harnesses when uncaught failures are detected.
376
+ */
377
+ markRunCrashed(runId, info) {
378
+ const run = this.store.getRun(runId);
379
+ if (!run)
380
+ return;
381
+ // Review fix-up · BUG-1. Don't overwrite a terminal status. A
382
+ // run that was already cancelled / failed / timedOut shouldn't
383
+ // be flipped to crashed by the boot orphan-recovery pass.
384
+ if (TERMINAL_STATUSES.has(run.status))
385
+ return;
386
+ const finishedAt = Date.now();
387
+ const durationMs = finishedAt - run.startedAt;
388
+ this.store.updateRun(runId, {
389
+ status: "crashed",
390
+ finishedAt,
391
+ durationMs,
392
+ error: toRunErrorDetail(info.error),
393
+ });
394
+ this.emitEvent(runId, run.workflowName, "RUN_CRASHED", undefined, undefined, {
395
+ durationMs,
396
+ error: toRunErrorDetail(info.error),
397
+ });
398
+ }
399
+ /**
400
+ * Tier 2 quick-wins follow-up — bulk-flip every run currently in
401
+ * `running` status to `crashed`. Returns the count flipped.
402
+ *
403
+ * Used by:
404
+ * - Process-level uncaught-exception handlers
405
+ * (`TriggerBase.installCrashHandlers`) — flip in-flight runs
406
+ * before the process dies.
407
+ * - Boot recovery (`TriggerBase.recoverOrphanedRuns`) — flip runs
408
+ * that were `running` from the previous (dead) process.
409
+ *
410
+ * Synchronous + safe to call from a `process.on("uncaughtException")`
411
+ * handler (which can't await). Backed by sync sqlite/in-memory
412
+ * writes that complete before the handler returns.
413
+ *
414
+ * Optional `opts.maxStartedAt` filter — only flip runs whose
415
+ * `startedAt` is at or before this timestamp. Used by boot recovery
416
+ * to avoid flipping runs from the current (live) process.
417
+ */
418
+ markAllRunningRunsAsCrashed(error, opts) {
419
+ // PR 1 follow-up · A1 fix. `getRuns` defaults `opts?.limit ?? 50` in
420
+ // SqliteRunStore — left unbounded, this method silently flips at
421
+ // most 50 orphans per call. Loop until the store returns fewer rows
422
+ // than the page size (= no more matches under the LIMIT).
423
+ //
424
+ // Bounded outer loop: cap at 1000 iterations defensively. With the
425
+ // 50-row page size that's 50K orphans handled per single call —
426
+ // well above any realistic boot-recovery scenario.
427
+ let totalFlipped = 0;
428
+ const PAGE_SIZE = 50; // mirrors SqliteRunStore.getRuns default LIMIT
429
+ const MAX_PAGES = 1000;
430
+ for (let page = 0; page < MAX_PAGES; page++) {
431
+ // Snapshot the runs first — markRunCrashed mutates the store and
432
+ // could perturb iteration if we read+update inline.
433
+ const { runs } = this.store.getRuns({ status: "running" });
434
+ const candidates = opts?.maxStartedAt !== undefined ? runs.filter((r) => r.startedAt <= opts.maxStartedAt) : runs;
435
+ if (candidates.length === 0)
436
+ break;
437
+ for (const run of candidates) {
438
+ this.markRunCrashed(run.id, { error });
439
+ }
440
+ totalFlipped += candidates.length;
441
+ // If we got fewer rows than the page size, the store has no more
442
+ // matches under the LIMIT — exit early.
443
+ if (runs.length < PAGE_SIZE)
444
+ break;
445
+ }
446
+ return totalFlipped;
447
+ }
448
+ /**
449
+ * Tier 2 quick-wins — mark a run as `timedOut` because a step's
450
+ * final retry attempt exceeded its `maxDuration` cap. Distinct
451
+ * from `failed` so SLA dashboards can separate timeout-driven
452
+ * failures (network / capacity) from logic failures (bugs).
453
+ * Auto-called by `RunnerSteps` on final-attempt `StepTimeoutError`.
454
+ */
455
+ markRunTimedOut(runId, info) {
456
+ const run = this.store.getRun(runId);
457
+ if (!run)
458
+ return;
459
+ // Review fix-up · BUG-1. Don't overwrite a terminal status — a
460
+ // run that was cancelled mid-step shouldn't flip to timedOut
461
+ // when the maxDuration timer fires after the cancel.
462
+ if (TERMINAL_STATUSES.has(run.status))
463
+ return;
464
+ const finishedAt = Date.now();
465
+ const durationMs = finishedAt - run.startedAt;
466
+ this.store.updateRun(runId, {
467
+ status: "timedOut",
468
+ finishedAt,
469
+ durationMs,
470
+ });
471
+ this.emitEvent(runId, run.workflowName, "RUN_TIMED_OUT", undefined, undefined, {
472
+ durationMs,
473
+ stepId: info.stepId,
474
+ maxDurationMs: info.maxDurationMs,
475
+ attemptsExhausted: info.attemptsExhausted,
476
+ });
477
+ }
478
+ /**
479
+ * Tier 2 #7 — record an additional ping into an existing trailing-mode
480
+ * debounce window. Increments `pingCount` and updates `scheduledAt`.
481
+ * Does NOT emit a new event (avoid event-stream bloat under burst).
482
+ */
483
+ recordDebouncePing(runId, opts) {
484
+ const run = this.store.getRun(runId);
485
+ if (!run)
486
+ return;
487
+ this.store.updateRun(runId, {
488
+ pingCount: opts.pingCount,
489
+ scheduledAt: opts.scheduledAt,
490
+ });
491
+ }
492
+ /**
493
+ * Tier 2 #7 — transition a `delayed`/`debounced` run into `running`
494
+ * when its timer fires. Studio sees the status change via the
495
+ * existing run-update SSE stream.
496
+ */
497
+ transitionRunToRunning(runId) {
498
+ const run = this.store.getRun(runId);
499
+ if (!run)
500
+ return;
501
+ this.store.updateRun(runId, {
502
+ status: "running",
503
+ startedAt: run.startedAt, // preserve the original submission time
504
+ });
505
+ }
506
+ /**
507
+ * Tier 2 polish — cancel a pending (delayed/debounced/queued) run.
508
+ * Idempotent. Returns true when the run existed AND was in a cancellable
509
+ * state; false when the run doesn't exist OR is already running/completed/
510
+ * failed/throttled/expired/crashed/timedOut/cancelled.
511
+ *
512
+ * **Caller responsibility**: this method only updates the run record
513
+ * (status → `"cancelled"`) and emits `RUN_CANCELLED`. The caller must
514
+ * separately clear any pending scheduler timers via
515
+ * `DeferredRunScheduler.getInstance().cancel(runId)` and (when applicable)
516
+ * `DebounceCoordinator.getInstance().cancel(workflowName, debounceKey)`.
517
+ * Done this way to avoid an import cycle from tracing → scheduling.
518
+ */
519
+ cancelRun(runId, options) {
520
+ const run = this.store.getRun(runId);
521
+ if (!run)
522
+ return false;
523
+ // Tier 2 follow-up · "running" added so cooperative AbortSignal
524
+ // cancellation can flip status to "cancelled" before the in-flight
525
+ // step throws `RunCancelledError`. The tracker's `abortRunningRun`
526
+ // calls this method right after firing the AbortController.
527
+ const cancellable = ["delayed", "debounced", "queued", "running"];
528
+ if (!cancellable.includes(run.status))
529
+ return false;
530
+ const previousStatus = run.status;
531
+ const finishedAt = Date.now();
532
+ const durationMs = finishedAt - run.startedAt;
533
+ this.store.updateRun(runId, {
534
+ status: "cancelled",
535
+ finishedAt,
536
+ durationMs,
537
+ });
538
+ this.emitEvent(runId, run.workflowName, "RUN_CANCELLED", undefined, undefined, {
539
+ durationMs,
540
+ previousStatus,
541
+ });
542
+ // PR 5 G1 — cascade to fire-and-forget children. Sub-workflow
543
+ // children with `wait: true` (sync) cancel automatically via the
544
+ // AbortSignal chain in createChildContext; children with
545
+ // `wait: false` (async / fire-and-forget) need explicit cascade
546
+ // because the parent step has already returned before the cancel.
547
+ // Walk getRunsByParent recursively (bounded by
548
+ // BLOK_MAX_SUBWORKFLOW_DEPTH).
549
+ if (options?.cascade !== false) {
550
+ const children = this.store.getRunsByParent(runId);
551
+ for (const child of children) {
552
+ if (cancellable.includes(child.status)) {
553
+ // Recursive — bounded by max-depth; each level reduces
554
+ // the candidate pool until none remain.
555
+ this.cancelRun(child.id, { cascade: true });
556
+ }
557
+ }
558
+ }
559
+ return true;
560
+ }
561
+ // === Cooperative cancellation (Tier 2 follow-up) ===
562
+ /**
563
+ * Per-process map from runId to the AbortController owned by the
564
+ * trigger's createContext call. Populated by TriggerBase right after
565
+ * `startRun()`; cleared in TriggerBase's finally block. Used by
566
+ * `abortRunningRun` to fire the signal when an operator cancels a
567
+ * `running` run via the cancel API.
568
+ */
569
+ abortControllers = new Map();
570
+ registerAbortController(runId, controller) {
571
+ this.abortControllers.set(runId, controller);
572
+ }
573
+ unregisterAbortController(runId) {
574
+ this.abortControllers.delete(runId);
575
+ }
576
+ /**
577
+ * Tier 2 follow-up · cooperative cancellation for `running` runs.
578
+ *
579
+ * Fires the run's AbortController (so `ctx.signal.aborted` becomes
580
+ * true and any node consulting it can abort early) AND flips the run
581
+ * status to `"cancelled"` immediately via `cancelRun`. RunnerSteps'
582
+ * between-step abort check throws `RunCancelledError` shortly after,
583
+ * which TriggerBase catches without re-flipping the status.
584
+ *
585
+ * Returns true when an AbortController was registered for this run
586
+ * AND the status was successfully flipped; false otherwise (run not
587
+ * found, run not in `running` status, or no controller registered —
588
+ * e.g. controller already cleaned up).
589
+ */
590
+ abortRunningRun(runId) {
591
+ const run = this.store.getRun(runId);
592
+ if (!run || run.status !== "running")
593
+ return false;
594
+ const controller = this.abortControllers.get(runId);
595
+ if (controller) {
596
+ try {
597
+ controller.abort();
598
+ }
599
+ catch {
600
+ // AbortController.abort never throws on first call; double-abort is safe.
601
+ }
602
+ }
603
+ // Flip status now so polls return cancelled immediately. The
604
+ // in-flight step's throw will land in TriggerBase.run's catch
605
+ // shortly; the catch sees status is already terminal and skips
606
+ // failRun (RunCancelledError instanceof check).
607
+ return this.cancelRun(runId);
608
+ }
609
+ // === Concurrency gate pass-throughs (Tier 2 #6) ===
610
+ /**
611
+ * Tier 2 #6 follow-up · cross-process concurrency backend.
612
+ *
613
+ * When set (via {@link setConcurrencyBackend}), the tracker's
614
+ * `acquireConcurrencySlot` and `releaseConcurrencySlot` methods
615
+ * delegate to the backend instead of the local sync `RunStore` impl.
616
+ * Used to coordinate across processes via NATS KV / Redis.
617
+ *
618
+ * Default `null` — preserves zero-overhead in-process behavior.
619
+ * Trigger packages install a backend in `listen()` when the operator
620
+ * sets `BLOK_CONCURRENCY_BACKEND=nats-kv`.
621
+ */
622
+ concurrencyBackend = null;
623
+ setConcurrencyBackend(backend) {
624
+ this.concurrencyBackend = backend;
625
+ }
626
+ getConcurrencyBackend() {
627
+ return this.concurrencyBackend;
628
+ }
629
+ /**
630
+ * Acquire a concurrency slot for `(workflowName, concurrencyKey)`.
631
+ * Delegates to the configured cross-process backend when set; falls
632
+ * back to the local sync `RunStore` impl otherwise.
633
+ *
634
+ * Async — the cross-process backend (NATS KV) is async-only. The
635
+ * sync fallback is wrapped in `Promise.resolve()` so the call site
636
+ * is uniform.
637
+ */
638
+ async acquireConcurrencySlot(workflowName, concurrencyKey, concurrencyLimit, runId, leaseExpiresAt) {
639
+ if (this.concurrencyBackend) {
640
+ return this.concurrencyBackend.acquireSlot(workflowName, concurrencyKey, concurrencyLimit, runId, leaseExpiresAt);
641
+ }
642
+ return this.store.acquireConcurrencySlot(workflowName, concurrencyKey, concurrencyLimit, runId, leaseExpiresAt);
643
+ }
644
+ /** Release a slot acquired via `acquireConcurrencySlot`. Idempotent. */
645
+ async releaseConcurrencySlot(workflowName, concurrencyKey, runId) {
646
+ if (this.concurrencyBackend) {
647
+ await this.concurrencyBackend.releaseSlot(workflowName, concurrencyKey, runId);
648
+ return;
649
+ }
650
+ this.store.releaseConcurrencySlot(workflowName, concurrencyKey, runId);
651
+ }
163
652
  // === Node Lifecycle ===
164
653
  startNode(runId, opts) {
165
654
  const nodeRun = {
@@ -174,6 +663,8 @@ export class RunTracker extends EventEmitter {
174
663
  parentNodeId: opts.parentNodeId,
175
664
  depth: opts.depth,
176
665
  stepIndex: opts.stepIndex,
666
+ wait: opts.wait,
667
+ subworkflowDepth: opts.subworkflowDepth,
177
668
  };
178
669
  this.store.saveNodeRun(nodeRun);
179
670
  const run = this.store.getRun(runId);
@@ -209,6 +700,70 @@ export class RunTracker extends EventEmitter {
209
700
  metrics: nodeMetrics,
210
701
  });
211
702
  }
703
+ /**
704
+ * Tier 1 idempotency cache hit. Marks the node as completed without
705
+ * having actually run, attaches the source-run/source-node lineage so
706
+ * Studio can render a CACHED badge with click-through, and emits a
707
+ * `NODE_CACHED` event so SSE subscribers see the short-circuit live.
708
+ *
709
+ * Caller is responsible for replaying the cached result through
710
+ * `PersistenceHelper.applyStepOutput` — this method only records the
711
+ * tracing side. Caching layers ABOVE persistence, never within it.
712
+ */
713
+ markNodeCached(nodeRunId, source, outputs) {
714
+ const nodeRun = this.store.getNodeRun(nodeRunId);
715
+ if (!nodeRun)
716
+ return;
717
+ const finishedAt = Date.now();
718
+ const durationMs = finishedAt - nodeRun.startedAt;
719
+ // Security review FW-10 — the idempotency cache holds raw step
720
+ // output (correct: downstream steps need actual values to run),
721
+ // but trace storage of a cache hit must mirror the live-run path
722
+ // where `completeNode` calls `sanitize(ctx.response.data)`.
723
+ // Without this, a cached step's outputs row could contain raw
724
+ // `password`/`token` fields that the live run would have redacted.
725
+ const sanitizedOutputs = outputs === undefined ? undefined : sanitize(outputs);
726
+ this.store.updateNodeRun(nodeRunId, {
727
+ status: "completed",
728
+ finishedAt,
729
+ durationMs,
730
+ outputs: sanitizedOutputs,
731
+ cached: { ...source },
732
+ });
733
+ const run = this.store.getRun(nodeRun.runId);
734
+ if (run) {
735
+ this.store.updateRun(nodeRun.runId, {
736
+ completedNodes: run.completedNodes + 1,
737
+ });
738
+ }
739
+ this.emitEvent(nodeRun.runId, run?.workflowName || "", "NODE_CACHED", nodeRun.nodeName, nodeRunId, {
740
+ durationMs,
741
+ source: { ...source },
742
+ });
743
+ }
744
+ /**
745
+ * Tier 1 retry: record a single failed attempt before the next retry. The
746
+ * node stays in `running` status — `failNode` is the terminal call that
747
+ * fires only after `retry.maxAttempts` is exhausted.
748
+ *
749
+ * Per-node attempt history is capped at {@link MAX_STORED_ATTEMPTS} (10)
750
+ * to bound store growth on extreme retry counts. The cap matches the
751
+ * risk-register decision in `tier1-idempotency-replay-retry.md`.
752
+ */
753
+ recordNodeAttemptFailed(nodeRunId, info) {
754
+ const nodeRun = this.store.getNodeRun(nodeRunId);
755
+ if (!nodeRun)
756
+ return;
757
+ const errorDetail = toRunErrorDetail(info.error);
758
+ const next = [...(nodeRun.attempts ?? []), { attempt: info.attempt, error: errorDetail, timestamp: Date.now() }];
759
+ const capped = next.length > MAX_STORED_ATTEMPTS ? next.slice(-MAX_STORED_ATTEMPTS) : next;
760
+ this.store.updateNodeRun(nodeRunId, { attempts: capped });
761
+ const run = this.store.getRun(nodeRun.runId);
762
+ this.emitEvent(nodeRun.runId, run?.workflowName || "", "NODE_ATTEMPT_FAILED", nodeRun.nodeName, nodeRunId, {
763
+ attempt: info.attempt,
764
+ error: errorDetail,
765
+ });
766
+ }
212
767
  failNode(nodeRunId, error) {
213
768
  const nodeRun = this.store.getNodeRun(nodeRunId);
214
769
  if (!nodeRun)
@@ -290,9 +845,16 @@ export class RunTracker extends EventEmitter {
290
845
  }
291
846
  // === Logging ===
292
847
  addLog(entry) {
848
+ // Security review FW-6 — pipe arbitrary log payload through
849
+ // the sensitive-field redactor before persisting or emitting.
850
+ // `ctx.logger.logLevel("warn", "x", { password: "..." })` lands
851
+ // here; without sanitize the secret would persist + stream via
852
+ // SSE to anyone with /__blok/runs/:id/events access.
853
+ const sanitizedData = entry.data === undefined ? undefined : sanitize(entry.data);
293
854
  const log = {
294
855
  id: `log_${uuid().replace(/-/g, "").slice(0, 12)}`,
295
856
  ...entry,
857
+ data: sanitizedData,
296
858
  timestamp: Date.now(),
297
859
  };
298
860
  this.store.saveLog(log);
@@ -300,7 +862,7 @@ export class RunTracker extends EventEmitter {
300
862
  this.emitEvent(entry.runId, run?.workflowName || "", "LOG_ENTRY", entry.nodeName, entry.nodeId, {
301
863
  level: entry.level,
302
864
  message: entry.message,
303
- data: entry.data,
865
+ data: sanitizedData,
304
866
  });
305
867
  }
306
868
  // === Vars Updated ===
@@ -324,6 +886,14 @@ export class RunTracker extends EventEmitter {
324
886
  getEvents(runId, since) {
325
887
  return this.store.getEvents(runId, since);
326
888
  }
889
+ /**
890
+ * Tier 2 sub-workflow lineage. Returns every run that was started by
891
+ * a `subworkflow:` step inside the given parent run. Powers Studio's
892
+ * "Sub-runs" list on a parent's run-detail page.
893
+ */
894
+ getRunsByParent(parentRunId) {
895
+ return this.store.getRunsByParent(parentRunId);
896
+ }
327
897
  getLogs(runId, nodeId) {
328
898
  return this.store.getLogs(runId, nodeId);
329
899
  }