@blokjs/runner 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. package/dist/Blok.js +32 -3
  2. package/dist/Blok.js.map +1 -1
  3. package/dist/Configuration.d.ts +41 -5
  4. package/dist/Configuration.js +215 -92
  5. package/dist/Configuration.js.map +1 -1
  6. package/dist/ForEachNode.d.ts +59 -0
  7. package/dist/ForEachNode.js +522 -0
  8. package/dist/ForEachNode.js.map +1 -0
  9. package/dist/LoopMaxIterationsError.d.ts +11 -0
  10. package/dist/LoopMaxIterationsError.js +18 -0
  11. package/dist/LoopMaxIterationsError.js.map +1 -0
  12. package/dist/LoopNode.d.ts +36 -0
  13. package/dist/LoopNode.js +182 -0
  14. package/dist/LoopNode.js.map +1 -0
  15. package/dist/Runner.d.ts +11 -1
  16. package/dist/Runner.js +9 -2
  17. package/dist/Runner.js.map +1 -1
  18. package/dist/RunnerSteps.js +419 -112
  19. package/dist/RunnerSteps.js.map +1 -1
  20. package/dist/RuntimeAdapterNode.d.ts +2 -1
  21. package/dist/RuntimeAdapterNode.js +2 -2
  22. package/dist/RuntimeAdapterNode.js.map +1 -1
  23. package/dist/RuntimeRegistry.d.ts +23 -2
  24. package/dist/RuntimeRegistry.js +31 -2
  25. package/dist/RuntimeRegistry.js.map +1 -1
  26. package/dist/SubworkflowNode.d.ts +106 -0
  27. package/dist/SubworkflowNode.js +261 -3
  28. package/dist/SubworkflowNode.js.map +1 -1
  29. package/dist/SwitchNode.d.ts +37 -0
  30. package/dist/SwitchNode.js +153 -0
  31. package/dist/SwitchNode.js.map +1 -0
  32. package/dist/TriggerBase.d.ts +50 -0
  33. package/dist/TriggerBase.js +262 -4
  34. package/dist/TriggerBase.js.map +1 -1
  35. package/dist/TryCatchNode.d.ts +32 -0
  36. package/dist/TryCatchNode.js +207 -0
  37. package/dist/TryCatchNode.js.map +1 -0
  38. package/dist/adapters/grpc/GrpcCodec.js +2 -2
  39. package/dist/adapters/grpc/GrpcRuntimeAdapter.d.ts +6 -4
  40. package/dist/adapters/grpc/GrpcRuntimeAdapter.js +6 -4
  41. package/dist/adapters/grpc/GrpcRuntimeAdapter.js.map +1 -1
  42. package/dist/adapters/grpc/types.d.ts +7 -5
  43. package/dist/adapters/grpc/types.js.map +1 -1
  44. package/dist/adapters/transport.d.ts +12 -41
  45. package/dist/adapters/transport.js +21 -70
  46. package/dist/adapters/transport.js.map +1 -1
  47. package/dist/cache/NodeResultCache.js +7 -0
  48. package/dist/cache/NodeResultCache.js.map +1 -1
  49. package/dist/concurrency/NatsKvConcurrencyBackend.js +18 -5
  50. package/dist/concurrency/NatsKvConcurrencyBackend.js.map +1 -1
  51. package/dist/concurrency/RedisConcurrencyBackend.d.ts +64 -0
  52. package/dist/concurrency/RedisConcurrencyBackend.js +374 -0
  53. package/dist/concurrency/RedisConcurrencyBackend.js.map +1 -0
  54. package/dist/concurrency/createConcurrencyBackend.d.ts +1 -0
  55. package/dist/concurrency/createConcurrencyBackend.js +5 -1
  56. package/dist/concurrency/createConcurrencyBackend.js.map +1 -1
  57. package/dist/defineNode.d.ts +8 -0
  58. package/dist/defineNode.js +25 -5
  59. package/dist/defineNode.js.map +1 -1
  60. package/dist/graphql/GraphQLSchemaGenerator.js +1 -1
  61. package/dist/graphql/GraphQLSchemaGenerator.js.map +1 -1
  62. package/dist/index.d.ts +10 -6
  63. package/dist/index.js +13 -9
  64. package/dist/index.js.map +1 -1
  65. package/dist/marketplace/RuntimeCatalog.d.ts +6 -0
  66. package/dist/marketplace/RuntimeCatalog.js.map +1 -1
  67. package/dist/marketplace/RuntimeDiscovery.d.ts +2 -2
  68. package/dist/marketplace/RuntimeDiscovery.js +18 -6
  69. package/dist/marketplace/RuntimeDiscovery.js.map +1 -1
  70. package/dist/monitoring/ConcurrencyMetrics.d.ts +26 -0
  71. package/dist/monitoring/ConcurrencyMetrics.js +36 -4
  72. package/dist/monitoring/ConcurrencyMetrics.js.map +1 -1
  73. package/dist/monitoring/ForEachWaitMetrics.d.ts +22 -0
  74. package/dist/monitoring/ForEachWaitMetrics.js +36 -0
  75. package/dist/monitoring/ForEachWaitMetrics.js.map +1 -0
  76. package/dist/openapi/OpenAPIGenerator.js +7 -2
  77. package/dist/openapi/OpenAPIGenerator.js.map +1 -1
  78. package/dist/runtime/PrimitiveStack.d.ts +64 -0
  79. package/dist/runtime/PrimitiveStack.js +92 -0
  80. package/dist/runtime/PrimitiveStack.js.map +1 -0
  81. package/dist/scheduling/DebounceBackend.d.ts +108 -0
  82. package/dist/scheduling/DebounceBackend.js +23 -0
  83. package/dist/scheduling/DebounceBackend.js.map +1 -0
  84. package/dist/scheduling/DebounceCoordinator.d.ts +65 -12
  85. package/dist/scheduling/DebounceCoordinator.js +234 -13
  86. package/dist/scheduling/DebounceCoordinator.js.map +1 -1
  87. package/dist/scheduling/DeferredRunScheduler.d.ts +28 -0
  88. package/dist/scheduling/DeferredRunScheduler.js +105 -3
  89. package/dist/scheduling/DeferredRunScheduler.js.map +1 -1
  90. package/dist/scheduling/NatsKvDebounceBackend.d.ts +53 -0
  91. package/dist/scheduling/NatsKvDebounceBackend.js +334 -0
  92. package/dist/scheduling/NatsKvDebounceBackend.js.map +1 -0
  93. package/dist/scheduling/RedisDebounceBackend.d.ts +49 -0
  94. package/dist/scheduling/RedisDebounceBackend.js +356 -0
  95. package/dist/scheduling/RedisDebounceBackend.js.map +1 -0
  96. package/dist/scheduling/createDebounceBackend.d.ts +25 -0
  97. package/dist/scheduling/createDebounceBackend.js +39 -0
  98. package/dist/scheduling/createDebounceBackend.js.map +1 -0
  99. package/dist/security/AuditLogger.js +1 -1
  100. package/dist/security/AuditLogger.js.map +1 -1
  101. package/dist/security/AuthMiddleware.d.ts +19 -20
  102. package/dist/security/AuthMiddleware.js +35 -20
  103. package/dist/security/AuthMiddleware.js.map +1 -1
  104. package/dist/security/OAuthProvider.js +2 -2
  105. package/dist/security/OAuthProvider.js.map +1 -1
  106. package/dist/security/SecretManager.js +14 -13
  107. package/dist/security/SecretManager.js.map +1 -1
  108. package/dist/security/index.d.ts +3 -1
  109. package/dist/security/index.js +3 -1
  110. package/dist/security/index.js.map +1 -1
  111. package/dist/testing/TestHarness.d.ts +27 -12
  112. package/dist/testing/TestHarness.js +19 -3
  113. package/dist/testing/TestHarness.js.map +1 -1
  114. package/dist/testing/WorkflowTestRunner.js +0 -7
  115. package/dist/testing/WorkflowTestRunner.js.map +1 -1
  116. package/dist/tracing/InMemoryRunStore.d.ts +14 -1
  117. package/dist/tracing/InMemoryRunStore.js +95 -6
  118. package/dist/tracing/InMemoryRunStore.js.map +1 -1
  119. package/dist/tracing/PostgresRunStore.d.ts +28 -2
  120. package/dist/tracing/PostgresRunStore.js +276 -3
  121. package/dist/tracing/PostgresRunStore.js.map +1 -1
  122. package/dist/tracing/RoutingDiagnostics.d.ts +55 -0
  123. package/dist/tracing/RoutingDiagnostics.js +50 -0
  124. package/dist/tracing/RoutingDiagnostics.js.map +1 -0
  125. package/dist/tracing/RunStore.d.ts +82 -1
  126. package/dist/tracing/RunTracker.d.ts +7 -1
  127. package/dist/tracing/RunTracker.js +23 -0
  128. package/dist/tracing/RunTracker.js.map +1 -1
  129. package/dist/tracing/SqliteRunStore.d.ts +57 -2
  130. package/dist/tracing/SqliteRunStore.js +408 -48
  131. package/dist/tracing/SqliteRunStore.js.map +1 -1
  132. package/dist/tracing/TraceRouter.js +380 -18
  133. package/dist/tracing/TraceRouter.js.map +1 -1
  134. package/dist/tracing/createStore.js +14 -3
  135. package/dist/tracing/createStore.js.map +1 -1
  136. package/dist/tracing/metadataFilter.d.ts +63 -0
  137. package/dist/tracing/metadataFilter.js +224 -0
  138. package/dist/tracing/metadataFilter.js.map +1 -0
  139. package/dist/tracing/types.d.ts +331 -7
  140. package/dist/utils/envAllowlist.d.ts +35 -0
  141. package/dist/utils/envAllowlist.js +113 -0
  142. package/dist/utils/envAllowlist.js.map +1 -0
  143. package/dist/version/RuntimeVersionValidator.d.ts +38 -0
  144. package/dist/version/RuntimeVersionValidator.js +121 -0
  145. package/dist/version/RuntimeVersionValidator.js.map +1 -0
  146. package/dist/visualization/WorkflowVisualizer.js +4 -4
  147. package/dist/visualization/WorkflowVisualizer.js.map +1 -1
  148. package/dist/workflow/PersistenceHelper.d.ts +18 -10
  149. package/dist/workflow/PersistenceHelper.js +35 -9
  150. package/dist/workflow/PersistenceHelper.js.map +1 -1
  151. package/dist/workflow/WorkflowNormalizer.d.ts +19 -1
  152. package/dist/workflow/WorkflowNormalizer.js +469 -19
  153. package/dist/workflow/WorkflowNormalizer.js.map +1 -1
  154. package/dist/workflow/WorkflowRegistry.d.ts +122 -0
  155. package/dist/workflow/WorkflowRegistry.js +121 -0
  156. package/dist/workflow/WorkflowRegistry.js.map +1 -1
  157. package/dist/workflow/sampleBody.d.ts +54 -0
  158. package/dist/workflow/sampleBody.js +320 -0
  159. package/dist/workflow/sampleBody.js.map +1 -0
  160. package/package.json +3 -8
  161. package/dist/adapters/HttpRuntimeAdapter.d.ts +0 -79
  162. package/dist/adapters/HttpRuntimeAdapter.js +0 -233
  163. package/dist/adapters/HttpRuntimeAdapter.js.map +0 -1
@@ -2,6 +2,7 @@ import { GlobalError } from "@blokjs/shared";
2
2
  import { RunCancelledError } from "./RunCancelledError";
3
3
  import { WaitDispatchRequest } from "./WaitDispatchRequest";
4
4
  import { resolveIdempotencyKey } from "./idempotency/resolveIdempotencyKey";
5
+ import { getPrimitiveStack } from "./runtime/PrimitiveStack";
5
6
  import { StepTimeoutError } from "./timeouts/StepTimeoutError";
6
7
  import { RunTracker } from "./tracing/RunTracker";
7
8
  import { sanitize } from "./tracing/sanitize";
@@ -27,6 +28,59 @@ function computeBackoff(config, attempt) {
27
28
  const raw = min * factor ** Math.max(0, attempt - 1);
28
29
  return Math.min(max, Math.floor(raw));
29
30
  }
31
+ /**
32
+ * Default cap on the JSON-serialized `ctx.state` snapshot taken before
33
+ * a `WaitDispatchRequest` throw. 1 MB matches the existing
34
+ * `BLOK_DISPATCH_PAYLOAD_MAX_BYTES` cap used by the durable scheduler
35
+ * for trigger payloads. Override per-deployment via the env var of the
36
+ * same name.
37
+ */
38
+ const DEFAULT_STATE_SNAPSHOT_MAX_BYTES = 1_048_576;
39
+ /**
40
+ * Serialize `ctx.state` for persistence in `workflow_runs.state_snapshot`
41
+ * (sqlite migration v11). Called immediately before the runner throws
42
+ * `WaitDispatchRequest`, so the snapshot reflects the canonical pre-wait
43
+ * state. Honors two ops env vars:
44
+ *
45
+ * - `BLOK_STATE_SNAPSHOT_DISABLED=1` — kill-switch. Returns `undefined`
46
+ * and the runner does NOT update the column. The wait still defers;
47
+ * cross-process recovery just resumes with empty `ctx.state`. Use
48
+ * this when state contains values that JSON.stringify can't round-
49
+ * trip safely (Date, Map, BigInt, circular refs) and the author
50
+ * accepts the limitation.
51
+ * - `BLOK_STATE_SNAPSHOT_MAX_BYTES=<n>` — cap on the serialized blob
52
+ * (default 1 MB). Above the cap, the helper logs a warning and
53
+ * returns `undefined`. Same effect as the kill-switch for that one
54
+ * run; subsequent runs with smaller state still snapshot.
55
+ *
56
+ * On JSON serialization failure (typed errors that bubble out of
57
+ * `JSON.stringify` — circular refs, BigInt, etc.), the helper logs a
58
+ * warning and returns `undefined`. The wait still defers — resumption
59
+ * for that specific run becomes best-effort, matching pre-v0.6
60
+ * behaviour for top-level waits across process restart.
61
+ */
62
+ function serializeStateSnapshot(state, logger) {
63
+ if (process.env.BLOK_STATE_SNAPSHOT_DISABLED === "1")
64
+ return undefined;
65
+ const capRaw = process.env.BLOK_STATE_SNAPSHOT_MAX_BYTES;
66
+ const cap = capRaw ? Number(capRaw) : DEFAULT_STATE_SNAPSHOT_MAX_BYTES;
67
+ const effectiveCap = Number.isFinite(cap) && cap > 0 ? cap : DEFAULT_STATE_SNAPSHOT_MAX_BYTES;
68
+ let serialized;
69
+ try {
70
+ serialized = JSON.stringify(state ?? {});
71
+ }
72
+ catch (err) {
73
+ const msg = err instanceof Error ? err.message : String(err);
74
+ logger.logLevel("warn", `[blok][wait] ctx.state snapshot failed to serialize: ${msg}. Wait will still defer; resumption is best-effort across process restart.`);
75
+ return undefined;
76
+ }
77
+ const size = Buffer.byteLength(serialized, "utf8");
78
+ if (size > effectiveCap) {
79
+ logger.logLevel("warn", `[blok][wait] ctx.state snapshot exceeds ${effectiveCap} bytes (got ${size}); skipping snapshot. Wait will still defer; resumption is best-effort. Reduce state size or raise BLOK_STATE_SNAPSHOT_MAX_BYTES.`);
80
+ return undefined;
81
+ }
82
+ return serialized;
83
+ }
30
84
  function sleep(ms) {
31
85
  return new Promise((resolve) => {
32
86
  setTimeout(resolve, ms);
@@ -84,7 +138,26 @@ export default class RunnerSteps {
84
138
  // at runSteps entry. Default `-1` = no resume; runner starts
85
139
  // at i = 0.
86
140
  const persistedRun = !deep && tracker && traceRunId ? tracker.getStore().getRun(traceRunId) : undefined;
87
- const resumeFromIndex = persistedRun?.lastCompletedStepIndex !== undefined ? persistedRun.lastCompletedStepIndex + 1 : 0;
141
+ // Two cursor sources:
142
+ // - Top-level (deep === false): workflow_runs.lastCompletedStepIndex.
143
+ // - Nested inside a primitive iterator (deep === true, v0.6
144
+ // Phase 2): `_blokInnerResumeIndex` stamped on the child ctx
145
+ // by ForEachNode.runIteration when resuming at a specific
146
+ // inner step. Undefined = start at 0 (fresh iteration body).
147
+ const innerResumeIndexRaw = ctx._blokInnerResumeIndex;
148
+ const innerResumeIndex = typeof innerResumeIndexRaw === "number" ? innerResumeIndexRaw : undefined;
149
+ const resumeFromIndex = !deep
150
+ ? persistedRun?.lastCompletedStepIndex !== undefined
151
+ ? persistedRun.lastCompletedStepIndex + 1
152
+ : 0
153
+ : (innerResumeIndex ?? 0);
154
+ // Clear the sentinel so a re-runner started fresh from this
155
+ // childCtx (e.g. the nested branch flow path) doesn't inherit
156
+ // a stale resume hint. ForEachNode set it for THIS one re-entry
157
+ // only; it should not propagate further.
158
+ if (deep && innerResumeIndex !== undefined) {
159
+ ctx._blokInnerResumeIndex = undefined;
160
+ }
88
161
  for (let i = 0; i < steps.length; i++) {
89
162
  const step = steps[i];
90
163
  // PR 4 — skip pre-wait steps on resume. State + NodeRuns
@@ -103,6 +176,19 @@ export default class RunnerSteps {
103
176
  if (ctx.signal?.aborted) {
104
177
  throw new RunCancelledError(traceRunId);
105
178
  }
179
+ // v0.6 Phase 4 — bump the TOP primitive frame's
180
+ // `innerStepIndex` to the current step. If a wait fires from
181
+ // inside this step (or anywhere deeper down the call stack),
182
+ // the wait-throw site walks the stack to persist each frame
183
+ // and needs the TOP frame's cursor to point at THIS step.
184
+ // `deep === true` is the only case where this can apply —
185
+ // the top-level runSteps doesn't have a frame.
186
+ if (deep) {
187
+ const stack = getPrimitiveStack(ctx);
188
+ if (stack.length > 0) {
189
+ stack[stack.length - 1].cursor.innerStepIndex = i;
190
+ }
191
+ }
106
192
  if (!step.active) {
107
193
  // Track skipped nodes
108
194
  if (tracker && traceRunId) {
@@ -144,6 +230,13 @@ export default class RunnerSteps {
144
230
  // (wait:true / default) in StepRail. Only meaningful
145
231
  // for subworkflow steps; undefined elsewhere.
146
232
  const subworkflowWait = stepType === "subworkflow" ? stepAny.wait : undefined;
233
+ // G2 (v0.6) — capture the `dispatch` strategy so the
234
+ // rail can mark http-self invocations with a small
235
+ // `http` badge alongside the existing `↳ async`/`↳ sub`.
236
+ // Normalize: unknown values + the default fall through
237
+ // to `undefined` (rendered as in-process by Studio).
238
+ const dispatchRaw = stepType === "subworkflow" ? stepAny.dispatch : undefined;
239
+ const subworkflowDispatch = dispatchRaw === "http-self" || dispatchRaw === "in-process" ? dispatchRaw : undefined;
147
240
  // PR 5 E3 — surface sub-workflow nesting depth.
148
241
  // `_subworkflowDepth` on ctx is set by SubworkflowNode +
149
242
  // createChildContext; the parent's invocation of a
@@ -152,6 +245,23 @@ export default class RunnerSteps {
152
245
  const subworkflowDepth = stepType === "subworkflow"
153
246
  ? (ctx._subworkflowDepth ?? 0) + 1
154
247
  : undefined;
248
+ // v0.5 middleware origin tagging — when the trigger's
249
+ // `runMiddlewareChain` is dispatching a middleware
250
+ // workflow on this ctx, it sets `_blokMiddlewareName`
251
+ // to the middleware's name. Surface that here so
252
+ // Studio's StepRail can render a `mw:<name>` origin
253
+ // badge on every inner step the middleware produced.
254
+ const middleware = ctx._blokMiddlewareName;
255
+ // v0.5.3 — read the iteration sentinel set by ForEachNode +
256
+ // LoopNode on per-iteration child ctxs. Lets Studio group
257
+ // inner steps under "iteration N" headers in StepRail.
258
+ // Inherited by nested runners (tryCatch, switch) inside
259
+ // the same iteration — which is correct: their inner steps
260
+ // belong to that iteration. A nested forEach inside an
261
+ // outer iteration overrides the sentinel on its own child
262
+ // ctx, so the inner-most iteration wins for its descendants.
263
+ const iterationIndexRaw = ctx._blokIterationIndex;
264
+ const iterationIndex = typeof iterationIndexRaw === "number" ? iterationIndexRaw : undefined;
155
265
  const nodeRun = tracker.startNode(traceRunId, {
156
266
  nodeName: step.name,
157
267
  nodeType: stepType,
@@ -160,7 +270,10 @@ export default class RunnerSteps {
160
270
  depth: depthLevel,
161
271
  stepIndex: i,
162
272
  wait: subworkflowWait,
273
+ dispatch: subworkflowDispatch,
163
274
  subworkflowDepth,
275
+ middleware,
276
+ iterationIndex,
164
277
  });
165
278
  nodeRunId = nodeRun.id;
166
279
  ctx._traceNodeId = nodeRunId;
@@ -214,9 +327,21 @@ export default class RunnerSteps {
214
327
  // scheduledAt (or it's from trigger-level delay); on
215
328
  // re-entry from a wait dispatch, the run was marked
216
329
  // `delayed` with scheduledAt set to the wait deadline.
330
+ //
331
+ // v0.6 Phase 4 — for deep (nested) runSteps, a primitive
332
+ // (SwitchNode etc.) sets `_blokInnerResumeIndex` to the
333
+ // resume target — including `0` when the wait is at the
334
+ // first step of its sub-pipeline. The original
335
+ // `resumeFromIndex > 0` guard prevented re-entry from
336
+ // firing at index 0, but Phase 4 needs the index-0 case
337
+ // (e.g., switch arm whose first step is the wait). For
338
+ // deep runs we additionally require `innerResumeIndex`
339
+ // to be defined — that's how we tell "this primitive
340
+ // resumed here" vs "we're at index 0 because of a fresh
341
+ // iteration that doesn't have a resume cursor".
217
342
  const isReentry = ctx._blokDispatchReentry === true &&
218
- resumeFromIndex > 0 &&
219
- i === resumeFromIndex;
343
+ i === resumeFromIndex &&
344
+ (!deep ? resumeFromIndex > 0 : innerResumeIndex !== undefined);
220
345
  const deadline = computeDeadline();
221
346
  const now = Date.now();
222
347
  if (isReentry || deadline <= now) {
@@ -228,21 +353,117 @@ export default class RunnerSteps {
228
353
  tracker.completeNode(nodeRunId, { __waited__: true, deadline });
229
354
  }
230
355
  ctx.logger.log(`[step ${i + 1}/${steps.length}] ${step.name} (wait) → satisfied`);
231
- // Advance the resume cursor so a subsequent wait at a
232
- // later index can rely on it.
233
- if (tracker && traceRunId) {
356
+ // Advance the resume cursor at TOP-LEVEL only.
357
+ // Nested satisfies (deep=true, v0.6 Phase 2 — wait
358
+ // inside a forEach iteration body) must NOT
359
+ // overwrite the workflow's resume cursor with the
360
+ // inner step index — that would skip past the
361
+ // primitive entirely on the next re-entry. The
362
+ // primitive's own NodeRun.iteration_context tracks
363
+ // progress for nested resumes.
364
+ if (!deep && tracker && traceRunId) {
234
365
  tracker.getStore().updateRun(traceRunId, { lastCompletedStepIndex: i });
235
366
  }
236
367
  continue;
237
368
  }
238
369
  // First pass: schedule + throw WaitDispatchRequest.
239
370
  // Set resume cursor BEFORE throwing so re-entry knows
240
- // where to pick up. Cursor = i - 1 (the last non-wait
241
- // step that completed).
371
+ // where to pick up.
372
+ //
373
+ // Two cases for cursor placement:
374
+ // - Top-level wait (deep === false). Cursor = i - 1
375
+ // (the last non-wait outer step that completed).
376
+ // On re-entry, runSteps reads
377
+ // workflow_runs.lastCompletedStepIndex + 1 = i and
378
+ // starts the wait step which flips to "satisfied".
379
+ // - Nested wait inside a primitive (deep === true,
380
+ // v0.6 Phase 2). The wait fired from inside an
381
+ // iteration body of a forEach (or analogous future
382
+ // primitive). The OUTER runSteps wrote `i - 1` =
383
+ // forEach-step-index minus 1 *before* invoking
384
+ // forEach.process, so workflow_runs.lastCompleted-
385
+ // StepIndex still points at the OUTER cursor we
386
+ // want — DON'T overwrite it with the inner-i (that
387
+ // would skip the forEach entirely on resume).
388
+ // Instead, persist the iteration cursor on the
389
+ // forEach's NodeRun's `iteration_context` column.
390
+ // ForEachNode reads it on re-entry to resume the
391
+ // right iteration + inner step.
392
+ //
393
+ // v0.6 prerequisite for wait-inside-primitives Phase 2
394
+ // — snapshot `ctx.state` regardless of nesting. Two
395
+ // re-entry paths consume this snapshot:
396
+ // 1. In-process timer fire (DeferredRunScheduler):
397
+ // same `ctx` is reused, state is already there;
398
+ // rehydrate at TriggerBase.run is a no-op.
399
+ // 2. Cross-process recovery (recoverDispatches →
400
+ // restoreDispatch on boot): a fresh `ctx` is
401
+ // built from the persisted scheduled_dispatches
402
+ // row with empty `state`. Without the snapshot,
403
+ // Phase 2's iteration-state-persistence promise
404
+ // breaks across restart.
242
405
  if (tracker && traceRunId) {
243
- tracker.getStore().updateRun(traceRunId, {
244
- lastCompletedStepIndex: i - 1,
245
- });
406
+ const updates = {
407
+ stateSnapshot: serializeStateSnapshot(ctx.state, ctx.logger),
408
+ };
409
+ if (!deep) {
410
+ updates.lastCompletedStepIndex = i - 1;
411
+ }
412
+ tracker.getStore().updateRun(traceRunId, updates);
413
+ // Phase 2/3 — write iteration_context to the active
414
+ // primitive's NodeRun when nested. Reads sentinels
415
+ // stamped by the primitive (ForEachNode in Phase 2,
416
+ // LoopNode in Phase 3) on the parent ctx:
417
+ // - _blokActivePrimitiveNodeRunId: which NodeRun
418
+ // gets the cursor (set by RunnerSteps' outer
419
+ // iteration around the primitive's process()).
420
+ // - _blokForEachCurrentIteration: iteration index
421
+ // of the in-flight iteration.
422
+ // - _blokForEachPartialResults (Phase 2 only):
423
+ // accumulator for iterations [0..iteration-1]
424
+ // so the post-resume final result array covers
425
+ // all iterations. LoopNode doesn't aggregate
426
+ // results (it returns the last iteration's
427
+ // output), so it doesn't stamp this sentinel —
428
+ // the cursor stores `completedResults: []` and
429
+ // LoopNode ignores the field on resume.
430
+ // v0.6 Phase 4 — walk the primitive stack and persist
431
+ // each frame's cursor to its NodeRun. The TOP frame's
432
+ // `innerStepIndex` is the wait step's position within
433
+ // the deepest primitive's sub-pipeline; outer frames'
434
+ // `innerStepIndex` values were set by their enclosing
435
+ // runSteps' step-boundary write when control passed
436
+ // into the deeper primitive. This is what lets
437
+ // `forEach > forEach > wait`,
438
+ // `switch > forEach > wait`, etc. all resume
439
+ // correctly on re-entry.
440
+ //
441
+ // Each frame's `cursor` is owned by the primitive
442
+ // (it stamps `iteration`/`caseIndex`/`completedResults`).
443
+ // The runner's only responsibility here is to refresh
444
+ // the TOP frame's `innerStepIndex` to `i` and
445
+ // persist every frame.
446
+ if (deep) {
447
+ const stack = getPrimitiveStack(ctx);
448
+ if (stack.length > 0) {
449
+ stack[stack.length - 1].cursor.innerStepIndex = i;
450
+ for (const frame of stack) {
451
+ // Skip parallel-forEach frames — the
452
+ // parallel branch in ForEachNode writes
453
+ // its own cursor (with cancelled set +
454
+ // completedResults) post-`Promise.allSettled`.
455
+ // Writing the placeholder here would let
456
+ // "error beats wait" classifications leak
457
+ // a parallel cursor onto the failed
458
+ // run's NodeRun.
459
+ if (frame.cursor.mode === "parallel")
460
+ continue;
461
+ tracker.getStore().updateNodeRun(frame.nodeRunId, {
462
+ iterationContext: frame.cursor,
463
+ });
464
+ }
465
+ }
466
+ }
246
467
  }
247
468
  ctx.logger.log(`[step ${i + 1}/${steps.length}] ${step.name} (wait) → scheduled (deadline=${new Date(deadline).toISOString()})`);
248
469
  throw new WaitDispatchRequest({
@@ -295,114 +516,153 @@ export default class RunnerSteps {
295
516
  // `30000` via `parseDuration`).
296
517
  const maxDurationMs = step.maxDurationMs;
297
518
  let attempt = 0;
298
- while (true) {
299
- attempt += 1;
300
- try {
301
- const processInvocation = () => step.process(ctx, step);
302
- const model = typeof maxDurationMs === "number" && maxDurationMs > 0
303
- ? await wrapWithTimeout(processInvocation, maxDurationMs, step.name)
304
- : await processInvocation();
305
- ctx.response = model.data;
306
- // Treat soft errors (data carries `.error`) the same as
307
- // thrown errors so retry semantics are uniform.
308
- if (ctx.response?.error) {
309
- throw ctx.response.error;
310
- }
311
- // === Tier 1: idempotency cache write ===
312
- // Cache on the success path only — failed steps are
313
- // re-runnable. Honour `idempotencyKeyTTL` per step;
314
- // default 24h. A TTL of 0 stores an immediately-
315
- // expired entry (useful as a kill-switch).
316
- if (cacheStore && resolvedIdemKey && nodeRunId && traceRunId) {
317
- const ttlField = step.idempotencyKeyTTL;
318
- const ttlMs = typeof ttlField === "number" ? ttlField : DEFAULT_IDEMPOTENCY_TTL_MS;
319
- const now = Date.now();
320
- const expiresAt = ttlMs > 0 ? now + ttlMs : now - 1;
321
- cacheStore.setIdempotencyCache(workflowName, step.name, resolvedIdemKey, {
322
- data: model.data,
323
- cachedAt: now,
324
- expiresAt,
325
- sourceRunId: traceRunId,
326
- sourceNodeRunId: nodeRunId,
327
- });
328
- }
329
- const stepDuration = (performance.now() - stepStart).toFixed(1);
330
- // --- Trace: complete node ---
331
- if (tracker && nodeRunId) {
332
- // `_stepMetrics` is stashed on ctx by RuntimeAdapterNode
333
- // when an adapter returns metrics (gRPC wire bytes,
334
- // duration, cpu, memory). Threading it through
335
- // `completeNode` is what gets the metrics into the
336
- // run store + NODE_COMPLETED event payload — Studio's
337
- // inspector reads them from there.
338
- const ctxAny = ctx;
339
- const stepMetrics = ctxAny._stepMetrics;
340
- ctxAny._stepMetrics = undefined;
341
- tracker.completeNode(nodeRunId, sanitize(ctx.response.data), stepMetrics);
342
- // PR 4 — advance the resume cursor after each
343
- // successful non-wait step. A subsequent wait step
344
- // reads this value to set its own cursor before
345
- // throwing WaitDispatchRequest. Only at top-level
346
- // (deep=false); nested branch flow doesn't update.
347
- if (!deep && traceRunId) {
348
- tracker.getStore().updateRun(traceRunId, { lastCompletedStepIndex: i });
519
+ // v0.6 Phase 4 — the primitive stack on ctx is owned by
520
+ // ForEachNode/LoopNode/SwitchNode (push on entry, pop in
521
+ // finally). The Phase 2/3 single-slot
522
+ // `_blokActivePrimitiveNodeRunId` mechanism is gone
523
+ // nested primitives each register their own frame, and
524
+ // the wait-throw site walks the full stack. We keep
525
+ // `isIteratingPrimitive` only as a hook for legacy
526
+ // readers (none in core today) — wait-cursor writes no
527
+ // longer depend on it.
528
+ const isIteratingPrimitive = step.isPrimitiveIterator === true;
529
+ try {
530
+ while (true) {
531
+ attempt += 1;
532
+ try {
533
+ const processInvocation = () => step.process(ctx, step);
534
+ const model = typeof maxDurationMs === "number" && maxDurationMs > 0
535
+ ? await wrapWithTimeout(processInvocation, maxDurationMs, step.name)
536
+ : await processInvocation();
537
+ ctx.response = model.data;
538
+ // Treat soft errors (data carries `.error`) the same as
539
+ // thrown errors so retry semantics are uniform.
540
+ if (ctx.response?.error) {
541
+ throw ctx.response.error;
349
542
  }
350
- }
351
- const attemptSuffix = attempt > 1 ? ` after ${attempt} attempts` : "";
352
- ctx.logger.log(`${stepPrefix} completed (${stepDuration}ms${attemptSuffix})`);
353
- break;
354
- }
355
- catch (nodeErr) {
356
- if (attempt < maxAttempts && retryConfig) {
357
- // More attempts remain record this as a soft
358
- // failure and back off before retrying. The node
359
- // stays in `running` status; failNode is the
360
- // terminal call.
543
+ // === Tier 1: idempotency cache write ===
544
+ // Cache on the success path only failed steps are
545
+ // re-runnable. Honour `idempotencyKeyTTL` per step;
546
+ // default 24h. A TTL of 0 stores an immediately-
547
+ // expired entry (useful as a kill-switch).
548
+ if (cacheStore && resolvedIdemKey && nodeRunId && traceRunId) {
549
+ const ttlField = step.idempotencyKeyTTL;
550
+ const ttlMs = typeof ttlField === "number" ? ttlField : DEFAULT_IDEMPOTENCY_TTL_MS;
551
+ const now = Date.now();
552
+ const expiresAt = ttlMs > 0 ? now + ttlMs : now - 1;
553
+ cacheStore.setIdempotencyCache(workflowName, step.name, resolvedIdemKey, {
554
+ data: model.data,
555
+ cachedAt: now,
556
+ expiresAt,
557
+ sourceRunId: traceRunId,
558
+ sourceNodeRunId: nodeRunId,
559
+ });
560
+ }
561
+ const stepDuration = (performance.now() - stepStart).toFixed(1);
562
+ // --- Trace: complete node ---
361
563
  if (tracker && nodeRunId) {
362
- tracker.recordNodeAttemptFailed(nodeRunId, { attempt, error: nodeErr });
564
+ // `_stepMetrics` is stashed on ctx by RuntimeAdapterNode
565
+ // when an adapter returns metrics (gRPC wire bytes,
566
+ // duration, cpu, memory). Threading it through
567
+ // `completeNode` is what gets the metrics into the
568
+ // run store + NODE_COMPLETED event payload — Studio's
569
+ // inspector reads them from there.
570
+ const ctxAny = ctx;
571
+ const stepMetrics = ctxAny._stepMetrics;
572
+ ctxAny._stepMetrics = undefined;
573
+ tracker.completeNode(nodeRunId, sanitize(ctx.response.data), stepMetrics);
574
+ // PR 4 — advance the resume cursor after each
575
+ // successful non-wait step. A subsequent wait step
576
+ // reads this value to set its own cursor before
577
+ // throwing WaitDispatchRequest. Only at top-level
578
+ // (deep=false); nested branch flow doesn't update.
579
+ if (!deep && traceRunId) {
580
+ tracker.getStore().updateRun(traceRunId, { lastCompletedStepIndex: i });
581
+ }
363
582
  }
364
- const backoffMs = computeBackoff(retryConfig, attempt);
365
- const errMsg = nodeErr instanceof Error ? nodeErr.message : String(nodeErr);
366
- ctx.logger.log(`${stepPrefix} → attempt ${attempt}/${maxAttempts} failed (${errMsg}), retrying in ${backoffMs}ms`);
367
- await sleep(backoffMs);
368
- continue;
583
+ const attemptSuffix = attempt > 1 ? ` after ${attempt} attempts` : "";
584
+ ctx.logger.log(`${stepPrefix} completed (${stepDuration}ms${attemptSuffix})`);
585
+ break;
369
586
  }
370
- // Final attempt — fail the node and propagate the
371
- // enriched error so RunnerSteps' outer catch can
372
- // wrap it as a GlobalError.
373
- if (tracker && nodeRunId) {
374
- const existing = tracker.getNodeRun(nodeRunId);
375
- if (existing && existing.status === "running") {
376
- tracker.failNode(nodeRunId, nodeErr instanceof Error ? nodeErr : new Error(String(nodeErr)));
587
+ catch (nodeErr) {
588
+ // v0.5.3 control-flow signals from a step's run()
589
+ // must NOT be retried OR wrapped as enriched errors.
590
+ // In the production wait path, RunnerSteps throws
591
+ // WaitDispatchRequest from outside this retry loop, so
592
+ // this branch is normally inert. But if a custom node
593
+ // ever throws a wait/cancel signal from inside its
594
+ // process()/run(), preserve the type so the outer
595
+ // catch + TryCatchNode pass-through still recognise
596
+ // it. Same rationale as the outer-catch instanceof
597
+ // guards at line ~498.
598
+ if (nodeErr instanceof WaitDispatchRequest || nodeErr instanceof RunCancelledError) {
599
+ throw nodeErr;
377
600
  }
601
+ if (attempt < maxAttempts && retryConfig) {
602
+ // More attempts remain — record this as a soft
603
+ // failure and back off before retrying. The node
604
+ // stays in `running` status; failNode is the
605
+ // terminal call.
606
+ if (tracker && nodeRunId) {
607
+ tracker.recordNodeAttemptFailed(nodeRunId, { attempt, error: nodeErr });
608
+ }
609
+ const backoffMs = computeBackoff(retryConfig, attempt);
610
+ const errMsg = nodeErr instanceof Error ? nodeErr.message : String(nodeErr);
611
+ ctx.logger.log(`${stepPrefix} → attempt ${attempt}/${maxAttempts} failed (${errMsg}), retrying in ${backoffMs}ms`);
612
+ await sleep(backoffMs);
613
+ continue;
614
+ }
615
+ // Final attempt — fail the node and propagate the
616
+ // enriched error so RunnerSteps' outer catch can
617
+ // wrap it as a GlobalError.
618
+ if (tracker && nodeRunId) {
619
+ const existing = tracker.getNodeRun(nodeRunId);
620
+ if (existing && existing.status === "running") {
621
+ tracker.failNode(nodeRunId, nodeErr instanceof Error ? nodeErr : new Error(String(nodeErr)));
622
+ }
623
+ }
624
+ // Tier 2 quick-wins — final-attempt timeout flips
625
+ // the run to "timedOut" (distinct from "failed").
626
+ // Only when the FINAL error was a StepTimeoutError;
627
+ // mixed failures (some retries timed out, final
628
+ // retry threw a different error) keep the normal
629
+ // "failed" status.
630
+ if (tracker &&
631
+ traceRunId &&
632
+ typeof maxDurationMs === "number" &&
633
+ maxDurationMs > 0 &&
634
+ nodeErr instanceof StepTimeoutError) {
635
+ tracker.markRunTimedOut(traceRunId, {
636
+ stepId: step.name,
637
+ maxDurationMs,
638
+ attemptsExhausted: attempt,
639
+ });
640
+ }
641
+ const stepDuration = (performance.now() - stepStart).toFixed(1);
642
+ const attemptSuffix = attempt > 1 ? ` after ${attempt} attempts` : "";
643
+ ctx.logger.log(`${stepPrefix} → FAILED (${stepDuration}ms${attemptSuffix})`);
644
+ // Enrich error with step context so developers know which step failed.
645
+ // Attach `_blokStepId` directly on the wrap so TryCatchNode's
646
+ // envelope construction can surface `$.error.stepId` to authors
647
+ // without parsing the prefix back out of the message string.
648
+ const originalMsg = nodeErr instanceof Error ? nodeErr.message : String(nodeErr);
649
+ const enrichedError = new Error(`${stepPrefix} failed: ${originalMsg}`);
650
+ const enrichedAny = enrichedError;
651
+ enrichedAny.cause = nodeErr;
652
+ enrichedAny._blokStepId = step.name;
653
+ throw enrichedError;
378
654
  }
379
- // Tier 2 quick-wins — final-attempt timeout flips
380
- // the run to "timedOut" (distinct from "failed").
381
- // Only when the FINAL error was a StepTimeoutError;
382
- // mixed failures (some retries timed out, final
383
- // retry threw a different error) keep the normal
384
- // "failed" status.
385
- if (tracker &&
386
- traceRunId &&
387
- typeof maxDurationMs === "number" &&
388
- maxDurationMs > 0 &&
389
- nodeErr instanceof StepTimeoutError) {
390
- tracker.markRunTimedOut(traceRunId, {
391
- stepId: step.name,
392
- maxDurationMs,
393
- attemptsExhausted: attempt,
394
- });
395
- }
396
- const stepDuration = (performance.now() - stepStart).toFixed(1);
397
- const attemptSuffix = attempt > 1 ? ` after ${attempt} attempts` : "";
398
- ctx.logger.log(`${stepPrefix} → FAILED (${stepDuration}ms${attemptSuffix})`);
399
- // Enrich error with step context so developers know which step failed
400
- const originalMsg = nodeErr instanceof Error ? nodeErr.message : String(nodeErr);
401
- const enrichedError = new Error(`${stepPrefix} failed: ${originalMsg}`);
402
- enrichedError.cause = nodeErr;
403
- throw enrichedError;
404
655
  }
405
656
  }
657
+ finally {
658
+ // v0.6 Phase 4 — primitives own their stack frame
659
+ // lifecycle now (push on entry, pop in finally), so
660
+ // there's nothing to restore here. The
661
+ // `isIteratingPrimitive` flag stays in the type
662
+ // system for documentation but no longer drives
663
+ // cursor accounting.
664
+ void isIteratingPrimitive;
665
+ }
406
666
  }
407
667
  else {
408
668
  stepName = step.name;
@@ -435,12 +695,59 @@ export default class RunnerSteps {
435
695
  if (e instanceof WaitDispatchRequest) {
436
696
  throw e;
437
697
  }
698
+ // Capture the step-enrichment wrap's `_blokStepId` BEFORE we
699
+ // unwrap past it. The wrap is the outermost layer (set inside
700
+ // the inner-try retry loop above); after unwrapping to the inner
701
+ // GlobalError this metadata would otherwise be lost. Surfaces to
702
+ // authors as `$.error.stepId` inside tryCatch.catch arms.
703
+ const wrapStepId = typeof e === "object" && e !== null && "_blokStepId" in e
704
+ ? e._blokStepId
705
+ : undefined;
438
706
  let error_context = {};
439
707
  if (e instanceof GlobalError) {
440
708
  error_context = e;
441
709
  }
442
710
  else {
443
- error_context = new GlobalError(e.message);
711
+ // Walk the `.cause` chain looking for a GlobalError. The
712
+ // step-enrichment wrap at line ~465 sets `cause = nodeErr`,
713
+ // and `nodeErr` may itself be a GlobalError thrown from
714
+ // `defineNode`-built nodes (e.g. `@blokjs/throw` setting
715
+ // `code: 401` for an auth-check middleware). Without this
716
+ // walk, the outer wrap below would force the framework's
717
+ // generic `[step N/M] X failed: ...` message + default 500
718
+ // code, clobbering the author's structured rejection.
719
+ let inner = e;
720
+ let foundGlobal = null;
721
+ while (typeof inner === "object" &&
722
+ inner !== null &&
723
+ "cause" in inner &&
724
+ inner.cause !== undefined &&
725
+ inner.cause !== inner) {
726
+ inner = inner.cause;
727
+ if (inner instanceof GlobalError) {
728
+ foundGlobal = inner;
729
+ break;
730
+ }
731
+ }
732
+ if (foundGlobal) {
733
+ error_context = foundGlobal;
734
+ }
735
+ else {
736
+ error_context = new GlobalError(e.message);
737
+ // Preserve the original error chain so outer handlers
738
+ // (notably v0.5 TryCatchNode's `$.error.message` resolution)
739
+ // can peel back through `.cause` to the author's original
740
+ // `throw new Error("...")` text instead of the runner's
741
+ // `[step N/M] <name> failed: ...` enriched prefix.
742
+ error_context.cause = e;
743
+ }
744
+ }
745
+ // Stamp the wrap's stepId on the unwrapped error so TryCatchNode's
746
+ // `toErrorEnvelope` walk can surface it as `$.error.stepId`. The
747
+ // inner-try wrap layer is gone by this point; this is the only
748
+ // place where the runner can identify which sub-step failed.
749
+ if (typeof wrapStepId === "string" && wrapStepId.length > 0) {
750
+ error_context._blokStepId = wrapStepId;
444
751
  }
445
752
  throw error_context;
446
753
  }