@bratsos/workflow-engine 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,514 @@
1
+ # Async Batch Stages
2
+
3
+ Complete guide for creating stages that suspend and resume for long-running batch operations.
4
+
5
+ ## Overview
6
+
7
+ Async batch stages allow workflows to:
8
+ 1. Submit work to external batch APIs (Anthropic, Google, OpenAI)
9
+ 2. Suspend while waiting for completion
10
+ 3. Resume automatically when results are ready
11
+ 4. Achieve 50% cost savings on large AI workloads
12
+
13
+ ## Creating an Async Batch Stage
14
+
15
+ ```typescript
16
+ import { defineAsyncBatchStage } from "@bratsos/workflow-engine";
17
+ import { z } from "zod";
18
+
19
+ const batchStage = defineAsyncBatchStage({
20
+ id: "batch-process",
21
+ name: "Batch Process",
22
+ mode: "async-batch", // Required marker
23
+
24
+ schemas: {
25
+ input: InputSchema,
26
+ output: OutputSchema,
27
+ config: ConfigSchema,
28
+ },
29
+
30
+ // Called when stage starts OR resumes
31
+ async execute(ctx) {
32
+ // Check if resuming from suspension
33
+ if (ctx.resumeState) {
34
+ // Stage was suspended, resumeState contains cached data
35
+ return handleResume(ctx);
36
+ }
37
+
38
+ // First execution - submit batch and suspend
39
+ return submitAndSuspend(ctx);
40
+ },
41
+
42
+ // Called by orchestrator to check batch status
43
+ async checkCompletion(suspendedState, ctx) {
44
+ return checkBatchStatus(suspendedState, ctx);
45
+ },
46
+ });
47
+ ```
48
+
49
+ ## Execute Function
50
+
51
+ The `execute` function handles both initial execution and resume:
52
+
53
+ ```typescript
54
+ async execute(ctx) {
55
+ // ===================
56
+ // Resume Path
57
+ // ===================
58
+ if (ctx.resumeState) {
59
+ // Check for cached results
60
+ const cached = await ctx.storage.load("batch-result");
61
+ if (cached) {
62
+ return { output: cached };
63
+ }
64
+
65
+ // If no cache, the checkCompletion already saved results
66
+ // This path shouldn't normally be hit
67
+ throw new Error("Resume called but no cached results found");
68
+ }
69
+
70
+ // ===================
71
+ // Initial Execution
72
+ // ===================
73
+
74
+ // Get data from previous stages
75
+ const extraction = ctx.require("data-extraction");
76
+
77
+ // Prepare batch requests
78
+ const requests = extraction.items.map((item, i) => ({
79
+ id: `item-${i}`,
80
+ prompt: `Process this item: ${JSON.stringify(item)}`,
81
+ // Optional: include schema for structured output
82
+ schema: ItemResultSchema,
83
+ }));
84
+
85
+ // Submit to batch API
86
+ const ai = createAIHelper(`batch.${ctx.workflowRunId}`, aiLogger);
87
+ const batch = ai.batch("claude-sonnet-4-20250514", "anthropic");
88
+ const handle = await batch.submit(requests);
89
+
90
+ // Store metadata for resume
91
+ await ctx.storage.save("batch-metadata", {
92
+ requestCount: requests.length,
93
+ requestIds: requests.map(r => r.id),
94
+ });
95
+
96
+ // Return suspended result
97
+ return {
98
+ suspended: true,
99
+ state: {
100
+ batchId: handle.id,
101
+ submittedAt: new Date().toISOString(),
102
+ pollInterval: 60000, // Check every 60 seconds
103
+ maxWaitTime: 3600000, // Max 1 hour
104
+ metadata: {
105
+ provider: "anthropic",
106
+ requestCount: requests.length,
107
+ },
108
+ },
109
+ pollConfig: {
110
+ pollInterval: 60000,
111
+ maxWaitTime: 3600000,
112
+ nextPollAt: new Date(Date.now() + 60000),
113
+ },
114
+ };
115
+ }
116
+ ```
117
+
118
+ ## SimpleSuspendedResult Structure
119
+
120
+ ```typescript
121
+ interface SimpleSuspendedResult {
122
+ suspended: true; // Required marker
123
+
124
+ state: {
125
+ batchId: string; // External batch job ID
126
+ submittedAt: string; // ISO timestamp
127
+ pollInterval: number; // Milliseconds between checks
128
+ maxWaitTime: number; // Maximum wait before timeout
129
+ metadata?: Record<string, unknown>; // Custom data
130
+ apiKey?: string; // Optional: for provider auth
131
+ };
132
+
133
+ pollConfig: {
134
+ pollInterval: number; // Milliseconds
135
+ maxWaitTime: number; // Milliseconds
136
+ nextPollAt: Date; // First poll time
137
+ };
138
+
139
+ customMetrics?: Record<string, number>; // Optional metrics
140
+ }
141
+ ```
142
+
143
+ ## Check Completion Function
144
+
145
+ The `checkCompletion` function is called by the orchestrator:
146
+
147
+ ```typescript
148
+ async checkCompletion(suspendedState, ctx) {
149
+ // suspendedState contains the state from SimpleSuspendedResult
150
+ const { batchId, metadata } = suspendedState;
151
+
152
+ // Create AI helper for batch operations
153
+ const ai = createAIHelper(`batch.${ctx.workflowRunId}`, aiLogger);
154
+ const batch = ai.batch(ctx.config.model, metadata?.provider as "anthropic");
155
+
156
+ // Check batch status
157
+ const status = await batch.getStatus(batchId);
158
+
159
+ // ===================
160
+ // Still Processing
161
+ // ===================
162
+ if (status.status === "pending" || status.status === "processing") {
163
+ return {
164
+ ready: false,
165
+ nextCheckIn: 60000, // Check again in 60 seconds
166
+ };
167
+ }
168
+
169
+ // ===================
170
+ // Failed
171
+ // ===================
172
+ if (status.status === "failed") {
173
+ return {
174
+ ready: false,
175
+ error: `Batch ${batchId} failed`,
176
+ };
177
+ }
178
+
179
+ // ===================
180
+ // Completed
181
+ // ===================
182
+ // Get results
183
+ const results = await batch.getResults(batchId, metadata);
184
+
185
+ // Process results
186
+ const processedResults = results.map(r => ({
187
+ id: r.id,
188
+ result: r.result,
189
+ success: r.status === "succeeded",
190
+ error: r.error,
191
+ }));
192
+
193
+ // Calculate metrics
194
+ const totalInputTokens = results.reduce((sum, r) => sum + r.inputTokens, 0);
195
+ const totalOutputTokens = results.reduce((sum, r) => sum + r.outputTokens, 0);
196
+
197
+ // Cache results for potential resume
198
+ await ctx.storage.save("batch-result", { items: processedResults });
199
+
200
+ // Return completed result
201
+ return {
202
+ ready: true,
203
+ output: { items: processedResults },
204
+ metrics: {
205
+ inputTokens: totalInputTokens,
206
+ outputTokens: totalOutputTokens,
207
+ itemsProcessed: processedResults.length,
208
+ successCount: processedResults.filter(r => r.success).length,
209
+ },
210
+ };
211
+ }
212
+ ```
213
+
214
+ ## CompletionCheckResult Structure
215
+
216
+ ```typescript
217
+ interface CompletionCheckResult<TOutput> {
218
+ ready: boolean; // Is batch complete?
219
+
220
+ // If ready === true
221
+ output?: TOutput; // Stage output
222
+ metrics?: Record<string, number>;
223
+ embeddings?: unknown; // Optional embedding info
224
+
225
+ // If ready === false
226
+ error?: string; // Failure reason (stops workflow)
227
+ nextCheckIn?: number; // Milliseconds until next check
228
+ }
229
+ ```
230
+
231
+ ## CheckCompletionContext
232
+
233
+ ```typescript
234
+ interface CheckCompletionContext<TConfig> {
235
+ workflowRunId: string; // Current workflow run
236
+ stageId: string; // Current stage ID
237
+ stageRecordId: string; // Database record ID (for LogContext)
238
+ config: TConfig; // Stage configuration
239
+ log: LogFunction; // Async logging
240
+ onLog: LogFunction; // Alias for log
241
+ storage: StageStorage; // Artifact storage
242
+ }
243
+ ```
244
+
245
+ ## Complete Example: Batch Embedding Stage
246
+
247
+ ```typescript
248
+ const batchEmbeddingStage = defineAsyncBatchStage({
249
+ id: "batch-embeddings",
250
+ name: "Generate Embeddings",
251
+ mode: "async-batch",
252
+ dependencies: ["data-extraction"],
253
+
254
+ schemas: {
255
+ input: "none",
256
+ output: z.object({
257
+ embeddings: z.array(z.object({
258
+ id: z.string(),
259
+ vector: z.array(z.number()),
260
+ })),
261
+ totalTokens: z.number(),
262
+ }),
263
+ config: z.object({
264
+ model: z.string().default("text-embedding-004"),
265
+ batchSize: z.number().default(100),
266
+ }),
267
+ },
268
+
269
+ async execute(ctx) {
270
+ // Handle resume
271
+ if (ctx.resumeState) {
272
+ const cached = await ctx.storage.load<{ embeddings: any[] }>("embeddings");
273
+ if (cached) {
274
+ return { output: { embeddings: cached.embeddings, totalTokens: 0 } };
275
+ }
276
+ }
277
+
278
+ // Get texts to embed
279
+ const extraction = ctx.require("data-extraction");
280
+ const texts = extraction.sections.map((s, i) => ({
281
+ id: `section-${i}`,
282
+ text: s.content,
283
+ }));
284
+
285
+ // Submit batch
286
+ const ai = createAIHelper(`batch.${ctx.workflowRunId}`, aiLogger);
287
+ const batch = ai.batch<number[]>(ctx.config.model, "google");
288
+
289
+ const requests = texts.map(t => ({
290
+ id: t.id,
291
+ prompt: t.text,
292
+ }));
293
+
294
+ const handle = await batch.submit(requests);
295
+
296
+ await ctx.log("INFO", `Submitted ${texts.length} texts for embedding`);
297
+
298
+ return {
299
+ suspended: true,
300
+ state: {
301
+ batchId: handle.id,
302
+ submittedAt: new Date().toISOString(),
303
+ pollInterval: 30000,
304
+ maxWaitTime: 1800000, // 30 minutes
305
+ metadata: {
306
+ textCount: texts.length,
307
+ customIds: texts.map(t => t.id),
308
+ },
309
+ },
310
+ pollConfig: {
311
+ pollInterval: 30000,
312
+ maxWaitTime: 1800000,
313
+ nextPollAt: new Date(Date.now() + 30000),
314
+ },
315
+ };
316
+ },
317
+
318
+ async checkCompletion(state, ctx) {
319
+ const ai = createAIHelper(`batch.${ctx.workflowRunId}`, aiLogger);
320
+ const batch = ai.batch<number[]>(ctx.config.model, "google");
321
+
322
+ const status = await batch.getStatus(state.batchId);
323
+ await ctx.log("DEBUG", `Batch status: ${status.status}`);
324
+
325
+ if (status.status !== "completed") {
326
+ if (status.status === "failed") {
327
+ return { ready: false, error: "Embedding batch failed" };
328
+ }
329
+ return { ready: false, nextCheckIn: 30000 };
330
+ }
331
+
332
+ // Get results with metadata for ID mapping
333
+ const results = await batch.getResults(state.batchId, state.metadata);
334
+
335
+ const embeddings = results
336
+ .filter(r => r.status === "succeeded")
337
+ .map(r => ({
338
+ id: r.id,
339
+ vector: r.result,
340
+ }));
341
+
342
+ const totalTokens = results.reduce((sum, r) => sum + r.inputTokens, 0);
343
+
344
+ // Cache for resume
345
+ await ctx.storage.save("embeddings", { embeddings });
346
+
347
+ await ctx.log("INFO", `Generated ${embeddings.length} embeddings`);
348
+
349
+ return {
350
+ ready: true,
351
+ output: { embeddings, totalTokens },
352
+ metrics: {
353
+ embeddingsGenerated: embeddings.length,
354
+ totalTokens,
355
+ },
356
+ };
357
+ },
358
+ });
359
+ ```
360
+
361
+ ## Batch Providers
362
+
363
+ ### Anthropic
364
+
365
+ ```typescript
366
+ const batch = ai.batch("claude-sonnet-4-20250514", "anthropic");
367
+ // 50% discount, results in ~24 hours
368
+ ```
369
+
370
+ ### Google
371
+
372
+ ```typescript
373
+ const batch = ai.batch("gemini-2.5-flash", "google");
374
+ // 50% discount, results typically faster
375
+ ```
376
+
377
+ ### OpenAI
378
+
379
+ ```typescript
380
+ const batch = ai.batch("gpt-4o", "openai");
381
+ // 50% discount
382
+ ```
383
+
384
+ ## Polling Configuration
385
+
386
+ ### Quick Jobs (< 10 minutes)
387
+
388
+ ```typescript
389
+ pollConfig: {
390
+ pollInterval: 15000, // Check every 15 seconds
391
+ maxWaitTime: 600000, // Max 10 minutes
392
+ nextPollAt: new Date(Date.now() + 15000),
393
+ }
394
+ ```
395
+
396
+ ### Medium Jobs (10 minutes - 1 hour)
397
+
398
+ ```typescript
399
+ pollConfig: {
400
+ pollInterval: 60000, // Check every minute
401
+ maxWaitTime: 3600000, // Max 1 hour
402
+ nextPollAt: new Date(Date.now() + 60000),
403
+ }
404
+ ```
405
+
406
+ ### Long Jobs (1+ hours)
407
+
408
+ ```typescript
409
+ pollConfig: {
410
+ pollInterval: 300000, // Check every 5 minutes
411
+ maxWaitTime: 86400000, // Max 24 hours
412
+ nextPollAt: new Date(Date.now() + 300000),
413
+ }
414
+ ```
415
+
416
+ ## Error Handling
417
+
418
+ ### Timeout Handling
419
+
420
+ The runtime automatically fails stages that exceed `maxWaitTime`:
421
+
422
+ ```typescript
423
+ // In checkCompletion, check for timeout
424
+ const startTime = new Date(state.submittedAt).getTime();
425
+ const elapsed = Date.now() - startTime;
426
+
427
+ if (elapsed > state.maxWaitTime) {
428
+ // Cancel batch if possible
429
+ await cancelBatch(state.batchId);
430
+
431
+ return {
432
+ ready: false,
433
+ error: `Batch timeout after ${elapsed}ms`,
434
+ };
435
+ }
436
+ ```
437
+
438
+ ### Partial Failures
439
+
440
+ Handle individual request failures gracefully:
441
+
442
+ ```typescript
443
+ const results = await batch.getResults(batchId);
444
+
445
+ const succeeded = results.filter(r => r.status === "succeeded");
446
+ const failed = results.filter(r => r.status === "failed");
447
+
448
+ if (failed.length > 0) {
449
+ await ctx.log("WARN", `${failed.length} requests failed`, {
450
+ failedIds: failed.map(f => f.id),
451
+ });
452
+ }
453
+
454
+ // Decide: fail the stage or continue with partial results
455
+ if (succeeded.length === 0) {
456
+ return { ready: false, error: "All batch requests failed" };
457
+ }
458
+
459
+ return {
460
+ ready: true,
461
+ output: { results: succeeded.map(r => r.result) },
462
+ };
463
+ ```
464
+
465
+ ### Retry Logic
466
+
467
+ Implement custom retry for transient failures:
468
+
469
+ ```typescript
470
+ async checkCompletion(state, ctx) {
471
+ try {
472
+ const status = await batch.getStatus(state.batchId);
473
+ // ... handle status
474
+ } catch (error) {
475
+ // Transient error - retry on next poll
476
+ if (isTransientError(error)) {
477
+ await ctx.log("WARN", "Transient error checking batch", { error: error.message });
478
+ return { ready: false, nextCheckIn: 30000 };
479
+ }
480
+
481
+ // Permanent error
482
+ return { ready: false, error: error.message };
483
+ }
484
+ }
485
+ ```
486
+
487
+ ## Storage Patterns
488
+
489
+ ### Caching Results
490
+
491
+ ```typescript
492
+ // In checkCompletion, cache before returning
493
+ await ctx.storage.save("batch-results", processedResults);
494
+
495
+ // In execute resume path
496
+ if (ctx.resumeState) {
497
+ const cached = await ctx.storage.load("batch-results");
498
+ if (cached) return { output: cached };
499
+ }
500
+ ```
501
+
502
+ ### Storing Metadata
503
+
504
+ ```typescript
505
+ // Save metadata during submission
506
+ await ctx.storage.save("batch-metadata", {
507
+ requestCount: requests.length,
508
+ requestIds: requests.map(r => r.id),
509
+ customData: { ... },
510
+ });
511
+
512
+ // Retrieve in checkCompletion
513
+ const metadata = await ctx.storage.load("batch-metadata");
514
+ ```