deepline 0.1.109 → 0.1.111
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +2634 -1532
- package/dist/cli/index.mjs +2547 -1451
- package/dist/index.d.mts +21 -14
- package/dist/index.d.ts +21 -14
- package/dist/index.js +97 -23
- package/dist/index.mjs +97 -23
- package/dist/repo/apps/play-runner-workers/src/coordinator-entry.ts +192 -121
- package/dist/repo/apps/play-runner-workers/src/entry.ts +254 -65
- package/dist/repo/apps/play-runner-workers/src/runtime/receipts.ts +18 -27
- package/dist/repo/apps/play-runner-workers/src/workflow-instance-create.ts +44 -0
- package/dist/repo/apps/play-runner-workers/src/workflow-retry.ts +7 -11
- package/dist/repo/sdk/src/client.ts +35 -12
- package/dist/repo/sdk/src/errors.ts +2 -2
- package/dist/repo/sdk/src/http.ts +87 -7
- package/dist/repo/sdk/src/play.ts +1 -1
- package/dist/repo/sdk/src/plays/bundle-play-file.ts +5 -1
- package/dist/repo/sdk/src/release.ts +13 -10
- package/dist/repo/sdk/src/tool-output.ts +2 -2
- package/dist/repo/sdk/src/types.ts +9 -6
- package/dist/repo/shared_libs/play-runtime/fullenrich-batching.ts +229 -0
- package/dist/repo/shared_libs/play-runtime/governor/policy.ts +1 -1
- package/dist/repo/shared_libs/play-runtime/play-runtime-batching-registry.ts +20 -0
- package/dist/repo/shared_libs/play-runtime/run-failure.ts +20 -12
- package/dist/repo/shared_libs/play-runtime/run-ledger.ts +147 -70
- package/dist/repo/shared_libs/play-runtime/scheduler-backend.ts +6 -2
- package/dist/repo/shared_libs/play-runtime/secret-redaction.ts +15 -0
- package/dist/repo/shared_libs/play-runtime/work-receipts.ts +1 -0
- package/dist/repo/shared_libs/plays/bundling/index.ts +193 -21
- package/dist/repo/shared_libs/plays/static-pipeline.ts +1 -3
- package/dist/repo/shared_libs/security/outbound-url-policy.ts +238 -0
- package/dist/repo/shared_libs/security/safe-fetch.ts +118 -0
- package/dist/viewer/viewer.css +617 -0
- package/dist/viewer/viewer.js +1496 -0
- package/package.json +5 -1
|
@@ -42,7 +42,7 @@ import {
|
|
|
42
42
|
executeChunkedRequests,
|
|
43
43
|
type ChunkExecutionResult,
|
|
44
44
|
} from '../../../shared_libs/play-runtime/batch-runtime';
|
|
45
|
-
import {
|
|
45
|
+
import { getPlayRuntimeBatchStrategy } from '../../../shared_libs/play-runtime/play-runtime-batching-registry';
|
|
46
46
|
import { STANDARD_PLAY_RUNTIME_LIMIT_SECONDS } from '../../../shared_libs/temporal/constants';
|
|
47
47
|
import {
|
|
48
48
|
createPlayExecutionGovernor,
|
|
@@ -161,6 +161,12 @@ import {
|
|
|
161
161
|
type SecretAwareRequestInit,
|
|
162
162
|
type SecretHandle,
|
|
163
163
|
} from '../../../shared_libs/play-runtime/secret-capability';
|
|
164
|
+
import { safePublicFetch } from '../../../shared_libs/security/safe-fetch';
|
|
165
|
+
import {
|
|
166
|
+
assertPublicHttpUrl,
|
|
167
|
+
isIpAddressLiteral,
|
|
168
|
+
UnsafeOutboundUrlError,
|
|
169
|
+
} from '../../../shared_libs/security/outbound-url-policy';
|
|
164
170
|
import type {
|
|
165
171
|
LiveNodeProgressMap,
|
|
166
172
|
LiveNodeProgressSnapshot,
|
|
@@ -395,6 +401,9 @@ function captureRuntimeApiBinding(env: WorkerEnv): void {
|
|
|
395
401
|
}
|
|
396
402
|
|
|
397
403
|
let cachedCoordinatorBinding: WorkerEnv['COORDINATOR'] | null = null;
|
|
404
|
+
const TRACE_FLUSH_MS = 1_000;
|
|
405
|
+
const pendingTraceForwardsByRun = new Map<string, Promise<void>>();
|
|
406
|
+
|
|
398
407
|
function captureCoordinatorBinding(env: WorkerEnv): void {
|
|
399
408
|
cachedCoordinatorBinding = env.COORDINATOR ?? null;
|
|
400
409
|
}
|
|
@@ -679,32 +688,58 @@ function recordRunnerPerfTrace(input: {
|
|
|
679
688
|
ms?: number;
|
|
680
689
|
extra?: Record<string, unknown>;
|
|
681
690
|
}): void {
|
|
691
|
+
// Benchmark note: these runner spans decompose the server watch's terminal
|
|
692
|
+
// wait. They are logged locally and forwarded to the coordinator so
|
|
693
|
+
// `/api/v2/plays/run --watch` benchmark exports can join them with
|
|
694
|
+
// `server.stream_scheduler_terminal_event` by runId.
|
|
682
695
|
if (!input.req.runId || !input.phase) return;
|
|
696
|
+
const phase = input.phase.startsWith('runner.')
|
|
697
|
+
? input.phase
|
|
698
|
+
: `runner.${input.phase}`;
|
|
683
699
|
// Tool-level traces can fire once per row/provider step. Forwarding each one
|
|
684
700
|
// through the coordinator binding can consume Cloudflare's subrequest budget
|
|
685
701
|
// before large batched maps finish.
|
|
686
|
-
if (
|
|
702
|
+
if (phase.startsWith('runner.tool.')) {
|
|
687
703
|
return;
|
|
688
704
|
}
|
|
689
705
|
const payload = {
|
|
690
706
|
ts: Date.now(),
|
|
691
707
|
source: 'dynamic_worker' as const,
|
|
692
708
|
runId: input.req.runId,
|
|
693
|
-
phase
|
|
709
|
+
phase,
|
|
694
710
|
ms: input.ms ?? 0,
|
|
695
711
|
...(input.extra ?? {}),
|
|
696
712
|
};
|
|
697
713
|
console.log(
|
|
698
714
|
`[deepline-run:${input.req.runId}] [perf-trace] ${JSON.stringify(payload)}`,
|
|
699
715
|
);
|
|
700
|
-
cachedCoordinatorBinding
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
716
|
+
const binding = cachedCoordinatorBinding;
|
|
717
|
+
if (!binding) return;
|
|
718
|
+
const forward = binding
|
|
719
|
+
.recordPerfTrace(input.req.runId, payload)
|
|
720
|
+
.catch(() => undefined);
|
|
721
|
+
const previous = pendingTraceForwardsByRun.get(input.req.runId);
|
|
722
|
+
const pending = previous
|
|
723
|
+
? previous.then(
|
|
724
|
+
() => forward,
|
|
725
|
+
() => forward,
|
|
726
|
+
)
|
|
727
|
+
: forward;
|
|
728
|
+
pendingTraceForwardsByRun.set(input.req.runId, pending);
|
|
729
|
+
void pending.finally(() => {
|
|
730
|
+
if (pendingTraceForwardsByRun.get(input.req.runId) === pending) {
|
|
731
|
+
pendingTraceForwardsByRun.delete(input.req.runId);
|
|
732
|
+
}
|
|
733
|
+
});
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
async function drainRunnerPerfTraces(req: RunRequest): Promise<void> {
|
|
737
|
+
const pending = pendingTraceForwardsByRun.get(req.runId);
|
|
738
|
+
if (!pending) return;
|
|
739
|
+
await Promise.race([
|
|
740
|
+
pending,
|
|
741
|
+
new Promise((resolve) => setTimeout(resolve, TRACE_FLUSH_MS)),
|
|
742
|
+
]);
|
|
708
743
|
}
|
|
709
744
|
|
|
710
745
|
function makeRequestId(): string {
|
|
@@ -1031,10 +1066,18 @@ async function executeToolWithLifecycle(
|
|
|
1031
1066
|
args: { id: string; toolId: string; input: Record<string, unknown> },
|
|
1032
1067
|
workflowStep: WorkflowStep | undefined,
|
|
1033
1068
|
callbacks: WorkerCtxCallbacks | undefined,
|
|
1069
|
+
onProviderBackpressure?: (retryAfterMs: number) => void,
|
|
1070
|
+
onRetryAttempt?: () => void,
|
|
1034
1071
|
): Promise<ToolExecuteResult> {
|
|
1035
1072
|
callbacks?.onToolCalled?.(args.toolId, nowMs());
|
|
1036
1073
|
try {
|
|
1037
|
-
return await executeTool(
|
|
1074
|
+
return await executeTool(
|
|
1075
|
+
req,
|
|
1076
|
+
args,
|
|
1077
|
+
workflowStep,
|
|
1078
|
+
onProviderBackpressure,
|
|
1079
|
+
onRetryAttempt,
|
|
1080
|
+
);
|
|
1038
1081
|
} catch (error) {
|
|
1039
1082
|
callbacks?.onToolFailed?.(args.toolId, nowMs());
|
|
1040
1083
|
throw error;
|
|
@@ -1178,17 +1221,38 @@ async function callToolDirect(
|
|
|
1178
1221
|
attempt <= WORKER_TOOL_RATE_LIMIT_MAX_ATTEMPTS;
|
|
1179
1222
|
attempt += 1
|
|
1180
1223
|
) {
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1224
|
+
let res: Response;
|
|
1225
|
+
try {
|
|
1226
|
+
res = await fetchRuntimeApi(req.baseUrl, path, {
|
|
1227
|
+
method: 'POST',
|
|
1228
|
+
headers: {
|
|
1229
|
+
'content-type': 'application/json',
|
|
1230
|
+
authorization: `Bearer ${req.executorToken}`,
|
|
1231
|
+
'x-deepline-request-id': `${req.runId}:${toolId}:${id}:attempt:${attempt}`,
|
|
1232
|
+
[EXECUTE_RESPONSE_CONTRACT_HEADER]: V2_EXECUTE_RESPONSE_CONTRACT,
|
|
1233
|
+
[EXECUTE_TOOL_METADATA_HEADER]: 'true',
|
|
1234
|
+
},
|
|
1235
|
+
body: JSON.stringify({ payload: input }),
|
|
1236
|
+
});
|
|
1237
|
+
} catch (error) {
|
|
1238
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1239
|
+
lastError = new Error(
|
|
1240
|
+
`Tool ${toolId} transport failed calling ${path} for run ${req.runId} on attempt ${attempt}/${WORKER_TOOL_TRANSPORT_MAX_ATTEMPTS}: ${message}`,
|
|
1241
|
+
);
|
|
1242
|
+
if (
|
|
1243
|
+
attempt >= WORKER_TOOL_TRANSPORT_MAX_ATTEMPTS ||
|
|
1244
|
+
!isRetryableRuntimeApiError(error)
|
|
1245
|
+
) {
|
|
1246
|
+
throw lastError;
|
|
1247
|
+
}
|
|
1248
|
+
onRetryAttempt?.();
|
|
1249
|
+
const delayMs = WORKER_TOOL_TRANSPORT_RETRY_DELAY_MS * attempt;
|
|
1250
|
+
console.warn(
|
|
1251
|
+
`[deepline-run:${req.runId}] tool transport retry tool=${toolId} path=${path} attempt=${attempt}/${WORKER_TOOL_TRANSPORT_MAX_ATTEMPTS} retryAfterMs=${delayMs} error=${redactSecretsFromLogString(message)}`,
|
|
1252
|
+
);
|
|
1253
|
+
await sleepWorkerMs(delayMs);
|
|
1254
|
+
continue;
|
|
1255
|
+
}
|
|
1192
1256
|
if (res.ok) {
|
|
1193
1257
|
const body = (await res.json()) as Record<string, unknown>;
|
|
1194
1258
|
const parsed = parseToolExecuteResponse(toolId, body);
|
|
@@ -1423,6 +1487,8 @@ const WORKER_RETRY_SAFE_5XX_TOOLS = new Set(['test_transient_500']);
|
|
|
1423
1487
|
* retry budget, so a runaway storm stays bounded and loud.
|
|
1424
1488
|
*/
|
|
1425
1489
|
const WORKER_TOOL_RATE_LIMIT_MAX_ATTEMPTS = 8;
|
|
1490
|
+
const WORKER_TOOL_TRANSPORT_MAX_ATTEMPTS = 3;
|
|
1491
|
+
const WORKER_TOOL_TRANSPORT_RETRY_DELAY_MS = 1_000;
|
|
1426
1492
|
|
|
1427
1493
|
function sleepWorkerMs(ms: number): Promise<void> {
|
|
1428
1494
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
@@ -1442,6 +1508,7 @@ class WorkerToolBatchScheduler {
|
|
|
1442
1508
|
private readonly resolvePacing: WorkerPacingResolver,
|
|
1443
1509
|
private readonly abortSignal?: AbortSignal,
|
|
1444
1510
|
private readonly onRequestsSettled?: (count: number) => void,
|
|
1511
|
+
private readonly callbacks?: WorkerCtxCallbacks,
|
|
1445
1512
|
) {}
|
|
1446
1513
|
|
|
1447
1514
|
/**
|
|
@@ -1508,7 +1575,7 @@ class WorkerToolBatchScheduler {
|
|
|
1508
1575
|
return this.queue.some(
|
|
1509
1576
|
(request) =>
|
|
1510
1577
|
request.toolId !== 'test_wait_for_event' &&
|
|
1511
|
-
|
|
1578
|
+
getPlayRuntimeBatchStrategy(request.toolId) !== null,
|
|
1512
1579
|
);
|
|
1513
1580
|
}
|
|
1514
1581
|
|
|
@@ -1539,8 +1606,8 @@ class WorkerToolBatchScheduler {
|
|
|
1539
1606
|
toolId: string,
|
|
1540
1607
|
requests: WorkerToolBatchRequest[],
|
|
1541
1608
|
): Promise<void> {
|
|
1542
|
-
const strategy =
|
|
1543
|
-
if (!strategy || toolId === 'test_wait_for_event') {
|
|
1609
|
+
const strategy = getPlayRuntimeBatchStrategy(toolId);
|
|
1610
|
+
if (!strategy || toolId === 'test_wait_for_event' || requests.length < 2) {
|
|
1544
1611
|
const groupStartedAt = nowMs();
|
|
1545
1612
|
await Promise.all(
|
|
1546
1613
|
requests.map(async (request) => {
|
|
@@ -1552,10 +1619,11 @@ class WorkerToolBatchScheduler {
|
|
|
1552
1619
|
});
|
|
1553
1620
|
try {
|
|
1554
1621
|
request.resolve(
|
|
1555
|
-
await
|
|
1622
|
+
await executeToolWithLifecycle(
|
|
1556
1623
|
this.req,
|
|
1557
1624
|
{ id: request.id, toolId, input: request.input },
|
|
1558
1625
|
request.workflowStep,
|
|
1626
|
+
this.callbacks,
|
|
1559
1627
|
(retryAfterMs) => this.reportBackpressure(toolId, retryAfterMs),
|
|
1560
1628
|
() => this.governor.chargeBudget('retry'),
|
|
1561
1629
|
),
|
|
@@ -1591,6 +1659,7 @@ class WorkerToolBatchScheduler {
|
|
|
1591
1659
|
reportBackpressure: (retryAfterMs) =>
|
|
1592
1660
|
this.reportBackpressure(toolId, retryAfterMs),
|
|
1593
1661
|
onRequestsSettled: this.onRequestsSettled,
|
|
1662
|
+
callbacks: this.callbacks,
|
|
1594
1663
|
});
|
|
1595
1664
|
recordRunnerPerfTrace({
|
|
1596
1665
|
req: this.req,
|
|
@@ -1625,6 +1694,7 @@ async function executeBatchedWorkerToolGroup(input: {
|
|
|
1625
1694
|
abortSignal?: AbortSignal;
|
|
1626
1695
|
reportBackpressure: (retryAfterMs: number) => void;
|
|
1627
1696
|
onRequestsSettled?: (count: number) => void;
|
|
1697
|
+
callbacks?: WorkerCtxCallbacks;
|
|
1628
1698
|
}): Promise<void> {
|
|
1629
1699
|
const compiledBatches = compileRequestsWithStrategy({
|
|
1630
1700
|
requests: input.requests,
|
|
@@ -1659,6 +1729,7 @@ async function executeBatchedWorkerToolGroup(input: {
|
|
|
1659
1729
|
signal: input.abortSignal,
|
|
1660
1730
|
});
|
|
1661
1731
|
try {
|
|
1732
|
+
input.callbacks?.onToolCalled?.(batch.batchOperation, nowMs());
|
|
1662
1733
|
return await executeTool(
|
|
1663
1734
|
input.req,
|
|
1664
1735
|
{
|
|
@@ -1670,6 +1741,9 @@ async function executeBatchedWorkerToolGroup(input: {
|
|
|
1670
1741
|
input.reportBackpressure,
|
|
1671
1742
|
() => input.governor.chargeBudget('retry'),
|
|
1672
1743
|
);
|
|
1744
|
+
} catch (error) {
|
|
1745
|
+
input.callbacks?.onToolFailed?.(batch.batchOperation, nowMs());
|
|
1746
|
+
throw error;
|
|
1673
1747
|
} finally {
|
|
1674
1748
|
slot.release();
|
|
1675
1749
|
}
|
|
@@ -2276,6 +2350,46 @@ function parseFetchJsonOrNull(bodyText: string): unknown | null {
|
|
|
2276
2350
|
}
|
|
2277
2351
|
}
|
|
2278
2352
|
|
|
2353
|
+
async function safeWorkerPublicFetch(
|
|
2354
|
+
input: string | URL,
|
|
2355
|
+
init: RequestInit,
|
|
2356
|
+
options: {
|
|
2357
|
+
allowedOrigins: Iterable<string>;
|
|
2358
|
+
sensitiveHeaders: Iterable<string>;
|
|
2359
|
+
},
|
|
2360
|
+
): Promise<Response> {
|
|
2361
|
+
const allowedOrigins = new Set(options.allowedOrigins);
|
|
2362
|
+
return safePublicFetch(input, init, {
|
|
2363
|
+
sensitiveHeaders: options.sensitiveHeaders,
|
|
2364
|
+
fetchImpl: async (nextInput, nextInit) => {
|
|
2365
|
+
const url = assertPublicHttpUrl(nextInput);
|
|
2366
|
+
if (
|
|
2367
|
+
!isIpAddressLiteral(url.hostname) &&
|
|
2368
|
+
!allowedOrigins.has(url.origin)
|
|
2369
|
+
) {
|
|
2370
|
+
throw new UnsafeOutboundUrlError(
|
|
2371
|
+
'workers_edge ctx.fetch requires a public IP literal target or Deepline runtime origin. Use a Deepline integration tool for other hostname URLs.',
|
|
2372
|
+
);
|
|
2373
|
+
}
|
|
2374
|
+
return fetch(url, nextInit);
|
|
2375
|
+
},
|
|
2376
|
+
});
|
|
2377
|
+
}
|
|
2378
|
+
|
|
2379
|
+
function normalizeAllowedWorkerFetchOrigin(rawUrl: string): string | null {
|
|
2380
|
+
try {
|
|
2381
|
+
return assertPublicHttpUrl(rawUrl).origin;
|
|
2382
|
+
} catch {
|
|
2383
|
+
return null;
|
|
2384
|
+
}
|
|
2385
|
+
}
|
|
2386
|
+
|
|
2387
|
+
function getAllowedWorkerFetchOrigins(req: RunRequest): string[] {
|
|
2388
|
+
return [req.baseUrl, req.callbackUrl]
|
|
2389
|
+
.map(normalizeAllowedWorkerFetchOrigin)
|
|
2390
|
+
.filter((origin): origin is string => origin !== null);
|
|
2391
|
+
}
|
|
2392
|
+
|
|
2279
2393
|
// ---------------------------------------------------------------------------
|
|
2280
2394
|
// Streaming CSV parser. Pipes a `ReadableStream<Uint8Array>` from R2 through
|
|
2281
2395
|
// a TextDecoder + line buffer + RFC-4180-ish state machine, yielding chunks
|
|
@@ -3248,7 +3362,8 @@ function createMinimalWorkerCtx(
|
|
|
3248
3362
|
const executeWithRuntimeReceipt = async <T>(
|
|
3249
3363
|
key: string,
|
|
3250
3364
|
execute: () => Promise<T> | T,
|
|
3251
|
-
repairRunningReceiptForSameRun =
|
|
3365
|
+
repairRunningReceiptForSameRun = true,
|
|
3366
|
+
reclaimRunning = false,
|
|
3252
3367
|
): Promise<T> => {
|
|
3253
3368
|
const serialized = await runWorkerRuntimeReceiptBoundary<unknown>({
|
|
3254
3369
|
orgId: req.orgId,
|
|
@@ -3258,6 +3373,7 @@ function createMinimalWorkerCtx(
|
|
|
3258
3373
|
receiptStore,
|
|
3259
3374
|
execute: async () => serializeDurableStepValue(await execute()),
|
|
3260
3375
|
repairRunningReceiptForSameRun,
|
|
3376
|
+
reclaimRunning,
|
|
3261
3377
|
});
|
|
3262
3378
|
return deserializeDurableStepValue(serialized) as T;
|
|
3263
3379
|
};
|
|
@@ -3279,7 +3395,7 @@ function createMinimalWorkerCtx(
|
|
|
3279
3395
|
)(name, async () => serializeDurableStepValue(await execute()));
|
|
3280
3396
|
return deserializeDurableStepValue(serialized) as T;
|
|
3281
3397
|
},
|
|
3282
|
-
|
|
3398
|
+
false,
|
|
3283
3399
|
);
|
|
3284
3400
|
};
|
|
3285
3401
|
const nextCtxStepReceiptKey = (name: string): string => {
|
|
@@ -3300,6 +3416,14 @@ function createMinimalWorkerCtx(
|
|
|
3300
3416
|
}
|
|
3301
3417
|
return `:stale:${staleAfterSeconds}:${Math.floor(nowMs() / (staleAfterSeconds * 1000))}`;
|
|
3302
3418
|
};
|
|
3419
|
+
const rootToolBatchScheduler = new WorkerToolBatchScheduler(
|
|
3420
|
+
req,
|
|
3421
|
+
governor,
|
|
3422
|
+
resolveToolPacing,
|
|
3423
|
+
abortSignal,
|
|
3424
|
+
undefined,
|
|
3425
|
+
callbacks,
|
|
3426
|
+
);
|
|
3303
3427
|
// Local ancestry chain that always ENDS with the currently-executing play
|
|
3304
3428
|
// (req.playName). The /api/v2/plays/run lineage validator requires the
|
|
3305
3429
|
// submitted ancestry's tail to equal the executor token's play name (i.e.
|
|
@@ -3746,10 +3870,8 @@ function createMinimalWorkerCtx(
|
|
|
3746
3870
|
reportExecutionHeartbeat(false);
|
|
3747
3871
|
const entry = uniqueRowsToExecuteEntries[myIndex]!;
|
|
3748
3872
|
const pendingRow = pendingRowsByKey.get(entry.rowKey);
|
|
3749
|
-
const row = runtimeCsvExecutionRow(
|
|
3750
|
-
|
|
3751
|
-
pendingRow,
|
|
3752
|
-
) as T & Record<string, unknown>;
|
|
3873
|
+
const row = runtimeCsvExecutionRow(entry.row, pendingRow) as T &
|
|
3874
|
+
Record<string, unknown>;
|
|
3753
3875
|
const absoluteIndex = entry.absoluteIndex;
|
|
3754
3876
|
const enriched: Record<string, unknown> =
|
|
3755
3877
|
cloneCsvAliasedRow(row);
|
|
@@ -4753,7 +4875,13 @@ function createMinimalWorkerCtx(
|
|
|
4753
4875
|
toolId: request.toolId,
|
|
4754
4876
|
requestInput: request.input,
|
|
4755
4877
|
})}${staleRuntimeSuffix(request.staleAfterSeconds)}`,
|
|
4756
|
-
() =>
|
|
4878
|
+
() =>
|
|
4879
|
+
rootToolBatchScheduler.execute(
|
|
4880
|
+
request.id,
|
|
4881
|
+
request.toolId,
|
|
4882
|
+
request.input,
|
|
4883
|
+
workflowStep,
|
|
4884
|
+
),
|
|
4757
4885
|
);
|
|
4758
4886
|
},
|
|
4759
4887
|
},
|
|
@@ -5214,7 +5342,10 @@ function createMinimalWorkerCtx(
|
|
|
5214
5342
|
};
|
|
5215
5343
|
const fetchInit = { ...init, headers };
|
|
5216
5344
|
delete fetchInit.auth;
|
|
5217
|
-
const response = await
|
|
5345
|
+
const response = await safeWorkerPublicFetch(url, fetchInit, {
|
|
5346
|
+
allowedOrigins: getAllowedWorkerFetchOrigins(req),
|
|
5347
|
+
sensitiveHeaders: Object.keys(secretHeaderMarkers),
|
|
5348
|
+
});
|
|
5218
5349
|
assertNotAborted(abortSignal);
|
|
5219
5350
|
const bodyText = await response.text();
|
|
5220
5351
|
const redactedBodyText = secretRedactor.redactString(bodyText);
|
|
@@ -5500,12 +5631,10 @@ async function executeRunRequest(
|
|
|
5500
5631
|
let runLogBuffer: string[] = [];
|
|
5501
5632
|
let pendingRunLogLines: string[] = [];
|
|
5502
5633
|
// Monotonic count of every line ever appended to this run's worker log
|
|
5503
|
-
// channel. runLogBuffer
|
|
5504
|
-
//
|
|
5505
|
-
//
|
|
5506
|
-
//
|
|
5507
|
-
// ingestion skips re-sent prefixes positionally (exactly-once, repeated
|
|
5508
|
-
// identical lines preserved) instead of text-deduping.
|
|
5634
|
+
// channel. runLogBuffer is only the rotating live/coordinator transport
|
|
5635
|
+
// cache; pendingRunLogLines is the durable unsent suffix and must not rotate,
|
|
5636
|
+
// otherwise a flush already in flight can let fresh lines fall out before
|
|
5637
|
+
// Run Log Stream ingestion ever sees them.
|
|
5509
5638
|
let totalEmittedLogLines = 0;
|
|
5510
5639
|
let stepProgressByNodeId: LiveNodeProgressMap = {};
|
|
5511
5640
|
let dirtyProgressNodeIds = new Set<string>();
|
|
@@ -5531,9 +5660,7 @@ async function executeRunRequest(
|
|
|
5531
5660
|
if (!trimmed) return;
|
|
5532
5661
|
totalEmittedLogLines += 1;
|
|
5533
5662
|
runLogBuffer = [...runLogBuffer, trimmed].slice(-RUN_LOG_BUFFER_LIMIT);
|
|
5534
|
-
pendingRunLogLines = [...pendingRunLogLines, trimmed]
|
|
5535
|
-
-RUN_LOG_BUFFER_LIMIT,
|
|
5536
|
-
);
|
|
5663
|
+
pendingRunLogLines = [...pendingRunLogLines, trimmed];
|
|
5537
5664
|
};
|
|
5538
5665
|
|
|
5539
5666
|
const updateStepProgress = (input: {
|
|
@@ -5722,9 +5849,7 @@ async function executeRunRequest(
|
|
|
5722
5849
|
lines: pendingRunLogLines,
|
|
5723
5850
|
// Positional cursor: pendingRunLogLines always holds the LAST
|
|
5724
5851
|
// pending lines emitted on this channel, so the offset of its first
|
|
5725
|
-
// line is total-emitted minus pending length.
|
|
5726
|
-
// terminal full-buffer re-send (pending = runLogBuffer), which
|
|
5727
|
-
// ingestion then skips positionally instead of via text dedupe.
|
|
5852
|
+
// line is total-emitted minus pending length.
|
|
5728
5853
|
channelOffset: totalEmittedLogLines - pendingRunLogLines.length,
|
|
5729
5854
|
});
|
|
5730
5855
|
pendingRunLogLines = [];
|
|
@@ -5820,18 +5945,13 @@ async function executeRunRequest(
|
|
|
5820
5945
|
terminalEvent: PlayRunLedgerEvent,
|
|
5821
5946
|
): Promise<void> => {
|
|
5822
5947
|
if (!options?.persistResultDatasets) return;
|
|
5948
|
+
await ledgerFlushInFlight;
|
|
5823
5949
|
const now = nowMs();
|
|
5824
|
-
// Terminal re-send of the full retained buffer. drainPendingLedgerEvents
|
|
5825
|
-
// stamps it with channelOffset = totalEmitted - buffer length, so Run Log
|
|
5826
|
-
// Stream ingestion drops the already-ingested prefix positionally.
|
|
5827
|
-
pendingRunLogLines = runLogBuffer;
|
|
5828
5950
|
dirtyProgressNodeIds = new Set([
|
|
5829
5951
|
...dirtyProgressNodeIds,
|
|
5830
5952
|
...Object.keys(stepProgressByNodeId),
|
|
5831
5953
|
]);
|
|
5832
|
-
|
|
5833
|
-
await ledgerFlushInFlight;
|
|
5834
|
-
const events = drainPendingLedgerEvents(now);
|
|
5954
|
+
const events = [...drainPendingLedgerEvents(now), terminalEvent];
|
|
5835
5955
|
if (events.length === 0) return;
|
|
5836
5956
|
try {
|
|
5837
5957
|
await postRuntimeApi(req.baseUrl, req.executorToken, {
|
|
@@ -5871,7 +5991,17 @@ async function executeRunRequest(
|
|
|
5871
5991
|
onToolFailed: (toolId, at) => stepLifecycle?.onToolFailed(toolId, at),
|
|
5872
5992
|
};
|
|
5873
5993
|
|
|
5994
|
+
let hasEmittedRunnerEvent = false;
|
|
5874
5995
|
const wrappedEmit = (event: RunnerEvent) => {
|
|
5996
|
+
if (!hasEmittedRunnerEvent) {
|
|
5997
|
+
hasEmittedRunnerEvent = true;
|
|
5998
|
+
recordRunnerPerfTrace({
|
|
5999
|
+
req,
|
|
6000
|
+
phase: 'first_event',
|
|
6001
|
+
ms: nowMs() - startedAt,
|
|
6002
|
+
extra: { eventType: event.type },
|
|
6003
|
+
});
|
|
6004
|
+
}
|
|
5875
6005
|
if (event.type === 'log') {
|
|
5876
6006
|
appendRunLogLine(event.message);
|
|
5877
6007
|
flushLedgerEvents(false);
|
|
@@ -6042,11 +6172,18 @@ async function executeRunRequest(
|
|
|
6042
6172
|
phase: 'runner.execute_total',
|
|
6043
6173
|
ms: nowMs() - startedAt,
|
|
6044
6174
|
});
|
|
6175
|
+
// The server-side watch path reads coordinator-buffered perf traces from
|
|
6176
|
+
// the same tail response that carries the terminal event. Runner traces are
|
|
6177
|
+
// forwarded asynchronously during execution so normal play latency is not
|
|
6178
|
+
// gated on observability writes; before returning terminal output, wait a
|
|
6179
|
+
// bounded interval for those writes to land. This keeps benchmark exports
|
|
6180
|
+
// able to decompose "terminal wait" into runner/dataset/ledger phases
|
|
6181
|
+
// without turning trace delivery into a correctness dependency.
|
|
6182
|
+
await drainRunnerPerfTraces(req);
|
|
6045
6183
|
return {
|
|
6046
6184
|
playName: req.playName,
|
|
6047
6185
|
result: serializedResult,
|
|
6048
6186
|
outputRows: inferOutputRows(serializedResult),
|
|
6049
|
-
liveLogs: runLogBuffer,
|
|
6050
6187
|
liveNodeProgress: stepProgressSnapshot(),
|
|
6051
6188
|
durationMs: nowMs() - startedAt,
|
|
6052
6189
|
};
|
|
@@ -6070,6 +6207,7 @@ async function executeRunRequest(
|
|
|
6070
6207
|
appendRunLogLine(
|
|
6071
6208
|
`${aborted ? '[cancelled]' : '[error]'} ${redactSecretsFromLogString(message)}`,
|
|
6072
6209
|
);
|
|
6210
|
+
const terminalUpdateStartedAt = nowMs();
|
|
6073
6211
|
await flushTerminalLedgerEvents({
|
|
6074
6212
|
type: aborted ? 'run.cancelled' : 'run.failed',
|
|
6075
6213
|
runId: req.runId,
|
|
@@ -6094,25 +6232,55 @@ async function executeRunRequest(
|
|
|
6094
6232
|
],
|
|
6095
6233
|
},
|
|
6096
6234
|
});
|
|
6235
|
+
recordRunnerPerfTrace({
|
|
6236
|
+
req,
|
|
6237
|
+
phase: aborted
|
|
6238
|
+
? 'runner.terminal_ledger_append_cancelled'
|
|
6239
|
+
: 'runner.terminal_ledger_append_failed',
|
|
6240
|
+
ms: nowMs() - terminalUpdateStartedAt,
|
|
6241
|
+
extra: {
|
|
6242
|
+
errorCode: failure.code,
|
|
6243
|
+
errorPhase: failure.phase,
|
|
6244
|
+
},
|
|
6245
|
+
});
|
|
6246
|
+
const billingStartedAt = nowMs();
|
|
6097
6247
|
await finalizeWorkerComputeBilling({
|
|
6098
6248
|
req,
|
|
6099
6249
|
success: false,
|
|
6100
6250
|
actionEstimate: 4,
|
|
6101
|
-
})
|
|
6102
|
-
|
|
6103
|
-
|
|
6104
|
-
|
|
6105
|
-
|
|
6106
|
-
|
|
6107
|
-
|
|
6108
|
-
|
|
6109
|
-
|
|
6251
|
+
})
|
|
6252
|
+
.catch((finalizeError) => {
|
|
6253
|
+
console.error(
|
|
6254
|
+
`[play-harness] non-fatal compute billing finalize failed runId=${req.runId}: ${
|
|
6255
|
+
finalizeError instanceof Error
|
|
6256
|
+
? finalizeError.message
|
|
6257
|
+
: String(finalizeError)
|
|
6258
|
+
}`,
|
|
6259
|
+
);
|
|
6260
|
+
})
|
|
6261
|
+
.finally(() => {
|
|
6262
|
+
recordRunnerPerfTrace({
|
|
6263
|
+
req,
|
|
6264
|
+
phase: 'runner.compute_billing_finalize_failed',
|
|
6265
|
+
ms: nowMs() - billingStartedAt,
|
|
6266
|
+
});
|
|
6267
|
+
});
|
|
6110
6268
|
}
|
|
6111
6269
|
await signalParentPlayTerminal({
|
|
6112
6270
|
req,
|
|
6113
6271
|
status: aborted ? 'cancelled' : 'failed',
|
|
6114
6272
|
error: message,
|
|
6115
6273
|
}).catch(() => null);
|
|
6274
|
+
recordRunnerPerfTrace({
|
|
6275
|
+
req,
|
|
6276
|
+
phase: aborted ? 'runner.execute_cancelled' : 'runner.execute_failed',
|
|
6277
|
+
ms: nowMs() - startedAt,
|
|
6278
|
+
extra: {
|
|
6279
|
+
errorCode: failure.code,
|
|
6280
|
+
errorPhase: failure.phase,
|
|
6281
|
+
},
|
|
6282
|
+
});
|
|
6283
|
+
await drainRunnerPerfTraces(req);
|
|
6116
6284
|
throw error;
|
|
6117
6285
|
} finally {
|
|
6118
6286
|
clearTimeout(runtimeDeadlineTimer);
|
|
@@ -6541,14 +6709,29 @@ export class TenantWorkflow extends WorkflowEntrypoint<
|
|
|
6541
6709
|
// Must run BEFORE any SDK call site that would reach into HARNESS,
|
|
6542
6710
|
// i.e. before user play code is invoked. Idempotent within a run.
|
|
6543
6711
|
captureHarnessBinding(this.env);
|
|
6712
|
+
recordRunnerPerfTrace({
|
|
6713
|
+
req,
|
|
6714
|
+
phase: 'tenant_workflow_entry',
|
|
6715
|
+
ms: 0,
|
|
6716
|
+
extra: {
|
|
6717
|
+
hasWorkflowStep: true,
|
|
6718
|
+
},
|
|
6719
|
+
});
|
|
6544
6720
|
// Fire the one-time wiring probe (deduplicated across runs in the
|
|
6545
6721
|
// same isolate). Awaited so the result is in the log before user code
|
|
6546
6722
|
// begins. A missing or unhealthy HARNESS fails the run before user code
|
|
6547
6723
|
// can accidentally take a slower fallback path.
|
|
6724
|
+
const probeStartedAt = nowMs();
|
|
6548
6725
|
await probeHarnessOnce(this.env, runPrefix);
|
|
6726
|
+
recordRunnerPerfTrace({
|
|
6727
|
+
req,
|
|
6728
|
+
phase: 'tenant_workflow_probe_harness',
|
|
6729
|
+
ms: nowMs() - probeStartedAt,
|
|
6730
|
+
});
|
|
6549
6731
|
const abortController = new AbortController();
|
|
6550
6732
|
try {
|
|
6551
|
-
|
|
6733
|
+
const executeStartedAt = nowMs();
|
|
6734
|
+
const output = (await executeRunRequest(
|
|
6552
6735
|
req,
|
|
6553
6736
|
this.env,
|
|
6554
6737
|
(runnerEvent) => {
|
|
@@ -6579,6 +6762,12 @@ export class TenantWorkflow extends WorkflowEntrypoint<
|
|
|
6579
6762
|
waitUntil: (promise) => this.ctx.waitUntil(promise),
|
|
6580
6763
|
},
|
|
6581
6764
|
)) as Record<string, unknown>;
|
|
6765
|
+
recordRunnerPerfTrace({
|
|
6766
|
+
req,
|
|
6767
|
+
phase: 'tenant_workflow_execute_request',
|
|
6768
|
+
ms: nowMs() - executeStartedAt,
|
|
6769
|
+
});
|
|
6770
|
+
return output;
|
|
6582
6771
|
} catch (error) {
|
|
6583
6772
|
// CF Workflows + the dynamic-workflows framework swallow the error
|
|
6584
6773
|
// message and surface only "internal error; reference = <id>" via
|
|
@@ -41,22 +41,13 @@ function errorMessage(error: unknown): string {
|
|
|
41
41
|
return error instanceof Error ? error.message : String(error);
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
function runningReceiptError(
|
|
45
|
-
key: string,
|
|
46
|
-
receipt: WorkerRuntimeReceipt,
|
|
47
|
-
): Error {
|
|
48
|
-
return new Error(
|
|
49
|
-
`Runtime receipt ${key} is already running for run ${receipt.runId ?? 'unknown'}.`,
|
|
50
|
-
);
|
|
51
|
-
}
|
|
52
|
-
|
|
53
44
|
async function executeAndPersistReceipt<T>(input: {
|
|
54
45
|
key: string;
|
|
55
46
|
playName: string;
|
|
56
47
|
runId: string;
|
|
57
48
|
execute: () => Promise<T> | T;
|
|
58
49
|
receiptStore: WorkerRuntimeReceiptStore;
|
|
59
|
-
ownership: 'claimed' | '
|
|
50
|
+
ownership: 'claimed' | 'reconciled';
|
|
60
51
|
}): Promise<T> {
|
|
61
52
|
let output: T;
|
|
62
53
|
try {
|
|
@@ -83,9 +74,13 @@ async function executeAndPersistReceipt<T>(input: {
|
|
|
83
74
|
output,
|
|
84
75
|
});
|
|
85
76
|
if (!completed) {
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
77
|
+
return output;
|
|
78
|
+
}
|
|
79
|
+
if (
|
|
80
|
+
(completed.status === 'completed' || completed.status === 'skipped') &&
|
|
81
|
+
completed.output !== undefined
|
|
82
|
+
) {
|
|
83
|
+
return receiptOutput<T>(completed);
|
|
89
84
|
}
|
|
90
85
|
return output;
|
|
91
86
|
}
|
|
@@ -94,6 +89,7 @@ export async function runWorkerRuntimeReceiptBoundary<T>(
|
|
|
94
89
|
input: RuntimeReceiptContext & {
|
|
95
90
|
execute: () => Promise<T> | T;
|
|
96
91
|
repairRunningReceiptForSameRun?: boolean;
|
|
92
|
+
reclaimRunning?: boolean;
|
|
97
93
|
},
|
|
98
94
|
): Promise<T> {
|
|
99
95
|
const key = scopedReceiptKey(input);
|
|
@@ -102,25 +98,20 @@ export async function runWorkerRuntimeReceiptBoundary<T>(
|
|
|
102
98
|
playName: input.playName,
|
|
103
99
|
runId: input.runId,
|
|
104
100
|
key,
|
|
101
|
+
...(input.reclaimRunning === true ? { reclaimRunning: true } : {}),
|
|
105
102
|
});
|
|
106
103
|
if (claimed.disposition === 'reused') {
|
|
107
104
|
return receiptOutput<T>(claimed.receipt);
|
|
108
105
|
}
|
|
109
106
|
if (claimed.disposition === 'running') {
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
execute: input.execute,
|
|
119
|
-
receiptStore,
|
|
120
|
-
ownership: 'workflow_replay',
|
|
121
|
-
});
|
|
122
|
-
}
|
|
123
|
-
throw runningReceiptError(key, claimed.receipt);
|
|
107
|
+
return executeAndPersistReceipt({
|
|
108
|
+
key,
|
|
109
|
+
playName: input.playName,
|
|
110
|
+
runId: input.runId,
|
|
111
|
+
execute: input.execute,
|
|
112
|
+
receiptStore,
|
|
113
|
+
ownership: 'reconciled',
|
|
114
|
+
});
|
|
124
115
|
}
|
|
125
116
|
if (claimed.disposition === 'failed') {
|
|
126
117
|
throw new Error(
|