@kbediako/codex-orchestrator 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +7 -0
- package/README.md +238 -0
- package/dist/bin/codex-orchestrator.js +507 -0
- package/dist/orchestrator/src/agents/builder.js +16 -0
- package/dist/orchestrator/src/agents/index.js +4 -0
- package/dist/orchestrator/src/agents/planner.js +17 -0
- package/dist/orchestrator/src/agents/reviewer.js +13 -0
- package/dist/orchestrator/src/agents/tester.js +13 -0
- package/dist/orchestrator/src/cli/adapters/CommandBuilder.js +20 -0
- package/dist/orchestrator/src/cli/adapters/CommandPlanner.js +164 -0
- package/dist/orchestrator/src/cli/adapters/CommandReviewer.js +32 -0
- package/dist/orchestrator/src/cli/adapters/CommandTester.js +33 -0
- package/dist/orchestrator/src/cli/adapters/index.js +4 -0
- package/dist/orchestrator/src/cli/config/userConfig.js +28 -0
- package/dist/orchestrator/src/cli/doctor.js +48 -0
- package/dist/orchestrator/src/cli/events/runEvents.js +84 -0
- package/dist/orchestrator/src/cli/exec/command.js +56 -0
- package/dist/orchestrator/src/cli/exec/context.js +108 -0
- package/dist/orchestrator/src/cli/exec/experience.js +77 -0
- package/dist/orchestrator/src/cli/exec/finalization.js +140 -0
- package/dist/orchestrator/src/cli/exec/learning.js +62 -0
- package/dist/orchestrator/src/cli/exec/stageRunner.js +71 -0
- package/dist/orchestrator/src/cli/exec/summary.js +109 -0
- package/dist/orchestrator/src/cli/exec/telemetry.js +18 -0
- package/dist/orchestrator/src/cli/exec/tfgrpo.js +200 -0
- package/dist/orchestrator/src/cli/exec/tfgrpoArtifacts.js +19 -0
- package/dist/orchestrator/src/cli/exec/types.js +1 -0
- package/dist/orchestrator/src/cli/init.js +64 -0
- package/dist/orchestrator/src/cli/mcp.js +124 -0
- package/dist/orchestrator/src/cli/metrics/metricsAggregator.js +404 -0
- package/dist/orchestrator/src/cli/metrics/metricsRecorder.js +138 -0
- package/dist/orchestrator/src/cli/orchestrator.js +554 -0
- package/dist/orchestrator/src/cli/pipelines/defaultDiagnostics.js +32 -0
- package/dist/orchestrator/src/cli/pipelines/designReference.js +72 -0
- package/dist/orchestrator/src/cli/pipelines/hiFiDesignToolkit.js +71 -0
- package/dist/orchestrator/src/cli/pipelines/index.js +34 -0
- package/dist/orchestrator/src/cli/run/environment.js +24 -0
- package/dist/orchestrator/src/cli/run/manifest.js +367 -0
- package/dist/orchestrator/src/cli/run/manifestPersister.js +88 -0
- package/dist/orchestrator/src/cli/run/runPaths.js +30 -0
- package/dist/orchestrator/src/cli/selfCheck.js +12 -0
- package/dist/orchestrator/src/cli/services/commandRunner.js +420 -0
- package/dist/orchestrator/src/cli/services/controlPlaneService.js +107 -0
- package/dist/orchestrator/src/cli/services/execRuntime.js +69 -0
- package/dist/orchestrator/src/cli/services/pipelineResolver.js +47 -0
- package/dist/orchestrator/src/cli/services/runPreparation.js +82 -0
- package/dist/orchestrator/src/cli/services/runSummaryWriter.js +35 -0
- package/dist/orchestrator/src/cli/services/schedulerService.js +42 -0
- package/dist/orchestrator/src/cli/tasks/taskMetadata.js +19 -0
- package/dist/orchestrator/src/cli/telemetry/schema.js +8 -0
- package/dist/orchestrator/src/cli/types.js +1 -0
- package/dist/orchestrator/src/cli/ui/HudApp.js +112 -0
- package/dist/orchestrator/src/cli/ui/controller.js +26 -0
- package/dist/orchestrator/src/cli/ui/store.js +240 -0
- package/dist/orchestrator/src/cli/utils/enforcementMode.js +12 -0
- package/dist/orchestrator/src/cli/utils/fs.js +8 -0
- package/dist/orchestrator/src/cli/utils/interactive.js +25 -0
- package/dist/orchestrator/src/cli/utils/jsonlWriter.js +10 -0
- package/dist/orchestrator/src/cli/utils/optionalDeps.js +30 -0
- package/dist/orchestrator/src/cli/utils/packageInfo.js +25 -0
- package/dist/orchestrator/src/cli/utils/planFormatter.js +49 -0
- package/dist/orchestrator/src/cli/utils/runId.js +7 -0
- package/dist/orchestrator/src/cli/utils/specGuardRunner.js +26 -0
- package/dist/orchestrator/src/cli/utils/strings.js +8 -0
- package/dist/orchestrator/src/cli/utils/time.js +6 -0
- package/dist/orchestrator/src/control-plane/drift-reporter.js +109 -0
- package/dist/orchestrator/src/control-plane/index.js +3 -0
- package/dist/orchestrator/src/control-plane/request-builder.js +217 -0
- package/dist/orchestrator/src/control-plane/types.js +1 -0
- package/dist/orchestrator/src/control-plane/validator.js +50 -0
- package/dist/orchestrator/src/credentials/CredentialBroker.js +1 -0
- package/dist/orchestrator/src/events/EventBus.js +25 -0
- package/dist/orchestrator/src/learning/crystalizer.js +108 -0
- package/dist/orchestrator/src/learning/harvester.js +146 -0
- package/dist/orchestrator/src/learning/manifest.js +56 -0
- package/dist/orchestrator/src/learning/runner.js +177 -0
- package/dist/orchestrator/src/learning/validator.js +164 -0
- package/dist/orchestrator/src/logger.js +20 -0
- package/dist/orchestrator/src/manager.js +388 -0
- package/dist/orchestrator/src/persistence/ArtifactStager.js +95 -0
- package/dist/orchestrator/src/persistence/ExperienceStore.js +210 -0
- package/dist/orchestrator/src/persistence/PersistenceCoordinator.js +65 -0
- package/dist/orchestrator/src/persistence/RunManifestWriter.js +23 -0
- package/dist/orchestrator/src/persistence/TaskStateStore.js +172 -0
- package/dist/orchestrator/src/persistence/identifierGuards.js +1 -0
- package/dist/orchestrator/src/persistence/lockFile.js +26 -0
- package/dist/orchestrator/src/persistence/sanitizeIdentifier.js +26 -0
- package/dist/orchestrator/src/persistence/sanitizeRunId.js +8 -0
- package/dist/orchestrator/src/persistence/sanitizeTaskId.js +8 -0
- package/dist/orchestrator/src/persistence/writeAtomicFile.js +4 -0
- package/dist/orchestrator/src/privacy/guard.js +111 -0
- package/dist/orchestrator/src/scheduler/index.js +1 -0
- package/dist/orchestrator/src/scheduler/plan.js +171 -0
- package/dist/orchestrator/src/scheduler/types.js +1 -0
- package/dist/orchestrator/src/sync/CloudRunsClient.js +1 -0
- package/dist/orchestrator/src/sync/CloudRunsHttpClient.js +82 -0
- package/dist/orchestrator/src/sync/CloudSyncWorker.js +206 -0
- package/dist/orchestrator/src/sync/createCloudSyncWorker.js +15 -0
- package/dist/orchestrator/src/types.js +1 -0
- package/dist/orchestrator/src/utils/atomicWrite.js +15 -0
- package/dist/orchestrator/src/utils/errorMessage.js +14 -0
- package/dist/orchestrator/src/utils/executionMode.js +69 -0
- package/dist/packages/control-plane-schemas/src/index.js +1 -0
- package/dist/packages/control-plane-schemas/src/run-request.js +548 -0
- package/dist/packages/orchestrator/src/exec/handle-service.js +203 -0
- package/dist/packages/orchestrator/src/exec/session-manager.js +147 -0
- package/dist/packages/orchestrator/src/exec/unified-exec.js +432 -0
- package/dist/packages/orchestrator/src/index.js +3 -0
- package/dist/packages/orchestrator/src/instructions/loader.js +101 -0
- package/dist/packages/orchestrator/src/instructions/promptPacks.js +151 -0
- package/dist/packages/orchestrator/src/notifications/index.js +74 -0
- package/dist/packages/orchestrator/src/telemetry/otel-exporter.js +142 -0
- package/dist/packages/orchestrator/src/tool-orchestrator.js +161 -0
- package/dist/packages/sdk-node/src/orchestrator.js +195 -0
- package/dist/packages/shared/config/designConfig.js +495 -0
- package/dist/packages/shared/config/env.js +37 -0
- package/dist/packages/shared/config/index.js +2 -0
- package/dist/packages/shared/design-artifacts/writer.js +221 -0
- package/dist/packages/shared/events/serializer.js +84 -0
- package/dist/packages/shared/events/types.js +1 -0
- package/dist/packages/shared/manifest/artifactUtils.js +36 -0
- package/dist/packages/shared/manifest/designArtifacts.js +665 -0
- package/dist/packages/shared/manifest/fileIO.js +29 -0
- package/dist/packages/shared/manifest/toolRuns.js +78 -0
- package/dist/packages/shared/manifest/toolkitArtifacts.js +223 -0
- package/dist/packages/shared/manifest/types.js +5 -0
- package/dist/packages/shared/manifest/validator.js +73 -0
- package/dist/packages/shared/manifest/writer.js +2 -0
- package/dist/packages/shared/streams/stdio.js +112 -0
- package/dist/scripts/design/pipeline/advanced-assets.js +466 -0
- package/dist/scripts/design/pipeline/componentize.js +74 -0
- package/dist/scripts/design/pipeline/context.js +34 -0
- package/dist/scripts/design/pipeline/extract.js +249 -0
- package/dist/scripts/design/pipeline/optionalDeps.js +107 -0
- package/dist/scripts/design/pipeline/prepare.js +46 -0
- package/dist/scripts/design/pipeline/reference.js +94 -0
- package/dist/scripts/design/pipeline/state.js +206 -0
- package/dist/scripts/design/pipeline/toolkit/common.js +94 -0
- package/dist/scripts/design/pipeline/toolkit/extract.js +258 -0
- package/dist/scripts/design/pipeline/toolkit/publish.js +202 -0
- package/dist/scripts/design/pipeline/toolkit/publishActions.js +12 -0
- package/dist/scripts/design/pipeline/toolkit/reference.js +846 -0
- package/dist/scripts/design/pipeline/toolkit/snapshot.js +882 -0
- package/dist/scripts/design/pipeline/toolkit/tokens.js +456 -0
- package/dist/scripts/design/pipeline/visual-regression.js +137 -0
- package/dist/scripts/design/pipeline/write-artifacts.js +61 -0
- package/package.json +97 -0
- package/schemas/manifest.json +1064 -0
- package/templates/README.md +12 -0
- package/templates/codex/mcp-client.json +8 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import { spawn } from 'node:child_process';
|
|
2
|
+
import { existsSync } from 'node:fs';
|
|
3
|
+
import { resolve } from 'node:path';
|
|
4
|
+
import process from 'node:process';
|
|
5
|
+
import { logger } from '../logger.js';
|
|
6
|
+
const MCP_HEADER_TOKEN = 'Content-Length:';
|
|
7
|
+
const MCP_HEADER_DELIMITER = '\r\n\r\n';
|
|
8
|
+
export async function serveMcp(options) {
|
|
9
|
+
const repoRoot = resolve(options.repoRoot ?? process.cwd());
|
|
10
|
+
if (!existsSync(repoRoot)) {
|
|
11
|
+
throw new Error(`Repository root not found: ${repoRoot}`);
|
|
12
|
+
}
|
|
13
|
+
if (options.dryRun) {
|
|
14
|
+
logger.warn(`[mcp] repo root: ${repoRoot}`);
|
|
15
|
+
logger.warn('[mcp] codex CLI must be available in PATH.');
|
|
16
|
+
return;
|
|
17
|
+
}
|
|
18
|
+
const args = ['-C', repoRoot, 'mcp-server', ...options.extraArgs];
|
|
19
|
+
const child = spawn('codex', args, { stdio: ['inherit', 'pipe', 'pipe'] });
|
|
20
|
+
if (child.stdout) {
|
|
21
|
+
if (isStrictMcpStdout()) {
|
|
22
|
+
attachMcpStdoutGuard(child.stdout);
|
|
23
|
+
}
|
|
24
|
+
else {
|
|
25
|
+
child.stdout.pipe(process.stdout);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
if (child.stderr) {
|
|
29
|
+
child.stderr.pipe(process.stderr);
|
|
30
|
+
}
|
|
31
|
+
await new Promise((resolvePromise) => {
|
|
32
|
+
child.once('exit', (code) => {
|
|
33
|
+
if (typeof code === 'number') {
|
|
34
|
+
process.exitCode = code;
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
process.exitCode = 1;
|
|
38
|
+
}
|
|
39
|
+
resolvePromise();
|
|
40
|
+
});
|
|
41
|
+
child.once('error', (error) => {
|
|
42
|
+
logger.error(error?.message ?? String(error));
|
|
43
|
+
process.exitCode = 1;
|
|
44
|
+
resolvePromise();
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
function isStrictMcpStdout() {
|
|
49
|
+
const value = process.env.CODEX_MCP_STDIO_STRICT;
|
|
50
|
+
if (!value) {
|
|
51
|
+
return true;
|
|
52
|
+
}
|
|
53
|
+
const normalized = value.trim().toLowerCase();
|
|
54
|
+
return normalized === '1' || normalized === 'true' || normalized === 'yes';
|
|
55
|
+
}
|
|
56
|
+
function attachMcpStdoutGuard(stream) {
|
|
57
|
+
let buffer = Buffer.alloc(0);
|
|
58
|
+
let expectedLength = null;
|
|
59
|
+
const flushLog = (data) => {
|
|
60
|
+
if (data.length > 0) {
|
|
61
|
+
process.stderr.write(data);
|
|
62
|
+
}
|
|
63
|
+
};
|
|
64
|
+
const processBuffer = () => {
|
|
65
|
+
while (buffer.length > 0) {
|
|
66
|
+
if (expectedLength !== null) {
|
|
67
|
+
if (buffer.length < expectedLength) {
|
|
68
|
+
return;
|
|
69
|
+
}
|
|
70
|
+
const body = buffer.slice(0, expectedLength);
|
|
71
|
+
buffer = buffer.slice(expectedLength);
|
|
72
|
+
process.stdout.write(body);
|
|
73
|
+
expectedLength = null;
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
const headerIndex = buffer.indexOf(MCP_HEADER_TOKEN);
|
|
77
|
+
if (headerIndex > 0) {
|
|
78
|
+
flushLog(buffer.slice(0, headerIndex));
|
|
79
|
+
buffer = buffer.slice(headerIndex);
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
if (headerIndex === -1) {
|
|
83
|
+
const newlineIndex = buffer.indexOf('\n');
|
|
84
|
+
if (newlineIndex !== -1) {
|
|
85
|
+
flushLog(buffer.slice(0, newlineIndex + 1));
|
|
86
|
+
buffer = buffer.slice(newlineIndex + 1);
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
return;
|
|
90
|
+
}
|
|
91
|
+
const headerEnd = buffer.indexOf(MCP_HEADER_DELIMITER);
|
|
92
|
+
if (headerEnd === -1) {
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
const headerBytes = buffer.slice(0, headerEnd + MCP_HEADER_DELIMITER.length);
|
|
96
|
+
const headerText = headerBytes.toString('utf8');
|
|
97
|
+
const match = headerText.match(/Content-Length:\s*(\d+)/i);
|
|
98
|
+
if (!match) {
|
|
99
|
+
flushLog(headerBytes);
|
|
100
|
+
buffer = buffer.slice(headerEnd + MCP_HEADER_DELIMITER.length);
|
|
101
|
+
continue;
|
|
102
|
+
}
|
|
103
|
+
const length = Number(match[1]);
|
|
104
|
+
if (!Number.isFinite(length) || length < 0) {
|
|
105
|
+
flushLog(headerBytes);
|
|
106
|
+
buffer = buffer.slice(headerEnd + MCP_HEADER_DELIMITER.length);
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
expectedLength = length;
|
|
110
|
+
process.stdout.write(headerBytes);
|
|
111
|
+
buffer = buffer.slice(headerEnd + MCP_HEADER_DELIMITER.length);
|
|
112
|
+
}
|
|
113
|
+
};
|
|
114
|
+
stream.on('data', (chunk) => {
|
|
115
|
+
buffer = Buffer.concat([buffer, Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)]);
|
|
116
|
+
processBuffer();
|
|
117
|
+
});
|
|
118
|
+
stream.on('error', (error) => {
|
|
119
|
+
logger.error(error?.message ?? String(error));
|
|
120
|
+
});
|
|
121
|
+
if (typeof stream.resume === 'function') {
|
|
122
|
+
stream.resume();
|
|
123
|
+
}
|
|
124
|
+
}
|
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
import { appendFile, mkdir, readFile, readdir, rm, stat, writeFile } from 'node:fs/promises';
|
|
2
|
+
import { dirname, join } from 'node:path';
|
|
3
|
+
import { acquireLockWithRetry } from '../../persistence/lockFile.js';
|
|
4
|
+
const REQUIRED_COMPLETENESS_FIELDS = [
|
|
5
|
+
'instance_stats',
|
|
6
|
+
'privacy_events',
|
|
7
|
+
'control_plane_status'
|
|
8
|
+
];
|
|
9
|
+
const METRICS_LOCK_FILENAME = 'metrics.lock';
|
|
10
|
+
const METRICS_PENDING_DIRNAME = 'metrics.pending';
|
|
11
|
+
const DEFAULT_LOCK_STALE_MS = 5 * 60 * 1000;
|
|
12
|
+
const DEFAULT_LOCK_RETRY = {
|
|
13
|
+
maxAttempts: 4,
|
|
14
|
+
initialDelayMs: 50,
|
|
15
|
+
backoffFactor: 2,
|
|
16
|
+
maxDelayMs: 200
|
|
17
|
+
};
|
|
18
|
+
function getMetricsRoot(env) {
|
|
19
|
+
return join(env.runsRoot, env.taskId);
|
|
20
|
+
}
|
|
21
|
+
function getMetricsPath(env) {
|
|
22
|
+
return join(getMetricsRoot(env), 'metrics.json');
|
|
23
|
+
}
|
|
24
|
+
function getMetricsPendingDir(env) {
|
|
25
|
+
return join(getMetricsRoot(env), METRICS_PENDING_DIRNAME);
|
|
26
|
+
}
|
|
27
|
+
function getMetricsLockPath(env) {
|
|
28
|
+
return join(getMetricsRoot(env), METRICS_LOCK_FILENAME);
|
|
29
|
+
}
|
|
30
|
+
async function cleanupStaleMetricsLock(lockPath, staleMs) {
|
|
31
|
+
if (staleMs <= 0) {
|
|
32
|
+
return false;
|
|
33
|
+
}
|
|
34
|
+
try {
|
|
35
|
+
const stats = await stat(lockPath);
|
|
36
|
+
const ageMs = Date.now() - stats.mtimeMs;
|
|
37
|
+
if (!Number.isFinite(ageMs) || ageMs <= staleMs) {
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
await rm(lockPath, { force: true });
|
|
41
|
+
return true;
|
|
42
|
+
}
|
|
43
|
+
catch (error) {
|
|
44
|
+
if (error.code === 'ENOENT') {
|
|
45
|
+
return false;
|
|
46
|
+
}
|
|
47
|
+
throw error;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
class MetricsLockError extends Error {
|
|
51
|
+
taskId;
|
|
52
|
+
constructor(message, taskId) {
|
|
53
|
+
super(message);
|
|
54
|
+
this.taskId = taskId;
|
|
55
|
+
this.name = 'MetricsLockError';
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
async function drainMetricsEntryFile(env, path) {
|
|
59
|
+
let raw = '';
|
|
60
|
+
try {
|
|
61
|
+
raw = await readFile(path, 'utf8');
|
|
62
|
+
}
|
|
63
|
+
catch (error) {
|
|
64
|
+
if (error.code === 'ENOENT') {
|
|
65
|
+
return 0;
|
|
66
|
+
}
|
|
67
|
+
throw error;
|
|
68
|
+
}
|
|
69
|
+
const lines = raw.trim().split('\n').filter(Boolean);
|
|
70
|
+
if (lines.length === 0) {
|
|
71
|
+
await rm(path, { force: true });
|
|
72
|
+
return 0;
|
|
73
|
+
}
|
|
74
|
+
await mkdir(getMetricsRoot(env), { recursive: true });
|
|
75
|
+
const payload = `${lines.join('\n')}\n`;
|
|
76
|
+
await appendFile(getMetricsPath(env), payload, 'utf8');
|
|
77
|
+
await rm(path, { force: true });
|
|
78
|
+
return lines.length;
|
|
79
|
+
}
|
|
80
|
+
export async function mergePendingMetricsEntries(env) {
|
|
81
|
+
const pendingDir = getMetricsPendingDir(env);
|
|
82
|
+
let merged = 0;
|
|
83
|
+
const staleTmpMs = DEFAULT_LOCK_STALE_MS;
|
|
84
|
+
for (let pass = 0; pass < 2; pass += 1) {
|
|
85
|
+
let entries = [];
|
|
86
|
+
try {
|
|
87
|
+
entries = await readdir(pendingDir, { withFileTypes: true });
|
|
88
|
+
}
|
|
89
|
+
catch (error) {
|
|
90
|
+
if (error.code === 'ENOENT') {
|
|
91
|
+
return merged;
|
|
92
|
+
}
|
|
93
|
+
throw error;
|
|
94
|
+
}
|
|
95
|
+
const now = Date.now();
|
|
96
|
+
for (const entry of entries) {
|
|
97
|
+
if (!entry.isFile() || !entry.name.endsWith('.tmp')) {
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
const tmpPath = join(pendingDir, entry.name);
|
|
101
|
+
try {
|
|
102
|
+
const stats = await stat(tmpPath);
|
|
103
|
+
if (now - stats.mtimeMs > staleTmpMs) {
|
|
104
|
+
await rm(tmpPath, { force: true });
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
catch (error) {
|
|
108
|
+
if (error.code !== 'ENOENT') {
|
|
109
|
+
throw error;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
const files = entries
|
|
114
|
+
.filter((entry) => entry.isFile() && entry.name.endsWith('.jsonl'))
|
|
115
|
+
.map((entry) => entry.name)
|
|
116
|
+
.sort();
|
|
117
|
+
if (files.length === 0) {
|
|
118
|
+
break;
|
|
119
|
+
}
|
|
120
|
+
for (const file of files) {
|
|
121
|
+
merged += await drainMetricsEntryFile(env, join(pendingDir, file));
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
return merged;
|
|
125
|
+
}
|
|
126
|
+
export async function withMetricsLock(env, action, options = {}) {
|
|
127
|
+
const overrides = options.retry ?? {};
|
|
128
|
+
const sanitizedOverrides = Object.fromEntries(Object.entries(overrides).filter(([, value]) => value !== undefined));
|
|
129
|
+
const lockRetry = { ...DEFAULT_LOCK_RETRY, ...sanitizedOverrides };
|
|
130
|
+
const lockPath = getMetricsLockPath(env);
|
|
131
|
+
const staleMs = options.staleMs ?? DEFAULT_LOCK_STALE_MS;
|
|
132
|
+
await cleanupStaleMetricsLock(lockPath, staleMs);
|
|
133
|
+
try {
|
|
134
|
+
await acquireLockWithRetry({
|
|
135
|
+
taskId: env.taskId,
|
|
136
|
+
lockPath,
|
|
137
|
+
retry: lockRetry,
|
|
138
|
+
ensureDirectory: async () => {
|
|
139
|
+
await mkdir(dirname(lockPath), { recursive: true });
|
|
140
|
+
},
|
|
141
|
+
createError: (taskId, attempts) => new MetricsLockError(`Failed to acquire metrics lock for ${taskId} after ${attempts} attempts`, taskId)
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
catch (error) {
|
|
145
|
+
if (error instanceof MetricsLockError) {
|
|
146
|
+
return { acquired: false };
|
|
147
|
+
}
|
|
148
|
+
throw error;
|
|
149
|
+
}
|
|
150
|
+
try {
|
|
151
|
+
const result = await action();
|
|
152
|
+
return { acquired: true, result };
|
|
153
|
+
}
|
|
154
|
+
finally {
|
|
155
|
+
await rm(lockPath, { force: true });
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
export async function updateMetricsAggregates(env) {
|
|
159
|
+
const metricsRoot = getMetricsRoot(env);
|
|
160
|
+
const metricsPath = getMetricsPath(env);
|
|
161
|
+
const entries = await loadMetricsEntries(metricsPath);
|
|
162
|
+
if (entries.length === 0) {
|
|
163
|
+
return;
|
|
164
|
+
}
|
|
165
|
+
const metricsDir = join(metricsRoot, 'metrics');
|
|
166
|
+
await mkdir(metricsDir, { recursive: true });
|
|
167
|
+
await Promise.all([
|
|
168
|
+
ensureBaseline(metricsDir, entries[0]),
|
|
169
|
+
writePostRollout(metricsDir, entries),
|
|
170
|
+
writeCompleteness(metricsDir, entries),
|
|
171
|
+
writeMttrDelta(env, entries),
|
|
172
|
+
writeTfgrpoEpochAggregates(metricsDir, entries),
|
|
173
|
+
writeLearningState(env, entries)
|
|
174
|
+
]);
|
|
175
|
+
}
|
|
176
|
+
async function ensureBaseline(dir, entry) {
|
|
177
|
+
const baselinePath = join(dir, 'baseline.json');
|
|
178
|
+
try {
|
|
179
|
+
await stat(baselinePath);
|
|
180
|
+
return;
|
|
181
|
+
}
|
|
182
|
+
catch (error) {
|
|
183
|
+
if (error.code !== 'ENOENT') {
|
|
184
|
+
throw error;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
const baseline = {
|
|
188
|
+
run_id: entry.run_id,
|
|
189
|
+
recorded_at: entry.recorded_at,
|
|
190
|
+
status: entry.status,
|
|
191
|
+
duration_seconds: entry.duration_seconds,
|
|
192
|
+
completion_rate: entry.status === 'succeeded' ? 1 : 0
|
|
193
|
+
};
|
|
194
|
+
await writeFile(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`, 'utf8');
|
|
195
|
+
}
|
|
196
|
+
async function writePostRollout(dir, entries) {
|
|
197
|
+
const totalRuns = entries.length;
|
|
198
|
+
const succeededRuns = entries.filter((entry) => entry.status === 'succeeded').length;
|
|
199
|
+
const completionRate = totalRuns > 0 ? succeededRuns / totalRuns : 0;
|
|
200
|
+
const payload = {
|
|
201
|
+
total_runs: totalRuns,
|
|
202
|
+
succeeded_runs: succeededRuns,
|
|
203
|
+
completion_rate: completionRate,
|
|
204
|
+
meets_threshold: completionRate >= 0.95,
|
|
205
|
+
updated_at: new Date().toISOString()
|
|
206
|
+
};
|
|
207
|
+
await writeFile(join(dir, 'post-rollout.json'), `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
|
|
208
|
+
}
|
|
209
|
+
async function writeCompleteness(dir, entries) {
|
|
210
|
+
const fieldChecks = REQUIRED_COMPLETENESS_FIELDS.length * entries.length;
|
|
211
|
+
if (fieldChecks === 0) {
|
|
212
|
+
return;
|
|
213
|
+
}
|
|
214
|
+
const missingCounts = Object.fromEntries(REQUIRED_COMPLETENESS_FIELDS.map((field) => [field, 0]));
|
|
215
|
+
for (const entry of entries) {
|
|
216
|
+
if (!Array.isArray(entry.instance_stats) || entry.instance_stats.length === 0) {
|
|
217
|
+
missingCounts.instance_stats += 1;
|
|
218
|
+
}
|
|
219
|
+
if (!Array.isArray(entry.privacy_events) || entry.privacy_events.length === 0) {
|
|
220
|
+
missingCounts.privacy_events += 1;
|
|
221
|
+
}
|
|
222
|
+
if (!entry.control_plane_status || entry.control_plane_status === 'unknown') {
|
|
223
|
+
missingCounts.control_plane_status += 1;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
const totalMissing = Object.values(missingCounts).reduce((sum, value) => sum + value, 0);
|
|
227
|
+
const ratio = totalMissing / fieldChecks;
|
|
228
|
+
const payload = {
|
|
229
|
+
checked_fields: REQUIRED_COMPLETENESS_FIELDS,
|
|
230
|
+
missing_counts: missingCounts,
|
|
231
|
+
missing_field_ratio: ratio,
|
|
232
|
+
meets_threshold: ratio < 0.05,
|
|
233
|
+
updated_at: new Date().toISOString()
|
|
234
|
+
};
|
|
235
|
+
await writeFile(join(dir, 'completeness.json'), `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
|
|
236
|
+
}
|
|
237
|
+
async function writeMttrDelta(env, entries) {
|
|
238
|
+
const durations = entries
|
|
239
|
+
.map((entry) => entry.duration_seconds)
|
|
240
|
+
.filter((value) => typeof value === 'number' && Number.isFinite(value));
|
|
241
|
+
if (durations.length === 0) {
|
|
242
|
+
return;
|
|
243
|
+
}
|
|
244
|
+
const currentMttr = average(durations);
|
|
245
|
+
const metricsDir = join(env.runsRoot, env.taskId, 'metrics');
|
|
246
|
+
const baselinePath = join(metricsDir, 'baseline.json');
|
|
247
|
+
let baselineMttr = currentMttr;
|
|
248
|
+
try {
|
|
249
|
+
const raw = await readFile(baselinePath, 'utf8');
|
|
250
|
+
const parsed = JSON.parse(raw);
|
|
251
|
+
if (typeof parsed.duration_seconds === 'number') {
|
|
252
|
+
baselineMttr = parsed.duration_seconds;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
catch (error) {
|
|
256
|
+
// baseline already ensured by ensureBaseline
|
|
257
|
+
}
|
|
258
|
+
const reductionPercent = baselineMttr > 0 ? (baselineMttr - currentMttr) / baselineMttr : 0;
|
|
259
|
+
const payload = {
|
|
260
|
+
baseline_mttr_seconds: baselineMttr,
|
|
261
|
+
current_mttr_seconds: currentMttr,
|
|
262
|
+
reduction_percent: reductionPercent,
|
|
263
|
+
meets_threshold: baselineMttr <= 0 ? true : currentMttr <= baselineMttr * 0.6,
|
|
264
|
+
updated_at: new Date().toISOString()
|
|
265
|
+
};
|
|
266
|
+
const outDir = join(env.outRoot, env.taskId, 'metrics');
|
|
267
|
+
await mkdir(outDir, { recursive: true });
|
|
268
|
+
await writeFile(join(outDir, 'mttr-delta.json'), `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
|
|
269
|
+
}
|
|
270
|
+
async function writeTfgrpoEpochAggregates(dir, entries) {
|
|
271
|
+
const grouped = new Map();
|
|
272
|
+
for (const entry of entries) {
|
|
273
|
+
if (typeof entry.tfgrpo_epoch !== 'number') {
|
|
274
|
+
continue;
|
|
275
|
+
}
|
|
276
|
+
const bucket = grouped.get(entry.tfgrpo_epoch) ?? [];
|
|
277
|
+
bucket.push(entry);
|
|
278
|
+
grouped.set(entry.tfgrpo_epoch, bucket);
|
|
279
|
+
}
|
|
280
|
+
if (grouped.size === 0) {
|
|
281
|
+
return;
|
|
282
|
+
}
|
|
283
|
+
const epochs = Array.from(grouped.entries())
|
|
284
|
+
.sort(([a], [b]) => a - b)
|
|
285
|
+
.map(([epoch, group]) => summarizeEpoch(epoch, group));
|
|
286
|
+
const payload = {
|
|
287
|
+
epochs,
|
|
288
|
+
updated_at: new Date().toISOString()
|
|
289
|
+
};
|
|
290
|
+
await writeFile(join(dir, 'per-epoch.json'), `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
|
|
291
|
+
}
|
|
292
|
+
async function writeLearningState(env, entries) {
|
|
293
|
+
const validationStatuses = entries
|
|
294
|
+
.map((entry) => entry.learning_validation_status)
|
|
295
|
+
.filter((value) => typeof value === 'string');
|
|
296
|
+
const validationSummary = {
|
|
297
|
+
passed: validationStatuses.filter((status) => status === 'validated').length,
|
|
298
|
+
failed: validationStatuses.filter((status) => status === 'snapshot_failed').length,
|
|
299
|
+
stalled: validationStatuses.filter((status) => status === 'stalled_snapshot').length,
|
|
300
|
+
manual: validationStatuses.filter((status) => status === 'needs_manual_scenario').length
|
|
301
|
+
};
|
|
302
|
+
const reviewerRejections = entries.reduce((sum, entry) => sum + (entry.learning_review_rejections ?? 0), 0);
|
|
303
|
+
const reviewerLatencies = entries
|
|
304
|
+
.map((entry) => entry.learning_review_latency_ms)
|
|
305
|
+
.filter((value) => typeof value === 'number' && Number.isFinite(value));
|
|
306
|
+
const reviewerLatencyMs = reviewerLatencies.length > 0
|
|
307
|
+
? reviewerLatencies.reduce((sum, value) => sum + value, 0) / reviewerLatencies.length
|
|
308
|
+
: null;
|
|
309
|
+
const regressions = entries.reduce((sum, entry) => sum + (entry.learning_regressions_detected ?? 0), 0);
|
|
310
|
+
const patternPromotions = entries.reduce((sum, entry) => sum + (entry.learning_pattern_promoted ?? 0), 0);
|
|
311
|
+
const patternDeprecations = entries.reduce((sum, entry) => sum + (entry.learning_pattern_deprecated ?? 0), 0);
|
|
312
|
+
const throughputCandidates = entries.reduce((sum, entry) => sum + (entry.learning_throughput_candidates ?? 0), 0);
|
|
313
|
+
const alerts = {
|
|
314
|
+
total: entries.reduce((sum, entry) => sum + (entry.learning_alerts ?? 0), 0),
|
|
315
|
+
snapshot_failed: entries.filter((entry) => entry.learning_snapshot_status === 'snapshot_failed').length,
|
|
316
|
+
stalled_snapshot: entries.filter((entry) => entry.learning_snapshot_status === 'stalled_snapshot').length,
|
|
317
|
+
needs_manual_scenario: validationSummary.manual
|
|
318
|
+
};
|
|
319
|
+
const payload = {
|
|
320
|
+
updated_at: new Date().toISOString(),
|
|
321
|
+
safety: {
|
|
322
|
+
validation: validationSummary,
|
|
323
|
+
reviewer: { rejections: reviewerRejections, average_latency_ms: reviewerLatencyMs },
|
|
324
|
+
regression_detection: { detected: regressions },
|
|
325
|
+
pattern_hygiene: { promoted: patternPromotions, deprecated: patternDeprecations }
|
|
326
|
+
},
|
|
327
|
+
throughput: { candidates: throughputCandidates },
|
|
328
|
+
alerts
|
|
329
|
+
};
|
|
330
|
+
const outDir = join(env.outRoot, env.taskId);
|
|
331
|
+
await mkdir(outDir, { recursive: true });
|
|
332
|
+
await writeFile(join(outDir, 'state.json'), `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
|
|
333
|
+
}
|
|
334
|
+
function summarizeEpoch(epoch, entries) {
|
|
335
|
+
const runs = entries.length;
|
|
336
|
+
const toolCalls = entries.reduce((sum, entry) => sum + (entry.tool_calls ?? 0), 0);
|
|
337
|
+
const tokenTotal = entries.reduce((sum, entry) => sum + (entry.token_total ?? 0), 0);
|
|
338
|
+
const costUsd = roundCurrency(entries.reduce((sum, entry) => sum + (entry.cost_usd ?? 0), 0));
|
|
339
|
+
const latencyMs = entries.reduce((sum, entry) => sum + (entry.latency_ms ?? 0), 0);
|
|
340
|
+
const groupSizes = entries
|
|
341
|
+
.map((entry) => entry.tfgrpo_group_size)
|
|
342
|
+
.filter((value) => typeof value === 'number');
|
|
343
|
+
const groupSizeAvg = groupSizes.length > 0 ? groupSizes.reduce((sum, value) => sum + value, 0) / groupSizes.length : null;
|
|
344
|
+
return {
|
|
345
|
+
epoch,
|
|
346
|
+
runs,
|
|
347
|
+
tool_calls: toolCalls,
|
|
348
|
+
token_total: tokenTotal,
|
|
349
|
+
cost_usd: costUsd,
|
|
350
|
+
latency_ms: latencyMs,
|
|
351
|
+
group_size_avg: groupSizeAvg,
|
|
352
|
+
tools: aggregateToolStats(entries)
|
|
353
|
+
};
|
|
354
|
+
}
|
|
355
|
+
function aggregateToolStats(entries) {
|
|
356
|
+
const aggregates = new Map();
|
|
357
|
+
for (const entry of entries) {
|
|
358
|
+
const stats = entry.tool_stats ?? [];
|
|
359
|
+
for (const stat of stats) {
|
|
360
|
+
if (typeof stat.tool !== 'string' || !stat.tool) {
|
|
361
|
+
continue;
|
|
362
|
+
}
|
|
363
|
+
const current = aggregates.get(stat.tool) ?? { runs: 0, tokens: 0, costUsd: 0, latencyMs: 0 };
|
|
364
|
+
current.runs += 1;
|
|
365
|
+
current.tokens += typeof stat.tokens === 'number' ? stat.tokens : 0;
|
|
366
|
+
current.costUsd += typeof stat.cost_usd === 'number' ? stat.cost_usd : 0;
|
|
367
|
+
current.latencyMs += typeof stat.latency_ms === 'number' ? stat.latency_ms : 0;
|
|
368
|
+
aggregates.set(stat.tool, current);
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
return Array.from(aggregates.entries()).map(([tool, aggregate]) => ({
|
|
372
|
+
tool,
|
|
373
|
+
runs: aggregate.runs,
|
|
374
|
+
tokens: aggregate.tokens,
|
|
375
|
+
cost_usd: roundCurrency(aggregate.costUsd),
|
|
376
|
+
latency_ms: aggregate.latencyMs
|
|
377
|
+
}));
|
|
378
|
+
}
|
|
379
|
+
async function loadMetricsEntries(path) {
|
|
380
|
+
try {
|
|
381
|
+
const raw = await readFile(path, 'utf8');
|
|
382
|
+
return raw
|
|
383
|
+
.trim()
|
|
384
|
+
.split('\n')
|
|
385
|
+
.filter(Boolean)
|
|
386
|
+
.map((line) => JSON.parse(line));
|
|
387
|
+
}
|
|
388
|
+
catch (error) {
|
|
389
|
+
if (error.code === 'ENOENT') {
|
|
390
|
+
return [];
|
|
391
|
+
}
|
|
392
|
+
throw error;
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
function average(values) {
|
|
396
|
+
if (values.length === 0) {
|
|
397
|
+
return 0;
|
|
398
|
+
}
|
|
399
|
+
const sum = values.reduce((total, value) => total + value, 0);
|
|
400
|
+
return sum / values.length;
|
|
401
|
+
}
|
|
402
|
+
function roundCurrency(value) {
|
|
403
|
+
return Math.round(value * 1_000_000) / 1_000_000;
|
|
404
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto';
|
|
2
|
+
import { appendFile, mkdir, rename, writeFile } from 'node:fs/promises';
|
|
3
|
+
import { join } from 'node:path';
|
|
4
|
+
import { ensureGuardrailStatus, appendSummary, upsertGuardrailSummary } from '../run/manifest.js';
|
|
5
|
+
import { isoTimestamp } from '../utils/time.js';
|
|
6
|
+
import { persistManifest } from '../run/manifestPersister.js';
|
|
7
|
+
import { logger } from '../../logger.js';
|
|
8
|
+
import { mergePendingMetricsEntries, updateMetricsAggregates, withMetricsLock } from './metricsAggregator.js';
|
|
9
|
+
import { EnvUtils } from '../../../../packages/shared/config/index.js';
|
|
10
|
+
const TERMINAL_STATES = new Set(['succeeded', 'failed', 'cancelled']);
|
|
11
|
+
const METRICS_PENDING_DIRNAME = 'metrics.pending';
|
|
12
|
+
export async function appendMetricsEntry(env, paths, manifest, persister) {
|
|
13
|
+
if (manifest.metrics_recorded) {
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
if (!TERMINAL_STATES.has(manifest.status)) {
|
|
17
|
+
return;
|
|
18
|
+
}
|
|
19
|
+
const startedAt = manifest.started_at ? Date.parse(manifest.started_at) : NaN;
|
|
20
|
+
const completedAt = manifest.completed_at ? Date.parse(manifest.completed_at) : NaN;
|
|
21
|
+
const durationSeconds = Number.isNaN(startedAt) || Number.isNaN(completedAt)
|
|
22
|
+
? null
|
|
23
|
+
: Math.max(0, (completedAt - startedAt) / 1000);
|
|
24
|
+
const commandsPassed = manifest.commands.filter((cmd) => cmd.status === 'succeeded').length;
|
|
25
|
+
const commandsFailed = manifest.commands.filter((cmd) => cmd.status === 'failed').length;
|
|
26
|
+
const guardrailStatus = ensureGuardrailStatus(manifest);
|
|
27
|
+
const guardrailsPresent = guardrailStatus.present;
|
|
28
|
+
const learning = manifest.learning ?? null;
|
|
29
|
+
const privacyDecisions = manifest.privacy?.decisions ?? [];
|
|
30
|
+
const privacyEventCount = privacyDecisions.length;
|
|
31
|
+
const maxPrivacyEvents = EnvUtils.getInt('CODEX_METRICS_PRIVACY_EVENTS_MAX', -1);
|
|
32
|
+
const shouldTruncatePrivacy = maxPrivacyEvents >= 0 && privacyEventCount > maxPrivacyEvents;
|
|
33
|
+
const privacyEvents = shouldTruncatePrivacy
|
|
34
|
+
? privacyDecisions.slice(0, maxPrivacyEvents)
|
|
35
|
+
: privacyDecisions;
|
|
36
|
+
const metricsRoot = join(env.runsRoot, env.taskId);
|
|
37
|
+
const metricsPath = join(metricsRoot, 'metrics.json');
|
|
38
|
+
const pendingDir = join(metricsRoot, METRICS_PENDING_DIRNAME);
|
|
39
|
+
const entry = {
|
|
40
|
+
run_id: manifest.run_id,
|
|
41
|
+
task_id: manifest.task_id,
|
|
42
|
+
pipeline_id: manifest.pipeline_id,
|
|
43
|
+
status: manifest.status,
|
|
44
|
+
started_at: manifest.started_at,
|
|
45
|
+
completed_at: manifest.completed_at,
|
|
46
|
+
duration_seconds: durationSeconds,
|
|
47
|
+
commands_passed: commandsPassed,
|
|
48
|
+
commands_failed: commandsFailed,
|
|
49
|
+
guardrails_present: guardrailsPresent,
|
|
50
|
+
recorded_at: isoTimestamp(),
|
|
51
|
+
artifact_path: manifest.artifact_root,
|
|
52
|
+
child_runs: manifest.child_runs.length,
|
|
53
|
+
control_plane_status: manifest.control_plane?.validation.status ?? 'unknown',
|
|
54
|
+
scheduler_mode: manifest.scheduler?.mode ?? null,
|
|
55
|
+
instance_stats: (manifest.scheduler?.assignments ?? []).map((assignment) => ({
|
|
56
|
+
instance_id: assignment.instance_id,
|
|
57
|
+
capability: assignment.capability,
|
|
58
|
+
status: assignment.status,
|
|
59
|
+
attempts: assignment.attempts.length,
|
|
60
|
+
recovery_events: assignment.attempts.reduce((sum, attempt) => sum + attempt.recovery_checkpoints.length, 0)
|
|
61
|
+
})),
|
|
62
|
+
privacy_mode: manifest.privacy?.mode ?? null,
|
|
63
|
+
privacy_log_path: manifest.privacy?.log_path ?? null,
|
|
64
|
+
privacy_event_count: privacyEventCount,
|
|
65
|
+
privacy_events_truncated: shouldTruncatePrivacy ? true : undefined,
|
|
66
|
+
privacy_events: privacyEvents,
|
|
67
|
+
handle_count: manifest.handles?.length ?? 0,
|
|
68
|
+
tfgrpo_epoch: manifest.tfgrpo?.epoch ?? null,
|
|
69
|
+
tfgrpo_group_id: manifest.tfgrpo?.group_id ?? null,
|
|
70
|
+
tfgrpo_group_size: manifest.tfgrpo?.group_size ?? null,
|
|
71
|
+
tool_calls: manifest.tfgrpo?.tool_metrics?.tool_calls ?? 0,
|
|
72
|
+
token_total: manifest.tfgrpo?.tool_metrics?.token_total ?? 0,
|
|
73
|
+
cost_usd: manifest.tfgrpo?.tool_metrics?.cost_usd ?? 0,
|
|
74
|
+
latency_ms: manifest.tfgrpo?.tool_metrics?.latency_ms ?? 0,
|
|
75
|
+
tool_stats: manifest.tfgrpo?.tool_metrics?.per_tool ?? [],
|
|
76
|
+
learning_validation_status: learning?.validation?.status ?? null,
|
|
77
|
+
learning_snapshot_status: learning?.snapshot?.status ?? null,
|
|
78
|
+
learning_scenario_status: learning?.scenario?.status ?? null,
|
|
79
|
+
learning_crystalizer_status: learning?.crystalizer?.status ?? null,
|
|
80
|
+
learning_alerts: learning?.alerts?.length ?? 0,
|
|
81
|
+
learning_group_id: learning?.validation?.grouping?.id ?? null,
|
|
82
|
+
learning_review_rejections: learning?.review?.rejections ?? 0,
|
|
83
|
+
learning_review_latency_ms: learning?.review?.latency_ms ?? null,
|
|
84
|
+
learning_regressions_detected: learning?.regressions?.detected ?? 0,
|
|
85
|
+
learning_pattern_promoted: learning?.pattern_hygiene?.promoted ?? 0,
|
|
86
|
+
learning_pattern_deprecated: learning?.pattern_hygiene?.deprecated ?? 0,
|
|
87
|
+
learning_throughput_candidates: learning?.throughput?.candidates ??
|
|
88
|
+
(learning?.crystalizer?.candidate_path ? 1 : 0)
|
|
89
|
+
};
|
|
90
|
+
await mkdir(metricsRoot, { recursive: true });
|
|
91
|
+
const appendEntry = async () => {
|
|
92
|
+
await appendFile(metricsPath, `${JSON.stringify(entry)}\n`, 'utf8');
|
|
93
|
+
};
|
|
94
|
+
const appendPendingEntry = async () => {
|
|
95
|
+
const safeRunId = entry.run_id.replace(/[^a-zA-Z0-9._-]+/g, '_');
|
|
96
|
+
await mkdir(pendingDir, { recursive: true });
|
|
97
|
+
for (let attempt = 0; attempt < 5; attempt += 1) {
|
|
98
|
+
const pendingName = `${safeRunId}-${Date.now()}-${randomUUID()}.jsonl`;
|
|
99
|
+
const pendingPath = join(pendingDir, pendingName);
|
|
100
|
+
const tmpPath = `${pendingPath}.tmp`;
|
|
101
|
+
try {
|
|
102
|
+
await writeFile(tmpPath, `${JSON.stringify(entry)}\n`, { encoding: 'utf8', flag: 'wx' });
|
|
103
|
+
await rename(tmpPath, pendingPath);
|
|
104
|
+
return pendingPath;
|
|
105
|
+
}
|
|
106
|
+
catch (error) {
|
|
107
|
+
if (error.code !== 'EEXIST') {
|
|
108
|
+
throw error;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
throw new Error(`Failed to create pending metrics entry for ${entry.run_id}`);
|
|
113
|
+
};
|
|
114
|
+
const finalizeManifest = async (metricsRecorded) => {
|
|
115
|
+
if (!guardrailsPresent && guardrailStatus.recommendation) {
|
|
116
|
+
logger.warn(guardrailStatus.recommendation);
|
|
117
|
+
appendSummary(manifest, guardrailStatus.recommendation);
|
|
118
|
+
}
|
|
119
|
+
upsertGuardrailSummary(manifest);
|
|
120
|
+
manifest.metrics_recorded = metricsRecorded;
|
|
121
|
+
await persistManifest(paths, manifest, persister, { force: true });
|
|
122
|
+
};
|
|
123
|
+
const { acquired } = await withMetricsLock(env, async () => {
|
|
124
|
+
await mergePendingMetricsEntries(env);
|
|
125
|
+
await appendEntry();
|
|
126
|
+
await finalizeManifest(true);
|
|
127
|
+
await updateMetricsAggregates(env);
|
|
128
|
+
const mergedAfter = await mergePendingMetricsEntries(env);
|
|
129
|
+
if (mergedAfter > 0) {
|
|
130
|
+
await updateMetricsAggregates(env);
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
if (!acquired) {
|
|
134
|
+
const pendingPath = await appendPendingEntry();
|
|
135
|
+
await finalizeManifest(false);
|
|
136
|
+
logger.warn(`Metrics aggregation skipped for ${env.taskId}: queued metrics entry in ${pendingPath}.`);
|
|
137
|
+
}
|
|
138
|
+
}
|