@kbediako/codex-orchestrator 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/dist/bin/codex-orchestrator.js +38 -0
- package/dist/orchestrator/src/cli/config/delegationConfig.js +485 -0
- package/dist/orchestrator/src/cli/control/confirmations.js +262 -0
- package/dist/orchestrator/src/cli/control/controlServer.js +1476 -0
- package/dist/orchestrator/src/cli/control/controlState.js +46 -0
- package/dist/orchestrator/src/cli/control/controlWatcher.js +222 -0
- package/dist/orchestrator/src/cli/control/delegationTokens.js +62 -0
- package/dist/orchestrator/src/cli/control/questions.js +106 -0
- package/dist/orchestrator/src/cli/delegationServer.js +1368 -0
- package/dist/orchestrator/src/cli/events/runEventStream.js +246 -0
- package/dist/orchestrator/src/cli/exec/context.js +4 -1
- package/dist/orchestrator/src/cli/exec/stageRunner.js +30 -5
- package/dist/orchestrator/src/cli/metrics/metricsAggregator.js +377 -147
- package/dist/orchestrator/src/cli/metrics/metricsRecorder.js +3 -5
- package/dist/orchestrator/src/cli/orchestrator.js +217 -40
- package/dist/orchestrator/src/cli/rlmRunner.js +26 -3
- package/dist/orchestrator/src/cli/run/manifestPersister.js +33 -3
- package/dist/orchestrator/src/cli/run/runPaths.js +14 -0
- package/dist/orchestrator/src/cli/services/commandRunner.js +1 -1
- package/dist/orchestrator/src/cli/utils/devtools.js +33 -2
- package/dist/orchestrator/src/persistence/ExperienceStore.js +113 -46
- package/dist/orchestrator/src/persistence/PersistenceCoordinator.js +8 -8
- package/dist/orchestrator/src/persistence/TaskStateStore.js +2 -1
- package/dist/orchestrator/src/persistence/lockFile.js +26 -1
- package/dist/orchestrator/src/sync/CloudSyncWorker.js +17 -4
- package/dist/packages/orchestrator/src/telemetry/otel-exporter.js +21 -0
- package/package.json +3 -1
|
@@ -1,6 +1,10 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { createReadStream } from 'node:fs';
|
|
2
|
+
import { appendFile, mkdir, open, readFile, readdir, rename, rm, stat } from 'node:fs/promises';
|
|
2
3
|
import { dirname, join } from 'node:path';
|
|
4
|
+
import { createInterface } from 'node:readline';
|
|
3
5
|
import { acquireLockWithRetry } from '../../persistence/lockFile.js';
|
|
6
|
+
import { EnvUtils } from '../../../../packages/shared/config/index.js';
|
|
7
|
+
import { writeJsonAtomic } from '../utils/fs.js';
|
|
4
8
|
const REQUIRED_COMPLETENESS_FIELDS = [
|
|
5
9
|
'instance_stats',
|
|
6
10
|
'privacy_events',
|
|
@@ -9,6 +13,8 @@ const REQUIRED_COMPLETENESS_FIELDS = [
|
|
|
9
13
|
const METRICS_LOCK_FILENAME = 'metrics.lock';
|
|
10
14
|
const METRICS_PENDING_DIRNAME = 'metrics.pending';
|
|
11
15
|
const DEFAULT_LOCK_STALE_MS = 5 * 60 * 1000;
|
|
16
|
+
const DEFAULT_PENDING_BATCH_MAX_LINES = 500;
|
|
17
|
+
const DEFAULT_PENDING_BATCH_MAX_BYTES = 1024 * 1024;
|
|
12
18
|
const DEFAULT_LOCK_RETRY = {
|
|
13
19
|
maxAttempts: 4,
|
|
14
20
|
initialDelayMs: 50,
|
|
@@ -55,10 +61,20 @@ class MetricsLockError extends Error {
|
|
|
55
61
|
this.name = 'MetricsLockError';
|
|
56
62
|
}
|
|
57
63
|
}
|
|
58
|
-
async function
|
|
59
|
-
let
|
|
64
|
+
async function streamMetricsEntryLines(path, onLine) {
|
|
65
|
+
let count = 0;
|
|
66
|
+
let reader;
|
|
67
|
+
let stream;
|
|
60
68
|
try {
|
|
61
|
-
|
|
69
|
+
stream = createReadStream(path, { encoding: 'utf8' });
|
|
70
|
+
reader = createInterface({ input: stream, crlfDelay: Infinity });
|
|
71
|
+
for await (const line of reader) {
|
|
72
|
+
if (line.trim().length === 0) {
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
count += 1;
|
|
76
|
+
await onLine(line);
|
|
77
|
+
}
|
|
62
78
|
}
|
|
63
79
|
catch (error) {
|
|
64
80
|
if (error.code === 'ENOENT') {
|
|
@@ -66,21 +82,104 @@ async function drainMetricsEntryFile(env, path) {
|
|
|
66
82
|
}
|
|
67
83
|
throw error;
|
|
68
84
|
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
85
|
+
finally {
|
|
86
|
+
reader?.close();
|
|
87
|
+
stream?.destroy();
|
|
88
|
+
}
|
|
89
|
+
return count;
|
|
90
|
+
}
|
|
91
|
+
function isMetricsEntryCandidate(value) {
|
|
92
|
+
if (!value || typeof value !== 'object' || Array.isArray(value)) {
|
|
93
|
+
return false;
|
|
94
|
+
}
|
|
95
|
+
const candidate = value;
|
|
96
|
+
if (typeof candidate.run_id !== 'string' || candidate.run_id.trim().length === 0) {
|
|
97
|
+
return false;
|
|
98
|
+
}
|
|
99
|
+
if (typeof candidate.status !== 'string' || candidate.status.trim().length === 0) {
|
|
100
|
+
return false;
|
|
101
|
+
}
|
|
102
|
+
if (typeof candidate.recorded_at !== 'string' || candidate.recorded_at.trim().length === 0) {
|
|
103
|
+
return false;
|
|
104
|
+
}
|
|
105
|
+
return true;
|
|
106
|
+
}
|
|
107
|
+
function parseMetricsEntry(line) {
|
|
108
|
+
const trimmed = line.trim();
|
|
109
|
+
if (!trimmed) {
|
|
110
|
+
return null;
|
|
111
|
+
}
|
|
112
|
+
try {
|
|
113
|
+
const parsed = JSON.parse(trimmed);
|
|
114
|
+
return isMetricsEntryCandidate(parsed) ? parsed : null;
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
return null;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
async function promotePendingTmpFile(tmpPath, pendingDir) {
|
|
121
|
+
let lineCount = 0;
|
|
122
|
+
let invalid = false;
|
|
123
|
+
try {
|
|
124
|
+
await streamMetricsEntryLines(tmpPath, async (line) => {
|
|
125
|
+
lineCount += 1;
|
|
126
|
+
if (!parseMetricsEntry(line)) {
|
|
127
|
+
invalid = true;
|
|
128
|
+
throw new Error('invalid metrics entry');
|
|
129
|
+
}
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
catch (error) {
|
|
133
|
+
if (invalid) {
|
|
134
|
+
return false;
|
|
135
|
+
}
|
|
136
|
+
if (error.code === 'ENOENT') {
|
|
137
|
+
return false;
|
|
138
|
+
}
|
|
139
|
+
throw error;
|
|
140
|
+
}
|
|
141
|
+
if (lineCount === 0) {
|
|
142
|
+
return false;
|
|
143
|
+
}
|
|
144
|
+
const targetPath = tmpPath.replace(/\.tmp$/, '');
|
|
145
|
+
try {
|
|
146
|
+
await rename(tmpPath, targetPath);
|
|
147
|
+
return true;
|
|
148
|
+
}
|
|
149
|
+
catch (error) {
|
|
150
|
+
const code = error.code;
|
|
151
|
+
if (code === 'ENOENT') {
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
if (code === 'EEXIST') {
|
|
155
|
+
const recoveryPath = join(pendingDir, `recovered-${Date.now()}-${Math.random().toString(36).slice(2)}.jsonl`);
|
|
156
|
+
await rename(tmpPath, recoveryPath);
|
|
157
|
+
return true;
|
|
158
|
+
}
|
|
159
|
+
throw error;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
function normalizeBatchLimit(value) {
|
|
163
|
+
if (!Number.isFinite(value) || value <= 0) {
|
|
164
|
+
return Number.POSITIVE_INFINITY;
|
|
73
165
|
}
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
166
|
+
return value;
|
|
167
|
+
}
|
|
168
|
+
function getPendingBatchLimits() {
|
|
169
|
+
const maxLines = EnvUtils.getInt('CODEX_METRICS_PENDING_BATCH_MAX_LINES', DEFAULT_PENDING_BATCH_MAX_LINES);
|
|
170
|
+
const maxBytes = EnvUtils.getInt('CODEX_METRICS_PENDING_BATCH_MAX_BYTES', DEFAULT_PENDING_BATCH_MAX_BYTES);
|
|
171
|
+
return {
|
|
172
|
+
maxLines: normalizeBatchLimit(maxLines),
|
|
173
|
+
maxBytes: normalizeBatchLimit(maxBytes)
|
|
174
|
+
};
|
|
79
175
|
}
|
|
80
176
|
export async function mergePendingMetricsEntries(env) {
|
|
81
177
|
const pendingDir = getMetricsPendingDir(env);
|
|
178
|
+
const metricsRoot = getMetricsRoot(env);
|
|
179
|
+
const metricsPath = getMetricsPath(env);
|
|
82
180
|
let merged = 0;
|
|
83
181
|
const staleTmpMs = DEFAULT_LOCK_STALE_MS;
|
|
182
|
+
const { maxLines: maxBatchLines, maxBytes: maxBatchBytes } = getPendingBatchLimits();
|
|
84
183
|
for (let pass = 0; pass < 2; pass += 1) {
|
|
85
184
|
let entries = [];
|
|
86
185
|
try {
|
|
@@ -93,6 +192,7 @@ export async function mergePendingMetricsEntries(env) {
|
|
|
93
192
|
throw error;
|
|
94
193
|
}
|
|
95
194
|
const now = Date.now();
|
|
195
|
+
let promotedTmp = false;
|
|
96
196
|
for (const entry of entries) {
|
|
97
197
|
if (!entry.isFile() || !entry.name.endsWith('.tmp')) {
|
|
98
198
|
continue;
|
|
@@ -101,7 +201,13 @@ export async function mergePendingMetricsEntries(env) {
|
|
|
101
201
|
try {
|
|
102
202
|
const stats = await stat(tmpPath);
|
|
103
203
|
if (now - stats.mtimeMs > staleTmpMs) {
|
|
104
|
-
await
|
|
204
|
+
const promoted = await promotePendingTmpFile(tmpPath, pendingDir);
|
|
205
|
+
if (promoted) {
|
|
206
|
+
promotedTmp = true;
|
|
207
|
+
}
|
|
208
|
+
if (!promoted) {
|
|
209
|
+
await rm(tmpPath, { force: true });
|
|
210
|
+
}
|
|
105
211
|
}
|
|
106
212
|
}
|
|
107
213
|
catch (error) {
|
|
@@ -115,14 +221,79 @@ export async function mergePendingMetricsEntries(env) {
|
|
|
115
221
|
.map((entry) => entry.name)
|
|
116
222
|
.sort();
|
|
117
223
|
if (files.length === 0) {
|
|
118
|
-
|
|
224
|
+
if (!promotedTmp) {
|
|
225
|
+
break;
|
|
226
|
+
}
|
|
227
|
+
continue;
|
|
119
228
|
}
|
|
229
|
+
await mkdir(metricsRoot, { recursive: true });
|
|
230
|
+
let payloadLines = [];
|
|
231
|
+
let filesToRemove = [];
|
|
232
|
+
let payloadLineCount = 0;
|
|
233
|
+
let payloadBytes = 0;
|
|
234
|
+
const flushBatch = async () => {
|
|
235
|
+
if (payloadLines.length > 0) {
|
|
236
|
+
const payload = `${payloadLines.join('\n')}\n`;
|
|
237
|
+
await ensureMetricsTrailingNewline(metricsPath);
|
|
238
|
+
await appendFile(metricsPath, payload, 'utf8');
|
|
239
|
+
}
|
|
240
|
+
if (filesToRemove.length > 0) {
|
|
241
|
+
await Promise.all(filesToRemove.map((filePath) => rm(filePath, { force: true })));
|
|
242
|
+
}
|
|
243
|
+
payloadLines = [];
|
|
244
|
+
filesToRemove = [];
|
|
245
|
+
payloadLineCount = 0;
|
|
246
|
+
payloadBytes = 0;
|
|
247
|
+
};
|
|
120
248
|
for (const file of files) {
|
|
121
|
-
|
|
249
|
+
const filePath = join(pendingDir, file);
|
|
250
|
+
const fileLineCount = await streamMetricsEntryLines(filePath, async (line) => {
|
|
251
|
+
const lineBytes = Buffer.byteLength(line, 'utf8') + 1;
|
|
252
|
+
const wouldExceedLines = payloadLineCount + 1 > maxBatchLines;
|
|
253
|
+
const wouldExceedBytes = payloadBytes + lineBytes > maxBatchBytes;
|
|
254
|
+
if (payloadLines.length > 0 && (wouldExceedLines || wouldExceedBytes)) {
|
|
255
|
+
await flushBatch();
|
|
256
|
+
}
|
|
257
|
+
payloadLines.push(line);
|
|
258
|
+
payloadLineCount += 1;
|
|
259
|
+
payloadBytes += lineBytes;
|
|
260
|
+
merged += 1;
|
|
261
|
+
});
|
|
262
|
+
if (fileLineCount === 0) {
|
|
263
|
+
await rm(filePath, { force: true });
|
|
264
|
+
continue;
|
|
265
|
+
}
|
|
266
|
+
filesToRemove.push(filePath);
|
|
122
267
|
}
|
|
268
|
+
await flushBatch();
|
|
123
269
|
}
|
|
124
270
|
return merged;
|
|
125
271
|
}
|
|
272
|
+
export async function ensureMetricsTrailingNewline(path) {
|
|
273
|
+
try {
|
|
274
|
+
const handle = await open(path, 'r');
|
|
275
|
+
try {
|
|
276
|
+
const stats = await handle.stat();
|
|
277
|
+
if (stats.size === 0) {
|
|
278
|
+
return;
|
|
279
|
+
}
|
|
280
|
+
const buffer = Buffer.alloc(1);
|
|
281
|
+
await handle.read(buffer, 0, 1, stats.size - 1);
|
|
282
|
+
if (buffer[0] !== 0x0a) {
|
|
283
|
+
await appendFile(path, '\n', 'utf8');
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
finally {
|
|
287
|
+
await handle.close();
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
catch (error) {
|
|
291
|
+
if (error.code === 'ENOENT') {
|
|
292
|
+
return;
|
|
293
|
+
}
|
|
294
|
+
throw error;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
126
297
|
export async function withMetricsLock(env, action, options = {}) {
|
|
127
298
|
const overrides = options.retry ?? {};
|
|
128
299
|
const sanitizedOverrides = Object.fromEntries(Object.entries(overrides).filter(([, value]) => value !== undefined));
|
|
@@ -158,19 +329,30 @@ export async function withMetricsLock(env, action, options = {}) {
|
|
|
158
329
|
export async function updateMetricsAggregates(env) {
|
|
159
330
|
const metricsRoot = getMetricsRoot(env);
|
|
160
331
|
const metricsPath = getMetricsPath(env);
|
|
161
|
-
const
|
|
162
|
-
|
|
332
|
+
const state = createAggregationState();
|
|
333
|
+
await streamMetricsEntryLines(metricsPath, async (line) => {
|
|
334
|
+
const entry = parseMetricsEntry(line);
|
|
335
|
+
if (!entry) {
|
|
336
|
+
return;
|
|
337
|
+
}
|
|
338
|
+
if (state.seenRunIds.has(entry.run_id)) {
|
|
339
|
+
return;
|
|
340
|
+
}
|
|
341
|
+
state.seenRunIds.add(entry.run_id);
|
|
342
|
+
accumulateMetricsEntry(state, entry);
|
|
343
|
+
});
|
|
344
|
+
if (state.totalRuns === 0 || !state.baselineEntry) {
|
|
163
345
|
return;
|
|
164
346
|
}
|
|
165
347
|
const metricsDir = join(metricsRoot, 'metrics');
|
|
166
348
|
await mkdir(metricsDir, { recursive: true });
|
|
349
|
+
await ensureBaseline(metricsDir, state.baselineEntry);
|
|
167
350
|
await Promise.all([
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
writeLearningState(env, entries)
|
|
351
|
+
writePostRollout(metricsDir, state),
|
|
352
|
+
writeCompleteness(metricsDir, state),
|
|
353
|
+
writeMttrDelta(env, state),
|
|
354
|
+
writeTfgrpoEpochAggregates(metricsDir, state),
|
|
355
|
+
writeLearningState(env, state)
|
|
174
356
|
]);
|
|
175
357
|
}
|
|
176
358
|
async function ensureBaseline(dir, entry) {
|
|
@@ -191,11 +373,11 @@ async function ensureBaseline(dir, entry) {
|
|
|
191
373
|
duration_seconds: entry.duration_seconds,
|
|
192
374
|
completion_rate: entry.status === 'succeeded' ? 1 : 0
|
|
193
375
|
};
|
|
194
|
-
await
|
|
376
|
+
await writeJsonAtomic(baselinePath, baseline);
|
|
195
377
|
}
|
|
196
|
-
async function writePostRollout(dir,
|
|
197
|
-
const totalRuns =
|
|
198
|
-
const succeededRuns =
|
|
378
|
+
async function writePostRollout(dir, state) {
|
|
379
|
+
const totalRuns = state.totalRuns;
|
|
380
|
+
const succeededRuns = state.succeededRuns;
|
|
199
381
|
const completionRate = totalRuns > 0 ? succeededRuns / totalRuns : 0;
|
|
200
382
|
const payload = {
|
|
201
383
|
total_runs: totalRuns,
|
|
@@ -204,44 +386,29 @@ async function writePostRollout(dir, entries) {
|
|
|
204
386
|
meets_threshold: completionRate >= 0.95,
|
|
205
387
|
updated_at: new Date().toISOString()
|
|
206
388
|
};
|
|
207
|
-
await
|
|
389
|
+
await writeJsonAtomic(join(dir, 'post-rollout.json'), payload);
|
|
208
390
|
}
|
|
209
|
-
async function writeCompleteness(dir,
|
|
210
|
-
const fieldChecks = REQUIRED_COMPLETENESS_FIELDS.length *
|
|
391
|
+
async function writeCompleteness(dir, state) {
|
|
392
|
+
const fieldChecks = REQUIRED_COMPLETENESS_FIELDS.length * state.totalRuns;
|
|
211
393
|
if (fieldChecks === 0) {
|
|
212
394
|
return;
|
|
213
395
|
}
|
|
214
|
-
const
|
|
215
|
-
for (const entry of entries) {
|
|
216
|
-
if (!Array.isArray(entry.instance_stats) || entry.instance_stats.length === 0) {
|
|
217
|
-
missingCounts.instance_stats += 1;
|
|
218
|
-
}
|
|
219
|
-
if (!Array.isArray(entry.privacy_events) || entry.privacy_events.length === 0) {
|
|
220
|
-
missingCounts.privacy_events += 1;
|
|
221
|
-
}
|
|
222
|
-
if (!entry.control_plane_status || entry.control_plane_status === 'unknown') {
|
|
223
|
-
missingCounts.control_plane_status += 1;
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
const totalMissing = Object.values(missingCounts).reduce((sum, value) => sum + value, 0);
|
|
396
|
+
const totalMissing = Object.values(state.missingCounts).reduce((sum, value) => sum + value, 0);
|
|
227
397
|
const ratio = totalMissing / fieldChecks;
|
|
228
398
|
const payload = {
|
|
229
399
|
checked_fields: REQUIRED_COMPLETENESS_FIELDS,
|
|
230
|
-
missing_counts: missingCounts,
|
|
400
|
+
missing_counts: state.missingCounts,
|
|
231
401
|
missing_field_ratio: ratio,
|
|
232
402
|
meets_threshold: ratio < 0.05,
|
|
233
403
|
updated_at: new Date().toISOString()
|
|
234
404
|
};
|
|
235
|
-
await
|
|
405
|
+
await writeJsonAtomic(join(dir, 'completeness.json'), payload);
|
|
236
406
|
}
|
|
237
|
-
async function writeMttrDelta(env,
|
|
238
|
-
|
|
239
|
-
.map((entry) => entry.duration_seconds)
|
|
240
|
-
.filter((value) => typeof value === 'number' && Number.isFinite(value));
|
|
241
|
-
if (durations.length === 0) {
|
|
407
|
+
async function writeMttrDelta(env, state) {
|
|
408
|
+
if (state.durationCount === 0) {
|
|
242
409
|
return;
|
|
243
410
|
}
|
|
244
|
-
const currentMttr =
|
|
411
|
+
const currentMttr = state.durationSum / state.durationCount;
|
|
245
412
|
const metricsDir = join(env.runsRoot, env.taskId, 'metrics');
|
|
246
413
|
const baselinePath = join(metricsDir, 'baseline.json');
|
|
247
414
|
let baselineMttr = currentMttr;
|
|
@@ -265,139 +432,202 @@ async function writeMttrDelta(env, entries) {
|
|
|
265
432
|
};
|
|
266
433
|
const outDir = join(env.outRoot, env.taskId, 'metrics');
|
|
267
434
|
await mkdir(outDir, { recursive: true });
|
|
268
|
-
await
|
|
435
|
+
await writeJsonAtomic(join(outDir, 'mttr-delta.json'), payload);
|
|
269
436
|
}
|
|
270
|
-
async function writeTfgrpoEpochAggregates(dir,
|
|
271
|
-
|
|
272
|
-
for (const entry of entries) {
|
|
273
|
-
if (typeof entry.tfgrpo_epoch !== 'number') {
|
|
274
|
-
continue;
|
|
275
|
-
}
|
|
276
|
-
const bucket = grouped.get(entry.tfgrpo_epoch) ?? [];
|
|
277
|
-
bucket.push(entry);
|
|
278
|
-
grouped.set(entry.tfgrpo_epoch, bucket);
|
|
279
|
-
}
|
|
280
|
-
if (grouped.size === 0) {
|
|
437
|
+
async function writeTfgrpoEpochAggregates(dir, state) {
|
|
438
|
+
if (state.epochs.size === 0) {
|
|
281
439
|
return;
|
|
282
440
|
}
|
|
283
|
-
const epochs = Array.from(
|
|
441
|
+
const epochs = Array.from(state.epochs.entries())
|
|
284
442
|
.sort(([a], [b]) => a - b)
|
|
285
|
-
.map(([epoch,
|
|
443
|
+
.map(([epoch, aggregate]) => ({
|
|
444
|
+
epoch,
|
|
445
|
+
runs: aggregate.runs,
|
|
446
|
+
tool_calls: aggregate.tool_calls,
|
|
447
|
+
token_total: aggregate.token_total,
|
|
448
|
+
cost_usd: roundCurrency(aggregate.cost_usd),
|
|
449
|
+
latency_ms: aggregate.latency_ms,
|
|
450
|
+
group_size_avg: aggregate.group_size_count > 0
|
|
451
|
+
? aggregate.group_size_sum / aggregate.group_size_count
|
|
452
|
+
: null,
|
|
453
|
+
tools: Array.from(aggregate.tool_stats.entries()).map(([tool, toolAggregate]) => ({
|
|
454
|
+
tool,
|
|
455
|
+
runs: toolAggregate.runs,
|
|
456
|
+
tokens: toolAggregate.tokens,
|
|
457
|
+
cost_usd: roundCurrency(toolAggregate.costUsd),
|
|
458
|
+
latency_ms: toolAggregate.latencyMs
|
|
459
|
+
}))
|
|
460
|
+
}));
|
|
286
461
|
const payload = {
|
|
287
462
|
epochs,
|
|
288
463
|
updated_at: new Date().toISOString()
|
|
289
464
|
};
|
|
290
|
-
await
|
|
465
|
+
await writeJsonAtomic(join(dir, 'per-epoch.json'), payload);
|
|
291
466
|
}
|
|
292
|
-
async function writeLearningState(env,
|
|
293
|
-
const
|
|
294
|
-
.
|
|
295
|
-
.filter((value) => typeof value === 'string');
|
|
296
|
-
const validationSummary = {
|
|
297
|
-
passed: validationStatuses.filter((status) => status === 'validated').length,
|
|
298
|
-
failed: validationStatuses.filter((status) => status === 'snapshot_failed').length,
|
|
299
|
-
stalled: validationStatuses.filter((status) => status === 'stalled_snapshot').length,
|
|
300
|
-
manual: validationStatuses.filter((status) => status === 'needs_manual_scenario').length
|
|
301
|
-
};
|
|
302
|
-
const reviewerRejections = entries.reduce((sum, entry) => sum + (entry.learning_review_rejections ?? 0), 0);
|
|
303
|
-
const reviewerLatencies = entries
|
|
304
|
-
.map((entry) => entry.learning_review_latency_ms)
|
|
305
|
-
.filter((value) => typeof value === 'number' && Number.isFinite(value));
|
|
306
|
-
const reviewerLatencyMs = reviewerLatencies.length > 0
|
|
307
|
-
? reviewerLatencies.reduce((sum, value) => sum + value, 0) / reviewerLatencies.length
|
|
467
|
+
async function writeLearningState(env, state) {
|
|
468
|
+
const reviewerLatencyMs = state.reviewerLatencyCount > 0
|
|
469
|
+
? state.reviewerLatencySum / state.reviewerLatencyCount
|
|
308
470
|
: null;
|
|
309
|
-
const regressions = entries.reduce((sum, entry) => sum + (entry.learning_regressions_detected ?? 0), 0);
|
|
310
|
-
const patternPromotions = entries.reduce((sum, entry) => sum + (entry.learning_pattern_promoted ?? 0), 0);
|
|
311
|
-
const patternDeprecations = entries.reduce((sum, entry) => sum + (entry.learning_pattern_deprecated ?? 0), 0);
|
|
312
|
-
const throughputCandidates = entries.reduce((sum, entry) => sum + (entry.learning_throughput_candidates ?? 0), 0);
|
|
313
471
|
const alerts = {
|
|
314
|
-
total:
|
|
315
|
-
snapshot_failed:
|
|
316
|
-
stalled_snapshot:
|
|
317
|
-
needs_manual_scenario: validationSummary.manual
|
|
472
|
+
total: state.alertsTotal,
|
|
473
|
+
snapshot_failed: state.alertsSnapshotFailed,
|
|
474
|
+
stalled_snapshot: state.alertsStalledSnapshot,
|
|
475
|
+
needs_manual_scenario: state.validationSummary.manual
|
|
318
476
|
};
|
|
319
477
|
const payload = {
|
|
320
478
|
updated_at: new Date().toISOString(),
|
|
321
479
|
safety: {
|
|
322
|
-
validation: validationSummary,
|
|
323
|
-
reviewer: { rejections: reviewerRejections, average_latency_ms: reviewerLatencyMs },
|
|
324
|
-
regression_detection: { detected: regressions },
|
|
325
|
-
pattern_hygiene: { promoted: patternPromotions, deprecated: patternDeprecations }
|
|
480
|
+
validation: state.validationSummary,
|
|
481
|
+
reviewer: { rejections: state.reviewerRejections, average_latency_ms: reviewerLatencyMs },
|
|
482
|
+
regression_detection: { detected: state.regressions },
|
|
483
|
+
pattern_hygiene: { promoted: state.patternPromotions, deprecated: state.patternDeprecations }
|
|
326
484
|
},
|
|
327
|
-
throughput: { candidates: throughputCandidates },
|
|
485
|
+
throughput: { candidates: state.throughputCandidates },
|
|
328
486
|
alerts
|
|
329
487
|
};
|
|
330
488
|
const outDir = join(env.outRoot, env.taskId);
|
|
331
489
|
await mkdir(outDir, { recursive: true });
|
|
332
|
-
await
|
|
490
|
+
await writeJsonAtomic(join(outDir, 'state.json'), payload);
|
|
333
491
|
}
|
|
334
|
-
function
|
|
335
|
-
const
|
|
336
|
-
const toolCalls = entries.reduce((sum, entry) => sum + (entry.tool_calls ?? 0), 0);
|
|
337
|
-
const tokenTotal = entries.reduce((sum, entry) => sum + (entry.token_total ?? 0), 0);
|
|
338
|
-
const costUsd = roundCurrency(entries.reduce((sum, entry) => sum + (entry.cost_usd ?? 0), 0));
|
|
339
|
-
const latencyMs = entries.reduce((sum, entry) => sum + (entry.latency_ms ?? 0), 0);
|
|
340
|
-
const groupSizes = entries
|
|
341
|
-
.map((entry) => entry.tfgrpo_group_size)
|
|
342
|
-
.filter((value) => typeof value === 'number');
|
|
343
|
-
const groupSizeAvg = groupSizes.length > 0 ? groupSizes.reduce((sum, value) => sum + value, 0) / groupSizes.length : null;
|
|
492
|
+
function createAggregationState() {
|
|
493
|
+
const missingCounts = Object.fromEntries(REQUIRED_COMPLETENESS_FIELDS.map((field) => [field, 0]));
|
|
344
494
|
return {
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
495
|
+
totalRuns: 0,
|
|
496
|
+
succeededRuns: 0,
|
|
497
|
+
baselineEntry: null,
|
|
498
|
+
seenRunIds: new Set(),
|
|
499
|
+
missingCounts,
|
|
500
|
+
durationSum: 0,
|
|
501
|
+
durationCount: 0,
|
|
502
|
+
epochs: new Map(),
|
|
503
|
+
validationSummary: {
|
|
504
|
+
passed: 0,
|
|
505
|
+
failed: 0,
|
|
506
|
+
stalled: 0,
|
|
507
|
+
manual: 0
|
|
508
|
+
},
|
|
509
|
+
reviewerRejections: 0,
|
|
510
|
+
reviewerLatencySum: 0,
|
|
511
|
+
reviewerLatencyCount: 0,
|
|
512
|
+
regressions: 0,
|
|
513
|
+
patternPromotions: 0,
|
|
514
|
+
patternDeprecations: 0,
|
|
515
|
+
throughputCandidates: 0,
|
|
516
|
+
alertsTotal: 0,
|
|
517
|
+
alertsSnapshotFailed: 0,
|
|
518
|
+
alertsStalledSnapshot: 0
|
|
353
519
|
};
|
|
354
520
|
}
|
|
355
|
-
function
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
521
|
+
function accumulateMetricsEntry(state, entry) {
|
|
522
|
+
state.totalRuns += 1;
|
|
523
|
+
if (!state.baselineEntry) {
|
|
524
|
+
state.baselineEntry = entry;
|
|
525
|
+
}
|
|
526
|
+
if (entry.status === 'succeeded') {
|
|
527
|
+
state.succeededRuns += 1;
|
|
528
|
+
}
|
|
529
|
+
if (!Array.isArray(entry.instance_stats) || entry.instance_stats.length === 0) {
|
|
530
|
+
state.missingCounts.instance_stats += 1;
|
|
531
|
+
}
|
|
532
|
+
if (!Array.isArray(entry.privacy_events) || entry.privacy_events.length === 0) {
|
|
533
|
+
state.missingCounts.privacy_events += 1;
|
|
534
|
+
}
|
|
535
|
+
if (!entry.control_plane_status || entry.control_plane_status === 'unknown') {
|
|
536
|
+
state.missingCounts.control_plane_status += 1;
|
|
537
|
+
}
|
|
538
|
+
if (typeof entry.duration_seconds === 'number' && Number.isFinite(entry.duration_seconds)) {
|
|
539
|
+
state.durationSum += entry.duration_seconds;
|
|
540
|
+
state.durationCount += 1;
|
|
541
|
+
}
|
|
542
|
+
if (typeof entry.tfgrpo_epoch === 'number') {
|
|
543
|
+
const aggregate = getEpochAggregate(state.epochs, entry.tfgrpo_epoch);
|
|
544
|
+
aggregate.runs += 1;
|
|
545
|
+
aggregate.tool_calls += typeof entry.tool_calls === 'number' ? entry.tool_calls : 0;
|
|
546
|
+
aggregate.token_total += typeof entry.token_total === 'number' ? entry.token_total : 0;
|
|
547
|
+
aggregate.cost_usd += typeof entry.cost_usd === 'number' ? entry.cost_usd : 0;
|
|
548
|
+
aggregate.latency_ms += typeof entry.latency_ms === 'number' ? entry.latency_ms : 0;
|
|
549
|
+
if (typeof entry.tfgrpo_group_size === 'number' && Number.isFinite(entry.tfgrpo_group_size)) {
|
|
550
|
+
aggregate.group_size_sum += entry.tfgrpo_group_size;
|
|
551
|
+
aggregate.group_size_count += 1;
|
|
552
|
+
}
|
|
553
|
+
const stats = Array.isArray(entry.tool_stats) ? entry.tool_stats : [];
|
|
359
554
|
for (const stat of stats) {
|
|
360
555
|
if (typeof stat.tool !== 'string' || !stat.tool) {
|
|
361
556
|
continue;
|
|
362
557
|
}
|
|
363
|
-
const current =
|
|
558
|
+
const current = aggregate.tool_stats.get(stat.tool) ?? { runs: 0, tokens: 0, costUsd: 0, latencyMs: 0 };
|
|
364
559
|
current.runs += 1;
|
|
365
560
|
current.tokens += typeof stat.tokens === 'number' ? stat.tokens : 0;
|
|
366
561
|
current.costUsd += typeof stat.cost_usd === 'number' ? stat.cost_usd : 0;
|
|
367
562
|
current.latencyMs += typeof stat.latency_ms === 'number' ? stat.latency_ms : 0;
|
|
368
|
-
|
|
563
|
+
aggregate.tool_stats.set(stat.tool, current);
|
|
369
564
|
}
|
|
370
565
|
}
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
}
|
|
388
|
-
catch (error) {
|
|
389
|
-
if (error.code === 'ENOENT') {
|
|
390
|
-
return [];
|
|
566
|
+
if (typeof entry.learning_validation_status === 'string') {
|
|
567
|
+
switch (entry.learning_validation_status) {
|
|
568
|
+
case 'validated':
|
|
569
|
+
state.validationSummary.passed += 1;
|
|
570
|
+
break;
|
|
571
|
+
case 'snapshot_failed':
|
|
572
|
+
state.validationSummary.failed += 1;
|
|
573
|
+
break;
|
|
574
|
+
case 'stalled_snapshot':
|
|
575
|
+
state.validationSummary.stalled += 1;
|
|
576
|
+
break;
|
|
577
|
+
case 'needs_manual_scenario':
|
|
578
|
+
state.validationSummary.manual += 1;
|
|
579
|
+
break;
|
|
580
|
+
default:
|
|
581
|
+
break;
|
|
391
582
|
}
|
|
392
|
-
|
|
583
|
+
}
|
|
584
|
+
if (typeof entry.learning_review_rejections === 'number') {
|
|
585
|
+
state.reviewerRejections += entry.learning_review_rejections;
|
|
586
|
+
}
|
|
587
|
+
if (typeof entry.learning_review_latency_ms === 'number' &&
|
|
588
|
+
Number.isFinite(entry.learning_review_latency_ms)) {
|
|
589
|
+
state.reviewerLatencySum += entry.learning_review_latency_ms;
|
|
590
|
+
state.reviewerLatencyCount += 1;
|
|
591
|
+
}
|
|
592
|
+
if (typeof entry.learning_regressions_detected === 'number') {
|
|
593
|
+
state.regressions += entry.learning_regressions_detected;
|
|
594
|
+
}
|
|
595
|
+
if (typeof entry.learning_pattern_promoted === 'number') {
|
|
596
|
+
state.patternPromotions += entry.learning_pattern_promoted;
|
|
597
|
+
}
|
|
598
|
+
if (typeof entry.learning_pattern_deprecated === 'number') {
|
|
599
|
+
state.patternDeprecations += entry.learning_pattern_deprecated;
|
|
600
|
+
}
|
|
601
|
+
if (typeof entry.learning_throughput_candidates === 'number') {
|
|
602
|
+
state.throughputCandidates += entry.learning_throughput_candidates;
|
|
603
|
+
}
|
|
604
|
+
if (typeof entry.learning_alerts === 'number') {
|
|
605
|
+
state.alertsTotal += entry.learning_alerts;
|
|
606
|
+
}
|
|
607
|
+
if (entry.learning_snapshot_status === 'snapshot_failed') {
|
|
608
|
+
state.alertsSnapshotFailed += 1;
|
|
609
|
+
}
|
|
610
|
+
if (entry.learning_snapshot_status === 'stalled_snapshot') {
|
|
611
|
+
state.alertsStalledSnapshot += 1;
|
|
393
612
|
}
|
|
394
613
|
}
|
|
395
|
-
function
|
|
396
|
-
|
|
397
|
-
|
|
614
|
+
function getEpochAggregate(epochs, epoch) {
|
|
615
|
+
const existing = epochs.get(epoch);
|
|
616
|
+
if (existing) {
|
|
617
|
+
return existing;
|
|
398
618
|
}
|
|
399
|
-
const
|
|
400
|
-
|
|
619
|
+
const created = {
|
|
620
|
+
runs: 0,
|
|
621
|
+
tool_calls: 0,
|
|
622
|
+
token_total: 0,
|
|
623
|
+
cost_usd: 0,
|
|
624
|
+
latency_ms: 0,
|
|
625
|
+
group_size_sum: 0,
|
|
626
|
+
group_size_count: 0,
|
|
627
|
+
tool_stats: new Map()
|
|
628
|
+
};
|
|
629
|
+
epochs.set(epoch, created);
|
|
630
|
+
return created;
|
|
401
631
|
}
|
|
402
632
|
function roundCurrency(value) {
|
|
403
633
|
return Math.round(value * 1_000_000) / 1_000_000;
|