@kbediako/codex-orchestrator 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/README.md +6 -1
  2. package/dist/bin/codex-orchestrator.js +38 -0
  3. package/dist/orchestrator/src/cli/config/delegationConfig.js +485 -0
  4. package/dist/orchestrator/src/cli/control/confirmations.js +262 -0
  5. package/dist/orchestrator/src/cli/control/controlServer.js +1476 -0
  6. package/dist/orchestrator/src/cli/control/controlState.js +46 -0
  7. package/dist/orchestrator/src/cli/control/controlWatcher.js +222 -0
  8. package/dist/orchestrator/src/cli/control/delegationTokens.js +62 -0
  9. package/dist/orchestrator/src/cli/control/questions.js +106 -0
  10. package/dist/orchestrator/src/cli/delegationServer.js +1368 -0
  11. package/dist/orchestrator/src/cli/events/runEventStream.js +246 -0
  12. package/dist/orchestrator/src/cli/exec/context.js +4 -1
  13. package/dist/orchestrator/src/cli/exec/stageRunner.js +30 -5
  14. package/dist/orchestrator/src/cli/metrics/metricsAggregator.js +377 -147
  15. package/dist/orchestrator/src/cli/metrics/metricsRecorder.js +3 -5
  16. package/dist/orchestrator/src/cli/orchestrator.js +217 -40
  17. package/dist/orchestrator/src/cli/rlmRunner.js +26 -3
  18. package/dist/orchestrator/src/cli/run/manifestPersister.js +33 -3
  19. package/dist/orchestrator/src/cli/run/runPaths.js +14 -0
  20. package/dist/orchestrator/src/cli/services/commandRunner.js +1 -1
  21. package/dist/orchestrator/src/cli/utils/devtools.js +33 -2
  22. package/dist/orchestrator/src/persistence/ExperienceStore.js +113 -46
  23. package/dist/orchestrator/src/persistence/PersistenceCoordinator.js +8 -8
  24. package/dist/orchestrator/src/persistence/TaskStateStore.js +2 -1
  25. package/dist/orchestrator/src/persistence/lockFile.js +26 -1
  26. package/dist/orchestrator/src/sync/CloudSyncWorker.js +17 -4
  27. package/dist/packages/orchestrator/src/telemetry/otel-exporter.js +21 -0
  28. package/package.json +3 -1
@@ -1,6 +1,10 @@
1
- import { appendFile, mkdir, readFile, readdir, rm, stat, writeFile } from 'node:fs/promises';
1
+ import { createReadStream } from 'node:fs';
2
+ import { appendFile, mkdir, open, readFile, readdir, rename, rm, stat } from 'node:fs/promises';
2
3
  import { dirname, join } from 'node:path';
4
+ import { createInterface } from 'node:readline';
3
5
  import { acquireLockWithRetry } from '../../persistence/lockFile.js';
6
+ import { EnvUtils } from '../../../../packages/shared/config/index.js';
7
+ import { writeJsonAtomic } from '../utils/fs.js';
4
8
  const REQUIRED_COMPLETENESS_FIELDS = [
5
9
  'instance_stats',
6
10
  'privacy_events',
@@ -9,6 +13,8 @@ const REQUIRED_COMPLETENESS_FIELDS = [
9
13
  const METRICS_LOCK_FILENAME = 'metrics.lock';
10
14
  const METRICS_PENDING_DIRNAME = 'metrics.pending';
11
15
  const DEFAULT_LOCK_STALE_MS = 5 * 60 * 1000;
16
+ const DEFAULT_PENDING_BATCH_MAX_LINES = 500;
17
+ const DEFAULT_PENDING_BATCH_MAX_BYTES = 1024 * 1024;
12
18
  const DEFAULT_LOCK_RETRY = {
13
19
  maxAttempts: 4,
14
20
  initialDelayMs: 50,
@@ -55,10 +61,20 @@ class MetricsLockError extends Error {
55
61
  this.name = 'MetricsLockError';
56
62
  }
57
63
  }
58
- async function drainMetricsEntryFile(env, path) {
59
- let raw = '';
64
+ async function streamMetricsEntryLines(path, onLine) {
65
+ let count = 0;
66
+ let reader;
67
+ let stream;
60
68
  try {
61
- raw = await readFile(path, 'utf8');
69
+ stream = createReadStream(path, { encoding: 'utf8' });
70
+ reader = createInterface({ input: stream, crlfDelay: Infinity });
71
+ for await (const line of reader) {
72
+ if (line.trim().length === 0) {
73
+ continue;
74
+ }
75
+ count += 1;
76
+ await onLine(line);
77
+ }
62
78
  }
63
79
  catch (error) {
64
80
  if (error.code === 'ENOENT') {
@@ -66,21 +82,104 @@ async function drainMetricsEntryFile(env, path) {
66
82
  }
67
83
  throw error;
68
84
  }
69
- const lines = raw.trim().split('\n').filter(Boolean);
70
- if (lines.length === 0) {
71
- await rm(path, { force: true });
72
- return 0;
85
+ finally {
86
+ reader?.close();
87
+ stream?.destroy();
88
+ }
89
+ return count;
90
+ }
91
+ function isMetricsEntryCandidate(value) {
92
+ if (!value || typeof value !== 'object' || Array.isArray(value)) {
93
+ return false;
94
+ }
95
+ const candidate = value;
96
+ if (typeof candidate.run_id !== 'string' || candidate.run_id.trim().length === 0) {
97
+ return false;
98
+ }
99
+ if (typeof candidate.status !== 'string' || candidate.status.trim().length === 0) {
100
+ return false;
101
+ }
102
+ if (typeof candidate.recorded_at !== 'string' || candidate.recorded_at.trim().length === 0) {
103
+ return false;
104
+ }
105
+ return true;
106
+ }
107
+ function parseMetricsEntry(line) {
108
+ const trimmed = line.trim();
109
+ if (!trimmed) {
110
+ return null;
111
+ }
112
+ try {
113
+ const parsed = JSON.parse(trimmed);
114
+ return isMetricsEntryCandidate(parsed) ? parsed : null;
115
+ }
116
+ catch {
117
+ return null;
118
+ }
119
+ }
120
+ async function promotePendingTmpFile(tmpPath, pendingDir) {
121
+ let lineCount = 0;
122
+ let invalid = false;
123
+ try {
124
+ await streamMetricsEntryLines(tmpPath, async (line) => {
125
+ lineCount += 1;
126
+ if (!parseMetricsEntry(line)) {
127
+ invalid = true;
128
+ throw new Error('invalid metrics entry');
129
+ }
130
+ });
131
+ }
132
+ catch (error) {
133
+ if (invalid) {
134
+ return false;
135
+ }
136
+ if (error.code === 'ENOENT') {
137
+ return false;
138
+ }
139
+ throw error;
140
+ }
141
+ if (lineCount === 0) {
142
+ return false;
143
+ }
144
+ const targetPath = tmpPath.replace(/\.tmp$/, '');
145
+ try {
146
+ await rename(tmpPath, targetPath);
147
+ return true;
148
+ }
149
+ catch (error) {
150
+ const code = error.code;
151
+ if (code === 'ENOENT') {
152
+ return false;
153
+ }
154
+ if (code === 'EEXIST') {
155
+ const recoveryPath = join(pendingDir, `recovered-${Date.now()}-${Math.random().toString(36).slice(2)}.jsonl`);
156
+ await rename(tmpPath, recoveryPath);
157
+ return true;
158
+ }
159
+ throw error;
160
+ }
161
+ }
162
+ function normalizeBatchLimit(value) {
163
+ if (!Number.isFinite(value) || value <= 0) {
164
+ return Number.POSITIVE_INFINITY;
73
165
  }
74
- await mkdir(getMetricsRoot(env), { recursive: true });
75
- const payload = `${lines.join('\n')}\n`;
76
- await appendFile(getMetricsPath(env), payload, 'utf8');
77
- await rm(path, { force: true });
78
- return lines.length;
166
+ return value;
167
+ }
168
+ function getPendingBatchLimits() {
169
+ const maxLines = EnvUtils.getInt('CODEX_METRICS_PENDING_BATCH_MAX_LINES', DEFAULT_PENDING_BATCH_MAX_LINES);
170
+ const maxBytes = EnvUtils.getInt('CODEX_METRICS_PENDING_BATCH_MAX_BYTES', DEFAULT_PENDING_BATCH_MAX_BYTES);
171
+ return {
172
+ maxLines: normalizeBatchLimit(maxLines),
173
+ maxBytes: normalizeBatchLimit(maxBytes)
174
+ };
79
175
  }
80
176
  export async function mergePendingMetricsEntries(env) {
81
177
  const pendingDir = getMetricsPendingDir(env);
178
+ const metricsRoot = getMetricsRoot(env);
179
+ const metricsPath = getMetricsPath(env);
82
180
  let merged = 0;
83
181
  const staleTmpMs = DEFAULT_LOCK_STALE_MS;
182
+ const { maxLines: maxBatchLines, maxBytes: maxBatchBytes } = getPendingBatchLimits();
84
183
  for (let pass = 0; pass < 2; pass += 1) {
85
184
  let entries = [];
86
185
  try {
@@ -93,6 +192,7 @@ export async function mergePendingMetricsEntries(env) {
93
192
  throw error;
94
193
  }
95
194
  const now = Date.now();
195
+ let promotedTmp = false;
96
196
  for (const entry of entries) {
97
197
  if (!entry.isFile() || !entry.name.endsWith('.tmp')) {
98
198
  continue;
@@ -101,7 +201,13 @@ export async function mergePendingMetricsEntries(env) {
101
201
  try {
102
202
  const stats = await stat(tmpPath);
103
203
  if (now - stats.mtimeMs > staleTmpMs) {
104
- await rm(tmpPath, { force: true });
204
+ const promoted = await promotePendingTmpFile(tmpPath, pendingDir);
205
+ if (promoted) {
206
+ promotedTmp = true;
207
+ }
208
+ if (!promoted) {
209
+ await rm(tmpPath, { force: true });
210
+ }
105
211
  }
106
212
  }
107
213
  catch (error) {
@@ -115,14 +221,79 @@ export async function mergePendingMetricsEntries(env) {
115
221
  .map((entry) => entry.name)
116
222
  .sort();
117
223
  if (files.length === 0) {
118
- break;
224
+ if (!promotedTmp) {
225
+ break;
226
+ }
227
+ continue;
119
228
  }
229
+ await mkdir(metricsRoot, { recursive: true });
230
+ let payloadLines = [];
231
+ let filesToRemove = [];
232
+ let payloadLineCount = 0;
233
+ let payloadBytes = 0;
234
+ const flushBatch = async () => {
235
+ if (payloadLines.length > 0) {
236
+ const payload = `${payloadLines.join('\n')}\n`;
237
+ await ensureMetricsTrailingNewline(metricsPath);
238
+ await appendFile(metricsPath, payload, 'utf8');
239
+ }
240
+ if (filesToRemove.length > 0) {
241
+ await Promise.all(filesToRemove.map((filePath) => rm(filePath, { force: true })));
242
+ }
243
+ payloadLines = [];
244
+ filesToRemove = [];
245
+ payloadLineCount = 0;
246
+ payloadBytes = 0;
247
+ };
120
248
  for (const file of files) {
121
- merged += await drainMetricsEntryFile(env, join(pendingDir, file));
249
+ const filePath = join(pendingDir, file);
250
+ const fileLineCount = await streamMetricsEntryLines(filePath, async (line) => {
251
+ const lineBytes = Buffer.byteLength(line, 'utf8') + 1;
252
+ const wouldExceedLines = payloadLineCount + 1 > maxBatchLines;
253
+ const wouldExceedBytes = payloadBytes + lineBytes > maxBatchBytes;
254
+ if (payloadLines.length > 0 && (wouldExceedLines || wouldExceedBytes)) {
255
+ await flushBatch();
256
+ }
257
+ payloadLines.push(line);
258
+ payloadLineCount += 1;
259
+ payloadBytes += lineBytes;
260
+ merged += 1;
261
+ });
262
+ if (fileLineCount === 0) {
263
+ await rm(filePath, { force: true });
264
+ continue;
265
+ }
266
+ filesToRemove.push(filePath);
122
267
  }
268
+ await flushBatch();
123
269
  }
124
270
  return merged;
125
271
  }
272
+ export async function ensureMetricsTrailingNewline(path) {
273
+ try {
274
+ const handle = await open(path, 'r');
275
+ try {
276
+ const stats = await handle.stat();
277
+ if (stats.size === 0) {
278
+ return;
279
+ }
280
+ const buffer = Buffer.alloc(1);
281
+ await handle.read(buffer, 0, 1, stats.size - 1);
282
+ if (buffer[0] !== 0x0a) {
283
+ await appendFile(path, '\n', 'utf8');
284
+ }
285
+ }
286
+ finally {
287
+ await handle.close();
288
+ }
289
+ }
290
+ catch (error) {
291
+ if (error.code === 'ENOENT') {
292
+ return;
293
+ }
294
+ throw error;
295
+ }
296
+ }
126
297
  export async function withMetricsLock(env, action, options = {}) {
127
298
  const overrides = options.retry ?? {};
128
299
  const sanitizedOverrides = Object.fromEntries(Object.entries(overrides).filter(([, value]) => value !== undefined));
@@ -158,19 +329,30 @@ export async function withMetricsLock(env, action, options = {}) {
158
329
  export async function updateMetricsAggregates(env) {
159
330
  const metricsRoot = getMetricsRoot(env);
160
331
  const metricsPath = getMetricsPath(env);
161
- const entries = await loadMetricsEntries(metricsPath);
162
- if (entries.length === 0) {
332
+ const state = createAggregationState();
333
+ await streamMetricsEntryLines(metricsPath, async (line) => {
334
+ const entry = parseMetricsEntry(line);
335
+ if (!entry) {
336
+ return;
337
+ }
338
+ if (state.seenRunIds.has(entry.run_id)) {
339
+ return;
340
+ }
341
+ state.seenRunIds.add(entry.run_id);
342
+ accumulateMetricsEntry(state, entry);
343
+ });
344
+ if (state.totalRuns === 0 || !state.baselineEntry) {
163
345
  return;
164
346
  }
165
347
  const metricsDir = join(metricsRoot, 'metrics');
166
348
  await mkdir(metricsDir, { recursive: true });
349
+ await ensureBaseline(metricsDir, state.baselineEntry);
167
350
  await Promise.all([
168
- ensureBaseline(metricsDir, entries[0]),
169
- writePostRollout(metricsDir, entries),
170
- writeCompleteness(metricsDir, entries),
171
- writeMttrDelta(env, entries),
172
- writeTfgrpoEpochAggregates(metricsDir, entries),
173
- writeLearningState(env, entries)
351
+ writePostRollout(metricsDir, state),
352
+ writeCompleteness(metricsDir, state),
353
+ writeMttrDelta(env, state),
354
+ writeTfgrpoEpochAggregates(metricsDir, state),
355
+ writeLearningState(env, state)
174
356
  ]);
175
357
  }
176
358
  async function ensureBaseline(dir, entry) {
@@ -191,11 +373,11 @@ async function ensureBaseline(dir, entry) {
191
373
  duration_seconds: entry.duration_seconds,
192
374
  completion_rate: entry.status === 'succeeded' ? 1 : 0
193
375
  };
194
- await writeFile(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`, 'utf8');
376
+ await writeJsonAtomic(baselinePath, baseline);
195
377
  }
196
- async function writePostRollout(dir, entries) {
197
- const totalRuns = entries.length;
198
- const succeededRuns = entries.filter((entry) => entry.status === 'succeeded').length;
378
+ async function writePostRollout(dir, state) {
379
+ const totalRuns = state.totalRuns;
380
+ const succeededRuns = state.succeededRuns;
199
381
  const completionRate = totalRuns > 0 ? succeededRuns / totalRuns : 0;
200
382
  const payload = {
201
383
  total_runs: totalRuns,
@@ -204,44 +386,29 @@ async function writePostRollout(dir, entries) {
204
386
  meets_threshold: completionRate >= 0.95,
205
387
  updated_at: new Date().toISOString()
206
388
  };
207
- await writeFile(join(dir, 'post-rollout.json'), `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
389
+ await writeJsonAtomic(join(dir, 'post-rollout.json'), payload);
208
390
  }
209
- async function writeCompleteness(dir, entries) {
210
- const fieldChecks = REQUIRED_COMPLETENESS_FIELDS.length * entries.length;
391
+ async function writeCompleteness(dir, state) {
392
+ const fieldChecks = REQUIRED_COMPLETENESS_FIELDS.length * state.totalRuns;
211
393
  if (fieldChecks === 0) {
212
394
  return;
213
395
  }
214
- const missingCounts = Object.fromEntries(REQUIRED_COMPLETENESS_FIELDS.map((field) => [field, 0]));
215
- for (const entry of entries) {
216
- if (!Array.isArray(entry.instance_stats) || entry.instance_stats.length === 0) {
217
- missingCounts.instance_stats += 1;
218
- }
219
- if (!Array.isArray(entry.privacy_events) || entry.privacy_events.length === 0) {
220
- missingCounts.privacy_events += 1;
221
- }
222
- if (!entry.control_plane_status || entry.control_plane_status === 'unknown') {
223
- missingCounts.control_plane_status += 1;
224
- }
225
- }
226
- const totalMissing = Object.values(missingCounts).reduce((sum, value) => sum + value, 0);
396
+ const totalMissing = Object.values(state.missingCounts).reduce((sum, value) => sum + value, 0);
227
397
  const ratio = totalMissing / fieldChecks;
228
398
  const payload = {
229
399
  checked_fields: REQUIRED_COMPLETENESS_FIELDS,
230
- missing_counts: missingCounts,
400
+ missing_counts: state.missingCounts,
231
401
  missing_field_ratio: ratio,
232
402
  meets_threshold: ratio < 0.05,
233
403
  updated_at: new Date().toISOString()
234
404
  };
235
- await writeFile(join(dir, 'completeness.json'), `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
405
+ await writeJsonAtomic(join(dir, 'completeness.json'), payload);
236
406
  }
237
- async function writeMttrDelta(env, entries) {
238
- const durations = entries
239
- .map((entry) => entry.duration_seconds)
240
- .filter((value) => typeof value === 'number' && Number.isFinite(value));
241
- if (durations.length === 0) {
407
+ async function writeMttrDelta(env, state) {
408
+ if (state.durationCount === 0) {
242
409
  return;
243
410
  }
244
- const currentMttr = average(durations);
411
+ const currentMttr = state.durationSum / state.durationCount;
245
412
  const metricsDir = join(env.runsRoot, env.taskId, 'metrics');
246
413
  const baselinePath = join(metricsDir, 'baseline.json');
247
414
  let baselineMttr = currentMttr;
@@ -265,139 +432,202 @@ async function writeMttrDelta(env, entries) {
265
432
  };
266
433
  const outDir = join(env.outRoot, env.taskId, 'metrics');
267
434
  await mkdir(outDir, { recursive: true });
268
- await writeFile(join(outDir, 'mttr-delta.json'), `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
435
+ await writeJsonAtomic(join(outDir, 'mttr-delta.json'), payload);
269
436
  }
270
- async function writeTfgrpoEpochAggregates(dir, entries) {
271
- const grouped = new Map();
272
- for (const entry of entries) {
273
- if (typeof entry.tfgrpo_epoch !== 'number') {
274
- continue;
275
- }
276
- const bucket = grouped.get(entry.tfgrpo_epoch) ?? [];
277
- bucket.push(entry);
278
- grouped.set(entry.tfgrpo_epoch, bucket);
279
- }
280
- if (grouped.size === 0) {
437
+ async function writeTfgrpoEpochAggregates(dir, state) {
438
+ if (state.epochs.size === 0) {
281
439
  return;
282
440
  }
283
- const epochs = Array.from(grouped.entries())
441
+ const epochs = Array.from(state.epochs.entries())
284
442
  .sort(([a], [b]) => a - b)
285
- .map(([epoch, group]) => summarizeEpoch(epoch, group));
443
+ .map(([epoch, aggregate]) => ({
444
+ epoch,
445
+ runs: aggregate.runs,
446
+ tool_calls: aggregate.tool_calls,
447
+ token_total: aggregate.token_total,
448
+ cost_usd: roundCurrency(aggregate.cost_usd),
449
+ latency_ms: aggregate.latency_ms,
450
+ group_size_avg: aggregate.group_size_count > 0
451
+ ? aggregate.group_size_sum / aggregate.group_size_count
452
+ : null,
453
+ tools: Array.from(aggregate.tool_stats.entries()).map(([tool, toolAggregate]) => ({
454
+ tool,
455
+ runs: toolAggregate.runs,
456
+ tokens: toolAggregate.tokens,
457
+ cost_usd: roundCurrency(toolAggregate.costUsd),
458
+ latency_ms: toolAggregate.latencyMs
459
+ }))
460
+ }));
286
461
  const payload = {
287
462
  epochs,
288
463
  updated_at: new Date().toISOString()
289
464
  };
290
- await writeFile(join(dir, 'per-epoch.json'), `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
465
+ await writeJsonAtomic(join(dir, 'per-epoch.json'), payload);
291
466
  }
292
- async function writeLearningState(env, entries) {
293
- const validationStatuses = entries
294
- .map((entry) => entry.learning_validation_status)
295
- .filter((value) => typeof value === 'string');
296
- const validationSummary = {
297
- passed: validationStatuses.filter((status) => status === 'validated').length,
298
- failed: validationStatuses.filter((status) => status === 'snapshot_failed').length,
299
- stalled: validationStatuses.filter((status) => status === 'stalled_snapshot').length,
300
- manual: validationStatuses.filter((status) => status === 'needs_manual_scenario').length
301
- };
302
- const reviewerRejections = entries.reduce((sum, entry) => sum + (entry.learning_review_rejections ?? 0), 0);
303
- const reviewerLatencies = entries
304
- .map((entry) => entry.learning_review_latency_ms)
305
- .filter((value) => typeof value === 'number' && Number.isFinite(value));
306
- const reviewerLatencyMs = reviewerLatencies.length > 0
307
- ? reviewerLatencies.reduce((sum, value) => sum + value, 0) / reviewerLatencies.length
467
+ async function writeLearningState(env, state) {
468
+ const reviewerLatencyMs = state.reviewerLatencyCount > 0
469
+ ? state.reviewerLatencySum / state.reviewerLatencyCount
308
470
  : null;
309
- const regressions = entries.reduce((sum, entry) => sum + (entry.learning_regressions_detected ?? 0), 0);
310
- const patternPromotions = entries.reduce((sum, entry) => sum + (entry.learning_pattern_promoted ?? 0), 0);
311
- const patternDeprecations = entries.reduce((sum, entry) => sum + (entry.learning_pattern_deprecated ?? 0), 0);
312
- const throughputCandidates = entries.reduce((sum, entry) => sum + (entry.learning_throughput_candidates ?? 0), 0);
313
471
  const alerts = {
314
- total: entries.reduce((sum, entry) => sum + (entry.learning_alerts ?? 0), 0),
315
- snapshot_failed: entries.filter((entry) => entry.learning_snapshot_status === 'snapshot_failed').length,
316
- stalled_snapshot: entries.filter((entry) => entry.learning_snapshot_status === 'stalled_snapshot').length,
317
- needs_manual_scenario: validationSummary.manual
472
+ total: state.alertsTotal,
473
+ snapshot_failed: state.alertsSnapshotFailed,
474
+ stalled_snapshot: state.alertsStalledSnapshot,
475
+ needs_manual_scenario: state.validationSummary.manual
318
476
  };
319
477
  const payload = {
320
478
  updated_at: new Date().toISOString(),
321
479
  safety: {
322
- validation: validationSummary,
323
- reviewer: { rejections: reviewerRejections, average_latency_ms: reviewerLatencyMs },
324
- regression_detection: { detected: regressions },
325
- pattern_hygiene: { promoted: patternPromotions, deprecated: patternDeprecations }
480
+ validation: state.validationSummary,
481
+ reviewer: { rejections: state.reviewerRejections, average_latency_ms: reviewerLatencyMs },
482
+ regression_detection: { detected: state.regressions },
483
+ pattern_hygiene: { promoted: state.patternPromotions, deprecated: state.patternDeprecations }
326
484
  },
327
- throughput: { candidates: throughputCandidates },
485
+ throughput: { candidates: state.throughputCandidates },
328
486
  alerts
329
487
  };
330
488
  const outDir = join(env.outRoot, env.taskId);
331
489
  await mkdir(outDir, { recursive: true });
332
- await writeFile(join(outDir, 'state.json'), `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
490
+ await writeJsonAtomic(join(outDir, 'state.json'), payload);
333
491
  }
334
- function summarizeEpoch(epoch, entries) {
335
- const runs = entries.length;
336
- const toolCalls = entries.reduce((sum, entry) => sum + (entry.tool_calls ?? 0), 0);
337
- const tokenTotal = entries.reduce((sum, entry) => sum + (entry.token_total ?? 0), 0);
338
- const costUsd = roundCurrency(entries.reduce((sum, entry) => sum + (entry.cost_usd ?? 0), 0));
339
- const latencyMs = entries.reduce((sum, entry) => sum + (entry.latency_ms ?? 0), 0);
340
- const groupSizes = entries
341
- .map((entry) => entry.tfgrpo_group_size)
342
- .filter((value) => typeof value === 'number');
343
- const groupSizeAvg = groupSizes.length > 0 ? groupSizes.reduce((sum, value) => sum + value, 0) / groupSizes.length : null;
492
+ function createAggregationState() {
493
+ const missingCounts = Object.fromEntries(REQUIRED_COMPLETENESS_FIELDS.map((field) => [field, 0]));
344
494
  return {
345
- epoch,
346
- runs,
347
- tool_calls: toolCalls,
348
- token_total: tokenTotal,
349
- cost_usd: costUsd,
350
- latency_ms: latencyMs,
351
- group_size_avg: groupSizeAvg,
352
- tools: aggregateToolStats(entries)
495
+ totalRuns: 0,
496
+ succeededRuns: 0,
497
+ baselineEntry: null,
498
+ seenRunIds: new Set(),
499
+ missingCounts,
500
+ durationSum: 0,
501
+ durationCount: 0,
502
+ epochs: new Map(),
503
+ validationSummary: {
504
+ passed: 0,
505
+ failed: 0,
506
+ stalled: 0,
507
+ manual: 0
508
+ },
509
+ reviewerRejections: 0,
510
+ reviewerLatencySum: 0,
511
+ reviewerLatencyCount: 0,
512
+ regressions: 0,
513
+ patternPromotions: 0,
514
+ patternDeprecations: 0,
515
+ throughputCandidates: 0,
516
+ alertsTotal: 0,
517
+ alertsSnapshotFailed: 0,
518
+ alertsStalledSnapshot: 0
353
519
  };
354
520
  }
355
- function aggregateToolStats(entries) {
356
- const aggregates = new Map();
357
- for (const entry of entries) {
358
- const stats = entry.tool_stats ?? [];
521
+ function accumulateMetricsEntry(state, entry) {
522
+ state.totalRuns += 1;
523
+ if (!state.baselineEntry) {
524
+ state.baselineEntry = entry;
525
+ }
526
+ if (entry.status === 'succeeded') {
527
+ state.succeededRuns += 1;
528
+ }
529
+ if (!Array.isArray(entry.instance_stats) || entry.instance_stats.length === 0) {
530
+ state.missingCounts.instance_stats += 1;
531
+ }
532
+ if (!Array.isArray(entry.privacy_events) || entry.privacy_events.length === 0) {
533
+ state.missingCounts.privacy_events += 1;
534
+ }
535
+ if (!entry.control_plane_status || entry.control_plane_status === 'unknown') {
536
+ state.missingCounts.control_plane_status += 1;
537
+ }
538
+ if (typeof entry.duration_seconds === 'number' && Number.isFinite(entry.duration_seconds)) {
539
+ state.durationSum += entry.duration_seconds;
540
+ state.durationCount += 1;
541
+ }
542
+ if (typeof entry.tfgrpo_epoch === 'number') {
543
+ const aggregate = getEpochAggregate(state.epochs, entry.tfgrpo_epoch);
544
+ aggregate.runs += 1;
545
+ aggregate.tool_calls += typeof entry.tool_calls === 'number' ? entry.tool_calls : 0;
546
+ aggregate.token_total += typeof entry.token_total === 'number' ? entry.token_total : 0;
547
+ aggregate.cost_usd += typeof entry.cost_usd === 'number' ? entry.cost_usd : 0;
548
+ aggregate.latency_ms += typeof entry.latency_ms === 'number' ? entry.latency_ms : 0;
549
+ if (typeof entry.tfgrpo_group_size === 'number' && Number.isFinite(entry.tfgrpo_group_size)) {
550
+ aggregate.group_size_sum += entry.tfgrpo_group_size;
551
+ aggregate.group_size_count += 1;
552
+ }
553
+ const stats = Array.isArray(entry.tool_stats) ? entry.tool_stats : [];
359
554
  for (const stat of stats) {
360
555
  if (typeof stat.tool !== 'string' || !stat.tool) {
361
556
  continue;
362
557
  }
363
- const current = aggregates.get(stat.tool) ?? { runs: 0, tokens: 0, costUsd: 0, latencyMs: 0 };
558
+ const current = aggregate.tool_stats.get(stat.tool) ?? { runs: 0, tokens: 0, costUsd: 0, latencyMs: 0 };
364
559
  current.runs += 1;
365
560
  current.tokens += typeof stat.tokens === 'number' ? stat.tokens : 0;
366
561
  current.costUsd += typeof stat.cost_usd === 'number' ? stat.cost_usd : 0;
367
562
  current.latencyMs += typeof stat.latency_ms === 'number' ? stat.latency_ms : 0;
368
- aggregates.set(stat.tool, current);
563
+ aggregate.tool_stats.set(stat.tool, current);
369
564
  }
370
565
  }
371
- return Array.from(aggregates.entries()).map(([tool, aggregate]) => ({
372
- tool,
373
- runs: aggregate.runs,
374
- tokens: aggregate.tokens,
375
- cost_usd: roundCurrency(aggregate.costUsd),
376
- latency_ms: aggregate.latencyMs
377
- }));
378
- }
379
- async function loadMetricsEntries(path) {
380
- try {
381
- const raw = await readFile(path, 'utf8');
382
- return raw
383
- .trim()
384
- .split('\n')
385
- .filter(Boolean)
386
- .map((line) => JSON.parse(line));
387
- }
388
- catch (error) {
389
- if (error.code === 'ENOENT') {
390
- return [];
566
+ if (typeof entry.learning_validation_status === 'string') {
567
+ switch (entry.learning_validation_status) {
568
+ case 'validated':
569
+ state.validationSummary.passed += 1;
570
+ break;
571
+ case 'snapshot_failed':
572
+ state.validationSummary.failed += 1;
573
+ break;
574
+ case 'stalled_snapshot':
575
+ state.validationSummary.stalled += 1;
576
+ break;
577
+ case 'needs_manual_scenario':
578
+ state.validationSummary.manual += 1;
579
+ break;
580
+ default:
581
+ break;
391
582
  }
392
- throw error;
583
+ }
584
+ if (typeof entry.learning_review_rejections === 'number') {
585
+ state.reviewerRejections += entry.learning_review_rejections;
586
+ }
587
+ if (typeof entry.learning_review_latency_ms === 'number' &&
588
+ Number.isFinite(entry.learning_review_latency_ms)) {
589
+ state.reviewerLatencySum += entry.learning_review_latency_ms;
590
+ state.reviewerLatencyCount += 1;
591
+ }
592
+ if (typeof entry.learning_regressions_detected === 'number') {
593
+ state.regressions += entry.learning_regressions_detected;
594
+ }
595
+ if (typeof entry.learning_pattern_promoted === 'number') {
596
+ state.patternPromotions += entry.learning_pattern_promoted;
597
+ }
598
+ if (typeof entry.learning_pattern_deprecated === 'number') {
599
+ state.patternDeprecations += entry.learning_pattern_deprecated;
600
+ }
601
+ if (typeof entry.learning_throughput_candidates === 'number') {
602
+ state.throughputCandidates += entry.learning_throughput_candidates;
603
+ }
604
+ if (typeof entry.learning_alerts === 'number') {
605
+ state.alertsTotal += entry.learning_alerts;
606
+ }
607
+ if (entry.learning_snapshot_status === 'snapshot_failed') {
608
+ state.alertsSnapshotFailed += 1;
609
+ }
610
+ if (entry.learning_snapshot_status === 'stalled_snapshot') {
611
+ state.alertsStalledSnapshot += 1;
393
612
  }
394
613
  }
395
- function average(values) {
396
- if (values.length === 0) {
397
- return 0;
614
+ function getEpochAggregate(epochs, epoch) {
615
+ const existing = epochs.get(epoch);
616
+ if (existing) {
617
+ return existing;
398
618
  }
399
- const sum = values.reduce((total, value) => total + value, 0);
400
- return sum / values.length;
619
+ const created = {
620
+ runs: 0,
621
+ tool_calls: 0,
622
+ token_total: 0,
623
+ cost_usd: 0,
624
+ latency_ms: 0,
625
+ group_size_sum: 0,
626
+ group_size_count: 0,
627
+ tool_stats: new Map()
628
+ };
629
+ epochs.set(epoch, created);
630
+ return created;
401
631
  }
402
632
  function roundCurrency(value) {
403
633
  return Math.round(value * 1_000_000) / 1_000_000;