@barivia/barmesh-mcp 0.5.4 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/blocking_monitor.js +34 -0
- package/dist/convergence_reading.js +25 -0
- package/dist/figure_sections.js +50 -0
- package/dist/index.js +1 -1
- package/dist/job_monitor.js +14 -3
- package/dist/results_metadata.js +78 -0
- package/dist/shared.js +1 -1
- package/dist/tools/barmesh_results_explorer.js +31 -53
- package/dist/tools/cfd.js +1 -0
- package/dist/tools/guide.js +1 -1
- package/dist/tools/jobs.js +4 -3
- package/dist/tools/results.js +3 -13
- package/dist/tools/training_monitor.js +75 -52
- package/dist/training_monitor_curve.js +60 -0
- package/dist/training_review.js +222 -0
- package/dist/views/src/views/barmesh-results-explorer/index.html +20 -19
- package/dist/views/src/views/barmesh-training-monitor/index.html +155 -0
- package/dist/viz-server.js +52 -1
- package/package.json +4 -3
|
@@ -1,56 +1,79 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
.optional()
|
|
19
|
-
.describe(`Max seconds to wait (default ${DEFAULT_BLOCK_UNTIL_SEC}; mesh jobs often need 6–10 min)`),
|
|
20
|
-
poll_interval_sec: z
|
|
21
|
-
.number()
|
|
22
|
-
.int()
|
|
23
|
-
.min(5)
|
|
24
|
-
.optional()
|
|
25
|
-
.describe(`Seconds between status polls (default ${DEFAULT_POLL_INTERVAL_SEC}; do not go below 5)`),
|
|
26
|
-
wait_finalize: z
|
|
27
|
-
.boolean()
|
|
28
|
-
.optional()
|
|
29
|
-
.describe("When true (default), wait for cfd_finalize after compute completes before returning"),
|
|
30
|
-
};
|
|
31
|
-
async function runMonitor(args) {
|
|
32
|
-
const block_until_sec = args.block_until_sec ?? DEFAULT_BLOCK_UNTIL_SEC;
|
|
33
|
-
const poll_interval_sec = args.poll_interval_sec ?? DEFAULT_POLL_INTERVAL_SEC;
|
|
34
|
-
const result = await monitorJob(args.job_id, {
|
|
35
|
-
block_until_sec,
|
|
36
|
-
poll_interval_sec,
|
|
37
|
-
wait_finalize: args.wait_finalize,
|
|
38
|
-
});
|
|
39
|
-
const text = formatMonitorText(result, { block_until_sec, poll_interval_sec });
|
|
40
|
-
return textResult({
|
|
41
|
-
...result.data,
|
|
42
|
-
monitor: {
|
|
43
|
-
job_id: result.job_id,
|
|
44
|
-
terminal: result.terminal,
|
|
45
|
-
timed_out: result.timed_out,
|
|
46
|
-
snapshots: result.snapshots,
|
|
47
|
-
status_text: result.status_text,
|
|
48
|
-
suggested_next_step: result.suggested_next_step,
|
|
49
|
-
},
|
|
50
|
-
status_text: text,
|
|
51
|
-
});
|
|
2
|
+
import { registerAppTool } from "@modelcontextprotocol/ext-apps/server";
|
|
3
|
+
import { runMcpToolAudit } from "../audit.js";
|
|
4
|
+
import { apiCall, getClientSupportsMcpApps, getVizPort, structuredTextResult, } from "../shared.js";
|
|
5
|
+
import { enrichWithTrainingLog, needsTrainingLogEnrichment } from "../training_review.js";
|
|
6
|
+
export const TRAINING_MONITOR_URI = "ui://barmesh/training-monitor";
|
|
7
|
+
export const TRAINING_MONITOR_REFRESH_MS = 5000;
|
|
8
|
+
function buildStructuredContent(job_id, data) {
|
|
9
|
+
const id = String(data.id ?? job_id);
|
|
10
|
+
return {
|
|
11
|
+
...data,
|
|
12
|
+
type: "barmesh-training-monitor",
|
|
13
|
+
jobId: id,
|
|
14
|
+
id,
|
|
15
|
+
job_id: id,
|
|
16
|
+
refresh_interval_ms: TRAINING_MONITOR_REFRESH_MS,
|
|
17
|
+
};
|
|
52
18
|
}
|
|
53
19
|
export function registerTrainingMonitorTool(server) {
|
|
54
|
-
|
|
20
|
+
registerAppTool(server, "barmesh_training_monitor", {
|
|
21
|
+
title: "Mesh Convergence Training Monitor",
|
|
22
|
+
description: "Default visual monitor for mesh_convergence jobs after submit. Embedded MCP App auto-refreshes every 5s: epoch progress bar, phase, ETA, live QE/TE curves (uniformly subsampled to ≤1000 batch samples per phase), and a per-epoch hit-grid heatmap when available. On already-completed jobs, replays training-log curves in review mode (same entry point). Also exposes a standalone localhost URL — copy it if window.open is blocked in your MCP host. Headless fallback: barmesh_jobs(action=monitor) blocks with compact text snapshots (live or post-hoc review). barmesh_jobs(action=status) remains a one-shot poll. After completion, use barmesh_results_explorer for figures.",
|
|
23
|
+
inputSchema: {
|
|
24
|
+
job_id: z.string().describe("Job ID from barmesh_mesh_convergence or barmesh_richardson"),
|
|
25
|
+
fetch_training_log: z
|
|
26
|
+
.boolean()
|
|
27
|
+
.optional()
|
|
28
|
+
.describe("Internal: merge completed-job training-log arrays when live progress is empty"),
|
|
29
|
+
},
|
|
30
|
+
_meta: { ui: { resourceUri: TRAINING_MONITOR_URI } },
|
|
31
|
+
}, async (args) => runMcpToolAudit("barmesh_training_monitor", "default", args, async () => {
|
|
32
|
+
const { job_id, fetch_training_log } = args;
|
|
33
|
+
let data = (await apiCall("GET", `/v1/jobs/${job_id}`));
|
|
34
|
+
const jobStatus = String(data.status ?? "");
|
|
35
|
+
const terminal = jobStatus === "completed" || jobStatus === "failed";
|
|
36
|
+
if (fetch_training_log || needsTrainingLogEnrichment(data)) {
|
|
37
|
+
data = await enrichWithTrainingLog(job_id, data);
|
|
38
|
+
}
|
|
39
|
+
const structuredContent = buildStructuredContent(job_id, data);
|
|
40
|
+
const progress = (data.progress ?? 0) * 100;
|
|
41
|
+
const etaSec = data.training_eta_sec != null ? Number(data.training_eta_sec) : null;
|
|
42
|
+
const elapsedSec = data.training_elapsed_sec != null ? Number(data.training_elapsed_sec) : null;
|
|
43
|
+
const epoch = data.epoch != null ? Number(data.epoch) : null;
|
|
44
|
+
const totalEpochs = data.total_epochs != null ? Number(data.total_epochs) : null;
|
|
45
|
+
const timingParts = [];
|
|
46
|
+
if (elapsedSec != null && elapsedSec >= 0)
|
|
47
|
+
timingParts.push(`elapsed ${Math.round(elapsedSec)}s`);
|
|
48
|
+
if (etaSec != null && etaSec > 0)
|
|
49
|
+
timingParts.push(`ETA ~${Math.round(etaSec)}s`);
|
|
50
|
+
if (epoch != null && totalEpochs != null)
|
|
51
|
+
timingParts.push(`epoch ${epoch}/${totalEpochs}`);
|
|
52
|
+
const timingNote = timingParts.length > 0 ? ` ${timingParts.join(", ")}.` : "";
|
|
53
|
+
const modeNote = terminal ? " (post-hoc review — training-log curves)" : "";
|
|
54
|
+
const text = `Training monitor (visual MCP App, refreshes every ${TRAINING_MONITOR_REFRESH_MS / 1000}s): job ${job_id} — ${jobStatus} (${progress.toFixed(1)}%).${modeNote}${timingNote}`;
|
|
55
|
+
const content = [{ type: "text", text }];
|
|
56
|
+
const port = getVizPort();
|
|
57
|
+
const standaloneUrl = port
|
|
58
|
+
? `http://localhost:${port}/viz/barmesh-training-monitor?mode=standalone&job_id=${encodeURIComponent(job_id)}`
|
|
59
|
+
: undefined;
|
|
60
|
+
if (standaloneUrl) {
|
|
61
|
+
content.push({
|
|
62
|
+
type: "text",
|
|
63
|
+
text: `Standalone training monitor URL (copy if Open is blocked):\n${standaloneUrl}\n` +
|
|
64
|
+
`AGENT: surface this URL to the user — use a markdown link or tell them to copy/paste into a browser when the inline panel cannot open localhost. Re-post after the job completes so they can review curves.\n` +
|
|
65
|
+
`This localhost port is per MCP session and goes stale if the proxy restarts — re-run barmesh_training_monitor for a fresh URL, or set BARIVIA_VIZ_PORT for a persistent port. Health check: http://localhost:${port}/api/health?job_id=${encodeURIComponent(job_id)}`,
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
if (!getClientSupportsMcpApps() && standaloneUrl) {
|
|
69
|
+
content.push({
|
|
70
|
+
type: "text",
|
|
71
|
+
text: `Interactive training monitor: [Open training monitor](${standaloneUrl})`,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
return {
|
|
75
|
+
...structuredTextResult(structuredContent, text, content),
|
|
76
|
+
_meta: { ui: { resourceUri: TRAINING_MONITOR_URI } },
|
|
77
|
+
};
|
|
78
|
+
}));
|
|
55
79
|
}
|
|
56
|
-
export { runMonitor, MONITOR_DESCRIPTION, monitorSchema };
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/** Batch-sample x-axis helpers for the barmesh_training_monitor MCP App chart. */
|
|
2
|
+
export function batchSampleAxis(n, offset = 0) {
|
|
3
|
+
if (n <= 0)
|
|
4
|
+
return [];
|
|
5
|
+
return Array.from({ length: n }, (_, i) => offset + i + 1);
|
|
6
|
+
}
|
|
7
|
+
export function combinedBatchAxis(nOrd, nConv) {
|
|
8
|
+
if (nOrd <= 0 && nConv <= 0)
|
|
9
|
+
return [];
|
|
10
|
+
return [...batchSampleAxis(nOrd, 0), ...batchSampleAxis(nConv, nOrd)];
|
|
11
|
+
}
|
|
12
|
+
export function formatCurveSourceNote(data) {
|
|
13
|
+
const src = data.training_curve_source_batches;
|
|
14
|
+
if (!src)
|
|
15
|
+
return null;
|
|
16
|
+
const parts = [];
|
|
17
|
+
const ordTotal = typeof src.ordering === "number" ? src.ordering : null;
|
|
18
|
+
const convTotal = typeof src.convergence === "number" ? src.convergence : null;
|
|
19
|
+
const ordShown = Array.isArray(data.ordering_errors) ? data.ordering_errors.length : 0;
|
|
20
|
+
const convShown = Array.isArray(data.convergence_errors) ? data.convergence_errors.length : 0;
|
|
21
|
+
if (ordTotal != null && ordTotal > ordShown) {
|
|
22
|
+
parts.push(`${ordShown} of ${ordTotal.toLocaleString()} ordering batch samples`);
|
|
23
|
+
}
|
|
24
|
+
if (convTotal != null && convTotal > convShown) {
|
|
25
|
+
parts.push(`${convShown} of ${convTotal.toLocaleString()} convergence batch samples`);
|
|
26
|
+
}
|
|
27
|
+
if (parts.length === 0)
|
|
28
|
+
return null;
|
|
29
|
+
return `Displaying uniformly subsampled QE+TE curves (≤1000 points/phase, joint indices): ${parts.join("; ")}.`;
|
|
30
|
+
}
|
|
31
|
+
/** Map TE onto QE batch axis; when batch-aligned (same length), use values directly. */
|
|
32
|
+
export function alignTeToQeAxis(te, qeLen) {
|
|
33
|
+
if (qeLen <= 0)
|
|
34
|
+
return [];
|
|
35
|
+
if (te.length === qeLen)
|
|
36
|
+
return te;
|
|
37
|
+
if (te.length === 0)
|
|
38
|
+
return Array(qeLen).fill(null);
|
|
39
|
+
if (te.length >= qeLen)
|
|
40
|
+
return te.slice(0, qeLen);
|
|
41
|
+
const pad = qeLen - te.length;
|
|
42
|
+
return [...Array(pad).fill(null), ...te];
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Last sampled TE point from the (batch-aligned) live TE curve. This is the most
|
|
46
|
+
* recent per-epoch TE estimate, distinct from the final trained-map TE in the summary.
|
|
47
|
+
*/
|
|
48
|
+
export function lastEpochTeFromCurves(data) {
|
|
49
|
+
const conv = data.convergence_topographic_errors;
|
|
50
|
+
if (Array.isArray(conv) && conv.length > 0) {
|
|
51
|
+
const v = Number(conv[conv.length - 1]);
|
|
52
|
+
return Number.isFinite(v) ? v : null;
|
|
53
|
+
}
|
|
54
|
+
const ord = data.ordering_topographic_errors;
|
|
55
|
+
if (Array.isArray(ord) && ord.length > 0) {
|
|
56
|
+
const v = Number(ord[ord.length - 1]);
|
|
57
|
+
return Number.isFinite(v) ? v : null;
|
|
58
|
+
}
|
|
59
|
+
return null;
|
|
60
|
+
}
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Post-hoc review mode for barmesh_training_monitor on already-terminal jobs.
|
|
3
|
+
* Replays QE curves from training-log and surfaces final timing/metrics.
|
|
4
|
+
*/
|
|
5
|
+
import { apiCall } from "./shared.js";
|
|
6
|
+
import { formatSnapshotLine, snapshotFromJob, } from "./job_monitor.js";
|
|
7
|
+
import { formatJobStatusText } from "./job_status_format.js";
|
|
8
|
+
import { lastEpochTeFromCurves } from "./training_monitor_curve.js";
|
|
9
|
+
export const REVIEW_MAX_SNAPSHOTS = 16;
|
|
10
|
+
function isTerminalStatus(status) {
|
|
11
|
+
return status === "completed" || status === "failed" || status === "cancelled";
|
|
12
|
+
}
|
|
13
|
+
export function hasTeCurveArrays(data) {
|
|
14
|
+
const ord = data.ordering_topographic_errors;
|
|
15
|
+
const conv = data.convergence_topographic_errors;
|
|
16
|
+
return ((Array.isArray(ord) && ord.length > 0) || (Array.isArray(conv) && conv.length > 0));
|
|
17
|
+
}
|
|
18
|
+
function hasCurveArrays(data) {
|
|
19
|
+
const ord = data.ordering_errors;
|
|
20
|
+
const conv = data.convergence_errors;
|
|
21
|
+
return ((Array.isArray(ord) && ord.length > 0) ||
|
|
22
|
+
(Array.isArray(conv) && conv.length > 0) ||
|
|
23
|
+
hasTeCurveArrays(data));
|
|
24
|
+
}
|
|
25
|
+
/** True when a terminal job still needs training-log curves or the final map TE. */
|
|
26
|
+
export function needsTrainingLogEnrichment(data) {
|
|
27
|
+
const status = String(data.status ?? "");
|
|
28
|
+
if (!isTerminalStatus(status))
|
|
29
|
+
return false;
|
|
30
|
+
if (!hasCurveArrays(data))
|
|
31
|
+
return true;
|
|
32
|
+
if (!hasTeCurveArrays(data))
|
|
33
|
+
return true;
|
|
34
|
+
if (data.map_topographic_error == null && data.topographic_error == null)
|
|
35
|
+
return true;
|
|
36
|
+
return false;
|
|
37
|
+
}
|
|
38
|
+
export async function enrichWithTrainingLog(job_id, data) {
|
|
39
|
+
const status = String(data.status ?? "");
|
|
40
|
+
const epochTe = data.epoch_topographic_error ??
|
|
41
|
+
data.topographic_error ??
|
|
42
|
+
lastEpochTeFromCurves(data);
|
|
43
|
+
try {
|
|
44
|
+
const log = (await apiCall("GET", `/v1/results/${job_id}/training-log`));
|
|
45
|
+
const merged = {
|
|
46
|
+
...data,
|
|
47
|
+
ordering_errors: log.ordering_errors ?? data.ordering_errors,
|
|
48
|
+
convergence_errors: log.convergence_errors ?? data.convergence_errors,
|
|
49
|
+
ordering_topographic_errors: log.ordering_topographic_errors ?? data.ordering_topographic_errors,
|
|
50
|
+
convergence_topographic_errors: log.convergence_topographic_errors ?? data.convergence_topographic_errors,
|
|
51
|
+
quantization_error: log.quantization_error ?? data.quantization_error,
|
|
52
|
+
grid: log.grid ?? data.grid,
|
|
53
|
+
epochs: log.epochs ?? data.epochs,
|
|
54
|
+
training_duration_seconds: log.training_duration_seconds ?? data.training_duration_seconds,
|
|
55
|
+
training_curve_source_batches: log.training_curve_source_batches ?? data.training_curve_source_batches,
|
|
56
|
+
};
|
|
57
|
+
const mapTe = log.topographic_error ?? data.map_topographic_error ?? data.topographic_error;
|
|
58
|
+
merged.epoch_topographic_error = epochTe ?? lastEpochTeFromCurves(merged);
|
|
59
|
+
merged.map_topographic_error = mapTe;
|
|
60
|
+
if (isTerminalStatus(status)) {
|
|
61
|
+
merged.topographic_error = mapTe;
|
|
62
|
+
}
|
|
63
|
+
return merged;
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
if (epochTe != null) {
|
|
67
|
+
return { ...data, epoch_topographic_error: epochTe };
|
|
68
|
+
}
|
|
69
|
+
return data;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
/** Evenly sample indices for a compact epoch/QE timeline in review mode. */
|
|
73
|
+
export function sampleReviewIndices(length, maxPoints = REVIEW_MAX_SNAPSHOTS) {
|
|
74
|
+
if (length <= 0)
|
|
75
|
+
return [];
|
|
76
|
+
if (length <= maxPoints)
|
|
77
|
+
return Array.from({ length }, (_, i) => i);
|
|
78
|
+
const out = [];
|
|
79
|
+
for (let i = 0; i < maxPoints; i += 1) {
|
|
80
|
+
out.push(Math.round((i * (length - 1)) / (maxPoints - 1)));
|
|
81
|
+
}
|
|
82
|
+
return [...new Set(out)].sort((a, b) => a - b);
|
|
83
|
+
}
|
|
84
|
+
export function buildCurveSnapshots(ordering, convergence, orderingEpochs) {
|
|
85
|
+
const snaps = [];
|
|
86
|
+
const ordIdx = sampleReviewIndices(ordering.length);
|
|
87
|
+
for (const i of ordIdx) {
|
|
88
|
+
const qe = ordering[i];
|
|
89
|
+
if (qe == null || Number.isNaN(Number(qe)))
|
|
90
|
+
continue;
|
|
91
|
+
const epoch = ordering.length > 1 && orderingEpochs > 0
|
|
92
|
+
? Math.max(1, Math.round(((i + 1) / ordering.length) * orderingEpochs))
|
|
93
|
+
: i + 1;
|
|
94
|
+
snaps.push({
|
|
95
|
+
elapsed_sec: 0,
|
|
96
|
+
status: "completed",
|
|
97
|
+
progress_pct: 100,
|
|
98
|
+
phase: "ordering",
|
|
99
|
+
epoch,
|
|
100
|
+
total_epochs: orderingEpochs > 0 ? orderingEpochs : undefined,
|
|
101
|
+
qe: Math.round(Number(qe) * 10_000) / 10_000,
|
|
102
|
+
note: i === ordIdx[ordIdx.length - 1] ? "ordering (review)" : undefined,
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
if (convergence.length > 0) {
|
|
106
|
+
const convIdx = sampleReviewIndices(convergence.length, Math.min(6, REVIEW_MAX_SNAPSHOTS));
|
|
107
|
+
for (const i of convIdx) {
|
|
108
|
+
const qe = convergence[i];
|
|
109
|
+
if (qe == null || Number.isNaN(Number(qe)))
|
|
110
|
+
continue;
|
|
111
|
+
snaps.push({
|
|
112
|
+
elapsed_sec: 0,
|
|
113
|
+
status: "completed",
|
|
114
|
+
progress_pct: 100,
|
|
115
|
+
phase: "convergence",
|
|
116
|
+
qe: Math.round(Number(qe) * 10_000) / 10_000,
|
|
117
|
+
note: i === convIdx[convIdx.length - 1] ? "convergence (review)" : undefined,
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return snaps;
|
|
122
|
+
}
|
|
123
|
+
export function formatPhaseBreakdown(data) {
|
|
124
|
+
const epochs = data.epochs;
|
|
125
|
+
if (!epochs || epochs.length < 2)
|
|
126
|
+
return null;
|
|
127
|
+
const [ord, conv] = epochs;
|
|
128
|
+
const ordN = Array.isArray(data.ordering_errors) ? data.ordering_errors.length : 0;
|
|
129
|
+
const convN = Array.isArray(data.convergence_errors) ? data.convergence_errors.length : 0;
|
|
130
|
+
const parts = [`ordering ${ord} epoch(s)`];
|
|
131
|
+
if (ordN > 0)
|
|
132
|
+
parts.push(`${ordN} batch samples logged`);
|
|
133
|
+
if (conv > 0) {
|
|
134
|
+
parts.push(`convergence ${conv} epoch(s)`);
|
|
135
|
+
if (convN > 0)
|
|
136
|
+
parts.push(`${convN} batch samples`);
|
|
137
|
+
}
|
|
138
|
+
return parts.join(", ");
|
|
139
|
+
}
|
|
140
|
+
export function formatTimingSummary(data) {
|
|
141
|
+
const wall = data.wall_elapsed_sec != null && !Number.isNaN(Number(data.wall_elapsed_sec))
|
|
142
|
+
? Math.round(Number(data.wall_elapsed_sec))
|
|
143
|
+
: null;
|
|
144
|
+
const kernel = data.kernel_elapsed_sec != null && !Number.isNaN(Number(data.kernel_elapsed_sec))
|
|
145
|
+
? Math.round(Number(data.kernel_elapsed_sec))
|
|
146
|
+
: data.training_elapsed_sec != null && !Number.isNaN(Number(data.training_elapsed_sec))
|
|
147
|
+
? Math.round(Number(data.training_elapsed_sec))
|
|
148
|
+
: data.training_duration_seconds != null
|
|
149
|
+
? Math.round(Number(data.training_duration_seconds))
|
|
150
|
+
: null;
|
|
151
|
+
if (wall != null && kernel != null && Math.abs(wall - kernel) >= 2) {
|
|
152
|
+
return `wall ${wall}s, kernel training ${kernel}s`;
|
|
153
|
+
}
|
|
154
|
+
if (wall != null)
|
|
155
|
+
return `wall ${wall}s`;
|
|
156
|
+
if (kernel != null)
|
|
157
|
+
return `kernel training ${kernel}s`;
|
|
158
|
+
return null;
|
|
159
|
+
}
|
|
160
|
+
export function formatReviewMonitorText(result, review) {
|
|
161
|
+
const lines = [
|
|
162
|
+
`Job ${result.job_id} monitor (post-hoc review — job already ${result.data.status ?? "terminal"}):`,
|
|
163
|
+
];
|
|
164
|
+
if (review.phaseBreakdown)
|
|
165
|
+
lines.push(`Phases: ${review.phaseBreakdown}`);
|
|
166
|
+
if (review.timing)
|
|
167
|
+
lines.push(`Timing: ${review.timing}`);
|
|
168
|
+
const qe = result.data.quantization_error;
|
|
169
|
+
const mapTe = result.data.map_topographic_error ?? result.data.topographic_error;
|
|
170
|
+
const epochTe = result.data.epoch_topographic_error;
|
|
171
|
+
if (qe != null || mapTe != null || epochTe != null) {
|
|
172
|
+
const qeS = qe != null ? `QE ${Number(qe).toFixed(4)}` : "";
|
|
173
|
+
const mapS = mapTe != null ? `Map TE ${Number(mapTe).toFixed(4)}` : "";
|
|
174
|
+
const epochS = epochTe != null &&
|
|
175
|
+
mapTe != null &&
|
|
176
|
+
Math.abs(Number(epochTe) - Number(mapTe)) > 0.0005
|
|
177
|
+
? `Last epoch TE ${Number(epochTe).toFixed(4)}`
|
|
178
|
+
: epochTe != null && mapTe == null
|
|
179
|
+
? `Epoch TE ${Number(epochTe).toFixed(4)}`
|
|
180
|
+
: "";
|
|
181
|
+
lines.push(`Final: ${[qeS, mapS, epochS].filter(Boolean).join(", ")}`);
|
|
182
|
+
}
|
|
183
|
+
lines.push("");
|
|
184
|
+
lines.push("Training curve (sampled from ordering_errors / convergence_errors):");
|
|
185
|
+
for (const s of result.snapshots)
|
|
186
|
+
lines.push(formatSnapshotLine(s));
|
|
187
|
+
lines.push("");
|
|
188
|
+
lines.push(`Terminal: ${result.status_text}`);
|
|
189
|
+
lines.push(result.suggested_next_step);
|
|
190
|
+
lines.push("Tip: learning_curve.png is attached when available; use barmesh_results_explorer for all figures.");
|
|
191
|
+
return lines.join("\n");
|
|
192
|
+
}
|
|
193
|
+
export async function buildPostHocReview(job_id, data, suggested_next_step) {
|
|
194
|
+
const enriched = await enrichWithTrainingLog(job_id, data);
|
|
195
|
+
const ordering = enriched.ordering_errors ?? [];
|
|
196
|
+
const convergence = enriched.convergence_errors ?? [];
|
|
197
|
+
const epochs = enriched.epochs ?? [];
|
|
198
|
+
const orderingEpochs = epochs.length > 0 ? Number(epochs[0]) || 0 : 0;
|
|
199
|
+
let snapshots = buildCurveSnapshots(ordering, convergence, orderingEpochs);
|
|
200
|
+
if (snapshots.length === 0) {
|
|
201
|
+
snapshots = [snapshotFromJob(enriched, 0, "post-hoc review (no training-log curves)")];
|
|
202
|
+
}
|
|
203
|
+
const statusText = await formatJobStatusText(job_id, enriched);
|
|
204
|
+
const phaseBreakdown = formatPhaseBreakdown(enriched);
|
|
205
|
+
const timing = formatTimingSummary(enriched);
|
|
206
|
+
const result = {
|
|
207
|
+
job_id,
|
|
208
|
+
terminal: true,
|
|
209
|
+
timed_out: false,
|
|
210
|
+
snapshots,
|
|
211
|
+
status_text: statusText,
|
|
212
|
+
data: { ...enriched, monitor_mode: "review" },
|
|
213
|
+
suggested_next_step,
|
|
214
|
+
};
|
|
215
|
+
result.data.status_text = formatReviewMonitorText(result, {
|
|
216
|
+
phaseBreakdown,
|
|
217
|
+
timing,
|
|
218
|
+
mode: "review",
|
|
219
|
+
});
|
|
220
|
+
return result;
|
|
221
|
+
}
|
|
222
|
+
export { isTerminalStatus, hasCurveArrays };
|