@barivia/barmesh-mcp 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/blocking_monitor.js +34 -0
- package/dist/convergence_reading.js +25 -0
- package/dist/figure_sections.js +50 -0
- package/dist/index.js +1 -1
- package/dist/job_monitor.js +13 -2
- package/dist/results_metadata.js +78 -0
- package/dist/shared.js +18 -2
- package/dist/tools/barmesh_results_explorer.js +29 -51
- package/dist/tools/guide.js +1 -1
- package/dist/tools/jobs.js +4 -3
- package/dist/tools/results.js +46 -7
- package/dist/tools/training_monitor.js +72 -52
- package/dist/training_monitor_curve.js +30 -0
- package/dist/training_review.js +182 -0
- package/dist/views/src/views/barmesh-results-explorer/index.html +20 -19
- package/dist/views/src/views/barmesh-training-monitor/index.html +155 -0
- package/dist/viz-server.js +52 -1
- package/package.json +4 -3
|
@@ -1,56 +1,76 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
.optional()
|
|
19
|
-
.describe(`Max seconds to wait (default ${DEFAULT_BLOCK_UNTIL_SEC}; mesh jobs often need 6–10 min)`),
|
|
20
|
-
poll_interval_sec: z
|
|
21
|
-
.number()
|
|
22
|
-
.int()
|
|
23
|
-
.min(5)
|
|
24
|
-
.optional()
|
|
25
|
-
.describe(`Seconds between status polls (default ${DEFAULT_POLL_INTERVAL_SEC}; do not go below 5)`),
|
|
26
|
-
wait_finalize: z
|
|
27
|
-
.boolean()
|
|
28
|
-
.optional()
|
|
29
|
-
.describe("When true (default), wait for cfd_finalize after compute completes before returning"),
|
|
30
|
-
};
|
|
31
|
-
async function runMonitor(args) {
|
|
32
|
-
const block_until_sec = args.block_until_sec ?? DEFAULT_BLOCK_UNTIL_SEC;
|
|
33
|
-
const poll_interval_sec = args.poll_interval_sec ?? DEFAULT_POLL_INTERVAL_SEC;
|
|
34
|
-
const result = await monitorJob(args.job_id, {
|
|
35
|
-
block_until_sec,
|
|
36
|
-
poll_interval_sec,
|
|
37
|
-
wait_finalize: args.wait_finalize,
|
|
38
|
-
});
|
|
39
|
-
const text = formatMonitorText(result, { block_until_sec, poll_interval_sec });
|
|
40
|
-
return textResult({
|
|
41
|
-
...result.data,
|
|
42
|
-
monitor: {
|
|
43
|
-
job_id: result.job_id,
|
|
44
|
-
terminal: result.terminal,
|
|
45
|
-
timed_out: result.timed_out,
|
|
46
|
-
snapshots: result.snapshots,
|
|
47
|
-
status_text: result.status_text,
|
|
48
|
-
suggested_next_step: result.suggested_next_step,
|
|
49
|
-
},
|
|
50
|
-
status_text: text,
|
|
51
|
-
});
|
|
2
|
+
import { registerAppTool } from "@modelcontextprotocol/ext-apps/server";
|
|
3
|
+
import { runMcpToolAudit } from "../audit.js";
|
|
4
|
+
import { apiCall, getClientSupportsMcpApps, getVizPort, structuredTextResult, } from "../shared.js";
|
|
5
|
+
import { enrichWithTrainingLog, hasCurveArrays } from "../training_review.js";
|
|
6
|
+
export const TRAINING_MONITOR_URI = "ui://barmesh/training-monitor";
|
|
7
|
+
export const TRAINING_MONITOR_REFRESH_MS = 5000;
|
|
8
|
+
function buildStructuredContent(job_id, data) {
|
|
9
|
+
const id = String(data.id ?? job_id);
|
|
10
|
+
return {
|
|
11
|
+
...data,
|
|
12
|
+
type: "barmesh-training-monitor",
|
|
13
|
+
jobId: id,
|
|
14
|
+
id,
|
|
15
|
+
job_id: id,
|
|
16
|
+
refresh_interval_ms: TRAINING_MONITOR_REFRESH_MS,
|
|
17
|
+
};
|
|
52
18
|
}
|
|
53
19
|
export function registerTrainingMonitorTool(server) {
|
|
54
|
-
|
|
20
|
+
registerAppTool(server, "barmesh_training_monitor", {
|
|
21
|
+
title: "Mesh Convergence Training Monitor",
|
|
22
|
+
description: "Default visual monitor for mesh_convergence jobs after submit. Embedded MCP App auto-refreshes every 5s: epoch progress bar, phase, ETA, live QE/TE curves (uniformly subsampled to ≤1000 batch samples per phase), and a per-epoch hit-grid heatmap when available. On already-completed jobs, replays training-log curves in review mode (same entry point). Also exposes a standalone localhost URL — copy it if window.open is blocked in your MCP host. Headless fallback: barmesh_jobs(action=monitor) blocks with compact text snapshots (live or post-hoc review). barmesh_jobs(action=status) remains a one-shot poll. After completion, use barmesh_results_explorer for figures.",
|
|
23
|
+
inputSchema: {
|
|
24
|
+
job_id: z.string().describe("Job ID from barmesh_mesh_convergence or barmesh_richardson"),
|
|
25
|
+
fetch_training_log: z
|
|
26
|
+
.boolean()
|
|
27
|
+
.optional()
|
|
28
|
+
.describe("Internal: merge completed-job training-log arrays when live progress is empty"),
|
|
29
|
+
},
|
|
30
|
+
_meta: { ui: { resourceUri: TRAINING_MONITOR_URI } },
|
|
31
|
+
}, async (args) => runMcpToolAudit("barmesh_training_monitor", "default", args, async () => {
|
|
32
|
+
const { job_id, fetch_training_log } = args;
|
|
33
|
+
let data = (await apiCall("GET", `/v1/jobs/${job_id}`));
|
|
34
|
+
const jobStatus = String(data.status ?? "");
|
|
35
|
+
const terminal = jobStatus === "completed" || jobStatus === "failed";
|
|
36
|
+
if (fetch_training_log || (terminal && !hasCurveArrays(data))) {
|
|
37
|
+
data = await enrichWithTrainingLog(job_id, data);
|
|
38
|
+
}
|
|
39
|
+
const structuredContent = buildStructuredContent(job_id, data);
|
|
40
|
+
const progress = (data.progress ?? 0) * 100;
|
|
41
|
+
const etaSec = data.training_eta_sec != null ? Number(data.training_eta_sec) : null;
|
|
42
|
+
const elapsedSec = data.training_elapsed_sec != null ? Number(data.training_elapsed_sec) : null;
|
|
43
|
+
const epoch = data.epoch != null ? Number(data.epoch) : null;
|
|
44
|
+
const totalEpochs = data.total_epochs != null ? Number(data.total_epochs) : null;
|
|
45
|
+
const timingParts = [];
|
|
46
|
+
if (elapsedSec != null && elapsedSec >= 0)
|
|
47
|
+
timingParts.push(`elapsed ${Math.round(elapsedSec)}s`);
|
|
48
|
+
if (etaSec != null && etaSec > 0)
|
|
49
|
+
timingParts.push(`ETA ~${Math.round(etaSec)}s`);
|
|
50
|
+
if (epoch != null && totalEpochs != null)
|
|
51
|
+
timingParts.push(`epoch ${epoch}/${totalEpochs}`);
|
|
52
|
+
const timingNote = timingParts.length > 0 ? ` ${timingParts.join(", ")}.` : "";
|
|
53
|
+
const modeNote = terminal ? " (post-hoc review — training-log curves)" : "";
|
|
54
|
+
const text = `Training monitor (visual MCP App, refreshes every ${TRAINING_MONITOR_REFRESH_MS / 1000}s): job ${job_id} — ${jobStatus} (${progress.toFixed(1)}%).${modeNote}${timingNote}`;
|
|
55
|
+
const content = [{ type: "text", text }];
|
|
56
|
+
const port = getVizPort();
|
|
57
|
+
const standaloneUrl = port
|
|
58
|
+
? `http://localhost:${port}/viz/barmesh-training-monitor?mode=standalone&job_id=${encodeURIComponent(job_id)}`
|
|
59
|
+
: undefined;
|
|
60
|
+
if (standaloneUrl) {
|
|
61
|
+
content.push({
|
|
62
|
+
type: "text",
|
|
63
|
+
text: `Standalone training monitor URL (copy if Open is blocked):\n${standaloneUrl}\n` +
|
|
64
|
+
`AGENT: surface this URL to the user — use a markdown link or tell them to copy/paste into a browser when the inline panel cannot open localhost. Re-post after the job completes so they can review curves.\n` +
|
|
65
|
+
`This localhost port is per MCP session and goes stale if the proxy restarts — re-run barmesh_training_monitor for a fresh URL, or set BARIVIA_VIZ_PORT for a persistent port. Health check: http://localhost:${port}/api/health?job_id=${encodeURIComponent(job_id)}`,
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
if (!getClientSupportsMcpApps() && standaloneUrl) {
|
|
69
|
+
content.push({
|
|
70
|
+
type: "text",
|
|
71
|
+
text: `Interactive training monitor: [Open training monitor](${standaloneUrl})`,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
return structuredTextResult(structuredContent, text, content);
|
|
75
|
+
}));
|
|
55
76
|
}
|
|
56
|
-
export { runMonitor, MONITOR_DESCRIPTION, monitorSchema };
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/** Batch-sample x-axis helpers for the barmesh_training_monitor MCP App chart. */
|
|
2
|
+
export function batchSampleAxis(n, offset = 0) {
|
|
3
|
+
if (n <= 0)
|
|
4
|
+
return [];
|
|
5
|
+
return Array.from({ length: n }, (_, i) => offset + i + 1);
|
|
6
|
+
}
|
|
7
|
+
export function combinedBatchAxis(nOrd, nConv) {
|
|
8
|
+
if (nOrd <= 0 && nConv <= 0)
|
|
9
|
+
return [];
|
|
10
|
+
return [...batchSampleAxis(nOrd, 0), ...batchSampleAxis(nConv, nOrd)];
|
|
11
|
+
}
|
|
12
|
+
export function formatCurveSourceNote(data) {
|
|
13
|
+
const src = data.training_curve_source_batches;
|
|
14
|
+
if (!src)
|
|
15
|
+
return null;
|
|
16
|
+
const parts = [];
|
|
17
|
+
const ordTotal = typeof src.ordering === "number" ? src.ordering : null;
|
|
18
|
+
const convTotal = typeof src.convergence === "number" ? src.convergence : null;
|
|
19
|
+
const ordShown = Array.isArray(data.ordering_errors) ? data.ordering_errors.length : 0;
|
|
20
|
+
const convShown = Array.isArray(data.convergence_errors) ? data.convergence_errors.length : 0;
|
|
21
|
+
if (ordTotal != null && ordTotal > ordShown) {
|
|
22
|
+
parts.push(`${ordShown} of ${ordTotal.toLocaleString()} ordering batch samples`);
|
|
23
|
+
}
|
|
24
|
+
if (convTotal != null && convTotal > convShown) {
|
|
25
|
+
parts.push(`${convShown} of ${convTotal.toLocaleString()} convergence batch samples`);
|
|
26
|
+
}
|
|
27
|
+
if (parts.length === 0)
|
|
28
|
+
return null;
|
|
29
|
+
return `Displaying uniformly subsampled curves (≤1000 points/phase): ${parts.join("; ")}.`;
|
|
30
|
+
}
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Post-hoc review mode for barmesh_training_monitor on already-terminal jobs.
|
|
3
|
+
* Replays QE curves from training-log and surfaces final timing/metrics.
|
|
4
|
+
*/
|
|
5
|
+
import { apiCall } from "./shared.js";
|
|
6
|
+
import { formatSnapshotLine, snapshotFromJob, } from "./job_monitor.js";
|
|
7
|
+
import { formatJobStatusText } from "./job_status_format.js";
|
|
8
|
+
export const REVIEW_MAX_SNAPSHOTS = 16;
|
|
9
|
+
function isTerminalStatus(status) {
|
|
10
|
+
return status === "completed" || status === "failed" || status === "cancelled";
|
|
11
|
+
}
|
|
12
|
+
function hasCurveArrays(data) {
|
|
13
|
+
const ord = data.ordering_errors;
|
|
14
|
+
const conv = data.convergence_errors;
|
|
15
|
+
return ((Array.isArray(ord) && ord.length > 0) || (Array.isArray(conv) && conv.length > 0));
|
|
16
|
+
}
|
|
17
|
+
export async function enrichWithTrainingLog(job_id, data) {
|
|
18
|
+
if (hasCurveArrays(data))
|
|
19
|
+
return data;
|
|
20
|
+
try {
|
|
21
|
+
const log = (await apiCall("GET", `/v1/results/${job_id}/training-log`));
|
|
22
|
+
return {
|
|
23
|
+
...data,
|
|
24
|
+
ordering_errors: log.ordering_errors ?? data.ordering_errors,
|
|
25
|
+
convergence_errors: log.convergence_errors ?? data.convergence_errors,
|
|
26
|
+
ordering_topographic_errors: log.ordering_topographic_errors ?? data.ordering_topographic_errors,
|
|
27
|
+
convergence_topographic_errors: log.convergence_topographic_errors ?? data.convergence_topographic_errors,
|
|
28
|
+
quantization_error: log.quantization_error ?? data.quantization_error,
|
|
29
|
+
topographic_error: log.topographic_error ?? data.topographic_error,
|
|
30
|
+
grid: log.grid ?? data.grid,
|
|
31
|
+
epochs: log.epochs ?? data.epochs,
|
|
32
|
+
training_duration_seconds: log.training_duration_seconds ?? data.training_duration_seconds,
|
|
33
|
+
training_curve_source_batches: log.training_curve_source_batches ?? data.training_curve_source_batches,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
return data;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
/** Evenly sample indices for a compact epoch/QE timeline in review mode. */
|
|
41
|
+
export function sampleReviewIndices(length, maxPoints = REVIEW_MAX_SNAPSHOTS) {
|
|
42
|
+
if (length <= 0)
|
|
43
|
+
return [];
|
|
44
|
+
if (length <= maxPoints)
|
|
45
|
+
return Array.from({ length }, (_, i) => i);
|
|
46
|
+
const out = [];
|
|
47
|
+
for (let i = 0; i < maxPoints; i += 1) {
|
|
48
|
+
out.push(Math.round((i * (length - 1)) / (maxPoints - 1)));
|
|
49
|
+
}
|
|
50
|
+
return [...new Set(out)].sort((a, b) => a - b);
|
|
51
|
+
}
|
|
52
|
+
export function buildCurveSnapshots(ordering, convergence, orderingEpochs) {
|
|
53
|
+
const snaps = [];
|
|
54
|
+
const ordIdx = sampleReviewIndices(ordering.length);
|
|
55
|
+
for (const i of ordIdx) {
|
|
56
|
+
const qe = ordering[i];
|
|
57
|
+
if (qe == null || Number.isNaN(Number(qe)))
|
|
58
|
+
continue;
|
|
59
|
+
const epoch = ordering.length > 1 && orderingEpochs > 0
|
|
60
|
+
? Math.max(1, Math.round(((i + 1) / ordering.length) * orderingEpochs))
|
|
61
|
+
: i + 1;
|
|
62
|
+
snaps.push({
|
|
63
|
+
elapsed_sec: 0,
|
|
64
|
+
status: "completed",
|
|
65
|
+
progress_pct: 100,
|
|
66
|
+
phase: "ordering",
|
|
67
|
+
epoch,
|
|
68
|
+
total_epochs: orderingEpochs > 0 ? orderingEpochs : undefined,
|
|
69
|
+
qe: Math.round(Number(qe) * 10_000) / 10_000,
|
|
70
|
+
note: i === ordIdx[ordIdx.length - 1] ? "ordering (review)" : undefined,
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
if (convergence.length > 0) {
|
|
74
|
+
const convIdx = sampleReviewIndices(convergence.length, Math.min(6, REVIEW_MAX_SNAPSHOTS));
|
|
75
|
+
for (const i of convIdx) {
|
|
76
|
+
const qe = convergence[i];
|
|
77
|
+
if (qe == null || Number.isNaN(Number(qe)))
|
|
78
|
+
continue;
|
|
79
|
+
snaps.push({
|
|
80
|
+
elapsed_sec: 0,
|
|
81
|
+
status: "completed",
|
|
82
|
+
progress_pct: 100,
|
|
83
|
+
phase: "convergence",
|
|
84
|
+
qe: Math.round(Number(qe) * 10_000) / 10_000,
|
|
85
|
+
note: i === convIdx[convIdx.length - 1] ? "convergence (review)" : undefined,
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return snaps;
|
|
90
|
+
}
|
|
91
|
+
export function formatPhaseBreakdown(data) {
|
|
92
|
+
const epochs = data.epochs;
|
|
93
|
+
if (!epochs || epochs.length < 2)
|
|
94
|
+
return null;
|
|
95
|
+
const [ord, conv] = epochs;
|
|
96
|
+
const ordN = Array.isArray(data.ordering_errors) ? data.ordering_errors.length : 0;
|
|
97
|
+
const convN = Array.isArray(data.convergence_errors) ? data.convergence_errors.length : 0;
|
|
98
|
+
const parts = [`ordering ${ord} epoch(s)`];
|
|
99
|
+
if (ordN > 0)
|
|
100
|
+
parts.push(`${ordN} batch samples logged`);
|
|
101
|
+
if (conv > 0) {
|
|
102
|
+
parts.push(`convergence ${conv} epoch(s)`);
|
|
103
|
+
if (convN > 0)
|
|
104
|
+
parts.push(`${convN} batch samples`);
|
|
105
|
+
}
|
|
106
|
+
return parts.join(", ");
|
|
107
|
+
}
|
|
108
|
+
export function formatTimingSummary(data) {
|
|
109
|
+
const wall = data.wall_elapsed_sec != null && !Number.isNaN(Number(data.wall_elapsed_sec))
|
|
110
|
+
? Math.round(Number(data.wall_elapsed_sec))
|
|
111
|
+
: null;
|
|
112
|
+
const kernel = data.kernel_elapsed_sec != null && !Number.isNaN(Number(data.kernel_elapsed_sec))
|
|
113
|
+
? Math.round(Number(data.kernel_elapsed_sec))
|
|
114
|
+
: data.training_elapsed_sec != null && !Number.isNaN(Number(data.training_elapsed_sec))
|
|
115
|
+
? Math.round(Number(data.training_elapsed_sec))
|
|
116
|
+
: data.training_duration_seconds != null
|
|
117
|
+
? Math.round(Number(data.training_duration_seconds))
|
|
118
|
+
: null;
|
|
119
|
+
if (wall != null && kernel != null && Math.abs(wall - kernel) >= 2) {
|
|
120
|
+
return `wall ${wall}s, kernel training ${kernel}s`;
|
|
121
|
+
}
|
|
122
|
+
if (wall != null)
|
|
123
|
+
return `wall ${wall}s`;
|
|
124
|
+
if (kernel != null)
|
|
125
|
+
return `kernel training ${kernel}s`;
|
|
126
|
+
return null;
|
|
127
|
+
}
|
|
128
|
+
export function formatReviewMonitorText(result, review) {
|
|
129
|
+
const lines = [
|
|
130
|
+
`Job ${result.job_id} monitor (post-hoc review — job already ${result.data.status ?? "terminal"}):`,
|
|
131
|
+
];
|
|
132
|
+
if (review.phaseBreakdown)
|
|
133
|
+
lines.push(`Phases: ${review.phaseBreakdown}`);
|
|
134
|
+
if (review.timing)
|
|
135
|
+
lines.push(`Timing: ${review.timing}`);
|
|
136
|
+
const qe = result.data.quantization_error;
|
|
137
|
+
const te = result.data.topographic_error;
|
|
138
|
+
if (qe != null || te != null) {
|
|
139
|
+
const qeS = qe != null ? `QE ${Number(qe).toFixed(4)}` : "";
|
|
140
|
+
const teS = te != null ? `TE ${Number(te).toFixed(4)}` : "";
|
|
141
|
+
lines.push(`Final: ${[qeS, teS].filter(Boolean).join(", ")}`);
|
|
142
|
+
}
|
|
143
|
+
lines.push("");
|
|
144
|
+
lines.push("Training curve (sampled from ordering_errors / convergence_errors):");
|
|
145
|
+
for (const s of result.snapshots)
|
|
146
|
+
lines.push(formatSnapshotLine(s));
|
|
147
|
+
lines.push("");
|
|
148
|
+
lines.push(`Terminal: ${result.status_text}`);
|
|
149
|
+
lines.push(result.suggested_next_step);
|
|
150
|
+
lines.push("Tip: learning_curve.png is attached when available; use barmesh_results_explorer for all figures.");
|
|
151
|
+
return lines.join("\n");
|
|
152
|
+
}
|
|
153
|
+
export async function buildPostHocReview(job_id, data, suggested_next_step) {
|
|
154
|
+
const enriched = await enrichWithTrainingLog(job_id, data);
|
|
155
|
+
const ordering = enriched.ordering_errors ?? [];
|
|
156
|
+
const convergence = enriched.convergence_errors ?? [];
|
|
157
|
+
const epochs = enriched.epochs ?? [];
|
|
158
|
+
const orderingEpochs = epochs.length > 0 ? Number(epochs[0]) || 0 : 0;
|
|
159
|
+
let snapshots = buildCurveSnapshots(ordering, convergence, orderingEpochs);
|
|
160
|
+
if (snapshots.length === 0) {
|
|
161
|
+
snapshots = [snapshotFromJob(enriched, 0, "post-hoc review (no training-log curves)")];
|
|
162
|
+
}
|
|
163
|
+
const statusText = await formatJobStatusText(job_id, enriched);
|
|
164
|
+
const phaseBreakdown = formatPhaseBreakdown(enriched);
|
|
165
|
+
const timing = formatTimingSummary(enriched);
|
|
166
|
+
const result = {
|
|
167
|
+
job_id,
|
|
168
|
+
terminal: true,
|
|
169
|
+
timed_out: false,
|
|
170
|
+
snapshots,
|
|
171
|
+
status_text: statusText,
|
|
172
|
+
data: { ...enriched, monitor_mode: "review" },
|
|
173
|
+
suggested_next_step,
|
|
174
|
+
};
|
|
175
|
+
result.data.status_text = formatReviewMonitorText(result, {
|
|
176
|
+
phaseBreakdown,
|
|
177
|
+
timing,
|
|
178
|
+
mode: "review",
|
|
179
|
+
});
|
|
180
|
+
return result;
|
|
181
|
+
}
|
|
182
|
+
export { isTerminalStatus, hasCurveArrays };
|