@barivia/barmesh-mcp 0.5.4 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/blocking_monitor.js +34 -0
- package/dist/convergence_reading.js +25 -0
- package/dist/figure_sections.js +50 -0
- package/dist/index.js +1 -1
- package/dist/job_monitor.js +13 -2
- package/dist/results_metadata.js +78 -0
- package/dist/tools/barmesh_results_explorer.js +27 -52
- package/dist/tools/guide.js +1 -1
- package/dist/tools/jobs.js +4 -3
- package/dist/tools/results.js +3 -13
- package/dist/tools/training_monitor.js +72 -52
- package/dist/training_monitor_curve.js +30 -0
- package/dist/training_review.js +182 -0
- package/dist/views/src/views/barmesh-results-explorer/index.html +20 -19
- package/dist/views/src/views/barmesh-training-monitor/index.html +155 -0
- package/dist/viz-server.js +52 -1
- package/package.json +4 -3
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Post-hoc review mode for barmesh_training_monitor on already-terminal jobs.
|
|
3
|
+
* Replays QE curves from training-log and surfaces final timing/metrics.
|
|
4
|
+
*/
|
|
5
|
+
import { apiCall } from "./shared.js";
|
|
6
|
+
import { formatSnapshotLine, snapshotFromJob, } from "./job_monitor.js";
|
|
7
|
+
import { formatJobStatusText } from "./job_status_format.js";
|
|
8
|
+
export const REVIEW_MAX_SNAPSHOTS = 16;
|
|
9
|
+
function isTerminalStatus(status) {
|
|
10
|
+
return status === "completed" || status === "failed" || status === "cancelled";
|
|
11
|
+
}
|
|
12
|
+
function hasCurveArrays(data) {
|
|
13
|
+
const ord = data.ordering_errors;
|
|
14
|
+
const conv = data.convergence_errors;
|
|
15
|
+
return ((Array.isArray(ord) && ord.length > 0) || (Array.isArray(conv) && conv.length > 0));
|
|
16
|
+
}
|
|
17
|
+
export async function enrichWithTrainingLog(job_id, data) {
|
|
18
|
+
if (hasCurveArrays(data))
|
|
19
|
+
return data;
|
|
20
|
+
try {
|
|
21
|
+
const log = (await apiCall("GET", `/v1/results/${job_id}/training-log`));
|
|
22
|
+
return {
|
|
23
|
+
...data,
|
|
24
|
+
ordering_errors: log.ordering_errors ?? data.ordering_errors,
|
|
25
|
+
convergence_errors: log.convergence_errors ?? data.convergence_errors,
|
|
26
|
+
ordering_topographic_errors: log.ordering_topographic_errors ?? data.ordering_topographic_errors,
|
|
27
|
+
convergence_topographic_errors: log.convergence_topographic_errors ?? data.convergence_topographic_errors,
|
|
28
|
+
quantization_error: log.quantization_error ?? data.quantization_error,
|
|
29
|
+
topographic_error: log.topographic_error ?? data.topographic_error,
|
|
30
|
+
grid: log.grid ?? data.grid,
|
|
31
|
+
epochs: log.epochs ?? data.epochs,
|
|
32
|
+
training_duration_seconds: log.training_duration_seconds ?? data.training_duration_seconds,
|
|
33
|
+
training_curve_source_batches: log.training_curve_source_batches ?? data.training_curve_source_batches,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
return data;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
/** Evenly sample indices for a compact epoch/QE timeline in review mode. */
|
|
41
|
+
export function sampleReviewIndices(length, maxPoints = REVIEW_MAX_SNAPSHOTS) {
|
|
42
|
+
if (length <= 0)
|
|
43
|
+
return [];
|
|
44
|
+
if (length <= maxPoints)
|
|
45
|
+
return Array.from({ length }, (_, i) => i);
|
|
46
|
+
const out = [];
|
|
47
|
+
for (let i = 0; i < maxPoints; i += 1) {
|
|
48
|
+
out.push(Math.round((i * (length - 1)) / (maxPoints - 1)));
|
|
49
|
+
}
|
|
50
|
+
return [...new Set(out)].sort((a, b) => a - b);
|
|
51
|
+
}
|
|
52
|
+
export function buildCurveSnapshots(ordering, convergence, orderingEpochs) {
|
|
53
|
+
const snaps = [];
|
|
54
|
+
const ordIdx = sampleReviewIndices(ordering.length);
|
|
55
|
+
for (const i of ordIdx) {
|
|
56
|
+
const qe = ordering[i];
|
|
57
|
+
if (qe == null || Number.isNaN(Number(qe)))
|
|
58
|
+
continue;
|
|
59
|
+
const epoch = ordering.length > 1 && orderingEpochs > 0
|
|
60
|
+
? Math.max(1, Math.round(((i + 1) / ordering.length) * orderingEpochs))
|
|
61
|
+
: i + 1;
|
|
62
|
+
snaps.push({
|
|
63
|
+
elapsed_sec: 0,
|
|
64
|
+
status: "completed",
|
|
65
|
+
progress_pct: 100,
|
|
66
|
+
phase: "ordering",
|
|
67
|
+
epoch,
|
|
68
|
+
total_epochs: orderingEpochs > 0 ? orderingEpochs : undefined,
|
|
69
|
+
qe: Math.round(Number(qe) * 10_000) / 10_000,
|
|
70
|
+
note: i === ordIdx[ordIdx.length - 1] ? "ordering (review)" : undefined,
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
if (convergence.length > 0) {
|
|
74
|
+
const convIdx = sampleReviewIndices(convergence.length, Math.min(6, REVIEW_MAX_SNAPSHOTS));
|
|
75
|
+
for (const i of convIdx) {
|
|
76
|
+
const qe = convergence[i];
|
|
77
|
+
if (qe == null || Number.isNaN(Number(qe)))
|
|
78
|
+
continue;
|
|
79
|
+
snaps.push({
|
|
80
|
+
elapsed_sec: 0,
|
|
81
|
+
status: "completed",
|
|
82
|
+
progress_pct: 100,
|
|
83
|
+
phase: "convergence",
|
|
84
|
+
qe: Math.round(Number(qe) * 10_000) / 10_000,
|
|
85
|
+
note: i === convIdx[convIdx.length - 1] ? "convergence (review)" : undefined,
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return snaps;
|
|
90
|
+
}
|
|
91
|
+
export function formatPhaseBreakdown(data) {
|
|
92
|
+
const epochs = data.epochs;
|
|
93
|
+
if (!epochs || epochs.length < 2)
|
|
94
|
+
return null;
|
|
95
|
+
const [ord, conv] = epochs;
|
|
96
|
+
const ordN = Array.isArray(data.ordering_errors) ? data.ordering_errors.length : 0;
|
|
97
|
+
const convN = Array.isArray(data.convergence_errors) ? data.convergence_errors.length : 0;
|
|
98
|
+
const parts = [`ordering ${ord} epoch(s)`];
|
|
99
|
+
if (ordN > 0)
|
|
100
|
+
parts.push(`${ordN} batch samples logged`);
|
|
101
|
+
if (conv > 0) {
|
|
102
|
+
parts.push(`convergence ${conv} epoch(s)`);
|
|
103
|
+
if (convN > 0)
|
|
104
|
+
parts.push(`${convN} batch samples`);
|
|
105
|
+
}
|
|
106
|
+
return parts.join(", ");
|
|
107
|
+
}
|
|
108
|
+
export function formatTimingSummary(data) {
|
|
109
|
+
const wall = data.wall_elapsed_sec != null && !Number.isNaN(Number(data.wall_elapsed_sec))
|
|
110
|
+
? Math.round(Number(data.wall_elapsed_sec))
|
|
111
|
+
: null;
|
|
112
|
+
const kernel = data.kernel_elapsed_sec != null && !Number.isNaN(Number(data.kernel_elapsed_sec))
|
|
113
|
+
? Math.round(Number(data.kernel_elapsed_sec))
|
|
114
|
+
: data.training_elapsed_sec != null && !Number.isNaN(Number(data.training_elapsed_sec))
|
|
115
|
+
? Math.round(Number(data.training_elapsed_sec))
|
|
116
|
+
: data.training_duration_seconds != null
|
|
117
|
+
? Math.round(Number(data.training_duration_seconds))
|
|
118
|
+
: null;
|
|
119
|
+
if (wall != null && kernel != null && Math.abs(wall - kernel) >= 2) {
|
|
120
|
+
return `wall ${wall}s, kernel training ${kernel}s`;
|
|
121
|
+
}
|
|
122
|
+
if (wall != null)
|
|
123
|
+
return `wall ${wall}s`;
|
|
124
|
+
if (kernel != null)
|
|
125
|
+
return `kernel training ${kernel}s`;
|
|
126
|
+
return null;
|
|
127
|
+
}
|
|
128
|
+
export function formatReviewMonitorText(result, review) {
|
|
129
|
+
const lines = [
|
|
130
|
+
`Job ${result.job_id} monitor (post-hoc review — job already ${result.data.status ?? "terminal"}):`,
|
|
131
|
+
];
|
|
132
|
+
if (review.phaseBreakdown)
|
|
133
|
+
lines.push(`Phases: ${review.phaseBreakdown}`);
|
|
134
|
+
if (review.timing)
|
|
135
|
+
lines.push(`Timing: ${review.timing}`);
|
|
136
|
+
const qe = result.data.quantization_error;
|
|
137
|
+
const te = result.data.topographic_error;
|
|
138
|
+
if (qe != null || te != null) {
|
|
139
|
+
const qeS = qe != null ? `QE ${Number(qe).toFixed(4)}` : "";
|
|
140
|
+
const teS = te != null ? `TE ${Number(te).toFixed(4)}` : "";
|
|
141
|
+
lines.push(`Final: ${[qeS, teS].filter(Boolean).join(", ")}`);
|
|
142
|
+
}
|
|
143
|
+
lines.push("");
|
|
144
|
+
lines.push("Training curve (sampled from ordering_errors / convergence_errors):");
|
|
145
|
+
for (const s of result.snapshots)
|
|
146
|
+
lines.push(formatSnapshotLine(s));
|
|
147
|
+
lines.push("");
|
|
148
|
+
lines.push(`Terminal: ${result.status_text}`);
|
|
149
|
+
lines.push(result.suggested_next_step);
|
|
150
|
+
lines.push("Tip: learning_curve.png is attached when available; use barmesh_results_explorer for all figures.");
|
|
151
|
+
return lines.join("\n");
|
|
152
|
+
}
|
|
153
|
+
export async function buildPostHocReview(job_id, data, suggested_next_step) {
|
|
154
|
+
const enriched = await enrichWithTrainingLog(job_id, data);
|
|
155
|
+
const ordering = enriched.ordering_errors ?? [];
|
|
156
|
+
const convergence = enriched.convergence_errors ?? [];
|
|
157
|
+
const epochs = enriched.epochs ?? [];
|
|
158
|
+
const orderingEpochs = epochs.length > 0 ? Number(epochs[0]) || 0 : 0;
|
|
159
|
+
let snapshots = buildCurveSnapshots(ordering, convergence, orderingEpochs);
|
|
160
|
+
if (snapshots.length === 0) {
|
|
161
|
+
snapshots = [snapshotFromJob(enriched, 0, "post-hoc review (no training-log curves)")];
|
|
162
|
+
}
|
|
163
|
+
const statusText = await formatJobStatusText(job_id, enriched);
|
|
164
|
+
const phaseBreakdown = formatPhaseBreakdown(enriched);
|
|
165
|
+
const timing = formatTimingSummary(enriched);
|
|
166
|
+
const result = {
|
|
167
|
+
job_id,
|
|
168
|
+
terminal: true,
|
|
169
|
+
timed_out: false,
|
|
170
|
+
snapshots,
|
|
171
|
+
status_text: statusText,
|
|
172
|
+
data: { ...enriched, monitor_mode: "review" },
|
|
173
|
+
suggested_next_step,
|
|
174
|
+
};
|
|
175
|
+
result.data.status_text = formatReviewMonitorText(result, {
|
|
176
|
+
phaseBreakdown,
|
|
177
|
+
timing,
|
|
178
|
+
mode: "review",
|
|
179
|
+
});
|
|
180
|
+
return result;
|
|
181
|
+
}
|
|
182
|
+
export { isTerminalStatus, hasCurveArrays };
|