@barivia/barmesh-mcp 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -5
- package/dist/cfd_prepare.js +5 -3
- package/dist/shared.js +40 -9
- package/dist/tools/cfd.js +4 -0
- package/dist/tools/datasets.js +191 -28
- package/dist/upload_hints.js +1 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -13,10 +13,14 @@ form on a shared self-organizing map (SOM)**:
|
|
|
13
13
|
- **`barmesh_mesh_convergence`** — trains one SOM on all meshes (joint-normalized), projects
|
|
14
14
|
each mesh to a volume-weighted fingerprint, and computes **symmetric KL** and
|
|
15
15
|
**Wasserstein-1 (EMD)** distances stepwise and against a reference mesh, with publication
|
|
16
|
-
figures and an advisory convergence reading.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
16
|
+
figures and an advisory convergence reading. The SOM features are preprocessed by the
|
|
17
|
+
same staged pipeline as barsom training, so any dataset (small or large, CSV/gzip)
|
|
18
|
+
is handled out-of-core by default; optional `transforms`, `normalize`,
|
|
19
|
+
`normalization_methods`, and `row_range` give the same preprocessing controls. Submit
|
|
20
|
+
enqueues **`prepare_training_matrix`** on worker-io when the dataset is staged; the proxy
|
|
21
|
+
auto-polls `prepare_job_id` before the mesh job runs. Default **`defer_figures=true`** →
|
|
22
|
+
**`cfd_finalize`** on worker-io; **`barmesh_jobs(status)`** auto-polls **`finalize_job_id`**
|
|
23
|
+
when figures are deferred.
|
|
20
24
|
- **`barmesh_richardson`** — classical three-level Richardson extrapolation / Grid
|
|
21
25
|
Convergence Index (GCI) on scalar quantities of interest.
|
|
22
26
|
|
|
@@ -48,7 +52,7 @@ API key; otherwise the analysis calls return HTTP 403. Contact Barivia to enable
|
|
|
48
52
|
|------|---------|
|
|
49
53
|
| `barmesh_guide_workflow` | Workflow + tool map (tier-scoped). Call first. |
|
|
50
54
|
| `barmesh_prepare_mesh_data` | Recipe for the combined per-cell CSV. |
|
|
51
|
-
| `barmesh_datasets` | Upload / preview / list the mesh CSV. |
|
|
55
|
+
| `barmesh_datasets` | Upload / preview / list / get / subset / delete the mesh CSV. |
|
|
52
56
|
| `barmesh_mesh_convergence` | SOM fingerprint distances (async job). |
|
|
53
57
|
| `barmesh_richardson` | Richardson/GCI on scalar QoIs (async job). |
|
|
54
58
|
| `barmesh_jobs` | Poll job status / list jobs (auto-polls CFD prepare + finalize when applicable). |
|
|
@@ -65,6 +69,12 @@ One combined CSV: one row per cell, a mesh-label column (`mesh_id`), the physica
|
|
|
65
69
|
you choose as `feature_columns` (e.g. `p`, `U_mag`, `k`, `log_epsilon`, `T`), and a
|
|
66
70
|
cell-volume column (`V`). Use `barmesh_prepare_mesh_data` for the full recipe.
|
|
67
71
|
|
|
72
|
+
**Upload formats:** `.csv`, `.tsv`, `.csv.gz`, or `.tsv.gz`. For large per-cell tables
|
|
73
|
+
(≥64 MB), prefer `.csv.gz` — uploads stream directly to object storage with presigned PUT.
|
|
74
|
+
Use `barmesh_datasets(action=get, dataset_id=...)` to check staging status after upload;
|
|
75
|
+
`barmesh_datasets(action=subset, sample_n=...)` to downsample huge tables server-side.
|
|
76
|
+
Parquet staging is supported by the API but not yet exposed as an MCP upload format.
|
|
77
|
+
|
|
68
78
|
## Environment variables
|
|
69
79
|
|
|
70
80
|
| Variable | Default | Purpose |
|
package/dist/cfd_prepare.js
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import { pollUntilComplete } from "./shared.js";
|
|
2
2
|
/**
|
|
3
|
-
*
|
|
3
|
+
* Mesh-convergence on a staged dataset enqueues a preprocessing job
|
|
4
|
+
* (prepare_training_matrix) first; the submit response carries its id as
|
|
5
|
+
* `prepare_job_id`. Poll it to completion before the mesh job runs.
|
|
4
6
|
*/
|
|
5
7
|
export async function pollCfdPrepareIfPresent(data, label, timeoutMs = 600_000) {
|
|
6
8
|
const prepareJobId = data.prepare_job_id;
|
|
@@ -8,10 +10,10 @@ export async function pollCfdPrepareIfPresent(data, label, timeoutMs = 600_000)
|
|
|
8
10
|
return null;
|
|
9
11
|
const poll = await pollUntilComplete(prepareJobId, timeoutMs);
|
|
10
12
|
if (poll.status === "failed") {
|
|
11
|
-
throw new Error(`${label}:
|
|
13
|
+
throw new Error(`${label}: data preparation job ${prepareJobId} failed: ${poll.error ?? "unknown error"}`);
|
|
12
14
|
}
|
|
13
15
|
if (poll.status !== "completed") {
|
|
14
|
-
throw new Error(`${label}:
|
|
16
|
+
throw new Error(`${label}: data preparation job ${prepareJobId} did not complete (status=${poll.status})`);
|
|
15
17
|
}
|
|
16
18
|
return prepareJobId;
|
|
17
19
|
}
|
package/dist/shared.js
CHANGED
|
@@ -4,10 +4,12 @@
|
|
|
4
4
|
* remains a thin HTTPS client to the same Barivia API (no domain logic here).
|
|
5
5
|
*/
|
|
6
6
|
import fs from "node:fs/promises";
|
|
7
|
-
import { createReadStream } from "node:fs";
|
|
7
|
+
import { createReadStream, createWriteStream } from "node:fs";
|
|
8
8
|
import { createGzip } from "node:zlib";
|
|
9
|
-
import { createHash } from "node:crypto";
|
|
9
|
+
import { createHash, randomUUID } from "node:crypto";
|
|
10
10
|
import { Readable } from "node:stream";
|
|
11
|
+
import { pipeline } from "node:stream/promises";
|
|
12
|
+
import os from "node:os";
|
|
11
13
|
import path from "node:path";
|
|
12
14
|
import { fileURLToPath } from "node:url";
|
|
13
15
|
import { logInfo } from "./logger.js";
|
|
@@ -20,7 +22,7 @@ export const FETCH_TIMEOUT_MS = parseInt(process.env.BARIVIA_FETCH_TIMEOUT_MS ??
|
|
|
20
22
|
export const MAX_RETRIES = 2;
|
|
21
23
|
export const RETRYABLE_STATUS = new Set([502, 503, 504]);
|
|
22
24
|
/** Single source of truth for the proxy version. Keep in sync with package.json on bump. */
|
|
23
|
-
export const CLIENT_VERSION = "0.
|
|
25
|
+
export const CLIENT_VERSION = "0.4.1";
|
|
24
26
|
export const PUBLIC_SITE_ORIGIN = "https://barivia.se";
|
|
25
27
|
/** Large per-cell CSV uploads may exceed the default fetch timeout. */
|
|
26
28
|
export const UPLOAD_DATASET_TIMEOUT_MS = 180_000;
|
|
@@ -40,27 +42,56 @@ export async function streamFileSha256(srcPath) {
|
|
|
40
42
|
s.on("error", reject);
|
|
41
43
|
});
|
|
42
44
|
}
|
|
43
|
-
/**
|
|
44
|
-
|
|
45
|
-
const
|
|
46
|
-
|
|
45
|
+
/** Turn a raw S3/R2 presigned-PUT error into a clean, actionable message. */
|
|
46
|
+
function presignedPutError(status, bodyText) {
|
|
47
|
+
const snippet = bodyText.slice(0, 200);
|
|
48
|
+
if (status === 411 || /MissingContentLength/i.test(bodyText)) {
|
|
49
|
+
return new Error("Upload rejected by storage: the Content-Length header was missing. " +
|
|
50
|
+
"This is a client bug — please update @barivia/barmesh-mcp to the latest version.");
|
|
51
|
+
}
|
|
52
|
+
if (status === 413 || /EntityTooLarge|entity is too large/i.test(bodyText)) {
|
|
53
|
+
return new Error("Upload rejected by storage: file exceeds the maximum upload size.");
|
|
54
|
+
}
|
|
55
|
+
if (status === 403) {
|
|
56
|
+
return new Error("Upload rejected by storage: the presigned URL expired or was invalid; retry barmesh_datasets(action=upload).");
|
|
57
|
+
}
|
|
58
|
+
return new Error(`Presigned upload failed: HTTP ${status} ${snippet}`);
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Stream a local file directly to a presigned PUT URL (e.g. R2), gzip-compressing
|
|
62
|
+
* on the way unless the source is already gzipped. Never materializes the payload
|
|
63
|
+
* in process memory: when compression is needed we gzip to a temp file first, then
|
|
64
|
+
* PUT that file with an explicit Content-Length.
|
|
65
|
+
*/
|
|
66
|
+
export async function putPresignedStream(url, srcPath, contentType, timeoutMs = PRESIGNED_PUT_TIMEOUT_MS, alreadyGzipped = false) {
|
|
67
|
+
let putPath = srcPath;
|
|
68
|
+
let tempPath;
|
|
69
|
+
if (!alreadyGzipped) {
|
|
70
|
+
tempPath = path.join(os.tmpdir(), `barmesh-upload-${randomUUID()}.csv.gz`);
|
|
71
|
+
await pipeline(createReadStream(srcPath), createGzip(), createWriteStream(tempPath));
|
|
72
|
+
putPath = tempPath;
|
|
73
|
+
}
|
|
47
74
|
const controller = new AbortController();
|
|
48
75
|
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
49
76
|
try {
|
|
77
|
+
const contentLength = (await fs.stat(putPath)).size;
|
|
78
|
+
const webStream = Readable.toWeb(createReadStream(putPath));
|
|
50
79
|
const resp = await fetch(url, {
|
|
51
80
|
method: "PUT",
|
|
52
81
|
body: webStream,
|
|
53
|
-
headers: { "Content-Type": contentType },
|
|
82
|
+
headers: { "Content-Type": contentType, "Content-Length": String(contentLength) },
|
|
54
83
|
duplex: "half",
|
|
55
84
|
signal: controller.signal,
|
|
56
85
|
});
|
|
57
86
|
if (!resp.ok) {
|
|
58
87
|
const t = await resp.text().catch(() => "");
|
|
59
|
-
throw
|
|
88
|
+
throw presignedPutError(resp.status, t);
|
|
60
89
|
}
|
|
61
90
|
}
|
|
62
91
|
finally {
|
|
63
92
|
clearTimeout(timer);
|
|
93
|
+
if (tempPath)
|
|
94
|
+
await fs.rm(tempPath, { force: true }).catch(() => { });
|
|
64
95
|
}
|
|
65
96
|
}
|
|
66
97
|
// ---------------------------------------------------------------------------
|
package/dist/tools/cfd.js
CHANGED
|
@@ -27,6 +27,10 @@ COMMON MISTAKES: omitting feature_columns (required); choosing a reference_mesh
|
|
|
27
27
|
emd_method: z.enum(["exact", "sinkhorn"]).optional().describe("EMD solver: exact LP (default) or sinkhorn (fast approximation for large grids)"),
|
|
28
28
|
component_planes_physical: z.boolean().optional().describe("Physical-scale component-plane colorbars (default true)"),
|
|
29
29
|
figures: z.boolean().optional().describe("Generate publication figures (default true)"),
|
|
30
|
+
transforms: z.record(z.enum(["log", "log1p", "log10", "sqrt", "square", "abs", "invert", "none"])).optional().describe("Per-feature transform applied before normalization (e.g. log1p to compress k/epsilon/omega). Same preprocessing engine as barsom training."),
|
|
31
|
+
normalize: z.union([z.enum(["all", "auto", "mad", "sigmoidal", "sepd"]), z.array(z.string())]).optional().describe("Normalization mode for SOM features (default auto = joint z-score of non-cyclic features). Use mad for heavy-tailed channels."),
|
|
32
|
+
normalization_methods: z.record(z.enum(["zscore", "mad", "sigmoidal", "sepd", "none"])).optional().describe("Per-feature normalization override; keys must be in feature_columns."),
|
|
33
|
+
row_range: z.tuple([z.number().int().min(1), z.number().int().min(1)]).optional().describe("1-based inclusive [start, end] row slice applied during preprocessing (and to mesh labels / cell volumes)."),
|
|
30
34
|
label: z.string().optional().describe("Optional job label"),
|
|
31
35
|
}, async (args) => {
|
|
32
36
|
const { dataset_id, label, ...rest } = args;
|
package/dist/tools/datasets.js
CHANGED
|
@@ -4,54 +4,140 @@ import { z } from "zod";
|
|
|
4
4
|
import fs from "node:fs/promises";
|
|
5
5
|
import path from "node:path";
|
|
6
6
|
import { registerAuditedTool } from "../audit.js";
|
|
7
|
-
import { apiCall, resolveFilePathForUpload, textResult, pollUntilComplete, UPLOAD_DATASET_TIMEOUT_MS, LARGE_UPLOAD_BYTES, PRESIGNED_PUT_TIMEOUT_MS, POLL_STAGE_MAX_MS, streamFileSha256, putPresignedStream, } from "../shared.js";
|
|
7
|
+
import { apiCall, getWorkspaceRootAsync, resolveFilePathForUpload, textResult, pollUntilComplete, UPLOAD_DATASET_TIMEOUT_MS, LARGE_UPLOAD_BYTES, PRESIGNED_PUT_TIMEOUT_MS, POLL_STAGE_MAX_MS, streamFileSha256, putPresignedStream, } from "../shared.js";
|
|
8
|
+
import { GZIP_UPLOAD_HINT } from "../upload_hints.js";
|
|
9
|
+
/**
|
|
10
|
+
* Normalize a nullable string field from the API. Returns "" for absent values,
|
|
11
|
+
* empty strings, and the literal SQL/serialization sentinels "missing"/"null".
|
|
12
|
+
*/
|
|
13
|
+
function cleanNullable(v) {
|
|
14
|
+
if (v == null)
|
|
15
|
+
return "";
|
|
16
|
+
const s = String(v).trim();
|
|
17
|
+
return s === "" || s.toLowerCase() === "missing" || s.toLowerCase() === "null" ? "" : s;
|
|
18
|
+
}
|
|
8
19
|
export function registerDatasetsTool(server) {
|
|
9
|
-
registerAuditedTool(server, "barmesh_datasets", `Upload, preview, or
|
|
20
|
+
registerAuditedTool(server, "barmesh_datasets", `Upload, preview, list, get, subset, or delete the combined per-cell mesh CSV used for convergence analysis.
|
|
21
|
+
|
|
22
|
+
Formats: plain CSV/TSV or gzip (.csv.gz / .tsv.gz). For files above ~64 MB, prefer .csv.gz (often 2–3× smaller); large uploads use presigned direct-to-storage PUT and accept gzip bodies.
|
|
10
23
|
|
|
11
24
|
| Action | Use when |
|
|
12
25
|
|--------|----------|
|
|
13
26
|
| upload | You have prepared a combined per-cell CSV (mesh_id + feature columns + cell volume V). Do this first. |
|
|
14
27
|
| preview | After upload — verify the mesh column, feature columns, and volume column are present and numeric. |
|
|
28
|
+
| get | Fetch one dataset by id — status, staging fields, ingest_error (use after upload or when staging is slow). |
|
|
15
29
|
| list | Find dataset IDs for analysis. |
|
|
30
|
+
| subset | Shrink a huge per-cell table server-side (row_range, filters, or sample_n). |
|
|
31
|
+
| delete | Remove a dataset permanently. |
|
|
32
|
+
|
|
33
|
+
action=upload: PREFER file_path — server reads from workspace root (token-efficient). Accepts .csv, .tsv, .csv.gz, .tsv.gz. Use csv_data only for small inline pastes (<10KB). If plain CSV exceeds the 5 GB upload cap, gzip it first (.csv.gz).
|
|
16
34
|
|
|
17
35
|
BEST FOR: One combined CSV holding all meshes of a refinement study (one row per cell, a mesh label column, the physical channels, and a cell-volume column).
|
|
18
36
|
NOT FOR: Raw OpenFOAM case directories — extract a per-cell CSV first (see barmesh_prepare_mesh_data).
|
|
19
37
|
COMMON MISTAKES: omitting the cell-volume column (defaults to equal weights, which weakens the fingerprint); inconsistent feature columns across meshes.
|
|
20
38
|
ESCALATION: If preview shows a feature column as non-numeric, fix the extraction and re-upload.`, {
|
|
21
|
-
action: z
|
|
22
|
-
|
|
23
|
-
|
|
39
|
+
action: z
|
|
40
|
+
.enum(["upload", "preview", "list", "get", "subset", "delete"])
|
|
41
|
+
.describe("upload: add CSV or .csv.gz; preview: inspect columns; list: see all datasets; get: fetch one dataset metadata (status/staging); subset: create filtered subset; delete: remove dataset"),
|
|
42
|
+
name: z.string().optional().describe("Dataset name (required for upload and subset)"),
|
|
43
|
+
file_path: z
|
|
44
|
+
.string()
|
|
45
|
+
.optional()
|
|
46
|
+
.describe("Path to local CSV or .csv.gz (PREFERRED): absolute path, file:// URI, or path relative to the workspace root. NOTE: relative paths resolve against the MCP workspace root — in Cursor/IDE clients that root is often the MCP install dir, not your project, so set BARIVIA_WORKSPACE_ROOT in the MCP config env (or pass an absolute path) if a relative path is 'not accessible'. Use .csv.gz for large mesh tables."),
|
|
24
47
|
csv_data: z.string().optional().describe("Inline CSV string for small pastes only (<10KB). Prefer file_path."),
|
|
25
|
-
dataset_id: z.string().optional().describe("Dataset ID (required for preview)"),
|
|
48
|
+
dataset_id: z.string().optional().describe("Dataset ID (required for preview, get, subset, and delete)"),
|
|
26
49
|
n_rows: z.number().int().optional().default(5).describe("Sample rows to return (preview only)"),
|
|
27
|
-
|
|
28
|
-
|
|
50
|
+
row_range: z
|
|
51
|
+
.tuple([z.number().int(), z.number().int()])
|
|
52
|
+
.optional()
|
|
53
|
+
.describe("For subset: [start, end] 1-based inclusive row range (e.g. [1, 2000])"),
|
|
54
|
+
filters: z.preprocess((v) => {
|
|
55
|
+
if (v === undefined || v === null)
|
|
56
|
+
return v;
|
|
57
|
+
if (Array.isArray(v))
|
|
58
|
+
return v;
|
|
59
|
+
if (typeof v === "object" && v !== null && "column" in v)
|
|
60
|
+
return [v];
|
|
61
|
+
return v;
|
|
62
|
+
}, z
|
|
63
|
+
.array(z.object({
|
|
64
|
+
column: z.string(),
|
|
65
|
+
op: z.enum(["eq", "ne", "in", "gt", "lt", "gte", "lte", "between"]),
|
|
66
|
+
value: z.union([z.string(), z.number(), z.array(z.union([z.string(), z.number()]))]),
|
|
67
|
+
}))
|
|
68
|
+
.optional()
|
|
69
|
+
.describe("For subset: filter conditions (AND logic). Single object or array.")),
|
|
70
|
+
filter: z
|
|
71
|
+
.object({
|
|
72
|
+
column: z.string(),
|
|
73
|
+
op: z.enum(["eq", "ne", "in", "gt", "lt", "gte", "lte", "between"]),
|
|
74
|
+
value: z.union([z.string(), z.number(), z.array(z.union([z.string(), z.number()]))]),
|
|
75
|
+
})
|
|
76
|
+
.optional()
|
|
77
|
+
.describe("Deprecated — use filters instead. Single filter condition."),
|
|
78
|
+
sample_n: z
|
|
79
|
+
.number()
|
|
80
|
+
.int()
|
|
81
|
+
.min(1)
|
|
82
|
+
.optional()
|
|
83
|
+
.describe("action=subset: keep a random N-row sample (seeded, row order preserved). Use to shrink a huge table server-side."),
|
|
84
|
+
sample_seed: z
|
|
85
|
+
.number()
|
|
86
|
+
.int()
|
|
87
|
+
.optional()
|
|
88
|
+
.describe("action=subset: RNG seed for sample_n (default 42)."),
|
|
89
|
+
}, async ({ action, name, file_path, csv_data, dataset_id, n_rows, row_range, filters, filter, sample_n, sample_seed, }) => {
|
|
29
90
|
if (action === "upload") {
|
|
30
91
|
if (!name)
|
|
31
92
|
throw new Error("barmesh_datasets(upload) requires name.");
|
|
32
93
|
let body;
|
|
33
94
|
if (file_path && file_path.length > 0) {
|
|
34
|
-
// Preflight: warm plan/limits and reject over-limit uploads before reading the file.
|
|
35
95
|
await apiCall("GET", "/v1/system/info");
|
|
36
96
|
const resolved = await resolveFilePathForUpload(file_path, server);
|
|
37
|
-
const
|
|
38
|
-
|
|
39
|
-
|
|
97
|
+
const lower = resolved.toLowerCase();
|
|
98
|
+
const isGzipInput = lower.endsWith(".gz");
|
|
99
|
+
const baseExt = path.extname(isGzipInput ? lower.slice(0, -3) : lower);
|
|
100
|
+
if (baseExt !== ".csv" && baseExt !== ".tsv") {
|
|
101
|
+
throw new Error("Only .csv, .tsv, .csv.gz, or .tsv.gz files can be uploaded as datasets.");
|
|
40
102
|
}
|
|
41
103
|
const HARD_MAX_BYTES = 5 * 1024 * 1024 * 1024; // 5 GB
|
|
42
|
-
|
|
104
|
+
let stat;
|
|
105
|
+
try {
|
|
106
|
+
stat = await fs.stat(resolved);
|
|
107
|
+
}
|
|
108
|
+
catch {
|
|
109
|
+
throw new Error(`File not accessible at resolved path. Easiest fix: pass an ABSOLUTE path ` +
|
|
110
|
+
`(e.g. "/home/you/project/data.csv" or "C:\\\\Users\\\\you\\\\data.csv") or a file:// URI. ` +
|
|
111
|
+
`Relative paths resolve against the MCP workspace root (current: ${await getWorkspaceRootAsync(server)}); ` +
|
|
112
|
+
`set BARIVIA_WORKSPACE_ROOT in your MCP config env to your project directory to use them.`);
|
|
113
|
+
}
|
|
43
114
|
if (stat.size > HARD_MAX_BYTES) {
|
|
44
|
-
|
|
115
|
+
const gzipHint = isGzipInput ? "" : ` ${GZIP_UPLOAD_HINT}`;
|
|
116
|
+
throw new Error(`File too large (${(stat.size / 1024 / 1024 / 1024).toFixed(2)} GB). Maximum upload size is 5 GB.${gzipHint}`);
|
|
45
117
|
}
|
|
46
118
|
if (stat.size >= LARGE_UPLOAD_BYTES) {
|
|
47
119
|
const idem = await streamFileSha256(resolved);
|
|
48
|
-
|
|
120
|
+
let init;
|
|
121
|
+
try {
|
|
122
|
+
init = (await apiCall("POST", "/v1/datasets/upload-url", { name, size_bytes: stat.size }, { "Idempotency-Key": idem }));
|
|
123
|
+
}
|
|
124
|
+
catch (e) {
|
|
125
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
126
|
+
if (msg.includes("dataset_too_large") && !isGzipInput) {
|
|
127
|
+
throw new Error(`${msg} ${GZIP_UPLOAD_HINT}`);
|
|
128
|
+
}
|
|
129
|
+
throw e;
|
|
130
|
+
}
|
|
49
131
|
const datasetId = (init.dataset_id ?? init.id);
|
|
50
|
-
if (init.idempotent_replay) {
|
|
51
|
-
return textResult({
|
|
52
|
-
|
|
132
|
+
if (init.idempotent_replay && !init.upload_url) {
|
|
133
|
+
return textResult({
|
|
134
|
+
id: datasetId,
|
|
135
|
+
status: init.status,
|
|
136
|
+
idempotent_replay: true,
|
|
137
|
+
suggested_next_step: `barmesh_datasets(action=preview, dataset_id=${datasetId})`,
|
|
138
|
+
});
|
|
53
139
|
}
|
|
54
|
-
await putPresignedStream(init.upload_url, resolved, init.content_type ?? "application/octet-stream", PRESIGNED_PUT_TIMEOUT_MS);
|
|
140
|
+
await putPresignedStream(init.upload_url, resolved, init.content_type ?? "application/octet-stream", PRESIGNED_PUT_TIMEOUT_MS, isGzipInput);
|
|
55
141
|
const fin = (await apiCall("POST", `/v1/datasets/${datasetId}/finalize`, {}));
|
|
56
142
|
const jobId = (fin.id ?? fin.job_id);
|
|
57
143
|
const poll = await pollUntilComplete(jobId, POLL_STAGE_MAX_MS);
|
|
@@ -59,10 +145,28 @@ ESCALATION: If preview shows a feature column as non-numeric, fix the extraction
|
|
|
59
145
|
return textResult({ id: datasetId, status: "failed", error: poll.error ?? "staging failed" });
|
|
60
146
|
}
|
|
61
147
|
const ready = poll.status === "completed";
|
|
62
|
-
return textResult({
|
|
148
|
+
return textResult({
|
|
149
|
+
id: datasetId,
|
|
150
|
+
status: ready ? "ready" : "staging",
|
|
151
|
+
job_id: jobId,
|
|
63
152
|
suggested_next_step: ready
|
|
64
|
-
? `barmesh_datasets(action=preview, dataset_id=${datasetId})
|
|
65
|
-
: `Still staging; poll barmesh_jobs(action=status, job_id="${jobId}")
|
|
153
|
+
? `barmesh_datasets(action=preview, dataset_id=${datasetId}) to verify mesh, feature, and volume columns.`
|
|
154
|
+
: `Still staging; poll barmesh_jobs(action=status, job_id="${jobId}") then barmesh_datasets(action=preview, dataset_id=${datasetId}).`,
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
if (isGzipInput) {
|
|
158
|
+
const gzBytes = await fs.readFile(resolved);
|
|
159
|
+
const data = (await apiCall("POST", "/v1/datasets", gzBytes, {
|
|
160
|
+
"X-Dataset-Name": name,
|
|
161
|
+
"Content-Type": "text/csv",
|
|
162
|
+
"Content-Encoding": "gzip",
|
|
163
|
+
"Idempotency-Key": createHash("sha256").update(`${name}\n`).update(gzBytes).digest("hex"),
|
|
164
|
+
}, UPLOAD_DATASET_TIMEOUT_MS));
|
|
165
|
+
const gid = data.id ?? data.dataset_id;
|
|
166
|
+
if (gid != null) {
|
|
167
|
+
data.suggested_next_step = `Next: barmesh_datasets(action=preview, dataset_id=${gid}) to verify the mesh, feature, and volume columns.`;
|
|
168
|
+
}
|
|
169
|
+
return textResult(data);
|
|
66
170
|
}
|
|
67
171
|
body = await fs.readFile(resolved, "utf-8");
|
|
68
172
|
}
|
|
@@ -76,8 +180,6 @@ ESCALATION: If preview shows a feature column as non-numeric, fix the extraction
|
|
|
76
180
|
const uploadHeaders = {
|
|
77
181
|
"X-Dataset-Name": name,
|
|
78
182
|
"Content-Type": "text/csv",
|
|
79
|
-
// Deterministic key so a timed-out retry of the SAME upload reconciles to
|
|
80
|
-
// the original dataset server-side instead of creating a duplicate.
|
|
81
183
|
"Idempotency-Key": createHash("sha256").update(`${name}\n`).update(body).digest("hex"),
|
|
82
184
|
};
|
|
83
185
|
let uploadBody = body;
|
|
@@ -87,8 +189,9 @@ ESCALATION: If preview shows a feature column as non-numeric, fix the extraction
|
|
|
87
189
|
}
|
|
88
190
|
const data = (await apiCall("POST", "/v1/datasets", uploadBody, uploadHeaders, UPLOAD_DATASET_TIMEOUT_MS));
|
|
89
191
|
const id = data.id ?? data.dataset_id;
|
|
90
|
-
if (id != null)
|
|
192
|
+
if (id != null) {
|
|
91
193
|
data.suggested_next_step = `Next: barmesh_datasets(action=preview, dataset_id=${id}) to verify the mesh, feature, and volume columns.`;
|
|
194
|
+
}
|
|
92
195
|
return textResult(data);
|
|
93
196
|
}
|
|
94
197
|
if (action === "preview") {
|
|
@@ -97,8 +200,68 @@ ESCALATION: If preview shows a feature column as non-numeric, fix the extraction
|
|
|
97
200
|
const data = await apiCall("GET", `/v1/datasets/${dataset_id}/preview?n_rows=${n_rows ?? 5}`);
|
|
98
201
|
return textResult(data);
|
|
99
202
|
}
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
203
|
+
if (action === "subset") {
|
|
204
|
+
if (!dataset_id)
|
|
205
|
+
throw new Error("barmesh_datasets(subset) requires dataset_id.");
|
|
206
|
+
if (!name)
|
|
207
|
+
throw new Error("barmesh_datasets(subset) requires name.");
|
|
208
|
+
const allFilters = filters ?? (filter ? [filter] : undefined);
|
|
209
|
+
if (row_range === undefined && allFilters === undefined && sample_n === undefined) {
|
|
210
|
+
throw new Error("barmesh_datasets(subset) requires at least one of row_range, filters, or sample_n.");
|
|
211
|
+
}
|
|
212
|
+
const body = { name };
|
|
213
|
+
if (row_range !== undefined)
|
|
214
|
+
body.row_range = row_range;
|
|
215
|
+
if (allFilters !== undefined)
|
|
216
|
+
body.filters = allFilters;
|
|
217
|
+
if (sample_n !== undefined)
|
|
218
|
+
body.sample_n = sample_n;
|
|
219
|
+
if (sample_seed !== undefined)
|
|
220
|
+
body.sample_seed = sample_seed;
|
|
221
|
+
const data = await apiCall("POST", `/v1/datasets/${dataset_id}/subset`, body);
|
|
222
|
+
return textResult(data);
|
|
223
|
+
}
|
|
224
|
+
if (action === "list") {
|
|
225
|
+
const data = (await apiCall("GET", "/v1/datasets"));
|
|
226
|
+
if (Array.isArray(data)) {
|
|
227
|
+
const lines = data.map((ds) => {
|
|
228
|
+
const id = String(ds.id ?? "");
|
|
229
|
+
const dsName = String(ds.name ?? "");
|
|
230
|
+
const rows = ds.rows != null ? Number(ds.rows) : "?";
|
|
231
|
+
const cols = ds.cols != null ? Number(ds.cols) : "?";
|
|
232
|
+
const st = ds.status != null ? String(ds.status) : "ready";
|
|
233
|
+
const statusBit = st !== "ready" ? ` | status=${st}` : "";
|
|
234
|
+
const ingestErr = cleanNullable(ds.ingest_error);
|
|
235
|
+
const err = ingestErr ? ` | ingest_error=${ingestErr}` : "";
|
|
236
|
+
return `${dsName} (${id}) — ${rows}×${cols}${statusBit}${err}`;
|
|
237
|
+
});
|
|
238
|
+
return { content: [{ type: "text", text: lines.length > 0 ? lines.join("\n") : "No datasets." }] };
|
|
239
|
+
}
|
|
240
|
+
return textResult(data);
|
|
241
|
+
}
|
|
242
|
+
if (action === "get") {
|
|
243
|
+
if (!dataset_id)
|
|
244
|
+
throw new Error("barmesh_datasets(get) requires dataset_id.");
|
|
245
|
+
const ds = (await apiCall("GET", `/v1/datasets/${dataset_id}`));
|
|
246
|
+
const lines = [
|
|
247
|
+
`Dataset: ${ds.name ?? "?"} (${ds.id ?? dataset_id})`,
|
|
248
|
+
`Status: ${ds.status ?? "ready"}`,
|
|
249
|
+
`Rows × cols: ${ds.rows ?? "?"} × ${ds.cols ?? "?"}`,
|
|
250
|
+
ds.size_bytes != null ? `Size: ${Number(ds.size_bytes).toLocaleString()} bytes` : "",
|
|
251
|
+
ds.staged_prefix != null ? `Staged prefix: ${String(ds.staged_prefix)}` : "",
|
|
252
|
+
ds.staged_version != null ? `Staged version: ${String(ds.staged_version)}` : "",
|
|
253
|
+
ds.stage_job_id != null ? `Stage job: ${String(ds.stage_job_id)} (poll barmesh_jobs(action=status))` : "",
|
|
254
|
+
cleanNullable(ds.ingest_error) ? `Ingest error: ${cleanNullable(ds.ingest_error)}` : "",
|
|
255
|
+
ds.created_at != null ? `Created: ${String(ds.created_at)}` : "",
|
|
256
|
+
].filter(Boolean);
|
|
257
|
+
return { content: [{ type: "text", text: lines.join("\n") }] };
|
|
258
|
+
}
|
|
259
|
+
if (action === "delete") {
|
|
260
|
+
if (!dataset_id)
|
|
261
|
+
throw new Error("barmesh_datasets(delete) requires dataset_id.");
|
|
262
|
+
const data = await apiCall("DELETE", `/v1/datasets/${dataset_id}`);
|
|
263
|
+
return textResult(data);
|
|
264
|
+
}
|
|
265
|
+
throw new Error("Invalid action");
|
|
103
266
|
});
|
|
104
267
|
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export const GZIP_UPLOAD_HINT = "Try gzip: save as .csv.gz (often 2–3× smaller). The upload limit applies to the compressed file size on presigned/large uploads.";
|