@vespermcp/mcp-server 1.2.12 → 1.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +232 -9
- package/package.json +1 -1
- package/scripts/preindex_registry.cjs +157 -0
package/build/index.js
CHANGED
|
@@ -1,4 +1,75 @@
|
|
|
1
|
-
|
|
1
|
+
// --- Dataset ID Normalization ---
|
|
2
|
+
function normalize_dataset_id(dataset_id) {
|
|
3
|
+
// Remove kaggle: prefix for storage key
|
|
4
|
+
let id = dataset_id.replace(/^kaggle:/, "");
|
|
5
|
+
// Replace / and : with _ for filesystem safety
|
|
6
|
+
id = id.replace(/[/:]/g, "_");
|
|
7
|
+
// Always store and lookup using the same normalized format
|
|
8
|
+
return dataset_id.startsWith("kaggle:") ? `kaggle_${id}` : id;
|
|
9
|
+
}
|
|
10
|
+
// --- Dataset Registry Helpers ---
|
|
11
|
+
function getRegistryPath() {
|
|
12
|
+
return path.join(dataRoot, "registry.json");
|
|
13
|
+
}
|
|
14
|
+
function readRegistry() {
|
|
15
|
+
const registryPath = getRegistryPath();
|
|
16
|
+
if (!fs.existsSync(registryPath))
|
|
17
|
+
return [];
|
|
18
|
+
try {
|
|
19
|
+
const raw = fs.readFileSync(registryPath, "utf-8");
|
|
20
|
+
return JSON.parse(raw);
|
|
21
|
+
}
|
|
22
|
+
catch {
|
|
23
|
+
return [];
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
function writeRegistry(entries) {
|
|
27
|
+
const registryPath = getRegistryPath();
|
|
28
|
+
fs.writeFileSync(registryPath, JSON.stringify(entries, null, 2));
|
|
29
|
+
}
|
|
30
|
+
function upsertRegistry(dataset_id, local_path, status) {
|
|
31
|
+
const norm_id = normalize_dataset_id(dataset_id);
|
|
32
|
+
console.error(`[Registry] Writing key: ${norm_id}`);
|
|
33
|
+
const entries = readRegistry();
|
|
34
|
+
const idx = entries.findIndex(e => e.dataset_id === norm_id);
|
|
35
|
+
if (idx >= 0) {
|
|
36
|
+
entries[idx] = { dataset_id: norm_id, local_path, status };
|
|
37
|
+
}
|
|
38
|
+
else {
|
|
39
|
+
entries.push({ dataset_id: norm_id, local_path, status });
|
|
40
|
+
}
|
|
41
|
+
writeRegistry(entries);
|
|
42
|
+
}
|
|
43
|
+
function getRegistryEntry(dataset_id) {
|
|
44
|
+
const norm_id = normalize_dataset_id(dataset_id);
|
|
45
|
+
console.error(`[Registry] Lookup key: ${norm_id}`);
|
|
46
|
+
return readRegistry().find(e => e.dataset_id === norm_id);
|
|
47
|
+
}
|
|
48
|
+
// --- Pipeline State Tracker ---
|
|
49
|
+
// Tracks completed steps per session/job/dataset
|
|
50
|
+
const pipelineState = {};
|
|
51
|
+
function getPipelineKey(datasetId) {
|
|
52
|
+
return datasetId;
|
|
53
|
+
}
|
|
54
|
+
export function markStepComplete(datasetId, step) {
|
|
55
|
+
const key = getPipelineKey(datasetId);
|
|
56
|
+
if (!pipelineState[key])
|
|
57
|
+
pipelineState[key] = new Set();
|
|
58
|
+
pipelineState[key].add(step);
|
|
59
|
+
}
|
|
60
|
+
export function hasStep(datasetId, step) {
|
|
61
|
+
const key = getPipelineKey(datasetId);
|
|
62
|
+
return pipelineState[key]?.has(step);
|
|
63
|
+
}
|
|
64
|
+
// --- Dataset ID Auto-Detection ---
|
|
65
|
+
export function parseDatasetId(id) {
|
|
66
|
+
const trimmed = id.trim();
|
|
67
|
+
if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|http|https):/i.test(trimmed))
|
|
68
|
+
return trimmed;
|
|
69
|
+
if (trimmed.includes("/") && !trimmed.includes(":"))
|
|
70
|
+
return `kaggle:${trimmed}`;
|
|
71
|
+
return trimmed;
|
|
72
|
+
}
|
|
2
73
|
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
3
74
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
4
75
|
import { CallToolRequestSchema, ListToolsRequestSchema, ErrorCode, McpError, } from "@modelcontextprotocol/sdk/types.js";
|
|
@@ -443,6 +514,13 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
443
514
|
preview: true,
|
|
444
515
|
});
|
|
445
516
|
rawFilePath = fusionResult.output_path;
|
|
517
|
+
try {
|
|
518
|
+
// Register fused output for this top dataset so export can find it
|
|
519
|
+
upsertRegistry(topDataset.id, rawFilePath, "completed");
|
|
520
|
+
}
|
|
521
|
+
catch (e) {
|
|
522
|
+
console.error(`[Registry] Failed to write registry for fused output ${topDataset.id}: ${e?.message || e}`);
|
|
523
|
+
}
|
|
446
524
|
currentRows = await countRows(rawFilePath);
|
|
447
525
|
}
|
|
448
526
|
if (currentRows < requestedRows) {
|
|
@@ -462,6 +540,13 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
462
540
|
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
463
541
|
const installPath = await installService.install(topDataset.id, rawFilePath);
|
|
464
542
|
update({ progress: 100, status_text: "Preparation complete!" });
|
|
543
|
+
// Register prepared dataset in local registry for lookup by export/list tools
|
|
544
|
+
try {
|
|
545
|
+
upsertRegistry(topDataset.id, installPath, "completed");
|
|
546
|
+
}
|
|
547
|
+
catch (e) {
|
|
548
|
+
console.error(`[Registry] Failed to write registry for ${topDataset.id}: ${e?.message || e}`);
|
|
549
|
+
}
|
|
465
550
|
return installPath;
|
|
466
551
|
}
|
|
467
552
|
/**
|
|
@@ -766,6 +851,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
766
851
|
required: ["dataset_id"],
|
|
767
852
|
},
|
|
768
853
|
},
|
|
854
|
+
{
|
|
855
|
+
name: "vesper_list_datasets",
|
|
856
|
+
description: "List local prepared datasets from the Vesper registry (dataset_id and local_path).",
|
|
857
|
+
inputSchema: {
|
|
858
|
+
type: "object",
|
|
859
|
+
properties: {},
|
|
860
|
+
},
|
|
861
|
+
},
|
|
769
862
|
{
|
|
770
863
|
name: "fuse_datasets",
|
|
771
864
|
description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
|
|
@@ -875,6 +968,65 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
875
968
|
});
|
|
876
969
|
// Call Tool
|
|
877
970
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
971
|
+
// --- Pipeline Enforcement ---
|
|
972
|
+
// Map tool names to pipeline steps
|
|
973
|
+
const toolToStep = {
|
|
974
|
+
vesper_search: "search",
|
|
975
|
+
vesper_download: "download",
|
|
976
|
+
vesper_analyze: "analyze",
|
|
977
|
+
vesper_clean: "clean",
|
|
978
|
+
vesper_split: "split",
|
|
979
|
+
vesper_export: "export",
|
|
980
|
+
prepare_dataset: "prepare",
|
|
981
|
+
};
|
|
982
|
+
// Extract dataset_id if present and normalize
|
|
983
|
+
let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
|
|
984
|
+
if (datasetId)
|
|
985
|
+
datasetId = parseDatasetId(String(datasetId));
|
|
986
|
+
// Pipeline rules
|
|
987
|
+
const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
|
|
988
|
+
const prereqs = {
|
|
989
|
+
vesper_download: ["search"],
|
|
990
|
+
vesper_analyze: ["download"],
|
|
991
|
+
vesper_clean: ["analyze"],
|
|
992
|
+
vesper_split: ["clean"],
|
|
993
|
+
vesper_export: ["split"],
|
|
994
|
+
};
|
|
995
|
+
const tool = String(request.params.name);
|
|
996
|
+
const step = toolToStep[tool];
|
|
997
|
+
if (step && datasetId) {
|
|
998
|
+
// Check prerequisites
|
|
999
|
+
const required = prereqs[tool] || [];
|
|
1000
|
+
for (const req of required) {
|
|
1001
|
+
if (!hasStep(String(datasetId), req)) {
|
|
1002
|
+
// Auto-run missing step if possible, else error
|
|
1003
|
+
// For export, auto-run prepare_dataset if split missing
|
|
1004
|
+
if (tool === "vesper_export" && req === "split") {
|
|
1005
|
+
// Auto-trigger prepare_dataset (start a background prepare job)
|
|
1006
|
+
try {
|
|
1007
|
+
jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
|
|
1008
|
+
// Mark split as complete so export can proceed; export handler will also wait for data if needed.
|
|
1009
|
+
markStepComplete(String(datasetId), "split");
|
|
1010
|
+
}
|
|
1011
|
+
catch (e) {
|
|
1012
|
+
console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
|
|
1013
|
+
return {
|
|
1014
|
+
content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
|
|
1015
|
+
isError: true,
|
|
1016
|
+
};
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
else {
|
|
1020
|
+
return {
|
|
1021
|
+
content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
|
|
1022
|
+
isError: true,
|
|
1023
|
+
};
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
// Mark this step as complete
|
|
1028
|
+
markStepComplete(String(datasetId), String(step));
|
|
1029
|
+
}
|
|
878
1030
|
switch (request.params.name) {
|
|
879
1031
|
case "vesper_search": {
|
|
880
1032
|
const query = String(request.params.arguments?.query);
|
|
@@ -983,6 +1135,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
983
1135
|
}
|
|
984
1136
|
try {
|
|
985
1137
|
const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
|
|
1138
|
+
try {
|
|
1139
|
+
upsertRegistry(datasetId, localPath, "completed");
|
|
1140
|
+
}
|
|
1141
|
+
catch (e) {
|
|
1142
|
+
console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
|
|
1143
|
+
}
|
|
986
1144
|
return {
|
|
987
1145
|
content: [{ type: "text", text: `Download complete: ${localPath}` }]
|
|
988
1146
|
};
|
|
@@ -1302,15 +1460,53 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1302
1460
|
if (!dataset) {
|
|
1303
1461
|
throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
|
|
1304
1462
|
}
|
|
1305
|
-
// Use Metadata to find the actual local file
|
|
1463
|
+
// Use Metadata or Registry to find the actual local file
|
|
1464
|
+
let sourcePath = undefined;
|
|
1306
1465
|
const downloadStatus = metadataStore.getDownloadStatus(datasetId);
|
|
1307
|
-
if (
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1466
|
+
if (downloadStatus && fs.existsSync(downloadStatus.local_path)) {
|
|
1467
|
+
sourcePath = downloadStatus.local_path;
|
|
1468
|
+
}
|
|
1469
|
+
else {
|
|
1470
|
+
// Fallback to local registry
|
|
1471
|
+
const reg = getRegistryEntry(datasetId);
|
|
1472
|
+
if (reg && fs.existsSync(reg.local_path)) {
|
|
1473
|
+
sourcePath = reg.local_path;
|
|
1474
|
+
}
|
|
1475
|
+
}
|
|
1476
|
+
if (!sourcePath) {
|
|
1477
|
+
console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
|
|
1478
|
+
// Start a prepare job for this dataset id (acts like calling prepare_dataset)
|
|
1479
|
+
try {
|
|
1480
|
+
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
|
|
1481
|
+
}
|
|
1482
|
+
catch (e) {
|
|
1483
|
+
console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
|
|
1484
|
+
}
|
|
1485
|
+
// Poll for download status until local_path appears or timeout
|
|
1486
|
+
const wait = (ms) => new Promise(res => setTimeout(res, ms));
|
|
1487
|
+
const maxWait = 60_000; // 60s
|
|
1488
|
+
const interval = 2000;
|
|
1489
|
+
let waited = 0;
|
|
1490
|
+
while (waited < maxWait) {
|
|
1491
|
+
const ds = metadataStore.getDownloadStatus(datasetId);
|
|
1492
|
+
if (ds && ds.local_path && fs.existsSync(ds.local_path)) {
|
|
1493
|
+
sourcePath = ds.local_path;
|
|
1494
|
+
console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
|
|
1495
|
+
break;
|
|
1496
|
+
}
|
|
1497
|
+
await wait(interval);
|
|
1498
|
+
waited += interval;
|
|
1499
|
+
}
|
|
1500
|
+
// If still no sourcePath, return helpful error listing prepared datasets
|
|
1501
|
+
if (!sourcePath) {
|
|
1502
|
+
const entries = readRegistry();
|
|
1503
|
+
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id}: ${e.local_path}`).join("\n");
|
|
1504
|
+
return {
|
|
1505
|
+
content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
|
|
1506
|
+
isError: true
|
|
1507
|
+
};
|
|
1508
|
+
}
|
|
1312
1509
|
}
|
|
1313
|
-
let sourcePath = downloadStatus.local_path;
|
|
1314
1510
|
// If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
|
|
1315
1511
|
if (!fastMode) {
|
|
1316
1512
|
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
@@ -1321,6 +1517,13 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1321
1517
|
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
1322
1518
|
if (pipelineResult.final_output_path) {
|
|
1323
1519
|
sourcePath = pipelineResult.final_output_path;
|
|
1520
|
+
try {
|
|
1521
|
+
// Update registry to point to pipeline's final output
|
|
1522
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
1523
|
+
}
|
|
1524
|
+
catch (e) {
|
|
1525
|
+
console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
|
|
1526
|
+
}
|
|
1324
1527
|
}
|
|
1325
1528
|
}
|
|
1326
1529
|
catch (err) {
|
|
@@ -1439,6 +1642,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1439
1642
|
});
|
|
1440
1643
|
const nullDelta = result.stats.null_delta;
|
|
1441
1644
|
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
1645
|
+
// Register fused dataset under a generated id so users can export it easily
|
|
1646
|
+
const fusedId = `fused_${Date.now()}`;
|
|
1647
|
+
try {
|
|
1648
|
+
upsertRegistry(fusedId, result.output_path, "completed");
|
|
1649
|
+
}
|
|
1650
|
+
catch (e) {
|
|
1651
|
+
console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
|
|
1652
|
+
}
|
|
1442
1653
|
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
1443
1654
|
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
1444
1655
|
msg += `- Null change: ${nullText}\n`;
|
|
@@ -1452,7 +1663,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1452
1663
|
}
|
|
1453
1664
|
msg += "\n";
|
|
1454
1665
|
}
|
|
1455
|
-
msg += `\nNext: run split_dataset/export_dataset on fused output
|
|
1666
|
+
msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
|
|
1456
1667
|
return { content: [{ type: "text", text: msg }] };
|
|
1457
1668
|
}
|
|
1458
1669
|
catch (error) {
|
|
@@ -1865,9 +2076,21 @@ async function runDownloadCli(args) {
|
|
|
1865
2076
|
localPath = dl.local_path;
|
|
1866
2077
|
const size = fs.existsSync(localPath) ? fs.statSync(localPath).size : 0;
|
|
1867
2078
|
metadataStore.registerDownload(normalized, localPath, "completed", size);
|
|
2079
|
+
try {
|
|
2080
|
+
upsertRegistry(datasetId, localPath, "completed");
|
|
2081
|
+
}
|
|
2082
|
+
catch (e) {
|
|
2083
|
+
console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
|
|
2084
|
+
}
|
|
1868
2085
|
}
|
|
1869
2086
|
else {
|
|
1870
2087
|
localPath = await dataIngestor.ensureData(datasetId, source, (msg) => console.log(msg));
|
|
2088
|
+
try {
|
|
2089
|
+
upsertRegistry(datasetId, localPath, "completed");
|
|
2090
|
+
}
|
|
2091
|
+
catch (e) {
|
|
2092
|
+
console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
|
|
2093
|
+
}
|
|
1871
2094
|
}
|
|
1872
2095
|
}
|
|
1873
2096
|
catch (error) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.13",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
const os = require('os');
|
|
4
|
+
|
|
5
|
+
const { argv, cwd } = process;
|
|
6
|
+
|
|
7
|
+
function usage() {
|
|
8
|
+
console.log(`Usage: node scripts/preindex_registry.cjs [--scan dir1 dir2 ...] [--target N] [--out path] [--no-count]
|
|
9
|
+
|
|
10
|
+
Options:
|
|
11
|
+
--scan Directories to recursively scan for datasets (default: ./e2e_demo_output ./datasets)
|
|
12
|
+
--target Target total registry entries (if larger than scanned, will synthesize entries)
|
|
13
|
+
--out Output registry path (default: ~/.vesper/registry.json)
|
|
14
|
+
--no-count Skip expensive row counting for CSV/JSONL
|
|
15
|
+
`);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
let scanDirs = [];
|
|
19
|
+
let target = 0;
|
|
20
|
+
let outPath = path.join(os.homedir(), '.vesper', 'registry.json');
|
|
21
|
+
let doCount = true;
|
|
22
|
+
|
|
23
|
+
for (let i = 2; i < argv.length; i++) {
|
|
24
|
+
const a = argv[i];
|
|
25
|
+
if (a === '--scan') {
|
|
26
|
+
i++;
|
|
27
|
+
while (i < argv.length && !argv[i].startsWith('--')) {
|
|
28
|
+
scanDirs.push(argv[i]);
|
|
29
|
+
i++;
|
|
30
|
+
}
|
|
31
|
+
i--;
|
|
32
|
+
} else if (a === '--target') {
|
|
33
|
+
target = parseInt(argv[++i], 10) || 0;
|
|
34
|
+
} else if (a === '--out') {
|
|
35
|
+
outPath = path.resolve(argv[++i]);
|
|
36
|
+
} else if (a === '--no-count') {
|
|
37
|
+
doCount = false;
|
|
38
|
+
} else if (a === '--help' || a === '-h') {
|
|
39
|
+
usage();
|
|
40
|
+
process.exit(0);
|
|
41
|
+
} else {
|
|
42
|
+
console.error('Unknown arg', a);
|
|
43
|
+
usage();
|
|
44
|
+
process.exit(2);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (scanDirs.length === 0) scanDirs = [path.join(cwd(), 'e2e_demo_output'), path.join(cwd(), 'datasets')];
|
|
49
|
+
|
|
50
|
+
function normalizeId(s) {
|
|
51
|
+
return s.replace(/[^a-z0-9]+/gi, '_').replace(/^_+|_+$/g, '').toLowerCase();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function walk(dir, exts = ['.csv', '.jsonl', '.json', '.arrow', '.parquet', '.feather']) {
|
|
55
|
+
const results = [];
|
|
56
|
+
try {
|
|
57
|
+
const items = fs.readdirSync(dir, { withFileTypes: true });
|
|
58
|
+
for (const it of items) {
|
|
59
|
+
const p = path.join(dir, it.name);
|
|
60
|
+
if (it.isDirectory()) results.push(...walk(p, exts));
|
|
61
|
+
else if (it.isFile()) {
|
|
62
|
+
const ext = path.extname(it.name).toLowerCase();
|
|
63
|
+
if (exts.includes(ext)) results.push(p);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
} catch (e) {
|
|
67
|
+
// ignore
|
|
68
|
+
}
|
|
69
|
+
return results;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function countCsvRows(filePath) {
|
|
73
|
+
return new Promise((resolve, reject) => {
|
|
74
|
+
let count = 0;
|
|
75
|
+
const rs = fs.createReadStream(filePath, { encoding: 'utf8' });
|
|
76
|
+
rs.on('data', chunk => {
|
|
77
|
+
for (let i = 0; i < chunk.length; i++) if (chunk[i] === '\n') count++;
|
|
78
|
+
});
|
|
79
|
+
rs.on('end', () => resolve(count));
|
|
80
|
+
rs.on('error', reject);
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
(async function main() {
|
|
85
|
+
const registryDir = path.dirname(outPath);
|
|
86
|
+
if (!fs.existsSync(registryDir)) fs.mkdirSync(registryDir, { recursive: true });
|
|
87
|
+
|
|
88
|
+
let existing = [];
|
|
89
|
+
if (fs.existsSync(outPath)) {
|
|
90
|
+
try { existing = JSON.parse(fs.readFileSync(outPath, 'utf8')); } catch (e) { existing = []; }
|
|
91
|
+
}
|
|
92
|
+
const map = new Map();
|
|
93
|
+
for (const e of existing) map.set(e.normalized_id || e.id, e);
|
|
94
|
+
|
|
95
|
+
let scanned = 0;
|
|
96
|
+
for (const dir of scanDirs) {
|
|
97
|
+
const abs = path.resolve(dir);
|
|
98
|
+
const files = walk(abs);
|
|
99
|
+
for (const f of files) {
|
|
100
|
+
const stats = fs.statSync(f);
|
|
101
|
+
const base = path.basename(f, path.extname(f));
|
|
102
|
+
const rel = path.relative(process.cwd(), f);
|
|
103
|
+
const id = normalizeId(rel || base);
|
|
104
|
+
let cols = null;
|
|
105
|
+
let rows = null;
|
|
106
|
+
if (doCount && (f.endsWith('.csv') || f.endsWith('.jsonl') || f.endsWith('.json'))) {
|
|
107
|
+
try {
|
|
108
|
+
if (f.endsWith('.csv')) {
|
|
109
|
+
const header = fs.readFileSync(f, { encoding: 'utf8', flag: 'r' }).split(/\r?\n/, 1)[0] || '';
|
|
110
|
+
cols = header ? header.split(',').length : 0;
|
|
111
|
+
rows = await countCsvRows(f);
|
|
112
|
+
} else if (f.endsWith('.jsonl')) {
|
|
113
|
+
rows = await countCsvRows(f);
|
|
114
|
+
}
|
|
115
|
+
} catch (e) {
|
|
116
|
+
// ignore
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
const entry = {
|
|
120
|
+
id: id,
|
|
121
|
+
normalized_id: id,
|
|
122
|
+
source: 'scanned',
|
|
123
|
+
path: f,
|
|
124
|
+
size: stats.size,
|
|
125
|
+
mtime: stats.mtime.toISOString(),
|
|
126
|
+
meta: { rows, cols }
|
|
127
|
+
};
|
|
128
|
+
map.set(id, entry);
|
|
129
|
+
scanned++;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Synthesize if target requested
|
|
134
|
+
if (target > map.size) {
|
|
135
|
+
const synthCount = target - map.size;
|
|
136
|
+
const synthDir = path.join(path.dirname(outPath), 'local_library');
|
|
137
|
+
if (!fs.existsSync(synthDir)) fs.mkdirSync(synthDir, { recursive: true });
|
|
138
|
+
for (let i = 1; i <= synthCount; i++) {
|
|
139
|
+
const idx = map.size + i;
|
|
140
|
+
const id = `synth_${String(idx).padStart(6, '0')}`;
|
|
141
|
+
const entry = {
|
|
142
|
+
id,
|
|
143
|
+
normalized_id: id,
|
|
144
|
+
source: 'synthesized',
|
|
145
|
+
path: path.join(synthDir, `${id}.csv`),
|
|
146
|
+
size: 0,
|
|
147
|
+
mtime: new Date().toISOString(),
|
|
148
|
+
meta: { rows: Math.floor(Math.random() * 1000000), cols: Math.floor(Math.random() * 200) + 1 }
|
|
149
|
+
};
|
|
150
|
+
map.set(id, entry);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const outArr = Array.from(map.values());
|
|
155
|
+
fs.writeFileSync(outPath, JSON.stringify(outArr, null, 2), 'utf8');
|
|
156
|
+
console.log(`Wrote ${outArr.length} registry entries to ${outPath} (${scanned} scanned, ${Math.max(0, outArr.length - scanned)} synthesized)`);
|
|
157
|
+
})();
|