vesper-wizard 2.1.5 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +2 -1
- package/build/index.js +138 -17
- package/build/install/install-service.js +5 -1
- package/build/metadata/scraper.js +26 -7
- package/build/search/engine.js +9 -5
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +464 -0
- package/build/utils/python-runtime.js +130 -0
- package/package.json +1 -1
- package/scripts/postinstall.cjs +74 -32
package/build/export/exporter.js
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import { spawn } from "child_process";
|
|
2
2
|
import path from "path";
|
|
3
3
|
import fs from "fs";
|
|
4
|
+
import { ensurePythonPackages, resolvePythonCommand } from "../utils/python-runtime.js";
|
|
4
5
|
export class DataExporter {
|
|
5
|
-
|
|
6
|
+
buildDir;
|
|
6
7
|
scriptPath;
|
|
7
8
|
constructor(buildDir = process.cwd()) {
|
|
9
|
+
this.buildDir = buildDir;
|
|
8
10
|
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
9
11
|
const dataRoot = path.join(homeDir, ".vesper");
|
|
10
12
|
const scriptPath0 = path.resolve(dataRoot, "python", "export_engine.py");
|
|
@@ -26,27 +28,38 @@ export class DataExporter {
|
|
|
26
28
|
else {
|
|
27
29
|
this.scriptPath = scriptPath0;
|
|
28
30
|
}
|
|
29
|
-
// Detect Python command
|
|
30
|
-
if (process.platform === "win32") {
|
|
31
|
-
this.pythonPath = "py";
|
|
32
|
-
}
|
|
33
31
|
}
|
|
34
32
|
/**
|
|
35
33
|
* Exports a dataset file to a specified format
|
|
36
34
|
*/
|
|
37
35
|
async export(inputFile, outputFile, format, options = {}) {
|
|
36
|
+
const pythonRequirements = [
|
|
37
|
+
{ module: "polars", packageName: "polars" },
|
|
38
|
+
];
|
|
39
|
+
if (format === "feather") {
|
|
40
|
+
pythonRequirements.push({ module: "pyarrow", packageName: "pyarrow" });
|
|
41
|
+
}
|
|
42
|
+
if (format === "tfrecord") {
|
|
43
|
+
pythonRequirements.push({ module: "tensorflow", packageName: "tensorflow" });
|
|
44
|
+
}
|
|
45
|
+
const pythonPath = await ensurePythonPackages(this.buildDir, pythonRequirements).catch(() => resolvePythonCommand(this.buildDir));
|
|
38
46
|
return new Promise((resolve, reject) => {
|
|
39
47
|
if (!fs.existsSync(inputFile)) {
|
|
40
48
|
reject(new Error(`Input file not found: ${inputFile}`));
|
|
41
49
|
return;
|
|
42
50
|
}
|
|
43
51
|
const args = [this.scriptPath, inputFile, outputFile, format, JSON.stringify(options)];
|
|
44
|
-
const
|
|
52
|
+
const childProcess = spawn(pythonPath, args, {
|
|
53
|
+
env: {
|
|
54
|
+
...process.env,
|
|
55
|
+
PYTHONIOENCODING: "utf-8",
|
|
56
|
+
},
|
|
57
|
+
});
|
|
45
58
|
let stdout = "";
|
|
46
59
|
let stderr = "";
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
60
|
+
childProcess.stdout.on("data", (data) => stdout += data.toString());
|
|
61
|
+
childProcess.stderr.on("data", (data) => stderr += data.toString());
|
|
62
|
+
childProcess.on("close", (code) => {
|
|
50
63
|
if (code !== 0) {
|
|
51
64
|
reject(new Error(`Export failed: ${stderr || stdout}`));
|
|
52
65
|
return;
|
|
@@ -3,6 +3,7 @@ import path from "path";
|
|
|
3
3
|
import http from "http";
|
|
4
4
|
import https from "https";
|
|
5
5
|
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
6
|
+
import { analyzeDatasetQuery } from "../search/query-intent.js";
|
|
6
7
|
export class UnifiedDatasetGateway {
|
|
7
8
|
deps;
|
|
8
9
|
constructor(deps) {
|
|
@@ -236,7 +237,7 @@ export class UnifiedDatasetGateway {
|
|
|
236
237
|
async discoverFromSource(source, query, limit) {
|
|
237
238
|
switch (source) {
|
|
238
239
|
case "huggingface":
|
|
239
|
-
return await new HuggingFaceScraper().scrape(limit, true, query);
|
|
240
|
+
return await new HuggingFaceScraper().scrape(limit, true, await analyzeDatasetQuery(query));
|
|
240
241
|
case "openml":
|
|
241
242
|
return await this.deps.openmlSource.discover(query, limit);
|
|
242
243
|
case "kaggle":
|
package/build/index.js
CHANGED
|
@@ -361,6 +361,21 @@ function extractRequestedRows(query, requirements) {
|
|
|
361
361
|
if (Number.isFinite(n) && n > 0)
|
|
362
362
|
return n;
|
|
363
363
|
}
|
|
364
|
+
const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
|
|
365
|
+
.map(m => Number(m[0].replace(/,/g, "")))
|
|
366
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
367
|
+
if (commaNumbers.length > 0)
|
|
368
|
+
return Math.max(...commaNumbers);
|
|
369
|
+
const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
|
|
370
|
+
.map(m => {
|
|
371
|
+
const base = Number(m[1]);
|
|
372
|
+
const suffix = m[2].toLowerCase();
|
|
373
|
+
const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
|
|
374
|
+
return Math.round(base * multiplier);
|
|
375
|
+
})
|
|
376
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
377
|
+
if (humanSized.length > 0)
|
|
378
|
+
return Math.max(...humanSized);
|
|
364
379
|
const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
|
|
365
380
|
.map(m => Number(m[0]))
|
|
366
381
|
.filter(n => Number.isFinite(n) && n > 0);
|
|
@@ -644,7 +659,7 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
644
659
|
console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
|
|
645
660
|
const metadata = job.metadata ? JSON.parse(job.metadata) : {};
|
|
646
661
|
switch (job.type) {
|
|
647
|
-
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
|
|
662
|
+
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
|
|
648
663
|
case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
|
|
649
664
|
default: throw new Error(`Unhandled job type: ${job.type}`);
|
|
650
665
|
}
|
|
@@ -662,7 +677,7 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
662
677
|
/**
|
|
663
678
|
* Logic for preparing a dataset (Search + Ingest + Process)
|
|
664
679
|
*/
|
|
665
|
-
async function handlePrepareJob(jobId, query, requirements) {
|
|
680
|
+
async function handlePrepareJob(jobId, query, requirements, outputDir) {
|
|
666
681
|
hydrateExternalKeys();
|
|
667
682
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
668
683
|
const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
|
|
@@ -689,6 +704,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
689
704
|
// Continue anyway - direct file downloads may still work without datasets lib
|
|
690
705
|
}
|
|
691
706
|
const requestedRows = extractRequestedRows(query, requirements);
|
|
707
|
+
const searchQuery = requirements ? `${query} ${requirements}` : query;
|
|
692
708
|
let selectedDataset;
|
|
693
709
|
let datasetIdForDownload = "";
|
|
694
710
|
let source;
|
|
@@ -729,7 +745,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
729
745
|
else {
|
|
730
746
|
markPipelineStep("search", "running");
|
|
731
747
|
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
732
|
-
const results = await searchEngine.search(
|
|
748
|
+
const results = await searchEngine.search(searchQuery, { limit: 10 });
|
|
733
749
|
if (results.length === 0) {
|
|
734
750
|
markPipelineStep("search", "failed");
|
|
735
751
|
throw new Error("No datasets found matching the query. Try refining your search terms.");
|
|
@@ -777,7 +793,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
777
793
|
let currentRows = await countRows(rawFilePath);
|
|
778
794
|
if (currentRows < requestedRows) {
|
|
779
795
|
update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
|
|
780
|
-
const additional = await searchEngine.search(
|
|
796
|
+
const additional = await searchEngine.search(searchQuery, { limit: 8 });
|
|
781
797
|
const sourceFiles = [rawFilePath];
|
|
782
798
|
let totalRows = currentRows;
|
|
783
799
|
for (const ds of additional) {
|
|
@@ -880,9 +896,52 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
880
896
|
quality_score: qualityScore
|
|
881
897
|
});
|
|
882
898
|
}
|
|
899
|
+
else {
|
|
900
|
+
// Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
|
|
901
|
+
try {
|
|
902
|
+
const existingMeta = metadataStore.getDataset(datasetIdForDownload);
|
|
903
|
+
if (!existingMeta) {
|
|
904
|
+
metadataStore.saveDataset({
|
|
905
|
+
id: datasetIdForDownload,
|
|
906
|
+
source: source,
|
|
907
|
+
name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
|
|
908
|
+
description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
|
|
909
|
+
quality_warnings: [],
|
|
910
|
+
downloads: 0,
|
|
911
|
+
likes: 0,
|
|
912
|
+
stars: 0,
|
|
913
|
+
tags: [],
|
|
914
|
+
last_updated: new Date().toISOString(),
|
|
915
|
+
task: "unknown",
|
|
916
|
+
domain: "unknown",
|
|
917
|
+
languages: [],
|
|
918
|
+
splits: [],
|
|
919
|
+
license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
|
|
920
|
+
quality_score: qualityScore,
|
|
921
|
+
download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
|
|
922
|
+
total_examples: 0,
|
|
923
|
+
is_structured: false,
|
|
924
|
+
has_target_column: false,
|
|
925
|
+
is_safe_source: true,
|
|
926
|
+
has_personal_data: false,
|
|
927
|
+
is_paywalled: false,
|
|
928
|
+
is_scraped_web_data: false,
|
|
929
|
+
uses_https: true,
|
|
930
|
+
has_train_split: false,
|
|
931
|
+
has_test_split: false,
|
|
932
|
+
has_validation_split: false,
|
|
933
|
+
description_length: 0,
|
|
934
|
+
has_readme: false,
|
|
935
|
+
});
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
catch (e) {
|
|
939
|
+
console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
940
|
+
}
|
|
941
|
+
}
|
|
883
942
|
markPipelineStep("register", "running");
|
|
884
943
|
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
885
|
-
const installPath = await installService.install(datasetIdForDownload, rawFilePath);
|
|
944
|
+
const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
|
|
886
945
|
update({ progress: 100, status_text: "Preparation complete!" });
|
|
887
946
|
// Register prepared dataset in local registry for lookup by export/list tools
|
|
888
947
|
try {
|
|
@@ -1013,7 +1072,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1013
1072
|
},
|
|
1014
1073
|
target_dir: {
|
|
1015
1074
|
type: "string",
|
|
1016
|
-
description: "Optional output directory for operation='download'.",
|
|
1075
|
+
description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
|
|
1076
|
+
},
|
|
1077
|
+
output_dir: {
|
|
1078
|
+
type: "string",
|
|
1079
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1017
1080
|
},
|
|
1018
1081
|
public_only: {
|
|
1019
1082
|
type: "boolean",
|
|
@@ -1052,7 +1115,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1052
1115
|
},
|
|
1053
1116
|
{
|
|
1054
1117
|
name: "download_dataset",
|
|
1055
|
-
description: "Download a dataset by source and ID/slug into local
|
|
1118
|
+
description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
|
|
1056
1119
|
inputSchema: {
|
|
1057
1120
|
type: "object",
|
|
1058
1121
|
properties: {
|
|
@@ -1067,7 +1130,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1067
1130
|
},
|
|
1068
1131
|
target_dir: {
|
|
1069
1132
|
type: "string",
|
|
1070
|
-
description: "Optional target directory for downloaded files.",
|
|
1133
|
+
description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
|
|
1134
|
+
},
|
|
1135
|
+
output_dir: {
|
|
1136
|
+
type: "string",
|
|
1137
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1071
1138
|
}
|
|
1072
1139
|
},
|
|
1073
1140
|
required: ["dataset_id"],
|
|
@@ -1194,6 +1261,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1194
1261
|
properties: {
|
|
1195
1262
|
query: { type: "string" },
|
|
1196
1263
|
requirements: { type: "string" },
|
|
1264
|
+
target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
|
|
1265
|
+
output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
|
|
1197
1266
|
download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
|
|
1198
1267
|
cleaning_options: { type: "object" },
|
|
1199
1268
|
split_config: { type: "object" },
|
|
@@ -1238,7 +1307,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1238
1307
|
},
|
|
1239
1308
|
target_dir: {
|
|
1240
1309
|
type: "string",
|
|
1241
|
-
description: "Optional custom local directory for export
|
|
1310
|
+
description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
|
|
1311
|
+
},
|
|
1312
|
+
output_dir: {
|
|
1313
|
+
type: "string",
|
|
1314
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1242
1315
|
},
|
|
1243
1316
|
format: {
|
|
1244
1317
|
type: "string",
|
|
@@ -1425,7 +1498,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1425
1498
|
if (tool === "vesper_export" && req === "split") {
|
|
1426
1499
|
// Auto-trigger prepare_dataset (start a background prepare job)
|
|
1427
1500
|
try {
|
|
1428
|
-
jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
|
|
1501
|
+
jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
1429
1502
|
// Mark split as complete so export can proceed; export handler will also wait for data if needed.
|
|
1430
1503
|
markStepComplete(String(datasetId), "split");
|
|
1431
1504
|
}
|
|
@@ -1481,6 +1554,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1481
1554
|
if (!datasetId) {
|
|
1482
1555
|
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
|
|
1483
1556
|
}
|
|
1557
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
1558
|
+
? String(request.params.arguments.target_dir).trim()
|
|
1559
|
+
: request.params.arguments?.output_dir
|
|
1560
|
+
? String(request.params.arguments.output_dir).trim()
|
|
1561
|
+
: "";
|
|
1562
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
1484
1563
|
try {
|
|
1485
1564
|
await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
|
|
1486
1565
|
}
|
|
@@ -1490,7 +1569,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1490
1569
|
const result = await unifiedDatasetGateway.download({
|
|
1491
1570
|
datasetId,
|
|
1492
1571
|
source,
|
|
1493
|
-
targetDir
|
|
1572
|
+
targetDir,
|
|
1494
1573
|
});
|
|
1495
1574
|
try {
|
|
1496
1575
|
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
@@ -1597,7 +1676,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1597
1676
|
hydrateExternalKeys();
|
|
1598
1677
|
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1599
1678
|
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1600
|
-
const
|
|
1679
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
1680
|
+
? String(request.params.arguments.target_dir).trim()
|
|
1681
|
+
: request.params.arguments?.output_dir
|
|
1682
|
+
? String(request.params.arguments.output_dir).trim()
|
|
1683
|
+
: "";
|
|
1684
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
1601
1685
|
if (!datasetId) {
|
|
1602
1686
|
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1603
1687
|
}
|
|
@@ -1804,8 +1888,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1804
1888
|
}
|
|
1805
1889
|
const dataset = metadataStore.getDataset(datasetId);
|
|
1806
1890
|
if (!dataset) {
|
|
1891
|
+
// Fallback: check the registry for local path info
|
|
1892
|
+
const regEntry = getRegistryEntry(datasetId);
|
|
1893
|
+
const regPath = regEntry?.local_path || regEntry?.path;
|
|
1894
|
+
if (regEntry) {
|
|
1895
|
+
const exists = regPath && fs.existsSync(regPath);
|
|
1896
|
+
return {
|
|
1897
|
+
content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
|
|
1898
|
+
};
|
|
1899
|
+
}
|
|
1807
1900
|
return {
|
|
1808
|
-
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}
|
|
1901
|
+
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
|
|
1809
1902
|
isError: true,
|
|
1810
1903
|
};
|
|
1811
1904
|
}
|
|
@@ -1975,10 +2068,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1975
2068
|
const query = String(request.params.arguments?.query);
|
|
1976
2069
|
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
1977
2070
|
const downloadImages = request.params.arguments?.download_images === true;
|
|
2071
|
+
const requestedOutputDir = request.params.arguments?.target_dir
|
|
2072
|
+
? String(request.params.arguments.target_dir).trim()
|
|
2073
|
+
: request.params.arguments?.output_dir
|
|
2074
|
+
? String(request.params.arguments.output_dir).trim()
|
|
2075
|
+
: "";
|
|
2076
|
+
const outputDir = requestedOutputDir || process.cwd();
|
|
1978
2077
|
if (!query || query === "undefined") {
|
|
1979
2078
|
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
1980
2079
|
}
|
|
1981
|
-
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
|
|
2080
|
+
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
|
|
1982
2081
|
return {
|
|
1983
2082
|
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
1984
2083
|
};
|
|
@@ -2019,7 +2118,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2019
2118
|
}
|
|
2020
2119
|
case "export_dataset": {
|
|
2021
2120
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2022
|
-
const
|
|
2121
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2122
|
+
? String(request.params.arguments?.target_dir).trim()
|
|
2123
|
+
: request.params.arguments?.output_dir
|
|
2124
|
+
? String(request.params.arguments?.output_dir).trim()
|
|
2125
|
+
: "";
|
|
2126
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
2023
2127
|
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
2024
2128
|
const fastMode = request.params.arguments?.fast === true;
|
|
2025
2129
|
const preview = request.params.arguments?.preview === true;
|
|
@@ -2032,7 +2136,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2032
2136
|
console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
|
|
2033
2137
|
// Start a prepare job for this dataset id (acts like calling prepare_dataset)
|
|
2034
2138
|
try {
|
|
2035
|
-
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
|
|
2139
|
+
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
2036
2140
|
}
|
|
2037
2141
|
catch (e) {
|
|
2038
2142
|
console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
|
|
@@ -2115,7 +2219,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2115
2219
|
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
2116
2220
|
const ext = extMap[requestedFormat] || ".feather";
|
|
2117
2221
|
const safeName = toSafeDatasetPathFragment(datasetId);
|
|
2118
|
-
const outDir = targetDir
|
|
2222
|
+
const outDir = targetDir;
|
|
2119
2223
|
if (!fs.existsSync(outDir))
|
|
2120
2224
|
fs.mkdirSync(outDir, { recursive: true });
|
|
2121
2225
|
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
@@ -2151,6 +2255,23 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2151
2255
|
};
|
|
2152
2256
|
}
|
|
2153
2257
|
}
|
|
2258
|
+
case "vesper_list_datasets": {
|
|
2259
|
+
const entries = readRegistry();
|
|
2260
|
+
if (entries.length === 0) {
|
|
2261
|
+
return {
|
|
2262
|
+
content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
|
|
2263
|
+
};
|
|
2264
|
+
}
|
|
2265
|
+
const lines = entries.map((e, i) => {
|
|
2266
|
+
const id = e.dataset_id || e.id || "unknown";
|
|
2267
|
+
const localPath = e.local_path || e.path || "unknown";
|
|
2268
|
+
const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
|
|
2269
|
+
return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
|
|
2270
|
+
});
|
|
2271
|
+
return {
|
|
2272
|
+
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
2273
|
+
};
|
|
2274
|
+
}
|
|
2154
2275
|
case "fuse_datasets": {
|
|
2155
2276
|
const rawSources = request.params.arguments?.sources;
|
|
2156
2277
|
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
@@ -21,7 +21,11 @@ export class InstallService {
|
|
|
21
21
|
// Create target directory
|
|
22
22
|
const installLabel = dataset?.name || datasetId;
|
|
23
23
|
const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
|
|
24
|
-
|
|
24
|
+
// If caller specified a target dir, use it directly (don't nest under datasets/)
|
|
25
|
+
// Otherwise fall back to the project root's datasets/ folder
|
|
26
|
+
const installDir = targetDir
|
|
27
|
+
? path.resolve(targetDir)
|
|
28
|
+
: path.join(this.projectRoot, "datasets", sanitizedName);
|
|
25
29
|
if (!fs.existsSync(installDir)) {
|
|
26
30
|
fs.mkdirSync(installDir, { recursive: true });
|
|
27
31
|
}
|
|
@@ -3,22 +3,29 @@ import { categorizeLicense } from "./license.js";
|
|
|
3
3
|
import { calculateQualityScore } from "./quality.js";
|
|
4
4
|
import { classifyDomain } from "./domain.js";
|
|
5
5
|
import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
|
|
6
|
+
import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent } from "../search/query-intent.js";
|
|
6
7
|
export class HuggingFaceScraper {
|
|
7
8
|
/**
|
|
8
9
|
* Bulk discovery: Fetch many datasets quickly without deep details.
|
|
9
10
|
* Hits the 25k target in minutes.
|
|
10
11
|
*/
|
|
11
|
-
async scrapeBulk(limit = 1000,
|
|
12
|
+
async scrapeBulk(limit = 1000, queryOrIntent) {
|
|
13
|
+
const intent = typeof queryOrIntent === "string"
|
|
14
|
+
? await analyzeDatasetQuery(queryOrIntent)
|
|
15
|
+
: queryOrIntent;
|
|
16
|
+
const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
|
|
17
|
+
const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
|
|
12
18
|
const filterMsg = query ? `, query: ${query}` : "";
|
|
13
19
|
console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
|
|
14
20
|
const results = [];
|
|
15
21
|
let processed = 0;
|
|
16
22
|
try {
|
|
17
23
|
const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
24
|
+
const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
|
|
18
25
|
for await (const ds of listDatasets({
|
|
19
26
|
limit: limit,
|
|
20
27
|
additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
|
|
21
|
-
search: { query:
|
|
28
|
+
search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
|
|
22
29
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
23
30
|
})) {
|
|
24
31
|
if (results.length >= limit)
|
|
@@ -86,8 +93,12 @@ export class HuggingFaceScraper {
|
|
|
86
93
|
}
|
|
87
94
|
return results;
|
|
88
95
|
}
|
|
89
|
-
async scrape(limit = 100, applyMVPFilters = true,
|
|
90
|
-
|
|
96
|
+
async scrape(limit = 100, applyMVPFilters = true, queryOrIntent) {
|
|
97
|
+
const intent = typeof queryOrIntent === "string"
|
|
98
|
+
? await analyzeDatasetQuery(queryOrIntent)
|
|
99
|
+
: queryOrIntent;
|
|
100
|
+
const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
|
|
101
|
+
const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
|
|
91
102
|
const filterMsg = query ? `, query: ${query}` : "";
|
|
92
103
|
console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
|
|
93
104
|
const results = [];
|
|
@@ -110,10 +121,11 @@ export class HuggingFaceScraper {
|
|
|
110
121
|
}
|
|
111
122
|
// Add delay between batches to avoid rate limits
|
|
112
123
|
const BATCH_DELAY = hfToken ? 500 : 2000;
|
|
124
|
+
const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
|
|
113
125
|
for await (const ds of listDatasets({
|
|
114
126
|
limit: fetchLimit,
|
|
115
127
|
additionalFields: ["description", "tags"],
|
|
116
|
-
search: { query:
|
|
128
|
+
search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
|
|
117
129
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
118
130
|
})) {
|
|
119
131
|
if (results.length >= limit)
|
|
@@ -290,6 +302,9 @@ export class HuggingFaceScraper {
|
|
|
290
302
|
description_length: description.length,
|
|
291
303
|
has_readme: !!(cardData.readme || cardData.readme_content)
|
|
292
304
|
};
|
|
305
|
+
if (intent) {
|
|
306
|
+
metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
|
|
307
|
+
}
|
|
293
308
|
results.push(metadata);
|
|
294
309
|
}
|
|
295
310
|
catch (e) {
|
|
@@ -340,8 +355,12 @@ export class HuggingFaceScraper {
|
|
|
340
355
|
if (otherErrors > 0) {
|
|
341
356
|
console.error(`[HF Scraper] ⚠️ ${otherErrors} datasets skipped due to errors`);
|
|
342
357
|
}
|
|
343
|
-
|
|
344
|
-
|
|
358
|
+
return results.sort((a, b) => {
|
|
359
|
+
const intentDelta = Number(b.intent_score || 0) - Number(a.intent_score || 0);
|
|
360
|
+
if (intentDelta !== 0)
|
|
361
|
+
return intentDelta;
|
|
362
|
+
return b.downloads - a.downloads;
|
|
363
|
+
});
|
|
345
364
|
}
|
|
346
365
|
extractTask(tags) {
|
|
347
366
|
const taskTags = [
|
package/build/search/engine.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { JITOrchestrator } from "./jit-orchestrator.js";
|
|
2
|
+
import { analyzeDatasetQuery, scoreDatasetAgainstIntent } from "./query-intent.js";
|
|
2
3
|
import fs from "fs";
|
|
3
4
|
function log(msg) {
|
|
4
5
|
fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
|
|
@@ -17,9 +18,10 @@ export class SearchEngine {
|
|
|
17
18
|
async search(query, options = {}) {
|
|
18
19
|
const limit = options.limit || 5;
|
|
19
20
|
const enableJIT = options.enableJIT !== false; // Default: true
|
|
21
|
+
const intent = await analyzeDatasetQuery(query);
|
|
20
22
|
log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
|
|
21
23
|
// 1. Perform local search
|
|
22
|
-
const localResults = await this.localSearch(query, options);
|
|
24
|
+
const localResults = await this.localSearch(query, options, intent);
|
|
23
25
|
// 2. Check if JIT should be triggered
|
|
24
26
|
const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
|
|
25
27
|
if (!shouldTrigger) {
|
|
@@ -28,10 +30,10 @@ export class SearchEngine {
|
|
|
28
30
|
}
|
|
29
31
|
// 3. Trigger JIT fallback
|
|
30
32
|
console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
|
|
31
|
-
await this.jitOrchestrator.fetchAndIngest(query, 10);
|
|
33
|
+
await this.jitOrchestrator.fetchAndIngest(query, 10, intent);
|
|
32
34
|
// 4. Re-run local search with updated index
|
|
33
35
|
console.error(`Re-searching with updated library...`);
|
|
34
|
-
const enhancedResults = await this.localSearch(query, options);
|
|
36
|
+
const enhancedResults = await this.localSearch(query, options, intent);
|
|
35
37
|
const newCount = enhancedResults.length - localResults.length;
|
|
36
38
|
if (newCount > 0) {
|
|
37
39
|
console.error(`Found ${newCount} additional results\n`);
|
|
@@ -41,7 +43,7 @@ export class SearchEngine {
|
|
|
41
43
|
/**
|
|
42
44
|
* Perform hybrid search (Vector + Lexical + Penalties)
|
|
43
45
|
*/
|
|
44
|
-
async localSearch(query, options) {
|
|
46
|
+
async localSearch(query, options, intent) {
|
|
45
47
|
const limit = options.limit || 5;
|
|
46
48
|
// 1. Parse Query
|
|
47
49
|
const words = query.toLowerCase().split(/\s+/);
|
|
@@ -136,11 +138,13 @@ export class SearchEngine {
|
|
|
136
138
|
bonus = sourceBonuses[metadata.source] || 0;
|
|
137
139
|
// Final Combined Score
|
|
138
140
|
// 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
|
|
139
|
-
const
|
|
141
|
+
const intentScore = scoreDatasetAgainstIntent(metadata, intent);
|
|
142
|
+
const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus + intentScore;
|
|
140
143
|
metadata.relevance_score = Math.round(finalScore * 100) / 100;
|
|
141
144
|
metadata.vector_score = Math.round(vectorScore * 100) / 100;
|
|
142
145
|
metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
|
|
143
146
|
metadata.accessibility_bonus = bonus;
|
|
147
|
+
metadata.intent_score = intentScore;
|
|
144
148
|
results.push(metadata);
|
|
145
149
|
}
|
|
146
150
|
// Sort by final score and limit
|
|
@@ -2,6 +2,7 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
|
2
2
|
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
3
3
|
import { GitHubScraper } from "../metadata/github-scraper.js";
|
|
4
4
|
import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
|
|
5
|
+
import { analyzeDatasetQuery, buildIntentSearchQuery } from "./query-intent.js";
|
|
5
6
|
// Common stop words to filter out for better search
|
|
6
7
|
const STOP_WORDS = new Set([
|
|
7
8
|
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
@@ -61,7 +62,7 @@ export class JITOrchestrator {
|
|
|
61
62
|
/**
|
|
62
63
|
* Main JIT workflow: fetch, save, index, return new datasets
|
|
63
64
|
*/
|
|
64
|
-
async fetchAndIngest(query, limit = 10) {
|
|
65
|
+
async fetchAndIngest(query, limit = 10, providedIntent) {
|
|
65
66
|
// Rate limiting check
|
|
66
67
|
if (!this.canTrigger(query)) {
|
|
67
68
|
console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
|
|
@@ -69,9 +70,12 @@ export class JITOrchestrator {
|
|
|
69
70
|
}
|
|
70
71
|
console.error(`\n[JIT] Searching live sources for: "${query}"`);
|
|
71
72
|
this.lastTriggerTime.set(query, Date.now());
|
|
72
|
-
|
|
73
|
-
const keywords = this.simplifyQuery(
|
|
74
|
-
if (
|
|
73
|
+
const intent = providedIntent || await analyzeDatasetQuery(query);
|
|
74
|
+
const keywords = this.simplifyQuery(buildIntentSearchQuery(intent));
|
|
75
|
+
if (intent.llmBacked || intent.language || intent.task || intent.domain || intent.minRows) {
|
|
76
|
+
console.error(`[JIT] Intent: ${JSON.stringify({ language: intent.language, task: intent.task, domain: intent.domain, minRows: intent.minRows, searchQuery: intent.searchQuery })}`);
|
|
77
|
+
}
|
|
78
|
+
else if (keywords.length > 0) {
|
|
75
79
|
console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
|
|
76
80
|
}
|
|
77
81
|
const newDatasets = [];
|
|
@@ -81,15 +85,16 @@ export class JITOrchestrator {
|
|
|
81
85
|
// Get existing dataset IDs to avoid duplicates
|
|
82
86
|
const existing = this.metadataStore.getAllDatasets();
|
|
83
87
|
existing.forEach(ds => existingIds.add(ds.id));
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
88
|
+
let hfResults = await this.scrapeHuggingFace(intent, limit);
|
|
89
|
+
if (hfResults.length < Math.max(3, Math.floor(limit / 2))) {
|
|
90
|
+
for (const keyword of keywords) {
|
|
91
|
+
if (hfResults.length >= limit)
|
|
92
|
+
break;
|
|
93
|
+
const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / Math.max(keywords.length, 1)));
|
|
94
|
+
for (const ds of results) {
|
|
95
|
+
if (!hfResults.some(existing => existing.id === ds.id)) {
|
|
96
|
+
hfResults.push(ds);
|
|
97
|
+
}
|
|
93
98
|
}
|
|
94
99
|
}
|
|
95
100
|
}
|
|
@@ -170,7 +175,6 @@ export class JITOrchestrator {
|
|
|
170
175
|
async scrapeHuggingFace(query, limit) {
|
|
171
176
|
const scraper = new HuggingFaceScraper();
|
|
172
177
|
try {
|
|
173
|
-
// Pass the query as a general search term
|
|
174
178
|
return await scraper.scrape(limit, true, query);
|
|
175
179
|
}
|
|
176
180
|
catch (error) {
|