vesper-wizard 2.1.5 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,12 @@
1
1
  import { spawn } from "child_process";
2
2
  import path from "path";
3
3
  import fs from "fs";
4
+ import { ensurePythonPackages, resolvePythonCommand } from "../utils/python-runtime.js";
4
5
  export class DataExporter {
5
- pythonPath = "python";
6
+ buildDir;
6
7
  scriptPath;
7
8
  constructor(buildDir = process.cwd()) {
9
+ this.buildDir = buildDir;
8
10
  const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
9
11
  const dataRoot = path.join(homeDir, ".vesper");
10
12
  const scriptPath0 = path.resolve(dataRoot, "python", "export_engine.py");
@@ -26,27 +28,38 @@ export class DataExporter {
26
28
  else {
27
29
  this.scriptPath = scriptPath0;
28
30
  }
29
- // Detect Python command
30
- if (process.platform === "win32") {
31
- this.pythonPath = "py";
32
- }
33
31
  }
34
32
  /**
35
33
  * Exports a dataset file to a specified format
36
34
  */
37
35
  async export(inputFile, outputFile, format, options = {}) {
36
+ const pythonRequirements = [
37
+ { module: "polars", packageName: "polars" },
38
+ ];
39
+ if (format === "feather") {
40
+ pythonRequirements.push({ module: "pyarrow", packageName: "pyarrow" });
41
+ }
42
+ if (format === "tfrecord") {
43
+ pythonRequirements.push({ module: "tensorflow", packageName: "tensorflow" });
44
+ }
45
+ const pythonPath = await ensurePythonPackages(this.buildDir, pythonRequirements).catch(() => resolvePythonCommand(this.buildDir));
38
46
  return new Promise((resolve, reject) => {
39
47
  if (!fs.existsSync(inputFile)) {
40
48
  reject(new Error(`Input file not found: ${inputFile}`));
41
49
  return;
42
50
  }
43
51
  const args = [this.scriptPath, inputFile, outputFile, format, JSON.stringify(options)];
44
- const process = spawn(this.pythonPath, args);
52
+ const childProcess = spawn(pythonPath, args, {
53
+ env: {
54
+ ...process.env,
55
+ PYTHONIOENCODING: "utf-8",
56
+ },
57
+ });
45
58
  let stdout = "";
46
59
  let stderr = "";
47
- process.stdout.on("data", (data) => stdout += data.toString());
48
- process.stderr.on("data", (data) => stderr += data.toString());
49
- process.on("close", (code) => {
60
+ childProcess.stdout.on("data", (data) => stdout += data.toString());
61
+ childProcess.stderr.on("data", (data) => stderr += data.toString());
62
+ childProcess.on("close", (code) => {
50
63
  if (code !== 0) {
51
64
  reject(new Error(`Export failed: ${stderr || stdout}`));
52
65
  return;
@@ -3,6 +3,7 @@ import path from "path";
3
3
  import http from "http";
4
4
  import https from "https";
5
5
  import { HuggingFaceScraper } from "../metadata/scraper.js";
6
+ import { analyzeDatasetQuery } from "../search/query-intent.js";
6
7
  export class UnifiedDatasetGateway {
7
8
  deps;
8
9
  constructor(deps) {
@@ -236,7 +237,7 @@ export class UnifiedDatasetGateway {
236
237
  async discoverFromSource(source, query, limit) {
237
238
  switch (source) {
238
239
  case "huggingface":
239
- return await new HuggingFaceScraper().scrape(limit, true, query);
240
+ return await new HuggingFaceScraper().scrape(limit, true, await analyzeDatasetQuery(query));
240
241
  case "openml":
241
242
  return await this.deps.openmlSource.discover(query, limit);
242
243
  case "kaggle":
package/build/index.js CHANGED
@@ -361,6 +361,21 @@ function extractRequestedRows(query, requirements) {
361
361
  if (Number.isFinite(n) && n > 0)
362
362
  return n;
363
363
  }
364
+ const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
365
+ .map(m => Number(m[0].replace(/,/g, "")))
366
+ .filter(n => Number.isFinite(n) && n > 0);
367
+ if (commaNumbers.length > 0)
368
+ return Math.max(...commaNumbers);
369
+ const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
370
+ .map(m => {
371
+ const base = Number(m[1]);
372
+ const suffix = m[2].toLowerCase();
373
+ const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
374
+ return Math.round(base * multiplier);
375
+ })
376
+ .filter(n => Number.isFinite(n) && n > 0);
377
+ if (humanSized.length > 0)
378
+ return Math.max(...humanSized);
364
379
  const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
365
380
  .map(m => Number(m[0]))
366
381
  .filter(n => Number.isFinite(n) && n > 0);
@@ -644,7 +659,7 @@ jobManager.on("processJob", async (job, execute) => {
644
659
  console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
645
660
  const metadata = job.metadata ? JSON.parse(job.metadata) : {};
646
661
  switch (job.type) {
647
- case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
662
+ case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
648
663
  case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
649
664
  default: throw new Error(`Unhandled job type: ${job.type}`);
650
665
  }
@@ -662,7 +677,7 @@ jobManager.on("processJob", async (job, execute) => {
662
677
  /**
663
678
  * Logic for preparing a dataset (Search + Ingest + Process)
664
679
  */
665
- async function handlePrepareJob(jobId, query, requirements) {
680
+ async function handlePrepareJob(jobId, query, requirements, outputDir) {
666
681
  hydrateExternalKeys();
667
682
  const update = (updates) => jobManager.updateJob(jobId, updates);
668
683
  const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
@@ -689,6 +704,7 @@ async function handlePrepareJob(jobId, query, requirements) {
689
704
  // Continue anyway - direct file downloads may still work without datasets lib
690
705
  }
691
706
  const requestedRows = extractRequestedRows(query, requirements);
707
+ const searchQuery = requirements ? `${query} ${requirements}` : query;
692
708
  let selectedDataset;
693
709
  let datasetIdForDownload = "";
694
710
  let source;
@@ -729,7 +745,7 @@ async function handlePrepareJob(jobId, query, requirements) {
729
745
  else {
730
746
  markPipelineStep("search", "running");
731
747
  update({ progress: 10, status_text: "Searching for best dataset matching query..." });
732
- const results = await searchEngine.search(query, { limit: 10 });
748
+ const results = await searchEngine.search(searchQuery, { limit: 10 });
733
749
  if (results.length === 0) {
734
750
  markPipelineStep("search", "failed");
735
751
  throw new Error("No datasets found matching the query. Try refining your search terms.");
@@ -777,7 +793,7 @@ async function handlePrepareJob(jobId, query, requirements) {
777
793
  let currentRows = await countRows(rawFilePath);
778
794
  if (currentRows < requestedRows) {
779
795
  update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
780
- const additional = await searchEngine.search(query, { limit: 8 });
796
+ const additional = await searchEngine.search(searchQuery, { limit: 8 });
781
797
  const sourceFiles = [rawFilePath];
782
798
  let totalRows = currentRows;
783
799
  for (const ds of additional) {
@@ -880,9 +896,52 @@ async function handlePrepareJob(jobId, query, requirements) {
880
896
  quality_score: qualityScore
881
897
  });
882
898
  }
899
+ else {
900
+ // Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
901
+ try {
902
+ const existingMeta = metadataStore.getDataset(datasetIdForDownload);
903
+ if (!existingMeta) {
904
+ metadataStore.saveDataset({
905
+ id: datasetIdForDownload,
906
+ source: source,
907
+ name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
908
+ description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
909
+ quality_warnings: [],
910
+ downloads: 0,
911
+ likes: 0,
912
+ stars: 0,
913
+ tags: [],
914
+ last_updated: new Date().toISOString(),
915
+ task: "unknown",
916
+ domain: "unknown",
917
+ languages: [],
918
+ splits: [],
919
+ license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
920
+ quality_score: qualityScore,
921
+ download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
922
+ total_examples: 0,
923
+ is_structured: false,
924
+ has_target_column: false,
925
+ is_safe_source: true,
926
+ has_personal_data: false,
927
+ is_paywalled: false,
928
+ is_scraped_web_data: false,
929
+ uses_https: true,
930
+ has_train_split: false,
931
+ has_test_split: false,
932
+ has_validation_split: false,
933
+ description_length: 0,
934
+ has_readme: false,
935
+ });
936
+ }
937
+ }
938
+ catch (e) {
939
+ console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
940
+ }
941
+ }
883
942
  markPipelineStep("register", "running");
884
943
  update({ progress: 85, status_text: "Installing dataset into project..." });
885
- const installPath = await installService.install(datasetIdForDownload, rawFilePath);
944
+ const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
886
945
  update({ progress: 100, status_text: "Preparation complete!" });
887
946
  // Register prepared dataset in local registry for lookup by export/list tools
888
947
  try {
@@ -1013,7 +1072,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1013
1072
  },
1014
1073
  target_dir: {
1015
1074
  type: "string",
1016
- description: "Optional output directory for operation='download'.",
1075
+ description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
1076
+ },
1077
+ output_dir: {
1078
+ type: "string",
1079
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
1017
1080
  },
1018
1081
  public_only: {
1019
1082
  type: "boolean",
@@ -1052,7 +1115,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1052
1115
  },
1053
1116
  {
1054
1117
  name: "download_dataset",
1055
- description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
1118
+ description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
1056
1119
  inputSchema: {
1057
1120
  type: "object",
1058
1121
  properties: {
@@ -1067,7 +1130,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1067
1130
  },
1068
1131
  target_dir: {
1069
1132
  type: "string",
1070
- description: "Optional target directory for downloaded files.",
1133
+ description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
1134
+ },
1135
+ output_dir: {
1136
+ type: "string",
1137
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
1071
1138
  }
1072
1139
  },
1073
1140
  required: ["dataset_id"],
@@ -1194,6 +1261,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1194
1261
  properties: {
1195
1262
  query: { type: "string" },
1196
1263
  requirements: { type: "string" },
1264
+ target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
1265
+ output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
1197
1266
  download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
1198
1267
  cleaning_options: { type: "object" },
1199
1268
  split_config: { type: "object" },
@@ -1238,7 +1307,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1238
1307
  },
1239
1308
  target_dir: {
1240
1309
  type: "string",
1241
- description: "Optional custom local directory for export (e.g., './naruto-quotes').",
1310
+ description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
1311
+ },
1312
+ output_dir: {
1313
+ type: "string",
1314
+ description: "Alias for target_dir. Defaults to the current working directory when omitted.",
1242
1315
  },
1243
1316
  format: {
1244
1317
  type: "string",
@@ -1425,7 +1498,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1425
1498
  if (tool === "vesper_export" && req === "split") {
1426
1499
  // Auto-trigger prepare_dataset (start a background prepare job)
1427
1500
  try {
1428
- jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
1501
+ jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
1429
1502
  // Mark split as complete so export can proceed; export handler will also wait for data if needed.
1430
1503
  markStepComplete(String(datasetId), "split");
1431
1504
  }
@@ -1481,6 +1554,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1481
1554
  if (!datasetId) {
1482
1555
  throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
1483
1556
  }
1557
+ const requestedTargetDir = request.params.arguments?.target_dir
1558
+ ? String(request.params.arguments.target_dir).trim()
1559
+ : request.params.arguments?.output_dir
1560
+ ? String(request.params.arguments.output_dir).trim()
1561
+ : "";
1562
+ const targetDir = requestedTargetDir || process.cwd();
1484
1563
  try {
1485
1564
  await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
1486
1565
  }
@@ -1490,7 +1569,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1490
1569
  const result = await unifiedDatasetGateway.download({
1491
1570
  datasetId,
1492
1571
  source,
1493
- targetDir: request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined,
1572
+ targetDir,
1494
1573
  });
1495
1574
  try {
1496
1575
  upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
@@ -1597,7 +1676,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1597
1676
  hydrateExternalKeys();
1598
1677
  const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
1599
1678
  const datasetId = String(request.params.arguments?.dataset_id || "").trim();
1600
- const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined;
1679
+ const requestedTargetDir = request.params.arguments?.target_dir
1680
+ ? String(request.params.arguments.target_dir).trim()
1681
+ : request.params.arguments?.output_dir
1682
+ ? String(request.params.arguments.output_dir).trim()
1683
+ : "";
1684
+ const targetDir = requestedTargetDir || process.cwd();
1601
1685
  if (!datasetId) {
1602
1686
  throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
1603
1687
  }
@@ -1804,8 +1888,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1804
1888
  }
1805
1889
  const dataset = metadataStore.getDataset(datasetId);
1806
1890
  if (!dataset) {
1891
+ // Fallback: check the registry for local path info
1892
+ const regEntry = getRegistryEntry(datasetId);
1893
+ const regPath = regEntry?.local_path || regEntry?.path;
1894
+ if (regEntry) {
1895
+ const exists = regPath && fs.existsSync(regPath);
1896
+ return {
1897
+ content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
1898
+ };
1899
+ }
1807
1900
  return {
1808
- content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
1901
+ content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
1809
1902
  isError: true,
1810
1903
  };
1811
1904
  }
@@ -1975,10 +2068,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1975
2068
  const query = String(request.params.arguments?.query);
1976
2069
  const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
1977
2070
  const downloadImages = request.params.arguments?.download_images === true;
2071
+ const requestedOutputDir = request.params.arguments?.target_dir
2072
+ ? String(request.params.arguments.target_dir).trim()
2073
+ : request.params.arguments?.output_dir
2074
+ ? String(request.params.arguments.output_dir).trim()
2075
+ : "";
2076
+ const outputDir = requestedOutputDir || process.cwd();
1978
2077
  if (!query || query === "undefined") {
1979
2078
  throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
1980
2079
  }
1981
- const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
2080
+ const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
1982
2081
  return {
1983
2082
  content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
1984
2083
  };
@@ -2019,7 +2118,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2019
2118
  }
2020
2119
  case "export_dataset": {
2021
2120
  const datasetId = String(request.params.arguments?.dataset_id);
2022
- const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
2121
+ const requestedTargetDir = request.params.arguments?.target_dir
2122
+ ? String(request.params.arguments?.target_dir).trim()
2123
+ : request.params.arguments?.output_dir
2124
+ ? String(request.params.arguments?.output_dir).trim()
2125
+ : "";
2126
+ const targetDir = requestedTargetDir || process.cwd();
2023
2127
  const requestedFormat = String(request.params.arguments?.format || "feather");
2024
2128
  const fastMode = request.params.arguments?.fast === true;
2025
2129
  const preview = request.params.arguments?.preview === true;
@@ -2032,7 +2136,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2032
2136
  console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
2033
2137
  // Start a prepare job for this dataset id (acts like calling prepare_dataset)
2034
2138
  try {
2035
- jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
2139
+ jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
2036
2140
  }
2037
2141
  catch (e) {
2038
2142
  console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
@@ -2115,7 +2219,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2115
2219
  const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
2116
2220
  const ext = extMap[requestedFormat] || ".feather";
2117
2221
  const safeName = toSafeDatasetPathFragment(datasetId);
2118
- const outDir = targetDir || path.join(dataRoot, "exports");
2222
+ const outDir = targetDir;
2119
2223
  if (!fs.existsSync(outDir))
2120
2224
  fs.mkdirSync(outDir, { recursive: true });
2121
2225
  const outputFile = path.join(outDir, `${safeName}${ext}`);
@@ -2151,6 +2255,23 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2151
2255
  };
2152
2256
  }
2153
2257
  }
2258
+ case "vesper_list_datasets": {
2259
+ const entries = readRegistry();
2260
+ if (entries.length === 0) {
2261
+ return {
2262
+ content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
2263
+ };
2264
+ }
2265
+ const lines = entries.map((e, i) => {
2266
+ const id = e.dataset_id || e.id || "unknown";
2267
+ const localPath = e.local_path || e.path || "unknown";
2268
+ const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
2269
+ return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
2270
+ });
2271
+ return {
2272
+ content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
2273
+ };
2274
+ }
2154
2275
  case "fuse_datasets": {
2155
2276
  const rawSources = request.params.arguments?.sources;
2156
2277
  if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
@@ -21,7 +21,11 @@ export class InstallService {
21
21
  // Create target directory
22
22
  const installLabel = dataset?.name || datasetId;
23
23
  const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
24
- const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
24
+ // If caller specified a target dir, use it directly (don't nest under datasets/)
25
+ // Otherwise fall back to the project root's datasets/ folder
26
+ const installDir = targetDir
27
+ ? path.resolve(targetDir)
28
+ : path.join(this.projectRoot, "datasets", sanitizedName);
25
29
  if (!fs.existsSync(installDir)) {
26
30
  fs.mkdirSync(installDir, { recursive: true });
27
31
  }
@@ -3,22 +3,29 @@ import { categorizeLicense } from "./license.js";
3
3
  import { calculateQualityScore } from "./quality.js";
4
4
  import { classifyDomain } from "./domain.js";
5
5
  import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
6
+ import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent } from "../search/query-intent.js";
6
7
  export class HuggingFaceScraper {
7
8
  /**
8
9
  * Bulk discovery: Fetch many datasets quickly without deep details.
9
10
  * Hits the 25k target in minutes.
10
11
  */
11
- async scrapeBulk(limit = 1000, query) {
12
+ async scrapeBulk(limit = 1000, queryOrIntent) {
13
+ const intent = typeof queryOrIntent === "string"
14
+ ? await analyzeDatasetQuery(queryOrIntent)
15
+ : queryOrIntent;
16
+ const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
17
+ const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
12
18
  const filterMsg = query ? `, query: ${query}` : "";
13
19
  console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
14
20
  const results = [];
15
21
  let processed = 0;
16
22
  try {
17
23
  const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
24
+ const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
18
25
  for await (const ds of listDatasets({
19
26
  limit: limit,
20
27
  additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
21
- search: { query: query },
28
+ search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
22
29
  ...(hfToken ? { accessToken: hfToken } : {})
23
30
  })) {
24
31
  if (results.length >= limit)
@@ -86,8 +93,12 @@ export class HuggingFaceScraper {
86
93
  }
87
94
  return results;
88
95
  }
89
- async scrape(limit = 100, applyMVPFilters = true, query // Use as general search query
90
- ) {
96
+ async scrape(limit = 100, applyMVPFilters = true, queryOrIntent) {
97
+ const intent = typeof queryOrIntent === "string"
98
+ ? await analyzeDatasetQuery(queryOrIntent)
99
+ : queryOrIntent;
100
+ const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
101
+ const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
91
102
  const filterMsg = query ? `, query: ${query}` : "";
92
103
  console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
93
104
  const results = [];
@@ -110,10 +121,11 @@ export class HuggingFaceScraper {
110
121
  }
111
122
  // Add delay between batches to avoid rate limits
112
123
  const BATCH_DELAY = hfToken ? 500 : 2000;
124
+ const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
113
125
  for await (const ds of listDatasets({
114
126
  limit: fetchLimit,
115
127
  additionalFields: ["description", "tags"],
116
- search: { query: query },
128
+ search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
117
129
  ...(hfToken ? { accessToken: hfToken } : {})
118
130
  })) {
119
131
  if (results.length >= limit)
@@ -290,6 +302,9 @@ export class HuggingFaceScraper {
290
302
  description_length: description.length,
291
303
  has_readme: !!(cardData.readme || cardData.readme_content)
292
304
  };
305
+ if (intent) {
306
+ metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
307
+ }
293
308
  results.push(metadata);
294
309
  }
295
310
  catch (e) {
@@ -340,8 +355,12 @@ export class HuggingFaceScraper {
340
355
  if (otherErrors > 0) {
341
356
  console.error(`[HF Scraper] ⚠️ ${otherErrors} datasets skipped due to errors`);
342
357
  }
343
- // Sort by downloads descending
344
- return results.sort((a, b) => b.downloads - a.downloads);
358
+ return results.sort((a, b) => {
359
+ const intentDelta = Number(b.intent_score || 0) - Number(a.intent_score || 0);
360
+ if (intentDelta !== 0)
361
+ return intentDelta;
362
+ return b.downloads - a.downloads;
363
+ });
345
364
  }
346
365
  extractTask(tags) {
347
366
  const taskTags = [
@@ -1,4 +1,5 @@
1
1
  import { JITOrchestrator } from "./jit-orchestrator.js";
2
+ import { analyzeDatasetQuery, scoreDatasetAgainstIntent } from "./query-intent.js";
2
3
  import fs from "fs";
3
4
  function log(msg) {
4
5
  fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
@@ -17,9 +18,10 @@ export class SearchEngine {
17
18
  async search(query, options = {}) {
18
19
  const limit = options.limit || 5;
19
20
  const enableJIT = options.enableJIT !== false; // Default: true
21
+ const intent = await analyzeDatasetQuery(query);
20
22
  log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
21
23
  // 1. Perform local search
22
- const localResults = await this.localSearch(query, options);
24
+ const localResults = await this.localSearch(query, options, intent);
23
25
  // 2. Check if JIT should be triggered
24
26
  const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
25
27
  if (!shouldTrigger) {
@@ -28,10 +30,10 @@ export class SearchEngine {
28
30
  }
29
31
  // 3. Trigger JIT fallback
30
32
  console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
31
- await this.jitOrchestrator.fetchAndIngest(query, 10);
33
+ await this.jitOrchestrator.fetchAndIngest(query, 10, intent);
32
34
  // 4. Re-run local search with updated index
33
35
  console.error(`Re-searching with updated library...`);
34
- const enhancedResults = await this.localSearch(query, options);
36
+ const enhancedResults = await this.localSearch(query, options, intent);
35
37
  const newCount = enhancedResults.length - localResults.length;
36
38
  if (newCount > 0) {
37
39
  console.error(`Found ${newCount} additional results\n`);
@@ -41,7 +43,7 @@ export class SearchEngine {
41
43
  /**
42
44
  * Perform hybrid search (Vector + Lexical + Penalties)
43
45
  */
44
- async localSearch(query, options) {
46
+ async localSearch(query, options, intent) {
45
47
  const limit = options.limit || 5;
46
48
  // 1. Parse Query
47
49
  const words = query.toLowerCase().split(/\s+/);
@@ -136,11 +138,13 @@ export class SearchEngine {
136
138
  bonus = sourceBonuses[metadata.source] || 0;
137
139
  // Final Combined Score
138
140
  // 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
139
- const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus;
141
+ const intentScore = scoreDatasetAgainstIntent(metadata, intent);
142
+ const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus + intentScore;
140
143
  metadata.relevance_score = Math.round(finalScore * 100) / 100;
141
144
  metadata.vector_score = Math.round(vectorScore * 100) / 100;
142
145
  metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
143
146
  metadata.accessibility_bonus = bonus;
147
+ metadata.intent_score = intentScore;
144
148
  results.push(metadata);
145
149
  }
146
150
  // Sort by final score and limit
@@ -2,6 +2,7 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
2
2
  import { UCIScraper } from "../metadata/uci-scraper.js";
3
3
  import { GitHubScraper } from "../metadata/github-scraper.js";
4
4
  import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
5
+ import { analyzeDatasetQuery, buildIntentSearchQuery } from "./query-intent.js";
5
6
  // Common stop words to filter out for better search
6
7
  const STOP_WORDS = new Set([
7
8
  "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
@@ -61,7 +62,7 @@ export class JITOrchestrator {
61
62
  /**
62
63
  * Main JIT workflow: fetch, save, index, return new datasets
63
64
  */
64
- async fetchAndIngest(query, limit = 10) {
65
+ async fetchAndIngest(query, limit = 10, providedIntent) {
65
66
  // Rate limiting check
66
67
  if (!this.canTrigger(query)) {
67
68
  console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
@@ -69,9 +70,12 @@ export class JITOrchestrator {
69
70
  }
70
71
  console.error(`\n[JIT] Searching live sources for: "${query}"`);
71
72
  this.lastTriggerTime.set(query, Date.now());
72
- // Simplify query for better API results
73
- const keywords = this.simplifyQuery(query);
74
- if (keywords.length > 0) {
73
+ const intent = providedIntent || await analyzeDatasetQuery(query);
74
+ const keywords = this.simplifyQuery(buildIntentSearchQuery(intent));
75
+ if (intent.llmBacked || intent.language || intent.task || intent.domain || intent.minRows) {
76
+ console.error(`[JIT] Intent: ${JSON.stringify({ language: intent.language, task: intent.task, domain: intent.domain, minRows: intent.minRows, searchQuery: intent.searchQuery })}`);
77
+ }
78
+ else if (keywords.length > 0) {
75
79
  console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
76
80
  }
77
81
  const newDatasets = [];
@@ -81,15 +85,16 @@ export class JITOrchestrator {
81
85
  // Get existing dataset IDs to avoid duplicates
82
86
  const existing = this.metadataStore.getAllDatasets();
83
87
  existing.forEach(ds => existingIds.add(ds.id));
84
- // 1. Scrape HuggingFace - try each keyword separately for better results
85
- let hfResults = [];
86
- for (const keyword of keywords) {
87
- if (hfResults.length >= limit)
88
- break;
89
- const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / keywords.length));
90
- for (const ds of results) {
91
- if (!hfResults.some(existing => existing.id === ds.id)) {
92
- hfResults.push(ds);
88
+ let hfResults = await this.scrapeHuggingFace(intent, limit);
89
+ if (hfResults.length < Math.max(3, Math.floor(limit / 2))) {
90
+ for (const keyword of keywords) {
91
+ if (hfResults.length >= limit)
92
+ break;
93
+ const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / Math.max(keywords.length, 1)));
94
+ for (const ds of results) {
95
+ if (!hfResults.some(existing => existing.id === ds.id)) {
96
+ hfResults.push(ds);
97
+ }
93
98
  }
94
99
  }
95
100
  }
@@ -170,7 +175,6 @@ export class JITOrchestrator {
170
175
  async scrapeHuggingFace(query, limit) {
171
176
  const scraper = new HuggingFaceScraper();
172
177
  try {
173
- // Pass the query as a general search term
174
178
  return await scraper.scrape(limit, true, query);
175
179
  }
176
180
  catch (error) {