vesper-wizard 2.1.4 ā 2.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +2 -1
- package/build/index.js +67 -15
- package/build/metadata/scraper.js +24 -7
- package/build/search/engine.js +9 -5
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +409 -0
- package/build/utils/python-runtime.js +130 -0
- package/package.json +1 -1
- package/scripts/postinstall.cjs +74 -32
package/build/export/exporter.js
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import { spawn } from "child_process";
|
|
2
2
|
import path from "path";
|
|
3
3
|
import fs from "fs";
|
|
4
|
+
import { ensurePythonPackages, resolvePythonCommand } from "../utils/python-runtime.js";
|
|
4
5
|
export class DataExporter {
|
|
5
|
-
|
|
6
|
+
buildDir;
|
|
6
7
|
scriptPath;
|
|
7
8
|
constructor(buildDir = process.cwd()) {
|
|
9
|
+
this.buildDir = buildDir;
|
|
8
10
|
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
9
11
|
const dataRoot = path.join(homeDir, ".vesper");
|
|
10
12
|
const scriptPath0 = path.resolve(dataRoot, "python", "export_engine.py");
|
|
@@ -26,27 +28,38 @@ export class DataExporter {
|
|
|
26
28
|
else {
|
|
27
29
|
this.scriptPath = scriptPath0;
|
|
28
30
|
}
|
|
29
|
-
// Detect Python command
|
|
30
|
-
if (process.platform === "win32") {
|
|
31
|
-
this.pythonPath = "py";
|
|
32
|
-
}
|
|
33
31
|
}
|
|
34
32
|
/**
|
|
35
33
|
* Exports a dataset file to a specified format
|
|
36
34
|
*/
|
|
37
35
|
async export(inputFile, outputFile, format, options = {}) {
|
|
36
|
+
const pythonRequirements = [
|
|
37
|
+
{ module: "polars", packageName: "polars" },
|
|
38
|
+
];
|
|
39
|
+
if (format === "feather") {
|
|
40
|
+
pythonRequirements.push({ module: "pyarrow", packageName: "pyarrow" });
|
|
41
|
+
}
|
|
42
|
+
if (format === "tfrecord") {
|
|
43
|
+
pythonRequirements.push({ module: "tensorflow", packageName: "tensorflow" });
|
|
44
|
+
}
|
|
45
|
+
const pythonPath = await ensurePythonPackages(this.buildDir, pythonRequirements).catch(() => resolvePythonCommand(this.buildDir));
|
|
38
46
|
return new Promise((resolve, reject) => {
|
|
39
47
|
if (!fs.existsSync(inputFile)) {
|
|
40
48
|
reject(new Error(`Input file not found: ${inputFile}`));
|
|
41
49
|
return;
|
|
42
50
|
}
|
|
43
51
|
const args = [this.scriptPath, inputFile, outputFile, format, JSON.stringify(options)];
|
|
44
|
-
const
|
|
52
|
+
const childProcess = spawn(pythonPath, args, {
|
|
53
|
+
env: {
|
|
54
|
+
...process.env,
|
|
55
|
+
PYTHONIOENCODING: "utf-8",
|
|
56
|
+
},
|
|
57
|
+
});
|
|
45
58
|
let stdout = "";
|
|
46
59
|
let stderr = "";
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
60
|
+
childProcess.stdout.on("data", (data) => stdout += data.toString());
|
|
61
|
+
childProcess.stderr.on("data", (data) => stderr += data.toString());
|
|
62
|
+
childProcess.on("close", (code) => {
|
|
50
63
|
if (code !== 0) {
|
|
51
64
|
reject(new Error(`Export failed: ${stderr || stdout}`));
|
|
52
65
|
return;
|
|
@@ -3,6 +3,7 @@ import path from "path";
|
|
|
3
3
|
import http from "http";
|
|
4
4
|
import https from "https";
|
|
5
5
|
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
6
|
+
import { analyzeDatasetQuery } from "../search/query-intent.js";
|
|
6
7
|
export class UnifiedDatasetGateway {
|
|
7
8
|
deps;
|
|
8
9
|
constructor(deps) {
|
|
@@ -236,7 +237,7 @@ export class UnifiedDatasetGateway {
|
|
|
236
237
|
async discoverFromSource(source, query, limit) {
|
|
237
238
|
switch (source) {
|
|
238
239
|
case "huggingface":
|
|
239
|
-
return await new HuggingFaceScraper().scrape(limit, true, query);
|
|
240
|
+
return await new HuggingFaceScraper().scrape(limit, true, await analyzeDatasetQuery(query));
|
|
240
241
|
case "openml":
|
|
241
242
|
return await this.deps.openmlSource.discover(query, limit);
|
|
242
243
|
case "kaggle":
|
package/build/index.js
CHANGED
|
@@ -361,6 +361,21 @@ function extractRequestedRows(query, requirements) {
|
|
|
361
361
|
if (Number.isFinite(n) && n > 0)
|
|
362
362
|
return n;
|
|
363
363
|
}
|
|
364
|
+
const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
|
|
365
|
+
.map(m => Number(m[0].replace(/,/g, "")))
|
|
366
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
367
|
+
if (commaNumbers.length > 0)
|
|
368
|
+
return Math.max(...commaNumbers);
|
|
369
|
+
const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
|
|
370
|
+
.map(m => {
|
|
371
|
+
const base = Number(m[1]);
|
|
372
|
+
const suffix = m[2].toLowerCase();
|
|
373
|
+
const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
|
|
374
|
+
return Math.round(base * multiplier);
|
|
375
|
+
})
|
|
376
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
377
|
+
if (humanSized.length > 0)
|
|
378
|
+
return Math.max(...humanSized);
|
|
364
379
|
const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
|
|
365
380
|
.map(m => Number(m[0]))
|
|
366
381
|
.filter(n => Number.isFinite(n) && n > 0);
|
|
@@ -644,7 +659,7 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
644
659
|
console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
|
|
645
660
|
const metadata = job.metadata ? JSON.parse(job.metadata) : {};
|
|
646
661
|
switch (job.type) {
|
|
647
|
-
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
|
|
662
|
+
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
|
|
648
663
|
case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
|
|
649
664
|
default: throw new Error(`Unhandled job type: ${job.type}`);
|
|
650
665
|
}
|
|
@@ -662,7 +677,7 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
662
677
|
/**
|
|
663
678
|
* Logic for preparing a dataset (Search + Ingest + Process)
|
|
664
679
|
*/
|
|
665
|
-
async function handlePrepareJob(jobId, query, requirements) {
|
|
680
|
+
async function handlePrepareJob(jobId, query, requirements, outputDir) {
|
|
666
681
|
hydrateExternalKeys();
|
|
667
682
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
668
683
|
const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
|
|
@@ -689,6 +704,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
689
704
|
// Continue anyway - direct file downloads may still work without datasets lib
|
|
690
705
|
}
|
|
691
706
|
const requestedRows = extractRequestedRows(query, requirements);
|
|
707
|
+
const searchQuery = requirements ? `${query} ${requirements}` : query;
|
|
692
708
|
let selectedDataset;
|
|
693
709
|
let datasetIdForDownload = "";
|
|
694
710
|
let source;
|
|
@@ -729,7 +745,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
729
745
|
else {
|
|
730
746
|
markPipelineStep("search", "running");
|
|
731
747
|
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
732
|
-
const results = await searchEngine.search(
|
|
748
|
+
const results = await searchEngine.search(searchQuery, { limit: 10 });
|
|
733
749
|
if (results.length === 0) {
|
|
734
750
|
markPipelineStep("search", "failed");
|
|
735
751
|
throw new Error("No datasets found matching the query. Try refining your search terms.");
|
|
@@ -777,7 +793,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
777
793
|
let currentRows = await countRows(rawFilePath);
|
|
778
794
|
if (currentRows < requestedRows) {
|
|
779
795
|
update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
|
|
780
|
-
const additional = await searchEngine.search(
|
|
796
|
+
const additional = await searchEngine.search(searchQuery, { limit: 8 });
|
|
781
797
|
const sourceFiles = [rawFilePath];
|
|
782
798
|
let totalRows = currentRows;
|
|
783
799
|
for (const ds of additional) {
|
|
@@ -882,7 +898,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
882
898
|
}
|
|
883
899
|
markPipelineStep("register", "running");
|
|
884
900
|
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
885
|
-
const installPath = await installService.install(datasetIdForDownload, rawFilePath);
|
|
901
|
+
const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
|
|
886
902
|
update({ progress: 100, status_text: "Preparation complete!" });
|
|
887
903
|
// Register prepared dataset in local registry for lookup by export/list tools
|
|
888
904
|
try {
|
|
@@ -1013,7 +1029,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1013
1029
|
},
|
|
1014
1030
|
target_dir: {
|
|
1015
1031
|
type: "string",
|
|
1016
|
-
description: "Optional output directory for operation='download'.",
|
|
1032
|
+
description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
|
|
1033
|
+
},
|
|
1034
|
+
output_dir: {
|
|
1035
|
+
type: "string",
|
|
1036
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1017
1037
|
},
|
|
1018
1038
|
public_only: {
|
|
1019
1039
|
type: "boolean",
|
|
@@ -1052,7 +1072,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1052
1072
|
},
|
|
1053
1073
|
{
|
|
1054
1074
|
name: "download_dataset",
|
|
1055
|
-
description: "Download a dataset by source and ID/slug into local
|
|
1075
|
+
description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
|
|
1056
1076
|
inputSchema: {
|
|
1057
1077
|
type: "object",
|
|
1058
1078
|
properties: {
|
|
@@ -1067,7 +1087,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1067
1087
|
},
|
|
1068
1088
|
target_dir: {
|
|
1069
1089
|
type: "string",
|
|
1070
|
-
description: "Optional target directory for downloaded files.",
|
|
1090
|
+
description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
|
|
1091
|
+
},
|
|
1092
|
+
output_dir: {
|
|
1093
|
+
type: "string",
|
|
1094
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1071
1095
|
}
|
|
1072
1096
|
},
|
|
1073
1097
|
required: ["dataset_id"],
|
|
@@ -1194,6 +1218,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1194
1218
|
properties: {
|
|
1195
1219
|
query: { type: "string" },
|
|
1196
1220
|
requirements: { type: "string" },
|
|
1221
|
+
target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
|
|
1222
|
+
output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
|
|
1197
1223
|
download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
|
|
1198
1224
|
cleaning_options: { type: "object" },
|
|
1199
1225
|
split_config: { type: "object" },
|
|
@@ -1238,7 +1264,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1238
1264
|
},
|
|
1239
1265
|
target_dir: {
|
|
1240
1266
|
type: "string",
|
|
1241
|
-
description: "Optional custom local directory for export
|
|
1267
|
+
description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
|
|
1268
|
+
},
|
|
1269
|
+
output_dir: {
|
|
1270
|
+
type: "string",
|
|
1271
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1242
1272
|
},
|
|
1243
1273
|
format: {
|
|
1244
1274
|
type: "string",
|
|
@@ -1425,7 +1455,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1425
1455
|
if (tool === "vesper_export" && req === "split") {
|
|
1426
1456
|
// Auto-trigger prepare_dataset (start a background prepare job)
|
|
1427
1457
|
try {
|
|
1428
|
-
jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
|
|
1458
|
+
jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
1429
1459
|
// Mark split as complete so export can proceed; export handler will also wait for data if needed.
|
|
1430
1460
|
markStepComplete(String(datasetId), "split");
|
|
1431
1461
|
}
|
|
@@ -1481,6 +1511,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1481
1511
|
if (!datasetId) {
|
|
1482
1512
|
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
|
|
1483
1513
|
}
|
|
1514
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
1515
|
+
? String(request.params.arguments.target_dir).trim()
|
|
1516
|
+
: request.params.arguments?.output_dir
|
|
1517
|
+
? String(request.params.arguments.output_dir).trim()
|
|
1518
|
+
: "";
|
|
1519
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
1484
1520
|
try {
|
|
1485
1521
|
await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
|
|
1486
1522
|
}
|
|
@@ -1490,7 +1526,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1490
1526
|
const result = await unifiedDatasetGateway.download({
|
|
1491
1527
|
datasetId,
|
|
1492
1528
|
source,
|
|
1493
|
-
targetDir
|
|
1529
|
+
targetDir,
|
|
1494
1530
|
});
|
|
1495
1531
|
try {
|
|
1496
1532
|
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
@@ -1597,7 +1633,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1597
1633
|
hydrateExternalKeys();
|
|
1598
1634
|
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1599
1635
|
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1600
|
-
const
|
|
1636
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
1637
|
+
? String(request.params.arguments.target_dir).trim()
|
|
1638
|
+
: request.params.arguments?.output_dir
|
|
1639
|
+
? String(request.params.arguments.output_dir).trim()
|
|
1640
|
+
: "";
|
|
1641
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
1601
1642
|
if (!datasetId) {
|
|
1602
1643
|
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1603
1644
|
}
|
|
@@ -1975,10 +2016,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1975
2016
|
const query = String(request.params.arguments?.query);
|
|
1976
2017
|
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
1977
2018
|
const downloadImages = request.params.arguments?.download_images === true;
|
|
2019
|
+
const requestedOutputDir = request.params.arguments?.target_dir
|
|
2020
|
+
? String(request.params.arguments.target_dir).trim()
|
|
2021
|
+
: request.params.arguments?.output_dir
|
|
2022
|
+
? String(request.params.arguments.output_dir).trim()
|
|
2023
|
+
: "";
|
|
2024
|
+
const outputDir = requestedOutputDir || process.cwd();
|
|
1978
2025
|
if (!query || query === "undefined") {
|
|
1979
2026
|
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
1980
2027
|
}
|
|
1981
|
-
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
|
|
2028
|
+
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
|
|
1982
2029
|
return {
|
|
1983
2030
|
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
1984
2031
|
};
|
|
@@ -2019,7 +2066,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2019
2066
|
}
|
|
2020
2067
|
case "export_dataset": {
|
|
2021
2068
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2022
|
-
const
|
|
2069
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2070
|
+
? String(request.params.arguments?.target_dir).trim()
|
|
2071
|
+
: request.params.arguments?.output_dir
|
|
2072
|
+
? String(request.params.arguments?.output_dir).trim()
|
|
2073
|
+
: "";
|
|
2074
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
2023
2075
|
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
2024
2076
|
const fastMode = request.params.arguments?.fast === true;
|
|
2025
2077
|
const preview = request.params.arguments?.preview === true;
|
|
@@ -2032,7 +2084,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2032
2084
|
console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
|
|
2033
2085
|
// Start a prepare job for this dataset id (acts like calling prepare_dataset)
|
|
2034
2086
|
try {
|
|
2035
|
-
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
|
|
2087
|
+
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
2036
2088
|
}
|
|
2037
2089
|
catch (e) {
|
|
2038
2090
|
console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
|
|
@@ -3,12 +3,18 @@ import { categorizeLicense } from "./license.js";
|
|
|
3
3
|
import { calculateQualityScore } from "./quality.js";
|
|
4
4
|
import { classifyDomain } from "./domain.js";
|
|
5
5
|
import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
|
|
6
|
+
import { analyzeDatasetQuery, buildIntentSearchQuery, scoreDatasetAgainstIntent } from "../search/query-intent.js";
|
|
6
7
|
export class HuggingFaceScraper {
|
|
7
8
|
/**
|
|
8
9
|
* Bulk discovery: Fetch many datasets quickly without deep details.
|
|
9
10
|
* Hits the 25k target in minutes.
|
|
10
11
|
*/
|
|
11
|
-
async scrapeBulk(limit = 1000,
|
|
12
|
+
async scrapeBulk(limit = 1000, queryOrIntent) {
|
|
13
|
+
const intent = typeof queryOrIntent === "string"
|
|
14
|
+
? await analyzeDatasetQuery(queryOrIntent)
|
|
15
|
+
: queryOrIntent;
|
|
16
|
+
const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
|
|
17
|
+
const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
|
|
12
18
|
const filterMsg = query ? `, query: ${query}` : "";
|
|
13
19
|
console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
|
|
14
20
|
const results = [];
|
|
@@ -18,7 +24,7 @@ export class HuggingFaceScraper {
|
|
|
18
24
|
for await (const ds of listDatasets({
|
|
19
25
|
limit: limit,
|
|
20
26
|
additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
|
|
21
|
-
search: { query:
|
|
27
|
+
search: { query: hfQuery },
|
|
22
28
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
23
29
|
})) {
|
|
24
30
|
if (results.length >= limit)
|
|
@@ -86,8 +92,12 @@ export class HuggingFaceScraper {
|
|
|
86
92
|
}
|
|
87
93
|
return results;
|
|
88
94
|
}
|
|
89
|
-
async scrape(limit = 100, applyMVPFilters = true,
|
|
90
|
-
|
|
95
|
+
async scrape(limit = 100, applyMVPFilters = true, queryOrIntent) {
|
|
96
|
+
const intent = typeof queryOrIntent === "string"
|
|
97
|
+
? await analyzeDatasetQuery(queryOrIntent)
|
|
98
|
+
: queryOrIntent;
|
|
99
|
+
const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
|
|
100
|
+
const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
|
|
91
101
|
const filterMsg = query ? `, query: ${query}` : "";
|
|
92
102
|
console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
|
|
93
103
|
const results = [];
|
|
@@ -113,7 +123,7 @@ export class HuggingFaceScraper {
|
|
|
113
123
|
for await (const ds of listDatasets({
|
|
114
124
|
limit: fetchLimit,
|
|
115
125
|
additionalFields: ["description", "tags"],
|
|
116
|
-
search: { query:
|
|
126
|
+
search: { query: hfQuery },
|
|
117
127
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
118
128
|
})) {
|
|
119
129
|
if (results.length >= limit)
|
|
@@ -290,6 +300,9 @@ export class HuggingFaceScraper {
|
|
|
290
300
|
description_length: description.length,
|
|
291
301
|
has_readme: !!(cardData.readme || cardData.readme_content)
|
|
292
302
|
};
|
|
303
|
+
if (intent) {
|
|
304
|
+
metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
|
|
305
|
+
}
|
|
293
306
|
results.push(metadata);
|
|
294
307
|
}
|
|
295
308
|
catch (e) {
|
|
@@ -340,8 +353,12 @@ export class HuggingFaceScraper {
|
|
|
340
353
|
if (otherErrors > 0) {
|
|
341
354
|
console.error(`[HF Scraper] ā ļø ${otherErrors} datasets skipped due to errors`);
|
|
342
355
|
}
|
|
343
|
-
|
|
344
|
-
|
|
356
|
+
return results.sort((a, b) => {
|
|
357
|
+
const intentDelta = Number(b.intent_score || 0) - Number(a.intent_score || 0);
|
|
358
|
+
if (intentDelta !== 0)
|
|
359
|
+
return intentDelta;
|
|
360
|
+
return b.downloads - a.downloads;
|
|
361
|
+
});
|
|
345
362
|
}
|
|
346
363
|
extractTask(tags) {
|
|
347
364
|
const taskTags = [
|
package/build/search/engine.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { JITOrchestrator } from "./jit-orchestrator.js";
|
|
2
|
+
import { analyzeDatasetQuery, scoreDatasetAgainstIntent } from "./query-intent.js";
|
|
2
3
|
import fs from "fs";
|
|
3
4
|
function log(msg) {
|
|
4
5
|
fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
|
|
@@ -17,9 +18,10 @@ export class SearchEngine {
|
|
|
17
18
|
async search(query, options = {}) {
|
|
18
19
|
const limit = options.limit || 5;
|
|
19
20
|
const enableJIT = options.enableJIT !== false; // Default: true
|
|
21
|
+
const intent = await analyzeDatasetQuery(query);
|
|
20
22
|
log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
|
|
21
23
|
// 1. Perform local search
|
|
22
|
-
const localResults = await this.localSearch(query, options);
|
|
24
|
+
const localResults = await this.localSearch(query, options, intent);
|
|
23
25
|
// 2. Check if JIT should be triggered
|
|
24
26
|
const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
|
|
25
27
|
if (!shouldTrigger) {
|
|
@@ -28,10 +30,10 @@ export class SearchEngine {
|
|
|
28
30
|
}
|
|
29
31
|
// 3. Trigger JIT fallback
|
|
30
32
|
console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
|
|
31
|
-
await this.jitOrchestrator.fetchAndIngest(query, 10);
|
|
33
|
+
await this.jitOrchestrator.fetchAndIngest(query, 10, intent);
|
|
32
34
|
// 4. Re-run local search with updated index
|
|
33
35
|
console.error(`Re-searching with updated library...`);
|
|
34
|
-
const enhancedResults = await this.localSearch(query, options);
|
|
36
|
+
const enhancedResults = await this.localSearch(query, options, intent);
|
|
35
37
|
const newCount = enhancedResults.length - localResults.length;
|
|
36
38
|
if (newCount > 0) {
|
|
37
39
|
console.error(`Found ${newCount} additional results\n`);
|
|
@@ -41,7 +43,7 @@ export class SearchEngine {
|
|
|
41
43
|
/**
|
|
42
44
|
* Perform hybrid search (Vector + Lexical + Penalties)
|
|
43
45
|
*/
|
|
44
|
-
async localSearch(query, options) {
|
|
46
|
+
async localSearch(query, options, intent) {
|
|
45
47
|
const limit = options.limit || 5;
|
|
46
48
|
// 1. Parse Query
|
|
47
49
|
const words = query.toLowerCase().split(/\s+/);
|
|
@@ -136,11 +138,13 @@ export class SearchEngine {
|
|
|
136
138
|
bonus = sourceBonuses[metadata.source] || 0;
|
|
137
139
|
// Final Combined Score
|
|
138
140
|
// 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
|
|
139
|
-
const
|
|
141
|
+
const intentScore = scoreDatasetAgainstIntent(metadata, intent);
|
|
142
|
+
const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus + intentScore;
|
|
140
143
|
metadata.relevance_score = Math.round(finalScore * 100) / 100;
|
|
141
144
|
metadata.vector_score = Math.round(vectorScore * 100) / 100;
|
|
142
145
|
metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
|
|
143
146
|
metadata.accessibility_bonus = bonus;
|
|
147
|
+
metadata.intent_score = intentScore;
|
|
144
148
|
results.push(metadata);
|
|
145
149
|
}
|
|
146
150
|
// Sort by final score and limit
|
|
@@ -2,6 +2,7 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
|
2
2
|
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
3
3
|
import { GitHubScraper } from "../metadata/github-scraper.js";
|
|
4
4
|
import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
|
|
5
|
+
import { analyzeDatasetQuery, buildIntentSearchQuery } from "./query-intent.js";
|
|
5
6
|
// Common stop words to filter out for better search
|
|
6
7
|
const STOP_WORDS = new Set([
|
|
7
8
|
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
@@ -61,7 +62,7 @@ export class JITOrchestrator {
|
|
|
61
62
|
/**
|
|
62
63
|
* Main JIT workflow: fetch, save, index, return new datasets
|
|
63
64
|
*/
|
|
64
|
-
async fetchAndIngest(query, limit = 10) {
|
|
65
|
+
async fetchAndIngest(query, limit = 10, providedIntent) {
|
|
65
66
|
// Rate limiting check
|
|
66
67
|
if (!this.canTrigger(query)) {
|
|
67
68
|
console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
|
|
@@ -69,9 +70,12 @@ export class JITOrchestrator {
|
|
|
69
70
|
}
|
|
70
71
|
console.error(`\n[JIT] Searching live sources for: "${query}"`);
|
|
71
72
|
this.lastTriggerTime.set(query, Date.now());
|
|
72
|
-
|
|
73
|
-
const keywords = this.simplifyQuery(
|
|
74
|
-
if (
|
|
73
|
+
const intent = providedIntent || await analyzeDatasetQuery(query);
|
|
74
|
+
const keywords = this.simplifyQuery(buildIntentSearchQuery(intent));
|
|
75
|
+
if (intent.llmBacked || intent.language || intent.task || intent.domain || intent.minRows) {
|
|
76
|
+
console.error(`[JIT] Intent: ${JSON.stringify({ language: intent.language, task: intent.task, domain: intent.domain, minRows: intent.minRows, searchQuery: intent.searchQuery })}`);
|
|
77
|
+
}
|
|
78
|
+
else if (keywords.length > 0) {
|
|
75
79
|
console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
|
|
76
80
|
}
|
|
77
81
|
const newDatasets = [];
|
|
@@ -81,15 +85,16 @@ export class JITOrchestrator {
|
|
|
81
85
|
// Get existing dataset IDs to avoid duplicates
|
|
82
86
|
const existing = this.metadataStore.getAllDatasets();
|
|
83
87
|
existing.forEach(ds => existingIds.add(ds.id));
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
88
|
+
let hfResults = await this.scrapeHuggingFace(intent, limit);
|
|
89
|
+
if (hfResults.length < Math.max(3, Math.floor(limit / 2))) {
|
|
90
|
+
for (const keyword of keywords) {
|
|
91
|
+
if (hfResults.length >= limit)
|
|
92
|
+
break;
|
|
93
|
+
const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / Math.max(keywords.length, 1)));
|
|
94
|
+
for (const ds of results) {
|
|
95
|
+
if (!hfResults.some(existing => existing.id === ds.id)) {
|
|
96
|
+
hfResults.push(ds);
|
|
97
|
+
}
|
|
93
98
|
}
|
|
94
99
|
}
|
|
95
100
|
}
|
|
@@ -170,7 +175,6 @@ export class JITOrchestrator {
|
|
|
170
175
|
async scrapeHuggingFace(query, limit) {
|
|
171
176
|
const scraper = new HuggingFaceScraper();
|
|
172
177
|
try {
|
|
173
|
-
// Pass the query as a general search term
|
|
174
178
|
return await scraper.scrape(limit, true, query);
|
|
175
179
|
}
|
|
176
180
|
catch (error) {
|
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
import { classifyDomain } from "../metadata/domain.js";
|
|
2
|
+
const STOP_WORDS = new Set([
|
|
3
|
+
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
4
|
+
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
|
|
5
|
+
"be", "have", "has", "had", "do", "does", "did", "will", "would",
|
|
6
|
+
"could", "should", "may", "might", "must", "shall", "can", "need",
|
|
7
|
+
"about", "into", "through", "during", "before", "after", "above",
|
|
8
|
+
"below", "between", "under", "again", "further", "then", "once",
|
|
9
|
+
"here", "there", "when", "where", "why", "how", "all", "each",
|
|
10
|
+
"few", "more", "most", "other", "some", "such", "no", "nor", "not",
|
|
11
|
+
"only", "own", "same", "so", "than", "too", "very", "just", "also",
|
|
12
|
+
"dataset", "datasets", "data", "find", "search", "looking", "need", "want",
|
|
13
|
+
"give", "show", "me", "please"
|
|
14
|
+
]);
|
|
15
|
+
const LANGUAGE_ALIASES = {
|
|
16
|
+
english: ["english", "en", "eng"],
|
|
17
|
+
spanish: ["spanish", "es", "spa"],
|
|
18
|
+
french: ["french", "fr", "fra"],
|
|
19
|
+
german: ["german", "de", "deu"],
|
|
20
|
+
portuguese: ["portuguese", "pt", "por"],
|
|
21
|
+
chinese: ["chinese", "zh", "cmn"],
|
|
22
|
+
japanese: ["japanese", "ja", "jpn"],
|
|
23
|
+
korean: ["korean", "ko", "kor"],
|
|
24
|
+
arabic: ["arabic", "ar", "ara"],
|
|
25
|
+
russian: ["russian", "ru", "rus"],
|
|
26
|
+
hindi: ["hindi", "hi", "hin"],
|
|
27
|
+
multilingual: ["multilingual", "bilingual", "cross-lingual", "crosslingual"],
|
|
28
|
+
};
|
|
29
|
+
const TASK_PATTERNS = [
|
|
30
|
+
{ task: "translation", patterns: [/\btranslation\b/i, /\bmachine translation\b/i, /\bparallel corpus\b/i] },
|
|
31
|
+
{ task: "question-answering", patterns: [/\bquestion answering\b/i, /\bqa\b/i, /\bq&a\b/i] },
|
|
32
|
+
{ task: "summarization", patterns: [/\bsummarization\b/i, /\bsummary\b/i, /\btl;dr\b/i] },
|
|
33
|
+
{ task: "sentiment-analysis", patterns: [/\bsentiment\b/i, /\bsentiment analysis\b/i] },
|
|
34
|
+
{ task: "text-classification", patterns: [/\bclassification\b/i, /\bclassifier\b/i, /\btext classification\b/i] },
|
|
35
|
+
{ task: "token-classification", patterns: [/\bner\b/i, /\bnamed entity\b/i, /\btoken classification\b/i] },
|
|
36
|
+
{ task: "text-generation", patterns: [/\btext generation\b/i, /\bgenerative\b/i, /\binstruction\b/i, /\bchat\b/i] },
|
|
37
|
+
{ task: "image-classification", patterns: [/\bimage classification\b/i] },
|
|
38
|
+
{ task: "object-detection", patterns: [/\bobject detection\b/i, /\bdetection\b/i] },
|
|
39
|
+
];
|
|
40
|
+
const intentCache = new Map();
|
|
41
|
+
export async function analyzeDatasetQuery(query, requirements) {
|
|
42
|
+
const cacheKey = `${query || ""}::${requirements || ""}`;
|
|
43
|
+
const cached = intentCache.get(cacheKey);
|
|
44
|
+
if (cached) {
|
|
45
|
+
return cached;
|
|
46
|
+
}
|
|
47
|
+
const task = (async () => {
|
|
48
|
+
const heuristic = buildHeuristicIntent(query, requirements);
|
|
49
|
+
const llmIntent = await tryLlmIntent(heuristic, requirements);
|
|
50
|
+
return llmIntent ? mergeIntent(heuristic, llmIntent) : heuristic;
|
|
51
|
+
})();
|
|
52
|
+
intentCache.set(cacheKey, task);
|
|
53
|
+
return task;
|
|
54
|
+
}
|
|
55
|
+
export function scoreDatasetAgainstIntent(dataset, intent) {
|
|
56
|
+
if (!intent)
|
|
57
|
+
return 0;
|
|
58
|
+
const text = [
|
|
59
|
+
dataset.name,
|
|
60
|
+
dataset.description,
|
|
61
|
+
dataset.task,
|
|
62
|
+
dataset.domain || "",
|
|
63
|
+
dataset.tags.join(" "),
|
|
64
|
+
dataset.languages.join(" "),
|
|
65
|
+
].join(" ").toLowerCase();
|
|
66
|
+
let score = 0;
|
|
67
|
+
if (intent.language) {
|
|
68
|
+
const aliases = getLanguageAliases(intent.language);
|
|
69
|
+
const datasetLanguages = dataset.languages.map(normalizeToken);
|
|
70
|
+
const languageMatch = aliases.some(alias => datasetLanguages.includes(alias) || text.includes(alias));
|
|
71
|
+
if (languageMatch) {
|
|
72
|
+
score += 0.45;
|
|
73
|
+
}
|
|
74
|
+
else if (dataset.languages.length > 0) {
|
|
75
|
+
score -= 0.55;
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
score -= 0.1;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
if (intent.task) {
|
|
82
|
+
if (matchesTask(dataset, intent.task, text)) {
|
|
83
|
+
score += 0.35;
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
score -= 0.3;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
if (intent.domain && intent.domain !== "general" && intent.domain !== "unknown") {
|
|
90
|
+
const datasetDomain = String(dataset.domain || "").toLowerCase();
|
|
91
|
+
if (datasetDomain === intent.domain || text.includes(intent.domain)) {
|
|
92
|
+
score += 0.25;
|
|
93
|
+
}
|
|
94
|
+
else {
|
|
95
|
+
score -= 0.2;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
if (intent.minRows && intent.minRows > 0) {
|
|
99
|
+
const totalExamples = Number(dataset.total_examples || 0);
|
|
100
|
+
if (totalExamples > 0) {
|
|
101
|
+
const ratio = totalExamples / intent.minRows;
|
|
102
|
+
if (ratio >= 1) {
|
|
103
|
+
score += Math.min(0.45, 0.18 + (Math.log10(ratio + 1) * 0.15));
|
|
104
|
+
}
|
|
105
|
+
else if (ratio < 0.05) {
|
|
106
|
+
score -= 1.2;
|
|
107
|
+
}
|
|
108
|
+
else if (ratio < 0.25) {
|
|
109
|
+
score -= 0.8;
|
|
110
|
+
}
|
|
111
|
+
else if (ratio < 0.5) {
|
|
112
|
+
score -= 0.45;
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
score -= 0.15;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
score -= 0.08;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
if (intent.positiveTerms.length > 0) {
|
|
123
|
+
const matches = intent.positiveTerms.filter(term => text.includes(term)).length;
|
|
124
|
+
score += Math.min(0.25, matches * 0.06);
|
|
125
|
+
}
|
|
126
|
+
if (intent.negativeTerms.some(term => text.includes(term))) {
|
|
127
|
+
score -= 0.7;
|
|
128
|
+
}
|
|
129
|
+
return Math.round(score * 100) / 100;
|
|
130
|
+
}
|
|
131
|
+
export function buildIntentSearchQuery(intent) {
|
|
132
|
+
return intent.searchQuery;
|
|
133
|
+
}
|
|
134
|
+
function buildHeuristicIntent(query, requirements) {
|
|
135
|
+
const originalQuery = `${query || ""} ${requirements || ""}`.trim();
|
|
136
|
+
const normalizedQuery = originalQuery.toLowerCase();
|
|
137
|
+
const negativeTerms = [...normalizedQuery.matchAll(/(?:^|\s)-([\w-]{2,})/g)].map(match => normalizeToken(match[1]));
|
|
138
|
+
const positiveTerms = tokenize(normalizedQuery)
|
|
139
|
+
.filter(token => !negativeTerms.includes(token))
|
|
140
|
+
.slice(0, 8);
|
|
141
|
+
const task = detectTask(normalizedQuery);
|
|
142
|
+
const language = detectLanguage(normalizedQuery);
|
|
143
|
+
const domain = classifyDomain(normalizedQuery, [], normalizedQuery, task);
|
|
144
|
+
const minRows = extractRequestedRows(normalizedQuery);
|
|
145
|
+
const searchTerms = [
|
|
146
|
+
language,
|
|
147
|
+
task,
|
|
148
|
+
domain !== "general" && domain !== "unknown" ? domain : undefined,
|
|
149
|
+
...positiveTerms,
|
|
150
|
+
].filter((value, index, self) => !!value && self.indexOf(value) === index);
|
|
151
|
+
return {
|
|
152
|
+
originalQuery,
|
|
153
|
+
normalizedQuery,
|
|
154
|
+
searchQuery: searchTerms.slice(0, 6).join(" ") || normalizedQuery,
|
|
155
|
+
positiveTerms,
|
|
156
|
+
negativeTerms,
|
|
157
|
+
language,
|
|
158
|
+
task: task || undefined,
|
|
159
|
+
domain,
|
|
160
|
+
minRows,
|
|
161
|
+
llmBacked: false,
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
function mergeIntent(base, llmIntent) {
|
|
165
|
+
const language = llmIntent.language ? normalizeToken(llmIntent.language) : base.language;
|
|
166
|
+
const task = llmIntent.task ? normalizeToken(llmIntent.task) : base.task;
|
|
167
|
+
const domain = llmIntent.domain ? normalizeToken(llmIntent.domain) : base.domain;
|
|
168
|
+
const minRows = typeof llmIntent.minRows === "number" && Number.isFinite(llmIntent.minRows)
|
|
169
|
+
? llmIntent.minRows
|
|
170
|
+
: base.minRows;
|
|
171
|
+
const positiveTerms = Array.from(new Set([...(llmIntent.positiveTerms || []), ...base.positiveTerms].map(normalizeToken))).filter(Boolean);
|
|
172
|
+
const negativeTerms = Array.from(new Set([...(llmIntent.negativeTerms || []), ...base.negativeTerms].map(normalizeToken))).filter(Boolean);
|
|
173
|
+
const merged = {
|
|
174
|
+
...base,
|
|
175
|
+
language,
|
|
176
|
+
task,
|
|
177
|
+
domain,
|
|
178
|
+
minRows,
|
|
179
|
+
positiveTerms,
|
|
180
|
+
negativeTerms,
|
|
181
|
+
llmBacked: true,
|
|
182
|
+
};
|
|
183
|
+
merged.searchQuery = [
|
|
184
|
+
merged.language,
|
|
185
|
+
merged.task,
|
|
186
|
+
merged.domain !== "general" && merged.domain !== "unknown" ? merged.domain : undefined,
|
|
187
|
+
...merged.positiveTerms,
|
|
188
|
+
].filter((value, index, self) => !!value && self.indexOf(value) === index).slice(0, 6).join(" ") || merged.normalizedQuery;
|
|
189
|
+
return merged;
|
|
190
|
+
}
|
|
191
|
+
async function tryLlmIntent(base, requirements) {
|
|
192
|
+
const openAiKey = process.env.OPENAI_API_KEY;
|
|
193
|
+
if (openAiKey) {
|
|
194
|
+
return await callOpenAiIntent(base, requirements).catch(() => undefined);
|
|
195
|
+
}
|
|
196
|
+
const geminiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY;
|
|
197
|
+
if (geminiKey) {
|
|
198
|
+
return await callGeminiIntent(base, requirements, geminiKey).catch(() => undefined);
|
|
199
|
+
}
|
|
200
|
+
return undefined;
|
|
201
|
+
}
|
|
202
|
+
async function callOpenAiIntent(base, requirements) {
|
|
203
|
+
const controller = new AbortController();
|
|
204
|
+
const timeout = setTimeout(() => controller.abort(), 5000);
|
|
205
|
+
try {
|
|
206
|
+
const response = await fetch("https://api.openai.com/v1/chat/completions", {
|
|
207
|
+
method: "POST",
|
|
208
|
+
headers: {
|
|
209
|
+
"Content-Type": "application/json",
|
|
210
|
+
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
|
|
211
|
+
},
|
|
212
|
+
body: JSON.stringify({
|
|
213
|
+
model: process.env.OPENAI_MODEL || "gpt-4o-mini",
|
|
214
|
+
temperature: 0,
|
|
215
|
+
response_format: { type: "json_object" },
|
|
216
|
+
messages: [
|
|
217
|
+
{
|
|
218
|
+
role: "system",
|
|
219
|
+
content: "Extract dataset search intent as JSON with keys: language, task, domain, minRows, positiveTerms, negativeTerms. Use null for unknowns.",
|
|
220
|
+
},
|
|
221
|
+
{
|
|
222
|
+
role: "user",
|
|
223
|
+
content: JSON.stringify({ query: base.originalQuery, requirements: requirements || null, heuristic: base }),
|
|
224
|
+
},
|
|
225
|
+
],
|
|
226
|
+
}),
|
|
227
|
+
signal: controller.signal,
|
|
228
|
+
});
|
|
229
|
+
if (!response.ok) {
|
|
230
|
+
return undefined;
|
|
231
|
+
}
|
|
232
|
+
const body = await response.json();
|
|
233
|
+
const content = body?.choices?.[0]?.message?.content;
|
|
234
|
+
return parseIntentPayload(content);
|
|
235
|
+
}
|
|
236
|
+
finally {
|
|
237
|
+
clearTimeout(timeout);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
async function callGeminiIntent(base, requirements, apiKey) {
|
|
241
|
+
const controller = new AbortController();
|
|
242
|
+
const timeout = setTimeout(() => controller.abort(), 5000);
|
|
243
|
+
try {
|
|
244
|
+
const model = process.env.GEMINI_MODEL || "gemini-1.5-flash";
|
|
245
|
+
const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${encodeURIComponent(apiKey)}`, {
|
|
246
|
+
method: "POST",
|
|
247
|
+
headers: {
|
|
248
|
+
"Content-Type": "application/json",
|
|
249
|
+
},
|
|
250
|
+
body: JSON.stringify({
|
|
251
|
+
generationConfig: {
|
|
252
|
+
temperature: 0,
|
|
253
|
+
responseMimeType: "application/json",
|
|
254
|
+
},
|
|
255
|
+
contents: [{
|
|
256
|
+
role: "user",
|
|
257
|
+
parts: [{
|
|
258
|
+
text: `Extract dataset search intent as JSON with keys language, task, domain, minRows, positiveTerms, negativeTerms. Query payload: ${JSON.stringify({ query: base.originalQuery, requirements: requirements || null, heuristic: base })}`,
|
|
259
|
+
}],
|
|
260
|
+
}],
|
|
261
|
+
}),
|
|
262
|
+
signal: controller.signal,
|
|
263
|
+
});
|
|
264
|
+
if (!response.ok) {
|
|
265
|
+
return undefined;
|
|
266
|
+
}
|
|
267
|
+
const body = await response.json();
|
|
268
|
+
const content = body?.candidates?.[0]?.content?.parts?.[0]?.text;
|
|
269
|
+
return parseIntentPayload(content);
|
|
270
|
+
}
|
|
271
|
+
finally {
|
|
272
|
+
clearTimeout(timeout);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
function parseIntentPayload(content) {
|
|
276
|
+
if (typeof content !== "string" || !content.trim()) {
|
|
277
|
+
return undefined;
|
|
278
|
+
}
|
|
279
|
+
const jsonText = extractJsonObject(content);
|
|
280
|
+
if (!jsonText) {
|
|
281
|
+
return undefined;
|
|
282
|
+
}
|
|
283
|
+
try {
|
|
284
|
+
const parsed = JSON.parse(jsonText);
|
|
285
|
+
return {
|
|
286
|
+
language: typeof parsed.language === "string" ? parsed.language : undefined,
|
|
287
|
+
task: typeof parsed.task === "string" ? parsed.task : undefined,
|
|
288
|
+
domain: typeof parsed.domain === "string" ? parsed.domain : undefined,
|
|
289
|
+
minRows: typeof parsed.minRows === "number"
|
|
290
|
+
? parsed.minRows
|
|
291
|
+
: typeof parsed.min_rows === "number"
|
|
292
|
+
? parsed.min_rows
|
|
293
|
+
: undefined,
|
|
294
|
+
positiveTerms: Array.isArray(parsed.positiveTerms)
|
|
295
|
+
? parsed.positiveTerms.filter((item) => typeof item === "string")
|
|
296
|
+
: Array.isArray(parsed.positive_terms)
|
|
297
|
+
? parsed.positive_terms.filter((item) => typeof item === "string")
|
|
298
|
+
: undefined,
|
|
299
|
+
negativeTerms: Array.isArray(parsed.negativeTerms)
|
|
300
|
+
? parsed.negativeTerms.filter((item) => typeof item === "string")
|
|
301
|
+
: Array.isArray(parsed.negative_terms)
|
|
302
|
+
? parsed.negative_terms.filter((item) => typeof item === "string")
|
|
303
|
+
: undefined,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
catch {
|
|
307
|
+
return undefined;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
function extractJsonObject(text) {
|
|
311
|
+
const trimmed = text.trim();
|
|
312
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
|
|
313
|
+
return trimmed;
|
|
314
|
+
}
|
|
315
|
+
const start = trimmed.indexOf("{");
|
|
316
|
+
const end = trimmed.lastIndexOf("}");
|
|
317
|
+
if (start >= 0 && end > start) {
|
|
318
|
+
return trimmed.slice(start, end + 1);
|
|
319
|
+
}
|
|
320
|
+
return undefined;
|
|
321
|
+
}
|
|
322
|
+
function detectLanguage(text) {
|
|
323
|
+
for (const [language, aliases] of Object.entries(LANGUAGE_ALIASES)) {
|
|
324
|
+
if (aliases.some(alias => new RegExp(`(^|[^a-z])${escapeRegex(alias)}([^a-z]|$)`, "i").test(text))) {
|
|
325
|
+
return language;
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
return undefined;
|
|
329
|
+
}
|
|
330
|
+
function detectTask(text) {
|
|
331
|
+
const match = TASK_PATTERNS.find(entry => entry.patterns.some(pattern => pattern.test(text)));
|
|
332
|
+
return match?.task;
|
|
333
|
+
}
|
|
334
|
+
function tokenize(text) {
|
|
335
|
+
return Array.from(new Set(text
|
|
336
|
+
.replace(/[^\w\s-]/g, " ")
|
|
337
|
+
.split(/\s+/)
|
|
338
|
+
.map(normalizeToken)
|
|
339
|
+
.filter(token => token.length > 2 && !STOP_WORDS.has(token) && !/^\d+$/.test(token))));
|
|
340
|
+
}
|
|
341
|
+
function normalizeToken(value) {
|
|
342
|
+
return value.toLowerCase().replace(/^[^a-z0-9]+|[^a-z0-9-]+$/g, "").trim();
|
|
343
|
+
}
|
|
344
|
+
function extractRequestedRows(text) {
|
|
345
|
+
const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
|
|
346
|
+
if (explicit) {
|
|
347
|
+
const value = Number(explicit[1].replace(/[\s,]/g, ""));
|
|
348
|
+
if (Number.isFinite(value) && value > 0) {
|
|
349
|
+
return value;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
const humanSized = text.match(/(\d+(?:\.\d+)?)\s*([kmb])\s*(samples?|rows?|records?)/i);
|
|
353
|
+
if (humanSized) {
|
|
354
|
+
const base = Number(humanSized[1]);
|
|
355
|
+
const suffix = humanSized[2].toLowerCase();
|
|
356
|
+
const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
|
|
357
|
+
const value = Math.round(base * multiplier);
|
|
358
|
+
if (Number.isFinite(value) && value > 0) {
|
|
359
|
+
return value;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
|
|
363
|
+
.map(match => Number(match[0].replace(/,/g, "")))
|
|
364
|
+
.filter(value => Number.isFinite(value) && value > 0);
|
|
365
|
+
if (commaNumbers.length > 0) {
|
|
366
|
+
return Math.max(...commaNumbers);
|
|
367
|
+
}
|
|
368
|
+
const humanSizedAnywhere = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
|
|
369
|
+
.map(match => {
|
|
370
|
+
const base = Number(match[1]);
|
|
371
|
+
const suffix = match[2].toLowerCase();
|
|
372
|
+
const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
|
|
373
|
+
return Math.round(base * multiplier);
|
|
374
|
+
})
|
|
375
|
+
.filter(value => Number.isFinite(value) && value > 0);
|
|
376
|
+
if (humanSizedAnywhere.length > 0) {
|
|
377
|
+
return Math.max(...humanSizedAnywhere);
|
|
378
|
+
}
|
|
379
|
+
const allNumbers = [...text.matchAll(/\b\d{4,9}\b/g)]
|
|
380
|
+
.map(match => Number(match[0]))
|
|
381
|
+
.filter(value => Number.isFinite(value) && value > 0);
|
|
382
|
+
if (allNumbers.length > 0) {
|
|
383
|
+
return Math.max(...allNumbers);
|
|
384
|
+
}
|
|
385
|
+
return undefined;
|
|
386
|
+
}
|
|
387
|
+
function matchesTask(dataset, task, text) {
|
|
388
|
+
const normalizedTask = normalizeToken(task);
|
|
389
|
+
const aliases = {
|
|
390
|
+
"question-answering": ["question-answering", "qa", "question answering"],
|
|
391
|
+
"text-classification": ["text-classification", "classification", "text classification"],
|
|
392
|
+
"token-classification": ["token-classification", "ner", "named entity"],
|
|
393
|
+
"sentiment-analysis": ["sentiment-analysis", "sentiment"],
|
|
394
|
+
translation: ["translation", "machine-translation", "parallel corpus"],
|
|
395
|
+
summarization: ["summarization", "summary"],
|
|
396
|
+
"text-generation": ["text-generation", "generation", "chat", "instruction"],
|
|
397
|
+
"image-classification": ["image-classification", "image classification"],
|
|
398
|
+
"object-detection": ["object-detection", "object detection"],
|
|
399
|
+
};
|
|
400
|
+
const variants = aliases[normalizedTask] || [normalizedTask];
|
|
401
|
+
return variants.some(variant => normalizeToken(dataset.task).includes(variant) || text.includes(variant));
|
|
402
|
+
}
|
|
403
|
+
function getLanguageAliases(language) {
|
|
404
|
+
const normalized = normalizeToken(language);
|
|
405
|
+
return (LANGUAGE_ALIASES[normalized] || [normalized]).map(normalizeToken);
|
|
406
|
+
}
|
|
407
|
+
function escapeRegex(value) {
|
|
408
|
+
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
409
|
+
}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import os from "os";
|
|
4
|
+
import path from "path";
|
|
5
|
+
function getHomeDir(buildDir) {
|
|
6
|
+
return os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
7
|
+
}
|
|
8
|
+
export function getVesperDataRoot(buildDir = process.cwd()) {
|
|
9
|
+
return path.join(getHomeDir(buildDir), ".vesper");
|
|
10
|
+
}
|
|
11
|
+
export function getManagedPythonPath(buildDir = process.cwd()) {
|
|
12
|
+
const dataRoot = getVesperDataRoot(buildDir);
|
|
13
|
+
return process.platform === "win32"
|
|
14
|
+
? path.join(dataRoot, ".venv", "Scripts", "python.exe")
|
|
15
|
+
: path.join(dataRoot, ".venv", "bin", "python");
|
|
16
|
+
}
|
|
17
|
+
function getFallbackPythonCommand() {
|
|
18
|
+
return process.platform === "win32" ? "py" : "python3";
|
|
19
|
+
}
|
|
20
|
+
export function resolvePythonCommand(buildDir = process.cwd()) {
|
|
21
|
+
const managedPython = getManagedPythonPath(buildDir);
|
|
22
|
+
if (fs.existsSync(managedPython)) {
|
|
23
|
+
return managedPython;
|
|
24
|
+
}
|
|
25
|
+
const envPython = process.env.VESPER_PYTHON;
|
|
26
|
+
if (envPython) {
|
|
27
|
+
return envPython;
|
|
28
|
+
}
|
|
29
|
+
const localCandidates = process.platform === "win32"
|
|
30
|
+
? [
|
|
31
|
+
path.resolve(buildDir, ".venv", "Scripts", "python.exe"),
|
|
32
|
+
path.resolve(buildDir, "..", ".venv", "Scripts", "python.exe")
|
|
33
|
+
]
|
|
34
|
+
: [
|
|
35
|
+
path.resolve(buildDir, ".venv", "bin", "python"),
|
|
36
|
+
path.resolve(buildDir, "..", ".venv", "bin", "python")
|
|
37
|
+
];
|
|
38
|
+
for (const candidate of localCandidates) {
|
|
39
|
+
if (fs.existsSync(candidate)) {
|
|
40
|
+
return candidate;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return getFallbackPythonCommand();
|
|
44
|
+
}
|
|
45
|
+
function runPythonCommand(pythonPath, args, timeoutMs = 300000) {
|
|
46
|
+
return new Promise((resolve, reject) => {
|
|
47
|
+
const proc = spawn(pythonPath, args, {
|
|
48
|
+
env: {
|
|
49
|
+
...process.env,
|
|
50
|
+
PYTHONIOENCODING: "utf-8",
|
|
51
|
+
},
|
|
52
|
+
});
|
|
53
|
+
let stdout = "";
|
|
54
|
+
let stderr = "";
|
|
55
|
+
const timer = setTimeout(() => {
|
|
56
|
+
proc.kill();
|
|
57
|
+
resolve({ code: 124, stdout, stderr: stderr || `Python command timed out after ${timeoutMs}ms` });
|
|
58
|
+
}, timeoutMs);
|
|
59
|
+
proc.stdout.on("data", (data) => {
|
|
60
|
+
stdout += data.toString();
|
|
61
|
+
});
|
|
62
|
+
proc.stderr.on("data", (data) => {
|
|
63
|
+
stderr += data.toString();
|
|
64
|
+
});
|
|
65
|
+
proc.on("close", (code) => {
|
|
66
|
+
clearTimeout(timer);
|
|
67
|
+
resolve({ code: code ?? 1, stdout, stderr });
|
|
68
|
+
});
|
|
69
|
+
proc.on("error", (error) => {
|
|
70
|
+
clearTimeout(timer);
|
|
71
|
+
reject(error);
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
async function createManagedPythonEnv(buildDir) {
|
|
76
|
+
const dataRoot = getVesperDataRoot(buildDir);
|
|
77
|
+
const venvDir = path.join(dataRoot, ".venv");
|
|
78
|
+
const managedPython = getManagedPythonPath(buildDir);
|
|
79
|
+
if (fs.existsSync(managedPython)) {
|
|
80
|
+
return managedPython;
|
|
81
|
+
}
|
|
82
|
+
fs.mkdirSync(dataRoot, { recursive: true });
|
|
83
|
+
const bootstrapAttempts = process.platform === "win32"
|
|
84
|
+
? [
|
|
85
|
+
{ command: "py", args: ["-3", "-m", "venv", venvDir] },
|
|
86
|
+
{ command: "python", args: ["-m", "venv", venvDir] },
|
|
87
|
+
]
|
|
88
|
+
: [
|
|
89
|
+
{ command: "python3", args: ["-m", "venv", venvDir] },
|
|
90
|
+
{ command: "python", args: ["-m", "venv", venvDir] },
|
|
91
|
+
];
|
|
92
|
+
let lastError = "";
|
|
93
|
+
for (const attempt of bootstrapAttempts) {
|
|
94
|
+
try {
|
|
95
|
+
const result = await runPythonCommand(attempt.command, attempt.args, 180000);
|
|
96
|
+
if (result.code === 0 && fs.existsSync(managedPython)) {
|
|
97
|
+
await runPythonCommand(managedPython, ["-m", "pip", "install", "--disable-pip-version-check", "--upgrade", "pip"], 300000);
|
|
98
|
+
return managedPython;
|
|
99
|
+
}
|
|
100
|
+
lastError = (result.stderr || result.stdout || "Unknown venv creation error").trim();
|
|
101
|
+
}
|
|
102
|
+
catch (error) {
|
|
103
|
+
lastError = error?.message || String(error);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
throw new Error(`Failed to create Vesper Python environment. ${lastError}`.trim());
|
|
107
|
+
}
|
|
108
|
+
export async function ensurePythonPackages(buildDir, requirements) {
|
|
109
|
+
const pythonPath = await createManagedPythonEnv(buildDir).catch(() => resolvePythonCommand(buildDir));
|
|
110
|
+
const missing = [];
|
|
111
|
+
for (const requirement of requirements) {
|
|
112
|
+
const check = await runPythonCommand(pythonPath, [
|
|
113
|
+
"-c",
|
|
114
|
+
`import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(requirement.module)}) else 1)`
|
|
115
|
+
], 20000);
|
|
116
|
+
if (check.code !== 0) {
|
|
117
|
+
missing.push(requirement);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
if (missing.length === 0) {
|
|
121
|
+
return pythonPath;
|
|
122
|
+
}
|
|
123
|
+
const packages = [...new Set(missing.map(requirement => requirement.packageName))];
|
|
124
|
+
const install = await runPythonCommand(pythonPath, ["-m", "pip", "install", "--disable-pip-version-check", ...packages], 600000);
|
|
125
|
+
if (install.code !== 0) {
|
|
126
|
+
const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
|
|
127
|
+
throw new Error(`Failed to install Python packages (${packages.join(", ")}). ${details}`);
|
|
128
|
+
}
|
|
129
|
+
return pythonPath;
|
|
130
|
+
}
|
package/package.json
CHANGED
package/scripts/postinstall.cjs
CHANGED
|
@@ -2,13 +2,34 @@
|
|
|
2
2
|
|
|
3
3
|
const { execSync } = require('child_process');
|
|
4
4
|
const fs = require('fs');
|
|
5
|
+
const os = require('os');
|
|
5
6
|
const path = require('path');
|
|
6
7
|
|
|
7
8
|
console.log('\nš Setting up Vesper MCP Server...\n');
|
|
8
9
|
|
|
10
|
+
function getPythonBootstrapCommand() {
|
|
11
|
+
const attempts = process.platform === 'win32'
|
|
12
|
+
? ['py -3', 'python']
|
|
13
|
+
: ['python3', 'python'];
|
|
14
|
+
|
|
15
|
+
for (const command of attempts) {
|
|
16
|
+
try {
|
|
17
|
+
execSync(`${command} --version`, { stdio: 'pipe' });
|
|
18
|
+
return command;
|
|
19
|
+
} catch {
|
|
20
|
+
// try next command
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
return null;
|
|
25
|
+
}
|
|
26
|
+
|
|
9
27
|
// 1. Check for Python
|
|
28
|
+
const pythonBootstrap = getPythonBootstrapCommand();
|
|
10
29
|
try {
|
|
11
|
-
|
|
30
|
+
if (!pythonBootstrap) {
|
|
31
|
+
throw new Error('Python not found');
|
|
32
|
+
}
|
|
12
33
|
console.log('ā
Python found');
|
|
13
34
|
} catch (e) {
|
|
14
35
|
console.warn('ā ļø Python not found. Please install Python 3.8+ for full functionality.');
|
|
@@ -16,36 +37,15 @@ try {
|
|
|
16
37
|
process.exit(0); // Don't fail installation
|
|
17
38
|
}
|
|
18
39
|
|
|
19
|
-
|
|
20
|
-
console.log('\nš¦ Installing Python dependencies...');
|
|
21
|
-
const pythonPackages = [
|
|
22
|
-
'opencv-python',
|
|
23
|
-
'pillow',
|
|
24
|
-
'numpy',
|
|
25
|
-
'librosa',
|
|
26
|
-
'soundfile',
|
|
27
|
-
'aiohttp',
|
|
28
|
-
'aiofiles',
|
|
29
|
-
'datasets',
|
|
30
|
-
'webdataset',
|
|
31
|
-
'kaggle'
|
|
32
|
-
];
|
|
33
|
-
|
|
34
|
-
try {
|
|
35
|
-
execSync(`python -m pip install ${pythonPackages.join(' ')}`, {
|
|
36
|
-
stdio: 'inherit',
|
|
37
|
-
timeout: 120000 // 2 minutes timeout
|
|
38
|
-
});
|
|
39
|
-
console.log('ā
Python dependencies installed');
|
|
40
|
-
} catch (e) {
|
|
41
|
-
console.warn('ā ļø Failed to install some Python dependencies.');
|
|
42
|
-
console.warn(' You may need to install them manually:');
|
|
43
|
-
console.warn(` pip install ${pythonPackages.join(' ')}\n`);
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
// 3. Create data directories
|
|
47
|
-
const homeDir = process.env.HOME || process.env.USERPROFILE;
|
|
40
|
+
const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE;
|
|
48
41
|
const vesperDataDir = path.join(homeDir, '.vesper');
|
|
42
|
+
const managedVenvDir = path.join(vesperDataDir, '.venv');
|
|
43
|
+
const managedPython = process.platform === 'win32'
|
|
44
|
+
? path.join(managedVenvDir, 'Scripts', 'python.exe')
|
|
45
|
+
: path.join(managedVenvDir, 'bin', 'python');
|
|
46
|
+
const requirementsPath = path.resolve(__dirname, '..', 'requirements.txt');
|
|
47
|
+
|
|
48
|
+
// 2. Create data directories
|
|
49
49
|
const dirs = [
|
|
50
50
|
vesperDataDir,
|
|
51
51
|
path.join(vesperDataDir, 'data'),
|
|
@@ -62,7 +62,49 @@ dirs.forEach(dir => {
|
|
|
62
62
|
|
|
63
63
|
console.log(`ā
Data directories created at ${vesperDataDir}`);
|
|
64
64
|
|
|
65
|
-
//
|
|
65
|
+
// 3. Create a managed Vesper Python environment
|
|
66
|
+
console.log('\nš Preparing managed Python environment...');
|
|
67
|
+
try {
|
|
68
|
+
if (!fs.existsSync(managedPython)) {
|
|
69
|
+
execSync(`${pythonBootstrap} -m venv "${managedVenvDir}"`, {
|
|
70
|
+
stdio: 'inherit',
|
|
71
|
+
timeout: 180000,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
console.log(`ā
Managed Python ready at ${managedVenvDir}`);
|
|
75
|
+
} catch (e) {
|
|
76
|
+
console.warn('ā ļø Failed to create the managed Vesper Python environment.');
|
|
77
|
+
console.warn(` Vesper will fall back to PATH Python and may need to self-heal at runtime. ${(e && e.message) || ''}`.trim());
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// 4. Install Python dependencies into the managed environment
|
|
81
|
+
console.log('\nš¦ Installing Python dependencies...');
|
|
82
|
+
const pythonPackages = [
|
|
83
|
+
'opencv-python',
|
|
84
|
+
'pillow',
|
|
85
|
+
'librosa',
|
|
86
|
+
'soundfile',
|
|
87
|
+
'pyarrow'
|
|
88
|
+
];
|
|
89
|
+
|
|
90
|
+
try {
|
|
91
|
+
const targetPython = fs.existsSync(managedPython) ? `"${managedPython}"` : pythonBootstrap;
|
|
92
|
+
execSync(`${targetPython} -m pip install --disable-pip-version-check --upgrade pip`, {
|
|
93
|
+
stdio: 'inherit',
|
|
94
|
+
timeout: 180000,
|
|
95
|
+
});
|
|
96
|
+
execSync(`${targetPython} -m pip install --disable-pip-version-check -r "${requirementsPath}" ${pythonPackages.join(' ')}`, {
|
|
97
|
+
stdio: 'inherit',
|
|
98
|
+
timeout: 600000,
|
|
99
|
+
});
|
|
100
|
+
console.log('ā
Python dependencies installed');
|
|
101
|
+
} catch (e) {
|
|
102
|
+
console.warn('ā ļø Failed to install some Python dependencies.');
|
|
103
|
+
console.warn(' You may need to install them manually into the Vesper runtime:');
|
|
104
|
+
console.warn(` ${fs.existsSync(managedPython) ? managedPython : pythonBootstrap} -m pip install -r "${requirementsPath}" ${pythonPackages.join(' ')}\n`);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// 5. Rebuild better-sqlite3 for current Node.js version
|
|
66
108
|
console.log('\nš§ Rebuilding native modules for current Node.js...');
|
|
67
109
|
try {
|
|
68
110
|
execSync('npm rebuild better-sqlite3', {
|
|
@@ -76,7 +118,7 @@ try {
|
|
|
76
118
|
console.warn(' If you see ERR_DLOPEN_FAILED, run: npm rebuild better-sqlite3');
|
|
77
119
|
}
|
|
78
120
|
|
|
79
|
-
//
|
|
121
|
+
// 6. Auto-configure Claude Desktop (Best Effort)
|
|
80
122
|
console.log('\nāļø Attempting to auto-configure Claude Desktop...');
|
|
81
123
|
|
|
82
124
|
function getClaudeConfigPath() {
|