vesper-wizard 2.1.6 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +153 -5
- package/build/install/install-service.js +6 -1
- package/build/metadata/scraper.js +17 -6
- package/build/python/convert_engine.py +92 -0
- package/build/search/engine.js +7 -1
- package/build/search/query-intent.js +102 -2
- package/package.json +1 -1
- package/src/python/convert_engine.py +92 -0
package/build/index.js
CHANGED
|
@@ -896,6 +896,49 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
|
|
|
896
896
|
quality_score: qualityScore
|
|
897
897
|
});
|
|
898
898
|
}
|
|
899
|
+
else {
|
|
900
|
+
// Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
|
|
901
|
+
try {
|
|
902
|
+
const existingMeta = metadataStore.getDataset(datasetIdForDownload);
|
|
903
|
+
if (!existingMeta) {
|
|
904
|
+
metadataStore.saveDataset({
|
|
905
|
+
id: datasetIdForDownload,
|
|
906
|
+
source: source,
|
|
907
|
+
name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
|
|
908
|
+
description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
|
|
909
|
+
quality_warnings: [],
|
|
910
|
+
downloads: 0,
|
|
911
|
+
likes: 0,
|
|
912
|
+
stars: 0,
|
|
913
|
+
tags: [],
|
|
914
|
+
last_updated: new Date().toISOString(),
|
|
915
|
+
task: "unknown",
|
|
916
|
+
domain: "unknown",
|
|
917
|
+
languages: [],
|
|
918
|
+
splits: [],
|
|
919
|
+
license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
|
|
920
|
+
quality_score: qualityScore,
|
|
921
|
+
download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
|
|
922
|
+
total_examples: 0,
|
|
923
|
+
is_structured: false,
|
|
924
|
+
has_target_column: false,
|
|
925
|
+
is_safe_source: true,
|
|
926
|
+
has_personal_data: false,
|
|
927
|
+
is_paywalled: false,
|
|
928
|
+
is_scraped_web_data: false,
|
|
929
|
+
uses_https: true,
|
|
930
|
+
has_train_split: false,
|
|
931
|
+
has_test_split: false,
|
|
932
|
+
has_validation_split: false,
|
|
933
|
+
description_length: 0,
|
|
934
|
+
has_readme: false,
|
|
935
|
+
});
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
catch (e) {
|
|
939
|
+
console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
940
|
+
}
|
|
941
|
+
}
|
|
899
942
|
markPipelineStep("register", "running");
|
|
900
943
|
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
901
944
|
const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
|
|
@@ -1309,6 +1352,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1309
1352
|
properties: {},
|
|
1310
1353
|
},
|
|
1311
1354
|
},
|
|
1355
|
+
{
|
|
1356
|
+
name: "vesper_convert_format",
|
|
1357
|
+
description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
|
|
1358
|
+
inputSchema: {
|
|
1359
|
+
type: "object",
|
|
1360
|
+
properties: {
|
|
1361
|
+
file_path: {
|
|
1362
|
+
type: "string",
|
|
1363
|
+
description: "Absolute path to the input dataset file.",
|
|
1364
|
+
},
|
|
1365
|
+
target_format: {
|
|
1366
|
+
type: "string",
|
|
1367
|
+
enum: ["csv", "parquet", "json", "jsonl"],
|
|
1368
|
+
description: "The desired output format.",
|
|
1369
|
+
},
|
|
1370
|
+
},
|
|
1371
|
+
required: ["file_path", "target_format"],
|
|
1372
|
+
},
|
|
1373
|
+
},
|
|
1312
1374
|
{
|
|
1313
1375
|
name: "fuse_datasets",
|
|
1314
1376
|
description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
|
|
@@ -1741,7 +1803,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1741
1803
|
max_items: maxItems,
|
|
1742
1804
|
workers,
|
|
1743
1805
|
image_column: imageColumn,
|
|
1744
|
-
output_root:
|
|
1806
|
+
output_root: requestedOutputDir || process.cwd(),
|
|
1745
1807
|
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1746
1808
|
};
|
|
1747
1809
|
try {
|
|
@@ -1845,8 +1907,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1845
1907
|
}
|
|
1846
1908
|
const dataset = metadataStore.getDataset(datasetId);
|
|
1847
1909
|
if (!dataset) {
|
|
1910
|
+
// Fallback: check the registry for local path info
|
|
1911
|
+
const regEntry = getRegistryEntry(datasetId);
|
|
1912
|
+
const regPath = regEntry?.local_path || regEntry?.path;
|
|
1913
|
+
if (regEntry) {
|
|
1914
|
+
const exists = regPath && fs.existsSync(regPath);
|
|
1915
|
+
return {
|
|
1916
|
+
content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
|
|
1917
|
+
};
|
|
1918
|
+
}
|
|
1848
1919
|
return {
|
|
1849
|
-
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}
|
|
1920
|
+
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
|
|
1850
1921
|
isError: true,
|
|
1851
1922
|
};
|
|
1852
1923
|
}
|
|
@@ -2167,7 +2238,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2167
2238
|
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
2168
2239
|
const ext = extMap[requestedFormat] || ".feather";
|
|
2169
2240
|
const safeName = toSafeDatasetPathFragment(datasetId);
|
|
2170
|
-
const outDir = targetDir
|
|
2241
|
+
const outDir = targetDir;
|
|
2171
2242
|
if (!fs.existsSync(outDir))
|
|
2172
2243
|
fs.mkdirSync(outDir, { recursive: true });
|
|
2173
2244
|
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
@@ -2203,6 +2274,80 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2203
2274
|
};
|
|
2204
2275
|
}
|
|
2205
2276
|
}
|
|
2277
|
+
case "vesper_list_datasets": {
|
|
2278
|
+
const entries = readRegistry();
|
|
2279
|
+
if (entries.length === 0) {
|
|
2280
|
+
return {
|
|
2281
|
+
content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
|
|
2282
|
+
};
|
|
2283
|
+
}
|
|
2284
|
+
const lines = entries.map((e, i) => {
|
|
2285
|
+
const id = e.dataset_id || e.id || "unknown";
|
|
2286
|
+
const localPath = e.local_path || e.path || "unknown";
|
|
2287
|
+
const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
|
|
2288
|
+
return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
|
|
2289
|
+
});
|
|
2290
|
+
return {
|
|
2291
|
+
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
2292
|
+
};
|
|
2293
|
+
}
|
|
2294
|
+
case "vesper_convert_format": {
|
|
2295
|
+
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
2296
|
+
const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
|
|
2297
|
+
if (!filePath) {
|
|
2298
|
+
throw new McpError(ErrorCode.InvalidParams, "file_path is required");
|
|
2299
|
+
}
|
|
2300
|
+
if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
|
|
2301
|
+
throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
|
|
2302
|
+
}
|
|
2303
|
+
if (!fs.existsSync(filePath)) {
|
|
2304
|
+
return {
|
|
2305
|
+
content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
|
|
2306
|
+
isError: true,
|
|
2307
|
+
};
|
|
2308
|
+
}
|
|
2309
|
+
const inputExt = path.extname(filePath).toLowerCase();
|
|
2310
|
+
const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
|
|
2311
|
+
const outputExt = extMap[targetFormat];
|
|
2312
|
+
if (inputExt === outputExt) {
|
|
2313
|
+
return {
|
|
2314
|
+
content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
|
|
2315
|
+
};
|
|
2316
|
+
}
|
|
2317
|
+
const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
|
|
2318
|
+
try {
|
|
2319
|
+
await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
|
|
2320
|
+
const convertScript = path.join(dataRoot, "python", "convert_engine.py");
|
|
2321
|
+
const result = await runPythonJson(convertScript, [filePath, outputPath]);
|
|
2322
|
+
if (!result.ok) {
|
|
2323
|
+
return {
|
|
2324
|
+
content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
|
|
2325
|
+
isError: true,
|
|
2326
|
+
};
|
|
2327
|
+
}
|
|
2328
|
+
// Register converted file in the registry
|
|
2329
|
+
const datasetId = path.basename(outputPath, outputExt);
|
|
2330
|
+
try {
|
|
2331
|
+
upsertRegistry(datasetId, outputPath, "completed");
|
|
2332
|
+
}
|
|
2333
|
+
catch (e) {
|
|
2334
|
+
console.error(`[Convert] Registry write failed: ${e?.message || e}`);
|
|
2335
|
+
}
|
|
2336
|
+
let msg = `**Conversion complete**\n`;
|
|
2337
|
+
msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
|
|
2338
|
+
msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
|
|
2339
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2340
|
+
if (result.size_mb !== undefined)
|
|
2341
|
+
msg += `- **Size**: ${result.size_mb} MB\n`;
|
|
2342
|
+
return { content: [{ type: "text", text: msg }] };
|
|
2343
|
+
}
|
|
2344
|
+
catch (error) {
|
|
2345
|
+
return {
|
|
2346
|
+
content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
|
|
2347
|
+
isError: true,
|
|
2348
|
+
};
|
|
2349
|
+
}
|
|
2350
|
+
}
|
|
2206
2351
|
case "fuse_datasets": {
|
|
2207
2352
|
const rawSources = request.params.arguments?.sources;
|
|
2208
2353
|
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
@@ -2243,10 +2388,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2243
2388
|
try {
|
|
2244
2389
|
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
|
|
2245
2390
|
const ext = extMap[outputFormat] || ".feather";
|
|
2246
|
-
const outDir =
|
|
2391
|
+
const outDir = process.cwd();
|
|
2247
2392
|
if (!fs.existsSync(outDir))
|
|
2248
2393
|
fs.mkdirSync(outDir, { recursive: true });
|
|
2249
2394
|
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
2395
|
+
console.error(`[Fusion] Resolved output directory: ${outDir}`);
|
|
2250
2396
|
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
2251
2397
|
strategy,
|
|
2252
2398
|
join_on: joinOn,
|
|
@@ -2805,10 +2951,12 @@ async function runExportCli(args) {
|
|
|
2805
2951
|
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
2806
2952
|
const ext = extMap[requestedFormat] || ".parquet";
|
|
2807
2953
|
const safeName = toSafeDatasetPathFragment(datasetId);
|
|
2808
|
-
const outDir = targetDir ||
|
|
2954
|
+
const outDir = targetDir || process.cwd();
|
|
2809
2955
|
if (!fs.existsSync(outDir))
|
|
2810
2956
|
fs.mkdirSync(outDir, { recursive: true });
|
|
2811
2957
|
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
2958
|
+
console.error(`[Export] Resolved output directory: ${outDir}`);
|
|
2959
|
+
console.error(`[Export] Output file: ${outputFile}`);
|
|
2812
2960
|
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
2813
2961
|
console.log(`Export complete: ${result.output_path}`);
|
|
2814
2962
|
console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
|
|
@@ -21,7 +21,12 @@ export class InstallService {
|
|
|
21
21
|
// Create target directory
|
|
22
22
|
const installLabel = dataset?.name || datasetId;
|
|
23
23
|
const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
|
|
24
|
-
|
|
24
|
+
// If caller specified a target dir, use it directly
|
|
25
|
+
// Otherwise use the current working directory
|
|
26
|
+
const installDir = targetDir
|
|
27
|
+
? path.resolve(targetDir)
|
|
28
|
+
: path.resolve(process.cwd(), sanitizedName);
|
|
29
|
+
console.error(`[InstallService] Resolved install directory: ${installDir}`);
|
|
25
30
|
if (!fs.existsSync(installDir)) {
|
|
26
31
|
fs.mkdirSync(installDir, { recursive: true });
|
|
27
32
|
}
|
|
@@ -3,7 +3,7 @@ import { categorizeLicense } from "./license.js";
|
|
|
3
3
|
import { calculateQualityScore } from "./quality.js";
|
|
4
4
|
import { classifyDomain } from "./domain.js";
|
|
5
5
|
import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
|
|
6
|
-
import { analyzeDatasetQuery, buildIntentSearchQuery, scoreDatasetAgainstIntent } from "../search/query-intent.js";
|
|
6
|
+
import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "../search/query-intent.js";
|
|
7
7
|
export class HuggingFaceScraper {
|
|
8
8
|
/**
|
|
9
9
|
* Bulk discovery: Fetch many datasets quickly without deep details.
|
|
@@ -21,10 +21,11 @@ export class HuggingFaceScraper {
|
|
|
21
21
|
let processed = 0;
|
|
22
22
|
try {
|
|
23
23
|
const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
24
|
+
const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
|
|
24
25
|
for await (const ds of listDatasets({
|
|
25
26
|
limit: limit,
|
|
26
27
|
additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
|
|
27
|
-
search: { query: hfQuery },
|
|
28
|
+
search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
|
|
28
29
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
29
30
|
})) {
|
|
30
31
|
if (results.length >= limit)
|
|
@@ -84,6 +85,9 @@ export class HuggingFaceScraper {
|
|
|
84
85
|
has_readme: false,
|
|
85
86
|
is_incomplete: true // Flag for Phase 2
|
|
86
87
|
};
|
|
88
|
+
// Hard language exclusion
|
|
89
|
+
if (intent && shouldExcludeByLanguage(metadata, intent))
|
|
90
|
+
continue;
|
|
87
91
|
results.push(metadata);
|
|
88
92
|
}
|
|
89
93
|
}
|
|
@@ -120,10 +124,11 @@ export class HuggingFaceScraper {
|
|
|
120
124
|
}
|
|
121
125
|
// Add delay between batches to avoid rate limits
|
|
122
126
|
const BATCH_DELAY = hfToken ? 500 : 2000;
|
|
127
|
+
const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
|
|
123
128
|
for await (const ds of listDatasets({
|
|
124
129
|
limit: fetchLimit,
|
|
125
130
|
additionalFields: ["description", "tags"],
|
|
126
|
-
search: { query: hfQuery },
|
|
131
|
+
search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
|
|
127
132
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
128
133
|
})) {
|
|
129
134
|
if (results.length >= limit)
|
|
@@ -300,10 +305,16 @@ export class HuggingFaceScraper {
|
|
|
300
305
|
description_length: description.length,
|
|
301
306
|
has_readme: !!(cardData.readme || cardData.readme_content)
|
|
302
307
|
};
|
|
303
|
-
|
|
304
|
-
|
|
308
|
+
// Hard language exclusion — drop bilingual/multilingual for single-language queries
|
|
309
|
+
if (intent && shouldExcludeByLanguage(metadata, intent)) {
|
|
310
|
+
// skip — do not push
|
|
311
|
+
}
|
|
312
|
+
else {
|
|
313
|
+
if (intent) {
|
|
314
|
+
metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
|
|
315
|
+
}
|
|
316
|
+
results.push(metadata);
|
|
305
317
|
}
|
|
306
|
-
results.push(metadata);
|
|
307
318
|
}
|
|
308
319
|
catch (e) {
|
|
309
320
|
// Track all errors for user feedback
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
|
|
3
|
+
Usage: convert_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
if ext == ".csv":
|
|
20
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
21
|
+
if ext in (".tsv", ".tab"):
|
|
22
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
23
|
+
if ext in (".parquet", ".pq"):
|
|
24
|
+
return pl.read_parquet(src)
|
|
25
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
26
|
+
return pl.read_ipc(src)
|
|
27
|
+
if ext in (".jsonl", ".ndjson"):
|
|
28
|
+
return pl.read_ndjson(src)
|
|
29
|
+
if ext == ".json":
|
|
30
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
31
|
+
if raw.startswith("["):
|
|
32
|
+
return pl.read_json(src)
|
|
33
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
34
|
+
return pl.read_ndjson(src)
|
|
35
|
+
obj = json.loads(raw)
|
|
36
|
+
if isinstance(obj, dict):
|
|
37
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
38
|
+
if key in obj and isinstance(obj[key], list):
|
|
39
|
+
return pl.DataFrame(obj[key])
|
|
40
|
+
for v in obj.values():
|
|
41
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
42
|
+
return pl.DataFrame(v)
|
|
43
|
+
return pl.read_json(src)
|
|
44
|
+
# Fallback: try csv
|
|
45
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _write(df: pl.DataFrame, dst: str) -> None:
|
|
49
|
+
ext = os.path.splitext(dst)[1].lower()
|
|
50
|
+
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
51
|
+
if ext in (".parquet", ".pq"):
|
|
52
|
+
df.write_parquet(dst)
|
|
53
|
+
elif ext == ".csv":
|
|
54
|
+
df.write_csv(dst)
|
|
55
|
+
elif ext == ".json":
|
|
56
|
+
df.write_json(dst, row_oriented=True)
|
|
57
|
+
elif ext in (".jsonl", ".ndjson"):
|
|
58
|
+
df.write_ndjson(dst)
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError(f"Unsupported output format: {ext}")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main():
|
|
64
|
+
if len(sys.argv) < 3:
|
|
65
|
+
print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
|
|
68
|
+
input_path = sys.argv[1]
|
|
69
|
+
output_path = sys.argv[2]
|
|
70
|
+
|
|
71
|
+
if not os.path.exists(input_path):
|
|
72
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
73
|
+
sys.exit(1)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
df = _load(input_path)
|
|
77
|
+
_write(df, output_path)
|
|
78
|
+
size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
79
|
+
print(json.dumps({
|
|
80
|
+
"ok": True,
|
|
81
|
+
"output_path": output_path,
|
|
82
|
+
"rows": df.height,
|
|
83
|
+
"columns": df.width,
|
|
84
|
+
"size_mb": size_mb,
|
|
85
|
+
}))
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
main()
|
package/build/search/engine.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { JITOrchestrator } from "./jit-orchestrator.js";
|
|
2
|
-
import { analyzeDatasetQuery, scoreDatasetAgainstIntent } from "./query-intent.js";
|
|
2
|
+
import { analyzeDatasetQuery, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "./query-intent.js";
|
|
3
3
|
import fs from "fs";
|
|
4
4
|
function log(msg) {
|
|
5
5
|
fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
|
|
@@ -68,6 +68,12 @@ export class SearchEngine {
|
|
|
68
68
|
// Filter: Safe only
|
|
69
69
|
if (options.safeOnly && metadata.license.category === "restricted")
|
|
70
70
|
continue;
|
|
71
|
+
// Filter: Hard language exclusion — completely drop bilingual/multilingual datasets
|
|
72
|
+
// when user explicitly requested a single language
|
|
73
|
+
if (shouldExcludeByLanguage(metadata, intent)) {
|
|
74
|
+
log(`Language exclusion: Dropped ${match.id} (bilingual/multilingual for single-language query)`);
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
71
77
|
const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
|
|
72
78
|
// Filter: Explicit Negative Keywords
|
|
73
79
|
if (negativeKeywords.some(neg => text.includes(neg))) {
|
|
@@ -68,11 +68,23 @@ export function scoreDatasetAgainstIntent(dataset, intent) {
|
|
|
68
68
|
const aliases = getLanguageAliases(intent.language);
|
|
69
69
|
const datasetLanguages = dataset.languages.map(normalizeToken);
|
|
70
70
|
const languageMatch = aliases.some(alias => datasetLanguages.includes(alias) || text.includes(alias));
|
|
71
|
+
const isMultilingualIntent = intent.language === "multilingual";
|
|
71
72
|
if (languageMatch) {
|
|
72
|
-
|
|
73
|
+
// Check if the dataset is monolingual in the requested language vs multilingual
|
|
74
|
+
const nonRequestedLanguages = datasetLanguages.filter(lang => !aliases.includes(lang) && lang !== "" && lang !== "unknown");
|
|
75
|
+
if (nonRequestedLanguages.length === 0 || isMultilingualIntent) {
|
|
76
|
+
// Purely the requested language (or user wants multilingual) → full boost
|
|
77
|
+
score += 0.55;
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
// Bilingual/multilingual dataset that CONTAINS the language but isn't exclusive
|
|
81
|
+
// Penalize proportionally to how many other languages are present
|
|
82
|
+
const ratio = nonRequestedLanguages.length / Math.max(datasetLanguages.length, 1);
|
|
83
|
+
score += 0.1 - (ratio * 0.4); // ranges from +0.1 (mostly target lang) to -0.3 (mostly other langs)
|
|
84
|
+
}
|
|
73
85
|
}
|
|
74
86
|
else if (dataset.languages.length > 0) {
|
|
75
|
-
score -= 0.
|
|
87
|
+
score -= 0.65;
|
|
76
88
|
}
|
|
77
89
|
else {
|
|
78
90
|
score -= 0.1;
|
|
@@ -131,6 +143,80 @@ export function scoreDatasetAgainstIntent(dataset, intent) {
|
|
|
131
143
|
export function buildIntentSearchQuery(intent) {
|
|
132
144
|
return intent.searchQuery;
|
|
133
145
|
}
|
|
146
|
+
/**
|
|
147
|
+
* Build HuggingFace-compatible filter tags from the parsed intent.
|
|
148
|
+
* Returns e.g. ["language:en", "task_ids:text-classification"].
|
|
149
|
+
*/
|
|
150
|
+
export function buildHuggingFaceFilterTags(intent) {
|
|
151
|
+
const tags = [];
|
|
152
|
+
if (intent.language && intent.language !== "multilingual") {
|
|
153
|
+
const langCode = LANGUAGE_TO_CODE[intent.language];
|
|
154
|
+
if (langCode)
|
|
155
|
+
tags.push(`language:${langCode}`);
|
|
156
|
+
}
|
|
157
|
+
if (intent.task) {
|
|
158
|
+
tags.push(`task_ids:${intent.task}`);
|
|
159
|
+
}
|
|
160
|
+
return tags;
|
|
161
|
+
}
|
|
162
|
+
const LANGUAGE_TO_CODE = {
|
|
163
|
+
english: "en",
|
|
164
|
+
spanish: "es",
|
|
165
|
+
french: "fr",
|
|
166
|
+
german: "de",
|
|
167
|
+
portuguese: "pt",
|
|
168
|
+
chinese: "zh",
|
|
169
|
+
japanese: "ja",
|
|
170
|
+
korean: "ko",
|
|
171
|
+
arabic: "ar",
|
|
172
|
+
russian: "ru",
|
|
173
|
+
hindi: "hi",
|
|
174
|
+
};
|
|
175
|
+
const BILINGUAL_INDICATORS = [
|
|
176
|
+
"translation", "parallel", "bilingual", "multilingual",
|
|
177
|
+
"cross-lingual", "crosslingual", "machine-translation",
|
|
178
|
+
"aligned", "comparable corpus",
|
|
179
|
+
];
|
|
180
|
+
/**
|
|
181
|
+
* Hard-exclude a dataset when the user requests a single specific language
|
|
182
|
+
* and the dataset is bilingual, multilingual, or tagged with other languages.
|
|
183
|
+
* Returns true if the dataset should be EXCLUDED from results.
|
|
184
|
+
*/
|
|
185
|
+
export function shouldExcludeByLanguage(dataset, intent) {
|
|
186
|
+
if (!intent?.language || intent.language === "multilingual")
|
|
187
|
+
return false;
|
|
188
|
+
const aliases = getLanguageAliases(intent.language);
|
|
189
|
+
const datasetLanguages = dataset.languages.map(normalizeToken).filter(l => l && l !== "unknown");
|
|
190
|
+
// If the dataset has language tags and ANY of them are NOT the requested language, exclude
|
|
191
|
+
if (datasetLanguages.length > 0) {
|
|
192
|
+
const hasRequestedLang = aliases.some(a => datasetLanguages.includes(a));
|
|
193
|
+
const hasOtherLangs = datasetLanguages.some(lang => !aliases.includes(lang));
|
|
194
|
+
if (hasOtherLangs)
|
|
195
|
+
return true; // bilingual/multilingual → exclude
|
|
196
|
+
if (!hasRequestedLang)
|
|
197
|
+
return true; // wrong language entirely
|
|
198
|
+
}
|
|
199
|
+
// Check name, description, and tags for bilingual indicators or other language names
|
|
200
|
+
const text = [
|
|
201
|
+
dataset.name,
|
|
202
|
+
dataset.description,
|
|
203
|
+
dataset.tags.join(" "),
|
|
204
|
+
].join(" ").toLowerCase();
|
|
205
|
+
// Check for bilingual/translation keywords
|
|
206
|
+
if (BILINGUAL_INDICATORS.some(indicator => text.includes(indicator))) {
|
|
207
|
+
return true;
|
|
208
|
+
}
|
|
209
|
+
// Check if the text mentions other specific languages by name
|
|
210
|
+
const otherLanguageNames = Object.keys(LANGUAGE_ALIASES).filter(lang => lang !== intent.language && lang !== "multilingual");
|
|
211
|
+
for (const otherLang of otherLanguageNames) {
|
|
212
|
+
const otherAliases = LANGUAGE_ALIASES[otherLang];
|
|
213
|
+
// Only check the full language name (not 2-letter codes which could appear in regular text)
|
|
214
|
+
if (otherAliases && otherAliases[0] && text.includes(otherAliases[0])) {
|
|
215
|
+
return true;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return false;
|
|
219
|
+
}
|
|
134
220
|
function buildHeuristicIntent(query, requirements) {
|
|
135
221
|
const originalQuery = `${query || ""} ${requirements || ""}`.trim();
|
|
136
222
|
const normalizedQuery = originalQuery.toLowerCase();
|
|
@@ -342,6 +428,20 @@ function normalizeToken(value) {
|
|
|
342
428
|
return value.toLowerCase().replace(/^[^a-z0-9]+|[^a-z0-9-]+$/g, "").trim();
|
|
343
429
|
}
|
|
344
430
|
function extractRequestedRows(text) {
|
|
431
|
+
// Match "1 million", "2.5 billion", "500 thousand" etc.
|
|
432
|
+
const wordMultipliers = {
|
|
433
|
+
thousand: 1_000, million: 1_000_000, billion: 1_000_000_000,
|
|
434
|
+
mil: 1_000_000, bil: 1_000_000_000,
|
|
435
|
+
};
|
|
436
|
+
const wordPattern = new RegExp(`(\\d+(?:\\.\\d+)?)\\s*(${Object.keys(wordMultipliers).join("|")})\\b`, "i");
|
|
437
|
+
const wordMatch = text.match(wordPattern);
|
|
438
|
+
if (wordMatch) {
|
|
439
|
+
const base = Number(wordMatch[1]);
|
|
440
|
+
const multiplier = wordMultipliers[wordMatch[2].toLowerCase()];
|
|
441
|
+
const value = Math.round(base * multiplier);
|
|
442
|
+
if (Number.isFinite(value) && value > 0)
|
|
443
|
+
return value;
|
|
444
|
+
}
|
|
345
445
|
const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
|
|
346
446
|
if (explicit) {
|
|
347
447
|
const value = Number(explicit[1].replace(/[\s,]/g, ""));
|
package/package.json
CHANGED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
|
|
3
|
+
Usage: convert_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
if ext == ".csv":
|
|
20
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
21
|
+
if ext in (".tsv", ".tab"):
|
|
22
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
23
|
+
if ext in (".parquet", ".pq"):
|
|
24
|
+
return pl.read_parquet(src)
|
|
25
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
26
|
+
return pl.read_ipc(src)
|
|
27
|
+
if ext in (".jsonl", ".ndjson"):
|
|
28
|
+
return pl.read_ndjson(src)
|
|
29
|
+
if ext == ".json":
|
|
30
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
31
|
+
if raw.startswith("["):
|
|
32
|
+
return pl.read_json(src)
|
|
33
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
34
|
+
return pl.read_ndjson(src)
|
|
35
|
+
obj = json.loads(raw)
|
|
36
|
+
if isinstance(obj, dict):
|
|
37
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
38
|
+
if key in obj and isinstance(obj[key], list):
|
|
39
|
+
return pl.DataFrame(obj[key])
|
|
40
|
+
for v in obj.values():
|
|
41
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
42
|
+
return pl.DataFrame(v)
|
|
43
|
+
return pl.read_json(src)
|
|
44
|
+
# Fallback: try csv
|
|
45
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _write(df: pl.DataFrame, dst: str) -> None:
|
|
49
|
+
ext = os.path.splitext(dst)[1].lower()
|
|
50
|
+
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
51
|
+
if ext in (".parquet", ".pq"):
|
|
52
|
+
df.write_parquet(dst)
|
|
53
|
+
elif ext == ".csv":
|
|
54
|
+
df.write_csv(dst)
|
|
55
|
+
elif ext == ".json":
|
|
56
|
+
df.write_json(dst, row_oriented=True)
|
|
57
|
+
elif ext in (".jsonl", ".ndjson"):
|
|
58
|
+
df.write_ndjson(dst)
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError(f"Unsupported output format: {ext}")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main():
|
|
64
|
+
if len(sys.argv) < 3:
|
|
65
|
+
print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
|
|
68
|
+
input_path = sys.argv[1]
|
|
69
|
+
output_path = sys.argv[2]
|
|
70
|
+
|
|
71
|
+
if not os.path.exists(input_path):
|
|
72
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
73
|
+
sys.exit(1)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
df = _load(input_path)
|
|
77
|
+
_write(df, output_path)
|
|
78
|
+
size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
79
|
+
print(json.dumps({
|
|
80
|
+
"ok": True,
|
|
81
|
+
"output_path": output_path,
|
|
82
|
+
"rows": df.height,
|
|
83
|
+
"columns": df.width,
|
|
84
|
+
"size_mb": size_mb,
|
|
85
|
+
}))
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
main()
|