vesper-wizard 2.1.6 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js
CHANGED
|
@@ -896,6 +896,49 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
|
|
|
896
896
|
quality_score: qualityScore
|
|
897
897
|
});
|
|
898
898
|
}
|
|
899
|
+
else {
|
|
900
|
+
// Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
|
|
901
|
+
try {
|
|
902
|
+
const existingMeta = metadataStore.getDataset(datasetIdForDownload);
|
|
903
|
+
if (!existingMeta) {
|
|
904
|
+
metadataStore.saveDataset({
|
|
905
|
+
id: datasetIdForDownload,
|
|
906
|
+
source: source,
|
|
907
|
+
name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
|
|
908
|
+
description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
|
|
909
|
+
quality_warnings: [],
|
|
910
|
+
downloads: 0,
|
|
911
|
+
likes: 0,
|
|
912
|
+
stars: 0,
|
|
913
|
+
tags: [],
|
|
914
|
+
last_updated: new Date().toISOString(),
|
|
915
|
+
task: "unknown",
|
|
916
|
+
domain: "unknown",
|
|
917
|
+
languages: [],
|
|
918
|
+
splits: [],
|
|
919
|
+
license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
|
|
920
|
+
quality_score: qualityScore,
|
|
921
|
+
download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
|
|
922
|
+
total_examples: 0,
|
|
923
|
+
is_structured: false,
|
|
924
|
+
has_target_column: false,
|
|
925
|
+
is_safe_source: true,
|
|
926
|
+
has_personal_data: false,
|
|
927
|
+
is_paywalled: false,
|
|
928
|
+
is_scraped_web_data: false,
|
|
929
|
+
uses_https: true,
|
|
930
|
+
has_train_split: false,
|
|
931
|
+
has_test_split: false,
|
|
932
|
+
has_validation_split: false,
|
|
933
|
+
description_length: 0,
|
|
934
|
+
has_readme: false,
|
|
935
|
+
});
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
catch (e) {
|
|
939
|
+
console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
940
|
+
}
|
|
941
|
+
}
|
|
899
942
|
markPipelineStep("register", "running");
|
|
900
943
|
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
901
944
|
const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
|
|
@@ -1845,8 +1888,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1845
1888
|
}
|
|
1846
1889
|
const dataset = metadataStore.getDataset(datasetId);
|
|
1847
1890
|
if (!dataset) {
|
|
1891
|
+
// Fallback: check the registry for local path info
|
|
1892
|
+
const regEntry = getRegistryEntry(datasetId);
|
|
1893
|
+
const regPath = regEntry?.local_path || regEntry?.path;
|
|
1894
|
+
if (regEntry) {
|
|
1895
|
+
const exists = regPath && fs.existsSync(regPath);
|
|
1896
|
+
return {
|
|
1897
|
+
content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
|
|
1898
|
+
};
|
|
1899
|
+
}
|
|
1848
1900
|
return {
|
|
1849
|
-
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}
|
|
1901
|
+
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
|
|
1850
1902
|
isError: true,
|
|
1851
1903
|
};
|
|
1852
1904
|
}
|
|
@@ -2167,7 +2219,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2167
2219
|
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
2168
2220
|
const ext = extMap[requestedFormat] || ".feather";
|
|
2169
2221
|
const safeName = toSafeDatasetPathFragment(datasetId);
|
|
2170
|
-
const outDir = targetDir
|
|
2222
|
+
const outDir = targetDir;
|
|
2171
2223
|
if (!fs.existsSync(outDir))
|
|
2172
2224
|
fs.mkdirSync(outDir, { recursive: true });
|
|
2173
2225
|
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
@@ -2203,6 +2255,23 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2203
2255
|
};
|
|
2204
2256
|
}
|
|
2205
2257
|
}
|
|
2258
|
+
case "vesper_list_datasets": {
|
|
2259
|
+
const entries = readRegistry();
|
|
2260
|
+
if (entries.length === 0) {
|
|
2261
|
+
return {
|
|
2262
|
+
content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
|
|
2263
|
+
};
|
|
2264
|
+
}
|
|
2265
|
+
const lines = entries.map((e, i) => {
|
|
2266
|
+
const id = e.dataset_id || e.id || "unknown";
|
|
2267
|
+
const localPath = e.local_path || e.path || "unknown";
|
|
2268
|
+
const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
|
|
2269
|
+
return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
|
|
2270
|
+
});
|
|
2271
|
+
return {
|
|
2272
|
+
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
2273
|
+
};
|
|
2274
|
+
}
|
|
2206
2275
|
case "fuse_datasets": {
|
|
2207
2276
|
const rawSources = request.params.arguments?.sources;
|
|
2208
2277
|
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
@@ -21,7 +21,11 @@ export class InstallService {
|
|
|
21
21
|
// Create target directory
|
|
22
22
|
const installLabel = dataset?.name || datasetId;
|
|
23
23
|
const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
|
|
24
|
-
|
|
24
|
+
// If caller specified a target dir, use it directly (don't nest under datasets/)
|
|
25
|
+
// Otherwise fall back to the project root's datasets/ folder
|
|
26
|
+
const installDir = targetDir
|
|
27
|
+
? path.resolve(targetDir)
|
|
28
|
+
: path.join(this.projectRoot, "datasets", sanitizedName);
|
|
25
29
|
if (!fs.existsSync(installDir)) {
|
|
26
30
|
fs.mkdirSync(installDir, { recursive: true });
|
|
27
31
|
}
|
|
@@ -3,7 +3,7 @@ import { categorizeLicense } from "./license.js";
|
|
|
3
3
|
import { calculateQualityScore } from "./quality.js";
|
|
4
4
|
import { classifyDomain } from "./domain.js";
|
|
5
5
|
import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
|
|
6
|
-
import { analyzeDatasetQuery, buildIntentSearchQuery, scoreDatasetAgainstIntent } from "../search/query-intent.js";
|
|
6
|
+
import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent } from "../search/query-intent.js";
|
|
7
7
|
export class HuggingFaceScraper {
|
|
8
8
|
/**
|
|
9
9
|
* Bulk discovery: Fetch many datasets quickly without deep details.
|
|
@@ -21,10 +21,11 @@ export class HuggingFaceScraper {
|
|
|
21
21
|
let processed = 0;
|
|
22
22
|
try {
|
|
23
23
|
const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
24
|
+
const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
|
|
24
25
|
for await (const ds of listDatasets({
|
|
25
26
|
limit: limit,
|
|
26
27
|
additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
|
|
27
|
-
search: { query: hfQuery },
|
|
28
|
+
search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
|
|
28
29
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
29
30
|
})) {
|
|
30
31
|
if (results.length >= limit)
|
|
@@ -120,10 +121,11 @@ export class HuggingFaceScraper {
|
|
|
120
121
|
}
|
|
121
122
|
// Add delay between batches to avoid rate limits
|
|
122
123
|
const BATCH_DELAY = hfToken ? 500 : 2000;
|
|
124
|
+
const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
|
|
123
125
|
for await (const ds of listDatasets({
|
|
124
126
|
limit: fetchLimit,
|
|
125
127
|
additionalFields: ["description", "tags"],
|
|
126
|
-
search: { query: hfQuery },
|
|
128
|
+
search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
|
|
127
129
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
128
130
|
})) {
|
|
129
131
|
if (results.length >= limit)
|
|
@@ -68,11 +68,23 @@ export function scoreDatasetAgainstIntent(dataset, intent) {
|
|
|
68
68
|
const aliases = getLanguageAliases(intent.language);
|
|
69
69
|
const datasetLanguages = dataset.languages.map(normalizeToken);
|
|
70
70
|
const languageMatch = aliases.some(alias => datasetLanguages.includes(alias) || text.includes(alias));
|
|
71
|
+
const isMultilingualIntent = intent.language === "multilingual";
|
|
71
72
|
if (languageMatch) {
|
|
72
|
-
|
|
73
|
+
// Check if the dataset is monolingual in the requested language vs multilingual
|
|
74
|
+
const nonRequestedLanguages = datasetLanguages.filter(lang => !aliases.includes(lang) && lang !== "" && lang !== "unknown");
|
|
75
|
+
if (nonRequestedLanguages.length === 0 || isMultilingualIntent) {
|
|
76
|
+
// Purely the requested language (or user wants multilingual) → full boost
|
|
77
|
+
score += 0.55;
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
// Bilingual/multilingual dataset that CONTAINS the language but isn't exclusive
|
|
81
|
+
// Penalize proportionally to how many other languages are present
|
|
82
|
+
const ratio = nonRequestedLanguages.length / Math.max(datasetLanguages.length, 1);
|
|
83
|
+
score += 0.1 - (ratio * 0.4); // ranges from +0.1 (mostly target lang) to -0.3 (mostly other langs)
|
|
84
|
+
}
|
|
73
85
|
}
|
|
74
86
|
else if (dataset.languages.length > 0) {
|
|
75
|
-
score -= 0.
|
|
87
|
+
score -= 0.65;
|
|
76
88
|
}
|
|
77
89
|
else {
|
|
78
90
|
score -= 0.1;
|
|
@@ -131,6 +143,35 @@ export function scoreDatasetAgainstIntent(dataset, intent) {
|
|
|
131
143
|
export function buildIntentSearchQuery(intent) {
|
|
132
144
|
return intent.searchQuery;
|
|
133
145
|
}
|
|
146
|
+
/**
|
|
147
|
+
* Build HuggingFace-compatible filter tags from the parsed intent.
|
|
148
|
+
* Returns e.g. ["language:en", "task_ids:text-classification"].
|
|
149
|
+
*/
|
|
150
|
+
export function buildHuggingFaceFilterTags(intent) {
|
|
151
|
+
const tags = [];
|
|
152
|
+
if (intent.language && intent.language !== "multilingual") {
|
|
153
|
+
const langCode = LANGUAGE_TO_CODE[intent.language];
|
|
154
|
+
if (langCode)
|
|
155
|
+
tags.push(`language:${langCode}`);
|
|
156
|
+
}
|
|
157
|
+
if (intent.task) {
|
|
158
|
+
tags.push(`task_ids:${intent.task}`);
|
|
159
|
+
}
|
|
160
|
+
return tags;
|
|
161
|
+
}
|
|
162
|
+
const LANGUAGE_TO_CODE = {
|
|
163
|
+
english: "en",
|
|
164
|
+
spanish: "es",
|
|
165
|
+
french: "fr",
|
|
166
|
+
german: "de",
|
|
167
|
+
portuguese: "pt",
|
|
168
|
+
chinese: "zh",
|
|
169
|
+
japanese: "ja",
|
|
170
|
+
korean: "ko",
|
|
171
|
+
arabic: "ar",
|
|
172
|
+
russian: "ru",
|
|
173
|
+
hindi: "hi",
|
|
174
|
+
};
|
|
134
175
|
function buildHeuristicIntent(query, requirements) {
|
|
135
176
|
const originalQuery = `${query || ""} ${requirements || ""}`.trim();
|
|
136
177
|
const normalizedQuery = originalQuery.toLowerCase();
|
|
@@ -342,6 +383,20 @@ function normalizeToken(value) {
|
|
|
342
383
|
return value.toLowerCase().replace(/^[^a-z0-9]+|[^a-z0-9-]+$/g, "").trim();
|
|
343
384
|
}
|
|
344
385
|
function extractRequestedRows(text) {
|
|
386
|
+
// Match "1 million", "2.5 billion", "500 thousand" etc.
|
|
387
|
+
const wordMultipliers = {
|
|
388
|
+
thousand: 1_000, million: 1_000_000, billion: 1_000_000_000,
|
|
389
|
+
mil: 1_000_000, bil: 1_000_000_000,
|
|
390
|
+
};
|
|
391
|
+
const wordPattern = new RegExp(`(\\d+(?:\\.\\d+)?)\\s*(${Object.keys(wordMultipliers).join("|")})\\b`, "i");
|
|
392
|
+
const wordMatch = text.match(wordPattern);
|
|
393
|
+
if (wordMatch) {
|
|
394
|
+
const base = Number(wordMatch[1]);
|
|
395
|
+
const multiplier = wordMultipliers[wordMatch[2].toLowerCase()];
|
|
396
|
+
const value = Math.round(base * multiplier);
|
|
397
|
+
if (Number.isFinite(value) && value > 0)
|
|
398
|
+
return value;
|
|
399
|
+
}
|
|
345
400
|
const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
|
|
346
401
|
if (explicit) {
|
|
347
402
|
const value = Number(explicit[1].replace(/[\s,]/g, ""));
|
package/package.json
CHANGED