@vespermcp/mcp-server 1.2.19 → 1.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +41 -6
- package/build/ingestion/hf-downloader.js +12 -2
- package/build/ingestion/ingestor.js +124 -18
- package/build/python/asset_downloader_engine.py +20 -1
- package/build/python/hf_fallback.py +298 -0
- package/build/python/vesper/core/asset_downloader.py +233 -47
- package/package.json +1 -1
- package/src/python/asset_downloader_engine.py +20 -1
- package/src/python/hf_fallback.py +298 -0
- package/src/python/vesper/core/asset_downloader.py +233 -47
package/build/index.js
CHANGED
|
@@ -453,6 +453,17 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
453
453
|
async function handlePrepareJob(jobId, query, requirements) {
|
|
454
454
|
hydrateExternalKeys();
|
|
455
455
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
456
|
+
// Ensure core Python packages are available for dataset operations
|
|
457
|
+
try {
|
|
458
|
+
await ensurePythonModules([
|
|
459
|
+
{ module: "polars", packageName: "polars" },
|
|
460
|
+
{ module: "datasets", packageName: "datasets" },
|
|
461
|
+
]);
|
|
462
|
+
}
|
|
463
|
+
catch (e) {
|
|
464
|
+
console.error(`[Prepare] Python dependency setup warning: ${e.message}`);
|
|
465
|
+
// Continue anyway - direct file downloads may still work without datasets lib
|
|
466
|
+
}
|
|
456
467
|
const requestedRows = extractRequestedRows(query, requirements);
|
|
457
468
|
let selectedDataset;
|
|
458
469
|
let datasetIdForDownload = "";
|
|
@@ -749,19 +760,19 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
749
760
|
},
|
|
750
761
|
{
|
|
751
762
|
name: "vesper_download_assets",
|
|
752
|
-
description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL).",
|
|
763
|
+
description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL). Auto-detects image columns from HF feature types, column names, and value patterns. Supports PIL Images, URL-based images, and binary image data.",
|
|
753
764
|
inputSchema: {
|
|
754
765
|
type: "object",
|
|
755
766
|
properties: {
|
|
756
|
-
dataset_id: { type: "string", description: "Unique dataset identifier." },
|
|
767
|
+
dataset_id: { type: "string", description: "Unique dataset identifier (e.g. 'user/dataset')." },
|
|
757
768
|
source: { type: "string", enum: ["huggingface", "kaggle", "url"], description: "Asset source type." },
|
|
758
|
-
repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g.
|
|
769
|
+
repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. 'user/dataset'). Auto-inferred from dataset_id if omitted." },
|
|
759
770
|
kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
|
|
760
771
|
urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
|
|
761
772
|
output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
|
|
762
773
|
max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
|
|
763
774
|
workers: { type: "number", description: "Parallel worker count (default 8)." },
|
|
764
|
-
image_column: { type: "string", description: "
|
|
775
|
+
image_column: { type: "string", description: "Explicit image column name. If omitted, auto-detected from HF features, column names, and sample values." },
|
|
765
776
|
},
|
|
766
777
|
required: ["dataset_id", "source"],
|
|
767
778
|
},
|
|
@@ -1225,6 +1236,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1225
1236
|
isError: true,
|
|
1226
1237
|
};
|
|
1227
1238
|
}
|
|
1239
|
+
// Pre-install Python datasets library for HuggingFace fallback
|
|
1240
|
+
if (source === "huggingface") {
|
|
1241
|
+
try {
|
|
1242
|
+
await ensurePythonModules([
|
|
1243
|
+
{ module: "datasets", packageName: "datasets" },
|
|
1244
|
+
]);
|
|
1245
|
+
}
|
|
1246
|
+
catch {
|
|
1247
|
+
// Continue - direct download may still work
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1228
1250
|
try {
|
|
1229
1251
|
const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
|
|
1230
1252
|
try {
|
|
@@ -1248,7 +1270,10 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1248
1270
|
hydrateExternalKeys();
|
|
1249
1271
|
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1250
1272
|
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
1251
|
-
|
|
1273
|
+
// Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
|
|
1274
|
+
const repoId = request.params.arguments?.repo_id
|
|
1275
|
+
? String(request.params.arguments.repo_id)
|
|
1276
|
+
: (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
|
|
1252
1277
|
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
1253
1278
|
const urls = Array.isArray(request.params.arguments?.urls)
|
|
1254
1279
|
? (request.params.arguments?.urls).map(v => String(v))
|
|
@@ -1274,6 +1299,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1274
1299
|
}
|
|
1275
1300
|
if (source === "huggingface") {
|
|
1276
1301
|
requiredModules.push({ module: "datasets", packageName: "datasets" });
|
|
1302
|
+
requiredModules.push({ module: "PIL", packageName: "Pillow" });
|
|
1277
1303
|
}
|
|
1278
1304
|
if (source === "kaggle") {
|
|
1279
1305
|
requiredModules.push({ module: "kaggle", packageName: "kaggle" });
|
|
@@ -1304,8 +1330,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1304
1330
|
try {
|
|
1305
1331
|
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
1306
1332
|
if (!result?.ok) {
|
|
1333
|
+
const errMsg = result?.error || "Unknown error";
|
|
1334
|
+
// Enhance error messages for common failures
|
|
1335
|
+
let hint = "";
|
|
1336
|
+
if (errMsg.includes("No image column")) {
|
|
1337
|
+
hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
|
|
1338
|
+
}
|
|
1339
|
+
else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
|
|
1340
|
+
hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
|
|
1341
|
+
}
|
|
1307
1342
|
return {
|
|
1308
|
-
content: [{ type: "text", text: `ERROR: asset download failed: ${
|
|
1343
|
+
content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
|
|
1309
1344
|
isError: true,
|
|
1310
1345
|
};
|
|
1311
1346
|
}
|
|
@@ -94,8 +94,18 @@ export class HFDownloader {
|
|
|
94
94
|
}
|
|
95
95
|
catch (error) {
|
|
96
96
|
const msg = String(error?.message || error);
|
|
97
|
-
if (msg.includes("401") || msg.
|
|
98
|
-
throw new Error(
|
|
97
|
+
if (msg.includes("401") || msg.toLowerCase().includes("unauthorized")) {
|
|
98
|
+
throw new Error(`Authentication required for dataset '${repoId}'. ` +
|
|
99
|
+
`This dataset may be gated or private. ` +
|
|
100
|
+
`Use the configure_keys tool to set your HF_TOKEN, then retry.`);
|
|
101
|
+
}
|
|
102
|
+
if (msg.includes("403") || msg.toLowerCase().includes("forbidden")) {
|
|
103
|
+
throw new Error(`Access denied for dataset '${repoId}'. ` +
|
|
104
|
+
`You may need to accept the dataset's usage agreement on huggingface.co, ` +
|
|
105
|
+
`then set HF_TOKEN via configure_keys tool.`);
|
|
106
|
+
}
|
|
107
|
+
if (msg.includes("404") || msg.toLowerCase().includes("not found")) {
|
|
108
|
+
throw new Error(`Dataset '${repoId}' not found on HuggingFace. Check the dataset ID.`);
|
|
99
109
|
}
|
|
100
110
|
console.error(`[HF] Failed to list files for ${repoId}:`, msg);
|
|
101
111
|
return null;
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import path from "path";
|
|
2
2
|
import fs from "fs";
|
|
3
|
+
import { spawn } from "child_process";
|
|
3
4
|
import { HFDownloader } from "./hf-downloader.js";
|
|
4
5
|
import { KaggleSource } from "../metadata/kaggle-source.js";
|
|
5
6
|
import { OpenMLSource } from "../metadata/openml-source.js";
|
|
@@ -63,25 +64,49 @@ export class DataIngestor {
|
|
|
63
64
|
if (source === "huggingface") {
|
|
64
65
|
onProgress?.("Discovering data files on HuggingFace Hub...");
|
|
65
66
|
const remotePath = await this.hfDownloader.findBestFile(datasetId);
|
|
66
|
-
if (
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
67
|
+
if (remotePath) {
|
|
68
|
+
// Direct file download path (repo has raw data files)
|
|
69
|
+
const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
|
|
70
|
+
const targetPath = this.getTargetPath(datasetId, ext);
|
|
71
|
+
this.store.registerDownload(datasetId, targetPath, "downloading");
|
|
72
|
+
try {
|
|
73
|
+
await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
|
|
74
|
+
onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
|
|
75
|
+
});
|
|
76
|
+
const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
|
|
77
|
+
onProgress?.("Resolving external dataset file...", progress);
|
|
78
|
+
});
|
|
79
|
+
const stats = fs.statSync(resolvedPath);
|
|
80
|
+
this.completeDownload(datasetId, resolvedPath, stats.size);
|
|
81
|
+
return resolvedPath;
|
|
82
|
+
}
|
|
83
|
+
catch (e) {
|
|
84
|
+
const msg = String(e?.message || e);
|
|
85
|
+
// If auth error, propagate immediately with helpful message
|
|
86
|
+
if (msg.includes("401") || msg.includes("403") || msg.includes("Authentication") || msg.includes("Access denied")) {
|
|
87
|
+
this.failDownload(datasetId, msg);
|
|
88
|
+
throw e;
|
|
89
|
+
}
|
|
90
|
+
// For other download errors, try the fallback
|
|
91
|
+
onProgress?.(`Direct download failed (${msg}), trying datasets library fallback...`);
|
|
92
|
+
}
|
|
81
93
|
}
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
94
|
+
// Fallback: Use Python datasets library to download and convert
|
|
95
|
+
// This runs when findBestFile returns null OR when direct download fails (non-auth)
|
|
96
|
+
if (!fs.existsSync(this.getTargetPath(datasetId, "parquet")) || !this.store.getDownloadStatus(datasetId)?.status?.includes("completed")) {
|
|
97
|
+
onProgress?.("Using HuggingFace datasets library to download...");
|
|
98
|
+
const targetPath = this.getTargetPath(datasetId, "parquet");
|
|
99
|
+
this.store.registerDownload(datasetId, targetPath, "downloading");
|
|
100
|
+
try {
|
|
101
|
+
const result = await this.hfDatasetsFallback(datasetId, targetPath, onProgress);
|
|
102
|
+
const stats = fs.statSync(result);
|
|
103
|
+
this.completeDownload(datasetId, result, stats.size);
|
|
104
|
+
return result;
|
|
105
|
+
}
|
|
106
|
+
catch (e) {
|
|
107
|
+
this.failDownload(datasetId, e.message);
|
|
108
|
+
throw e;
|
|
109
|
+
}
|
|
85
110
|
}
|
|
86
111
|
}
|
|
87
112
|
else if (source === "kaggle") {
|
|
@@ -159,4 +184,85 @@ export class DataIngestor {
|
|
|
159
184
|
const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
|
|
160
185
|
return path.join(this.rawDataDir, `${safeId}.${extension}`);
|
|
161
186
|
}
|
|
187
|
+
/**
|
|
188
|
+
* Fallback: Use Python `datasets` library to download a HuggingFace dataset
|
|
189
|
+
* when no raw data files are found in the repo file listing.
|
|
190
|
+
*/
|
|
191
|
+
async hfDatasetsFallback(datasetId, targetPath, onProgress) {
|
|
192
|
+
const pyCmd = process.platform === "win32" ? "py" : "python";
|
|
193
|
+
// Resolve the fallback script path
|
|
194
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || this.projectRoot;
|
|
195
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
196
|
+
const scriptCandidates = [
|
|
197
|
+
path.resolve(dataRoot, "python", "hf_fallback.py"),
|
|
198
|
+
path.resolve(this.projectRoot, "python", "hf_fallback.py"),
|
|
199
|
+
path.resolve(this.projectRoot, "..", "src", "python", "hf_fallback.py"),
|
|
200
|
+
path.resolve(this.projectRoot, "..", "python", "hf_fallback.py"),
|
|
201
|
+
];
|
|
202
|
+
let scriptPath = scriptCandidates.find(p => fs.existsSync(p));
|
|
203
|
+
if (!scriptPath) {
|
|
204
|
+
scriptPath = scriptCandidates[0]; // Will fail with a clear error
|
|
205
|
+
}
|
|
206
|
+
const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN || undefined;
|
|
207
|
+
const payload = {
|
|
208
|
+
repo_id: datasetId,
|
|
209
|
+
output_path: targetPath,
|
|
210
|
+
token: token || null,
|
|
211
|
+
max_rows: 500000,
|
|
212
|
+
};
|
|
213
|
+
onProgress?.("Downloading via datasets library (this may take a moment)...", 30);
|
|
214
|
+
return new Promise((resolve, reject) => {
|
|
215
|
+
const proc = spawn(pyCmd, [scriptPath, JSON.stringify(payload)], {
|
|
216
|
+
env: {
|
|
217
|
+
...process.env,
|
|
218
|
+
PYTHONUTF8: "1",
|
|
219
|
+
PIP_DISABLE_PIP_VERSION_CHECK: "1",
|
|
220
|
+
},
|
|
221
|
+
});
|
|
222
|
+
let stdout = "";
|
|
223
|
+
let stderr = "";
|
|
224
|
+
proc.stdout.on("data", (d) => (stdout += d.toString()));
|
|
225
|
+
proc.stderr.on("data", (d) => {
|
|
226
|
+
const msg = d.toString();
|
|
227
|
+
stderr += msg;
|
|
228
|
+
// Forward progress info
|
|
229
|
+
if (msg.includes("Downloading") || msg.includes("Loading")) {
|
|
230
|
+
onProgress?.(msg.trim().split("\n").pop() || "Downloading...", 50);
|
|
231
|
+
}
|
|
232
|
+
});
|
|
233
|
+
const timer = setTimeout(() => {
|
|
234
|
+
try {
|
|
235
|
+
proc.kill();
|
|
236
|
+
}
|
|
237
|
+
catch { /* no-op */ }
|
|
238
|
+
reject(new Error(`HuggingFace datasets download timed out after 10 minutes for ${datasetId}`));
|
|
239
|
+
}, 600000); // 10 min timeout
|
|
240
|
+
proc.on("close", (code) => {
|
|
241
|
+
clearTimeout(timer);
|
|
242
|
+
if (code !== 0) {
|
|
243
|
+
let errorMsg = stderr || stdout || `Python exited with code ${code}`;
|
|
244
|
+
try {
|
|
245
|
+
const parsed = JSON.parse(stdout);
|
|
246
|
+
if (parsed.error)
|
|
247
|
+
errorMsg = parsed.error;
|
|
248
|
+
}
|
|
249
|
+
catch { /* use stderr */ }
|
|
250
|
+
reject(new Error(`HuggingFace datasets fallback failed: ${errorMsg}`));
|
|
251
|
+
return;
|
|
252
|
+
}
|
|
253
|
+
try {
|
|
254
|
+
const result = JSON.parse(stdout);
|
|
255
|
+
if (!result.ok) {
|
|
256
|
+
reject(new Error(result.error || "Unknown error from HF fallback"));
|
|
257
|
+
return;
|
|
258
|
+
}
|
|
259
|
+
onProgress?.(`Downloaded ${result.rows?.toLocaleString() || "?"} rows (${result.columns?.length || "?"} columns)`, 90);
|
|
260
|
+
resolve(result.path);
|
|
261
|
+
}
|
|
262
|
+
catch {
|
|
263
|
+
reject(new Error(`Failed to parse HF fallback output: ${stdout}`));
|
|
264
|
+
}
|
|
265
|
+
});
|
|
266
|
+
});
|
|
267
|
+
}
|
|
162
268
|
}
|
|
@@ -3,9 +3,14 @@ import asyncio
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import sys
|
|
6
|
+
import warnings
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Any, Dict
|
|
8
9
|
|
|
10
|
+
# Suppress noisy HF warnings
|
|
11
|
+
warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
|
|
12
|
+
warnings.filterwarnings("ignore", message=".*legacy.*")
|
|
13
|
+
|
|
9
14
|
CURRENT_DIR = Path(__file__).resolve().parent
|
|
10
15
|
if str(CURRENT_DIR) not in sys.path:
|
|
11
16
|
sys.path.insert(0, str(CURRENT_DIR))
|
|
@@ -24,6 +29,11 @@ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
|
|
|
24
29
|
workers = int(payload.get("workers") or 8)
|
|
25
30
|
recipes_dir = payload.get("recipes_dir")
|
|
26
31
|
|
|
32
|
+
# Auto-set HF token from payload if provided
|
|
33
|
+
token = payload.get("token") or payload.get("hf_token")
|
|
34
|
+
if token:
|
|
35
|
+
os.environ["HF_TOKEN"] = str(token)
|
|
36
|
+
|
|
27
37
|
downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
|
|
28
38
|
|
|
29
39
|
result = await downloader.download_assets(
|
|
@@ -66,7 +76,16 @@ def main() -> None:
|
|
|
66
76
|
|
|
67
77
|
_print({"ok": False, "error": f"Unknown action: {args.action}"})
|
|
68
78
|
except Exception as e:
|
|
69
|
-
|
|
79
|
+
error_msg = str(e)
|
|
80
|
+
# Provide actionable error messages
|
|
81
|
+
if "401" in error_msg or "403" in error_msg or "Unauthorized" in error_msg:
|
|
82
|
+
error_msg = (
|
|
83
|
+
"Authentication required. This dataset may be gated/private. "
|
|
84
|
+
"Use configure_keys tool to set HF_TOKEN, then retry."
|
|
85
|
+
)
|
|
86
|
+
elif "No image column" in error_msg:
|
|
87
|
+
error_msg += " Hint: specify image_column parameter with the name of the column containing images."
|
|
88
|
+
_print({"ok": False, "error": error_msg})
|
|
70
89
|
|
|
71
90
|
|
|
72
91
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HuggingFace Datasets Library Fallback Downloader.
|
|
3
|
+
|
|
4
|
+
Used when the HF Hub file listing finds no suitable data files
|
|
5
|
+
(e.g. script-based datasets, gated datasets, datasets that use
|
|
6
|
+
the `datasets` library format).
|
|
7
|
+
|
|
8
|
+
Handles:
|
|
9
|
+
- Legacy script-based datasets (trust_remote_code)
|
|
10
|
+
- Gated/private datasets (token auth)
|
|
11
|
+
- Image datasets (PIL Image columns → stripped for tabular export)
|
|
12
|
+
- Various split formats (DatasetDict, single split)
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
|
|
16
|
+
|
|
17
|
+
Output: JSON to stdout
|
|
18
|
+
{"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
|
|
19
|
+
{"ok": false, "error": "..."}
|
|
20
|
+
"""
|
|
21
|
+
import sys
|
|
22
|
+
import json
|
|
23
|
+
import os
|
|
24
|
+
import warnings
|
|
25
|
+
|
|
26
|
+
# Suppress noisy HF warnings about trust_remote_code etc.
|
|
27
|
+
warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
|
|
28
|
+
warnings.filterwarnings("ignore", message=".*legacy.*")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _detect_image_columns(ds):
|
|
32
|
+
"""Detect columns that contain HF Image features or PIL Image objects."""
|
|
33
|
+
image_cols = []
|
|
34
|
+
features = getattr(ds, "features", None)
|
|
35
|
+
if features:
|
|
36
|
+
for name, feat in features.items():
|
|
37
|
+
feat_cls = feat.__class__.__name__.lower()
|
|
38
|
+
feat_str = str(feat).lower()
|
|
39
|
+
if feat_cls == "image" or "image(" in feat_str:
|
|
40
|
+
image_cols.append(name)
|
|
41
|
+
return image_cols
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _strip_image_columns(ds, image_cols):
|
|
45
|
+
"""Remove image columns from dataset so it can be exported to Parquet/CSV.
|
|
46
|
+
|
|
47
|
+
Image columns contain PIL Image objects that can't be serialized to tabular
|
|
48
|
+
formats. We replace them with a placeholder string indicating the column
|
|
49
|
+
was an image column.
|
|
50
|
+
"""
|
|
51
|
+
if not image_cols:
|
|
52
|
+
return ds
|
|
53
|
+
|
|
54
|
+
# Remove the image columns entirely for tabular export
|
|
55
|
+
cols_to_keep = [c for c in ds.column_names if c not in image_cols]
|
|
56
|
+
if not cols_to_keep:
|
|
57
|
+
# Dataset is ALL image columns — keep them but cast to path strings if possible
|
|
58
|
+
return ds
|
|
59
|
+
|
|
60
|
+
return ds.select_columns(cols_to_keep)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _load_dataset_robust(repo_id, token, split):
|
|
64
|
+
"""Load a HuggingFace dataset with multiple fallback strategies.
|
|
65
|
+
|
|
66
|
+
Strategy order:
|
|
67
|
+
1. Normal load with trust_remote_code=True (handles legacy script datasets)
|
|
68
|
+
2. Load without trust_remote_code (newer datasets that reject it)
|
|
69
|
+
3. Load with streaming=True then materialize (handles very large datasets)
|
|
70
|
+
"""
|
|
71
|
+
from datasets import load_dataset, DatasetDict
|
|
72
|
+
|
|
73
|
+
errors = []
|
|
74
|
+
splits_to_try = [split] if split else ["train", "test", "validation", None]
|
|
75
|
+
|
|
76
|
+
# Strategy 1: Normal load with trust_remote_code
|
|
77
|
+
for s in splits_to_try:
|
|
78
|
+
try:
|
|
79
|
+
kwargs = {"path": repo_id, "trust_remote_code": True}
|
|
80
|
+
if token:
|
|
81
|
+
kwargs["token"] = token
|
|
82
|
+
if s:
|
|
83
|
+
kwargs["split"] = s
|
|
84
|
+
ds = load_dataset(**kwargs)
|
|
85
|
+
return ds, s
|
|
86
|
+
except (ValueError, KeyError):
|
|
87
|
+
continue
|
|
88
|
+
except Exception as e:
|
|
89
|
+
msg = str(e)
|
|
90
|
+
# Auth errors should be raised immediately, not retried
|
|
91
|
+
if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
|
|
92
|
+
raise
|
|
93
|
+
if "split" in msg.lower() or "key" in msg.lower():
|
|
94
|
+
continue
|
|
95
|
+
errors.append(f"trust_remote_code=True, split={s}: {msg}")
|
|
96
|
+
|
|
97
|
+
# Strategy 2: Load WITHOUT trust_remote_code (some repos reject it)
|
|
98
|
+
for s in splits_to_try:
|
|
99
|
+
try:
|
|
100
|
+
kwargs = {"path": repo_id}
|
|
101
|
+
if token:
|
|
102
|
+
kwargs["token"] = token
|
|
103
|
+
if s:
|
|
104
|
+
kwargs["split"] = s
|
|
105
|
+
ds = load_dataset(**kwargs)
|
|
106
|
+
return ds, s
|
|
107
|
+
except (ValueError, KeyError):
|
|
108
|
+
continue
|
|
109
|
+
except Exception as e:
|
|
110
|
+
msg = str(e)
|
|
111
|
+
if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
|
|
112
|
+
raise
|
|
113
|
+
if "split" in msg.lower() or "key" in msg.lower():
|
|
114
|
+
continue
|
|
115
|
+
errors.append(f"trust_remote_code=False, split={s}: {msg}")
|
|
116
|
+
|
|
117
|
+
# Strategy 3: Streaming fallback (for very large / oddly structured datasets)
|
|
118
|
+
for s in splits_to_try:
|
|
119
|
+
if s is None:
|
|
120
|
+
continue # streaming requires a split
|
|
121
|
+
try:
|
|
122
|
+
kwargs = {"path": repo_id, "streaming": True, "trust_remote_code": True}
|
|
123
|
+
if token:
|
|
124
|
+
kwargs["token"] = token
|
|
125
|
+
if s:
|
|
126
|
+
kwargs["split"] = s
|
|
127
|
+
ds_stream = load_dataset(**kwargs)
|
|
128
|
+
# Materialize from streaming iterator
|
|
129
|
+
from datasets import Dataset as HFDataset
|
|
130
|
+
rows = []
|
|
131
|
+
for i, row in enumerate(ds_stream):
|
|
132
|
+
if i >= 500000:
|
|
133
|
+
break
|
|
134
|
+
rows.append(row)
|
|
135
|
+
if rows:
|
|
136
|
+
ds = HFDataset.from_list(rows)
|
|
137
|
+
return ds, s
|
|
138
|
+
except Exception:
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
# All strategies failed
|
|
142
|
+
error_summary = "; ".join(errors[:3]) if errors else "No valid configuration found"
|
|
143
|
+
return None, error_summary
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def main():
|
|
147
|
+
if len(sys.argv) < 2:
|
|
148
|
+
print(json.dumps({"ok": False, "error": "Missing payload argument"}))
|
|
149
|
+
sys.exit(1)
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
payload = json.loads(sys.argv[1])
|
|
153
|
+
except json.JSONDecodeError as e:
|
|
154
|
+
print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
|
|
155
|
+
sys.exit(1)
|
|
156
|
+
|
|
157
|
+
repo_id = payload.get("repo_id", "").strip()
|
|
158
|
+
output_path = payload.get("output_path", "").strip()
|
|
159
|
+
token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
|
|
160
|
+
max_rows = payload.get("max_rows", 500000)
|
|
161
|
+
split = payload.get("split") # None = auto-detect
|
|
162
|
+
|
|
163
|
+
if not repo_id:
|
|
164
|
+
print(json.dumps({"ok": False, "error": "repo_id is required"}))
|
|
165
|
+
sys.exit(1)
|
|
166
|
+
|
|
167
|
+
if not output_path:
|
|
168
|
+
print(json.dumps({"ok": False, "error": "output_path is required"}))
|
|
169
|
+
sys.exit(1)
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
from datasets import load_dataset
|
|
173
|
+
except ImportError:
|
|
174
|
+
print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
|
|
175
|
+
sys.exit(1)
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
import polars as pl
|
|
179
|
+
except ImportError:
|
|
180
|
+
pl = None
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
ds, used_split = _load_dataset_robust(repo_id, token, split)
|
|
184
|
+
|
|
185
|
+
if ds is None:
|
|
186
|
+
print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}': {used_split}"}))
|
|
187
|
+
sys.exit(1)
|
|
188
|
+
|
|
189
|
+
# Handle DatasetDict (when no split specified)
|
|
190
|
+
from datasets import DatasetDict, Dataset
|
|
191
|
+
if isinstance(ds, DatasetDict):
|
|
192
|
+
# Pick the best split
|
|
193
|
+
for preferred in ["train", "test", "validation"]:
|
|
194
|
+
if preferred in ds:
|
|
195
|
+
ds = ds[preferred]
|
|
196
|
+
used_split = preferred
|
|
197
|
+
break
|
|
198
|
+
else:
|
|
199
|
+
# Just pick the first available split
|
|
200
|
+
first_key = list(ds.keys())[0]
|
|
201
|
+
ds = ds[first_key]
|
|
202
|
+
used_split = first_key
|
|
203
|
+
|
|
204
|
+
# Limit rows if needed
|
|
205
|
+
total_rows = len(ds)
|
|
206
|
+
if max_rows and total_rows > max_rows:
|
|
207
|
+
ds = ds.select(range(max_rows))
|
|
208
|
+
total_rows = max_rows
|
|
209
|
+
|
|
210
|
+
# Detect and handle image columns (PIL Image objects can't be exported to Parquet)
|
|
211
|
+
image_cols = _detect_image_columns(ds)
|
|
212
|
+
has_images = len(image_cols) > 0
|
|
213
|
+
|
|
214
|
+
if has_images:
|
|
215
|
+
# Strip image columns for tabular export, note them in output
|
|
216
|
+
export_ds = _strip_image_columns(ds, image_cols)
|
|
217
|
+
else:
|
|
218
|
+
export_ds = ds
|
|
219
|
+
|
|
220
|
+
# Ensure output directory exists
|
|
221
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
222
|
+
|
|
223
|
+
# Export to parquet
|
|
224
|
+
columns = export_ds.column_names
|
|
225
|
+
|
|
226
|
+
try:
|
|
227
|
+
if output_path.endswith(".parquet"):
|
|
228
|
+
export_ds.to_parquet(output_path)
|
|
229
|
+
elif output_path.endswith(".csv"):
|
|
230
|
+
export_ds.to_csv(output_path)
|
|
231
|
+
else:
|
|
232
|
+
# Default to parquet
|
|
233
|
+
if not output_path.endswith(".parquet"):
|
|
234
|
+
output_path = output_path + ".parquet"
|
|
235
|
+
export_ds.to_parquet(output_path)
|
|
236
|
+
except Exception as export_err:
|
|
237
|
+
# If parquet export fails (e.g. complex nested types), try CSV
|
|
238
|
+
csv_path = output_path.replace(".parquet", ".csv")
|
|
239
|
+
try:
|
|
240
|
+
export_ds.to_csv(csv_path)
|
|
241
|
+
output_path = csv_path
|
|
242
|
+
except Exception:
|
|
243
|
+
raise export_err # Re-raise original error
|
|
244
|
+
|
|
245
|
+
result = {
|
|
246
|
+
"ok": True,
|
|
247
|
+
"path": output_path,
|
|
248
|
+
"rows": total_rows,
|
|
249
|
+
"columns": columns,
|
|
250
|
+
"split": used_split
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if has_images:
|
|
254
|
+
result["image_columns"] = image_cols
|
|
255
|
+
result["note"] = (
|
|
256
|
+
f"This dataset contains image columns ({', '.join(image_cols)}). "
|
|
257
|
+
"Image data was stripped for tabular export. "
|
|
258
|
+
"Use vesper_download_assets with source='huggingface' to download the actual images."
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
print(json.dumps(result))
|
|
262
|
+
|
|
263
|
+
except Exception as e:
|
|
264
|
+
error_msg = str(e)
|
|
265
|
+
# Provide helpful, actionable hints
|
|
266
|
+
if "401" in error_msg or "Unauthorized" in error_msg:
|
|
267
|
+
error_msg = (
|
|
268
|
+
f"Authentication required for dataset '{repo_id}'. "
|
|
269
|
+
"This dataset may be gated or private. "
|
|
270
|
+
"Use the configure_keys tool to set your HF_TOKEN, then retry."
|
|
271
|
+
)
|
|
272
|
+
elif "403" in error_msg or "Forbidden" in error_msg:
|
|
273
|
+
error_msg = (
|
|
274
|
+
f"Access denied for dataset '{repo_id}'. "
|
|
275
|
+
"You may need to accept the dataset's usage agreement on huggingface.co, "
|
|
276
|
+
"then set HF_TOKEN via configure_keys tool."
|
|
277
|
+
)
|
|
278
|
+
elif "gated" in error_msg.lower():
|
|
279
|
+
error_msg = (
|
|
280
|
+
f"Dataset '{repo_id}' is gated. "
|
|
281
|
+
"Visit https://huggingface.co/datasets/{repo_id} to request access, "
|
|
282
|
+
"then set HF_TOKEN via configure_keys tool."
|
|
283
|
+
).format(repo_id=repo_id)
|
|
284
|
+
elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower() or "doesn't exist" in error_msg.lower():
|
|
285
|
+
error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
|
|
286
|
+
elif "script" in error_msg.lower() and "no longer supported" in error_msg.lower():
|
|
287
|
+
error_msg = (
|
|
288
|
+
f"Dataset '{repo_id}' uses a legacy loading script that is no longer supported "
|
|
289
|
+
"by the current version of the datasets library. "
|
|
290
|
+
"Try: pip install datasets --upgrade, or use an older datasets version."
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
print(json.dumps({"ok": False, "error": error_msg}))
|
|
294
|
+
sys.exit(1)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
if __name__ == "__main__":
|
|
298
|
+
main()
|