@vespermcp/mcp-server 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +129 -20
- package/build/python/asset_downloader_engine.py +73 -0
- package/build/python/vesper/__init__.py +1 -0
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +1 -0
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +388 -0
- package/build/python/vesper/core/download_recipe.py +104 -0
- package/package.json +2 -2
- package/src/python/asset_downloader_engine.py +73 -0
- package/src/python/vesper/__init__.py +1 -0
- package/src/python/vesper/core/__init__.py +1 -0
- package/src/python/vesper/core/asset_downloader.py +388 -0
- package/src/python/vesper/core/download_recipe.py +104 -0
package/build/index.js
CHANGED
|
@@ -167,6 +167,25 @@ function syncPythonScripts(appRoot, dataRoot) {
|
|
|
167
167
|
const pythonDest = path.join(dataRoot, "python");
|
|
168
168
|
if (!fs.existsSync(pythonDest))
|
|
169
169
|
fs.mkdirSync(pythonDest, { recursive: true });
|
|
170
|
+
const collectPyFiles = (dir) => {
|
|
171
|
+
if (!fs.existsSync(dir))
|
|
172
|
+
return [];
|
|
173
|
+
const out = [];
|
|
174
|
+
const stack = [dir];
|
|
175
|
+
while (stack.length > 0) {
|
|
176
|
+
const cur = stack.pop();
|
|
177
|
+
for (const entry of fs.readdirSync(cur, { withFileTypes: true })) {
|
|
178
|
+
const full = path.join(cur, entry.name);
|
|
179
|
+
if (entry.isDirectory()) {
|
|
180
|
+
stack.push(full);
|
|
181
|
+
}
|
|
182
|
+
else if (entry.isFile() && full.endsWith(".py")) {
|
|
183
|
+
out.push(full);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return out;
|
|
188
|
+
};
|
|
170
189
|
// Sources to check for Python scripts
|
|
171
190
|
const sources = [
|
|
172
191
|
path.join(appRoot, "src", "python"),
|
|
@@ -175,25 +194,21 @@ function syncPythonScripts(appRoot, dataRoot) {
|
|
|
175
194
|
];
|
|
176
195
|
let syncedCount = 0;
|
|
177
196
|
for (const src of sources) {
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
fs.copyFileSync(srcPath, destPath);
|
|
194
|
-
syncedCount++;
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
+
const files = collectPyFiles(src);
|
|
198
|
+
for (const srcPath of files) {
|
|
199
|
+
const rel = path.relative(src, srcPath);
|
|
200
|
+
const destPath = path.join(pythonDest, rel);
|
|
201
|
+
const srcStat = fs.statSync(srcPath);
|
|
202
|
+
let shouldCopy = true;
|
|
203
|
+
if (fs.existsSync(destPath)) {
|
|
204
|
+
const destStat = fs.statSync(destPath);
|
|
205
|
+
if (srcStat.size === destStat.size)
|
|
206
|
+
shouldCopy = false;
|
|
207
|
+
}
|
|
208
|
+
if (shouldCopy) {
|
|
209
|
+
fs.mkdirSync(path.dirname(destPath), { recursive: true });
|
|
210
|
+
fs.copyFileSync(srcPath, destPath);
|
|
211
|
+
syncedCount++;
|
|
197
212
|
}
|
|
198
213
|
}
|
|
199
214
|
}
|
|
@@ -471,6 +486,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
471
486
|
required: ["source", "dataset_id"],
|
|
472
487
|
},
|
|
473
488
|
},
|
|
489
|
+
{
|
|
490
|
+
name: "vesper_download_assets",
|
|
491
|
+
description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL).",
|
|
492
|
+
inputSchema: {
|
|
493
|
+
type: "object",
|
|
494
|
+
properties: {
|
|
495
|
+
dataset_id: { type: "string", description: "Unique dataset identifier." },
|
|
496
|
+
source: { type: "string", enum: ["huggingface", "kaggle", "url"], description: "Asset source type." },
|
|
497
|
+
repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. cifar100)." },
|
|
498
|
+
kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
|
|
499
|
+
urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
|
|
500
|
+
output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
|
|
501
|
+
max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
|
|
502
|
+
workers: { type: "number", description: "Parallel worker count (default 8)." },
|
|
503
|
+
image_column: { type: "string", description: "Optional explicit image column for HuggingFace datasets." },
|
|
504
|
+
},
|
|
505
|
+
required: ["dataset_id", "source"],
|
|
506
|
+
},
|
|
507
|
+
},
|
|
474
508
|
{
|
|
475
509
|
name: "configure_kaggle",
|
|
476
510
|
description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
|
|
@@ -571,6 +605,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
571
605
|
properties: {
|
|
572
606
|
query: { type: "string" },
|
|
573
607
|
requirements: { type: "string" },
|
|
608
|
+
download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
|
|
574
609
|
cleaning_options: { type: "object" },
|
|
575
610
|
split_config: { type: "object" },
|
|
576
611
|
},
|
|
@@ -813,6 +848,24 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
813
848
|
const hf = new HuggingFaceScraper();
|
|
814
849
|
results = await hf.scrape(Math.max(1, limit), true, query);
|
|
815
850
|
}
|
|
851
|
+
const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
852
|
+
for (const ds of results.slice(0, limit)) {
|
|
853
|
+
const info = {
|
|
854
|
+
dataset_id: ds.id,
|
|
855
|
+
id: ds.id,
|
|
856
|
+
source: ds.source,
|
|
857
|
+
repo_id: ds.id,
|
|
858
|
+
total_images: ds.total_examples || 0,
|
|
859
|
+
image_column: undefined,
|
|
860
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
861
|
+
};
|
|
862
|
+
try {
|
|
863
|
+
await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
|
|
864
|
+
}
|
|
865
|
+
catch {
|
|
866
|
+
// best-effort recipe generation; ignore discovery-time recipe failures
|
|
867
|
+
}
|
|
868
|
+
}
|
|
816
869
|
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
817
870
|
return {
|
|
818
871
|
content: [{ type: "text", text: formattedOutput }]
|
|
@@ -857,6 +910,61 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
857
910
|
};
|
|
858
911
|
}
|
|
859
912
|
}
|
|
913
|
+
case "vesper_download_assets": {
|
|
914
|
+
hydrateExternalKeys();
|
|
915
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
916
|
+
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
917
|
+
const repoId = request.params.arguments?.repo_id ? String(request.params.arguments.repo_id) : undefined;
|
|
918
|
+
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
919
|
+
const urls = Array.isArray(request.params.arguments?.urls)
|
|
920
|
+
? (request.params.arguments?.urls).map(v => String(v))
|
|
921
|
+
: undefined;
|
|
922
|
+
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
923
|
+
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
924
|
+
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
925
|
+
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
926
|
+
if (!datasetId || !source) {
|
|
927
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
928
|
+
}
|
|
929
|
+
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
930
|
+
return {
|
|
931
|
+
content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
|
|
932
|
+
isError: true,
|
|
933
|
+
};
|
|
934
|
+
}
|
|
935
|
+
const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
936
|
+
const payload = {
|
|
937
|
+
dataset_id: datasetId,
|
|
938
|
+
source,
|
|
939
|
+
repo_id: repoId,
|
|
940
|
+
kaggle_ref: kaggleRef,
|
|
941
|
+
urls,
|
|
942
|
+
output_format: outputFormat,
|
|
943
|
+
max_items: maxItems,
|
|
944
|
+
workers,
|
|
945
|
+
image_column: imageColumn,
|
|
946
|
+
output_root: path.join(dataRoot, "data", "assets"),
|
|
947
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
948
|
+
};
|
|
949
|
+
try {
|
|
950
|
+
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
951
|
+
if (!result?.ok) {
|
|
952
|
+
return {
|
|
953
|
+
content: [{ type: "text", text: `ERROR: asset download failed: ${result?.error || "Unknown error"}` }],
|
|
954
|
+
isError: true,
|
|
955
|
+
};
|
|
956
|
+
}
|
|
957
|
+
return {
|
|
958
|
+
content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
|
|
959
|
+
};
|
|
960
|
+
}
|
|
961
|
+
catch (error) {
|
|
962
|
+
return {
|
|
963
|
+
content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
|
|
964
|
+
isError: true,
|
|
965
|
+
};
|
|
966
|
+
}
|
|
967
|
+
}
|
|
860
968
|
case "configure_kaggle": {
|
|
861
969
|
const username = String(request.params.arguments?.username || "").trim();
|
|
862
970
|
const key = String(request.params.arguments?.key || "").trim();
|
|
@@ -1033,7 +1141,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1033
1141
|
case "prepare_dataset": {
|
|
1034
1142
|
const query = String(request.params.arguments?.query);
|
|
1035
1143
|
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
1036
|
-
const
|
|
1144
|
+
const downloadImages = request.params.arguments?.download_images === true;
|
|
1145
|
+
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
|
|
1037
1146
|
return {
|
|
1038
1147
|
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
|
|
1039
1148
|
};
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import asyncio
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
CURRENT_DIR = Path(__file__).resolve().parent
|
|
10
|
+
if str(CURRENT_DIR) not in sys.path:
|
|
11
|
+
sys.path.insert(0, str(CURRENT_DIR))
|
|
12
|
+
|
|
13
|
+
from vesper.core.asset_downloader import AssetDownloader
|
|
14
|
+
from vesper.core.download_recipe import build_download_recipe, save_recipe, get_download_recipe
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _print(payload: Dict[str, Any]) -> None:
|
|
18
|
+
print(json.dumps(payload, ensure_ascii=False))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
|
|
22
|
+
payload = json.loads(args.payload)
|
|
23
|
+
output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
|
|
24
|
+
workers = int(payload.get("workers") or 8)
|
|
25
|
+
recipes_dir = payload.get("recipes_dir")
|
|
26
|
+
|
|
27
|
+
downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
|
|
28
|
+
|
|
29
|
+
result = await downloader.download_assets(
|
|
30
|
+
dataset_id=str(payload.get("dataset_id")),
|
|
31
|
+
source=payload.get("source"),
|
|
32
|
+
repo_id=payload.get("repo_id"),
|
|
33
|
+
kaggle_ref=payload.get("kaggle_ref"),
|
|
34
|
+
urls=payload.get("urls"),
|
|
35
|
+
output_format=payload.get("output_format", "webdataset"),
|
|
36
|
+
max_items=payload.get("max_items"),
|
|
37
|
+
image_column=payload.get("image_column"),
|
|
38
|
+
)
|
|
39
|
+
return {"ok": True, "result": result}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def main() -> None:
|
|
43
|
+
parser = argparse.ArgumentParser(description="Vesper Smart Asset Downloader Engine")
|
|
44
|
+
parser.add_argument("action", choices=["download", "build_recipe", "get_recipe"])
|
|
45
|
+
parser.add_argument("payload", help="JSON payload")
|
|
46
|
+
args = parser.parse_args()
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
if args.action == "download":
|
|
50
|
+
response = asyncio.run(_run_download(args))
|
|
51
|
+
_print(response)
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
payload = json.loads(args.payload)
|
|
55
|
+
if args.action == "build_recipe":
|
|
56
|
+
recipe = build_download_recipe(payload)
|
|
57
|
+
saved = save_recipe(recipe, payload.get("recipes_dir"))
|
|
58
|
+
_print({"ok": True, "recipe": recipe, "saved_to": saved})
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
if args.action == "get_recipe":
|
|
62
|
+
dataset_id = str(payload.get("dataset_id"))
|
|
63
|
+
recipe = get_download_recipe(dataset_id, payload.get("recipes_dir"))
|
|
64
|
+
_print({"ok": True, "recipe": recipe})
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
_print({"ok": False, "error": f"Unknown action: {args.action}"})
|
|
68
|
+
except Exception as e:
|
|
69
|
+
_print({"ok": False, "error": str(e)})
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
if __name__ == "__main__":
|
|
73
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Vesper Python runtime package."""
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core data engines for Vesper."""
|
|
Binary file
|
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import mimetypes
|
|
6
|
+
import os
|
|
7
|
+
import shutil
|
|
8
|
+
import tempfile
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Awaitable, Callable, Dict, Iterable, List, Optional
|
|
12
|
+
|
|
13
|
+
import aiohttp
|
|
14
|
+
|
|
15
|
+
from vesper.core.download_recipe import get_download_recipe
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import aiofiles
|
|
19
|
+
except Exception: # pragma: no cover
|
|
20
|
+
aiofiles = None
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
import webdataset as wds
|
|
24
|
+
except Exception: # pragma: no cover
|
|
25
|
+
wds = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff"}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class DownloadResult:
|
|
33
|
+
dataset_id: str
|
|
34
|
+
source: str
|
|
35
|
+
output_dir: str
|
|
36
|
+
downloaded_assets: int
|
|
37
|
+
failed_assets: int
|
|
38
|
+
errors_file: str
|
|
39
|
+
metadata_file: str
|
|
40
|
+
output_format: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class AssetDownloader:
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
output_root: str,
|
|
47
|
+
workers: int = 8,
|
|
48
|
+
recipes_dir: Optional[str] = None,
|
|
49
|
+
progress_callback: Optional[Callable[[str, Dict[str, Any]], Awaitable[None] | None]] = None,
|
|
50
|
+
) -> None:
|
|
51
|
+
self.output_root = Path(output_root)
|
|
52
|
+
self.workers = max(1, min(workers, 32))
|
|
53
|
+
self.recipes_dir = recipes_dir
|
|
54
|
+
self.progress_callback = progress_callback
|
|
55
|
+
|
|
56
|
+
async def _emit(self, stage: str, payload: Dict[str, Any]) -> None:
|
|
57
|
+
if not self.progress_callback:
|
|
58
|
+
return
|
|
59
|
+
maybe = self.progress_callback(stage, payload)
|
|
60
|
+
if asyncio.iscoroutine(maybe):
|
|
61
|
+
await maybe
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def find_image_column(dataset: Any) -> Optional[str]:
|
|
65
|
+
features = getattr(dataset, "features", None)
|
|
66
|
+
if features:
|
|
67
|
+
for name, feature in features.items():
|
|
68
|
+
feature_name = feature.__class__.__name__.lower()
|
|
69
|
+
feature_repr = str(feature).lower()
|
|
70
|
+
if feature_name == "image" or "image(" in feature_repr:
|
|
71
|
+
return str(name)
|
|
72
|
+
lower = str(name).lower()
|
|
73
|
+
if lower in {"image", "images", "img", "image_path", "image_url", "url"}:
|
|
74
|
+
return str(name)
|
|
75
|
+
|
|
76
|
+
candidate_columns = ["image", "images", "img", "image_path", "image_url", "url", "file_name", "filepath"]
|
|
77
|
+
cols = getattr(dataset, "column_names", []) or []
|
|
78
|
+
for c in candidate_columns:
|
|
79
|
+
if c in cols:
|
|
80
|
+
return c
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
async def download_assets(
|
|
84
|
+
self,
|
|
85
|
+
dataset_id: str,
|
|
86
|
+
source: Optional[str] = None,
|
|
87
|
+
repo_id: Optional[str] = None,
|
|
88
|
+
kaggle_ref: Optional[str] = None,
|
|
89
|
+
urls: Optional[List[str]] = None,
|
|
90
|
+
output_format: str = "webdataset",
|
|
91
|
+
max_items: Optional[int] = None,
|
|
92
|
+
image_column: Optional[str] = None,
|
|
93
|
+
) -> Dict[str, Any]:
|
|
94
|
+
recipe = get_download_recipe(dataset_id, self.recipes_dir)
|
|
95
|
+
if recipe:
|
|
96
|
+
source = source or recipe.get("source")
|
|
97
|
+
repo_id = repo_id or recipe.get("repo_id")
|
|
98
|
+
image_column = image_column or recipe.get("image_column")
|
|
99
|
+
|
|
100
|
+
source = (source or "").lower()
|
|
101
|
+
if source not in {"huggingface", "kaggle", "url"}:
|
|
102
|
+
raise ValueError("source must be one of: huggingface, kaggle, url")
|
|
103
|
+
|
|
104
|
+
dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
|
|
105
|
+
images_dir = dataset_dir / "images"
|
|
106
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
108
|
+
|
|
109
|
+
errors_file = dataset_dir / "errors.jsonl"
|
|
110
|
+
metadata_file = dataset_dir / "metadata.jsonl"
|
|
111
|
+
|
|
112
|
+
if source == "huggingface":
|
|
113
|
+
if not repo_id:
|
|
114
|
+
raise ValueError("repo_id is required for source=huggingface")
|
|
115
|
+
summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
|
|
116
|
+
elif source == "kaggle":
|
|
117
|
+
ref = kaggle_ref or repo_id
|
|
118
|
+
if not ref:
|
|
119
|
+
raise ValueError("kaggle_ref is required for source=kaggle")
|
|
120
|
+
summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
|
|
121
|
+
else:
|
|
122
|
+
if not urls:
|
|
123
|
+
raise ValueError("urls are required for source=url")
|
|
124
|
+
summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
|
|
125
|
+
|
|
126
|
+
if output_format == "webdataset":
|
|
127
|
+
await self._write_webdataset(dataset_dir, images_dir, metadata_file)
|
|
128
|
+
elif output_format == "parquet":
|
|
129
|
+
await self._write_parquet(dataset_dir, metadata_file)
|
|
130
|
+
|
|
131
|
+
result = DownloadResult(
|
|
132
|
+
dataset_id=dataset_id,
|
|
133
|
+
source=source,
|
|
134
|
+
output_dir=str(dataset_dir),
|
|
135
|
+
downloaded_assets=summary["downloaded"],
|
|
136
|
+
failed_assets=summary["failed"],
|
|
137
|
+
errors_file=str(errors_file),
|
|
138
|
+
metadata_file=str(metadata_file),
|
|
139
|
+
output_format=output_format,
|
|
140
|
+
)
|
|
141
|
+
return result.__dict__
|
|
142
|
+
|
|
143
|
+
async def _download_huggingface(
|
|
144
|
+
self,
|
|
145
|
+
repo_id: str,
|
|
146
|
+
dataset_id: str,
|
|
147
|
+
images_dir: Path,
|
|
148
|
+
metadata_file: Path,
|
|
149
|
+
errors_file: Path,
|
|
150
|
+
max_items: Optional[int],
|
|
151
|
+
image_column: Optional[str],
|
|
152
|
+
) -> Dict[str, int]:
|
|
153
|
+
try:
|
|
154
|
+
from datasets import load_dataset
|
|
155
|
+
except Exception as e:
|
|
156
|
+
raise RuntimeError(f"datasets package is required. Install with: pip install datasets. Details: {e}")
|
|
157
|
+
|
|
158
|
+
await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
ds = load_dataset(repo_id, split="train")
|
|
162
|
+
except Exception:
|
|
163
|
+
dd = load_dataset(repo_id)
|
|
164
|
+
first_split = list(dd.keys())[0]
|
|
165
|
+
ds = dd[first_split]
|
|
166
|
+
|
|
167
|
+
col = image_column or self.find_image_column(ds)
|
|
168
|
+
if not col:
|
|
169
|
+
raise RuntimeError(
|
|
170
|
+
"No image column detected in HuggingFace dataset. Provide image_column or use fallback strategy with URL column."
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
total = len(ds) if hasattr(ds, "__len__") else 0
|
|
174
|
+
target = min(total, max_items) if max_items and total else (max_items or total or 0)
|
|
175
|
+
|
|
176
|
+
downloaded = 0
|
|
177
|
+
failed = 0
|
|
178
|
+
|
|
179
|
+
with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
|
|
180
|
+
for idx, row in enumerate(ds):
|
|
181
|
+
if max_items and idx >= max_items:
|
|
182
|
+
break
|
|
183
|
+
try:
|
|
184
|
+
out_name = f"{idx:08d}.jpg"
|
|
185
|
+
out_path = images_dir / out_name
|
|
186
|
+
self._save_image_value(row.get(col), out_path)
|
|
187
|
+
|
|
188
|
+
record = {
|
|
189
|
+
"dataset_id": dataset_id,
|
|
190
|
+
"index": idx,
|
|
191
|
+
"image_path": str(out_path),
|
|
192
|
+
"source": "huggingface",
|
|
193
|
+
"repo_id": repo_id,
|
|
194
|
+
}
|
|
195
|
+
mf.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
196
|
+
downloaded += 1
|
|
197
|
+
if downloaded % 50 == 0:
|
|
198
|
+
await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
|
|
199
|
+
except Exception as e:
|
|
200
|
+
failed += 1
|
|
201
|
+
ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
|
|
202
|
+
|
|
203
|
+
await self._emit("done", {"downloaded": downloaded, "failed": failed})
|
|
204
|
+
return {"downloaded": downloaded, "failed": failed}
|
|
205
|
+
|
|
206
|
+
async def _download_kaggle(
|
|
207
|
+
self,
|
|
208
|
+
kaggle_ref: str,
|
|
209
|
+
dataset_id: str,
|
|
210
|
+
images_dir: Path,
|
|
211
|
+
metadata_file: Path,
|
|
212
|
+
errors_file: Path,
|
|
213
|
+
max_items: Optional[int],
|
|
214
|
+
) -> Dict[str, int]:
|
|
215
|
+
try:
|
|
216
|
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
217
|
+
except Exception as e:
|
|
218
|
+
raise RuntimeError(f"kaggle package is required: {e}")
|
|
219
|
+
|
|
220
|
+
await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
|
|
221
|
+
|
|
222
|
+
api = KaggleApi()
|
|
223
|
+
api.authenticate()
|
|
224
|
+
|
|
225
|
+
tmp_dir = Path(tempfile.mkdtemp(prefix="vesper_kaggle_assets_"))
|
|
226
|
+
downloaded = 0
|
|
227
|
+
failed = 0
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
api.dataset_download_files(kaggle_ref, path=str(tmp_dir), unzip=True, quiet=True)
|
|
231
|
+
candidates = [p for p in tmp_dir.rglob("*") if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS]
|
|
232
|
+
if max_items:
|
|
233
|
+
candidates = candidates[:max_items]
|
|
234
|
+
|
|
235
|
+
with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
|
|
236
|
+
for idx, src_path in enumerate(candidates):
|
|
237
|
+
try:
|
|
238
|
+
out_name = f"{idx:08d}{src_path.suffix.lower()}"
|
|
239
|
+
out_path = images_dir / out_name
|
|
240
|
+
shutil.copy2(src_path, out_path)
|
|
241
|
+
record = {
|
|
242
|
+
"dataset_id": dataset_id,
|
|
243
|
+
"index": idx,
|
|
244
|
+
"image_path": str(out_path),
|
|
245
|
+
"source": "kaggle",
|
|
246
|
+
"repo_id": kaggle_ref,
|
|
247
|
+
}
|
|
248
|
+
mf.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
249
|
+
downloaded += 1
|
|
250
|
+
except Exception as e:
|
|
251
|
+
failed += 1
|
|
252
|
+
ef.write(json.dumps({"file": str(src_path), "error": str(e)}, ensure_ascii=False) + "\n")
|
|
253
|
+
finally:
|
|
254
|
+
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
255
|
+
|
|
256
|
+
await self._emit("done", {"downloaded": downloaded, "failed": failed})
|
|
257
|
+
return {"downloaded": downloaded, "failed": failed}
|
|
258
|
+
|
|
259
|
+
async def _download_urls(
|
|
260
|
+
self,
|
|
261
|
+
urls: List[str],
|
|
262
|
+
dataset_id: str,
|
|
263
|
+
images_dir: Path,
|
|
264
|
+
metadata_file: Path,
|
|
265
|
+
errors_file: Path,
|
|
266
|
+
max_items: Optional[int],
|
|
267
|
+
) -> Dict[str, int]:
|
|
268
|
+
if aiofiles is None:
|
|
269
|
+
raise RuntimeError("aiofiles is required for URL downloads. Install with: pip install aiofiles")
|
|
270
|
+
|
|
271
|
+
selected = urls[:max_items] if max_items else urls
|
|
272
|
+
sem = asyncio.Semaphore(self.workers)
|
|
273
|
+
|
|
274
|
+
downloaded = 0
|
|
275
|
+
failed = 0
|
|
276
|
+
metadata_lock = asyncio.Lock()
|
|
277
|
+
|
|
278
|
+
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=180)) as session:
|
|
279
|
+
async def worker(idx: int, url: str) -> None:
|
|
280
|
+
nonlocal downloaded, failed
|
|
281
|
+
async with sem:
|
|
282
|
+
try:
|
|
283
|
+
local_path = await self._download_one_url(session, idx, url, images_dir)
|
|
284
|
+
async with metadata_lock:
|
|
285
|
+
async with aiofiles.open(metadata_file, "a", encoding="utf-8") as mf:
|
|
286
|
+
await mf.write(json.dumps({
|
|
287
|
+
"dataset_id": dataset_id,
|
|
288
|
+
"index": idx,
|
|
289
|
+
"image_path": str(local_path),
|
|
290
|
+
"source": "url",
|
|
291
|
+
"url": url,
|
|
292
|
+
}, ensure_ascii=False) + "\n")
|
|
293
|
+
downloaded += 1
|
|
294
|
+
except Exception as e:
|
|
295
|
+
failed += 1
|
|
296
|
+
async with metadata_lock:
|
|
297
|
+
async with aiofiles.open(errors_file, "a", encoding="utf-8") as ef:
|
|
298
|
+
await ef.write(json.dumps({"index": idx, "url": url, "error": str(e)}, ensure_ascii=False) + "\n")
|
|
299
|
+
|
|
300
|
+
tasks = [asyncio.create_task(worker(i, u)) for i, u in enumerate(selected)]
|
|
301
|
+
await asyncio.gather(*tasks)
|
|
302
|
+
|
|
303
|
+
await self._emit("done", {"downloaded": downloaded, "failed": failed})
|
|
304
|
+
return {"downloaded": downloaded, "failed": failed}
|
|
305
|
+
|
|
306
|
+
async def _download_one_url(self, session: aiohttp.ClientSession, idx: int, url: str, images_dir: Path) -> Path:
|
|
307
|
+
ext = Path(url.split("?")[0]).suffix.lower()
|
|
308
|
+
if ext not in IMAGE_EXTENSIONS:
|
|
309
|
+
ext = ".jpg"
|
|
310
|
+
out_path = images_dir / f"{idx:08d}{ext}"
|
|
311
|
+
|
|
312
|
+
existing_size = out_path.stat().st_size if out_path.exists() else 0
|
|
313
|
+
headers: Dict[str, str] = {}
|
|
314
|
+
if existing_size > 0:
|
|
315
|
+
headers["Range"] = f"bytes={existing_size}-"
|
|
316
|
+
|
|
317
|
+
async with session.get(url, headers=headers) as response:
|
|
318
|
+
if response.status not in (200, 206):
|
|
319
|
+
raise RuntimeError(f"HTTP {response.status}")
|
|
320
|
+
|
|
321
|
+
mode = "ab" if response.status == 206 and existing_size > 0 else "wb"
|
|
322
|
+
async with aiofiles.open(out_path, mode) as f:
|
|
323
|
+
async for chunk in response.content.iter_chunked(1024 * 256):
|
|
324
|
+
await f.write(chunk)
|
|
325
|
+
|
|
326
|
+
return out_path
|
|
327
|
+
|
|
328
|
+
@staticmethod
|
|
329
|
+
def _save_image_value(value: Any, out_path: Path) -> None:
|
|
330
|
+
if value is None:
|
|
331
|
+
raise ValueError("empty image value")
|
|
332
|
+
|
|
333
|
+
if hasattr(value, "save"):
|
|
334
|
+
value.save(out_path)
|
|
335
|
+
return
|
|
336
|
+
|
|
337
|
+
if isinstance(value, dict):
|
|
338
|
+
if value.get("bytes"):
|
|
339
|
+
out_path.write_bytes(value["bytes"])
|
|
340
|
+
return
|
|
341
|
+
if value.get("path") and os.path.exists(value["path"]):
|
|
342
|
+
shutil.copy2(value["path"], out_path)
|
|
343
|
+
return
|
|
344
|
+
if value.get("url"):
|
|
345
|
+
raise ValueError("image URL requires URL downloader fallback")
|
|
346
|
+
|
|
347
|
+
if isinstance(value, str):
|
|
348
|
+
if os.path.exists(value):
|
|
349
|
+
shutil.copy2(value, out_path)
|
|
350
|
+
return
|
|
351
|
+
raise ValueError("string image value is not a local path")
|
|
352
|
+
|
|
353
|
+
raise ValueError(f"Unsupported image value type: {type(value)}")
|
|
354
|
+
|
|
355
|
+
async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
|
|
356
|
+
if wds is None:
|
|
357
|
+
raise RuntimeError("webdataset package is required for webdataset output. Install with: pip install webdataset")
|
|
358
|
+
|
|
359
|
+
shard_pattern = str(dataset_dir / "shard-%06d.tar")
|
|
360
|
+
with metadata_file.open("r", encoding="utf-8") as mf, wds.ShardWriter(shard_pattern, maxcount=5000) as sink:
|
|
361
|
+
for line in mf:
|
|
362
|
+
row = json.loads(line)
|
|
363
|
+
image_path = Path(row["image_path"])
|
|
364
|
+
if not image_path.exists():
|
|
365
|
+
continue
|
|
366
|
+
key = image_path.stem
|
|
367
|
+
ext = image_path.suffix.lstrip(".") or "jpg"
|
|
368
|
+
sample = {
|
|
369
|
+
"__key__": key,
|
|
370
|
+
ext: image_path.read_bytes(),
|
|
371
|
+
"json": json.dumps(row, ensure_ascii=False).encode("utf-8"),
|
|
372
|
+
}
|
|
373
|
+
sink.write(sample)
|
|
374
|
+
|
|
375
|
+
async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
|
|
376
|
+
try:
|
|
377
|
+
import pyarrow as pa
|
|
378
|
+
import pyarrow.parquet as pq
|
|
379
|
+
except Exception as e:
|
|
380
|
+
raise RuntimeError(f"pyarrow is required for parquet output: {e}")
|
|
381
|
+
|
|
382
|
+
rows: List[Dict[str, Any]] = []
|
|
383
|
+
with metadata_file.open("r", encoding="utf-8") as mf:
|
|
384
|
+
for line in mf:
|
|
385
|
+
rows.append(json.loads(line))
|
|
386
|
+
|
|
387
|
+
table = pa.Table.from_pylist(rows)
|
|
388
|
+
pq.write_table(table, str(dataset_dir / "metadata.parquet"))
|