@vespermcp/mcp-server 1.2.21 → 1.2.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +6 -0
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +410 -0
- package/build/index.js +1587 -845
- package/build/ingestion/ingestor.js +7 -4
- package/build/install/install-service.js +11 -6
- package/build/lib/supabase.js +3 -0
- package/build/metadata/scraper.js +85 -14
- package/build/python/asset_downloader_engine.py +2 -0
- package/build/python/convert_engine.py +92 -0
- package/build/python/export_engine.py +45 -0
- package/build/python/kaggle_engine.py +77 -5
- package/build/python/normalize_engine.py +83 -0
- package/build/python/vesper/core/asset_downloader.py +5 -1
- package/build/search/engine.js +43 -5
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +509 -0
- package/build/tools/formatter.js +6 -3
- package/build/utils/python-runtime.js +130 -0
- package/package.json +7 -5
- package/scripts/postinstall.cjs +87 -31
- package/scripts/wizard.cjs +601 -0
- package/scripts/wizard.js +306 -12
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +2 -0
- package/src/python/convert_engine.py +92 -0
- package/src/python/export_engine.py +45 -0
- package/src/python/kaggle_engine.py +77 -5
- package/src/python/normalize_engine.py +83 -0
- package/src/python/requirements.txt +12 -0
- package/src/python/vesper/core/asset_downloader.py +5 -1
- package/wizard.cjs +3 -0
|
@@ -46,6 +46,9 @@ export class DataIngestor {
|
|
|
46
46
|
getKaggleCredentialError() {
|
|
47
47
|
return "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds), or provide ~/.kaggle/kaggle.json.";
|
|
48
48
|
}
|
|
49
|
+
toSafeDatasetPath(datasetId) {
|
|
50
|
+
return datasetId.replace(/[:\/]/g, "_");
|
|
51
|
+
}
|
|
49
52
|
/**
|
|
50
53
|
* Ensures a dataset is available locally
|
|
51
54
|
*/
|
|
@@ -115,7 +118,7 @@ export class DataIngestor {
|
|
|
115
118
|
this.failDownload(datasetId, errorMsg);
|
|
116
119
|
throw new Error(errorMsg);
|
|
117
120
|
}
|
|
118
|
-
const targetDir = path.join(this.rawDataDir,
|
|
121
|
+
const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
|
|
119
122
|
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
120
123
|
try {
|
|
121
124
|
onProgress?.("Downloading from Kaggle...");
|
|
@@ -131,7 +134,7 @@ export class DataIngestor {
|
|
|
131
134
|
}
|
|
132
135
|
}
|
|
133
136
|
else if (source === "openml") {
|
|
134
|
-
const targetDir = path.join(this.rawDataDir,
|
|
137
|
+
const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
|
|
135
138
|
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
136
139
|
try {
|
|
137
140
|
onProgress?.("Downloading from OpenML...");
|
|
@@ -147,7 +150,7 @@ export class DataIngestor {
|
|
|
147
150
|
}
|
|
148
151
|
}
|
|
149
152
|
else if (source === "dataworld") {
|
|
150
|
-
const targetDir = path.join(this.rawDataDir,
|
|
153
|
+
const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
|
|
151
154
|
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
152
155
|
try {
|
|
153
156
|
onProgress?.("Downloading from data.world...");
|
|
@@ -181,7 +184,7 @@ export class DataIngestor {
|
|
|
181
184
|
* Generates a safe local filename for a dataset ID
|
|
182
185
|
*/
|
|
183
186
|
getTargetPath(datasetId, extension = "parquet") {
|
|
184
|
-
const safeId =
|
|
187
|
+
const safeId = this.toSafeDatasetPath(datasetId);
|
|
185
188
|
return path.join(this.rawDataDir, `${safeId}.${extension}`);
|
|
186
189
|
}
|
|
187
190
|
/**
|
|
@@ -18,12 +18,15 @@ export class InstallService {
|
|
|
18
18
|
throw new Error(`Source file not found for installation: ${sourcePath}`);
|
|
19
19
|
}
|
|
20
20
|
const dataset = this.metadataStore.getDataset(datasetId);
|
|
21
|
-
if (!dataset) {
|
|
22
|
-
throw new Error(`Dataset metadata not found for ${datasetId}`);
|
|
23
|
-
}
|
|
24
21
|
// Create target directory
|
|
25
|
-
const
|
|
26
|
-
const
|
|
22
|
+
const installLabel = dataset?.name || datasetId;
|
|
23
|
+
const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
|
|
24
|
+
// If caller specified a target dir, use it directly
|
|
25
|
+
// Otherwise use the current working directory
|
|
26
|
+
const installDir = targetDir
|
|
27
|
+
? path.resolve(targetDir)
|
|
28
|
+
: path.resolve(process.cwd(), sanitizedName);
|
|
29
|
+
console.error(`[InstallService] Resolved install directory: ${installDir}`);
|
|
27
30
|
if (!fs.existsSync(installDir)) {
|
|
28
31
|
fs.mkdirSync(installDir, { recursive: true });
|
|
29
32
|
}
|
|
@@ -34,7 +37,9 @@ export class InstallService {
|
|
|
34
37
|
fs.copyFileSync(sourcePath, targetPath);
|
|
35
38
|
// Update metadata
|
|
36
39
|
const absolutePath = path.resolve(targetPath);
|
|
37
|
-
|
|
40
|
+
if (dataset) {
|
|
41
|
+
this.metadataStore.updateInstallPath(datasetId, absolutePath);
|
|
42
|
+
}
|
|
38
43
|
console.error(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
|
|
39
44
|
return absolutePath;
|
|
40
45
|
}
|
|
@@ -3,22 +3,29 @@ import { categorizeLicense } from "./license.js";
|
|
|
3
3
|
import { calculateQualityScore } from "./quality.js";
|
|
4
4
|
import { classifyDomain } from "./domain.js";
|
|
5
5
|
import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
|
|
6
|
+
import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "../search/query-intent.js";
|
|
6
7
|
export class HuggingFaceScraper {
|
|
7
8
|
/**
|
|
8
9
|
* Bulk discovery: Fetch many datasets quickly without deep details.
|
|
9
10
|
* Hits the 25k target in minutes.
|
|
10
11
|
*/
|
|
11
|
-
async scrapeBulk(limit = 1000,
|
|
12
|
+
async scrapeBulk(limit = 1000, queryOrIntent) {
|
|
13
|
+
const intent = typeof queryOrIntent === "string"
|
|
14
|
+
? await analyzeDatasetQuery(queryOrIntent)
|
|
15
|
+
: queryOrIntent;
|
|
16
|
+
const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
|
|
17
|
+
const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
|
|
12
18
|
const filterMsg = query ? `, query: ${query}` : "";
|
|
13
19
|
console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
|
|
14
20
|
const results = [];
|
|
15
21
|
let processed = 0;
|
|
16
22
|
try {
|
|
17
23
|
const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
24
|
+
const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
|
|
18
25
|
for await (const ds of listDatasets({
|
|
19
26
|
limit: limit,
|
|
20
27
|
additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
|
|
21
|
-
search: { query:
|
|
28
|
+
search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
|
|
22
29
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
23
30
|
})) {
|
|
24
31
|
if (results.length >= limit)
|
|
@@ -78,6 +85,9 @@ export class HuggingFaceScraper {
|
|
|
78
85
|
has_readme: false,
|
|
79
86
|
is_incomplete: true // Flag for Phase 2
|
|
80
87
|
};
|
|
88
|
+
// Hard language exclusion
|
|
89
|
+
if (intent && shouldExcludeByLanguage(metadata, intent))
|
|
90
|
+
continue;
|
|
81
91
|
results.push(metadata);
|
|
82
92
|
}
|
|
83
93
|
}
|
|
@@ -86,8 +96,12 @@ export class HuggingFaceScraper {
|
|
|
86
96
|
}
|
|
87
97
|
return results;
|
|
88
98
|
}
|
|
89
|
-
async scrape(limit = 100, applyMVPFilters = true,
|
|
90
|
-
|
|
99
|
+
async scrape(limit = 100, applyMVPFilters = true, queryOrIntent) {
|
|
100
|
+
const intent = typeof queryOrIntent === "string"
|
|
101
|
+
? await analyzeDatasetQuery(queryOrIntent)
|
|
102
|
+
: queryOrIntent;
|
|
103
|
+
const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
|
|
104
|
+
const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
|
|
91
105
|
const filterMsg = query ? `, query: ${query}` : "";
|
|
92
106
|
console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
|
|
93
107
|
const results = [];
|
|
@@ -110,10 +124,11 @@ export class HuggingFaceScraper {
|
|
|
110
124
|
}
|
|
111
125
|
// Add delay between batches to avoid rate limits
|
|
112
126
|
const BATCH_DELAY = hfToken ? 500 : 2000;
|
|
127
|
+
const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
|
|
113
128
|
for await (const ds of listDatasets({
|
|
114
129
|
limit: fetchLimit,
|
|
115
130
|
additionalFields: ["description", "tags"],
|
|
116
|
-
search: { query:
|
|
131
|
+
search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
|
|
117
132
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
118
133
|
})) {
|
|
119
134
|
if (results.length >= limit)
|
|
@@ -150,18 +165,61 @@ export class HuggingFaceScraper {
|
|
|
150
165
|
initialDelay: 2000, // Start with 2 seconds for HF API
|
|
151
166
|
maxDelay: 30000 // Max 30 seconds
|
|
152
167
|
});
|
|
153
|
-
const
|
|
168
|
+
const cardData = fullInfo.cardData || {};
|
|
169
|
+
// Extract splits from cardData.dataset_info (where HF actually stores them)
|
|
170
|
+
// cardData.dataset_info can be an object (single config) or array (multi-config)
|
|
171
|
+
let rawSplits = [];
|
|
172
|
+
const datasetInfoField = cardData.dataset_info;
|
|
173
|
+
if (datasetInfoField) {
|
|
174
|
+
const configs = Array.isArray(datasetInfoField) ? datasetInfoField : [datasetInfoField];
|
|
175
|
+
for (const config of configs) {
|
|
176
|
+
if (config?.splits && Array.isArray(config.splits)) {
|
|
177
|
+
rawSplits = rawSplits.concat(config.splits);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
// Fallback: try top-level splits from the SDK (rarely populated)
|
|
182
|
+
if (rawSplits.length === 0 && fullInfo.splits) {
|
|
183
|
+
rawSplits = fullInfo.splits;
|
|
184
|
+
}
|
|
185
|
+
const splits = rawSplits.map((s) => ({
|
|
154
186
|
name: s.name,
|
|
155
|
-
num_examples: s.numExamples || 0,
|
|
156
|
-
size_bytes: s.sizeBytes
|
|
157
|
-
}))
|
|
158
|
-
|
|
187
|
+
num_examples: s.num_examples || s.numExamples || 0,
|
|
188
|
+
size_bytes: s.num_bytes || s.sizeBytes || 0
|
|
189
|
+
}));
|
|
190
|
+
let totalExamples = splits.reduce((sum, s) => sum + (s.num_examples || 0), 0);
|
|
159
191
|
const totalSizeBytes = splits.reduce((sum, s) => sum + (s.size_bytes || 0), 0);
|
|
192
|
+
// Fallback: estimate from size_categories when splits give 0
|
|
193
|
+
if (totalExamples === 0) {
|
|
194
|
+
const sizeCategories = cardData.size_categories;
|
|
195
|
+
if (Array.isArray(sizeCategories) && sizeCategories.length > 0) {
|
|
196
|
+
const cat = sizeCategories[0];
|
|
197
|
+
const rangeMatch = cat.match(/([\d.]+[KMB]?)\s*<\s*n\s*<\s*([\d.]+[KMB]?)/i);
|
|
198
|
+
if (rangeMatch) {
|
|
199
|
+
const parseHumanNum = (s) => {
|
|
200
|
+
const m = s.match(/^([\d.]+)([KMB])?$/i);
|
|
201
|
+
if (!m)
|
|
202
|
+
return 0;
|
|
203
|
+
const base = parseFloat(m[1]);
|
|
204
|
+
const suffix = (m[2] || '').toUpperCase();
|
|
205
|
+
if (suffix === 'K')
|
|
206
|
+
return base * 1000;
|
|
207
|
+
if (suffix === 'M')
|
|
208
|
+
return base * 1_000_000;
|
|
209
|
+
if (suffix === 'B')
|
|
210
|
+
return base * 1_000_000_000;
|
|
211
|
+
return base;
|
|
212
|
+
};
|
|
213
|
+
const lo = parseHumanNum(rangeMatch[1]);
|
|
214
|
+
const hi = parseHumanNum(rangeMatch[2]);
|
|
215
|
+
totalExamples = Math.round((lo + hi) / 2);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
160
219
|
const totalSizeMB = totalSizeBytes ? Math.round(totalSizeBytes / (1024 * 1024) * 100) / 100 : undefined;
|
|
161
220
|
const hasValidationSplit = splits.some((s) => s.name === "validation" || s.name === "val");
|
|
162
221
|
const licenseTag = tags.find(t => t.startsWith("license:"));
|
|
163
222
|
const licenseId = licenseTag ? licenseTag.replace("license:", "") : fullInfo.license;
|
|
164
|
-
const cardData = fullInfo.cardData || {};
|
|
165
223
|
const licenseUrl = cardData.license?.[0]?.link || cardData.license_link;
|
|
166
224
|
const license = categorizeLicense(licenseId, licenseUrl);
|
|
167
225
|
if (license.category === "restricted") {
|
|
@@ -247,7 +305,16 @@ export class HuggingFaceScraper {
|
|
|
247
305
|
description_length: description.length,
|
|
248
306
|
has_readme: !!(cardData.readme || cardData.readme_content)
|
|
249
307
|
};
|
|
250
|
-
|
|
308
|
+
// Hard language exclusion — drop bilingual/multilingual for single-language queries
|
|
309
|
+
if (intent && shouldExcludeByLanguage(metadata, intent)) {
|
|
310
|
+
// skip — do not push
|
|
311
|
+
}
|
|
312
|
+
else {
|
|
313
|
+
if (intent) {
|
|
314
|
+
metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
|
|
315
|
+
}
|
|
316
|
+
results.push(metadata);
|
|
317
|
+
}
|
|
251
318
|
}
|
|
252
319
|
catch (e) {
|
|
253
320
|
// Track all errors for user feedback
|
|
@@ -297,8 +364,12 @@ export class HuggingFaceScraper {
|
|
|
297
364
|
if (otherErrors > 0) {
|
|
298
365
|
console.error(`[HF Scraper] ⚠️ ${otherErrors} datasets skipped due to errors`);
|
|
299
366
|
}
|
|
300
|
-
|
|
301
|
-
|
|
367
|
+
return results.sort((a, b) => {
|
|
368
|
+
const intentDelta = Number(b.intent_score || 0) - Number(a.intent_score || 0);
|
|
369
|
+
if (intentDelta !== 0)
|
|
370
|
+
return intentDelta;
|
|
371
|
+
return b.downloads - a.downloads;
|
|
372
|
+
});
|
|
302
373
|
}
|
|
303
374
|
extractTask(tags) {
|
|
304
375
|
const taskTags = [
|
|
@@ -26,6 +26,7 @@ def _print(payload: Dict[str, Any]) -> None:
|
|
|
26
26
|
async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
|
|
27
27
|
payload = json.loads(args.payload)
|
|
28
28
|
output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
|
|
29
|
+
output_dir = payload.get("output_dir")
|
|
29
30
|
workers = int(payload.get("workers") or 8)
|
|
30
31
|
recipes_dir = payload.get("recipes_dir")
|
|
31
32
|
|
|
@@ -43,6 +44,7 @@ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
|
|
|
43
44
|
kaggle_ref=payload.get("kaggle_ref"),
|
|
44
45
|
urls=payload.get("urls"),
|
|
45
46
|
output_format=payload.get("output_format", "webdataset"),
|
|
47
|
+
output_dir=str(output_dir) if output_dir else None,
|
|
46
48
|
max_items=payload.get("max_items"),
|
|
47
49
|
image_column=payload.get("image_column"),
|
|
48
50
|
)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
|
|
3
|
+
Usage: convert_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
if ext == ".csv":
|
|
20
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
21
|
+
if ext in (".tsv", ".tab"):
|
|
22
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
23
|
+
if ext in (".parquet", ".pq"):
|
|
24
|
+
return pl.read_parquet(src)
|
|
25
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
26
|
+
return pl.read_ipc(src)
|
|
27
|
+
if ext in (".jsonl", ".ndjson"):
|
|
28
|
+
return pl.read_ndjson(src)
|
|
29
|
+
if ext == ".json":
|
|
30
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
31
|
+
if raw.startswith("["):
|
|
32
|
+
return pl.read_json(src)
|
|
33
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
34
|
+
return pl.read_ndjson(src)
|
|
35
|
+
obj = json.loads(raw)
|
|
36
|
+
if isinstance(obj, dict):
|
|
37
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
38
|
+
if key in obj and isinstance(obj[key], list):
|
|
39
|
+
return pl.DataFrame(obj[key])
|
|
40
|
+
for v in obj.values():
|
|
41
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
42
|
+
return pl.DataFrame(v)
|
|
43
|
+
return pl.read_json(src)
|
|
44
|
+
# Fallback: try csv
|
|
45
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _write(df: pl.DataFrame, dst: str) -> None:
|
|
49
|
+
ext = os.path.splitext(dst)[1].lower()
|
|
50
|
+
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
51
|
+
if ext in (".parquet", ".pq"):
|
|
52
|
+
df.write_parquet(dst)
|
|
53
|
+
elif ext == ".csv":
|
|
54
|
+
df.write_csv(dst)
|
|
55
|
+
elif ext == ".json":
|
|
56
|
+
df.write_json(dst, row_oriented=True)
|
|
57
|
+
elif ext in (".jsonl", ".ndjson"):
|
|
58
|
+
df.write_ndjson(dst)
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError(f"Unsupported output format: {ext}")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main():
|
|
64
|
+
if len(sys.argv) < 3:
|
|
65
|
+
print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
|
|
68
|
+
input_path = sys.argv[1]
|
|
69
|
+
output_path = sys.argv[2]
|
|
70
|
+
|
|
71
|
+
if not os.path.exists(input_path):
|
|
72
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
73
|
+
sys.exit(1)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
df = _load(input_path)
|
|
77
|
+
_write(df, output_path)
|
|
78
|
+
size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
79
|
+
print(json.dumps({
|
|
80
|
+
"ok": True,
|
|
81
|
+
"output_path": output_path,
|
|
82
|
+
"rows": df.height,
|
|
83
|
+
"columns": df.width,
|
|
84
|
+
"size_mb": size_mb,
|
|
85
|
+
}))
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
main()
|
|
@@ -50,6 +50,51 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
|
50
50
|
df = pl.read_ipc(file_path)
|
|
51
51
|
elif ext == ".jsonl":
|
|
52
52
|
df = pl.read_ndjson(file_path)
|
|
53
|
+
elif ext == ".json":
|
|
54
|
+
# Auto-detect: array-of-objects vs NDJSON vs nested structures
|
|
55
|
+
try:
|
|
56
|
+
import json as _json
|
|
57
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
58
|
+
raw_text = fh.read(512) # peek
|
|
59
|
+
stripped = raw_text.lstrip()
|
|
60
|
+
if stripped.startswith("["):
|
|
61
|
+
# Array of objects — standard JSON
|
|
62
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
63
|
+
data = _json.load(fh)
|
|
64
|
+
if isinstance(data, list) and len(data) > 0:
|
|
65
|
+
df = pl.DataFrame(data)
|
|
66
|
+
else:
|
|
67
|
+
raise ValueError("JSON file is empty or not an array of objects")
|
|
68
|
+
elif stripped.startswith("{"):
|
|
69
|
+
# Could be NDJSON or a single object wrapping rows
|
|
70
|
+
try:
|
|
71
|
+
df = pl.read_ndjson(file_path)
|
|
72
|
+
except Exception:
|
|
73
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
74
|
+
data = _json.load(fh)
|
|
75
|
+
# Try common wrapper patterns: {"data": [...]}, {"rows": [...]}, etc.
|
|
76
|
+
rows = None
|
|
77
|
+
if isinstance(data, dict):
|
|
78
|
+
for key in ("data", "rows", "records", "items", "results", "entries"):
|
|
79
|
+
if key in data and isinstance(data[key], list):
|
|
80
|
+
rows = data[key]
|
|
81
|
+
break
|
|
82
|
+
if rows is None:
|
|
83
|
+
# Last resort: try to use the dict values
|
|
84
|
+
rows = [data]
|
|
85
|
+
if rows and len(rows) > 0:
|
|
86
|
+
df = pl.DataFrame(rows)
|
|
87
|
+
else:
|
|
88
|
+
raise ValueError("Could not parse JSON structure into tabular data")
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError("JSON file does not start with [ or {")
|
|
91
|
+
except pl.exceptions.ComputeError as ce:
|
|
92
|
+
raise ValueError(f"Failed to parse JSON: {ce}")
|
|
93
|
+
elif ext == ".xlsx":
|
|
94
|
+
try:
|
|
95
|
+
df = pl.read_excel(file_path)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
raise ValueError(f"Failed to read Excel file: {e}")
|
|
53
98
|
else:
|
|
54
99
|
raise ValueError(f"Unsupported input format: {ext}")
|
|
55
100
|
|
|
@@ -12,6 +12,19 @@ except Exception:
|
|
|
12
12
|
HAS_KAGGLE = False
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
IMAGE_EXTENSIONS = {
|
|
16
|
+
".jpg",
|
|
17
|
+
".jpeg",
|
|
18
|
+
".png",
|
|
19
|
+
".webp",
|
|
20
|
+
".bmp",
|
|
21
|
+
".gif",
|
|
22
|
+
".tiff",
|
|
23
|
+
".tif",
|
|
24
|
+
".svg",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
15
28
|
def _ensure_auth() -> Dict[str, Any]:
|
|
16
29
|
if not HAS_KAGGLE:
|
|
17
30
|
return {
|
|
@@ -135,7 +148,56 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
|
135
148
|
return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
|
|
136
149
|
|
|
137
150
|
|
|
138
|
-
def
|
|
151
|
+
def _find_image_files(root: str) -> List[str]:
|
|
152
|
+
image_files: List[str] = []
|
|
153
|
+
for base, _, files in os.walk(root):
|
|
154
|
+
for name in files:
|
|
155
|
+
full = os.path.join(base, name)
|
|
156
|
+
if os.path.splitext(name)[1].lower() in IMAGE_EXTENSIONS:
|
|
157
|
+
image_files.append(full)
|
|
158
|
+
image_files.sort()
|
|
159
|
+
return image_files
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _infer_image_record(root: str, full_path: str, index: int) -> Dict[str, Any]:
|
|
163
|
+
relative_path = os.path.relpath(full_path, root).replace("\\", "/")
|
|
164
|
+
parent_dir = os.path.dirname(relative_path)
|
|
165
|
+
parts = [part for part in parent_dir.split("/") if part and part != "."]
|
|
166
|
+
|
|
167
|
+
split = None
|
|
168
|
+
label = None
|
|
169
|
+
if parts:
|
|
170
|
+
first = parts[0].lower()
|
|
171
|
+
if first in {"train", "test", "val", "valid", "validation"}:
|
|
172
|
+
split = parts[0]
|
|
173
|
+
if len(parts) > 1:
|
|
174
|
+
label = parts[-1]
|
|
175
|
+
else:
|
|
176
|
+
label = parts[-1]
|
|
177
|
+
|
|
178
|
+
record: Dict[str, Any] = {
|
|
179
|
+
"id": index,
|
|
180
|
+
"image_path": os.path.abspath(full_path),
|
|
181
|
+
"relative_path": relative_path,
|
|
182
|
+
"file_name": os.path.basename(full_path),
|
|
183
|
+
"extension": os.path.splitext(full_path)[1].lower().lstrip("."),
|
|
184
|
+
}
|
|
185
|
+
if split:
|
|
186
|
+
record["split"] = split
|
|
187
|
+
if label:
|
|
188
|
+
record["label"] = label
|
|
189
|
+
return record
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _write_image_manifest(root: str, image_files: List[str]) -> str:
|
|
193
|
+
manifest_path = os.path.join(root, "_vesper_image_manifest.jsonl")
|
|
194
|
+
with open(manifest_path, "w", encoding="utf-8") as handle:
|
|
195
|
+
for index, full_path in enumerate(image_files):
|
|
196
|
+
handle.write(json.dumps(_infer_image_record(root, full_path, index), ensure_ascii=False) + "\n")
|
|
197
|
+
return manifest_path
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _pick_best_file(root: str) -> Dict[str, Any]:
|
|
139
201
|
candidates: List[str] = []
|
|
140
202
|
for base, _, files in os.walk(root):
|
|
141
203
|
for name in files:
|
|
@@ -145,6 +207,14 @@ def _pick_best_file(root: str) -> str:
|
|
|
145
207
|
candidates.append(full)
|
|
146
208
|
|
|
147
209
|
if not candidates:
|
|
210
|
+
image_files = _find_image_files(root)
|
|
211
|
+
if image_files:
|
|
212
|
+
manifest_path = _write_image_manifest(root, image_files)
|
|
213
|
+
return {
|
|
214
|
+
"local_path": manifest_path,
|
|
215
|
+
"dataset_kind": "image-manifest",
|
|
216
|
+
"image_count": len(image_files),
|
|
217
|
+
}
|
|
148
218
|
raise RuntimeError("No suitable data file found after download")
|
|
149
219
|
|
|
150
220
|
# prioritize common tabular formats
|
|
@@ -152,8 +222,8 @@ def _pick_best_file(root: str) -> str:
|
|
|
152
222
|
for ext in priorities:
|
|
153
223
|
for c in candidates:
|
|
154
224
|
if c.lower().endswith(ext):
|
|
155
|
-
return c
|
|
156
|
-
return candidates[0]
|
|
225
|
+
return {"local_path": c, "dataset_kind": "tabular", "image_count": 0}
|
|
226
|
+
return {"local_path": candidates[0], "dataset_kind": "tabular", "image_count": 0}
|
|
157
227
|
|
|
158
228
|
|
|
159
229
|
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
@@ -174,12 +244,14 @@ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
|
174
244
|
|
|
175
245
|
# unzip in place, remove zip for convenience
|
|
176
246
|
api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
|
|
177
|
-
|
|
247
|
+
artifact = _pick_best_file(target_dir)
|
|
178
248
|
return {
|
|
179
249
|
"ok": True,
|
|
180
250
|
"dataset_id": dataset_ref,
|
|
181
251
|
"target_dir": target_dir,
|
|
182
|
-
"local_path":
|
|
252
|
+
"local_path": artifact["local_path"],
|
|
253
|
+
"dataset_kind": artifact["dataset_kind"],
|
|
254
|
+
"image_count": artifact.get("image_count", 0),
|
|
183
255
|
}
|
|
184
256
|
except Exception as e:
|
|
185
257
|
msg = str(e)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalize any supported dataset file to parquet format.
|
|
3
|
+
Usage: normalize_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
|
|
20
|
+
if ext == ".csv":
|
|
21
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
22
|
+
if ext in (".tsv", ".tab"):
|
|
23
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
24
|
+
if ext in (".parquet", ".pq"):
|
|
25
|
+
return pl.read_parquet(src)
|
|
26
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
27
|
+
return pl.read_ipc(src)
|
|
28
|
+
if ext in (".jsonl", ".ndjson"):
|
|
29
|
+
return pl.read_ndjson(src)
|
|
30
|
+
if ext == ".json":
|
|
31
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
32
|
+
if raw.startswith("["):
|
|
33
|
+
return pl.read_json(src)
|
|
34
|
+
# Try NDJSON
|
|
35
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
36
|
+
return pl.read_ndjson(src)
|
|
37
|
+
# Try wrapper object
|
|
38
|
+
obj = json.loads(raw)
|
|
39
|
+
if isinstance(obj, dict):
|
|
40
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
41
|
+
if key in obj and isinstance(obj[key], list):
|
|
42
|
+
return pl.DataFrame(obj[key])
|
|
43
|
+
# Last resort - take first list value
|
|
44
|
+
for v in obj.values():
|
|
45
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
46
|
+
return pl.DataFrame(v)
|
|
47
|
+
return pl.read_json(src)
|
|
48
|
+
if ext == ".txt":
|
|
49
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
50
|
+
|
|
51
|
+
# Fallback: try csv
|
|
52
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def normalize(input_path: str, output_path: str):
|
|
56
|
+
df = _load(input_path)
|
|
57
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
58
|
+
df.write_parquet(output_path)
|
|
59
|
+
return df.height
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main():
|
|
63
|
+
if len(sys.argv) < 3:
|
|
64
|
+
print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
|
|
67
|
+
input_path = sys.argv[1]
|
|
68
|
+
output_path = sys.argv[2]
|
|
69
|
+
|
|
70
|
+
if not os.path.exists(input_path):
|
|
71
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
72
|
+
sys.exit(1)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
rows = normalize(input_path, output_path)
|
|
76
|
+
print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
|
@@ -191,6 +191,7 @@ class AssetDownloader:
|
|
|
191
191
|
kaggle_ref: Optional[str] = None,
|
|
192
192
|
urls: Optional[List[str]] = None,
|
|
193
193
|
output_format: str = "webdataset",
|
|
194
|
+
output_dir: Optional[str] = None,
|
|
194
195
|
max_items: Optional[int] = None,
|
|
195
196
|
image_column: Optional[str] = None,
|
|
196
197
|
) -> Dict[str, Any]:
|
|
@@ -231,7 +232,10 @@ class AssetDownloader:
|
|
|
231
232
|
raise ValueError("urls are required for source=url")
|
|
232
233
|
|
|
233
234
|
# --- Now safe to create directories ---
|
|
234
|
-
|
|
235
|
+
if output_dir:
|
|
236
|
+
dataset_dir = Path(output_dir).expanduser().resolve()
|
|
237
|
+
else:
|
|
238
|
+
dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
|
|
235
239
|
images_dir = dataset_dir / "images"
|
|
236
240
|
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
237
241
|
images_dir.mkdir(parents=True, exist_ok=True)
|