@vespermcp/mcp-server 1.2.21 → 1.2.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/README.md +49 -0
  2. package/build/cloud/adapters/supabase.js +49 -0
  3. package/build/cloud/storage-manager.js +6 -0
  4. package/build/export/exporter.js +22 -9
  5. package/build/gateway/unified-dataset-gateway.js +410 -0
  6. package/build/index.js +1587 -845
  7. package/build/ingestion/ingestor.js +7 -4
  8. package/build/install/install-service.js +11 -6
  9. package/build/lib/supabase.js +3 -0
  10. package/build/metadata/scraper.js +85 -14
  11. package/build/python/asset_downloader_engine.py +2 -0
  12. package/build/python/convert_engine.py +92 -0
  13. package/build/python/export_engine.py +45 -0
  14. package/build/python/kaggle_engine.py +77 -5
  15. package/build/python/normalize_engine.py +83 -0
  16. package/build/python/vesper/core/asset_downloader.py +5 -1
  17. package/build/search/engine.js +43 -5
  18. package/build/search/jit-orchestrator.js +18 -14
  19. package/build/search/query-intent.js +509 -0
  20. package/build/tools/formatter.js +6 -3
  21. package/build/utils/python-runtime.js +130 -0
  22. package/package.json +7 -5
  23. package/scripts/postinstall.cjs +87 -31
  24. package/scripts/wizard.cjs +601 -0
  25. package/scripts/wizard.js +306 -12
  26. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  27. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  28. package/src/python/asset_downloader_engine.py +2 -0
  29. package/src/python/convert_engine.py +92 -0
  30. package/src/python/export_engine.py +45 -0
  31. package/src/python/kaggle_engine.py +77 -5
  32. package/src/python/normalize_engine.py +83 -0
  33. package/src/python/requirements.txt +12 -0
  34. package/src/python/vesper/core/asset_downloader.py +5 -1
  35. package/wizard.cjs +3 -0
@@ -46,6 +46,9 @@ export class DataIngestor {
46
46
  getKaggleCredentialError() {
47
47
  return "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds), or provide ~/.kaggle/kaggle.json.";
48
48
  }
49
+ toSafeDatasetPath(datasetId) {
50
+ return datasetId.replace(/[:\/]/g, "_");
51
+ }
49
52
  /**
50
53
  * Ensures a dataset is available locally
51
54
  */
@@ -115,7 +118,7 @@ export class DataIngestor {
115
118
  this.failDownload(datasetId, errorMsg);
116
119
  throw new Error(errorMsg);
117
120
  }
118
- const targetDir = path.join(this.rawDataDir, datasetId.replace(/\//g, "_"));
121
+ const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
119
122
  this.store.registerDownload(datasetId, targetDir, "downloading");
120
123
  try {
121
124
  onProgress?.("Downloading from Kaggle...");
@@ -131,7 +134,7 @@ export class DataIngestor {
131
134
  }
132
135
  }
133
136
  else if (source === "openml") {
134
- const targetDir = path.join(this.rawDataDir, datasetId.replace(/:/g, "_"));
137
+ const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
135
138
  this.store.registerDownload(datasetId, targetDir, "downloading");
136
139
  try {
137
140
  onProgress?.("Downloading from OpenML...");
@@ -147,7 +150,7 @@ export class DataIngestor {
147
150
  }
148
151
  }
149
152
  else if (source === "dataworld") {
150
- const targetDir = path.join(this.rawDataDir, datasetId.replace(/[:\/]/g, "_"));
153
+ const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
151
154
  this.store.registerDownload(datasetId, targetDir, "downloading");
152
155
  try {
153
156
  onProgress?.("Downloading from data.world...");
@@ -181,7 +184,7 @@ export class DataIngestor {
181
184
  * Generates a safe local filename for a dataset ID
182
185
  */
183
186
  getTargetPath(datasetId, extension = "parquet") {
184
- const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
187
+ const safeId = this.toSafeDatasetPath(datasetId);
185
188
  return path.join(this.rawDataDir, `${safeId}.${extension}`);
186
189
  }
187
190
  /**
@@ -18,12 +18,15 @@ export class InstallService {
18
18
  throw new Error(`Source file not found for installation: ${sourcePath}`);
19
19
  }
20
20
  const dataset = this.metadataStore.getDataset(datasetId);
21
- if (!dataset) {
22
- throw new Error(`Dataset metadata not found for ${datasetId}`);
23
- }
24
21
  // Create target directory
25
- const sanitizedName = dataset.name.replace(/[^a-z0-9]/gi, "_").toLowerCase();
26
- const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
22
+ const installLabel = dataset?.name || datasetId;
23
+ const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
24
+ // If caller specified a target dir, use it directly
25
+ // Otherwise use the current working directory
26
+ const installDir = targetDir
27
+ ? path.resolve(targetDir)
28
+ : path.resolve(process.cwd(), sanitizedName);
29
+ console.error(`[InstallService] Resolved install directory: ${installDir}`);
27
30
  if (!fs.existsSync(installDir)) {
28
31
  fs.mkdirSync(installDir, { recursive: true });
29
32
  }
@@ -34,7 +37,9 @@ export class InstallService {
34
37
  fs.copyFileSync(sourcePath, targetPath);
35
38
  // Update metadata
36
39
  const absolutePath = path.resolve(targetPath);
37
- this.metadataStore.updateInstallPath(datasetId, absolutePath);
40
+ if (dataset) {
41
+ this.metadataStore.updateInstallPath(datasetId, absolutePath);
42
+ }
38
43
  console.error(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
39
44
  return absolutePath;
40
45
  }
@@ -0,0 +1,3 @@
1
+ import { createClient } from '@supabase/supabase-js';
2
+ export const supabase = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_SERVICE_ROLE_KEY // for MCP, use service_role, not anon
3
+ );
@@ -3,22 +3,29 @@ import { categorizeLicense } from "./license.js";
3
3
  import { calculateQualityScore } from "./quality.js";
4
4
  import { classifyDomain } from "./domain.js";
5
5
  import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
6
+ import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "../search/query-intent.js";
6
7
  export class HuggingFaceScraper {
7
8
  /**
8
9
  * Bulk discovery: Fetch many datasets quickly without deep details.
9
10
  * Hits the 25k target in minutes.
10
11
  */
11
- async scrapeBulk(limit = 1000, query) {
12
+ async scrapeBulk(limit = 1000, queryOrIntent) {
13
+ const intent = typeof queryOrIntent === "string"
14
+ ? await analyzeDatasetQuery(queryOrIntent)
15
+ : queryOrIntent;
16
+ const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
17
+ const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
12
18
  const filterMsg = query ? `, query: ${query}` : "";
13
19
  console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
14
20
  const results = [];
15
21
  let processed = 0;
16
22
  try {
17
23
  const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
24
+ const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
18
25
  for await (const ds of listDatasets({
19
26
  limit: limit,
20
27
  additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
21
- search: { query: query },
28
+ search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
22
29
  ...(hfToken ? { accessToken: hfToken } : {})
23
30
  })) {
24
31
  if (results.length >= limit)
@@ -78,6 +85,9 @@ export class HuggingFaceScraper {
78
85
  has_readme: false,
79
86
  is_incomplete: true // Flag for Phase 2
80
87
  };
88
+ // Hard language exclusion
89
+ if (intent && shouldExcludeByLanguage(metadata, intent))
90
+ continue;
81
91
  results.push(metadata);
82
92
  }
83
93
  }
@@ -86,8 +96,12 @@ export class HuggingFaceScraper {
86
96
  }
87
97
  return results;
88
98
  }
89
- async scrape(limit = 100, applyMVPFilters = true, query // Use as general search query
90
- ) {
99
+ async scrape(limit = 100, applyMVPFilters = true, queryOrIntent) {
100
+ const intent = typeof queryOrIntent === "string"
101
+ ? await analyzeDatasetQuery(queryOrIntent)
102
+ : queryOrIntent;
103
+ const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
104
+ const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
91
105
  const filterMsg = query ? `, query: ${query}` : "";
92
106
  console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
93
107
  const results = [];
@@ -110,10 +124,11 @@ export class HuggingFaceScraper {
110
124
  }
111
125
  // Add delay between batches to avoid rate limits
112
126
  const BATCH_DELAY = hfToken ? 500 : 2000;
127
+ const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
113
128
  for await (const ds of listDatasets({
114
129
  limit: fetchLimit,
115
130
  additionalFields: ["description", "tags"],
116
- search: { query: query },
131
+ search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
117
132
  ...(hfToken ? { accessToken: hfToken } : {})
118
133
  })) {
119
134
  if (results.length >= limit)
@@ -150,18 +165,61 @@ export class HuggingFaceScraper {
150
165
  initialDelay: 2000, // Start with 2 seconds for HF API
151
166
  maxDelay: 30000 // Max 30 seconds
152
167
  });
153
- const splits = fullInfo.splits?.map((s) => ({
168
+ const cardData = fullInfo.cardData || {};
169
+ // Extract splits from cardData.dataset_info (where HF actually stores them)
170
+ // cardData.dataset_info can be an object (single config) or array (multi-config)
171
+ let rawSplits = [];
172
+ const datasetInfoField = cardData.dataset_info;
173
+ if (datasetInfoField) {
174
+ const configs = Array.isArray(datasetInfoField) ? datasetInfoField : [datasetInfoField];
175
+ for (const config of configs) {
176
+ if (config?.splits && Array.isArray(config.splits)) {
177
+ rawSplits = rawSplits.concat(config.splits);
178
+ }
179
+ }
180
+ }
181
+ // Fallback: try top-level splits from the SDK (rarely populated)
182
+ if (rawSplits.length === 0 && fullInfo.splits) {
183
+ rawSplits = fullInfo.splits;
184
+ }
185
+ const splits = rawSplits.map((s) => ({
154
186
  name: s.name,
155
- num_examples: s.numExamples || 0,
156
- size_bytes: s.sizeBytes
157
- })) || [];
158
- const totalExamples = splits.reduce((sum, s) => sum + (s.num_examples || 0), 0);
187
+ num_examples: s.num_examples || s.numExamples || 0,
188
+ size_bytes: s.num_bytes || s.sizeBytes || 0
189
+ }));
190
+ let totalExamples = splits.reduce((sum, s) => sum + (s.num_examples || 0), 0);
159
191
  const totalSizeBytes = splits.reduce((sum, s) => sum + (s.size_bytes || 0), 0);
192
+ // Fallback: estimate from size_categories when splits give 0
193
+ if (totalExamples === 0) {
194
+ const sizeCategories = cardData.size_categories;
195
+ if (Array.isArray(sizeCategories) && sizeCategories.length > 0) {
196
+ const cat = sizeCategories[0];
197
+ const rangeMatch = cat.match(/([\d.]+[KMB]?)\s*<\s*n\s*<\s*([\d.]+[KMB]?)/i);
198
+ if (rangeMatch) {
199
+ const parseHumanNum = (s) => {
200
+ const m = s.match(/^([\d.]+)([KMB])?$/i);
201
+ if (!m)
202
+ return 0;
203
+ const base = parseFloat(m[1]);
204
+ const suffix = (m[2] || '').toUpperCase();
205
+ if (suffix === 'K')
206
+ return base * 1000;
207
+ if (suffix === 'M')
208
+ return base * 1_000_000;
209
+ if (suffix === 'B')
210
+ return base * 1_000_000_000;
211
+ return base;
212
+ };
213
+ const lo = parseHumanNum(rangeMatch[1]);
214
+ const hi = parseHumanNum(rangeMatch[2]);
215
+ totalExamples = Math.round((lo + hi) / 2);
216
+ }
217
+ }
218
+ }
160
219
  const totalSizeMB = totalSizeBytes ? Math.round(totalSizeBytes / (1024 * 1024) * 100) / 100 : undefined;
161
220
  const hasValidationSplit = splits.some((s) => s.name === "validation" || s.name === "val");
162
221
  const licenseTag = tags.find(t => t.startsWith("license:"));
163
222
  const licenseId = licenseTag ? licenseTag.replace("license:", "") : fullInfo.license;
164
- const cardData = fullInfo.cardData || {};
165
223
  const licenseUrl = cardData.license?.[0]?.link || cardData.license_link;
166
224
  const license = categorizeLicense(licenseId, licenseUrl);
167
225
  if (license.category === "restricted") {
@@ -247,7 +305,16 @@ export class HuggingFaceScraper {
247
305
  description_length: description.length,
248
306
  has_readme: !!(cardData.readme || cardData.readme_content)
249
307
  };
250
- results.push(metadata);
308
+ // Hard language exclusion — drop bilingual/multilingual for single-language queries
309
+ if (intent && shouldExcludeByLanguage(metadata, intent)) {
310
+ // skip — do not push
311
+ }
312
+ else {
313
+ if (intent) {
314
+ metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
315
+ }
316
+ results.push(metadata);
317
+ }
251
318
  }
252
319
  catch (e) {
253
320
  // Track all errors for user feedback
@@ -297,8 +364,12 @@ export class HuggingFaceScraper {
297
364
  if (otherErrors > 0) {
298
365
  console.error(`[HF Scraper] ⚠️ ${otherErrors} datasets skipped due to errors`);
299
366
  }
300
- // Sort by downloads descending
301
- return results.sort((a, b) => b.downloads - a.downloads);
367
+ return results.sort((a, b) => {
368
+ const intentDelta = Number(b.intent_score || 0) - Number(a.intent_score || 0);
369
+ if (intentDelta !== 0)
370
+ return intentDelta;
371
+ return b.downloads - a.downloads;
372
+ });
302
373
  }
303
374
  extractTask(tags) {
304
375
  const taskTags = [
@@ -26,6 +26,7 @@ def _print(payload: Dict[str, Any]) -> None:
26
26
  async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
27
27
  payload = json.loads(args.payload)
28
28
  output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
29
+ output_dir = payload.get("output_dir")
29
30
  workers = int(payload.get("workers") or 8)
30
31
  recipes_dir = payload.get("recipes_dir")
31
32
 
@@ -43,6 +44,7 @@ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
43
44
  kaggle_ref=payload.get("kaggle_ref"),
44
45
  urls=payload.get("urls"),
45
46
  output_format=payload.get("output_format", "webdataset"),
47
+ output_dir=str(output_dir) if output_dir else None,
46
48
  max_items=payload.get("max_items"),
47
49
  image_column=payload.get("image_column"),
48
50
  )
@@ -0,0 +1,92 @@
1
+ """
2
+ Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
3
+ Usage: convert_engine.py <input_path> <output_path>
4
+ Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
5
+ """
6
+ import sys
7
+ import json
8
+ import os
9
+
10
+ try:
11
+ import polars as pl
12
+ except Exception:
13
+ print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
14
+ sys.exit(1)
15
+
16
+
17
+ def _load(src: str) -> pl.DataFrame:
18
+ ext = os.path.splitext(src)[1].lower()
19
+ if ext == ".csv":
20
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
21
+ if ext in (".tsv", ".tab"):
22
+ return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
23
+ if ext in (".parquet", ".pq"):
24
+ return pl.read_parquet(src)
25
+ if ext in (".feather", ".ftr", ".arrow", ".ipc"):
26
+ return pl.read_ipc(src)
27
+ if ext in (".jsonl", ".ndjson"):
28
+ return pl.read_ndjson(src)
29
+ if ext == ".json":
30
+ raw = open(src, "r", encoding="utf-8").read().strip()
31
+ if raw.startswith("["):
32
+ return pl.read_json(src)
33
+ if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
34
+ return pl.read_ndjson(src)
35
+ obj = json.loads(raw)
36
+ if isinstance(obj, dict):
37
+ for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
38
+ if key in obj and isinstance(obj[key], list):
39
+ return pl.DataFrame(obj[key])
40
+ for v in obj.values():
41
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
42
+ return pl.DataFrame(v)
43
+ return pl.read_json(src)
44
+ # Fallback: try csv
45
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
46
+
47
+
48
+ def _write(df: pl.DataFrame, dst: str) -> None:
49
+ ext = os.path.splitext(dst)[1].lower()
50
+ os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
51
+ if ext in (".parquet", ".pq"):
52
+ df.write_parquet(dst)
53
+ elif ext == ".csv":
54
+ df.write_csv(dst)
55
+ elif ext == ".json":
56
+ df.write_json(dst, row_oriented=True)
57
+ elif ext in (".jsonl", ".ndjson"):
58
+ df.write_ndjson(dst)
59
+ else:
60
+ raise ValueError(f"Unsupported output format: {ext}")
61
+
62
+
63
+ def main():
64
+ if len(sys.argv) < 3:
65
+ print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
66
+ sys.exit(1)
67
+
68
+ input_path = sys.argv[1]
69
+ output_path = sys.argv[2]
70
+
71
+ if not os.path.exists(input_path):
72
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
73
+ sys.exit(1)
74
+
75
+ try:
76
+ df = _load(input_path)
77
+ _write(df, output_path)
78
+ size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
79
+ print(json.dumps({
80
+ "ok": True,
81
+ "output_path": output_path,
82
+ "rows": df.height,
83
+ "columns": df.width,
84
+ "size_mb": size_mb,
85
+ }))
86
+ except Exception as e:
87
+ print(json.dumps({"ok": False, "error": str(e)}))
88
+ sys.exit(1)
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()
@@ -50,6 +50,51 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
50
50
  df = pl.read_ipc(file_path)
51
51
  elif ext == ".jsonl":
52
52
  df = pl.read_ndjson(file_path)
53
+ elif ext == ".json":
54
+ # Auto-detect: array-of-objects vs NDJSON vs nested structures
55
+ try:
56
+ import json as _json
57
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
58
+ raw_text = fh.read(512) # peek
59
+ stripped = raw_text.lstrip()
60
+ if stripped.startswith("["):
61
+ # Array of objects — standard JSON
62
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
63
+ data = _json.load(fh)
64
+ if isinstance(data, list) and len(data) > 0:
65
+ df = pl.DataFrame(data)
66
+ else:
67
+ raise ValueError("JSON file is empty or not an array of objects")
68
+ elif stripped.startswith("{"):
69
+ # Could be NDJSON or a single object wrapping rows
70
+ try:
71
+ df = pl.read_ndjson(file_path)
72
+ except Exception:
73
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
74
+ data = _json.load(fh)
75
+ # Try common wrapper patterns: {"data": [...]}, {"rows": [...]}, etc.
76
+ rows = None
77
+ if isinstance(data, dict):
78
+ for key in ("data", "rows", "records", "items", "results", "entries"):
79
+ if key in data and isinstance(data[key], list):
80
+ rows = data[key]
81
+ break
82
+ if rows is None:
83
+ # Last resort: try to use the dict values
84
+ rows = [data]
85
+ if rows and len(rows) > 0:
86
+ df = pl.DataFrame(rows)
87
+ else:
88
+ raise ValueError("Could not parse JSON structure into tabular data")
89
+ else:
90
+ raise ValueError("JSON file does not start with [ or {")
91
+ except pl.exceptions.ComputeError as ce:
92
+ raise ValueError(f"Failed to parse JSON: {ce}")
93
+ elif ext == ".xlsx":
94
+ try:
95
+ df = pl.read_excel(file_path)
96
+ except Exception as e:
97
+ raise ValueError(f"Failed to read Excel file: {e}")
53
98
  else:
54
99
  raise ValueError(f"Unsupported input format: {ext}")
55
100
 
@@ -12,6 +12,19 @@ except Exception:
12
12
  HAS_KAGGLE = False
13
13
 
14
14
 
15
+ IMAGE_EXTENSIONS = {
16
+ ".jpg",
17
+ ".jpeg",
18
+ ".png",
19
+ ".webp",
20
+ ".bmp",
21
+ ".gif",
22
+ ".tiff",
23
+ ".tif",
24
+ ".svg",
25
+ }
26
+
27
+
15
28
  def _ensure_auth() -> Dict[str, Any]:
16
29
  if not HAS_KAGGLE:
17
30
  return {
@@ -135,7 +148,56 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
135
148
  return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
136
149
 
137
150
 
138
- def _pick_best_file(root: str) -> str:
151
+ def _find_image_files(root: str) -> List[str]:
152
+ image_files: List[str] = []
153
+ for base, _, files in os.walk(root):
154
+ for name in files:
155
+ full = os.path.join(base, name)
156
+ if os.path.splitext(name)[1].lower() in IMAGE_EXTENSIONS:
157
+ image_files.append(full)
158
+ image_files.sort()
159
+ return image_files
160
+
161
+
162
+ def _infer_image_record(root: str, full_path: str, index: int) -> Dict[str, Any]:
163
+ relative_path = os.path.relpath(full_path, root).replace("\\", "/")
164
+ parent_dir = os.path.dirname(relative_path)
165
+ parts = [part for part in parent_dir.split("/") if part and part != "."]
166
+
167
+ split = None
168
+ label = None
169
+ if parts:
170
+ first = parts[0].lower()
171
+ if first in {"train", "test", "val", "valid", "validation"}:
172
+ split = parts[0]
173
+ if len(parts) > 1:
174
+ label = parts[-1]
175
+ else:
176
+ label = parts[-1]
177
+
178
+ record: Dict[str, Any] = {
179
+ "id": index,
180
+ "image_path": os.path.abspath(full_path),
181
+ "relative_path": relative_path,
182
+ "file_name": os.path.basename(full_path),
183
+ "extension": os.path.splitext(full_path)[1].lower().lstrip("."),
184
+ }
185
+ if split:
186
+ record["split"] = split
187
+ if label:
188
+ record["label"] = label
189
+ return record
190
+
191
+
192
+ def _write_image_manifest(root: str, image_files: List[str]) -> str:
193
+ manifest_path = os.path.join(root, "_vesper_image_manifest.jsonl")
194
+ with open(manifest_path, "w", encoding="utf-8") as handle:
195
+ for index, full_path in enumerate(image_files):
196
+ handle.write(json.dumps(_infer_image_record(root, full_path, index), ensure_ascii=False) + "\n")
197
+ return manifest_path
198
+
199
+
200
+ def _pick_best_file(root: str) -> Dict[str, Any]:
139
201
  candidates: List[str] = []
140
202
  for base, _, files in os.walk(root):
141
203
  for name in files:
@@ -145,6 +207,14 @@ def _pick_best_file(root: str) -> str:
145
207
  candidates.append(full)
146
208
 
147
209
  if not candidates:
210
+ image_files = _find_image_files(root)
211
+ if image_files:
212
+ manifest_path = _write_image_manifest(root, image_files)
213
+ return {
214
+ "local_path": manifest_path,
215
+ "dataset_kind": "image-manifest",
216
+ "image_count": len(image_files),
217
+ }
148
218
  raise RuntimeError("No suitable data file found after download")
149
219
 
150
220
  # prioritize common tabular formats
@@ -152,8 +222,8 @@ def _pick_best_file(root: str) -> str:
152
222
  for ext in priorities:
153
223
  for c in candidates:
154
224
  if c.lower().endswith(ext):
155
- return c
156
- return candidates[0]
225
+ return {"local_path": c, "dataset_kind": "tabular", "image_count": 0}
226
+ return {"local_path": candidates[0], "dataset_kind": "tabular", "image_count": 0}
157
227
 
158
228
 
159
229
  def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
@@ -174,12 +244,14 @@ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
174
244
 
175
245
  # unzip in place, remove zip for convenience
176
246
  api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
177
- best_file = _pick_best_file(target_dir)
247
+ artifact = _pick_best_file(target_dir)
178
248
  return {
179
249
  "ok": True,
180
250
  "dataset_id": dataset_ref,
181
251
  "target_dir": target_dir,
182
- "local_path": best_file,
252
+ "local_path": artifact["local_path"],
253
+ "dataset_kind": artifact["dataset_kind"],
254
+ "image_count": artifact.get("image_count", 0),
183
255
  }
184
256
  except Exception as e:
185
257
  msg = str(e)
@@ -0,0 +1,83 @@
1
+ """
2
+ Normalize any supported dataset file to parquet format.
3
+ Usage: normalize_engine.py <input_path> <output_path>
4
+ Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
5
+ """
6
+ import sys
7
+ import json
8
+ import os
9
+
10
+ try:
11
+ import polars as pl
12
+ except Exception:
13
+ print(json.dumps({"ok": False, "error": "polars is required"}))
14
+ sys.exit(1)
15
+
16
+
17
+ def _load(src: str) -> pl.DataFrame:
18
+ ext = os.path.splitext(src)[1].lower()
19
+
20
+ if ext == ".csv":
21
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
22
+ if ext in (".tsv", ".tab"):
23
+ return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
24
+ if ext in (".parquet", ".pq"):
25
+ return pl.read_parquet(src)
26
+ if ext in (".feather", ".ftr", ".arrow", ".ipc"):
27
+ return pl.read_ipc(src)
28
+ if ext in (".jsonl", ".ndjson"):
29
+ return pl.read_ndjson(src)
30
+ if ext == ".json":
31
+ raw = open(src, "r", encoding="utf-8").read().strip()
32
+ if raw.startswith("["):
33
+ return pl.read_json(src)
34
+ # Try NDJSON
35
+ if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
36
+ return pl.read_ndjson(src)
37
+ # Try wrapper object
38
+ obj = json.loads(raw)
39
+ if isinstance(obj, dict):
40
+ for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
41
+ if key in obj and isinstance(obj[key], list):
42
+ return pl.DataFrame(obj[key])
43
+ # Last resort - take first list value
44
+ for v in obj.values():
45
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
46
+ return pl.DataFrame(v)
47
+ return pl.read_json(src)
48
+ if ext == ".txt":
49
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
50
+
51
+ # Fallback: try csv
52
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
53
+
54
+
55
+ def normalize(input_path: str, output_path: str):
56
+ df = _load(input_path)
57
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
58
+ df.write_parquet(output_path)
59
+ return df.height
60
+
61
+
62
+ def main():
63
+ if len(sys.argv) < 3:
64
+ print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
65
+ sys.exit(1)
66
+
67
+ input_path = sys.argv[1]
68
+ output_path = sys.argv[2]
69
+
70
+ if not os.path.exists(input_path):
71
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
72
+ sys.exit(1)
73
+
74
+ try:
75
+ rows = normalize(input_path, output_path)
76
+ print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
77
+ except Exception as e:
78
+ print(json.dumps({"ok": False, "error": str(e)}))
79
+ sys.exit(1)
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
@@ -191,6 +191,7 @@ class AssetDownloader:
191
191
  kaggle_ref: Optional[str] = None,
192
192
  urls: Optional[List[str]] = None,
193
193
  output_format: str = "webdataset",
194
+ output_dir: Optional[str] = None,
194
195
  max_items: Optional[int] = None,
195
196
  image_column: Optional[str] = None,
196
197
  ) -> Dict[str, Any]:
@@ -231,7 +232,10 @@ class AssetDownloader:
231
232
  raise ValueError("urls are required for source=url")
232
233
 
233
234
  # --- Now safe to create directories ---
234
- dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
235
+ if output_dir:
236
+ dataset_dir = Path(output_dir).expanduser().resolve()
237
+ else:
238
+ dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
235
239
  images_dir = dataset_dir / "images"
236
240
  dataset_dir.mkdir(parents=True, exist_ok=True)
237
241
  images_dir.mkdir(parents=True, exist_ok=True)