npm - @vespermcp/mcp-server - Versions diffs - 1.2.20 → 1.2.22 - Mend

@vespermcp/mcp-server 1.2.20 → 1.2.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +49 -0
package/build/cloud/adapters/supabase.js +49 -0
package/build/cloud/storage-manager.js +6 -0
package/build/export/exporter.js +22 -9
package/build/gateway/unified-dataset-gateway.js +410 -0
package/build/index.js +1592 -837
package/build/ingestion/hf-downloader.js +12 -2
package/build/ingestion/ingestor.js +19 -9
package/build/install/install-service.js +11 -6
package/build/lib/supabase.js +3 -0
package/build/metadata/scraper.js +85 -14
package/build/python/asset_downloader_engine.py +22 -1
package/build/python/convert_engine.py +92 -0
package/build/python/export_engine.py +45 -0
package/build/python/hf_fallback.py +196 -45
package/build/python/kaggle_engine.py +77 -5
package/build/python/normalize_engine.py +83 -0
package/build/python/vesper/core/asset_downloader.py +238 -48
package/build/search/engine.js +43 -5
package/build/search/jit-orchestrator.js +18 -14
package/build/search/query-intent.js +509 -0
package/build/tools/formatter.js +6 -3
package/build/utils/python-runtime.js +130 -0
package/package.json +7 -5
package/scripts/postinstall.cjs +87 -31
package/scripts/wizard.cjs +601 -0
package/scripts/wizard.js +306 -12
package/src/python/__pycache__/config.cpython-312.pyc +0 -0
package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
package/src/python/asset_downloader_engine.py +22 -1
package/src/python/convert_engine.py +92 -0
package/src/python/export_engine.py +45 -0
package/src/python/hf_fallback.py +196 -45
package/src/python/kaggle_engine.py +77 -5
package/src/python/normalize_engine.py +83 -0
package/src/python/requirements.txt +12 -0
package/src/python/vesper/core/asset_downloader.py +238 -48
package/wizard.cjs +3 -0

package/build/ingestion/hf-downloader.js CHANGED Viewed

@@ -94,8 +94,18 @@ export class HFDownloader {
         }
         catch (error) {
             const msg = String(error?.message || error);
-            if (msg.includes("401") || msg.includes("403") || msg.toLowerCase().includes("unauthorized")) {
-                throw new Error("Hugging Face gated/private dataset requires token. Run 'vespermcp config keys' to set HF token.");
+            if (msg.includes("401") || msg.toLowerCase().includes("unauthorized")) {
+                throw new Error(`Authentication required for dataset '${repoId}'. ` +
+                    `This dataset may be gated or private. ` +
+                    `Use the configure_keys tool to set your HF_TOKEN, then retry.`);
+            }
+            if (msg.includes("403") || msg.toLowerCase().includes("forbidden")) {
+                throw new Error(`Access denied for dataset '${repoId}'. ` +
+                    `You may need to accept the dataset's usage agreement on huggingface.co, ` +
+                    `then set HF_TOKEN via configure_keys tool.`);
+            }
+            if (msg.includes("404") || msg.toLowerCase().includes("not found")) {
+                throw new Error(`Dataset '${repoId}' not found on HuggingFace. Check the dataset ID.`);
             }
             console.error(`[HF] Failed to list files for ${repoId}:`, msg);
             return null;

package/build/ingestion/ingestor.js CHANGED Viewed

@@ -46,6 +46,9 @@ export class DataIngestor {
     getKaggleCredentialError() {
         return "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds), or provide ~/.kaggle/kaggle.json.";
     }
+    toSafeDatasetPath(datasetId) {
+        return datasetId.replace(/[:\/]/g, "_");
+    }
     /**
      * Ensures a dataset is available locally
      */
@@ -81,13 +84,20 @@ export class DataIngestor {
                     return resolvedPath;
                 }
                 catch (e) {
-                    this.failDownload(datasetId, e.message);
-                    throw e;
+                    const msg = String(e?.message || e);
+                    // If auth error, propagate immediately with helpful message
+                    if (msg.includes("401") || msg.includes("403") || msg.includes("Authentication") || msg.includes("Access denied")) {
+                        this.failDownload(datasetId, msg);
+                        throw e;
+                    }
+                    // For other download errors, try the fallback
+                    onProgress?.(`Direct download failed (${msg}), trying datasets library fallback...`);
                 }
             }
-            else {
-                // Fallback: Use Python datasets library to download and convert
-                onProgress?.("No raw files found. Using HuggingFace datasets library to download...");
+            // Fallback: Use Python datasets library to download and convert
+            // This runs when findBestFile returns null OR when direct download fails (non-auth)
+            if (!fs.existsSync(this.getTargetPath(datasetId, "parquet")) || !this.store.getDownloadStatus(datasetId)?.status?.includes("completed")) {
+                onProgress?.("Using HuggingFace datasets library to download...");
                 const targetPath = this.getTargetPath(datasetId, "parquet");
                 this.store.registerDownload(datasetId, targetPath, "downloading");
                 try {
@@ -108,7 +118,7 @@ export class DataIngestor {
                 this.failDownload(datasetId, errorMsg);
                 throw new Error(errorMsg);
             }
-            const targetDir = path.join(this.rawDataDir, datasetId.replace(/\//g, "_"));
+            const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
             this.store.registerDownload(datasetId, targetDir, "downloading");
             try {
                 onProgress?.("Downloading from Kaggle...");
@@ -124,7 +134,7 @@ export class DataIngestor {
             }
         }
         else if (source === "openml") {
-            const targetDir = path.join(this.rawDataDir, datasetId.replace(/:/g, "_"));
+            const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
             this.store.registerDownload(datasetId, targetDir, "downloading");
             try {
                 onProgress?.("Downloading from OpenML...");
@@ -140,7 +150,7 @@ export class DataIngestor {
             }
         }
         else if (source === "dataworld") {
-            const targetDir = path.join(this.rawDataDir, datasetId.replace(/[:\/]/g, "_"));
+            const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
             this.store.registerDownload(datasetId, targetDir, "downloading");
             try {
                 onProgress?.("Downloading from data.world...");
@@ -174,7 +184,7 @@ export class DataIngestor {
      * Generates a safe local filename for a dataset ID
      */
     getTargetPath(datasetId, extension = "parquet") {
-        const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
+        const safeId = this.toSafeDatasetPath(datasetId);
         return path.join(this.rawDataDir, `${safeId}.${extension}`);
     }
     /**

package/build/install/install-service.js CHANGED Viewed

@@ -18,12 +18,15 @@ export class InstallService {
             throw new Error(`Source file not found for installation: ${sourcePath}`);
         }
         const dataset = this.metadataStore.getDataset(datasetId);
-        if (!dataset) {
-            throw new Error(`Dataset metadata not found for ${datasetId}`);
-        }
         // Create target directory
-        const sanitizedName = dataset.name.replace(/[^a-z0-9]/gi, "_").toLowerCase();
-        const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
+        const installLabel = dataset?.name || datasetId;
+        const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
+        // If caller specified a target dir, use it directly
+        // Otherwise use the current working directory
+        const installDir = targetDir
+            ? path.resolve(targetDir)
+            : path.resolve(process.cwd(), sanitizedName);
+        console.error(`[InstallService] Resolved install directory: ${installDir}`);
         if (!fs.existsSync(installDir)) {
             fs.mkdirSync(installDir, { recursive: true });
         }
@@ -34,7 +37,9 @@ export class InstallService {
         fs.copyFileSync(sourcePath, targetPath);
         // Update metadata
         const absolutePath = path.resolve(targetPath);
-        this.metadataStore.updateInstallPath(datasetId, absolutePath);
+        if (dataset) {
+            this.metadataStore.updateInstallPath(datasetId, absolutePath);
+        }
         console.error(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
         return absolutePath;
     }

package/build/lib/supabase.js ADDED Viewed

@@ -0,0 +1,3 @@
+import { createClient } from '@supabase/supabase-js';
+export const supabase = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_SERVICE_ROLE_KEY // for MCP, use service_role, not anon
+);

package/build/metadata/scraper.js CHANGED Viewed

@@ -3,22 +3,29 @@ import { categorizeLicense } from "./license.js";
 import { calculateQualityScore } from "./quality.js";
 import { classifyDomain } from "./domain.js";
 import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
+import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "../search/query-intent.js";
 export class HuggingFaceScraper {
     /**
      * Bulk discovery: Fetch many datasets quickly without deep details.
      * Hits the 25k target in minutes.
      */
-    async scrapeBulk(limit = 1000, query) {
+    async scrapeBulk(limit = 1000, queryOrIntent) {
+        const intent = typeof queryOrIntent === "string"
+            ? await analyzeDatasetQuery(queryOrIntent)
+            : queryOrIntent;
+        const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
+        const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
         const filterMsg = query ? `, query: ${query}` : "";
         console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
         const results = [];
         let processed = 0;
         try {
             const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
+            const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
             for await (const ds of listDatasets({
                 limit: limit,
                 additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
-                search: { query: query },
+                search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
                 ...(hfToken ? { accessToken: hfToken } : {})
             })) {
                 if (results.length >= limit)
@@ -78,6 +85,9 @@ export class HuggingFaceScraper {
                     has_readme: false,
                     is_incomplete: true // Flag for Phase 2
                 };
+                // Hard language exclusion
+                if (intent && shouldExcludeByLanguage(metadata, intent))
+                    continue;
                 results.push(metadata);
             }
         }
@@ -86,8 +96,12 @@ export class HuggingFaceScraper {
         }
         return results;
     }
-    async scrape(limit = 100, applyMVPFilters = true, query // Use as general search query
-    ) {
+    async scrape(limit = 100, applyMVPFilters = true, queryOrIntent) {
+        const intent = typeof queryOrIntent === "string"
+            ? await analyzeDatasetQuery(queryOrIntent)
+            : queryOrIntent;
+        const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
+        const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
         const filterMsg = query ? `, query: ${query}` : "";
         console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
         const results = [];
@@ -110,10 +124,11 @@ export class HuggingFaceScraper {
             }
             // Add delay between batches to avoid rate limits
             const BATCH_DELAY = hfToken ? 500 : 2000;
+            const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
             for await (const ds of listDatasets({
                 limit: fetchLimit,
                 additionalFields: ["description", "tags"],
-                search: { query: query },
+                search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
                 ...(hfToken ? { accessToken: hfToken } : {})
             })) {
                 if (results.length >= limit)
@@ -150,18 +165,61 @@ export class HuggingFaceScraper {
                             initialDelay: 2000, // Start with 2 seconds for HF API
                             maxDelay: 30000 // Max 30 seconds
                         });
-                        const splits = fullInfo.splits?.map((s) => ({
+                        const cardData = fullInfo.cardData || {};
+                        // Extract splits from cardData.dataset_info (where HF actually stores them)
+                        // cardData.dataset_info can be an object (single config) or array (multi-config)
+                        let rawSplits = [];
+                        const datasetInfoField = cardData.dataset_info;
+                        if (datasetInfoField) {
+                            const configs = Array.isArray(datasetInfoField) ? datasetInfoField : [datasetInfoField];
+                            for (const config of configs) {
+                                if (config?.splits && Array.isArray(config.splits)) {
+                                    rawSplits = rawSplits.concat(config.splits);
+                                }
+                            }
+                        }
+                        // Fallback: try top-level splits from the SDK (rarely populated)
+                        if (rawSplits.length === 0 && fullInfo.splits) {
+                            rawSplits = fullInfo.splits;
+                        }
+                        const splits = rawSplits.map((s) => ({
                             name: s.name,
-                            num_examples: s.numExamples || 0,
-                            size_bytes: s.sizeBytes
-                        })) || [];
-                        const totalExamples = splits.reduce((sum, s) => sum + (s.num_examples || 0), 0);
+                            num_examples: s.num_examples || s.numExamples || 0,
+                            size_bytes: s.num_bytes || s.sizeBytes || 0
+                        }));
+                        let totalExamples = splits.reduce((sum, s) => sum + (s.num_examples || 0), 0);
                         const totalSizeBytes = splits.reduce((sum, s) => sum + (s.size_bytes || 0), 0);
+                        // Fallback: estimate from size_categories when splits give 0
+                        if (totalExamples === 0) {
+                            const sizeCategories = cardData.size_categories;
+                            if (Array.isArray(sizeCategories) && sizeCategories.length > 0) {
+                                const cat = sizeCategories[0];
+                                const rangeMatch = cat.match(/([\d.]+[KMB]?)\s*<\s*n\s*<\s*([\d.]+[KMB]?)/i);
+                                if (rangeMatch) {
+                                    const parseHumanNum = (s) => {
+                                        const m = s.match(/^([\d.]+)([KMB])?$/i);
+                                        if (!m)
+                                            return 0;
+                                        const base = parseFloat(m[1]);
+                                        const suffix = (m[2] || '').toUpperCase();
+                                        if (suffix === 'K')
+                                            return base * 1000;
+                                        if (suffix === 'M')
+                                            return base * 1_000_000;
+                                        if (suffix === 'B')
+                                            return base * 1_000_000_000;
+                                        return base;
+                                    };
+                                    const lo = parseHumanNum(rangeMatch[1]);
+                                    const hi = parseHumanNum(rangeMatch[2]);
+                                    totalExamples = Math.round((lo + hi) / 2);
+                                }
+                            }
+                        }
                         const totalSizeMB = totalSizeBytes ? Math.round(totalSizeBytes / (1024 * 1024) * 100) / 100 : undefined;
                         const hasValidationSplit = splits.some((s) => s.name === "validation" || s.name === "val");
                         const licenseTag = tags.find(t => t.startsWith("license:"));
                         const licenseId = licenseTag ? licenseTag.replace("license:", "") : fullInfo.license;
-                        const cardData = fullInfo.cardData || {};
                         const licenseUrl = cardData.license?.[0]?.link || cardData.license_link;
                         const license = categorizeLicense(licenseId, licenseUrl);
                         if (license.category === "restricted") {
@@ -247,7 +305,16 @@ export class HuggingFaceScraper {
                             description_length: description.length,
                             has_readme: !!(cardData.readme || cardData.readme_content)
                         };
-                        results.push(metadata);
+                        // Hard language exclusion — drop bilingual/multilingual for single-language queries
+                        if (intent && shouldExcludeByLanguage(metadata, intent)) {
+                            // skip — do not push
+                        }
+                        else {
+                            if (intent) {
+                                metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
+                            }
+                            results.push(metadata);
+                        }
                     }
                     catch (e) {
                         // Track all errors for user feedback
@@ -297,8 +364,12 @@ export class HuggingFaceScraper {
         if (otherErrors > 0) {
             console.error(`[HF Scraper] ⚠️ ${otherErrors} datasets skipped due to errors`);
         }
-        // Sort by downloads descending
-        return results.sort((a, b) => b.downloads - a.downloads);
+        return results.sort((a, b) => {
+            const intentDelta = Number(b.intent_score || 0) - Number(a.intent_score || 0);
+            if (intentDelta !== 0)
+                return intentDelta;
+            return b.downloads - a.downloads;
+        });
     }
     extractTask(tags) {
         const taskTags = [

package/build/python/asset_downloader_engine.py CHANGED Viewed

@@ -3,9 +3,14 @@ import asyncio
 import json
 import os
 import sys
+import warnings
 from pathlib import Path
 from typing import Any, Dict
+# Suppress noisy HF warnings
+warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
+warnings.filterwarnings("ignore", message=".*legacy.*")
 CURRENT_DIR = Path(__file__).resolve().parent
 if str(CURRENT_DIR) not in sys.path:
     sys.path.insert(0, str(CURRENT_DIR))
@@ -21,9 +26,15 @@ def _print(payload: Dict[str, Any]) -> None:
 async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
     payload = json.loads(args.payload)
     output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
+    output_dir = payload.get("output_dir")
     workers = int(payload.get("workers") or 8)
     recipes_dir = payload.get("recipes_dir")
+    # Auto-set HF token from payload if provided
+    token = payload.get("token") or payload.get("hf_token")
+    if token:
+        os.environ["HF_TOKEN"] = str(token)
     downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
     result = await downloader.download_assets(
@@ -33,6 +44,7 @@ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
         kaggle_ref=payload.get("kaggle_ref"),
         urls=payload.get("urls"),
         output_format=payload.get("output_format", "webdataset"),
+        output_dir=str(output_dir) if output_dir else None,
         max_items=payload.get("max_items"),
         image_column=payload.get("image_column"),
     )
@@ -66,7 +78,16 @@ def main() -> None:
         _print({"ok": False, "error": f"Unknown action: {args.action}"})
     except Exception as e:
-        _print({"ok": False, "error": str(e)})
+        error_msg = str(e)
+        # Provide actionable error messages
+        if "401" in error_msg or "403" in error_msg or "Unauthorized" in error_msg:
+            error_msg = (
+                "Authentication required. This dataset may be gated/private. "
+                "Use configure_keys tool to set HF_TOKEN, then retry."
+            )
+        elif "No image column" in error_msg:
+            error_msg += " Hint: specify image_column parameter with the name of the column containing images."
+        _print({"ok": False, "error": error_msg})
 if __name__ == "__main__":

package/build/python/convert_engine.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""
+Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
+Usage: convert_engine.py <input_path> <output_path>
+Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
+"""
+import sys
+import json
+import os
+try:
+    import polars as pl
+except Exception:
+    print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
+    sys.exit(1)
+def _load(src: str) -> pl.DataFrame:
+    ext = os.path.splitext(src)[1].lower()
+    if ext == ".csv":
+        return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
+    if ext in (".tsv", ".tab"):
+        return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
+    if ext in (".parquet", ".pq"):
+        return pl.read_parquet(src)
+    if ext in (".feather", ".ftr", ".arrow", ".ipc"):
+        return pl.read_ipc(src)
+    if ext in (".jsonl", ".ndjson"):
+        return pl.read_ndjson(src)
+    if ext == ".json":
+        raw = open(src, "r", encoding="utf-8").read().strip()
+        if raw.startswith("["):
+            return pl.read_json(src)
+        if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
+            return pl.read_ndjson(src)
+        obj = json.loads(raw)
+        if isinstance(obj, dict):
+            for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
+                if key in obj and isinstance(obj[key], list):
+                    return pl.DataFrame(obj[key])
+            for v in obj.values():
+                if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
+                    return pl.DataFrame(v)
+        return pl.read_json(src)
+    # Fallback: try csv
+    return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
+def _write(df: pl.DataFrame, dst: str) -> None:
+    ext = os.path.splitext(dst)[1].lower()
+    os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
+    if ext in (".parquet", ".pq"):
+        df.write_parquet(dst)
+    elif ext == ".csv":
+        df.write_csv(dst)
+    elif ext == ".json":
+        df.write_json(dst, row_oriented=True)
+    elif ext in (".jsonl", ".ndjson"):
+        df.write_ndjson(dst)
+    else:
+        raise ValueError(f"Unsupported output format: {ext}")
+def main():
+    if len(sys.argv) < 3:
+        print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
+        sys.exit(1)
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    if not os.path.exists(input_path):
+        print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
+        sys.exit(1)
+    try:
+        df = _load(input_path)
+        _write(df, output_path)
+        size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
+        print(json.dumps({
+            "ok": True,
+            "output_path": output_path,
+            "rows": df.height,
+            "columns": df.width,
+            "size_mb": size_mb,
+        }))
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": str(e)}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

package/build/python/export_engine.py CHANGED Viewed

@@ -50,6 +50,51 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
         df = pl.read_ipc(file_path)
     elif ext == ".jsonl":
         df = pl.read_ndjson(file_path)
+    elif ext == ".json":
+        # Auto-detect: array-of-objects vs NDJSON vs nested structures
+        try:
+            import json as _json
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
+                raw_text = fh.read(512)  # peek
+            stripped = raw_text.lstrip()
+            if stripped.startswith("["):
+                # Array of objects — standard JSON
+                with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
+                    data = _json.load(fh)
+                if isinstance(data, list) and len(data) > 0:
+                    df = pl.DataFrame(data)
+                else:
+                    raise ValueError("JSON file is empty or not an array of objects")
+            elif stripped.startswith("{"):
+                # Could be NDJSON or a single object wrapping rows
+                try:
+                    df = pl.read_ndjson(file_path)
+                except Exception:
+                    with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
+                        data = _json.load(fh)
+                    # Try common wrapper patterns: {"data": [...]}, {"rows": [...]}, etc.
+                    rows = None
+                    if isinstance(data, dict):
+                        for key in ("data", "rows", "records", "items", "results", "entries"):
+                            if key in data and isinstance(data[key], list):
+                                rows = data[key]
+                                break
+                        if rows is None:
+                            # Last resort: try to use the dict values
+                            rows = [data]
+                    if rows and len(rows) > 0:
+                        df = pl.DataFrame(rows)
+                    else:
+                        raise ValueError("Could not parse JSON structure into tabular data")
+            else:
+                raise ValueError("JSON file does not start with [ or {")
+        except pl.exceptions.ComputeError as ce:
+            raise ValueError(f"Failed to parse JSON: {ce}")
+    elif ext == ".xlsx":
+        try:
+            df = pl.read_excel(file_path)
+        except Exception as e:
+            raise ValueError(f"Failed to read Excel file: {e}")
     else:
         raise ValueError(f"Unsupported input format: {ext}")