npm - vesper-wizard - Versions diffs - 2.0.5 → 2.0.6 - Mend

vesper-wizard 2.0.5 → 2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (205) hide show

package/LICENSE +21 -0
package/README.md +300 -37
package/build/cache/cdn.js +34 -0
package/build/cache/service.js +63 -0
package/build/cleaning/cleaner.js +81 -0
package/build/cleaning/evaluator.js +89 -0
package/build/cleaning/executor.js +62 -0
package/build/cleaning/exporter.js +87 -0
package/build/cleaning/planner.js +127 -0
package/build/cleaning/rules.js +57 -0
package/build/cleaning/types.js +1 -0
package/build/cloud/adapters/local.js +37 -0
package/build/cloud/adapters/s3.js +24 -0
package/build/cloud/adapters/supabase.js +49 -0
package/build/cloud/storage-manager.js +26 -0
package/build/cloud/types.js +1 -0
package/build/compliance/service.js +73 -0
package/build/compliance/store.js +80 -0
package/build/compliance/types.js +1 -0
package/build/config/config-manager.js +221 -0
package/build/config/secure-keys.js +51 -0
package/build/config/user-config.js +48 -0
package/build/data/processing-worker.js +23 -0
package/build/data/streaming.js +38 -0
package/build/data/worker-pool.js +39 -0
package/build/export/exporter.js +69 -0
package/build/export/packager.js +100 -0
package/build/export/types.js +1 -0
package/build/fusion/aligner.js +56 -0
package/build/fusion/deduplicator.js +69 -0
package/build/fusion/engine.js +69 -0
package/build/fusion/harmonizer.js +39 -0
package/build/fusion/orchestrator.js +86 -0
package/build/fusion/types.js +1 -0
package/build/gateway/unified-dataset-gateway.js +409 -0
package/build/index.js +2704 -0
package/build/ingestion/hf-downloader.js +171 -0
package/build/ingestion/ingestor.js +271 -0
package/build/ingestion/kaggle-downloader.js +102 -0
package/build/install/install-service.js +41 -0
package/build/jobs/manager.js +136 -0
package/build/jobs/queue.js +59 -0
package/build/jobs/types.js +1 -0
package/build/lib/supabase.js +3 -0
package/build/metadata/dataworld-source.js +89 -0
package/build/metadata/domain.js +147 -0
package/build/metadata/github-scraper.js +47 -0
package/build/metadata/institutional-scrapers.js +49 -0
package/build/metadata/kaggle-scraper.js +182 -0
package/build/metadata/kaggle-source.js +70 -0
package/build/metadata/license.js +68 -0
package/build/metadata/monitoring-service.js +107 -0
package/build/metadata/monitoring-store.js +78 -0
package/build/metadata/monitoring-types.js +1 -0
package/build/metadata/openml-source.js +87 -0
package/build/metadata/quality.js +48 -0
package/build/metadata/rate-limiter.js +128 -0
package/build/metadata/scraper.js +377 -0
package/build/metadata/store.js +340 -0
package/build/metadata/types.js +1 -0
package/build/metadata/uci-scraper.js +49 -0
package/build/monitoring/observability.js +76 -0
package/build/preparation/target-detector.js +75 -0
package/build/python/__pycache__/config.cpython-312.pyc +0 -0
package/build/python/asset_downloader_engine.py +92 -0
package/build/python/cleaner.py +226 -0
package/build/python/config.py +263 -0
package/build/python/dataworld_engine.py +208 -0
package/build/python/export_engine.py +243 -0
package/build/python/framework_adapters.py +100 -0
package/build/python/fusion_engine.py +368 -0
package/build/python/github_adapter.py +106 -0
package/build/python/hf_fallback.py +298 -0
package/build/python/image_engine.py +86 -0
package/build/python/kaggle_engine.py +295 -0
package/build/python/media_engine.py +133 -0
package/build/python/nasa_adapter.py +82 -0
package/build/python/openml_engine.py +146 -0
package/build/python/quality_engine.py +267 -0
package/build/python/row_count.py +54 -0
package/build/python/splitter_engine.py +283 -0
package/build/python/target_engine.py +154 -0
package/build/python/test_framework_adapters.py +61 -0
package/build/python/test_fusion_engine.py +89 -0
package/build/python/uci_adapter.py +94 -0
package/build/python/vesper/__init__.py +1 -0
package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
package/build/python/vesper/core/__init__.py +1 -0
package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
package/build/python/vesper/core/asset_downloader.py +675 -0
package/build/python/vesper/core/download_recipe.py +104 -0
package/build/python/worldbank_adapter.py +99 -0
package/build/quality/analyzer.js +93 -0
package/build/quality/image-analyzer.js +114 -0
package/build/quality/media-analyzer.js +115 -0
package/build/quality/quality-orchestrator.js +162 -0
package/build/quality/types.js +1 -0
package/build/scripts/build-index.js +54 -0
package/build/scripts/check-db.js +73 -0
package/build/scripts/check-jobs.js +24 -0
package/build/scripts/check-naruto.js +17 -0
package/build/scripts/cleanup-kaggle.js +41 -0
package/build/scripts/demo-full-pipeline.js +62 -0
package/build/scripts/demo-ui.js +58 -0
package/build/scripts/e2e-demo.js +72 -0
package/build/scripts/massive-scrape.js +103 -0
package/build/scripts/ops-dashboard.js +33 -0
package/build/scripts/repro-bug.js +37 -0
package/build/scripts/repro-export-bug.js +56 -0
package/build/scripts/scrape-metadata.js +100 -0
package/build/scripts/search-cli.js +26 -0
package/build/scripts/test-bias.js +45 -0
package/build/scripts/test-caching.js +51 -0
package/build/scripts/test-cleaning.js +76 -0
package/build/scripts/test-cloud-storage.js +48 -0
package/build/scripts/test-compliance.js +58 -0
package/build/scripts/test-conversion.js +64 -0
package/build/scripts/test-custom-rules.js +58 -0
package/build/scripts/test-db-opt.js +63 -0
package/build/scripts/test-export-custom.js +33 -0
package/build/scripts/test-exporter.js +53 -0
package/build/scripts/test-fusion.js +61 -0
package/build/scripts/test-github.js +27 -0
package/build/scripts/test-group-split.js +52 -0
package/build/scripts/test-hf-download.js +29 -0
package/build/scripts/test-holdout-manager.js +61 -0
package/build/scripts/test-hybrid-search.js +41 -0
package/build/scripts/test-image-analysis.js +50 -0
package/build/scripts/test-ingestion-infra.js +39 -0
package/build/scripts/test-install.js +40 -0
package/build/scripts/test-institutional.js +26 -0
package/build/scripts/test-integrity.js +41 -0
package/build/scripts/test-jit.js +42 -0
package/build/scripts/test-job-queue.js +62 -0
package/build/scripts/test-kaggle-download.js +34 -0
package/build/scripts/test-large-data.js +50 -0
package/build/scripts/test-mcp-v5.js +74 -0
package/build/scripts/test-media-analysis.js +61 -0
package/build/scripts/test-monitoring.js +91 -0
package/build/scripts/test-observability.js +106 -0
package/build/scripts/test-packager.js +55 -0
package/build/scripts/test-pipeline.js +50 -0
package/build/scripts/test-planning.js +64 -0
package/build/scripts/test-privacy.js +38 -0
package/build/scripts/test-production-sync.js +36 -0
package/build/scripts/test-quality.js +43 -0
package/build/scripts/test-robust-ingestion.js +41 -0
package/build/scripts/test-schema.js +45 -0
package/build/scripts/test-split-validation.js +40 -0
package/build/scripts/test-splitter.js +93 -0
package/build/scripts/test-target-detector.js +29 -0
package/build/scripts/test-uci.js +27 -0
package/build/scripts/test-unified-quality.js +86 -0
package/build/scripts/test-write.js +14 -0
package/build/scripts/verify-integration.js +57 -0
package/build/scripts/verify-priority.js +33 -0
package/build/search/embedder.js +34 -0
package/build/search/engine.js +152 -0
package/build/search/jit-orchestrator.js +258 -0
package/build/search/vector-store.js +123 -0
package/build/splitting/splitter.js +82 -0
package/build/splitting/types.js +1 -0
package/build/tools/formatter.js +251 -0
package/build/utils/downloader.js +52 -0
package/build/utils/selector.js +69 -0
package/mcp-config-template.json +18 -0
package/package.json +101 -29
package/scripts/postinstall.cjs +114 -0
package/scripts/preindex_registry.cjs +157 -0
package/scripts/refresh-index.cjs +87 -0
package/{wizard.js → scripts/wizard.js} +99 -21
package/src/python/__pycache__/config.cpython-312.pyc +0 -0
package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
package/src/python/asset_downloader_engine.py +92 -0
package/src/python/cleaner.py +226 -0
package/src/python/config.py +263 -0
package/src/python/dataworld_engine.py +208 -0
package/src/python/export_engine.py +243 -0
package/src/python/framework_adapters.py +100 -0
package/src/python/fusion_engine.py +368 -0
package/src/python/github_adapter.py +106 -0
package/src/python/hf_fallback.py +298 -0
package/src/python/image_engine.py +86 -0
package/src/python/kaggle_engine.py +295 -0
package/src/python/media_engine.py +133 -0
package/src/python/nasa_adapter.py +82 -0
package/src/python/openml_engine.py +146 -0
package/src/python/quality_engine.py +267 -0
package/src/python/row_count.py +54 -0
package/src/python/splitter_engine.py +283 -0
package/src/python/target_engine.py +154 -0
package/src/python/test_framework_adapters.py +61 -0
package/src/python/test_fusion_engine.py +89 -0
package/src/python/uci_adapter.py +94 -0
package/src/python/vesper/__init__.py +1 -0
package/src/python/vesper/core/__init__.py +1 -0
package/src/python/vesper/core/asset_downloader.py +675 -0
package/src/python/vesper/core/download_recipe.py +104 -0
package/src/python/worldbank_adapter.py +99 -0
package/vesper-mcp-config.json +0 -6

package/build/fusion/deduplicator.js ADDED Viewed

@@ -0,0 +1,69 @@
+import * as crypto from "crypto";
+export class Deduplicator {
+    config;
+    seenHashes = new Set();
+    seenTexts = new Map(); // column -> tokens[]
+    constructor(config) {
+        this.config = config;
+    }
+    /**
+     * Checks if a record is an exact duplicate based on all columns.
+     */
+    isExactDuplicate(record) {
+        if (!this.config.dedupe_config.exact)
+            return false;
+        const hash = crypto
+            .createHash("md5")
+            .update(JSON.stringify(record))
+            .digest("hex");
+        if (this.seenHashes.has(hash)) {
+            return true;
+        }
+        this.seenHashes.add(hash);
+        return false;
+    }
+    /**
+     * Checks if a record is a fuzzy duplicate based on configured columns.
+     */
+    isFuzzyDuplicate(record) {
+        if (!this.config.dedupe_config.fuzzy)
+            return false;
+        if (!this.config.dedupe_config.fuzzy_columns.length)
+            return false;
+        for (const col of this.config.dedupe_config.fuzzy_columns) {
+            const text = String(record[col] || "");
+            if (!text || text.length < 10)
+                continue; // Skip short/empty tags
+            const tokens = this.tokenize(text);
+            if (this.isSimilar(col, tokens)) {
+                return true;
+            }
+            // In a real implementation, we'd use a more efficient data structure (like LSH)
+            // For now, we store tokens and compare (O(N^2) in worst case, use with caution)
+            // But we'll keep it simple for this phase.
+            // Actually, let's just store the tokens.
+        }
+        return false;
+    }
+    tokenize(text) {
+        return new Set(text.toLowerCase().split(/\s+/).filter(t => t.length > 2));
+    }
+    isSimilar(column, tokens) {
+        if (!this.seenTexts.has(column)) {
+            this.seenTexts.set(column, []);
+        }
+        const stored = this.seenTexts.get(column);
+        for (const existingSerialized of stored) {
+            const existing = new Set(JSON.parse(existingSerialized));
+            const intersection = new Set([...tokens].filter(t => existing.has(t)));
+            const union = new Set([...tokens, ...existing]);
+            const jaccard = intersection.size / union.size;
+            if (jaccard >= this.config.dedupe_config.fuzzy_threshold) {
+                return true;
+            }
+        }
+        // Add to seen
+        stored.push(JSON.stringify([...tokens]));
+        return false;
+    }
+}

package/build/fusion/engine.js ADDED Viewed

@@ -0,0 +1,69 @@
+import { spawn } from "child_process";
+import path from "path";
+import fs from "fs";
+export class DataFusionEngine {
+    pythonPath = "python";
+    scriptPath;
+    constructor(buildDir = process.cwd()) {
+        const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
+        const dataRoot = path.join(homeDir, ".vesper");
+        const scriptPath0 = path.resolve(dataRoot, "python", "fusion_engine.py");
+        const scriptPath1 = path.resolve(buildDir, "python", "fusion_engine.py");
+        const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "fusion_engine.py");
+        const scriptPath3 = path.resolve(buildDir, "..", "python", "fusion_engine.py");
+        if (fs.existsSync(scriptPath0)) {
+            this.scriptPath = scriptPath0;
+        }
+        else if (fs.existsSync(scriptPath1)) {
+            this.scriptPath = scriptPath1;
+        }
+        else if (fs.existsSync(scriptPath2)) {
+            this.scriptPath = scriptPath2;
+        }
+        else if (fs.existsSync(scriptPath3)) {
+            this.scriptPath = scriptPath3;
+        }
+        else {
+            this.scriptPath = scriptPath0;
+        }
+        if (process.platform === "win32") {
+            this.pythonPath = "py";
+        }
+    }
+    async fuse(sourcePaths, outputPath, options = {}) {
+        return new Promise((resolve, reject) => {
+            if (!Array.isArray(sourcePaths) || sourcePaths.length < 2) {
+                reject(new Error("At least 2 source paths are required for fusion"));
+                return;
+            }
+            const args = [
+                this.scriptPath,
+                JSON.stringify(sourcePaths),
+                outputPath,
+                JSON.stringify(options),
+            ];
+            const processRef = spawn(this.pythonPath, args);
+            let stdout = "";
+            let stderr = "";
+            processRef.stdout.on("data", (data) => (stdout += data.toString()));
+            processRef.stderr.on("data", (data) => (stderr += data.toString()));
+            processRef.on("close", (code) => {
+                if (code !== 0) {
+                    reject(new Error(`Fusion failed: ${stderr || stdout}`));
+                    return;
+                }
+                try {
+                    const result = JSON.parse(stdout);
+                    if (result.error) {
+                        reject(new Error(result.error));
+                        return;
+                    }
+                    resolve(result);
+                }
+                catch (e) {
+                    reject(new Error(`Failed to parse fusion output: ${stdout}`));
+                }
+            });
+        });
+    }
+}

package/build/fusion/harmonizer.js ADDED Viewed

@@ -0,0 +1,39 @@
+export class LabelHarmonizer {
+    config;
+    constructor(config) {
+        this.config = config;
+    }
+    /**
+     * Maps a raw label value to a unified canonical value.
+     */
+    harmonize(label) {
+        let harmonized = label;
+        // 1. apply explicit mapping
+        if (this.config.label_map && label in this.config.label_map) {
+            harmonized = this.config.label_map[label];
+        }
+        // 2. apply multi-class to binary conversion
+        if (this.config.multi_to_binary) {
+            const { positive_classes, positive_label, negative_label } = this.config.multi_to_binary;
+            const isPositive = positive_classes.some((pc) => String(pc).toLowerCase() === String(harmonized).toLowerCase());
+            return isPositive ? positive_label : negative_label;
+        }
+        return harmonized;
+    }
+    /**
+     * Checks if a label distribution is balanced enough (placeholder for quality scoring).
+     */
+    checkBalance(distribution) {
+        const warnings = [];
+        const total = Object.values(distribution).reduce((a, b) => a + b, 0);
+        if (total === 0)
+            return warnings;
+        for (const [label, count] of Object.entries(distribution)) {
+            const pct = count / total;
+            if (pct < 0.05) {
+                warnings.push(`Extreme minority class detected: "${label}" (${(pct * 100).toFixed(1)}%)`);
+            }
+        }
+        return warnings;
+    }
+}

package/build/fusion/orchestrator.js ADDED Viewed

@@ -0,0 +1,86 @@
+import { SchemaAligner } from "./aligner.js";
+import { LabelHarmonizer } from "./harmonizer.js";
+import { Deduplicator } from "./deduplicator.js";
+export class FusionOrchestrator {
+    config;
+    aligner;
+    harmonizer;
+    deduplicator;
+    constructor(config) {
+        this.config = config;
+        this.aligner = new SchemaAligner(config);
+        this.harmonizer = new LabelHarmonizer(config);
+        this.deduplicator = new Deduplicator(config);
+    }
+    /**
+     * Fuses multiple datasets into one.
+     * @param datasets Metadata of datasets to fuse
+     * @param dataRecords Actual records from all datasets (flattened)
+     */
+    async fuse(datasets, dataRecords) {
+        const stats = {
+            total_input_rows: dataRecords.length,
+            total_output_rows: 0,
+            duplicates_removed: 0,
+            fuzzy_duplicates_removed: 0,
+            schema_overlaps: [],
+            label_distribution: {}
+        };
+        const warnings = [];
+        const alignmentMap = this.aligner.align(datasets);
+        // Find canonical columns common to at least one dataset (in this simple version)
+        const allCanonicalCols = new Set();
+        for (const dsId in alignmentMap) {
+            for (const canonical of Object.values(alignmentMap[dsId])) {
+                allCanonicalCols.add(canonical);
+            }
+        }
+        stats.schema_overlaps = Array.from(allCanonicalCols);
+        const fusedData = [];
+        for (const { datasetId, record } of dataRecords) {
+            const alignedRecord = {};
+            const colMap = alignmentMap[datasetId];
+            if (!colMap) {
+                warnings.push(`No alignment found for dataset ${datasetId}`);
+                continue;
+            }
+            // 1. Align columns
+            for (const [sourceCol, val] of Object.entries(record)) {
+                const canonical = colMap[sourceCol];
+                if (canonical) {
+                    alignedRecord[canonical] = val;
+                }
+            }
+            // 2. Harmonize label
+            if (this.config.target_column && alignedRecord[this.config.target_column] !== undefined) {
+                alignedRecord[this.config.target_column] = this.harmonizer.harmonize(alignedRecord[this.config.target_column]);
+            }
+            // 3. Deduplicate
+            if (this.deduplicator.isExactDuplicate(alignedRecord)) {
+                stats.duplicates_removed++;
+                continue;
+            }
+            if (this.deduplicator.isFuzzyDuplicate(alignedRecord)) {
+                stats.fuzzy_duplicates_removed++;
+                continue;
+            }
+            // Update distribution ONLY for kept records
+            if (this.config.target_column && alignedRecord[this.config.target_column] !== undefined) {
+                const labelStr = String(alignedRecord[this.config.target_column]);
+                stats.label_distribution[labelStr] = (stats.label_distribution[labelStr] || 0) + 1;
+            }
+            fusedData.push(alignedRecord);
+        }
+        stats.total_output_rows = fusedData.push(); // Wait, push returns new length
+        stats.total_output_rows = fusedData.length;
+        // Add balance warnings
+        const balanceWarnings = this.harmonizer.checkBalance(stats.label_distribution);
+        warnings.push(...balanceWarnings);
+        return {
+            success: true,
+            output_path: "fused_dataset.json", // Placeholder
+            stats,
+            warnings
+        };
+    }
+}

package/build/fusion/types.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/build/gateway/unified-dataset-gateway.js ADDED Viewed

@@ -0,0 +1,409 @@
+import fs from "fs";
+import path from "path";
+import http from "http";
+import https from "https";
+import { HuggingFaceScraper } from "../metadata/scraper.js";
+export class UnifiedDatasetGateway {
+    deps;
+    constructor(deps) {
+        this.deps = deps;
+    }
+    getProviderStatuses(includeUnavailable = true) {
+        const hasHfToken = !!(process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN);
+        const hasKaggle = this.deps.dataIngestor.hasKaggleCredentials();
+        const hasDataWorld = this.deps.hasDataWorldToken();
+        const hasBigQuery = !!(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GOOGLE_CLOUD_PROJECT);
+        const providers = [
+            {
+                source: "huggingface",
+                display_name: "Hugging Face",
+                available: true,
+                auth_mode: hasHfToken ? "public-or-server-managed" : "public",
+                supported_operations: ["discover", "download", "info"],
+                requires_end_user_key: false,
+                notes: hasHfToken
+                    ? ["Public datasets are open by default. Gated/private repos can be accessed via the server-managed HF token when configured."]
+                    : ["Public datasets work without any user key. Gated/private repos need an operator or user token."],
+            },
+            {
+                source: "openml",
+                display_name: "OpenML",
+                available: true,
+                auth_mode: "public",
+                supported_operations: ["discover", "download", "info"],
+                requires_end_user_key: false,
+                notes: ["OpenML is exposed as a keyless public provider through the gateway."],
+            },
+            {
+                source: "kaggle",
+                display_name: "Kaggle",
+                available: hasKaggle,
+                auth_mode: hasKaggle ? "server-managed" : "not-configured",
+                supported_operations: ["discover", "download", "info"],
+                requires_end_user_key: false,
+                notes: hasKaggle
+                    ? ["Kaggle is available through server-managed credentials. End users do not need to pass their own key."]
+                    : ["Kaggle support exists, but no server-managed credentials are configured yet."],
+            },
+            {
+                source: "dataworld",
+                display_name: "data.world",
+                available: hasDataWorld,
+                auth_mode: hasDataWorld ? "server-managed" : "not-configured",
+                supported_operations: ["discover", "download", "info"],
+                requires_end_user_key: false,
+                notes: hasDataWorld
+                    ? ["data.world is available through server-managed credentials."]
+                    : ["data.world support exists, but no server-managed token is configured yet."],
+            },
+            {
+                source: "s3",
+                display_name: "Amazon S3",
+                available: true,
+                auth_mode: "public-or-server-managed",
+                supported_operations: ["download", "info"],
+                requires_end_user_key: false,
+                notes: ["Supports keyless download of public S3 objects via s3://bucket/key or HTTPS S3 URLs.", "Bucket listing and search are intentionally not exposed."],
+            },
+            {
+                source: "bigquery",
+                display_name: "BigQuery",
+                available: hasBigQuery,
+                auth_mode: hasBigQuery ? "server-managed" : "not-configured",
+                supported_operations: ["info"],
+                requires_end_user_key: false,
+                notes: hasBigQuery
+                    ? ["BigQuery is reserved for operator-managed connectors. Query execution is not implemented in this patch."]
+                    : ["BigQuery is scaffolded in the gateway contract, but no server-managed GCP configuration is present."],
+            },
+        ];
+        return includeUnavailable ? providers : providers.filter(provider => provider.available);
+    }
+    async discover(options) {
+        const query = String(options.query || "").trim();
+        const requestedSource = options.source || "auto";
+        const limit = Math.max(1, Number(options.limit || 10));
+        const publicOnly = options.publicOnly !== false;
+        if (!query) {
+            throw new Error("query is required");
+        }
+        const notes = [];
+        const providers = this.resolveDiscoverSources(requestedSource, publicOnly, notes);
+        const perSourceLimit = Math.max(5, Math.ceil(limit / Math.max(providers.length, 1)) * 2);
+        const allResults = [];
+        for (const provider of providers) {
+            try {
+                const partial = await this.discoverFromSource(provider, query, perSourceLimit);
+                for (const dataset of partial) {
+                    try {
+                        this.deps.metadataStore.saveDataset(dataset);
+                    }
+                    catch {
+                        // best-effort metadata persistence
+                    }
+                    allResults.push(dataset);
+                }
+            }
+            catch (error) {
+                notes.push(`${provider}: ${(error?.message || error || "Unknown provider error").toString()}`);
+            }
+        }
+        const deduped = new Map();
+        for (const dataset of allResults) {
+            deduped.set(`${dataset.source}:${dataset.id}`, dataset);
+        }
+        const results = Array.from(deduped.values())
+            .sort((a, b) => this.rankDataset(b) - this.rankDataset(a))
+            .slice(0, limit);
+        return {
+            query,
+            requested_source: requestedSource,
+            providers_tried: providers,
+            notes,
+            results,
+        };
+    }
+    async download(options) {
+        const requested = String(options.datasetId || "").trim();
+        if (!requested) {
+            throw new Error("dataset_id is required");
+        }
+        const notes = [];
+        const resolved = this.resolveDatasetReference(requested, options.source || "auto");
+        if (resolved.source === "bigquery") {
+            throw new Error("BigQuery gateway support is scaffolded for operator-managed connectors, but query/download execution is not implemented yet.");
+        }
+        if (resolved.source === "s3") {
+            const localPath = await this.downloadPublicS3Object(resolved.datasetId, options.targetDir);
+            return {
+                dataset_id: requested,
+                resolved_source: "s3",
+                local_path: localPath,
+                notes: ["Downloaded via the keyless S3 gateway path."],
+            };
+        }
+        let source = this.toIngestSource(resolved.source);
+        let datasetId = resolved.datasetId;
+        if (!source) {
+            const metadataMatch = this.lookupKnownDataset(requested);
+            const metadataSource = this.toIngestSource(metadataMatch?.source);
+            if (metadataMatch && metadataSource) {
+                source = metadataSource;
+                datasetId = metadataMatch.id;
+            }
+        }
+        if (!source) {
+            const discovery = await this.discover({ query: requested, source: "auto", limit: 1, publicOnly: false });
+            if (discovery.results.length === 0) {
+                throw new Error(`Unable to resolve provider for '${requested}'. Run unified_dataset_api with operation='discover' first or pass an explicit source.`);
+            }
+            const discoveredSource = this.toIngestSource(discovery.results[0].source);
+            if (!discoveredSource) {
+                throw new Error(`Resolved provider '${discovery.results[0].source}' cannot be downloaded through the dataset ingestor.`);
+            }
+            source = discoveredSource;
+            datasetId = discovery.results[0].id;
+            notes.push(`Auto-resolved provider to ${source}.`);
+        }
+        if (source === "kaggle" && !this.deps.dataIngestor.hasKaggleCredentials()) {
+            throw new Error("Kaggle is configured as a gateway source, but no server-managed credentials are available.");
+        }
+        if (source === "dataworld" && !this.deps.hasDataWorldToken()) {
+            throw new Error("data.world is configured as a gateway source, but no server-managed token is available.");
+        }
+        const localPath = await this.deps.dataIngestor.ensureData(datasetId, source, () => undefined);
+        let copiedTo;
+        if (options.targetDir) {
+            copiedTo = this.copyDownloadOutput(localPath, options.targetDir);
+            notes.push(`Copied dataset output to ${copiedTo}.`);
+        }
+        return {
+            dataset_id: datasetId,
+            resolved_source: source,
+            local_path: localPath,
+            copied_to: copiedTo,
+            notes,
+        };
+    }
+    async info(options) {
+        const requested = String(options.datasetId || "").trim();
+        if (!requested) {
+            throw new Error("dataset_id is required");
+        }
+        const resolved = this.resolveDatasetReference(requested, options.source || "auto");
+        const metadataMatch = this.lookupKnownDataset(requested) || (resolved.datasetId !== requested ? this.lookupKnownDataset(resolved.datasetId) : undefined);
+        if (metadataMatch) {
+            return {
+                dataset_id: requested,
+                resolved_source: metadataMatch.source,
+                notes: [],
+                dataset: metadataMatch,
+            };
+        }
+        if (resolved.source === "s3") {
+            return {
+                dataset_id: requested,
+                resolved_source: "s3",
+                notes: ["S3 info is derived from the object URI. Discovery/listing is intentionally not supported."],
+                dataset: {
+                    id: requested,
+                    source: "s3",
+                    uri: this.toS3HttpsUrl(resolved.datasetId),
+                },
+            };
+        }
+        if (resolved.source === "bigquery") {
+            return {
+                dataset_id: requested,
+                resolved_source: "bigquery",
+                notes: ["BigQuery is reserved for operator-managed connectors. Detailed inspection is not implemented in this patch."],
+            };
+        }
+        const discovery = await this.discover({
+            query: resolved.datasetId,
+            source: resolved.source || "auto",
+            limit: 5,
+            publicOnly: options.publicOnly !== false,
+        });
+        const exact = discovery.results.find(dataset => this.matchesDatasetReference(dataset, requested));
+        return {
+            dataset_id: requested,
+            resolved_source: exact?.source,
+            notes: discovery.notes,
+            dataset: exact || discovery.results[0],
+        };
+    }
+    async discoverFromSource(source, query, limit) {
+        switch (source) {
+            case "huggingface":
+                return await new HuggingFaceScraper().scrape(limit, true, query);
+            case "openml":
+                return await this.deps.openmlSource.discover(query, limit);
+            case "kaggle":
+                return await this.deps.kaggleSource.discover(query, limit);
+            case "dataworld":
+                return await this.deps.dataworldSource.discover(query, limit);
+            case "s3":
+                throw new Error("S3 does not support search/discovery in the unified gateway. Use a direct s3://bucket/key object reference.");
+            case "bigquery":
+                throw new Error("BigQuery discovery is not implemented in the unified gateway.");
+            default:
+                throw new Error(`Unsupported provider: ${source}`);
+        }
+    }
+    resolveDiscoverSources(source, publicOnly, notes) {
+        if (source !== "auto") {
+            if (source === "kaggle" && !this.deps.dataIngestor.hasKaggleCredentials()) {
+                throw new Error("Kaggle requires server-managed credentials and none are configured.");
+            }
+            if (source === "dataworld" && !this.deps.hasDataWorldToken()) {
+                throw new Error("data.world requires a server-managed token and none is configured.");
+            }
+            if (source === "s3" || source === "bigquery") {
+                throw new Error(`${source} does not currently support discover operation through the gateway.`);
+            }
+            return [source];
+        }
+        const providers = ["huggingface", "openml"];
+        if (!publicOnly && this.deps.dataIngestor.hasKaggleCredentials()) {
+            providers.push("kaggle");
+        }
+        else if (!publicOnly) {
+            notes.push("Kaggle skipped because no server-managed credentials are configured.");
+        }
+        if (!publicOnly && this.deps.hasDataWorldToken()) {
+            providers.push("dataworld");
+        }
+        else if (!publicOnly) {
+            notes.push("data.world skipped because no server-managed token is configured.");
+        }
+        return providers;
+    }
+    resolveDatasetReference(datasetId, source) {
+        const trimmed = datasetId.trim();
+        if (source !== "auto") {
+            if (source === "s3") {
+                return { source, datasetId: trimmed };
+            }
+            return { source, datasetId: this.stripSourcePrefix(trimmed, source) };
+        }
+        if (/^s3:\/\//i.test(trimmed) || /^https?:\/\/[^\s]+\.s3[.-][^\s]+/i.test(trimmed) || /^https?:\/\/s3\.[^\s]+amazonaws\.com\//i.test(trimmed)) {
+            return { source: "s3", datasetId: trimmed };
+        }
+        if (/^kaggle:/i.test(trimmed))
+            return { source: "kaggle", datasetId: trimmed.replace(/^kaggle:/i, "") };
+        if (/^(huggingface|hf):/i.test(trimmed))
+            return { source: "huggingface", datasetId: trimmed.replace(/^(huggingface|hf):/i, "") };
+        if (/^openml:/i.test(trimmed))
+            return { source: "openml", datasetId: trimmed.replace(/^openml:/i, "") };
+        if (/^dataworld:/i.test(trimmed))
+            return { source: "dataworld", datasetId: trimmed.replace(/^dataworld:/i, "") };
+        if (/^bigquery:/i.test(trimmed))
+            return { source: "bigquery", datasetId: trimmed.replace(/^bigquery:/i, "") };
+        if (/^\d+$/.test(trimmed))
+            return { source: "openml", datasetId: trimmed };
+        if (trimmed.includes("/") && !trimmed.includes(":"))
+            return { source: "huggingface", datasetId: trimmed };
+        return { datasetId: trimmed };
+    }
+    stripSourcePrefix(datasetId, source) {
+        if (source === "huggingface") {
+            return datasetId.replace(/^(huggingface|hf):/i, "");
+        }
+        return datasetId.replace(new RegExp(`^${source}:`, "i"), "");
+    }
+    lookupKnownDataset(datasetId) {
+        const candidates = new Set([
+            datasetId,
+            datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, ""),
+        ]);
+        for (const candidate of candidates) {
+            const dataset = this.deps.metadataStore.getDataset(candidate);
+            if (dataset)
+                return dataset;
+        }
+        return undefined;
+    }
+    matchesDatasetReference(dataset, requested) {
+        const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, "").toLowerCase();
+        const fullId = `${dataset.source}:${dataset.id}`.toLowerCase();
+        return dataset.id.toLowerCase() === normalizedRequested || fullId === requested.toLowerCase();
+    }
+    toIngestSource(source) {
+        if (source === "huggingface" || source === "openml" || source === "kaggle" || source === "dataworld") {
+            return source;
+        }
+        return undefined;
+    }
+    rankDataset(dataset) {
+        const relevance = Number(dataset.relevance_score || 0) * 1000;
+        const quality = Number(dataset.quality_score || 0) * 100;
+        const downloads = Number(dataset.downloads || 0);
+        return relevance + quality + downloads;
+    }
+    copyDownloadOutput(localPath, targetDir) {
+        const resolvedTargetDir = path.resolve(targetDir);
+        fs.mkdirSync(resolvedTargetDir, { recursive: true });
+        const destination = path.join(resolvedTargetDir, path.basename(localPath));
+        fs.cpSync(localPath, destination, { recursive: true, force: true });
+        return destination;
+    }
+    async downloadPublicS3Object(datasetId, targetDir) {
+        const httpsUrl = this.toS3HttpsUrl(datasetId);
+        const parsed = new URL(httpsUrl);
+        const fileName = path.basename(parsed.pathname) || "s3-object.bin";
+        const outputDir = path.resolve(targetDir || path.join(this.deps.dataRoot, "data", "raw"));
+        fs.mkdirSync(outputDir, { recursive: true });
+        const outputPath = path.join(outputDir, fileName);
+        await this.downloadToFile(httpsUrl, outputPath);
+        this.deps.metadataStore.registerDownload(datasetId, outputPath, "completed", fs.statSync(outputPath).size);
+        return outputPath;
+    }
+    toS3HttpsUrl(datasetId) {
+        if (/^https?:\/\//i.test(datasetId)) {
+            return datasetId;
+        }
+        const match = datasetId.match(/^s3:\/\/([^/]+)\/(.+)$/i);
+        if (!match) {
+            throw new Error("S3 source expects an s3://bucket/key object reference or a direct HTTPS S3 URL.");
+        }
+        const bucket = match[1];
+        const objectKey = match[2].split("/").map(encodeURIComponent).join("/");
+        return `https://${bucket}.s3.amazonaws.com/${objectKey}`;
+    }
+    async downloadToFile(url, destination) {
+        await new Promise((resolve, reject) => {
+            const transport = url.startsWith("https:") ? https : http;
+            const request = transport.get(url, response => {
+                const statusCode = response.statusCode || 0;
+                const location = response.headers.location;
+                if (statusCode >= 300 && statusCode < 400 && location) {
+                    response.resume();
+                    this.downloadToFile(location, destination).then(resolve).catch(reject);
+                    return;
+                }
+                if (statusCode < 200 || statusCode >= 300) {
+                    response.resume();
+                    reject(new Error(`Download failed with status ${statusCode}`));
+                    return;
+                }
+                const file = fs.createWriteStream(destination);
+                response.pipe(file);
+                file.on("finish", () => {
+                    file.close();
+                    resolve();
+                });
+                file.on("error", error => {
+                    try {
+                        file.close();
+                    }
+                    catch {
+                        // no-op
+                    }
+                    reject(error);
+                });
+            });
+            request.on("error", reject);
+        });
+    }
+}