npm - vesper-wizard - Versions diffs - 2.3.1 → 2.3.3 - Mend

vesper-wizard 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (214) hide show

package/README.md +37 -322
package/package.json +34 -100
package/vesper-mcp-config.json +6 -0
package/{scripts/wizard.js → wizard.js} +1 -1
package/LICENSE +0 -21
package/build/cache/cdn.js +0 -34
package/build/cache/service.js +0 -63
package/build/cleaning/cleaner.js +0 -81
package/build/cleaning/evaluator.js +0 -89
package/build/cleaning/executor.js +0 -62
package/build/cleaning/exporter.js +0 -87
package/build/cleaning/planner.js +0 -127
package/build/cleaning/rules.js +0 -57
package/build/cleaning/types.js +0 -1
package/build/cloud/adapters/local.js +0 -37
package/build/cloud/adapters/s3.js +0 -24
package/build/cloud/adapters/supabase.js +0 -49
package/build/cloud/storage-manager.js +0 -26
package/build/cloud/types.js +0 -1
package/build/compliance/service.js +0 -73
package/build/compliance/store.js +0 -80
package/build/compliance/types.js +0 -1
package/build/config/config-manager.js +0 -221
package/build/config/secure-keys.js +0 -51
package/build/config/user-config.js +0 -48
package/build/data/processing-worker.js +0 -23
package/build/data/streaming.js +0 -38
package/build/data/worker-pool.js +0 -39
package/build/export/exporter.js +0 -82
package/build/export/packager.js +0 -100
package/build/export/types.js +0 -1
package/build/fusion/aligner.js +0 -56
package/build/fusion/deduplicator.js +0 -69
package/build/fusion/engine.js +0 -69
package/build/fusion/harmonizer.js +0 -39
package/build/fusion/orchestrator.js +0 -86
package/build/fusion/types.js +0 -1
package/build/gateway/unified-dataset-gateway.js +0 -410
package/build/index.js +0 -3068
package/build/ingestion/hf-downloader.js +0 -171
package/build/ingestion/ingestor.js +0 -271
package/build/ingestion/kaggle-downloader.js +0 -102
package/build/install/install-service.js +0 -46
package/build/jobs/manager.js +0 -136
package/build/jobs/queue.js +0 -59
package/build/jobs/types.js +0 -1
package/build/lib/supabase.js +0 -3
package/build/metadata/dataworld-source.js +0 -89
package/build/metadata/domain.js +0 -147
package/build/metadata/github-scraper.js +0 -47
package/build/metadata/institutional-scrapers.js +0 -49
package/build/metadata/kaggle-scraper.js +0 -182
package/build/metadata/kaggle-source.js +0 -70
package/build/metadata/license.js +0 -68
package/build/metadata/monitoring-service.js +0 -107
package/build/metadata/monitoring-store.js +0 -78
package/build/metadata/monitoring-types.js +0 -1
package/build/metadata/openml-source.js +0 -87
package/build/metadata/quality.js +0 -48
package/build/metadata/rate-limiter.js +0 -128
package/build/metadata/scraper.js +0 -448
package/build/metadata/store.js +0 -340
package/build/metadata/types.js +0 -1
package/build/metadata/uci-scraper.js +0 -49
package/build/monitoring/observability.js +0 -76
package/build/preparation/target-detector.js +0 -75
package/build/python/__pycache__/config.cpython-312.pyc +0 -0
package/build/python/asset_downloader_engine.py +0 -94
package/build/python/cleaner.py +0 -226
package/build/python/config.py +0 -263
package/build/python/convert_engine.py +0 -92
package/build/python/dataworld_engine.py +0 -208
package/build/python/export_engine.py +0 -288
package/build/python/framework_adapters.py +0 -100
package/build/python/fusion_engine.py +0 -368
package/build/python/github_adapter.py +0 -106
package/build/python/hf_fallback.py +0 -298
package/build/python/image_engine.py +0 -86
package/build/python/kaggle_engine.py +0 -295
package/build/python/media_engine.py +0 -133
package/build/python/nasa_adapter.py +0 -82
package/build/python/normalize_engine.py +0 -83
package/build/python/openml_engine.py +0 -146
package/build/python/quality_engine.py +0 -267
package/build/python/row_count.py +0 -54
package/build/python/splitter_engine.py +0 -283
package/build/python/target_engine.py +0 -154
package/build/python/test_framework_adapters.py +0 -61
package/build/python/test_fusion_engine.py +0 -89
package/build/python/uci_adapter.py +0 -94
package/build/python/vesper/__init__.py +0 -1
package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
package/build/python/vesper/core/__init__.py +0 -1
package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
package/build/python/vesper/core/asset_downloader.py +0 -679
package/build/python/vesper/core/download_recipe.py +0 -104
package/build/python/worldbank_adapter.py +0 -99
package/build/quality/analyzer.js +0 -93
package/build/quality/image-analyzer.js +0 -114
package/build/quality/media-analyzer.js +0 -115
package/build/quality/quality-orchestrator.js +0 -162
package/build/quality/types.js +0 -1
package/build/scripts/build-index.js +0 -54
package/build/scripts/check-db.js +0 -73
package/build/scripts/check-jobs.js +0 -24
package/build/scripts/check-naruto.js +0 -17
package/build/scripts/cleanup-kaggle.js +0 -41
package/build/scripts/demo-full-pipeline.js +0 -62
package/build/scripts/demo-ui.js +0 -58
package/build/scripts/e2e-demo.js +0 -72
package/build/scripts/massive-scrape.js +0 -103
package/build/scripts/ops-dashboard.js +0 -33
package/build/scripts/repro-bug.js +0 -37
package/build/scripts/repro-export-bug.js +0 -56
package/build/scripts/scrape-metadata.js +0 -100
package/build/scripts/search-cli.js +0 -26
package/build/scripts/test-bias.js +0 -45
package/build/scripts/test-caching.js +0 -51
package/build/scripts/test-cleaning.js +0 -76
package/build/scripts/test-cloud-storage.js +0 -48
package/build/scripts/test-compliance.js +0 -58
package/build/scripts/test-conversion.js +0 -64
package/build/scripts/test-custom-rules.js +0 -58
package/build/scripts/test-db-opt.js +0 -63
package/build/scripts/test-export-custom.js +0 -33
package/build/scripts/test-exporter.js +0 -53
package/build/scripts/test-fusion.js +0 -61
package/build/scripts/test-github.js +0 -27
package/build/scripts/test-group-split.js +0 -52
package/build/scripts/test-hf-download.js +0 -29
package/build/scripts/test-holdout-manager.js +0 -61
package/build/scripts/test-hybrid-search.js +0 -41
package/build/scripts/test-image-analysis.js +0 -50
package/build/scripts/test-ingestion-infra.js +0 -39
package/build/scripts/test-install.js +0 -40
package/build/scripts/test-institutional.js +0 -26
package/build/scripts/test-integrity.js +0 -41
package/build/scripts/test-jit.js +0 -42
package/build/scripts/test-job-queue.js +0 -62
package/build/scripts/test-kaggle-download.js +0 -34
package/build/scripts/test-large-data.js +0 -50
package/build/scripts/test-mcp-v5.js +0 -74
package/build/scripts/test-media-analysis.js +0 -61
package/build/scripts/test-monitoring.js +0 -91
package/build/scripts/test-observability.js +0 -106
package/build/scripts/test-packager.js +0 -55
package/build/scripts/test-pipeline.js +0 -50
package/build/scripts/test-planning.js +0 -64
package/build/scripts/test-privacy.js +0 -38
package/build/scripts/test-production-sync.js +0 -36
package/build/scripts/test-quality.js +0 -43
package/build/scripts/test-robust-ingestion.js +0 -41
package/build/scripts/test-schema.js +0 -45
package/build/scripts/test-split-validation.js +0 -40
package/build/scripts/test-splitter.js +0 -93
package/build/scripts/test-target-detector.js +0 -29
package/build/scripts/test-uci.js +0 -27
package/build/scripts/test-unified-quality.js +0 -86
package/build/scripts/test-write.js +0 -14
package/build/scripts/verify-integration.js +0 -57
package/build/scripts/verify-priority.js +0 -33
package/build/search/embedder.js +0 -34
package/build/search/engine.js +0 -190
package/build/search/jit-orchestrator.js +0 -262
package/build/search/query-intent.js +0 -509
package/build/search/vector-store.js +0 -123
package/build/splitting/splitter.js +0 -82
package/build/splitting/types.js +0 -1
package/build/tools/formatter.js +0 -251
package/build/utils/downloader.js +0 -52
package/build/utils/python-runtime.js +0 -130
package/build/utils/selector.js +0 -69
package/mcp-config-template.json +0 -18
package/scripts/postinstall.cjs +0 -170
package/scripts/preindex_registry.cjs +0 -157
package/scripts/refresh-index.cjs +0 -87
package/scripts/wizard.cjs +0 -601
package/src/python/__pycache__/config.cpython-312.pyc +0 -0
package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
package/src/python/asset_downloader_engine.py +0 -94
package/src/python/cleaner.py +0 -226
package/src/python/config.py +0 -263
package/src/python/convert_engine.py +0 -92
package/src/python/dataworld_engine.py +0 -208
package/src/python/export_engine.py +0 -288
package/src/python/framework_adapters.py +0 -100
package/src/python/fusion_engine.py +0 -368
package/src/python/github_adapter.py +0 -106
package/src/python/hf_fallback.py +0 -298
package/src/python/image_engine.py +0 -86
package/src/python/kaggle_engine.py +0 -295
package/src/python/media_engine.py +0 -133
package/src/python/nasa_adapter.py +0 -82
package/src/python/normalize_engine.py +0 -83
package/src/python/openml_engine.py +0 -146
package/src/python/quality_engine.py +0 -267
package/src/python/requirements.txt +0 -12
package/src/python/row_count.py +0 -54
package/src/python/splitter_engine.py +0 -283
package/src/python/target_engine.py +0 -154
package/src/python/test_framework_adapters.py +0 -61
package/src/python/test_fusion_engine.py +0 -89
package/src/python/uci_adapter.py +0 -94
package/src/python/vesper/__init__.py +0 -1
package/src/python/vesper/core/__init__.py +0 -1
package/src/python/vesper/core/asset_downloader.py +0 -679
package/src/python/vesper/core/download_recipe.py +0 -104
package/src/python/worldbank_adapter.py +0 -99
package/wizard.cjs +0 -3

package/build/gateway/unified-dataset-gateway.js DELETED Viewed

@@ -1,410 +0,0 @@
-import fs from "fs";
-import path from "path";
-import http from "http";
-import https from "https";
-import { HuggingFaceScraper } from "../metadata/scraper.js";
-import { analyzeDatasetQuery } from "../search/query-intent.js";
-export class UnifiedDatasetGateway {
-    deps;
-    constructor(deps) {
-        this.deps = deps;
-    }
-    getProviderStatuses(includeUnavailable = true) {
-        const hasHfToken = !!(process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN);
-        const hasKaggle = this.deps.dataIngestor.hasKaggleCredentials();
-        const hasDataWorld = this.deps.hasDataWorldToken();
-        const hasBigQuery = !!(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GOOGLE_CLOUD_PROJECT);
-        const providers = [
-            {
-                source: "huggingface",
-                display_name: "Hugging Face",
-                available: true,
-                auth_mode: hasHfToken ? "public-or-server-managed" : "public",
-                supported_operations: ["discover", "download", "info"],
-                requires_end_user_key: false,
-                notes: hasHfToken
-                    ? ["Public datasets are open by default. Gated/private repos can be accessed via the server-managed HF token when configured."]
-                    : ["Public datasets work without any user key. Gated/private repos need an operator or user token."],
-            },
-            {
-                source: "openml",
-                display_name: "OpenML",
-                available: true,
-                auth_mode: "public",
-                supported_operations: ["discover", "download", "info"],
-                requires_end_user_key: false,
-                notes: ["OpenML is exposed as a keyless public provider through the gateway."],
-            },
-            {
-                source: "kaggle",
-                display_name: "Kaggle",
-                available: hasKaggle,
-                auth_mode: hasKaggle ? "server-managed" : "not-configured",
-                supported_operations: ["discover", "download", "info"],
-                requires_end_user_key: false,
-                notes: hasKaggle
-                    ? ["Kaggle is available through server-managed credentials. End users do not need to pass their own key."]
-                    : ["Kaggle support exists, but no server-managed credentials are configured yet."],
-            },
-            {
-                source: "dataworld",
-                display_name: "data.world",
-                available: hasDataWorld,
-                auth_mode: hasDataWorld ? "server-managed" : "not-configured",
-                supported_operations: ["discover", "download", "info"],
-                requires_end_user_key: false,
-                notes: hasDataWorld
-                    ? ["data.world is available through server-managed credentials."]
-                    : ["data.world support exists, but no server-managed token is configured yet."],
-            },
-            {
-                source: "s3",
-                display_name: "Amazon S3",
-                available: true,
-                auth_mode: "public-or-server-managed",
-                supported_operations: ["download", "info"],
-                requires_end_user_key: false,
-                notes: ["Supports keyless download of public S3 objects via s3://bucket/key or HTTPS S3 URLs.", "Bucket listing and search are intentionally not exposed."],
-            },
-            {
-                source: "bigquery",
-                display_name: "BigQuery",
-                available: hasBigQuery,
-                auth_mode: hasBigQuery ? "server-managed" : "not-configured",
-                supported_operations: ["info"],
-                requires_end_user_key: false,
-                notes: hasBigQuery
-                    ? ["BigQuery is reserved for operator-managed connectors. Query execution is not implemented in this patch."]
-                    : ["BigQuery is scaffolded in the gateway contract, but no server-managed GCP configuration is present."],
-            },
-        ];
-        return includeUnavailable ? providers : providers.filter(provider => provider.available);
-    }
-    async discover(options) {
-        const query = String(options.query || "").trim();
-        const requestedSource = options.source || "auto";
-        const limit = Math.max(1, Number(options.limit || 10));
-        const publicOnly = options.publicOnly !== false;
-        if (!query) {
-            throw new Error("query is required");
-        }
-        const notes = [];
-        const providers = this.resolveDiscoverSources(requestedSource, publicOnly, notes);
-        const perSourceLimit = Math.max(5, Math.ceil(limit / Math.max(providers.length, 1)) * 2);
-        const allResults = [];
-        for (const provider of providers) {
-            try {
-                const partial = await this.discoverFromSource(provider, query, perSourceLimit);
-                for (const dataset of partial) {
-                    try {
-                        this.deps.metadataStore.saveDataset(dataset);
-                    }
-                    catch {
-                        // best-effort metadata persistence
-                    }
-                    allResults.push(dataset);
-                }
-            }
-            catch (error) {
-                notes.push(`${provider}: ${(error?.message || error || "Unknown provider error").toString()}`);
-            }
-        }
-        const deduped = new Map();
-        for (const dataset of allResults) {
-            deduped.set(`${dataset.source}:${dataset.id}`, dataset);
-        }
-        const results = Array.from(deduped.values())
-            .sort((a, b) => this.rankDataset(b) - this.rankDataset(a))
-            .slice(0, limit);
-        return {
-            query,
-            requested_source: requestedSource,
-            providers_tried: providers,
-            notes,
-            results,
-        };
-    }
-    async download(options) {
-        const requested = String(options.datasetId || "").trim();
-        if (!requested) {
-            throw new Error("dataset_id is required");
-        }
-        const notes = [];
-        const resolved = this.resolveDatasetReference(requested, options.source || "auto");
-        if (resolved.source === "bigquery") {
-            throw new Error("BigQuery gateway support is scaffolded for operator-managed connectors, but query/download execution is not implemented yet.");
-        }
-        if (resolved.source === "s3") {
-            const localPath = await this.downloadPublicS3Object(resolved.datasetId, options.targetDir);
-            return {
-                dataset_id: requested,
-                resolved_source: "s3",
-                local_path: localPath,
-                notes: ["Downloaded via the keyless S3 gateway path."],
-            };
-        }
-        let source = this.toIngestSource(resolved.source);
-        let datasetId = resolved.datasetId;
-        if (!source) {
-            const metadataMatch = this.lookupKnownDataset(requested);
-            const metadataSource = this.toIngestSource(metadataMatch?.source);
-            if (metadataMatch && metadataSource) {
-                source = metadataSource;
-                datasetId = metadataMatch.id;
-            }
-        }
-        if (!source) {
-            const discovery = await this.discover({ query: requested, source: "auto", limit: 1, publicOnly: false });
-            if (discovery.results.length === 0) {
-                throw new Error(`Unable to resolve provider for '${requested}'. Run unified_dataset_api with operation='discover' first or pass an explicit source.`);
-            }
-            const discoveredSource = this.toIngestSource(discovery.results[0].source);
-            if (!discoveredSource) {
-                throw new Error(`Resolved provider '${discovery.results[0].source}' cannot be downloaded through the dataset ingestor.`);
-            }
-            source = discoveredSource;
-            datasetId = discovery.results[0].id;
-            notes.push(`Auto-resolved provider to ${source}.`);
-        }
-        if (source === "kaggle" && !this.deps.dataIngestor.hasKaggleCredentials()) {
-            throw new Error("Kaggle is configured as a gateway source, but no server-managed credentials are available.");
-        }
-        if (source === "dataworld" && !this.deps.hasDataWorldToken()) {
-            throw new Error("data.world is configured as a gateway source, but no server-managed token is available.");
-        }
-        const localPath = await this.deps.dataIngestor.ensureData(datasetId, source, () => undefined);
-        let copiedTo;
-        if (options.targetDir) {
-            copiedTo = this.copyDownloadOutput(localPath, options.targetDir);
-            notes.push(`Copied dataset output to ${copiedTo}.`);
-        }
-        return {
-            dataset_id: datasetId,
-            resolved_source: source,
-            local_path: localPath,
-            copied_to: copiedTo,
-            notes,
-        };
-    }
-    async info(options) {
-        const requested = String(options.datasetId || "").trim();
-        if (!requested) {
-            throw new Error("dataset_id is required");
-        }
-        const resolved = this.resolveDatasetReference(requested, options.source || "auto");
-        const metadataMatch = this.lookupKnownDataset(requested) || (resolved.datasetId !== requested ? this.lookupKnownDataset(resolved.datasetId) : undefined);
-        if (metadataMatch) {
-            return {
-                dataset_id: requested,
-                resolved_source: metadataMatch.source,
-                notes: [],
-                dataset: metadataMatch,
-            };
-        }
-        if (resolved.source === "s3") {
-            return {
-                dataset_id: requested,
-                resolved_source: "s3",
-                notes: ["S3 info is derived from the object URI. Discovery/listing is intentionally not supported."],
-                dataset: {
-                    id: requested,
-                    source: "s3",
-                    uri: this.toS3HttpsUrl(resolved.datasetId),
-                },
-            };
-        }
-        if (resolved.source === "bigquery") {
-            return {
-                dataset_id: requested,
-                resolved_source: "bigquery",
-                notes: ["BigQuery is reserved for operator-managed connectors. Detailed inspection is not implemented in this patch."],
-            };
-        }
-        const discovery = await this.discover({
-            query: resolved.datasetId,
-            source: resolved.source || "auto",
-            limit: 5,
-            publicOnly: options.publicOnly !== false,
-        });
-        const exact = discovery.results.find(dataset => this.matchesDatasetReference(dataset, requested));
-        return {
-            dataset_id: requested,
-            resolved_source: exact?.source,
-            notes: discovery.notes,
-            dataset: exact || discovery.results[0],
-        };
-    }
-    async discoverFromSource(source, query, limit) {
-        switch (source) {
-            case "huggingface":
-                return await new HuggingFaceScraper().scrape(limit, true, await analyzeDatasetQuery(query));
-            case "openml":
-                return await this.deps.openmlSource.discover(query, limit);
-            case "kaggle":
-                return await this.deps.kaggleSource.discover(query, limit);
-            case "dataworld":
-                return await this.deps.dataworldSource.discover(query, limit);
-            case "s3":
-                throw new Error("S3 does not support search/discovery in the unified gateway. Use a direct s3://bucket/key object reference.");
-            case "bigquery":
-                throw new Error("BigQuery discovery is not implemented in the unified gateway.");
-            default:
-                throw new Error(`Unsupported provider: ${source}`);
-        }
-    }
-    resolveDiscoverSources(source, publicOnly, notes) {
-        if (source !== "auto") {
-            if (source === "kaggle" && !this.deps.dataIngestor.hasKaggleCredentials()) {
-                throw new Error("Kaggle requires server-managed credentials and none are configured.");
-            }
-            if (source === "dataworld" && !this.deps.hasDataWorldToken()) {
-                throw new Error("data.world requires a server-managed token and none is configured.");
-            }
-            if (source === "s3" || source === "bigquery") {
-                throw new Error(`${source} does not currently support discover operation through the gateway.`);
-            }
-            return [source];
-        }
-        const providers = ["huggingface", "openml"];
-        if (!publicOnly && this.deps.dataIngestor.hasKaggleCredentials()) {
-            providers.push("kaggle");
-        }
-        else if (!publicOnly) {
-            notes.push("Kaggle skipped because no server-managed credentials are configured.");
-        }
-        if (!publicOnly && this.deps.hasDataWorldToken()) {
-            providers.push("dataworld");
-        }
-        else if (!publicOnly) {
-            notes.push("data.world skipped because no server-managed token is configured.");
-        }
-        return providers;
-    }
-    resolveDatasetReference(datasetId, source) {
-        const trimmed = datasetId.trim();
-        if (source !== "auto") {
-            if (source === "s3") {
-                return { source, datasetId: trimmed };
-            }
-            return { source, datasetId: this.stripSourcePrefix(trimmed, source) };
-        }
-        if (/^s3:\/\//i.test(trimmed) || /^https?:\/\/[^\s]+\.s3[.-][^\s]+/i.test(trimmed) || /^https?:\/\/s3\.[^\s]+amazonaws\.com\//i.test(trimmed)) {
-            return { source: "s3", datasetId: trimmed };
-        }
-        if (/^kaggle:/i.test(trimmed))
-            return { source: "kaggle", datasetId: trimmed.replace(/^kaggle:/i, "") };
-        if (/^(huggingface|hf):/i.test(trimmed))
-            return { source: "huggingface", datasetId: trimmed.replace(/^(huggingface|hf):/i, "") };
-        if (/^openml:/i.test(trimmed))
-            return { source: "openml", datasetId: trimmed.replace(/^openml:/i, "") };
-        if (/^dataworld:/i.test(trimmed))
-            return { source: "dataworld", datasetId: trimmed.replace(/^dataworld:/i, "") };
-        if (/^bigquery:/i.test(trimmed))
-            return { source: "bigquery", datasetId: trimmed.replace(/^bigquery:/i, "") };
-        if (/^\d+$/.test(trimmed))
-            return { source: "openml", datasetId: trimmed };
-        if (trimmed.includes("/") && !trimmed.includes(":"))
-            return { source: "huggingface", datasetId: trimmed };
-        return { datasetId: trimmed };
-    }
-    stripSourcePrefix(datasetId, source) {
-        if (source === "huggingface") {
-            return datasetId.replace(/^(huggingface|hf):/i, "");
-        }
-        return datasetId.replace(new RegExp(`^${source}:`, "i"), "");
-    }
-    lookupKnownDataset(datasetId) {
-        const candidates = new Set([
-            datasetId,
-            datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, ""),
-        ]);
-        for (const candidate of candidates) {
-            const dataset = this.deps.metadataStore.getDataset(candidate);
-            if (dataset)
-                return dataset;
-        }
-        return undefined;
-    }
-    matchesDatasetReference(dataset, requested) {
-        const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, "").toLowerCase();
-        const fullId = `${dataset.source}:${dataset.id}`.toLowerCase();
-        return dataset.id.toLowerCase() === normalizedRequested || fullId === requested.toLowerCase();
-    }
-    toIngestSource(source) {
-        if (source === "huggingface" || source === "openml" || source === "kaggle" || source === "dataworld") {
-            return source;
-        }
-        return undefined;
-    }
-    rankDataset(dataset) {
-        const relevance = Number(dataset.relevance_score || 0) * 1000;
-        const quality = Number(dataset.quality_score || 0) * 100;
-        const downloads = Number(dataset.downloads || 0);
-        return relevance + quality + downloads;
-    }
-    copyDownloadOutput(localPath, targetDir) {
-        const resolvedTargetDir = path.resolve(targetDir);
-        fs.mkdirSync(resolvedTargetDir, { recursive: true });
-        const destination = path.join(resolvedTargetDir, path.basename(localPath));
-        fs.cpSync(localPath, destination, { recursive: true, force: true });
-        return destination;
-    }
-    async downloadPublicS3Object(datasetId, targetDir) {
-        const httpsUrl = this.toS3HttpsUrl(datasetId);
-        const parsed = new URL(httpsUrl);
-        const fileName = path.basename(parsed.pathname) || "s3-object.bin";
-        const outputDir = path.resolve(targetDir || path.join(this.deps.dataRoot, "data", "raw"));
-        fs.mkdirSync(outputDir, { recursive: true });
-        const outputPath = path.join(outputDir, fileName);
-        await this.downloadToFile(httpsUrl, outputPath);
-        this.deps.metadataStore.registerDownload(datasetId, outputPath, "completed", fs.statSync(outputPath).size);
-        return outputPath;
-    }
-    toS3HttpsUrl(datasetId) {
-        if (/^https?:\/\//i.test(datasetId)) {
-            return datasetId;
-        }
-        const match = datasetId.match(/^s3:\/\/([^/]+)\/(.+)$/i);
-        if (!match) {
-            throw new Error("S3 source expects an s3://bucket/key object reference or a direct HTTPS S3 URL.");
-        }
-        const bucket = match[1];
-        const objectKey = match[2].split("/").map(encodeURIComponent).join("/");
-        return `https://${bucket}.s3.amazonaws.com/${objectKey}`;
-    }
-    async downloadToFile(url, destination) {
-        await new Promise((resolve, reject) => {
-            const transport = url.startsWith("https:") ? https : http;
-            const request = transport.get(url, response => {
-                const statusCode = response.statusCode || 0;
-                const location = response.headers.location;
-                if (statusCode >= 300 && statusCode < 400 && location) {
-                    response.resume();
-                    this.downloadToFile(location, destination).then(resolve).catch(reject);
-                    return;
-                }
-                if (statusCode < 200 || statusCode >= 300) {
-                    response.resume();
-                    reject(new Error(`Download failed with status ${statusCode}`));
-                    return;
-                }
-                const file = fs.createWriteStream(destination);
-                response.pipe(file);
-                file.on("finish", () => {
-                    file.close();
-                    resolve();
-                });
-                file.on("error", error => {
-                    try {
-                        file.close();
-                    }
-                    catch {
-                        // no-op
-                    }
-                    reject(error);
-                });
-            });
-            request.on("error", reject);
-        });
-    }
-}