npm - vesper-wizard - Versions diffs - 2.3.1 → 2.3.3 - Mend

vesper-wizard 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (214) hide show

package/README.md +37 -322
package/package.json +34 -100
package/vesper-mcp-config.json +6 -0
package/{scripts/wizard.js → wizard.js} +1 -1
package/LICENSE +0 -21
package/build/cache/cdn.js +0 -34
package/build/cache/service.js +0 -63
package/build/cleaning/cleaner.js +0 -81
package/build/cleaning/evaluator.js +0 -89
package/build/cleaning/executor.js +0 -62
package/build/cleaning/exporter.js +0 -87
package/build/cleaning/planner.js +0 -127
package/build/cleaning/rules.js +0 -57
package/build/cleaning/types.js +0 -1
package/build/cloud/adapters/local.js +0 -37
package/build/cloud/adapters/s3.js +0 -24
package/build/cloud/adapters/supabase.js +0 -49
package/build/cloud/storage-manager.js +0 -26
package/build/cloud/types.js +0 -1
package/build/compliance/service.js +0 -73
package/build/compliance/store.js +0 -80
package/build/compliance/types.js +0 -1
package/build/config/config-manager.js +0 -221
package/build/config/secure-keys.js +0 -51
package/build/config/user-config.js +0 -48
package/build/data/processing-worker.js +0 -23
package/build/data/streaming.js +0 -38
package/build/data/worker-pool.js +0 -39
package/build/export/exporter.js +0 -82
package/build/export/packager.js +0 -100
package/build/export/types.js +0 -1
package/build/fusion/aligner.js +0 -56
package/build/fusion/deduplicator.js +0 -69
package/build/fusion/engine.js +0 -69
package/build/fusion/harmonizer.js +0 -39
package/build/fusion/orchestrator.js +0 -86
package/build/fusion/types.js +0 -1
package/build/gateway/unified-dataset-gateway.js +0 -410
package/build/index.js +0 -3068
package/build/ingestion/hf-downloader.js +0 -171
package/build/ingestion/ingestor.js +0 -271
package/build/ingestion/kaggle-downloader.js +0 -102
package/build/install/install-service.js +0 -46
package/build/jobs/manager.js +0 -136
package/build/jobs/queue.js +0 -59
package/build/jobs/types.js +0 -1
package/build/lib/supabase.js +0 -3
package/build/metadata/dataworld-source.js +0 -89
package/build/metadata/domain.js +0 -147
package/build/metadata/github-scraper.js +0 -47
package/build/metadata/institutional-scrapers.js +0 -49
package/build/metadata/kaggle-scraper.js +0 -182
package/build/metadata/kaggle-source.js +0 -70
package/build/metadata/license.js +0 -68
package/build/metadata/monitoring-service.js +0 -107
package/build/metadata/monitoring-store.js +0 -78
package/build/metadata/monitoring-types.js +0 -1
package/build/metadata/openml-source.js +0 -87
package/build/metadata/quality.js +0 -48
package/build/metadata/rate-limiter.js +0 -128
package/build/metadata/scraper.js +0 -448
package/build/metadata/store.js +0 -340
package/build/metadata/types.js +0 -1
package/build/metadata/uci-scraper.js +0 -49
package/build/monitoring/observability.js +0 -76
package/build/preparation/target-detector.js +0 -75
package/build/python/__pycache__/config.cpython-312.pyc +0 -0
package/build/python/asset_downloader_engine.py +0 -94
package/build/python/cleaner.py +0 -226
package/build/python/config.py +0 -263
package/build/python/convert_engine.py +0 -92
package/build/python/dataworld_engine.py +0 -208
package/build/python/export_engine.py +0 -288
package/build/python/framework_adapters.py +0 -100
package/build/python/fusion_engine.py +0 -368
package/build/python/github_adapter.py +0 -106
package/build/python/hf_fallback.py +0 -298
package/build/python/image_engine.py +0 -86
package/build/python/kaggle_engine.py +0 -295
package/build/python/media_engine.py +0 -133
package/build/python/nasa_adapter.py +0 -82
package/build/python/normalize_engine.py +0 -83
package/build/python/openml_engine.py +0 -146
package/build/python/quality_engine.py +0 -267
package/build/python/row_count.py +0 -54
package/build/python/splitter_engine.py +0 -283
package/build/python/target_engine.py +0 -154
package/build/python/test_framework_adapters.py +0 -61
package/build/python/test_fusion_engine.py +0 -89
package/build/python/uci_adapter.py +0 -94
package/build/python/vesper/__init__.py +0 -1
package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
package/build/python/vesper/core/__init__.py +0 -1
package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
package/build/python/vesper/core/asset_downloader.py +0 -679
package/build/python/vesper/core/download_recipe.py +0 -104
package/build/python/worldbank_adapter.py +0 -99
package/build/quality/analyzer.js +0 -93
package/build/quality/image-analyzer.js +0 -114
package/build/quality/media-analyzer.js +0 -115
package/build/quality/quality-orchestrator.js +0 -162
package/build/quality/types.js +0 -1
package/build/scripts/build-index.js +0 -54
package/build/scripts/check-db.js +0 -73
package/build/scripts/check-jobs.js +0 -24
package/build/scripts/check-naruto.js +0 -17
package/build/scripts/cleanup-kaggle.js +0 -41
package/build/scripts/demo-full-pipeline.js +0 -62
package/build/scripts/demo-ui.js +0 -58
package/build/scripts/e2e-demo.js +0 -72
package/build/scripts/massive-scrape.js +0 -103
package/build/scripts/ops-dashboard.js +0 -33
package/build/scripts/repro-bug.js +0 -37
package/build/scripts/repro-export-bug.js +0 -56
package/build/scripts/scrape-metadata.js +0 -100
package/build/scripts/search-cli.js +0 -26
package/build/scripts/test-bias.js +0 -45
package/build/scripts/test-caching.js +0 -51
package/build/scripts/test-cleaning.js +0 -76
package/build/scripts/test-cloud-storage.js +0 -48
package/build/scripts/test-compliance.js +0 -58
package/build/scripts/test-conversion.js +0 -64
package/build/scripts/test-custom-rules.js +0 -58
package/build/scripts/test-db-opt.js +0 -63
package/build/scripts/test-export-custom.js +0 -33
package/build/scripts/test-exporter.js +0 -53
package/build/scripts/test-fusion.js +0 -61
package/build/scripts/test-github.js +0 -27
package/build/scripts/test-group-split.js +0 -52
package/build/scripts/test-hf-download.js +0 -29
package/build/scripts/test-holdout-manager.js +0 -61
package/build/scripts/test-hybrid-search.js +0 -41
package/build/scripts/test-image-analysis.js +0 -50
package/build/scripts/test-ingestion-infra.js +0 -39
package/build/scripts/test-install.js +0 -40
package/build/scripts/test-institutional.js +0 -26
package/build/scripts/test-integrity.js +0 -41
package/build/scripts/test-jit.js +0 -42
package/build/scripts/test-job-queue.js +0 -62
package/build/scripts/test-kaggle-download.js +0 -34
package/build/scripts/test-large-data.js +0 -50
package/build/scripts/test-mcp-v5.js +0 -74
package/build/scripts/test-media-analysis.js +0 -61
package/build/scripts/test-monitoring.js +0 -91
package/build/scripts/test-observability.js +0 -106
package/build/scripts/test-packager.js +0 -55
package/build/scripts/test-pipeline.js +0 -50
package/build/scripts/test-planning.js +0 -64
package/build/scripts/test-privacy.js +0 -38
package/build/scripts/test-production-sync.js +0 -36
package/build/scripts/test-quality.js +0 -43
package/build/scripts/test-robust-ingestion.js +0 -41
package/build/scripts/test-schema.js +0 -45
package/build/scripts/test-split-validation.js +0 -40
package/build/scripts/test-splitter.js +0 -93
package/build/scripts/test-target-detector.js +0 -29
package/build/scripts/test-uci.js +0 -27
package/build/scripts/test-unified-quality.js +0 -86
package/build/scripts/test-write.js +0 -14
package/build/scripts/verify-integration.js +0 -57
package/build/scripts/verify-priority.js +0 -33
package/build/search/embedder.js +0 -34
package/build/search/engine.js +0 -190
package/build/search/jit-orchestrator.js +0 -262
package/build/search/query-intent.js +0 -509
package/build/search/vector-store.js +0 -123
package/build/splitting/splitter.js +0 -82
package/build/splitting/types.js +0 -1
package/build/tools/formatter.js +0 -251
package/build/utils/downloader.js +0 -52
package/build/utils/python-runtime.js +0 -130
package/build/utils/selector.js +0 -69
package/mcp-config-template.json +0 -18
package/scripts/postinstall.cjs +0 -170
package/scripts/preindex_registry.cjs +0 -157
package/scripts/refresh-index.cjs +0 -87
package/scripts/wizard.cjs +0 -601
package/src/python/__pycache__/config.cpython-312.pyc +0 -0
package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
package/src/python/asset_downloader_engine.py +0 -94
package/src/python/cleaner.py +0 -226
package/src/python/config.py +0 -263
package/src/python/convert_engine.py +0 -92
package/src/python/dataworld_engine.py +0 -208
package/src/python/export_engine.py +0 -288
package/src/python/framework_adapters.py +0 -100
package/src/python/fusion_engine.py +0 -368
package/src/python/github_adapter.py +0 -106
package/src/python/hf_fallback.py +0 -298
package/src/python/image_engine.py +0 -86
package/src/python/kaggle_engine.py +0 -295
package/src/python/media_engine.py +0 -133
package/src/python/nasa_adapter.py +0 -82
package/src/python/normalize_engine.py +0 -83
package/src/python/openml_engine.py +0 -146
package/src/python/quality_engine.py +0 -267
package/src/python/requirements.txt +0 -12
package/src/python/row_count.py +0 -54
package/src/python/splitter_engine.py +0 -283
package/src/python/target_engine.py +0 -154
package/src/python/test_framework_adapters.py +0 -61
package/src/python/test_fusion_engine.py +0 -89
package/src/python/uci_adapter.py +0 -94
package/src/python/vesper/__init__.py +0 -1
package/src/python/vesper/core/__init__.py +0 -1
package/src/python/vesper/core/asset_downloader.py +0 -679
package/src/python/vesper/core/download_recipe.py +0 -104
package/src/python/worldbank_adapter.py +0 -99
package/wizard.cjs +0 -3

package/build/metadata/kaggle-scraper.js DELETED Viewed

@@ -1,182 +0,0 @@
-import { categorizeLicense } from "./license.js";
-import { calculateQualityScore } from "./quality.js";
-import { classifyDomain } from "./domain.js";
-import { rateLimitedFetch, delayBetweenRequests } from "./rate-limiter.js";
-export class KaggleMetadataScraper {
-    username;
-    key;
-    constructor(username, key) {
-        this.username = username;
-        this.key = key;
-    }
-    async scrape(query, limit = 20, usePagination = true) {
-        console.error(`[Kaggle] Searching for "${query}" (limit: ${limit}, pagination: ${usePagination})...`);
-        const auth = Buffer.from(`${this.username}:${this.key}`).toString('base64');
-        const results = [];
-        const MAX_PAGE_SIZE = 100; // Kaggle API max page size
-        const pageSize = Math.min(limit, MAX_PAGE_SIZE);
-        let page = 1;
-        let totalFetched = 0;
-        let hasMore = true;
-        try {
-            while (hasMore && totalFetched < limit) {
-                const url = `https://www.kaggle.com/api/v1/datasets/list?search=${encodeURIComponent(query)}&page_size=${pageSize}&page=${page}`;
-                console.error(`[Kaggle] Fetching page ${page} (${totalFetched}/${limit} datasets so far)...`);
-                // Use rate-limited fetch with retry logic
-                const response = await rateLimitedFetch(url, {
-                    headers: {
-                        'Authorization': `Basic ${auth}`,
-                        'Content-Type': 'application/json'
-                    }
-                }, {
-                    maxRetries: 3,
-                    initialDelay: 2000, // Start with 2 seconds
-                    maxDelay: 30000 // Max 30 seconds
-                });
-                const datasets = await response.json();
-                if (!datasets || datasets.length === 0) {
-                    hasMore = false;
-                    break;
-                }
-                // Add delay between processing datasets to avoid rate limits
-                for (let i = 0; i < datasets.length; i++) {
-                    const ds = datasets[i];
-                    try {
-                        const metadata = this.transform(ds);
-                        results.push(metadata);
-                        totalFetched++;
-                        console.error(`[Kaggle] Added: ${ds.ref} (${ds.downloadCount} downloads)`);
-                        // Add small delay every 5 datasets
-                        if ((i + 1) % 5 === 0 && i < datasets.length - 1) {
-                            await delayBetweenRequests(500);
-                        }
-                    }
-                    catch (e) {
-                        console.error(`[Kaggle] ERROR: Failed to transform ${ds.ref}:`, e);
-                    }
-                }
-                // Check if we should continue pagination
-                if (usePagination && datasets.length === pageSize && totalFetched < limit) {
-                    page++;
-                    // Add delay between pages to avoid rate limits
-                    await delayBetweenRequests(1000);
-                }
-                else {
-                    hasMore = false;
-                }
-            }
-            console.error(`[Kaggle] Completed: ${results.length} datasets found for "${query}"`);
-            return results;
-        }
-        catch (e) {
-            // Handle rate limit errors specifically
-            if (e?.status === 429 || e?.message?.includes('rate limit')) {
-                console.error("[Kaggle] Rate limit error:", e.message);
-                console.error("Consider adding delays between requests or reducing batch size");
-            }
-            else {
-                console.error("[Kaggle] Scrape error:", e.message || e);
-            }
-            // Return partial results if we got some before the error
-            if (results.length > 0) {
-                console.error(`[Kaggle] Returning ${results.length} partial results before error`);
-            }
-            return results;
-        }
-    }
-    transform(ds) {
-        const repoId = ds.ref;
-        const tags = ds.tags?.map(t => t.name) || [];
-        const description = ds.description || "";
-        const license = categorizeLicense(ds.licenseName);
-        const warnings = [];
-        // Kaggle doesn't give us splits in the list API easily
-        const sizeBytes = this.parseSize(ds.size);
-        const splits = [
-            {
-                name: "data",
-                num_examples: 0,
-                size_bytes: sizeBytes
-            }
-        ];
-        const totalSizeMB = sizeBytes ? Math.round(sizeBytes / (1024 * 1024) * 100) / 100 : 0;
-        // Populate warnings
-        if (description.length < 100)
-            warnings.push("Short description; results may be less relevant");
-        const lastUpdatedDate = new Date(ds.lastUpdated);
-        const fourYearsAgo = new Date();
-        fourYearsAgo.setFullYear(fourYearsAgo.getFullYear() - 4);
-        if (lastUpdatedDate < fourYearsAgo) {
-            warnings.push(`Stale data: Last updated ${lastUpdatedDate.getFullYear()}`);
-        }
-        warnings.push("No specific data splits identified (Kaggle API limitation)");
-        // Classify domain
-        const task = this.extractTask(tags);
-        const domain = classifyDomain(description, tags, repoId, task);
-        return {
-            id: repoId,
-            source: "kaggle",
-            name: ds.title,
-            description: description,
-            quality_warnings: warnings,
-            downloads: ds.downloadCount,
-            likes: ds.voteCount,
-            stars: 0,
-            tags: tags,
-            last_updated: ds.lastUpdated,
-            task: task,
-            domain: domain,
-            languages: [],
-            splits,
-            license,
-            quality_score: calculateQualityScore({
-                downloads: ds.downloadCount,
-                likes: ds.voteCount,
-                hasDescription: description.length > 50,
-                descriptionLength: description.length,
-                hasTrainSplit: false,
-                hasTestSplit: false,
-                lastUpdated: ds.lastUpdated,
-                licenseCategory: license.category
-            }),
-            download_url: `https://www.kaggle.com/datasets/${ds.ref}`,
-            format: undefined,
-            total_examples: 0,
-            total_size_bytes: sizeBytes,
-            total_size_mb: totalSizeMB,
-            columns: [],
-            is_structured: false,
-            has_target_column: false,
-            is_safe_source: true,
-            has_personal_data: false,
-            is_paywalled: false,
-            is_scraped_web_data: false,
-            uses_https: true,
-            has_train_split: false,
-            has_test_split: false,
-            has_validation_split: false,
-            description_length: description.length,
-            has_readme: true
-        };
-    }
-    parseSize(sizeStr) {
-        if (!sizeStr)
-            return 0;
-        const match = sizeStr.match(/^(\d+(?:\.\d+)?)\s*([KMGT]B)$/i);
-        if (!match)
-            return 0;
-        const value = parseFloat(match[1]);
-        const unit = match[2].toUpperCase();
-        switch (unit) {
-            case 'KB': return value * 1024;
-            case 'MB': return value * 1024 * 1024;
-            case 'GB': return value * 1024 * 1024 * 1024;
-            case 'TB': return value * 1024 * 1024 * 1024 * 1024;
-            default: return value;
-        }
-    }
-    extractTask(tags) {
-        // Similar to HF but Kaggle tags might be different
-        return "unknown";
-    }
-}

package/build/metadata/kaggle-source.js DELETED Viewed

@@ -1,70 +0,0 @@
-import { spawn } from "child_process";
-import path from "path";
-import fs from "fs";
-import os from "os";
-export class KaggleSource {
-    pythonPath = "python";
-    scriptPath;
-    constructor(buildDir = process.cwd()) {
-        const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
-        const dataRoot = path.join(homeDir, ".vesper");
-        const scriptPath0 = path.resolve(dataRoot, "python", "kaggle_engine.py");
-        const scriptPath1 = path.resolve(buildDir, "python", "kaggle_engine.py");
-        const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "kaggle_engine.py");
-        if (fs.existsSync(scriptPath0)) {
-            this.scriptPath = scriptPath0;
-        }
-        else if (fs.existsSync(scriptPath1)) {
-            this.scriptPath = scriptPath1;
-        }
-        else if (fs.existsSync(scriptPath2)) {
-            this.scriptPath = scriptPath2;
-        }
-        else {
-            this.scriptPath = scriptPath0;
-        }
-        if (process.platform === "win32")
-            this.pythonPath = "py";
-    }
-    async discover(query, limit = 20) {
-        const result = await this.run(["discover", query, String(limit)]);
-        if (!result.ok) {
-            throw new Error(result.error || "Kaggle discover failed");
-        }
-        return (result.results || []);
-    }
-    async download(datasetRef, targetDir) {
-        const args = ["download", datasetRef];
-        if (targetDir)
-            args.push(targetDir);
-        const result = await this.run(args);
-        if (!result.ok) {
-            throw new Error(result.error || "Kaggle download failed");
-        }
-        return {
-            local_path: result.local_path,
-            target_dir: result.target_dir,
-        };
-    }
-    async run(args) {
-        return new Promise((resolve, reject) => {
-            const processRef = spawn(this.pythonPath, [this.scriptPath, ...args]);
-            let stdout = "";
-            let stderr = "";
-            processRef.stdout.on("data", (d) => (stdout += d.toString()));
-            processRef.stderr.on("data", (d) => (stderr += d.toString()));
-            processRef.on("close", (code) => {
-                if (code !== 0) {
-                    reject(new Error(stderr || stdout || `kaggle_engine exited with code ${code}`));
-                    return;
-                }
-                try {
-                    resolve(JSON.parse(stdout));
-                }
-                catch {
-                    reject(new Error(`Failed to parse kaggle_engine output: ${stdout}`));
-                }
-            });
-        });
-    }
-}

package/build/metadata/license.js DELETED Viewed

@@ -1,68 +0,0 @@
-const SAFE_KEYWORDS = ["mit", "apache", "bsd", "cc0", "cc-by-4.0", "cc-by-sa-4.0", "odc-by", "pddl", "openrail", "creative commons attribution 4.0", "public domain"];
-const RESTRICTED_KEYWORDS = ["nc", "non-commercial", "research-only", "academic", "gpl", "agpl", "proprietary", "custom"];
-// Permissive licenses for MVP filter
-const PERMISSIVE_LICENSES = ["mit", "apache", "apache-2.0", "bsd", "cc0", "cc-by-4.0", "odc-by", "pddl", "openrail"];
-export function categorizeLicense(licenseId, licenseUrl) {
-    const id = (licenseId || "unknown").toLowerCase();
-    const usageRestrictions = [];
-    let requiresConsent = false;
-    // Check for usage restrictions
-    if (id.includes("nc") || id.includes("non-commercial")) {
-        usageRestrictions.push("non-commercial");
-    }
-    if (id.includes("research-only") || id.includes("academic")) {
-        usageRestrictions.push("academic-only");
-    }
-    if (id.includes("nd") || id.includes("no-derivatives")) {
-        usageRestrictions.push("no-derivatives");
-    }
-    if (id.includes("gpl") || id.includes("agpl")) {
-        usageRestrictions.push("no-derivatives"); // GPL requires derivative works to be GPL
-    }
-    // Check if consent is required (GDPR, Kaggle, etc.)
-    if (id.includes("gdpr") || id.includes("consent") || id.includes("kaggle")) {
-        requiresConsent = true;
-    }
-    // If ID contains restricted keywords
-    if (RESTRICTED_KEYWORDS.some(k => id.includes(k))) {
-        return {
-            id,
-            category: "restricted",
-            commercial_use: false,
-            usage_restrictions: usageRestrictions.length > 0 ? usageRestrictions : ["non-commercial"],
-            url: licenseUrl,
-            warnings: [
-                "Restricted usage terms apply",
-                "Verify license terms before commercial application",
-            ],
-            requires_consent: requiresConsent,
-        };
-    }
-    // If ID is a common safe license
-    if (SAFE_KEYWORDS.some(k => id.includes(k))) {
-        return {
-            id,
-            category: "safe",
-            commercial_use: true,
-            usage_restrictions: [],
-            url: licenseUrl,
-            warnings: [],
-            requires_consent: requiresConsent,
-        };
-    }
-    return {
-        id: id || "unknown",
-        category: "unknown",
-        usage_restrictions: usageRestrictions,
-        url: licenseUrl,
-        warnings: [
-            "License information unclear or unknown",
-            "Use at your own risk",
-        ],
-        requires_consent: requiresConsent,
-    };
-}
-export function isPermissiveLicense(licenseId) {
-    const id = (licenseId || "unknown").toLowerCase();
-    return PERMISSIVE_LICENSES.some(perm => id.includes(perm));
-}

package/build/metadata/monitoring-service.js DELETED Viewed

@@ -1,107 +0,0 @@
-export class MonitoringService {
-    monitorStore;
-    metadataStore;
-    constructor(monitorStore, metadataStore) {
-        this.monitorStore = monitorStore;
-        this.metadataStore = metadataStore;
-    }
-    /**
-     * Checks all active monitors for updates.
-     * @param fetchLatest A function that fetches the latest metadata from the source (HF/Kaggle)
-     */
-    async checkUpdates(fetchLatest) {
-        const monitors = this.monitorStore.getActiveMonitors();
-        const results = [];
-        for (const monitor of monitors) {
-            const current = this.metadataStore.getDataset(monitor.dataset_id);
-            if (!current)
-                continue;
-            const latest = await fetchLatest(monitor.dataset_id, current.source);
-            if (!latest)
-                continue;
-            if (latest.last_updated !== monitor.last_checked_version) {
-                const diff = this.compareVersions(current, latest);
-                if (diff.changes.length > 0) {
-                    results.push(diff);
-                    await this.notify(monitor, diff);
-                    // Update monitor
-                    monitor.last_checked_version = latest.last_updated;
-                    monitor.updated_at = new Date().toISOString();
-                    this.monitorStore.saveMonitor(monitor);
-                    // Update store
-                    this.metadataStore.saveDataset(latest);
-                    if (monitor.auto_reprocess) {
-                        await this.triggerReprocess(monitor.dataset_id);
-                    }
-                }
-            }
-        }
-        return results;
-    }
-    compareVersions(oldVer, newVer) {
-        const changes = [];
-        // Check for significant field changes
-        const fieldsToTrack = ["downloads", "likes", "total_examples", "total_size_mb", "quality_score"];
-        for (const field of fieldsToTrack) {
-            if (oldVer[field] !== newVer[field]) {
-                changes.push({
-                    field: String(field),
-                    old_value: oldVer[field],
-                    new_value: newVer[field]
-                });
-            }
-        }
-        // Check for split changes
-        if (JSON.stringify(oldVer.splits) !== JSON.stringify(newVer.splits)) {
-            changes.push({
-                field: "splits",
-                old_value: oldVer.splits,
-                new_value: newVer.splits
-            });
-        }
-        return {
-            dataset_id: oldVer.id,
-            old_version: oldVer.last_updated,
-            new_version: newVer.last_updated,
-            changes,
-            impact_score: this.calculateImpact(changes)
-        };
-    }
-    calculateImpact(changes) {
-        let score = 0;
-        for (const change of changes) {
-            if (change.field === "total_examples")
-                score += 40;
-            if (change.field === "splits")
-                score += 30;
-            if (change.field === "quality_score")
-                score += 20;
-            if (change.field === "total_size_mb")
-                score += 10;
-        }
-        return Math.min(score, 100);
-    }
-    async notify(monitor, diff) {
-        for (const webhookId of monitor.webhook_ids) {
-            const webhook = this.monitorStore.getWebhook(webhookId);
-            if (webhook && webhook.enabled) {
-                await this.sendToWebhook(webhook, diff);
-            }
-        }
-    }
-    async sendToWebhook(webhook, diff) {
-        console.error(`[MonitoringService] Sending notification to ${webhook.name} (${webhook.channel}) for dataset ${diff.dataset_id}`);
-        // In a real implementation, this would be an HTTP POST
-        // For now, we simulate the payload
-        const payload = {
-            text: `Dataset ${diff.dataset_id} updated!`,
-            changes: diff.changes,
-            impact: diff.impact_score
-        };
-        // await axios.post(webhook.url, payload);
-    }
-    async triggerReprocess(datasetId) {
-        console.error(`[MonitoringService] Auto-reprocessing dataset ${datasetId}...`);
-        // This would call IngestionService or similar
-    }
-}

package/build/metadata/monitoring-store.js DELETED Viewed

@@ -1,78 +0,0 @@
-export class MonitoringStore {
-    db;
-    constructor(db) {
-        this.db = db;
-        this.init();
-    }
-    init() {
-        this.db.exec(`
-            CREATE TABLE IF NOT EXISTS dataset_monitors (
-                dataset_id TEXT PRIMARY KEY,
-                enabled BOOLEAN DEFAULT 1,
-                auto_reprocess BOOLEAN DEFAULT 0,
-                last_checked_version TEXT,
-                webhook_ids TEXT, -- JSON array
-                created_at TEXT,
-                updated_at TEXT
-            );
-            CREATE TABLE IF NOT EXISTS webhook_configs (
-                id TEXT PRIMARY KEY,
-                name TEXT,
-                channel TEXT,
-                url TEXT,
-                enabled BOOLEAN DEFAULT 1
-            );
-        `);
-    }
-    saveMonitor(monitor) {
-        const upsert = this.db.prepare(`
-            INSERT INTO dataset_monitors (dataset_id, enabled, auto_reprocess, last_checked_version, webhook_ids, created_at, updated_at)
-            VALUES (?, ?, ?, ?, ?, ?, ?)
-            ON CONFLICT(dataset_id) DO UPDATE SET
-                enabled=excluded.enabled,
-                auto_reprocess=excluded.auto_reprocess,
-                last_checked_version=excluded.last_checked_version,
-                webhook_ids=excluded.webhook_ids,
-                updated_at=excluded.updated_at
-        `);
-        upsert.run(monitor.dataset_id, monitor.enabled ? 1 : 0, monitor.auto_reprocess ? 1 : 0, monitor.last_checked_version || null, JSON.stringify(monitor.webhook_ids), monitor.created_at, monitor.updated_at);
-    }
-    getMonitor(datasetId) {
-        const row = this.db.prepare("SELECT * FROM dataset_monitors WHERE dataset_id = ?").get(datasetId);
-        if (!row)
-            return null;
-        return {
-            ...row,
-            enabled: Boolean(row.enabled),
-            auto_reprocess: Boolean(row.auto_reprocess),
-            webhook_ids: JSON.parse(row.webhook_ids)
-        };
-    }
-    getActiveMonitors() {
-        const rows = this.db.prepare("SELECT * FROM dataset_monitors WHERE enabled = 1").all();
-        return rows.map(row => ({
-            ...row,
-            enabled: Boolean(row.enabled),
-            auto_reprocess: Boolean(row.auto_reprocess),
-            webhook_ids: JSON.parse(row.webhook_ids)
-        }));
-    }
-    saveWebhook(config) {
-        const upsert = this.db.prepare(`
-            INSERT INTO webhook_configs (id, name, channel, url, enabled)
-            VALUES (?, ?, ?, ?, ?)
-            ON CONFLICT(id) DO UPDATE SET
-                name=excluded.name,
-                url=excluded.url,
-                enabled=excluded.enabled
-        `);
-        upsert.run(config.id, config.name, config.channel, config.url, config.enabled ? 1 : 0);
-    }
-    getWebhook(id) {
-        const row = this.db.prepare("SELECT * FROM webhook_configs WHERE id = ?").get(id);
-        if (!row)
-            return null;
-        return { ...row, enabled: Boolean(row.enabled) };
-    }
-}

package/build/metadata/monitoring-types.js DELETED Viewed

	@@ -1 +0,0 @@
1	- export {};

package/build/metadata/openml-source.js DELETED Viewed

@@ -1,87 +0,0 @@
-import { spawn } from "child_process";
-import path from "path";
-import fs from "fs";
-import os from "os";
-export class OpenMLSource {
-    pythonPath = "python";
-    scriptPath;
-    constructor(buildDir = process.cwd()) {
-        const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
-        const dataRoot = path.join(homeDir, ".vesper");
-        const scriptPath0 = path.resolve(dataRoot, "python", "openml_engine.py");
-        const scriptPath1 = path.resolve(buildDir, "python", "openml_engine.py");
-        const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "openml_engine.py");
-        if (fs.existsSync(scriptPath0)) {
-            this.scriptPath = scriptPath0;
-        }
-        else if (fs.existsSync(scriptPath1)) {
-            this.scriptPath = scriptPath1;
-        }
-        else if (fs.existsSync(scriptPath2)) {
-            this.scriptPath = scriptPath2;
-        }
-        else {
-            this.scriptPath = scriptPath0;
-        }
-        if (process.platform === "win32") {
-            const venvPy = path.resolve(buildDir, ".venv", "Scripts", "python.exe");
-            if (fs.existsSync(venvPy)) {
-                this.pythonPath = venvPy;
-            }
-            else {
-                this.pythonPath = "py";
-            }
-        }
-        else {
-            const venvPy = path.resolve(buildDir, ".venv", "bin", "python");
-            if (fs.existsSync(venvPy)) {
-                this.pythonPath = venvPy;
-            }
-        }
-    }
-    async discover(query, limit = 20) {
-        const result = await this.run(["discover", query, String(limit)]);
-        if (!result.ok) {
-            throw new Error(result.error || "OpenML discover failed");
-        }
-        return (result.results || []);
-    }
-    async download(datasetRef, targetDir) {
-        const args = ["download", datasetRef];
-        if (targetDir)
-            args.push(targetDir);
-        const result = await this.run(args);
-        if (!result.ok) {
-            throw new Error(result.error || "OpenML download failed");
-        }
-        return {
-            local_path: result.local_path,
-            target_dir: result.target_dir,
-        };
-    }
-    run(args) {
-        return new Promise((resolve, reject) => {
-            const proc = spawn(this.pythonPath, [this.scriptPath, ...args]);
-            let stdout = "";
-            let stderr = "";
-            proc.stdout.on("data", (data) => {
-                stdout += data.toString();
-            });
-            proc.stderr.on("data", (data) => {
-                stderr += data.toString();
-            });
-            proc.on("close", (code) => {
-                if (code !== 0) {
-                    return reject(new Error(`OpenML engine exited with code ${code}: ${stderr}`));
-                }
-                try {
-                    const parsed = JSON.parse(stdout.trim());
-                    resolve(parsed);
-                }
-                catch (e) {
-                    reject(new Error(`Failed to parse OpenML engine output: ${stdout}`));
-                }
-            });
-        });
-    }
-}

package/build/metadata/quality.js DELETED Viewed

@@ -1,48 +0,0 @@
-/**
- * Calculates a quality score from 0-100 based on metadata.
- */
-export function calculateQualityScore(data) {
-    let score = 0;
-    // 1. Popularity (max 30)
-    if (data.downloads > 10000)
-        score += 30;
-    else if (data.downloads > 1000)
-        score += 20;
-    else if (data.downloads > 100)
-        score += 10;
-    // 2. Structuredness (max 20)
-    if (data.hasTrainSplit)
-        score += 10;
-    if (data.hasTestSplit)
-        score += 10;
-    // 3. Documentation (max 20)
-    if (data.hasDescription) {
-        if (data.descriptionLength > 1000)
-            score += 20;
-        else if (data.descriptionLength > 200)
-            score += 10;
-        else
-            score += 5;
-    }
-    // 4. Recency (max 15)
-    const lastUpdate = new Date(data.lastUpdated);
-    const now = new Date();
-    const diffDays = Math.floor((now.getTime() - lastUpdate.getTime()) / (1000 * 3600 * 24));
-    if (diffDays < 180)
-        score += 15; // 6 months
-    else if (diffDays < 365)
-        score += 10; // 1 year
-    else if (diffDays < 730)
-        score += 5; // 2 years
-    // 5. License Clarity (max 10)
-    if (data.licenseCategory === "safe")
-        score += 10;
-    else if (data.licenseCategory === "restricted")
-        score += 5;
-    // 6. Community (max 5)
-    if (data.likes > 50)
-        score += 5;
-    else if (data.likes > 10)
-        score += 2;
-    return Math.min(100, score);
-}