npm - vesper-wizard - Versions diffs - 2.3.1 → 2.3.2 - Mend

vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (214) hide show

package/README.md +37 -322
package/package.json +34 -100
package/vesper-mcp-config.json +6 -0
package/{scripts/wizard.js → wizard.js} +34 -10
package/LICENSE +0 -21
package/build/cache/cdn.js +0 -34
package/build/cache/service.js +0 -63
package/build/cleaning/cleaner.js +0 -81
package/build/cleaning/evaluator.js +0 -89
package/build/cleaning/executor.js +0 -62
package/build/cleaning/exporter.js +0 -87
package/build/cleaning/planner.js +0 -127
package/build/cleaning/rules.js +0 -57
package/build/cleaning/types.js +0 -1
package/build/cloud/adapters/local.js +0 -37
package/build/cloud/adapters/s3.js +0 -24
package/build/cloud/adapters/supabase.js +0 -49
package/build/cloud/storage-manager.js +0 -26
package/build/cloud/types.js +0 -1
package/build/compliance/service.js +0 -73
package/build/compliance/store.js +0 -80
package/build/compliance/types.js +0 -1
package/build/config/config-manager.js +0 -221
package/build/config/secure-keys.js +0 -51
package/build/config/user-config.js +0 -48
package/build/data/processing-worker.js +0 -23
package/build/data/streaming.js +0 -38
package/build/data/worker-pool.js +0 -39
package/build/export/exporter.js +0 -82
package/build/export/packager.js +0 -100
package/build/export/types.js +0 -1
package/build/fusion/aligner.js +0 -56
package/build/fusion/deduplicator.js +0 -69
package/build/fusion/engine.js +0 -69
package/build/fusion/harmonizer.js +0 -39
package/build/fusion/orchestrator.js +0 -86
package/build/fusion/types.js +0 -1
package/build/gateway/unified-dataset-gateway.js +0 -410
package/build/index.js +0 -3068
package/build/ingestion/hf-downloader.js +0 -171
package/build/ingestion/ingestor.js +0 -271
package/build/ingestion/kaggle-downloader.js +0 -102
package/build/install/install-service.js +0 -46
package/build/jobs/manager.js +0 -136
package/build/jobs/queue.js +0 -59
package/build/jobs/types.js +0 -1
package/build/lib/supabase.js +0 -3
package/build/metadata/dataworld-source.js +0 -89
package/build/metadata/domain.js +0 -147
package/build/metadata/github-scraper.js +0 -47
package/build/metadata/institutional-scrapers.js +0 -49
package/build/metadata/kaggle-scraper.js +0 -182
package/build/metadata/kaggle-source.js +0 -70
package/build/metadata/license.js +0 -68
package/build/metadata/monitoring-service.js +0 -107
package/build/metadata/monitoring-store.js +0 -78
package/build/metadata/monitoring-types.js +0 -1
package/build/metadata/openml-source.js +0 -87
package/build/metadata/quality.js +0 -48
package/build/metadata/rate-limiter.js +0 -128
package/build/metadata/scraper.js +0 -448
package/build/metadata/store.js +0 -340
package/build/metadata/types.js +0 -1
package/build/metadata/uci-scraper.js +0 -49
package/build/monitoring/observability.js +0 -76
package/build/preparation/target-detector.js +0 -75
package/build/python/__pycache__/config.cpython-312.pyc +0 -0
package/build/python/asset_downloader_engine.py +0 -94
package/build/python/cleaner.py +0 -226
package/build/python/config.py +0 -263
package/build/python/convert_engine.py +0 -92
package/build/python/dataworld_engine.py +0 -208
package/build/python/export_engine.py +0 -288
package/build/python/framework_adapters.py +0 -100
package/build/python/fusion_engine.py +0 -368
package/build/python/github_adapter.py +0 -106
package/build/python/hf_fallback.py +0 -298
package/build/python/image_engine.py +0 -86
package/build/python/kaggle_engine.py +0 -295
package/build/python/media_engine.py +0 -133
package/build/python/nasa_adapter.py +0 -82
package/build/python/normalize_engine.py +0 -83
package/build/python/openml_engine.py +0 -146
package/build/python/quality_engine.py +0 -267
package/build/python/row_count.py +0 -54
package/build/python/splitter_engine.py +0 -283
package/build/python/target_engine.py +0 -154
package/build/python/test_framework_adapters.py +0 -61
package/build/python/test_fusion_engine.py +0 -89
package/build/python/uci_adapter.py +0 -94
package/build/python/vesper/__init__.py +0 -1
package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
package/build/python/vesper/core/__init__.py +0 -1
package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
package/build/python/vesper/core/asset_downloader.py +0 -679
package/build/python/vesper/core/download_recipe.py +0 -104
package/build/python/worldbank_adapter.py +0 -99
package/build/quality/analyzer.js +0 -93
package/build/quality/image-analyzer.js +0 -114
package/build/quality/media-analyzer.js +0 -115
package/build/quality/quality-orchestrator.js +0 -162
package/build/quality/types.js +0 -1
package/build/scripts/build-index.js +0 -54
package/build/scripts/check-db.js +0 -73
package/build/scripts/check-jobs.js +0 -24
package/build/scripts/check-naruto.js +0 -17
package/build/scripts/cleanup-kaggle.js +0 -41
package/build/scripts/demo-full-pipeline.js +0 -62
package/build/scripts/demo-ui.js +0 -58
package/build/scripts/e2e-demo.js +0 -72
package/build/scripts/massive-scrape.js +0 -103
package/build/scripts/ops-dashboard.js +0 -33
package/build/scripts/repro-bug.js +0 -37
package/build/scripts/repro-export-bug.js +0 -56
package/build/scripts/scrape-metadata.js +0 -100
package/build/scripts/search-cli.js +0 -26
package/build/scripts/test-bias.js +0 -45
package/build/scripts/test-caching.js +0 -51
package/build/scripts/test-cleaning.js +0 -76
package/build/scripts/test-cloud-storage.js +0 -48
package/build/scripts/test-compliance.js +0 -58
package/build/scripts/test-conversion.js +0 -64
package/build/scripts/test-custom-rules.js +0 -58
package/build/scripts/test-db-opt.js +0 -63
package/build/scripts/test-export-custom.js +0 -33
package/build/scripts/test-exporter.js +0 -53
package/build/scripts/test-fusion.js +0 -61
package/build/scripts/test-github.js +0 -27
package/build/scripts/test-group-split.js +0 -52
package/build/scripts/test-hf-download.js +0 -29
package/build/scripts/test-holdout-manager.js +0 -61
package/build/scripts/test-hybrid-search.js +0 -41
package/build/scripts/test-image-analysis.js +0 -50
package/build/scripts/test-ingestion-infra.js +0 -39
package/build/scripts/test-install.js +0 -40
package/build/scripts/test-institutional.js +0 -26
package/build/scripts/test-integrity.js +0 -41
package/build/scripts/test-jit.js +0 -42
package/build/scripts/test-job-queue.js +0 -62
package/build/scripts/test-kaggle-download.js +0 -34
package/build/scripts/test-large-data.js +0 -50
package/build/scripts/test-mcp-v5.js +0 -74
package/build/scripts/test-media-analysis.js +0 -61
package/build/scripts/test-monitoring.js +0 -91
package/build/scripts/test-observability.js +0 -106
package/build/scripts/test-packager.js +0 -55
package/build/scripts/test-pipeline.js +0 -50
package/build/scripts/test-planning.js +0 -64
package/build/scripts/test-privacy.js +0 -38
package/build/scripts/test-production-sync.js +0 -36
package/build/scripts/test-quality.js +0 -43
package/build/scripts/test-robust-ingestion.js +0 -41
package/build/scripts/test-schema.js +0 -45
package/build/scripts/test-split-validation.js +0 -40
package/build/scripts/test-splitter.js +0 -93
package/build/scripts/test-target-detector.js +0 -29
package/build/scripts/test-uci.js +0 -27
package/build/scripts/test-unified-quality.js +0 -86
package/build/scripts/test-write.js +0 -14
package/build/scripts/verify-integration.js +0 -57
package/build/scripts/verify-priority.js +0 -33
package/build/search/embedder.js +0 -34
package/build/search/engine.js +0 -190
package/build/search/jit-orchestrator.js +0 -262
package/build/search/query-intent.js +0 -509
package/build/search/vector-store.js +0 -123
package/build/splitting/splitter.js +0 -82
package/build/splitting/types.js +0 -1
package/build/tools/formatter.js +0 -251
package/build/utils/downloader.js +0 -52
package/build/utils/python-runtime.js +0 -130
package/build/utils/selector.js +0 -69
package/mcp-config-template.json +0 -18
package/scripts/postinstall.cjs +0 -170
package/scripts/preindex_registry.cjs +0 -157
package/scripts/refresh-index.cjs +0 -87
package/scripts/wizard.cjs +0 -601
package/src/python/__pycache__/config.cpython-312.pyc +0 -0
package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
package/src/python/asset_downloader_engine.py +0 -94
package/src/python/cleaner.py +0 -226
package/src/python/config.py +0 -263
package/src/python/convert_engine.py +0 -92
package/src/python/dataworld_engine.py +0 -208
package/src/python/export_engine.py +0 -288
package/src/python/framework_adapters.py +0 -100
package/src/python/fusion_engine.py +0 -368
package/src/python/github_adapter.py +0 -106
package/src/python/hf_fallback.py +0 -298
package/src/python/image_engine.py +0 -86
package/src/python/kaggle_engine.py +0 -295
package/src/python/media_engine.py +0 -133
package/src/python/nasa_adapter.py +0 -82
package/src/python/normalize_engine.py +0 -83
package/src/python/openml_engine.py +0 -146
package/src/python/quality_engine.py +0 -267
package/src/python/requirements.txt +0 -12
package/src/python/row_count.py +0 -54
package/src/python/splitter_engine.py +0 -283
package/src/python/target_engine.py +0 -154
package/src/python/test_framework_adapters.py +0 -61
package/src/python/test_fusion_engine.py +0 -89
package/src/python/uci_adapter.py +0 -94
package/src/python/vesper/__init__.py +0 -1
package/src/python/vesper/core/__init__.py +0 -1
package/src/python/vesper/core/asset_downloader.py +0 -679
package/src/python/vesper/core/download_recipe.py +0 -104
package/src/python/worldbank_adapter.py +0 -99
package/wizard.cjs +0 -3

package/src/python/target_engine.py DELETED Viewed

@@ -1,154 +0,0 @@
-import sys
-import json
-import pandas as pd
-import numpy as np
-# Common names for target variables in datasets
-TARGET_CANDIDATES = [
-    'target', 'label', 'class', 'outcome', 'y',
-    'price', 'saleprice', 'sales', 'cost', 'value', 'total',
-    'diagnosis', 'species', 'churn', 'survived', 'credit_risk'
-]
-def load_data(file_path):
-    if file_path.endswith('.csv'):
-        return pd.read_csv(file_path)
-    elif file_path.endswith('.parquet'):
-        return pd.read_parquet(file_path)
-    else:
-        raise ValueError("Unsupported file format")
-def detect_target(file_path):
-    try:
-        df = load_data(file_path)
-        columns = [c.lower() for c in df.columns]
-        candidates = []
-        # 1. Exact Name Match
-        for col_original in df.columns:
-            col_lower = col_original.lower()
-            confidence = 0.0
-            reasons = []
-            if col_lower in TARGET_CANDIDATES:
-                confidence += 0.6
-                reasons.append(f"Matches common target name '{col_lower}'")
-                # Boost if exact match 'target' or 'label'
-                if col_lower in ['target', 'label', 'class']:
-                    confidence += 0.2
-            # 2. Position Heuristic (Last column is often target)
-            if col_original == df.columns[-1]:
-                confidence += 0.3
-                reasons.append("Is the last column")
-            # 3. Completeness
-            missing_rate = df[col_original].isnull().mean()
-            if missing_rate > 0.5:
-                confidence -= 0.5
-                reasons.append(f"High missing rate ({missing_rate:.1%})")
-            elif missing_rate > 0:
-                confidence -= 0.1
-                reasons.append(f"Has missing values ({missing_rate:.1%})")
-            # 4. Cardinality / Unique Values
-            # If regression-like (many unique numeric values) or class-like (few unique values)
-            # This is hard to score generally, but extremes are bad for targets (e.g. all unique = ID usually)
-            n_unique = df[col_original].nunique()
-            if n_unique == len(df):
-                confidence -= 0.8
-                reasons.append("All values are unique (likely ID)")
-            if confidence > 0.3:
-                candidates.append({
-                    "column": col_original,
-                    "confidence": min(confidence, 1.0),
-                    "reason": reasons
-                })
-        # Sort by confidence
-        candidates.sort(key=lambda x: x['confidence'], reverse=True)
-        best_target = None
-        best_conf = 0.0
-        if candidates:
-            best_target = candidates[0]['column']
-            best_conf = candidates[0]['confidence']
-        return {
-            "target_column": best_target,
-            "confidence": best_conf,
-            "candidates": candidates,
-            "is_unified": False # Wrapper will handle unification logic
-        }
-    except Exception as e:
-        return {"error": str(e)}
-def validate_target(file_path, target_column):
-    try:
-        df = load_data(file_path)
-        if target_column not in df.columns:
-            return {"error": f"Column '{target_column}' not found in dataset."}
-        series = df[target_column]
-        total_rows = len(df)
-        missing_count = series.isnull().sum()
-        # Determine type
-        is_numeric = pd.api.types.is_numeric_dtype(series)
-        n_unique = series.nunique()
-        problem_type = "unknown"
-        if is_numeric and n_unique > 20:
-             problem_type = "regression"
-        elif n_unique < 50: # String or few numeric values
-             problem_type = "classification"
-        else:
-             # Heuristic fallback
-             problem_type = "regression" if is_numeric else "classification"
-        warnings = []
-        if missing_count > 0:
-            warnings.append(f"Target has {missing_count} missing values.")
-        # Imbalance check for classification
-        if problem_type == "classification":
-            counts = series.value_counts(normalize=True)
-            if counts.iloc[0] > 0.9: # Dominant class > 90%
-                warnings.append(f"Highly imbalanced target: Class '{counts.index[0]}' is {counts.iloc[0]:.1%}")
-        return {
-            "valid": True,
-            "problem_type": problem_type,
-            "missing_count": int(missing_count),
-            "total_rows": total_rows,
-            "warnings": warnings
-        }
-    except Exception as e:
-        return {"error": str(e)}
-if __name__ == "__main__":
-    if len(sys.argv) < 3:
-        print(json.dumps({"error": "Usage: target_engine.py <action> <file_path> [args]"}));
-        sys.exit(1)
-    action = sys.argv[1]
-    file_path = sys.argv[2]
-    result = {}
-    if action == "detect":
-        result = detect_target(file_path)
-    elif action == "validate":
-        target_col = sys.argv[3] if len(sys.argv) > 3 else None
-        if target_col:
-            result = validate_target(file_path, target_col)
-        else:
-            result = {"error": "Target column required for validation"}
-    else:
-        result = {"error": f"Unknown action: {action}"}
-    print(json.dumps(result))

package/src/python/test_framework_adapters.py DELETED Viewed

@@ -1,61 +0,0 @@
-import sys
-import os
-import polars as pl
-import numpy as np
-# Mock data creation
-def create_mock_data():
-    df = pl.DataFrame({
-        "feature1": np.random.rand(100),
-        "feature2": np.random.rand(100),
-        "label": np.random.randint(0, 2, 100)
-    })
-    os.makedirs("test_adapters", exist_ok=True)
-    df.write_parquet("test_adapters/data.parquet")
-    df.write_csv("test_adapters/data.csv")
-    print("Created mock data in test_adapters/")
-def test_pytorch():
-    print("\n--- Testing PyTorch Adapter ---")
-    try:
-        from framework_adapters import VesperPyTorchDataset
-        import torch
-        from torch.utils.data import DataLoader
-        dataset = VesperPyTorchDataset("test_adapters/data.parquet", target_col="label")
-        loader = DataLoader(dataset, batch_size=10, shuffle=True)
-        batch = next(iter(loader))
-        print(f"Loaded batch: {batch}")
-        print("PASS: PyTorch DataLoader works")
-    except ImportError:
-        print("SKIP: PyTorch not installed")
-    except Exception as e:
-        print(f"FAIL: PyTorch test failed: {e}")
-def test_huggingface():
-    print("\n--- Testing HuggingFace Adapter ---")
-    try:
-        from framework_adapters import load_vesper_dataset
-        ds = load_vesper_dataset("test_adapters/data.csv")
-        print(f"Loaded dataset: {ds}")
-        print("PASS: HuggingFace Dataset works")
-    except ImportError:
-        print("SKIP: HuggingFace datasets not installed")
-    except Exception as e:
-        print(f"FAIL: HuggingFace test failed: {e}")
-if __name__ == "__main__":
-    create_mock_data()
-    # Add src/python to path to import adapters
-    sys.path.append(os.path.join(os.getcwd(), "src", "python"))
-    test_pytorch()
-    test_huggingface()
-    # Cleanup
-    import shutil
-    shutil.rmtree("test_adapters")

package/src/python/test_fusion_engine.py DELETED Viewed

@@ -1,89 +0,0 @@
-import os
-import tempfile
-import polars as pl
-from fusion_engine import fuse_datasets
-def run_basic_tests():
-    tmp = tempfile.gettempdir()
-    # ----- Test 1: concat -----
-    p1 = os.path.join(tmp, "fuse_test_a.csv")
-    p2 = os.path.join(tmp, "fuse_test_b.csv")
-    out_concat = os.path.join(tmp, "fuse_test_concat.feather")
-    df1 = pl.DataFrame({
-        "id": [1, 2, 3],
-        "text": ["a", "b", "c"],
-        "price": [10.0, 20.0, 30.0],
-    })
-    df2 = pl.DataFrame({
-        "id": [4, 5, 3],
-        "text": ["d", "e", "c"],
-        "price": [40.0, 50.0, 30.0],
-        "image_path": ["img1.jpg", "img2.jpg", "img3.jpg"],
-    })
-    df1.write_csv(p1)
-    df2.write_csv(p2)
-    concat_res = fuse_datasets(
-        sources=[p1, p2],
-        strategy="concat",
-        dedup=True,
-        run_quality_after=False,
-        leakage_check=True,
-        output_path=out_concat,
-        output_format="feather",
-        compression="lz4",
-        preview=True,
-        id_column="id",
-    )
-    assert concat_res.get("success") is True, f"Concat failed: {concat_res}"
-    assert os.path.exists(out_concat), "Concat output file missing"
-    # ----- Test 2: join with conflicting column names -----
-    p3 = os.path.join(tmp, "fuse_test_c.csv")
-    p4 = os.path.join(tmp, "fuse_test_d.csv")
-    out_join = os.path.join(tmp, "fuse_test_join.parquet")
-    left = pl.DataFrame({
-        "id": [1, 2, 3],
-        "price": [100, 200, 300],
-        "text": ["x", "y", "z"],
-    })
-    right = pl.DataFrame({
-        "id": [2, 3, 4],
-        "price": [999, 888, 777],
-        "caption": ["two", "three", "four"],
-    })
-    left.write_csv(p3)
-    right.write_csv(p4)
-    join_res = fuse_datasets(
-        sources=[p3, p4],
-        strategy="join",
-        join_on="id",
-        how="inner",
-        dedup=True,
-        run_quality_after=False,
-        leakage_check=False,
-        output_path=out_join,
-        output_format="parquet",
-        compression="snappy",
-        preview=True,
-    )
-    assert join_res.get("success") is True, f"Join failed: {join_res}"
-    assert os.path.exists(out_join), "Join output file missing"
-    assert len(join_res.get("stats", {}).get("conflict_renames", [])) >= 1, "Expected conflict rename for price column"
-    print("✅ Fusion tests passed")
-    print("Concat:", concat_res["stats"])
-    print("Join:", join_res["stats"])
-if __name__ == "__main__":
-    run_basic_tests()

package/src/python/uci_adapter.py DELETED Viewed

@@ -1,94 +0,0 @@
-import sys
-import json
-import argparse
-import urllib.request
-import urllib.parse
-from datetime import datetime
-# API Endpoint found in network inspection of UCI website
-UCI_API_URL = "https://archive.ics.uci.edu/api/datasets/list"
-def search_uci(query: str, limit: int = 10):
-    """
-    Search UCI datasets using their internal API.
-    """
-    try:
-        # Fetch data dictionary from API
-        # Only fetching first 100 to filter locally
-        params = {
-            "skip": 0,
-            "take": 100,
-            "sort": "desc",
-            "orderBy": "NumHits",
-            "search": query
-        }
-        query_string = urllib.parse.urlencode(params)
-        url = f"{UCI_API_URL}?{query_string}"
-        req = urllib.request.Request(url)
-        with urllib.request.urlopen(req) as response:
-            data = json.load(response)
-        datasets = data.get('data', [])
-        if not datasets:
-            datasets = []
-        results = []
-        count = 0
-        # We trust the API search mostly, but can do extra filtering if needed
-        # The API "search" param is supported
-        for ds in datasets:
-            # Normalize to Vesper schema
-            # API fields: id, name, abstract, numHits, area, task, dateDonated
-            metadata = {
-                "id": f"uci:{ds.get('id')}",
-                "source": "uci",
-                "name": ds.get('name'),
-                "description": ds.get('abstract') or "No description available.",
-                "downloads": ds.get('numHits') or 0,
-                "likes": 0,
-                "last_updated": ds.get('dateDonated') or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
-                "quality_score": 80,
-                "license": {
-                    "id": "other",
-                    "category": "open",
-                    "usage_restrictions": [],
-                    "warnings": []
-                },
-                "tags": [t for t in [ds.get('area'), ds.get('task')] if t],
-                "total_examples": ds.get('numInstances'),
-                "is_safe_source": True,
-                "is_structured": True,
-                "metadata_url": f"https://archive.ics.uci.edu/dataset/{ds.get('id')}/{ds.get('name').replace(' ', '+')}"
-            }
-            results.append(metadata)
-            count += 1
-            if count >= limit:
-                break
-        return results
-    except Exception as e:
-        # Fallback empty or specific error
-        return {"error": str(e)}
-def main():
-    parser = argparse.ArgumentParser(description="UCI Adapter")
-    parser.add_argument("--action", required=True, choices=["search"])
-    parser.add_argument("--query", required=True)
-    parser.add_argument("--limit", type=int, default=10)
-    args = parser.parse_args()
-    if args.action == "search":
-        results = search_uci(args.query, args.limit)
-        # JSON dump print for stdout capture
-        print(json.dumps(results))
-if __name__ == "__main__":
-    main()

package/src/python/vesper/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Vesper Python runtime package."""

package/src/python/vesper/core/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Core data engines for Vesper."""