vesper-wizard 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
import asyncio
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
import sys
|
|
6
|
-
import warnings
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import Any, Dict
|
|
9
|
-
|
|
10
|
-
# Suppress noisy HF warnings
|
|
11
|
-
warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
|
|
12
|
-
warnings.filterwarnings("ignore", message=".*legacy.*")
|
|
13
|
-
|
|
14
|
-
CURRENT_DIR = Path(__file__).resolve().parent
|
|
15
|
-
if str(CURRENT_DIR) not in sys.path:
|
|
16
|
-
sys.path.insert(0, str(CURRENT_DIR))
|
|
17
|
-
|
|
18
|
-
from vesper.core.asset_downloader import AssetDownloader
|
|
19
|
-
from vesper.core.download_recipe import build_download_recipe, save_recipe, get_download_recipe
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _print(payload: Dict[str, Any]) -> None:
|
|
23
|
-
print(json.dumps(payload, ensure_ascii=False))
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
|
|
27
|
-
payload = json.loads(args.payload)
|
|
28
|
-
output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
|
|
29
|
-
output_dir = payload.get("output_dir")
|
|
30
|
-
workers = int(payload.get("workers") or 8)
|
|
31
|
-
recipes_dir = payload.get("recipes_dir")
|
|
32
|
-
|
|
33
|
-
# Auto-set HF token from payload if provided
|
|
34
|
-
token = payload.get("token") or payload.get("hf_token")
|
|
35
|
-
if token:
|
|
36
|
-
os.environ["HF_TOKEN"] = str(token)
|
|
37
|
-
|
|
38
|
-
downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
|
|
39
|
-
|
|
40
|
-
result = await downloader.download_assets(
|
|
41
|
-
dataset_id=str(payload.get("dataset_id")),
|
|
42
|
-
source=payload.get("source"),
|
|
43
|
-
repo_id=payload.get("repo_id"),
|
|
44
|
-
kaggle_ref=payload.get("kaggle_ref"),
|
|
45
|
-
urls=payload.get("urls"),
|
|
46
|
-
output_format=payload.get("output_format", "webdataset"),
|
|
47
|
-
output_dir=str(output_dir) if output_dir else None,
|
|
48
|
-
max_items=payload.get("max_items"),
|
|
49
|
-
image_column=payload.get("image_column"),
|
|
50
|
-
)
|
|
51
|
-
return {"ok": True, "result": result}
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def main() -> None:
|
|
55
|
-
parser = argparse.ArgumentParser(description="Vesper Smart Asset Downloader Engine")
|
|
56
|
-
parser.add_argument("action", choices=["download", "build_recipe", "get_recipe"])
|
|
57
|
-
parser.add_argument("payload", help="JSON payload")
|
|
58
|
-
args = parser.parse_args()
|
|
59
|
-
|
|
60
|
-
try:
|
|
61
|
-
if args.action == "download":
|
|
62
|
-
response = asyncio.run(_run_download(args))
|
|
63
|
-
_print(response)
|
|
64
|
-
return
|
|
65
|
-
|
|
66
|
-
payload = json.loads(args.payload)
|
|
67
|
-
if args.action == "build_recipe":
|
|
68
|
-
recipe = build_download_recipe(payload)
|
|
69
|
-
saved = save_recipe(recipe, payload.get("recipes_dir"))
|
|
70
|
-
_print({"ok": True, "recipe": recipe, "saved_to": saved})
|
|
71
|
-
return
|
|
72
|
-
|
|
73
|
-
if args.action == "get_recipe":
|
|
74
|
-
dataset_id = str(payload.get("dataset_id"))
|
|
75
|
-
recipe = get_download_recipe(dataset_id, payload.get("recipes_dir"))
|
|
76
|
-
_print({"ok": True, "recipe": recipe})
|
|
77
|
-
return
|
|
78
|
-
|
|
79
|
-
_print({"ok": False, "error": f"Unknown action: {args.action}"})
|
|
80
|
-
except Exception as e:
|
|
81
|
-
error_msg = str(e)
|
|
82
|
-
# Provide actionable error messages
|
|
83
|
-
if "401" in error_msg or "403" in error_msg or "Unauthorized" in error_msg:
|
|
84
|
-
error_msg = (
|
|
85
|
-
"Authentication required. This dataset may be gated/private. "
|
|
86
|
-
"Use configure_keys tool to set HF_TOKEN, then retry."
|
|
87
|
-
)
|
|
88
|
-
elif "No image column" in error_msg:
|
|
89
|
-
error_msg += " Hint: specify image_column parameter with the name of the column containing images."
|
|
90
|
-
_print({"ok": False, "error": error_msg})
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
if __name__ == "__main__":
|
|
94
|
-
main()
|
package/src/python/cleaner.py
DELETED
|
@@ -1,226 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import polars as pl
|
|
4
|
-
import numpy as np
|
|
5
|
-
|
|
6
|
-
# --- Operations Library ---
|
|
7
|
-
|
|
8
|
-
def op_remove_duplicates(df, params):
|
|
9
|
-
subset = params.get("subset", None) # List of cols or None
|
|
10
|
-
before = len(df)
|
|
11
|
-
if subset:
|
|
12
|
-
df = df.unique(subset=subset)
|
|
13
|
-
else:
|
|
14
|
-
df = df.unique()
|
|
15
|
-
return df, {"rows_removed": before - len(df)}
|
|
16
|
-
|
|
17
|
-
def op_drop_columns(df, params):
|
|
18
|
-
cols = params.get("columns", [])
|
|
19
|
-
before = len(df.columns)
|
|
20
|
-
# Filter only existing cols to avoid errors
|
|
21
|
-
cols_to_drop = [c for c in cols if c in df.columns]
|
|
22
|
-
df = df.drop(cols_to_drop)
|
|
23
|
-
return df, {"columns_dropped": len(cols_to_drop)}
|
|
24
|
-
|
|
25
|
-
def op_fill_missing(df, params):
|
|
26
|
-
col = params["column"]
|
|
27
|
-
method = params.get("method", "mean") # mean, median, mode, constant
|
|
28
|
-
value = params.get("value", None)
|
|
29
|
-
|
|
30
|
-
if col not in df.columns:
|
|
31
|
-
return df, {"error": f"Column {col} not found"}
|
|
32
|
-
|
|
33
|
-
affected = df[col].null_count()
|
|
34
|
-
|
|
35
|
-
if method == "constant":
|
|
36
|
-
df = df.with_columns(pl.col(col).fill_null(value))
|
|
37
|
-
elif method == "mean":
|
|
38
|
-
mean_val = df[col].mean()
|
|
39
|
-
df = df.with_columns(pl.col(col).fill_null(mean_val))
|
|
40
|
-
elif method == "median":
|
|
41
|
-
median_val = df[col].median()
|
|
42
|
-
df = df.with_columns(pl.col(col).fill_null(median_val))
|
|
43
|
-
|
|
44
|
-
return df, {"rows_imputed": affected}
|
|
45
|
-
|
|
46
|
-
def op_fix_types(df, params):
|
|
47
|
-
col = params["column"]
|
|
48
|
-
target_type = params["type"] # "int", "float", "string", "date"
|
|
49
|
-
|
|
50
|
-
if col not in df.columns:
|
|
51
|
-
return df, {"error": f"Column {col} not found"}
|
|
52
|
-
|
|
53
|
-
try:
|
|
54
|
-
if target_type == "int":
|
|
55
|
-
df = df.with_columns(pl.col(col).cast(pl.Int64, strict=False))
|
|
56
|
-
elif target_type == "float":
|
|
57
|
-
df = df.with_columns(pl.col(col).cast(pl.Float64, strict=False))
|
|
58
|
-
elif target_type == "string":
|
|
59
|
-
df = df.with_columns(pl.col(col).cast(pl.Utf8))
|
|
60
|
-
elif target_type == "date":
|
|
61
|
-
df = df.with_columns(pl.col(col).str.to_date(strict=False))
|
|
62
|
-
|
|
63
|
-
return df, {"status": "Converted"}
|
|
64
|
-
except Exception as e:
|
|
65
|
-
return df, {"error": str(e)}
|
|
66
|
-
|
|
67
|
-
def op_remove_outliers(df, params):
|
|
68
|
-
col = params["column"]
|
|
69
|
-
method = params.get("method", "iqr")
|
|
70
|
-
threshold = params.get("threshold", 1.5)
|
|
71
|
-
|
|
72
|
-
if col not in df.columns:
|
|
73
|
-
return df, {"error": f"Column {col} not found"}
|
|
74
|
-
|
|
75
|
-
before = len(df)
|
|
76
|
-
|
|
77
|
-
if method == "iqr":
|
|
78
|
-
q1 = df[col].quantile(0.25)
|
|
79
|
-
q3 = df[col].quantile(0.75)
|
|
80
|
-
iqr = q3 - q1
|
|
81
|
-
lower = q1 - (threshold * iqr)
|
|
82
|
-
upper = q3 + (threshold * iqr)
|
|
83
|
-
|
|
84
|
-
df = df.filter((pl.col(col) >= lower) & (pl.col(col) <= upper))
|
|
85
|
-
|
|
86
|
-
return df, {"rows_removed": before - len(df)}
|
|
87
|
-
|
|
88
|
-
def op_encode_categories(df, params):
|
|
89
|
-
col = params["column"]
|
|
90
|
-
method = params.get("method", "label") # label, onehot
|
|
91
|
-
|
|
92
|
-
if col not in df.columns:
|
|
93
|
-
return df, {"error": f"Column {col} not found"}
|
|
94
|
-
|
|
95
|
-
if method == "label":
|
|
96
|
-
# Polars dense_rank acts similar to label encoding
|
|
97
|
-
df = df.with_columns(pl.col(col).rank("dense").alias(f"{col}_encoded"))
|
|
98
|
-
elif method == "onehot":
|
|
99
|
-
dummies = df[col].to_dummies()
|
|
100
|
-
df = pl.concat([df, dummies], how="horizontal")
|
|
101
|
-
|
|
102
|
-
return df, {"status": f"Encoded using {method}"}
|
|
103
|
-
|
|
104
|
-
# --- Registry ---
|
|
105
|
-
|
|
106
|
-
OPERATIONS = {
|
|
107
|
-
"RemoveDuplicates": op_remove_duplicates,
|
|
108
|
-
"DropColumns": op_drop_columns,
|
|
109
|
-
"FillMissing": op_fill_missing,
|
|
110
|
-
"FixTypes": op_fix_types,
|
|
111
|
-
"RemoveOutliers": op_remove_outliers,
|
|
112
|
-
"EncodeCategories": op_encode_categories
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
def main():
|
|
116
|
-
if len(sys.argv) < 3:
|
|
117
|
-
print(json.dumps({"error": "Usage: cleaner.py <file_path> <operations_json>"}), file=sys.stderr)
|
|
118
|
-
sys.exit(1)
|
|
119
|
-
|
|
120
|
-
file_path = sys.argv[1]
|
|
121
|
-
ops_json = sys.argv[2]
|
|
122
|
-
|
|
123
|
-
try:
|
|
124
|
-
operations = json.loads(ops_json)
|
|
125
|
-
|
|
126
|
-
# Load Data
|
|
127
|
-
file_path_lower = file_path.lower()
|
|
128
|
-
if file_path_lower.endswith(".csv"):
|
|
129
|
-
df = pl.read_csv(file_path, ignore_errors=True)
|
|
130
|
-
elif file_path_lower.endswith(".parquet"):
|
|
131
|
-
df = pl.read_parquet(file_path)
|
|
132
|
-
elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
|
|
133
|
-
# Explicit NDJSON
|
|
134
|
-
df = pl.read_ndjson(file_path)
|
|
135
|
-
elif file_path_lower.endswith(".json"):
|
|
136
|
-
# Ambiguous .json
|
|
137
|
-
try:
|
|
138
|
-
df = pl.read_json(file_path)
|
|
139
|
-
except Exception:
|
|
140
|
-
try:
|
|
141
|
-
df = pl.read_ndjson(file_path)
|
|
142
|
-
except Exception as e:
|
|
143
|
-
raise ValueError(f"Failed to read JSON: {str(e)}")
|
|
144
|
-
else:
|
|
145
|
-
raise ValueError(f"Unsupported format: {file_path}")
|
|
146
|
-
|
|
147
|
-
logs = []
|
|
148
|
-
total_rows_affected = 0
|
|
149
|
-
|
|
150
|
-
# Execute Pipeline
|
|
151
|
-
for op in operations:
|
|
152
|
-
op_type = op["type"]
|
|
153
|
-
params = op.get("params", {})
|
|
154
|
-
|
|
155
|
-
if op_type == "RenameTarget":
|
|
156
|
-
old_name = params.get("old_name")
|
|
157
|
-
new_name = params.get("new_name", "target")
|
|
158
|
-
if old_name and old_name in df.columns:
|
|
159
|
-
df = df.rename({old_name: new_name})
|
|
160
|
-
logs.append(f"Renamed column '{old_name}' to '{new_name}'")
|
|
161
|
-
else:
|
|
162
|
-
logs.append(f"Failed RenameTarget: Column '{old_name}' not found or not specified.")
|
|
163
|
-
elif op_type in OPERATIONS:
|
|
164
|
-
try:
|
|
165
|
-
df, stats = OPERATIONS[op_type](df, params)
|
|
166
|
-
logs.append(f"Executed {op_type}: {stats}")
|
|
167
|
-
total_rows_affected += stats.get("rows_removed", 0)
|
|
168
|
-
except Exception as e:
|
|
169
|
-
logs.append(f"Failed {op_type}: {str(e)}")
|
|
170
|
-
else:
|
|
171
|
-
logs.append(f"Unknown operation: {op_type}")
|
|
172
|
-
|
|
173
|
-
# Save Result (overwrite or new file)
|
|
174
|
-
# Save Result (overwrite or new file)
|
|
175
|
-
output_format = sys.argv[3] if len(sys.argv) > 3 else None
|
|
176
|
-
|
|
177
|
-
if not output_format:
|
|
178
|
-
# Legacy logic: preserve CSV or default to parquet
|
|
179
|
-
if file_path_lower.endswith(".csv"):
|
|
180
|
-
output_format = "csv"
|
|
181
|
-
else:
|
|
182
|
-
output_format = "parquet"
|
|
183
|
-
|
|
184
|
-
base_name = file_path.rsplit(".", 1)[0]
|
|
185
|
-
if output_format == "csv":
|
|
186
|
-
output_path = f"{base_name}_cleaned.csv"
|
|
187
|
-
# Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
|
|
188
|
-
for col in df.columns:
|
|
189
|
-
dtype = df.schema[col]
|
|
190
|
-
# Only keep simple types; stringify everything else for CSV
|
|
191
|
-
is_simple = (
|
|
192
|
-
dtype.is_numeric() or
|
|
193
|
-
dtype.is_temporal() or
|
|
194
|
-
str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
|
|
195
|
-
)
|
|
196
|
-
if not is_simple:
|
|
197
|
-
# Use a robust helper for clean JSON serialization
|
|
198
|
-
def safe_serialize(val):
|
|
199
|
-
try:
|
|
200
|
-
# Handle Polars nested types (convert to Python list/dict first)
|
|
201
|
-
if hasattr(val, "to_list"):
|
|
202
|
-
return json.dumps(val.to_list())
|
|
203
|
-
if hasattr(val, "to_dict"):
|
|
204
|
-
return json.dumps(val.to_dict())
|
|
205
|
-
return json.dumps(val)
|
|
206
|
-
except:
|
|
207
|
-
return str(val)
|
|
208
|
-
df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
|
|
209
|
-
df.write_csv(output_path)
|
|
210
|
-
else:
|
|
211
|
-
output_path = f"{base_name}_cleaned.parquet"
|
|
212
|
-
df.write_parquet(output_path)
|
|
213
|
-
|
|
214
|
-
print(json.dumps({
|
|
215
|
-
"success": True,
|
|
216
|
-
"output_path": output_path,
|
|
217
|
-
"rows_affected": total_rows_affected,
|
|
218
|
-
"logs": logs
|
|
219
|
-
}, default=str))
|
|
220
|
-
|
|
221
|
-
except Exception as e:
|
|
222
|
-
print(json.dumps({"success": False, "error": str(e)}, default=str))
|
|
223
|
-
sys.exit(1)
|
|
224
|
-
|
|
225
|
-
if __name__ == "__main__":
|
|
226
|
-
main()
|
package/src/python/config.py
DELETED
|
@@ -1,263 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import sys
|
|
3
|
-
import json
|
|
4
|
-
import base64
|
|
5
|
-
import hashlib
|
|
6
|
-
import secrets
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import Dict, Optional
|
|
9
|
-
|
|
10
|
-
SERVICE_NAME = "vesper"
|
|
11
|
-
|
|
12
|
-
KEY_ALIASES = {
|
|
13
|
-
"hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
|
|
14
|
-
"kaggle_username": ["KAGGLE_USERNAME"],
|
|
15
|
-
"kaggle_key": ["KAGGLE_KEY"],
|
|
16
|
-
"dataworld_token": ["DW_AUTH_TOKEN"],
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
try:
|
|
20
|
-
import keyring # type: ignore
|
|
21
|
-
HAS_KEYRING = True
|
|
22
|
-
except Exception:
|
|
23
|
-
HAS_KEYRING = False
|
|
24
|
-
|
|
25
|
-
try:
|
|
26
|
-
from cryptography.fernet import Fernet, InvalidToken # type: ignore
|
|
27
|
-
HAS_FERNET = True
|
|
28
|
-
except Exception:
|
|
29
|
-
HAS_FERNET = False
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def _config_path() -> Path:
|
|
33
|
-
return Path.home() / ".vesper" / "config.toml"
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def _secret_path() -> Path:
|
|
37
|
-
return Path.home() / ".vesper" / ".config_key"
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def _ensure_parent(path: Path) -> None:
|
|
41
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def _read_fallback_toml() -> Dict[str, str]:
|
|
45
|
-
path = _config_path()
|
|
46
|
-
if not path.exists():
|
|
47
|
-
return {}
|
|
48
|
-
|
|
49
|
-
values: Dict[str, str] = {}
|
|
50
|
-
in_keys = False
|
|
51
|
-
method = ""
|
|
52
|
-
|
|
53
|
-
for raw in path.read_text(encoding="utf-8").splitlines():
|
|
54
|
-
line = raw.strip()
|
|
55
|
-
if not line or line.startswith("#"):
|
|
56
|
-
continue
|
|
57
|
-
if line.startswith("[") and line.endswith("]"):
|
|
58
|
-
in_keys = (line == "[keys]")
|
|
59
|
-
continue
|
|
60
|
-
if line.startswith("method") and "=" in line:
|
|
61
|
-
method = line.split("=", 1)[1].strip().strip('"').strip("'")
|
|
62
|
-
continue
|
|
63
|
-
if not in_keys or "=" not in line:
|
|
64
|
-
continue
|
|
65
|
-
|
|
66
|
-
key, val = line.split("=", 1)
|
|
67
|
-
key = key.strip()
|
|
68
|
-
val = val.strip().strip('"').strip("'")
|
|
69
|
-
values[key] = val
|
|
70
|
-
|
|
71
|
-
if method:
|
|
72
|
-
values["__method__"] = method
|
|
73
|
-
|
|
74
|
-
return values
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def _get_or_create_local_secret() -> str:
|
|
78
|
-
secret_file = _secret_path()
|
|
79
|
-
_ensure_parent(secret_file)
|
|
80
|
-
|
|
81
|
-
if secret_file.exists():
|
|
82
|
-
return secret_file.read_text(encoding="utf-8").strip()
|
|
83
|
-
|
|
84
|
-
secret = base64.urlsafe_b64encode(secrets.token_bytes(32)).decode("utf-8")
|
|
85
|
-
secret_file.write_text(secret, encoding="utf-8")
|
|
86
|
-
try:
|
|
87
|
-
os.chmod(secret_file, 0o600)
|
|
88
|
-
except Exception:
|
|
89
|
-
pass
|
|
90
|
-
return secret
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def _xor_encrypt(plain: str, secret: str) -> str:
|
|
94
|
-
key = hashlib.sha256(secret.encode("utf-8")).digest()
|
|
95
|
-
data = plain.encode("utf-8")
|
|
96
|
-
out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
|
|
97
|
-
return base64.urlsafe_b64encode(out).decode("utf-8")
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def _xor_decrypt(cipher_text: str, secret: str) -> str:
|
|
101
|
-
key = hashlib.sha256(secret.encode("utf-8")).digest()
|
|
102
|
-
data = base64.urlsafe_b64decode(cipher_text.encode("utf-8"))
|
|
103
|
-
out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
|
|
104
|
-
return out.decode("utf-8")
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def _encrypt_value(value: str, secret: str) -> Dict[str, str]:
|
|
108
|
-
if HAS_FERNET:
|
|
109
|
-
token = Fernet(secret.encode("utf-8")).encrypt(value.encode("utf-8")).decode("utf-8")
|
|
110
|
-
return {"method": "fernet", "value": token}
|
|
111
|
-
# fallback encryption (weaker than fernet, but still not plaintext)
|
|
112
|
-
return {"method": "xor", "value": _xor_encrypt(value, secret)}
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def _decrypt_value(value: str, method: str, secret: str) -> Optional[str]:
|
|
116
|
-
try:
|
|
117
|
-
if method == "fernet" and HAS_FERNET:
|
|
118
|
-
return Fernet(secret.encode("utf-8")).decrypt(value.encode("utf-8")).decode("utf-8")
|
|
119
|
-
if method == "xor":
|
|
120
|
-
return _xor_decrypt(value, secret)
|
|
121
|
-
return None
|
|
122
|
-
except InvalidToken:
|
|
123
|
-
return None
|
|
124
|
-
except Exception:
|
|
125
|
-
return None
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def _write_fallback_toml(values: Dict[str, str]) -> None:
|
|
129
|
-
path = _config_path()
|
|
130
|
-
_ensure_parent(path)
|
|
131
|
-
|
|
132
|
-
method = values.get("__method__", "fernet" if HAS_FERNET else "xor")
|
|
133
|
-
lines = [
|
|
134
|
-
"# Vesper optional API keys fallback storage",
|
|
135
|
-
"# Encrypted fallback (keyring is preferred)",
|
|
136
|
-
"[meta]",
|
|
137
|
-
f'method = "{method}"',
|
|
138
|
-
"[keys]",
|
|
139
|
-
]
|
|
140
|
-
for key in sorted(values.keys()):
|
|
141
|
-
if key.startswith("__"):
|
|
142
|
-
continue
|
|
143
|
-
val = str(values[key]).replace('"', '\\"')
|
|
144
|
-
lines.append(f'{key} = "{val}"')
|
|
145
|
-
|
|
146
|
-
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
def _get_from_env(name: str) -> Optional[str]:
|
|
150
|
-
for env_key in KEY_ALIASES.get(name, []):
|
|
151
|
-
val = os.getenv(env_key)
|
|
152
|
-
if val:
|
|
153
|
-
return val
|
|
154
|
-
return None
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
def get_key(name: str) -> Optional[str]:
|
|
158
|
-
# 1) keyring (secure)
|
|
159
|
-
if HAS_KEYRING:
|
|
160
|
-
try:
|
|
161
|
-
val = keyring.get_password(SERVICE_NAME, name)
|
|
162
|
-
if val:
|
|
163
|
-
return val
|
|
164
|
-
except Exception:
|
|
165
|
-
pass
|
|
166
|
-
|
|
167
|
-
# 2) encrypted fallback config.toml
|
|
168
|
-
fallback = _read_fallback_toml()
|
|
169
|
-
enc = fallback.get(name)
|
|
170
|
-
if enc:
|
|
171
|
-
secret = _get_or_create_local_secret()
|
|
172
|
-
method = fallback.get("__method__", "fernet" if HAS_FERNET else "xor")
|
|
173
|
-
dec = _decrypt_value(enc, method, secret)
|
|
174
|
-
if dec:
|
|
175
|
-
return dec
|
|
176
|
-
|
|
177
|
-
# 3) env vars (fallback only)
|
|
178
|
-
env_val = _get_from_env(name)
|
|
179
|
-
if env_val:
|
|
180
|
-
return env_val
|
|
181
|
-
return None
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
def set_key(name: str, value: str) -> Dict[str, str]:
|
|
185
|
-
if not value:
|
|
186
|
-
return {"ok": "false", "method": "none", "error": "Empty value"}
|
|
187
|
-
|
|
188
|
-
if HAS_KEYRING:
|
|
189
|
-
try:
|
|
190
|
-
keyring.set_password(SERVICE_NAME, name, value)
|
|
191
|
-
return {"ok": "true", "method": "keyring"}
|
|
192
|
-
except Exception:
|
|
193
|
-
pass
|
|
194
|
-
|
|
195
|
-
fallback = _read_fallback_toml()
|
|
196
|
-
secret = _get_or_create_local_secret()
|
|
197
|
-
enc = _encrypt_value(value, secret)
|
|
198
|
-
fallback["__method__"] = enc["method"]
|
|
199
|
-
fallback[name] = enc["value"]
|
|
200
|
-
_write_fallback_toml(fallback)
|
|
201
|
-
return {"ok": "true", "method": f'toml:{enc["method"]}'}
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def has_key(name: str) -> bool:
|
|
205
|
-
return bool(get_key(name))
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
def get_all() -> Dict[str, Optional[str]]:
|
|
209
|
-
return {
|
|
210
|
-
"hf_token": get_key("hf_token"),
|
|
211
|
-
"kaggle_username": get_key("kaggle_username"),
|
|
212
|
-
"kaggle_key": get_key("kaggle_key"),
|
|
213
|
-
"dataworld_token": get_key("dataworld_token"),
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
def _print_json(data):
|
|
218
|
-
print(json.dumps(data))
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
def main() -> None:
|
|
222
|
-
if len(sys.argv) < 2:
|
|
223
|
-
_print_json({
|
|
224
|
-
"ok": False,
|
|
225
|
-
"error": "Usage: config.py <get|set|has|all> [name] [value]",
|
|
226
|
-
})
|
|
227
|
-
sys.exit(1)
|
|
228
|
-
|
|
229
|
-
cmd = sys.argv[1].lower()
|
|
230
|
-
|
|
231
|
-
if cmd == "all":
|
|
232
|
-
_print_json({"ok": True, "data": get_all()})
|
|
233
|
-
return
|
|
234
|
-
|
|
235
|
-
if len(sys.argv) < 3:
|
|
236
|
-
_print_json({"ok": False, "error": "Missing key name"})
|
|
237
|
-
sys.exit(1)
|
|
238
|
-
|
|
239
|
-
name = sys.argv[2]
|
|
240
|
-
|
|
241
|
-
if cmd == "get":
|
|
242
|
-
_print_json({"ok": True, "name": name, "value": get_key(name)})
|
|
243
|
-
return
|
|
244
|
-
|
|
245
|
-
if cmd == "has":
|
|
246
|
-
_print_json({"ok": True, "name": name, "value": has_key(name)})
|
|
247
|
-
return
|
|
248
|
-
|
|
249
|
-
if cmd == "set":
|
|
250
|
-
if len(sys.argv) < 4:
|
|
251
|
-
_print_json({"ok": False, "error": "Missing value for set"})
|
|
252
|
-
sys.exit(1)
|
|
253
|
-
value = sys.argv[3]
|
|
254
|
-
result = set_key(name, value)
|
|
255
|
-
_print_json({"ok": result.get("ok") == "true", "name": name, "method": result.get("method"), "error": result.get("error")})
|
|
256
|
-
return
|
|
257
|
-
|
|
258
|
-
_print_json({"ok": False, "error": f"Unknown command: {cmd}"})
|
|
259
|
-
sys.exit(1)
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
if __name__ == "__main__":
|
|
263
|
-
main()
|
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
|
|
3
|
-
Usage: convert_engine.py <input_path> <output_path>
|
|
4
|
-
Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
|
|
5
|
-
"""
|
|
6
|
-
import sys
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
try:
|
|
11
|
-
import polars as pl
|
|
12
|
-
except Exception:
|
|
13
|
-
print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
|
|
14
|
-
sys.exit(1)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def _load(src: str) -> pl.DataFrame:
|
|
18
|
-
ext = os.path.splitext(src)[1].lower()
|
|
19
|
-
if ext == ".csv":
|
|
20
|
-
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
21
|
-
if ext in (".tsv", ".tab"):
|
|
22
|
-
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
23
|
-
if ext in (".parquet", ".pq"):
|
|
24
|
-
return pl.read_parquet(src)
|
|
25
|
-
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
26
|
-
return pl.read_ipc(src)
|
|
27
|
-
if ext in (".jsonl", ".ndjson"):
|
|
28
|
-
return pl.read_ndjson(src)
|
|
29
|
-
if ext == ".json":
|
|
30
|
-
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
31
|
-
if raw.startswith("["):
|
|
32
|
-
return pl.read_json(src)
|
|
33
|
-
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
34
|
-
return pl.read_ndjson(src)
|
|
35
|
-
obj = json.loads(raw)
|
|
36
|
-
if isinstance(obj, dict):
|
|
37
|
-
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
38
|
-
if key in obj and isinstance(obj[key], list):
|
|
39
|
-
return pl.DataFrame(obj[key])
|
|
40
|
-
for v in obj.values():
|
|
41
|
-
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
42
|
-
return pl.DataFrame(v)
|
|
43
|
-
return pl.read_json(src)
|
|
44
|
-
# Fallback: try csv
|
|
45
|
-
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def _write(df: pl.DataFrame, dst: str) -> None:
|
|
49
|
-
ext = os.path.splitext(dst)[1].lower()
|
|
50
|
-
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
51
|
-
if ext in (".parquet", ".pq"):
|
|
52
|
-
df.write_parquet(dst)
|
|
53
|
-
elif ext == ".csv":
|
|
54
|
-
df.write_csv(dst)
|
|
55
|
-
elif ext == ".json":
|
|
56
|
-
df.write_json(dst, row_oriented=True)
|
|
57
|
-
elif ext in (".jsonl", ".ndjson"):
|
|
58
|
-
df.write_ndjson(dst)
|
|
59
|
-
else:
|
|
60
|
-
raise ValueError(f"Unsupported output format: {ext}")
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def main():
|
|
64
|
-
if len(sys.argv) < 3:
|
|
65
|
-
print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
|
|
66
|
-
sys.exit(1)
|
|
67
|
-
|
|
68
|
-
input_path = sys.argv[1]
|
|
69
|
-
output_path = sys.argv[2]
|
|
70
|
-
|
|
71
|
-
if not os.path.exists(input_path):
|
|
72
|
-
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
73
|
-
sys.exit(1)
|
|
74
|
-
|
|
75
|
-
try:
|
|
76
|
-
df = _load(input_path)
|
|
77
|
-
_write(df, output_path)
|
|
78
|
-
size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
79
|
-
print(json.dumps({
|
|
80
|
-
"ok": True,
|
|
81
|
-
"output_path": output_path,
|
|
82
|
-
"rows": df.height,
|
|
83
|
-
"columns": df.width,
|
|
84
|
-
"size_mb": size_mb,
|
|
85
|
-
}))
|
|
86
|
-
except Exception as e:
|
|
87
|
-
print(json.dumps({"ok": False, "error": str(e)}))
|
|
88
|
-
sys.exit(1)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
if __name__ == "__main__":
|
|
92
|
-
main()
|