vesper-wizard 2.0.4 → 2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +300 -37
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +81 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +62 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +127 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +26 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/config/config-manager.js +221 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +69 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/engine.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/gateway/unified-dataset-gateway.js +409 -0
- package/build/index.js +2704 -0
- package/build/ingestion/hf-downloader.js +171 -0
- package/build/ingestion/ingestor.js +271 -0
- package/build/ingestion/kaggle-downloader.js +102 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +136 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/lib/supabase.js +3 -0
- package/build/metadata/dataworld-source.js +89 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/openml-source.js +87 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +377 -0
- package/build/metadata/store.js +340 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/preparation/target-detector.js +75 -0
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +92 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/config.py +263 -0
- package/build/python/dataworld_engine.py +208 -0
- package/build/python/export_engine.py +243 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/fusion_engine.py +368 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/hf_fallback.py +298 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/kaggle_engine.py +295 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/openml_engine.py +146 -0
- package/build/python/quality_engine.py +267 -0
- package/build/python/row_count.py +54 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/vesper/__init__.py +1 -0
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +1 -0
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +675 -0
- package/build/python/vesper/core/download_recipe.py +104 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +93 -0
- package/build/quality/image-analyzer.js +114 -0
- package/build/quality/media-analyzer.js +115 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +74 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +152 -0
- package/build/search/jit-orchestrator.js +258 -0
- package/build/search/vector-store.js +123 -0
- package/build/splitting/splitter.js +82 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +251 -0
- package/build/utils/downloader.js +52 -0
- package/build/utils/selector.js +69 -0
- package/mcp-config-template.json +18 -0
- package/package.json +101 -29
- package/scripts/postinstall.cjs +114 -0
- package/scripts/preindex_registry.cjs +157 -0
- package/scripts/refresh-index.cjs +87 -0
- package/{wizard.js → scripts/wizard.js} +148 -32
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +92 -0
- package/src/python/cleaner.py +226 -0
- package/src/python/config.py +263 -0
- package/src/python/dataworld_engine.py +208 -0
- package/src/python/export_engine.py +243 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/fusion_engine.py +368 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/hf_fallback.py +298 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/kaggle_engine.py +295 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/openml_engine.py +146 -0
- package/src/python/quality_engine.py +267 -0
- package/src/python/row_count.py +54 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/target_engine.py +154 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/test_fusion_engine.py +89 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/vesper/__init__.py +1 -0
- package/src/python/vesper/core/__init__.py +1 -0
- package/src/python/vesper/core/asset_downloader.py +675 -0
- package/src/python/vesper/core/download_recipe.py +104 -0
- package/src/python/worldbank_adapter.py +99 -0
- package/vesper-mcp-config.json +0 -6
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass, asdict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
DEFAULT_RECIPES_DIR = Path.home() / ".vesper" / "recipes"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DownloadRecipe:
|
|
15
|
+
dataset_id: str
|
|
16
|
+
source: str
|
|
17
|
+
repo_id: str
|
|
18
|
+
image_column: Optional[str]
|
|
19
|
+
download_method: str
|
|
20
|
+
requires_auth: bool
|
|
21
|
+
estimated_asset_size_gb: float
|
|
22
|
+
total_images: int
|
|
23
|
+
fallback_strategy: list[str]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _safe_name(value: str) -> str:
|
|
27
|
+
return re.sub(r"[^a-zA-Z0-9._-]+", "_", value).strip("_") or "dataset"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def build_download_recipe(dataset_info: Dict[str, Any]) -> Dict[str, Any]:
|
|
31
|
+
dataset_id = str(dataset_info.get("dataset_id") or dataset_info.get("id") or "")
|
|
32
|
+
source = str(dataset_info.get("source") or "unknown").lower()
|
|
33
|
+
repo_id = str(dataset_info.get("repo_id") or dataset_info.get("id") or dataset_id)
|
|
34
|
+
|
|
35
|
+
image_column = dataset_info.get("image_column")
|
|
36
|
+
if not image_column:
|
|
37
|
+
features = dataset_info.get("features") or {}
|
|
38
|
+
if isinstance(features, dict):
|
|
39
|
+
for key in features.keys():
|
|
40
|
+
lower = str(key).lower()
|
|
41
|
+
if lower in {"image", "images", "img", "img_path", "image_url", "url"}:
|
|
42
|
+
image_column = key
|
|
43
|
+
break
|
|
44
|
+
|
|
45
|
+
download_method = "url_list"
|
|
46
|
+
if source == "huggingface":
|
|
47
|
+
download_method = "hf_dataset_image_feature"
|
|
48
|
+
elif source == "kaggle":
|
|
49
|
+
download_method = "kaggle_archive"
|
|
50
|
+
elif source in {"dataworld", "openml"}:
|
|
51
|
+
download_method = "direct_file_scan"
|
|
52
|
+
|
|
53
|
+
requires_auth = bool(dataset_info.get("requires_auth", source in {"kaggle", "dataworld"}))
|
|
54
|
+
|
|
55
|
+
total_images = int(dataset_info.get("total_images") or dataset_info.get("total_examples") or 0)
|
|
56
|
+
if total_images <= 0:
|
|
57
|
+
total_images = 1000
|
|
58
|
+
|
|
59
|
+
estimated_asset_size_gb = round(float(dataset_info.get("estimated_asset_size_gb") or (total_images * 0.0004)), 3)
|
|
60
|
+
|
|
61
|
+
fallback_strategy = dataset_info.get("fallback_strategy") or [
|
|
62
|
+
"scan_archive_for_images",
|
|
63
|
+
"extract_url_column_and_download",
|
|
64
|
+
"export_metadata_only_with_actionable_error",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
recipe = DownloadRecipe(
|
|
68
|
+
dataset_id=dataset_id or repo_id,
|
|
69
|
+
source=source,
|
|
70
|
+
repo_id=repo_id,
|
|
71
|
+
image_column=image_column,
|
|
72
|
+
download_method=download_method,
|
|
73
|
+
requires_auth=requires_auth,
|
|
74
|
+
estimated_asset_size_gb=estimated_asset_size_gb,
|
|
75
|
+
total_images=total_images,
|
|
76
|
+
fallback_strategy=list(fallback_strategy),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return asdict(recipe)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def save_recipe(recipe: Dict[str, Any], recipes_dir: Optional[str] = None) -> str:
|
|
83
|
+
root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
|
|
84
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
dataset_id = str(recipe.get("dataset_id") or recipe.get("repo_id") or "dataset")
|
|
87
|
+
recipe_dir = root / _safe_name(dataset_id)
|
|
88
|
+
recipe_dir.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
|
|
90
|
+
out_path = recipe_dir / "download_recipe.json"
|
|
91
|
+
out_path.write_text(json.dumps(recipe, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
92
|
+
return str(out_path)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_download_recipe(dataset_id: str, recipes_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
|
96
|
+
root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
|
|
97
|
+
path = root / _safe_name(dataset_id) / "download_recipe.json"
|
|
98
|
+
if not path.exists():
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
103
|
+
except Exception:
|
|
104
|
+
return None
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import urllib.request
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
# WB API for indicators (Series)
|
|
9
|
+
# Source 2 is World Development Indicators
|
|
10
|
+
WB_API_URL = "https://api.worldbank.org/v2/indicator"
|
|
11
|
+
|
|
12
|
+
def search_worldbank(query: str, limit: int = 10):
|
|
13
|
+
"""
|
|
14
|
+
Search World Bank indicators.
|
|
15
|
+
"""
|
|
16
|
+
try:
|
|
17
|
+
# The World Bank Indicators API doesn't have a direct "search" parameter for indicators
|
|
18
|
+
# that works exactly like a search engine. We fetch a page and filter by query terms.
|
|
19
|
+
# Alternatively, we could use the 'qterm' on the documents API, but indicators are more tabular.
|
|
20
|
+
|
|
21
|
+
params = {
|
|
22
|
+
"format": "json",
|
|
23
|
+
"per_page": 299, # Max per page to search through more indicators
|
|
24
|
+
"source": 2
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
query_string = urllib.parse.urlencode(params)
|
|
28
|
+
url = f"{WB_API_URL}?{query_string}"
|
|
29
|
+
|
|
30
|
+
req = urllib.request.Request(url)
|
|
31
|
+
with urllib.request.urlopen(req) as response:
|
|
32
|
+
data = json.load(response)
|
|
33
|
+
|
|
34
|
+
# WB response is [metadata, data_list]
|
|
35
|
+
if len(data) < 2:
|
|
36
|
+
return []
|
|
37
|
+
|
|
38
|
+
indicators = data[1]
|
|
39
|
+
|
|
40
|
+
results = []
|
|
41
|
+
count = 0
|
|
42
|
+
|
|
43
|
+
query_terms = query.lower().split()
|
|
44
|
+
|
|
45
|
+
for ind in indicators:
|
|
46
|
+
name = ind.get('name', '')
|
|
47
|
+
source_note = ind.get('sourceNote', '')
|
|
48
|
+
text = (name + " " + source_note).lower()
|
|
49
|
+
|
|
50
|
+
# Simple keyword matching
|
|
51
|
+
if all(term in text for term in query_terms):
|
|
52
|
+
metadata = {
|
|
53
|
+
"id": f"wb:{ind.get('id')}",
|
|
54
|
+
"source": "worldbank",
|
|
55
|
+
"name": name,
|
|
56
|
+
"description": source_note or "No description available.",
|
|
57
|
+
"downloads": 1000, # Placeholder (high relevance for WB)
|
|
58
|
+
"likes": 100,
|
|
59
|
+
"last_updated": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
60
|
+
"quality_score": 95, # Institutional data is high quality
|
|
61
|
+
"license": {
|
|
62
|
+
"id": "cc-by-4.0",
|
|
63
|
+
"name": "Creative Commons Attribution 4.0",
|
|
64
|
+
"category": "safe",
|
|
65
|
+
"usage_restrictions": [],
|
|
66
|
+
"warnings": []
|
|
67
|
+
},
|
|
68
|
+
"tags": [ind.get('source', {}).get('value')] if ind.get('source') else [],
|
|
69
|
+
"total_examples": 0, # Time series length varies
|
|
70
|
+
"is_safe_source": True,
|
|
71
|
+
"is_structured": True,
|
|
72
|
+
"metadata_url": f"https://data.worldbank.org/indicator/{ind.get('id')}",
|
|
73
|
+
"domain": "economics"
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
results.append(metadata)
|
|
77
|
+
count += 1
|
|
78
|
+
if count >= limit:
|
|
79
|
+
break
|
|
80
|
+
|
|
81
|
+
return results
|
|
82
|
+
|
|
83
|
+
except Exception as e:
|
|
84
|
+
return {"error": str(e)}
|
|
85
|
+
|
|
86
|
+
def main():
|
|
87
|
+
parser = argparse.ArgumentParser(description="World Bank Adapter")
|
|
88
|
+
parser.add_argument("--action", required=True, choices=["search"])
|
|
89
|
+
parser.add_argument("--query", required=True)
|
|
90
|
+
parser.add_argument("--limit", type=int, default=10)
|
|
91
|
+
|
|
92
|
+
args = parser.parse_args()
|
|
93
|
+
|
|
94
|
+
if args.action == "search":
|
|
95
|
+
results = search_worldbank(args.query, args.limit)
|
|
96
|
+
print(json.dumps(results))
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
main()
|