vesper-wizard 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +1 -1
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import re
|
|
5
|
-
from dataclasses import dataclass, asdict
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, Optional
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
DEFAULT_RECIPES_DIR = Path.home() / ".vesper" / "recipes"
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclass
|
|
14
|
-
class DownloadRecipe:
|
|
15
|
-
dataset_id: str
|
|
16
|
-
source: str
|
|
17
|
-
repo_id: str
|
|
18
|
-
image_column: Optional[str]
|
|
19
|
-
download_method: str
|
|
20
|
-
requires_auth: bool
|
|
21
|
-
estimated_asset_size_gb: float
|
|
22
|
-
total_images: int
|
|
23
|
-
fallback_strategy: list[str]
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def _safe_name(value: str) -> str:
|
|
27
|
-
return re.sub(r"[^a-zA-Z0-9._-]+", "_", value).strip("_") or "dataset"
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def build_download_recipe(dataset_info: Dict[str, Any]) -> Dict[str, Any]:
|
|
31
|
-
dataset_id = str(dataset_info.get("dataset_id") or dataset_info.get("id") or "")
|
|
32
|
-
source = str(dataset_info.get("source") or "unknown").lower()
|
|
33
|
-
repo_id = str(dataset_info.get("repo_id") or dataset_info.get("id") or dataset_id)
|
|
34
|
-
|
|
35
|
-
image_column = dataset_info.get("image_column")
|
|
36
|
-
if not image_column:
|
|
37
|
-
features = dataset_info.get("features") or {}
|
|
38
|
-
if isinstance(features, dict):
|
|
39
|
-
for key in features.keys():
|
|
40
|
-
lower = str(key).lower()
|
|
41
|
-
if lower in {"image", "images", "img", "img_path", "image_url", "url"}:
|
|
42
|
-
image_column = key
|
|
43
|
-
break
|
|
44
|
-
|
|
45
|
-
download_method = "url_list"
|
|
46
|
-
if source == "huggingface":
|
|
47
|
-
download_method = "hf_dataset_image_feature"
|
|
48
|
-
elif source == "kaggle":
|
|
49
|
-
download_method = "kaggle_archive"
|
|
50
|
-
elif source in {"dataworld", "openml"}:
|
|
51
|
-
download_method = "direct_file_scan"
|
|
52
|
-
|
|
53
|
-
requires_auth = bool(dataset_info.get("requires_auth", source in {"kaggle", "dataworld"}))
|
|
54
|
-
|
|
55
|
-
total_images = int(dataset_info.get("total_images") or dataset_info.get("total_examples") or 0)
|
|
56
|
-
if total_images <= 0:
|
|
57
|
-
total_images = 1000
|
|
58
|
-
|
|
59
|
-
estimated_asset_size_gb = round(float(dataset_info.get("estimated_asset_size_gb") or (total_images * 0.0004)), 3)
|
|
60
|
-
|
|
61
|
-
fallback_strategy = dataset_info.get("fallback_strategy") or [
|
|
62
|
-
"scan_archive_for_images",
|
|
63
|
-
"extract_url_column_and_download",
|
|
64
|
-
"export_metadata_only_with_actionable_error",
|
|
65
|
-
]
|
|
66
|
-
|
|
67
|
-
recipe = DownloadRecipe(
|
|
68
|
-
dataset_id=dataset_id or repo_id,
|
|
69
|
-
source=source,
|
|
70
|
-
repo_id=repo_id,
|
|
71
|
-
image_column=image_column,
|
|
72
|
-
download_method=download_method,
|
|
73
|
-
requires_auth=requires_auth,
|
|
74
|
-
estimated_asset_size_gb=estimated_asset_size_gb,
|
|
75
|
-
total_images=total_images,
|
|
76
|
-
fallback_strategy=list(fallback_strategy),
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
return asdict(recipe)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def save_recipe(recipe: Dict[str, Any], recipes_dir: Optional[str] = None) -> str:
|
|
83
|
-
root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
|
|
84
|
-
root.mkdir(parents=True, exist_ok=True)
|
|
85
|
-
|
|
86
|
-
dataset_id = str(recipe.get("dataset_id") or recipe.get("repo_id") or "dataset")
|
|
87
|
-
recipe_dir = root / _safe_name(dataset_id)
|
|
88
|
-
recipe_dir.mkdir(parents=True, exist_ok=True)
|
|
89
|
-
|
|
90
|
-
out_path = recipe_dir / "download_recipe.json"
|
|
91
|
-
out_path.write_text(json.dumps(recipe, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
92
|
-
return str(out_path)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def get_download_recipe(dataset_id: str, recipes_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
|
96
|
-
root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
|
|
97
|
-
path = root / _safe_name(dataset_id) / "download_recipe.json"
|
|
98
|
-
if not path.exists():
|
|
99
|
-
return None
|
|
100
|
-
|
|
101
|
-
try:
|
|
102
|
-
return json.loads(path.read_text(encoding="utf-8"))
|
|
103
|
-
except Exception:
|
|
104
|
-
return None
|
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import argparse
|
|
4
|
-
import urllib.request
|
|
5
|
-
import urllib.parse
|
|
6
|
-
from datetime import datetime
|
|
7
|
-
|
|
8
|
-
# WB API for indicators (Series)
|
|
9
|
-
# Source 2 is World Development Indicators
|
|
10
|
-
WB_API_URL = "https://api.worldbank.org/v2/indicator"
|
|
11
|
-
|
|
12
|
-
def search_worldbank(query: str, limit: int = 10):
|
|
13
|
-
"""
|
|
14
|
-
Search World Bank indicators.
|
|
15
|
-
"""
|
|
16
|
-
try:
|
|
17
|
-
# The World Bank Indicators API doesn't have a direct "search" parameter for indicators
|
|
18
|
-
# that works exactly like a search engine. We fetch a page and filter by query terms.
|
|
19
|
-
# Alternatively, we could use the 'qterm' on the documents API, but indicators are more tabular.
|
|
20
|
-
|
|
21
|
-
params = {
|
|
22
|
-
"format": "json",
|
|
23
|
-
"per_page": 299, # Max per page to search through more indicators
|
|
24
|
-
"source": 2
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
query_string = urllib.parse.urlencode(params)
|
|
28
|
-
url = f"{WB_API_URL}?{query_string}"
|
|
29
|
-
|
|
30
|
-
req = urllib.request.Request(url)
|
|
31
|
-
with urllib.request.urlopen(req) as response:
|
|
32
|
-
data = json.load(response)
|
|
33
|
-
|
|
34
|
-
# WB response is [metadata, data_list]
|
|
35
|
-
if len(data) < 2:
|
|
36
|
-
return []
|
|
37
|
-
|
|
38
|
-
indicators = data[1]
|
|
39
|
-
|
|
40
|
-
results = []
|
|
41
|
-
count = 0
|
|
42
|
-
|
|
43
|
-
query_terms = query.lower().split()
|
|
44
|
-
|
|
45
|
-
for ind in indicators:
|
|
46
|
-
name = ind.get('name', '')
|
|
47
|
-
source_note = ind.get('sourceNote', '')
|
|
48
|
-
text = (name + " " + source_note).lower()
|
|
49
|
-
|
|
50
|
-
# Simple keyword matching
|
|
51
|
-
if all(term in text for term in query_terms):
|
|
52
|
-
metadata = {
|
|
53
|
-
"id": f"wb:{ind.get('id')}",
|
|
54
|
-
"source": "worldbank",
|
|
55
|
-
"name": name,
|
|
56
|
-
"description": source_note or "No description available.",
|
|
57
|
-
"downloads": 1000, # Placeholder (high relevance for WB)
|
|
58
|
-
"likes": 100,
|
|
59
|
-
"last_updated": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
60
|
-
"quality_score": 95, # Institutional data is high quality
|
|
61
|
-
"license": {
|
|
62
|
-
"id": "cc-by-4.0",
|
|
63
|
-
"name": "Creative Commons Attribution 4.0",
|
|
64
|
-
"category": "safe",
|
|
65
|
-
"usage_restrictions": [],
|
|
66
|
-
"warnings": []
|
|
67
|
-
},
|
|
68
|
-
"tags": [ind.get('source', {}).get('value')] if ind.get('source') else [],
|
|
69
|
-
"total_examples": 0, # Time series length varies
|
|
70
|
-
"is_safe_source": True,
|
|
71
|
-
"is_structured": True,
|
|
72
|
-
"metadata_url": f"https://data.worldbank.org/indicator/{ind.get('id')}",
|
|
73
|
-
"domain": "economics"
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
results.append(metadata)
|
|
77
|
-
count += 1
|
|
78
|
-
if count >= limit:
|
|
79
|
-
break
|
|
80
|
-
|
|
81
|
-
return results
|
|
82
|
-
|
|
83
|
-
except Exception as e:
|
|
84
|
-
return {"error": str(e)}
|
|
85
|
-
|
|
86
|
-
def main():
|
|
87
|
-
parser = argparse.ArgumentParser(description="World Bank Adapter")
|
|
88
|
-
parser.add_argument("--action", required=True, choices=["search"])
|
|
89
|
-
parser.add_argument("--query", required=True)
|
|
90
|
-
parser.add_argument("--limit", type=int, default=10)
|
|
91
|
-
|
|
92
|
-
args = parser.parse_args()
|
|
93
|
-
|
|
94
|
-
if args.action == "search":
|
|
95
|
-
results = search_worldbank(args.query, args.limit)
|
|
96
|
-
print(json.dumps(results))
|
|
97
|
-
|
|
98
|
-
if __name__ == "__main__":
|
|
99
|
-
main()
|
package/wizard.cjs
DELETED