vesper-wizard 2.0.4 → 2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +300 -37
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +81 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +62 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +127 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/adapters/supabase.js +49 -0
  15. package/build/cloud/storage-manager.js +26 -0
  16. package/build/cloud/types.js +1 -0
  17. package/build/compliance/service.js +73 -0
  18. package/build/compliance/store.js +80 -0
  19. package/build/compliance/types.js +1 -0
  20. package/build/config/config-manager.js +221 -0
  21. package/build/config/secure-keys.js +51 -0
  22. package/build/config/user-config.js +48 -0
  23. package/build/data/processing-worker.js +23 -0
  24. package/build/data/streaming.js +38 -0
  25. package/build/data/worker-pool.js +39 -0
  26. package/build/export/exporter.js +69 -0
  27. package/build/export/packager.js +100 -0
  28. package/build/export/types.js +1 -0
  29. package/build/fusion/aligner.js +56 -0
  30. package/build/fusion/deduplicator.js +69 -0
  31. package/build/fusion/engine.js +69 -0
  32. package/build/fusion/harmonizer.js +39 -0
  33. package/build/fusion/orchestrator.js +86 -0
  34. package/build/fusion/types.js +1 -0
  35. package/build/gateway/unified-dataset-gateway.js +409 -0
  36. package/build/index.js +2704 -0
  37. package/build/ingestion/hf-downloader.js +171 -0
  38. package/build/ingestion/ingestor.js +271 -0
  39. package/build/ingestion/kaggle-downloader.js +102 -0
  40. package/build/install/install-service.js +41 -0
  41. package/build/jobs/manager.js +136 -0
  42. package/build/jobs/queue.js +59 -0
  43. package/build/jobs/types.js +1 -0
  44. package/build/lib/supabase.js +3 -0
  45. package/build/metadata/dataworld-source.js +89 -0
  46. package/build/metadata/domain.js +147 -0
  47. package/build/metadata/github-scraper.js +47 -0
  48. package/build/metadata/institutional-scrapers.js +49 -0
  49. package/build/metadata/kaggle-scraper.js +182 -0
  50. package/build/metadata/kaggle-source.js +70 -0
  51. package/build/metadata/license.js +68 -0
  52. package/build/metadata/monitoring-service.js +107 -0
  53. package/build/metadata/monitoring-store.js +78 -0
  54. package/build/metadata/monitoring-types.js +1 -0
  55. package/build/metadata/openml-source.js +87 -0
  56. package/build/metadata/quality.js +48 -0
  57. package/build/metadata/rate-limiter.js +128 -0
  58. package/build/metadata/scraper.js +377 -0
  59. package/build/metadata/store.js +340 -0
  60. package/build/metadata/types.js +1 -0
  61. package/build/metadata/uci-scraper.js +49 -0
  62. package/build/monitoring/observability.js +76 -0
  63. package/build/preparation/target-detector.js +75 -0
  64. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  65. package/build/python/asset_downloader_engine.py +92 -0
  66. package/build/python/cleaner.py +226 -0
  67. package/build/python/config.py +263 -0
  68. package/build/python/dataworld_engine.py +208 -0
  69. package/build/python/export_engine.py +243 -0
  70. package/build/python/framework_adapters.py +100 -0
  71. package/build/python/fusion_engine.py +368 -0
  72. package/build/python/github_adapter.py +106 -0
  73. package/build/python/hf_fallback.py +298 -0
  74. package/build/python/image_engine.py +86 -0
  75. package/build/python/kaggle_engine.py +295 -0
  76. package/build/python/media_engine.py +133 -0
  77. package/build/python/nasa_adapter.py +82 -0
  78. package/build/python/openml_engine.py +146 -0
  79. package/build/python/quality_engine.py +267 -0
  80. package/build/python/row_count.py +54 -0
  81. package/build/python/splitter_engine.py +283 -0
  82. package/build/python/target_engine.py +154 -0
  83. package/build/python/test_framework_adapters.py +61 -0
  84. package/build/python/test_fusion_engine.py +89 -0
  85. package/build/python/uci_adapter.py +94 -0
  86. package/build/python/vesper/__init__.py +1 -0
  87. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  88. package/build/python/vesper/core/__init__.py +1 -0
  89. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  90. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  91. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  92. package/build/python/vesper/core/asset_downloader.py +675 -0
  93. package/build/python/vesper/core/download_recipe.py +104 -0
  94. package/build/python/worldbank_adapter.py +99 -0
  95. package/build/quality/analyzer.js +93 -0
  96. package/build/quality/image-analyzer.js +114 -0
  97. package/build/quality/media-analyzer.js +115 -0
  98. package/build/quality/quality-orchestrator.js +162 -0
  99. package/build/quality/types.js +1 -0
  100. package/build/scripts/build-index.js +54 -0
  101. package/build/scripts/check-db.js +73 -0
  102. package/build/scripts/check-jobs.js +24 -0
  103. package/build/scripts/check-naruto.js +17 -0
  104. package/build/scripts/cleanup-kaggle.js +41 -0
  105. package/build/scripts/demo-full-pipeline.js +62 -0
  106. package/build/scripts/demo-ui.js +58 -0
  107. package/build/scripts/e2e-demo.js +72 -0
  108. package/build/scripts/massive-scrape.js +103 -0
  109. package/build/scripts/ops-dashboard.js +33 -0
  110. package/build/scripts/repro-bug.js +37 -0
  111. package/build/scripts/repro-export-bug.js +56 -0
  112. package/build/scripts/scrape-metadata.js +100 -0
  113. package/build/scripts/search-cli.js +26 -0
  114. package/build/scripts/test-bias.js +45 -0
  115. package/build/scripts/test-caching.js +51 -0
  116. package/build/scripts/test-cleaning.js +76 -0
  117. package/build/scripts/test-cloud-storage.js +48 -0
  118. package/build/scripts/test-compliance.js +58 -0
  119. package/build/scripts/test-conversion.js +64 -0
  120. package/build/scripts/test-custom-rules.js +58 -0
  121. package/build/scripts/test-db-opt.js +63 -0
  122. package/build/scripts/test-export-custom.js +33 -0
  123. package/build/scripts/test-exporter.js +53 -0
  124. package/build/scripts/test-fusion.js +61 -0
  125. package/build/scripts/test-github.js +27 -0
  126. package/build/scripts/test-group-split.js +52 -0
  127. package/build/scripts/test-hf-download.js +29 -0
  128. package/build/scripts/test-holdout-manager.js +61 -0
  129. package/build/scripts/test-hybrid-search.js +41 -0
  130. package/build/scripts/test-image-analysis.js +50 -0
  131. package/build/scripts/test-ingestion-infra.js +39 -0
  132. package/build/scripts/test-install.js +40 -0
  133. package/build/scripts/test-institutional.js +26 -0
  134. package/build/scripts/test-integrity.js +41 -0
  135. package/build/scripts/test-jit.js +42 -0
  136. package/build/scripts/test-job-queue.js +62 -0
  137. package/build/scripts/test-kaggle-download.js +34 -0
  138. package/build/scripts/test-large-data.js +50 -0
  139. package/build/scripts/test-mcp-v5.js +74 -0
  140. package/build/scripts/test-media-analysis.js +61 -0
  141. package/build/scripts/test-monitoring.js +91 -0
  142. package/build/scripts/test-observability.js +106 -0
  143. package/build/scripts/test-packager.js +55 -0
  144. package/build/scripts/test-pipeline.js +50 -0
  145. package/build/scripts/test-planning.js +64 -0
  146. package/build/scripts/test-privacy.js +38 -0
  147. package/build/scripts/test-production-sync.js +36 -0
  148. package/build/scripts/test-quality.js +43 -0
  149. package/build/scripts/test-robust-ingestion.js +41 -0
  150. package/build/scripts/test-schema.js +45 -0
  151. package/build/scripts/test-split-validation.js +40 -0
  152. package/build/scripts/test-splitter.js +93 -0
  153. package/build/scripts/test-target-detector.js +29 -0
  154. package/build/scripts/test-uci.js +27 -0
  155. package/build/scripts/test-unified-quality.js +86 -0
  156. package/build/scripts/test-write.js +14 -0
  157. package/build/scripts/verify-integration.js +57 -0
  158. package/build/scripts/verify-priority.js +33 -0
  159. package/build/search/embedder.js +34 -0
  160. package/build/search/engine.js +152 -0
  161. package/build/search/jit-orchestrator.js +258 -0
  162. package/build/search/vector-store.js +123 -0
  163. package/build/splitting/splitter.js +82 -0
  164. package/build/splitting/types.js +1 -0
  165. package/build/tools/formatter.js +251 -0
  166. package/build/utils/downloader.js +52 -0
  167. package/build/utils/selector.js +69 -0
  168. package/mcp-config-template.json +18 -0
  169. package/package.json +101 -29
  170. package/scripts/postinstall.cjs +114 -0
  171. package/scripts/preindex_registry.cjs +157 -0
  172. package/scripts/refresh-index.cjs +87 -0
  173. package/{wizard.js → scripts/wizard.js} +148 -32
  174. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  175. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  176. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  177. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  178. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  179. package/src/python/asset_downloader_engine.py +92 -0
  180. package/src/python/cleaner.py +226 -0
  181. package/src/python/config.py +263 -0
  182. package/src/python/dataworld_engine.py +208 -0
  183. package/src/python/export_engine.py +243 -0
  184. package/src/python/framework_adapters.py +100 -0
  185. package/src/python/fusion_engine.py +368 -0
  186. package/src/python/github_adapter.py +106 -0
  187. package/src/python/hf_fallback.py +298 -0
  188. package/src/python/image_engine.py +86 -0
  189. package/src/python/kaggle_engine.py +295 -0
  190. package/src/python/media_engine.py +133 -0
  191. package/src/python/nasa_adapter.py +82 -0
  192. package/src/python/openml_engine.py +146 -0
  193. package/src/python/quality_engine.py +267 -0
  194. package/src/python/row_count.py +54 -0
  195. package/src/python/splitter_engine.py +283 -0
  196. package/src/python/target_engine.py +154 -0
  197. package/src/python/test_framework_adapters.py +61 -0
  198. package/src/python/test_fusion_engine.py +89 -0
  199. package/src/python/uci_adapter.py +94 -0
  200. package/src/python/vesper/__init__.py +1 -0
  201. package/src/python/vesper/core/__init__.py +1 -0
  202. package/src/python/vesper/core/asset_downloader.py +675 -0
  203. package/src/python/vesper/core/download_recipe.py +104 -0
  204. package/src/python/worldbank_adapter.py +99 -0
  205. package/vesper-mcp-config.json +0 -6
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from dataclasses import dataclass, asdict
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional
8
+
9
+
10
+ DEFAULT_RECIPES_DIR = Path.home() / ".vesper" / "recipes"
11
+
12
+
13
+ @dataclass
14
+ class DownloadRecipe:
15
+ dataset_id: str
16
+ source: str
17
+ repo_id: str
18
+ image_column: Optional[str]
19
+ download_method: str
20
+ requires_auth: bool
21
+ estimated_asset_size_gb: float
22
+ total_images: int
23
+ fallback_strategy: list[str]
24
+
25
+
26
+ def _safe_name(value: str) -> str:
27
+ return re.sub(r"[^a-zA-Z0-9._-]+", "_", value).strip("_") or "dataset"
28
+
29
+
30
+ def build_download_recipe(dataset_info: Dict[str, Any]) -> Dict[str, Any]:
31
+ dataset_id = str(dataset_info.get("dataset_id") or dataset_info.get("id") or "")
32
+ source = str(dataset_info.get("source") or "unknown").lower()
33
+ repo_id = str(dataset_info.get("repo_id") or dataset_info.get("id") or dataset_id)
34
+
35
+ image_column = dataset_info.get("image_column")
36
+ if not image_column:
37
+ features = dataset_info.get("features") or {}
38
+ if isinstance(features, dict):
39
+ for key in features.keys():
40
+ lower = str(key).lower()
41
+ if lower in {"image", "images", "img", "img_path", "image_url", "url"}:
42
+ image_column = key
43
+ break
44
+
45
+ download_method = "url_list"
46
+ if source == "huggingface":
47
+ download_method = "hf_dataset_image_feature"
48
+ elif source == "kaggle":
49
+ download_method = "kaggle_archive"
50
+ elif source in {"dataworld", "openml"}:
51
+ download_method = "direct_file_scan"
52
+
53
+ requires_auth = bool(dataset_info.get("requires_auth", source in {"kaggle", "dataworld"}))
54
+
55
+ total_images = int(dataset_info.get("total_images") or dataset_info.get("total_examples") or 0)
56
+ if total_images <= 0:
57
+ total_images = 1000
58
+
59
+ estimated_asset_size_gb = round(float(dataset_info.get("estimated_asset_size_gb") or (total_images * 0.0004)), 3)
60
+
61
+ fallback_strategy = dataset_info.get("fallback_strategy") or [
62
+ "scan_archive_for_images",
63
+ "extract_url_column_and_download",
64
+ "export_metadata_only_with_actionable_error",
65
+ ]
66
+
67
+ recipe = DownloadRecipe(
68
+ dataset_id=dataset_id or repo_id,
69
+ source=source,
70
+ repo_id=repo_id,
71
+ image_column=image_column,
72
+ download_method=download_method,
73
+ requires_auth=requires_auth,
74
+ estimated_asset_size_gb=estimated_asset_size_gb,
75
+ total_images=total_images,
76
+ fallback_strategy=list(fallback_strategy),
77
+ )
78
+
79
+ return asdict(recipe)
80
+
81
+
82
+ def save_recipe(recipe: Dict[str, Any], recipes_dir: Optional[str] = None) -> str:
83
+ root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
84
+ root.mkdir(parents=True, exist_ok=True)
85
+
86
+ dataset_id = str(recipe.get("dataset_id") or recipe.get("repo_id") or "dataset")
87
+ recipe_dir = root / _safe_name(dataset_id)
88
+ recipe_dir.mkdir(parents=True, exist_ok=True)
89
+
90
+ out_path = recipe_dir / "download_recipe.json"
91
+ out_path.write_text(json.dumps(recipe, indent=2, ensure_ascii=False), encoding="utf-8")
92
+ return str(out_path)
93
+
94
+
95
+ def get_download_recipe(dataset_id: str, recipes_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
96
+ root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
97
+ path = root / _safe_name(dataset_id) / "download_recipe.json"
98
+ if not path.exists():
99
+ return None
100
+
101
+ try:
102
+ return json.loads(path.read_text(encoding="utf-8"))
103
+ except Exception:
104
+ return None
@@ -0,0 +1,99 @@
1
+ import sys
2
+ import json
3
+ import argparse
4
+ import urllib.request
5
+ import urllib.parse
6
+ from datetime import datetime
7
+
8
+ # WB API for indicators (Series)
9
+ # Source 2 is World Development Indicators
10
+ WB_API_URL = "https://api.worldbank.org/v2/indicator"
11
+
12
+ def search_worldbank(query: str, limit: int = 10):
13
+ """
14
+ Search World Bank indicators.
15
+ """
16
+ try:
17
+ # The World Bank Indicators API doesn't have a direct "search" parameter for indicators
18
+ # that works exactly like a search engine. We fetch a page and filter by query terms.
19
+ # Alternatively, we could use the 'qterm' on the documents API, but indicators are more tabular.
20
+
21
+ params = {
22
+ "format": "json",
23
+ "per_page": 299, # Max per page to search through more indicators
24
+ "source": 2
25
+ }
26
+
27
+ query_string = urllib.parse.urlencode(params)
28
+ url = f"{WB_API_URL}?{query_string}"
29
+
30
+ req = urllib.request.Request(url)
31
+ with urllib.request.urlopen(req) as response:
32
+ data = json.load(response)
33
+
34
+ # WB response is [metadata, data_list]
35
+ if len(data) < 2:
36
+ return []
37
+
38
+ indicators = data[1]
39
+
40
+ results = []
41
+ count = 0
42
+
43
+ query_terms = query.lower().split()
44
+
45
+ for ind in indicators:
46
+ name = ind.get('name', '')
47
+ source_note = ind.get('sourceNote', '')
48
+ text = (name + " " + source_note).lower()
49
+
50
+ # Simple keyword matching
51
+ if all(term in text for term in query_terms):
52
+ metadata = {
53
+ "id": f"wb:{ind.get('id')}",
54
+ "source": "worldbank",
55
+ "name": name,
56
+ "description": source_note or "No description available.",
57
+ "downloads": 1000, # Placeholder (high relevance for WB)
58
+ "likes": 100,
59
+ "last_updated": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
60
+ "quality_score": 95, # Institutional data is high quality
61
+ "license": {
62
+ "id": "cc-by-4.0",
63
+ "name": "Creative Commons Attribution 4.0",
64
+ "category": "safe",
65
+ "usage_restrictions": [],
66
+ "warnings": []
67
+ },
68
+ "tags": [ind.get('source', {}).get('value')] if ind.get('source') else [],
69
+ "total_examples": 0, # Time series length varies
70
+ "is_safe_source": True,
71
+ "is_structured": True,
72
+ "metadata_url": f"https://data.worldbank.org/indicator/{ind.get('id')}",
73
+ "domain": "economics"
74
+ }
75
+
76
+ results.append(metadata)
77
+ count += 1
78
+ if count >= limit:
79
+ break
80
+
81
+ return results
82
+
83
+ except Exception as e:
84
+ return {"error": str(e)}
85
+
86
+ def main():
87
+ parser = argparse.ArgumentParser(description="World Bank Adapter")
88
+ parser.add_argument("--action", required=True, choices=["search"])
89
+ parser.add_argument("--query", required=True)
90
+ parser.add_argument("--limit", type=int, default=10)
91
+
92
+ args = parser.parse_args()
93
+
94
+ if args.action == "search":
95
+ results = search_worldbank(args.query, args.limit)
96
+ print(json.dumps(results))
97
+
98
+ if __name__ == "__main__":
99
+ main()
@@ -1,6 +0,0 @@
1
- {
2
- "project": "vesper",
3
- "dataDir": "./datasets",
4
- "exportFormat": "parquet",
5
- "tokens": {}
6
- }