vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,104 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import re
5
- from dataclasses import dataclass, asdict
6
- from pathlib import Path
7
- from typing import Any, Dict, Optional
8
-
9
-
10
- DEFAULT_RECIPES_DIR = Path.home() / ".vesper" / "recipes"
11
-
12
-
13
- @dataclass
14
- class DownloadRecipe:
15
- dataset_id: str
16
- source: str
17
- repo_id: str
18
- image_column: Optional[str]
19
- download_method: str
20
- requires_auth: bool
21
- estimated_asset_size_gb: float
22
- total_images: int
23
- fallback_strategy: list[str]
24
-
25
-
26
- def _safe_name(value: str) -> str:
27
- return re.sub(r"[^a-zA-Z0-9._-]+", "_", value).strip("_") or "dataset"
28
-
29
-
30
- def build_download_recipe(dataset_info: Dict[str, Any]) -> Dict[str, Any]:
31
- dataset_id = str(dataset_info.get("dataset_id") or dataset_info.get("id") or "")
32
- source = str(dataset_info.get("source") or "unknown").lower()
33
- repo_id = str(dataset_info.get("repo_id") or dataset_info.get("id") or dataset_id)
34
-
35
- image_column = dataset_info.get("image_column")
36
- if not image_column:
37
- features = dataset_info.get("features") or {}
38
- if isinstance(features, dict):
39
- for key in features.keys():
40
- lower = str(key).lower()
41
- if lower in {"image", "images", "img", "img_path", "image_url", "url"}:
42
- image_column = key
43
- break
44
-
45
- download_method = "url_list"
46
- if source == "huggingface":
47
- download_method = "hf_dataset_image_feature"
48
- elif source == "kaggle":
49
- download_method = "kaggle_archive"
50
- elif source in {"dataworld", "openml"}:
51
- download_method = "direct_file_scan"
52
-
53
- requires_auth = bool(dataset_info.get("requires_auth", source in {"kaggle", "dataworld"}))
54
-
55
- total_images = int(dataset_info.get("total_images") or dataset_info.get("total_examples") or 0)
56
- if total_images <= 0:
57
- total_images = 1000
58
-
59
- estimated_asset_size_gb = round(float(dataset_info.get("estimated_asset_size_gb") or (total_images * 0.0004)), 3)
60
-
61
- fallback_strategy = dataset_info.get("fallback_strategy") or [
62
- "scan_archive_for_images",
63
- "extract_url_column_and_download",
64
- "export_metadata_only_with_actionable_error",
65
- ]
66
-
67
- recipe = DownloadRecipe(
68
- dataset_id=dataset_id or repo_id,
69
- source=source,
70
- repo_id=repo_id,
71
- image_column=image_column,
72
- download_method=download_method,
73
- requires_auth=requires_auth,
74
- estimated_asset_size_gb=estimated_asset_size_gb,
75
- total_images=total_images,
76
- fallback_strategy=list(fallback_strategy),
77
- )
78
-
79
- return asdict(recipe)
80
-
81
-
82
- def save_recipe(recipe: Dict[str, Any], recipes_dir: Optional[str] = None) -> str:
83
- root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
84
- root.mkdir(parents=True, exist_ok=True)
85
-
86
- dataset_id = str(recipe.get("dataset_id") or recipe.get("repo_id") or "dataset")
87
- recipe_dir = root / _safe_name(dataset_id)
88
- recipe_dir.mkdir(parents=True, exist_ok=True)
89
-
90
- out_path = recipe_dir / "download_recipe.json"
91
- out_path.write_text(json.dumps(recipe, indent=2, ensure_ascii=False), encoding="utf-8")
92
- return str(out_path)
93
-
94
-
95
- def get_download_recipe(dataset_id: str, recipes_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
96
- root = Path(recipes_dir) if recipes_dir else DEFAULT_RECIPES_DIR
97
- path = root / _safe_name(dataset_id) / "download_recipe.json"
98
- if not path.exists():
99
- return None
100
-
101
- try:
102
- return json.loads(path.read_text(encoding="utf-8"))
103
- except Exception:
104
- return None
@@ -1,99 +0,0 @@
1
- import sys
2
- import json
3
- import argparse
4
- import urllib.request
5
- import urllib.parse
6
- from datetime import datetime
7
-
8
- # WB API for indicators (Series)
9
- # Source 2 is World Development Indicators
10
- WB_API_URL = "https://api.worldbank.org/v2/indicator"
11
-
12
- def search_worldbank(query: str, limit: int = 10):
13
- """
14
- Search World Bank indicators.
15
- """
16
- try:
17
- # The World Bank Indicators API doesn't have a direct "search" parameter for indicators
18
- # that works exactly like a search engine. We fetch a page and filter by query terms.
19
- # Alternatively, we could use the 'qterm' on the documents API, but indicators are more tabular.
20
-
21
- params = {
22
- "format": "json",
23
- "per_page": 299, # Max per page to search through more indicators
24
- "source": 2
25
- }
26
-
27
- query_string = urllib.parse.urlencode(params)
28
- url = f"{WB_API_URL}?{query_string}"
29
-
30
- req = urllib.request.Request(url)
31
- with urllib.request.urlopen(req) as response:
32
- data = json.load(response)
33
-
34
- # WB response is [metadata, data_list]
35
- if len(data) < 2:
36
- return []
37
-
38
- indicators = data[1]
39
-
40
- results = []
41
- count = 0
42
-
43
- query_terms = query.lower().split()
44
-
45
- for ind in indicators:
46
- name = ind.get('name', '')
47
- source_note = ind.get('sourceNote', '')
48
- text = (name + " " + source_note).lower()
49
-
50
- # Simple keyword matching
51
- if all(term in text for term in query_terms):
52
- metadata = {
53
- "id": f"wb:{ind.get('id')}",
54
- "source": "worldbank",
55
- "name": name,
56
- "description": source_note or "No description available.",
57
- "downloads": 1000, # Placeholder (high relevance for WB)
58
- "likes": 100,
59
- "last_updated": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
60
- "quality_score": 95, # Institutional data is high quality
61
- "license": {
62
- "id": "cc-by-4.0",
63
- "name": "Creative Commons Attribution 4.0",
64
- "category": "safe",
65
- "usage_restrictions": [],
66
- "warnings": []
67
- },
68
- "tags": [ind.get('source', {}).get('value')] if ind.get('source') else [],
69
- "total_examples": 0, # Time series length varies
70
- "is_safe_source": True,
71
- "is_structured": True,
72
- "metadata_url": f"https://data.worldbank.org/indicator/{ind.get('id')}",
73
- "domain": "economics"
74
- }
75
-
76
- results.append(metadata)
77
- count += 1
78
- if count >= limit:
79
- break
80
-
81
- return results
82
-
83
- except Exception as e:
84
- return {"error": str(e)}
85
-
86
- def main():
87
- parser = argparse.ArgumentParser(description="World Bank Adapter")
88
- parser.add_argument("--action", required=True, choices=["search"])
89
- parser.add_argument("--query", required=True)
90
- parser.add_argument("--limit", type=int, default=10)
91
-
92
- args = parser.parse_args()
93
-
94
- if args.action == "search":
95
- results = search_worldbank(args.query, args.limit)
96
- print(json.dumps(results))
97
-
98
- if __name__ == "__main__":
99
- main()
package/wizard.cjs DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- require('./scripts/wizard.cjs');