vesper-wizard 2.0.5 → 2.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +300 -37
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +81 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +62 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +127 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +26 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/config/config-manager.js +221 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +69 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/engine.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/gateway/unified-dataset-gateway.js +409 -0
- package/build/index.js +2704 -0
- package/build/ingestion/hf-downloader.js +171 -0
- package/build/ingestion/ingestor.js +271 -0
- package/build/ingestion/kaggle-downloader.js +102 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +136 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/lib/supabase.js +3 -0
- package/build/metadata/dataworld-source.js +89 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/openml-source.js +87 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +377 -0
- package/build/metadata/store.js +340 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/preparation/target-detector.js +75 -0
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +92 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/config.py +263 -0
- package/build/python/dataworld_engine.py +208 -0
- package/build/python/export_engine.py +243 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/fusion_engine.py +368 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/hf_fallback.py +298 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/kaggle_engine.py +295 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/openml_engine.py +146 -0
- package/build/python/quality_engine.py +267 -0
- package/build/python/row_count.py +54 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/vesper/__init__.py +1 -0
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +1 -0
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +675 -0
- package/build/python/vesper/core/download_recipe.py +104 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +93 -0
- package/build/quality/image-analyzer.js +114 -0
- package/build/quality/media-analyzer.js +115 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +74 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +152 -0
- package/build/search/jit-orchestrator.js +258 -0
- package/build/search/vector-store.js +123 -0
- package/build/splitting/splitter.js +82 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +251 -0
- package/build/utils/downloader.js +52 -0
- package/build/utils/selector.js +69 -0
- package/mcp-config-template.json +18 -0
- package/package.json +101 -29
- package/scripts/postinstall.cjs +114 -0
- package/scripts/preindex_registry.cjs +157 -0
- package/scripts/refresh-index.cjs +87 -0
- package/scripts/wizard.cjs +625 -0
- package/{wizard.js → scripts/wizard.js} +99 -21
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +92 -0
- package/src/python/cleaner.py +226 -0
- package/src/python/config.py +263 -0
- package/src/python/dataworld_engine.py +208 -0
- package/src/python/export_engine.py +243 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/fusion_engine.py +368 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/hf_fallback.py +298 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/kaggle_engine.py +295 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/openml_engine.py +146 -0
- package/src/python/quality_engine.py +267 -0
- package/src/python/row_count.py +54 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/target_engine.py +154 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/test_fusion_engine.py +89 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/vesper/__init__.py +1 -0
- package/src/python/vesper/core/__init__.py +1 -0
- package/src/python/vesper/core/asset_downloader.py +675 -0
- package/src/python/vesper/core/download_recipe.py +104 -0
- package/src/python/worldbank_adapter.py +99 -0
- package/vesper-mcp-config.json +0 -6
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import json
|
|
4
|
+
import base64
|
|
5
|
+
import hashlib
|
|
6
|
+
import secrets
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, Optional
|
|
9
|
+
|
|
10
|
+
SERVICE_NAME = "vesper"
|
|
11
|
+
|
|
12
|
+
KEY_ALIASES = {
|
|
13
|
+
"hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
|
|
14
|
+
"kaggle_username": ["KAGGLE_USERNAME"],
|
|
15
|
+
"kaggle_key": ["KAGGLE_KEY"],
|
|
16
|
+
"dataworld_token": ["DW_AUTH_TOKEN"],
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
import keyring # type: ignore
|
|
21
|
+
HAS_KEYRING = True
|
|
22
|
+
except Exception:
|
|
23
|
+
HAS_KEYRING = False
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from cryptography.fernet import Fernet, InvalidToken # type: ignore
|
|
27
|
+
HAS_FERNET = True
|
|
28
|
+
except Exception:
|
|
29
|
+
HAS_FERNET = False
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _config_path() -> Path:
|
|
33
|
+
return Path.home() / ".vesper" / "config.toml"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _secret_path() -> Path:
|
|
37
|
+
return Path.home() / ".vesper" / ".config_key"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _ensure_parent(path: Path) -> None:
|
|
41
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _read_fallback_toml() -> Dict[str, str]:
|
|
45
|
+
path = _config_path()
|
|
46
|
+
if not path.exists():
|
|
47
|
+
return {}
|
|
48
|
+
|
|
49
|
+
values: Dict[str, str] = {}
|
|
50
|
+
in_keys = False
|
|
51
|
+
method = ""
|
|
52
|
+
|
|
53
|
+
for raw in path.read_text(encoding="utf-8").splitlines():
|
|
54
|
+
line = raw.strip()
|
|
55
|
+
if not line or line.startswith("#"):
|
|
56
|
+
continue
|
|
57
|
+
if line.startswith("[") and line.endswith("]"):
|
|
58
|
+
in_keys = (line == "[keys]")
|
|
59
|
+
continue
|
|
60
|
+
if line.startswith("method") and "=" in line:
|
|
61
|
+
method = line.split("=", 1)[1].strip().strip('"').strip("'")
|
|
62
|
+
continue
|
|
63
|
+
if not in_keys or "=" not in line:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
key, val = line.split("=", 1)
|
|
67
|
+
key = key.strip()
|
|
68
|
+
val = val.strip().strip('"').strip("'")
|
|
69
|
+
values[key] = val
|
|
70
|
+
|
|
71
|
+
if method:
|
|
72
|
+
values["__method__"] = method
|
|
73
|
+
|
|
74
|
+
return values
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _get_or_create_local_secret() -> str:
|
|
78
|
+
secret_file = _secret_path()
|
|
79
|
+
_ensure_parent(secret_file)
|
|
80
|
+
|
|
81
|
+
if secret_file.exists():
|
|
82
|
+
return secret_file.read_text(encoding="utf-8").strip()
|
|
83
|
+
|
|
84
|
+
secret = base64.urlsafe_b64encode(secrets.token_bytes(32)).decode("utf-8")
|
|
85
|
+
secret_file.write_text(secret, encoding="utf-8")
|
|
86
|
+
try:
|
|
87
|
+
os.chmod(secret_file, 0o600)
|
|
88
|
+
except Exception:
|
|
89
|
+
pass
|
|
90
|
+
return secret
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _xor_encrypt(plain: str, secret: str) -> str:
|
|
94
|
+
key = hashlib.sha256(secret.encode("utf-8")).digest()
|
|
95
|
+
data = plain.encode("utf-8")
|
|
96
|
+
out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
|
|
97
|
+
return base64.urlsafe_b64encode(out).decode("utf-8")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _xor_decrypt(cipher_text: str, secret: str) -> str:
|
|
101
|
+
key = hashlib.sha256(secret.encode("utf-8")).digest()
|
|
102
|
+
data = base64.urlsafe_b64decode(cipher_text.encode("utf-8"))
|
|
103
|
+
out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
|
|
104
|
+
return out.decode("utf-8")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _encrypt_value(value: str, secret: str) -> Dict[str, str]:
|
|
108
|
+
if HAS_FERNET:
|
|
109
|
+
token = Fernet(secret.encode("utf-8")).encrypt(value.encode("utf-8")).decode("utf-8")
|
|
110
|
+
return {"method": "fernet", "value": token}
|
|
111
|
+
# fallback encryption (weaker than fernet, but still not plaintext)
|
|
112
|
+
return {"method": "xor", "value": _xor_encrypt(value, secret)}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _decrypt_value(value: str, method: str, secret: str) -> Optional[str]:
|
|
116
|
+
try:
|
|
117
|
+
if method == "fernet" and HAS_FERNET:
|
|
118
|
+
return Fernet(secret.encode("utf-8")).decrypt(value.encode("utf-8")).decode("utf-8")
|
|
119
|
+
if method == "xor":
|
|
120
|
+
return _xor_decrypt(value, secret)
|
|
121
|
+
return None
|
|
122
|
+
except InvalidToken:
|
|
123
|
+
return None
|
|
124
|
+
except Exception:
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _write_fallback_toml(values: Dict[str, str]) -> None:
|
|
129
|
+
path = _config_path()
|
|
130
|
+
_ensure_parent(path)
|
|
131
|
+
|
|
132
|
+
method = values.get("__method__", "fernet" if HAS_FERNET else "xor")
|
|
133
|
+
lines = [
|
|
134
|
+
"# Vesper optional API keys fallback storage",
|
|
135
|
+
"# Encrypted fallback (keyring is preferred)",
|
|
136
|
+
"[meta]",
|
|
137
|
+
f'method = "{method}"',
|
|
138
|
+
"[keys]",
|
|
139
|
+
]
|
|
140
|
+
for key in sorted(values.keys()):
|
|
141
|
+
if key.startswith("__"):
|
|
142
|
+
continue
|
|
143
|
+
val = str(values[key]).replace('"', '\\"')
|
|
144
|
+
lines.append(f'{key} = "{val}"')
|
|
145
|
+
|
|
146
|
+
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _get_from_env(name: str) -> Optional[str]:
|
|
150
|
+
for env_key in KEY_ALIASES.get(name, []):
|
|
151
|
+
val = os.getenv(env_key)
|
|
152
|
+
if val:
|
|
153
|
+
return val
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def get_key(name: str) -> Optional[str]:
|
|
158
|
+
# 1) keyring (secure)
|
|
159
|
+
if HAS_KEYRING:
|
|
160
|
+
try:
|
|
161
|
+
val = keyring.get_password(SERVICE_NAME, name)
|
|
162
|
+
if val:
|
|
163
|
+
return val
|
|
164
|
+
except Exception:
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
# 2) encrypted fallback config.toml
|
|
168
|
+
fallback = _read_fallback_toml()
|
|
169
|
+
enc = fallback.get(name)
|
|
170
|
+
if enc:
|
|
171
|
+
secret = _get_or_create_local_secret()
|
|
172
|
+
method = fallback.get("__method__", "fernet" if HAS_FERNET else "xor")
|
|
173
|
+
dec = _decrypt_value(enc, method, secret)
|
|
174
|
+
if dec:
|
|
175
|
+
return dec
|
|
176
|
+
|
|
177
|
+
# 3) env vars (fallback only)
|
|
178
|
+
env_val = _get_from_env(name)
|
|
179
|
+
if env_val:
|
|
180
|
+
return env_val
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def set_key(name: str, value: str) -> Dict[str, str]:
|
|
185
|
+
if not value:
|
|
186
|
+
return {"ok": "false", "method": "none", "error": "Empty value"}
|
|
187
|
+
|
|
188
|
+
if HAS_KEYRING:
|
|
189
|
+
try:
|
|
190
|
+
keyring.set_password(SERVICE_NAME, name, value)
|
|
191
|
+
return {"ok": "true", "method": "keyring"}
|
|
192
|
+
except Exception:
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
fallback = _read_fallback_toml()
|
|
196
|
+
secret = _get_or_create_local_secret()
|
|
197
|
+
enc = _encrypt_value(value, secret)
|
|
198
|
+
fallback["__method__"] = enc["method"]
|
|
199
|
+
fallback[name] = enc["value"]
|
|
200
|
+
_write_fallback_toml(fallback)
|
|
201
|
+
return {"ok": "true", "method": f'toml:{enc["method"]}'}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def has_key(name: str) -> bool:
|
|
205
|
+
return bool(get_key(name))
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def get_all() -> Dict[str, Optional[str]]:
|
|
209
|
+
return {
|
|
210
|
+
"hf_token": get_key("hf_token"),
|
|
211
|
+
"kaggle_username": get_key("kaggle_username"),
|
|
212
|
+
"kaggle_key": get_key("kaggle_key"),
|
|
213
|
+
"dataworld_token": get_key("dataworld_token"),
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _print_json(data):
|
|
218
|
+
print(json.dumps(data))
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def main() -> None:
|
|
222
|
+
if len(sys.argv) < 2:
|
|
223
|
+
_print_json({
|
|
224
|
+
"ok": False,
|
|
225
|
+
"error": "Usage: config.py <get|set|has|all> [name] [value]",
|
|
226
|
+
})
|
|
227
|
+
sys.exit(1)
|
|
228
|
+
|
|
229
|
+
cmd = sys.argv[1].lower()
|
|
230
|
+
|
|
231
|
+
if cmd == "all":
|
|
232
|
+
_print_json({"ok": True, "data": get_all()})
|
|
233
|
+
return
|
|
234
|
+
|
|
235
|
+
if len(sys.argv) < 3:
|
|
236
|
+
_print_json({"ok": False, "error": "Missing key name"})
|
|
237
|
+
sys.exit(1)
|
|
238
|
+
|
|
239
|
+
name = sys.argv[2]
|
|
240
|
+
|
|
241
|
+
if cmd == "get":
|
|
242
|
+
_print_json({"ok": True, "name": name, "value": get_key(name)})
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
if cmd == "has":
|
|
246
|
+
_print_json({"ok": True, "name": name, "value": has_key(name)})
|
|
247
|
+
return
|
|
248
|
+
|
|
249
|
+
if cmd == "set":
|
|
250
|
+
if len(sys.argv) < 4:
|
|
251
|
+
_print_json({"ok": False, "error": "Missing value for set"})
|
|
252
|
+
sys.exit(1)
|
|
253
|
+
value = sys.argv[3]
|
|
254
|
+
result = set_key(name, value)
|
|
255
|
+
_print_json({"ok": result.get("ok") == "true", "name": name, "method": result.get("method"), "error": result.get("error")})
|
|
256
|
+
return
|
|
257
|
+
|
|
258
|
+
_print_json({"ok": False, "error": f"Unknown command: {cmd}"})
|
|
259
|
+
sys.exit(1)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
if __name__ == "__main__":
|
|
263
|
+
main()
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import tempfile
|
|
5
|
+
import os
|
|
6
|
+
import urllib.request
|
|
7
|
+
import urllib.error
|
|
8
|
+
import urllib.parse
|
|
9
|
+
from typing import Dict, Any, List
|
|
10
|
+
|
|
11
|
+
def _get_token() -> str:
|
|
12
|
+
token = os.environ.get("DW_AUTH_TOKEN")
|
|
13
|
+
if not token:
|
|
14
|
+
raise ValueError("DW_AUTH_TOKEN environment variable is required for data.world")
|
|
15
|
+
return token
|
|
16
|
+
|
|
17
|
+
def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
|
|
18
|
+
owner_field = ds.get("owner", "")
|
|
19
|
+
if isinstance(owner_field, dict):
|
|
20
|
+
owner = owner_field.get("id") or owner_field.get("name") or ""
|
|
21
|
+
else:
|
|
22
|
+
owner = owner_field or ""
|
|
23
|
+
|
|
24
|
+
id_str = ds.get("id", "")
|
|
25
|
+
title = ds.get("title", "")
|
|
26
|
+
|
|
27
|
+
if (not owner or not id_str) and isinstance(ds.get("resourceLink"), str):
|
|
28
|
+
# Expected format includes /<owner>/<dataset-id>
|
|
29
|
+
parts = ds["resourceLink"].strip("/").split("/")
|
|
30
|
+
if len(parts) >= 2:
|
|
31
|
+
owner = owner or parts[-2]
|
|
32
|
+
id_str = id_str or parts[-1]
|
|
33
|
+
|
|
34
|
+
if isinstance(id_str, str) and "/" in id_str and not owner:
|
|
35
|
+
split_ref = id_str.split("/", 1)
|
|
36
|
+
owner = split_ref[0]
|
|
37
|
+
id_str = split_ref[1]
|
|
38
|
+
|
|
39
|
+
if not owner and not id_str:
|
|
40
|
+
owner = "unknown"
|
|
41
|
+
id_str = "unknown"
|
|
42
|
+
|
|
43
|
+
if not title:
|
|
44
|
+
title = f"{owner}/{id_str}"
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
"id": f"dataworld:{owner}/{id_str}",
|
|
48
|
+
"name": title,
|
|
49
|
+
"source": "dataworld",
|
|
50
|
+
"description": ds.get("description", f"data.world dataset {title}"),
|
|
51
|
+
"author": owner,
|
|
52
|
+
"license": {
|
|
53
|
+
"id": "Unknown",
|
|
54
|
+
"category": "unknown",
|
|
55
|
+
"commercial_use": None,
|
|
56
|
+
"warnings": []
|
|
57
|
+
},
|
|
58
|
+
"tags": ds.get("tags", []) + ["dataworld"],
|
|
59
|
+
"downloads": 0,
|
|
60
|
+
"likes": 0,
|
|
61
|
+
"created_at": ds.get("created", ""),
|
|
62
|
+
"updated_at": ds.get("updated", ""),
|
|
63
|
+
"size_bytes": 0,
|
|
64
|
+
"quality_score": 0.8,
|
|
65
|
+
"domain": "general",
|
|
66
|
+
"is_gated": False,
|
|
67
|
+
"is_nsfw": False,
|
|
68
|
+
"description_length": len(ds.get("description", "")),
|
|
69
|
+
"has_readme": False,
|
|
70
|
+
"download_url": f"https://data.world/{owner}/{id_str}",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
74
|
+
try:
|
|
75
|
+
token = _get_token()
|
|
76
|
+
|
|
77
|
+
# data.world simple search API
|
|
78
|
+
url = f"https://api.data.world/v0/search/resources?size={limit}"
|
|
79
|
+
|
|
80
|
+
headers = {
|
|
81
|
+
"Authorization": f"Bearer {token}",
|
|
82
|
+
"Content-Type": "application/json",
|
|
83
|
+
"Accept": "application/json"
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Search datasets and include community results to improve recall
|
|
87
|
+
body = {
|
|
88
|
+
"query": query,
|
|
89
|
+
"category": ["dataset"],
|
|
90
|
+
"includeCommunityResults": True,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
req = urllib.request.Request(url, data=json.dumps(body).encode('utf-8'), headers=headers, method="POST")
|
|
94
|
+
|
|
95
|
+
with urllib.request.urlopen(req) as response:
|
|
96
|
+
data = json.loads(response.read().decode('utf-8'))
|
|
97
|
+
|
|
98
|
+
records = data.get("records", [])
|
|
99
|
+
|
|
100
|
+
# Fallback to advanced endpoint if simple search returns nothing
|
|
101
|
+
if not records:
|
|
102
|
+
adv_url = f"https://api.data.world/v0/search?size={limit}"
|
|
103
|
+
adv_body = {
|
|
104
|
+
"query": query,
|
|
105
|
+
"category": ["dataset"],
|
|
106
|
+
}
|
|
107
|
+
adv_req = urllib.request.Request(
|
|
108
|
+
adv_url,
|
|
109
|
+
data=json.dumps(adv_body).encode("utf-8"),
|
|
110
|
+
headers=headers,
|
|
111
|
+
method="POST",
|
|
112
|
+
)
|
|
113
|
+
with urllib.request.urlopen(adv_req) as response:
|
|
114
|
+
adv_data = json.loads(response.read().decode("utf-8"))
|
|
115
|
+
records = adv_data.get("records", [])
|
|
116
|
+
|
|
117
|
+
items = [_dataset_to_dict(r) for r in records]
|
|
118
|
+
|
|
119
|
+
return {"ok": True, "results": items, "count": len(items)}
|
|
120
|
+
except Exception as e:
|
|
121
|
+
return {"ok": False, "error": f"data.world discover failed: {str(e)}"}
|
|
122
|
+
|
|
123
|
+
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
124
|
+
try:
|
|
125
|
+
token = _get_token()
|
|
126
|
+
|
|
127
|
+
# dataset_ref is expected to be "dataworld:owner/id"
|
|
128
|
+
if dataset_ref.startswith("dataworld:"):
|
|
129
|
+
ref = dataset_ref.split(":", 1)[1]
|
|
130
|
+
else:
|
|
131
|
+
ref = dataset_ref
|
|
132
|
+
|
|
133
|
+
parts = ref.split("/")
|
|
134
|
+
if len(parts) != 2:
|
|
135
|
+
return {"ok": False, "error": f"Invalid data.world dataset ID format. Expected owner/id, got {ref}"}
|
|
136
|
+
|
|
137
|
+
owner, dataset_id = parts
|
|
138
|
+
|
|
139
|
+
if not target_dir:
|
|
140
|
+
target_dir = tempfile.mkdtemp(prefix="vesper_dataworld_")
|
|
141
|
+
|
|
142
|
+
os.makedirs(target_dir, exist_ok=True)
|
|
143
|
+
|
|
144
|
+
# First, get the dataset metadata to find the files
|
|
145
|
+
url = f"https://api.data.world/v0/datasets/{owner}/{dataset_id}"
|
|
146
|
+
headers = {
|
|
147
|
+
"Authorization": f"Bearer {token}",
|
|
148
|
+
"Accept": "application/json"
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
req = urllib.request.Request(url, headers=headers)
|
|
152
|
+
with urllib.request.urlopen(req) as response:
|
|
153
|
+
dataset_meta = json.loads(response.read().decode('utf-8'))
|
|
154
|
+
|
|
155
|
+
files = dataset_meta.get("files", [])
|
|
156
|
+
if not files:
|
|
157
|
+
return {"ok": False, "error": "No files found in this dataset"}
|
|
158
|
+
|
|
159
|
+
# Find the best file to download (prefer csv, parquet, jsonl)
|
|
160
|
+
best_file = None
|
|
161
|
+
for ext in [".parquet", ".csv", ".jsonl", ".json"]:
|
|
162
|
+
for f in files:
|
|
163
|
+
if f.get("name", "").lower().endswith(ext):
|
|
164
|
+
best_file = f
|
|
165
|
+
break
|
|
166
|
+
if best_file:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
if not best_file:
|
|
170
|
+
best_file = files[0] # Just take the first one if no preferred format
|
|
171
|
+
|
|
172
|
+
filename = best_file.get("name")
|
|
173
|
+
|
|
174
|
+
# Download the file
|
|
175
|
+
download_url = f"https://api.data.world/v0/file_download/{owner}/{dataset_id}/{urllib.parse.quote(filename)}"
|
|
176
|
+
|
|
177
|
+
file_path = os.path.join(target_dir, filename)
|
|
178
|
+
|
|
179
|
+
download_req = urllib.request.Request(download_url, headers=headers)
|
|
180
|
+
with urllib.request.urlopen(download_req) as response, open(file_path, 'wb') as out_file:
|
|
181
|
+
out_file.write(response.read())
|
|
182
|
+
|
|
183
|
+
return {
|
|
184
|
+
"ok": True,
|
|
185
|
+
"local_path": file_path,
|
|
186
|
+
"target_dir": target_dir
|
|
187
|
+
}
|
|
188
|
+
except Exception as e:
|
|
189
|
+
return {"ok": False, "error": f"data.world download failed: {str(e)}"}
|
|
190
|
+
|
|
191
|
+
def main():
|
|
192
|
+
parser = argparse.ArgumentParser(description="Vesper data.world Engine")
|
|
193
|
+
parser.add_argument("action", choices=["discover", "download"])
|
|
194
|
+
parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
|
|
195
|
+
parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
|
|
196
|
+
|
|
197
|
+
args = parser.parse_args()
|
|
198
|
+
|
|
199
|
+
if args.action == "discover":
|
|
200
|
+
limit = int(args.arg2) if args.arg2 else 20
|
|
201
|
+
result = discover(args.arg1, limit)
|
|
202
|
+
print(json.dumps(result))
|
|
203
|
+
elif args.action == "download":
|
|
204
|
+
result = download(args.arg1, args.arg2)
|
|
205
|
+
print(json.dumps(result))
|
|
206
|
+
|
|
207
|
+
if __name__ == "__main__":
|
|
208
|
+
main()
|