vesper-wizard 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/row_count.py +0 -54
  203. package/src/python/splitter_engine.py +0 -283
  204. package/src/python/target_engine.py +0 -154
  205. package/src/python/test_framework_adapters.py +0 -61
  206. package/src/python/test_fusion_engine.py +0 -89
  207. package/src/python/uci_adapter.py +0 -94
  208. package/src/python/vesper/__init__.py +0 -1
  209. package/src/python/vesper/core/__init__.py +0 -1
  210. package/src/python/vesper/core/asset_downloader.py +0 -679
  211. package/src/python/vesper/core/download_recipe.py +0 -104
  212. package/src/python/worldbank_adapter.py +0 -99
  213. package/wizard.cjs +0 -3
@@ -1,368 +0,0 @@
1
- import sys
2
- import json
3
- import os
4
- import hashlib
5
- import subprocess
6
- from typing import List, Optional, Union, Dict, Any
7
-
8
- try:
9
- import polars as pl
10
- HAS_POLARS = True
11
- except ImportError:
12
- HAS_POLARS = False
13
-
14
-
15
- def _safe_suffix(source_path: str, idx: int) -> str:
16
- base = os.path.basename(source_path)
17
- base = os.path.splitext(base)[0].replace(" ", "_")
18
- if not base:
19
- base = f"source{idx+1}"
20
- return base
21
-
22
-
23
- def _load_with_polars(path: str):
24
- ext = os.path.splitext(path)[1].lower()
25
- if ext == ".csv":
26
- return pl.read_csv(path, ignore_errors=True)
27
- if ext in [".parquet", ".pq"]:
28
- return pl.read_parquet(path)
29
- if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
30
- return pl.read_ipc(path)
31
- if ext in [".jsonl", ".ndjson"]:
32
- return pl.read_ndjson(path)
33
- if ext == ".json":
34
- try:
35
- return pl.read_json(path)
36
- except Exception:
37
- return pl.read_ndjson(path)
38
- raise ValueError(f"Unsupported source format: {ext} ({path})")
39
-
40
-
41
- def _write_with_polars(df, output_path: str, fmt: str, compression: Optional[str]):
42
- fmt = fmt.lower()
43
- comp = compression
44
- if comp in ["none", "None", "uncompressed"]:
45
- comp = None
46
-
47
- if fmt == "csv":
48
- df.write_csv(output_path)
49
- elif fmt == "parquet":
50
- df.write_parquet(output_path, compression=(compression or "snappy"))
51
- elif fmt == "feather":
52
- ipc_comp = "lz4" if compression is None else compression
53
- if ipc_comp == "uncompressed":
54
- ipc_comp = None
55
- df.write_ipc(output_path, compression=ipc_comp)
56
- elif fmt in ["arrow", "ipc"]:
57
- df.write_ipc(output_path, compression=comp)
58
- elif fmt == "jsonl":
59
- df.write_ndjson(output_path)
60
- else:
61
- raise ValueError(f"Unsupported output format: {fmt}")
62
-
63
-
64
- def _ensure_parent(path: str):
65
- parent = os.path.dirname(path)
66
- if parent and not os.path.exists(parent):
67
- os.makedirs(parent, exist_ok=True)
68
-
69
-
70
- def _compute_null_ratio(df) -> float:
71
- if len(df) == 0 or len(df.columns) == 0:
72
- return 0.0
73
- nulls = 0
74
- for col in df.columns:
75
- nulls += int(df[col].null_count())
76
- total_cells = len(df) * len(df.columns)
77
- return (nulls / total_cells) * 100 if total_cells else 0.0
78
-
79
-
80
- def _leakage_report(df, source_col: str = "_vesper_source", id_col: Optional[str] = None) -> Dict[str, Any]:
81
- report = {
82
- "leakage_detected": False,
83
- "leakage_count": 0,
84
- "id_column": id_col,
85
- "warnings": []
86
- }
87
-
88
- if source_col not in df.columns:
89
- report["warnings"].append("Source marker column missing; leakage check skipped.")
90
- return report
91
-
92
- if id_col and id_col in df.columns:
93
- overlap = (
94
- df.group_by(id_col)
95
- .agg(pl.col(source_col).n_unique().alias("source_count"))
96
- .filter(pl.col("source_count") > 1)
97
- )
98
- overlap_count = len(overlap)
99
- if overlap_count > 0:
100
- report["leakage_detected"] = True
101
- report["leakage_count"] = overlap_count
102
- report["warnings"].append(f"Found {overlap_count} IDs appearing across multiple sources")
103
- else:
104
- # Fallback: hash rows (excluding source marker) and check if same row appears in multiple sources
105
- compare_cols = [c for c in df.columns if c != source_col]
106
- if not compare_cols:
107
- return report
108
-
109
- row_sig = df.select(compare_cols).with_columns(
110
- pl.concat_str([pl.col(c).cast(pl.Utf8, strict=False) for c in compare_cols], separator="||").alias("_row_sig")
111
- )
112
- tmp = row_sig.with_columns(df[source_col]).select(["_row_sig", source_col])
113
-
114
- overlap = (
115
- tmp.group_by("_row_sig")
116
- .agg(pl.col(source_col).n_unique().alias("source_count"))
117
- .filter(pl.col("source_count") > 1)
118
- )
119
- overlap_count = len(overlap)
120
- if overlap_count > 0:
121
- report["leakage_detected"] = True
122
- report["leakage_count"] = overlap_count
123
- report["warnings"].append(f"Found {overlap_count} duplicate rows across multiple sources")
124
-
125
- return report
126
-
127
-
128
- def _run_quality_engine(output_path: str) -> Dict[str, Any]:
129
- # Reuse existing quality engine script (same folder)
130
- try:
131
- script_dir = os.path.dirname(os.path.abspath(__file__))
132
- quality_script = os.path.join(script_dir, "quality_engine.py")
133
- cmd = [sys.executable, quality_script, output_path]
134
- proc = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
135
- if proc.returncode != 0:
136
- return {"error": f"quality_engine failed: {proc.stderr.strip()}"}
137
- return json.loads(proc.stdout)
138
- except Exception as e:
139
- return {"error": str(e)}
140
-
141
-
142
- def _concat_polars(dfs, source_names: List[str]):
143
- # Schema alignment by union columns
144
- all_cols = []
145
- col_set = set()
146
- for df in dfs:
147
- for col in df.columns:
148
- if col not in col_set:
149
- all_cols.append(col)
150
- col_set.add(col)
151
- if "_vesper_source" not in col_set:
152
- all_cols.append("_vesper_source")
153
-
154
- aligned = []
155
- for i, df in enumerate(dfs):
156
- tmp = df
157
- if "_vesper_source" not in tmp.columns:
158
- tmp = tmp.with_columns(pl.lit(source_names[i]).alias("_vesper_source"))
159
- missing = [c for c in all_cols if c not in tmp.columns]
160
- for m in missing:
161
- tmp = tmp.with_columns(pl.lit(None).alias(m))
162
- tmp = tmp.select(all_cols)
163
- aligned.append(tmp)
164
-
165
- return pl.concat(aligned, how="vertical_relaxed")
166
-
167
-
168
- def _join_polars(dfs, source_names: List[str], join_on: List[str], how: str):
169
- if len(dfs) < 2:
170
- raise ValueError("Join strategy requires at least 2 sources")
171
-
172
- conflict_renames = []
173
- left = dfs[0]
174
- if "_vesper_source" not in left.columns:
175
- left = left.with_columns(pl.lit(source_names[0]).alias("_vesper_source"))
176
-
177
- for i in range(1, len(dfs)):
178
- right = dfs[i]
179
- if "_vesper_source" not in right.columns:
180
- right = right.with_columns(pl.lit(source_names[i]).alias("_vesper_source"))
181
-
182
- overlaps = [
183
- c for c in right.columns
184
- if c in left.columns and c not in join_on
185
- ]
186
- rename_map = {}
187
- suffix = _safe_suffix(source_names[i], i)
188
- for c in overlaps:
189
- new_name = f"{c}_{suffix}"
190
- rename_map[c] = new_name
191
- conflict_renames.append({"source": source_names[i], "from": c, "to": new_name})
192
-
193
- if rename_map:
194
- right = right.rename(rename_map)
195
-
196
- left = left.join(right, on=join_on, how=how, coalesce=True)
197
-
198
- return left, conflict_renames
199
-
200
-
201
- def fuse_datasets(
202
- sources: List[Union[str, Any]],
203
- strategy: str = "concat",
204
- join_on: Optional[Union[str, List[str]]] = None,
205
- how: str = "inner",
206
- dedup: bool = True,
207
- run_quality_after: bool = True,
208
- leakage_check: bool = True,
209
- output_path: Optional[str] = None,
210
- output_format: str = "feather",
211
- compression: Optional[str] = "lz4",
212
- preview: bool = True,
213
- preview_rows: int = 500,
214
- id_column: Optional[str] = None,
215
- **kwargs,
216
- ):
217
- if not HAS_POLARS:
218
- return {"error": "Polars is required for dataset fusion. Install with: pip install polars"}
219
-
220
- if not sources or len(sources) < 2:
221
- return {"error": "Need at least 2 sources to fuse"}
222
-
223
- source_paths: List[str] = []
224
- source_names: List[str] = []
225
-
226
- for i, src in enumerate(sources):
227
- if isinstance(src, str):
228
- source_paths.append(src)
229
- source_names.append(src)
230
- elif isinstance(src, dict):
231
- p = src.get("path") or src.get("local_path")
232
- if not p:
233
- return {"error": f"Source {i} missing path"}
234
- source_paths.append(p)
235
- source_names.append(src.get("name") or p)
236
- else:
237
- return {"error": f"Unsupported source type at index {i}"}
238
-
239
- for p in source_paths:
240
- if not os.path.exists(p):
241
- return {"error": f"Source not found: {p}"}
242
-
243
- if output_path is None:
244
- ext_map = {
245
- "feather": ".feather",
246
- "parquet": ".parquet",
247
- "csv": ".csv",
248
- "jsonl": ".jsonl",
249
- "arrow": ".arrow",
250
- "ipc": ".arrow"
251
- }
252
- ext = ext_map.get(output_format, ".feather")
253
- output_path = os.path.abspath(f"fused_dataset{ext}")
254
-
255
- _ensure_parent(output_path)
256
-
257
- try:
258
- dfs = [_load_with_polars(p) for p in source_paths]
259
- except Exception as e:
260
- return {"error": f"Failed to load sources: {str(e)}"}
261
-
262
- rows_before = sum(len(df) for df in dfs)
263
- null_before = sum(_compute_null_ratio(df) for df in dfs) / len(dfs)
264
-
265
- strategy = (strategy or "concat").lower()
266
- how = (how or "inner").lower()
267
- conflict_renames = []
268
-
269
- try:
270
- if strategy == "concat":
271
- fused = _concat_polars(dfs, source_names)
272
- elif strategy == "join":
273
- if not join_on:
274
- return {"error": "join_on is required when strategy='join'"}
275
- join_keys = [join_on] if isinstance(join_on, str) else list(join_on)
276
- for key in join_keys:
277
- for idx, df in enumerate(dfs):
278
- if key not in df.columns:
279
- return {"error": f"Join key '{key}' missing in source {source_paths[idx]}"}
280
- fused, conflict_renames = _join_polars(dfs, source_names, join_keys, how)
281
- else:
282
- return {"error": f"Unknown strategy: {strategy}. Use concat or join."}
283
-
284
- duplicates_removed = 0
285
- if dedup:
286
- before = len(fused)
287
- fused = fused.unique(maintain_order=True)
288
- duplicates_removed = before - len(fused)
289
-
290
- leakage = None
291
- if leakage_check:
292
- leakage = _leakage_report(fused, source_col="_vesper_source", id_col=id_column)
293
-
294
- _write_with_polars(fused, output_path, output_format, compression)
295
-
296
- preview_path = None
297
- if preview:
298
- preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
299
- fused.head(min(preview_rows, len(fused))).write_csv(preview_path)
300
-
301
- quality_report = None
302
- if run_quality_after:
303
- quality_report = _run_quality_engine(output_path)
304
-
305
- rows_after = len(fused)
306
- null_after = _compute_null_ratio(fused)
307
-
308
- return {
309
- "success": True,
310
- "output_path": output_path,
311
- "preview_path": preview_path,
312
- "stats": {
313
- "sources_count": len(source_paths),
314
- "rows_before": rows_before,
315
- "rows_after": rows_after,
316
- "columns_after": len(fused.columns),
317
- "duplicates_removed": duplicates_removed,
318
- "null_ratio_before": round(null_before, 3),
319
- "null_ratio_after": round(null_after, 3),
320
- "null_delta": round(null_after - null_before, 3),
321
- "conflict_renames": conflict_renames,
322
- },
323
- "quality_report": quality_report,
324
- "leakage_report": leakage,
325
- }
326
-
327
- except Exception as e:
328
- return {"error": f"Fusion failed: {str(e)}"}
329
-
330
-
331
- def main():
332
- if len(sys.argv) < 3:
333
- print(json.dumps({
334
- "error": "Usage: fusion_engine.py <sources_json> <output_path> [config_json]"
335
- }))
336
- sys.exit(1)
337
-
338
- try:
339
- sources = json.loads(sys.argv[1])
340
- output_path = sys.argv[2]
341
- config = {}
342
- if len(sys.argv) > 3:
343
- config = json.loads(sys.argv[3])
344
-
345
- result = fuse_datasets(
346
- sources=sources,
347
- output_path=output_path,
348
- strategy=config.get("strategy", "concat"),
349
- join_on=config.get("join_on"),
350
- how=config.get("how", "inner"),
351
- dedup=config.get("dedup", True),
352
- run_quality_after=config.get("run_quality_after", True),
353
- leakage_check=config.get("leakage_check", True),
354
- output_format=config.get("output_format", "feather"),
355
- compression=config.get("compression", "lz4"),
356
- preview=config.get("preview", True),
357
- preview_rows=config.get("preview_rows", 500),
358
- id_column=config.get("id_column"),
359
- )
360
-
361
- print(json.dumps(result))
362
- except Exception as e:
363
- print(json.dumps({"error": str(e)}))
364
- sys.exit(1)
365
-
366
-
367
- if __name__ == "__main__":
368
- main()
@@ -1,106 +0,0 @@
1
- import sys
2
- import json
3
- import argparse
4
- import urllib.request
5
- import urllib.parse
6
- import os
7
- from datetime import datetime
8
-
9
- GITHUB_API_URL = "https://api.github.com/search/repositories"
10
-
11
- def search_github(query: str, limit: int = 10):
12
- """
13
- Search GitHub for dataset repositories.
14
- """
15
- try:
16
- # Construct refined query:
17
- # User query + (topic:dataset OR topic:data)
18
- # We also filter for repositories with > 5 stars to ensure some relevance
19
- refined_query = f"{query} topic:dataset stars:>5"
20
-
21
- params = {
22
- "q": refined_query,
23
- "sort": "stars",
24
- "order": "desc",
25
- "per_page": limit
26
- }
27
-
28
- query_string = urllib.parse.urlencode(params)
29
- url = f"{GITHUB_API_URL}?{query_string}"
30
-
31
- req = urllib.request.Request(url)
32
-
33
- # Add User-Agent (Required by GitHub)
34
- req.add_header("User-Agent", "Vesper-Dataset-Search")
35
-
36
- # Add Authorization if token exists
37
- token = os.environ.get("GITHUB_TOKEN")
38
- if token:
39
- req.add_header("Authorization", f"token {token}")
40
-
41
- with urllib.request.urlopen(req) as response:
42
- data = json.load(response)
43
-
44
- items = data.get('items', [])
45
- results = []
46
-
47
- for item in items:
48
- # Map GitHub fields to Vesper schema
49
- # repo: owner/name
50
- repo_id = item.get("full_name")
51
-
52
- # Simple licensing map
53
- license_info = item.get("license") or {}
54
- license_key = license_info.get("key", "unknown")
55
- license_category = "safe" if license_key in ["mit", "apache-2.0", "cc0-1.0", "bsd-3-clause"] else "unknown"
56
-
57
- metadata = {
58
- "id": f"github:{repo_id}",
59
- "source": "github",
60
- "name": item.get("name"),
61
- "description": item.get("description") or "No description provided.",
62
- "downloads": item.get("forks_count") * 10, # Proxy estimation
63
- "likes": item.get("stargazers_count"),
64
- "stars": item.get("stargazers_count"),
65
- "last_updated": item.get("updated_at"),
66
- "quality_score": min(100, 50 + (item.get("stargazers_count", 0))), # Rough heuristic
67
- "license": {
68
- "id": license_key,
69
- "name": license_info.get("name", "Unknown"),
70
- "category": license_category,
71
- "usage_restrictions": [],
72
- "warnings": []
73
- },
74
- "tags": item.get("topics", []),
75
- "total_examples": 0, # Unknown without drilling deeper
76
- "is_safe_source": True, # GitHub is generally safe code, content varies
77
- "is_structured": False, # Often contains code + data
78
- "metadata_url": item.get("html_url"),
79
- "domain": "general"
80
- }
81
-
82
- results.append(metadata)
83
-
84
- return results
85
-
86
- except urllib.error.HTTPError as e:
87
- if e.code == 403:
88
- return {"error": "Rate limit exceeded. Set GITHUB_TOKEN environment variable."}
89
- return {"error": f"HTTP Error {e.code}: {e.reason}"}
90
- except Exception as e:
91
- return {"error": str(e)}
92
-
93
- def main():
94
- parser = argparse.ArgumentParser(description="GitHub Adapter")
95
- parser.add_argument("--action", required=True, choices=["search"])
96
- parser.add_argument("--query", required=True)
97
- parser.add_argument("--limit", type=int, default=10)
98
-
99
- args = parser.parse_args()
100
-
101
- if args.action == "search":
102
- results = search_github(args.query, args.limit)
103
- print(json.dumps(results))
104
-
105
- if __name__ == "__main__":
106
- main()