vesper-wizard 2.0.5 → 2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +300 -37
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +81 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +62 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +127 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/adapters/supabase.js +49 -0
  15. package/build/cloud/storage-manager.js +26 -0
  16. package/build/cloud/types.js +1 -0
  17. package/build/compliance/service.js +73 -0
  18. package/build/compliance/store.js +80 -0
  19. package/build/compliance/types.js +1 -0
  20. package/build/config/config-manager.js +221 -0
  21. package/build/config/secure-keys.js +51 -0
  22. package/build/config/user-config.js +48 -0
  23. package/build/data/processing-worker.js +23 -0
  24. package/build/data/streaming.js +38 -0
  25. package/build/data/worker-pool.js +39 -0
  26. package/build/export/exporter.js +69 -0
  27. package/build/export/packager.js +100 -0
  28. package/build/export/types.js +1 -0
  29. package/build/fusion/aligner.js +56 -0
  30. package/build/fusion/deduplicator.js +69 -0
  31. package/build/fusion/engine.js +69 -0
  32. package/build/fusion/harmonizer.js +39 -0
  33. package/build/fusion/orchestrator.js +86 -0
  34. package/build/fusion/types.js +1 -0
  35. package/build/gateway/unified-dataset-gateway.js +409 -0
  36. package/build/index.js +2704 -0
  37. package/build/ingestion/hf-downloader.js +171 -0
  38. package/build/ingestion/ingestor.js +271 -0
  39. package/build/ingestion/kaggle-downloader.js +102 -0
  40. package/build/install/install-service.js +41 -0
  41. package/build/jobs/manager.js +136 -0
  42. package/build/jobs/queue.js +59 -0
  43. package/build/jobs/types.js +1 -0
  44. package/build/lib/supabase.js +3 -0
  45. package/build/metadata/dataworld-source.js +89 -0
  46. package/build/metadata/domain.js +147 -0
  47. package/build/metadata/github-scraper.js +47 -0
  48. package/build/metadata/institutional-scrapers.js +49 -0
  49. package/build/metadata/kaggle-scraper.js +182 -0
  50. package/build/metadata/kaggle-source.js +70 -0
  51. package/build/metadata/license.js +68 -0
  52. package/build/metadata/monitoring-service.js +107 -0
  53. package/build/metadata/monitoring-store.js +78 -0
  54. package/build/metadata/monitoring-types.js +1 -0
  55. package/build/metadata/openml-source.js +87 -0
  56. package/build/metadata/quality.js +48 -0
  57. package/build/metadata/rate-limiter.js +128 -0
  58. package/build/metadata/scraper.js +377 -0
  59. package/build/metadata/store.js +340 -0
  60. package/build/metadata/types.js +1 -0
  61. package/build/metadata/uci-scraper.js +49 -0
  62. package/build/monitoring/observability.js +76 -0
  63. package/build/preparation/target-detector.js +75 -0
  64. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  65. package/build/python/asset_downloader_engine.py +92 -0
  66. package/build/python/cleaner.py +226 -0
  67. package/build/python/config.py +263 -0
  68. package/build/python/dataworld_engine.py +208 -0
  69. package/build/python/export_engine.py +243 -0
  70. package/build/python/framework_adapters.py +100 -0
  71. package/build/python/fusion_engine.py +368 -0
  72. package/build/python/github_adapter.py +106 -0
  73. package/build/python/hf_fallback.py +298 -0
  74. package/build/python/image_engine.py +86 -0
  75. package/build/python/kaggle_engine.py +295 -0
  76. package/build/python/media_engine.py +133 -0
  77. package/build/python/nasa_adapter.py +82 -0
  78. package/build/python/openml_engine.py +146 -0
  79. package/build/python/quality_engine.py +267 -0
  80. package/build/python/row_count.py +54 -0
  81. package/build/python/splitter_engine.py +283 -0
  82. package/build/python/target_engine.py +154 -0
  83. package/build/python/test_framework_adapters.py +61 -0
  84. package/build/python/test_fusion_engine.py +89 -0
  85. package/build/python/uci_adapter.py +94 -0
  86. package/build/python/vesper/__init__.py +1 -0
  87. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  88. package/build/python/vesper/core/__init__.py +1 -0
  89. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  90. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  91. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  92. package/build/python/vesper/core/asset_downloader.py +675 -0
  93. package/build/python/vesper/core/download_recipe.py +104 -0
  94. package/build/python/worldbank_adapter.py +99 -0
  95. package/build/quality/analyzer.js +93 -0
  96. package/build/quality/image-analyzer.js +114 -0
  97. package/build/quality/media-analyzer.js +115 -0
  98. package/build/quality/quality-orchestrator.js +162 -0
  99. package/build/quality/types.js +1 -0
  100. package/build/scripts/build-index.js +54 -0
  101. package/build/scripts/check-db.js +73 -0
  102. package/build/scripts/check-jobs.js +24 -0
  103. package/build/scripts/check-naruto.js +17 -0
  104. package/build/scripts/cleanup-kaggle.js +41 -0
  105. package/build/scripts/demo-full-pipeline.js +62 -0
  106. package/build/scripts/demo-ui.js +58 -0
  107. package/build/scripts/e2e-demo.js +72 -0
  108. package/build/scripts/massive-scrape.js +103 -0
  109. package/build/scripts/ops-dashboard.js +33 -0
  110. package/build/scripts/repro-bug.js +37 -0
  111. package/build/scripts/repro-export-bug.js +56 -0
  112. package/build/scripts/scrape-metadata.js +100 -0
  113. package/build/scripts/search-cli.js +26 -0
  114. package/build/scripts/test-bias.js +45 -0
  115. package/build/scripts/test-caching.js +51 -0
  116. package/build/scripts/test-cleaning.js +76 -0
  117. package/build/scripts/test-cloud-storage.js +48 -0
  118. package/build/scripts/test-compliance.js +58 -0
  119. package/build/scripts/test-conversion.js +64 -0
  120. package/build/scripts/test-custom-rules.js +58 -0
  121. package/build/scripts/test-db-opt.js +63 -0
  122. package/build/scripts/test-export-custom.js +33 -0
  123. package/build/scripts/test-exporter.js +53 -0
  124. package/build/scripts/test-fusion.js +61 -0
  125. package/build/scripts/test-github.js +27 -0
  126. package/build/scripts/test-group-split.js +52 -0
  127. package/build/scripts/test-hf-download.js +29 -0
  128. package/build/scripts/test-holdout-manager.js +61 -0
  129. package/build/scripts/test-hybrid-search.js +41 -0
  130. package/build/scripts/test-image-analysis.js +50 -0
  131. package/build/scripts/test-ingestion-infra.js +39 -0
  132. package/build/scripts/test-install.js +40 -0
  133. package/build/scripts/test-institutional.js +26 -0
  134. package/build/scripts/test-integrity.js +41 -0
  135. package/build/scripts/test-jit.js +42 -0
  136. package/build/scripts/test-job-queue.js +62 -0
  137. package/build/scripts/test-kaggle-download.js +34 -0
  138. package/build/scripts/test-large-data.js +50 -0
  139. package/build/scripts/test-mcp-v5.js +74 -0
  140. package/build/scripts/test-media-analysis.js +61 -0
  141. package/build/scripts/test-monitoring.js +91 -0
  142. package/build/scripts/test-observability.js +106 -0
  143. package/build/scripts/test-packager.js +55 -0
  144. package/build/scripts/test-pipeline.js +50 -0
  145. package/build/scripts/test-planning.js +64 -0
  146. package/build/scripts/test-privacy.js +38 -0
  147. package/build/scripts/test-production-sync.js +36 -0
  148. package/build/scripts/test-quality.js +43 -0
  149. package/build/scripts/test-robust-ingestion.js +41 -0
  150. package/build/scripts/test-schema.js +45 -0
  151. package/build/scripts/test-split-validation.js +40 -0
  152. package/build/scripts/test-splitter.js +93 -0
  153. package/build/scripts/test-target-detector.js +29 -0
  154. package/build/scripts/test-uci.js +27 -0
  155. package/build/scripts/test-unified-quality.js +86 -0
  156. package/build/scripts/test-write.js +14 -0
  157. package/build/scripts/verify-integration.js +57 -0
  158. package/build/scripts/verify-priority.js +33 -0
  159. package/build/search/embedder.js +34 -0
  160. package/build/search/engine.js +152 -0
  161. package/build/search/jit-orchestrator.js +258 -0
  162. package/build/search/vector-store.js +123 -0
  163. package/build/splitting/splitter.js +82 -0
  164. package/build/splitting/types.js +1 -0
  165. package/build/tools/formatter.js +251 -0
  166. package/build/utils/downloader.js +52 -0
  167. package/build/utils/selector.js +69 -0
  168. package/mcp-config-template.json +18 -0
  169. package/package.json +101 -29
  170. package/scripts/postinstall.cjs +114 -0
  171. package/scripts/preindex_registry.cjs +157 -0
  172. package/scripts/refresh-index.cjs +87 -0
  173. package/{wizard.js → scripts/wizard.js} +99 -21
  174. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  175. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  176. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  177. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  178. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  179. package/src/python/asset_downloader_engine.py +92 -0
  180. package/src/python/cleaner.py +226 -0
  181. package/src/python/config.py +263 -0
  182. package/src/python/dataworld_engine.py +208 -0
  183. package/src/python/export_engine.py +243 -0
  184. package/src/python/framework_adapters.py +100 -0
  185. package/src/python/fusion_engine.py +368 -0
  186. package/src/python/github_adapter.py +106 -0
  187. package/src/python/hf_fallback.py +298 -0
  188. package/src/python/image_engine.py +86 -0
  189. package/src/python/kaggle_engine.py +295 -0
  190. package/src/python/media_engine.py +133 -0
  191. package/src/python/nasa_adapter.py +82 -0
  192. package/src/python/openml_engine.py +146 -0
  193. package/src/python/quality_engine.py +267 -0
  194. package/src/python/row_count.py +54 -0
  195. package/src/python/splitter_engine.py +283 -0
  196. package/src/python/target_engine.py +154 -0
  197. package/src/python/test_framework_adapters.py +61 -0
  198. package/src/python/test_fusion_engine.py +89 -0
  199. package/src/python/uci_adapter.py +94 -0
  200. package/src/python/vesper/__init__.py +1 -0
  201. package/src/python/vesper/core/__init__.py +1 -0
  202. package/src/python/vesper/core/asset_downloader.py +675 -0
  203. package/src/python/vesper/core/download_recipe.py +104 -0
  204. package/src/python/worldbank_adapter.py +99 -0
  205. package/vesper-mcp-config.json +0 -6
@@ -0,0 +1,69 @@
1
+ import * as crypto from "crypto";
2
+ export class Deduplicator {
3
+ config;
4
+ seenHashes = new Set();
5
+ seenTexts = new Map(); // column -> tokens[]
6
+ constructor(config) {
7
+ this.config = config;
8
+ }
9
+ /**
10
+ * Checks if a record is an exact duplicate based on all columns.
11
+ */
12
+ isExactDuplicate(record) {
13
+ if (!this.config.dedupe_config.exact)
14
+ return false;
15
+ const hash = crypto
16
+ .createHash("md5")
17
+ .update(JSON.stringify(record))
18
+ .digest("hex");
19
+ if (this.seenHashes.has(hash)) {
20
+ return true;
21
+ }
22
+ this.seenHashes.add(hash);
23
+ return false;
24
+ }
25
+ /**
26
+ * Checks if a record is a fuzzy duplicate based on configured columns.
27
+ */
28
+ isFuzzyDuplicate(record) {
29
+ if (!this.config.dedupe_config.fuzzy)
30
+ return false;
31
+ if (!this.config.dedupe_config.fuzzy_columns.length)
32
+ return false;
33
+ for (const col of this.config.dedupe_config.fuzzy_columns) {
34
+ const text = String(record[col] || "");
35
+ if (!text || text.length < 10)
36
+ continue; // Skip short/empty tags
37
+ const tokens = this.tokenize(text);
38
+ if (this.isSimilar(col, tokens)) {
39
+ return true;
40
+ }
41
+ // In a real implementation, we'd use a more efficient data structure (like LSH)
42
+ // For now, we store tokens and compare (O(N^2) in worst case, use with caution)
43
+ // But we'll keep it simple for this phase.
44
+ // Actually, let's just store the tokens.
45
+ }
46
+ return false;
47
+ }
48
+ tokenize(text) {
49
+ return new Set(text.toLowerCase().split(/\s+/).filter(t => t.length > 2));
50
+ }
51
+ isSimilar(column, tokens) {
52
+ if (!this.seenTexts.has(column)) {
53
+ this.seenTexts.set(column, []);
54
+ }
55
+ const stored = this.seenTexts.get(column);
56
+ for (const existingSerialized of stored) {
57
+ const existing = new Set(JSON.parse(existingSerialized));
58
+ const intersection = new Set([...tokens].filter(t => existing.has(t)));
59
+ const union = new Set([...tokens, ...existing]);
60
+ const jaccard = intersection.size / union.size;
61
+ if (jaccard >= this.config.dedupe_config.fuzzy_threshold) {
62
+ return true;
63
+ }
64
+ }
65
+ // Add to seen
66
+ stored.push(JSON.stringify([...tokens]));
67
+ return false;
68
+ }
69
+ }
@@ -0,0 +1,69 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ export class DataFusionEngine {
5
+ pythonPath = "python";
6
+ scriptPath;
7
+ constructor(buildDir = process.cwd()) {
8
+ const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
9
+ const dataRoot = path.join(homeDir, ".vesper");
10
+ const scriptPath0 = path.resolve(dataRoot, "python", "fusion_engine.py");
11
+ const scriptPath1 = path.resolve(buildDir, "python", "fusion_engine.py");
12
+ const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "fusion_engine.py");
13
+ const scriptPath3 = path.resolve(buildDir, "..", "python", "fusion_engine.py");
14
+ if (fs.existsSync(scriptPath0)) {
15
+ this.scriptPath = scriptPath0;
16
+ }
17
+ else if (fs.existsSync(scriptPath1)) {
18
+ this.scriptPath = scriptPath1;
19
+ }
20
+ else if (fs.existsSync(scriptPath2)) {
21
+ this.scriptPath = scriptPath2;
22
+ }
23
+ else if (fs.existsSync(scriptPath3)) {
24
+ this.scriptPath = scriptPath3;
25
+ }
26
+ else {
27
+ this.scriptPath = scriptPath0;
28
+ }
29
+ if (process.platform === "win32") {
30
+ this.pythonPath = "py";
31
+ }
32
+ }
33
+ async fuse(sourcePaths, outputPath, options = {}) {
34
+ return new Promise((resolve, reject) => {
35
+ if (!Array.isArray(sourcePaths) || sourcePaths.length < 2) {
36
+ reject(new Error("At least 2 source paths are required for fusion"));
37
+ return;
38
+ }
39
+ const args = [
40
+ this.scriptPath,
41
+ JSON.stringify(sourcePaths),
42
+ outputPath,
43
+ JSON.stringify(options),
44
+ ];
45
+ const processRef = spawn(this.pythonPath, args);
46
+ let stdout = "";
47
+ let stderr = "";
48
+ processRef.stdout.on("data", (data) => (stdout += data.toString()));
49
+ processRef.stderr.on("data", (data) => (stderr += data.toString()));
50
+ processRef.on("close", (code) => {
51
+ if (code !== 0) {
52
+ reject(new Error(`Fusion failed: ${stderr || stdout}`));
53
+ return;
54
+ }
55
+ try {
56
+ const result = JSON.parse(stdout);
57
+ if (result.error) {
58
+ reject(new Error(result.error));
59
+ return;
60
+ }
61
+ resolve(result);
62
+ }
63
+ catch (e) {
64
+ reject(new Error(`Failed to parse fusion output: ${stdout}`));
65
+ }
66
+ });
67
+ });
68
+ }
69
+ }
@@ -0,0 +1,39 @@
1
+ export class LabelHarmonizer {
2
+ config;
3
+ constructor(config) {
4
+ this.config = config;
5
+ }
6
+ /**
7
+ * Maps a raw label value to a unified canonical value.
8
+ */
9
+ harmonize(label) {
10
+ let harmonized = label;
11
+ // 1. apply explicit mapping
12
+ if (this.config.label_map && label in this.config.label_map) {
13
+ harmonized = this.config.label_map[label];
14
+ }
15
+ // 2. apply multi-class to binary conversion
16
+ if (this.config.multi_to_binary) {
17
+ const { positive_classes, positive_label, negative_label } = this.config.multi_to_binary;
18
+ const isPositive = positive_classes.some((pc) => String(pc).toLowerCase() === String(harmonized).toLowerCase());
19
+ return isPositive ? positive_label : negative_label;
20
+ }
21
+ return harmonized;
22
+ }
23
+ /**
24
+ * Checks if a label distribution is balanced enough (placeholder for quality scoring).
25
+ */
26
+ checkBalance(distribution) {
27
+ const warnings = [];
28
+ const total = Object.values(distribution).reduce((a, b) => a + b, 0);
29
+ if (total === 0)
30
+ return warnings;
31
+ for (const [label, count] of Object.entries(distribution)) {
32
+ const pct = count / total;
33
+ if (pct < 0.05) {
34
+ warnings.push(`Extreme minority class detected: "${label}" (${(pct * 100).toFixed(1)}%)`);
35
+ }
36
+ }
37
+ return warnings;
38
+ }
39
+ }
@@ -0,0 +1,86 @@
1
+ import { SchemaAligner } from "./aligner.js";
2
+ import { LabelHarmonizer } from "./harmonizer.js";
3
+ import { Deduplicator } from "./deduplicator.js";
4
+ export class FusionOrchestrator {
5
+ config;
6
+ aligner;
7
+ harmonizer;
8
+ deduplicator;
9
+ constructor(config) {
10
+ this.config = config;
11
+ this.aligner = new SchemaAligner(config);
12
+ this.harmonizer = new LabelHarmonizer(config);
13
+ this.deduplicator = new Deduplicator(config);
14
+ }
15
+ /**
16
+ * Fuses multiple datasets into one.
17
+ * @param datasets Metadata of datasets to fuse
18
+ * @param dataRecords Actual records from all datasets (flattened)
19
+ */
20
+ async fuse(datasets, dataRecords) {
21
+ const stats = {
22
+ total_input_rows: dataRecords.length,
23
+ total_output_rows: 0,
24
+ duplicates_removed: 0,
25
+ fuzzy_duplicates_removed: 0,
26
+ schema_overlaps: [],
27
+ label_distribution: {}
28
+ };
29
+ const warnings = [];
30
+ const alignmentMap = this.aligner.align(datasets);
31
+ // Find canonical columns common to at least one dataset (in this simple version)
32
+ const allCanonicalCols = new Set();
33
+ for (const dsId in alignmentMap) {
34
+ for (const canonical of Object.values(alignmentMap[dsId])) {
35
+ allCanonicalCols.add(canonical);
36
+ }
37
+ }
38
+ stats.schema_overlaps = Array.from(allCanonicalCols);
39
+ const fusedData = [];
40
+ for (const { datasetId, record } of dataRecords) {
41
+ const alignedRecord = {};
42
+ const colMap = alignmentMap[datasetId];
43
+ if (!colMap) {
44
+ warnings.push(`No alignment found for dataset ${datasetId}`);
45
+ continue;
46
+ }
47
+ // 1. Align columns
48
+ for (const [sourceCol, val] of Object.entries(record)) {
49
+ const canonical = colMap[sourceCol];
50
+ if (canonical) {
51
+ alignedRecord[canonical] = val;
52
+ }
53
+ }
54
+ // 2. Harmonize label
55
+ if (this.config.target_column && alignedRecord[this.config.target_column] !== undefined) {
56
+ alignedRecord[this.config.target_column] = this.harmonizer.harmonize(alignedRecord[this.config.target_column]);
57
+ }
58
+ // 3. Deduplicate
59
+ if (this.deduplicator.isExactDuplicate(alignedRecord)) {
60
+ stats.duplicates_removed++;
61
+ continue;
62
+ }
63
+ if (this.deduplicator.isFuzzyDuplicate(alignedRecord)) {
64
+ stats.fuzzy_duplicates_removed++;
65
+ continue;
66
+ }
67
+ // Update distribution ONLY for kept records
68
+ if (this.config.target_column && alignedRecord[this.config.target_column] !== undefined) {
69
+ const labelStr = String(alignedRecord[this.config.target_column]);
70
+ stats.label_distribution[labelStr] = (stats.label_distribution[labelStr] || 0) + 1;
71
+ }
72
+ fusedData.push(alignedRecord);
73
+ }
74
+ stats.total_output_rows = fusedData.push(); // Wait, push returns new length
75
+ stats.total_output_rows = fusedData.length;
76
+ // Add balance warnings
77
+ const balanceWarnings = this.harmonizer.checkBalance(stats.label_distribution);
78
+ warnings.push(...balanceWarnings);
79
+ return {
80
+ success: true,
81
+ output_path: "fused_dataset.json", // Placeholder
82
+ stats,
83
+ warnings
84
+ };
85
+ }
86
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,409 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ import http from "http";
4
+ import https from "https";
5
+ import { HuggingFaceScraper } from "../metadata/scraper.js";
6
+ export class UnifiedDatasetGateway {
7
+ deps;
8
+ constructor(deps) {
9
+ this.deps = deps;
10
+ }
11
+ getProviderStatuses(includeUnavailable = true) {
12
+ const hasHfToken = !!(process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN);
13
+ const hasKaggle = this.deps.dataIngestor.hasKaggleCredentials();
14
+ const hasDataWorld = this.deps.hasDataWorldToken();
15
+ const hasBigQuery = !!(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GOOGLE_CLOUD_PROJECT);
16
+ const providers = [
17
+ {
18
+ source: "huggingface",
19
+ display_name: "Hugging Face",
20
+ available: true,
21
+ auth_mode: hasHfToken ? "public-or-server-managed" : "public",
22
+ supported_operations: ["discover", "download", "info"],
23
+ requires_end_user_key: false,
24
+ notes: hasHfToken
25
+ ? ["Public datasets are open by default. Gated/private repos can be accessed via the server-managed HF token when configured."]
26
+ : ["Public datasets work without any user key. Gated/private repos need an operator or user token."],
27
+ },
28
+ {
29
+ source: "openml",
30
+ display_name: "OpenML",
31
+ available: true,
32
+ auth_mode: "public",
33
+ supported_operations: ["discover", "download", "info"],
34
+ requires_end_user_key: false,
35
+ notes: ["OpenML is exposed as a keyless public provider through the gateway."],
36
+ },
37
+ {
38
+ source: "kaggle",
39
+ display_name: "Kaggle",
40
+ available: hasKaggle,
41
+ auth_mode: hasKaggle ? "server-managed" : "not-configured",
42
+ supported_operations: ["discover", "download", "info"],
43
+ requires_end_user_key: false,
44
+ notes: hasKaggle
45
+ ? ["Kaggle is available through server-managed credentials. End users do not need to pass their own key."]
46
+ : ["Kaggle support exists, but no server-managed credentials are configured yet."],
47
+ },
48
+ {
49
+ source: "dataworld",
50
+ display_name: "data.world",
51
+ available: hasDataWorld,
52
+ auth_mode: hasDataWorld ? "server-managed" : "not-configured",
53
+ supported_operations: ["discover", "download", "info"],
54
+ requires_end_user_key: false,
55
+ notes: hasDataWorld
56
+ ? ["data.world is available through server-managed credentials."]
57
+ : ["data.world support exists, but no server-managed token is configured yet."],
58
+ },
59
+ {
60
+ source: "s3",
61
+ display_name: "Amazon S3",
62
+ available: true,
63
+ auth_mode: "public-or-server-managed",
64
+ supported_operations: ["download", "info"],
65
+ requires_end_user_key: false,
66
+ notes: ["Supports keyless download of public S3 objects via s3://bucket/key or HTTPS S3 URLs.", "Bucket listing and search are intentionally not exposed."],
67
+ },
68
+ {
69
+ source: "bigquery",
70
+ display_name: "BigQuery",
71
+ available: hasBigQuery,
72
+ auth_mode: hasBigQuery ? "server-managed" : "not-configured",
73
+ supported_operations: ["info"],
74
+ requires_end_user_key: false,
75
+ notes: hasBigQuery
76
+ ? ["BigQuery is reserved for operator-managed connectors. Query execution is not implemented in this patch."]
77
+ : ["BigQuery is scaffolded in the gateway contract, but no server-managed GCP configuration is present."],
78
+ },
79
+ ];
80
+ return includeUnavailable ? providers : providers.filter(provider => provider.available);
81
+ }
82
+ async discover(options) {
83
+ const query = String(options.query || "").trim();
84
+ const requestedSource = options.source || "auto";
85
+ const limit = Math.max(1, Number(options.limit || 10));
86
+ const publicOnly = options.publicOnly !== false;
87
+ if (!query) {
88
+ throw new Error("query is required");
89
+ }
90
+ const notes = [];
91
+ const providers = this.resolveDiscoverSources(requestedSource, publicOnly, notes);
92
+ const perSourceLimit = Math.max(5, Math.ceil(limit / Math.max(providers.length, 1)) * 2);
93
+ const allResults = [];
94
+ for (const provider of providers) {
95
+ try {
96
+ const partial = await this.discoverFromSource(provider, query, perSourceLimit);
97
+ for (const dataset of partial) {
98
+ try {
99
+ this.deps.metadataStore.saveDataset(dataset);
100
+ }
101
+ catch {
102
+ // best-effort metadata persistence
103
+ }
104
+ allResults.push(dataset);
105
+ }
106
+ }
107
+ catch (error) {
108
+ notes.push(`${provider}: ${(error?.message || error || "Unknown provider error").toString()}`);
109
+ }
110
+ }
111
+ const deduped = new Map();
112
+ for (const dataset of allResults) {
113
+ deduped.set(`${dataset.source}:${dataset.id}`, dataset);
114
+ }
115
+ const results = Array.from(deduped.values())
116
+ .sort((a, b) => this.rankDataset(b) - this.rankDataset(a))
117
+ .slice(0, limit);
118
+ return {
119
+ query,
120
+ requested_source: requestedSource,
121
+ providers_tried: providers,
122
+ notes,
123
+ results,
124
+ };
125
+ }
126
+ async download(options) {
127
+ const requested = String(options.datasetId || "").trim();
128
+ if (!requested) {
129
+ throw new Error("dataset_id is required");
130
+ }
131
+ const notes = [];
132
+ const resolved = this.resolveDatasetReference(requested, options.source || "auto");
133
+ if (resolved.source === "bigquery") {
134
+ throw new Error("BigQuery gateway support is scaffolded for operator-managed connectors, but query/download execution is not implemented yet.");
135
+ }
136
+ if (resolved.source === "s3") {
137
+ const localPath = await this.downloadPublicS3Object(resolved.datasetId, options.targetDir);
138
+ return {
139
+ dataset_id: requested,
140
+ resolved_source: "s3",
141
+ local_path: localPath,
142
+ notes: ["Downloaded via the keyless S3 gateway path."],
143
+ };
144
+ }
145
+ let source = this.toIngestSource(resolved.source);
146
+ let datasetId = resolved.datasetId;
147
+ if (!source) {
148
+ const metadataMatch = this.lookupKnownDataset(requested);
149
+ const metadataSource = this.toIngestSource(metadataMatch?.source);
150
+ if (metadataMatch && metadataSource) {
151
+ source = metadataSource;
152
+ datasetId = metadataMatch.id;
153
+ }
154
+ }
155
+ if (!source) {
156
+ const discovery = await this.discover({ query: requested, source: "auto", limit: 1, publicOnly: false });
157
+ if (discovery.results.length === 0) {
158
+ throw new Error(`Unable to resolve provider for '${requested}'. Run unified_dataset_api with operation='discover' first or pass an explicit source.`);
159
+ }
160
+ const discoveredSource = this.toIngestSource(discovery.results[0].source);
161
+ if (!discoveredSource) {
162
+ throw new Error(`Resolved provider '${discovery.results[0].source}' cannot be downloaded through the dataset ingestor.`);
163
+ }
164
+ source = discoveredSource;
165
+ datasetId = discovery.results[0].id;
166
+ notes.push(`Auto-resolved provider to ${source}.`);
167
+ }
168
+ if (source === "kaggle" && !this.deps.dataIngestor.hasKaggleCredentials()) {
169
+ throw new Error("Kaggle is configured as a gateway source, but no server-managed credentials are available.");
170
+ }
171
+ if (source === "dataworld" && !this.deps.hasDataWorldToken()) {
172
+ throw new Error("data.world is configured as a gateway source, but no server-managed token is available.");
173
+ }
174
+ const localPath = await this.deps.dataIngestor.ensureData(datasetId, source, () => undefined);
175
+ let copiedTo;
176
+ if (options.targetDir) {
177
+ copiedTo = this.copyDownloadOutput(localPath, options.targetDir);
178
+ notes.push(`Copied dataset output to ${copiedTo}.`);
179
+ }
180
+ return {
181
+ dataset_id: datasetId,
182
+ resolved_source: source,
183
+ local_path: localPath,
184
+ copied_to: copiedTo,
185
+ notes,
186
+ };
187
+ }
188
+ async info(options) {
189
+ const requested = String(options.datasetId || "").trim();
190
+ if (!requested) {
191
+ throw new Error("dataset_id is required");
192
+ }
193
+ const resolved = this.resolveDatasetReference(requested, options.source || "auto");
194
+ const metadataMatch = this.lookupKnownDataset(requested) || (resolved.datasetId !== requested ? this.lookupKnownDataset(resolved.datasetId) : undefined);
195
+ if (metadataMatch) {
196
+ return {
197
+ dataset_id: requested,
198
+ resolved_source: metadataMatch.source,
199
+ notes: [],
200
+ dataset: metadataMatch,
201
+ };
202
+ }
203
+ if (resolved.source === "s3") {
204
+ return {
205
+ dataset_id: requested,
206
+ resolved_source: "s3",
207
+ notes: ["S3 info is derived from the object URI. Discovery/listing is intentionally not supported."],
208
+ dataset: {
209
+ id: requested,
210
+ source: "s3",
211
+ uri: this.toS3HttpsUrl(resolved.datasetId),
212
+ },
213
+ };
214
+ }
215
+ if (resolved.source === "bigquery") {
216
+ return {
217
+ dataset_id: requested,
218
+ resolved_source: "bigquery",
219
+ notes: ["BigQuery is reserved for operator-managed connectors. Detailed inspection is not implemented in this patch."],
220
+ };
221
+ }
222
+ const discovery = await this.discover({
223
+ query: resolved.datasetId,
224
+ source: resolved.source || "auto",
225
+ limit: 5,
226
+ publicOnly: options.publicOnly !== false,
227
+ });
228
+ const exact = discovery.results.find(dataset => this.matchesDatasetReference(dataset, requested));
229
+ return {
230
+ dataset_id: requested,
231
+ resolved_source: exact?.source,
232
+ notes: discovery.notes,
233
+ dataset: exact || discovery.results[0],
234
+ };
235
+ }
236
+ async discoverFromSource(source, query, limit) {
237
+ switch (source) {
238
+ case "huggingface":
239
+ return await new HuggingFaceScraper().scrape(limit, true, query);
240
+ case "openml":
241
+ return await this.deps.openmlSource.discover(query, limit);
242
+ case "kaggle":
243
+ return await this.deps.kaggleSource.discover(query, limit);
244
+ case "dataworld":
245
+ return await this.deps.dataworldSource.discover(query, limit);
246
+ case "s3":
247
+ throw new Error("S3 does not support search/discovery in the unified gateway. Use a direct s3://bucket/key object reference.");
248
+ case "bigquery":
249
+ throw new Error("BigQuery discovery is not implemented in the unified gateway.");
250
+ default:
251
+ throw new Error(`Unsupported provider: ${source}`);
252
+ }
253
+ }
254
+ resolveDiscoverSources(source, publicOnly, notes) {
255
+ if (source !== "auto") {
256
+ if (source === "kaggle" && !this.deps.dataIngestor.hasKaggleCredentials()) {
257
+ throw new Error("Kaggle requires server-managed credentials and none are configured.");
258
+ }
259
+ if (source === "dataworld" && !this.deps.hasDataWorldToken()) {
260
+ throw new Error("data.world requires a server-managed token and none is configured.");
261
+ }
262
+ if (source === "s3" || source === "bigquery") {
263
+ throw new Error(`${source} does not currently support discover operation through the gateway.`);
264
+ }
265
+ return [source];
266
+ }
267
+ const providers = ["huggingface", "openml"];
268
+ if (!publicOnly && this.deps.dataIngestor.hasKaggleCredentials()) {
269
+ providers.push("kaggle");
270
+ }
271
+ else if (!publicOnly) {
272
+ notes.push("Kaggle skipped because no server-managed credentials are configured.");
273
+ }
274
+ if (!publicOnly && this.deps.hasDataWorldToken()) {
275
+ providers.push("dataworld");
276
+ }
277
+ else if (!publicOnly) {
278
+ notes.push("data.world skipped because no server-managed token is configured.");
279
+ }
280
+ return providers;
281
+ }
282
+ resolveDatasetReference(datasetId, source) {
283
+ const trimmed = datasetId.trim();
284
+ if (source !== "auto") {
285
+ if (source === "s3") {
286
+ return { source, datasetId: trimmed };
287
+ }
288
+ return { source, datasetId: this.stripSourcePrefix(trimmed, source) };
289
+ }
290
+ if (/^s3:\/\//i.test(trimmed) || /^https?:\/\/[^\s]+\.s3[.-][^\s]+/i.test(trimmed) || /^https?:\/\/s3\.[^\s]+amazonaws\.com\//i.test(trimmed)) {
291
+ return { source: "s3", datasetId: trimmed };
292
+ }
293
+ if (/^kaggle:/i.test(trimmed))
294
+ return { source: "kaggle", datasetId: trimmed.replace(/^kaggle:/i, "") };
295
+ if (/^(huggingface|hf):/i.test(trimmed))
296
+ return { source: "huggingface", datasetId: trimmed.replace(/^(huggingface|hf):/i, "") };
297
+ if (/^openml:/i.test(trimmed))
298
+ return { source: "openml", datasetId: trimmed.replace(/^openml:/i, "") };
299
+ if (/^dataworld:/i.test(trimmed))
300
+ return { source: "dataworld", datasetId: trimmed.replace(/^dataworld:/i, "") };
301
+ if (/^bigquery:/i.test(trimmed))
302
+ return { source: "bigquery", datasetId: trimmed.replace(/^bigquery:/i, "") };
303
+ if (/^\d+$/.test(trimmed))
304
+ return { source: "openml", datasetId: trimmed };
305
+ if (trimmed.includes("/") && !trimmed.includes(":"))
306
+ return { source: "huggingface", datasetId: trimmed };
307
+ return { datasetId: trimmed };
308
+ }
309
+ stripSourcePrefix(datasetId, source) {
310
+ if (source === "huggingface") {
311
+ return datasetId.replace(/^(huggingface|hf):/i, "");
312
+ }
313
+ return datasetId.replace(new RegExp(`^${source}:`, "i"), "");
314
+ }
315
+ lookupKnownDataset(datasetId) {
316
+ const candidates = new Set([
317
+ datasetId,
318
+ datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, ""),
319
+ ]);
320
+ for (const candidate of candidates) {
321
+ const dataset = this.deps.metadataStore.getDataset(candidate);
322
+ if (dataset)
323
+ return dataset;
324
+ }
325
+ return undefined;
326
+ }
327
+ matchesDatasetReference(dataset, requested) {
328
+ const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, "").toLowerCase();
329
+ const fullId = `${dataset.source}:${dataset.id}`.toLowerCase();
330
+ return dataset.id.toLowerCase() === normalizedRequested || fullId === requested.toLowerCase();
331
+ }
332
+ toIngestSource(source) {
333
+ if (source === "huggingface" || source === "openml" || source === "kaggle" || source === "dataworld") {
334
+ return source;
335
+ }
336
+ return undefined;
337
+ }
338
+ rankDataset(dataset) {
339
+ const relevance = Number(dataset.relevance_score || 0) * 1000;
340
+ const quality = Number(dataset.quality_score || 0) * 100;
341
+ const downloads = Number(dataset.downloads || 0);
342
+ return relevance + quality + downloads;
343
+ }
344
+ copyDownloadOutput(localPath, targetDir) {
345
+ const resolvedTargetDir = path.resolve(targetDir);
346
+ fs.mkdirSync(resolvedTargetDir, { recursive: true });
347
+ const destination = path.join(resolvedTargetDir, path.basename(localPath));
348
+ fs.cpSync(localPath, destination, { recursive: true, force: true });
349
+ return destination;
350
+ }
351
+ async downloadPublicS3Object(datasetId, targetDir) {
352
+ const httpsUrl = this.toS3HttpsUrl(datasetId);
353
+ const parsed = new URL(httpsUrl);
354
+ const fileName = path.basename(parsed.pathname) || "s3-object.bin";
355
+ const outputDir = path.resolve(targetDir || path.join(this.deps.dataRoot, "data", "raw"));
356
+ fs.mkdirSync(outputDir, { recursive: true });
357
+ const outputPath = path.join(outputDir, fileName);
358
+ await this.downloadToFile(httpsUrl, outputPath);
359
+ this.deps.metadataStore.registerDownload(datasetId, outputPath, "completed", fs.statSync(outputPath).size);
360
+ return outputPath;
361
+ }
362
+ toS3HttpsUrl(datasetId) {
363
+ if (/^https?:\/\//i.test(datasetId)) {
364
+ return datasetId;
365
+ }
366
+ const match = datasetId.match(/^s3:\/\/([^/]+)\/(.+)$/i);
367
+ if (!match) {
368
+ throw new Error("S3 source expects an s3://bucket/key object reference or a direct HTTPS S3 URL.");
369
+ }
370
+ const bucket = match[1];
371
+ const objectKey = match[2].split("/").map(encodeURIComponent).join("/");
372
+ return `https://${bucket}.s3.amazonaws.com/${objectKey}`;
373
+ }
374
+ async downloadToFile(url, destination) {
375
+ await new Promise((resolve, reject) => {
376
+ const transport = url.startsWith("https:") ? https : http;
377
+ const request = transport.get(url, response => {
378
+ const statusCode = response.statusCode || 0;
379
+ const location = response.headers.location;
380
+ if (statusCode >= 300 && statusCode < 400 && location) {
381
+ response.resume();
382
+ this.downloadToFile(location, destination).then(resolve).catch(reject);
383
+ return;
384
+ }
385
+ if (statusCode < 200 || statusCode >= 300) {
386
+ response.resume();
387
+ reject(new Error(`Download failed with status ${statusCode}`));
388
+ return;
389
+ }
390
+ const file = fs.createWriteStream(destination);
391
+ response.pipe(file);
392
+ file.on("finish", () => {
393
+ file.close();
394
+ resolve();
395
+ });
396
+ file.on("error", error => {
397
+ try {
398
+ file.close();
399
+ }
400
+ catch {
401
+ // no-op
402
+ }
403
+ reject(error);
404
+ });
405
+ });
406
+ request.on("error", reject);
407
+ });
408
+ }
409
+ }