vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,82 +0,0 @@
1
- import { spawn } from "child_process";
2
- import path from "path";
3
- import fs from "fs";
4
- import { ensurePythonPackages, resolvePythonCommand } from "../utils/python-runtime.js";
5
- export class DataExporter {
6
- buildDir;
7
- scriptPath;
8
- constructor(buildDir = process.cwd()) {
9
- this.buildDir = buildDir;
10
- const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
11
- const dataRoot = path.join(homeDir, ".vesper");
12
- const scriptPath0 = path.resolve(dataRoot, "python", "export_engine.py");
13
- const scriptPath1 = path.resolve(buildDir, "python", "export_engine.py");
14
- const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "export_engine.py");
15
- const scriptPath3 = path.resolve(buildDir, "..", "python", "export_engine.py");
16
- if (fs.existsSync(scriptPath0)) {
17
- this.scriptPath = scriptPath0;
18
- }
19
- else if (fs.existsSync(scriptPath1)) {
20
- this.scriptPath = scriptPath1;
21
- }
22
- else if (fs.existsSync(scriptPath2)) {
23
- this.scriptPath = scriptPath2;
24
- }
25
- else if (fs.existsSync(scriptPath3)) {
26
- this.scriptPath = scriptPath3;
27
- }
28
- else {
29
- this.scriptPath = scriptPath0;
30
- }
31
- }
32
- /**
33
- * Exports a dataset file to a specified format
34
- */
35
- async export(inputFile, outputFile, format, options = {}) {
36
- const pythonRequirements = [
37
- { module: "polars", packageName: "polars" },
38
- ];
39
- if (format === "feather") {
40
- pythonRequirements.push({ module: "pyarrow", packageName: "pyarrow" });
41
- }
42
- if (format === "tfrecord") {
43
- pythonRequirements.push({ module: "tensorflow", packageName: "tensorflow" });
44
- }
45
- const pythonPath = await ensurePythonPackages(this.buildDir, pythonRequirements).catch(() => resolvePythonCommand(this.buildDir));
46
- return new Promise((resolve, reject) => {
47
- if (!fs.existsSync(inputFile)) {
48
- reject(new Error(`Input file not found: ${inputFile}`));
49
- return;
50
- }
51
- const args = [this.scriptPath, inputFile, outputFile, format, JSON.stringify(options)];
52
- const childProcess = spawn(pythonPath, args, {
53
- env: {
54
- ...process.env,
55
- PYTHONIOENCODING: "utf-8",
56
- },
57
- });
58
- let stdout = "";
59
- let stderr = "";
60
- childProcess.stdout.on("data", (data) => stdout += data.toString());
61
- childProcess.stderr.on("data", (data) => stderr += data.toString());
62
- childProcess.on("close", (code) => {
63
- if (code !== 0) {
64
- reject(new Error(`Export failed: ${stderr || stdout}`));
65
- return;
66
- }
67
- try {
68
- const result = JSON.parse(stdout);
69
- if (result.error) {
70
- reject(new Error(result.error));
71
- }
72
- else {
73
- resolve(result);
74
- }
75
- }
76
- catch (e) {
77
- reject(new Error(`Failed to parse export output: ${stdout}`));
78
- }
79
- });
80
- });
81
- }
82
- }
@@ -1,100 +0,0 @@
1
- import fs from "fs";
2
- import path from "path";
3
- import crypto from "crypto";
4
- /**
5
- * MetadataPackager bundles data files with metadata and quality reports.
6
- * Follows a simplified Frictionless Data Package standard.
7
- */
8
- export class MetadataPackager {
9
- /**
10
- * Creates a data package in the target directory
11
- */
12
- async createPackage(targetDir, dataFiles, metadata, extraArtifacts) {
13
- try {
14
- if (!fs.existsSync(targetDir)) {
15
- fs.mkdirSync(targetDir, { recursive: true });
16
- }
17
- const resources = [];
18
- // 1. Process Data Files
19
- for (const file of dataFiles) {
20
- if (!fs.existsSync(file.path)) {
21
- throw new Error(`Data file not found: ${file.path}`);
22
- }
23
- const fileName = path.basename(file.path);
24
- const destPath = path.join(targetDir, fileName);
25
- // Copy file to package dir
26
- fs.copyFileSync(file.path, destPath);
27
- // Compute hash and size
28
- const fileBuffer = fs.readFileSync(destPath);
29
- const hash = crypto.createHash("sha256").update(fileBuffer).digest("hex");
30
- const stats = fs.statSync(destPath);
31
- resources.push({
32
- name: file.name,
33
- path: fileName,
34
- format: file.format,
35
- mediatype: this.getMediaType(file.format),
36
- bytes: stats.size,
37
- hash: `sha256:${hash}`
38
- });
39
- }
40
- // 2. Add Extra Artifacts
41
- if (extraArtifacts?.qualityReport) {
42
- const qrPath = path.join(targetDir, "quality_report.json");
43
- fs.writeFileSync(qrPath, JSON.stringify(extraArtifacts.qualityReport, null, 2));
44
- resources.push({
45
- name: "quality-report",
46
- path: "quality_report.json",
47
- format: "json"
48
- });
49
- }
50
- if (extraArtifacts?.cleaningLog) {
51
- const clPath = path.join(targetDir, "cleaning_log.json");
52
- fs.writeFileSync(clPath, JSON.stringify(extraArtifacts.cleaningLog, null, 2));
53
- resources.push({
54
- name: "cleaning-log",
55
- path: "cleaning_log.json",
56
- format: "json"
57
- });
58
- }
59
- // 3. Generate Manifest (datapackage.json)
60
- const manifest = {
61
- profile: "tabular-data-package",
62
- name: metadata.name.toLowerCase().replace(/\s+/g, "-"),
63
- title: metadata.name,
64
- description: metadata.description,
65
- version: metadata.version,
66
- homepage: "https://github.com/vesper-data",
67
- license: metadata.license,
68
- author: metadata.author,
69
- keywords: metadata.tags,
70
- created: new Date().toISOString(),
71
- resources: resources
72
- };
73
- const manifestPath = path.join(targetDir, "datapackage.json");
74
- fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
75
- return {
76
- success: true,
77
- packagePath: targetDir,
78
- manifest: manifest
79
- };
80
- }
81
- catch (e) {
82
- return {
83
- success: false,
84
- packagePath: targetDir,
85
- manifest: null,
86
- error: e.message
87
- };
88
- }
89
- }
90
- getMediaType(format) {
91
- switch (format.toLowerCase()) {
92
- case "csv": return "text/csv";
93
- case "parquet": return "application/x-parquet";
94
- case "jsonl": return "application/x-jsonlines";
95
- case "arrow": return "application/x-apache-arrow-file";
96
- case "json": return "application/json";
97
- default: return "application/octet-stream";
98
- }
99
- }
100
- }
@@ -1 +0,0 @@
1
- export {};
@@ -1,56 +0,0 @@
1
- export class SchemaAligner {
2
- config;
3
- constructor(config) {
4
- this.config = config;
5
- }
6
- /**
7
- * Finds common columns across datasets and creates a mapping to canonical names.
8
- */
9
- align(datasets) {
10
- const alignment = {};
11
- for (const ds of datasets) {
12
- alignment[ds.id] = {};
13
- if (!ds.columns)
14
- continue;
15
- for (const col of ds.columns) {
16
- const canonicalName = this.getCanonicalName(col.name);
17
- if (canonicalName) {
18
- alignment[ds.id][col.name] = canonicalName;
19
- }
20
- }
21
- }
22
- return alignment;
23
- }
24
- /**
25
- * Resolves the most specific common type for a canonical column.
26
- */
27
- resolveType(canonicalName, types) {
28
- if (this.config.type_overrides[canonicalName]) {
29
- return this.config.type_overrides[canonicalName];
30
- }
31
- const uniqueTypes = new Set(types.map(t => t.toLowerCase()));
32
- if (uniqueTypes.size === 1) {
33
- return Array.from(uniqueTypes)[0];
34
- }
35
- if (uniqueTypes.has("float") || uniqueTypes.has("number")) {
36
- return "number";
37
- }
38
- return "string"; // Default to string for mixed types
39
- }
40
- getCanonicalName(colName) {
41
- // Direct match with any canonical name
42
- for (const canonical in this.config.column_aliases) {
43
- if (canonical.toLowerCase() === colName.toLowerCase()) {
44
- return canonical;
45
- }
46
- // Match with aliases
47
- const aliases = this.config.column_aliases[canonical];
48
- if (aliases.some((a) => a.toLowerCase() === colName.toLowerCase())) {
49
- return canonical;
50
- }
51
- }
52
- // If no alias, use the name itself if it's already one of the canonical names we expect
53
- // This is a bit recursive, let's simplify.
54
- return null;
55
- }
56
- }
@@ -1,69 +0,0 @@
1
- import * as crypto from "crypto";
2
- export class Deduplicator {
3
- config;
4
- seenHashes = new Set();
5
- seenTexts = new Map(); // column -> tokens[]
6
- constructor(config) {
7
- this.config = config;
8
- }
9
- /**
10
- * Checks if a record is an exact duplicate based on all columns.
11
- */
12
- isExactDuplicate(record) {
13
- if (!this.config.dedupe_config.exact)
14
- return false;
15
- const hash = crypto
16
- .createHash("md5")
17
- .update(JSON.stringify(record))
18
- .digest("hex");
19
- if (this.seenHashes.has(hash)) {
20
- return true;
21
- }
22
- this.seenHashes.add(hash);
23
- return false;
24
- }
25
- /**
26
- * Checks if a record is a fuzzy duplicate based on configured columns.
27
- */
28
- isFuzzyDuplicate(record) {
29
- if (!this.config.dedupe_config.fuzzy)
30
- return false;
31
- if (!this.config.dedupe_config.fuzzy_columns.length)
32
- return false;
33
- for (const col of this.config.dedupe_config.fuzzy_columns) {
34
- const text = String(record[col] || "");
35
- if (!text || text.length < 10)
36
- continue; // Skip short/empty tags
37
- const tokens = this.tokenize(text);
38
- if (this.isSimilar(col, tokens)) {
39
- return true;
40
- }
41
- // In a real implementation, we'd use a more efficient data structure (like LSH)
42
- // For now, we store tokens and compare (O(N^2) in worst case, use with caution)
43
- // But we'll keep it simple for this phase.
44
- // Actually, let's just store the tokens.
45
- }
46
- return false;
47
- }
48
- tokenize(text) {
49
- return new Set(text.toLowerCase().split(/\s+/).filter(t => t.length > 2));
50
- }
51
- isSimilar(column, tokens) {
52
- if (!this.seenTexts.has(column)) {
53
- this.seenTexts.set(column, []);
54
- }
55
- const stored = this.seenTexts.get(column);
56
- for (const existingSerialized of stored) {
57
- const existing = new Set(JSON.parse(existingSerialized));
58
- const intersection = new Set([...tokens].filter(t => existing.has(t)));
59
- const union = new Set([...tokens, ...existing]);
60
- const jaccard = intersection.size / union.size;
61
- if (jaccard >= this.config.dedupe_config.fuzzy_threshold) {
62
- return true;
63
- }
64
- }
65
- // Add to seen
66
- stored.push(JSON.stringify([...tokens]));
67
- return false;
68
- }
69
- }
@@ -1,69 +0,0 @@
1
- import { spawn } from "child_process";
2
- import path from "path";
3
- import fs from "fs";
4
- export class DataFusionEngine {
5
- pythonPath = "python";
6
- scriptPath;
7
- constructor(buildDir = process.cwd()) {
8
- const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
9
- const dataRoot = path.join(homeDir, ".vesper");
10
- const scriptPath0 = path.resolve(dataRoot, "python", "fusion_engine.py");
11
- const scriptPath1 = path.resolve(buildDir, "python", "fusion_engine.py");
12
- const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "fusion_engine.py");
13
- const scriptPath3 = path.resolve(buildDir, "..", "python", "fusion_engine.py");
14
- if (fs.existsSync(scriptPath0)) {
15
- this.scriptPath = scriptPath0;
16
- }
17
- else if (fs.existsSync(scriptPath1)) {
18
- this.scriptPath = scriptPath1;
19
- }
20
- else if (fs.existsSync(scriptPath2)) {
21
- this.scriptPath = scriptPath2;
22
- }
23
- else if (fs.existsSync(scriptPath3)) {
24
- this.scriptPath = scriptPath3;
25
- }
26
- else {
27
- this.scriptPath = scriptPath0;
28
- }
29
- if (process.platform === "win32") {
30
- this.pythonPath = "py";
31
- }
32
- }
33
- async fuse(sourcePaths, outputPath, options = {}) {
34
- return new Promise((resolve, reject) => {
35
- if (!Array.isArray(sourcePaths) || sourcePaths.length < 2) {
36
- reject(new Error("At least 2 source paths are required for fusion"));
37
- return;
38
- }
39
- const args = [
40
- this.scriptPath,
41
- JSON.stringify(sourcePaths),
42
- outputPath,
43
- JSON.stringify(options),
44
- ];
45
- const processRef = spawn(this.pythonPath, args);
46
- let stdout = "";
47
- let stderr = "";
48
- processRef.stdout.on("data", (data) => (stdout += data.toString()));
49
- processRef.stderr.on("data", (data) => (stderr += data.toString()));
50
- processRef.on("close", (code) => {
51
- if (code !== 0) {
52
- reject(new Error(`Fusion failed: ${stderr || stdout}`));
53
- return;
54
- }
55
- try {
56
- const result = JSON.parse(stdout);
57
- if (result.error) {
58
- reject(new Error(result.error));
59
- return;
60
- }
61
- resolve(result);
62
- }
63
- catch (e) {
64
- reject(new Error(`Failed to parse fusion output: ${stdout}`));
65
- }
66
- });
67
- });
68
- }
69
- }
@@ -1,39 +0,0 @@
1
- export class LabelHarmonizer {
2
- config;
3
- constructor(config) {
4
- this.config = config;
5
- }
6
- /**
7
- * Maps a raw label value to a unified canonical value.
8
- */
9
- harmonize(label) {
10
- let harmonized = label;
11
- // 1. apply explicit mapping
12
- if (this.config.label_map && label in this.config.label_map) {
13
- harmonized = this.config.label_map[label];
14
- }
15
- // 2. apply multi-class to binary conversion
16
- if (this.config.multi_to_binary) {
17
- const { positive_classes, positive_label, negative_label } = this.config.multi_to_binary;
18
- const isPositive = positive_classes.some((pc) => String(pc).toLowerCase() === String(harmonized).toLowerCase());
19
- return isPositive ? positive_label : negative_label;
20
- }
21
- return harmonized;
22
- }
23
- /**
24
- * Checks if a label distribution is balanced enough (placeholder for quality scoring).
25
- */
26
- checkBalance(distribution) {
27
- const warnings = [];
28
- const total = Object.values(distribution).reduce((a, b) => a + b, 0);
29
- if (total === 0)
30
- return warnings;
31
- for (const [label, count] of Object.entries(distribution)) {
32
- const pct = count / total;
33
- if (pct < 0.05) {
34
- warnings.push(`Extreme minority class detected: "${label}" (${(pct * 100).toFixed(1)}%)`);
35
- }
36
- }
37
- return warnings;
38
- }
39
- }
@@ -1,86 +0,0 @@
1
- import { SchemaAligner } from "./aligner.js";
2
- import { LabelHarmonizer } from "./harmonizer.js";
3
- import { Deduplicator } from "./deduplicator.js";
4
- export class FusionOrchestrator {
5
- config;
6
- aligner;
7
- harmonizer;
8
- deduplicator;
9
- constructor(config) {
10
- this.config = config;
11
- this.aligner = new SchemaAligner(config);
12
- this.harmonizer = new LabelHarmonizer(config);
13
- this.deduplicator = new Deduplicator(config);
14
- }
15
- /**
16
- * Fuses multiple datasets into one.
17
- * @param datasets Metadata of datasets to fuse
18
- * @param dataRecords Actual records from all datasets (flattened)
19
- */
20
- async fuse(datasets, dataRecords) {
21
- const stats = {
22
- total_input_rows: dataRecords.length,
23
- total_output_rows: 0,
24
- duplicates_removed: 0,
25
- fuzzy_duplicates_removed: 0,
26
- schema_overlaps: [],
27
- label_distribution: {}
28
- };
29
- const warnings = [];
30
- const alignmentMap = this.aligner.align(datasets);
31
- // Find canonical columns common to at least one dataset (in this simple version)
32
- const allCanonicalCols = new Set();
33
- for (const dsId in alignmentMap) {
34
- for (const canonical of Object.values(alignmentMap[dsId])) {
35
- allCanonicalCols.add(canonical);
36
- }
37
- }
38
- stats.schema_overlaps = Array.from(allCanonicalCols);
39
- const fusedData = [];
40
- for (const { datasetId, record } of dataRecords) {
41
- const alignedRecord = {};
42
- const colMap = alignmentMap[datasetId];
43
- if (!colMap) {
44
- warnings.push(`No alignment found for dataset ${datasetId}`);
45
- continue;
46
- }
47
- // 1. Align columns
48
- for (const [sourceCol, val] of Object.entries(record)) {
49
- const canonical = colMap[sourceCol];
50
- if (canonical) {
51
- alignedRecord[canonical] = val;
52
- }
53
- }
54
- // 2. Harmonize label
55
- if (this.config.target_column && alignedRecord[this.config.target_column] !== undefined) {
56
- alignedRecord[this.config.target_column] = this.harmonizer.harmonize(alignedRecord[this.config.target_column]);
57
- }
58
- // 3. Deduplicate
59
- if (this.deduplicator.isExactDuplicate(alignedRecord)) {
60
- stats.duplicates_removed++;
61
- continue;
62
- }
63
- if (this.deduplicator.isFuzzyDuplicate(alignedRecord)) {
64
- stats.fuzzy_duplicates_removed++;
65
- continue;
66
- }
67
- // Update distribution ONLY for kept records
68
- if (this.config.target_column && alignedRecord[this.config.target_column] !== undefined) {
69
- const labelStr = String(alignedRecord[this.config.target_column]);
70
- stats.label_distribution[labelStr] = (stats.label_distribution[labelStr] || 0) + 1;
71
- }
72
- fusedData.push(alignedRecord);
73
- }
74
- stats.total_output_rows = fusedData.push(); // Wait, push returns new length
75
- stats.total_output_rows = fusedData.length;
76
- // Add balance warnings
77
- const balanceWarnings = this.harmonizer.checkBalance(stats.label_distribution);
78
- warnings.push(...balanceWarnings);
79
- return {
80
- success: true,
81
- output_path: "fused_dataset.json", // Placeholder
82
- stats,
83
- warnings
84
- };
85
- }
86
- }
@@ -1 +0,0 @@
1
- export {};