vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,63 +0,0 @@
1
- import crypto from "crypto";
2
- /**
3
- * A simple in-memory cache provider simulating Redis for demonstration.
4
- */
5
- export class MockRedisProvider {
6
- store = new Map();
7
- async get(key) {
8
- const item = this.store.get(key);
9
- if (!item)
10
- return null;
11
- if (item.expiry && Date.now() > item.expiry) {
12
- this.store.delete(key);
13
- return null;
14
- }
15
- return item.value;
16
- }
17
- async set(key, value, ttlSeconds) {
18
- const expiry = ttlSeconds ? Date.now() + ttlSeconds * 1000 : null;
19
- this.store.set(key, { value, expiry });
20
- }
21
- async delete(key) {
22
- this.store.delete(key);
23
- }
24
- }
25
- export class CacheService {
26
- provider;
27
- constructor(provider) {
28
- this.provider = provider;
29
- }
30
- /**
31
- * Caches quality reports (TTL: 24h)
32
- */
33
- async getReport(datasetId) {
34
- const key = `report:${datasetId}`;
35
- const data = await this.provider.get(key);
36
- return data ? JSON.parse(data) : null;
37
- }
38
- async saveReport(datasetId, report) {
39
- const key = `report:${datasetId}`;
40
- await this.provider.set(key, JSON.stringify(report), 86400); // 24 hours
41
- }
42
- /**
43
- * Caches cleaning plans by dataset ID and configuration hash
44
- */
45
- async getPlan(datasetId, config) {
46
- const hash = this.generateHash(config);
47
- const key = `plan:${datasetId}:${hash}`;
48
- const data = await this.provider.get(key);
49
- return data ? JSON.parse(data) : null;
50
- }
51
- async savePlan(datasetId, config, plan) {
52
- const hash = this.generateHash(config);
53
- const key = `plan:${datasetId}:${hash}`;
54
- await this.provider.set(key, JSON.stringify(plan), 3600); // 1 hour
55
- }
56
- generateHash(obj) {
57
- return crypto
58
- .createHash("sha256")
59
- .update(JSON.stringify(obj))
60
- .digest("hex")
61
- .substring(0, 16);
62
- }
63
- }
@@ -1,81 +0,0 @@
1
- import { spawn } from "child_process";
2
- import path from "path";
3
- import fs from "fs";
4
- export class DataCleaner {
5
- pythonPath = "python";
6
- scriptPath;
7
- constructor(buildDir = process.cwd()) {
8
- const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
9
- const dataRoot = path.join(homeDir, ".vesper");
10
- const scriptPath0 = path.resolve(dataRoot, "python", "cleaner.py");
11
- const scriptPath1 = path.resolve(buildDir, "python", "cleaner.py");
12
- const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "cleaner.py");
13
- const scriptPath3 = path.resolve(buildDir, "..", "python", "cleaner.py");
14
- if (fs.existsSync(scriptPath0)) {
15
- this.scriptPath = scriptPath0;
16
- }
17
- else if (fs.existsSync(scriptPath1)) {
18
- this.scriptPath = scriptPath1;
19
- }
20
- else if (fs.existsSync(scriptPath2)) {
21
- this.scriptPath = scriptPath2;
22
- }
23
- else if (fs.existsSync(scriptPath3)) {
24
- this.scriptPath = scriptPath3;
25
- }
26
- else {
27
- this.scriptPath = scriptPath0; // Final fallback
28
- }
29
- // Detect Python command (Windows may use 'py' instead of 'python')
30
- if (process.platform === "win32") {
31
- this.pythonPath = "py";
32
- }
33
- }
34
- /**
35
- * Execute a list of cleaning operations on a file
36
- */
37
- async clean(filePath, operations, format) {
38
- return new Promise((resolve, reject) => {
39
- const args = [
40
- this.scriptPath,
41
- filePath,
42
- JSON.stringify(operations)
43
- ];
44
- if (format)
45
- args.push(format);
46
- const process = spawn(this.pythonPath, args);
47
- let stdout = "";
48
- let stderr = "";
49
- process.stdout.on("data", (data) => {
50
- stdout += data.toString();
51
- });
52
- process.stderr.on("data", (data) => {
53
- stderr += data.toString();
54
- });
55
- process.on("close", (code) => {
56
- if (code !== 0) {
57
- reject(new Error(`Data Cleaner failed (code ${code}): ${stderr}`));
58
- return;
59
- }
60
- try {
61
- const result = JSON.parse(stdout);
62
- if (!result.success) {
63
- reject(new Error(result.error));
64
- }
65
- else {
66
- resolve({
67
- success: true,
68
- rows_affected: Number(result.rows_affected ?? 0),
69
- columns_affected: Number(result.columns_affected ?? 0),
70
- output_path: result.output_path,
71
- logs: Array.isArray(result.logs) ? result.logs : [],
72
- });
73
- }
74
- }
75
- catch (e) {
76
- reject(new Error(`Failed to parse cleaner output: ${stdout}`));
77
- }
78
- });
79
- });
80
- }
81
- }
@@ -1,89 +0,0 @@
1
- import * as crypto from "crypto";
2
- export class RuleEvaluator {
3
- /**
4
- * Checks if a record matches a rule's condition.
5
- */
6
- matches(record, condition) {
7
- const columnsToTest = condition.column === "*"
8
- ? Object.keys(record)
9
- : [condition.column];
10
- for (const col of columnsToTest) {
11
- const val = record[col];
12
- if (val === undefined)
13
- continue;
14
- if (this.testValue(val, condition)) {
15
- return true;
16
- }
17
- }
18
- return false;
19
- }
20
- /**
21
- * Applies a rule action to a record.
22
- */
23
- apply(record, rule) {
24
- const newRecord = { ...record };
25
- const action = rule.action;
26
- const condition = rule.condition;
27
- const columnsToApply = condition.column === "*"
28
- ? Object.keys(record)
29
- : [condition.column];
30
- for (const col of columnsToApply) {
31
- const val = record[col];
32
- if (val === undefined)
33
- continue;
34
- // Optional: Re-test condition per column if using wildcard
35
- if (condition.column === "*" && !this.testValue(val, condition)) {
36
- continue;
37
- }
38
- switch (action.type) {
39
- case "Replace":
40
- if (typeof val === "string") {
41
- const regex = new RegExp(action.params.pattern, "g");
42
- newRecord[col] = val.replace(regex, action.params.replacement);
43
- }
44
- break;
45
- case "CustomMask":
46
- if (val !== null) {
47
- newRecord[col] = this.maskValue(String(val), action.params);
48
- }
49
- break;
50
- case "NormalizeText":
51
- if (typeof val === "string") {
52
- newRecord[col] = action.params.case === "lower"
53
- ? val.toLowerCase()
54
- : val.toUpperCase();
55
- }
56
- break;
57
- // Add more handlers as needed...
58
- }
59
- }
60
- return newRecord;
61
- }
62
- testValue(val, condition) {
63
- const { operator, value } = condition;
64
- const strVal = String(val);
65
- switch (operator) {
66
- case "contains":
67
- return strVal.includes(String(value));
68
- case "equals":
69
- return val === value;
70
- case "starts_with":
71
- return strVal.startsWith(String(value));
72
- case "ends_with":
73
- return strVal.endsWith(String(value));
74
- case "is_null":
75
- return (val === null || val === undefined) === value;
76
- case "matches_regex":
77
- return new RegExp(String(value)).test(strVal);
78
- default:
79
- return false;
80
- }
81
- }
82
- maskValue(val, params) {
83
- if (params.method === "hash") {
84
- const salt = params.salt || "";
85
- return crypto.createHash("sha256").update(val + salt).digest("hex").substring(0, 12) + "...";
86
- }
87
- return "MASKED";
88
- }
89
- }
@@ -1,62 +0,0 @@
1
- import { QualityAnalyzer } from "../quality/analyzer.js";
2
- import { CleaningPlanner } from "./planner.js";
3
- import { DataCleaner } from "./cleaner.js";
4
- export class PipelineExecutor {
5
- analyzer;
6
- planner;
7
- cleaner;
8
- constructor(dataRoot = process.cwd(), buildDir) {
9
- // Use buildDir for analyzer and cleaner (where Python scripts are), dataRoot for data operations
10
- this.analyzer = new QualityAnalyzer(undefined, buildDir || dataRoot);
11
- this.planner = new CleaningPlanner(undefined, buildDir || dataRoot);
12
- this.cleaner = new DataCleaner(buildDir || dataRoot);
13
- }
14
- /**
15
- * Run the full Auto-Cleaning Pipeline on a dataset file
16
- */
17
- async runPipeline(datasetId, filePath, outputFormat = "parquet", onProgress) {
18
- // ... (logging setup)
19
- const log = (m) => {
20
- console.error(`[Pipeline] ${m}`);
21
- if (onProgress)
22
- onProgress(m);
23
- };
24
- log(`Analyzing quality for ${datasetId}...`);
25
- const qualityReport = await this.analyzer.analyze(filePath);
26
- // 2. Generate Plan
27
- log(`Generating cleaning plan...`);
28
- const plan = await this.planner.generatePlan(datasetId, qualityReport);
29
- const rules = plan.operations;
30
- // If no cleaning needed, we still might need format conversion
31
- const needsConversion = !filePath.toLowerCase().endsWith(`.${outputFormat}`);
32
- if (rules.length === 0 && !needsConversion) {
33
- log(`No cleaning or conversion needed.`);
34
- return {
35
- initial_quality: qualityReport,
36
- plan,
37
- cleaning_result: {
38
- success: true,
39
- rows_affected: 0,
40
- columns_affected: 0,
41
- logs: ["No operations generated."]
42
- }
43
- };
44
- }
45
- // 3. Execute Plan (includes conversion if requested)
46
- log(`Executing ${rules.length} operations (Format: ${outputFormat})...`);
47
- rules.forEach(op => console.error(` - ${op.type}: ${op.reason}`));
48
- const cleaningResult = await this.cleaner.clean(filePath, rules, outputFormat);
49
- if (cleaningResult.success) {
50
- log(`Cleaning complete. Output: ${cleaningResult.output_path}`);
51
- }
52
- else {
53
- log(`Cleaning failed: ${cleaningResult.error}`);
54
- }
55
- return {
56
- initial_quality: qualityReport,
57
- plan,
58
- cleaning_result: cleaningResult,
59
- final_output_path: cleaningResult.output_path
60
- };
61
- }
62
- }
@@ -1,87 +0,0 @@
1
- export class ScriptGenerator {
2
- /**
3
- * Generate a standalone Python script for the cleaning plan
4
- */
5
- generatePythonScript(plan, inputPath) {
6
- const timestamp = new Date().toISOString().split('T')[0];
7
- let script = `"""
8
- Vesper Auto-Cleaning Script
9
- Generated: ${timestamp}
10
- Dataset ID: ${plan.dataset_id}
11
- """
12
-
13
- import polars as pl
14
- import numpy as np
15
-
16
- def clean_dataset(file_path):
17
- print(f"Loading {file_path}...")
18
-
19
- # Load Data
20
- if file_path.endswith(".csv"):
21
- df = pl.read_csv(file_path, ignore_errors=True)
22
- elif file_path.endswith(".parquet"):
23
- df = pl.read_parquet(file_path)
24
- else:
25
- raise ValueError("Unsupported format")
26
-
27
- print(f"Initial shape: {df.shape}")
28
-
29
- `;
30
- // Generate code for each operation
31
- plan.operations.forEach((op, index) => {
32
- script += ` # Step ${index + 1}: ${op.type}\n`;
33
- script += ` # Reason: ${op.reason}\n`;
34
- script += this.generateOpCode(op);
35
- script += ` print(f"After Step ${index + 1} (${op.type}): {df.shape}")\n\n`;
36
- });
37
- script += ` # Save Output
38
- output_path = file_path.replace(".csv", "_cleaned_repro.csv").replace(".parquet", "_cleaned_repro.parquet")
39
- if file_path.endswith(".csv"):
40
- df.write_csv(output_path)
41
- else:
42
- df.write_parquet(output_path)
43
-
44
- print(f"Done! Saved to {output_path}")
45
-
46
- if __name__ == "__main__":
47
- # Default input path from generation time, can be overridden
48
- INPUT_PATH = r"${inputPath}"
49
- clean_dataset(INPUT_PATH)
50
- `;
51
- return script;
52
- }
53
- generateOpCode(op) {
54
- const p = op.params;
55
- switch (op.type) {
56
- case "RemoveDuplicates":
57
- return ` df = df.unique()\n`;
58
- case "DropColumns":
59
- return ` cols_to_drop = ${JSON.stringify(p.columns)}\n existing_cols = [c for c in cols_to_drop if c in df.columns]\n if existing_cols:\n df = df.drop(existing_cols)\n`;
60
- case "FillMissing":
61
- if (p.method === "constant") {
62
- const val = typeof p.value === 'string' ? `"${p.value}"` : p.value;
63
- return ` df = df.with_columns(pl.col("${p.column}").fill_null(${val}))\n`;
64
- }
65
- else if (p.method === "mean") {
66
- return ` mean_val = df["${p.column}"].mean()\n df = df.with_columns(pl.col("${p.column}").fill_null(mean_val))\n`;
67
- }
68
- else if (p.method === "median") {
69
- return ` median_val = df["${p.column}"].median()\n df = df.with_columns(pl.col("${p.column}").fill_null(median_val))\n`;
70
- }
71
- return ` # Unknown fill method for ${p.column}\n`;
72
- case "FixTypes":
73
- if (p.type === "float")
74
- return ` df = df.with_columns(pl.col("${p.column}").cast(pl.Float64, strict=False))\n`;
75
- if (p.type === "int")
76
- return ` df = df.with_columns(pl.col("${p.column}").cast(pl.Int64, strict=False))\n`;
77
- if (p.type === "string")
78
- return ` df = df.with_columns(pl.col("${p.column}").cast(pl.Utf8))\n`;
79
- return ` # Unknown type conversion for ${p.column}\n`;
80
- case "RemoveOutliers":
81
- // IQR implementation inline
82
- return ` q1 = df["${p.column}"].quantile(0.25)\n q3 = df["${p.column}"].quantile(0.75)\n iqr = q3 - q1\n lower = q1 - (1.5 * iqr)\n upper = q3 + (1.5 * iqr)\n df = df.filter((pl.col("${p.column}") >= lower) & (pl.col("${p.column}") <= upper))\n`;
83
- default:
84
- return ` # Operation ${op.type} not fully supported in export yet\n`;
85
- }
86
- }
87
- }
@@ -1,127 +0,0 @@
1
- import { TargetDetector } from "../preparation/target-detector.js";
2
- export class CleaningPlanner {
3
- cache;
4
- targetDetector;
5
- constructor(cache, buildDir = process.cwd()) {
6
- this.cache = cache;
7
- this.targetDetector = new TargetDetector(buildDir);
8
- }
9
- /**
10
- * Generate a cleaning plan based on the quality report and optional custom rules
11
- */
12
- async generatePlan(datasetId, report, ruleSet, targetInfo) {
13
- if (this.cache) {
14
- const cached = await this.cache.getPlan(datasetId, { report, ruleSet });
15
- if (cached) {
16
- console.error(`[CleaningPlanner] Cache hit for ${datasetId}`);
17
- return cached;
18
- }
19
- }
20
- const ops = [];
21
- let estimatedRowsSaved = 0;
22
- let estimatedColsSaved = 0;
23
- // 1. Remove Duplicates (Global)
24
- if (report.duplicate_rows > 0) {
25
- ops.push({
26
- type: "RemoveDuplicates",
27
- params: {},
28
- reason: `Found ${report.duplicate_rows} exact duplicate rows`
29
- });
30
- estimatedRowsSaved += report.duplicate_rows;
31
- }
32
- // 2. Column-level operations
33
- for (const col of report.columns) {
34
- // A. Drop Empty / Useless Columns
35
- if (col.missing_percentage > 90 || col.is_constant) {
36
- ops.push({
37
- type: "DropColumns",
38
- params: { columns: [col.name] },
39
- reason: col.is_constant ? "Column is constant (zero variance)" : `High missing values (${col.missing_percentage.toFixed(1)}%)`
40
- });
41
- estimatedColsSaved++;
42
- continue;
43
- }
44
- // B. Fix Types
45
- if (this.shouldFixType(col)) {
46
- const targetType = col.inferred_type.toLowerCase().includes("numeric") ? "float" : "string";
47
- ops.push({
48
- type: "FixTypes",
49
- params: { column: col.name, type: targetType },
50
- reason: `Inferred type is ${col.inferred_type} but stored as ${col.type}`
51
- });
52
- }
53
- // C. Impute Missing Values
54
- if (col.missing_count > 0) {
55
- let method = "constant";
56
- let value = "unknown";
57
- if (col.inferred_type.includes("Numeric") || col.type.includes("Int") || col.type.includes("Float")) {
58
- method = "median";
59
- value = 0;
60
- }
61
- else {
62
- method = "constant";
63
- value = "missing";
64
- }
65
- ops.push({
66
- type: "FillMissing",
67
- params: { column: col.name, method, value },
68
- reason: `${col.missing_count} missing values`
69
- });
70
- }
71
- }
72
- // 3. Target Unification
73
- if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
74
- ops.push({
75
- type: "RenameTarget",
76
- params: { old_name: targetInfo.target, new_name: "target" },
77
- reason: `Detected target '${targetInfo.target}' with high confidence (${targetInfo.confidence.toFixed(2)})`
78
- });
79
- estimatedColsSaved++; // Logic fix: effectively "saving" a column by standardizing it
80
- }
81
- // 4. Apply Custom Rules
82
- if (ruleSet) {
83
- for (const rule of ruleSet.rules) {
84
- const targets = rule.condition.column === "*"
85
- ? report.columns.map(c => c.name)
86
- : [rule.condition.column];
87
- for (const targetCol of targets) {
88
- const colStats = report.columns.find(c => c.name === targetCol);
89
- if (!colStats)
90
- continue;
91
- ops.push({
92
- type: rule.action.type,
93
- params: { ...rule.action.params, column: targetCol },
94
- reason: `Custom Rule: ${rule.name} - ${rule.description}`
95
- });
96
- }
97
- }
98
- }
99
- const plan = {
100
- dataset_id: datasetId,
101
- operations: ops,
102
- estimated_impact: {
103
- rows_saved: estimatedRowsSaved,
104
- columns_saved: estimatedColsSaved,
105
- quality_score_improvement: 10 + (ops.length * 5)
106
- }
107
- };
108
- if (this.cache) {
109
- await this.cache.savePlan(datasetId, { report, ruleSet }, plan);
110
- }
111
- return plan;
112
- }
113
- async generateRules(datasetId, report, ruleSet, targetInfo) {
114
- const plan = await this.generatePlan(datasetId, report, ruleSet, targetInfo);
115
- return plan.operations;
116
- }
117
- shouldFixType(col) {
118
- if (col.inferred_type && col.inferred_type.includes("Numeric") && (col.type.includes("String") || col.type.includes("Utf8"))) {
119
- return true;
120
- }
121
- return false;
122
- }
123
- isNumeric(col) {
124
- const t = col.type.toLowerCase();
125
- return t.includes("int") || t.includes("float") || t.includes("numeric");
126
- }
127
- }
@@ -1,57 +0,0 @@
1
- // --- Domain Presets ---
2
- export const NLP_PRESET = {
3
- id: "preset-nlp",
4
- name: "NLP Data Prep",
5
- domain: "nlp",
6
- rules: [
7
- {
8
- id: "nlp-1",
9
- name: "Normalize Case",
10
- description: "Convert all text to lowercase",
11
- condition: { column: "*", operator: "is_null", value: false }, // Apply to all non-null if type is string (logic in evaluator)
12
- action: { type: "NormalizeText", params: { case: "lower" } }
13
- },
14
- {
15
- id: "nlp-2",
16
- name: "Remove URLs",
17
- description: "Strip http/https links",
18
- condition: { column: "*", operator: "contains", value: "http" },
19
- action: { type: "Replace", params: { pattern: "https?://\\S+", replacement: "" } }
20
- }
21
- ]
22
- };
23
- export const HEALTHCARE_PRESET = {
24
- id: "preset-healthcare",
25
- name: "Healthcare (HIPAA) Prep",
26
- domain: "healthcare",
27
- rules: [
28
- {
29
- id: "hc-1",
30
- name: "Mask Emails",
31
- description: "Identify and mask email addresses",
32
- condition: { column: "*", operator: "contains", value: "@" },
33
- action: { type: "CustomMask", params: { method: "hash", salt: "vesper-pii" } }
34
- },
35
- {
36
- id: "hc-2",
37
- name: "Normalize Dates",
38
- description: "Ensure ISO-8601 for DOB/Admit dates",
39
- condition: { column: "date", operator: "is_null", value: false },
40
- action: { type: "FixTypes", params: { type: "date" } }
41
- }
42
- ]
43
- };
44
- export const FINANCE_PRESET = {
45
- id: "preset-finance",
46
- name: "Financial Data Prep",
47
- domain: "finance",
48
- rules: [
49
- {
50
- id: "fin-1",
51
- name: "Currency Cleanup",
52
- description: "Remove currency symbols and parse as float",
53
- condition: { column: "amount", operator: "matches_regex", value: "[\\$\\€\\£]" },
54
- action: { type: "FixTypes", params: { type: "float", strip: "[^0-9\\.]" } }
55
- }
56
- ]
57
- };
@@ -1 +0,0 @@
1
- export {};
@@ -1,37 +0,0 @@
1
- import fs from "fs";
2
- import path from "path";
3
- /**
4
- * LocalAdapter mimics cloud storage by copying files to a local vault directory.
5
- */
6
- export class LocalAdapter {
7
- vaultPath;
8
- constructor(vaultPath) {
9
- this.vaultPath = path.resolve(vaultPath);
10
- if (!fs.existsSync(this.vaultPath)) {
11
- fs.mkdirSync(this.vaultPath, { recursive: true });
12
- }
13
- }
14
- async upload(localPath, remotePath) {
15
- if (!fs.existsSync(localPath)) {
16
- throw new Error(`Local file not found: ${localPath}`);
17
- }
18
- const destPath = path.join(this.vaultPath, remotePath);
19
- const destDir = path.dirname(destPath);
20
- if (!fs.existsSync(destDir)) {
21
- fs.mkdirSync(destDir, { recursive: true });
22
- }
23
- fs.copyFileSync(localPath, destPath);
24
- // Returns a file URI as the "url"
25
- return `file://${destPath}`;
26
- }
27
- async delete(remotePath) {
28
- const destPath = path.join(this.vaultPath, remotePath);
29
- if (fs.existsSync(destPath)) {
30
- fs.unlinkSync(destPath);
31
- }
32
- }
33
- async getSignedUrl(remotePath, expiresValue) {
34
- // For local, just return the file URI
35
- return `file://${path.join(this.vaultPath, remotePath)}`;
36
- }
37
- }
@@ -1,24 +0,0 @@
1
- /**
2
- * S3Adapter Stub.
3
- * Note: Requires @aws-sdk/client-s3 to be installed for full functionality.
4
- */
5
- export class S3Adapter {
6
- bucket;
7
- region;
8
- credentials;
9
- constructor(bucket, region, credentials) {
10
- this.bucket = bucket;
11
- this.region = region;
12
- this.credentials = credentials;
13
- }
14
- async upload(localPath, remotePath) {
15
- console.warn("S3Adapter: Full implementation requires @aws-sdk/client-s3. This is a stub.");
16
- return `https://${this.bucket}.s3.${this.region}.amazonaws.com/${remotePath}`;
17
- }
18
- async delete(remotePath) {
19
- console.warn("S3Adapter: Delete stub called.");
20
- }
21
- async getSignedUrl(remotePath) {
22
- return `https://${this.bucket}.s3.${this.region}.amazonaws.com/${remotePath}?stub=true`;
23
- }
24
- }
@@ -1,49 +0,0 @@
1
- import fs from "fs";
2
- import { readFile } from "fs/promises";
3
- import { createClient } from "@supabase/supabase-js";
4
- export class SupabaseAdapter {
5
- bucket;
6
- client;
7
- constructor(bucket, supabaseUrl, supabaseServiceRoleKey) {
8
- this.bucket = bucket;
9
- const resolvedUrl = supabaseUrl || process.env.SUPABASE_URL;
10
- const resolvedServiceRoleKey = supabaseServiceRoleKey || process.env.SUPABASE_SERVICE_ROLE_KEY;
11
- if (!resolvedUrl || !resolvedServiceRoleKey) {
12
- throw new Error("Supabase requires SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY");
13
- }
14
- this.client = createClient(resolvedUrl, resolvedServiceRoleKey, {
15
- auth: {
16
- persistSession: false,
17
- },
18
- });
19
- }
20
- async upload(localPath, remotePath) {
21
- if (!fs.existsSync(localPath)) {
22
- throw new Error(`Local file not found: ${localPath}`);
23
- }
24
- const fileBuffer = await readFile(localPath);
25
- const { error } = await this.client.storage
26
- .from(this.bucket)
27
- .upload(remotePath, fileBuffer, { upsert: true });
28
- if (error) {
29
- throw new Error(`Supabase upload failed: ${error.message}`);
30
- }
31
- const { data } = this.client.storage.from(this.bucket).getPublicUrl(remotePath);
32
- return data.publicUrl;
33
- }
34
- async delete(remotePath) {
35
- const { error } = await this.client.storage.from(this.bucket).remove([remotePath]);
36
- if (error) {
37
- throw new Error(`Supabase delete failed: ${error.message}`);
38
- }
39
- }
40
- async getSignedUrl(remotePath, expiresValue = 3600) {
41
- const { data, error } = await this.client.storage
42
- .from(this.bucket)
43
- .createSignedUrl(remotePath, expiresValue);
44
- if (error || !data?.signedUrl) {
45
- throw new Error(`Supabase signed URL failed: ${error?.message || "No signed URL returned"}`);
46
- }
47
- return data.signedUrl;
48
- }
49
- }