vesper-wizard 2.0.5 → 2.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +300 -37
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +81 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +62 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +127 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/adapters/supabase.js +49 -0
  15. package/build/cloud/storage-manager.js +26 -0
  16. package/build/cloud/types.js +1 -0
  17. package/build/compliance/service.js +73 -0
  18. package/build/compliance/store.js +80 -0
  19. package/build/compliance/types.js +1 -0
  20. package/build/config/config-manager.js +221 -0
  21. package/build/config/secure-keys.js +51 -0
  22. package/build/config/user-config.js +48 -0
  23. package/build/data/processing-worker.js +23 -0
  24. package/build/data/streaming.js +38 -0
  25. package/build/data/worker-pool.js +39 -0
  26. package/build/export/exporter.js +69 -0
  27. package/build/export/packager.js +100 -0
  28. package/build/export/types.js +1 -0
  29. package/build/fusion/aligner.js +56 -0
  30. package/build/fusion/deduplicator.js +69 -0
  31. package/build/fusion/engine.js +69 -0
  32. package/build/fusion/harmonizer.js +39 -0
  33. package/build/fusion/orchestrator.js +86 -0
  34. package/build/fusion/types.js +1 -0
  35. package/build/gateway/unified-dataset-gateway.js +409 -0
  36. package/build/index.js +2704 -0
  37. package/build/ingestion/hf-downloader.js +171 -0
  38. package/build/ingestion/ingestor.js +271 -0
  39. package/build/ingestion/kaggle-downloader.js +102 -0
  40. package/build/install/install-service.js +41 -0
  41. package/build/jobs/manager.js +136 -0
  42. package/build/jobs/queue.js +59 -0
  43. package/build/jobs/types.js +1 -0
  44. package/build/lib/supabase.js +3 -0
  45. package/build/metadata/dataworld-source.js +89 -0
  46. package/build/metadata/domain.js +147 -0
  47. package/build/metadata/github-scraper.js +47 -0
  48. package/build/metadata/institutional-scrapers.js +49 -0
  49. package/build/metadata/kaggle-scraper.js +182 -0
  50. package/build/metadata/kaggle-source.js +70 -0
  51. package/build/metadata/license.js +68 -0
  52. package/build/metadata/monitoring-service.js +107 -0
  53. package/build/metadata/monitoring-store.js +78 -0
  54. package/build/metadata/monitoring-types.js +1 -0
  55. package/build/metadata/openml-source.js +87 -0
  56. package/build/metadata/quality.js +48 -0
  57. package/build/metadata/rate-limiter.js +128 -0
  58. package/build/metadata/scraper.js +377 -0
  59. package/build/metadata/store.js +340 -0
  60. package/build/metadata/types.js +1 -0
  61. package/build/metadata/uci-scraper.js +49 -0
  62. package/build/monitoring/observability.js +76 -0
  63. package/build/preparation/target-detector.js +75 -0
  64. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  65. package/build/python/asset_downloader_engine.py +92 -0
  66. package/build/python/cleaner.py +226 -0
  67. package/build/python/config.py +263 -0
  68. package/build/python/dataworld_engine.py +208 -0
  69. package/build/python/export_engine.py +243 -0
  70. package/build/python/framework_adapters.py +100 -0
  71. package/build/python/fusion_engine.py +368 -0
  72. package/build/python/github_adapter.py +106 -0
  73. package/build/python/hf_fallback.py +298 -0
  74. package/build/python/image_engine.py +86 -0
  75. package/build/python/kaggle_engine.py +295 -0
  76. package/build/python/media_engine.py +133 -0
  77. package/build/python/nasa_adapter.py +82 -0
  78. package/build/python/openml_engine.py +146 -0
  79. package/build/python/quality_engine.py +267 -0
  80. package/build/python/row_count.py +54 -0
  81. package/build/python/splitter_engine.py +283 -0
  82. package/build/python/target_engine.py +154 -0
  83. package/build/python/test_framework_adapters.py +61 -0
  84. package/build/python/test_fusion_engine.py +89 -0
  85. package/build/python/uci_adapter.py +94 -0
  86. package/build/python/vesper/__init__.py +1 -0
  87. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  88. package/build/python/vesper/core/__init__.py +1 -0
  89. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  90. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  91. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  92. package/build/python/vesper/core/asset_downloader.py +675 -0
  93. package/build/python/vesper/core/download_recipe.py +104 -0
  94. package/build/python/worldbank_adapter.py +99 -0
  95. package/build/quality/analyzer.js +93 -0
  96. package/build/quality/image-analyzer.js +114 -0
  97. package/build/quality/media-analyzer.js +115 -0
  98. package/build/quality/quality-orchestrator.js +162 -0
  99. package/build/quality/types.js +1 -0
  100. package/build/scripts/build-index.js +54 -0
  101. package/build/scripts/check-db.js +73 -0
  102. package/build/scripts/check-jobs.js +24 -0
  103. package/build/scripts/check-naruto.js +17 -0
  104. package/build/scripts/cleanup-kaggle.js +41 -0
  105. package/build/scripts/demo-full-pipeline.js +62 -0
  106. package/build/scripts/demo-ui.js +58 -0
  107. package/build/scripts/e2e-demo.js +72 -0
  108. package/build/scripts/massive-scrape.js +103 -0
  109. package/build/scripts/ops-dashboard.js +33 -0
  110. package/build/scripts/repro-bug.js +37 -0
  111. package/build/scripts/repro-export-bug.js +56 -0
  112. package/build/scripts/scrape-metadata.js +100 -0
  113. package/build/scripts/search-cli.js +26 -0
  114. package/build/scripts/test-bias.js +45 -0
  115. package/build/scripts/test-caching.js +51 -0
  116. package/build/scripts/test-cleaning.js +76 -0
  117. package/build/scripts/test-cloud-storage.js +48 -0
  118. package/build/scripts/test-compliance.js +58 -0
  119. package/build/scripts/test-conversion.js +64 -0
  120. package/build/scripts/test-custom-rules.js +58 -0
  121. package/build/scripts/test-db-opt.js +63 -0
  122. package/build/scripts/test-export-custom.js +33 -0
  123. package/build/scripts/test-exporter.js +53 -0
  124. package/build/scripts/test-fusion.js +61 -0
  125. package/build/scripts/test-github.js +27 -0
  126. package/build/scripts/test-group-split.js +52 -0
  127. package/build/scripts/test-hf-download.js +29 -0
  128. package/build/scripts/test-holdout-manager.js +61 -0
  129. package/build/scripts/test-hybrid-search.js +41 -0
  130. package/build/scripts/test-image-analysis.js +50 -0
  131. package/build/scripts/test-ingestion-infra.js +39 -0
  132. package/build/scripts/test-install.js +40 -0
  133. package/build/scripts/test-institutional.js +26 -0
  134. package/build/scripts/test-integrity.js +41 -0
  135. package/build/scripts/test-jit.js +42 -0
  136. package/build/scripts/test-job-queue.js +62 -0
  137. package/build/scripts/test-kaggle-download.js +34 -0
  138. package/build/scripts/test-large-data.js +50 -0
  139. package/build/scripts/test-mcp-v5.js +74 -0
  140. package/build/scripts/test-media-analysis.js +61 -0
  141. package/build/scripts/test-monitoring.js +91 -0
  142. package/build/scripts/test-observability.js +106 -0
  143. package/build/scripts/test-packager.js +55 -0
  144. package/build/scripts/test-pipeline.js +50 -0
  145. package/build/scripts/test-planning.js +64 -0
  146. package/build/scripts/test-privacy.js +38 -0
  147. package/build/scripts/test-production-sync.js +36 -0
  148. package/build/scripts/test-quality.js +43 -0
  149. package/build/scripts/test-robust-ingestion.js +41 -0
  150. package/build/scripts/test-schema.js +45 -0
  151. package/build/scripts/test-split-validation.js +40 -0
  152. package/build/scripts/test-splitter.js +93 -0
  153. package/build/scripts/test-target-detector.js +29 -0
  154. package/build/scripts/test-uci.js +27 -0
  155. package/build/scripts/test-unified-quality.js +86 -0
  156. package/build/scripts/test-write.js +14 -0
  157. package/build/scripts/verify-integration.js +57 -0
  158. package/build/scripts/verify-priority.js +33 -0
  159. package/build/search/embedder.js +34 -0
  160. package/build/search/engine.js +152 -0
  161. package/build/search/jit-orchestrator.js +258 -0
  162. package/build/search/vector-store.js +123 -0
  163. package/build/splitting/splitter.js +82 -0
  164. package/build/splitting/types.js +1 -0
  165. package/build/tools/formatter.js +251 -0
  166. package/build/utils/downloader.js +52 -0
  167. package/build/utils/selector.js +69 -0
  168. package/mcp-config-template.json +18 -0
  169. package/package.json +101 -29
  170. package/scripts/postinstall.cjs +114 -0
  171. package/scripts/preindex_registry.cjs +157 -0
  172. package/scripts/refresh-index.cjs +87 -0
  173. package/scripts/wizard.cjs +625 -0
  174. package/{wizard.js → scripts/wizard.js} +99 -21
  175. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  176. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  177. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  178. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  179. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  180. package/src/python/asset_downloader_engine.py +92 -0
  181. package/src/python/cleaner.py +226 -0
  182. package/src/python/config.py +263 -0
  183. package/src/python/dataworld_engine.py +208 -0
  184. package/src/python/export_engine.py +243 -0
  185. package/src/python/framework_adapters.py +100 -0
  186. package/src/python/fusion_engine.py +368 -0
  187. package/src/python/github_adapter.py +106 -0
  188. package/src/python/hf_fallback.py +298 -0
  189. package/src/python/image_engine.py +86 -0
  190. package/src/python/kaggle_engine.py +295 -0
  191. package/src/python/media_engine.py +133 -0
  192. package/src/python/nasa_adapter.py +82 -0
  193. package/src/python/openml_engine.py +146 -0
  194. package/src/python/quality_engine.py +267 -0
  195. package/src/python/row_count.py +54 -0
  196. package/src/python/splitter_engine.py +283 -0
  197. package/src/python/target_engine.py +154 -0
  198. package/src/python/test_framework_adapters.py +61 -0
  199. package/src/python/test_fusion_engine.py +89 -0
  200. package/src/python/uci_adapter.py +94 -0
  201. package/src/python/vesper/__init__.py +1 -0
  202. package/src/python/vesper/core/__init__.py +1 -0
  203. package/src/python/vesper/core/asset_downloader.py +675 -0
  204. package/src/python/vesper/core/download_recipe.py +104 -0
  205. package/src/python/worldbank_adapter.py +99 -0
  206. package/vesper-mcp-config.json +0 -6
@@ -0,0 +1,62 @@
1
+ import { QualityAnalyzer } from "../quality/analyzer.js";
2
+ import { CleaningPlanner } from "./planner.js";
3
+ import { DataCleaner } from "./cleaner.js";
4
+ export class PipelineExecutor {
5
+ analyzer;
6
+ planner;
7
+ cleaner;
8
+ constructor(dataRoot = process.cwd(), buildDir) {
9
+ // Use buildDir for analyzer and cleaner (where Python scripts are), dataRoot for data operations
10
+ this.analyzer = new QualityAnalyzer(undefined, buildDir || dataRoot);
11
+ this.planner = new CleaningPlanner(undefined, buildDir || dataRoot);
12
+ this.cleaner = new DataCleaner(buildDir || dataRoot);
13
+ }
14
+ /**
15
+ * Run the full Auto-Cleaning Pipeline on a dataset file
16
+ */
17
+ async runPipeline(datasetId, filePath, outputFormat = "parquet", onProgress) {
18
+ // ... (logging setup)
19
+ const log = (m) => {
20
+ console.error(`[Pipeline] ${m}`);
21
+ if (onProgress)
22
+ onProgress(m);
23
+ };
24
+ log(`Analyzing quality for ${datasetId}...`);
25
+ const qualityReport = await this.analyzer.analyze(filePath);
26
+ // 2. Generate Plan
27
+ log(`Generating cleaning plan...`);
28
+ const plan = await this.planner.generatePlan(datasetId, qualityReport);
29
+ const rules = plan.operations;
30
+ // If no cleaning needed, we still might need format conversion
31
+ const needsConversion = !filePath.toLowerCase().endsWith(`.${outputFormat}`);
32
+ if (rules.length === 0 && !needsConversion) {
33
+ log(`No cleaning or conversion needed.`);
34
+ return {
35
+ initial_quality: qualityReport,
36
+ plan,
37
+ cleaning_result: {
38
+ success: true,
39
+ rows_affected: 0,
40
+ columns_affected: 0,
41
+ logs: ["No operations generated."]
42
+ }
43
+ };
44
+ }
45
+ // 3. Execute Plan (includes conversion if requested)
46
+ log(`Executing ${rules.length} operations (Format: ${outputFormat})...`);
47
+ rules.forEach(op => console.error(` - ${op.type}: ${op.reason}`));
48
+ const cleaningResult = await this.cleaner.clean(filePath, rules, outputFormat);
49
+ if (cleaningResult.success) {
50
+ log(`Cleaning complete. Output: ${cleaningResult.output_path}`);
51
+ }
52
+ else {
53
+ log(`Cleaning failed: ${cleaningResult.error}`);
54
+ }
55
+ return {
56
+ initial_quality: qualityReport,
57
+ plan,
58
+ cleaning_result: cleaningResult,
59
+ final_output_path: cleaningResult.output_path
60
+ };
61
+ }
62
+ }
@@ -0,0 +1,87 @@
1
+ export class ScriptGenerator {
2
+ /**
3
+ * Generate a standalone Python script for the cleaning plan
4
+ */
5
+ generatePythonScript(plan, inputPath) {
6
+ const timestamp = new Date().toISOString().split('T')[0];
7
+ let script = `"""
8
+ Vesper Auto-Cleaning Script
9
+ Generated: ${timestamp}
10
+ Dataset ID: ${plan.dataset_id}
11
+ """
12
+
13
+ import polars as pl
14
+ import numpy as np
15
+
16
+ def clean_dataset(file_path):
17
+ print(f"Loading {file_path}...")
18
+
19
+ # Load Data
20
+ if file_path.endswith(".csv"):
21
+ df = pl.read_csv(file_path, ignore_errors=True)
22
+ elif file_path.endswith(".parquet"):
23
+ df = pl.read_parquet(file_path)
24
+ else:
25
+ raise ValueError("Unsupported format")
26
+
27
+ print(f"Initial shape: {df.shape}")
28
+
29
+ `;
30
+ // Generate code for each operation
31
+ plan.operations.forEach((op, index) => {
32
+ script += ` # Step ${index + 1}: ${op.type}\n`;
33
+ script += ` # Reason: ${op.reason}\n`;
34
+ script += this.generateOpCode(op);
35
+ script += ` print(f"After Step ${index + 1} (${op.type}): {df.shape}")\n\n`;
36
+ });
37
+ script += ` # Save Output
38
+ output_path = file_path.replace(".csv", "_cleaned_repro.csv").replace(".parquet", "_cleaned_repro.parquet")
39
+ if file_path.endswith(".csv"):
40
+ df.write_csv(output_path)
41
+ else:
42
+ df.write_parquet(output_path)
43
+
44
+ print(f"Done! Saved to {output_path}")
45
+
46
+ if __name__ == "__main__":
47
+ # Default input path from generation time, can be overridden
48
+ INPUT_PATH = r"${inputPath}"
49
+ clean_dataset(INPUT_PATH)
50
+ `;
51
+ return script;
52
+ }
53
+ generateOpCode(op) {
54
+ const p = op.params;
55
+ switch (op.type) {
56
+ case "RemoveDuplicates":
57
+ return ` df = df.unique()\n`;
58
+ case "DropColumns":
59
+ return ` cols_to_drop = ${JSON.stringify(p.columns)}\n existing_cols = [c for c in cols_to_drop if c in df.columns]\n if existing_cols:\n df = df.drop(existing_cols)\n`;
60
+ case "FillMissing":
61
+ if (p.method === "constant") {
62
+ const val = typeof p.value === 'string' ? `"${p.value}"` : p.value;
63
+ return ` df = df.with_columns(pl.col("${p.column}").fill_null(${val}))\n`;
64
+ }
65
+ else if (p.method === "mean") {
66
+ return ` mean_val = df["${p.column}"].mean()\n df = df.with_columns(pl.col("${p.column}").fill_null(mean_val))\n`;
67
+ }
68
+ else if (p.method === "median") {
69
+ return ` median_val = df["${p.column}"].median()\n df = df.with_columns(pl.col("${p.column}").fill_null(median_val))\n`;
70
+ }
71
+ return ` # Unknown fill method for ${p.column}\n`;
72
+ case "FixTypes":
73
+ if (p.type === "float")
74
+ return ` df = df.with_columns(pl.col("${p.column}").cast(pl.Float64, strict=False))\n`;
75
+ if (p.type === "int")
76
+ return ` df = df.with_columns(pl.col("${p.column}").cast(pl.Int64, strict=False))\n`;
77
+ if (p.type === "string")
78
+ return ` df = df.with_columns(pl.col("${p.column}").cast(pl.Utf8))\n`;
79
+ return ` # Unknown type conversion for ${p.column}\n`;
80
+ case "RemoveOutliers":
81
+ // IQR implementation inline
82
+ return ` q1 = df["${p.column}"].quantile(0.25)\n q3 = df["${p.column}"].quantile(0.75)\n iqr = q3 - q1\n lower = q1 - (1.5 * iqr)\n upper = q3 + (1.5 * iqr)\n df = df.filter((pl.col("${p.column}") >= lower) & (pl.col("${p.column}") <= upper))\n`;
83
+ default:
84
+ return ` # Operation ${op.type} not fully supported in export yet\n`;
85
+ }
86
+ }
87
+ }
@@ -0,0 +1,127 @@
1
+ import { TargetDetector } from "../preparation/target-detector.js";
2
+ export class CleaningPlanner {
3
+ cache;
4
+ targetDetector;
5
+ constructor(cache, buildDir = process.cwd()) {
6
+ this.cache = cache;
7
+ this.targetDetector = new TargetDetector(buildDir);
8
+ }
9
+ /**
10
+ * Generate a cleaning plan based on the quality report and optional custom rules
11
+ */
12
+ async generatePlan(datasetId, report, ruleSet, targetInfo) {
13
+ if (this.cache) {
14
+ const cached = await this.cache.getPlan(datasetId, { report, ruleSet });
15
+ if (cached) {
16
+ console.error(`[CleaningPlanner] Cache hit for ${datasetId}`);
17
+ return cached;
18
+ }
19
+ }
20
+ const ops = [];
21
+ let estimatedRowsSaved = 0;
22
+ let estimatedColsSaved = 0;
23
+ // 1. Remove Duplicates (Global)
24
+ if (report.duplicate_rows > 0) {
25
+ ops.push({
26
+ type: "RemoveDuplicates",
27
+ params: {},
28
+ reason: `Found ${report.duplicate_rows} exact duplicate rows`
29
+ });
30
+ estimatedRowsSaved += report.duplicate_rows;
31
+ }
32
+ // 2. Column-level operations
33
+ for (const col of report.columns) {
34
+ // A. Drop Empty / Useless Columns
35
+ if (col.missing_percentage > 90 || col.is_constant) {
36
+ ops.push({
37
+ type: "DropColumns",
38
+ params: { columns: [col.name] },
39
+ reason: col.is_constant ? "Column is constant (zero variance)" : `High missing values (${col.missing_percentage.toFixed(1)}%)`
40
+ });
41
+ estimatedColsSaved++;
42
+ continue;
43
+ }
44
+ // B. Fix Types
45
+ if (this.shouldFixType(col)) {
46
+ const targetType = col.inferred_type.toLowerCase().includes("numeric") ? "float" : "string";
47
+ ops.push({
48
+ type: "FixTypes",
49
+ params: { column: col.name, type: targetType },
50
+ reason: `Inferred type is ${col.inferred_type} but stored as ${col.type}`
51
+ });
52
+ }
53
+ // C. Impute Missing Values
54
+ if (col.missing_count > 0) {
55
+ let method = "constant";
56
+ let value = "unknown";
57
+ if (col.inferred_type.includes("Numeric") || col.type.includes("Int") || col.type.includes("Float")) {
58
+ method = "median";
59
+ value = 0;
60
+ }
61
+ else {
62
+ method = "constant";
63
+ value = "missing";
64
+ }
65
+ ops.push({
66
+ type: "FillMissing",
67
+ params: { column: col.name, method, value },
68
+ reason: `${col.missing_count} missing values`
69
+ });
70
+ }
71
+ }
72
+ // 3. Target Unification
73
+ if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
74
+ ops.push({
75
+ type: "RenameTarget",
76
+ params: { old_name: targetInfo.target, new_name: "target" },
77
+ reason: `Detected target '${targetInfo.target}' with high confidence (${targetInfo.confidence.toFixed(2)})`
78
+ });
79
+ estimatedColsSaved++; // Logic fix: effectively "saving" a column by standardizing it
80
+ }
81
+ // 4. Apply Custom Rules
82
+ if (ruleSet) {
83
+ for (const rule of ruleSet.rules) {
84
+ const targets = rule.condition.column === "*"
85
+ ? report.columns.map(c => c.name)
86
+ : [rule.condition.column];
87
+ for (const targetCol of targets) {
88
+ const colStats = report.columns.find(c => c.name === targetCol);
89
+ if (!colStats)
90
+ continue;
91
+ ops.push({
92
+ type: rule.action.type,
93
+ params: { ...rule.action.params, column: targetCol },
94
+ reason: `Custom Rule: ${rule.name} - ${rule.description}`
95
+ });
96
+ }
97
+ }
98
+ }
99
+ const plan = {
100
+ dataset_id: datasetId,
101
+ operations: ops,
102
+ estimated_impact: {
103
+ rows_saved: estimatedRowsSaved,
104
+ columns_saved: estimatedColsSaved,
105
+ quality_score_improvement: 10 + (ops.length * 5)
106
+ }
107
+ };
108
+ if (this.cache) {
109
+ await this.cache.savePlan(datasetId, { report, ruleSet }, plan);
110
+ }
111
+ return plan;
112
+ }
113
+ async generateRules(datasetId, report, ruleSet, targetInfo) {
114
+ const plan = await this.generatePlan(datasetId, report, ruleSet, targetInfo);
115
+ return plan.operations;
116
+ }
117
+ shouldFixType(col) {
118
+ if (col.inferred_type && col.inferred_type.includes("Numeric") && (col.type.includes("String") || col.type.includes("Utf8"))) {
119
+ return true;
120
+ }
121
+ return false;
122
+ }
123
+ isNumeric(col) {
124
+ const t = col.type.toLowerCase();
125
+ return t.includes("int") || t.includes("float") || t.includes("numeric");
126
+ }
127
+ }
@@ -0,0 +1,57 @@
1
+ // --- Domain Presets ---
2
+ export const NLP_PRESET = {
3
+ id: "preset-nlp",
4
+ name: "NLP Data Prep",
5
+ domain: "nlp",
6
+ rules: [
7
+ {
8
+ id: "nlp-1",
9
+ name: "Normalize Case",
10
+ description: "Convert all text to lowercase",
11
+ condition: { column: "*", operator: "is_null", value: false }, // Apply to all non-null if type is string (logic in evaluator)
12
+ action: { type: "NormalizeText", params: { case: "lower" } }
13
+ },
14
+ {
15
+ id: "nlp-2",
16
+ name: "Remove URLs",
17
+ description: "Strip http/https links",
18
+ condition: { column: "*", operator: "contains", value: "http" },
19
+ action: { type: "Replace", params: { pattern: "https?://\\S+", replacement: "" } }
20
+ }
21
+ ]
22
+ };
23
+ export const HEALTHCARE_PRESET = {
24
+ id: "preset-healthcare",
25
+ name: "Healthcare (HIPAA) Prep",
26
+ domain: "healthcare",
27
+ rules: [
28
+ {
29
+ id: "hc-1",
30
+ name: "Mask Emails",
31
+ description: "Identify and mask email addresses",
32
+ condition: { column: "*", operator: "contains", value: "@" },
33
+ action: { type: "CustomMask", params: { method: "hash", salt: "vesper-pii" } }
34
+ },
35
+ {
36
+ id: "hc-2",
37
+ name: "Normalize Dates",
38
+ description: "Ensure ISO-8601 for DOB/Admit dates",
39
+ condition: { column: "date", operator: "is_null", value: false },
40
+ action: { type: "FixTypes", params: { type: "date" } }
41
+ }
42
+ ]
43
+ };
44
+ export const FINANCE_PRESET = {
45
+ id: "preset-finance",
46
+ name: "Financial Data Prep",
47
+ domain: "finance",
48
+ rules: [
49
+ {
50
+ id: "fin-1",
51
+ name: "Currency Cleanup",
52
+ description: "Remove currency symbols and parse as float",
53
+ condition: { column: "amount", operator: "matches_regex", value: "[\\$\\€\\£]" },
54
+ action: { type: "FixTypes", params: { type: "float", strip: "[^0-9\\.]" } }
55
+ }
56
+ ]
57
+ };
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,37 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ /**
4
+ * LocalAdapter mimics cloud storage by copying files to a local vault directory.
5
+ */
6
+ export class LocalAdapter {
7
+ vaultPath;
8
+ constructor(vaultPath) {
9
+ this.vaultPath = path.resolve(vaultPath);
10
+ if (!fs.existsSync(this.vaultPath)) {
11
+ fs.mkdirSync(this.vaultPath, { recursive: true });
12
+ }
13
+ }
14
+ async upload(localPath, remotePath) {
15
+ if (!fs.existsSync(localPath)) {
16
+ throw new Error(`Local file not found: ${localPath}`);
17
+ }
18
+ const destPath = path.join(this.vaultPath, remotePath);
19
+ const destDir = path.dirname(destPath);
20
+ if (!fs.existsSync(destDir)) {
21
+ fs.mkdirSync(destDir, { recursive: true });
22
+ }
23
+ fs.copyFileSync(localPath, destPath);
24
+ // Returns a file URI as the "url"
25
+ return `file://${destPath}`;
26
+ }
27
+ async delete(remotePath) {
28
+ const destPath = path.join(this.vaultPath, remotePath);
29
+ if (fs.existsSync(destPath)) {
30
+ fs.unlinkSync(destPath);
31
+ }
32
+ }
33
+ async getSignedUrl(remotePath, expiresValue) {
34
+ // For local, just return the file URI
35
+ return `file://${path.join(this.vaultPath, remotePath)}`;
36
+ }
37
+ }
@@ -0,0 +1,24 @@
1
+ /**
2
+ * S3Adapter Stub.
3
+ * Note: Requires @aws-sdk/client-s3 to be installed for full functionality.
4
+ */
5
+ export class S3Adapter {
6
+ bucket;
7
+ region;
8
+ credentials;
9
+ constructor(bucket, region, credentials) {
10
+ this.bucket = bucket;
11
+ this.region = region;
12
+ this.credentials = credentials;
13
+ }
14
+ async upload(localPath, remotePath) {
15
+ console.warn("S3Adapter: Full implementation requires @aws-sdk/client-s3. This is a stub.");
16
+ return `https://${this.bucket}.s3.${this.region}.amazonaws.com/${remotePath}`;
17
+ }
18
+ async delete(remotePath) {
19
+ console.warn("S3Adapter: Delete stub called.");
20
+ }
21
+ async getSignedUrl(remotePath) {
22
+ return `https://${this.bucket}.s3.${this.region}.amazonaws.com/${remotePath}?stub=true`;
23
+ }
24
+ }
@@ -0,0 +1,49 @@
1
+ import fs from "fs";
2
+ import { readFile } from "fs/promises";
3
+ import { createClient } from "@supabase/supabase-js";
4
+ export class SupabaseAdapter {
5
+ bucket;
6
+ client;
7
+ constructor(bucket, supabaseUrl, supabaseServiceRoleKey) {
8
+ this.bucket = bucket;
9
+ const resolvedUrl = supabaseUrl || process.env.SUPABASE_URL;
10
+ const resolvedServiceRoleKey = supabaseServiceRoleKey || process.env.SUPABASE_SERVICE_ROLE_KEY;
11
+ if (!resolvedUrl || !resolvedServiceRoleKey) {
12
+ throw new Error("Supabase requires SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY");
13
+ }
14
+ this.client = createClient(resolvedUrl, resolvedServiceRoleKey, {
15
+ auth: {
16
+ persistSession: false,
17
+ },
18
+ });
19
+ }
20
+ async upload(localPath, remotePath) {
21
+ if (!fs.existsSync(localPath)) {
22
+ throw new Error(`Local file not found: ${localPath}`);
23
+ }
24
+ const fileBuffer = await readFile(localPath);
25
+ const { error } = await this.client.storage
26
+ .from(this.bucket)
27
+ .upload(remotePath, fileBuffer, { upsert: true });
28
+ if (error) {
29
+ throw new Error(`Supabase upload failed: ${error.message}`);
30
+ }
31
+ const { data } = this.client.storage.from(this.bucket).getPublicUrl(remotePath);
32
+ return data.publicUrl;
33
+ }
34
+ async delete(remotePath) {
35
+ const { error } = await this.client.storage.from(this.bucket).remove([remotePath]);
36
+ if (error) {
37
+ throw new Error(`Supabase delete failed: ${error.message}`);
38
+ }
39
+ }
40
+ async getSignedUrl(remotePath, expiresValue = 3600) {
41
+ const { data, error } = await this.client.storage
42
+ .from(this.bucket)
43
+ .createSignedUrl(remotePath, expiresValue);
44
+ if (error || !data?.signedUrl) {
45
+ throw new Error(`Supabase signed URL failed: ${error?.message || "No signed URL returned"}`);
46
+ }
47
+ return data.signedUrl;
48
+ }
49
+ }
@@ -0,0 +1,26 @@
1
+ import { LocalAdapter } from "./adapters/local.js";
2
+ import { S3Adapter } from "./adapters/s3.js";
3
+ import { SupabaseAdapter } from "./adapters/supabase.js";
4
+ export class StorageManager {
5
+ /**
6
+ * Creates an adapter based on configuration
7
+ */
8
+ static createAdapter(config) {
9
+ switch (config.type) {
10
+ case "local":
11
+ return new LocalAdapter(config.options.basePath || "./storage_vault");
12
+ case "s3":
13
+ if (!config.options.bucket || !config.options.region) {
14
+ throw new Error("S3 requires bucket and region");
15
+ }
16
+ return new S3Adapter(config.options.bucket, config.options.region, config.options.credentials);
17
+ case "supabase":
18
+ if (!config.options.bucket) {
19
+ throw new Error("Supabase requires bucket");
20
+ }
21
+ return new SupabaseAdapter(config.options.bucket, config.options.supabaseUrl, config.options.supabaseServiceRoleKey);
22
+ default:
23
+ throw new Error(`Unsupported storage type: ${config.type}`);
24
+ }
25
+ }
26
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,73 @@
1
+ import { v4 as uuidv4 } from "uuid";
2
+ export class ComplianceService {
3
+ store;
4
+ constructor(store) {
5
+ this.store = store;
6
+ }
7
+ async verifyGDPR(dataset) {
8
+ const issues = [];
9
+ // 1. Check for PII metadata flag
10
+ if (dataset.has_personal_data) {
11
+ issues.push("Dataset explicitly flagged as containing personal data.");
12
+ }
13
+ // 2. Check quality warnings for PII
14
+ if (dataset.quality_warnings) {
15
+ const piiWarnings = dataset.quality_warnings.filter(w => w.toLowerCase().includes("pii") || w.toLowerCase().includes("personal"));
16
+ if (piiWarnings.length > 0) {
17
+ issues.push(...piiWarnings);
18
+ }
19
+ }
20
+ // 3. Check for consent record
21
+ const consent = this.store.getConsent(dataset.id);
22
+ if (!consent || !consent.consent_obtained) {
23
+ issues.push("No valid consent record found for this dataset.");
24
+ }
25
+ const result = {
26
+ dataset_id: dataset.id,
27
+ standard: "GDPR",
28
+ passed: issues.length === 0,
29
+ issues,
30
+ timestamp: new Date().toISOString()
31
+ };
32
+ this.store.saveCheck(result);
33
+ return result;
34
+ }
35
+ async verifyHIPAA(dataset) {
36
+ const issues = [];
37
+ // 1. De-identification check (simulated)
38
+ // If domain is medical but no de-identification flag exists
39
+ if (dataset.domain === "healthcare" || dataset.domain === "medical") {
40
+ if (dataset.has_personal_data) {
41
+ issues.push("Medical dataset contains personal data (not de-identified).");
42
+ }
43
+ }
44
+ const result = {
45
+ dataset_id: dataset.id,
46
+ standard: "HIPAA",
47
+ passed: issues.length === 0,
48
+ issues,
49
+ timestamp: new Date().toISOString()
50
+ };
51
+ this.store.saveCheck(result);
52
+ return result;
53
+ }
54
+ logOperation(userId, datasetId, operation, metadata = {}) {
55
+ const event = {
56
+ id: uuidv4(),
57
+ user_id: userId,
58
+ dataset_id: datasetId,
59
+ operation,
60
+ timestamp: new Date().toISOString(),
61
+ details: JSON.stringify(metadata)
62
+ };
63
+ this.store.saveAudit(event);
64
+ }
65
+ exportAuditLog(datasetId) {
66
+ const logs = this.store.getAuditLogs(datasetId);
67
+ if (logs.length === 0)
68
+ return "No audit logs found.";
69
+ const header = "ID,Timestamp,User,Operation,Dataset,Details\n";
70
+ const rows = logs.map(l => `${l.id},${l.timestamp},${l.user_id},${l.operation},${l.dataset_id},"${l.details.replace(/"/g, '""')}"`).join("\n");
71
+ return header + rows;
72
+ }
73
+ }
@@ -0,0 +1,80 @@
1
+ export class ComplianceStore {
2
+ db;
3
+ constructor(db) {
4
+ this.db = db;
5
+ this.init();
6
+ }
7
+ init() {
8
+ this.db.exec(`
9
+ CREATE TABLE IF NOT EXISTS audit_logs (
10
+ id TEXT PRIMARY KEY,
11
+ user_id TEXT,
12
+ dataset_id TEXT,
13
+ operation TEXT,
14
+ timestamp TEXT,
15
+ details TEXT
16
+ );
17
+
18
+ CREATE TABLE IF NOT EXISTS compliance_checks (
19
+ dataset_id TEXT,
20
+ standard TEXT,
21
+ passed BOOLEAN,
22
+ issues TEXT, -- JSON array
23
+ timestamp TEXT,
24
+ PRIMARY KEY (dataset_id, standard)
25
+ );
26
+
27
+ CREATE TABLE IF NOT EXISTS consent_records (
28
+ dataset_id TEXT PRIMARY KEY,
29
+ consent_obtained BOOLEAN,
30
+ source TEXT,
31
+ last_verified TEXT
32
+ );
33
+ `);
34
+ }
35
+ saveAudit(event) {
36
+ const stmt = this.db.prepare(`
37
+ INSERT INTO audit_logs (id, user_id, dataset_id, operation, timestamp, details)
38
+ VALUES (?, ?, ?, ?, ?, ?)
39
+ `);
40
+ stmt.run(event.id, event.user_id, event.dataset_id, event.operation, event.timestamp, event.details);
41
+ }
42
+ getAuditLogs(datasetId) {
43
+ let query = "SELECT * FROM audit_logs";
44
+ const params = [];
45
+ if (datasetId) {
46
+ query += " WHERE dataset_id = ?";
47
+ params.push(datasetId);
48
+ }
49
+ query += " ORDER BY timestamp DESC";
50
+ return this.db.prepare(query).all(...params);
51
+ }
52
+ saveCheck(result) {
53
+ const stmt = this.db.prepare(`
54
+ INSERT INTO compliance_checks (dataset_id, standard, passed, issues, timestamp)
55
+ VALUES (?, ?, ?, ?, ?)
56
+ ON CONFLICT(dataset_id, standard) DO UPDATE SET
57
+ passed=excluded.passed,
58
+ issues=excluded.issues,
59
+ timestamp=excluded.timestamp
60
+ `);
61
+ stmt.run(result.dataset_id, result.standard, result.passed ? 1 : 0, JSON.stringify(result.issues), result.timestamp);
62
+ }
63
+ saveConsent(record) {
64
+ const stmt = this.db.prepare(`
65
+ INSERT INTO consent_records (dataset_id, consent_obtained, source, last_verified)
66
+ VALUES (?, ?, ?, ?)
67
+ ON CONFLICT(dataset_id) DO UPDATE SET
68
+ consent_obtained=excluded.consent_obtained,
69
+ source=excluded.source,
70
+ last_verified=excluded.last_verified
71
+ `);
72
+ stmt.run(record.dataset_id, record.consent_obtained ? 1 : 0, record.source, record.last_verified);
73
+ }
74
+ getConsent(datasetId) {
75
+ const row = this.db.prepare("SELECT * FROM consent_records WHERE dataset_id = ?").get(datasetId);
76
+ if (!row)
77
+ return null;
78
+ return { ...row, consent_obtained: Boolean(row.consent_obtained) };
79
+ }
80
+ }
@@ -0,0 +1 @@
1
+ export {};