@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
export class CleaningPlanner {
|
|
2
|
+
cache;
|
|
3
|
+
constructor(cache) {
|
|
4
|
+
this.cache = cache;
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Generate a cleaning plan based on the quality report and optional custom rules
|
|
8
|
+
*/
|
|
9
|
+
async generatePlan(datasetId, report, ruleSet) {
|
|
10
|
+
if (this.cache) {
|
|
11
|
+
const cached = await this.cache.getPlan(datasetId, { report, ruleSet });
|
|
12
|
+
if (cached) {
|
|
13
|
+
console.log(`[CleaningPlanner] Cache hit for ${datasetId}`);
|
|
14
|
+
return cached;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
const ops = [];
|
|
18
|
+
let estimatedRowsSaved = 0;
|
|
19
|
+
let estimatedColsSaved = 0;
|
|
20
|
+
// 1. Remove Duplicates (Global)
|
|
21
|
+
if (report.duplicate_rows > 0) {
|
|
22
|
+
ops.push({
|
|
23
|
+
type: "RemoveDuplicates",
|
|
24
|
+
params: {},
|
|
25
|
+
reason: `Found ${report.duplicate_rows} exact duplicate rows`
|
|
26
|
+
});
|
|
27
|
+
estimatedRowsSaved += report.duplicate_rows;
|
|
28
|
+
}
|
|
29
|
+
// 2. Column-level operations
|
|
30
|
+
for (const col of report.columns) {
|
|
31
|
+
// A. Drop Empty / Useless Columns
|
|
32
|
+
if (col.missing_percentage > 90 || col.is_constant) {
|
|
33
|
+
ops.push({
|
|
34
|
+
type: "DropColumns",
|
|
35
|
+
params: { columns: [col.name] },
|
|
36
|
+
reason: col.is_constant ? "Column is constant (zero variance)" : `High missing values (${col.missing_percentage.toFixed(1)}%)`
|
|
37
|
+
});
|
|
38
|
+
estimatedColsSaved++;
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
// B. Fix Types
|
|
42
|
+
if (this.shouldFixType(col)) {
|
|
43
|
+
const targetType = col.inferred_type.toLowerCase().includes("numeric") ? "float" : "string";
|
|
44
|
+
ops.push({
|
|
45
|
+
type: "FixTypes",
|
|
46
|
+
params: { column: col.name, type: targetType },
|
|
47
|
+
reason: `Inferred type is ${col.inferred_type} but stored as ${col.type}`
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
// C. Impute Missing Values
|
|
51
|
+
if (col.missing_count > 0) {
|
|
52
|
+
let method = "constant";
|
|
53
|
+
let value = "unknown";
|
|
54
|
+
if (col.inferred_type.includes("Numeric") || col.type.includes("Int") || col.type.includes("Float")) {
|
|
55
|
+
method = "median";
|
|
56
|
+
value = 0;
|
|
57
|
+
}
|
|
58
|
+
else {
|
|
59
|
+
method = "constant";
|
|
60
|
+
value = "missing";
|
|
61
|
+
}
|
|
62
|
+
ops.push({
|
|
63
|
+
type: "FillMissing",
|
|
64
|
+
params: { column: col.name, method, value },
|
|
65
|
+
reason: `${col.missing_count} missing values`
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
// 3. Apply Custom Rules
|
|
70
|
+
if (ruleSet) {
|
|
71
|
+
for (const rule of ruleSet.rules) {
|
|
72
|
+
const targets = rule.condition.column === "*"
|
|
73
|
+
? report.columns.map(c => c.name)
|
|
74
|
+
: [rule.condition.column];
|
|
75
|
+
for (const targetCol of targets) {
|
|
76
|
+
const colStats = report.columns.find(c => c.name === targetCol);
|
|
77
|
+
if (!colStats)
|
|
78
|
+
continue;
|
|
79
|
+
ops.push({
|
|
80
|
+
type: rule.action.type,
|
|
81
|
+
params: { ...rule.action.params, column: targetCol },
|
|
82
|
+
reason: `Custom Rule: ${rule.name} - ${rule.description}`
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
const plan = {
|
|
88
|
+
dataset_id: datasetId,
|
|
89
|
+
operations: ops,
|
|
90
|
+
estimated_impact: {
|
|
91
|
+
rows_saved: estimatedRowsSaved,
|
|
92
|
+
columns_saved: estimatedColsSaved,
|
|
93
|
+
quality_score_improvement: 10 + (ops.length * 5)
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
if (this.cache) {
|
|
97
|
+
await this.cache.savePlan(datasetId, { report, ruleSet }, plan);
|
|
98
|
+
}
|
|
99
|
+
return plan;
|
|
100
|
+
}
|
|
101
|
+
shouldFixType(col) {
|
|
102
|
+
if (col.inferred_type && col.inferred_type.includes("Numeric") && (col.type.includes("String") || col.type.includes("Utf8"))) {
|
|
103
|
+
return true;
|
|
104
|
+
}
|
|
105
|
+
return false;
|
|
106
|
+
}
|
|
107
|
+
isNumeric(col) {
|
|
108
|
+
const t = col.type.toLowerCase();
|
|
109
|
+
return t.includes("int") || t.includes("float") || t.includes("numeric");
|
|
110
|
+
}
|
|
111
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
// --- Domain Presets ---
|
|
2
|
+
export const NLP_PRESET = {
|
|
3
|
+
id: "preset-nlp",
|
|
4
|
+
name: "NLP Data Prep",
|
|
5
|
+
domain: "nlp",
|
|
6
|
+
rules: [
|
|
7
|
+
{
|
|
8
|
+
id: "nlp-1",
|
|
9
|
+
name: "Normalize Case",
|
|
10
|
+
description: "Convert all text to lowercase",
|
|
11
|
+
condition: { column: "*", operator: "is_null", value: false }, // Apply to all non-null if type is string (logic in evaluator)
|
|
12
|
+
action: { type: "NormalizeText", params: { case: "lower" } }
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
id: "nlp-2",
|
|
16
|
+
name: "Remove URLs",
|
|
17
|
+
description: "Strip http/https links",
|
|
18
|
+
condition: { column: "*", operator: "contains", value: "http" },
|
|
19
|
+
action: { type: "Replace", params: { pattern: "https?://\\S+", replacement: "" } }
|
|
20
|
+
}
|
|
21
|
+
]
|
|
22
|
+
};
|
|
23
|
+
export const HEALTHCARE_PRESET = {
|
|
24
|
+
id: "preset-healthcare",
|
|
25
|
+
name: "Healthcare (HIPAA) Prep",
|
|
26
|
+
domain: "healthcare",
|
|
27
|
+
rules: [
|
|
28
|
+
{
|
|
29
|
+
id: "hc-1",
|
|
30
|
+
name: "Mask Emails",
|
|
31
|
+
description: "Identify and mask email addresses",
|
|
32
|
+
condition: { column: "*", operator: "contains", value: "@" },
|
|
33
|
+
action: { type: "CustomMask", params: { method: "hash", salt: "vesper-pii" } }
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
id: "hc-2",
|
|
37
|
+
name: "Normalize Dates",
|
|
38
|
+
description: "Ensure ISO-8601 for DOB/Admit dates",
|
|
39
|
+
condition: { column: "date", operator: "is_null", value: false },
|
|
40
|
+
action: { type: "FixTypes", params: { type: "date" } }
|
|
41
|
+
}
|
|
42
|
+
]
|
|
43
|
+
};
|
|
44
|
+
export const FINANCE_PRESET = {
|
|
45
|
+
id: "preset-finance",
|
|
46
|
+
name: "Financial Data Prep",
|
|
47
|
+
domain: "finance",
|
|
48
|
+
rules: [
|
|
49
|
+
{
|
|
50
|
+
id: "fin-1",
|
|
51
|
+
name: "Currency Cleanup",
|
|
52
|
+
description: "Remove currency symbols and parse as float",
|
|
53
|
+
condition: { column: "amount", operator: "matches_regex", value: "[\\$\\€\\£]" },
|
|
54
|
+
action: { type: "FixTypes", params: { type: "float", strip: "[^0-9\\.]" } }
|
|
55
|
+
}
|
|
56
|
+
]
|
|
57
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
/**
|
|
4
|
+
* LocalAdapter mimics cloud storage by copying files to a local vault directory.
|
|
5
|
+
*/
|
|
6
|
+
export class LocalAdapter {
|
|
7
|
+
vaultPath;
|
|
8
|
+
constructor(vaultPath) {
|
|
9
|
+
this.vaultPath = path.resolve(vaultPath);
|
|
10
|
+
if (!fs.existsSync(this.vaultPath)) {
|
|
11
|
+
fs.mkdirSync(this.vaultPath, { recursive: true });
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
async upload(localPath, remotePath) {
|
|
15
|
+
if (!fs.existsSync(localPath)) {
|
|
16
|
+
throw new Error(`Local file not found: ${localPath}`);
|
|
17
|
+
}
|
|
18
|
+
const destPath = path.join(this.vaultPath, remotePath);
|
|
19
|
+
const destDir = path.dirname(destPath);
|
|
20
|
+
if (!fs.existsSync(destDir)) {
|
|
21
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
22
|
+
}
|
|
23
|
+
fs.copyFileSync(localPath, destPath);
|
|
24
|
+
// Returns a file URI as the "url"
|
|
25
|
+
return `file://${destPath}`;
|
|
26
|
+
}
|
|
27
|
+
async delete(remotePath) {
|
|
28
|
+
const destPath = path.join(this.vaultPath, remotePath);
|
|
29
|
+
if (fs.existsSync(destPath)) {
|
|
30
|
+
fs.unlinkSync(destPath);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
async getSignedUrl(remotePath, expiresValue) {
|
|
34
|
+
// For local, just return the file URI
|
|
35
|
+
return `file://${path.join(this.vaultPath, remotePath)}`;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* S3Adapter Stub.
|
|
3
|
+
* Note: Requires @aws-sdk/client-s3 to be installed for full functionality.
|
|
4
|
+
*/
|
|
5
|
+
export class S3Adapter {
|
|
6
|
+
bucket;
|
|
7
|
+
region;
|
|
8
|
+
credentials;
|
|
9
|
+
constructor(bucket, region, credentials) {
|
|
10
|
+
this.bucket = bucket;
|
|
11
|
+
this.region = region;
|
|
12
|
+
this.credentials = credentials;
|
|
13
|
+
}
|
|
14
|
+
async upload(localPath, remotePath) {
|
|
15
|
+
console.warn("S3Adapter: Full implementation requires @aws-sdk/client-s3. This is a stub.");
|
|
16
|
+
return `https://${this.bucket}.s3.${this.region}.amazonaws.com/${remotePath}`;
|
|
17
|
+
}
|
|
18
|
+
async delete(remotePath) {
|
|
19
|
+
console.warn("S3Adapter: Delete stub called.");
|
|
20
|
+
}
|
|
21
|
+
async getSignedUrl(remotePath) {
|
|
22
|
+
return `https://${this.bucket}.s3.${this.region}.amazonaws.com/${remotePath}?stub=true`;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { LocalAdapter } from "./adapters/local.js";
|
|
2
|
+
import { S3Adapter } from "./adapters/s3.js";
|
|
3
|
+
export class StorageManager {
|
|
4
|
+
/**
|
|
5
|
+
* Creates an adapter based on configuration
|
|
6
|
+
*/
|
|
7
|
+
static createAdapter(config) {
|
|
8
|
+
switch (config.type) {
|
|
9
|
+
case "local":
|
|
10
|
+
return new LocalAdapter(config.options.basePath || "./storage_vault");
|
|
11
|
+
case "s3":
|
|
12
|
+
if (!config.options.bucket || !config.options.region) {
|
|
13
|
+
throw new Error("S3 requires bucket and region");
|
|
14
|
+
}
|
|
15
|
+
return new S3Adapter(config.options.bucket, config.options.region, config.options.credentials);
|
|
16
|
+
default:
|
|
17
|
+
throw new Error(`Unsupported storage type: ${config.type}`);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { v4 as uuidv4 } from "uuid";
|
|
2
|
+
export class ComplianceService {
|
|
3
|
+
store;
|
|
4
|
+
constructor(store) {
|
|
5
|
+
this.store = store;
|
|
6
|
+
}
|
|
7
|
+
async verifyGDPR(dataset) {
|
|
8
|
+
const issues = [];
|
|
9
|
+
// 1. Check for PII metadata flag
|
|
10
|
+
if (dataset.has_personal_data) {
|
|
11
|
+
issues.push("Dataset explicitly flagged as containing personal data.");
|
|
12
|
+
}
|
|
13
|
+
// 2. Check quality warnings for PII
|
|
14
|
+
if (dataset.quality_warnings) {
|
|
15
|
+
const piiWarnings = dataset.quality_warnings.filter(w => w.toLowerCase().includes("pii") || w.toLowerCase().includes("personal"));
|
|
16
|
+
if (piiWarnings.length > 0) {
|
|
17
|
+
issues.push(...piiWarnings);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
// 3. Check for consent record
|
|
21
|
+
const consent = this.store.getConsent(dataset.id);
|
|
22
|
+
if (!consent || !consent.consent_obtained) {
|
|
23
|
+
issues.push("No valid consent record found for this dataset.");
|
|
24
|
+
}
|
|
25
|
+
const result = {
|
|
26
|
+
dataset_id: dataset.id,
|
|
27
|
+
standard: "GDPR",
|
|
28
|
+
passed: issues.length === 0,
|
|
29
|
+
issues,
|
|
30
|
+
timestamp: new Date().toISOString()
|
|
31
|
+
};
|
|
32
|
+
this.store.saveCheck(result);
|
|
33
|
+
return result;
|
|
34
|
+
}
|
|
35
|
+
async verifyHIPAA(dataset) {
|
|
36
|
+
const issues = [];
|
|
37
|
+
// 1. De-identification check (simulated)
|
|
38
|
+
// If domain is medical but no de-identification flag exists
|
|
39
|
+
if (dataset.domain === "healthcare" || dataset.domain === "medical") {
|
|
40
|
+
if (dataset.has_personal_data) {
|
|
41
|
+
issues.push("Medical dataset contains personal data (not de-identified).");
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
const result = {
|
|
45
|
+
dataset_id: dataset.id,
|
|
46
|
+
standard: "HIPAA",
|
|
47
|
+
passed: issues.length === 0,
|
|
48
|
+
issues,
|
|
49
|
+
timestamp: new Date().toISOString()
|
|
50
|
+
};
|
|
51
|
+
this.store.saveCheck(result);
|
|
52
|
+
return result;
|
|
53
|
+
}
|
|
54
|
+
logOperation(userId, datasetId, operation, metadata = {}) {
|
|
55
|
+
const event = {
|
|
56
|
+
id: uuidv4(),
|
|
57
|
+
user_id: userId,
|
|
58
|
+
dataset_id: datasetId,
|
|
59
|
+
operation,
|
|
60
|
+
timestamp: new Date().toISOString(),
|
|
61
|
+
details: JSON.stringify(metadata)
|
|
62
|
+
};
|
|
63
|
+
this.store.saveAudit(event);
|
|
64
|
+
}
|
|
65
|
+
exportAuditLog(datasetId) {
|
|
66
|
+
const logs = this.store.getAuditLogs(datasetId);
|
|
67
|
+
if (logs.length === 0)
|
|
68
|
+
return "No audit logs found.";
|
|
69
|
+
const header = "ID,Timestamp,User,Operation,Dataset,Details\n";
|
|
70
|
+
const rows = logs.map(l => `${l.id},${l.timestamp},${l.user_id},${l.operation},${l.dataset_id},"${l.details.replace(/"/g, '""')}"`).join("\n");
|
|
71
|
+
return header + rows;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
export class ComplianceStore {
|
|
2
|
+
db;
|
|
3
|
+
constructor(db) {
|
|
4
|
+
this.db = db;
|
|
5
|
+
this.init();
|
|
6
|
+
}
|
|
7
|
+
init() {
|
|
8
|
+
this.db.exec(`
|
|
9
|
+
CREATE TABLE IF NOT EXISTS audit_logs (
|
|
10
|
+
id TEXT PRIMARY KEY,
|
|
11
|
+
user_id TEXT,
|
|
12
|
+
dataset_id TEXT,
|
|
13
|
+
operation TEXT,
|
|
14
|
+
timestamp TEXT,
|
|
15
|
+
details TEXT
|
|
16
|
+
);
|
|
17
|
+
|
|
18
|
+
CREATE TABLE IF NOT EXISTS compliance_checks (
|
|
19
|
+
dataset_id TEXT,
|
|
20
|
+
standard TEXT,
|
|
21
|
+
passed BOOLEAN,
|
|
22
|
+
issues TEXT, -- JSON array
|
|
23
|
+
timestamp TEXT,
|
|
24
|
+
PRIMARY KEY (dataset_id, standard)
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
CREATE TABLE IF NOT EXISTS consent_records (
|
|
28
|
+
dataset_id TEXT PRIMARY KEY,
|
|
29
|
+
consent_obtained BOOLEAN,
|
|
30
|
+
source TEXT,
|
|
31
|
+
last_verified TEXT
|
|
32
|
+
);
|
|
33
|
+
`);
|
|
34
|
+
}
|
|
35
|
+
saveAudit(event) {
|
|
36
|
+
const stmt = this.db.prepare(`
|
|
37
|
+
INSERT INTO audit_logs (id, user_id, dataset_id, operation, timestamp, details)
|
|
38
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
39
|
+
`);
|
|
40
|
+
stmt.run(event.id, event.user_id, event.dataset_id, event.operation, event.timestamp, event.details);
|
|
41
|
+
}
|
|
42
|
+
getAuditLogs(datasetId) {
|
|
43
|
+
let query = "SELECT * FROM audit_logs";
|
|
44
|
+
const params = [];
|
|
45
|
+
if (datasetId) {
|
|
46
|
+
query += " WHERE dataset_id = ?";
|
|
47
|
+
params.push(datasetId);
|
|
48
|
+
}
|
|
49
|
+
query += " ORDER BY timestamp DESC";
|
|
50
|
+
return this.db.prepare(query).all(...params);
|
|
51
|
+
}
|
|
52
|
+
saveCheck(result) {
|
|
53
|
+
const stmt = this.db.prepare(`
|
|
54
|
+
INSERT INTO compliance_checks (dataset_id, standard, passed, issues, timestamp)
|
|
55
|
+
VALUES (?, ?, ?, ?, ?)
|
|
56
|
+
ON CONFLICT(dataset_id, standard) DO UPDATE SET
|
|
57
|
+
passed=excluded.passed,
|
|
58
|
+
issues=excluded.issues,
|
|
59
|
+
timestamp=excluded.timestamp
|
|
60
|
+
`);
|
|
61
|
+
stmt.run(result.dataset_id, result.standard, result.passed ? 1 : 0, JSON.stringify(result.issues), result.timestamp);
|
|
62
|
+
}
|
|
63
|
+
saveConsent(record) {
|
|
64
|
+
const stmt = this.db.prepare(`
|
|
65
|
+
INSERT INTO consent_records (dataset_id, consent_obtained, source, last_verified)
|
|
66
|
+
VALUES (?, ?, ?, ?)
|
|
67
|
+
ON CONFLICT(dataset_id) DO UPDATE SET
|
|
68
|
+
consent_obtained=excluded.consent_obtained,
|
|
69
|
+
source=excluded.source,
|
|
70
|
+
last_verified=excluded.last_verified
|
|
71
|
+
`);
|
|
72
|
+
stmt.run(record.dataset_id, record.consent_obtained ? 1 : 0, record.source, record.last_verified);
|
|
73
|
+
}
|
|
74
|
+
getConsent(datasetId) {
|
|
75
|
+
const row = this.db.prepare("SELECT * FROM consent_records WHERE dataset_id = ?").get(datasetId);
|
|
76
|
+
if (!row)
|
|
77
|
+
return null;
|
|
78
|
+
return { ...row, consent_obtained: Boolean(row.consent_obtained) };
|
|
79
|
+
}
|
|
80
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { parentPort } from "worker_threads";
|
|
2
|
+
if (parentPort) {
|
|
3
|
+
parentPort.on("message", (data) => {
|
|
4
|
+
const { records, task, params } = data;
|
|
5
|
+
let results = [];
|
|
6
|
+
switch (task) {
|
|
7
|
+
case "transform":
|
|
8
|
+
results = records.map(r => {
|
|
9
|
+
// Simulated transformation (e.g. alignment/harmonization logic)
|
|
10
|
+
const updated = { ...r, processed_at: new Date().toISOString() };
|
|
11
|
+
// Complex CPU bound work simulation
|
|
12
|
+
let sum = 0;
|
|
13
|
+
for (let i = 0; i < 1000; i++)
|
|
14
|
+
sum += i;
|
|
15
|
+
return updated;
|
|
16
|
+
});
|
|
17
|
+
break;
|
|
18
|
+
default:
|
|
19
|
+
results = records;
|
|
20
|
+
}
|
|
21
|
+
parentPort?.postMessage(results);
|
|
22
|
+
});
|
|
23
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
export class StreamProcessor {
|
|
2
|
+
/**
|
|
3
|
+
* Processes a readable stream in chunks.
|
|
4
|
+
*/
|
|
5
|
+
static async processInChunks(stream, options, processor) {
|
|
6
|
+
let chunk = [];
|
|
7
|
+
const results = [];
|
|
8
|
+
let totalProcessed = 0;
|
|
9
|
+
for await (const record of stream) {
|
|
10
|
+
// Sampling logic
|
|
11
|
+
if (options.samplePercentage !== undefined) {
|
|
12
|
+
if (Math.random() * 100 > options.samplePercentage) {
|
|
13
|
+
continue;
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
chunk.push(record);
|
|
17
|
+
if (chunk.length >= options.chunkSize) {
|
|
18
|
+
const processedChunk = await processor(chunk);
|
|
19
|
+
results.push(...processedChunk);
|
|
20
|
+
totalProcessed += chunk.length;
|
|
21
|
+
chunk = [];
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
// Process remaining
|
|
25
|
+
if (chunk.length > 0) {
|
|
26
|
+
const processedChunk = await processor(chunk);
|
|
27
|
+
results.push(...processedChunk);
|
|
28
|
+
totalProcessed += chunk.length;
|
|
29
|
+
}
|
|
30
|
+
return results;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Creates a sampler stream that only emits a percentage of records.
|
|
34
|
+
*/
|
|
35
|
+
static createSampler(percentage) {
|
|
36
|
+
return () => Math.random() * 100 <= percentage;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { Worker } from "worker_threads";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import { fileURLToPath } from "url";
|
|
4
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
5
|
+
export class WorkerPool {
|
|
6
|
+
poolSize;
|
|
7
|
+
workers = [];
|
|
8
|
+
queue = [];
|
|
9
|
+
activeWorkers = 0;
|
|
10
|
+
constructor(poolSize = 4) {
|
|
11
|
+
this.poolSize = poolSize;
|
|
12
|
+
const workerPath = path.resolve(__dirname, "processing-worker.js");
|
|
13
|
+
// Note: In development with tsx, we might need to handle .ts vs .js
|
|
14
|
+
}
|
|
15
|
+
async process(records, task = "transform") {
|
|
16
|
+
return new Promise((resolve, reject) => {
|
|
17
|
+
const worker = new Worker(new URL("./processing-worker.ts", import.meta.url), {
|
|
18
|
+
execArgv: ["--import", "tsx"]
|
|
19
|
+
});
|
|
20
|
+
worker.postMessage({ records, task });
|
|
21
|
+
worker.on("message", (results) => {
|
|
22
|
+
worker.terminate();
|
|
23
|
+
resolve(results);
|
|
24
|
+
});
|
|
25
|
+
worker.on("error", reject);
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Splits records into chunks and processes them in parallel across workers.
|
|
30
|
+
*/
|
|
31
|
+
async processParallel(records, chunkSize) {
|
|
32
|
+
const chunks = [];
|
|
33
|
+
for (let i = 0; i < records.length; i += chunkSize) {
|
|
34
|
+
chunks.push(records.slice(i, i + chunkSize));
|
|
35
|
+
}
|
|
36
|
+
const results = await Promise.all(chunks.map(chunk => this.process(chunk)));
|
|
37
|
+
return results.flat();
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
export class DataExporter {
|
|
5
|
+
pythonPath = "python";
|
|
6
|
+
scriptPath;
|
|
7
|
+
constructor(projectRoot = process.cwd()) {
|
|
8
|
+
this.scriptPath = path.join(projectRoot, "src", "python", "export_engine.py");
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Exports a dataset file to a specified format
|
|
12
|
+
*/
|
|
13
|
+
async export(inputFile, outputFile, format, options = {}) {
|
|
14
|
+
return new Promise((resolve, reject) => {
|
|
15
|
+
if (!fs.existsSync(inputFile)) {
|
|
16
|
+
reject(new Error(`Input file not found: ${inputFile}`));
|
|
17
|
+
return;
|
|
18
|
+
}
|
|
19
|
+
const args = [this.scriptPath, inputFile, outputFile, format, JSON.stringify(options)];
|
|
20
|
+
const process = spawn(this.pythonPath, args);
|
|
21
|
+
let stdout = "";
|
|
22
|
+
let stderr = "";
|
|
23
|
+
process.stdout.on("data", (data) => stdout += data.toString());
|
|
24
|
+
process.stderr.on("data", (data) => stderr += data.toString());
|
|
25
|
+
process.on("close", (code) => {
|
|
26
|
+
if (code !== 0) {
|
|
27
|
+
reject(new Error(`Export failed: ${stderr || stdout}`));
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
try {
|
|
31
|
+
const result = JSON.parse(stdout);
|
|
32
|
+
if (result.error) {
|
|
33
|
+
reject(new Error(result.error));
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
resolve(result);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
catch (e) {
|
|
40
|
+
reject(new Error(`Failed to parse export output: ${stdout}`));
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import crypto from "crypto";
|
|
4
|
+
/**
|
|
5
|
+
* MetadataPackager bundles data files with metadata and quality reports.
|
|
6
|
+
* Follows a simplified Frictionless Data Package standard.
|
|
7
|
+
*/
|
|
8
|
+
export class MetadataPackager {
|
|
9
|
+
/**
|
|
10
|
+
* Creates a data package in the target directory
|
|
11
|
+
*/
|
|
12
|
+
async createPackage(targetDir, dataFiles, metadata, extraArtifacts) {
|
|
13
|
+
try {
|
|
14
|
+
if (!fs.existsSync(targetDir)) {
|
|
15
|
+
fs.mkdirSync(targetDir, { recursive: true });
|
|
16
|
+
}
|
|
17
|
+
const resources = [];
|
|
18
|
+
// 1. Process Data Files
|
|
19
|
+
for (const file of dataFiles) {
|
|
20
|
+
if (!fs.existsSync(file.path)) {
|
|
21
|
+
throw new Error(`Data file not found: ${file.path}`);
|
|
22
|
+
}
|
|
23
|
+
const fileName = path.basename(file.path);
|
|
24
|
+
const destPath = path.join(targetDir, fileName);
|
|
25
|
+
// Copy file to package dir
|
|
26
|
+
fs.copyFileSync(file.path, destPath);
|
|
27
|
+
// Compute hash and size
|
|
28
|
+
const fileBuffer = fs.readFileSync(destPath);
|
|
29
|
+
const hash = crypto.createHash("sha256").update(fileBuffer).digest("hex");
|
|
30
|
+
const stats = fs.statSync(destPath);
|
|
31
|
+
resources.push({
|
|
32
|
+
name: file.name,
|
|
33
|
+
path: fileName,
|
|
34
|
+
format: file.format,
|
|
35
|
+
mediatype: this.getMediaType(file.format),
|
|
36
|
+
bytes: stats.size,
|
|
37
|
+
hash: `sha256:${hash}`
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
// 2. Add Extra Artifacts
|
|
41
|
+
if (extraArtifacts?.qualityReport) {
|
|
42
|
+
const qrPath = path.join(targetDir, "quality_report.json");
|
|
43
|
+
fs.writeFileSync(qrPath, JSON.stringify(extraArtifacts.qualityReport, null, 2));
|
|
44
|
+
resources.push({
|
|
45
|
+
name: "quality-report",
|
|
46
|
+
path: "quality_report.json",
|
|
47
|
+
format: "json"
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
if (extraArtifacts?.cleaningLog) {
|
|
51
|
+
const clPath = path.join(targetDir, "cleaning_log.json");
|
|
52
|
+
fs.writeFileSync(clPath, JSON.stringify(extraArtifacts.cleaningLog, null, 2));
|
|
53
|
+
resources.push({
|
|
54
|
+
name: "cleaning-log",
|
|
55
|
+
path: "cleaning_log.json",
|
|
56
|
+
format: "json"
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
// 3. Generate Manifest (datapackage.json)
|
|
60
|
+
const manifest = {
|
|
61
|
+
profile: "tabular-data-package",
|
|
62
|
+
name: metadata.name.toLowerCase().replace(/\s+/g, "-"),
|
|
63
|
+
title: metadata.name,
|
|
64
|
+
description: metadata.description,
|
|
65
|
+
version: metadata.version,
|
|
66
|
+
homepage: "https://github.com/vesper-data",
|
|
67
|
+
license: metadata.license,
|
|
68
|
+
author: metadata.author,
|
|
69
|
+
keywords: metadata.tags,
|
|
70
|
+
created: new Date().toISOString(),
|
|
71
|
+
resources: resources
|
|
72
|
+
};
|
|
73
|
+
const manifestPath = path.join(targetDir, "datapackage.json");
|
|
74
|
+
fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
|
|
75
|
+
return {
|
|
76
|
+
success: true,
|
|
77
|
+
packagePath: targetDir,
|
|
78
|
+
manifest: manifest
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
catch (e) {
|
|
82
|
+
return {
|
|
83
|
+
success: false,
|
|
84
|
+
packagePath: targetDir,
|
|
85
|
+
manifest: null,
|
|
86
|
+
error: e.message
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
getMediaType(format) {
|
|
91
|
+
switch (format.toLowerCase()) {
|
|
92
|
+
case "csv": return "text/csv";
|
|
93
|
+
case "parquet": return "application/x-parquet";
|
|
94
|
+
case "jsonl": return "application/x-jsonlines";
|
|
95
|
+
case "arrow": return "application/x-apache-arrow-file";
|
|
96
|
+
case "json": return "application/json";
|
|
97
|
+
default: return "application/octet-stream";
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|