@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +259 -0
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +50 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +60 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +111 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/storage-manager.js +20 -0
  15. package/build/cloud/types.js +1 -0
  16. package/build/compliance/service.js +73 -0
  17. package/build/compliance/store.js +80 -0
  18. package/build/compliance/types.js +1 -0
  19. package/build/data/processing-worker.js +23 -0
  20. package/build/data/streaming.js +38 -0
  21. package/build/data/worker-pool.js +39 -0
  22. package/build/export/exporter.js +45 -0
  23. package/build/export/packager.js +100 -0
  24. package/build/export/types.js +1 -0
  25. package/build/fusion/aligner.js +56 -0
  26. package/build/fusion/deduplicator.js +69 -0
  27. package/build/fusion/harmonizer.js +39 -0
  28. package/build/fusion/orchestrator.js +86 -0
  29. package/build/fusion/types.js +1 -0
  30. package/build/index.js +632 -0
  31. package/build/ingestion/hf-downloader.js +64 -0
  32. package/build/ingestion/ingestor.js +96 -0
  33. package/build/ingestion/kaggle-downloader.js +79 -0
  34. package/build/install/install-service.js +41 -0
  35. package/build/jobs/manager.js +129 -0
  36. package/build/jobs/queue.js +59 -0
  37. package/build/jobs/types.js +1 -0
  38. package/build/metadata/domain.js +147 -0
  39. package/build/metadata/github-scraper.js +47 -0
  40. package/build/metadata/institutional-scrapers.js +49 -0
  41. package/build/metadata/kaggle-scraper.js +182 -0
  42. package/build/metadata/license.js +68 -0
  43. package/build/metadata/monitoring-service.js +107 -0
  44. package/build/metadata/monitoring-store.js +78 -0
  45. package/build/metadata/monitoring-types.js +1 -0
  46. package/build/metadata/quality.js +48 -0
  47. package/build/metadata/rate-limiter.js +128 -0
  48. package/build/metadata/scraper.js +353 -0
  49. package/build/metadata/store.js +325 -0
  50. package/build/metadata/types.js +1 -0
  51. package/build/metadata/uci-scraper.js +49 -0
  52. package/build/monitoring/observability.js +76 -0
  53. package/build/quality/analyzer.js +57 -0
  54. package/build/quality/image-analyzer.js +46 -0
  55. package/build/quality/media-analyzer.js +46 -0
  56. package/build/quality/quality-orchestrator.js +162 -0
  57. package/build/quality/types.js +1 -0
  58. package/build/scripts/build-index.js +54 -0
  59. package/build/scripts/check-db.js +73 -0
  60. package/build/scripts/check-jobs.js +24 -0
  61. package/build/scripts/check-naruto.js +17 -0
  62. package/build/scripts/demo-full-pipeline.js +62 -0
  63. package/build/scripts/demo-ui.js +58 -0
  64. package/build/scripts/e2e-demo.js +72 -0
  65. package/build/scripts/massive-scrape.js +103 -0
  66. package/build/scripts/ops-dashboard.js +33 -0
  67. package/build/scripts/scrape-metadata.js +100 -0
  68. package/build/scripts/search-cli.js +26 -0
  69. package/build/scripts/test-bias.js +45 -0
  70. package/build/scripts/test-caching.js +51 -0
  71. package/build/scripts/test-cleaning.js +76 -0
  72. package/build/scripts/test-cloud-storage.js +48 -0
  73. package/build/scripts/test-compliance.js +58 -0
  74. package/build/scripts/test-conversion.js +64 -0
  75. package/build/scripts/test-custom-rules.js +58 -0
  76. package/build/scripts/test-db-opt.js +63 -0
  77. package/build/scripts/test-export-custom.js +33 -0
  78. package/build/scripts/test-exporter.js +53 -0
  79. package/build/scripts/test-fusion.js +61 -0
  80. package/build/scripts/test-github.js +27 -0
  81. package/build/scripts/test-group-split.js +52 -0
  82. package/build/scripts/test-hf-download.js +29 -0
  83. package/build/scripts/test-holdout-manager.js +61 -0
  84. package/build/scripts/test-hybrid-search.js +41 -0
  85. package/build/scripts/test-image-analysis.js +50 -0
  86. package/build/scripts/test-ingestion-infra.js +39 -0
  87. package/build/scripts/test-install.js +40 -0
  88. package/build/scripts/test-institutional.js +26 -0
  89. package/build/scripts/test-integrity.js +41 -0
  90. package/build/scripts/test-jit.js +42 -0
  91. package/build/scripts/test-job-queue.js +62 -0
  92. package/build/scripts/test-kaggle-download.js +34 -0
  93. package/build/scripts/test-large-data.js +50 -0
  94. package/build/scripts/test-mcp-v5.js +73 -0
  95. package/build/scripts/test-media-analysis.js +61 -0
  96. package/build/scripts/test-monitoring.js +91 -0
  97. package/build/scripts/test-observability.js +106 -0
  98. package/build/scripts/test-packager.js +55 -0
  99. package/build/scripts/test-pipeline.js +50 -0
  100. package/build/scripts/test-planning.js +64 -0
  101. package/build/scripts/test-privacy.js +38 -0
  102. package/build/scripts/test-quality.js +43 -0
  103. package/build/scripts/test-robust-ingestion.js +41 -0
  104. package/build/scripts/test-schema.js +45 -0
  105. package/build/scripts/test-split-validation.js +40 -0
  106. package/build/scripts/test-splitter.js +93 -0
  107. package/build/scripts/test-uci.js +27 -0
  108. package/build/scripts/test-unified-quality.js +86 -0
  109. package/build/search/embedder.js +34 -0
  110. package/build/search/engine.js +129 -0
  111. package/build/search/jit-orchestrator.js +232 -0
  112. package/build/search/vector-store.js +105 -0
  113. package/build/splitting/splitter.js +57 -0
  114. package/build/splitting/types.js +1 -0
  115. package/build/tools/formatter.js +227 -0
  116. package/build/utils/downloader.js +52 -0
  117. package/mcp-config-template.json +15 -0
  118. package/package.json +84 -0
  119. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  120. package/src/python/cleaner.py +196 -0
  121. package/src/python/export_engine.py +112 -0
  122. package/src/python/framework_adapters.py +100 -0
  123. package/src/python/github_adapter.py +106 -0
  124. package/src/python/image_engine.py +86 -0
  125. package/src/python/media_engine.py +133 -0
  126. package/src/python/nasa_adapter.py +82 -0
  127. package/src/python/quality_engine.py +243 -0
  128. package/src/python/splitter_engine.py +283 -0
  129. package/src/python/test_framework_adapters.py +61 -0
  130. package/src/python/uci_adapter.py +94 -0
  131. package/src/python/worldbank_adapter.py +99 -0
@@ -0,0 +1,111 @@
1
+ export class CleaningPlanner {
2
+ cache;
3
+ constructor(cache) {
4
+ this.cache = cache;
5
+ }
6
+ /**
7
+ * Generate a cleaning plan based on the quality report and optional custom rules
8
+ */
9
+ async generatePlan(datasetId, report, ruleSet) {
10
+ if (this.cache) {
11
+ const cached = await this.cache.getPlan(datasetId, { report, ruleSet });
12
+ if (cached) {
13
+ console.log(`[CleaningPlanner] Cache hit for ${datasetId}`);
14
+ return cached;
15
+ }
16
+ }
17
+ const ops = [];
18
+ let estimatedRowsSaved = 0;
19
+ let estimatedColsSaved = 0;
20
+ // 1. Remove Duplicates (Global)
21
+ if (report.duplicate_rows > 0) {
22
+ ops.push({
23
+ type: "RemoveDuplicates",
24
+ params: {},
25
+ reason: `Found ${report.duplicate_rows} exact duplicate rows`
26
+ });
27
+ estimatedRowsSaved += report.duplicate_rows;
28
+ }
29
+ // 2. Column-level operations
30
+ for (const col of report.columns) {
31
+ // A. Drop Empty / Useless Columns
32
+ if (col.missing_percentage > 90 || col.is_constant) {
33
+ ops.push({
34
+ type: "DropColumns",
35
+ params: { columns: [col.name] },
36
+ reason: col.is_constant ? "Column is constant (zero variance)" : `High missing values (${col.missing_percentage.toFixed(1)}%)`
37
+ });
38
+ estimatedColsSaved++;
39
+ continue;
40
+ }
41
+ // B. Fix Types
42
+ if (this.shouldFixType(col)) {
43
+ const targetType = col.inferred_type.toLowerCase().includes("numeric") ? "float" : "string";
44
+ ops.push({
45
+ type: "FixTypes",
46
+ params: { column: col.name, type: targetType },
47
+ reason: `Inferred type is ${col.inferred_type} but stored as ${col.type}`
48
+ });
49
+ }
50
+ // C. Impute Missing Values
51
+ if (col.missing_count > 0) {
52
+ let method = "constant";
53
+ let value = "unknown";
54
+ if (col.inferred_type.includes("Numeric") || col.type.includes("Int") || col.type.includes("Float")) {
55
+ method = "median";
56
+ value = 0;
57
+ }
58
+ else {
59
+ method = "constant";
60
+ value = "missing";
61
+ }
62
+ ops.push({
63
+ type: "FillMissing",
64
+ params: { column: col.name, method, value },
65
+ reason: `${col.missing_count} missing values`
66
+ });
67
+ }
68
+ }
69
+ // 3. Apply Custom Rules
70
+ if (ruleSet) {
71
+ for (const rule of ruleSet.rules) {
72
+ const targets = rule.condition.column === "*"
73
+ ? report.columns.map(c => c.name)
74
+ : [rule.condition.column];
75
+ for (const targetCol of targets) {
76
+ const colStats = report.columns.find(c => c.name === targetCol);
77
+ if (!colStats)
78
+ continue;
79
+ ops.push({
80
+ type: rule.action.type,
81
+ params: { ...rule.action.params, column: targetCol },
82
+ reason: `Custom Rule: ${rule.name} - ${rule.description}`
83
+ });
84
+ }
85
+ }
86
+ }
87
+ const plan = {
88
+ dataset_id: datasetId,
89
+ operations: ops,
90
+ estimated_impact: {
91
+ rows_saved: estimatedRowsSaved,
92
+ columns_saved: estimatedColsSaved,
93
+ quality_score_improvement: 10 + (ops.length * 5)
94
+ }
95
+ };
96
+ if (this.cache) {
97
+ await this.cache.savePlan(datasetId, { report, ruleSet }, plan);
98
+ }
99
+ return plan;
100
+ }
101
+ shouldFixType(col) {
102
+ if (col.inferred_type && col.inferred_type.includes("Numeric") && (col.type.includes("String") || col.type.includes("Utf8"))) {
103
+ return true;
104
+ }
105
+ return false;
106
+ }
107
+ isNumeric(col) {
108
+ const t = col.type.toLowerCase();
109
+ return t.includes("int") || t.includes("float") || t.includes("numeric");
110
+ }
111
+ }
@@ -0,0 +1,57 @@
1
+ // --- Domain Presets ---
2
+ export const NLP_PRESET = {
3
+ id: "preset-nlp",
4
+ name: "NLP Data Prep",
5
+ domain: "nlp",
6
+ rules: [
7
+ {
8
+ id: "nlp-1",
9
+ name: "Normalize Case",
10
+ description: "Convert all text to lowercase",
11
+ condition: { column: "*", operator: "is_null", value: false }, // Apply to all non-null if type is string (logic in evaluator)
12
+ action: { type: "NormalizeText", params: { case: "lower" } }
13
+ },
14
+ {
15
+ id: "nlp-2",
16
+ name: "Remove URLs",
17
+ description: "Strip http/https links",
18
+ condition: { column: "*", operator: "contains", value: "http" },
19
+ action: { type: "Replace", params: { pattern: "https?://\\S+", replacement: "" } }
20
+ }
21
+ ]
22
+ };
23
+ export const HEALTHCARE_PRESET = {
24
+ id: "preset-healthcare",
25
+ name: "Healthcare (HIPAA) Prep",
26
+ domain: "healthcare",
27
+ rules: [
28
+ {
29
+ id: "hc-1",
30
+ name: "Mask Emails",
31
+ description: "Identify and mask email addresses",
32
+ condition: { column: "*", operator: "contains", value: "@" },
33
+ action: { type: "CustomMask", params: { method: "hash", salt: "vesper-pii" } }
34
+ },
35
+ {
36
+ id: "hc-2",
37
+ name: "Normalize Dates",
38
+ description: "Ensure ISO-8601 for DOB/Admit dates",
39
+ condition: { column: "date", operator: "is_null", value: false },
40
+ action: { type: "FixTypes", params: { type: "date" } }
41
+ }
42
+ ]
43
+ };
44
+ export const FINANCE_PRESET = {
45
+ id: "preset-finance",
46
+ name: "Financial Data Prep",
47
+ domain: "finance",
48
+ rules: [
49
+ {
50
+ id: "fin-1",
51
+ name: "Currency Cleanup",
52
+ description: "Remove currency symbols and parse as float",
53
+ condition: { column: "amount", operator: "matches_regex", value: "[\\$\\€\\£]" },
54
+ action: { type: "FixTypes", params: { type: "float", strip: "[^0-9\\.]" } }
55
+ }
56
+ ]
57
+ };
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,37 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ /**
4
+ * LocalAdapter mimics cloud storage by copying files to a local vault directory.
5
+ */
6
+ export class LocalAdapter {
7
+ vaultPath;
8
+ constructor(vaultPath) {
9
+ this.vaultPath = path.resolve(vaultPath);
10
+ if (!fs.existsSync(this.vaultPath)) {
11
+ fs.mkdirSync(this.vaultPath, { recursive: true });
12
+ }
13
+ }
14
+ async upload(localPath, remotePath) {
15
+ if (!fs.existsSync(localPath)) {
16
+ throw new Error(`Local file not found: ${localPath}`);
17
+ }
18
+ const destPath = path.join(this.vaultPath, remotePath);
19
+ const destDir = path.dirname(destPath);
20
+ if (!fs.existsSync(destDir)) {
21
+ fs.mkdirSync(destDir, { recursive: true });
22
+ }
23
+ fs.copyFileSync(localPath, destPath);
24
+ // Returns a file URI as the "url"
25
+ return `file://${destPath}`;
26
+ }
27
+ async delete(remotePath) {
28
+ const destPath = path.join(this.vaultPath, remotePath);
29
+ if (fs.existsSync(destPath)) {
30
+ fs.unlinkSync(destPath);
31
+ }
32
+ }
33
+ async getSignedUrl(remotePath, expiresValue) {
34
+ // For local, just return the file URI
35
+ return `file://${path.join(this.vaultPath, remotePath)}`;
36
+ }
37
+ }
@@ -0,0 +1,24 @@
1
+ /**
2
+ * S3Adapter Stub.
3
+ * Note: Requires @aws-sdk/client-s3 to be installed for full functionality.
4
+ */
5
+ export class S3Adapter {
6
+ bucket;
7
+ region;
8
+ credentials;
9
+ constructor(bucket, region, credentials) {
10
+ this.bucket = bucket;
11
+ this.region = region;
12
+ this.credentials = credentials;
13
+ }
14
+ async upload(localPath, remotePath) {
15
+ console.warn("S3Adapter: Full implementation requires @aws-sdk/client-s3. This is a stub.");
16
+ return `https://${this.bucket}.s3.${this.region}.amazonaws.com/${remotePath}`;
17
+ }
18
+ async delete(remotePath) {
19
+ console.warn("S3Adapter: Delete stub called.");
20
+ }
21
+ async getSignedUrl(remotePath) {
22
+ return `https://${this.bucket}.s3.${this.region}.amazonaws.com/${remotePath}?stub=true`;
23
+ }
24
+ }
@@ -0,0 +1,20 @@
1
+ import { LocalAdapter } from "./adapters/local.js";
2
+ import { S3Adapter } from "./adapters/s3.js";
3
+ export class StorageManager {
4
+ /**
5
+ * Creates an adapter based on configuration
6
+ */
7
+ static createAdapter(config) {
8
+ switch (config.type) {
9
+ case "local":
10
+ return new LocalAdapter(config.options.basePath || "./storage_vault");
11
+ case "s3":
12
+ if (!config.options.bucket || !config.options.region) {
13
+ throw new Error("S3 requires bucket and region");
14
+ }
15
+ return new S3Adapter(config.options.bucket, config.options.region, config.options.credentials);
16
+ default:
17
+ throw new Error(`Unsupported storage type: ${config.type}`);
18
+ }
19
+ }
20
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,73 @@
1
+ import { v4 as uuidv4 } from "uuid";
2
+ export class ComplianceService {
3
+ store;
4
+ constructor(store) {
5
+ this.store = store;
6
+ }
7
+ async verifyGDPR(dataset) {
8
+ const issues = [];
9
+ // 1. Check for PII metadata flag
10
+ if (dataset.has_personal_data) {
11
+ issues.push("Dataset explicitly flagged as containing personal data.");
12
+ }
13
+ // 2. Check quality warnings for PII
14
+ if (dataset.quality_warnings) {
15
+ const piiWarnings = dataset.quality_warnings.filter(w => w.toLowerCase().includes("pii") || w.toLowerCase().includes("personal"));
16
+ if (piiWarnings.length > 0) {
17
+ issues.push(...piiWarnings);
18
+ }
19
+ }
20
+ // 3. Check for consent record
21
+ const consent = this.store.getConsent(dataset.id);
22
+ if (!consent || !consent.consent_obtained) {
23
+ issues.push("No valid consent record found for this dataset.");
24
+ }
25
+ const result = {
26
+ dataset_id: dataset.id,
27
+ standard: "GDPR",
28
+ passed: issues.length === 0,
29
+ issues,
30
+ timestamp: new Date().toISOString()
31
+ };
32
+ this.store.saveCheck(result);
33
+ return result;
34
+ }
35
+ async verifyHIPAA(dataset) {
36
+ const issues = [];
37
+ // 1. De-identification check (simulated)
38
+ // If domain is medical but no de-identification flag exists
39
+ if (dataset.domain === "healthcare" || dataset.domain === "medical") {
40
+ if (dataset.has_personal_data) {
41
+ issues.push("Medical dataset contains personal data (not de-identified).");
42
+ }
43
+ }
44
+ const result = {
45
+ dataset_id: dataset.id,
46
+ standard: "HIPAA",
47
+ passed: issues.length === 0,
48
+ issues,
49
+ timestamp: new Date().toISOString()
50
+ };
51
+ this.store.saveCheck(result);
52
+ return result;
53
+ }
54
+ logOperation(userId, datasetId, operation, metadata = {}) {
55
+ const event = {
56
+ id: uuidv4(),
57
+ user_id: userId,
58
+ dataset_id: datasetId,
59
+ operation,
60
+ timestamp: new Date().toISOString(),
61
+ details: JSON.stringify(metadata)
62
+ };
63
+ this.store.saveAudit(event);
64
+ }
65
+ exportAuditLog(datasetId) {
66
+ const logs = this.store.getAuditLogs(datasetId);
67
+ if (logs.length === 0)
68
+ return "No audit logs found.";
69
+ const header = "ID,Timestamp,User,Operation,Dataset,Details\n";
70
+ const rows = logs.map(l => `${l.id},${l.timestamp},${l.user_id},${l.operation},${l.dataset_id},"${l.details.replace(/"/g, '""')}"`).join("\n");
71
+ return header + rows;
72
+ }
73
+ }
@@ -0,0 +1,80 @@
1
+ export class ComplianceStore {
2
+ db;
3
+ constructor(db) {
4
+ this.db = db;
5
+ this.init();
6
+ }
7
+ init() {
8
+ this.db.exec(`
9
+ CREATE TABLE IF NOT EXISTS audit_logs (
10
+ id TEXT PRIMARY KEY,
11
+ user_id TEXT,
12
+ dataset_id TEXT,
13
+ operation TEXT,
14
+ timestamp TEXT,
15
+ details TEXT
16
+ );
17
+
18
+ CREATE TABLE IF NOT EXISTS compliance_checks (
19
+ dataset_id TEXT,
20
+ standard TEXT,
21
+ passed BOOLEAN,
22
+ issues TEXT, -- JSON array
23
+ timestamp TEXT,
24
+ PRIMARY KEY (dataset_id, standard)
25
+ );
26
+
27
+ CREATE TABLE IF NOT EXISTS consent_records (
28
+ dataset_id TEXT PRIMARY KEY,
29
+ consent_obtained BOOLEAN,
30
+ source TEXT,
31
+ last_verified TEXT
32
+ );
33
+ `);
34
+ }
35
+ saveAudit(event) {
36
+ const stmt = this.db.prepare(`
37
+ INSERT INTO audit_logs (id, user_id, dataset_id, operation, timestamp, details)
38
+ VALUES (?, ?, ?, ?, ?, ?)
39
+ `);
40
+ stmt.run(event.id, event.user_id, event.dataset_id, event.operation, event.timestamp, event.details);
41
+ }
42
+ getAuditLogs(datasetId) {
43
+ let query = "SELECT * FROM audit_logs";
44
+ const params = [];
45
+ if (datasetId) {
46
+ query += " WHERE dataset_id = ?";
47
+ params.push(datasetId);
48
+ }
49
+ query += " ORDER BY timestamp DESC";
50
+ return this.db.prepare(query).all(...params);
51
+ }
52
+ saveCheck(result) {
53
+ const stmt = this.db.prepare(`
54
+ INSERT INTO compliance_checks (dataset_id, standard, passed, issues, timestamp)
55
+ VALUES (?, ?, ?, ?, ?)
56
+ ON CONFLICT(dataset_id, standard) DO UPDATE SET
57
+ passed=excluded.passed,
58
+ issues=excluded.issues,
59
+ timestamp=excluded.timestamp
60
+ `);
61
+ stmt.run(result.dataset_id, result.standard, result.passed ? 1 : 0, JSON.stringify(result.issues), result.timestamp);
62
+ }
63
+ saveConsent(record) {
64
+ const stmt = this.db.prepare(`
65
+ INSERT INTO consent_records (dataset_id, consent_obtained, source, last_verified)
66
+ VALUES (?, ?, ?, ?)
67
+ ON CONFLICT(dataset_id) DO UPDATE SET
68
+ consent_obtained=excluded.consent_obtained,
69
+ source=excluded.source,
70
+ last_verified=excluded.last_verified
71
+ `);
72
+ stmt.run(record.dataset_id, record.consent_obtained ? 1 : 0, record.source, record.last_verified);
73
+ }
74
+ getConsent(datasetId) {
75
+ const row = this.db.prepare("SELECT * FROM consent_records WHERE dataset_id = ?").get(datasetId);
76
+ if (!row)
77
+ return null;
78
+ return { ...row, consent_obtained: Boolean(row.consent_obtained) };
79
+ }
80
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,23 @@
1
+ import { parentPort } from "worker_threads";
2
+ if (parentPort) {
3
+ parentPort.on("message", (data) => {
4
+ const { records, task, params } = data;
5
+ let results = [];
6
+ switch (task) {
7
+ case "transform":
8
+ results = records.map(r => {
9
+ // Simulated transformation (e.g. alignment/harmonization logic)
10
+ const updated = { ...r, processed_at: new Date().toISOString() };
11
+ // Complex CPU bound work simulation
12
+ let sum = 0;
13
+ for (let i = 0; i < 1000; i++)
14
+ sum += i;
15
+ return updated;
16
+ });
17
+ break;
18
+ default:
19
+ results = records;
20
+ }
21
+ parentPort?.postMessage(results);
22
+ });
23
+ }
@@ -0,0 +1,38 @@
1
+ export class StreamProcessor {
2
+ /**
3
+ * Processes a readable stream in chunks.
4
+ */
5
+ static async processInChunks(stream, options, processor) {
6
+ let chunk = [];
7
+ const results = [];
8
+ let totalProcessed = 0;
9
+ for await (const record of stream) {
10
+ // Sampling logic
11
+ if (options.samplePercentage !== undefined) {
12
+ if (Math.random() * 100 > options.samplePercentage) {
13
+ continue;
14
+ }
15
+ }
16
+ chunk.push(record);
17
+ if (chunk.length >= options.chunkSize) {
18
+ const processedChunk = await processor(chunk);
19
+ results.push(...processedChunk);
20
+ totalProcessed += chunk.length;
21
+ chunk = [];
22
+ }
23
+ }
24
+ // Process remaining
25
+ if (chunk.length > 0) {
26
+ const processedChunk = await processor(chunk);
27
+ results.push(...processedChunk);
28
+ totalProcessed += chunk.length;
29
+ }
30
+ return results;
31
+ }
32
+ /**
33
+ * Creates a sampler stream that only emits a percentage of records.
34
+ */
35
+ static createSampler(percentage) {
36
+ return () => Math.random() * 100 <= percentage;
37
+ }
38
+ }
@@ -0,0 +1,39 @@
1
+ import { Worker } from "worker_threads";
2
+ import path from "path";
3
+ import { fileURLToPath } from "url";
4
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
5
+ export class WorkerPool {
6
+ poolSize;
7
+ workers = [];
8
+ queue = [];
9
+ activeWorkers = 0;
10
+ constructor(poolSize = 4) {
11
+ this.poolSize = poolSize;
12
+ const workerPath = path.resolve(__dirname, "processing-worker.js");
13
+ // Note: In development with tsx, we might need to handle .ts vs .js
14
+ }
15
+ async process(records, task = "transform") {
16
+ return new Promise((resolve, reject) => {
17
+ const worker = new Worker(new URL("./processing-worker.ts", import.meta.url), {
18
+ execArgv: ["--import", "tsx"]
19
+ });
20
+ worker.postMessage({ records, task });
21
+ worker.on("message", (results) => {
22
+ worker.terminate();
23
+ resolve(results);
24
+ });
25
+ worker.on("error", reject);
26
+ });
27
+ }
28
+ /**
29
+ * Splits records into chunks and processes them in parallel across workers.
30
+ */
31
+ async processParallel(records, chunkSize) {
32
+ const chunks = [];
33
+ for (let i = 0; i < records.length; i += chunkSize) {
34
+ chunks.push(records.slice(i, i + chunkSize));
35
+ }
36
+ const results = await Promise.all(chunks.map(chunk => this.process(chunk)));
37
+ return results.flat();
38
+ }
39
+ }
@@ -0,0 +1,45 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ export class DataExporter {
5
+ pythonPath = "python";
6
+ scriptPath;
7
+ constructor(projectRoot = process.cwd()) {
8
+ this.scriptPath = path.join(projectRoot, "src", "python", "export_engine.py");
9
+ }
10
+ /**
11
+ * Exports a dataset file to a specified format
12
+ */
13
+ async export(inputFile, outputFile, format, options = {}) {
14
+ return new Promise((resolve, reject) => {
15
+ if (!fs.existsSync(inputFile)) {
16
+ reject(new Error(`Input file not found: ${inputFile}`));
17
+ return;
18
+ }
19
+ const args = [this.scriptPath, inputFile, outputFile, format, JSON.stringify(options)];
20
+ const process = spawn(this.pythonPath, args);
21
+ let stdout = "";
22
+ let stderr = "";
23
+ process.stdout.on("data", (data) => stdout += data.toString());
24
+ process.stderr.on("data", (data) => stderr += data.toString());
25
+ process.on("close", (code) => {
26
+ if (code !== 0) {
27
+ reject(new Error(`Export failed: ${stderr || stdout}`));
28
+ return;
29
+ }
30
+ try {
31
+ const result = JSON.parse(stdout);
32
+ if (result.error) {
33
+ reject(new Error(result.error));
34
+ }
35
+ else {
36
+ resolve(result);
37
+ }
38
+ }
39
+ catch (e) {
40
+ reject(new Error(`Failed to parse export output: ${stdout}`));
41
+ }
42
+ });
43
+ });
44
+ }
45
+ }
@@ -0,0 +1,100 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ import crypto from "crypto";
4
+ /**
5
+ * MetadataPackager bundles data files with metadata and quality reports.
6
+ * Follows a simplified Frictionless Data Package standard.
7
+ */
8
+ export class MetadataPackager {
9
+ /**
10
+ * Creates a data package in the target directory
11
+ */
12
+ async createPackage(targetDir, dataFiles, metadata, extraArtifacts) {
13
+ try {
14
+ if (!fs.existsSync(targetDir)) {
15
+ fs.mkdirSync(targetDir, { recursive: true });
16
+ }
17
+ const resources = [];
18
+ // 1. Process Data Files
19
+ for (const file of dataFiles) {
20
+ if (!fs.existsSync(file.path)) {
21
+ throw new Error(`Data file not found: ${file.path}`);
22
+ }
23
+ const fileName = path.basename(file.path);
24
+ const destPath = path.join(targetDir, fileName);
25
+ // Copy file to package dir
26
+ fs.copyFileSync(file.path, destPath);
27
+ // Compute hash and size
28
+ const fileBuffer = fs.readFileSync(destPath);
29
+ const hash = crypto.createHash("sha256").update(fileBuffer).digest("hex");
30
+ const stats = fs.statSync(destPath);
31
+ resources.push({
32
+ name: file.name,
33
+ path: fileName,
34
+ format: file.format,
35
+ mediatype: this.getMediaType(file.format),
36
+ bytes: stats.size,
37
+ hash: `sha256:${hash}`
38
+ });
39
+ }
40
+ // 2. Add Extra Artifacts
41
+ if (extraArtifacts?.qualityReport) {
42
+ const qrPath = path.join(targetDir, "quality_report.json");
43
+ fs.writeFileSync(qrPath, JSON.stringify(extraArtifacts.qualityReport, null, 2));
44
+ resources.push({
45
+ name: "quality-report",
46
+ path: "quality_report.json",
47
+ format: "json"
48
+ });
49
+ }
50
+ if (extraArtifacts?.cleaningLog) {
51
+ const clPath = path.join(targetDir, "cleaning_log.json");
52
+ fs.writeFileSync(clPath, JSON.stringify(extraArtifacts.cleaningLog, null, 2));
53
+ resources.push({
54
+ name: "cleaning-log",
55
+ path: "cleaning_log.json",
56
+ format: "json"
57
+ });
58
+ }
59
+ // 3. Generate Manifest (datapackage.json)
60
+ const manifest = {
61
+ profile: "tabular-data-package",
62
+ name: metadata.name.toLowerCase().replace(/\s+/g, "-"),
63
+ title: metadata.name,
64
+ description: metadata.description,
65
+ version: metadata.version,
66
+ homepage: "https://github.com/vesper-data",
67
+ license: metadata.license,
68
+ author: metadata.author,
69
+ keywords: metadata.tags,
70
+ created: new Date().toISOString(),
71
+ resources: resources
72
+ };
73
+ const manifestPath = path.join(targetDir, "datapackage.json");
74
+ fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2));
75
+ return {
76
+ success: true,
77
+ packagePath: targetDir,
78
+ manifest: manifest
79
+ };
80
+ }
81
+ catch (e) {
82
+ return {
83
+ success: false,
84
+ packagePath: targetDir,
85
+ manifest: null,
86
+ error: e.message
87
+ };
88
+ }
89
+ }
90
+ getMediaType(format) {
91
+ switch (format.toLowerCase()) {
92
+ case "csv": return "text/csv";
93
+ case "parquet": return "application/x-parquet";
94
+ case "jsonl": return "application/x-jsonlines";
95
+ case "arrow": return "application/x-apache-arrow-file";
96
+ case "json": return "application/json";
97
+ default: return "application/octet-stream";
98
+ }
99
+ }
100
+ }
@@ -0,0 +1 @@
1
+ export {};