@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
export class SchemaAligner {
|
|
2
|
+
config;
|
|
3
|
+
constructor(config) {
|
|
4
|
+
this.config = config;
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Finds common columns across datasets and creates a mapping to canonical names.
|
|
8
|
+
*/
|
|
9
|
+
align(datasets) {
|
|
10
|
+
const alignment = {};
|
|
11
|
+
for (const ds of datasets) {
|
|
12
|
+
alignment[ds.id] = {};
|
|
13
|
+
if (!ds.columns)
|
|
14
|
+
continue;
|
|
15
|
+
for (const col of ds.columns) {
|
|
16
|
+
const canonicalName = this.getCanonicalName(col.name);
|
|
17
|
+
if (canonicalName) {
|
|
18
|
+
alignment[ds.id][col.name] = canonicalName;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
return alignment;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Resolves the most specific common type for a canonical column.
|
|
26
|
+
*/
|
|
27
|
+
resolveType(canonicalName, types) {
|
|
28
|
+
if (this.config.type_overrides[canonicalName]) {
|
|
29
|
+
return this.config.type_overrides[canonicalName];
|
|
30
|
+
}
|
|
31
|
+
const uniqueTypes = new Set(types.map(t => t.toLowerCase()));
|
|
32
|
+
if (uniqueTypes.size === 1) {
|
|
33
|
+
return Array.from(uniqueTypes)[0];
|
|
34
|
+
}
|
|
35
|
+
if (uniqueTypes.has("float") || uniqueTypes.has("number")) {
|
|
36
|
+
return "number";
|
|
37
|
+
}
|
|
38
|
+
return "string"; // Default to string for mixed types
|
|
39
|
+
}
|
|
40
|
+
getCanonicalName(colName) {
|
|
41
|
+
// Direct match with any canonical name
|
|
42
|
+
for (const canonical in this.config.column_aliases) {
|
|
43
|
+
if (canonical.toLowerCase() === colName.toLowerCase()) {
|
|
44
|
+
return canonical;
|
|
45
|
+
}
|
|
46
|
+
// Match with aliases
|
|
47
|
+
const aliases = this.config.column_aliases[canonical];
|
|
48
|
+
if (aliases.some((a) => a.toLowerCase() === colName.toLowerCase())) {
|
|
49
|
+
return canonical;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
// If no alias, use the name itself if it's already one of the canonical names we expect
|
|
53
|
+
// This is a bit recursive, let's simplify.
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import * as crypto from "crypto";
|
|
2
|
+
export class Deduplicator {
|
|
3
|
+
config;
|
|
4
|
+
seenHashes = new Set();
|
|
5
|
+
seenTexts = new Map(); // column -> tokens[]
|
|
6
|
+
constructor(config) {
|
|
7
|
+
this.config = config;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Checks if a record is an exact duplicate based on all columns.
|
|
11
|
+
*/
|
|
12
|
+
isExactDuplicate(record) {
|
|
13
|
+
if (!this.config.dedupe_config.exact)
|
|
14
|
+
return false;
|
|
15
|
+
const hash = crypto
|
|
16
|
+
.createHash("md5")
|
|
17
|
+
.update(JSON.stringify(record))
|
|
18
|
+
.digest("hex");
|
|
19
|
+
if (this.seenHashes.has(hash)) {
|
|
20
|
+
return true;
|
|
21
|
+
}
|
|
22
|
+
this.seenHashes.add(hash);
|
|
23
|
+
return false;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Checks if a record is a fuzzy duplicate based on configured columns.
|
|
27
|
+
*/
|
|
28
|
+
isFuzzyDuplicate(record) {
|
|
29
|
+
if (!this.config.dedupe_config.fuzzy)
|
|
30
|
+
return false;
|
|
31
|
+
if (!this.config.dedupe_config.fuzzy_columns.length)
|
|
32
|
+
return false;
|
|
33
|
+
for (const col of this.config.dedupe_config.fuzzy_columns) {
|
|
34
|
+
const text = String(record[col] || "");
|
|
35
|
+
if (!text || text.length < 10)
|
|
36
|
+
continue; // Skip short/empty tags
|
|
37
|
+
const tokens = this.tokenize(text);
|
|
38
|
+
if (this.isSimilar(col, tokens)) {
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
// In a real implementation, we'd use a more efficient data structure (like LSH)
|
|
42
|
+
// For now, we store tokens and compare (O(N^2) in worst case, use with caution)
|
|
43
|
+
// But we'll keep it simple for this phase.
|
|
44
|
+
// Actually, let's just store the tokens.
|
|
45
|
+
}
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
48
|
+
tokenize(text) {
|
|
49
|
+
return new Set(text.toLowerCase().split(/\s+/).filter(t => t.length > 2));
|
|
50
|
+
}
|
|
51
|
+
isSimilar(column, tokens) {
|
|
52
|
+
if (!this.seenTexts.has(column)) {
|
|
53
|
+
this.seenTexts.set(column, []);
|
|
54
|
+
}
|
|
55
|
+
const stored = this.seenTexts.get(column);
|
|
56
|
+
for (const existingSerialized of stored) {
|
|
57
|
+
const existing = new Set(JSON.parse(existingSerialized));
|
|
58
|
+
const intersection = new Set([...tokens].filter(t => existing.has(t)));
|
|
59
|
+
const union = new Set([...tokens, ...existing]);
|
|
60
|
+
const jaccard = intersection.size / union.size;
|
|
61
|
+
if (jaccard >= this.config.dedupe_config.fuzzy_threshold) {
|
|
62
|
+
return true;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
// Add to seen
|
|
66
|
+
stored.push(JSON.stringify([...tokens]));
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
export class LabelHarmonizer {
|
|
2
|
+
config;
|
|
3
|
+
constructor(config) {
|
|
4
|
+
this.config = config;
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Maps a raw label value to a unified canonical value.
|
|
8
|
+
*/
|
|
9
|
+
harmonize(label) {
|
|
10
|
+
let harmonized = label;
|
|
11
|
+
// 1. apply explicit mapping
|
|
12
|
+
if (this.config.label_map && label in this.config.label_map) {
|
|
13
|
+
harmonized = this.config.label_map[label];
|
|
14
|
+
}
|
|
15
|
+
// 2. apply multi-class to binary conversion
|
|
16
|
+
if (this.config.multi_to_binary) {
|
|
17
|
+
const { positive_classes, positive_label, negative_label } = this.config.multi_to_binary;
|
|
18
|
+
const isPositive = positive_classes.some((pc) => String(pc).toLowerCase() === String(harmonized).toLowerCase());
|
|
19
|
+
return isPositive ? positive_label : negative_label;
|
|
20
|
+
}
|
|
21
|
+
return harmonized;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Checks if a label distribution is balanced enough (placeholder for quality scoring).
|
|
25
|
+
*/
|
|
26
|
+
checkBalance(distribution) {
|
|
27
|
+
const warnings = [];
|
|
28
|
+
const total = Object.values(distribution).reduce((a, b) => a + b, 0);
|
|
29
|
+
if (total === 0)
|
|
30
|
+
return warnings;
|
|
31
|
+
for (const [label, count] of Object.entries(distribution)) {
|
|
32
|
+
const pct = count / total;
|
|
33
|
+
if (pct < 0.05) {
|
|
34
|
+
warnings.push(`Extreme minority class detected: "${label}" (${(pct * 100).toFixed(1)}%)`);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return warnings;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { SchemaAligner } from "./aligner.js";
|
|
2
|
+
import { LabelHarmonizer } from "./harmonizer.js";
|
|
3
|
+
import { Deduplicator } from "./deduplicator.js";
|
|
4
|
+
export class FusionOrchestrator {
|
|
5
|
+
config;
|
|
6
|
+
aligner;
|
|
7
|
+
harmonizer;
|
|
8
|
+
deduplicator;
|
|
9
|
+
constructor(config) {
|
|
10
|
+
this.config = config;
|
|
11
|
+
this.aligner = new SchemaAligner(config);
|
|
12
|
+
this.harmonizer = new LabelHarmonizer(config);
|
|
13
|
+
this.deduplicator = new Deduplicator(config);
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Fuses multiple datasets into one.
|
|
17
|
+
* @param datasets Metadata of datasets to fuse
|
|
18
|
+
* @param dataRecords Actual records from all datasets (flattened)
|
|
19
|
+
*/
|
|
20
|
+
async fuse(datasets, dataRecords) {
|
|
21
|
+
const stats = {
|
|
22
|
+
total_input_rows: dataRecords.length,
|
|
23
|
+
total_output_rows: 0,
|
|
24
|
+
duplicates_removed: 0,
|
|
25
|
+
fuzzy_duplicates_removed: 0,
|
|
26
|
+
schema_overlaps: [],
|
|
27
|
+
label_distribution: {}
|
|
28
|
+
};
|
|
29
|
+
const warnings = [];
|
|
30
|
+
const alignmentMap = this.aligner.align(datasets);
|
|
31
|
+
// Find canonical columns common to at least one dataset (in this simple version)
|
|
32
|
+
const allCanonicalCols = new Set();
|
|
33
|
+
for (const dsId in alignmentMap) {
|
|
34
|
+
for (const canonical of Object.values(alignmentMap[dsId])) {
|
|
35
|
+
allCanonicalCols.add(canonical);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
stats.schema_overlaps = Array.from(allCanonicalCols);
|
|
39
|
+
const fusedData = [];
|
|
40
|
+
for (const { datasetId, record } of dataRecords) {
|
|
41
|
+
const alignedRecord = {};
|
|
42
|
+
const colMap = alignmentMap[datasetId];
|
|
43
|
+
if (!colMap) {
|
|
44
|
+
warnings.push(`No alignment found for dataset ${datasetId}`);
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
// 1. Align columns
|
|
48
|
+
for (const [sourceCol, val] of Object.entries(record)) {
|
|
49
|
+
const canonical = colMap[sourceCol];
|
|
50
|
+
if (canonical) {
|
|
51
|
+
alignedRecord[canonical] = val;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
// 2. Harmonize label
|
|
55
|
+
if (this.config.target_column && alignedRecord[this.config.target_column] !== undefined) {
|
|
56
|
+
alignedRecord[this.config.target_column] = this.harmonizer.harmonize(alignedRecord[this.config.target_column]);
|
|
57
|
+
}
|
|
58
|
+
// 3. Deduplicate
|
|
59
|
+
if (this.deduplicator.isExactDuplicate(alignedRecord)) {
|
|
60
|
+
stats.duplicates_removed++;
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
if (this.deduplicator.isFuzzyDuplicate(alignedRecord)) {
|
|
64
|
+
stats.fuzzy_duplicates_removed++;
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
// Update distribution ONLY for kept records
|
|
68
|
+
if (this.config.target_column && alignedRecord[this.config.target_column] !== undefined) {
|
|
69
|
+
const labelStr = String(alignedRecord[this.config.target_column]);
|
|
70
|
+
stats.label_distribution[labelStr] = (stats.label_distribution[labelStr] || 0) + 1;
|
|
71
|
+
}
|
|
72
|
+
fusedData.push(alignedRecord);
|
|
73
|
+
}
|
|
74
|
+
stats.total_output_rows = fusedData.push(); // Wait, push returns new length
|
|
75
|
+
stats.total_output_rows = fusedData.length;
|
|
76
|
+
// Add balance warnings
|
|
77
|
+
const balanceWarnings = this.harmonizer.checkBalance(stats.label_distribution);
|
|
78
|
+
warnings.push(...balanceWarnings);
|
|
79
|
+
return {
|
|
80
|
+
success: true,
|
|
81
|
+
output_path: "fused_dataset.json", // Placeholder
|
|
82
|
+
stats,
|
|
83
|
+
warnings
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|