@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
const PYTHON_SCRIPT_PATH = path.resolve("src", "python", "uci_adapter.py");
|
|
4
|
+
export class UCIScraper {
|
|
5
|
+
/**
|
|
6
|
+
* Search UCI repository using the Python adapter
|
|
7
|
+
*/
|
|
8
|
+
async scrape(query, limit = 10) {
|
|
9
|
+
return new Promise((resolve, reject) => {
|
|
10
|
+
const pythonProcess = spawn("python", [
|
|
11
|
+
PYTHON_SCRIPT_PATH,
|
|
12
|
+
"--action", "search",
|
|
13
|
+
"--query", query,
|
|
14
|
+
"--limit", String(limit)
|
|
15
|
+
]);
|
|
16
|
+
let output = "";
|
|
17
|
+
let errorOutput = "";
|
|
18
|
+
pythonProcess.stdout.on("data", (data) => {
|
|
19
|
+
output += data.toString();
|
|
20
|
+
});
|
|
21
|
+
pythonProcess.stderr.on("data", (data) => {
|
|
22
|
+
errorOutput += data.toString();
|
|
23
|
+
});
|
|
24
|
+
pythonProcess.on("close", (code) => {
|
|
25
|
+
if (code !== 0) {
|
|
26
|
+
// It's possible for python to emit stderr warnings but still succeed
|
|
27
|
+
// But exit code != 0 is definitely an error
|
|
28
|
+
console.error(`[UCIScraper] Process exited with code ${code}: ${errorOutput}`);
|
|
29
|
+
resolve([]); // Fail gracefully by returning empty
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
try {
|
|
33
|
+
const results = JSON.parse(output);
|
|
34
|
+
if (results.error) {
|
|
35
|
+
console.error(`[UCIScraper] Internal error: ${results.error}`);
|
|
36
|
+
resolve([]);
|
|
37
|
+
}
|
|
38
|
+
else {
|
|
39
|
+
resolve(results);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
catch (e) {
|
|
43
|
+
console.error(`[UCIScraper] JSON parse error: ${e.message}. Output: ${output.substring(0, 100)}...`);
|
|
44
|
+
resolve([]);
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
export class MockErrorTracker {
|
|
2
|
+
exceptions = [];
|
|
3
|
+
messages = [];
|
|
4
|
+
captureException(error, context) {
|
|
5
|
+
console.log(`[ErrorTracker] Exception captured: ${error.message}`);
|
|
6
|
+
this.exceptions.push({ error, context });
|
|
7
|
+
}
|
|
8
|
+
captureMessage(message, level = "info") {
|
|
9
|
+
console.log(`[ErrorTracker] Message captured (${level}): ${message}`);
|
|
10
|
+
this.messages.push({ message, level });
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
export class ObservabilityService {
|
|
14
|
+
errorTracker;
|
|
15
|
+
jobSuccessCounter = new Map();
|
|
16
|
+
jobFailureCounter = new Map();
|
|
17
|
+
jobDurationHistogram = new Map();
|
|
18
|
+
constructor(errorTracker = new MockErrorTracker()) {
|
|
19
|
+
this.errorTracker = errorTracker;
|
|
20
|
+
}
|
|
21
|
+
recordJobSuccess(type, durationMs) {
|
|
22
|
+
// Increment success counter
|
|
23
|
+
this.jobSuccessCounter.set(type, (this.jobSuccessCounter.get(type) || 0) + 1);
|
|
24
|
+
// Record duration
|
|
25
|
+
this.updateMetricRecord(type, durationMs);
|
|
26
|
+
}
|
|
27
|
+
recordJobFailure(type, error) {
|
|
28
|
+
// Increment failure counter
|
|
29
|
+
this.jobFailureCounter.set(type, (this.jobFailureCounter.get(type) || 0) + 1);
|
|
30
|
+
// Track error
|
|
31
|
+
this.errorTracker.captureException(error, { jobType: type });
|
|
32
|
+
}
|
|
33
|
+
getPrometheusMetrics() {
|
|
34
|
+
let out = "# HELP jobs_processed_total Total number of jobs successfully processed\n";
|
|
35
|
+
out += "# TYPE jobs_processed_total counter\n";
|
|
36
|
+
for (const [type, count] of this.jobSuccessCounter) {
|
|
37
|
+
out += `jobs_processed_total{type="${type}"} ${count}\n`;
|
|
38
|
+
}
|
|
39
|
+
out += "\n# HELP jobs_failed_total Total number of failed jobs\n";
|
|
40
|
+
out += "# TYPE jobs_failed_total counter\n";
|
|
41
|
+
for (const [type, count] of this.jobFailureCounter) {
|
|
42
|
+
out += `jobs_failed_total{type="${type}"} ${count}\n`;
|
|
43
|
+
}
|
|
44
|
+
out += "\n# HELP job_duration_seconds_sum Latency of job processing in seconds\n";
|
|
45
|
+
out += "# TYPE job_duration_seconds_sum counter\n";
|
|
46
|
+
for (const [type, record] of this.jobDurationHistogram) {
|
|
47
|
+
out += `job_duration_seconds_sum{type="${type}"} ${record.sum / 1000}\n`;
|
|
48
|
+
out += `job_duration_seconds_count{type="${type}"} ${record.count}\n`;
|
|
49
|
+
out += `job_duration_seconds_max{type="${type}"} ${record.max / 1000}\n`;
|
|
50
|
+
}
|
|
51
|
+
return out;
|
|
52
|
+
}
|
|
53
|
+
getStats() {
|
|
54
|
+
const stats = {};
|
|
55
|
+
for (const [type, record] of this.jobDurationHistogram) {
|
|
56
|
+
stats[type] = {
|
|
57
|
+
successCount: this.jobSuccessCounter.get(type) || 0,
|
|
58
|
+
failureCount: this.jobFailureCounter.get(type) || 0,
|
|
59
|
+
avgDuration: (record.sum / record.count).toFixed(2) + "ms",
|
|
60
|
+
maxDuration: record.max + "ms"
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
return stats;
|
|
64
|
+
}
|
|
65
|
+
updateMetricRecord(type, value) {
|
|
66
|
+
let record = this.jobDurationHistogram.get(type);
|
|
67
|
+
if (!record) {
|
|
68
|
+
record = { count: 0, sum: 0, min: value, max: value };
|
|
69
|
+
this.jobDurationHistogram.set(type, record);
|
|
70
|
+
}
|
|
71
|
+
record.count++;
|
|
72
|
+
record.sum += value;
|
|
73
|
+
record.min = Math.min(record.min, value);
|
|
74
|
+
record.max = Math.max(record.max, value);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
export class QualityAnalyzer {
|
|
4
|
+
cache;
|
|
5
|
+
pythonPath = "python"; // Assumes python is in PATH
|
|
6
|
+
scriptPath;
|
|
7
|
+
constructor(cache, projectRoot = process.cwd()) {
|
|
8
|
+
this.cache = cache;
|
|
9
|
+
this.scriptPath = path.join(projectRoot, "src", "python", "quality_engine.py");
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Run quality analysis on a local file (CSV/Parquet/JSON)
|
|
13
|
+
* @param datasetId Used for caching
|
|
14
|
+
*/
|
|
15
|
+
async analyze(filePath, datasetId) {
|
|
16
|
+
if (this.cache && datasetId) {
|
|
17
|
+
const cached = await this.cache.getReport(datasetId);
|
|
18
|
+
if (cached) {
|
|
19
|
+
console.log(`[QualityAnalyzer] Cache hit for ${datasetId}`);
|
|
20
|
+
return cached;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
const report = await new Promise((resolve, reject) => {
|
|
24
|
+
const process = spawn(this.pythonPath, [this.scriptPath, filePath]);
|
|
25
|
+
let stdout = "";
|
|
26
|
+
let stderr = "";
|
|
27
|
+
process.stdout.on("data", (data) => {
|
|
28
|
+
stdout += data.toString();
|
|
29
|
+
});
|
|
30
|
+
process.stderr.on("data", (data) => {
|
|
31
|
+
stderr += data.toString();
|
|
32
|
+
});
|
|
33
|
+
process.on("close", (code) => {
|
|
34
|
+
if (code !== 0) {
|
|
35
|
+
reject(new Error(`Quality Analyzer failed (code ${code}): ${stderr}`));
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
try {
|
|
39
|
+
const report = JSON.parse(stdout);
|
|
40
|
+
if (report.error) {
|
|
41
|
+
reject(new Error(report.error));
|
|
42
|
+
}
|
|
43
|
+
else {
|
|
44
|
+
resolve(report);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
catch (e) {
|
|
48
|
+
reject(new Error(`Failed to parse analyzer output: ${stdout}`));
|
|
49
|
+
}
|
|
50
|
+
});
|
|
51
|
+
});
|
|
52
|
+
if (this.cache && datasetId) {
|
|
53
|
+
await this.cache.saveReport(datasetId, report);
|
|
54
|
+
}
|
|
55
|
+
return report;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
export class ImageAnalyzer {
|
|
4
|
+
pythonPath = "python";
|
|
5
|
+
scriptPath;
|
|
6
|
+
constructor(projectRoot = process.cwd()) {
|
|
7
|
+
this.scriptPath = path.join(projectRoot, "src", "python", "image_engine.py");
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Analyze image quality for a single file or a directory
|
|
11
|
+
*/
|
|
12
|
+
async analyze(inputPath) {
|
|
13
|
+
return new Promise((resolve, reject) => {
|
|
14
|
+
const process = spawn(this.pythonPath, [
|
|
15
|
+
this.scriptPath,
|
|
16
|
+
inputPath
|
|
17
|
+
]);
|
|
18
|
+
let stdout = "";
|
|
19
|
+
let stderr = "";
|
|
20
|
+
process.stdout.on("data", (data) => {
|
|
21
|
+
stdout += data.toString();
|
|
22
|
+
});
|
|
23
|
+
process.stderr.on("data", (data) => {
|
|
24
|
+
stderr += data.toString();
|
|
25
|
+
});
|
|
26
|
+
process.on("close", (code) => {
|
|
27
|
+
if (code !== 0) {
|
|
28
|
+
reject(new Error(`Image Analyzer failed (code ${code}): ${stderr}`));
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
try {
|
|
32
|
+
const result = JSON.parse(stdout);
|
|
33
|
+
if (result.error) {
|
|
34
|
+
reject(new Error(result.error));
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
resolve(result);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
catch (e) {
|
|
41
|
+
reject(new Error(`Failed to parse image analyzer output: ${stdout}`));
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
export class MediaAnalyzer {
|
|
4
|
+
pythonPath = "python";
|
|
5
|
+
scriptPath;
|
|
6
|
+
constructor(projectRoot = process.cwd()) {
|
|
7
|
+
this.scriptPath = path.join(projectRoot, "src", "python", "media_engine.py");
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Analyze audio/video quality for a single file or a directory
|
|
11
|
+
*/
|
|
12
|
+
async analyze(inputPath) {
|
|
13
|
+
return new Promise((resolve, reject) => {
|
|
14
|
+
const process = spawn(this.pythonPath, [
|
|
15
|
+
this.scriptPath,
|
|
16
|
+
inputPath
|
|
17
|
+
]);
|
|
18
|
+
let stdout = "";
|
|
19
|
+
let stderr = "";
|
|
20
|
+
process.stdout.on("data", (data) => {
|
|
21
|
+
stdout += data.toString();
|
|
22
|
+
});
|
|
23
|
+
process.stderr.on("data", (data) => {
|
|
24
|
+
stderr += data.toString();
|
|
25
|
+
});
|
|
26
|
+
process.on("close", (code) => {
|
|
27
|
+
if (code !== 0) {
|
|
28
|
+
reject(new Error(`Media Analyzer failed (code ${code}): ${stderr}`));
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
try {
|
|
32
|
+
const result = JSON.parse(stdout);
|
|
33
|
+
if (result.error) {
|
|
34
|
+
reject(new Error(result.error));
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
resolve(result);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
catch (e) {
|
|
41
|
+
reject(new Error(`Failed to parse media analyzer output: ${stdout}`));
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import { ImageAnalyzer } from "./image-analyzer.js";
|
|
3
|
+
import { MediaAnalyzer } from "./media-analyzer.js";
|
|
4
|
+
export class QualityOrchestrator {
|
|
5
|
+
imageAnalyzer;
|
|
6
|
+
mediaAnalyzer;
|
|
7
|
+
constructor(projectRoot = process.cwd()) {
|
|
8
|
+
this.imageAnalyzer = new ImageAnalyzer(projectRoot);
|
|
9
|
+
this.mediaAnalyzer = new MediaAnalyzer(projectRoot);
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Detect modalities present in a dataset directory
|
|
13
|
+
*/
|
|
14
|
+
detectModalities(datasetPath) {
|
|
15
|
+
const modalities = [];
|
|
16
|
+
if (!fs.existsSync(datasetPath)) {
|
|
17
|
+
return modalities;
|
|
18
|
+
}
|
|
19
|
+
const files = fs.readdirSync(datasetPath);
|
|
20
|
+
// Check for text/tabular data
|
|
21
|
+
const hasText = files.some(f => /\.(csv|json|parquet|txt)$/i.test(f));
|
|
22
|
+
if (hasText)
|
|
23
|
+
modalities.push("text");
|
|
24
|
+
// Check for images
|
|
25
|
+
const hasImages = files.some(f => /\.(jpg|jpeg|png|bmp|webp)$/i.test(f));
|
|
26
|
+
if (hasImages)
|
|
27
|
+
modalities.push("image");
|
|
28
|
+
// Check for audio
|
|
29
|
+
const hasAudio = files.some(f => /\.(wav|mp3|flac|ogg|m4a)$/i.test(f));
|
|
30
|
+
if (hasAudio)
|
|
31
|
+
modalities.push("audio");
|
|
32
|
+
// Check for video
|
|
33
|
+
const hasVideo = files.some(f => /\.(mp4|avi|mkv|mov|wmv)$/i.test(f));
|
|
34
|
+
if (hasVideo)
|
|
35
|
+
modalities.push("video");
|
|
36
|
+
return modalities;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Generate a unified quality report for a dataset
|
|
40
|
+
*/
|
|
41
|
+
async generateReport(datasetId, datasetPath, textQuality) {
|
|
42
|
+
const modalities = this.detectModalities(datasetPath);
|
|
43
|
+
const report = {
|
|
44
|
+
dataset_id: datasetId,
|
|
45
|
+
modalities,
|
|
46
|
+
overall_quality_score: 0,
|
|
47
|
+
recommendations: [],
|
|
48
|
+
generated_at: new Date().toISOString()
|
|
49
|
+
};
|
|
50
|
+
let totalScore = 0;
|
|
51
|
+
let scoreCount = 0;
|
|
52
|
+
// Text quality (if provided from existing analysis)
|
|
53
|
+
if (textQuality) {
|
|
54
|
+
report.text_quality = {
|
|
55
|
+
row_count: textQuality.row_count || 0,
|
|
56
|
+
column_count: textQuality.column_count || 0,
|
|
57
|
+
missing_percentage: textQuality.missing_percentage || 0,
|
|
58
|
+
duplicate_percentage: textQuality.duplicate_percentage || 0
|
|
59
|
+
};
|
|
60
|
+
// Calculate text quality score (0-100)
|
|
61
|
+
const textScore = Math.max(0, 100 - (report.text_quality.missing_percentage * 2) - (report.text_quality.duplicate_percentage));
|
|
62
|
+
totalScore += textScore;
|
|
63
|
+
scoreCount++;
|
|
64
|
+
if (report.text_quality.missing_percentage > 20) {
|
|
65
|
+
report.recommendations.push("High missing data detected. Consider imputation or removal.");
|
|
66
|
+
}
|
|
67
|
+
if (report.text_quality.duplicate_percentage > 10) {
|
|
68
|
+
report.recommendations.push("Significant duplicates found. Run deduplication.");
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
// Image quality
|
|
72
|
+
if (modalities.includes("image")) {
|
|
73
|
+
try {
|
|
74
|
+
const imageReport = await this.imageAnalyzer.analyze(datasetPath);
|
|
75
|
+
report.image_quality = {
|
|
76
|
+
total_images: imageReport.total_images,
|
|
77
|
+
corrupted_count: imageReport.corrupted_count,
|
|
78
|
+
avg_resolution: `${Math.round(imageReport.average_width)}x${Math.round(imageReport.average_height)}`,
|
|
79
|
+
blurry_percentage: (imageReport.blurry_count / imageReport.total_images) * 100
|
|
80
|
+
};
|
|
81
|
+
// Calculate image quality score
|
|
82
|
+
const corruptionPenalty = (imageReport.corrupted_count / imageReport.total_images) * 50;
|
|
83
|
+
const blurPenalty = report.image_quality.blurry_percentage * 0.3;
|
|
84
|
+
const imageScore = Math.max(0, 100 - corruptionPenalty - blurPenalty);
|
|
85
|
+
totalScore += imageScore;
|
|
86
|
+
scoreCount++;
|
|
87
|
+
if (report.image_quality.corrupted_count > 0) {
|
|
88
|
+
report.recommendations.push(`Remove ${imageReport.corrupted_count} corrupted images.`);
|
|
89
|
+
}
|
|
90
|
+
if (report.image_quality.blurry_percentage > 15) {
|
|
91
|
+
report.recommendations.push("High blur detected. Consider filtering blurry images.");
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
catch (e) {
|
|
95
|
+
console.error("Image analysis failed:", e);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
// Audio quality
|
|
99
|
+
if (modalities.includes("audio")) {
|
|
100
|
+
try {
|
|
101
|
+
const audioReport = await this.mediaAnalyzer.analyze(datasetPath);
|
|
102
|
+
if ('avg_audio_duration' in audioReport) {
|
|
103
|
+
const silentFiles = audioReport.details.filter(d => d.status === "ok" && 'is_silent' in d && d.is_silent).length;
|
|
104
|
+
const avgSampleRate = audioReport.details
|
|
105
|
+
.filter(d => d.status === "ok" && 'sample_rate' in d)
|
|
106
|
+
.reduce((sum, d) => sum + (('sample_rate' in d) ? (d.sample_rate || 0) : 0), 0) / audioReport.ok_files;
|
|
107
|
+
report.audio_quality = {
|
|
108
|
+
total_files: audioReport.total_files,
|
|
109
|
+
avg_duration: audioReport.avg_audio_duration || 0,
|
|
110
|
+
avg_sample_rate: avgSampleRate,
|
|
111
|
+
silent_percentage: (silentFiles / audioReport.total_files) * 100
|
|
112
|
+
};
|
|
113
|
+
// Calculate audio quality score
|
|
114
|
+
const failurePenalty = (audioReport.failed_files / audioReport.total_files) * 50;
|
|
115
|
+
const silentPenalty = report.audio_quality.silent_percentage * 0.5;
|
|
116
|
+
const audioScore = Math.max(0, 100 - failurePenalty - silentPenalty);
|
|
117
|
+
totalScore += audioScore;
|
|
118
|
+
scoreCount++;
|
|
119
|
+
if (report.audio_quality.silent_percentage > 10) {
|
|
120
|
+
report.recommendations.push("High percentage of silent audio files detected.");
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
catch (e) {
|
|
125
|
+
console.error("Audio analysis failed:", e);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
// Video quality
|
|
129
|
+
if (modalities.includes("video")) {
|
|
130
|
+
try {
|
|
131
|
+
const videoReport = await this.mediaAnalyzer.analyze(datasetPath);
|
|
132
|
+
if ('avg_video_duration' in videoReport) {
|
|
133
|
+
const highRiskFiles = videoReport.details.filter(d => d.status === "ok" && d.corruption_risk === "high").length;
|
|
134
|
+
report.video_quality = {
|
|
135
|
+
total_files: videoReport.total_files,
|
|
136
|
+
avg_duration: videoReport.avg_video_duration || 0,
|
|
137
|
+
avg_fps: videoReport.avg_fps || 0,
|
|
138
|
+
corruption_risk_high: highRiskFiles
|
|
139
|
+
};
|
|
140
|
+
// Calculate video quality score
|
|
141
|
+
const failurePenalty = (videoReport.failed_files / videoReport.total_files) * 50;
|
|
142
|
+
const corruptionPenalty = (highRiskFiles / videoReport.total_files) * 30;
|
|
143
|
+
const videoScore = Math.max(0, 100 - failurePenalty - corruptionPenalty);
|
|
144
|
+
totalScore += videoScore;
|
|
145
|
+
scoreCount++;
|
|
146
|
+
if (highRiskFiles > 0) {
|
|
147
|
+
report.recommendations.push(`${highRiskFiles} video files have high corruption risk.`);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
catch (e) {
|
|
152
|
+
console.error("Video analysis failed:", e);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
// Calculate overall quality score
|
|
156
|
+
report.overall_quality_score = scoreCount > 0 ? Math.round(totalScore / scoreCount) : 0;
|
|
157
|
+
if (report.recommendations.length === 0) {
|
|
158
|
+
report.recommendations.push("Dataset quality is good. No major issues detected.");
|
|
159
|
+
}
|
|
160
|
+
return report;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { Embedder } from "../search/embedder.js";
|
|
2
|
+
import { VectorStore } from "../search/vector-store.js";
|
|
3
|
+
import { MetadataStore } from "../metadata/store.js";
|
|
4
|
+
import path from "path";
|
|
5
|
+
async function main() {
|
|
6
|
+
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
7
|
+
const vectorPath = path.join(process.cwd(), "data", "vectors.json");
|
|
8
|
+
const metadataStore = new MetadataStore(dbPath);
|
|
9
|
+
const vectorStore = new VectorStore(vectorPath);
|
|
10
|
+
const embedder = Embedder.getInstance();
|
|
11
|
+
const datasets = metadataStore.getAllDatasets();
|
|
12
|
+
const indexedIds = new Set(vectorStore.getAllIds());
|
|
13
|
+
// Filter to only new datasets
|
|
14
|
+
const toIndex = datasets.filter(ds => !indexedIds.has(ds.id));
|
|
15
|
+
console.error(`Total datasets: ${datasets.length}, Already indexed: ${indexedIds.size}, To index: ${toIndex.length}`);
|
|
16
|
+
const BATCH_SIZE = 50;
|
|
17
|
+
let processed = 0;
|
|
18
|
+
for (let i = 0; i < toIndex.length; i += BATCH_SIZE) {
|
|
19
|
+
const batch = toIndex.slice(i, i + BATCH_SIZE);
|
|
20
|
+
try {
|
|
21
|
+
// Prepare texts for batch embedding
|
|
22
|
+
const texts = batch.map(ds => [
|
|
23
|
+
ds.name,
|
|
24
|
+
ds.description,
|
|
25
|
+
`Task: ${ds.task}`,
|
|
26
|
+
`Languages: ${ds.languages?.join(", ") || ""}`,
|
|
27
|
+
`Tags: ${ds.tags?.join(" ") || ""}`
|
|
28
|
+
].join(" ").slice(0, 1500));
|
|
29
|
+
// Embed batch (Xenova supports array input)
|
|
30
|
+
// Note: Parallelizing at the embed level is better for CPU utilization
|
|
31
|
+
await Promise.all(batch.map(async (ds, idx) => {
|
|
32
|
+
try {
|
|
33
|
+
const vector = await embedder.embed(texts[idx]);
|
|
34
|
+
vectorStore.add(ds.id, vector);
|
|
35
|
+
}
|
|
36
|
+
catch (err) {
|
|
37
|
+
console.error(`Failed to index ${ds.id}:`, err);
|
|
38
|
+
}
|
|
39
|
+
}));
|
|
40
|
+
processed += batch.length;
|
|
41
|
+
if (processed % 100 === 0 || i + BATCH_SIZE >= toIndex.length) {
|
|
42
|
+
console.error(`Indexed ${processed}/${toIndex.length} new datasets...`);
|
|
43
|
+
vectorStore.save();
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
catch (err) {
|
|
47
|
+
console.error(`Batch around ${i} failed:`, err);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
vectorStore.save();
|
|
51
|
+
console.error("Vector indexing complete.");
|
|
52
|
+
metadataStore.close();
|
|
53
|
+
}
|
|
54
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import Database from "better-sqlite3";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
async function main() {
|
|
5
|
+
const dbPath = path.join(process.cwd(), "data", "metadata.db");
|
|
6
|
+
if (!fs.existsSync(dbPath)) {
|
|
7
|
+
console.error("Database not found. Run 'npm run scrape' first.");
|
|
8
|
+
process.exit(1);
|
|
9
|
+
}
|
|
10
|
+
const db = new Database(dbPath);
|
|
11
|
+
try {
|
|
12
|
+
// Get total count
|
|
13
|
+
const count = db.prepare("SELECT COUNT(*) as count FROM datasets").get();
|
|
14
|
+
console.log(`\nTotal datasets in database: ${count.count}\n`);
|
|
15
|
+
// Check which columns exist
|
|
16
|
+
const tableInfo = db.prepare("PRAGMA table_info(datasets)").all();
|
|
17
|
+
const columns = tableInfo.map(col => col.name);
|
|
18
|
+
const hasNewColumns = columns.includes("is_safe_source");
|
|
19
|
+
// Get basic statistics (works with both old and new schema)
|
|
20
|
+
const stats = db.prepare(`
|
|
21
|
+
SELECT
|
|
22
|
+
COUNT(*) as total,
|
|
23
|
+
SUM(downloads) as total_downloads,
|
|
24
|
+
AVG(quality_score) as avg_quality,
|
|
25
|
+
SUM(CASE WHEN license_category = 'safe' THEN 1 ELSE 0 END) as safe_licenses,
|
|
26
|
+
SUM(CASE WHEN has_train_split = 1 THEN 1 ELSE 0 END) as with_train_split
|
|
27
|
+
FROM datasets
|
|
28
|
+
`).get();
|
|
29
|
+
console.log("Statistics:");
|
|
30
|
+
console.log(` Total downloads: ${stats.total_downloads?.toLocaleString() || 0}`);
|
|
31
|
+
console.log(` Average quality score: ${Math.round(stats.avg_quality || 0)}`);
|
|
32
|
+
console.log(` Safe licenses: ${stats.safe_licenses || 0}`);
|
|
33
|
+
console.log(` With train split: ${stats.with_train_split || 0}`);
|
|
34
|
+
// Show extended stats if new schema is available
|
|
35
|
+
if (hasNewColumns) {
|
|
36
|
+
const extendedStats = db.prepare(`
|
|
37
|
+
SELECT
|
|
38
|
+
SUM(CASE WHEN is_safe_source = 1 THEN 1 ELSE 0 END) as safe_sources,
|
|
39
|
+
SUM(CASE WHEN is_structured = 1 THEN 1 ELSE 0 END) as structured,
|
|
40
|
+
SUM(total_examples) as total_examples
|
|
41
|
+
FROM datasets
|
|
42
|
+
`).get();
|
|
43
|
+
console.log(` Safe sources: ${extendedStats.safe_sources || 0}`);
|
|
44
|
+
console.log(` Structured datasets: ${extendedStats.structured || 0}`);
|
|
45
|
+
console.log(` Total examples: ${extendedStats.total_examples?.toLocaleString() || 0}`);
|
|
46
|
+
}
|
|
47
|
+
else {
|
|
48
|
+
console.log(` WARNING: Database uses old schema. Re-scrape to get extended statistics.`);
|
|
49
|
+
}
|
|
50
|
+
console.log();
|
|
51
|
+
// Top 5 by downloads
|
|
52
|
+
const top5 = db.prepare(`
|
|
53
|
+
SELECT id, name, downloads, quality_score, license_category
|
|
54
|
+
FROM datasets
|
|
55
|
+
ORDER BY downloads DESC
|
|
56
|
+
LIMIT 5
|
|
57
|
+
`).all();
|
|
58
|
+
console.log("Top 5 datasets by downloads:");
|
|
59
|
+
top5.forEach((ds, i) => {
|
|
60
|
+
console.log(` ${i + 1}. ${ds.id}`);
|
|
61
|
+
console.log(` Downloads: ${ds.downloads.toLocaleString()}, Quality: ${ds.quality_score}, License: ${ds.license_category}`);
|
|
62
|
+
});
|
|
63
|
+
console.log();
|
|
64
|
+
}
|
|
65
|
+
catch (error) {
|
|
66
|
+
console.error("Error reading database:", error);
|
|
67
|
+
process.exit(1);
|
|
68
|
+
}
|
|
69
|
+
finally {
|
|
70
|
+
db.close();
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
main();
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import Database from "better-sqlite3";
|
|
2
|
+
import path from "path";
|
|
3
|
+
// Checking all plausible databases for jobs
|
|
4
|
+
const dbs = ["metadata.db", "vesper.db", "datasets.db"];
|
|
5
|
+
for (const dbName of dbs) {
|
|
6
|
+
const dbPath = path.resolve("data", dbName);
|
|
7
|
+
try {
|
|
8
|
+
const db = new Database(dbPath);
|
|
9
|
+
const tables = db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='jobs'").all();
|
|
10
|
+
if (tables.length > 0) {
|
|
11
|
+
console.log(`\n--- Checking jobs in ${dbName} ---`);
|
|
12
|
+
const jobs = db.prepare("SELECT * FROM jobs ORDER BY created_at DESC LIMIT 20").all();
|
|
13
|
+
for (const job of jobs) {
|
|
14
|
+
if (JSON.stringify(job).toLowerCase().includes("naruto")) {
|
|
15
|
+
console.log(JSON.stringify(job, null, 2));
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
db.close();
|
|
20
|
+
}
|
|
21
|
+
catch (e) {
|
|
22
|
+
// Silently skip if DB doesn't exist or table missing
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import Database from "better-sqlite3";
|
|
2
|
+
import path from "path";
|
|
3
|
+
// Trying metadata.db which is larger
|
|
4
|
+
const dbPath = path.resolve("data", "metadata.db");
|
|
5
|
+
const db = new Database(dbPath);
|
|
6
|
+
try {
|
|
7
|
+
const query = "naruto";
|
|
8
|
+
const results = db.prepare("SELECT * FROM datasets WHERE name LIKE ? OR description LIKE ?").all(`%${query}%`, `%${query}%`);
|
|
9
|
+
console.log(`Found ${results.length} results for "${query}" in metadata.db:`);
|
|
10
|
+
console.log(JSON.stringify(results, null, 2));
|
|
11
|
+
}
|
|
12
|
+
catch (e) {
|
|
13
|
+
console.error("Error checking database:", e.message);
|
|
14
|
+
}
|
|
15
|
+
finally {
|
|
16
|
+
db.close();
|
|
17
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { PipelineExecutor } from "../cleaning/executor.js";
|
|
2
|
+
import { ScriptGenerator } from "../cleaning/exporter.js";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
import path from "path";
|
|
5
|
+
async function main() {
|
|
6
|
+
console.log(" Vesper Dataset Ops Engine: Full Demo\n");
|
|
7
|
+
const executor = new PipelineExecutor();
|
|
8
|
+
const exporter = new ScriptGenerator();
|
|
9
|
+
const demoFile = path.join(process.cwd(), "vesper_demo_data.csv");
|
|
10
|
+
// 1. Create a Realistic Dirty Dataset
|
|
11
|
+
// - duplicate: Duplicate Customer
|
|
12
|
+
// - age: Mixed types ("25", "twenty"), Outliers (200)
|
|
13
|
+
// - email: PII
|
|
14
|
+
// - empty_col: 100% missing
|
|
15
|
+
// - score: Good data
|
|
16
|
+
const csvContent = `customer_id,age,email,score,empty_col
|
|
17
|
+
C001,25,john.doe@example.com,88.5,
|
|
18
|
+
C002,"30",jane.smith@work.org,92.0,
|
|
19
|
+
C003,200,bob.jones@gmail.com,15.0,
|
|
20
|
+
C001,25,john.doe@example.com,88.5,
|
|
21
|
+
C004,"forty",alice@co.uk,80.0,
|
|
22
|
+
C005,35,,75.0,`;
|
|
23
|
+
fs.writeFileSync(demoFile, csvContent);
|
|
24
|
+
console.log(`📦 Created dirty dataset: ${demoFile}`);
|
|
25
|
+
console.log(`Contains: Duplicates, PII (Emails), Mixed Types (Age), Outliers, Empty Columns.\n`);
|
|
26
|
+
try {
|
|
27
|
+
// 2. Run the Auto-Cleaning Pipeline
|
|
28
|
+
console.log(" Running Auto-Cleaning Pipeline...");
|
|
29
|
+
const result = await executor.runPipeline("demo-dataset", demoFile);
|
|
30
|
+
console.log("\n --- Quality Inspection Report ---");
|
|
31
|
+
console.log(` Duplicates: ${result.initial_quality.duplicate_rows} rows`);
|
|
32
|
+
console.log(` PII Warnings: ${result.initial_quality.pii_warnings?.length || 0}`);
|
|
33
|
+
if (result.initial_quality.schema_warnings.length > 0) {
|
|
34
|
+
console.log(" Schema Issues:");
|
|
35
|
+
result.initial_quality.schema_warnings.forEach(w => console.log(` ⚠️ ${w}`));
|
|
36
|
+
}
|
|
37
|
+
console.log("\n --- Generated Cleaning Plan ---");
|
|
38
|
+
result.plan.operations.forEach((op, i) => {
|
|
39
|
+
console.log(` ${i + 1}. [${op.type}] Reason: ${op.reason}`);
|
|
40
|
+
});
|
|
41
|
+
console.log("\n --- Execution Result ---");
|
|
42
|
+
if (result.cleaning_result.success) {
|
|
43
|
+
console.log(` Success! Cleaned file saved to:`);
|
|
44
|
+
console.log(` ${result.final_output_path}`);
|
|
45
|
+
}
|
|
46
|
+
else {
|
|
47
|
+
console.error(` Failed: ${result.cleaning_result.error}`);
|
|
48
|
+
}
|
|
49
|
+
// 3. Generate Reproducibility Script
|
|
50
|
+
console.log("\n --- Reproducibility ---");
|
|
51
|
+
const pythonScript = exporter.generatePythonScript(result.plan, demoFile);
|
|
52
|
+
const scriptPath = path.join(process.cwd(), "demo_cleaning_script.py");
|
|
53
|
+
fs.writeFileSync(scriptPath, pythonScript);
|
|
54
|
+
console.log(` Generated Python script: ${scriptPath}`);
|
|
55
|
+
console.log(` (You can run this script independently to reproduce these steps!)`);
|
|
56
|
+
console.log("\n Demo Complete.");
|
|
57
|
+
}
|
|
58
|
+
catch (error) {
|
|
59
|
+
console.error("\n Demo failed:", error);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
main().catch(console.error);
|