@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +259 -0
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +50 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +60 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +111 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/storage-manager.js +20 -0
  15. package/build/cloud/types.js +1 -0
  16. package/build/compliance/service.js +73 -0
  17. package/build/compliance/store.js +80 -0
  18. package/build/compliance/types.js +1 -0
  19. package/build/data/processing-worker.js +23 -0
  20. package/build/data/streaming.js +38 -0
  21. package/build/data/worker-pool.js +39 -0
  22. package/build/export/exporter.js +45 -0
  23. package/build/export/packager.js +100 -0
  24. package/build/export/types.js +1 -0
  25. package/build/fusion/aligner.js +56 -0
  26. package/build/fusion/deduplicator.js +69 -0
  27. package/build/fusion/harmonizer.js +39 -0
  28. package/build/fusion/orchestrator.js +86 -0
  29. package/build/fusion/types.js +1 -0
  30. package/build/index.js +632 -0
  31. package/build/ingestion/hf-downloader.js +64 -0
  32. package/build/ingestion/ingestor.js +96 -0
  33. package/build/ingestion/kaggle-downloader.js +79 -0
  34. package/build/install/install-service.js +41 -0
  35. package/build/jobs/manager.js +129 -0
  36. package/build/jobs/queue.js +59 -0
  37. package/build/jobs/types.js +1 -0
  38. package/build/metadata/domain.js +147 -0
  39. package/build/metadata/github-scraper.js +47 -0
  40. package/build/metadata/institutional-scrapers.js +49 -0
  41. package/build/metadata/kaggle-scraper.js +182 -0
  42. package/build/metadata/license.js +68 -0
  43. package/build/metadata/monitoring-service.js +107 -0
  44. package/build/metadata/monitoring-store.js +78 -0
  45. package/build/metadata/monitoring-types.js +1 -0
  46. package/build/metadata/quality.js +48 -0
  47. package/build/metadata/rate-limiter.js +128 -0
  48. package/build/metadata/scraper.js +353 -0
  49. package/build/metadata/store.js +325 -0
  50. package/build/metadata/types.js +1 -0
  51. package/build/metadata/uci-scraper.js +49 -0
  52. package/build/monitoring/observability.js +76 -0
  53. package/build/quality/analyzer.js +57 -0
  54. package/build/quality/image-analyzer.js +46 -0
  55. package/build/quality/media-analyzer.js +46 -0
  56. package/build/quality/quality-orchestrator.js +162 -0
  57. package/build/quality/types.js +1 -0
  58. package/build/scripts/build-index.js +54 -0
  59. package/build/scripts/check-db.js +73 -0
  60. package/build/scripts/check-jobs.js +24 -0
  61. package/build/scripts/check-naruto.js +17 -0
  62. package/build/scripts/demo-full-pipeline.js +62 -0
  63. package/build/scripts/demo-ui.js +58 -0
  64. package/build/scripts/e2e-demo.js +72 -0
  65. package/build/scripts/massive-scrape.js +103 -0
  66. package/build/scripts/ops-dashboard.js +33 -0
  67. package/build/scripts/scrape-metadata.js +100 -0
  68. package/build/scripts/search-cli.js +26 -0
  69. package/build/scripts/test-bias.js +45 -0
  70. package/build/scripts/test-caching.js +51 -0
  71. package/build/scripts/test-cleaning.js +76 -0
  72. package/build/scripts/test-cloud-storage.js +48 -0
  73. package/build/scripts/test-compliance.js +58 -0
  74. package/build/scripts/test-conversion.js +64 -0
  75. package/build/scripts/test-custom-rules.js +58 -0
  76. package/build/scripts/test-db-opt.js +63 -0
  77. package/build/scripts/test-export-custom.js +33 -0
  78. package/build/scripts/test-exporter.js +53 -0
  79. package/build/scripts/test-fusion.js +61 -0
  80. package/build/scripts/test-github.js +27 -0
  81. package/build/scripts/test-group-split.js +52 -0
  82. package/build/scripts/test-hf-download.js +29 -0
  83. package/build/scripts/test-holdout-manager.js +61 -0
  84. package/build/scripts/test-hybrid-search.js +41 -0
  85. package/build/scripts/test-image-analysis.js +50 -0
  86. package/build/scripts/test-ingestion-infra.js +39 -0
  87. package/build/scripts/test-install.js +40 -0
  88. package/build/scripts/test-institutional.js +26 -0
  89. package/build/scripts/test-integrity.js +41 -0
  90. package/build/scripts/test-jit.js +42 -0
  91. package/build/scripts/test-job-queue.js +62 -0
  92. package/build/scripts/test-kaggle-download.js +34 -0
  93. package/build/scripts/test-large-data.js +50 -0
  94. package/build/scripts/test-mcp-v5.js +73 -0
  95. package/build/scripts/test-media-analysis.js +61 -0
  96. package/build/scripts/test-monitoring.js +91 -0
  97. package/build/scripts/test-observability.js +106 -0
  98. package/build/scripts/test-packager.js +55 -0
  99. package/build/scripts/test-pipeline.js +50 -0
  100. package/build/scripts/test-planning.js +64 -0
  101. package/build/scripts/test-privacy.js +38 -0
  102. package/build/scripts/test-quality.js +43 -0
  103. package/build/scripts/test-robust-ingestion.js +41 -0
  104. package/build/scripts/test-schema.js +45 -0
  105. package/build/scripts/test-split-validation.js +40 -0
  106. package/build/scripts/test-splitter.js +93 -0
  107. package/build/scripts/test-uci.js +27 -0
  108. package/build/scripts/test-unified-quality.js +86 -0
  109. package/build/search/embedder.js +34 -0
  110. package/build/search/engine.js +129 -0
  111. package/build/search/jit-orchestrator.js +232 -0
  112. package/build/search/vector-store.js +105 -0
  113. package/build/splitting/splitter.js +57 -0
  114. package/build/splitting/types.js +1 -0
  115. package/build/tools/formatter.js +227 -0
  116. package/build/utils/downloader.js +52 -0
  117. package/mcp-config-template.json +15 -0
  118. package/package.json +84 -0
  119. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  120. package/src/python/cleaner.py +196 -0
  121. package/src/python/export_engine.py +112 -0
  122. package/src/python/framework_adapters.py +100 -0
  123. package/src/python/github_adapter.py +106 -0
  124. package/src/python/image_engine.py +86 -0
  125. package/src/python/media_engine.py +133 -0
  126. package/src/python/nasa_adapter.py +82 -0
  127. package/src/python/quality_engine.py +243 -0
  128. package/src/python/splitter_engine.py +283 -0
  129. package/src/python/test_framework_adapters.py +61 -0
  130. package/src/python/uci_adapter.py +94 -0
  131. package/src/python/worldbank_adapter.py +99 -0
@@ -0,0 +1,49 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ const PYTHON_SCRIPT_PATH = path.resolve("src", "python", "uci_adapter.py");
4
+ export class UCIScraper {
5
+ /**
6
+ * Search UCI repository using the Python adapter
7
+ */
8
+ async scrape(query, limit = 10) {
9
+ return new Promise((resolve, reject) => {
10
+ const pythonProcess = spawn("python", [
11
+ PYTHON_SCRIPT_PATH,
12
+ "--action", "search",
13
+ "--query", query,
14
+ "--limit", String(limit)
15
+ ]);
16
+ let output = "";
17
+ let errorOutput = "";
18
+ pythonProcess.stdout.on("data", (data) => {
19
+ output += data.toString();
20
+ });
21
+ pythonProcess.stderr.on("data", (data) => {
22
+ errorOutput += data.toString();
23
+ });
24
+ pythonProcess.on("close", (code) => {
25
+ if (code !== 0) {
26
+ // It's possible for python to emit stderr warnings but still succeed
27
+ // But exit code != 0 is definitely an error
28
+ console.error(`[UCIScraper] Process exited with code ${code}: ${errorOutput}`);
29
+ resolve([]); // Fail gracefully by returning empty
30
+ return;
31
+ }
32
+ try {
33
+ const results = JSON.parse(output);
34
+ if (results.error) {
35
+ console.error(`[UCIScraper] Internal error: ${results.error}`);
36
+ resolve([]);
37
+ }
38
+ else {
39
+ resolve(results);
40
+ }
41
+ }
42
+ catch (e) {
43
+ console.error(`[UCIScraper] JSON parse error: ${e.message}. Output: ${output.substring(0, 100)}...`);
44
+ resolve([]);
45
+ }
46
+ });
47
+ });
48
+ }
49
+ }
@@ -0,0 +1,76 @@
1
+ export class MockErrorTracker {
2
+ exceptions = [];
3
+ messages = [];
4
+ captureException(error, context) {
5
+ console.log(`[ErrorTracker] Exception captured: ${error.message}`);
6
+ this.exceptions.push({ error, context });
7
+ }
8
+ captureMessage(message, level = "info") {
9
+ console.log(`[ErrorTracker] Message captured (${level}): ${message}`);
10
+ this.messages.push({ message, level });
11
+ }
12
+ }
13
+ export class ObservabilityService {
14
+ errorTracker;
15
+ jobSuccessCounter = new Map();
16
+ jobFailureCounter = new Map();
17
+ jobDurationHistogram = new Map();
18
+ constructor(errorTracker = new MockErrorTracker()) {
19
+ this.errorTracker = errorTracker;
20
+ }
21
+ recordJobSuccess(type, durationMs) {
22
+ // Increment success counter
23
+ this.jobSuccessCounter.set(type, (this.jobSuccessCounter.get(type) || 0) + 1);
24
+ // Record duration
25
+ this.updateMetricRecord(type, durationMs);
26
+ }
27
+ recordJobFailure(type, error) {
28
+ // Increment failure counter
29
+ this.jobFailureCounter.set(type, (this.jobFailureCounter.get(type) || 0) + 1);
30
+ // Track error
31
+ this.errorTracker.captureException(error, { jobType: type });
32
+ }
33
+ getPrometheusMetrics() {
34
+ let out = "# HELP jobs_processed_total Total number of jobs successfully processed\n";
35
+ out += "# TYPE jobs_processed_total counter\n";
36
+ for (const [type, count] of this.jobSuccessCounter) {
37
+ out += `jobs_processed_total{type="${type}"} ${count}\n`;
38
+ }
39
+ out += "\n# HELP jobs_failed_total Total number of failed jobs\n";
40
+ out += "# TYPE jobs_failed_total counter\n";
41
+ for (const [type, count] of this.jobFailureCounter) {
42
+ out += `jobs_failed_total{type="${type}"} ${count}\n`;
43
+ }
44
+ out += "\n# HELP job_duration_seconds_sum Latency of job processing in seconds\n";
45
+ out += "# TYPE job_duration_seconds_sum counter\n";
46
+ for (const [type, record] of this.jobDurationHistogram) {
47
+ out += `job_duration_seconds_sum{type="${type}"} ${record.sum / 1000}\n`;
48
+ out += `job_duration_seconds_count{type="${type}"} ${record.count}\n`;
49
+ out += `job_duration_seconds_max{type="${type}"} ${record.max / 1000}\n`;
50
+ }
51
+ return out;
52
+ }
53
+ getStats() {
54
+ const stats = {};
55
+ for (const [type, record] of this.jobDurationHistogram) {
56
+ stats[type] = {
57
+ successCount: this.jobSuccessCounter.get(type) || 0,
58
+ failureCount: this.jobFailureCounter.get(type) || 0,
59
+ avgDuration: (record.sum / record.count).toFixed(2) + "ms",
60
+ maxDuration: record.max + "ms"
61
+ };
62
+ }
63
+ return stats;
64
+ }
65
+ updateMetricRecord(type, value) {
66
+ let record = this.jobDurationHistogram.get(type);
67
+ if (!record) {
68
+ record = { count: 0, sum: 0, min: value, max: value };
69
+ this.jobDurationHistogram.set(type, record);
70
+ }
71
+ record.count++;
72
+ record.sum += value;
73
+ record.min = Math.min(record.min, value);
74
+ record.max = Math.max(record.max, value);
75
+ }
76
+ }
@@ -0,0 +1,57 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ export class QualityAnalyzer {
4
+ cache;
5
+ pythonPath = "python"; // Assumes python is in PATH
6
+ scriptPath;
7
+ constructor(cache, projectRoot = process.cwd()) {
8
+ this.cache = cache;
9
+ this.scriptPath = path.join(projectRoot, "src", "python", "quality_engine.py");
10
+ }
11
+ /**
12
+ * Run quality analysis on a local file (CSV/Parquet/JSON)
13
+ * @param datasetId Used for caching
14
+ */
15
+ async analyze(filePath, datasetId) {
16
+ if (this.cache && datasetId) {
17
+ const cached = await this.cache.getReport(datasetId);
18
+ if (cached) {
19
+ console.log(`[QualityAnalyzer] Cache hit for ${datasetId}`);
20
+ return cached;
21
+ }
22
+ }
23
+ const report = await new Promise((resolve, reject) => {
24
+ const process = spawn(this.pythonPath, [this.scriptPath, filePath]);
25
+ let stdout = "";
26
+ let stderr = "";
27
+ process.stdout.on("data", (data) => {
28
+ stdout += data.toString();
29
+ });
30
+ process.stderr.on("data", (data) => {
31
+ stderr += data.toString();
32
+ });
33
+ process.on("close", (code) => {
34
+ if (code !== 0) {
35
+ reject(new Error(`Quality Analyzer failed (code ${code}): ${stderr}`));
36
+ return;
37
+ }
38
+ try {
39
+ const report = JSON.parse(stdout);
40
+ if (report.error) {
41
+ reject(new Error(report.error));
42
+ }
43
+ else {
44
+ resolve(report);
45
+ }
46
+ }
47
+ catch (e) {
48
+ reject(new Error(`Failed to parse analyzer output: ${stdout}`));
49
+ }
50
+ });
51
+ });
52
+ if (this.cache && datasetId) {
53
+ await this.cache.saveReport(datasetId, report);
54
+ }
55
+ return report;
56
+ }
57
+ }
@@ -0,0 +1,46 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ export class ImageAnalyzer {
4
+ pythonPath = "python";
5
+ scriptPath;
6
+ constructor(projectRoot = process.cwd()) {
7
+ this.scriptPath = path.join(projectRoot, "src", "python", "image_engine.py");
8
+ }
9
+ /**
10
+ * Analyze image quality for a single file or a directory
11
+ */
12
+ async analyze(inputPath) {
13
+ return new Promise((resolve, reject) => {
14
+ const process = spawn(this.pythonPath, [
15
+ this.scriptPath,
16
+ inputPath
17
+ ]);
18
+ let stdout = "";
19
+ let stderr = "";
20
+ process.stdout.on("data", (data) => {
21
+ stdout += data.toString();
22
+ });
23
+ process.stderr.on("data", (data) => {
24
+ stderr += data.toString();
25
+ });
26
+ process.on("close", (code) => {
27
+ if (code !== 0) {
28
+ reject(new Error(`Image Analyzer failed (code ${code}): ${stderr}`));
29
+ return;
30
+ }
31
+ try {
32
+ const result = JSON.parse(stdout);
33
+ if (result.error) {
34
+ reject(new Error(result.error));
35
+ }
36
+ else {
37
+ resolve(result);
38
+ }
39
+ }
40
+ catch (e) {
41
+ reject(new Error(`Failed to parse image analyzer output: ${stdout}`));
42
+ }
43
+ });
44
+ });
45
+ }
46
+ }
@@ -0,0 +1,46 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ export class MediaAnalyzer {
4
+ pythonPath = "python";
5
+ scriptPath;
6
+ constructor(projectRoot = process.cwd()) {
7
+ this.scriptPath = path.join(projectRoot, "src", "python", "media_engine.py");
8
+ }
9
+ /**
10
+ * Analyze audio/video quality for a single file or a directory
11
+ */
12
+ async analyze(inputPath) {
13
+ return new Promise((resolve, reject) => {
14
+ const process = spawn(this.pythonPath, [
15
+ this.scriptPath,
16
+ inputPath
17
+ ]);
18
+ let stdout = "";
19
+ let stderr = "";
20
+ process.stdout.on("data", (data) => {
21
+ stdout += data.toString();
22
+ });
23
+ process.stderr.on("data", (data) => {
24
+ stderr += data.toString();
25
+ });
26
+ process.on("close", (code) => {
27
+ if (code !== 0) {
28
+ reject(new Error(`Media Analyzer failed (code ${code}): ${stderr}`));
29
+ return;
30
+ }
31
+ try {
32
+ const result = JSON.parse(stdout);
33
+ if (result.error) {
34
+ reject(new Error(result.error));
35
+ }
36
+ else {
37
+ resolve(result);
38
+ }
39
+ }
40
+ catch (e) {
41
+ reject(new Error(`Failed to parse media analyzer output: ${stdout}`));
42
+ }
43
+ });
44
+ });
45
+ }
46
+ }
@@ -0,0 +1,162 @@
1
+ import fs from "fs";
2
+ import { ImageAnalyzer } from "./image-analyzer.js";
3
+ import { MediaAnalyzer } from "./media-analyzer.js";
4
+ export class QualityOrchestrator {
5
+ imageAnalyzer;
6
+ mediaAnalyzer;
7
+ constructor(projectRoot = process.cwd()) {
8
+ this.imageAnalyzer = new ImageAnalyzer(projectRoot);
9
+ this.mediaAnalyzer = new MediaAnalyzer(projectRoot);
10
+ }
11
+ /**
12
+ * Detect modalities present in a dataset directory
13
+ */
14
+ detectModalities(datasetPath) {
15
+ const modalities = [];
16
+ if (!fs.existsSync(datasetPath)) {
17
+ return modalities;
18
+ }
19
+ const files = fs.readdirSync(datasetPath);
20
+ // Check for text/tabular data
21
+ const hasText = files.some(f => /\.(csv|json|parquet|txt)$/i.test(f));
22
+ if (hasText)
23
+ modalities.push("text");
24
+ // Check for images
25
+ const hasImages = files.some(f => /\.(jpg|jpeg|png|bmp|webp)$/i.test(f));
26
+ if (hasImages)
27
+ modalities.push("image");
28
+ // Check for audio
29
+ const hasAudio = files.some(f => /\.(wav|mp3|flac|ogg|m4a)$/i.test(f));
30
+ if (hasAudio)
31
+ modalities.push("audio");
32
+ // Check for video
33
+ const hasVideo = files.some(f => /\.(mp4|avi|mkv|mov|wmv)$/i.test(f));
34
+ if (hasVideo)
35
+ modalities.push("video");
36
+ return modalities;
37
+ }
38
+ /**
39
+ * Generate a unified quality report for a dataset
40
+ */
41
+ async generateReport(datasetId, datasetPath, textQuality) {
42
+ const modalities = this.detectModalities(datasetPath);
43
+ const report = {
44
+ dataset_id: datasetId,
45
+ modalities,
46
+ overall_quality_score: 0,
47
+ recommendations: [],
48
+ generated_at: new Date().toISOString()
49
+ };
50
+ let totalScore = 0;
51
+ let scoreCount = 0;
52
+ // Text quality (if provided from existing analysis)
53
+ if (textQuality) {
54
+ report.text_quality = {
55
+ row_count: textQuality.row_count || 0,
56
+ column_count: textQuality.column_count || 0,
57
+ missing_percentage: textQuality.missing_percentage || 0,
58
+ duplicate_percentage: textQuality.duplicate_percentage || 0
59
+ };
60
+ // Calculate text quality score (0-100)
61
+ const textScore = Math.max(0, 100 - (report.text_quality.missing_percentage * 2) - (report.text_quality.duplicate_percentage));
62
+ totalScore += textScore;
63
+ scoreCount++;
64
+ if (report.text_quality.missing_percentage > 20) {
65
+ report.recommendations.push("High missing data detected. Consider imputation or removal.");
66
+ }
67
+ if (report.text_quality.duplicate_percentage > 10) {
68
+ report.recommendations.push("Significant duplicates found. Run deduplication.");
69
+ }
70
+ }
71
+ // Image quality
72
+ if (modalities.includes("image")) {
73
+ try {
74
+ const imageReport = await this.imageAnalyzer.analyze(datasetPath);
75
+ report.image_quality = {
76
+ total_images: imageReport.total_images,
77
+ corrupted_count: imageReport.corrupted_count,
78
+ avg_resolution: `${Math.round(imageReport.average_width)}x${Math.round(imageReport.average_height)}`,
79
+ blurry_percentage: (imageReport.blurry_count / imageReport.total_images) * 100
80
+ };
81
+ // Calculate image quality score
82
+ const corruptionPenalty = (imageReport.corrupted_count / imageReport.total_images) * 50;
83
+ const blurPenalty = report.image_quality.blurry_percentage * 0.3;
84
+ const imageScore = Math.max(0, 100 - corruptionPenalty - blurPenalty);
85
+ totalScore += imageScore;
86
+ scoreCount++;
87
+ if (report.image_quality.corrupted_count > 0) {
88
+ report.recommendations.push(`Remove ${imageReport.corrupted_count} corrupted images.`);
89
+ }
90
+ if (report.image_quality.blurry_percentage > 15) {
91
+ report.recommendations.push("High blur detected. Consider filtering blurry images.");
92
+ }
93
+ }
94
+ catch (e) {
95
+ console.error("Image analysis failed:", e);
96
+ }
97
+ }
98
+ // Audio quality
99
+ if (modalities.includes("audio")) {
100
+ try {
101
+ const audioReport = await this.mediaAnalyzer.analyze(datasetPath);
102
+ if ('avg_audio_duration' in audioReport) {
103
+ const silentFiles = audioReport.details.filter(d => d.status === "ok" && 'is_silent' in d && d.is_silent).length;
104
+ const avgSampleRate = audioReport.details
105
+ .filter(d => d.status === "ok" && 'sample_rate' in d)
106
+ .reduce((sum, d) => sum + (('sample_rate' in d) ? (d.sample_rate || 0) : 0), 0) / audioReport.ok_files;
107
+ report.audio_quality = {
108
+ total_files: audioReport.total_files,
109
+ avg_duration: audioReport.avg_audio_duration || 0,
110
+ avg_sample_rate: avgSampleRate,
111
+ silent_percentage: (silentFiles / audioReport.total_files) * 100
112
+ };
113
+ // Calculate audio quality score
114
+ const failurePenalty = (audioReport.failed_files / audioReport.total_files) * 50;
115
+ const silentPenalty = report.audio_quality.silent_percentage * 0.5;
116
+ const audioScore = Math.max(0, 100 - failurePenalty - silentPenalty);
117
+ totalScore += audioScore;
118
+ scoreCount++;
119
+ if (report.audio_quality.silent_percentage > 10) {
120
+ report.recommendations.push("High percentage of silent audio files detected.");
121
+ }
122
+ }
123
+ }
124
+ catch (e) {
125
+ console.error("Audio analysis failed:", e);
126
+ }
127
+ }
128
+ // Video quality
129
+ if (modalities.includes("video")) {
130
+ try {
131
+ const videoReport = await this.mediaAnalyzer.analyze(datasetPath);
132
+ if ('avg_video_duration' in videoReport) {
133
+ const highRiskFiles = videoReport.details.filter(d => d.status === "ok" && d.corruption_risk === "high").length;
134
+ report.video_quality = {
135
+ total_files: videoReport.total_files,
136
+ avg_duration: videoReport.avg_video_duration || 0,
137
+ avg_fps: videoReport.avg_fps || 0,
138
+ corruption_risk_high: highRiskFiles
139
+ };
140
+ // Calculate video quality score
141
+ const failurePenalty = (videoReport.failed_files / videoReport.total_files) * 50;
142
+ const corruptionPenalty = (highRiskFiles / videoReport.total_files) * 30;
143
+ const videoScore = Math.max(0, 100 - failurePenalty - corruptionPenalty);
144
+ totalScore += videoScore;
145
+ scoreCount++;
146
+ if (highRiskFiles > 0) {
147
+ report.recommendations.push(`${highRiskFiles} video files have high corruption risk.`);
148
+ }
149
+ }
150
+ }
151
+ catch (e) {
152
+ console.error("Video analysis failed:", e);
153
+ }
154
+ }
155
+ // Calculate overall quality score
156
+ report.overall_quality_score = scoreCount > 0 ? Math.round(totalScore / scoreCount) : 0;
157
+ if (report.recommendations.length === 0) {
158
+ report.recommendations.push("Dataset quality is good. No major issues detected.");
159
+ }
160
+ return report;
161
+ }
162
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,54 @@
1
+ import { Embedder } from "../search/embedder.js";
2
+ import { VectorStore } from "../search/vector-store.js";
3
+ import { MetadataStore } from "../metadata/store.js";
4
+ import path from "path";
5
+ async function main() {
6
+ const dbPath = path.join(process.cwd(), "data", "metadata.db");
7
+ const vectorPath = path.join(process.cwd(), "data", "vectors.json");
8
+ const metadataStore = new MetadataStore(dbPath);
9
+ const vectorStore = new VectorStore(vectorPath);
10
+ const embedder = Embedder.getInstance();
11
+ const datasets = metadataStore.getAllDatasets();
12
+ const indexedIds = new Set(vectorStore.getAllIds());
13
+ // Filter to only new datasets
14
+ const toIndex = datasets.filter(ds => !indexedIds.has(ds.id));
15
+ console.error(`Total datasets: ${datasets.length}, Already indexed: ${indexedIds.size}, To index: ${toIndex.length}`);
16
+ const BATCH_SIZE = 50;
17
+ let processed = 0;
18
+ for (let i = 0; i < toIndex.length; i += BATCH_SIZE) {
19
+ const batch = toIndex.slice(i, i + BATCH_SIZE);
20
+ try {
21
+ // Prepare texts for batch embedding
22
+ const texts = batch.map(ds => [
23
+ ds.name,
24
+ ds.description,
25
+ `Task: ${ds.task}`,
26
+ `Languages: ${ds.languages?.join(", ") || ""}`,
27
+ `Tags: ${ds.tags?.join(" ") || ""}`
28
+ ].join(" ").slice(0, 1500));
29
+ // Embed batch (Xenova supports array input)
30
+ // Note: Parallelizing at the embed level is better for CPU utilization
31
+ await Promise.all(batch.map(async (ds, idx) => {
32
+ try {
33
+ const vector = await embedder.embed(texts[idx]);
34
+ vectorStore.add(ds.id, vector);
35
+ }
36
+ catch (err) {
37
+ console.error(`Failed to index ${ds.id}:`, err);
38
+ }
39
+ }));
40
+ processed += batch.length;
41
+ if (processed % 100 === 0 || i + BATCH_SIZE >= toIndex.length) {
42
+ console.error(`Indexed ${processed}/${toIndex.length} new datasets...`);
43
+ vectorStore.save();
44
+ }
45
+ }
46
+ catch (err) {
47
+ console.error(`Batch around ${i} failed:`, err);
48
+ }
49
+ }
50
+ vectorStore.save();
51
+ console.error("Vector indexing complete.");
52
+ metadataStore.close();
53
+ }
54
+ main().catch(console.error);
@@ -0,0 +1,73 @@
1
+ import Database from "better-sqlite3";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ async function main() {
5
+ const dbPath = path.join(process.cwd(), "data", "metadata.db");
6
+ if (!fs.existsSync(dbPath)) {
7
+ console.error("Database not found. Run 'npm run scrape' first.");
8
+ process.exit(1);
9
+ }
10
+ const db = new Database(dbPath);
11
+ try {
12
+ // Get total count
13
+ const count = db.prepare("SELECT COUNT(*) as count FROM datasets").get();
14
+ console.log(`\nTotal datasets in database: ${count.count}\n`);
15
+ // Check which columns exist
16
+ const tableInfo = db.prepare("PRAGMA table_info(datasets)").all();
17
+ const columns = tableInfo.map(col => col.name);
18
+ const hasNewColumns = columns.includes("is_safe_source");
19
+ // Get basic statistics (works with both old and new schema)
20
+ const stats = db.prepare(`
21
+ SELECT
22
+ COUNT(*) as total,
23
+ SUM(downloads) as total_downloads,
24
+ AVG(quality_score) as avg_quality,
25
+ SUM(CASE WHEN license_category = 'safe' THEN 1 ELSE 0 END) as safe_licenses,
26
+ SUM(CASE WHEN has_train_split = 1 THEN 1 ELSE 0 END) as with_train_split
27
+ FROM datasets
28
+ `).get();
29
+ console.log("Statistics:");
30
+ console.log(` Total downloads: ${stats.total_downloads?.toLocaleString() || 0}`);
31
+ console.log(` Average quality score: ${Math.round(stats.avg_quality || 0)}`);
32
+ console.log(` Safe licenses: ${stats.safe_licenses || 0}`);
33
+ console.log(` With train split: ${stats.with_train_split || 0}`);
34
+ // Show extended stats if new schema is available
35
+ if (hasNewColumns) {
36
+ const extendedStats = db.prepare(`
37
+ SELECT
38
+ SUM(CASE WHEN is_safe_source = 1 THEN 1 ELSE 0 END) as safe_sources,
39
+ SUM(CASE WHEN is_structured = 1 THEN 1 ELSE 0 END) as structured,
40
+ SUM(total_examples) as total_examples
41
+ FROM datasets
42
+ `).get();
43
+ console.log(` Safe sources: ${extendedStats.safe_sources || 0}`);
44
+ console.log(` Structured datasets: ${extendedStats.structured || 0}`);
45
+ console.log(` Total examples: ${extendedStats.total_examples?.toLocaleString() || 0}`);
46
+ }
47
+ else {
48
+ console.log(` WARNING: Database uses old schema. Re-scrape to get extended statistics.`);
49
+ }
50
+ console.log();
51
+ // Top 5 by downloads
52
+ const top5 = db.prepare(`
53
+ SELECT id, name, downloads, quality_score, license_category
54
+ FROM datasets
55
+ ORDER BY downloads DESC
56
+ LIMIT 5
57
+ `).all();
58
+ console.log("Top 5 datasets by downloads:");
59
+ top5.forEach((ds, i) => {
60
+ console.log(` ${i + 1}. ${ds.id}`);
61
+ console.log(` Downloads: ${ds.downloads.toLocaleString()}, Quality: ${ds.quality_score}, License: ${ds.license_category}`);
62
+ });
63
+ console.log();
64
+ }
65
+ catch (error) {
66
+ console.error("Error reading database:", error);
67
+ process.exit(1);
68
+ }
69
+ finally {
70
+ db.close();
71
+ }
72
+ }
73
+ main();
@@ -0,0 +1,24 @@
1
+ import Database from "better-sqlite3";
2
+ import path from "path";
3
+ // Checking all plausible databases for jobs
4
+ const dbs = ["metadata.db", "vesper.db", "datasets.db"];
5
+ for (const dbName of dbs) {
6
+ const dbPath = path.resolve("data", dbName);
7
+ try {
8
+ const db = new Database(dbPath);
9
+ const tables = db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='jobs'").all();
10
+ if (tables.length > 0) {
11
+ console.log(`\n--- Checking jobs in ${dbName} ---`);
12
+ const jobs = db.prepare("SELECT * FROM jobs ORDER BY created_at DESC LIMIT 20").all();
13
+ for (const job of jobs) {
14
+ if (JSON.stringify(job).toLowerCase().includes("naruto")) {
15
+ console.log(JSON.stringify(job, null, 2));
16
+ }
17
+ }
18
+ }
19
+ db.close();
20
+ }
21
+ catch (e) {
22
+ // Silently skip if DB doesn't exist or table missing
23
+ }
24
+ }
@@ -0,0 +1,17 @@
1
+ import Database from "better-sqlite3";
2
+ import path from "path";
3
+ // Trying metadata.db which is larger
4
+ const dbPath = path.resolve("data", "metadata.db");
5
+ const db = new Database(dbPath);
6
+ try {
7
+ const query = "naruto";
8
+ const results = db.prepare("SELECT * FROM datasets WHERE name LIKE ? OR description LIKE ?").all(`%${query}%`, `%${query}%`);
9
+ console.log(`Found ${results.length} results for "${query}" in metadata.db:`);
10
+ console.log(JSON.stringify(results, null, 2));
11
+ }
12
+ catch (e) {
13
+ console.error("Error checking database:", e.message);
14
+ }
15
+ finally {
16
+ db.close();
17
+ }
@@ -0,0 +1,62 @@
1
+ import { PipelineExecutor } from "../cleaning/executor.js";
2
+ import { ScriptGenerator } from "../cleaning/exporter.js";
3
+ import fs from "fs";
4
+ import path from "path";
5
+ async function main() {
6
+ console.log(" Vesper Dataset Ops Engine: Full Demo\n");
7
+ const executor = new PipelineExecutor();
8
+ const exporter = new ScriptGenerator();
9
+ const demoFile = path.join(process.cwd(), "vesper_demo_data.csv");
10
+ // 1. Create a Realistic Dirty Dataset
11
+ // - duplicate: Duplicate Customer
12
+ // - age: Mixed types ("25", "twenty"), Outliers (200)
13
+ // - email: PII
14
+ // - empty_col: 100% missing
15
+ // - score: Good data
16
+ const csvContent = `customer_id,age,email,score,empty_col
17
+ C001,25,john.doe@example.com,88.5,
18
+ C002,"30",jane.smith@work.org,92.0,
19
+ C003,200,bob.jones@gmail.com,15.0,
20
+ C001,25,john.doe@example.com,88.5,
21
+ C004,"forty",alice@co.uk,80.0,
22
+ C005,35,,75.0,`;
23
+ fs.writeFileSync(demoFile, csvContent);
24
+ console.log(`📦 Created dirty dataset: ${demoFile}`);
25
+ console.log(`Contains: Duplicates, PII (Emails), Mixed Types (Age), Outliers, Empty Columns.\n`);
26
+ try {
27
+ // 2. Run the Auto-Cleaning Pipeline
28
+ console.log(" Running Auto-Cleaning Pipeline...");
29
+ const result = await executor.runPipeline("demo-dataset", demoFile);
30
+ console.log("\n --- Quality Inspection Report ---");
31
+ console.log(` Duplicates: ${result.initial_quality.duplicate_rows} rows`);
32
+ console.log(` PII Warnings: ${result.initial_quality.pii_warnings?.length || 0}`);
33
+ if (result.initial_quality.schema_warnings.length > 0) {
34
+ console.log(" Schema Issues:");
35
+ result.initial_quality.schema_warnings.forEach(w => console.log(` ⚠️ ${w}`));
36
+ }
37
+ console.log("\n --- Generated Cleaning Plan ---");
38
+ result.plan.operations.forEach((op, i) => {
39
+ console.log(` ${i + 1}. [${op.type}] Reason: ${op.reason}`);
40
+ });
41
+ console.log("\n --- Execution Result ---");
42
+ if (result.cleaning_result.success) {
43
+ console.log(` Success! Cleaned file saved to:`);
44
+ console.log(` ${result.final_output_path}`);
45
+ }
46
+ else {
47
+ console.error(` Failed: ${result.cleaning_result.error}`);
48
+ }
49
+ // 3. Generate Reproducibility Script
50
+ console.log("\n --- Reproducibility ---");
51
+ const pythonScript = exporter.generatePythonScript(result.plan, demoFile);
52
+ const scriptPath = path.join(process.cwd(), "demo_cleaning_script.py");
53
+ fs.writeFileSync(scriptPath, pythonScript);
54
+ console.log(` Generated Python script: ${scriptPath}`);
55
+ console.log(` (You can run this script independently to reproduce these steps!)`);
56
+ console.log("\n Demo Complete.");
57
+ }
58
+ catch (error) {
59
+ console.error("\n Demo failed:", error);
60
+ }
61
+ }
62
+ main().catch(console.error);