@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +259 -0
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +50 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +60 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +111 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/storage-manager.js +20 -0
  15. package/build/cloud/types.js +1 -0
  16. package/build/compliance/service.js +73 -0
  17. package/build/compliance/store.js +80 -0
  18. package/build/compliance/types.js +1 -0
  19. package/build/data/processing-worker.js +23 -0
  20. package/build/data/streaming.js +38 -0
  21. package/build/data/worker-pool.js +39 -0
  22. package/build/export/exporter.js +45 -0
  23. package/build/export/packager.js +100 -0
  24. package/build/export/types.js +1 -0
  25. package/build/fusion/aligner.js +56 -0
  26. package/build/fusion/deduplicator.js +69 -0
  27. package/build/fusion/harmonizer.js +39 -0
  28. package/build/fusion/orchestrator.js +86 -0
  29. package/build/fusion/types.js +1 -0
  30. package/build/index.js +632 -0
  31. package/build/ingestion/hf-downloader.js +64 -0
  32. package/build/ingestion/ingestor.js +96 -0
  33. package/build/ingestion/kaggle-downloader.js +79 -0
  34. package/build/install/install-service.js +41 -0
  35. package/build/jobs/manager.js +129 -0
  36. package/build/jobs/queue.js +59 -0
  37. package/build/jobs/types.js +1 -0
  38. package/build/metadata/domain.js +147 -0
  39. package/build/metadata/github-scraper.js +47 -0
  40. package/build/metadata/institutional-scrapers.js +49 -0
  41. package/build/metadata/kaggle-scraper.js +182 -0
  42. package/build/metadata/license.js +68 -0
  43. package/build/metadata/monitoring-service.js +107 -0
  44. package/build/metadata/monitoring-store.js +78 -0
  45. package/build/metadata/monitoring-types.js +1 -0
  46. package/build/metadata/quality.js +48 -0
  47. package/build/metadata/rate-limiter.js +128 -0
  48. package/build/metadata/scraper.js +353 -0
  49. package/build/metadata/store.js +325 -0
  50. package/build/metadata/types.js +1 -0
  51. package/build/metadata/uci-scraper.js +49 -0
  52. package/build/monitoring/observability.js +76 -0
  53. package/build/quality/analyzer.js +57 -0
  54. package/build/quality/image-analyzer.js +46 -0
  55. package/build/quality/media-analyzer.js +46 -0
  56. package/build/quality/quality-orchestrator.js +162 -0
  57. package/build/quality/types.js +1 -0
  58. package/build/scripts/build-index.js +54 -0
  59. package/build/scripts/check-db.js +73 -0
  60. package/build/scripts/check-jobs.js +24 -0
  61. package/build/scripts/check-naruto.js +17 -0
  62. package/build/scripts/demo-full-pipeline.js +62 -0
  63. package/build/scripts/demo-ui.js +58 -0
  64. package/build/scripts/e2e-demo.js +72 -0
  65. package/build/scripts/massive-scrape.js +103 -0
  66. package/build/scripts/ops-dashboard.js +33 -0
  67. package/build/scripts/scrape-metadata.js +100 -0
  68. package/build/scripts/search-cli.js +26 -0
  69. package/build/scripts/test-bias.js +45 -0
  70. package/build/scripts/test-caching.js +51 -0
  71. package/build/scripts/test-cleaning.js +76 -0
  72. package/build/scripts/test-cloud-storage.js +48 -0
  73. package/build/scripts/test-compliance.js +58 -0
  74. package/build/scripts/test-conversion.js +64 -0
  75. package/build/scripts/test-custom-rules.js +58 -0
  76. package/build/scripts/test-db-opt.js +63 -0
  77. package/build/scripts/test-export-custom.js +33 -0
  78. package/build/scripts/test-exporter.js +53 -0
  79. package/build/scripts/test-fusion.js +61 -0
  80. package/build/scripts/test-github.js +27 -0
  81. package/build/scripts/test-group-split.js +52 -0
  82. package/build/scripts/test-hf-download.js +29 -0
  83. package/build/scripts/test-holdout-manager.js +61 -0
  84. package/build/scripts/test-hybrid-search.js +41 -0
  85. package/build/scripts/test-image-analysis.js +50 -0
  86. package/build/scripts/test-ingestion-infra.js +39 -0
  87. package/build/scripts/test-install.js +40 -0
  88. package/build/scripts/test-institutional.js +26 -0
  89. package/build/scripts/test-integrity.js +41 -0
  90. package/build/scripts/test-jit.js +42 -0
  91. package/build/scripts/test-job-queue.js +62 -0
  92. package/build/scripts/test-kaggle-download.js +34 -0
  93. package/build/scripts/test-large-data.js +50 -0
  94. package/build/scripts/test-mcp-v5.js +73 -0
  95. package/build/scripts/test-media-analysis.js +61 -0
  96. package/build/scripts/test-monitoring.js +91 -0
  97. package/build/scripts/test-observability.js +106 -0
  98. package/build/scripts/test-packager.js +55 -0
  99. package/build/scripts/test-pipeline.js +50 -0
  100. package/build/scripts/test-planning.js +64 -0
  101. package/build/scripts/test-privacy.js +38 -0
  102. package/build/scripts/test-quality.js +43 -0
  103. package/build/scripts/test-robust-ingestion.js +41 -0
  104. package/build/scripts/test-schema.js +45 -0
  105. package/build/scripts/test-split-validation.js +40 -0
  106. package/build/scripts/test-splitter.js +93 -0
  107. package/build/scripts/test-uci.js +27 -0
  108. package/build/scripts/test-unified-quality.js +86 -0
  109. package/build/search/embedder.js +34 -0
  110. package/build/search/engine.js +129 -0
  111. package/build/search/jit-orchestrator.js +232 -0
  112. package/build/search/vector-store.js +105 -0
  113. package/build/splitting/splitter.js +57 -0
  114. package/build/splitting/types.js +1 -0
  115. package/build/tools/formatter.js +227 -0
  116. package/build/utils/downloader.js +52 -0
  117. package/mcp-config-template.json +15 -0
  118. package/package.json +84 -0
  119. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  120. package/src/python/cleaner.py +196 -0
  121. package/src/python/export_engine.py +112 -0
  122. package/src/python/framework_adapters.py +100 -0
  123. package/src/python/github_adapter.py +106 -0
  124. package/src/python/image_engine.py +86 -0
  125. package/src/python/media_engine.py +133 -0
  126. package/src/python/nasa_adapter.py +82 -0
  127. package/src/python/quality_engine.py +243 -0
  128. package/src/python/splitter_engine.py +283 -0
  129. package/src/python/test_framework_adapters.py +61 -0
  130. package/src/python/uci_adapter.py +94 -0
  131. package/src/python/worldbank_adapter.py +99 -0
@@ -0,0 +1,39 @@
1
+ import { MetadataStore } from "../metadata/store.js";
2
+ import { DataIngestor } from "../ingestion/ingestor.js";
3
+ import path from "path";
4
+ import fs from "fs";
5
+ import { fileURLToPath } from "url";
6
+ const __filename = fileURLToPath(import.meta.url);
7
+ const __dirname = path.dirname(__filename);
8
+ const projectRoot = path.join(__dirname, "..", "..");
9
+ const dbPath = path.join(projectRoot, "data", "test-metadata.db");
10
+ // Clean test DB if exists
11
+ if (fs.existsSync(dbPath))
12
+ fs.unlinkSync(dbPath);
13
+ const store = new MetadataStore(dbPath);
14
+ const ingestor = new DataIngestor(projectRoot, store);
15
+ async function testInfra() {
16
+ console.log("--- Testing Ingestion Infrastructure (6.1) ---");
17
+ const testId = "test/dataset";
18
+ // 1. Get target path
19
+ const target = ingestor.getTargetPath(testId);
20
+ console.log(`Target path: ${target}`);
21
+ // 2. Register downloading
22
+ console.log("Registering download start...");
23
+ store.registerDownload(testId, target, 'downloading');
24
+ let status = store.getDownloadStatus(testId);
25
+ console.log("Status after start:", status?.status);
26
+ // 3. Register completion
27
+ console.log("Registering completion...");
28
+ ingestor.completeDownload(testId, target, 1024);
29
+ status = store.getDownloadStatus(testId);
30
+ console.log("Status after completion:", status?.status);
31
+ console.log("Saved path:", status?.local_path);
32
+ console.log("Size:", status?.size_bytes, "bytes");
33
+ // 4. Verify directory exists
34
+ const rawDir = path.join(projectRoot, "data", "raw");
35
+ console.log(`Raw data dir created: ${fs.existsSync(rawDir)}`);
36
+ console.log("\nInfrastructure test PASSED!");
37
+ store.close();
38
+ }
39
+ testInfra().catch(console.error);
@@ -0,0 +1,40 @@
1
+ import { InstallService } from "../install/install-service.js";
2
+ import { MetadataStore } from "../metadata/store.js";
3
+ import path from "path";
4
+ import fs from "fs";
5
+ async function runTest() {
6
+ console.log("--- Testing Automatic Installation ---");
7
+ const projectRoot = path.resolve(".");
8
+ const metadataStore = new MetadataStore(path.join(projectRoot, "data", "metadata.db"));
9
+ const installService = new InstallService(projectRoot, metadataStore);
10
+ const query = "naruto";
11
+ const dataset = metadataStore.db.prepare("SELECT * FROM datasets WHERE name LIKE ? LIMIT 1").get(`%${query}%`);
12
+ if (!dataset) {
13
+ console.error("Naruto dataset not found in metadata.db. Please run a search first.");
14
+ return;
15
+ }
16
+ console.log(`Found dataset: ${dataset.name}`);
17
+ // Mock a prepared file
18
+ const mockFile = path.join(projectRoot, "data", "raw", "naruto_mock.csv");
19
+ if (!fs.existsSync(path.dirname(mockFile))) {
20
+ fs.mkdirSync(path.dirname(mockFile), { recursive: true });
21
+ }
22
+ fs.writeFileSync(mockFile, "quote,character\nBelieve it!,Naruto\nI will be Hokage,Naruto");
23
+ console.log(`Installing ${mockFile}...`);
24
+ const installPath = await installService.install(dataset.id, mockFile);
25
+ console.log(`✅ Success! Installed to: ${installPath}`);
26
+ // Verify it exists
27
+ if (fs.existsSync(installPath)) {
28
+ console.log("File exists at install location.");
29
+ }
30
+ else {
31
+ console.error("File MISSING from install location!");
32
+ }
33
+ // Verify metadata updated
34
+ const updated = metadataStore.getDataset(dataset.id);
35
+ const success = updated?.install_path === installPath;
36
+ console.log(`Updated install_path in metadata: ${updated?.install_path}`);
37
+ console.log(`VERIFICATION_STATUS: ${success ? "✅ PASS" : "❌ FAIL"}`);
38
+ console.log("\n--- Test Complete ---");
39
+ }
40
+ runTest().catch(console.error);
@@ -0,0 +1,26 @@
1
+ import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
2
+ async function runTest() {
3
+ console.log("--- Testing Institutional Data Sources ---");
4
+ const wb = new WorldBankScraper();
5
+ const nasa = new NASAScraper();
6
+ console.log('Searching World Bank for: "climate"...');
7
+ const wbResults = await wb.scrape("climate", 3);
8
+ console.log(`Found ${wbResults.length} World Bank results.`);
9
+ if (wbResults.length > 0) {
10
+ console.log("✅ World Bank Sample:");
11
+ console.log(` - ID: ${wbResults[0].id}`);
12
+ console.log(` - Name: ${wbResults[0].name}`);
13
+ console.log(` - URL: ${wbResults[0].metadata_url}`);
14
+ }
15
+ console.log('\nSearching NASA for: "astronomy"...');
16
+ const nasaResults = await nasa.scrape("astronomy", 3);
17
+ console.log(`Found ${nasaResults.length} NASA results.`);
18
+ if (nasaResults.length > 0) {
19
+ console.log("✅ NASA Sample:");
20
+ console.log(` - ID: ${nasaResults[0].id}`);
21
+ console.log(` - Name: ${nasaResults[0].name}`);
22
+ console.log(` - URL: ${nasaResults[0].metadata_url}`);
23
+ }
24
+ console.log("\n--- Test Complete ---");
25
+ }
26
+ runTest().catch(console.error);
@@ -0,0 +1,41 @@
1
+ import { QualityAnalyzer } from "../quality/analyzer.js";
2
+ import fs from "fs";
3
+ import path from "path";
4
+ async function main() {
5
+ console.log("=== Vesper Integrity Check Test ===\n");
6
+ const analyzer = new QualityAnalyzer();
7
+ const testFile = path.join(process.cwd(), "test_integrity.csv");
8
+ // Create a CSV with unique IDs but duplicate CONTENT
9
+ // This simulates a common data issue: exact same review scraped twice with different IDs
10
+ const csvContent = `id,review,label
11
+ 1,"This product involves huge risks.",negative
12
+ 2,"Great investment opportunity!",positive
13
+ 3,"This product involves huge risks.",negative
14
+ 4,"Wait and see.",neutral
15
+ 5,"Great investment opportunity!",positive`;
16
+ fs.writeFileSync(testFile, csvContent);
17
+ console.log(`Created test file: ${testFile}`);
18
+ console.log("Expectations:");
19
+ console.log(" - Duplicate Rows: 0 (because IDs differ)");
20
+ console.log(" - Text Duplicates: > 0 (because 'review' column has dupes)");
21
+ try {
22
+ console.log("\nRunning analyzer...");
23
+ const report = await analyzer.analyze(testFile);
24
+ console.log("\n=== Integrity Report ===");
25
+ console.log(`Duplicate Rows (Exact): ${report.duplicate_rows}`);
26
+ console.log(`Text Duplicates (Content): ${report.text_duplicates || 0}`);
27
+ console.log("\nWarnings:", report.warnings);
28
+ if (report.text_duplicates && report.text_duplicates > 0) {
29
+ console.log("Test passed! Detected text duplication despite unique IDs.");
30
+ }
31
+ else {
32
+ console.error("Test failed! Did not detect text duplicates.");
33
+ }
34
+ // Cleanup
35
+ fs.unlinkSync(testFile);
36
+ }
37
+ catch (error) {
38
+ console.error("\nTest failed:", error);
39
+ }
40
+ }
41
+ main().catch(console.error);
@@ -0,0 +1,42 @@
1
+ import { SearchEngine } from "../search/engine.js";
2
+ import { MetadataStore } from "../metadata/store.js";
3
+ import { VectorStore } from "../search/vector-store.js";
4
+ import { Embedder } from "../search/embedder.js";
5
+ import path from "path";
6
+ /**
7
+ * Test JIT fallback with various queries
8
+ */
9
+ async function main() {
10
+ const dbPath = path.join(process.cwd(), "data", "metadata.db");
11
+ const vectorPath = path.join(process.cwd(), "data", "vectors.json");
12
+ const store = new MetadataStore(dbPath);
13
+ const vectorStore = new VectorStore(vectorPath);
14
+ const embedder = Embedder.getInstance();
15
+ await embedder.init();
16
+ const engine = new SearchEngine(store, vectorStore, embedder);
17
+ console.log("\n=== JIT Fallback Test Suite ===\n");
18
+ // Test 1: Query that should have good results (should NOT trigger JIT)
19
+ console.log("Test 1: Common query (should NOT trigger JIT)");
20
+ console.log("Query: 'image classification cats dogs'\n");
21
+ const results1 = await engine.search("image classification cats dogs", { limit: 5 });
22
+ console.log(`Results: ${results1.length}, Top score: ${results1[0]?.relevance_score || 0}`);
23
+ console.log("---\n");
24
+ // Test 2: Very niche query (should trigger JIT)
25
+ console.log("Test 2: Niche query (SHOULD trigger JIT)");
26
+ console.log("Query: 'underwater acoustic signal processing dolphins'\n");
27
+ const results2 = await engine.search("underwater acoustic signal processing dolphins", { limit: 5 });
28
+ console.log(`Results: ${results2.length}, Top score: ${results2[0]?.relevance_score || 0}`);
29
+ console.log("---\n");
30
+ // Test 3: Another niche query
31
+ console.log("Test 3: Another niche query (SHOULD trigger JIT)");
32
+ console.log("Query: 'mongolian language morphological analysis'\n");
33
+ const results3 = await engine.search("mongolian language morphological analysis", { limit: 5 });
34
+ console.log(`Results: ${results3.length}, Top score: ${results3[0]?.relevance_score || 0}`);
35
+ console.log("---\n");
36
+ // Check final dataset count
37
+ const finalCount = store.getAllDatasets().length;
38
+ console.log(`\nFinal dataset count: ${finalCount}`);
39
+ console.log("(Compare with initial 1238 to see if JIT added new datasets)\n");
40
+ store.close();
41
+ }
42
+ main().catch(console.error);
@@ -0,0 +1,62 @@
1
+ import { MetadataStore } from "../metadata/store.js";
2
+ import { JobManager } from "../jobs/manager.js";
3
+ async function runTest() {
4
+ console.log("--- Initializing Job Queue Test ---");
5
+ const store = new MetadataStore("data/vesper_test_jobs.db");
6
+ const manager = JobManager.getInstance(store);
7
+ manager.setConcurrency(2); // 2 parallel workers
8
+ const jobsFinished = [];
9
+ // Listener to simulate job processing
10
+ manager.on("processJob", async (job, run) => {
11
+ console.log(`[Worker] Starting Job: ${job.id} (Type: ${job.type}, Priority: ${job.priority})`);
12
+ await run(async () => {
13
+ // Simulate variable workload
14
+ const duration = job.priority === 10 ? 500 : 2000;
15
+ await new Promise(r => setTimeout(r, duration));
16
+ // Simulate failure for a specific job to test retries
17
+ if (job.metadata === "FAIL_ONCE" && job.attempts === 0) {
18
+ console.log(`[Worker] Simulating failure for job ${job.id}`);
19
+ throw new Error("Transitory error");
20
+ }
21
+ console.log(`[Worker] Finished Job: ${job.id}`);
22
+ jobsFinished.push(job.id);
23
+ });
24
+ });
25
+ console.log("\n--- Enqueuing Jobs ---");
26
+ // 1. A slow low-priority job
27
+ const j1 = manager.createJob("prepare", 0, "slow-1");
28
+ // 2. A fast high-priority job (Pro user)
29
+ const j2 = manager.createJob("clean", 10, "pro-1");
30
+ // 3. Another low-priority job
31
+ const j3 = manager.createJob("split", 0, "slow-2");
32
+ // 4. A job that fails once
33
+ const j4 = manager.createJob("fusion", 5, "FAIL_ONCE");
34
+ console.log(`Enqueued 4 jobs. Concurrency is 2.`);
35
+ // Wait for all to finish
36
+ return new Promise((resolve) => {
37
+ const check = setInterval(() => {
38
+ const stats = manager.queue.getStats();
39
+ if (stats.total === 0 && jobsFinished.includes(j4.id)) {
40
+ clearInterval(check);
41
+ console.log("\n--- Test Results ---");
42
+ console.log("Execution Order:", jobsFinished);
43
+ const proIndex = jobsFinished.indexOf(j2.id);
44
+ const slowIndex = jobsFinished.indexOf(j3.id);
45
+ if (proIndex < slowIndex) {
46
+ console.log("✅ Priority verified: Pro job finished before later low-priority jobs.");
47
+ }
48
+ else {
49
+ console.warn("⚠️ Priority check failed or inconclusive due to parallel timing.");
50
+ }
51
+ const j4_final = store.getJob(j4.id);
52
+ if (j4_final?.attempts === 1) {
53
+ console.log("✅ Retry logic verified: Job retried after failure.");
54
+ }
55
+ console.log("\n✅ Success: Job queue system verified.");
56
+ store.close();
57
+ resolve(null);
58
+ }
59
+ }, 1000);
60
+ });
61
+ }
62
+ runTest().catch(console.error);
@@ -0,0 +1,34 @@
1
+ import { KaggleDownloader } from "../ingestion/kaggle-downloader.js";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ import { fileURLToPath } from "url";
5
+ const __filename = fileURLToPath(import.meta.url);
6
+ const __dirname = path.dirname(__filename);
7
+ const projectRoot = path.join(__dirname, "..", "..");
8
+ async function testKaggleDownload() {
9
+ // Try to get credentials from env
10
+ const user = process.env.KAGGLE_USERNAME;
11
+ const key = process.env.KAGGLE_KEY;
12
+ if (!user || !key) {
13
+ console.error("KAGGLE_USERNAME and KAGGLE_KEY must be set to run this test.");
14
+ return;
15
+ }
16
+ const downloader = new KaggleDownloader(user, key);
17
+ const repoId = "shivam2503/diamonds"; // Small classic dataset
18
+ console.log(`Testing Kaggle Download for ${repoId}...`);
19
+ const targetDir = path.join(projectRoot, "data", "test", "kaggle_diamonds");
20
+ if (fs.existsSync(targetDir))
21
+ fs.rmSync(targetDir, { recursive: true });
22
+ try {
23
+ const bestFile = await downloader.download(repoId, targetDir, (p) => {
24
+ process.stdout.write(`\rProgress: ${p}%`);
25
+ });
26
+ console.log(`\nDownload complete!`);
27
+ console.log(`Best file found: ${bestFile}`);
28
+ console.log(`Size: ${fs.statSync(bestFile).size} bytes`);
29
+ }
30
+ catch (e) {
31
+ console.error("\nKaggle Download failed:", e.message);
32
+ }
33
+ }
34
+ testKaggleDownload().catch(console.error);
@@ -0,0 +1,50 @@
1
+ import { StreamProcessor } from "../data/streaming.js";
2
+ import { WorkerPool } from "../data/worker-pool.js";
3
+ import { Readable } from "stream";
4
+ async function runTest() {
5
+ console.log("--- Initializing Large Dataset Handling Test ---");
6
+ // 1. Generate Mock Data (10,000 records)
7
+ const records = Array.from({ length: 10000 }, (_, i) => ({
8
+ id: i,
9
+ name: `Record ${i}`,
10
+ value: Math.random() * 100,
11
+ category: i % 2 === 0 ? "A" : "B"
12
+ }));
13
+ console.log(`Generated ${records.length} mock records.`);
14
+ // 2. Test Sampling Strategy
15
+ console.log("\n--- Testing Sampling Strategy (10%) ---");
16
+ const isSampled = StreamProcessor.createSampler(10);
17
+ const sample = records.filter(isSampled);
18
+ console.log(`Sample size: ${sample.length} (Expected ~1000)`);
19
+ // 3. Test Streaming Utility
20
+ console.log("\n--- Testing Streaming Utility (Chunk size: 2000) ---");
21
+ const stream = Readable.from(records);
22
+ let chunkCount = 0;
23
+ const processChunk = async (chunk) => {
24
+ chunkCount++;
25
+ console.log(`Processing chunk ${chunkCount} of size ${chunk.length}`);
26
+ return chunk.map(r => ({ ...r, status: "streamed" }));
27
+ };
28
+ const streamedResults = await StreamProcessor.processInChunks(stream, { chunkSize: 2000 }, processChunk);
29
+ console.log(`Total streamed records: ${streamedResults.length}`);
30
+ // 4. Test Parallel Worker Pool
31
+ console.log("\n--- Testing Parallel Worker Pool (Optional) ---");
32
+ try {
33
+ const pool = new WorkerPool(4);
34
+ console.log("Running parallel transformation on all records...");
35
+ const start = Date.now();
36
+ const parallelResults = await pool.processParallel(records, 2500);
37
+ const end = Date.now();
38
+ console.log(`Parallel processing took ${end - start}ms`);
39
+ }
40
+ catch (err) {
41
+ console.warn(" Parallel Worker Pool skipped or failed (Common with tsx + worker_threads). Streaming and Sampling are still functional.");
42
+ }
43
+ if (streamedResults.length === 10000) {
44
+ console.log("\n✅ Success: Chunked streaming and sampling verified.");
45
+ }
46
+ else {
47
+ console.error("\n❌ Failure: Processing mismatch.");
48
+ }
49
+ }
50
+ runTest().catch(console.error);
@@ -0,0 +1,73 @@
1
+ import { MetadataStore } from "../metadata/store.js";
2
+ import { JobManager } from "../jobs/manager.js";
3
+ import { SearchEngine } from "../search/engine.js";
4
+ import { VectorStore } from "../search/vector-store.js";
5
+ import { Embedder } from "../search/embedder.js";
6
+ import { PipelineExecutor } from "../cleaning/executor.js";
7
+ import path from "path";
8
+ import fs from "fs";
9
+ async function testPhase5Tools() {
10
+ console.log(" Testing Phase 5.2: New MCP Tools Integration\n");
11
+ const dbPath = path.join(process.cwd(), "data", "metadata.db");
12
+ const vectorPath = path.join(process.cwd(), "data", "vectors.json");
13
+ const store = new MetadataStore(dbPath);
14
+ const vectorStore = new VectorStore(vectorPath);
15
+ const embedder = Embedder.getInstance();
16
+ const searchEngine = new SearchEngine(store, vectorStore, embedder);
17
+ const jobManager = JobManager.getInstance(store);
18
+ const executor = new PipelineExecutor();
19
+ // 1. Test compare_datasets
20
+ console.log("Step 1: Testing compare_datasets...");
21
+ const allDs = store.getAllDatasets();
22
+ if (allDs.length < 2) {
23
+ console.warn(" ! Need at least 2 datasets in DB to test comparison. Skipping.");
24
+ }
25
+ else {
26
+ const ids = allDs.slice(0, 2).map(d => d.id);
27
+ console.log(` - Comparing: ${ids.join(", ")}`);
28
+ // Simulating comparison table logic from index.ts
29
+ const datasets = ids.map(id => store.getDataset(id)).filter(d => !!d);
30
+ let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
31
+ comparison += "| Quality Score | " + datasets.map(d => d.quality_score).join(" | ") + " |\n";
32
+ console.log(comparison);
33
+ }
34
+ // 2. Test analyze_quality / preview_cleaning (Local logic)
35
+ console.log("\nStep 2: Testing Analysis/Preview (Simulated)...");
36
+ const testFile = path.join(process.cwd(), "e2e_demo_output", "raw_data.csv");
37
+ if (fs.existsSync(testFile)) {
38
+ console.log(" - Testing on e2e_demo_output/raw_data.csv");
39
+ // Logic check
40
+ }
41
+ else {
42
+ console.warn(" ! No test file found at e2e_demo_output/raw_data.csv. Run 'tsx src/scripts/e2e-demo.ts' first.");
43
+ }
44
+ // 3. Test prepare_dataset (Async Job)
45
+ console.log("\nStep 3: Testing prepare_dataset (Autonomous Orchestrator)...");
46
+ const query = "financial data for stock prediction";
47
+ // Create job (Logic from index.ts)
48
+ const job = jobManager.createJob("prepare", 0, { query });
49
+ console.log(` - Job Created: ${job.id}`);
50
+ // Start background task
51
+ console.log(" - Starting autonomous preparation...");
52
+ const jobPromise = jobManager.runJob(job.id, async (update) => {
53
+ update({ progress: 20, status_text: "Searching..." });
54
+ await new Promise(r => setTimeout(r, 1000));
55
+ update({ progress: 50, status_text: "Analyzing quality..." });
56
+ await new Promise(r => setTimeout(r, 1000));
57
+ update({ progress: 80, status_text: "Formatting export..." });
58
+ await new Promise(r => setTimeout(r, 1000));
59
+ return "data/exports/prepared_dataset.parquet";
60
+ });
61
+ // 4. Test check_job_status (Polling)
62
+ console.log("\nStep 4: Polling Job Status (Simulating UI Check)...");
63
+ for (let i = 0; i < 5; i++) {
64
+ const currentJob = store.getJob(job.id);
65
+ console.log(` - [TS +${i}s] Status: ${currentJob?.status}, Progress: ${currentJob?.progress}%, Activity: ${currentJob?.status_text}`);
66
+ if (currentJob?.status === "completed")
67
+ break;
68
+ await new Promise(r => setTimeout(r, 800));
69
+ }
70
+ await jobPromise;
71
+ console.log("\n Phase 5 tools logic verified.");
72
+ }
73
+ testPhase5Tools().catch(console.error);
@@ -0,0 +1,61 @@
1
+ import { MediaAnalyzer } from "../quality/media-analyzer.js";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ import { execSync } from "child_process";
5
+ async function runTest() {
6
+ console.log("--- Testing Media Quality Analysis ---");
7
+ const projectRoot = path.resolve(".");
8
+ const analyzer = new MediaAnalyzer(projectRoot);
9
+ // 1. Create a sample audio file using Python (sine wave)
10
+ const testMediaDir = path.join(projectRoot, "data", "test-media");
11
+ if (!fs.existsSync(testMediaDir))
12
+ fs.mkdirSync(testMediaDir, { recursive: true });
13
+ const audioPath = path.join(testMediaDir, "test_audio.wav");
14
+ console.log("Generating test audio file (sine wave)...");
15
+ const pythonScript = `
16
+ import numpy as np
17
+ import soundfile as sf
18
+ sample_rate = 44100
19
+ duration = 2.0
20
+ frequency = 440.0
21
+ t = np.linspace(0, duration, int(sample_rate * duration))
22
+ audio = 0.5 * np.sin(2 * np.pi * frequency * t)
23
+ sf.write('${audioPath.replace(/\\/g, "\\\\")}', audio, sample_rate)
24
+ `;
25
+ fs.writeFileSync(path.join(testMediaDir, "gen_audio.py"), pythonScript);
26
+ try {
27
+ execSync(`python "${path.join(testMediaDir, "gen_audio.py")}"`);
28
+ }
29
+ catch (e) {
30
+ console.log("⚠️ Could not generate audio (soundfile may not be installed). Skipping audio test.");
31
+ console.log("VERIFICATION_STATUS: ⚠️ PARTIAL (Audio generation failed)");
32
+ return;
33
+ }
34
+ // 2. Run Analysis
35
+ console.log(`Analyzing ${audioPath}...`);
36
+ try {
37
+ const report = await analyzer.analyze(audioPath);
38
+ console.log("Analysis Result:");
39
+ console.log(`- Total Files: ${report.total_files}`);
40
+ console.log(`- OK Files: ${report.ok_files}`);
41
+ if ('avg_audio_duration' in report) {
42
+ console.log(`- Average Audio Duration: ${report.avg_audio_duration}s`);
43
+ const firstAudio = report.details[0];
44
+ if (firstAudio && 'sample_rate' in firstAudio) {
45
+ console.log(`- Sample Rate: ${firstAudio.sample_rate}Hz`);
46
+ console.log(`- Duration: ${firstAudio.duration}s`);
47
+ }
48
+ }
49
+ if (report.ok_files === 1 && report.total_files === 1) {
50
+ console.log("\nVERIFICATION_STATUS: ✅ PASS");
51
+ }
52
+ else {
53
+ console.log("\nVERIFICATION_STATUS: ❌ FAIL - Incorrect stats");
54
+ }
55
+ }
56
+ catch (e) {
57
+ console.error(`Analysis failed: ${e.message}`);
58
+ console.log("\nVERIFICATION_STATUS: ❌ FAIL");
59
+ }
60
+ }
61
+ runTest().catch(console.error);
@@ -0,0 +1,91 @@
1
+ import { MetadataStore } from "../metadata/store.js";
2
+ import { MonitoringStore } from "../metadata/monitoring-store.js";
3
+ import { MonitoringService } from "../metadata/monitoring-service.js";
4
+ import path from "path";
5
+ async function runTest() {
6
+ const dbPath = path.resolve("data", "test-monitoring.db");
7
+ const metadataStore = new MetadataStore(dbPath);
8
+ const monitoringStore = new MonitoringStore(metadataStore.db);
9
+ const service = new MonitoringService(monitoringStore, metadataStore);
10
+ // 1. Setup mock data
11
+ const dsId = "test/dataset-1";
12
+ const initialMetadata = {
13
+ id: dsId,
14
+ source: "huggingface",
15
+ name: "Test Dataset",
16
+ description: "Initial description",
17
+ downloads: 100,
18
+ likes: 10,
19
+ last_updated: "2023-01-01T00:00:00Z",
20
+ download_url: "https://huggingface.co/datasets/test/dataset-1",
21
+ quality_score: 85,
22
+ splits: [{ name: "train", num_examples: 1000 }],
23
+ license: { id: "mit", category: "safe", usage_restrictions: [], warnings: [] },
24
+ tags: [],
25
+ task: "text-classification",
26
+ languages: ["en"],
27
+ is_structured: true,
28
+ has_target_column: true,
29
+ total_examples: 1000,
30
+ is_safe_source: true,
31
+ has_personal_data: false,
32
+ is_paywalled: false,
33
+ is_scraped_web_data: false,
34
+ uses_https: true,
35
+ has_train_split: true,
36
+ has_test_split: false,
37
+ has_validation_split: false,
38
+ description_length: 100,
39
+ has_readme: true,
40
+ quality_warnings: []
41
+ };
42
+ metadataStore.saveDataset(initialMetadata);
43
+ // 2. Setup Monitor and Webhook
44
+ monitoringStore.saveWebhook({
45
+ id: "slack-1",
46
+ name: "General Slack",
47
+ channel: "slack",
48
+ url: "https://hooks.slack.com/services/...",
49
+ enabled: true
50
+ });
51
+ monitoringStore.saveMonitor({
52
+ dataset_id: dsId,
53
+ enabled: true,
54
+ auto_reprocess: true,
55
+ last_checked_version: initialMetadata.last_updated,
56
+ webhook_ids: ["slack-1"],
57
+ created_at: new Date().toISOString(),
58
+ updated_at: new Date().toISOString()
59
+ });
60
+ console.log("--- Monitoring Setup Complete ---");
61
+ // 3. Simulate an update
62
+ const updatedMetadata = {
63
+ ...initialMetadata,
64
+ last_updated: "2023-02-01T00:00:00Z", // New version
65
+ downloads: 150,
66
+ quality_score: 90,
67
+ total_examples: 1200 // Significant change
68
+ };
69
+ console.log("Checking for updates...");
70
+ // Mock fetch function
71
+ const mockFetch = async (id, source) => {
72
+ if (id === dsId)
73
+ return updatedMetadata;
74
+ return null;
75
+ };
76
+ const diffs = await service.checkUpdates(mockFetch);
77
+ console.log(`Found ${diffs.length} updates.`);
78
+ if (diffs.length > 0) {
79
+ const diff = diffs[0];
80
+ console.log(`Version Change: ${diff.old_version} -> ${diff.new_version}`);
81
+ console.log(`Impact Score: ${diff.impact_score}`);
82
+ console.log("Changes:", JSON.stringify(diff.changes, null, 2));
83
+ }
84
+ if (diffs.length > 0 && diffs[0].impact_score > 0) {
85
+ console.log("\n✅ Success: Dataset update detected and diffed correctly.");
86
+ }
87
+ else {
88
+ console.error("\n❌ Failure: Update not detected.");
89
+ }
90
+ }
91
+ runTest().catch(console.error);