vesper-wizard 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/row_count.py +0 -54
  203. package/src/python/splitter_engine.py +0 -283
  204. package/src/python/target_engine.py +0 -154
  205. package/src/python/test_framework_adapters.py +0 -61
  206. package/src/python/test_fusion_engine.py +0 -89
  207. package/src/python/uci_adapter.py +0 -94
  208. package/src/python/vesper/__init__.py +0 -1
  209. package/src/python/vesper/core/__init__.py +0 -1
  210. package/src/python/vesper/core/asset_downloader.py +0 -679
  211. package/src/python/vesper/core/download_recipe.py +0 -104
  212. package/src/python/worldbank_adapter.py +0 -99
  213. package/wizard.cjs +0 -3
@@ -1,53 +0,0 @@
1
- import { DataExporter } from "../export/exporter.js";
2
- import fs from "fs";
3
- import path from "path";
4
- async function main() {
5
- console.log("=== Vesper Data Exporter Test ===\n");
6
- const exporter = new DataExporter();
7
- const testDir = path.join(process.cwd(), "test_export");
8
- if (!fs.existsSync(testDir))
9
- fs.mkdirSync(testDir);
10
- const inputFile = path.join(testDir, "input.csv");
11
- // Create Dummy Data
12
- let csvContent = "id,name,value\n";
13
- for (let i = 0; i < 10; i++)
14
- csvContent += `${i},item_${i},${Math.random()}\n`;
15
- fs.writeFileSync(inputFile, csvContent);
16
- console.log(`Created input file: ${inputFile}`);
17
- const formats = ["parquet", "jsonl", "arrow"];
18
- for (const fmt of formats) {
19
- const outputFile = path.join(testDir, `output.${fmt}`);
20
- console.log(`\n--- Exporting to ${fmt.toUpperCase()} ---`);
21
- try {
22
- const result = await exporter.export(inputFile, outputFile, fmt);
23
- console.log("Result:", result);
24
- if (fs.existsSync(outputFile)) {
25
- const stats = fs.statSync(outputFile);
26
- console.log(`PASS: File created (${stats.size} bytes)`);
27
- }
28
- else {
29
- console.error("FAIL: Output file not found");
30
- }
31
- }
32
- catch (e) {
33
- console.error(`FAIL: Export to ${fmt} failed:`, e);
34
- }
35
- }
36
- // specific Arrow test
37
- // specific TFRecord test (might fail if no tensorflow)
38
- console.log(`\n--- Exporting to TFRECORD (Optional) ---`);
39
- const tfFile = path.join(testDir, "output.tfrecord");
40
- try {
41
- const result = await exporter.export(inputFile, tfFile, "tfrecord");
42
- console.log("Result:", result);
43
- if (fs.existsSync(tfFile)) {
44
- console.log("PASS: TFRecord file created");
45
- }
46
- }
47
- catch (e) {
48
- console.log("SKIP: TFRecord export failed (likely no tensorflow installed):", e.message || e);
49
- }
50
- // Cleanup
51
- // fs.rmSync(testDir, { recursive: true, force: true });
52
- }
53
- main().catch(console.error);
@@ -1,61 +0,0 @@
1
- import { FusionOrchestrator } from "../fusion/orchestrator.js";
2
- import * as fs from "fs";
3
- async function runTest() {
4
- const ds1 = {
5
- id: "ds1",
6
- name: "Dataset 1",
7
- source: "huggingface",
8
- columns: [
9
- { name: "text", type: "string" },
10
- { name: "label", type: "int", is_target: true }
11
- ],
12
- // ... other required fields (using type assertion for brevity in test)
13
- };
14
- const ds2 = {
15
- id: "ds2",
16
- name: "Dataset 2",
17
- source: "kaggle",
18
- columns: [
19
- { name: "sentence", type: "string" },
20
- { name: "sentiment", type: "string", is_target: true }
21
- ],
22
- };
23
- const config = {
24
- target_column: "target",
25
- column_aliases: {
26
- "text": ["sentence", "content"],
27
- "target": ["label", "sentiment", "target"]
28
- },
29
- type_overrides: {},
30
- dedupe_config: {
31
- exact: true,
32
- fuzzy: true,
33
- fuzzy_threshold: 0.6,
34
- fuzzy_columns: ["text"]
35
- },
36
- label_map: {
37
- "pos": 1,
38
- "neg": 0,
39
- "0": 0,
40
- "1": 1
41
- }
42
- };
43
- const records = [
44
- { datasetId: "ds1", record: { text: "I love this!", label: 1 } },
45
- { datasetId: "ds1", record: { text: "This is bad.", label: 0 } },
46
- { datasetId: "ds2", record: { sentence: "I love this!", sentiment: "pos" } }, // Exact duplicate (after alignment)
47
- { datasetId: "ds2", record: { sentence: "I really love this!", sentiment: "pos" } }, // Fuzzy duplicate
48
- { datasetId: "ds2", record: { sentence: "It was okay.", sentiment: "neg" } },
49
- ];
50
- const orchestrator = new FusionOrchestrator(config);
51
- const result = await orchestrator.fuse([ds1, ds2], records);
52
- fs.writeFileSync("test-fusion-results.json", JSON.stringify(result, null, 2));
53
- console.log("Results written to test-fusion-results.json");
54
- if (result.stats.total_output_rows === 3) {
55
- console.log("✅ Success: Rows correctly fused and deduplicated.");
56
- }
57
- else {
58
- console.error(`❌ Failure: Expected 3 rows, got ${result.stats.total_output_rows}`);
59
- }
60
- }
61
- runTest().catch(console.error);
@@ -1,27 +0,0 @@
1
- import { GitHubScraper } from "../metadata/github-scraper.js";
2
- async function runTest() {
3
- console.log("--- Testing GitHub integration ---");
4
- const scraper = new GitHubScraper();
5
- const query = "covid-19";
6
- console.log(`Searching GitHub for: "${query}"...`);
7
- const results = await scraper.scrape(query, 5);
8
- console.log(`Found ${results.length} datasets.`);
9
- if (results.length > 0) {
10
- console.log("✅ GitHub Scraper returned results.");
11
- console.log("Sample Result:");
12
- console.log(JSON.stringify(results[0], null, 2));
13
- // Validation
14
- const sample = results[0];
15
- if (sample.id.startsWith("github:") && sample.source === "github") {
16
- console.log("✅ Metadata schema validation passed.");
17
- }
18
- else {
19
- console.error("❌ Metadata schema validation failed.");
20
- }
21
- }
22
- else {
23
- console.error("❌ No results found or rate limit hit.");
24
- }
25
- console.log("--- Test Complete ---");
26
- }
27
- runTest().catch(console.error);
@@ -1,52 +0,0 @@
1
- import { DataSplitter } from "../splitting/splitter.js";
2
- import fs from "fs";
3
- import path from "path";
4
- async function main() {
5
- console.log("=== Vesper Group Split Test ===\n");
6
- const splitter = new DataSplitter();
7
- const testFile = path.join(process.cwd(), "test_group_split.csv");
8
- // Create Dummy Data (100 rows, 10 groups of 10 rows each)
9
- let csvContent = "id,group_id,value\n";
10
- for (let g = 0; g < 10; g++) {
11
- for (let i = 0; i < 10; i++) {
12
- csvContent += `${g * 10 + i},group_${g},${Math.random()}\n`;
13
- }
14
- }
15
- fs.writeFileSync(testFile, csvContent);
16
- console.log(`Created test file with 10 distinct groups.`);
17
- // Test: Group-based Split (60/20/20)
18
- const config = {
19
- type: "group",
20
- ratios: { train: 0.6, val: 0.2, test: 0.2, holdout: 0 },
21
- group_column: "group_id",
22
- shuffle: true,
23
- random_seed: 42
24
- };
25
- try {
26
- const result = await splitter.split(testFile, config);
27
- console.log("Stats:", result.stats);
28
- // Validation: No group should exist in more than one split
29
- console.log("\n--- Group Leakage Validation ---");
30
- const report = await splitter.validate(result.paths, { id_column: "group_id" }); // Checking uniqueness of group_id across splits
31
- console.log("Validation Report:", report);
32
- if (!report.leakage_detected) {
33
- console.log("PASS: No group leakage detected.");
34
- }
35
- else {
36
- console.error("FAIL: Groups leaked across splits!");
37
- }
38
- // Cleanup
39
- Object.values(result.paths).forEach(p => {
40
- if (fs.existsSync(p))
41
- fs.unlinkSync(p);
42
- });
43
- }
44
- catch (e) {
45
- console.error("Test execution failed:", e);
46
- }
47
- finally {
48
- if (fs.existsSync(testFile))
49
- fs.unlinkSync(testFile);
50
- }
51
- }
52
- main().catch(console.error);
@@ -1,29 +0,0 @@
1
- import { HFDownloader } from "../ingestion/hf-downloader.js";
2
- import path from "path";
3
- import fs from "fs";
4
- import { fileURLToPath } from "url";
5
- const __filename = fileURLToPath(import.meta.url);
6
- const __dirname = path.dirname(__filename);
7
- const projectRoot = path.join(__dirname, "..", "..");
8
- async function testHFDownload() {
9
- const downloader = new HFDownloader();
10
- const repoId = "fka/awesome-chatgpt-prompts";
11
- console.log(`Testing HF Download for ${repoId}...`);
12
- const bestFile = await downloader.findBestFile(repoId);
13
- console.log(`Best file found: ${bestFile}`);
14
- if (bestFile) {
15
- const testDir = path.join(projectRoot, "data", "test");
16
- if (!fs.existsSync(testDir))
17
- fs.mkdirSync(testDir, { recursive: true });
18
- const targetPath = path.join(testDir, "awesome-prompts.csv");
19
- await downloader.download(repoId, bestFile, targetPath, (p) => {
20
- process.stdout.write(`\rProgress: ${p}%`);
21
- });
22
- console.log(`\nDownload complete! Size: ${fs.statSync(targetPath).size} bytes`);
23
- console.log(`Location: ${targetPath}`);
24
- }
25
- else {
26
- console.error("No file found!");
27
- }
28
- }
29
- testHFDownload().catch(console.error);
@@ -1,61 +0,0 @@
1
- import { DataSplitter } from "../splitting/splitter.js";
2
- import fs from "fs";
3
- import path from "path";
4
- async function main() {
5
- console.log("=== Vesper Holdout Set Manager Test ===\n");
6
- const splitter = new DataSplitter();
7
- const testFile = path.join(process.cwd(), "test_holdout.csv");
8
- // Create Dummy Data (200 rows)
9
- let csvContent = "id,label,date\n";
10
- for (let i = 0; i < 100; i++)
11
- csvContent += `${i},A,2023-01-${(i % 30) + 1}\n`;
12
- for (let i = 100; i < 200; i++)
13
- csvContent += `${i},B,2023-02-${(i % 28) + 1}\n`;
14
- fs.writeFileSync(testFile, csvContent);
15
- console.log(`Created test file: ${testFile}`);
16
- // Test: 4-way Random Split (70/10/10/10)
17
- console.log("\n--- Test: 4-way Random Split (70/10/10/10) ---");
18
- const config = {
19
- type: "random",
20
- ratios: { train: 0.7, val: 0.1, test: 0.1, holdout: 0.1 },
21
- shuffle: true,
22
- random_seed: 42
23
- };
24
- try {
25
- const result = await splitter.split(testFile, config);
26
- console.log("Stats:", result.stats);
27
- const expected = { train: 140, val: 20, test: 20, holdout: 20 };
28
- if (result.stats.train_rows === expected.train &&
29
- result.stats.val_rows === expected.val &&
30
- result.stats.test_rows === expected.test &&
31
- result.stats.holdout_rows === expected.holdout) {
32
- console.log("PASS: 4-way ratios preserved");
33
- }
34
- else {
35
- console.error(`FAIL: Ratios mismatch. Expected ${JSON.stringify(expected)}, got ${JSON.stringify(result.stats)}`);
36
- }
37
- // Validate Split for Leakage
38
- console.log("\n--- Validation Check ---");
39
- const report = await splitter.validate(result.paths, { id_column: "id" });
40
- console.log("Report:", report);
41
- if (!report.leakage_detected) {
42
- console.log("PASS: No leakage between 4 splits");
43
- }
44
- else {
45
- console.error("FAIL: Leakage detected!");
46
- }
47
- // Cleanup
48
- Object.values(result.paths).forEach(p => {
49
- if (fs.existsSync(p))
50
- fs.unlinkSync(p);
51
- });
52
- }
53
- catch (e) {
54
- console.error("Test execution failed:", e);
55
- }
56
- finally {
57
- if (fs.existsSync(testFile))
58
- fs.unlinkSync(testFile);
59
- }
60
- }
61
- main().catch(console.error);
@@ -1,41 +0,0 @@
1
- import { SearchEngine } from "../search/engine.js";
2
- import { MetadataStore } from "../metadata/store.js";
3
- import { VectorStore } from "../search/vector-store.js";
4
- import { Embedder } from "../search/embedder.js";
5
- import path from "path";
6
- async function main() {
7
- const dbPath = path.join(process.cwd(), "data", "metadata.db");
8
- const vectorPath = path.join(process.cwd(), "data", "vectors.json");
9
- const store = new MetadataStore(dbPath);
10
- const vectorStore = new VectorStore(vectorPath);
11
- const embedder = Embedder.getInstance();
12
- await embedder.init();
13
- const engine = new SearchEngine(store, vectorStore, embedder);
14
- console.log("\n=== Hybrid Search Precision Test ===\n");
15
- // Test 1: Financial Forecasting (Drift Check)
16
- // Should NOT have crypto in top results if hybrid logic works
17
- console.log("Test 1: 'financial forecasting' (Should penalize Crypto)");
18
- const results1 = await engine.search("financial forecasting", { limit: 5 });
19
- results1.forEach((r, i) => {
20
- const score = r.relevance_score;
21
- const vector = r.vector_score;
22
- const lexical = r.lexical_score;
23
- console.log(` ${i + 1}. [${score}] ${r.name} (Vec: ${vector}, Lex: ${lexical})`);
24
- if (r.name.toLowerCase().includes("crypto") || r.description.toLowerCase().includes("bitcoin")) {
25
- console.error(" CRITICAL: Crypto found in top results!");
26
- }
27
- });
28
- console.log("");
29
- // Test 2: Negative Keywords
30
- console.log("Test 2: 'financial forecasting -stock' (Should exclude 'stock')");
31
- const results2 = await engine.search("financial forecasting -stock", { limit: 5 });
32
- results2.forEach((r, i) => {
33
- console.log(` ${i + 1}. ${r.name}`);
34
- if (r.name.toLowerCase().includes("stock") || r.description.toLowerCase().includes("stock")) {
35
- console.error(" ERROR: Negative keyword failed!");
36
- }
37
- });
38
- console.log("\nDone.");
39
- store.close();
40
- }
41
- main().catch(console.error);
@@ -1,50 +0,0 @@
1
- import { ImageAnalyzer } from "../quality/image-analyzer.js";
2
- import path from "path";
3
- import fs from "fs";
4
- import { execSync } from "child_process";
5
- async function runTest() {
6
- console.log("--- Testing Image Quality Analysis ---");
7
- const projectRoot = path.resolve(".");
8
- const analyzer = new ImageAnalyzer(projectRoot);
9
- // 1. Create a sample image using Python (to avoid external dependencies)
10
- const testImageDir = path.join(projectRoot, "data", "test-images");
11
- if (!fs.existsSync(testImageDir))
12
- fs.mkdirSync(testImageDir, { recursive: true });
13
- const imagePath = path.join(testImageDir, "test_v1.png");
14
- console.log("Generating test image...");
15
- const pythonScript = `
16
- from PIL import Image, ImageDraw
17
- import numpy as np
18
- img = Image.new('RGB', (800, 600), color = (73, 109, 137))
19
- d = ImageDraw.Draw(img)
20
- d.text((10,10), "Vesper Test Image", fill=(255,255,0))
21
- # Add some noise for blur check
22
- noise = np.random.randint(0, 255, (600, 800, 3), dtype='uint8')
23
- img_np = np.array(img)
24
- img_np = (img_np * 0.5 + noise * 0.5).astype('uint8')
25
- Image.fromarray(img_np).save('${imagePath.replace(/\\/g, "\\\\")}')
26
- `;
27
- fs.writeFileSync(path.join(testImageDir, "gen_image.py"), pythonScript);
28
- execSync(`python "${path.join(testImageDir, "gen_image.py")}"`);
29
- // 2. Run Analysis
30
- console.log(`Analyzing ${imagePath}...`);
31
- try {
32
- const report = await analyzer.analyze(imagePath);
33
- console.log("Analysis Result:");
34
- console.log(`- Format: ${report.individual_results[0].format}`);
35
- console.log(`- Resolution: ${report.individual_results[0].width}x${report.individual_results[0].height}`);
36
- console.log(`- Blur Score: ${report.individual_results[0].blur_score}`);
37
- console.log(`- Is Blurry: ${report.individual_results[0].is_blurry}`);
38
- if (report.total_images === 1 && report.average_width === 800) {
39
- console.log("\nVERIFICATION_STATUS: PASS");
40
- }
41
- else {
42
- console.log("\nVERIFICATION_STATUS: FAIL - Incorrect stats");
43
- }
44
- }
45
- catch (e) {
46
- console.error(`Analysis failed: ${e.message}`);
47
- console.log("\nVERIFICATION_STATUS: FAIL");
48
- }
49
- }
50
- runTest().catch(console.error);
@@ -1,39 +0,0 @@
1
- import { MetadataStore } from "../metadata/store.js";
2
- import { DataIngestor } from "../ingestion/ingestor.js";
3
- import path from "path";
4
- import fs from "fs";
5
- import { fileURLToPath } from "url";
6
- const __filename = fileURLToPath(import.meta.url);
7
- const __dirname = path.dirname(__filename);
8
- const projectRoot = path.join(__dirname, "..", "..");
9
- const dbPath = path.join(projectRoot, "data", "test-metadata.db");
10
- // Clean test DB if exists
11
- if (fs.existsSync(dbPath))
12
- fs.unlinkSync(dbPath);
13
- const store = new MetadataStore(dbPath);
14
- const ingestor = new DataIngestor(projectRoot, store);
15
- async function testInfra() {
16
- console.log("--- Testing Ingestion Infrastructure (6.1) ---");
17
- const testId = "test/dataset";
18
- // 1. Get target path
19
- const target = ingestor.getTargetPath(testId);
20
- console.log(`Target path: ${target}`);
21
- // 2. Register downloading
22
- console.log("Registering download start...");
23
- store.registerDownload(testId, target, 'downloading');
24
- let status = store.getDownloadStatus(testId);
25
- console.log("Status after start:", status?.status);
26
- // 3. Register completion
27
- console.log("Registering completion...");
28
- ingestor.completeDownload(testId, target, 1024);
29
- status = store.getDownloadStatus(testId);
30
- console.log("Status after completion:", status?.status);
31
- console.log("Saved path:", status?.local_path);
32
- console.log("Size:", status?.size_bytes, "bytes");
33
- // 4. Verify directory exists
34
- const rawDir = path.join(projectRoot, "data", "raw");
35
- console.log(`Raw data dir created: ${fs.existsSync(rawDir)}`);
36
- console.log("\nInfrastructure test PASSED!");
37
- store.close();
38
- }
39
- testInfra().catch(console.error);
@@ -1,40 +0,0 @@
1
- import { InstallService } from "../install/install-service.js";
2
- import { MetadataStore } from "../metadata/store.js";
3
- import path from "path";
4
- import fs from "fs";
5
- async function runTest() {
6
- console.log("--- Testing Automatic Installation ---");
7
- const projectRoot = path.resolve(".");
8
- const metadataStore = new MetadataStore(path.join(projectRoot, "data", "metadata.db"));
9
- const installService = new InstallService(projectRoot, metadataStore);
10
- const query = "naruto";
11
- const dataset = metadataStore.db.prepare("SELECT * FROM datasets WHERE name LIKE ? LIMIT 1").get(`%${query}%`);
12
- if (!dataset) {
13
- console.error("Naruto dataset not found in metadata.db. Please run a search first.");
14
- return;
15
- }
16
- console.log(`Found dataset: ${dataset.name}`);
17
- // Mock a prepared file
18
- const mockFile = path.join(projectRoot, "data", "raw", "naruto_mock.csv");
19
- if (!fs.existsSync(path.dirname(mockFile))) {
20
- fs.mkdirSync(path.dirname(mockFile), { recursive: true });
21
- }
22
- fs.writeFileSync(mockFile, "quote,character\nBelieve it!,Naruto\nI will be Hokage,Naruto");
23
- console.log(`Installing ${mockFile}...`);
24
- const installPath = await installService.install(dataset.id, mockFile);
25
- console.log(`✅ Success! Installed to: ${installPath}`);
26
- // Verify it exists
27
- if (fs.existsSync(installPath)) {
28
- console.log("File exists at install location.");
29
- }
30
- else {
31
- console.error("File MISSING from install location!");
32
- }
33
- // Verify metadata updated
34
- const updated = metadataStore.getDataset(dataset.id);
35
- const success = updated?.install_path === installPath;
36
- console.log(`Updated install_path in metadata: ${updated?.install_path}`);
37
- console.log(`VERIFICATION_STATUS: ${success ? "✅ PASS" : "❌ FAIL"}`);
38
- console.log("\n--- Test Complete ---");
39
- }
40
- runTest().catch(console.error);
@@ -1,26 +0,0 @@
1
- import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
2
- async function runTest() {
3
- console.log("--- Testing Institutional Data Sources ---");
4
- const wb = new WorldBankScraper();
5
- const nasa = new NASAScraper();
6
- console.log('Searching World Bank for: "climate"...');
7
- const wbResults = await wb.scrape("climate", 3);
8
- console.log(`Found ${wbResults.length} World Bank results.`);
9
- if (wbResults.length > 0) {
10
- console.log("✅ World Bank Sample:");
11
- console.log(` - ID: ${wbResults[0].id}`);
12
- console.log(` - Name: ${wbResults[0].name}`);
13
- console.log(` - URL: ${wbResults[0].metadata_url}`);
14
- }
15
- console.log('\nSearching NASA for: "astronomy"...');
16
- const nasaResults = await nasa.scrape("astronomy", 3);
17
- console.log(`Found ${nasaResults.length} NASA results.`);
18
- if (nasaResults.length > 0) {
19
- console.log("✅ NASA Sample:");
20
- console.log(` - ID: ${nasaResults[0].id}`);
21
- console.log(` - Name: ${nasaResults[0].name}`);
22
- console.log(` - URL: ${nasaResults[0].metadata_url}`);
23
- }
24
- console.log("\n--- Test Complete ---");
25
- }
26
- runTest().catch(console.error);
@@ -1,41 +0,0 @@
1
- import { QualityAnalyzer } from "../quality/analyzer.js";
2
- import fs from "fs";
3
- import path from "path";
4
- async function main() {
5
- console.log("=== Vesper Integrity Check Test ===\n");
6
- const analyzer = new QualityAnalyzer();
7
- const testFile = path.join(process.cwd(), "test_integrity.csv");
8
- // Create a CSV with unique IDs but duplicate CONTENT
9
- // This simulates a common data issue: exact same review scraped twice with different IDs
10
- const csvContent = `id,review,label
11
- 1,"This product involves huge risks.",negative
12
- 2,"Great investment opportunity!",positive
13
- 3,"This product involves huge risks.",negative
14
- 4,"Wait and see.",neutral
15
- 5,"Great investment opportunity!",positive`;
16
- fs.writeFileSync(testFile, csvContent);
17
- console.log(`Created test file: ${testFile}`);
18
- console.log("Expectations:");
19
- console.log(" - Duplicate Rows: 0 (because IDs differ)");
20
- console.log(" - Text Duplicates: > 0 (because 'review' column has dupes)");
21
- try {
22
- console.log("\nRunning analyzer...");
23
- const report = await analyzer.analyze(testFile);
24
- console.log("\n=== Integrity Report ===");
25
- console.log(`Duplicate Rows (Exact): ${report.duplicate_rows}`);
26
- console.log(`Text Duplicates (Content): ${report.text_duplicates || 0}`);
27
- console.log("\nWarnings:", report.warnings);
28
- if (report.text_duplicates && report.text_duplicates > 0) {
29
- console.log("Test passed! Detected text duplication despite unique IDs.");
30
- }
31
- else {
32
- console.error("Test failed! Did not detect text duplicates.");
33
- }
34
- // Cleanup
35
- fs.unlinkSync(testFile);
36
- }
37
- catch (error) {
38
- console.error("\nTest failed:", error);
39
- }
40
- }
41
- main().catch(console.error);
@@ -1,42 +0,0 @@
1
- import { SearchEngine } from "../search/engine.js";
2
- import { MetadataStore } from "../metadata/store.js";
3
- import { VectorStore } from "../search/vector-store.js";
4
- import { Embedder } from "../search/embedder.js";
5
- import path from "path";
6
- /**
7
- * Test JIT fallback with various queries
8
- */
9
- async function main() {
10
- const dbPath = path.join(process.cwd(), "data", "metadata.db");
11
- const vectorPath = path.join(process.cwd(), "data", "vectors.json");
12
- const store = new MetadataStore(dbPath);
13
- const vectorStore = new VectorStore(vectorPath);
14
- const embedder = Embedder.getInstance();
15
- await embedder.init();
16
- const engine = new SearchEngine(store, vectorStore, embedder);
17
- console.log("\n=== JIT Fallback Test Suite ===\n");
18
- // Test 1: Query that should have good results (should NOT trigger JIT)
19
- console.log("Test 1: Common query (should NOT trigger JIT)");
20
- console.log("Query: 'image classification cats dogs'\n");
21
- const results1 = await engine.search("image classification cats dogs", { limit: 5 });
22
- console.log(`Results: ${results1.length}, Top score: ${results1[0]?.relevance_score || 0}`);
23
- console.log("---\n");
24
- // Test 2: Very niche query (should trigger JIT)
25
- console.log("Test 2: Niche query (SHOULD trigger JIT)");
26
- console.log("Query: 'underwater acoustic signal processing dolphins'\n");
27
- const results2 = await engine.search("underwater acoustic signal processing dolphins", { limit: 5 });
28
- console.log(`Results: ${results2.length}, Top score: ${results2[0]?.relevance_score || 0}`);
29
- console.log("---\n");
30
- // Test 3: Another niche query
31
- console.log("Test 3: Another niche query (SHOULD trigger JIT)");
32
- console.log("Query: 'mongolian language morphological analysis'\n");
33
- const results3 = await engine.search("mongolian language morphological analysis", { limit: 5 });
34
- console.log(`Results: ${results3.length}, Top score: ${results3[0]?.relevance_score || 0}`);
35
- console.log("---\n");
36
- // Check final dataset count
37
- const finalCount = store.getAllDatasets().length;
38
- console.log(`\nFinal dataset count: ${finalCount}`);
39
- console.log("(Compare with initial 1238 to see if JIT added new datasets)\n");
40
- store.close();
41
- }
42
- main().catch(console.error);
@@ -1,62 +0,0 @@
1
- import { MetadataStore } from "../metadata/store.js";
2
- import { JobManager } from "../jobs/manager.js";
3
- async function runTest() {
4
- console.log("--- Initializing Job Queue Test ---");
5
- const store = new MetadataStore("data/vesper_test_jobs.db");
6
- const manager = JobManager.getInstance(store);
7
- manager.setConcurrency(2); // 2 parallel workers
8
- const jobsFinished = [];
9
- // Listener to simulate job processing
10
- manager.on("processJob", async (job, run) => {
11
- console.log(`[Worker] Starting Job: ${job.id} (Type: ${job.type}, Priority: ${job.priority})`);
12
- await run(async () => {
13
- // Simulate variable workload
14
- const duration = job.priority === 10 ? 500 : 2000;
15
- await new Promise(r => setTimeout(r, duration));
16
- // Simulate failure for a specific job to test retries
17
- if (job.metadata === "FAIL_ONCE" && job.attempts === 0) {
18
- console.log(`[Worker] Simulating failure for job ${job.id}`);
19
- throw new Error("Transitory error");
20
- }
21
- console.log(`[Worker] Finished Job: ${job.id}`);
22
- jobsFinished.push(job.id);
23
- });
24
- });
25
- console.log("\n--- Enqueuing Jobs ---");
26
- // 1. A slow low-priority job
27
- const j1 = manager.createJob("prepare", 0, "slow-1");
28
- // 2. A fast high-priority job (Pro user)
29
- const j2 = manager.createJob("clean", 10, "pro-1");
30
- // 3. Another low-priority job
31
- const j3 = manager.createJob("split", 0, "slow-2");
32
- // 4. A job that fails once
33
- const j4 = manager.createJob("fusion", 5, "FAIL_ONCE");
34
- console.log(`Enqueued 4 jobs. Concurrency is 2.`);
35
- // Wait for all to finish
36
- return new Promise((resolve) => {
37
- const check = setInterval(() => {
38
- const stats = manager.queue.getStats();
39
- if (stats.total === 0 && jobsFinished.includes(j4.id)) {
40
- clearInterval(check);
41
- console.log("\n--- Test Results ---");
42
- console.log("Execution Order:", jobsFinished);
43
- const proIndex = jobsFinished.indexOf(j2.id);
44
- const slowIndex = jobsFinished.indexOf(j3.id);
45
- if (proIndex < slowIndex) {
46
- console.log("✅ Priority verified: Pro job finished before later low-priority jobs.");
47
- }
48
- else {
49
- console.warn("⚠️ Priority check failed or inconclusive due to parallel timing.");
50
- }
51
- const j4_final = store.getJob(j4.id);
52
- if (j4_final?.attempts === 1) {
53
- console.log("✅ Retry logic verified: Job retried after failure.");
54
- }
55
- console.log("\n✅ Success: Job queue system verified.");
56
- store.close();
57
- resolve(null);
58
- }
59
- }, 1000);
60
- });
61
- }
62
- runTest().catch(console.error);