vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,100 +0,0 @@
1
- import { HuggingFaceScraper } from "../metadata/scraper.js";
2
- import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
3
- import { MetadataStore } from "../metadata/store.js";
4
- import path from "path";
5
- async function main() {
6
- const scraper = new HuggingFaceScraper();
7
- const dbPath = path.join(process.cwd(), "data", "metadata.db");
8
- const store = new MetadataStore(dbPath);
9
- // Get limit from command line args or default to 100
10
- let limit = 100;
11
- if (process.argv[2] && !process.argv[2].startsWith("-")) {
12
- limit = parseInt(process.argv[2], 10);
13
- if (isNaN(limit))
14
- limit = 100;
15
- }
16
- // Check if MVP filters should be disabled (--no-mvp flag)
17
- const applyMVPFilters = !process.argv.includes("--no-mvp");
18
- // Get domain filter if specified (--domain medicine, --domain healthcare, etc.)
19
- const domainArgIndex = process.argv.indexOf("--domain");
20
- const domainFilter = domainArgIndex !== -1 && process.argv[domainArgIndex + 1]
21
- ? process.argv[domainArgIndex + 1]
22
- : undefined;
23
- try {
24
- const datasets = [];
25
- const domainMsg = domainFilter ? `, domain: ${domainFilter}` : "";
26
- // 1. HuggingFace
27
- const hfLimit = limit > 1000 ? limit : 1000; // Aim for at least 1k for "massive"
28
- console.error(`Scraping HF: ${hfLimit} datasets with MVP filters: ${applyMVPFilters}${domainMsg}`);
29
- const hfDatasets = await scraper.scrape(hfLimit, applyMVPFilters, domainFilter);
30
- datasets.push(...hfDatasets);
31
- // 2. Kaggle (Optional - only if credentials provided)
32
- const kaggleUser = process.env.KAGGLE_USERNAME;
33
- const kaggleKey = process.env.KAGGLE_KEY;
34
- if (kaggleUser && kaggleKey) {
35
- const kaggleLimit = Math.max(100, Math.floor(hfLimit / 4)); // Balance HF/Kaggle ratio
36
- console.error(`\nScraping Kaggle: searching for "${domainFilter || 'all'}" (limit: ${kaggleLimit})`);
37
- const kaggleScraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
38
- // For massive scraping, we might want to iterate through multiple common search terms if no domainFilter is set
39
- const searchTerms = domainFilter ? [domainFilter] : ["machine learning", "data science", "nlp", "computer vision", "healthcare"];
40
- const termLimit = Math.floor(kaggleLimit / searchTerms.length);
41
- for (const term of searchTerms) {
42
- console.error(`[Kaggle] Fetching "${term}"...`);
43
- const kaggleDatasets = await kaggleScraper.scrape(term, termLimit);
44
- kaggleDatasets.forEach(d => {
45
- d.id = `kaggle:${d.id}`;
46
- if (!datasets.some(existing => existing.id === d.id)) {
47
- datasets.push(d);
48
- }
49
- });
50
- }
51
- }
52
- else {
53
- console.error("\n[Kaggle] Skipping (KAGGLE_USERNAME/KAGGLE_KEY not set).");
54
- }
55
- console.error(`\nTotal Scraped: ${datasets.length} datasets.`);
56
- let saved = 0;
57
- store.beginTransaction(); // Performance optimization for large batch
58
- try {
59
- for (const ds of datasets) {
60
- store.saveDataset(ds);
61
- saved++;
62
- if (saved % 500 === 0)
63
- console.error(`[DB] Saved ${saved} records...`);
64
- }
65
- store.commit();
66
- }
67
- catch (e) {
68
- store.rollback();
69
- throw e;
70
- }
71
- console.error(`Successfully saved ${saved} datasets to MetadataStore.`);
72
- console.error(`\nSummary:`);
73
- console.error(` - Total examples: ${datasets.reduce((sum, d) => sum + d.total_examples, 0).toLocaleString()}`);
74
- console.error(` - Safe sources: ${datasets.filter(d => d.is_safe_source).length}`);
75
- console.error(` - Structured datasets: ${datasets.filter(d => d.is_structured).length}`);
76
- console.error(` - Permissive licenses: ${datasets.filter(d => d.license.category === "safe").length}`);
77
- // Show domain distribution
78
- const domainCounts = new Map();
79
- datasets.forEach(d => {
80
- const domain = d.domain || "unknown";
81
- domainCounts.set(domain, (domainCounts.get(domain) || 0) + 1);
82
- });
83
- if (domainCounts.size > 0) {
84
- console.error(` - Domains:`);
85
- Array.from(domainCounts.entries())
86
- .sort((a, b) => b[1] - a[1])
87
- .forEach(([domain, count]) => {
88
- console.error(` ${domain}: ${count}`);
89
- });
90
- }
91
- }
92
- catch (error) {
93
- console.error("Scraping failed:", error);
94
- process.exit(1);
95
- }
96
- finally {
97
- store.close();
98
- }
99
- }
100
- main();
@@ -1,26 +0,0 @@
1
- import path from "path";
2
- import { MetadataStore } from "../metadata/store.js";
3
- import { VectorStore } from "../search/vector-store.js";
4
- import { Embedder } from "../search/embedder.js";
5
- import { SearchEngine } from "../search/engine.js";
6
- const query = process.argv[2];
7
- if (!query) {
8
- console.error("Usage: npx tsx src/scripts/search-cli.ts \"your query\"");
9
- process.exit(1);
10
- }
11
- const dbPath = path.join(process.cwd(), "data", "metadata.db");
12
- const vectorPath = path.join(process.cwd(), "data", "vectors.json");
13
- const metadataStore = new MetadataStore(dbPath);
14
- const vectorStore = new VectorStore(vectorPath);
15
- const embedder = Embedder.getInstance();
16
- const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
17
- async function run() {
18
- console.error(`Searching for: "${query}"...`);
19
- const results = await searchEngine.search(query, { limit: 5, safeOnly: true });
20
- if (results.length === 0) {
21
- console.log("No results found.");
22
- return;
23
- }
24
- console.log(JSON.stringify(results, null, 2));
25
- }
26
- run().catch(console.error);
@@ -1,45 +0,0 @@
1
- import { QualityAnalyzer } from "../quality/analyzer.js";
2
- import fs from "fs";
3
- import path from "path";
4
- async function main() {
5
- console.log("=== Vesper Bias Analysis Test ===\n");
6
- const analyzer = new QualityAnalyzer();
7
- const testFile = path.join(process.cwd(), "test_bias.csv");
8
- // Create a CSV with severe class imbalance
9
- // 'label': 90% 'A', 10% 'B' -> Should trigger warning
10
- // 'gender': 50/50 -> No warning
11
- let csvContent = "id,gender,label\n";
12
- for (let i = 0; i < 95; i++)
13
- csvContent += `${i},M,A\n`;
14
- for (let i = 95; i < 100; i++)
15
- csvContent += `${i},F,B\n`;
16
- fs.writeFileSync(testFile, csvContent);
17
- console.log(`Created test file: ${testFile}`);
18
- console.log("Expectations:");
19
- console.log(" - Should warn about 'Severe imbalance' in 'label' column (95% A)");
20
- console.log(" - Should NOT warn about 'gender' (skewed but just a test sample)");
21
- try {
22
- console.log("\nRunning analyzer...");
23
- const report = await analyzer.analyze(testFile);
24
- console.log("\n=== Bias Report ===");
25
- if (report.class_imbalance_warnings && report.class_imbalance_warnings.length > 0) {
26
- report.class_imbalance_warnings.forEach(w => console.log(`[!] ${w}`));
27
- }
28
- else {
29
- console.log("No bias detected.");
30
- }
31
- const labelWarning = report.class_imbalance_warnings?.find(w => w.includes("'label'"));
32
- if (labelWarning) {
33
- console.log("\nTest passed! Detected imbalance.");
34
- }
35
- else {
36
- console.error("\nTest failed! Did not detect imbalance.");
37
- }
38
- // Cleanup
39
- fs.unlinkSync(testFile);
40
- }
41
- catch (error) {
42
- console.error("\nTest failed:", error);
43
- }
44
- }
45
- main().catch(console.error);
@@ -1,51 +0,0 @@
1
- import { MockRedisProvider, CacheService } from "../cache/service.js";
2
- import { CDNService } from "../cache/cdn.js";
3
- import { CleaningPlanner } from "../cleaning/planner.js";
4
- async function runTest() {
5
- console.log("--- Initializing Caching Layer Test ---");
6
- const cacheProvider = new MockRedisProvider();
7
- const cache = new CacheService(cacheProvider);
8
- const planner = new CleaningPlanner(cache);
9
- const cdn = new CDNService("data/cdn_mock", "https://cdn.vesper.ai");
10
- const dsId = "test/cache-dataset";
11
- const mockReport = {
12
- row_count: 1000,
13
- column_count: 2,
14
- duplicate_rows: 5,
15
- duplicate_percentage: 0.5,
16
- columns: [
17
- { name: "col1", type: "Utf8", inferred_type: "String", missing_count: 0, missing_percentage: 0, unique_count: 1000, is_constant: false, is_mixed_type: false }
18
- ],
19
- warnings: [],
20
- schema_warnings: [],
21
- overall_score: 80
22
- };
23
- // 1. Test Planning Cache
24
- console.log("\n--- Testing Cleaning Plan Cache ---");
25
- console.log("First Run (Cache Miss)...");
26
- const start1 = Date.now();
27
- await planner.generatePlan(dsId, mockReport);
28
- console.log(`First run took ${Date.now() - start1}ms`);
29
- console.log("Second Run (Cache Hit)...");
30
- const start2 = Date.now();
31
- await planner.generatePlan(dsId, mockReport);
32
- console.log(`Second run took ${Date.now() - start2}ms (Expected to be near 0ms)`);
33
- // 2. Test TTL / Hash mismatch
34
- console.log("\n--- Testing Cache Sensitivity (Config Change) ---");
35
- console.log("Running with a ruleSet (New Cache Key)...");
36
- const start3 = Date.now();
37
- await planner.generatePlan(dsId, mockReport, { id: "rules-1", name: "Custom", rules: [] });
38
- console.log(`Config change run took ${Date.now() - start3}ms (Cache Miss)`);
39
- // 3. Test CDN Upload
40
- console.log("\n--- Testing CDN Mock ---");
41
- const reportContent = JSON.stringify(mockReport, null, 2);
42
- const url = await cdn.upload("report_latest.json", reportContent);
43
- console.log(`Report uploaded to CDN: ${url}`);
44
- if (url.startsWith("https://cdn.vesper.ai")) {
45
- console.log("\n✅ Success: Caching and CDN layers verified.");
46
- }
47
- else {
48
- console.error("\n❌ Failure: CDN URL generation mismatch.");
49
- }
50
- }
51
- runTest().catch(console.error);
@@ -1,76 +0,0 @@
1
- import { DataCleaner } from "../cleaning/cleaner.js";
2
- import fs from "fs";
3
- import path from "path";
4
- async function main() {
5
- console.log("=== Vesper Auto-Cleaning Pipeline Test ===\n");
6
- const cleaner = new DataCleaner();
7
- const testFile = path.join(process.cwd(), "test_cleaning.csv");
8
- // Create Dirty Data
9
- // - id: Duplicate rows (1)
10
- // - age: Missing values, Outlier (200), Wrong Type ("thirty")
11
- // - score: Good float
12
- // - unnecessary: Column to drop
13
- const csvContent = `id,age,score,unnecessary
14
- 1,25,88.5,trash
15
- 2,,92.0,trash
16
- 3,200,15.0,trash
17
- 1,25,88.5,trash
18
- 4,"30",80.0,trash`;
19
- fs.writeFileSync(testFile, csvContent);
20
- console.log(`Created dirty file: ${testFile}`);
21
- // Define Cleaning Plan
22
- const ops = [
23
- {
24
- type: "DropColumns",
25
- params: { columns: ["unnecessary"] },
26
- reason: "Not useful"
27
- },
28
- {
29
- type: "RemoveDuplicates",
30
- params: {},
31
- reason: "Duplicate rows"
32
- },
33
- {
34
- type: "FillMissing",
35
- params: { column: "age", method: "constant", value: 0 },
36
- reason: "Impute missing age"
37
- },
38
- // Note: Polars can't easily fix "thirty" to 30 automatically without specific logic,
39
- // so we just cast what we can. "30" string -> 30 int works.
40
- {
41
- type: "FixTypes",
42
- params: { column: "age", type: "float" },
43
- reason: "Convert age to number"
44
- },
45
- {
46
- type: "RemoveOutliers",
47
- params: { column: "age", method: "iqr", threshold: 1.5 },
48
- reason: "Remove age > 100"
49
- }
50
- ];
51
- try {
52
- console.log("Executing cleaning plan...");
53
- const result = await cleaner.clean(testFile, ops);
54
- console.log("\n=== Cleaning Result ===");
55
- console.log(`Success: ${result.success}`);
56
- console.log(`Output: ${result.output_path || "None"}`);
57
- console.log(`Rows Affected: ${result.rows_affected}`);
58
- console.log("\nLogs:");
59
- result.logs.forEach(l => console.log(` - ${l}`));
60
- // Validate Output File Exists
61
- if (fs.existsSync(result.output_path)) {
62
- console.log("\nConverted file created successfully.");
63
- // Cleanup
64
- fs.unlinkSync(testFile);
65
- fs.unlinkSync(result.output_path);
66
- console.log("Test passed!");
67
- }
68
- else {
69
- console.error("\nTest failed! Output file missing.");
70
- }
71
- }
72
- catch (error) {
73
- console.error("\nTest failed:", error);
74
- }
75
- }
76
- main().catch(console.error);
@@ -1,48 +0,0 @@
1
- import { StorageManager } from "../cloud/storage-manager.js";
2
- import fs from "fs";
3
- import path from "path";
4
- async function main() {
5
- console.log("=== Vesper Cloud Storage Test ===\n");
6
- const vaultDir = path.join(process.cwd(), "test_vault");
7
- const adapter = StorageManager.createAdapter({
8
- type: "local",
9
- options: { basePath: vaultDir }
10
- });
11
- const testFile = "test_upload.txt";
12
- fs.writeFileSync(testFile, "Hello Cloud Storage!");
13
- try {
14
- console.log("Testing upload...");
15
- const remotePath = "datasets/v1/test.txt";
16
- const url = await adapter.upload(testFile, remotePath);
17
- console.log("Upload URL:", url);
18
- const expectedFile = path.join(vaultDir, remotePath);
19
- if (fs.existsSync(expectedFile)) {
20
- console.log("PASS: File successfully uploaded to local vault.");
21
- const content = fs.readFileSync(expectedFile, "utf8");
22
- if (content === "Hello Cloud Storage!") {
23
- console.log("PASS: Content integrity verified.");
24
- }
25
- }
26
- else {
27
- console.error("FAIL: 0Uploaded file not found in vault.");
28
- }
29
- console.log("\nTesting signed URL...");
30
- const signedUrl = await adapter.getSignedUrl(remotePath);
31
- console.log("Signed URL:", signedUrl);
32
- console.log("\nTesting deletion...");
33
- await adapter.delete(remotePath);
34
- if (!fs.existsSync(expectedFile)) {
35
- console.log("PASS: File successfully deleted from vault.");
36
- }
37
- }
38
- catch (e) {
39
- console.error("FAIL: Cloud storage test failed:", e);
40
- }
41
- finally {
42
- if (fs.existsSync(testFile))
43
- fs.unlinkSync(testFile);
44
- if (fs.existsSync(vaultDir))
45
- fs.rmSync(vaultDir, { recursive: true, force: true });
46
- }
47
- }
48
- main().catch(console.error);
@@ -1,58 +0,0 @@
1
- import { MetadataStore } from "../metadata/store.js";
2
- import { ComplianceStore } from "../compliance/store.js";
3
- import { ComplianceService } from "../compliance/service.js";
4
- import path from "path";
5
- import fs from "fs";
6
- async function runTest() {
7
- const dbPath = path.resolve("data", "test-compliance.db");
8
- const metadataStore = new MetadataStore(dbPath);
9
- const complianceStore = new ComplianceStore(metadataStore.db);
10
- const service = new ComplianceService(complianceStore);
11
- const dsId = "org/health-data";
12
- const medDataset = {
13
- id: dsId,
14
- source: "huggingface",
15
- name: "Patient Records (Mock)",
16
- domain: "healthcare",
17
- has_personal_data: true,
18
- quality_warnings: ["Potential PII detected in column 'patient_name'"],
19
- last_updated: new Date().toISOString(),
20
- license: { id: "proprietary", category: "restricted", usage_restrictions: [], warnings: [] },
21
- // ... rest
22
- };
23
- console.log("--- Initial Compliance Check (Expecting Failure) ---");
24
- const gdpr1 = await service.verifyGDPR(medDataset);
25
- console.log(`GDPR Passed: ${gdpr1.passed}`);
26
- console.log("Issues:", JSON.stringify(gdpr1.issues, null, 2));
27
- const hipaa1 = await service.verifyHIPAA(medDataset);
28
- console.log(`HIPAA Passed: ${hipaa1.passed}`);
29
- console.log("Issues:", JSON.stringify(hipaa1.issues, null, 2));
30
- console.log("\n--- Logging Operations (Audit Trail) ---");
31
- service.logOperation("admin-123", dsId, "ComplianceCheck", { result: "Failed" });
32
- service.logOperation("admin-123", dsId, "Clean", { rules: "PII-Masking" });
33
- console.log("\n--- Resolving Compliance Issues ---");
34
- complianceStore.saveConsent({
35
- dataset_id: dsId,
36
- consent_obtained: true,
37
- source: "patient-portal",
38
- last_verified: new Date().toISOString()
39
- });
40
- // Simulate de-identification
41
- const cleanDataset = { ...medDataset, has_personal_data: false, quality_warnings: [] };
42
- const gdpr2 = await service.verifyGDPR(cleanDataset);
43
- console.log(`GDPR Passed (After Fix): ${gdpr2.passed}`);
44
- const hipaa2 = await service.verifyHIPAA(cleanDataset);
45
- console.log(`HIPAA Passed (After Fix): ${hipaa2.passed}`);
46
- console.log("\n--- Exporting Audit Log ---");
47
- const csv = service.exportAuditLog(dsId);
48
- fs.writeFileSync("compliance-audit.csv", csv);
49
- console.log("Audit log saved to compliance-audit.csv");
50
- console.log("Log Snippet:\n", csv.split("\n").slice(0, 3).join("\n"));
51
- if (gdpr2.passed && hipaa2.passed && csv.includes("Clean")) {
52
- console.log("\n✅ Success: Compliance checks and audit trail verified.");
53
- }
54
- else {
55
- console.error("\n❌ Failure: Compliance workflow incomplete.");
56
- }
57
- }
58
- runTest().catch(console.error);
@@ -1,64 +0,0 @@
1
- import { MetadataStore } from "../metadata/store.js";
2
- import { PipelineExecutor } from "../cleaning/executor.js";
3
- import { InstallService } from "../install/install-service.js";
4
- import path from "path";
5
- import fs from "fs";
6
- async function runTest() {
7
- console.log("--- Testing Format Conversion Export ---");
8
- const projectRoot = path.resolve(".");
9
- const metadataStore = new MetadataStore(path.join(projectRoot, "data", "metadata.db"));
10
- const pipelineExecutor = new PipelineExecutor(projectRoot);
11
- const installService = new InstallService(projectRoot, metadataStore);
12
- // 1. Create a mock JSON dataset
13
- const datasetId = "test/json-dataset";
14
- const rawDir = path.join(projectRoot, "data", "raw");
15
- if (!fs.existsSync(rawDir))
16
- fs.mkdirSync(rawDir, { recursive: true });
17
- const jsonPath = path.join(rawDir, "test_data.json");
18
- const testData = [
19
- { id: 1, name: "Alice", value: 10.5 },
20
- { id: 2, name: "Bob", value: 20.1 }
21
- ];
22
- fs.writeFileSync(jsonPath, JSON.stringify(testData));
23
- // Register in local_files
24
- metadataStore.registerDownload(datasetId, jsonPath, "completed", fs.statSync(jsonPath).size);
25
- // Register in datasets to satisfy inner check
26
- metadataStore.saveDataset({
27
- id: datasetId,
28
- source: "huggingface",
29
- name: "Test JSON",
30
- description: "A test JSON file",
31
- license: { category: "safe", id: "mit", usage_restrictions: [], warnings: [] },
32
- last_updated: new Date().toISOString(),
33
- quality_score: 80,
34
- download_url: "http://example.com",
35
- quality_warnings: []
36
- });
37
- console.log("Mock dataset registered.");
38
- // 2. Simulate export_dataset with format="csv"
39
- console.log(`Converting ${jsonPath} to CSV...`);
40
- const downloadStatus = metadataStore.getDownloadStatus(datasetId);
41
- if (!downloadStatus)
42
- throw new Error("Dataset not found in DB");
43
- let sourcePath = downloadStatus.local_path;
44
- const requestedFormat = "csv";
45
- const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
46
- if (currentExt !== requestedFormat) {
47
- console.log(`Mismatch detected. Running pipeline conversion...`);
48
- const result = await pipelineExecutor.runPipeline(datasetId, sourcePath, requestedFormat);
49
- sourcePath = result.final_output_path;
50
- }
51
- // 3. Install
52
- const targetDir = path.join(projectRoot, "test-conversion-export");
53
- const finalPath = await installService.install(datasetId, sourcePath, targetDir);
54
- console.log(`✅ Final Export Path: ${finalPath}`);
55
- if (finalPath.endsWith(".csv") && fs.existsSync(finalPath)) {
56
- const content = fs.readFileSync(finalPath, "utf8");
57
- console.log("CSV Content Preview:\n" + content);
58
- console.log("\nVERIFICATION_STATUS: ✅ PASS");
59
- }
60
- else {
61
- console.error("\nVERIFICATION_STATUS: ❌ FAIL - Result is not a CSV or file missing");
62
- }
63
- }
64
- runTest().catch(console.error);
@@ -1,58 +0,0 @@
1
- import { CleaningPlanner } from "../cleaning/planner.js";
2
- import { NLP_PRESET, HEALTHCARE_PRESET } from "../cleaning/rules.js";
3
- import { RuleEvaluator } from "../cleaning/evaluator.js";
4
- async function runTest() {
5
- const planner = new CleaningPlanner();
6
- const evaluator = new RuleEvaluator();
7
- const mockReport = {
8
- row_count: 1000,
9
- column_count: 3,
10
- duplicate_rows: 0,
11
- duplicate_percentage: 0,
12
- columns: [
13
- { name: "text", type: "Utf8", inferred_type: "String", missing_count: 0, missing_percentage: 0, unique_count: 1000, is_constant: false, is_mixed_type: false },
14
- { name: "email", type: "Utf8", inferred_type: "String", missing_count: 0, missing_percentage: 0, unique_count: 1000, is_constant: false, is_mixed_type: false },
15
- { name: "date", type: "Utf8", inferred_type: "Date", missing_count: 0, missing_percentage: 0, unique_count: 1000, is_constant: false, is_mixed_type: false }
16
- ],
17
- warnings: [],
18
- schema_warnings: [],
19
- overall_score: 90
20
- };
21
- console.log("--- Testing NLP Preset ---");
22
- const nlpPlan = await planner.generatePlan("nlp-ds", mockReport, NLP_PRESET);
23
- console.log(`Plan generated with ${nlpPlan.operations.length} operations.`);
24
- nlpPlan.operations.forEach((op) => console.log(`- [${op.type}] ${op.reason}`));
25
- console.log("\n--- Testing Healthcare Preset ---");
26
- const hcPlan = await planner.generatePlan("hc-ds", mockReport, HEALTHCARE_PRESET);
27
- console.log(`Plan generated with ${hcPlan.operations.length} operations.`);
28
- hcPlan.operations.forEach((op) => console.log(`- [${op.type}] ${op.reason}`));
29
- console.log("\n--- Testing RuleEvaluator Logic ---");
30
- const record = {
31
- text: "Check out https://google.com",
32
- email: "user@example.com",
33
- date: "2023-01-01"
34
- };
35
- console.log("Original Record:", JSON.stringify(record));
36
- let processed = { ...record };
37
- // Apply NLP Rules
38
- for (const rule of NLP_PRESET.rules) {
39
- if (evaluator.matches(processed, rule.condition)) {
40
- processed = evaluator.apply(processed, rule);
41
- }
42
- }
43
- // Apply Healthcare Rules
44
- for (const rule of HEALTHCARE_PRESET.rules) {
45
- if (evaluator.matches(processed, rule.condition)) {
46
- processed = evaluator.apply(processed, rule);
47
- }
48
- }
49
- console.log("Processed Record:", JSON.stringify(processed));
50
- if (processed.text === "check out " && processed.email.includes("...")) {
51
- console.log("\n✅ Success: Rules correctly applied and record transformed.");
52
- }
53
- else {
54
- console.error("\n❌ Failure: Record transformation mismatch.");
55
- console.log("Expected text to be lowercased and URL removed, and email to be masked.");
56
- }
57
- }
58
- runTest().catch(console.error);
@@ -1,63 +0,0 @@
1
- import { MetadataStore } from "../metadata/store.js";
2
- async function runTest() {
3
- console.log("--- Initializing Database Optimization Test ---");
4
- const store = new MetadataStore("data/vesper_test_opt.db");
5
- // 1. Check if jobs_archive table exists
6
- console.log("Verifying tables and indexes...");
7
- // 2. Insert mock jobs (some old, some new)
8
- const now = new Date();
9
- const oldDate = new Date();
10
- oldDate.setDate(now.getDate() - 40); // 40 days ago
11
- const jobs = [
12
- {
13
- id: "job-new-1",
14
- type: "clean",
15
- status: "completed",
16
- priority: 0,
17
- progress: 100,
18
- status_text: "Done",
19
- attempts: 1,
20
- max_attempts: 3,
21
- created_at: now.toISOString(),
22
- updated_at: now.toISOString()
23
- },
24
- {
25
- id: "job-old-1",
26
- type: "prepare",
27
- status: "completed",
28
- priority: 0,
29
- progress: 100,
30
- status_text: "Archivable",
31
- attempts: 1,
32
- max_attempts: 3,
33
- created_at: oldDate.toISOString(),
34
- updated_at: oldDate.toISOString()
35
- }
36
- ];
37
- console.log("Inserting mock jobs...");
38
- jobs.forEach(j => store.saveJob(j));
39
- // 3. Test Archiving
40
- console.log("\n--- Testing Archiving Logic (cutoff 30 days) ---");
41
- const archivedCount = store.archiveOldJobs(30);
42
- console.log(`Archived ${archivedCount} jobs (indicator).`);
43
- // Verify
44
- const newJob = store.getJob("job-new-1");
45
- const oldJob = store.getJob("job-old-1");
46
- if (newJob)
47
- console.log("✅ New job remains in active table.");
48
- if (!oldJob)
49
- console.log("✅ Old job removed from active table.");
50
- // Manually check archive via query if possible (not exposed, but we can check if it failed)
51
- // 4. Test Optimization
52
- console.log("\n--- Testing Maintenance Methods ---");
53
- try {
54
- store.optimize();
55
- console.log("✅ Vacuum and Analyze completed.");
56
- }
57
- catch (e) {
58
- console.error("❌ Optimization failed:", e);
59
- }
60
- console.log("\n✅ Success: Database optimizations verified.");
61
- store.close();
62
- }
63
- runTest().catch(console.error);
@@ -1,33 +0,0 @@
1
- import { InstallService } from "../install/install-service.js";
2
- import { MetadataStore } from "../metadata/store.js";
3
- import path from "path";
4
- import fs from "fs";
5
- async function runTest() {
6
- console.log("--- Testing Custom Export Path ---");
7
- const projectRoot = path.resolve(".");
8
- const metadataStore = new MetadataStore(path.join(projectRoot, "data", "metadata.db"));
9
- const installService = new InstallService(projectRoot, metadataStore);
10
- // Find a naruto dataset
11
- const dataset = metadataStore.db.prepare("SELECT * FROM datasets WHERE name LIKE '%naruto%' LIMIT 1").get();
12
- if (!dataset) {
13
- console.error("Naruto dataset not found. Please run a search first.");
14
- return;
15
- }
16
- const customDir = path.join(projectRoot, "naruto-quotes");
17
- const mockFile = path.join(projectRoot, "data", "raw", "naruto_test_export.csv");
18
- if (!fs.existsSync(path.dirname(mockFile))) {
19
- fs.mkdirSync(path.dirname(mockFile), { recursive: true });
20
- }
21
- fs.writeFileSync(mockFile, "quote,character\nBelieve it!,Naruto");
22
- console.log(`Exporting ${dataset.id} to ${customDir}...`);
23
- const finalPath = await installService.install(dataset.id, mockFile, customDir);
24
- console.log(`✅ Success! Exported to: ${finalPath}`);
25
- if (fs.existsSync(finalPath) && finalPath.includes("naruto-quotes")) {
26
- console.log("Path verification: PASSED");
27
- }
28
- else {
29
- console.error("Path verification: FAILED");
30
- }
31
- console.log("\n--- Test Complete ---");
32
- }
33
- runTest().catch(console.error);