vesper-wizard 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +1 -1
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,182 +0,0 @@
1
- import { categorizeLicense } from "./license.js";
2
- import { calculateQualityScore } from "./quality.js";
3
- import { classifyDomain } from "./domain.js";
4
- import { rateLimitedFetch, delayBetweenRequests } from "./rate-limiter.js";
5
- export class KaggleMetadataScraper {
6
- username;
7
- key;
8
- constructor(username, key) {
9
- this.username = username;
10
- this.key = key;
11
- }
12
- async scrape(query, limit = 20, usePagination = true) {
13
- console.error(`[Kaggle] Searching for "${query}" (limit: ${limit}, pagination: ${usePagination})...`);
14
- const auth = Buffer.from(`${this.username}:${this.key}`).toString('base64');
15
- const results = [];
16
- const MAX_PAGE_SIZE = 100; // Kaggle API max page size
17
- const pageSize = Math.min(limit, MAX_PAGE_SIZE);
18
- let page = 1;
19
- let totalFetched = 0;
20
- let hasMore = true;
21
- try {
22
- while (hasMore && totalFetched < limit) {
23
- const url = `https://www.kaggle.com/api/v1/datasets/list?search=${encodeURIComponent(query)}&page_size=${pageSize}&page=${page}`;
24
- console.error(`[Kaggle] Fetching page ${page} (${totalFetched}/${limit} datasets so far)...`);
25
- // Use rate-limited fetch with retry logic
26
- const response = await rateLimitedFetch(url, {
27
- headers: {
28
- 'Authorization': `Basic ${auth}`,
29
- 'Content-Type': 'application/json'
30
- }
31
- }, {
32
- maxRetries: 3,
33
- initialDelay: 2000, // Start with 2 seconds
34
- maxDelay: 30000 // Max 30 seconds
35
- });
36
- const datasets = await response.json();
37
- if (!datasets || datasets.length === 0) {
38
- hasMore = false;
39
- break;
40
- }
41
- // Add delay between processing datasets to avoid rate limits
42
- for (let i = 0; i < datasets.length; i++) {
43
- const ds = datasets[i];
44
- try {
45
- const metadata = this.transform(ds);
46
- results.push(metadata);
47
- totalFetched++;
48
- console.error(`[Kaggle] Added: ${ds.ref} (${ds.downloadCount} downloads)`);
49
- // Add small delay every 5 datasets
50
- if ((i + 1) % 5 === 0 && i < datasets.length - 1) {
51
- await delayBetweenRequests(500);
52
- }
53
- }
54
- catch (e) {
55
- console.error(`[Kaggle] ERROR: Failed to transform ${ds.ref}:`, e);
56
- }
57
- }
58
- // Check if we should continue pagination
59
- if (usePagination && datasets.length === pageSize && totalFetched < limit) {
60
- page++;
61
- // Add delay between pages to avoid rate limits
62
- await delayBetweenRequests(1000);
63
- }
64
- else {
65
- hasMore = false;
66
- }
67
- }
68
- console.error(`[Kaggle] Completed: ${results.length} datasets found for "${query}"`);
69
- return results;
70
- }
71
- catch (e) {
72
- // Handle rate limit errors specifically
73
- if (e?.status === 429 || e?.message?.includes('rate limit')) {
74
- console.error("[Kaggle] Rate limit error:", e.message);
75
- console.error("Consider adding delays between requests or reducing batch size");
76
- }
77
- else {
78
- console.error("[Kaggle] Scrape error:", e.message || e);
79
- }
80
- // Return partial results if we got some before the error
81
- if (results.length > 0) {
82
- console.error(`[Kaggle] Returning ${results.length} partial results before error`);
83
- }
84
- return results;
85
- }
86
- }
87
- transform(ds) {
88
- const repoId = ds.ref;
89
- const tags = ds.tags?.map(t => t.name) || [];
90
- const description = ds.description || "";
91
- const license = categorizeLicense(ds.licenseName);
92
- const warnings = [];
93
- // Kaggle doesn't give us splits in the list API easily
94
- const sizeBytes = this.parseSize(ds.size);
95
- const splits = [
96
- {
97
- name: "data",
98
- num_examples: 0,
99
- size_bytes: sizeBytes
100
- }
101
- ];
102
- const totalSizeMB = sizeBytes ? Math.round(sizeBytes / (1024 * 1024) * 100) / 100 : 0;
103
- // Populate warnings
104
- if (description.length < 100)
105
- warnings.push("Short description; results may be less relevant");
106
- const lastUpdatedDate = new Date(ds.lastUpdated);
107
- const fourYearsAgo = new Date();
108
- fourYearsAgo.setFullYear(fourYearsAgo.getFullYear() - 4);
109
- if (lastUpdatedDate < fourYearsAgo) {
110
- warnings.push(`Stale data: Last updated ${lastUpdatedDate.getFullYear()}`);
111
- }
112
- warnings.push("No specific data splits identified (Kaggle API limitation)");
113
- // Classify domain
114
- const task = this.extractTask(tags);
115
- const domain = classifyDomain(description, tags, repoId, task);
116
- return {
117
- id: repoId,
118
- source: "kaggle",
119
- name: ds.title,
120
- description: description,
121
- quality_warnings: warnings,
122
- downloads: ds.downloadCount,
123
- likes: ds.voteCount,
124
- stars: 0,
125
- tags: tags,
126
- last_updated: ds.lastUpdated,
127
- task: task,
128
- domain: domain,
129
- languages: [],
130
- splits,
131
- license,
132
- quality_score: calculateQualityScore({
133
- downloads: ds.downloadCount,
134
- likes: ds.voteCount,
135
- hasDescription: description.length > 50,
136
- descriptionLength: description.length,
137
- hasTrainSplit: false,
138
- hasTestSplit: false,
139
- lastUpdated: ds.lastUpdated,
140
- licenseCategory: license.category
141
- }),
142
- download_url: `https://www.kaggle.com/datasets/${ds.ref}`,
143
- format: undefined,
144
- total_examples: 0,
145
- total_size_bytes: sizeBytes,
146
- total_size_mb: totalSizeMB,
147
- columns: [],
148
- is_structured: false,
149
- has_target_column: false,
150
- is_safe_source: true,
151
- has_personal_data: false,
152
- is_paywalled: false,
153
- is_scraped_web_data: false,
154
- uses_https: true,
155
- has_train_split: false,
156
- has_test_split: false,
157
- has_validation_split: false,
158
- description_length: description.length,
159
- has_readme: true
160
- };
161
- }
162
- parseSize(sizeStr) {
163
- if (!sizeStr)
164
- return 0;
165
- const match = sizeStr.match(/^(\d+(?:\.\d+)?)\s*([KMGT]B)$/i);
166
- if (!match)
167
- return 0;
168
- const value = parseFloat(match[1]);
169
- const unit = match[2].toUpperCase();
170
- switch (unit) {
171
- case 'KB': return value * 1024;
172
- case 'MB': return value * 1024 * 1024;
173
- case 'GB': return value * 1024 * 1024 * 1024;
174
- case 'TB': return value * 1024 * 1024 * 1024 * 1024;
175
- default: return value;
176
- }
177
- }
178
- extractTask(tags) {
179
- // Similar to HF but Kaggle tags might be different
180
- return "unknown";
181
- }
182
- }
@@ -1,70 +0,0 @@
1
- import { spawn } from "child_process";
2
- import path from "path";
3
- import fs from "fs";
4
- import os from "os";
5
- export class KaggleSource {
6
- pythonPath = "python";
7
- scriptPath;
8
- constructor(buildDir = process.cwd()) {
9
- const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
10
- const dataRoot = path.join(homeDir, ".vesper");
11
- const scriptPath0 = path.resolve(dataRoot, "python", "kaggle_engine.py");
12
- const scriptPath1 = path.resolve(buildDir, "python", "kaggle_engine.py");
13
- const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "kaggle_engine.py");
14
- if (fs.existsSync(scriptPath0)) {
15
- this.scriptPath = scriptPath0;
16
- }
17
- else if (fs.existsSync(scriptPath1)) {
18
- this.scriptPath = scriptPath1;
19
- }
20
- else if (fs.existsSync(scriptPath2)) {
21
- this.scriptPath = scriptPath2;
22
- }
23
- else {
24
- this.scriptPath = scriptPath0;
25
- }
26
- if (process.platform === "win32")
27
- this.pythonPath = "py";
28
- }
29
- async discover(query, limit = 20) {
30
- const result = await this.run(["discover", query, String(limit)]);
31
- if (!result.ok) {
32
- throw new Error(result.error || "Kaggle discover failed");
33
- }
34
- return (result.results || []);
35
- }
36
- async download(datasetRef, targetDir) {
37
- const args = ["download", datasetRef];
38
- if (targetDir)
39
- args.push(targetDir);
40
- const result = await this.run(args);
41
- if (!result.ok) {
42
- throw new Error(result.error || "Kaggle download failed");
43
- }
44
- return {
45
- local_path: result.local_path,
46
- target_dir: result.target_dir,
47
- };
48
- }
49
- async run(args) {
50
- return new Promise((resolve, reject) => {
51
- const processRef = spawn(this.pythonPath, [this.scriptPath, ...args]);
52
- let stdout = "";
53
- let stderr = "";
54
- processRef.stdout.on("data", (d) => (stdout += d.toString()));
55
- processRef.stderr.on("data", (d) => (stderr += d.toString()));
56
- processRef.on("close", (code) => {
57
- if (code !== 0) {
58
- reject(new Error(stderr || stdout || `kaggle_engine exited with code ${code}`));
59
- return;
60
- }
61
- try {
62
- resolve(JSON.parse(stdout));
63
- }
64
- catch {
65
- reject(new Error(`Failed to parse kaggle_engine output: ${stdout}`));
66
- }
67
- });
68
- });
69
- }
70
- }
@@ -1,68 +0,0 @@
1
- const SAFE_KEYWORDS = ["mit", "apache", "bsd", "cc0", "cc-by-4.0", "cc-by-sa-4.0", "odc-by", "pddl", "openrail", "creative commons attribution 4.0", "public domain"];
2
- const RESTRICTED_KEYWORDS = ["nc", "non-commercial", "research-only", "academic", "gpl", "agpl", "proprietary", "custom"];
3
- // Permissive licenses for MVP filter
4
- const PERMISSIVE_LICENSES = ["mit", "apache", "apache-2.0", "bsd", "cc0", "cc-by-4.0", "odc-by", "pddl", "openrail"];
5
- export function categorizeLicense(licenseId, licenseUrl) {
6
- const id = (licenseId || "unknown").toLowerCase();
7
- const usageRestrictions = [];
8
- let requiresConsent = false;
9
- // Check for usage restrictions
10
- if (id.includes("nc") || id.includes("non-commercial")) {
11
- usageRestrictions.push("non-commercial");
12
- }
13
- if (id.includes("research-only") || id.includes("academic")) {
14
- usageRestrictions.push("academic-only");
15
- }
16
- if (id.includes("nd") || id.includes("no-derivatives")) {
17
- usageRestrictions.push("no-derivatives");
18
- }
19
- if (id.includes("gpl") || id.includes("agpl")) {
20
- usageRestrictions.push("no-derivatives"); // GPL requires derivative works to be GPL
21
- }
22
- // Check if consent is required (GDPR, Kaggle, etc.)
23
- if (id.includes("gdpr") || id.includes("consent") || id.includes("kaggle")) {
24
- requiresConsent = true;
25
- }
26
- // If ID contains restricted keywords
27
- if (RESTRICTED_KEYWORDS.some(k => id.includes(k))) {
28
- return {
29
- id,
30
- category: "restricted",
31
- commercial_use: false,
32
- usage_restrictions: usageRestrictions.length > 0 ? usageRestrictions : ["non-commercial"],
33
- url: licenseUrl,
34
- warnings: [
35
- "Restricted usage terms apply",
36
- "Verify license terms before commercial application",
37
- ],
38
- requires_consent: requiresConsent,
39
- };
40
- }
41
- // If ID is a common safe license
42
- if (SAFE_KEYWORDS.some(k => id.includes(k))) {
43
- return {
44
- id,
45
- category: "safe",
46
- commercial_use: true,
47
- usage_restrictions: [],
48
- url: licenseUrl,
49
- warnings: [],
50
- requires_consent: requiresConsent,
51
- };
52
- }
53
- return {
54
- id: id || "unknown",
55
- category: "unknown",
56
- usage_restrictions: usageRestrictions,
57
- url: licenseUrl,
58
- warnings: [
59
- "License information unclear or unknown",
60
- "Use at your own risk",
61
- ],
62
- requires_consent: requiresConsent,
63
- };
64
- }
65
- export function isPermissiveLicense(licenseId) {
66
- const id = (licenseId || "unknown").toLowerCase();
67
- return PERMISSIVE_LICENSES.some(perm => id.includes(perm));
68
- }
@@ -1,107 +0,0 @@
1
- export class MonitoringService {
2
- monitorStore;
3
- metadataStore;
4
- constructor(monitorStore, metadataStore) {
5
- this.monitorStore = monitorStore;
6
- this.metadataStore = metadataStore;
7
- }
8
- /**
9
- * Checks all active monitors for updates.
10
- * @param fetchLatest A function that fetches the latest metadata from the source (HF/Kaggle)
11
- */
12
- async checkUpdates(fetchLatest) {
13
- const monitors = this.monitorStore.getActiveMonitors();
14
- const results = [];
15
- for (const monitor of monitors) {
16
- const current = this.metadataStore.getDataset(monitor.dataset_id);
17
- if (!current)
18
- continue;
19
- const latest = await fetchLatest(monitor.dataset_id, current.source);
20
- if (!latest)
21
- continue;
22
- if (latest.last_updated !== monitor.last_checked_version) {
23
- const diff = this.compareVersions(current, latest);
24
- if (diff.changes.length > 0) {
25
- results.push(diff);
26
- await this.notify(monitor, diff);
27
- // Update monitor
28
- monitor.last_checked_version = latest.last_updated;
29
- monitor.updated_at = new Date().toISOString();
30
- this.monitorStore.saveMonitor(monitor);
31
- // Update store
32
- this.metadataStore.saveDataset(latest);
33
- if (monitor.auto_reprocess) {
34
- await this.triggerReprocess(monitor.dataset_id);
35
- }
36
- }
37
- }
38
- }
39
- return results;
40
- }
41
- compareVersions(oldVer, newVer) {
42
- const changes = [];
43
- // Check for significant field changes
44
- const fieldsToTrack = ["downloads", "likes", "total_examples", "total_size_mb", "quality_score"];
45
- for (const field of fieldsToTrack) {
46
- if (oldVer[field] !== newVer[field]) {
47
- changes.push({
48
- field: String(field),
49
- old_value: oldVer[field],
50
- new_value: newVer[field]
51
- });
52
- }
53
- }
54
- // Check for split changes
55
- if (JSON.stringify(oldVer.splits) !== JSON.stringify(newVer.splits)) {
56
- changes.push({
57
- field: "splits",
58
- old_value: oldVer.splits,
59
- new_value: newVer.splits
60
- });
61
- }
62
- return {
63
- dataset_id: oldVer.id,
64
- old_version: oldVer.last_updated,
65
- new_version: newVer.last_updated,
66
- changes,
67
- impact_score: this.calculateImpact(changes)
68
- };
69
- }
70
- calculateImpact(changes) {
71
- let score = 0;
72
- for (const change of changes) {
73
- if (change.field === "total_examples")
74
- score += 40;
75
- if (change.field === "splits")
76
- score += 30;
77
- if (change.field === "quality_score")
78
- score += 20;
79
- if (change.field === "total_size_mb")
80
- score += 10;
81
- }
82
- return Math.min(score, 100);
83
- }
84
- async notify(monitor, diff) {
85
- for (const webhookId of monitor.webhook_ids) {
86
- const webhook = this.monitorStore.getWebhook(webhookId);
87
- if (webhook && webhook.enabled) {
88
- await this.sendToWebhook(webhook, diff);
89
- }
90
- }
91
- }
92
- async sendToWebhook(webhook, diff) {
93
- console.error(`[MonitoringService] Sending notification to ${webhook.name} (${webhook.channel}) for dataset ${diff.dataset_id}`);
94
- // In a real implementation, this would be an HTTP POST
95
- // For now, we simulate the payload
96
- const payload = {
97
- text: `Dataset ${diff.dataset_id} updated!`,
98
- changes: diff.changes,
99
- impact: diff.impact_score
100
- };
101
- // await axios.post(webhook.url, payload);
102
- }
103
- async triggerReprocess(datasetId) {
104
- console.error(`[MonitoringService] Auto-reprocessing dataset ${datasetId}...`);
105
- // This would call IngestionService or similar
106
- }
107
- }
@@ -1,78 +0,0 @@
1
- export class MonitoringStore {
2
- db;
3
- constructor(db) {
4
- this.db = db;
5
- this.init();
6
- }
7
- init() {
8
- this.db.exec(`
9
- CREATE TABLE IF NOT EXISTS dataset_monitors (
10
- dataset_id TEXT PRIMARY KEY,
11
- enabled BOOLEAN DEFAULT 1,
12
- auto_reprocess BOOLEAN DEFAULT 0,
13
- last_checked_version TEXT,
14
- webhook_ids TEXT, -- JSON array
15
- created_at TEXT,
16
- updated_at TEXT
17
- );
18
-
19
- CREATE TABLE IF NOT EXISTS webhook_configs (
20
- id TEXT PRIMARY KEY,
21
- name TEXT,
22
- channel TEXT,
23
- url TEXT,
24
- enabled BOOLEAN DEFAULT 1
25
- );
26
- `);
27
- }
28
- saveMonitor(monitor) {
29
- const upsert = this.db.prepare(`
30
- INSERT INTO dataset_monitors (dataset_id, enabled, auto_reprocess, last_checked_version, webhook_ids, created_at, updated_at)
31
- VALUES (?, ?, ?, ?, ?, ?, ?)
32
- ON CONFLICT(dataset_id) DO UPDATE SET
33
- enabled=excluded.enabled,
34
- auto_reprocess=excluded.auto_reprocess,
35
- last_checked_version=excluded.last_checked_version,
36
- webhook_ids=excluded.webhook_ids,
37
- updated_at=excluded.updated_at
38
- `);
39
- upsert.run(monitor.dataset_id, monitor.enabled ? 1 : 0, monitor.auto_reprocess ? 1 : 0, monitor.last_checked_version || null, JSON.stringify(monitor.webhook_ids), monitor.created_at, monitor.updated_at);
40
- }
41
- getMonitor(datasetId) {
42
- const row = this.db.prepare("SELECT * FROM dataset_monitors WHERE dataset_id = ?").get(datasetId);
43
- if (!row)
44
- return null;
45
- return {
46
- ...row,
47
- enabled: Boolean(row.enabled),
48
- auto_reprocess: Boolean(row.auto_reprocess),
49
- webhook_ids: JSON.parse(row.webhook_ids)
50
- };
51
- }
52
- getActiveMonitors() {
53
- const rows = this.db.prepare("SELECT * FROM dataset_monitors WHERE enabled = 1").all();
54
- return rows.map(row => ({
55
- ...row,
56
- enabled: Boolean(row.enabled),
57
- auto_reprocess: Boolean(row.auto_reprocess),
58
- webhook_ids: JSON.parse(row.webhook_ids)
59
- }));
60
- }
61
- saveWebhook(config) {
62
- const upsert = this.db.prepare(`
63
- INSERT INTO webhook_configs (id, name, channel, url, enabled)
64
- VALUES (?, ?, ?, ?, ?)
65
- ON CONFLICT(id) DO UPDATE SET
66
- name=excluded.name,
67
- url=excluded.url,
68
- enabled=excluded.enabled
69
- `);
70
- upsert.run(config.id, config.name, config.channel, config.url, config.enabled ? 1 : 0);
71
- }
72
- getWebhook(id) {
73
- const row = this.db.prepare("SELECT * FROM webhook_configs WHERE id = ?").get(id);
74
- if (!row)
75
- return null;
76
- return { ...row, enabled: Boolean(row.enabled) };
77
- }
78
- }
@@ -1 +0,0 @@
1
- export {};
@@ -1,87 +0,0 @@
1
- import { spawn } from "child_process";
2
- import path from "path";
3
- import fs from "fs";
4
- import os from "os";
5
- export class OpenMLSource {
6
- pythonPath = "python";
7
- scriptPath;
8
- constructor(buildDir = process.cwd()) {
9
- const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
10
- const dataRoot = path.join(homeDir, ".vesper");
11
- const scriptPath0 = path.resolve(dataRoot, "python", "openml_engine.py");
12
- const scriptPath1 = path.resolve(buildDir, "python", "openml_engine.py");
13
- const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "openml_engine.py");
14
- if (fs.existsSync(scriptPath0)) {
15
- this.scriptPath = scriptPath0;
16
- }
17
- else if (fs.existsSync(scriptPath1)) {
18
- this.scriptPath = scriptPath1;
19
- }
20
- else if (fs.existsSync(scriptPath2)) {
21
- this.scriptPath = scriptPath2;
22
- }
23
- else {
24
- this.scriptPath = scriptPath0;
25
- }
26
- if (process.platform === "win32") {
27
- const venvPy = path.resolve(buildDir, ".venv", "Scripts", "python.exe");
28
- if (fs.existsSync(venvPy)) {
29
- this.pythonPath = venvPy;
30
- }
31
- else {
32
- this.pythonPath = "py";
33
- }
34
- }
35
- else {
36
- const venvPy = path.resolve(buildDir, ".venv", "bin", "python");
37
- if (fs.existsSync(venvPy)) {
38
- this.pythonPath = venvPy;
39
- }
40
- }
41
- }
42
- async discover(query, limit = 20) {
43
- const result = await this.run(["discover", query, String(limit)]);
44
- if (!result.ok) {
45
- throw new Error(result.error || "OpenML discover failed");
46
- }
47
- return (result.results || []);
48
- }
49
- async download(datasetRef, targetDir) {
50
- const args = ["download", datasetRef];
51
- if (targetDir)
52
- args.push(targetDir);
53
- const result = await this.run(args);
54
- if (!result.ok) {
55
- throw new Error(result.error || "OpenML download failed");
56
- }
57
- return {
58
- local_path: result.local_path,
59
- target_dir: result.target_dir,
60
- };
61
- }
62
- run(args) {
63
- return new Promise((resolve, reject) => {
64
- const proc = spawn(this.pythonPath, [this.scriptPath, ...args]);
65
- let stdout = "";
66
- let stderr = "";
67
- proc.stdout.on("data", (data) => {
68
- stdout += data.toString();
69
- });
70
- proc.stderr.on("data", (data) => {
71
- stderr += data.toString();
72
- });
73
- proc.on("close", (code) => {
74
- if (code !== 0) {
75
- return reject(new Error(`OpenML engine exited with code ${code}: ${stderr}`));
76
- }
77
- try {
78
- const parsed = JSON.parse(stdout.trim());
79
- resolve(parsed);
80
- }
81
- catch (e) {
82
- reject(new Error(`Failed to parse OpenML engine output: ${stdout}`));
83
- }
84
- });
85
- });
86
- }
87
- }
@@ -1,48 +0,0 @@
1
- /**
2
- * Calculates a quality score from 0-100 based on metadata.
3
- */
4
- export function calculateQualityScore(data) {
5
- let score = 0;
6
- // 1. Popularity (max 30)
7
- if (data.downloads > 10000)
8
- score += 30;
9
- else if (data.downloads > 1000)
10
- score += 20;
11
- else if (data.downloads > 100)
12
- score += 10;
13
- // 2. Structuredness (max 20)
14
- if (data.hasTrainSplit)
15
- score += 10;
16
- if (data.hasTestSplit)
17
- score += 10;
18
- // 3. Documentation (max 20)
19
- if (data.hasDescription) {
20
- if (data.descriptionLength > 1000)
21
- score += 20;
22
- else if (data.descriptionLength > 200)
23
- score += 10;
24
- else
25
- score += 5;
26
- }
27
- // 4. Recency (max 15)
28
- const lastUpdate = new Date(data.lastUpdated);
29
- const now = new Date();
30
- const diffDays = Math.floor((now.getTime() - lastUpdate.getTime()) / (1000 * 3600 * 24));
31
- if (diffDays < 180)
32
- score += 15; // 6 months
33
- else if (diffDays < 365)
34
- score += 10; // 1 year
35
- else if (diffDays < 730)
36
- score += 5; // 2 years
37
- // 5. License Clarity (max 10)
38
- if (data.licenseCategory === "safe")
39
- score += 10;
40
- else if (data.licenseCategory === "restricted")
41
- score += 5;
42
- // 6. Community (max 5)
43
- if (data.likes > 50)
44
- score += 5;
45
- else if (data.likes > 10)
46
- score += 2;
47
- return Math.min(100, score);
48
- }