vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,190 +0,0 @@
1
- import { JITOrchestrator } from "./jit-orchestrator.js";
2
- import { analyzeDatasetQuery, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "./query-intent.js";
3
- import fs from "fs";
4
- function log(msg) {
5
- fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
6
- }
7
- export class SearchEngine {
8
- metadataStore;
9
- vectorStore;
10
- embedder;
11
- jitOrchestrator;
12
- constructor(metadataStore, vectorStore, embedder) {
13
- this.metadataStore = metadataStore;
14
- this.vectorStore = vectorStore;
15
- this.embedder = embedder;
16
- this.jitOrchestrator = new JITOrchestrator(metadataStore, vectorStore, embedder);
17
- }
18
- async search(query, options = {}) {
19
- const limit = options.limit || 5;
20
- const enableJIT = options.enableJIT !== false; // Default: true
21
- const intent = await analyzeDatasetQuery(query);
22
- log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
23
- // 1. Perform local search
24
- const localResults = await this.localSearch(query, options, intent);
25
- // 2. Check if JIT should be triggered
26
- const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
27
- if (!shouldTrigger) {
28
- log(`JIT not triggered. Returning ${localResults.length} local results`);
29
- return localResults;
30
- }
31
- // 3. Trigger JIT fallback
32
- console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
33
- await this.jitOrchestrator.fetchAndIngest(query, 10, intent);
34
- // 4. Re-run local search with updated index
35
- console.error(`Re-searching with updated library...`);
36
- const enhancedResults = await this.localSearch(query, options, intent);
37
- const newCount = enhancedResults.length - localResults.length;
38
- if (newCount > 0) {
39
- console.error(`Found ${newCount} additional results\n`);
40
- }
41
- return enhancedResults;
42
- }
43
- /**
44
- * Perform hybrid search (Vector + Lexical + Penalties)
45
- */
46
- async localSearch(query, options, intent) {
47
- const limit = options.limit || 5;
48
- // 1. Parse Query
49
- const words = query.toLowerCase().split(/\s+/);
50
- const positiveKeywords = words.filter(w => !w.startsWith("-") && w.length > 2);
51
- const negativeKeywords = words.filter(w => w.startsWith("-")).map(w => w.slice(1));
52
- // Automatic Penalty Detection: Is this a "Finance" query without "Crypto" mentioned?
53
- const financeTerms = ["financial", "finance", "banking", "economy", "stock", "loan", "forecasting", "bank"];
54
- const isFinanceQuery = positiveKeywords.some(w => financeTerms.includes(w));
55
- const mentionsCrypto = query.toLowerCase().includes("crypto") || query.toLowerCase().includes("bitcoin");
56
- // 2. Get query vector
57
- const queryVector = await this.embedder.embed(query);
58
- log(`Vector generated, length=${queryVector.length}`);
59
- // 3. Search in vector store (fetch more candidates for reranking)
60
- const matches = this.vectorStore.search(queryVector, 40);
61
- log(`Vector search found ${matches.length} matches`);
62
- // 4. Score and filter candidates
63
- const results = [];
64
- for (const match of matches) {
65
- const metadata = this.metadataStore.getDataset(match.id);
66
- if (!metadata)
67
- continue;
68
- // Filter: Safe only
69
- if (options.safeOnly && metadata.license.category === "restricted")
70
- continue;
71
- // Filter: Hard language exclusion — completely drop bilingual/multilingual datasets
72
- // when user explicitly requested a single language
73
- if (shouldExcludeByLanguage(metadata, intent)) {
74
- log(`Language exclusion: Dropped ${match.id} (bilingual/multilingual for single-language query)`);
75
- continue;
76
- }
77
- const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
78
- // Filter: Explicit Negative Keywords
79
- if (negativeKeywords.some(neg => text.includes(neg))) {
80
- log(`Negative penalty: Dropped ${match.id} due to keyword match`);
81
- continue;
82
- }
83
- // A. Vector Score (0.0 to 1.0)
84
- const vectorScore = match.score;
85
- // B. Lexical Score (Keyword Match)
86
- let lexicalScore = 0;
87
- if (positiveKeywords.length > 0) {
88
- const matchesCount = positiveKeywords.filter(kw => text.includes(kw)).length;
89
- lexicalScore = matchesCount / positiveKeywords.length;
90
- }
91
- // C. Penalties
92
- let penalty = 0;
93
- // Penalty: Domain Drift (Finance vs Crypto)
94
- // If user asks for finance but NOT crypto, and the result is crypto, penalize heavily
95
- if (isFinanceQuery && !mentionsCrypto) {
96
- const isCryptoResult = text.includes("crypto") || text.includes("bitcoin") || text.includes("ethereum") || text.includes("blockchain");
97
- if (isCryptoResult) {
98
- log(`Penalty applied: Crypto result in Finance query for ${match.id}`);
99
- penalty += 0.4; // Massive penalty
100
- }
101
- }
102
- // Penalty: Weak Lexical Match
103
- // If it's a "semantic vibe" match but has ZERO matching keywords from the query
104
- if (lexicalScore === 0 && positiveKeywords.length > 1) {
105
- penalty += 0.2;
106
- }
107
- // Penalty: Modality Mismatch
108
- // Infer the expected modality from the query and penalize mismatches.
109
- // e.g., "anime quotes" is text, so image-classification datasets get penalized.
110
- const textIndicators = ["quotes", "text", "nlp", "sentiment", "review", "comment", "caption", "dialogue", "chat", "translation", "summarization", "classification"];
111
- const imageIndicators = ["image", "photo", "picture", "vision", "detection", "segmentation", "face", "background"];
112
- const queryLower = query.toLowerCase();
113
- const queryLooksText = textIndicators.some(t => queryLower.includes(t));
114
- const queryLooksImage = imageIndicators.some(t => queryLower.includes(t));
115
- if (queryLooksText && !queryLooksImage) {
116
- const resultTask = (metadata.task || "").toLowerCase();
117
- const isImageResult = resultTask.includes("image") || resultTask.includes("object-detection") ||
118
- text.includes("image classification") || text.includes("image-classification") ||
119
- text.includes("object detection") || text.includes("image segmentation");
120
- if (isImageResult) {
121
- log(`Modality penalty: text query but image dataset ${match.id}`);
122
- penalty += 0.35;
123
- }
124
- }
125
- if (queryLooksImage && !queryLooksText) {
126
- const resultTask = (metadata.task || "").toLowerCase();
127
- const isTextResult = resultTask.includes("text-classification") || resultTask.includes("text-generation") ||
128
- resultTask.includes("translation") || resultTask.includes("summarization") ||
129
- resultTask.includes("question-answering");
130
- if (isTextResult) {
131
- log(`Modality penalty: image query but text dataset ${match.id}`);
132
- penalty += 0.35;
133
- }
134
- }
135
- // D. Accessibility Bonuses (Prioritize low-friction sources)
136
- let bonus = 0;
137
- const sourceBonuses = {
138
- "huggingface": 0.1,
139
- "uci": 0.1,
140
- "github": 0.1,
141
- "worldbank": 0.1,
142
- "nasa": 0.1
143
- };
144
- bonus = sourceBonuses[metadata.source] || 0;
145
- // Final Combined Score
146
- // 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
147
- const intentScore = scoreDatasetAgainstIntent(metadata, intent);
148
- const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus + intentScore;
149
- metadata.relevance_score = Math.round(finalScore * 100) / 100;
150
- metadata.vector_score = Math.round(vectorScore * 100) / 100;
151
- metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
152
- metadata.accessibility_bonus = bonus;
153
- metadata.intent_score = intentScore;
154
- results.push(metadata);
155
- }
156
- // Sort by final score and limit
157
- return results
158
- .sort((a, b) => b.relevance_score - a.relevance_score)
159
- .slice(0, limit);
160
- }
161
- /**
162
- * Determine if JIT should be triggered
163
- */
164
- shouldTriggerJIT(results, query) {
165
- const queryWords = query.trim().split(/\s+/).length;
166
- // Condition 1: Zero results (Always trigger JIT)
167
- if (results.length === 0) {
168
- log(`JIT trigger: No results found for "${query}"`);
169
- return true;
170
- }
171
- // Condition 2: Very few results
172
- if (results.length < 3) {
173
- log(`JIT trigger: Only ${results.length} results found`);
174
- return true;
175
- }
176
- // Condition 3: Low confidence (top result has low similarity)
177
- // Increased threshold from 0.60 to 0.75 for better specific-query matching
178
- const topScore = results[0]?.relevance_score || 0;
179
- if (topScore < 0.75) {
180
- log(`JIT trigger: Low confidence (top score: ${topScore}, threshold: 0.75)`);
181
- return true;
182
- }
183
- // Condition 4: Long specific queries with mediocre matches
184
- if (queryWords >= 4 && topScore < 0.85) {
185
- log(`JIT trigger: Long query with mediocre top score (${topScore})`);
186
- return true;
187
- }
188
- return false;
189
- }
190
- }
@@ -1,262 +0,0 @@
1
- import { HuggingFaceScraper } from "../metadata/scraper.js";
2
- import { UCIScraper } from "../metadata/uci-scraper.js";
3
- import { GitHubScraper } from "../metadata/github-scraper.js";
4
- import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
5
- import { analyzeDatasetQuery, buildIntentSearchQuery } from "./query-intent.js";
6
- // Common stop words to filter out for better search
7
- const STOP_WORDS = new Set([
8
- "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
9
- "of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
10
- "be", "have", "has", "had", "do", "does", "did", "will", "would",
11
- "could", "should", "may", "might", "must", "shall", "can", "need",
12
- "about", "into", "through", "during", "before", "after", "above",
13
- "below", "between", "under", "again", "further", "then", "once",
14
- "here", "there", "when", "where", "why", "how", "all", "each",
15
- "few", "more", "most", "other", "some", "such", "no", "nor", "not",
16
- "only", "own", "same", "so", "than", "too", "very", "just", "also",
17
- "dataset", "datasets", "data", "find", "search", "get", "looking"
18
- ]);
19
- /**
20
- * Just-In-Time Orchestrator
21
- * Automatically fetches and indexes new datasets when local search is insufficient
22
- */
23
- export class JITOrchestrator {
24
- metadataStore;
25
- vectorStore;
26
- embedder;
27
- lastTriggerTime = new Map();
28
- RATE_LIMIT_MS = 30000; // 30 seconds between triggers for same query
29
- constructor(metadataStore, vectorStore, embedder) {
30
- this.metadataStore = metadataStore;
31
- this.vectorStore = vectorStore;
32
- this.embedder = embedder;
33
- }
34
- /**
35
- * Simplify a complex user query into keywords that work better with APIs
36
- * HuggingFace and other APIs often fail on long multi-word queries
37
- */
38
- simplifyQuery(query) {
39
- // Split into words, lowercase, remove punctuation
40
- const words = query.toLowerCase()
41
- .replace(/[^\w\s-]/g, "")
42
- .split(/\s+/)
43
- .filter(w => w.length > 2 && !STOP_WORDS.has(w));
44
- // Return unique keywords (max 3 for API-friendly queries)
45
- const unique = [...new Set(words)];
46
- // If we have a lot of words, prioritize longer/more specific ones
47
- if (unique.length > 3) {
48
- unique.sort((a, b) => b.length - a.length);
49
- }
50
- return unique.slice(0, 3);
51
- }
52
- /**
53
- * Check if JIT should be triggered based on rate limiting
54
- */
55
- canTrigger(query) {
56
- const lastTrigger = this.lastTriggerTime.get(query);
57
- if (!lastTrigger)
58
- return true;
59
- const elapsed = Date.now() - lastTrigger;
60
- return elapsed > this.RATE_LIMIT_MS;
61
- }
62
- /**
63
- * Main JIT workflow: fetch, save, index, return new datasets
64
- */
65
- async fetchAndIngest(query, limit = 10, providedIntent) {
66
- // Rate limiting check
67
- if (!this.canTrigger(query)) {
68
- console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
69
- return [];
70
- }
71
- console.error(`\n[JIT] Searching live sources for: "${query}"`);
72
- this.lastTriggerTime.set(query, Date.now());
73
- const intent = providedIntent || await analyzeDatasetQuery(query);
74
- const keywords = this.simplifyQuery(buildIntentSearchQuery(intent));
75
- if (intent.llmBacked || intent.language || intent.task || intent.domain || intent.minRows) {
76
- console.error(`[JIT] Intent: ${JSON.stringify({ language: intent.language, task: intent.task, domain: intent.domain, minRows: intent.minRows, searchQuery: intent.searchQuery })}`);
77
- }
78
- else if (keywords.length > 0) {
79
- console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
80
- }
81
- const newDatasets = [];
82
- const existingIds = new Set();
83
- const sourceResults = {};
84
- try {
85
- // Get existing dataset IDs to avoid duplicates
86
- const existing = this.metadataStore.getAllDatasets();
87
- existing.forEach(ds => existingIds.add(ds.id));
88
- let hfResults = await this.scrapeHuggingFace(intent, limit);
89
- if (hfResults.length < Math.max(3, Math.floor(limit / 2))) {
90
- for (const keyword of keywords) {
91
- if (hfResults.length >= limit)
92
- break;
93
- const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / Math.max(keywords.length, 1)));
94
- for (const ds of results) {
95
- if (!hfResults.some(existing => existing.id === ds.id)) {
96
- hfResults.push(ds);
97
- }
98
- }
99
- }
100
- }
101
- sourceResults["HuggingFace"] = hfResults.length;
102
- console.error(` [source] HuggingFace: ${hfResults.length} datasets`);
103
- for (const ds of hfResults) {
104
- if (!existingIds.has(ds.id)) {
105
- newDatasets.push(ds);
106
- existingIds.add(ds.id);
107
- }
108
- }
109
- // 2. Scrape UCI (Open Access)
110
- const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
111
- sourceResults["UCI"] = uciResults.length;
112
- console.error(` [source] UCI: ${uciResults.length} datasets`);
113
- for (const ds of uciResults) {
114
- if (!existingIds.has(ds.id)) {
115
- newDatasets.push(ds);
116
- existingIds.add(ds.id);
117
- }
118
- }
119
- // 3. Scrape GitHub (Open Access)
120
- const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
121
- sourceResults["GitHub"] = githubResults.length;
122
- console.error(` [source] GitHub: ${githubResults.length} datasets`);
123
- for (const ds of githubResults) {
124
- if (!existingIds.has(ds.id)) {
125
- newDatasets.push(ds);
126
- existingIds.add(ds.id);
127
- }
128
- }
129
- // 4. Scrape World Bank (Open Access) - Economic/demographic data
130
- const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
131
- sourceResults["WorldBank"] = wbResults.length;
132
- console.error(` [source] World Bank: ${wbResults.length} datasets`);
133
- for (const ds of wbResults) {
134
- if (!existingIds.has(ds.id)) {
135
- newDatasets.push(ds);
136
- existingIds.add(ds.id);
137
- }
138
- }
139
- // 5. Scrape NASA (Open Access) - Scientific/space data
140
- const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
141
- sourceResults["NASA"] = nasaResults.length;
142
- console.error(` [source] NASA: ${nasaResults.length} datasets`);
143
- for (const ds of nasaResults) {
144
- if (!existingIds.has(ds.id)) {
145
- newDatasets.push(ds);
146
- existingIds.add(ds.id);
147
- }
148
- }
149
- // Save and index new datasets
150
- if (newDatasets.length > 0) {
151
- await this.saveAndIndex(newDatasets);
152
- console.error(`[JIT] Indexed ${newDatasets.length} new datasets`);
153
- }
154
- else {
155
- // Provide helpful feedback when no results found
156
- const allZero = Object.values(sourceResults).every(v => v === 0);
157
- if (allZero) {
158
- console.error(`[JIT] No datasets found across all sources.`);
159
- console.error(`[JIT] Try: broader keywords, or set HF_TOKEN for better HuggingFace access`);
160
- }
161
- else {
162
- console.error(`[JIT] All found datasets already in index`);
163
- }
164
- }
165
- return newDatasets;
166
- }
167
- catch (error) {
168
- console.error(`ERROR [JIT] Error during fetch and ingest:`, error.message);
169
- return [];
170
- }
171
- }
172
- /**
173
- * Scrape HuggingFace with free-text search
174
- */
175
- async scrapeHuggingFace(query, limit) {
176
- const scraper = new HuggingFaceScraper();
177
- try {
178
- return await scraper.scrape(limit, true, query);
179
- }
180
- catch (error) {
181
- console.error(` ERROR: HuggingFace scrape failed: ${error.message}`);
182
- return [];
183
- }
184
- }
185
- /**
186
- * Scrape UCI
187
- */
188
- async scrapeUCI(query, limit) {
189
- const scraper = new UCIScraper();
190
- try {
191
- return await scraper.scrape(query, limit);
192
- }
193
- catch (error) {
194
- console.error(` ERROR: UCI scrape failed: ${error.message}`);
195
- return [];
196
- }
197
- }
198
- /**
199
- * Scrape GitHub
200
- */
201
- async scrapeGitHub(query, limit) {
202
- const scraper = new GitHubScraper();
203
- try {
204
- return await scraper.scrape(query, limit);
205
- }
206
- catch (error) {
207
- console.error(` ERROR: GitHub scrape failed: ${error.message}`);
208
- return [];
209
- }
210
- }
211
- /**
212
- * Scrape World Bank
213
- */
214
- async scrapeWorldBank(query, limit) {
215
- const scraper = new WorldBankScraper();
216
- try {
217
- return await scraper.scrape(query, limit);
218
- }
219
- catch (error) {
220
- console.error(` ERROR: World Bank scrape failed: ${error.message}`);
221
- return [];
222
- }
223
- }
224
- /**
225
- * Scrape NASA
226
- */
227
- async scrapeNASA(query, limit) {
228
- const scraper = new NASAScraper();
229
- try {
230
- return await scraper.scrape(query, limit);
231
- }
232
- catch (error) {
233
- console.error(` ERROR: NASA scrape failed: ${error.message}`);
234
- return [];
235
- }
236
- }
237
- /**
238
- * Save datasets to DB and generate embeddings
239
- */
240
- async saveAndIndex(datasets) {
241
- // 1. Save to database
242
- this.metadataStore.beginTransaction();
243
- try {
244
- for (const ds of datasets) {
245
- this.metadataStore.saveDataset(ds);
246
- }
247
- this.metadataStore.commit();
248
- }
249
- catch (e) {
250
- this.metadataStore.rollback();
251
- throw e;
252
- }
253
- // 2. Generate embeddings and update vector store
254
- for (const ds of datasets) {
255
- const text = `${ds.name} ${ds.description} ${ds.tags.join(" ")}`;
256
- const vector = await this.embedder.embed(text);
257
- this.vectorStore.add(ds.id, vector);
258
- }
259
- // 3. Persist vector store to disk
260
- this.vectorStore.save();
261
- }
262
- }