@vespermcp/mcp-server 1.2.21 → 1.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/README.md +49 -0
  2. package/build/cache/service.js +7 -0
  3. package/build/cloud/adapters/supabase.js +49 -0
  4. package/build/cloud/storage-manager.js +6 -0
  5. package/build/export/exporter.js +22 -9
  6. package/build/gateway/unified-dataset-gateway.js +441 -0
  7. package/build/index.js +1815 -839
  8. package/build/ingestion/ingestor.js +7 -4
  9. package/build/install/install-service.js +11 -6
  10. package/build/lib/supabase.js +3 -0
  11. package/build/metadata/arxiv-source.js +229 -0
  12. package/build/metadata/circuit-breaker.js +62 -0
  13. package/build/metadata/github-source.js +203 -0
  14. package/build/metadata/hackernews-source.js +123 -0
  15. package/build/metadata/quality.js +27 -0
  16. package/build/metadata/scraper.js +85 -14
  17. package/build/metadata/semantic-scholar-source.js +138 -0
  18. package/build/python/asset_downloader_engine.py +2 -0
  19. package/build/python/convert_engine.py +92 -0
  20. package/build/python/export_engine.py +45 -0
  21. package/build/python/kaggle_engine.py +77 -5
  22. package/build/python/normalize_engine.py +83 -0
  23. package/build/python/vesper/core/asset_downloader.py +5 -1
  24. package/build/scripts/test-phase1-webcore-quality.js +104 -0
  25. package/build/search/engine.js +45 -6
  26. package/build/search/jit-orchestrator.js +18 -14
  27. package/build/search/query-intent.js +509 -0
  28. package/build/tools/formatter.js +6 -3
  29. package/build/utils/python-runtime.js +130 -0
  30. package/build/web/extract-web.js +297 -0
  31. package/build/web/fusion-engine.js +457 -0
  32. package/build/web/types.js +1 -0
  33. package/build/web/web-core.js +242 -0
  34. package/package.json +12 -5
  35. package/scripts/postinstall.cjs +87 -31
  36. package/scripts/wizard.cjs +652 -0
  37. package/scripts/wizard.js +338 -12
  38. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  39. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  40. package/src/python/asset_downloader_engine.py +2 -0
  41. package/src/python/convert_engine.py +92 -0
  42. package/src/python/export_engine.py +45 -0
  43. package/src/python/kaggle_engine.py +77 -5
  44. package/src/python/normalize_engine.py +83 -0
  45. package/src/python/requirements.txt +12 -0
  46. package/src/python/vesper/core/asset_downloader.py +5 -1
  47. package/wizard.cjs +3 -0
@@ -12,6 +12,19 @@ except Exception:
12
12
  HAS_KAGGLE = False
13
13
 
14
14
 
15
+ IMAGE_EXTENSIONS = {
16
+ ".jpg",
17
+ ".jpeg",
18
+ ".png",
19
+ ".webp",
20
+ ".bmp",
21
+ ".gif",
22
+ ".tiff",
23
+ ".tif",
24
+ ".svg",
25
+ }
26
+
27
+
15
28
  def _ensure_auth() -> Dict[str, Any]:
16
29
  if not HAS_KAGGLE:
17
30
  return {
@@ -135,7 +148,56 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
135
148
  return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
136
149
 
137
150
 
138
- def _pick_best_file(root: str) -> str:
151
+ def _find_image_files(root: str) -> List[str]:
152
+ image_files: List[str] = []
153
+ for base, _, files in os.walk(root):
154
+ for name in files:
155
+ full = os.path.join(base, name)
156
+ if os.path.splitext(name)[1].lower() in IMAGE_EXTENSIONS:
157
+ image_files.append(full)
158
+ image_files.sort()
159
+ return image_files
160
+
161
+
162
+ def _infer_image_record(root: str, full_path: str, index: int) -> Dict[str, Any]:
163
+ relative_path = os.path.relpath(full_path, root).replace("\\", "/")
164
+ parent_dir = os.path.dirname(relative_path)
165
+ parts = [part for part in parent_dir.split("/") if part and part != "."]
166
+
167
+ split = None
168
+ label = None
169
+ if parts:
170
+ first = parts[0].lower()
171
+ if first in {"train", "test", "val", "valid", "validation"}:
172
+ split = parts[0]
173
+ if len(parts) > 1:
174
+ label = parts[-1]
175
+ else:
176
+ label = parts[-1]
177
+
178
+ record: Dict[str, Any] = {
179
+ "id": index,
180
+ "image_path": os.path.abspath(full_path),
181
+ "relative_path": relative_path,
182
+ "file_name": os.path.basename(full_path),
183
+ "extension": os.path.splitext(full_path)[1].lower().lstrip("."),
184
+ }
185
+ if split:
186
+ record["split"] = split
187
+ if label:
188
+ record["label"] = label
189
+ return record
190
+
191
+
192
+ def _write_image_manifest(root: str, image_files: List[str]) -> str:
193
+ manifest_path = os.path.join(root, "_vesper_image_manifest.jsonl")
194
+ with open(manifest_path, "w", encoding="utf-8") as handle:
195
+ for index, full_path in enumerate(image_files):
196
+ handle.write(json.dumps(_infer_image_record(root, full_path, index), ensure_ascii=False) + "\n")
197
+ return manifest_path
198
+
199
+
200
+ def _pick_best_file(root: str) -> Dict[str, Any]:
139
201
  candidates: List[str] = []
140
202
  for base, _, files in os.walk(root):
141
203
  for name in files:
@@ -145,6 +207,14 @@ def _pick_best_file(root: str) -> str:
145
207
  candidates.append(full)
146
208
 
147
209
  if not candidates:
210
+ image_files = _find_image_files(root)
211
+ if image_files:
212
+ manifest_path = _write_image_manifest(root, image_files)
213
+ return {
214
+ "local_path": manifest_path,
215
+ "dataset_kind": "image-manifest",
216
+ "image_count": len(image_files),
217
+ }
148
218
  raise RuntimeError("No suitable data file found after download")
149
219
 
150
220
  # prioritize common tabular formats
@@ -152,8 +222,8 @@ def _pick_best_file(root: str) -> str:
152
222
  for ext in priorities:
153
223
  for c in candidates:
154
224
  if c.lower().endswith(ext):
155
- return c
156
- return candidates[0]
225
+ return {"local_path": c, "dataset_kind": "tabular", "image_count": 0}
226
+ return {"local_path": candidates[0], "dataset_kind": "tabular", "image_count": 0}
157
227
 
158
228
 
159
229
  def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
@@ -174,12 +244,14 @@ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
174
244
 
175
245
  # unzip in place, remove zip for convenience
176
246
  api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
177
- best_file = _pick_best_file(target_dir)
247
+ artifact = _pick_best_file(target_dir)
178
248
  return {
179
249
  "ok": True,
180
250
  "dataset_id": dataset_ref,
181
251
  "target_dir": target_dir,
182
- "local_path": best_file,
252
+ "local_path": artifact["local_path"],
253
+ "dataset_kind": artifact["dataset_kind"],
254
+ "image_count": artifact.get("image_count", 0),
183
255
  }
184
256
  except Exception as e:
185
257
  msg = str(e)
@@ -0,0 +1,83 @@
1
+ """
2
+ Normalize any supported dataset file to parquet format.
3
+ Usage: normalize_engine.py <input_path> <output_path>
4
+ Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
5
+ """
6
+ import sys
7
+ import json
8
+ import os
9
+
10
+ try:
11
+ import polars as pl
12
+ except Exception:
13
+ print(json.dumps({"ok": False, "error": "polars is required"}))
14
+ sys.exit(1)
15
+
16
+
17
+ def _load(src: str) -> pl.DataFrame:
18
+ ext = os.path.splitext(src)[1].lower()
19
+
20
+ if ext == ".csv":
21
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
22
+ if ext in (".tsv", ".tab"):
23
+ return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
24
+ if ext in (".parquet", ".pq"):
25
+ return pl.read_parquet(src)
26
+ if ext in (".feather", ".ftr", ".arrow", ".ipc"):
27
+ return pl.read_ipc(src)
28
+ if ext in (".jsonl", ".ndjson"):
29
+ return pl.read_ndjson(src)
30
+ if ext == ".json":
31
+ raw = open(src, "r", encoding="utf-8").read().strip()
32
+ if raw.startswith("["):
33
+ return pl.read_json(src)
34
+ # Try NDJSON
35
+ if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
36
+ return pl.read_ndjson(src)
37
+ # Try wrapper object
38
+ obj = json.loads(raw)
39
+ if isinstance(obj, dict):
40
+ for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
41
+ if key in obj and isinstance(obj[key], list):
42
+ return pl.DataFrame(obj[key])
43
+ # Last resort - take first list value
44
+ for v in obj.values():
45
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
46
+ return pl.DataFrame(v)
47
+ return pl.read_json(src)
48
+ if ext == ".txt":
49
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
50
+
51
+ # Fallback: try csv
52
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
53
+
54
+
55
+ def normalize(input_path: str, output_path: str):
56
+ df = _load(input_path)
57
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
58
+ df.write_parquet(output_path)
59
+ return df.height
60
+
61
+
62
+ def main():
63
+ if len(sys.argv) < 3:
64
+ print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
65
+ sys.exit(1)
66
+
67
+ input_path = sys.argv[1]
68
+ output_path = sys.argv[2]
69
+
70
+ if not os.path.exists(input_path):
71
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
72
+ sys.exit(1)
73
+
74
+ try:
75
+ rows = normalize(input_path, output_path)
76
+ print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
77
+ except Exception as e:
78
+ print(json.dumps({"ok": False, "error": str(e)}))
79
+ sys.exit(1)
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
@@ -191,6 +191,7 @@ class AssetDownloader:
191
191
  kaggle_ref: Optional[str] = None,
192
192
  urls: Optional[List[str]] = None,
193
193
  output_format: str = "webdataset",
194
+ output_dir: Optional[str] = None,
194
195
  max_items: Optional[int] = None,
195
196
  image_column: Optional[str] = None,
196
197
  ) -> Dict[str, Any]:
@@ -231,7 +232,10 @@ class AssetDownloader:
231
232
  raise ValueError("urls are required for source=url")
232
233
 
233
234
  # --- Now safe to create directories ---
234
- dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
235
+ if output_dir:
236
+ dataset_dir = Path(output_dir).expanduser().resolve()
237
+ else:
238
+ dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
235
239
  images_dir = dataset_dir / "images"
236
240
  dataset_dir.mkdir(parents=True, exist_ok=True)
237
241
  images_dir.mkdir(parents=True, exist_ok=True)
@@ -0,0 +1,104 @@
1
+ import { CacheService, MockRedisProvider } from "../cache/service.js";
2
+ import { ArxivSource } from "../metadata/arxiv-source.js";
3
+ import { GithubSource } from "../metadata/github-source.js";
4
+ import { SemanticScholarSource } from "../metadata/semantic-scholar-source.js";
5
+ import { HackerNewsSource } from "../metadata/hackernews-source.js";
6
+ import { WebCoreEngine } from "../web/web-core.js";
7
+ function percentile(values, p) {
8
+ if (values.length === 0)
9
+ return 0;
10
+ const sorted = [...values].sort((a, b) => a - b);
11
+ const idx = Math.max(0, Math.ceil(p * sorted.length) - 1);
12
+ return sorted[idx] ?? 0;
13
+ }
14
+ function makeQueries() {
15
+ const topics = [
16
+ "agentic RAG evaluation",
17
+ "tool-augmented retrieval",
18
+ "LLM dataset quality scoring",
19
+ "semantic deduplication embeddings",
20
+ "cross-source dataset fusion",
21
+ "retrieval augmented generation metrics",
22
+ "dataset export parquet arrow jsonl",
23
+ "data safety dataset provenance",
24
+ "synthetic data generation alignment",
25
+ "multi-source corpus building",
26
+ ];
27
+ const suffixes = [
28
+ "paper",
29
+ "benchmark",
30
+ "latency",
31
+ "quality",
32
+ "dedup",
33
+ "provenance",
34
+ "evaluation",
35
+ "pipeline",
36
+ "MCP",
37
+ "agents",
38
+ ];
39
+ const out = [];
40
+ for (let i = 0; out.length < 100; i++) {
41
+ const t = topics[i % topics.length];
42
+ const s = suffixes[Math.floor(i / topics.length) % suffixes.length];
43
+ out.push(`${t} ${s}`.trim());
44
+ }
45
+ return out.slice(0, 100);
46
+ }
47
+ async function main() {
48
+ const cache = new CacheService(new MockRedisProvider());
49
+ const engine = new WebCoreEngine({
50
+ arxivSource: new ArxivSource(cache),
51
+ githubSource: new GithubSource(cache),
52
+ semanticScholarSource: new SemanticScholarSource(cache),
53
+ hackerNewsSource: new HackerNewsSource(cache),
54
+ });
55
+ const baseQuery = "agentic RAG evaluation";
56
+ const flags = { arxiv_full_text: true, github_include_readme: true };
57
+ const sources = ["arxiv", "github"];
58
+ console.log("Phase 1 Web Core validation:");
59
+ console.log("- Checking cached latency regression on a single query...");
60
+ const run1 = await engine.find({ query: baseQuery, sources: [...sources], limit: 2, ...flags });
61
+ const run2 = await engine.find({ query: baseQuery, sources: [...sources], limit: 2, ...flags });
62
+ const run2Arxiv = run2.telemetry?.per_source.find((t) => t.source === "arxiv");
63
+ const run2Github = run2.telemetry?.per_source.find((t) => t.source === "github");
64
+ console.log("Cached telemetry (run2):", {
65
+ arxiv: run2Arxiv ? { cache_hit: run2Arxiv.cache_hit, latency_ms: run2Arxiv.latency_ms } : null,
66
+ github: run2Github ? { cache_hit: run2Github.cache_hit, latency_ms: run2Github.latency_ms } : null,
67
+ });
68
+ // 100 query quality distribution test
69
+ console.log("- Running 100 test queries (quality distribution + extraction latency)...");
70
+ const queries = makeQueries();
71
+ const qualityScores = [];
72
+ const pdfExtractMs = [];
73
+ for (let i = 0; i < queries.length; i++) {
74
+ const q = queries[i];
75
+ const res = await engine.find({ query: q, sources: [...sources], limit: 2, ...flags });
76
+ for (const doc of res.results) {
77
+ qualityScores.push(Number(doc.quality_score));
78
+ }
79
+ const arxivTelemetry = res.telemetry?.per_source.find((t) => t.source === "arxiv");
80
+ if (arxivTelemetry) {
81
+ pdfExtractMs.push(Number(arxivTelemetry.pdf_extract_ms_total || 0));
82
+ }
83
+ if ((i + 1) % 10 === 0) {
84
+ console.log(` progress: ${i + 1}/100`);
85
+ }
86
+ }
87
+ const q95 = percentile(qualityScores, 0.95);
88
+ const pdfP95 = percentile(pdfExtractMs, 0.95);
89
+ console.log("\nResults:");
90
+ console.log(`- Quality score p95: ${q95}`);
91
+ console.log(`- PDF extract ms p95 (arxiv): ${pdfP95}`);
92
+ const okQuality = q95 > 0.9;
93
+ const okPdfLatency = pdfP95 < 5000;
94
+ console.log(`\nSuccess criteria:`);
95
+ console.log(`- Quality p95 > 0.9: ${okQuality ? "PASS" : "FAIL"}`);
96
+ console.log(`- PDF extract p95 < 5000ms additional: ${okPdfLatency ? "PASS" : "FAIL"}`);
97
+ if (!okQuality) {
98
+ console.log("Tip: adjust estimateQualityScore() weights/thresholds in src/metadata/quality.ts then rerun.");
99
+ }
100
+ }
101
+ main().catch((e) => {
102
+ console.error(e);
103
+ process.exit(1);
104
+ });
@@ -1,4 +1,5 @@
1
1
  import { JITOrchestrator } from "./jit-orchestrator.js";
2
+ import { analyzeDatasetQuery, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "./query-intent.js";
2
3
  import fs from "fs";
3
4
  function log(msg) {
4
5
  fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
@@ -17,9 +18,10 @@ export class SearchEngine {
17
18
  async search(query, options = {}) {
18
19
  const limit = options.limit || 5;
19
20
  const enableJIT = options.enableJIT !== false; // Default: true
21
+ const intent = await analyzeDatasetQuery(query);
20
22
  log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
21
23
  // 1. Perform local search
22
- const localResults = await this.localSearch(query, options);
24
+ const localResults = await this.localSearch(query, options, intent);
23
25
  // 2. Check if JIT should be triggered
24
26
  const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
25
27
  if (!shouldTrigger) {
@@ -28,10 +30,10 @@ export class SearchEngine {
28
30
  }
29
31
  // 3. Trigger JIT fallback
30
32
  console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
31
- await this.jitOrchestrator.fetchAndIngest(query, 10);
33
+ await this.jitOrchestrator.fetchAndIngest(query, 10, intent);
32
34
  // 4. Re-run local search with updated index
33
35
  console.error(`Re-searching with updated library...`);
34
- const enhancedResults = await this.localSearch(query, options);
36
+ const enhancedResults = await this.localSearch(query, options, intent);
35
37
  const newCount = enhancedResults.length - localResults.length;
36
38
  if (newCount > 0) {
37
39
  console.error(`Found ${newCount} additional results\n`);
@@ -41,7 +43,7 @@ export class SearchEngine {
41
43
  /**
42
44
  * Perform hybrid search (Vector + Lexical + Penalties)
43
45
  */
44
- async localSearch(query, options) {
46
+ async localSearch(query, options, intent) {
45
47
  const limit = options.limit || 5;
46
48
  // 1. Parse Query
47
49
  const words = query.toLowerCase().split(/\s+/);
@@ -66,6 +68,12 @@ export class SearchEngine {
66
68
  // Filter: Safe only
67
69
  if (options.safeOnly && metadata.license.category === "restricted")
68
70
  continue;
71
+ // Filter: Hard language exclusion — completely drop bilingual/multilingual datasets
72
+ // when user explicitly requested a single language
73
+ if (shouldExcludeByLanguage(metadata, intent)) {
74
+ log(`Language exclusion: Dropped ${match.id} (bilingual/multilingual for single-language query)`);
75
+ continue;
76
+ }
69
77
  const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
70
78
  // Filter: Explicit Negative Keywords
71
79
  if (negativeKeywords.some(neg => text.includes(neg))) {
@@ -96,6 +104,34 @@ export class SearchEngine {
96
104
  if (lexicalScore === 0 && positiveKeywords.length > 1) {
97
105
  penalty += 0.2;
98
106
  }
107
+ // Penalty: Modality Mismatch
108
+ // Infer the expected modality from the query and penalize mismatches.
109
+ // e.g., "anime quotes" is text, so image-classification datasets get penalized.
110
+ const textIndicators = ["quotes", "text", "nlp", "sentiment", "review", "comment", "caption", "dialogue", "chat", "translation", "summarization", "classification"];
111
+ const imageIndicators = ["image", "photo", "picture", "vision", "detection", "segmentation", "face", "background"];
112
+ const queryLower = query.toLowerCase();
113
+ const queryLooksText = textIndicators.some(t => queryLower.includes(t));
114
+ const queryLooksImage = imageIndicators.some(t => queryLower.includes(t));
115
+ if (queryLooksText && !queryLooksImage) {
116
+ const resultTask = (metadata.task || "").toLowerCase();
117
+ const isImageResult = resultTask.includes("image") || resultTask.includes("object-detection") ||
118
+ text.includes("image classification") || text.includes("image-classification") ||
119
+ text.includes("object detection") || text.includes("image segmentation");
120
+ if (isImageResult) {
121
+ log(`Modality penalty: text query but image dataset ${match.id}`);
122
+ penalty += 0.35;
123
+ }
124
+ }
125
+ if (queryLooksImage && !queryLooksText) {
126
+ const resultTask = (metadata.task || "").toLowerCase();
127
+ const isTextResult = resultTask.includes("text-classification") || resultTask.includes("text-generation") ||
128
+ resultTask.includes("translation") || resultTask.includes("summarization") ||
129
+ resultTask.includes("question-answering");
130
+ if (isTextResult) {
131
+ log(`Modality penalty: image query but text dataset ${match.id}`);
132
+ penalty += 0.35;
133
+ }
134
+ }
99
135
  // D. Accessibility Bonuses (Prioritize low-friction sources)
100
136
  let bonus = 0;
101
137
  const sourceBonuses = {
@@ -103,16 +139,19 @@ export class SearchEngine {
103
139
  "uci": 0.1,
104
140
  "github": 0.1,
105
141
  "worldbank": 0.1,
106
- "nasa": 0.1
142
+ "nasa": 0.1,
143
+ "arxiv": 0.1
107
144
  };
108
145
  bonus = sourceBonuses[metadata.source] || 0;
109
146
  // Final Combined Score
110
147
  // 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
111
- const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus;
148
+ const intentScore = scoreDatasetAgainstIntent(metadata, intent);
149
+ const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus + intentScore;
112
150
  metadata.relevance_score = Math.round(finalScore * 100) / 100;
113
151
  metadata.vector_score = Math.round(vectorScore * 100) / 100;
114
152
  metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
115
153
  metadata.accessibility_bonus = bonus;
154
+ metadata.intent_score = intentScore;
116
155
  results.push(metadata);
117
156
  }
118
157
  // Sort by final score and limit
@@ -2,6 +2,7 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
2
2
  import { UCIScraper } from "../metadata/uci-scraper.js";
3
3
  import { GitHubScraper } from "../metadata/github-scraper.js";
4
4
  import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
5
+ import { analyzeDatasetQuery, buildIntentSearchQuery } from "./query-intent.js";
5
6
  // Common stop words to filter out for better search
6
7
  const STOP_WORDS = new Set([
7
8
  "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
@@ -61,7 +62,7 @@ export class JITOrchestrator {
61
62
  /**
62
63
  * Main JIT workflow: fetch, save, index, return new datasets
63
64
  */
64
- async fetchAndIngest(query, limit = 10) {
65
+ async fetchAndIngest(query, limit = 10, providedIntent) {
65
66
  // Rate limiting check
66
67
  if (!this.canTrigger(query)) {
67
68
  console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
@@ -69,9 +70,12 @@ export class JITOrchestrator {
69
70
  }
70
71
  console.error(`\n[JIT] Searching live sources for: "${query}"`);
71
72
  this.lastTriggerTime.set(query, Date.now());
72
- // Simplify query for better API results
73
- const keywords = this.simplifyQuery(query);
74
- if (keywords.length > 0) {
73
+ const intent = providedIntent || await analyzeDatasetQuery(query);
74
+ const keywords = this.simplifyQuery(buildIntentSearchQuery(intent));
75
+ if (intent.llmBacked || intent.language || intent.task || intent.domain || intent.minRows) {
76
+ console.error(`[JIT] Intent: ${JSON.stringify({ language: intent.language, task: intent.task, domain: intent.domain, minRows: intent.minRows, searchQuery: intent.searchQuery })}`);
77
+ }
78
+ else if (keywords.length > 0) {
75
79
  console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
76
80
  }
77
81
  const newDatasets = [];
@@ -81,15 +85,16 @@ export class JITOrchestrator {
81
85
  // Get existing dataset IDs to avoid duplicates
82
86
  const existing = this.metadataStore.getAllDatasets();
83
87
  existing.forEach(ds => existingIds.add(ds.id));
84
- // 1. Scrape HuggingFace - try each keyword separately for better results
85
- let hfResults = [];
86
- for (const keyword of keywords) {
87
- if (hfResults.length >= limit)
88
- break;
89
- const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / keywords.length));
90
- for (const ds of results) {
91
- if (!hfResults.some(existing => existing.id === ds.id)) {
92
- hfResults.push(ds);
88
+ let hfResults = await this.scrapeHuggingFace(intent, limit);
89
+ if (hfResults.length < Math.max(3, Math.floor(limit / 2))) {
90
+ for (const keyword of keywords) {
91
+ if (hfResults.length >= limit)
92
+ break;
93
+ const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / Math.max(keywords.length, 1)));
94
+ for (const ds of results) {
95
+ if (!hfResults.some(existing => existing.id === ds.id)) {
96
+ hfResults.push(ds);
97
+ }
93
98
  }
94
99
  }
95
100
  }
@@ -170,7 +175,6 @@ export class JITOrchestrator {
170
175
  async scrapeHuggingFace(query, limit) {
171
176
  const scraper = new HuggingFaceScraper();
172
177
  try {
173
- // Pass the query as a general search term
174
178
  return await scraper.scrape(limit, true, query);
175
179
  }
176
180
  catch (error) {