pi-research 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@ import {
13
13
  buildFallbackQueries,
14
14
  buildFastQueries,
15
15
  buildFollowUpQuery,
16
+ buildActionBasedFollowUpQuery,
16
17
  buildJinaReaderUrl,
17
18
  classifySourceType,
18
19
  compactResearchPayload,
@@ -35,8 +36,8 @@ import {
35
36
  scoreSourceEntry,
36
37
  selectRelevantChunks,
37
38
  } from "./research.js";
38
- import { pageFetchAdapter } from "./page-fetch-adapter.js";
39
- import { pageQualitySignals } from "./research-policy.js";
39
+ import { getScraplingRuntimeStatus, pageFetchAdapter } from "./page-fetch-adapter.js";
40
+ import { isUsableContent, pageQualitySignals } from "./research-policy.js";
40
41
  import { resolveOutputFormat, shouldRequireAuthoritativeSources } from "./research-output.js";
41
42
  import { planResearch } from "./planner.js";
42
43
  import {
@@ -53,6 +54,7 @@ const USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/
53
54
  const MIN_PAGE_TEXT = 300;
54
55
  const SEARCH_CACHE_TTL_MS = 5 * 60 * 1000;
55
56
  const PAGE_CACHE_TTL_MS = 30 * 60 * 1000;
57
+ const EXPENSIVE_PAGE_CACHE_TTL_MS = 7 * 24 * 60 * 60 * 1000;
56
58
  const searchCache = new Map();
57
59
  const pageCache = new Map();
58
60
 
@@ -71,6 +73,10 @@ function setCacheValue(cache, key, value, ttlMs) {
71
73
  return value;
72
74
  }
73
75
 
76
+ function pageCacheTtl(page) {
77
+ return page?.expensive ? EXPENSIVE_PAGE_CACHE_TTL_MS : PAGE_CACHE_TTL_MS;
78
+ }
79
+
74
80
  function hashText(text) {
75
81
  return createHash("sha1").update(String(text || "")).digest("hex");
76
82
  }
@@ -165,6 +171,7 @@ export async function buildQueries(query, mode = "fast", ctx, signal) {
165
171
  const hintedQueries = Array.isArray(config.queryHints) && config.queryHints.length
166
172
  ? config.queryHints.map((hint) => `${query} ${hint}`)
167
173
  : [];
174
+
168
175
  if (config.mode === "code") {
169
176
  return [...new Set([...planResearch(query, "code").subqueries, ...hintedQueries])].slice(0, config.maxQueries);
170
177
  }
@@ -208,6 +215,7 @@ async function fetchTextWithRetry(url, signal, attempts = 2, headers = {
208
215
  return response;
209
216
  } catch (error) {
210
217
  lastError = error;
218
+ if (signal?.aborted || error?.name === "AbortError" || error?.name === "TimeoutError") throw error;
211
219
  if (attempt + 1 < attempts) await new Promise((resolve) => setTimeout(resolve, 100 * (attempt + 1)));
212
220
  }
213
221
  }
@@ -428,7 +436,7 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
428
436
  if (shouldUseJinaFirst(url)) {
429
437
  const first = finalizeFetchedPage(await fetchJinaPageSource(url, signal, config), config, { url, contentType: "text/plain" });
430
438
  if (first && withinTimeframe(first, config)) {
431
- const page = config.isolate ? first : setCacheValue(pageCache, cacheKey, first, PAGE_CACHE_TTL_MS);
439
+ const page = config.isolate ? first : setCacheValue(pageCache, cacheKey, first, pageCacheTtl(first));
432
440
  await logResearchEvent("fetch_end", { url, via: "jina_first", success: Boolean(page), page: page ? { title: page.title, sourceType: page.sourceType, publishDate: page.publishDate, textLength: page.text?.length || 0 } : null });
433
441
  return page;
434
442
  }
@@ -444,7 +452,7 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
444
452
  if (!contentType.includes("text/html") && !contentType.includes("text/plain")) {
445
453
  const fallback = finalizeFetchedPage(await fetchJinaPageSource(url, signal, config), config, { url, contentType });
446
454
  if (fallback && withinTimeframe(fallback, config)) {
447
- const page = config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, PAGE_CACHE_TTL_MS);
455
+ const page = config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, pageCacheTtl(fallback));
448
456
  await logResearchEvent("fetch_end", { url, via: "unsupported_content_type_fallback", success: Boolean(page), contentType, page: page ? { title: page.title, sourceType: page.sourceType, publishDate: page.publishDate, textLength: page.text?.length || 0 } : null });
449
457
  return page;
450
458
  }
@@ -479,27 +487,60 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
479
487
  codeBlocks: scraplingSnapshot.codeBlocks,
480
488
  fetchStatus: scrapling.status ?? 200,
481
489
  contentType: scrapling.contentType || "text/html",
490
+ expensive: true,
482
491
  });
492
+ } else if (assessment?.blocked || assessment?.dynamic) {
493
+ await logResearchEvent("fetch_scrapling_unavailable", { url, mode: assessment.mode, runtime: getScraplingRuntimeStatus?.() || null });
483
494
  }
484
495
  }
485
496
 
486
497
  const resolved = page || await fetchJinaPageSource(url, signal, config);
487
498
  const finalPage = finalizeFetchedPage(resolved, config, { url: response.url || url, status: response.status ?? 200, contentType });
488
499
  const stored = finalPage && withinTimeframe(finalPage, config)
489
- ? (config.isolate ? finalPage : setCacheValue(pageCache, cacheKey, finalPage, PAGE_CACHE_TTL_MS))
500
+ ? (config.isolate ? finalPage : setCacheValue(pageCache, cacheKey, finalPage, pageCacheTtl(finalPage)))
490
501
  : null;
491
502
  await logResearchEvent("fetch_end", { url, success: Boolean(stored), page: stored ? { title: stored.title, sourceType: stored.sourceType, publishDate: stored.publishDate, textLength: stored.text?.length || 0 } : null });
492
503
  return stored;
493
504
  } catch (error) {
505
+ if (signal?.aborted || error?.name === "AbortError") {
506
+ await logResearchEvent("fetch_abort", { url });
507
+ return null;
508
+ }
494
509
  const fallback = finalizeFetchedPage(await fetchJinaPageSource(url, signal, config), config, { url, contentType: "text/plain" });
495
510
  const stored = fallback && withinTimeframe(fallback, config)
496
- ? (config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, PAGE_CACHE_TTL_MS))
511
+ ? (config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, pageCacheTtl(fallback)))
497
512
  : null;
498
513
  await logResearchEvent("fetch_error", { url, error, fallback: stored ? { title: stored.title, sourceType: stored.sourceType, publishDate: stored.publishDate, textLength: stored.text?.length || 0 } : null });
499
514
  return stored;
500
515
  }
501
516
  }
502
517
 
518
+ async function speculativeFetch(results, signal, config, query) {
519
+ const target = Math.max(1, config.minSources || 1);
520
+ const controllers = results.map(() => new AbortController());
521
+ const abortAll = () => controllers.forEach((controller) => controller.abort());
522
+ if (signal) signal.addEventListener("abort", abortAll, { once: true });
523
+
524
+ let usableCount = 0;
525
+ const pages = await Promise.all(results.map(async (result, index) => {
526
+ const scopedSignal = signal ? AbortSignal.any([signal, controllers[index].signal]) : controllers[index].signal;
527
+ const page = await fetchPageSource(result.url, scopedSignal, { ...config, query });
528
+ if (scopedSignal.aborted || !page) return null;
529
+ if (isUsableContent(page, { ...config, query })) {
530
+ usableCount += 1;
531
+ if (usableCount >= target) {
532
+ controllers.forEach((controller, controllerIndex) => {
533
+ if (controllerIndex !== index && !controller.signal.aborted) controller.abort();
534
+ });
535
+ }
536
+ }
537
+ return page;
538
+ }));
539
+
540
+ if (signal) signal.removeEventListener("abort", abortAll);
541
+ return pages.filter(Boolean);
542
+ }
543
+
503
544
  async function readLocalFiles(paths, config) {
504
545
  const pages = [];
505
546
  for (const path of paths) {
@@ -620,8 +661,72 @@ function modeCacheKey(query, config) {
620
661
  }))}`;
621
662
  }
622
663
 
664
+ import {
665
+ applyConflictTinyRouterDecision,
666
+ applySufficiencyTinyRouterDecision,
667
+ chooseTinyRouterDomain,
668
+ classifyConflictWithTinyRouter,
669
+ classifyDomainWithTinyRouter,
670
+ classifyFollowupWithTinyRouter,
671
+ classifySufficiencyWithTinyRouter,
672
+ } from "./tiny-router.js";
673
+
674
+ function missingAspectFromStructuredDecision(decision) {
675
+ if (decision === "need_authority") return "authoritative sources";
676
+ if (decision === "need_more_sources") return "readable sources";
677
+ if (decision === "need_recency") return "recent sources";
678
+ if (decision === "need_version_context") return "version context";
679
+ if (decision === "need_conflict_resolution") return "conflict resolution";
680
+ return null;
681
+ }
682
+
683
+ function withStructuredSufficiencyDecision(sufficiency, decision, query, seenUrls = []) {
684
+ if (!decision) return sufficiency;
685
+ if (decision === "sufficient") return sufficiency;
686
+
687
+ const missingAspect = missingAspectFromStructuredDecision(decision);
688
+ const followupQuery = buildActionBasedFollowUpQuery(query, decision, { seenUrls });
689
+
690
+ return {
691
+ ...sufficiency,
692
+ sufficient: false,
693
+ missingAspects: missingAspect
694
+ ? [...new Set([...(sufficiency.missingAspects || []), missingAspect])]
695
+ : sufficiency.missingAspects,
696
+ openSubQuestions: followupQuery
697
+ ? [...new Set([...(sufficiency.openSubQuestions || []), followupQuery])]
698
+ : sufficiency.openSubQuestions,
699
+ };
700
+ }
701
+
702
+ async function resolveQuestionDomain(query, mode, signal) {
703
+ const fallback = classifyQuestionDomain(query);
704
+ const normalizedMode = typeof mode === "object" ? mode?.mode || "fast" : mode;
705
+ try {
706
+ const tinyStartedAt = Date.now();
707
+ const tinyDomain = await classifyDomainWithTinyRouter(query, normalizedMode, signal);
708
+ const tinyLatencyMs = Date.now() - tinyStartedAt;
709
+ await logResearchEvent("tiny_router_latency", { task: "domain", latencyMs: tinyLatencyMs, accepted: Boolean(tinyDomain) });
710
+
711
+ const domain = chooseTinyRouterDomain(fallback, tinyDomain);
712
+ if (tinyDomain && domain !== fallback) {
713
+ await logResearchEvent("tiny_router_domain", { query, mode: normalizedMode, heuristicDomain: fallback, predictedDomain: tinyDomain, acceptedDomain: domain });
714
+ return domain;
715
+ }
716
+ if (tinyDomain && domain === fallback && tinyDomain !== fallback) {
717
+ await logResearchEvent("tiny_router_fallback", { task: "domain", query, mode: normalizedMode, heuristicDomain: fallback, predictedDomain: tinyDomain, reason: "high_risk_not_downgraded" });
718
+ }
719
+
720
+ await logResearchEvent("tiny_router_fallback", { task: "domain", query, mode: normalizedMode, heuristicDomain: fallback, reason: tinyDomain ? "heuristic_kept" : "tiny_router_unavailable_or_low_confidence" });
721
+ return fallback;
722
+ } catch (error) {
723
+ await logResearchEvent("tiny_router_fallback", { task: "domain", query, mode: normalizedMode, heuristicDomain: fallback, reason: "error", error });
724
+ return fallback;
725
+ }
726
+ }
727
+
623
728
  export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast") {
624
- const domain = classifyQuestionDomain(query);
729
+ const domain = await resolveQuestionDomain(query, mode, signal);
625
730
  const config = getResearchConfig(typeof mode === "object" ? { ...mode, domain } : { mode, domain });
626
731
  const cacheKey = modeCacheKey(query, config);
627
732
 
@@ -699,6 +804,9 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
699
804
  } else {
700
805
  lastEmptySearchSignature = null;
701
806
  }
807
+ const fetchWindow = config.mode === "fast"
808
+ ? Math.max(config.maxPages, Math.min(config.maxPages * 2, (config.minSources || 3) + 2))
809
+ : config.maxPages;
702
810
  const results = rankSearchResults(flatResults, query, config.maxPages * 2, config)
703
811
  .filter((result) => {
704
812
  const key = normalizeUrl(result.url);
@@ -706,10 +814,12 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
706
814
  seenUrls.add(key);
707
815
  return true;
708
816
  })
709
- .slice(0, config.maxPages);
817
+ .slice(0, fetchWindow);
710
818
 
711
819
  emit("fetch", `Reading ${results.length} sources...`);
712
- const pageCandidates = await Promise.all(results.map((result) => fetchPageSource(result.url, signal, { ...config, query })));
820
+ const pageCandidates = config.mode === "fast"
821
+ ? await speculativeFetch(results, signal, { ...config, minSources: config.minSources || 3 }, query)
822
+ : await Promise.all(results.map((result) => fetchPageSource(result.url, signal, { ...config, query })));
713
823
  await logResearchEvent("page_fetch_results", {
714
824
  query,
715
825
  urls: results.map((result) => result.url),
@@ -742,6 +852,22 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
742
852
  conflictSummary = conflict.conflictSummary || "";
743
853
  conflictingSourcePairs = conflict.conflictingSourcePairs || [];
744
854
 
855
+ const structuredConflictStartedAt = Date.now();
856
+ const structuredConflictDecision = await classifyConflictWithTinyRouter(query, mergedPages, signal);
857
+ if (structuredConflictDecision) {
858
+ await logResearchEvent("tiny_router_latency", { task: "conflict", latencyMs: Date.now() - structuredConflictStartedAt, accepted: true });
859
+ const nextConflictDetected = applyConflictTinyRouterDecision(
860
+ conflictDetected,
861
+ structuredConflictDecision,
862
+ { allowClear: process.env.PI_RESEARCH_TINY_ROUTER_CONFLICT_ALLOW_CLEAR === "1" || process.env.PI_RESEARCH_TINY_ROUTER_CONFLICT_ALLOW_CLEAR === "true" },
863
+ );
864
+ if (nextConflictDetected !== conflictDetected) {
865
+ conflictDetected = nextConflictDetected;
866
+ if (conflictDetected && !conflictSummary) conflictSummary = `Structured router flagged ${query} for conflict review.`;
867
+ }
868
+ await logResearchEvent("tiny_router_structured_decision", { task: "conflict", query, decision: structuredConflictDecision, heuristicConflictDetected: conflict.detected, finalConflictDetected: conflictDetected });
869
+ }
870
+
745
871
  const minSources = config.mode === "fast"
746
872
  ? (mergedPages.some((page) => page.authoritative) ? 1 : Math.max(3, config.minSources || 3))
747
873
  : (config.minSources || 3);
@@ -757,10 +883,45 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
757
883
  sufficiency = { ...sufficiency, sufficient: true };
758
884
  }
759
885
 
886
+ const structuredSufficiencyStartedAt = Date.now();
887
+ const structuredSufficiencyDecision = await classifySufficiencyWithTinyRouter(query, mergedPages, signal);
888
+ if (structuredSufficiencyDecision) {
889
+ const heuristicSufficient = sufficiency.sufficient;
890
+ await logResearchEvent("tiny_router_latency", { task: "sufficiency", latencyMs: Date.now() - structuredSufficiencyStartedAt, accepted: true });
891
+ const finalSufficient = applySufficiencyTinyRouterDecision(heuristicSufficient, structuredSufficiencyDecision);
892
+ if (finalSufficient !== heuristicSufficient) {
893
+ sufficiency = withStructuredSufficiencyDecision(sufficiency, structuredSufficiencyDecision, query, mergedPages.map((page) => page.url));
894
+ }
895
+ await logResearchEvent("tiny_router_structured_decision", { task: "sufficiency", query, decision: structuredSufficiencyDecision, heuristicSufficient, finalSufficient });
896
+ sufficiency = { ...sufficiency, sufficient: finalSufficient };
897
+ }
898
+
760
899
  if (sufficiency.sufficient || turn === (config.maxTurns - 1)) break;
761
900
 
762
901
  followupRounds += 1;
763
- followupQuery = buildFollowUpQuery(query, mergedPages);
902
+
903
+ const conflictState = conflictDetected ? (mergedPages.some(p => p.authoritative) ? "minor" : "severe") : "none";
904
+ const sourcesMeta = {
905
+ has_authority: mergedPages.some(p => p.authoritative),
906
+ has_forum: mergedPages.some(p => p.sourceType === "forum" || /forum|reddit|stack/i.test(p.url)),
907
+ has_news: mergedPages.some(p => p.sourceType === "news" || /news|blog|article/i.test(p.url)),
908
+ has_recent: mergedPages.some(p => p.freshness === "recent" || p.freshness === "current_year"),
909
+ source_count: mergedPages.length
910
+ };
911
+
912
+ const action = await classifyFollowupWithTinyRouter(query, config.mode, conflictState, sourcesMeta, signal);
913
+
914
+ if (action === "stop") {
915
+ await logResearchEvent("tiny_router_stop", { query, reason: "router_suggested_stop" });
916
+ break;
917
+ }
918
+
919
+ if (!action) {
920
+ followupQuery = buildFollowUpQuery(query, mergedPages, { seenUrls: mergedPages.map((page) => page.url) });
921
+ } else {
922
+ followupQuery = buildActionBasedFollowUpQuery(query, action, { seenUrls: mergedPages.map((page) => page.url) });
923
+ }
924
+
764
925
  currentQueries = planSubqueries(query, followupQuery, config, sufficiency);
765
926
  subqueries = [...new Set([...subqueries, ...currentQueries])];
766
927
  }
package/package.json CHANGED
@@ -1,14 +1,14 @@
1
1
  {
2
2
  "name": "pi-research",
3
- "version": "1.3.0",
3
+ "version": "1.4.0",
4
4
  "private": false,
5
5
  "type": "module",
6
- "description": "Pi extension for web research.",
6
+ "description": "Zero-setup grounded web research for AI coding agents.",
7
7
  "license": "MIT",
8
8
  "main": "./index.js",
9
9
  "bin": {
10
- "pi-research": "./pi-research.js",
11
- "unblind-mcp": "./unblind-mcp.js"
10
+ "pi-research": "./bin/pi-research.js",
11
+ "unblind-mcp": "./bin/unblind-mcp.js"
12
12
  },
13
13
  "files": [
14
14
  "bin",