pi-research 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -250
- package/lib/page-fetch-adapter.js +311 -64
- package/lib/research-policy.js +36 -15
- package/lib/research-profiles.json +4 -0
- package/lib/research.js +15 -6
- package/lib/router-annotation.js +192 -0
- package/lib/router-structured-features.js +134 -0
- package/lib/tiny-router.js +338 -0
- package/lib/web-research.js +171 -10
- package/package.json +2 -2
package/lib/web-research.js
CHANGED
|
@@ -13,6 +13,7 @@ import {
|
|
|
13
13
|
buildFallbackQueries,
|
|
14
14
|
buildFastQueries,
|
|
15
15
|
buildFollowUpQuery,
|
|
16
|
+
buildActionBasedFollowUpQuery,
|
|
16
17
|
buildJinaReaderUrl,
|
|
17
18
|
classifySourceType,
|
|
18
19
|
compactResearchPayload,
|
|
@@ -35,8 +36,8 @@ import {
|
|
|
35
36
|
scoreSourceEntry,
|
|
36
37
|
selectRelevantChunks,
|
|
37
38
|
} from "./research.js";
|
|
38
|
-
import { pageFetchAdapter } from "./page-fetch-adapter.js";
|
|
39
|
-
import { pageQualitySignals } from "./research-policy.js";
|
|
39
|
+
import { getScraplingRuntimeStatus, pageFetchAdapter } from "./page-fetch-adapter.js";
|
|
40
|
+
import { isUsableContent, pageQualitySignals } from "./research-policy.js";
|
|
40
41
|
import { resolveOutputFormat, shouldRequireAuthoritativeSources } from "./research-output.js";
|
|
41
42
|
import { planResearch } from "./planner.js";
|
|
42
43
|
import {
|
|
@@ -53,6 +54,7 @@ const USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/
|
|
|
53
54
|
const MIN_PAGE_TEXT = 300;
|
|
54
55
|
const SEARCH_CACHE_TTL_MS = 5 * 60 * 1000;
|
|
55
56
|
const PAGE_CACHE_TTL_MS = 30 * 60 * 1000;
|
|
57
|
+
const EXPENSIVE_PAGE_CACHE_TTL_MS = 7 * 24 * 60 * 60 * 1000;
|
|
56
58
|
const searchCache = new Map();
|
|
57
59
|
const pageCache = new Map();
|
|
58
60
|
|
|
@@ -71,6 +73,10 @@ function setCacheValue(cache, key, value, ttlMs) {
|
|
|
71
73
|
return value;
|
|
72
74
|
}
|
|
73
75
|
|
|
76
|
+
function pageCacheTtl(page) {
|
|
77
|
+
return page?.expensive ? EXPENSIVE_PAGE_CACHE_TTL_MS : PAGE_CACHE_TTL_MS;
|
|
78
|
+
}
|
|
79
|
+
|
|
74
80
|
function hashText(text) {
|
|
75
81
|
return createHash("sha1").update(String(text || "")).digest("hex");
|
|
76
82
|
}
|
|
@@ -165,6 +171,7 @@ export async function buildQueries(query, mode = "fast", ctx, signal) {
|
|
|
165
171
|
const hintedQueries = Array.isArray(config.queryHints) && config.queryHints.length
|
|
166
172
|
? config.queryHints.map((hint) => `${query} ${hint}`)
|
|
167
173
|
: [];
|
|
174
|
+
|
|
168
175
|
if (config.mode === "code") {
|
|
169
176
|
return [...new Set([...planResearch(query, "code").subqueries, ...hintedQueries])].slice(0, config.maxQueries);
|
|
170
177
|
}
|
|
@@ -208,6 +215,7 @@ async function fetchTextWithRetry(url, signal, attempts = 2, headers = {
|
|
|
208
215
|
return response;
|
|
209
216
|
} catch (error) {
|
|
210
217
|
lastError = error;
|
|
218
|
+
if (signal?.aborted || error?.name === "AbortError" || error?.name === "TimeoutError") throw error;
|
|
211
219
|
if (attempt + 1 < attempts) await new Promise((resolve) => setTimeout(resolve, 100 * (attempt + 1)));
|
|
212
220
|
}
|
|
213
221
|
}
|
|
@@ -428,7 +436,7 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
|
|
|
428
436
|
if (shouldUseJinaFirst(url)) {
|
|
429
437
|
const first = finalizeFetchedPage(await fetchJinaPageSource(url, signal, config), config, { url, contentType: "text/plain" });
|
|
430
438
|
if (first && withinTimeframe(first, config)) {
|
|
431
|
-
const page = config.isolate ? first : setCacheValue(pageCache, cacheKey, first,
|
|
439
|
+
const page = config.isolate ? first : setCacheValue(pageCache, cacheKey, first, pageCacheTtl(first));
|
|
432
440
|
await logResearchEvent("fetch_end", { url, via: "jina_first", success: Boolean(page), page: page ? { title: page.title, sourceType: page.sourceType, publishDate: page.publishDate, textLength: page.text?.length || 0 } : null });
|
|
433
441
|
return page;
|
|
434
442
|
}
|
|
@@ -444,7 +452,7 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
|
|
|
444
452
|
if (!contentType.includes("text/html") && !contentType.includes("text/plain")) {
|
|
445
453
|
const fallback = finalizeFetchedPage(await fetchJinaPageSource(url, signal, config), config, { url, contentType });
|
|
446
454
|
if (fallback && withinTimeframe(fallback, config)) {
|
|
447
|
-
const page = config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback,
|
|
455
|
+
const page = config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, pageCacheTtl(fallback));
|
|
448
456
|
await logResearchEvent("fetch_end", { url, via: "unsupported_content_type_fallback", success: Boolean(page), contentType, page: page ? { title: page.title, sourceType: page.sourceType, publishDate: page.publishDate, textLength: page.text?.length || 0 } : null });
|
|
449
457
|
return page;
|
|
450
458
|
}
|
|
@@ -479,27 +487,60 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
|
|
|
479
487
|
codeBlocks: scraplingSnapshot.codeBlocks,
|
|
480
488
|
fetchStatus: scrapling.status ?? 200,
|
|
481
489
|
contentType: scrapling.contentType || "text/html",
|
|
490
|
+
expensive: true,
|
|
482
491
|
});
|
|
492
|
+
} else if (assessment?.blocked || assessment?.dynamic) {
|
|
493
|
+
await logResearchEvent("fetch_scrapling_unavailable", { url, mode: assessment.mode, runtime: getScraplingRuntimeStatus?.() || null });
|
|
483
494
|
}
|
|
484
495
|
}
|
|
485
496
|
|
|
486
497
|
const resolved = page || await fetchJinaPageSource(url, signal, config);
|
|
487
498
|
const finalPage = finalizeFetchedPage(resolved, config, { url: response.url || url, status: response.status ?? 200, contentType });
|
|
488
499
|
const stored = finalPage && withinTimeframe(finalPage, config)
|
|
489
|
-
? (config.isolate ? finalPage : setCacheValue(pageCache, cacheKey, finalPage,
|
|
500
|
+
? (config.isolate ? finalPage : setCacheValue(pageCache, cacheKey, finalPage, pageCacheTtl(finalPage)))
|
|
490
501
|
: null;
|
|
491
502
|
await logResearchEvent("fetch_end", { url, success: Boolean(stored), page: stored ? { title: stored.title, sourceType: stored.sourceType, publishDate: stored.publishDate, textLength: stored.text?.length || 0 } : null });
|
|
492
503
|
return stored;
|
|
493
504
|
} catch (error) {
|
|
505
|
+
if (signal?.aborted || error?.name === "AbortError") {
|
|
506
|
+
await logResearchEvent("fetch_abort", { url });
|
|
507
|
+
return null;
|
|
508
|
+
}
|
|
494
509
|
const fallback = finalizeFetchedPage(await fetchJinaPageSource(url, signal, config), config, { url, contentType: "text/plain" });
|
|
495
510
|
const stored = fallback && withinTimeframe(fallback, config)
|
|
496
|
-
? (config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback,
|
|
511
|
+
? (config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, pageCacheTtl(fallback)))
|
|
497
512
|
: null;
|
|
498
513
|
await logResearchEvent("fetch_error", { url, error, fallback: stored ? { title: stored.title, sourceType: stored.sourceType, publishDate: stored.publishDate, textLength: stored.text?.length || 0 } : null });
|
|
499
514
|
return stored;
|
|
500
515
|
}
|
|
501
516
|
}
|
|
502
517
|
|
|
518
|
+
async function speculativeFetch(results, signal, config, query) {
|
|
519
|
+
const target = Math.max(1, config.minSources || 1);
|
|
520
|
+
const controllers = results.map(() => new AbortController());
|
|
521
|
+
const abortAll = () => controllers.forEach((controller) => controller.abort());
|
|
522
|
+
if (signal) signal.addEventListener("abort", abortAll, { once: true });
|
|
523
|
+
|
|
524
|
+
let usableCount = 0;
|
|
525
|
+
const pages = await Promise.all(results.map(async (result, index) => {
|
|
526
|
+
const scopedSignal = signal ? AbortSignal.any([signal, controllers[index].signal]) : controllers[index].signal;
|
|
527
|
+
const page = await fetchPageSource(result.url, scopedSignal, { ...config, query });
|
|
528
|
+
if (scopedSignal.aborted || !page) return null;
|
|
529
|
+
if (isUsableContent(page, { ...config, query })) {
|
|
530
|
+
usableCount += 1;
|
|
531
|
+
if (usableCount >= target) {
|
|
532
|
+
controllers.forEach((controller, controllerIndex) => {
|
|
533
|
+
if (controllerIndex !== index && !controller.signal.aborted) controller.abort();
|
|
534
|
+
});
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
return page;
|
|
538
|
+
}));
|
|
539
|
+
|
|
540
|
+
if (signal) signal.removeEventListener("abort", abortAll);
|
|
541
|
+
return pages.filter(Boolean);
|
|
542
|
+
}
|
|
543
|
+
|
|
503
544
|
async function readLocalFiles(paths, config) {
|
|
504
545
|
const pages = [];
|
|
505
546
|
for (const path of paths) {
|
|
@@ -620,8 +661,72 @@ function modeCacheKey(query, config) {
|
|
|
620
661
|
}))}`;
|
|
621
662
|
}
|
|
622
663
|
|
|
664
|
+
import {
|
|
665
|
+
applyConflictTinyRouterDecision,
|
|
666
|
+
applySufficiencyTinyRouterDecision,
|
|
667
|
+
chooseTinyRouterDomain,
|
|
668
|
+
classifyConflictWithTinyRouter,
|
|
669
|
+
classifyDomainWithTinyRouter,
|
|
670
|
+
classifyFollowupWithTinyRouter,
|
|
671
|
+
classifySufficiencyWithTinyRouter,
|
|
672
|
+
} from "./tiny-router.js";
|
|
673
|
+
|
|
674
|
+
function missingAspectFromStructuredDecision(decision) {
|
|
675
|
+
if (decision === "need_authority") return "authoritative sources";
|
|
676
|
+
if (decision === "need_more_sources") return "readable sources";
|
|
677
|
+
if (decision === "need_recency") return "recent sources";
|
|
678
|
+
if (decision === "need_version_context") return "version context";
|
|
679
|
+
if (decision === "need_conflict_resolution") return "conflict resolution";
|
|
680
|
+
return null;
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
function withStructuredSufficiencyDecision(sufficiency, decision, query, seenUrls = []) {
|
|
684
|
+
if (!decision) return sufficiency;
|
|
685
|
+
if (decision === "sufficient") return sufficiency;
|
|
686
|
+
|
|
687
|
+
const missingAspect = missingAspectFromStructuredDecision(decision);
|
|
688
|
+
const followupQuery = buildActionBasedFollowUpQuery(query, decision, { seenUrls });
|
|
689
|
+
|
|
690
|
+
return {
|
|
691
|
+
...sufficiency,
|
|
692
|
+
sufficient: false,
|
|
693
|
+
missingAspects: missingAspect
|
|
694
|
+
? [...new Set([...(sufficiency.missingAspects || []), missingAspect])]
|
|
695
|
+
: sufficiency.missingAspects,
|
|
696
|
+
openSubQuestions: followupQuery
|
|
697
|
+
? [...new Set([...(sufficiency.openSubQuestions || []), followupQuery])]
|
|
698
|
+
: sufficiency.openSubQuestions,
|
|
699
|
+
};
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
async function resolveQuestionDomain(query, mode, signal) {
|
|
703
|
+
const fallback = classifyQuestionDomain(query);
|
|
704
|
+
const normalizedMode = typeof mode === "object" ? mode?.mode || "fast" : mode;
|
|
705
|
+
try {
|
|
706
|
+
const tinyStartedAt = Date.now();
|
|
707
|
+
const tinyDomain = await classifyDomainWithTinyRouter(query, normalizedMode, signal);
|
|
708
|
+
const tinyLatencyMs = Date.now() - tinyStartedAt;
|
|
709
|
+
await logResearchEvent("tiny_router_latency", { task: "domain", latencyMs: tinyLatencyMs, accepted: Boolean(tinyDomain) });
|
|
710
|
+
|
|
711
|
+
const domain = chooseTinyRouterDomain(fallback, tinyDomain);
|
|
712
|
+
if (tinyDomain && domain !== fallback) {
|
|
713
|
+
await logResearchEvent("tiny_router_domain", { query, mode: normalizedMode, heuristicDomain: fallback, predictedDomain: tinyDomain, acceptedDomain: domain });
|
|
714
|
+
return domain;
|
|
715
|
+
}
|
|
716
|
+
if (tinyDomain && domain === fallback && tinyDomain !== fallback) {
|
|
717
|
+
await logResearchEvent("tiny_router_fallback", { task: "domain", query, mode: normalizedMode, heuristicDomain: fallback, predictedDomain: tinyDomain, reason: "high_risk_not_downgraded" });
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
await logResearchEvent("tiny_router_fallback", { task: "domain", query, mode: normalizedMode, heuristicDomain: fallback, reason: tinyDomain ? "heuristic_kept" : "tiny_router_unavailable_or_low_confidence" });
|
|
721
|
+
return fallback;
|
|
722
|
+
} catch (error) {
|
|
723
|
+
await logResearchEvent("tiny_router_fallback", { task: "domain", query, mode: normalizedMode, heuristicDomain: fallback, reason: "error", error });
|
|
724
|
+
return fallback;
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
|
|
623
728
|
export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast") {
|
|
624
|
-
const domain =
|
|
729
|
+
const domain = await resolveQuestionDomain(query, mode, signal);
|
|
625
730
|
const config = getResearchConfig(typeof mode === "object" ? { ...mode, domain } : { mode, domain });
|
|
626
731
|
const cacheKey = modeCacheKey(query, config);
|
|
627
732
|
|
|
@@ -699,6 +804,9 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
699
804
|
} else {
|
|
700
805
|
lastEmptySearchSignature = null;
|
|
701
806
|
}
|
|
807
|
+
const fetchWindow = config.mode === "fast"
|
|
808
|
+
? Math.max(config.maxPages, Math.min(config.maxPages * 2, (config.minSources || 3) + 2))
|
|
809
|
+
: config.maxPages;
|
|
702
810
|
const results = rankSearchResults(flatResults, query, config.maxPages * 2, config)
|
|
703
811
|
.filter((result) => {
|
|
704
812
|
const key = normalizeUrl(result.url);
|
|
@@ -706,10 +814,12 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
706
814
|
seenUrls.add(key);
|
|
707
815
|
return true;
|
|
708
816
|
})
|
|
709
|
-
.slice(0,
|
|
817
|
+
.slice(0, fetchWindow);
|
|
710
818
|
|
|
711
819
|
emit("fetch", `Reading ${results.length} sources...`);
|
|
712
|
-
const pageCandidates =
|
|
820
|
+
const pageCandidates = config.mode === "fast"
|
|
821
|
+
? await speculativeFetch(results, signal, { ...config, minSources: config.minSources || 3 }, query)
|
|
822
|
+
: await Promise.all(results.map((result) => fetchPageSource(result.url, signal, { ...config, query })));
|
|
713
823
|
await logResearchEvent("page_fetch_results", {
|
|
714
824
|
query,
|
|
715
825
|
urls: results.map((result) => result.url),
|
|
@@ -742,6 +852,22 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
742
852
|
conflictSummary = conflict.conflictSummary || "";
|
|
743
853
|
conflictingSourcePairs = conflict.conflictingSourcePairs || [];
|
|
744
854
|
|
|
855
|
+
const structuredConflictStartedAt = Date.now();
|
|
856
|
+
const structuredConflictDecision = await classifyConflictWithTinyRouter(query, mergedPages, signal);
|
|
857
|
+
if (structuredConflictDecision) {
|
|
858
|
+
await logResearchEvent("tiny_router_latency", { task: "conflict", latencyMs: Date.now() - structuredConflictStartedAt, accepted: true });
|
|
859
|
+
const nextConflictDetected = applyConflictTinyRouterDecision(
|
|
860
|
+
conflictDetected,
|
|
861
|
+
structuredConflictDecision,
|
|
862
|
+
{ allowClear: process.env.PI_RESEARCH_TINY_ROUTER_CONFLICT_ALLOW_CLEAR === "1" || process.env.PI_RESEARCH_TINY_ROUTER_CONFLICT_ALLOW_CLEAR === "true" },
|
|
863
|
+
);
|
|
864
|
+
if (nextConflictDetected !== conflictDetected) {
|
|
865
|
+
conflictDetected = nextConflictDetected;
|
|
866
|
+
if (conflictDetected && !conflictSummary) conflictSummary = `Structured router flagged ${query} for conflict review.`;
|
|
867
|
+
}
|
|
868
|
+
await logResearchEvent("tiny_router_structured_decision", { task: "conflict", query, decision: structuredConflictDecision, heuristicConflictDetected: conflict.detected, finalConflictDetected: conflictDetected });
|
|
869
|
+
}
|
|
870
|
+
|
|
745
871
|
const minSources = config.mode === "fast"
|
|
746
872
|
? (mergedPages.some((page) => page.authoritative) ? 1 : Math.max(3, config.minSources || 3))
|
|
747
873
|
: (config.minSources || 3);
|
|
@@ -757,10 +883,45 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
757
883
|
sufficiency = { ...sufficiency, sufficient: true };
|
|
758
884
|
}
|
|
759
885
|
|
|
886
|
+
const structuredSufficiencyStartedAt = Date.now();
|
|
887
|
+
const structuredSufficiencyDecision = await classifySufficiencyWithTinyRouter(query, mergedPages, signal);
|
|
888
|
+
if (structuredSufficiencyDecision) {
|
|
889
|
+
const heuristicSufficient = sufficiency.sufficient;
|
|
890
|
+
await logResearchEvent("tiny_router_latency", { task: "sufficiency", latencyMs: Date.now() - structuredSufficiencyStartedAt, accepted: true });
|
|
891
|
+
const finalSufficient = applySufficiencyTinyRouterDecision(heuristicSufficient, structuredSufficiencyDecision);
|
|
892
|
+
if (finalSufficient !== heuristicSufficient) {
|
|
893
|
+
sufficiency = withStructuredSufficiencyDecision(sufficiency, structuredSufficiencyDecision, query, mergedPages.map((page) => page.url));
|
|
894
|
+
}
|
|
895
|
+
await logResearchEvent("tiny_router_structured_decision", { task: "sufficiency", query, decision: structuredSufficiencyDecision, heuristicSufficient, finalSufficient });
|
|
896
|
+
sufficiency = { ...sufficiency, sufficient: finalSufficient };
|
|
897
|
+
}
|
|
898
|
+
|
|
760
899
|
if (sufficiency.sufficient || turn === (config.maxTurns - 1)) break;
|
|
761
900
|
|
|
762
901
|
followupRounds += 1;
|
|
763
|
-
|
|
902
|
+
|
|
903
|
+
const conflictState = conflictDetected ? (mergedPages.some(p => p.authoritative) ? "minor" : "severe") : "none";
|
|
904
|
+
const sourcesMeta = {
|
|
905
|
+
has_authority: mergedPages.some(p => p.authoritative),
|
|
906
|
+
has_forum: mergedPages.some(p => p.sourceType === "forum" || /forum|reddit|stack/i.test(p.url)),
|
|
907
|
+
has_news: mergedPages.some(p => p.sourceType === "news" || /news|blog|article/i.test(p.url)),
|
|
908
|
+
has_recent: mergedPages.some(p => p.freshness === "recent" || p.freshness === "current_year"),
|
|
909
|
+
source_count: mergedPages.length
|
|
910
|
+
};
|
|
911
|
+
|
|
912
|
+
const action = await classifyFollowupWithTinyRouter(query, config.mode, conflictState, sourcesMeta, signal);
|
|
913
|
+
|
|
914
|
+
if (action === "stop") {
|
|
915
|
+
await logResearchEvent("tiny_router_stop", { query, reason: "router_suggested_stop" });
|
|
916
|
+
break;
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
if (!action) {
|
|
920
|
+
followupQuery = buildFollowUpQuery(query, mergedPages, { seenUrls: mergedPages.map((page) => page.url) });
|
|
921
|
+
} else {
|
|
922
|
+
followupQuery = buildActionBasedFollowUpQuery(query, action, { seenUrls: mergedPages.map((page) => page.url) });
|
|
923
|
+
}
|
|
924
|
+
|
|
764
925
|
currentQueries = planSubqueries(query, followupQuery, config, sufficiency);
|
|
765
926
|
subqueries = [...new Set([...subqueries, ...currentQueries])];
|
|
766
927
|
}
|
package/package.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-research",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"type": "module",
|
|
6
|
-
"description": "
|
|
6
|
+
"description": "Zero-setup grounded web research for AI coding agents.",
|
|
7
7
|
"license": "MIT",
|
|
8
8
|
"main": "./index.js",
|
|
9
9
|
"bin": {
|