pi-research 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -210,6 +210,14 @@ pi install npm:pi-research
210
210
 
211
211
  This registers the Pi extension and keeps the public tool name `pi-research`.
212
212
 
213
+ ### npm install
214
+
215
+ ```bash
216
+ npm i pi-research
217
+ ```
218
+
219
+ This is the package install command that npm shows on the package page.
220
+
213
221
  ### MCP-only — any agent
214
222
 
215
223
  Run the MCP server directly from npm:
@@ -280,7 +288,7 @@ A separate npm package named `unblind-mcp` can be added later as a tiny wrapper
280
288
  ## Release notes
281
289
 
282
290
  - Package name: `pi-research`
283
- - Version: `1.1.2`
291
+ - Version: `1.2.1`
284
292
  - Entry point: `extensions/pi-research.ts`
285
293
  - MCP entry point: `mcp/server.js`
286
294
  - MCP compatibility shim: `mcp-server.js`
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { startMcpServer } from "../mcp/server.js";
4
+
5
+ startMcpServer();
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { startMcpServer } from "../mcp/server.js";
4
+
5
+ startMcpServer();
package/index.js CHANGED
@@ -63,7 +63,7 @@ export default function webResearchExtension(pi) {
63
63
  RESEARCH_STATE.clear();
64
64
  clearResearchMemory();
65
65
  await logResearchEvent("agent_start", {
66
- systemPrompt: event.systemPrompt,
66
+ systemPromptLength: String(event.systemPrompt || "").length,
67
67
  guidance: buildWebResearchGuidance(),
68
68
  });
69
69
  return { systemPrompt: `${event.systemPrompt}\n\n${buildWebResearchGuidance()}` };
@@ -6,6 +6,7 @@ export function loadEvalCases(domain) {
6
6
  try {
7
7
  return readdirSync(dir)
8
8
  .filter((file) => file.endsWith(".json"))
9
+ .sort((a, b) => a.localeCompare(b))
9
10
  .map((file) => JSON.parse(readFileSync(join(dir, file), "utf8")));
10
11
  } catch {
11
12
  return [];
@@ -1,8 +1,186 @@
1
+ import { classifyQuestionDomain } from "../research-intent.js";
2
+ import { buildFollowUpQuery, evaluateSufficiency, scoreSourceEntry } from "../research.js";
3
+ import { pageQualitySignals } from "../research-policy.js";
1
4
  import { loadEvalCases } from "./case-loader.js";
2
5
 
6
+ function normalizeHost(url = "") {
7
+ try {
8
+ return new URL(url).hostname.replace(/^www\./, "").toLowerCase();
9
+ } catch {
10
+ return "";
11
+ }
12
+ }
13
+
14
+ function runChecks(checks = []) {
15
+ const passed = checks.filter((check) => check.ok).length;
16
+ return {
17
+ passed,
18
+ total: checks.length,
19
+ checks,
20
+ ok: passed === checks.length,
21
+ };
22
+ }
23
+
24
+ function evaluateFollowupProbe(probe = {}) {
25
+ if (!probe || !probe.query) return [];
26
+ const followup = buildFollowUpQuery(probe.query, probe.pages || []);
27
+ const checks = [];
28
+
29
+ if (probe.expectNoQuestionMark !== false) {
30
+ checks.push({
31
+ name: "followup-no-question-mark",
32
+ ok: !followup.includes("?"),
33
+ actual: followup,
34
+ });
35
+ }
36
+ if (Array.isArray(probe.expectNotIncludes)) {
37
+ for (const token of probe.expectNotIncludes) {
38
+ checks.push({
39
+ name: `followup-not-includes:${token}`,
40
+ ok: !followup.toLowerCase().includes(String(token).toLowerCase()),
41
+ actual: followup,
42
+ });
43
+ }
44
+ }
45
+ if (Array.isArray(probe.expectAnyIncludes) && probe.expectAnyIncludes.length) {
46
+ checks.push({
47
+ name: "followup-includes-one-expected-token",
48
+ ok: probe.expectAnyIncludes.some((token) => followup.toLowerCase().includes(String(token).toLowerCase())),
49
+ actual: followup,
50
+ });
51
+ }
52
+
53
+ return checks;
54
+ }
55
+
56
+ function evaluatePageProbe(probe = {}) {
57
+ if (!probe || !probe.page) return [];
58
+ const quality = pageQualitySignals(probe.page);
59
+ const checks = [];
60
+
61
+ if (typeof probe.expectBlocked === "boolean") {
62
+ checks.push({ name: "page-blocked", ok: quality.blocked === probe.expectBlocked, actual: quality.blocked });
63
+ }
64
+ if (typeof probe.expectWeak === "boolean") {
65
+ checks.push({ name: "page-weak", ok: quality.weak === probe.expectWeak, actual: quality.weak });
66
+ }
67
+ if (Array.isArray(probe.expectSignals)) {
68
+ for (const signal of probe.expectSignals) {
69
+ checks.push({
70
+ name: `page-signal:${signal}`,
71
+ ok: quality.negativeSignals.includes(signal),
72
+ actual: quality.negativeSignals,
73
+ });
74
+ }
75
+ }
76
+
77
+ return checks;
78
+ }
79
+
80
+ function evaluateSourceProbe(probe = {}) {
81
+ if (!probe || !Array.isArray(probe.sources)) return [];
82
+ const scored = probe.sources.map((source) => ({ source, scored: scoreSourceEntry(source, probe.query || "") }));
83
+ const checks = [];
84
+
85
+ if (Array.isArray(probe.expectAuthoritativeHosts)) {
86
+ for (const host of probe.expectAuthoritativeHosts) {
87
+ const match = scored.find(({ source }) => normalizeHost(source.url) === host);
88
+ checks.push({
89
+ name: `authoritative-host:${host}`,
90
+ ok: Boolean(match?.scored.authoritative),
91
+ actual: match?.scored,
92
+ });
93
+ }
94
+ }
95
+ if (Array.isArray(probe.expectNonAuthoritativeHosts)) {
96
+ for (const host of probe.expectNonAuthoritativeHosts) {
97
+ const match = scored.find(({ source }) => normalizeHost(source.url) === host);
98
+ checks.push({
99
+ name: `non-authoritative-host:${host}`,
100
+ ok: match ? !match.scored.authoritative : false,
101
+ actual: match?.scored,
102
+ });
103
+ }
104
+ }
105
+ if (Array.isArray(probe.expectSourceTypes)) {
106
+ for (const expected of probe.expectSourceTypes) {
107
+ const match = scored.find(({ source }) => normalizeHost(source.url) === expected.host);
108
+ checks.push({
109
+ name: `source-type:${expected.host}`,
110
+ ok: match?.scored.sourceType === expected.sourceType,
111
+ actual: match?.scored,
112
+ });
113
+ }
114
+ }
115
+
116
+ return checks;
117
+ }
118
+
119
+ function evaluateSufficiencyProbe(probe = {}) {
120
+ if (!probe || !probe.query) return [];
121
+ const result = evaluateSufficiency({
122
+ query: probe.query,
123
+ sources: probe.sources || [],
124
+ conflictDetected: Boolean(probe.conflictDetected),
125
+ minSources: probe.minSources,
126
+ });
127
+ const checks = [];
128
+
129
+ if (typeof probe.expectSufficient === "boolean") {
130
+ checks.push({ name: "sufficient", ok: result.sufficient === probe.expectSufficient, actual: result.sufficient });
131
+ }
132
+ if (typeof probe.expectAuthoritativeSourcesFound === "boolean") {
133
+ checks.push({
134
+ name: "authoritative-sources-found",
135
+ ok: result.authoritativeSourcesFound === probe.expectAuthoritativeSourcesFound,
136
+ actual: result.authoritativeSourcesFound,
137
+ });
138
+ }
139
+ if (probe.expectOpenSubQuestionsNoQuestionMark) {
140
+ checks.push({
141
+ name: "open-subquestions-no-question-mark",
142
+ ok: result.openSubQuestions.every((item) => !String(item).includes("?")),
143
+ actual: result.openSubQuestions,
144
+ });
145
+ }
146
+
147
+ return checks;
148
+ }
149
+
150
+ function evaluateCase(domain, item) {
151
+ const checks = [];
152
+ checks.push({
153
+ name: "domain-match",
154
+ ok: classifyQuestionDomain(item.question) === (item.expectedDomain || domain),
155
+ actual: classifyQuestionDomain(item.question),
156
+ });
157
+
158
+ checks.push(...evaluateFollowupProbe(item.followupProbe));
159
+ checks.push(...evaluatePageProbe(item.pageProbe));
160
+ checks.push(...evaluateSourceProbe(item.sourceProbe));
161
+ checks.push(...evaluateSufficiencyProbe(item.sufficiencyProbe));
162
+
163
+ return {
164
+ question: item.question,
165
+ notes: item.notes || "",
166
+ ...runChecks(checks),
167
+ };
168
+ }
169
+
3
170
  export async function runEvalSuite({ domain }) {
4
171
  const cases = loadEvalCases(domain);
5
- const passed = cases.filter((item) => item.expectedDomain === domain).length;
6
- const total = cases.length;
7
- return { total, passed, passRate: total ? passed / total : 0 };
172
+ const details = cases.map((item) => evaluateCase(domain, item));
173
+ const passedCases = details.filter((item) => item.ok).length;
174
+ const passedChecks = details.reduce((sum, item) => sum + item.passed, 0);
175
+ const totalChecks = details.reduce((sum, item) => sum + item.total, 0);
176
+
177
+ return {
178
+ total: cases.length,
179
+ passed: passedCases,
180
+ passRate: cases.length ? passedCases / cases.length : 0,
181
+ checkPassRate: totalChecks ? passedChecks / totalChecks : 0,
182
+ passedChecks,
183
+ totalChecks,
184
+ details,
185
+ };
8
186
  }
@@ -2,6 +2,8 @@ import { spawn } from "node:child_process";
2
2
  import { fileURLToPath } from "node:url";
3
3
  import path from "node:path";
4
4
 
5
+ import { WEAK_PAGE_POLICY } from "./research-policy.js";
6
+
5
7
  const SCRAPLING_ROOT = fileURLToPath(new URL("../Scrapling", import.meta.url));
6
8
  const BLOCKED_PATTERNS = [
7
9
  /cloudflare/i,
@@ -11,6 +13,10 @@ const BLOCKED_PATTERNS = [
11
13
  /bot detection/i,
12
14
  /verify you are human/i,
13
15
  /security check/i,
16
+ /access denied/i,
17
+ /temporarily unavailable/i,
18
+ /attention required/i,
19
+ /challenge-platform/i,
14
20
  ];
15
21
  const DYNAMIC_PATTERNS = [
16
22
  /__next_data__/i,
@@ -38,9 +44,18 @@ export function assessPageAttempt({ status = 200, body = "", contentType = "", u
38
44
  const plain = stripHtml(text);
39
45
  const lower = `${text}\n${url}`.toLowerCase();
40
46
  const antiBotSignal = BLOCKED_PATTERNS.some((pattern) => pattern.test(lower));
41
- const blocked = status === 403 || status === 429 || (antiBotSignal && plain.length < 1000);
42
- const dynamic = !blocked && (DYNAMIC_PATTERNS.some((pattern) => pattern.test(lower)) || (text.includes("<script") && plain.length < 400));
43
- const weak = blocked || plain.length < 300 || (!/text\/(html|plain)/i.test(contentType) && plain.length < 500);
47
+ const negativeSignals = [];
48
+
49
+ if (plain.length < WEAK_PAGE_POLICY.weakTextLimit) negativeSignals.push("weak_text");
50
+ else if (plain.length < WEAK_PAGE_POLICY.thinTextLimit) negativeSignals.push("thin_text");
51
+ if (antiBotSignal) negativeSignals.push("placeholder");
52
+ if (!/text\/(html|plain)/i.test(contentType) && plain.length < 500) negativeSignals.push("unsupported_content_type");
53
+
54
+ const blocked = status === 403
55
+ || status === 429
56
+ || (antiBotSignal && plain.length < WEAK_PAGE_POLICY.blockedTextLimit);
57
+ const dynamic = !blocked && (DYNAMIC_PATTERNS.some((pattern) => pattern.test(lower)) || (text.includes("<script") && plain.length < WEAK_PAGE_POLICY.weakTextLimit));
58
+ const weak = blocked || negativeSignals.includes("weak_text") || negativeSignals.length >= WEAK_PAGE_POLICY.minNegativeSignals;
44
59
 
45
60
  return {
46
61
  blocked,
@@ -48,6 +63,7 @@ export function assessPageAttempt({ status = 200, body = "", contentType = "", u
48
63
  weak,
49
64
  mode: blocked ? "stealthy" : dynamic ? "dynamic" : "async",
50
65
  plainLength: plain.length,
66
+ negativeSignals,
51
67
  };
52
68
  }
53
69
 
@@ -0,0 +1,188 @@
1
+ import { classifyQuestionDomain } from "./research-intent.js";
2
+
3
+ export const WEAK_PAGE_POLICY = {
4
+ blockedTextLimit: 1200,
5
+ weakTextLimit: 400,
6
+ thinTextLimit: 1200,
7
+ minQueryTermMatches: 2,
8
+ minNegativeSignals: 2,
9
+ };
10
+
11
+ export const DOMAIN_AUTHORITY_RULES = {
12
+ security: {
13
+ hosts: ["nvd.nist.gov", "cisa.gov", "mitre.org", "github.com", "ubuntu.com", "redhat.com", "debian.org", "suse.com"],
14
+ type: "official_doc",
15
+ },
16
+ "vendor-status": {
17
+ hosts: ["statuspage.io", "status.github.com"],
18
+ type: "official_doc",
19
+ },
20
+ "package-registry": {
21
+ hosts: ["npmjs.com", "pypi.org", "crates.io", "mvnrepository.com"],
22
+ type: "official_doc",
23
+ },
24
+ github: {
25
+ hosts: ["github.com"],
26
+ type: "github_repo",
27
+ },
28
+ papers: {
29
+ hosts: ["arxiv.org", "semanticscholar.org", "doi.org", "pubmed.ncbi.nlm.nih.gov", "nature.com", "science.org"],
30
+ type: "paper",
31
+ },
32
+ web: {
33
+ hosts: [],
34
+ type: "official_doc",
35
+ },
36
+ };
37
+
38
+ const PLACEHOLDER_PATTERNS = [
39
+ /cloudflare/i,
40
+ /access denied/i,
41
+ /temporarily unavailable/i,
42
+ /attention required/i,
43
+ /verify you are human/i,
44
+ /security check/i,
45
+ /captcha/i,
46
+ /turnstile/i,
47
+ /challenge-platform/i,
48
+ ];
49
+
50
+ const VENDOR_RESEARCH_HOSTS = [
51
+ "research.ibm.com",
52
+ "research.google",
53
+ ];
54
+
55
+ function baseQuery(query = "") {
56
+ return String(query || "")
57
+ .trim()
58
+ .replace(/[?!.]+$/g, "")
59
+ .replace(/\s+/g, " ");
60
+ }
61
+
62
+ function meaningfulTerms(text = "") {
63
+ return [...new Set(String(text || "")
64
+ .toLowerCase()
65
+ .replace(/[^a-z0-9\s]+/g, " ")
66
+ .split(/\s+/)
67
+ .filter((term) => term.length > 2 && !["the", "and", "for", "with", "from", "that", "this", "what", "which", "best", "official", "docs"].includes(term)))];
68
+ }
69
+
70
+ export function normalizeHostname(url = "") {
71
+ try {
72
+ return new URL(url).hostname.replace(/^www\./, "").toLowerCase();
73
+ } catch {
74
+ return "";
75
+ }
76
+ }
77
+
78
+ function hostMatches(hostname, candidate) {
79
+ return hostname === candidate || hostname.endsWith(`.${candidate}`);
80
+ }
81
+
82
+ function countOverlap(query = "", title = "", text = "") {
83
+ const terms = meaningfulTerms(query);
84
+ const haystack = `${title} ${String(text || "").slice(0, 1200)}`.toLowerCase();
85
+ return terms.filter((term) => haystack.includes(term)).length;
86
+ }
87
+
88
+ export function resolvePolicyDomain(query = "", explicitDomain = "") {
89
+ return explicitDomain || classifyQuestionDomain(query || "");
90
+ }
91
+
92
+ export function pageQualitySignals({ title = "", text = "", status = 200, contentType = "", url = "", query = "" } = {}) {
93
+ const plain = String(text || "").replace(/\s+/g, " ").trim();
94
+ const corpus = `${title}\n${plain}\n${url}`;
95
+ const placeholder = PLACEHOLDER_PATTERNS.some((pattern) => pattern.test(corpus));
96
+ const queryTermMatches = countOverlap(query, title, plain);
97
+ const negativeSignals = [];
98
+
99
+ if (plain.length < WEAK_PAGE_POLICY.weakTextLimit) negativeSignals.push("weak_text");
100
+ else if (plain.length < WEAK_PAGE_POLICY.thinTextLimit) negativeSignals.push("thin_text");
101
+ if (placeholder) negativeSignals.push("placeholder");
102
+ if (contentType && !/text\/(html|plain)/i.test(contentType)) negativeSignals.push("unsupported_content_type");
103
+ if (query && queryTermMatches < WEAK_PAGE_POLICY.minQueryTermMatches) negativeSignals.push("query_overlap_low");
104
+
105
+ const blocked = status === 403
106
+ || status === 429
107
+ || (placeholder && plain.length < WEAK_PAGE_POLICY.blockedTextLimit);
108
+ const weak = blocked
109
+ || negativeSignals.includes("weak_text")
110
+ || negativeSignals.length >= WEAK_PAGE_POLICY.minNegativeSignals;
111
+
112
+ return {
113
+ blocked,
114
+ weak,
115
+ placeholder,
116
+ plainLength: plain.length,
117
+ queryTermMatches,
118
+ negativeSignals,
119
+ };
120
+ }
121
+
122
+ export function sourceAuthorityProfile({ url = "", title = "", text = "", query = "", domain = "" } = {}) {
123
+ const hostname = normalizeHostname(url);
124
+ const resolvedDomain = resolvePolicyDomain(query, domain);
125
+ const quality = pageQualitySignals({ title, text, url, query });
126
+
127
+ if (hostMatches(hostname, "researchgate.net")) {
128
+ if (quality.blocked || quality.placeholder) {
129
+ return { sourceType: "other", authoritative: false, domainBoost: -8, reasons: ["researchgate_placeholder"] };
130
+ }
131
+ return { sourceType: "other", authoritative: false, domainBoost: resolvedDomain === "papers" ? 2 : 0, reasons: ["researchgate_secondary"] };
132
+ }
133
+
134
+ if (VENDOR_RESEARCH_HOSTS.some((host) => hostMatches(hostname, host))) {
135
+ return { sourceType: "official_doc", authoritative: true, domainBoost: 8, reasons: ["vendor_research_host"] };
136
+ }
137
+
138
+ const rule = DOMAIN_AUTHORITY_RULES[resolvedDomain] || DOMAIN_AUTHORITY_RULES.web;
139
+ if (rule.hosts.some((host) => hostMatches(hostname, host))) {
140
+ const reason = resolvedDomain === "github" && /\/(issues|pull|pulls|discussions)\//.test(url)
141
+ ? "github_state_page"
142
+ : "domain_authority_host";
143
+ const authoritative = !(resolvedDomain === "github" && /\/(issues|pull|pulls|discussions)\//.test(url)) || /#readme|\/releases|\/blob\//.test(url);
144
+ return { sourceType: rule.type, authoritative, domainBoost: authoritative ? 10 : 4, reasons: [reason] };
145
+ }
146
+
147
+ return { sourceType: null, authoritative: false, domainBoost: 0, reasons: [] };
148
+ }
149
+
150
+ export function buildAuthorityFollowUpQueries(query = "", explicitDomain = "") {
151
+ const resolvedDomain = resolvePolicyDomain(query, explicitDomain);
152
+ const base = baseQuery(query);
153
+
154
+ switch (resolvedDomain) {
155
+ case "security":
156
+ return [`${base} cve advisory vendor`, `${base} nvd cisa mitre`];
157
+ case "vendor-status":
158
+ return [`${base} status page incident`, `${base} official outage status`];
159
+ case "package-registry":
160
+ return [`${base} npm pypi crates readme`, `${base} official package docs`];
161
+ case "github":
162
+ return [`${base} github readme releases`, `${base} site:github.com readme docs`];
163
+ case "papers":
164
+ return [`${base} arxiv doi publisher`, `${base} semanticscholar arxiv doi`];
165
+ default:
166
+ return [`${base} official docs`, `${base} documentation reference`];
167
+ }
168
+ }
169
+
170
+ export function buildConflictFollowUpQueries(query = "", explicitDomain = "") {
171
+ const resolvedDomain = resolvePolicyDomain(query, explicitDomain);
172
+ const base = baseQuery(query);
173
+
174
+ switch (resolvedDomain) {
175
+ case "security":
176
+ return [`${base} vendor advisory official`, `${base} cve mitigation official`];
177
+ case "vendor-status":
178
+ return [`${base} incident status official`, `${base} status page postmortem`];
179
+ case "package-registry":
180
+ return [`${base} release notes changelog`, `${base} maintainer docs`];
181
+ case "github":
182
+ return [`${base} github releases readme`, `${base} canonical repo docs`];
183
+ case "papers":
184
+ return [`${base} arxiv doi compare`, `${base} publisher abstract official`];
185
+ default:
186
+ return [`${base} official docs support status`, `${base} official comparison reference`];
187
+ }
188
+ }
package/lib/research.js CHANGED
@@ -1,3 +1,10 @@
1
+ import {
2
+ buildAuthorityFollowUpQueries,
3
+ buildConflictFollowUpQueries,
4
+ pageQualitySignals,
5
+ sourceAuthorityProfile,
6
+ } from "./research-policy.js";
7
+
1
8
  function decodeHtmlEntities(text) {
2
9
  return String(text || "")
3
10
  .replace(/&amp;/g, "&")
@@ -307,6 +314,7 @@ export function classifySourceType(url, title = "") {
307
314
  if (/github\.com\/[^/]+\/[^/]+#readme|github\.com\/[^/]+\/[^/]+\/blob\//.test(lower)) return "github_readme";
308
315
  if (/github\.com\/[^/]+\/[^/]+/.test(lower)) return "github_repo";
309
316
  if (/arxiv\.org|ieee\.org|springer\.com|pubmed\.ncbi\.nlm\.nih\.gov|doi\.org|semanticscholar\.org|acm\.org|nature\.com|science\.org/.test(lower)) return "paper";
317
+ if (/research\.ibm\.com|research\.google/.test(lower)) return "official_doc";
310
318
  if (/reddit\.com|stackoverflow\.com|forum/.test(lower)) return "forum";
311
319
  if (/blog\.|medium\.com|dev\.to|substack\.com/.test(lower)) return "blog";
312
320
  if (/\/docs?\b|documentation|developer|reference|official/.test(lower) || /official|documentation|reference|guide/i.test(title) || /\.edu\/|\.ac\.uk\//.test(lower)) return "official_doc";
@@ -315,7 +323,7 @@ export function classifySourceType(url, title = "") {
315
323
 
316
324
  export function isAuthoritativeUrl(url) {
317
325
  const lower = String(url || "").toLowerCase();
318
- return /\/docs?\b|documentation|developer|reference|official|github\.com\/[^/]+\/[^/]+(#readme|\/tree\/[^/]+\/docs)?|npmjs\.com\/package\/|arxiv\.org|pubmed\.ncbi\.nlm\.nih\.gov|semanticscholar\.org|doi\.org|\.edu\/|\.ac\.uk\//.test(lower);
326
+ return /\/docs?\b|documentation|developer|reference|official|github\.com\/[^/]+\/[^/]+(#readme|\/tree\/[^/]+\/docs)?|npmjs\.com\/package\/|arxiv\.org|pubmed\.ncbi\.nlm\.nih\.gov|semanticscholar\.org|doi\.org|research\.ibm\.com|research\.google|\.edu\/|\.ac\.uk\//.test(lower);
319
327
  }
320
328
 
321
329
  export function scoreSearchResult(result, query, config = {}) {
@@ -369,6 +377,18 @@ export function scoreFetchedPage(page, query, config = {}) {
369
377
  if (/stackoverflow\.com|reddit\.com|quora\.com/.test(url)) score -= 2;
370
378
  if (countTermMatches(firstChunk, terms) > 0) score += 5;
371
379
 
380
+ const quality = page?.quality || pageQualitySignals({
381
+ title: page?.title || "",
382
+ text,
383
+ url: page?.url || "",
384
+ query,
385
+ });
386
+ if (quality.blocked) score -= 20;
387
+ if (quality.negativeSignals?.includes("placeholder")) score -= 10;
388
+ if (quality.negativeSignals?.includes("weak_text")) score -= 8;
389
+ if (quality.negativeSignals?.includes("thin_text")) score -= 4;
390
+ if (quality.negativeSignals?.includes("query_overlap_low")) score -= 6;
391
+
372
392
  const ageInMonths = monthsSince(page?.publishDate);
373
393
  if (config.preferRecent && ageInMonths !== null) {
374
394
  if (ageInMonths <= 6) score += 8;
@@ -448,7 +468,7 @@ export function detectResearchGaps(query, pages) {
448
468
  return {
449
469
  detected: true,
450
470
  reason: "Retrieved pages lack an authoritative docs or README source.",
451
- followupQuery: `${queryBase(query)} official docs`,
471
+ followupQuery: buildAuthorityFollowUpQueries(query)[0] || `${queryBase(query)} official docs`,
452
472
  missingAspects: ["authoritative sources"],
453
473
  };
454
474
  }
@@ -458,10 +478,10 @@ export function detectResearchGaps(query, pages) {
458
478
 
459
479
  export function buildFollowUpQuery(query, pages) {
460
480
  const conflict = detectConflictSignals(pages);
461
- if (conflict.detected) return `${queryBase(query)} official docs support status`;
481
+ if (conflict.detected) return buildConflictFollowUpQueries(query)[0] || `${queryBase(query)} official docs support status`;
462
482
  const gaps = detectResearchGaps(query, pages);
463
483
  if (gaps.detected) return gaps.followupQuery;
464
- return `${queryBase(query)} clarification official docs`;
484
+ return buildAuthorityFollowUpQueries(`${queryBase(query)} clarification`)[0] || `${queryBase(query)} clarification official docs`;
465
485
  }
466
486
 
467
487
  function queryTermsForFactCheck(text) {
@@ -536,12 +556,18 @@ export function buildConfidenceSummary(pages, meta = {}) {
536
556
  export function scoreSourceEntry(source, query = "") {
537
557
  const url = String(source?.url || "");
538
558
  const title = String(source?.title || "");
539
- const sourceType = classifySourceType(url, title);
559
+ const authorityProfile = sourceAuthorityProfile({
560
+ url,
561
+ title,
562
+ text: source?.text || source?.snippet || "",
563
+ query,
564
+ });
565
+ const sourceType = authorityProfile.sourceType || classifySourceType(url, title);
540
566
  const freshness = summarizeFreshness(source?.publishDate || source?.freshness);
541
567
  let typeScore = 0;
542
568
  let freshnessScore = 0;
543
- let domainScore = 0;
544
- let authoritative = isAuthoritativeUrl(url) || sourceType === "official_doc" || sourceType === "paper" || sourceType === "file";
569
+ let domainScore = authorityProfile.domainBoost || 0;
570
+ let authoritative = authorityProfile.authoritative || isAuthoritativeUrl(url) || sourceType === "official_doc" || sourceType === "paper" || sourceType === "file";
545
571
 
546
572
  if (authoritative) typeScore += 10;
547
573
  if (sourceType === "official_doc") typeScore += 8;
@@ -625,9 +651,10 @@ export function evaluateSufficiency(input, legacyPages, legacyConflictDetected =
625
651
  if (conflictDetected) missingAspects.push("conflict resolution");
626
652
  if (!payload.sources.length) missingAspects.push("readable sources");
627
653
 
628
- const openSubQuestions = [];
629
- if (!authoritativeSourcesFound) openSubQuestions.push(`${queryBase(payload.query)} official docs`);
630
- if (conflictDetected) openSubQuestions.push(`Which authoritative source resolves the conflicting claims about ${queryBase(payload.query)}?`);
654
+ const openSubQuestions = [
655
+ ...(!authoritativeSourcesFound ? buildAuthorityFollowUpQueries(payload.query) : []),
656
+ ...(conflictDetected ? buildConflictFollowUpQueries(payload.query) : []),
657
+ ];
631
658
  if (!openSubQuestions.length) openSubQuestions.push(`${queryBase(payload.query)} follow-up`);
632
659
 
633
660
  const minSources = payload.minSources || 1;
@@ -36,6 +36,7 @@ import {
36
36
  selectRelevantChunks,
37
37
  } from "./research.js";
38
38
  import { pageFetchAdapter } from "./page-fetch-adapter.js";
39
+ import { pageQualitySignals } from "./research-policy.js";
39
40
  import { resolveOutputFormat, shouldRequireAuthoritativeSources } from "./research-output.js";
40
41
  import { planResearch } from "./planner.js";
41
42
  import {
@@ -111,7 +112,7 @@ export function resolveResearchConfig(input = "fast") {
111
112
  force: Boolean(options.force),
112
113
  format: resolveOutputFormat(options, domainConfig.format || "markdown"),
113
114
  queryHints: Array.isArray(domainConfig.queryHints) ? domainConfig.queryHints : [],
114
- requireAuthoritative: Boolean(options.requireAuthoritative ?? domainConfig.requireAuthoritative),
115
+ requireAuthoritative: Boolean(options.requireAuthoritative ?? (domainConfig.requireAuthoritative || domainConfig.domain === "github")),
115
116
  domain: domainConfig.domain,
116
117
  };
117
118
  }
@@ -371,7 +372,7 @@ async function fetchJinaPageSource(url, signal, config) {
371
372
  const body = await response.text();
372
373
  const firstLine = body.split("\n").find((line) => line.trim().replace(/^#+\s*/, ""));
373
374
  const title = firstLine ? firstLine.trim().replace(/^#+\s*/, "") : url;
374
- return pageFromText(title, url, body, config, { sourceType: classifySourceType(url, title) });
375
+ return pageFromText(title, url, body, config, { sourceType: classifySourceType(url, title), fetchStatus: 200, contentType: "text/plain" });
375
376
  } catch {
376
377
  return null;
377
378
  }
@@ -385,6 +386,20 @@ function withinTimeframe(page, config) {
385
386
  return true;
386
387
  }
387
388
 
389
+ function finalizeFetchedPage(page, config, meta = {}) {
390
+ if (!page) return null;
391
+ const quality = page.quality || pageQualitySignals({
392
+ title: page.title,
393
+ text: page.text,
394
+ url: page.url || meta.url || "",
395
+ query: config.query || "",
396
+ status: page.fetchStatus ?? meta.status,
397
+ contentType: page.contentType || meta.contentType || "text/html",
398
+ });
399
+ if (quality.blocked) return null;
400
+ return { ...page, quality };
401
+ }
402
+
388
403
  export async function fetchPageSource(url, signal, config = getResearchConfig()) {
389
404
  if (shouldSkipUrl(url)) {
390
405
  await logResearchEvent("fetch_skip", { url, reason: "login_or_account_url" });
@@ -399,14 +414,19 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
399
414
  })}`;
400
415
  const cached = config.isolate ? null : getCacheValue(pageCache, cacheKey);
401
416
  if (cached) {
402
- await logResearchEvent("fetch_cache_hit", { url, cacheKey, title: cached.title, textLength: cached.text?.length || 0 });
403
- return cached;
417
+ const validated = finalizeFetchedPage(cached, config, { url: cached.url || url, contentType: "text/html" });
418
+ if (!validated) {
419
+ await logResearchEvent("fetch_skip", { url, cacheKey, reason: "cached_page_blocked_or_placeholder" });
420
+ return null;
421
+ }
422
+ await logResearchEvent("fetch_cache_hit", { url, cacheKey, title: validated.title, textLength: validated.text?.length || 0 });
423
+ return validated;
404
424
  }
405
425
 
406
426
  await logResearchEvent("fetch_start", { url, cacheKey, config: { isolate: config.isolate, useJinaFallback: Boolean(config.useJinaFallback), pageTextLimit: config.pageTextLimit } });
407
427
 
408
428
  if (shouldUseJinaFirst(url)) {
409
- const first = await fetchJinaPageSource(url, signal, config);
429
+ const first = finalizeFetchedPage(await fetchJinaPageSource(url, signal, config), config, { url, contentType: "text/plain" });
410
430
  if (first && withinTimeframe(first, config)) {
411
431
  const page = config.isolate ? first : setCacheValue(pageCache, cacheKey, first, PAGE_CACHE_TTL_MS);
412
432
  await logResearchEvent("fetch_end", { url, via: "jina_first", success: Boolean(page), page: page ? { title: page.title, sourceType: page.sourceType, publishDate: page.publishDate, textLength: page.text?.length || 0 } : null });
@@ -422,6 +442,12 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
422
442
 
423
443
  const contentType = response.headers.get("content-type") || "";
424
444
  if (!contentType.includes("text/html") && !contentType.includes("text/plain")) {
445
+ const fallback = finalizeFetchedPage(await fetchJinaPageSource(url, signal, config), config, { url, contentType });
446
+ if (fallback && withinTimeframe(fallback, config)) {
447
+ const page = config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, PAGE_CACHE_TTL_MS);
448
+ await logResearchEvent("fetch_end", { url, via: "unsupported_content_type_fallback", success: Boolean(page), contentType, page: page ? { title: page.title, sourceType: page.sourceType, publishDate: page.publishDate, textLength: page.text?.length || 0 } : null });
449
+ return page;
450
+ }
425
451
  await logResearchEvent("fetch_end", { url, success: false, reason: "unsupported_content_type", contentType });
426
452
  return null;
427
453
  }
@@ -432,6 +458,8 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
432
458
  publishDate: extractPublishDate(body),
433
459
  sourceType: classifySourceType(snapshot.url, snapshot.title),
434
460
  codeBlocks: snapshot.codeBlocks,
461
+ fetchStatus: response.status ?? 200,
462
+ contentType,
435
463
  });
436
464
 
437
465
  const assessment = adapter.assessPageAttempt?.({
@@ -449,18 +477,24 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
449
477
  publishDate: extractPublishDate(scrapling.body),
450
478
  sourceType: classifySourceType(scraplingSnapshot.url, scraplingSnapshot.title),
451
479
  codeBlocks: scraplingSnapshot.codeBlocks,
480
+ fetchStatus: scrapling.status ?? 200,
481
+ contentType: scrapling.contentType || "text/html",
452
482
  });
453
483
  }
454
484
  }
455
485
 
456
486
  const resolved = page || await fetchJinaPageSource(url, signal, config);
457
- const finalPage = resolved && withinTimeframe(resolved, config) ? resolved : null;
458
- const stored = config.isolate ? finalPage : setCacheValue(pageCache, cacheKey, finalPage, PAGE_CACHE_TTL_MS);
487
+ const finalPage = finalizeFetchedPage(resolved, config, { url: response.url || url, status: response.status ?? 200, contentType });
488
+ const stored = finalPage && withinTimeframe(finalPage, config)
489
+ ? (config.isolate ? finalPage : setCacheValue(pageCache, cacheKey, finalPage, PAGE_CACHE_TTL_MS))
490
+ : null;
459
491
  await logResearchEvent("fetch_end", { url, success: Boolean(stored), page: stored ? { title: stored.title, sourceType: stored.sourceType, publishDate: stored.publishDate, textLength: stored.text?.length || 0 } : null });
460
492
  return stored;
461
493
  } catch (error) {
462
- const fallback = await fetchJinaPageSource(url, signal, config);
463
- const stored = config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, PAGE_CACHE_TTL_MS);
494
+ const fallback = finalizeFetchedPage(await fetchJinaPageSource(url, signal, config), config, { url, contentType: "text/plain" });
495
+ const stored = fallback && withinTimeframe(fallback, config)
496
+ ? (config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, PAGE_CACHE_TTL_MS))
497
+ : null;
464
498
  await logResearchEvent("fetch_error", { url, error, fallback: stored ? { title: stored.title, sourceType: stored.sourceType, publishDate: stored.publishDate, textLength: stored.text?.length || 0 } : null });
465
499
  return stored;
466
500
  }
@@ -625,6 +659,7 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
625
659
  let conflictSummary = "";
626
660
  let conflictingSourcePairs = [];
627
661
  let sufficiency = { sufficient: false, confidenceScore: 0.1, missingAspects: [], openSubQuestions: [] };
662
+ let lastEmptySearchSignature = null;
628
663
  let currentQueries = await buildQueries(query, config, ctx, signal);
629
664
  subqueries = [...currentQueries];
630
665
 
@@ -651,12 +686,20 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
651
686
  emit("search", `Searching ${queriesThisTurn.length} queries...`);
652
687
 
653
688
  const searchGroups = await Promise.all(queriesThisTurn.map((subquery) => searchDuckDuckGo(subquery, signal, config)));
689
+ const flatResults = searchGroups.flat();
654
690
  await logResearchEvent("search_results", {
655
691
  query,
656
692
  queries: queriesThisTurn,
657
- results: searchGroups.flat().map((result) => ({ title: result.title, url: result.url, snippet: result.snippet, sourceType: result.sourceType, publishDate: result.publishDate })),
693
+ results: flatResults.map((result) => ({ title: result.title, url: result.url, snippet: result.snippet, sourceType: result.sourceType, publishDate: result.publishDate })),
658
694
  });
659
- const results = rankSearchResults(searchGroups.flat(), query, config.maxPages * 2, config)
695
+ const searchSignature = queriesThisTurn.join(" || ");
696
+ if (flatResults.length === 0) {
697
+ if (lastEmptySearchSignature === searchSignature) break;
698
+ lastEmptySearchSignature = searchSignature;
699
+ } else {
700
+ lastEmptySearchSignature = null;
701
+ }
702
+ const results = rankSearchResults(flatResults, query, config.maxPages * 2, config)
660
703
  .filter((result) => {
661
704
  const key = normalizeUrl(result.url);
662
705
  if (seenUrls.has(key)) return false;
@@ -666,7 +709,7 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
666
709
  .slice(0, config.maxPages);
667
710
 
668
711
  emit("fetch", `Reading ${results.length} sources...`);
669
- const pageCandidates = await Promise.all(results.map((result) => fetchPageSource(result.url, signal, config)));
712
+ const pageCandidates = await Promise.all(results.map((result) => fetchPageSource(result.url, signal, { ...config, query })));
670
713
  await logResearchEvent("page_fetch_results", {
671
714
  query,
672
715
  urls: results.map((result) => result.url),
@@ -682,7 +725,7 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
682
725
  sourceType: page.sourceType || scored.sourceType,
683
726
  text: selectRelevantChunks(page.text, query, config.maxChunksPerPage).join("\n\n") || page.text,
684
727
  };
685
- }).filter((page) => withinTimeframe(page, config)), query, config.maxPages, config);
728
+ }).filter((page) => withinTimeframe(page, config) && !page.quality?.blocked), query, config.maxPages, config);
686
729
 
687
730
  for (const page of prioritizeSourceEntries(rankedPages, query)) {
688
731
  const key = normalizeUrl(page.url);
package/package.json CHANGED
@@ -1,21 +1,24 @@
1
1
  {
2
2
  "name": "pi-research",
3
- "version": "1.2.0",
3
+ "version": "1.3.0",
4
4
  "private": false,
5
5
  "type": "module",
6
6
  "description": "Pi extension for web research.",
7
7
  "license": "MIT",
8
8
  "main": "./index.js",
9
9
  "bin": {
10
- "pi-research": "./mcp/server.js",
11
- "unblind-mcp": "./mcp/server.js"
10
+ "pi-research": "./pi-research.js",
11
+ "unblind-mcp": "./unblind-mcp.js"
12
12
  },
13
13
  "files": [
14
+ "bin",
14
15
  "extensions",
15
16
  "index.js",
16
17
  "lib",
17
18
  "mcp",
18
19
  "mcp-server.js",
20
+ "pi-research.js",
21
+ "unblind-mcp.js",
19
22
  "README.md",
20
23
  "THIRD_PARTY_NOTICES.md",
21
24
  "package.json"
package/pi-research.js ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { startMcpServer } from "./mcp/server.js";
4
+
5
+ startMcpServer();
package/unblind-mcp.js ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { startMcpServer } from "./mcp/server.js";
4
+
5
+ startMcpServer();