pi-research 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -72
- package/THIRD_PARTY_NOTICES.md +17 -0
- package/index.js +68 -20
- package/lib/local-logger.js +54 -0
- package/lib/page-fetch-adapter.js +180 -0
- package/lib/web-research.js +98 -19
- package/mcp/server.js +242 -0
- package/mcp-server.js +18 -0
- package/package.json +10 -2
package/README.md
CHANGED
|
@@ -1,22 +1,36 @@
|
|
|
1
1
|
# pi-research
|
|
2
2
|
|
|
3
|
+

|
|
4
|
+
|
|
3
5
|
[](https://www.npmjs.com/package/pi-research)
|
|
4
6
|
[](https://github.com/endgegnerbert-tech/pi-research)
|
|
5
7
|
[](https://pi.ai)
|
|
6
8
|
|
|
7
|
-
`pi-research` is a Pi extension for
|
|
9
|
+
`pi-research` is a Pi extension for grounded web research.
|
|
10
|
+
It searches, ranks, compares, and synthesizes sources inside the agent.
|
|
8
11
|
|
|
9
|
-
|
|
10
|
-
It does **not** require an external research API or API key, and it is not a browser automation tool.
|
|
12
|
+

|
|
11
13
|
|
|
12
14
|
## Why it exists
|
|
13
15
|
|
|
14
|
-
|
|
16
|
+
When agents answer well, they usually do three things:
|
|
17
|
+
|
|
18
|
+
1. search the right places
|
|
19
|
+
2. prefer authoritative sources
|
|
20
|
+
3. explain confidence and gaps clearly
|
|
15
21
|
|
|
16
|
-
|
|
17
|
-
2. a way to turn sources into a usable answer
|
|
22
|
+
`pi-research` does that without an external research service.
|
|
18
23
|
|
|
19
|
-
|
|
24
|
+
## Best practices
|
|
25
|
+
|
|
26
|
+
- use `fast` for short factual lookups
|
|
27
|
+
- use `deep` for comparisons, conflicts, or unclear questions
|
|
28
|
+
- use `code` for docs, repos, README-driven answers, and snippets
|
|
29
|
+
- use `academic` for paper-heavy topics
|
|
30
|
+
- set `options.requireAuthoritative: true` when source quality matters more than recall
|
|
31
|
+
- use `options.format: json` when you need machine-readable output
|
|
32
|
+
- add `options.files` when local docs matter
|
|
33
|
+
- keep questions specific; vague prompts create noisy retrieval
|
|
20
34
|
|
|
21
35
|
## What it does
|
|
22
36
|
|
|
@@ -26,7 +40,7 @@ Agents usually need two things to answer well:
|
|
|
26
40
|
- follows up when the first pass is not enough
|
|
27
41
|
- extracts code blocks for code-focused questions
|
|
28
42
|
- supports local files as additional sources
|
|
29
|
-
- returns
|
|
43
|
+
- returns structured results with citations, confidence, conflicts, and gaps
|
|
30
44
|
|
|
31
45
|
## What it is not
|
|
32
46
|
|
|
@@ -34,22 +48,6 @@ Agents usually need two things to answer well:
|
|
|
34
48
|
- not an offline knowledge base
|
|
35
49
|
- not a replacement for page navigation
|
|
36
50
|
|
|
37
|
-
## Install
|
|
38
|
-
|
|
39
|
-
### For Pi
|
|
40
|
-
|
|
41
|
-
```bash
|
|
42
|
-
pi install npm:pi-research
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
### For npm-based workflows
|
|
46
|
-
|
|
47
|
-
```bash
|
|
48
|
-
npm install pi-research
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
GitHub repository: https://github.com/endgegnerbert-tech/pi-research
|
|
52
|
-
|
|
53
51
|
## Quick start
|
|
54
52
|
|
|
55
53
|
```text
|
|
@@ -57,11 +55,11 @@ What are the trade-offs between B-trees and LSM-trees?
|
|
|
57
55
|
```
|
|
58
56
|
|
|
59
57
|
```text
|
|
60
|
-
|
|
58
|
+
Compare React Server Components with traditional SSR.
|
|
61
59
|
```
|
|
62
60
|
|
|
63
61
|
```text
|
|
64
|
-
|
|
62
|
+
How do I add retries to a Node.js fetch wrapper?
|
|
65
63
|
```
|
|
66
64
|
|
|
67
65
|
## Modes
|
|
@@ -73,6 +71,26 @@ Compare React Server Components with traditional SSR.
|
|
|
73
71
|
| `code` | docs, READMEs, repositories, and code snippets |
|
|
74
72
|
| `academic` | scholarly sources and paper-heavy topics |
|
|
75
73
|
|
|
74
|
+
## Output
|
|
75
|
+
|
|
76
|
+
The tool returns structured data including:
|
|
77
|
+
|
|
78
|
+
- `answer`
|
|
79
|
+
- `bullets`
|
|
80
|
+
- `sources`
|
|
81
|
+
- `citations`
|
|
82
|
+
- `codeBlocks`
|
|
83
|
+
- `confidence`
|
|
84
|
+
- `confidenceScore`
|
|
85
|
+
- `sufficient`
|
|
86
|
+
- `authoritativeSourcesFound`
|
|
87
|
+
- `openSubQuestions`
|
|
88
|
+
- `missingAspects`
|
|
89
|
+
- `conflictSummary`
|
|
90
|
+
- `unverifiedClaims`
|
|
91
|
+
- `sourceTypes`
|
|
92
|
+
- `meta`
|
|
93
|
+
|
|
76
94
|
## Public tool parameters
|
|
77
95
|
|
|
78
96
|
- `query` — research question to answer
|
|
@@ -133,44 +151,10 @@ options:
|
|
|
133
151
|
- ./docs/spec.md
|
|
134
152
|
```
|
|
135
153
|
|
|
136
|
-
## Output
|
|
137
|
-
|
|
138
|
-
The tool returns structured data including:
|
|
139
|
-
|
|
140
|
-
- `answer`
|
|
141
|
-
- `bullets`
|
|
142
|
-
- `sources`
|
|
143
|
-
- `citations`
|
|
144
|
-
- `codeBlocks`
|
|
145
|
-
- `confidence`
|
|
146
|
-
- `confidenceScore`
|
|
147
|
-
- `sufficient`
|
|
148
|
-
- `authoritativeSourcesFound`
|
|
149
|
-
- `openSubQuestions`
|
|
150
|
-
- `missingAspects`
|
|
151
|
-
- `conflictSummary`
|
|
152
|
-
- `unverifiedClaims`
|
|
153
|
-
- `sourceTypes`
|
|
154
|
-
- `meta`
|
|
155
|
-
|
|
156
|
-
## How it works
|
|
157
|
-
|
|
158
|
-
- **query-isolated caching**: repeated identical research can be skipped when the previous result was already sufficient
|
|
159
|
-
- **source scoring**: official docs, READMEs, papers, and local files are preferred over weak sources
|
|
160
|
-
- **follow-up planning**: unclear or conflicting results trigger another round of research
|
|
161
|
-
- **conflict detection**: opposing claims are surfaced explicitly
|
|
162
|
-
- **fact checking**: unsupported answer sentences are marked as unverified
|
|
163
|
-
- **local source input**: files can be added directly to the research context
|
|
164
|
-
|
|
165
|
-
## Limits
|
|
166
|
-
|
|
167
|
-
- it still depends on live web access for web research
|
|
168
|
-
- it does not browse pages like a human user
|
|
169
|
-
- it is not fully offline unless you only use local files
|
|
170
|
-
- it is not a browser interaction tool
|
|
171
|
-
|
|
172
154
|
## Domain packs
|
|
173
155
|
|
|
156
|
+
Built-in packs now steer routing and source selection:
|
|
157
|
+
|
|
174
158
|
- `web`
|
|
175
159
|
- `github`
|
|
176
160
|
- `security`
|
|
@@ -183,9 +167,14 @@ The tool returns structured data including:
|
|
|
183
167
|
|
|
184
168
|
## Community packs
|
|
185
169
|
|
|
186
|
-
You can add your own domain pack
|
|
170
|
+
You can add your own domain pack without changing the core research engine:
|
|
171
|
+
|
|
172
|
+
1. copy `lib/domains/template.js`
|
|
173
|
+
2. implement your domain-specific `run(question, options)` logic
|
|
174
|
+
3. register the pack in `lib/domains/index.js`
|
|
175
|
+
4. add eval cases in `eval/cases/<your-domain>/`
|
|
187
176
|
|
|
188
|
-
|
|
177
|
+
Starter example:
|
|
189
178
|
|
|
190
179
|
```js
|
|
191
180
|
export default {
|
|
@@ -209,16 +198,92 @@ export default {
|
|
|
209
198
|
|
|
210
199
|
Run `npm run eval` to execute the eval harness.
|
|
211
200
|
|
|
212
|
-
##
|
|
201
|
+
## Install
|
|
213
202
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
203
|
+
### Pi Coding Agent — extension
|
|
204
|
+
|
|
205
|
+
Existing Pi users should keep installing the main package:
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
pi install npm:pi-research
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
This registers the Pi extension and keeps the public tool name `pi-research`.
|
|
212
|
+
|
|
213
|
+
### MCP-only — any agent
|
|
214
|
+
|
|
215
|
+
Run the MCP server directly from npm:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
npx -y pi-research
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
The MCP server identifies itself as `unblind-mcp`, but the tool it exposes is still named `pi-research`.
|
|
222
|
+
|
|
223
|
+
### Global MCP install
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
npm install -g pi-research
|
|
227
|
+
unblind-mcp
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
The global install also provides `pi-research` as a CLI alias for the same MCP server:
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
pi-research
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### Local development
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
node ./mcp/server.js
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
Convenience script:
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
npm run --silent mcp
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
Example MCP config:
|
|
249
|
+
|
|
250
|
+
```json
|
|
251
|
+
{
|
|
252
|
+
"mcpServers": {
|
|
253
|
+
"unblind-mcp": {
|
|
254
|
+
"command": "npx",
|
|
255
|
+
"args": ["-y", "pi-research"]
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
Local path config:
|
|
262
|
+
|
|
263
|
+
```json
|
|
264
|
+
{
|
|
265
|
+
"mcpServers": {
|
|
266
|
+
"unblind-mcp": {
|
|
267
|
+
"command": "node",
|
|
268
|
+
"args": ["/path/to/pi-research/mcp/server.js"]
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
Compatibility note: `mcp-server.js` remains as a deprecated root-level shim for older local configs.
|
|
275
|
+
|
|
276
|
+
### Future `unblind-mcp` package
|
|
277
|
+
|
|
278
|
+
A separate npm package named `unblind-mcp` can be added later as a tiny wrapper around `pi-research`. It should depend on `pi-research` and start the same MCP server, not duplicate the engine.
|
|
218
279
|
|
|
219
280
|
## Release notes
|
|
220
281
|
|
|
221
|
-
-
|
|
222
|
-
-
|
|
282
|
+
- Package name: `pi-research`
|
|
283
|
+
- Version: `1.1.2`
|
|
284
|
+
- Entry point: `extensions/pi-research.ts`
|
|
285
|
+
- MCP entry point: `mcp/server.js`
|
|
286
|
+
- MCP compatibility shim: `mcp-server.js`
|
|
287
|
+
- License: MIT
|
|
288
|
+
- Third-party notices: `THIRD_PARTY_NOTICES.md`
|
|
223
289
|
- GitHub: `https://github.com/endgegnerbert-tech/pi-research`
|
|
224
|
-
- Community packs: copy the template pack and register it in `lib/domains/index.js`
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Third-Party Notices
|
|
2
|
+
|
|
3
|
+
## Scrapling
|
|
4
|
+
|
|
5
|
+
This project includes ideas and/or adapted implementation details from Scrapling.
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2024, Karim shoair
|
|
8
|
+
|
|
9
|
+
BSD 3-Clause License
|
|
10
|
+
|
|
11
|
+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
|
12
|
+
|
|
13
|
+
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
|
14
|
+
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
|
|
16
|
+
|
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
package/index.js
CHANGED
|
@@ -2,12 +2,13 @@ import { Type } from "typebox";
|
|
|
2
2
|
|
|
3
3
|
import { compactResearchPayload, classifyQueryIntent, inferOfficialDocsSite } from "./lib/research.js";
|
|
4
4
|
import { clearResearchMemory, hashResearchQuery, setResearchMemory, shouldSkipResearch } from "./lib/research-memory.js";
|
|
5
|
+
import { logResearchEvent } from "./lib/local-logger.js";
|
|
5
6
|
import { runWebResearch } from "./lib/web-research.js";
|
|
6
7
|
|
|
7
8
|
const RESEARCH_STATE = new Map();
|
|
8
9
|
|
|
9
10
|
function buildWebResearchGuidance() {
|
|
10
|
-
return "Use pi-research for
|
|
11
|
+
return "Use pi-research for current facts, docs, best practices, comparisons, and citations. Search if unsure.";
|
|
11
12
|
}
|
|
12
13
|
|
|
13
14
|
function defaultMode(query) {
|
|
@@ -61,11 +62,17 @@ export default function webResearchExtension(pi) {
|
|
|
61
62
|
pi.on("before_agent_start", async (event) => {
|
|
62
63
|
RESEARCH_STATE.clear();
|
|
63
64
|
clearResearchMemory();
|
|
65
|
+
await logResearchEvent("agent_start", {
|
|
66
|
+
systemPrompt: event.systemPrompt,
|
|
67
|
+
guidance: buildWebResearchGuidance(),
|
|
68
|
+
});
|
|
64
69
|
return { systemPrompt: `${event.systemPrompt}\n\n${buildWebResearchGuidance()}` };
|
|
65
70
|
});
|
|
66
71
|
|
|
67
72
|
pi.on("tool_call", async (event) => {
|
|
68
73
|
if (event.toolName !== "pi-research") return;
|
|
74
|
+
event.input ||= {};
|
|
75
|
+
const originalInput = { ...event.input };
|
|
69
76
|
if (!event.input.mode) event.input.mode = defaultMode(event.input.query || "");
|
|
70
77
|
|
|
71
78
|
const queryHash = hashResearchQuery(event.input.query || "");
|
|
@@ -73,9 +80,26 @@ export default function webResearchExtension(pi) {
|
|
|
73
80
|
const mode = event.input.mode;
|
|
74
81
|
const isolate = Boolean(event.input.isolate || process.env.RESEARCH_ISOLATE === "1");
|
|
75
82
|
const force = Boolean(event.input.force);
|
|
83
|
+
let blocked = false;
|
|
84
|
+
let reason = "";
|
|
76
85
|
|
|
77
86
|
if (shouldSkipResearch({ queryHash, lastHash: state.lastHash, lastWasSufficient: state.lastSufficient, force, isolate })) {
|
|
78
|
-
|
|
87
|
+
blocked = true;
|
|
88
|
+
reason = "Recent pi-research result was already sufficient for this exact query.";
|
|
89
|
+
await logResearchEvent("tool_call", {
|
|
90
|
+
originalInput,
|
|
91
|
+
finalInput: { ...event.input },
|
|
92
|
+
queryHash,
|
|
93
|
+
blocked,
|
|
94
|
+
reason,
|
|
95
|
+
state: {
|
|
96
|
+
count: state.count,
|
|
97
|
+
lastHash: state.lastHash,
|
|
98
|
+
lastSufficient: state.lastSufficient,
|
|
99
|
+
fastRecoveryAllowed: state.fastRecoveryAllowed,
|
|
100
|
+
},
|
|
101
|
+
});
|
|
102
|
+
return { block: true, reason };
|
|
79
103
|
}
|
|
80
104
|
|
|
81
105
|
if (mode === "fast" && state.count === 1 && state.fastRecoveryAllowed && !force && !isolate) {
|
|
@@ -85,34 +109,58 @@ export default function webResearchExtension(pi) {
|
|
|
85
109
|
|
|
86
110
|
state.count += 1;
|
|
87
111
|
state.lastHash = queryHash;
|
|
112
|
+
await logResearchEvent("tool_call", {
|
|
113
|
+
originalInput,
|
|
114
|
+
finalInput: { ...event.input },
|
|
115
|
+
queryHash,
|
|
116
|
+
blocked,
|
|
117
|
+
state: {
|
|
118
|
+
count: state.count,
|
|
119
|
+
lastHash: state.lastHash,
|
|
120
|
+
lastSufficient: state.lastSufficient,
|
|
121
|
+
fastRecoveryAllowed: state.fastRecoveryAllowed,
|
|
122
|
+
},
|
|
123
|
+
});
|
|
88
124
|
});
|
|
89
125
|
|
|
90
126
|
pi.on("tool_result", async (event) => {
|
|
91
|
-
if (event.toolName === "pi-research"
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
127
|
+
if (event.toolName === "pi-research") {
|
|
128
|
+
if (!event.isError && event.details?.ok) {
|
|
129
|
+
const queryHash = hashResearchQuery(event.input?.query || "");
|
|
130
|
+
const state = getState(queryHash);
|
|
131
|
+
state.lastHash = queryHash;
|
|
132
|
+
state.lastSufficient = Boolean(event.details.sufficient);
|
|
133
|
+
const query = event.input?.query || "";
|
|
134
|
+
state.fastRecoveryAllowed = !event.details.sufficient
|
|
135
|
+
&& !event.details.authoritativeSourcesFound
|
|
136
|
+
&& ["best_practice", "temporal", "definition"].includes(classifyQueryIntent(query || ""));
|
|
137
|
+
setResearchMemory(`last:${queryHash}`, event.details);
|
|
138
|
+
}
|
|
139
|
+
await logResearchEvent("tool_result", {
|
|
140
|
+
toolName: event.toolName,
|
|
141
|
+
isError: event.isError,
|
|
142
|
+
input: event.input,
|
|
143
|
+
details: event.details,
|
|
144
|
+
});
|
|
101
145
|
}
|
|
102
146
|
return compactWebResearchToolResult(event) || undefined;
|
|
103
147
|
});
|
|
104
148
|
|
|
105
149
|
pi.registerTool({
|
|
106
150
|
name: "pi-research",
|
|
107
|
-
label: "
|
|
108
|
-
description: "
|
|
109
|
-
promptSnippet: "Use
|
|
110
|
-
promptGuidelines: [
|
|
151
|
+
label: "Web Research",
|
|
152
|
+
description: "Live sources, ranking, and cited answers.",
|
|
153
|
+
promptSnippet: "Use for current or uncertain answers with citations.",
|
|
154
|
+
promptGuidelines: [
|
|
155
|
+
"Use for current facts, docs, best practices, comparisons, and verification.",
|
|
156
|
+
"Search instead of guessing.",
|
|
157
|
+
"Pick fast, deep, code, or academic mode as needed.",
|
|
158
|
+
],
|
|
111
159
|
parameters: Type.Object({
|
|
112
|
-
query: Type.String({ description: "
|
|
113
|
-
mode: Type.Optional(Type.Union([Type.Literal("fast"), Type.Literal("deep"), Type.Literal("code"), Type.Literal("academic")], { description: "
|
|
114
|
-
force: Type.Optional(Type.Boolean({ description: "
|
|
115
|
-
isolate: Type.Optional(Type.Boolean({ description: "
|
|
160
|
+
query: Type.String({ description: "Live web question" }),
|
|
161
|
+
mode: Type.Optional(Type.Union([Type.Literal("fast"), Type.Literal("deep"), Type.Literal("code"), Type.Literal("academic")], { description: "Mode", default: "fast" })),
|
|
162
|
+
force: Type.Optional(Type.Boolean({ description: "Ignore cache" })),
|
|
163
|
+
isolate: Type.Optional(Type.Boolean({ description: "No cache reuse" })),
|
|
116
164
|
options: Type.Optional(Type.Object({
|
|
117
165
|
allowedSources: Type.Optional(Type.Array(Type.String())),
|
|
118
166
|
maxTurns: Type.Optional(Type.Number()),
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { appendFile, mkdir } from "node:fs/promises";
|
|
2
|
+
import { dirname, join } from "node:path";
|
|
3
|
+
import { homedir } from "node:os";
|
|
4
|
+
|
|
5
|
+
const LOG_PATH = process.env.PI_RESEARCH_LOG_PATH || join(homedir(), ".pi", "logs", "pi-research.jsonl");
|
|
6
|
+
let writeChain = Promise.resolve();
|
|
7
|
+
|
|
8
|
+
function sanitize(value, depth = 0, seen = new WeakSet()) {
|
|
9
|
+
if (value === null || value === undefined) return value;
|
|
10
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") return value;
|
|
11
|
+
if (typeof value === "bigint") return value.toString();
|
|
12
|
+
if (typeof value === "function") return `[Function ${value.name || "anonymous"}]`;
|
|
13
|
+
if (value instanceof Date) return value.toISOString();
|
|
14
|
+
if (value instanceof RegExp) return value.toString();
|
|
15
|
+
if (value instanceof Error) {
|
|
16
|
+
return { name: value.name, message: value.message, stack: value.stack };
|
|
17
|
+
}
|
|
18
|
+
if (Array.isArray(value)) {
|
|
19
|
+
if (depth >= 6) return "[MaxDepth]";
|
|
20
|
+
return value.map((item) => sanitize(item, depth + 1, seen));
|
|
21
|
+
}
|
|
22
|
+
if (typeof value === "object") {
|
|
23
|
+
if (seen.has(value)) return "[Circular]";
|
|
24
|
+
if (depth >= 6) return "[MaxDepth]";
|
|
25
|
+
seen.add(value);
|
|
26
|
+
const output = {};
|
|
27
|
+
for (const [key, item] of Object.entries(value)) output[key] = sanitize(item, depth + 1, seen);
|
|
28
|
+
seen.delete(value);
|
|
29
|
+
return output;
|
|
30
|
+
}
|
|
31
|
+
return String(value);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export function getResearchLogPath() {
|
|
35
|
+
return LOG_PATH;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export async function logResearchEvent(type, data = {}) {
|
|
39
|
+
const record = {
|
|
40
|
+
ts: new Date().toISOString(),
|
|
41
|
+
pid: process.pid,
|
|
42
|
+
cwd: process.cwd(),
|
|
43
|
+
type,
|
|
44
|
+
data: sanitize(data),
|
|
45
|
+
};
|
|
46
|
+
const line = `${JSON.stringify(record)}\n`;
|
|
47
|
+
writeChain = writeChain
|
|
48
|
+
.then(async () => {
|
|
49
|
+
await mkdir(dirname(LOG_PATH), { recursive: true });
|
|
50
|
+
await appendFile(LOG_PATH, line, "utf8");
|
|
51
|
+
})
|
|
52
|
+
.catch(() => {});
|
|
53
|
+
return writeChain;
|
|
54
|
+
}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
|
+
import { fileURLToPath } from "node:url";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
|
|
5
|
+
const SCRAPLING_ROOT = fileURLToPath(new URL("../Scrapling", import.meta.url));
|
|
6
|
+
const BLOCKED_PATTERNS = [
|
|
7
|
+
/cloudflare/i,
|
|
8
|
+
/turnstile/i,
|
|
9
|
+
/captcha/i,
|
|
10
|
+
/please enable cookies/i,
|
|
11
|
+
/bot detection/i,
|
|
12
|
+
/verify you are human/i,
|
|
13
|
+
/security check/i,
|
|
14
|
+
];
|
|
15
|
+
const DYNAMIC_PATTERNS = [
|
|
16
|
+
/__next_data__/i,
|
|
17
|
+
/__nuxt__/i,
|
|
18
|
+
/data-reactroot/i,
|
|
19
|
+
/hydrat/i,
|
|
20
|
+
/window\.__INITIAL_STATE__/i,
|
|
21
|
+
/id=["']app["']/i,
|
|
22
|
+
/id=["']root["']/i,
|
|
23
|
+
];
|
|
24
|
+
|
|
25
|
+
function stripHtml(value) {
|
|
26
|
+
return String(value || "")
|
|
27
|
+
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
|
28
|
+
.replace(/<style[\s\S]*?<\/style>/gi, " ")
|
|
29
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
|
|
30
|
+
.replace(/<[^>]+>/g, " ")
|
|
31
|
+
.replace(/ /g, " ")
|
|
32
|
+
.replace(/\s+/g, " ")
|
|
33
|
+
.trim();
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export function assessPageAttempt({ status = 200, body = "", contentType = "", url = "" } = {}) {
|
|
37
|
+
const text = String(body || "");
|
|
38
|
+
const plain = stripHtml(text);
|
|
39
|
+
const lower = `${text}\n${url}`.toLowerCase();
|
|
40
|
+
const antiBotSignal = BLOCKED_PATTERNS.some((pattern) => pattern.test(lower));
|
|
41
|
+
const blocked = status === 403 || status === 429 || (antiBotSignal && plain.length < 1000);
|
|
42
|
+
const dynamic = !blocked && (DYNAMIC_PATTERNS.some((pattern) => pattern.test(lower)) || (text.includes("<script") && plain.length < 400));
|
|
43
|
+
const weak = blocked || plain.length < 300 || (!/text\/(html|plain)/i.test(contentType) && plain.length < 500);
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
blocked,
|
|
47
|
+
dynamic,
|
|
48
|
+
weak,
|
|
49
|
+
mode: blocked ? "stealthy" : dynamic ? "dynamic" : "async",
|
|
50
|
+
plainLength: plain.length,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export function chooseScraplingMode(input) {
|
|
55
|
+
return assessPageAttempt(input).mode;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function pythonScript() {
|
|
59
|
+
return String.raw`
|
|
60
|
+
import asyncio
|
|
61
|
+
import json
|
|
62
|
+
import os
|
|
63
|
+
import sys
|
|
64
|
+
|
|
65
|
+
root = sys.argv[1]
|
|
66
|
+
mode = sys.argv[2]
|
|
67
|
+
url = sys.argv[3]
|
|
68
|
+
payload = json.loads(sys.argv[4])
|
|
69
|
+
|
|
70
|
+
sys.path.insert(0, root)
|
|
71
|
+
|
|
72
|
+
async def main():
|
|
73
|
+
from scrapling.fetchers import AsyncFetcher, DynamicFetcher, StealthyFetcher
|
|
74
|
+
|
|
75
|
+
timeout = payload.get("timeout")
|
|
76
|
+
kwargs = {}
|
|
77
|
+
if timeout:
|
|
78
|
+
kwargs["timeout"] = timeout
|
|
79
|
+
|
|
80
|
+
if mode == "async":
|
|
81
|
+
response = await AsyncFetcher.get(url, **kwargs)
|
|
82
|
+
elif mode == "dynamic":
|
|
83
|
+
response = DynamicFetcher.fetch(url, **kwargs)
|
|
84
|
+
else:
|
|
85
|
+
response = StealthyFetcher.fetch(url, **kwargs)
|
|
86
|
+
|
|
87
|
+
headers = {}
|
|
88
|
+
raw_headers = getattr(response, "headers", None)
|
|
89
|
+
if hasattr(raw_headers, "items"):
|
|
90
|
+
headers = dict(raw_headers.items())
|
|
91
|
+
else:
|
|
92
|
+
try:
|
|
93
|
+
headers = dict(raw_headers or {})
|
|
94
|
+
except Exception:
|
|
95
|
+
headers = {}
|
|
96
|
+
|
|
97
|
+
body = getattr(response, "body", None)
|
|
98
|
+
if body is None:
|
|
99
|
+
candidate = getattr(response, "text", None)
|
|
100
|
+
body = candidate() if callable(candidate) else candidate
|
|
101
|
+
|
|
102
|
+
if isinstance(body, bytes):
|
|
103
|
+
body = body.decode("utf-8", "replace")
|
|
104
|
+
elif not isinstance(body, str):
|
|
105
|
+
body = str(body or "")
|
|
106
|
+
|
|
107
|
+
out = {
|
|
108
|
+
"ok": True,
|
|
109
|
+
"url": getattr(response, "url", url),
|
|
110
|
+
"status": getattr(response, "status", 200),
|
|
111
|
+
"contentType": headers.get("content-type", ""),
|
|
112
|
+
"body": body,
|
|
113
|
+
"headers": headers,
|
|
114
|
+
}
|
|
115
|
+
print(json.dumps(out))
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
asyncio.run(main())
|
|
119
|
+
except Exception as exc:
|
|
120
|
+
print(json.dumps({"ok": False, "error": str(exc), "type": exc.__class__.__name__}))
|
|
121
|
+
sys.exit(1)
|
|
122
|
+
`;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
export async function fetchWithScrapling(url, mode, signal, config = {}) {
|
|
126
|
+
if (!mode) return null;
|
|
127
|
+
|
|
128
|
+
return await new Promise((resolve) => {
|
|
129
|
+
const child = spawn(process.env.PYTHON || "python3", ["-c", pythonScript(), SCRAPLING_ROOT, mode, url, JSON.stringify({ timeout: config.pageTimeoutMs || 30000 })], {
|
|
130
|
+
env: {
|
|
131
|
+
...process.env,
|
|
132
|
+
PYTHONPATH: [SCRAPLING_ROOT, process.env.PYTHONPATH].filter(Boolean).join(path.delimiter),
|
|
133
|
+
},
|
|
134
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
let stdout = "";
|
|
138
|
+
let stderr = "";
|
|
139
|
+
child.stdout.on("data", (chunk) => {
|
|
140
|
+
stdout += chunk;
|
|
141
|
+
});
|
|
142
|
+
child.stderr.on("data", (chunk) => {
|
|
143
|
+
stderr += chunk;
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
const finish = (value) => {
|
|
147
|
+
if (!signal) return resolve(value);
|
|
148
|
+
if (signal.aborted) return resolve(null);
|
|
149
|
+
return resolve(value);
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
child.on("error", () => finish(null));
|
|
153
|
+
child.on("close", (code) => {
|
|
154
|
+
if (code !== 0) return finish(null);
|
|
155
|
+
try {
|
|
156
|
+
const parsed = JSON.parse(stdout.trim() || "{}");
|
|
157
|
+
if (!parsed.ok) return finish(null);
|
|
158
|
+
return finish(parsed);
|
|
159
|
+
} catch {
|
|
160
|
+
if (stderr) return finish(null);
|
|
161
|
+
return finish(null);
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
if (signal) {
|
|
166
|
+
const abort = () => {
|
|
167
|
+
child.kill("SIGKILL");
|
|
168
|
+
finish(null);
|
|
169
|
+
};
|
|
170
|
+
if (signal.aborted) abort();
|
|
171
|
+
else signal.addEventListener("abort", abort, { once: true });
|
|
172
|
+
}
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export const pageFetchAdapter = {
|
|
177
|
+
assessPageAttempt,
|
|
178
|
+
chooseScraplingMode,
|
|
179
|
+
fetchWithScrapling,
|
|
180
|
+
};
|
package/lib/web-research.js
CHANGED
|
@@ -35,6 +35,7 @@ import {
|
|
|
35
35
|
scoreSourceEntry,
|
|
36
36
|
selectRelevantChunks,
|
|
37
37
|
} from "./research.js";
|
|
38
|
+
import { pageFetchAdapter } from "./page-fetch-adapter.js";
|
|
38
39
|
import { resolveOutputFormat, shouldRequireAuthoritativeSources } from "./research-output.js";
|
|
39
40
|
import { planResearch } from "./planner.js";
|
|
40
41
|
import {
|
|
@@ -45,6 +46,7 @@ import {
|
|
|
45
46
|
setResearchMemory,
|
|
46
47
|
writeCachedResult,
|
|
47
48
|
} from "./research-memory.js";
|
|
49
|
+
import { logResearchEvent } from "./local-logger.js";
|
|
48
50
|
|
|
49
51
|
const USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36";
|
|
50
52
|
const MIN_PAGE_TEXT = 300;
|
|
@@ -262,7 +264,8 @@ async function searchArxiv(query, signal, config) {
|
|
|
262
264
|
const published = entry.match(/<published>([^<]+)<\/published>/)?.[1]?.slice(0, 10);
|
|
263
265
|
return sourceFromPaper(title, url, summary, published);
|
|
264
266
|
}).filter((item) => item.url && item.title);
|
|
265
|
-
} catch {
|
|
267
|
+
} catch (error) {
|
|
268
|
+
await logResearchEvent("search_error", { provider: "arxiv", query, error });
|
|
266
269
|
return [];
|
|
267
270
|
}
|
|
268
271
|
}
|
|
@@ -272,7 +275,8 @@ async function searchSemanticScholar(query, signal, config) {
|
|
|
272
275
|
const response = await fetchTextWithRetry(`https://api.semanticscholar.org/graph/v1/paper/search?query=${encodeURIComponent(query)}&limit=${config.resultsPerQuery}&fields=title,abstract,url,year`, signal, 2, {}, config.pageTimeoutMs);
|
|
273
276
|
const data = await response.json();
|
|
274
277
|
return (data?.data || []).map((item) => sourceFromPaper(item.title, item.url || `https://www.semanticscholar.org/search?q=${encodeURIComponent(item.title)}`, item.abstract || "", item.year ? `${item.year}-01-01` : null)).filter((item) => item.title);
|
|
275
|
-
} catch {
|
|
278
|
+
} catch (error) {
|
|
279
|
+
await logResearchEvent("search_error", { provider: "semanticscholar", query, error });
|
|
276
280
|
return [];
|
|
277
281
|
}
|
|
278
282
|
}
|
|
@@ -287,7 +291,8 @@ async function searchCrossref(query, signal, config) {
|
|
|
287
291
|
const publishDate = dateParts.length ? `${String(dateParts[0]).padStart(4, "0")}-${String(dateParts[1] || 1).padStart(2, "0")}-${String(dateParts[2] || 1).padStart(2, "0")}` : null;
|
|
288
292
|
return sourceFromPaper(item.title?.[0] || "", doi, String(item.abstract || "").replace(/<[^>]+>/g, " "), publishDate);
|
|
289
293
|
}).filter((item) => item.url && item.title);
|
|
290
|
-
} catch {
|
|
294
|
+
} catch (error) {
|
|
295
|
+
await logResearchEvent("search_error", { provider: "crossref", query, error });
|
|
291
296
|
return [];
|
|
292
297
|
}
|
|
293
298
|
}
|
|
@@ -346,7 +351,11 @@ function shouldSkipUrl(url) {
|
|
|
346
351
|
}
|
|
347
352
|
|
|
348
353
|
function shouldUseJinaFirst(url) {
|
|
349
|
-
|
|
354
|
+
try {
|
|
355
|
+
return /(^|\.)medium\.com$|(^|\.)dev\.to$|(^|\.)substack\.com$/i.test(new URL(url).hostname);
|
|
356
|
+
} catch {
|
|
357
|
+
return false;
|
|
358
|
+
}
|
|
350
359
|
}
|
|
351
360
|
|
|
352
361
|
function pageFromText(title, url, text, config, extra = {}) {
|
|
@@ -377,7 +386,11 @@ function withinTimeframe(page, config) {
|
|
|
377
386
|
}
|
|
378
387
|
|
|
379
388
|
export async function fetchPageSource(url, signal, config = getResearchConfig()) {
|
|
380
|
-
if (shouldSkipUrl(url))
|
|
389
|
+
if (shouldSkipUrl(url)) {
|
|
390
|
+
await logResearchEvent("fetch_skip", { url, reason: "login_or_account_url" });
|
|
391
|
+
return null;
|
|
392
|
+
}
|
|
393
|
+
const adapter = config.fetchAdapter || pageFetchAdapter;
|
|
381
394
|
const cacheKey = `${normalizeUrl(url)}::${config.pageTextLimit}::${JSON.stringify({
|
|
382
395
|
preferRecent: config.preferRecent || false,
|
|
383
396
|
minYear: config.minYear || "",
|
|
@@ -385,10 +398,20 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
|
|
|
385
398
|
useJinaFallback: Boolean(config.useJinaFallback),
|
|
386
399
|
})}`;
|
|
387
400
|
const cached = config.isolate ? null : getCacheValue(pageCache, cacheKey);
|
|
388
|
-
if (cached)
|
|
401
|
+
if (cached) {
|
|
402
|
+
await logResearchEvent("fetch_cache_hit", { url, cacheKey, title: cached.title, textLength: cached.text?.length || 0 });
|
|
403
|
+
return cached;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
await logResearchEvent("fetch_start", { url, cacheKey, config: { isolate: config.isolate, useJinaFallback: Boolean(config.useJinaFallback), pageTextLimit: config.pageTextLimit } });
|
|
407
|
+
|
|
389
408
|
if (shouldUseJinaFirst(url)) {
|
|
390
409
|
const first = await fetchJinaPageSource(url, signal, config);
|
|
391
|
-
|
|
410
|
+
if (first && withinTimeframe(first, config)) {
|
|
411
|
+
const page = config.isolate ? first : setCacheValue(pageCache, cacheKey, first, PAGE_CACHE_TTL_MS);
|
|
412
|
+
await logResearchEvent("fetch_end", { url, via: "jina_first", success: Boolean(page), page: page ? { title: page.title, sourceType: page.sourceType, publishDate: page.publishDate, textLength: page.text?.length || 0 } : null });
|
|
413
|
+
return page;
|
|
414
|
+
}
|
|
392
415
|
}
|
|
393
416
|
|
|
394
417
|
try {
|
|
@@ -398,22 +421,48 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
|
|
|
398
421
|
}, config.pageTimeoutMs);
|
|
399
422
|
|
|
400
423
|
const contentType = response.headers.get("content-type") || "";
|
|
401
|
-
if (!contentType.includes("text/html") && !contentType.includes("text/plain"))
|
|
424
|
+
if (!contentType.includes("text/html") && !contentType.includes("text/plain")) {
|
|
425
|
+
await logResearchEvent("fetch_end", { url, success: false, reason: "unsupported_content_type", contentType });
|
|
426
|
+
return null;
|
|
427
|
+
}
|
|
402
428
|
|
|
403
429
|
const body = await response.text();
|
|
404
430
|
const snapshot = extractPageSnapshot(body, response.url || url);
|
|
405
|
-
|
|
431
|
+
let page = pageFromText(snapshot.title, snapshot.url, snapshot.text, config, {
|
|
406
432
|
publishDate: extractPublishDate(body),
|
|
407
433
|
sourceType: classifySourceType(snapshot.url, snapshot.title),
|
|
408
434
|
codeBlocks: snapshot.codeBlocks,
|
|
409
435
|
});
|
|
410
436
|
|
|
437
|
+
const assessment = adapter.assessPageAttempt?.({
|
|
438
|
+
status: response.status ?? 200,
|
|
439
|
+
body,
|
|
440
|
+
contentType,
|
|
441
|
+
url: response.url || url,
|
|
442
|
+
});
|
|
443
|
+
|
|
444
|
+
if ((!page && assessment?.weak) || assessment?.dynamic || assessment?.blocked) {
|
|
445
|
+
const scrapling = await adapter.fetchWithScrapling?.(url, assessment.mode, signal, config);
|
|
446
|
+
if (scrapling?.body) {
|
|
447
|
+
const scraplingSnapshot = extractPageSnapshot(scrapling.body, scrapling.url || url);
|
|
448
|
+
page = pageFromText(scraplingSnapshot.title, scraplingSnapshot.url, scraplingSnapshot.text, config, {
|
|
449
|
+
publishDate: extractPublishDate(scrapling.body),
|
|
450
|
+
sourceType: classifySourceType(scraplingSnapshot.url, scraplingSnapshot.title),
|
|
451
|
+
codeBlocks: scraplingSnapshot.codeBlocks,
|
|
452
|
+
});
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
411
456
|
const resolved = page || await fetchJinaPageSource(url, signal, config);
|
|
412
457
|
const finalPage = resolved && withinTimeframe(resolved, config) ? resolved : null;
|
|
413
|
-
|
|
414
|
-
|
|
458
|
+
const stored = config.isolate ? finalPage : setCacheValue(pageCache, cacheKey, finalPage, PAGE_CACHE_TTL_MS);
|
|
459
|
+
await logResearchEvent("fetch_end", { url, success: Boolean(stored), page: stored ? { title: stored.title, sourceType: stored.sourceType, publishDate: stored.publishDate, textLength: stored.text?.length || 0 } : null });
|
|
460
|
+
return stored;
|
|
461
|
+
} catch (error) {
|
|
415
462
|
const fallback = await fetchJinaPageSource(url, signal, config);
|
|
416
|
-
|
|
463
|
+
const stored = config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, PAGE_CACHE_TTL_MS);
|
|
464
|
+
await logResearchEvent("fetch_error", { url, error, fallback: stored ? { title: stored.title, sourceType: stored.sourceType, publishDate: stored.publishDate, textLength: stored.text?.length || 0 } : null });
|
|
465
|
+
return stored;
|
|
417
466
|
}
|
|
418
467
|
}
|
|
419
468
|
|
|
@@ -427,9 +476,10 @@ async function readLocalFiles(paths, config) {
|
|
|
427
476
|
publishDate: null,
|
|
428
477
|
local: true,
|
|
429
478
|
});
|
|
479
|
+
await logResearchEvent("local_file_read", { path, success: Boolean(page), textLength: text.length, page: page ? { title: page.title, textLength: page.text.length } : null });
|
|
430
480
|
if (page) pages.push(page);
|
|
431
|
-
} catch {
|
|
432
|
-
|
|
481
|
+
} catch (error) {
|
|
482
|
+
await logResearchEvent("local_file_error", { path, error });
|
|
433
483
|
}
|
|
434
484
|
}
|
|
435
485
|
return pages;
|
|
@@ -455,6 +505,7 @@ function fallbackSynthesis(query, pages) {
|
|
|
455
505
|
}
|
|
456
506
|
|
|
457
507
|
export async function synthesizeResearch(query, pages, ctx, signal) {
|
|
508
|
+
await logResearchEvent("synthesis_start", { query, pages: pages.map((page) => ({ title: page.title, url: page.url, sourceType: page.sourceType, textLength: page.text?.length || 0 })) });
|
|
458
509
|
const prompt = [
|
|
459
510
|
"You are a concise research synthesizer.",
|
|
460
511
|
"Answer only from the provided sources.",
|
|
@@ -489,19 +540,23 @@ export async function synthesizeResearch(query, pages, ctx, signal) {
|
|
|
489
540
|
score: typeof pages[id - 1].score === "number" ? pages[id - 1].score : scoreSourceEntry(pages[id - 1], query).total,
|
|
490
541
|
authoritative: typeof pages[id - 1].authoritative === "boolean" ? pages[id - 1].authoritative : scoreSourceEntry(pages[id - 1], query).authoritative,
|
|
491
542
|
})), query);
|
|
492
|
-
|
|
543
|
+
const result = {
|
|
493
544
|
answer: parsed.answer.trim(),
|
|
494
545
|
bullets: parsed.bullets.map((item) => String(item).trim()).filter(Boolean).slice(0, 5),
|
|
495
546
|
sources,
|
|
496
547
|
citations: Array.isArray(parsed.citations) ? parsed.citations.slice(0, 8) : sources.map((source) => ({ text: source.title, sourceIndex: source.number || 0 })),
|
|
497
548
|
};
|
|
549
|
+
await logResearchEvent("synthesis_end", { query, result });
|
|
550
|
+
return result;
|
|
498
551
|
}
|
|
499
552
|
}
|
|
500
553
|
} catch {
|
|
501
554
|
// fall through
|
|
502
555
|
}
|
|
503
556
|
|
|
504
|
-
|
|
557
|
+
const fallback = fallbackSynthesis(query, pages);
|
|
558
|
+
await logResearchEvent("synthesis_end", { query, result: fallback, fallback: true });
|
|
559
|
+
return fallback;
|
|
505
560
|
}
|
|
506
561
|
|
|
507
562
|
function planSubqueries(rootQuery, currentQuery, config, sufficiency) {
|
|
@@ -536,17 +591,28 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
536
591
|
const config = getResearchConfig(typeof mode === "object" ? { ...mode, domain } : { mode, domain });
|
|
537
592
|
const cacheKey = modeCacheKey(query, config);
|
|
538
593
|
|
|
594
|
+
await logResearchEvent("research_start", { query, mode: config.mode, domain, config });
|
|
595
|
+
|
|
539
596
|
if (!config.isolate && !config.force) {
|
|
540
597
|
const memoryHit = getResearchMemory(cacheKey);
|
|
541
|
-
if (memoryHit)
|
|
598
|
+
if (memoryHit) {
|
|
599
|
+
await logResearchEvent("research_cache_hit", { query, cacheKey, source: "memory" });
|
|
600
|
+
await logResearchEvent("research_end", { ...memoryHit, cacheHit: true, cacheSource: "memory" });
|
|
601
|
+
return memoryHit;
|
|
602
|
+
}
|
|
542
603
|
const persistentHit = readCachedResult(cacheKey);
|
|
543
604
|
if (persistentHit) {
|
|
544
605
|
setResearchMemory(cacheKey, persistentHit);
|
|
606
|
+
await logResearchEvent("research_cache_hit", { query, cacheKey, source: "disk" });
|
|
607
|
+
await logResearchEvent("research_end", { ...persistentHit, cacheHit: true, cacheSource: "disk" });
|
|
545
608
|
return persistentHit;
|
|
546
609
|
}
|
|
547
610
|
}
|
|
548
611
|
|
|
549
|
-
const emit = (stage, text) =>
|
|
612
|
+
const emit = (stage, text) => {
|
|
613
|
+
void logResearchEvent("pipeline_stage", { query, stage, text });
|
|
614
|
+
return onUpdate?.({ content: [{ type: "text", text: `[pipeline:${stage}] ${text}` }] });
|
|
615
|
+
};
|
|
550
616
|
const startedAt = Date.now();
|
|
551
617
|
const seenUrls = new Set();
|
|
552
618
|
const seenContentHashes = new Set();
|
|
@@ -585,6 +651,11 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
585
651
|
emit("search", `Searching ${queriesThisTurn.length} queries...`);
|
|
586
652
|
|
|
587
653
|
const searchGroups = await Promise.all(queriesThisTurn.map((subquery) => searchDuckDuckGo(subquery, signal, config)));
|
|
654
|
+
await logResearchEvent("search_results", {
|
|
655
|
+
query,
|
|
656
|
+
queries: queriesThisTurn,
|
|
657
|
+
results: searchGroups.flat().map((result) => ({ title: result.title, url: result.url, snippet: result.snippet, sourceType: result.sourceType, publishDate: result.publishDate })),
|
|
658
|
+
});
|
|
588
659
|
const results = rankSearchResults(searchGroups.flat(), query, config.maxPages * 2, config)
|
|
589
660
|
.filter((result) => {
|
|
590
661
|
const key = normalizeUrl(result.url);
|
|
@@ -596,6 +667,11 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
596
667
|
|
|
597
668
|
emit("fetch", `Reading ${results.length} sources...`);
|
|
598
669
|
const pageCandidates = await Promise.all(results.map((result) => fetchPageSource(result.url, signal, config)));
|
|
670
|
+
await logResearchEvent("page_fetch_results", {
|
|
671
|
+
query,
|
|
672
|
+
urls: results.map((result) => result.url),
|
|
673
|
+
pages: pageCandidates.filter(Boolean).map((page) => ({ title: page.title, url: page.url, sourceType: page.sourceType, publishDate: page.publishDate, textLength: page.text?.length || 0 })),
|
|
674
|
+
});
|
|
599
675
|
const rankedPages = rankFetchedPages(pageCandidates.filter(Boolean).map((page) => {
|
|
600
676
|
const scored = scoreSourceEntry(page, query);
|
|
601
677
|
return {
|
|
@@ -647,7 +723,7 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
647
723
|
}
|
|
648
724
|
|
|
649
725
|
if (mergedPages.length === 0) {
|
|
650
|
-
|
|
726
|
+
const emptyResult = {
|
|
651
727
|
ok: false,
|
|
652
728
|
action: "web_research",
|
|
653
729
|
query,
|
|
@@ -657,6 +733,8 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
657
733
|
openSubQuestions: buildFallbackQueries(query),
|
|
658
734
|
error: "No readable web sources were retrieved.",
|
|
659
735
|
};
|
|
736
|
+
await logResearchEvent("research_end", emptyResult);
|
|
737
|
+
return emptyResult;
|
|
660
738
|
}
|
|
661
739
|
|
|
662
740
|
emit("synthesis", `Synthesizing ${mergedPages.length} sources...`);
|
|
@@ -726,6 +804,7 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
726
804
|
|
|
727
805
|
setResearchMemory(cacheKey, result);
|
|
728
806
|
writeCachedResult(cacheKey, result, config.cacheTtlMs);
|
|
807
|
+
await logResearchEvent("research_end", result);
|
|
729
808
|
return result;
|
|
730
809
|
}
|
|
731
810
|
|
package/mcp/server.js
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { realpathSync } from "node:fs";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
|
|
6
|
+
import pkg from "../package.json" with { type: "json" };
|
|
7
|
+
import { classifyQueryIntent } from "../lib/research.js";
|
|
8
|
+
import { runWebResearch } from "../lib/web-research.js";
|
|
9
|
+
|
|
10
|
+
const SERVER_NAME = "unblind-mcp";
|
|
11
|
+
const TOOL_NAME = "pi-research";
|
|
12
|
+
|
|
13
|
+
function buildWebResearchGuidance() {
|
|
14
|
+
return "Use pi-research for current facts, docs, best practices, comparisons, and citations. Search if unsure.";
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function defaultMode(query) {
|
|
18
|
+
const intent = classifyQueryIntent(query);
|
|
19
|
+
if (intent === "comparison" || intent === "comparative") return "deep";
|
|
20
|
+
if (intent === "academic") return "academic";
|
|
21
|
+
return "fast";
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function buildToolDefinition() {
|
|
25
|
+
return {
|
|
26
|
+
name: TOOL_NAME,
|
|
27
|
+
description: "Live sources, ranking, and cited answers.",
|
|
28
|
+
inputSchema: {
|
|
29
|
+
type: "object",
|
|
30
|
+
properties: {
|
|
31
|
+
query: { type: "string", description: "Live web question" },
|
|
32
|
+
mode: {
|
|
33
|
+
type: "string",
|
|
34
|
+
enum: ["fast", "deep", "code", "academic"],
|
|
35
|
+
description: "Mode",
|
|
36
|
+
},
|
|
37
|
+
force: { type: "boolean", description: "Ignore cache" },
|
|
38
|
+
isolate: { type: "boolean", description: "No cache reuse" },
|
|
39
|
+
options: {
|
|
40
|
+
type: "object",
|
|
41
|
+
properties: {
|
|
42
|
+
allowedSources: { type: "array", items: { type: "string" } },
|
|
43
|
+
maxTurns: { type: "number" },
|
|
44
|
+
maxSites: { type: "number" },
|
|
45
|
+
requireAuthoritative: { type: "boolean" },
|
|
46
|
+
minYear: { type: "number" },
|
|
47
|
+
maxYear: { type: "number" },
|
|
48
|
+
preferRecent: { type: "boolean" },
|
|
49
|
+
files: { type: "array", items: { type: "string" } },
|
|
50
|
+
format: {
|
|
51
|
+
type: "string",
|
|
52
|
+
enum: ["markdown", "json", "table", "latex"],
|
|
53
|
+
},
|
|
54
|
+
deepResearchConfig: {
|
|
55
|
+
type: "object",
|
|
56
|
+
properties: {
|
|
57
|
+
depth: { type: "number", enum: [1, 2, 3] },
|
|
58
|
+
breadth: { type: "number", enum: [2, 3, 4] },
|
|
59
|
+
concurrency: { type: "number", enum: [1, 2, 3, 4] },
|
|
60
|
+
},
|
|
61
|
+
additionalProperties: false,
|
|
62
|
+
},
|
|
63
|
+
},
|
|
64
|
+
additionalProperties: false,
|
|
65
|
+
},
|
|
66
|
+
},
|
|
67
|
+
required: ["query"],
|
|
68
|
+
additionalProperties: false,
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function buildInitializeResult(protocolVersion) {
|
|
74
|
+
return {
|
|
75
|
+
protocolVersion: protocolVersion || "2025-03-26",
|
|
76
|
+
capabilities: {
|
|
77
|
+
tools: {},
|
|
78
|
+
},
|
|
79
|
+
serverInfo: {
|
|
80
|
+
name: SERVER_NAME,
|
|
81
|
+
version: pkg.version,
|
|
82
|
+
},
|
|
83
|
+
instructions: buildWebResearchGuidance(),
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function buildToolResult(payload) {
|
|
88
|
+
const text = payload?.ok ? (payload.contentText || JSON.stringify(payload, null, 2)) : JSON.stringify(payload, null, 2);
|
|
89
|
+
return {
|
|
90
|
+
content: [{ type: "text", text }],
|
|
91
|
+
structuredContent: payload,
|
|
92
|
+
isError: !payload?.ok,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
async function runResearchTool(params = {}, run = runWebResearch) {
|
|
97
|
+
const mode = params.mode ?? defaultMode(params.query || "");
|
|
98
|
+
const payload = await run(params.query || "", undefined, undefined, undefined, {
|
|
99
|
+
mode,
|
|
100
|
+
force: params.force,
|
|
101
|
+
isolate: params.isolate,
|
|
102
|
+
...(params.options || {}),
|
|
103
|
+
});
|
|
104
|
+
return buildToolResult(payload);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function jsonRpcError(id, code, message, data) {
|
|
108
|
+
const error = { code, message };
|
|
109
|
+
if (data !== undefined) error.data = data;
|
|
110
|
+
return { jsonrpc: "2.0", id, error };
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
async function handleMcpRequest(message, deps = {}) {
|
|
114
|
+
const run = deps.runWebResearchFn || runWebResearch;
|
|
115
|
+
|
|
116
|
+
if (!message || typeof message !== "object") {
|
|
117
|
+
return jsonRpcError(null, -32600, "Invalid Request");
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if (typeof message.method !== "string") {
|
|
121
|
+
return jsonRpcError(message.id ?? null, -32600, "Invalid Request");
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
if (message.method === "notifications/initialized") return null;
|
|
125
|
+
|
|
126
|
+
try {
|
|
127
|
+
if (message.method === "initialize") {
|
|
128
|
+
return {
|
|
129
|
+
jsonrpc: "2.0",
|
|
130
|
+
id: message.id,
|
|
131
|
+
result: buildInitializeResult(message.params?.protocolVersion),
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (message.method === "tools/list") {
|
|
136
|
+
return {
|
|
137
|
+
jsonrpc: "2.0",
|
|
138
|
+
id: message.id,
|
|
139
|
+
result: { tools: [buildToolDefinition()] },
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (message.method === "tools/call") {
|
|
144
|
+
const params = message.params || {};
|
|
145
|
+
if (params.name !== TOOL_NAME) {
|
|
146
|
+
return jsonRpcError(message.id ?? null, -32602, `Unknown tool: ${String(params.name || "")}`);
|
|
147
|
+
}
|
|
148
|
+
const toolResult = await runResearchTool(params.arguments || {}, run);
|
|
149
|
+
return {
|
|
150
|
+
jsonrpc: "2.0",
|
|
151
|
+
id: message.id,
|
|
152
|
+
result: toolResult,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
return jsonRpcError(message.id ?? null, -32601, `Method not found: ${message.method}`);
|
|
157
|
+
} catch (error) {
|
|
158
|
+
const text = error instanceof Error ? error.stack || error.message : String(error);
|
|
159
|
+
return {
|
|
160
|
+
jsonrpc: "2.0",
|
|
161
|
+
id: message.id ?? null,
|
|
162
|
+
result: {
|
|
163
|
+
content: [{ type: "text", text }],
|
|
164
|
+
isError: true,
|
|
165
|
+
},
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
export function startMcpServer({ input = process.stdin, output = process.stdout, errorOutput = process.stderr, runWebResearchFn = runWebResearch } = {}) {
|
|
171
|
+
function send(message) {
|
|
172
|
+
const json = JSON.stringify(message);
|
|
173
|
+
output.write(`Content-Length: ${Buffer.byteLength(json, "utf8")}\r\n\r\n${json}`);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
let buffer = Buffer.alloc(0);
|
|
177
|
+
|
|
178
|
+
function pump() {
|
|
179
|
+
while (true) {
|
|
180
|
+
const headerEnd = buffer.indexOf("\r\n\r\n");
|
|
181
|
+
if (headerEnd === -1) return;
|
|
182
|
+
|
|
183
|
+
const headerText = buffer.slice(0, headerEnd).toString("utf8");
|
|
184
|
+
const match = headerText.match(/content-length:\s*(\d+)/i);
|
|
185
|
+
if (!match) {
|
|
186
|
+
buffer = buffer.slice(headerEnd + 4);
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const length = Number(match[1]);
|
|
191
|
+
const bodyStart = headerEnd + 4;
|
|
192
|
+
const bodyEnd = bodyStart + length;
|
|
193
|
+
if (buffer.length < bodyEnd) return;
|
|
194
|
+
|
|
195
|
+
const bodyText = buffer.slice(bodyStart, bodyEnd).toString("utf8");
|
|
196
|
+
buffer = buffer.slice(bodyEnd);
|
|
197
|
+
if (!bodyText.trim()) continue;
|
|
198
|
+
|
|
199
|
+
let message;
|
|
200
|
+
try {
|
|
201
|
+
message = JSON.parse(bodyText);
|
|
202
|
+
} catch (error) {
|
|
203
|
+
errorOutput.write(`${String(error)}\n`);
|
|
204
|
+
continue;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
void handleMcpRequest(message, { runWebResearchFn }).then((response) => {
|
|
208
|
+
if (response) send(response);
|
|
209
|
+
}).catch((error) => {
|
|
210
|
+
const text = error instanceof Error ? error.stack || error.message : String(error);
|
|
211
|
+
send({ jsonrpc: "2.0", id: message?.id ?? null, result: { content: [{ type: "text", text }], isError: true } });
|
|
212
|
+
});
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
input.on("data", (chunk) => {
|
|
217
|
+
buffer = Buffer.concat([buffer, chunk]);
|
|
218
|
+
pump();
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
input.on("end", () => {
|
|
222
|
+
process.exitCode = 0;
|
|
223
|
+
});
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
function isMainModule(metaUrl) {
|
|
227
|
+
if (!process.argv[1]) return false;
|
|
228
|
+
return realpathSync(process.argv[1]) === realpathSync(fileURLToPath(metaUrl));
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
if (isMainModule(import.meta.url)) {
|
|
232
|
+
startMcpServer();
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
export {
|
|
236
|
+
buildInitializeResult,
|
|
237
|
+
buildToolDefinition,
|
|
238
|
+
buildToolResult,
|
|
239
|
+
defaultMode,
|
|
240
|
+
handleMcpRequest,
|
|
241
|
+
runResearchTool,
|
|
242
|
+
};
|
package/mcp-server.js
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { realpathSync } from "node:fs";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
|
|
6
|
+
import { startMcpServer } from "./mcp/server.js";
|
|
7
|
+
|
|
8
|
+
export * from "./mcp/server.js";
|
|
9
|
+
|
|
10
|
+
function isMainModule(metaUrl) {
|
|
11
|
+
if (!process.argv[1]) return false;
|
|
12
|
+
return realpathSync(process.argv[1]) === realpathSync(fileURLToPath(metaUrl));
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
if (isMainModule(import.meta.url)) {
|
|
16
|
+
process.stderr.write("mcp-server.js is deprecated; use mcp/server.js instead.\n");
|
|
17
|
+
startMcpServer();
|
|
18
|
+
}
|
package/package.json
CHANGED
|
@@ -1,16 +1,23 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-research",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"type": "module",
|
|
6
6
|
"description": "Pi extension for web research.",
|
|
7
7
|
"license": "MIT",
|
|
8
8
|
"main": "./index.js",
|
|
9
|
+
"bin": {
|
|
10
|
+
"pi-research": "./mcp/server.js",
|
|
11
|
+
"unblind-mcp": "./mcp/server.js"
|
|
12
|
+
},
|
|
9
13
|
"files": [
|
|
10
14
|
"extensions",
|
|
11
15
|
"index.js",
|
|
12
16
|
"lib",
|
|
17
|
+
"mcp",
|
|
18
|
+
"mcp-server.js",
|
|
13
19
|
"README.md",
|
|
20
|
+
"THIRD_PARTY_NOTICES.md",
|
|
14
21
|
"package.json"
|
|
15
22
|
],
|
|
16
23
|
"repository": {
|
|
@@ -26,7 +33,8 @@
|
|
|
26
33
|
],
|
|
27
34
|
"scripts": {
|
|
28
35
|
"test": "node --test",
|
|
29
|
-
"eval": "node --test test/eval-runner.test.js"
|
|
36
|
+
"eval": "node --test test/eval-runner.test.js",
|
|
37
|
+
"mcp": "node ./mcp/server.js"
|
|
30
38
|
},
|
|
31
39
|
"dependencies": {
|
|
32
40
|
"@mariozechner/pi-ai": "*",
|