pi-research 1.1.0 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +70 -72
- package/THIRD_PARTY_NOTICES.md +17 -0
- package/index.js +13 -9
- package/lib/page-fetch-adapter.js +180 -0
- package/lib/web-research.js +31 -3
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -1,22 +1,36 @@
|
|
|
1
1
|
# pi-research
|
|
2
2
|
|
|
3
|
+

|
|
4
|
+
|
|
3
5
|
[](https://www.npmjs.com/package/pi-research)
|
|
4
6
|
[](https://github.com/endgegnerbert-tech/pi-research)
|
|
5
7
|
[](https://pi.ai)
|
|
6
8
|
|
|
7
|
-
`pi-research` is a Pi extension for
|
|
9
|
+
`pi-research` is a Pi extension for grounded web research.
|
|
10
|
+
It searches, ranks, compares, and synthesizes sources inside the agent.
|
|
8
11
|
|
|
9
|
-
|
|
10
|
-
It does **not** require an external research API or API key, and it is not a browser automation tool.
|
|
12
|
+

|
|
11
13
|
|
|
12
14
|
## Why it exists
|
|
13
15
|
|
|
14
|
-
|
|
16
|
+
When agents answer well, they usually do three things:
|
|
17
|
+
|
|
18
|
+
1. search the right places
|
|
19
|
+
2. prefer authoritative sources
|
|
20
|
+
3. explain confidence and gaps clearly
|
|
21
|
+
|
|
22
|
+
`pi-research` does that without an external research service.
|
|
15
23
|
|
|
16
|
-
|
|
17
|
-
2. a way to turn sources into a usable answer
|
|
24
|
+
## Best practices
|
|
18
25
|
|
|
19
|
-
|
|
26
|
+
- use `fast` for short factual lookups
|
|
27
|
+
- use `deep` for comparisons, conflicts, or unclear questions
|
|
28
|
+
- use `code` for docs, repos, README-driven answers, and snippets
|
|
29
|
+
- use `academic` for paper-heavy topics
|
|
30
|
+
- set `options.requireAuthoritative: true` when source quality matters more than recall
|
|
31
|
+
- use `options.format: json` when you need machine-readable output
|
|
32
|
+
- add `options.files` when local docs matter
|
|
33
|
+
- keep questions specific; vague prompts create noisy retrieval
|
|
20
34
|
|
|
21
35
|
## What it does
|
|
22
36
|
|
|
@@ -26,7 +40,7 @@ Agents usually need two things to answer well:
|
|
|
26
40
|
- follows up when the first pass is not enough
|
|
27
41
|
- extracts code blocks for code-focused questions
|
|
28
42
|
- supports local files as additional sources
|
|
29
|
-
- returns
|
|
43
|
+
- returns structured results with citations, confidence, conflicts, and gaps
|
|
30
44
|
|
|
31
45
|
## What it is not
|
|
32
46
|
|
|
@@ -34,22 +48,6 @@ Agents usually need two things to answer well:
|
|
|
34
48
|
- not an offline knowledge base
|
|
35
49
|
- not a replacement for page navigation
|
|
36
50
|
|
|
37
|
-
## Install
|
|
38
|
-
|
|
39
|
-
### For Pi
|
|
40
|
-
|
|
41
|
-
```bash
|
|
42
|
-
pi install npm:pi-research
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
### For npm-based workflows
|
|
46
|
-
|
|
47
|
-
```bash
|
|
48
|
-
npm install pi-research
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
GitHub repository: https://github.com/endgegnerbert-tech/pi-research
|
|
52
|
-
|
|
53
51
|
## Quick start
|
|
54
52
|
|
|
55
53
|
```text
|
|
@@ -57,11 +55,11 @@ What are the trade-offs between B-trees and LSM-trees?
|
|
|
57
55
|
```
|
|
58
56
|
|
|
59
57
|
```text
|
|
60
|
-
|
|
58
|
+
Compare React Server Components with traditional SSR.
|
|
61
59
|
```
|
|
62
60
|
|
|
63
61
|
```text
|
|
64
|
-
|
|
62
|
+
How do I add retries to a Node.js fetch wrapper?
|
|
65
63
|
```
|
|
66
64
|
|
|
67
65
|
## Modes
|
|
@@ -73,6 +71,26 @@ Compare React Server Components with traditional SSR.
|
|
|
73
71
|
| `code` | docs, READMEs, repositories, and code snippets |
|
|
74
72
|
| `academic` | scholarly sources and paper-heavy topics |
|
|
75
73
|
|
|
74
|
+
## Output
|
|
75
|
+
|
|
76
|
+
The tool returns structured data including:
|
|
77
|
+
|
|
78
|
+
- `answer`
|
|
79
|
+
- `bullets`
|
|
80
|
+
- `sources`
|
|
81
|
+
- `citations`
|
|
82
|
+
- `codeBlocks`
|
|
83
|
+
- `confidence`
|
|
84
|
+
- `confidenceScore`
|
|
85
|
+
- `sufficient`
|
|
86
|
+
- `authoritativeSourcesFound`
|
|
87
|
+
- `openSubQuestions`
|
|
88
|
+
- `missingAspects`
|
|
89
|
+
- `conflictSummary`
|
|
90
|
+
- `unverifiedClaims`
|
|
91
|
+
- `sourceTypes`
|
|
92
|
+
- `meta`
|
|
93
|
+
|
|
76
94
|
## Public tool parameters
|
|
77
95
|
|
|
78
96
|
- `query` — research question to answer
|
|
@@ -133,44 +151,10 @@ options:
|
|
|
133
151
|
- ./docs/spec.md
|
|
134
152
|
```
|
|
135
153
|
|
|
136
|
-
## Output
|
|
137
|
-
|
|
138
|
-
The tool returns structured data including:
|
|
139
|
-
|
|
140
|
-
- `answer`
|
|
141
|
-
- `bullets`
|
|
142
|
-
- `sources`
|
|
143
|
-
- `citations`
|
|
144
|
-
- `codeBlocks`
|
|
145
|
-
- `confidence`
|
|
146
|
-
- `confidenceScore`
|
|
147
|
-
- `sufficient`
|
|
148
|
-
- `authoritativeSourcesFound`
|
|
149
|
-
- `openSubQuestions`
|
|
150
|
-
- `missingAspects`
|
|
151
|
-
- `conflictSummary`
|
|
152
|
-
- `unverifiedClaims`
|
|
153
|
-
- `sourceTypes`
|
|
154
|
-
- `meta`
|
|
155
|
-
|
|
156
|
-
## How it works
|
|
157
|
-
|
|
158
|
-
- **query-isolated caching**: repeated identical research can be skipped when the previous result was already sufficient
|
|
159
|
-
- **source scoring**: official docs, READMEs, papers, and local files are preferred over weak sources
|
|
160
|
-
- **follow-up planning**: unclear or conflicting results trigger another round of research
|
|
161
|
-
- **conflict detection**: opposing claims are surfaced explicitly
|
|
162
|
-
- **fact checking**: unsupported answer sentences are marked as unverified
|
|
163
|
-
- **local source input**: files can be added directly to the research context
|
|
164
|
-
|
|
165
|
-
## Limits
|
|
166
|
-
|
|
167
|
-
- it still depends on live web access for web research
|
|
168
|
-
- it does not browse pages like a human user
|
|
169
|
-
- it is not fully offline unless you only use local files
|
|
170
|
-
- it is not a browser interaction tool
|
|
171
|
-
|
|
172
154
|
## Domain packs
|
|
173
155
|
|
|
156
|
+
Built-in packs now steer routing and source selection:
|
|
157
|
+
|
|
174
158
|
- `web`
|
|
175
159
|
- `github`
|
|
176
160
|
- `security`
|
|
@@ -183,9 +167,14 @@ The tool returns structured data including:
|
|
|
183
167
|
|
|
184
168
|
## Community packs
|
|
185
169
|
|
|
186
|
-
You can add your own domain pack
|
|
170
|
+
You can add your own domain pack without changing the core research engine:
|
|
171
|
+
|
|
172
|
+
1. copy `lib/domains/template.js`
|
|
173
|
+
2. implement your domain-specific `run(question, options)` logic
|
|
174
|
+
3. register the pack in `lib/domains/index.js`
|
|
175
|
+
4. add eval cases in `eval/cases/<your-domain>/`
|
|
187
176
|
|
|
188
|
-
|
|
177
|
+
Starter example:
|
|
189
178
|
|
|
190
179
|
```js
|
|
191
180
|
export default {
|
|
@@ -209,16 +198,25 @@ export default {
|
|
|
209
198
|
|
|
210
199
|
Run `npm run eval` to execute the eval harness.
|
|
211
200
|
|
|
212
|
-
##
|
|
201
|
+
## Install
|
|
213
202
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
203
|
+
### For Pi
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
pi install npm:pi-research
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### For npm-based workflows
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
npm install pi-research
|
|
213
|
+
```
|
|
218
214
|
|
|
219
215
|
## Release notes
|
|
220
216
|
|
|
221
|
-
-
|
|
222
|
-
-
|
|
217
|
+
- Package name: `pi-research`
|
|
218
|
+
- Version: `1.1.1`
|
|
219
|
+
- Entry point: `extensions/pi-research.ts`
|
|
220
|
+
- License: MIT
|
|
221
|
+
- Third-party notices: `THIRD_PARTY_NOTICES.md`
|
|
223
222
|
- GitHub: `https://github.com/endgegnerbert-tech/pi-research`
|
|
224
|
-
- Community packs: copy the template pack and register it in `lib/domains/index.js`
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Third-Party Notices
|
|
2
|
+
|
|
3
|
+
## Scrapling
|
|
4
|
+
|
|
5
|
+
This project includes ideas and/or adapted implementation details from Scrapling.
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2024, Karim shoair
|
|
8
|
+
|
|
9
|
+
BSD 3-Clause License
|
|
10
|
+
|
|
11
|
+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
|
12
|
+
|
|
13
|
+
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
|
14
|
+
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
|
|
16
|
+
|
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
package/index.js
CHANGED
|
@@ -7,7 +7,7 @@ import { runWebResearch } from "./lib/web-research.js";
|
|
|
7
7
|
const RESEARCH_STATE = new Map();
|
|
8
8
|
|
|
9
9
|
function buildWebResearchGuidance() {
|
|
10
|
-
return "Use pi-research for
|
|
10
|
+
return "Use pi-research for current facts, docs, best practices, comparisons, and citations. Search if unsure.";
|
|
11
11
|
}
|
|
12
12
|
|
|
13
13
|
function defaultMode(query) {
|
|
@@ -104,15 +104,19 @@ export default function webResearchExtension(pi) {
|
|
|
104
104
|
|
|
105
105
|
pi.registerTool({
|
|
106
106
|
name: "pi-research",
|
|
107
|
-
label: "
|
|
108
|
-
description: "
|
|
109
|
-
promptSnippet: "Use
|
|
110
|
-
promptGuidelines: [
|
|
107
|
+
label: "Web Research",
|
|
108
|
+
description: "Live sources, ranking, and cited answers.",
|
|
109
|
+
promptSnippet: "Use for current or uncertain answers with citations.",
|
|
110
|
+
promptGuidelines: [
|
|
111
|
+
"Use for current facts, docs, best practices, comparisons, and verification.",
|
|
112
|
+
"Search instead of guessing.",
|
|
113
|
+
"Pick fast, deep, code, or academic mode as needed.",
|
|
114
|
+
],
|
|
111
115
|
parameters: Type.Object({
|
|
112
|
-
query: Type.String({ description: "
|
|
113
|
-
mode: Type.Optional(Type.Union([Type.Literal("fast"), Type.Literal("deep"), Type.Literal("code"), Type.Literal("academic")], { description: "
|
|
114
|
-
force: Type.Optional(Type.Boolean({ description: "
|
|
115
|
-
isolate: Type.Optional(Type.Boolean({ description: "
|
|
116
|
+
query: Type.String({ description: "Live web question" }),
|
|
117
|
+
mode: Type.Optional(Type.Union([Type.Literal("fast"), Type.Literal("deep"), Type.Literal("code"), Type.Literal("academic")], { description: "Mode", default: "fast" })),
|
|
118
|
+
force: Type.Optional(Type.Boolean({ description: "Ignore cache" })),
|
|
119
|
+
isolate: Type.Optional(Type.Boolean({ description: "No cache reuse" })),
|
|
116
120
|
options: Type.Optional(Type.Object({
|
|
117
121
|
allowedSources: Type.Optional(Type.Array(Type.String())),
|
|
118
122
|
maxTurns: Type.Optional(Type.Number()),
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
|
+
import { fileURLToPath } from "node:url";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
|
|
5
|
+
const SCRAPLING_ROOT = fileURLToPath(new URL("../Scrapling", import.meta.url));
|
|
6
|
+
const BLOCKED_PATTERNS = [
|
|
7
|
+
/cloudflare/i,
|
|
8
|
+
/turnstile/i,
|
|
9
|
+
/captcha/i,
|
|
10
|
+
/please enable cookies/i,
|
|
11
|
+
/bot detection/i,
|
|
12
|
+
/verify you are human/i,
|
|
13
|
+
/security check/i,
|
|
14
|
+
];
|
|
15
|
+
const DYNAMIC_PATTERNS = [
|
|
16
|
+
/__next_data__/i,
|
|
17
|
+
/__nuxt__/i,
|
|
18
|
+
/data-reactroot/i,
|
|
19
|
+
/hydrat/i,
|
|
20
|
+
/window\.__INITIAL_STATE__/i,
|
|
21
|
+
/id=["']app["']/i,
|
|
22
|
+
/id=["']root["']/i,
|
|
23
|
+
];
|
|
24
|
+
|
|
25
|
+
function stripHtml(value) {
|
|
26
|
+
return String(value || "")
|
|
27
|
+
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
|
28
|
+
.replace(/<style[\s\S]*?<\/style>/gi, " ")
|
|
29
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
|
|
30
|
+
.replace(/<[^>]+>/g, " ")
|
|
31
|
+
.replace(/ /g, " ")
|
|
32
|
+
.replace(/\s+/g, " ")
|
|
33
|
+
.trim();
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export function assessPageAttempt({ status = 200, body = "", contentType = "", url = "" } = {}) {
|
|
37
|
+
const text = String(body || "");
|
|
38
|
+
const plain = stripHtml(text);
|
|
39
|
+
const lower = `${text}\n${url}`.toLowerCase();
|
|
40
|
+
const antiBotSignal = BLOCKED_PATTERNS.some((pattern) => pattern.test(lower));
|
|
41
|
+
const blocked = status === 403 || status === 429 || (antiBotSignal && plain.length < 1000);
|
|
42
|
+
const dynamic = !blocked && (DYNAMIC_PATTERNS.some((pattern) => pattern.test(lower)) || (text.includes("<script") && plain.length < 400));
|
|
43
|
+
const weak = blocked || plain.length < 300 || (!/text\/(html|plain)/i.test(contentType) && plain.length < 500);
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
blocked,
|
|
47
|
+
dynamic,
|
|
48
|
+
weak,
|
|
49
|
+
mode: blocked ? "stealthy" : dynamic ? "dynamic" : "async",
|
|
50
|
+
plainLength: plain.length,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export function chooseScraplingMode(input) {
|
|
55
|
+
return assessPageAttempt(input).mode;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function pythonScript() {
|
|
59
|
+
return String.raw`
|
|
60
|
+
import asyncio
|
|
61
|
+
import json
|
|
62
|
+
import os
|
|
63
|
+
import sys
|
|
64
|
+
|
|
65
|
+
root = sys.argv[1]
|
|
66
|
+
mode = sys.argv[2]
|
|
67
|
+
url = sys.argv[3]
|
|
68
|
+
payload = json.loads(sys.argv[4])
|
|
69
|
+
|
|
70
|
+
sys.path.insert(0, root)
|
|
71
|
+
|
|
72
|
+
async def main():
|
|
73
|
+
from scrapling.fetchers import AsyncFetcher, DynamicFetcher, StealthyFetcher
|
|
74
|
+
|
|
75
|
+
timeout = payload.get("timeout")
|
|
76
|
+
kwargs = {}
|
|
77
|
+
if timeout:
|
|
78
|
+
kwargs["timeout"] = timeout
|
|
79
|
+
|
|
80
|
+
if mode == "async":
|
|
81
|
+
response = await AsyncFetcher.get(url, **kwargs)
|
|
82
|
+
elif mode == "dynamic":
|
|
83
|
+
response = DynamicFetcher.fetch(url, **kwargs)
|
|
84
|
+
else:
|
|
85
|
+
response = StealthyFetcher.fetch(url, **kwargs)
|
|
86
|
+
|
|
87
|
+
headers = {}
|
|
88
|
+
raw_headers = getattr(response, "headers", None)
|
|
89
|
+
if hasattr(raw_headers, "items"):
|
|
90
|
+
headers = dict(raw_headers.items())
|
|
91
|
+
else:
|
|
92
|
+
try:
|
|
93
|
+
headers = dict(raw_headers or {})
|
|
94
|
+
except Exception:
|
|
95
|
+
headers = {}
|
|
96
|
+
|
|
97
|
+
body = getattr(response, "body", None)
|
|
98
|
+
if body is None:
|
|
99
|
+
candidate = getattr(response, "text", None)
|
|
100
|
+
body = candidate() if callable(candidate) else candidate
|
|
101
|
+
|
|
102
|
+
if isinstance(body, bytes):
|
|
103
|
+
body = body.decode("utf-8", "replace")
|
|
104
|
+
elif not isinstance(body, str):
|
|
105
|
+
body = str(body or "")
|
|
106
|
+
|
|
107
|
+
out = {
|
|
108
|
+
"ok": True,
|
|
109
|
+
"url": getattr(response, "url", url),
|
|
110
|
+
"status": getattr(response, "status", 200),
|
|
111
|
+
"contentType": headers.get("content-type", ""),
|
|
112
|
+
"body": body,
|
|
113
|
+
"headers": headers,
|
|
114
|
+
}
|
|
115
|
+
print(json.dumps(out))
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
asyncio.run(main())
|
|
119
|
+
except Exception as exc:
|
|
120
|
+
print(json.dumps({"ok": False, "error": str(exc), "type": exc.__class__.__name__}))
|
|
121
|
+
sys.exit(1)
|
|
122
|
+
`;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
export async function fetchWithScrapling(url, mode, signal, config = {}) {
|
|
126
|
+
if (!mode) return null;
|
|
127
|
+
|
|
128
|
+
return await new Promise((resolve) => {
|
|
129
|
+
const child = spawn(process.env.PYTHON || "python3", ["-c", pythonScript(), SCRAPLING_ROOT, mode, url, JSON.stringify({ timeout: config.pageTimeoutMs || 30000 })], {
|
|
130
|
+
env: {
|
|
131
|
+
...process.env,
|
|
132
|
+
PYTHONPATH: [SCRAPLING_ROOT, process.env.PYTHONPATH].filter(Boolean).join(path.delimiter),
|
|
133
|
+
},
|
|
134
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
let stdout = "";
|
|
138
|
+
let stderr = "";
|
|
139
|
+
child.stdout.on("data", (chunk) => {
|
|
140
|
+
stdout += chunk;
|
|
141
|
+
});
|
|
142
|
+
child.stderr.on("data", (chunk) => {
|
|
143
|
+
stderr += chunk;
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
const finish = (value) => {
|
|
147
|
+
if (!signal) return resolve(value);
|
|
148
|
+
if (signal.aborted) return resolve(null);
|
|
149
|
+
return resolve(value);
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
child.on("error", () => finish(null));
|
|
153
|
+
child.on("close", (code) => {
|
|
154
|
+
if (code !== 0) return finish(null);
|
|
155
|
+
try {
|
|
156
|
+
const parsed = JSON.parse(stdout.trim() || "{}");
|
|
157
|
+
if (!parsed.ok) return finish(null);
|
|
158
|
+
return finish(parsed);
|
|
159
|
+
} catch {
|
|
160
|
+
if (stderr) return finish(null);
|
|
161
|
+
return finish(null);
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
if (signal) {
|
|
166
|
+
const abort = () => {
|
|
167
|
+
child.kill("SIGKILL");
|
|
168
|
+
finish(null);
|
|
169
|
+
};
|
|
170
|
+
if (signal.aborted) abort();
|
|
171
|
+
else signal.addEventListener("abort", abort, { once: true });
|
|
172
|
+
}
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export const pageFetchAdapter = {
|
|
177
|
+
assessPageAttempt,
|
|
178
|
+
chooseScraplingMode,
|
|
179
|
+
fetchWithScrapling,
|
|
180
|
+
};
|
package/lib/web-research.js
CHANGED
|
@@ -35,6 +35,7 @@ import {
|
|
|
35
35
|
scoreSourceEntry,
|
|
36
36
|
selectRelevantChunks,
|
|
37
37
|
} from "./research.js";
|
|
38
|
+
import { pageFetchAdapter } from "./page-fetch-adapter.js";
|
|
38
39
|
import { resolveOutputFormat, shouldRequireAuthoritativeSources } from "./research-output.js";
|
|
39
40
|
import { planResearch } from "./planner.js";
|
|
40
41
|
import {
|
|
@@ -346,7 +347,11 @@ function shouldSkipUrl(url) {
|
|
|
346
347
|
}
|
|
347
348
|
|
|
348
349
|
function shouldUseJinaFirst(url) {
|
|
349
|
-
|
|
350
|
+
try {
|
|
351
|
+
return /(^|\.)medium\.com$|(^|\.)dev\.to$|(^|\.)substack\.com$/i.test(new URL(url).hostname);
|
|
352
|
+
} catch {
|
|
353
|
+
return false;
|
|
354
|
+
}
|
|
350
355
|
}
|
|
351
356
|
|
|
352
357
|
function pageFromText(title, url, text, config, extra = {}) {
|
|
@@ -378,6 +383,7 @@ function withinTimeframe(page, config) {
|
|
|
378
383
|
|
|
379
384
|
export async function fetchPageSource(url, signal, config = getResearchConfig()) {
|
|
380
385
|
if (shouldSkipUrl(url)) return null;
|
|
386
|
+
const adapter = config.fetchAdapter || pageFetchAdapter;
|
|
381
387
|
const cacheKey = `${normalizeUrl(url)}::${config.pageTextLimit}::${JSON.stringify({
|
|
382
388
|
preferRecent: config.preferRecent || false,
|
|
383
389
|
minYear: config.minYear || "",
|
|
@@ -386,9 +392,12 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
|
|
|
386
392
|
})}`;
|
|
387
393
|
const cached = config.isolate ? null : getCacheValue(pageCache, cacheKey);
|
|
388
394
|
if (cached) return cached;
|
|
395
|
+
|
|
389
396
|
if (shouldUseJinaFirst(url)) {
|
|
390
397
|
const first = await fetchJinaPageSource(url, signal, config);
|
|
391
|
-
|
|
398
|
+
if (first && withinTimeframe(first, config)) {
|
|
399
|
+
return config.isolate ? first : setCacheValue(pageCache, cacheKey, first, PAGE_CACHE_TTL_MS);
|
|
400
|
+
}
|
|
392
401
|
}
|
|
393
402
|
|
|
394
403
|
try {
|
|
@@ -402,12 +411,31 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
|
|
|
402
411
|
|
|
403
412
|
const body = await response.text();
|
|
404
413
|
const snapshot = extractPageSnapshot(body, response.url || url);
|
|
405
|
-
|
|
414
|
+
let page = pageFromText(snapshot.title, snapshot.url, snapshot.text, config, {
|
|
406
415
|
publishDate: extractPublishDate(body),
|
|
407
416
|
sourceType: classifySourceType(snapshot.url, snapshot.title),
|
|
408
417
|
codeBlocks: snapshot.codeBlocks,
|
|
409
418
|
});
|
|
410
419
|
|
|
420
|
+
const assessment = adapter.assessPageAttempt?.({
|
|
421
|
+
status: response.status ?? 200,
|
|
422
|
+
body,
|
|
423
|
+
contentType,
|
|
424
|
+
url: response.url || url,
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
if ((!page && assessment?.weak) || assessment?.dynamic || assessment?.blocked) {
|
|
428
|
+
const scrapling = await adapter.fetchWithScrapling?.(url, assessment.mode, signal, config);
|
|
429
|
+
if (scrapling?.body) {
|
|
430
|
+
const scraplingSnapshot = extractPageSnapshot(scrapling.body, scrapling.url || url);
|
|
431
|
+
page = pageFromText(scraplingSnapshot.title, scraplingSnapshot.url, scraplingSnapshot.text, config, {
|
|
432
|
+
publishDate: extractPublishDate(scrapling.body),
|
|
433
|
+
sourceType: classifySourceType(scraplingSnapshot.url, scraplingSnapshot.title),
|
|
434
|
+
codeBlocks: scraplingSnapshot.codeBlocks,
|
|
435
|
+
});
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
411
439
|
const resolved = page || await fetchJinaPageSource(url, signal, config);
|
|
412
440
|
const finalPage = resolved && withinTimeframe(resolved, config) ? resolved : null;
|
|
413
441
|
return config.isolate ? finalPage : setCacheValue(pageCache, cacheKey, finalPage, PAGE_CACHE_TTL_MS);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-research",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.2",
|
|
4
4
|
"private": false,
|
|
5
5
|
"type": "module",
|
|
6
6
|
"description": "Pi extension for web research.",
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
"index.js",
|
|
12
12
|
"lib",
|
|
13
13
|
"README.md",
|
|
14
|
+
"THIRD_PARTY_NOTICES.md",
|
|
14
15
|
"package.json"
|
|
15
16
|
],
|
|
16
17
|
"repository": {
|