pi-research 1.0.2 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +102 -63
- package/THIRD_PARTY_NOTICES.md +17 -0
- package/index.js +13 -9
- package/lib/domains/changelog.js +10 -0
- package/lib/domains/forums.js +9 -0
- package/lib/domains/github.js +9 -0
- package/lib/domains/index.js +46 -0
- package/lib/domains/package-registry.js +11 -0
- package/lib/domains/papers.js +11 -0
- package/lib/domains/security.js +11 -0
- package/lib/domains/specs.js +11 -0
- package/lib/domains/template.js +26 -0
- package/lib/domains/vendor-status.js +10 -0
- package/lib/domains/web.js +7 -0
- package/lib/eval/case-loader.js +13 -0
- package/lib/eval/runner.js +8 -0
- package/lib/page-fetch-adapter.js +180 -0
- package/lib/research-evidence.js +21 -0
- package/lib/research-intent.js +20 -0
- package/lib/research-output.js +7 -0
- package/lib/research.js +44 -5
- package/lib/types.js +2 -0
- package/lib/web-research.js +57 -15
- package/package.json +7 -4
package/README.md
CHANGED
|
@@ -1,22 +1,36 @@
|
|
|
1
1
|
# pi-research
|
|
2
2
|
|
|
3
|
+

|
|
4
|
+
|
|
3
5
|
[](https://www.npmjs.com/package/pi-research)
|
|
4
|
-
[](https://github.com/endgegnerbert-tech/pi-research)
|
|
5
7
|
[](https://pi.ai)
|
|
6
8
|
|
|
7
|
-
`pi-research` is a Pi extension for
|
|
9
|
+
`pi-research` is a Pi extension for grounded web research.
|
|
10
|
+
It searches, ranks, compares, and synthesizes sources inside the agent.
|
|
8
11
|
|
|
9
|
-
|
|
10
|
-
It does **not** require an external research API or API key, and it is not a browser automation tool.
|
|
12
|
+

|
|
11
13
|
|
|
12
14
|
## Why it exists
|
|
13
15
|
|
|
14
|
-
|
|
16
|
+
When agents answer well, they usually do three things:
|
|
17
|
+
|
|
18
|
+
1. search the right places
|
|
19
|
+
2. prefer authoritative sources
|
|
20
|
+
3. explain confidence and gaps clearly
|
|
21
|
+
|
|
22
|
+
`pi-research` does that without an external research service.
|
|
15
23
|
|
|
16
|
-
|
|
17
|
-
2. a way to turn sources into a usable answer
|
|
24
|
+
## Best practices
|
|
18
25
|
|
|
19
|
-
|
|
26
|
+
- use `fast` for short factual lookups
|
|
27
|
+
- use `deep` for comparisons, conflicts, or unclear questions
|
|
28
|
+
- use `code` for docs, repos, README-driven answers, and snippets
|
|
29
|
+
- use `academic` for paper-heavy topics
|
|
30
|
+
- set `options.requireAuthoritative: true` when source quality matters more than recall
|
|
31
|
+
- use `options.format: json` when you need machine-readable output
|
|
32
|
+
- add `options.files` when local docs matter
|
|
33
|
+
- keep questions specific; vague prompts create noisy retrieval
|
|
20
34
|
|
|
21
35
|
## What it does
|
|
22
36
|
|
|
@@ -26,7 +40,7 @@ Agents usually need two things to answer well:
|
|
|
26
40
|
- follows up when the first pass is not enough
|
|
27
41
|
- extracts code blocks for code-focused questions
|
|
28
42
|
- supports local files as additional sources
|
|
29
|
-
- returns
|
|
43
|
+
- returns structured results with citations, confidence, conflicts, and gaps
|
|
30
44
|
|
|
31
45
|
## What it is not
|
|
32
46
|
|
|
@@ -34,22 +48,6 @@ Agents usually need two things to answer well:
|
|
|
34
48
|
- not an offline knowledge base
|
|
35
49
|
- not a replacement for page navigation
|
|
36
50
|
|
|
37
|
-
## Install
|
|
38
|
-
|
|
39
|
-
### For Pi
|
|
40
|
-
|
|
41
|
-
```bash
|
|
42
|
-
pi install npm:pi-research
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
### For npm-based workflows
|
|
46
|
-
|
|
47
|
-
```bash
|
|
48
|
-
npm install pi-research
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
GitHub repository: https://github.com/endgegnerbert-tech/pi-research
|
|
52
|
-
|
|
53
51
|
## Quick start
|
|
54
52
|
|
|
55
53
|
```text
|
|
@@ -57,11 +55,11 @@ What are the trade-offs between B-trees and LSM-trees?
|
|
|
57
55
|
```
|
|
58
56
|
|
|
59
57
|
```text
|
|
60
|
-
|
|
58
|
+
Compare React Server Components with traditional SSR.
|
|
61
59
|
```
|
|
62
60
|
|
|
63
61
|
```text
|
|
64
|
-
|
|
62
|
+
How do I add retries to a Node.js fetch wrapper?
|
|
65
63
|
```
|
|
66
64
|
|
|
67
65
|
## Modes
|
|
@@ -73,6 +71,26 @@ Compare React Server Components with traditional SSR.
|
|
|
73
71
|
| `code` | docs, READMEs, repositories, and code snippets |
|
|
74
72
|
| `academic` | scholarly sources and paper-heavy topics |
|
|
75
73
|
|
|
74
|
+
## Output
|
|
75
|
+
|
|
76
|
+
The tool returns structured data including:
|
|
77
|
+
|
|
78
|
+
- `answer`
|
|
79
|
+
- `bullets`
|
|
80
|
+
- `sources`
|
|
81
|
+
- `citations`
|
|
82
|
+
- `codeBlocks`
|
|
83
|
+
- `confidence`
|
|
84
|
+
- `confidenceScore`
|
|
85
|
+
- `sufficient`
|
|
86
|
+
- `authoritativeSourcesFound`
|
|
87
|
+
- `openSubQuestions`
|
|
88
|
+
- `missingAspects`
|
|
89
|
+
- `conflictSummary`
|
|
90
|
+
- `unverifiedClaims`
|
|
91
|
+
- `sourceTypes`
|
|
92
|
+
- `meta`
|
|
93
|
+
|
|
76
94
|
## Public tool parameters
|
|
77
95
|
|
|
78
96
|
- `query` — research question to answer
|
|
@@ -133,51 +151,72 @@ options:
|
|
|
133
151
|
- ./docs/spec.md
|
|
134
152
|
```
|
|
135
153
|
|
|
136
|
-
##
|
|
154
|
+
## Domain packs
|
|
155
|
+
|
|
156
|
+
Built-in packs now steer routing and source selection:
|
|
157
|
+
|
|
158
|
+
- `web`
|
|
159
|
+
- `github`
|
|
160
|
+
- `security`
|
|
161
|
+
- `papers`
|
|
162
|
+
- `specs`
|
|
163
|
+
- `changelog`
|
|
164
|
+
- `forums`
|
|
165
|
+
- `package-registry`
|
|
166
|
+
- `vendor-status`
|
|
167
|
+
|
|
168
|
+
## Community packs
|
|
169
|
+
|
|
170
|
+
You can add your own domain pack without changing the core research engine:
|
|
171
|
+
|
|
172
|
+
1. copy `lib/domains/template.js`
|
|
173
|
+
2. implement your domain-specific `run(question, options)` logic
|
|
174
|
+
3. register the pack in `lib/domains/index.js`
|
|
175
|
+
4. add eval cases in `eval/cases/<your-domain>/`
|
|
176
|
+
|
|
177
|
+
Starter example:
|
|
178
|
+
|
|
179
|
+
```js
|
|
180
|
+
export default {
|
|
181
|
+
name: "boxing-training",
|
|
182
|
+
sourceHints: ["web"],
|
|
183
|
+
async run(question) {
|
|
184
|
+
return {
|
|
185
|
+
claims: [
|
|
186
|
+
{
|
|
187
|
+
text: `Starter pack example for ${question}`,
|
|
188
|
+
evidence: [{ type: "web", source: "https://example.com", snippet: "Example" }],
|
|
189
|
+
confidence: "medium",
|
|
190
|
+
},
|
|
191
|
+
],
|
|
192
|
+
};
|
|
193
|
+
},
|
|
194
|
+
};
|
|
195
|
+
```
|
|
137
196
|
|
|
138
|
-
|
|
197
|
+
## Eval
|
|
139
198
|
|
|
140
|
-
|
|
141
|
-
- `bullets`
|
|
142
|
-
- `sources`
|
|
143
|
-
- `citations`
|
|
144
|
-
- `codeBlocks`
|
|
145
|
-
- `confidence`
|
|
146
|
-
- `confidenceScore`
|
|
147
|
-
- `sufficient`
|
|
148
|
-
- `authoritativeSourcesFound`
|
|
149
|
-
- `openSubQuestions`
|
|
150
|
-
- `missingAspects`
|
|
151
|
-
- `conflictSummary`
|
|
152
|
-
- `unverifiedClaims`
|
|
153
|
-
- `sourceTypes`
|
|
154
|
-
- `meta`
|
|
199
|
+
Run `npm run eval` to execute the eval harness.
|
|
155
200
|
|
|
156
|
-
##
|
|
201
|
+
## Install
|
|
157
202
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
- **local source input**: files can be added directly to the research context
|
|
203
|
+
### For Pi
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
pi install npm:pi-research
|
|
207
|
+
```
|
|
164
208
|
|
|
165
|
-
|
|
209
|
+
### For npm-based workflows
|
|
166
210
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
- it is not a browser interaction tool
|
|
211
|
+
```bash
|
|
212
|
+
npm install pi-research
|
|
213
|
+
```
|
|
171
214
|
|
|
172
|
-
##
|
|
215
|
+
## Release notes
|
|
173
216
|
|
|
174
217
|
- Package name: `pi-research`
|
|
218
|
+
- Version: `1.1.1`
|
|
175
219
|
- Entry point: `extensions/pi-research.ts`
|
|
176
|
-
- Tool name: `pi-research`
|
|
177
220
|
- License: MIT
|
|
178
|
-
|
|
179
|
-
## Release notes
|
|
180
|
-
|
|
181
|
-
- Pi install: `pi install npm:pi-research`
|
|
182
|
-
- npm install: `npm install pi-research`
|
|
221
|
+
- Third-party notices: `THIRD_PARTY_NOTICES.md`
|
|
183
222
|
- GitHub: `https://github.com/endgegnerbert-tech/pi-research`
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Third-Party Notices
|
|
2
|
+
|
|
3
|
+
## Scrapling
|
|
4
|
+
|
|
5
|
+
This project includes ideas and/or adapted implementation details from Scrapling.
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2024, Karim shoair
|
|
8
|
+
|
|
9
|
+
BSD 3-Clause License
|
|
10
|
+
|
|
11
|
+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
|
12
|
+
|
|
13
|
+
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
|
14
|
+
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
|
|
16
|
+
|
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
package/index.js
CHANGED
|
@@ -7,7 +7,7 @@ import { runWebResearch } from "./lib/web-research.js";
|
|
|
7
7
|
const RESEARCH_STATE = new Map();
|
|
8
8
|
|
|
9
9
|
function buildWebResearchGuidance() {
|
|
10
|
-
return "Use pi-research for
|
|
10
|
+
return "Use pi-research for current facts, docs, best practices, comparisons, and citations. Search if unsure.";
|
|
11
11
|
}
|
|
12
12
|
|
|
13
13
|
function defaultMode(query) {
|
|
@@ -104,15 +104,19 @@ export default function webResearchExtension(pi) {
|
|
|
104
104
|
|
|
105
105
|
pi.registerTool({
|
|
106
106
|
name: "pi-research",
|
|
107
|
-
label: "
|
|
108
|
-
description: "
|
|
109
|
-
promptSnippet: "Use
|
|
110
|
-
promptGuidelines: [
|
|
107
|
+
label: "Web Research",
|
|
108
|
+
description: "Live sources, ranking, and cited answers.",
|
|
109
|
+
promptSnippet: "Use for current or uncertain answers with citations.",
|
|
110
|
+
promptGuidelines: [
|
|
111
|
+
"Use for current facts, docs, best practices, comparisons, and verification.",
|
|
112
|
+
"Search instead of guessing.",
|
|
113
|
+
"Pick fast, deep, code, or academic mode as needed.",
|
|
114
|
+
],
|
|
111
115
|
parameters: Type.Object({
|
|
112
|
-
query: Type.String({ description: "
|
|
113
|
-
mode: Type.Optional(Type.Union([Type.Literal("fast"), Type.Literal("deep"), Type.Literal("code"), Type.Literal("academic")], { description: "
|
|
114
|
-
force: Type.Optional(Type.Boolean({ description: "
|
|
115
|
-
isolate: Type.Optional(Type.Boolean({ description: "
|
|
116
|
+
query: Type.String({ description: "Live web question" }),
|
|
117
|
+
mode: Type.Optional(Type.Union([Type.Literal("fast"), Type.Literal("deep"), Type.Literal("code"), Type.Literal("academic")], { description: "Mode", default: "fast" })),
|
|
118
|
+
force: Type.Optional(Type.Boolean({ description: "Ignore cache" })),
|
|
119
|
+
isolate: Type.Optional(Type.Boolean({ description: "No cache reuse" })),
|
|
116
120
|
options: Type.Optional(Type.Object({
|
|
117
121
|
allowedSources: Type.Optional(Type.Array(Type.String())),
|
|
118
122
|
maxTurns: Type.Optional(Type.Number()),
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
name: "changelog",
|
|
3
|
+
sourceHints: ["changelog", "release notes", "releases"],
|
|
4
|
+
allowedSources: ["github.com", "docs.", "release notes"],
|
|
5
|
+
queryHints: ["release notes", "changelog", "site:github.com/releases"],
|
|
6
|
+
requireAuthoritative: true,
|
|
7
|
+
async run() {
|
|
8
|
+
return { name: "changelog" };
|
|
9
|
+
},
|
|
10
|
+
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
name: "forums",
|
|
3
|
+
sourceHints: ["stackoverflow", "discourse", "reddit"],
|
|
4
|
+
allowedSources: ["stackoverflow.com", "discourse", "reddit.com"],
|
|
5
|
+
queryHints: ["site:stackoverflow.com", "discourse", "site:reddit.com"],
|
|
6
|
+
async run() {
|
|
7
|
+
return { name: "forums" };
|
|
8
|
+
},
|
|
9
|
+
};
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import web from "./web.js";
|
|
2
|
+
import github from "./github.js";
|
|
3
|
+
import forums from "./forums.js";
|
|
4
|
+
import security from "./security.js";
|
|
5
|
+
import packageRegistry from "./package-registry.js";
|
|
6
|
+
import changelog from "./changelog.js";
|
|
7
|
+
import papers from "./papers.js";
|
|
8
|
+
import specs from "./specs.js";
|
|
9
|
+
import vendorStatus from "./vendor-status.js";
|
|
10
|
+
|
|
11
|
+
const PACKS = {
|
|
12
|
+
web,
|
|
13
|
+
github,
|
|
14
|
+
forums,
|
|
15
|
+
security,
|
|
16
|
+
"package-registry": packageRegistry,
|
|
17
|
+
changelog,
|
|
18
|
+
papers,
|
|
19
|
+
specs,
|
|
20
|
+
"vendor-status": vendorStatus,
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
const DOMAIN_NAMES = ["web", "github", "security", "papers", "specs", "changelog", "forums", "package-registry", "vendor-status"];
|
|
24
|
+
|
|
25
|
+
export function listDomainPacks() {
|
|
26
|
+
return [...DOMAIN_NAMES];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export function getDomainPack(name = "web") {
|
|
30
|
+
return PACKS[name] || web;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
import { classifyQuestionDomain } from "../research-intent.js";
|
|
34
|
+
|
|
35
|
+
export function resolveDomainConfig(questionOrDomain = "web") {
|
|
36
|
+
const name = PACKS[questionOrDomain] ? questionOrDomain : classifyQuestionDomain(questionOrDomain);
|
|
37
|
+
const pack = PACKS[name] || PACKS.web;
|
|
38
|
+
return {
|
|
39
|
+
domain: name,
|
|
40
|
+
allowedSources: pack.allowedSources || [],
|
|
41
|
+
allowedSourceTypes: pack.allowedSourceTypes || [],
|
|
42
|
+
queryHints: pack.queryHints || [],
|
|
43
|
+
requireAuthoritative: Boolean(pack.requireAuthoritative),
|
|
44
|
+
format: pack.format || "markdown",
|
|
45
|
+
};
|
|
46
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
name: "package-registry",
|
|
3
|
+
sourceHints: ["npm", "pypi", "cargo", "maven"],
|
|
4
|
+
allowedSources: ["npmjs.com", "pypi.org", "crates.io", "mvnrepository.com"],
|
|
5
|
+
allowedSourceTypes: ["official_doc", "github_readme"],
|
|
6
|
+
queryHints: ["site:npmjs.com", "site:pypi.org", "site:crates.io", "site:mvnrepository.com"],
|
|
7
|
+
requireAuthoritative: true,
|
|
8
|
+
async run() {
|
|
9
|
+
return { name: "package-registry" };
|
|
10
|
+
},
|
|
11
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
name: "papers",
|
|
3
|
+
sourceHints: ["arxiv", "semanticscholar", "doi"],
|
|
4
|
+
allowedSources: ["arxiv.org", "semanticscholar.org", "doi.org", "pubmed.ncbi.nlm.nih.gov"],
|
|
5
|
+
allowedSourceTypes: ["paper"],
|
|
6
|
+
queryHints: ["site:arxiv.org", "site:semanticscholar.org", "site:doi.org"],
|
|
7
|
+
requireAuthoritative: true,
|
|
8
|
+
async run() {
|
|
9
|
+
return { name: "papers" };
|
|
10
|
+
},
|
|
11
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
name: "security",
|
|
3
|
+
sourceHints: ["cve", "advisory", "security bulletin"],
|
|
4
|
+
allowedSources: ["nvd.nist.gov", "cisa.gov", "mitre.org", "ubuntu.com", "redhat.com", "debian.org", "suse.com"],
|
|
5
|
+
allowedSourceTypes: ["official_doc", "paper"],
|
|
6
|
+
queryHints: ["nvd", "cisa", "mitre", "advisory", "cve"],
|
|
7
|
+
requireAuthoritative: true,
|
|
8
|
+
async run() {
|
|
9
|
+
return { name: "security" };
|
|
10
|
+
},
|
|
11
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
name: "specs",
|
|
3
|
+
sourceHints: ["rfc", "spec", "standard"],
|
|
4
|
+
allowedSources: ["rfc-editor.org", "datatracker.ietf.org", "w3.org"],
|
|
5
|
+
allowedSourceTypes: ["official_doc"],
|
|
6
|
+
queryHints: ["site:rfc-editor.org", "site:datatracker.ietf.org", "RFC"],
|
|
7
|
+
requireAuthoritative: true,
|
|
8
|
+
async run() {
|
|
9
|
+
return { name: "specs" };
|
|
10
|
+
},
|
|
11
|
+
};
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
name: "template",
|
|
3
|
+
description: "Minimal domain pack example for pi-research",
|
|
4
|
+
sourceHints: ["web"],
|
|
5
|
+
queryHints: ["site:example.com"],
|
|
6
|
+
async run(question, options) {
|
|
7
|
+
return {
|
|
8
|
+
claims: [
|
|
9
|
+
{
|
|
10
|
+
text: `This is a minimal example for a domain pack: ${question}`,
|
|
11
|
+
evidence: [
|
|
12
|
+
{
|
|
13
|
+
type: "web",
|
|
14
|
+
source: "https://example.com",
|
|
15
|
+
snippet: "Minimal example",
|
|
16
|
+
},
|
|
17
|
+
],
|
|
18
|
+
confidence: "medium",
|
|
19
|
+
confidenceDescription: "Just an example",
|
|
20
|
+
},
|
|
21
|
+
],
|
|
22
|
+
evidenceSummary: "Starter example only.",
|
|
23
|
+
sourceTypes: ["other"],
|
|
24
|
+
};
|
|
25
|
+
},
|
|
26
|
+
};
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
name: "vendor-status",
|
|
3
|
+
sourceHints: ["status", "incident", "outage"],
|
|
4
|
+
allowedSources: ["status", "statuspage.io", "status.github.com"],
|
|
5
|
+
queryHints: ["status page", "incident", "outage"],
|
|
6
|
+
requireAuthoritative: true,
|
|
7
|
+
async run() {
|
|
8
|
+
return { name: "vendor-status" };
|
|
9
|
+
},
|
|
10
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { readdirSync, readFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
|
|
4
|
+
export function loadEvalCases(domain) {
|
|
5
|
+
const dir = join(process.cwd(), "eval", "cases", domain);
|
|
6
|
+
try {
|
|
7
|
+
return readdirSync(dir)
|
|
8
|
+
.filter((file) => file.endsWith(".json"))
|
|
9
|
+
.map((file) => JSON.parse(readFileSync(join(dir, file), "utf8")));
|
|
10
|
+
} catch {
|
|
11
|
+
return [];
|
|
12
|
+
}
|
|
13
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { loadEvalCases } from "./case-loader.js";
|
|
2
|
+
|
|
3
|
+
export async function runEvalSuite({ domain }) {
|
|
4
|
+
const cases = loadEvalCases(domain);
|
|
5
|
+
const passed = cases.filter((item) => item.expectedDomain === domain).length;
|
|
6
|
+
const total = cases.length;
|
|
7
|
+
return { total, passed, passRate: total ? passed / total : 0 };
|
|
8
|
+
}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
|
+
import { fileURLToPath } from "node:url";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
|
|
5
|
+
const SCRAPLING_ROOT = fileURLToPath(new URL("../Scrapling", import.meta.url));
|
|
6
|
+
const BLOCKED_PATTERNS = [
|
|
7
|
+
/cloudflare/i,
|
|
8
|
+
/turnstile/i,
|
|
9
|
+
/captcha/i,
|
|
10
|
+
/please enable cookies/i,
|
|
11
|
+
/bot detection/i,
|
|
12
|
+
/verify you are human/i,
|
|
13
|
+
/security check/i,
|
|
14
|
+
];
|
|
15
|
+
const DYNAMIC_PATTERNS = [
|
|
16
|
+
/__next_data__/i,
|
|
17
|
+
/__nuxt__/i,
|
|
18
|
+
/data-reactroot/i,
|
|
19
|
+
/hydrat/i,
|
|
20
|
+
/window\.__INITIAL_STATE__/i,
|
|
21
|
+
/id=["']app["']/i,
|
|
22
|
+
/id=["']root["']/i,
|
|
23
|
+
];
|
|
24
|
+
|
|
25
|
+
function stripHtml(value) {
|
|
26
|
+
return String(value || "")
|
|
27
|
+
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
|
28
|
+
.replace(/<style[\s\S]*?<\/style>/gi, " ")
|
|
29
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, " ")
|
|
30
|
+
.replace(/<[^>]+>/g, " ")
|
|
31
|
+
.replace(/ /g, " ")
|
|
32
|
+
.replace(/\s+/g, " ")
|
|
33
|
+
.trim();
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export function assessPageAttempt({ status = 200, body = "", contentType = "", url = "" } = {}) {
|
|
37
|
+
const text = String(body || "");
|
|
38
|
+
const plain = stripHtml(text);
|
|
39
|
+
const lower = `${text}\n${url}`.toLowerCase();
|
|
40
|
+
const antiBotSignal = BLOCKED_PATTERNS.some((pattern) => pattern.test(lower));
|
|
41
|
+
const blocked = status === 403 || status === 429 || (antiBotSignal && plain.length < 1000);
|
|
42
|
+
const dynamic = !blocked && (DYNAMIC_PATTERNS.some((pattern) => pattern.test(lower)) || (text.includes("<script") && plain.length < 400));
|
|
43
|
+
const weak = blocked || plain.length < 300 || (!/text\/(html|plain)/i.test(contentType) && plain.length < 500);
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
blocked,
|
|
47
|
+
dynamic,
|
|
48
|
+
weak,
|
|
49
|
+
mode: blocked ? "stealthy" : dynamic ? "dynamic" : "async",
|
|
50
|
+
plainLength: plain.length,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export function chooseScraplingMode(input) {
|
|
55
|
+
return assessPageAttempt(input).mode;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function pythonScript() {
|
|
59
|
+
return String.raw`
|
|
60
|
+
import asyncio
|
|
61
|
+
import json
|
|
62
|
+
import os
|
|
63
|
+
import sys
|
|
64
|
+
|
|
65
|
+
root = sys.argv[1]
|
|
66
|
+
mode = sys.argv[2]
|
|
67
|
+
url = sys.argv[3]
|
|
68
|
+
payload = json.loads(sys.argv[4])
|
|
69
|
+
|
|
70
|
+
sys.path.insert(0, root)
|
|
71
|
+
|
|
72
|
+
async def main():
|
|
73
|
+
from scrapling.fetchers import AsyncFetcher, DynamicFetcher, StealthyFetcher
|
|
74
|
+
|
|
75
|
+
timeout = payload.get("timeout")
|
|
76
|
+
kwargs = {}
|
|
77
|
+
if timeout:
|
|
78
|
+
kwargs["timeout"] = timeout
|
|
79
|
+
|
|
80
|
+
if mode == "async":
|
|
81
|
+
response = await AsyncFetcher.get(url, **kwargs)
|
|
82
|
+
elif mode == "dynamic":
|
|
83
|
+
response = DynamicFetcher.fetch(url, **kwargs)
|
|
84
|
+
else:
|
|
85
|
+
response = StealthyFetcher.fetch(url, **kwargs)
|
|
86
|
+
|
|
87
|
+
headers = {}
|
|
88
|
+
raw_headers = getattr(response, "headers", None)
|
|
89
|
+
if hasattr(raw_headers, "items"):
|
|
90
|
+
headers = dict(raw_headers.items())
|
|
91
|
+
else:
|
|
92
|
+
try:
|
|
93
|
+
headers = dict(raw_headers or {})
|
|
94
|
+
except Exception:
|
|
95
|
+
headers = {}
|
|
96
|
+
|
|
97
|
+
body = getattr(response, "body", None)
|
|
98
|
+
if body is None:
|
|
99
|
+
candidate = getattr(response, "text", None)
|
|
100
|
+
body = candidate() if callable(candidate) else candidate
|
|
101
|
+
|
|
102
|
+
if isinstance(body, bytes):
|
|
103
|
+
body = body.decode("utf-8", "replace")
|
|
104
|
+
elif not isinstance(body, str):
|
|
105
|
+
body = str(body or "")
|
|
106
|
+
|
|
107
|
+
out = {
|
|
108
|
+
"ok": True,
|
|
109
|
+
"url": getattr(response, "url", url),
|
|
110
|
+
"status": getattr(response, "status", 200),
|
|
111
|
+
"contentType": headers.get("content-type", ""),
|
|
112
|
+
"body": body,
|
|
113
|
+
"headers": headers,
|
|
114
|
+
}
|
|
115
|
+
print(json.dumps(out))
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
asyncio.run(main())
|
|
119
|
+
except Exception as exc:
|
|
120
|
+
print(json.dumps({"ok": False, "error": str(exc), "type": exc.__class__.__name__}))
|
|
121
|
+
sys.exit(1)
|
|
122
|
+
`;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
export async function fetchWithScrapling(url, mode, signal, config = {}) {
|
|
126
|
+
if (!mode) return null;
|
|
127
|
+
|
|
128
|
+
return await new Promise((resolve) => {
|
|
129
|
+
const child = spawn(process.env.PYTHON || "python3", ["-c", pythonScript(), SCRAPLING_ROOT, mode, url, JSON.stringify({ timeout: config.pageTimeoutMs || 30000 })], {
|
|
130
|
+
env: {
|
|
131
|
+
...process.env,
|
|
132
|
+
PYTHONPATH: [SCRAPLING_ROOT, process.env.PYTHONPATH].filter(Boolean).join(path.delimiter),
|
|
133
|
+
},
|
|
134
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
let stdout = "";
|
|
138
|
+
let stderr = "";
|
|
139
|
+
child.stdout.on("data", (chunk) => {
|
|
140
|
+
stdout += chunk;
|
|
141
|
+
});
|
|
142
|
+
child.stderr.on("data", (chunk) => {
|
|
143
|
+
stderr += chunk;
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
const finish = (value) => {
|
|
147
|
+
if (!signal) return resolve(value);
|
|
148
|
+
if (signal.aborted) return resolve(null);
|
|
149
|
+
return resolve(value);
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
child.on("error", () => finish(null));
|
|
153
|
+
child.on("close", (code) => {
|
|
154
|
+
if (code !== 0) return finish(null);
|
|
155
|
+
try {
|
|
156
|
+
const parsed = JSON.parse(stdout.trim() || "{}");
|
|
157
|
+
if (!parsed.ok) return finish(null);
|
|
158
|
+
return finish(parsed);
|
|
159
|
+
} catch {
|
|
160
|
+
if (stderr) return finish(null);
|
|
161
|
+
return finish(null);
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
if (signal) {
|
|
166
|
+
const abort = () => {
|
|
167
|
+
child.kill("SIGKILL");
|
|
168
|
+
finish(null);
|
|
169
|
+
};
|
|
170
|
+
if (signal.aborted) abort();
|
|
171
|
+
else signal.addEventListener("abort", abort, { once: true });
|
|
172
|
+
}
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export const pageFetchAdapter = {
|
|
177
|
+
assessPageAttempt,
|
|
178
|
+
chooseScraplingMode,
|
|
179
|
+
fetchWithScrapling,
|
|
180
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export function createEvidence(evidence = {}) {
|
|
2
|
+
return {
|
|
3
|
+
type: evidence.type || "web",
|
|
4
|
+
source: evidence.source || "",
|
|
5
|
+
snippet: evidence.snippet || "",
|
|
6
|
+
};
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export function createClaim(claim = {}) {
|
|
10
|
+
return {
|
|
11
|
+
text: claim.text || "",
|
|
12
|
+
confidence: claim.confidence || "low",
|
|
13
|
+
evidence: Array.isArray(claim.evidence) ? claim.evidence.map(createEvidence) : [],
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function explainConfidence(confidence = "low", evidenceCount = 0) {
|
|
18
|
+
if (confidence === "high" && evidenceCount >= 2) return "Multiple sources support this claim.";
|
|
19
|
+
if (confidence === "medium") return "Some supporting evidence was found.";
|
|
20
|
+
return "Limited supporting evidence was found.";
|
|
21
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
function text(value) {
|
|
2
|
+
return String(value || "").toLowerCase();
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
export function classifyQuestionDomain(question) {
|
|
6
|
+
const q = text(question);
|
|
7
|
+
if (/(cve-|cve\b|advisory|security|vulnerability|exploit)/.test(q)) return "security";
|
|
8
|
+
if (/(status page|status|outage|incident)/.test(q)) return "vendor-status";
|
|
9
|
+
if (/(changelog|release notes?|releases?|version history)/.test(q)) return "changelog";
|
|
10
|
+
if (/(github|issue|issues|pull request|repo\b|repository\b|discussions?)/.test(q)) return "github";
|
|
11
|
+
if (/(arxiv|paper|papers|study|(?<!pi-)research|scientific|scholar)/.test(q)) return "papers";
|
|
12
|
+
if (/(rfc|spec|specification|standard|standards)/.test(q)) return "specs";
|
|
13
|
+
if (/(stackoverflow|stack overflow|discourse|reddit|forum|forums)/.test(q)) return "forums";
|
|
14
|
+
if (/(npm|pypi|cargo|maven|package registry|package|library)/.test(q)) return "package-registry";
|
|
15
|
+
return "web";
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export function normalizeResearchMode(input = {}, fallback = "fast") {
|
|
19
|
+
return input && typeof input === "object" && input.mode ? input.mode : fallback;
|
|
20
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export function resolveOutputFormat(input = {}, fallback = "markdown") {
|
|
2
|
+
return input && typeof input === "object" && input.format ? input.format : fallback;
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
export function shouldRequireAuthoritativeSources(input = {}, fallback = false) {
|
|
6
|
+
return Boolean(input && typeof input === "object" && input.requireAuthoritative) || Boolean(fallback);
|
|
7
|
+
}
|
package/lib/research.js
CHANGED
|
@@ -386,6 +386,25 @@ export function rankFetchedPages(pages, query, limit = pages.length, config = {}
|
|
|
386
386
|
return [...pages].sort((a, b) => scoreFetchedPage(b, query, config) - scoreFetchedPage(a, query, config)).slice(0, limit);
|
|
387
387
|
}
|
|
388
388
|
|
|
389
|
+
export function detectClaimConflicts(claims = []) {
|
|
390
|
+
const texts = claims.map((claim) => String(claim?.text || claim || "").toLowerCase());
|
|
391
|
+
const hasPositive = texts.some((text) => /\b(supported|works|available|recommended|yes|stable|compatible)\b/.test(text));
|
|
392
|
+
const hasNegative = texts.some((text) => /\b(not supported|unsupported|does not|no support|broken|incompatible|removed)\b/.test(text));
|
|
393
|
+
return {
|
|
394
|
+
detected: hasPositive && hasNegative,
|
|
395
|
+
conflictSummary: hasPositive && hasNegative ? "Claims conflict." : "",
|
|
396
|
+
};
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
export function detectCoverageGaps(input = {}) {
|
|
400
|
+
const claims = Array.isArray(input.claims) ? input.claims : [];
|
|
401
|
+
const authoritativeSourcesFound = claims.some((claim) => Array.isArray(claim?.evidence) && claim.evidence.length > 0);
|
|
402
|
+
return {
|
|
403
|
+
detected: !authoritativeSourcesFound,
|
|
404
|
+
missingAspects: authoritativeSourcesFound ? [] : ["authoritative sources"],
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
|
|
389
408
|
export function detectConflictSignals(pages) {
|
|
390
409
|
if (!Array.isArray(pages) || pages.length < 2) {
|
|
391
410
|
return { detected: false, reason: null, conflictSummary: "", conflictingSourcePairs: [] };
|
|
@@ -592,15 +611,17 @@ export function extractCodeBlocks(text) {
|
|
|
592
611
|
export function evaluateSufficiency(input, legacyPages, legacyConflictDetected = false) {
|
|
593
612
|
const payload = typeof input === "string"
|
|
594
613
|
? { query: input, sources: legacyPages || [], conflictDetected: legacyConflictDetected }
|
|
595
|
-
: { query: input?.query || "", sources: input?.sources || [], conflictDetected: Boolean(input?.conflictDetected), confidence: input?.confidence, minSources: input?.minSources };
|
|
614
|
+
: { query: input?.query || "", sources: input?.sources || [], claims: input?.claims || [], conflictDetected: Boolean(input?.conflictDetected), confidence: input?.confidence, minSources: input?.minSources };
|
|
596
615
|
|
|
597
616
|
const scoredSources = payload.sources.map((page) => scoreSourceEntry(page, payload.query || ""));
|
|
598
617
|
const authoritativeCount = scoredSources.filter((scored) => Boolean(scored.authoritative)).length;
|
|
599
618
|
const authoritativeSourcesFound = authoritativeCount > 0;
|
|
600
619
|
const conflict = detectConflictSignals(payload.sources);
|
|
601
|
-
const
|
|
620
|
+
const claimConflict = detectClaimConflicts(payload.claims);
|
|
621
|
+
const coverage = detectCoverageGaps(payload);
|
|
622
|
+
const conflictDetected = payload.conflictDetected || conflict.detected || claimConflict.detected;
|
|
602
623
|
const missingAspects = [];
|
|
603
|
-
if (!authoritativeSourcesFound) missingAspects.push("authoritative sources");
|
|
624
|
+
if (!authoritativeSourcesFound || coverage.detected) missingAspects.push("authoritative sources");
|
|
604
625
|
if (conflictDetected) missingAspects.push("conflict resolution");
|
|
605
626
|
if (!payload.sources.length) missingAspects.push("readable sources");
|
|
606
627
|
|
|
@@ -654,6 +675,16 @@ export function compactResearchPayload(payload) {
|
|
|
654
675
|
...(typeof source.local === "boolean" ? { local: source.local } : {}),
|
|
655
676
|
}))
|
|
656
677
|
: [],
|
|
678
|
+
claims: Array.isArray(payload.claims) ? payload.claims.slice(0, 8).map((claim) => ({
|
|
679
|
+
text: claim.text,
|
|
680
|
+
confidence: claim.confidence,
|
|
681
|
+
evidence: Array.isArray(claim.evidence) ? claim.evidence.slice(0, 5).map((evidence) => ({
|
|
682
|
+
type: evidence.type,
|
|
683
|
+
source: evidence.source,
|
|
684
|
+
snippet: evidence.snippet,
|
|
685
|
+
})) : [],
|
|
686
|
+
})) : [],
|
|
687
|
+
evidenceSummary: payload.evidenceSummary || "",
|
|
657
688
|
sourceTypes: Array.isArray(payload.sourceTypes) ? payload.sourceTypes.slice(0, 8) : [],
|
|
658
689
|
unverifiedClaims: Array.isArray(payload.unverifiedClaims) ? payload.unverifiedClaims.slice(0, 8) : [],
|
|
659
690
|
meta: payload.meta && typeof payload.meta === "object" ? payload.meta : undefined,
|
|
@@ -675,12 +706,20 @@ export function extractPageSnapshot(html, url) {
|
|
|
675
706
|
return { title, url, text: stripTags(body), codeBlocks: extractCodeBlocks(html) };
|
|
676
707
|
}
|
|
677
708
|
|
|
678
|
-
export function formatResearchResponse({ answer, bullets, sources, confidence }) {
|
|
709
|
+
export function formatResearchResponse({ answer, bullets, sources, confidence, format = "markdown" }) {
|
|
710
|
+
const list = Array.isArray(sources) ? sources : [];
|
|
711
|
+
if (format === "json") {
|
|
712
|
+
return JSON.stringify({ answer: String(answer || "").trim(), bullets: bullets || [], confidence: confidence || "", sources: list });
|
|
713
|
+
}
|
|
714
|
+
if (format === "table") {
|
|
715
|
+
const rows = list.map((source, index) => `| ${index + 1} | ${source.title} | ${source.url} |`).join("\n");
|
|
716
|
+
return ["| # | Title | URL |", "|---|---|---|", rows].filter(Boolean).join("\n").trim();
|
|
717
|
+
}
|
|
679
718
|
const parts = ["## Answer", "", String(answer || "").trim(), "", "## Key points"];
|
|
680
719
|
for (const bullet of bullets || []) parts.push(`- ${bullet}`);
|
|
681
720
|
if (confidence) parts.push("", "## Confidence", "", confidence);
|
|
682
721
|
parts.push("", "## Sources");
|
|
683
|
-
|
|
722
|
+
list.forEach((source, index) => {
|
|
684
723
|
const freshness = source.freshness ? ` (${source.freshness})` : "";
|
|
685
724
|
const meta = [];
|
|
686
725
|
if (source.sourceType) meta.push(source.sourceType);
|
package/lib/types.js
CHANGED
|
@@ -36,6 +36,8 @@ export function createResearchResult(result = {}) {
|
|
|
36
36
|
bullets: Array.isArray(result.bullets) ? result.bullets : [],
|
|
37
37
|
citations: Array.isArray(result.citations) ? result.citations : [],
|
|
38
38
|
sources: Array.isArray(result.sources) ? result.sources.map(createResearchSource) : [],
|
|
39
|
+
claims: Array.isArray(result.claims) ? result.claims : [],
|
|
40
|
+
evidenceSummary: result.evidenceSummary || "",
|
|
39
41
|
codeBlocks: Array.isArray(result.codeBlocks) ? result.codeBlocks : [],
|
|
40
42
|
sufficient: Boolean(result.sufficient),
|
|
41
43
|
missingAspects: Array.isArray(result.missingAspects) ? result.missingAspects : [],
|
package/lib/web-research.js
CHANGED
|
@@ -5,6 +5,8 @@ import { complete } from "@mariozechner/pi-ai";
|
|
|
5
5
|
|
|
6
6
|
import profiles from "./research-profiles.json" with { type: "json" };
|
|
7
7
|
import { createResearchResult } from "./types.js";
|
|
8
|
+
import { resolveDomainConfig } from "./domains/index.js";
|
|
9
|
+
import { classifyQuestionDomain } from "./research-intent.js";
|
|
8
10
|
import {
|
|
9
11
|
buildConfidenceSummary,
|
|
10
12
|
buildDeepQueries,
|
|
@@ -33,6 +35,8 @@ import {
|
|
|
33
35
|
scoreSourceEntry,
|
|
34
36
|
selectRelevantChunks,
|
|
35
37
|
} from "./research.js";
|
|
38
|
+
import { pageFetchAdapter } from "./page-fetch-adapter.js";
|
|
39
|
+
import { resolveOutputFormat, shouldRequireAuthoritativeSources } from "./research-output.js";
|
|
36
40
|
import { planResearch } from "./planner.js";
|
|
37
41
|
import {
|
|
38
42
|
clearResearchMemory,
|
|
@@ -79,15 +83,18 @@ export function resolveResearchConfig(input = "fast") {
|
|
|
79
83
|
const options = normalizeResearchOptions(input);
|
|
80
84
|
const base = profiles[options.mode] || profiles.fast;
|
|
81
85
|
const deep = options.deepResearchConfig || {};
|
|
86
|
+
const domainConfig = resolveDomainConfig(options.domain || "web");
|
|
82
87
|
|
|
83
88
|
return {
|
|
84
89
|
...base,
|
|
90
|
+
...domainConfig,
|
|
85
91
|
...options,
|
|
86
92
|
mode: base.mode,
|
|
87
93
|
maxTurns: options.maxTurns ?? (deep.depth ? Math.max(base.maxTurns || 1, deep.depth) : (base.maxTurns || 1)),
|
|
88
94
|
maxQueries: options.maxQueries ?? (deep.breadth ? Math.max(base.maxQueries || 2, deep.breadth * (deep.depth || 1)) : (base.maxQueries || 2)),
|
|
89
95
|
maxPages: options.maxSites ?? options.maxPages ?? base.maxPages,
|
|
90
|
-
allowedSourceTypes: options.allowedSourceTypes ?? base.allowedSourceTypes,
|
|
96
|
+
allowedSourceTypes: options.allowedSourceTypes ?? (Array.isArray(domainConfig.allowedSourceTypes) && domainConfig.allowedSourceTypes.length ? domainConfig.allowedSourceTypes : base.allowedSourceTypes),
|
|
97
|
+
allowedSources: options.allowedSources ?? (Array.isArray(domainConfig.allowedSources) && domainConfig.allowedSources.length ? domainConfig.allowedSources : base.allowedSources),
|
|
91
98
|
searchProvider: options.searchProvider ?? base.searchProvider,
|
|
92
99
|
concurrentQueries: deep.concurrency ?? options.concurrentQueries ?? 3,
|
|
93
100
|
depth: deep.depth ?? 1,
|
|
@@ -101,7 +108,10 @@ export function resolveResearchConfig(input = "fast") {
|
|
|
101
108
|
files: Array.isArray(options.files) ? options.files : [],
|
|
102
109
|
isolate: Boolean(options.isolate || process.env.RESEARCH_ISOLATE === "1"),
|
|
103
110
|
force: Boolean(options.force),
|
|
104
|
-
format: options.format
|
|
111
|
+
format: resolveOutputFormat(options, domainConfig.format || "markdown"),
|
|
112
|
+
queryHints: Array.isArray(domainConfig.queryHints) ? domainConfig.queryHints : [],
|
|
113
|
+
requireAuthoritative: Boolean(options.requireAuthoritative ?? domainConfig.requireAuthoritative),
|
|
114
|
+
domain: domainConfig.domain,
|
|
105
115
|
};
|
|
106
116
|
}
|
|
107
117
|
|
|
@@ -150,8 +160,11 @@ async function completeWithResearchModel(ctx, signal, prompt, reasoningEffort =
|
|
|
150
160
|
|
|
151
161
|
export async function buildQueries(query, mode = "fast", ctx, signal) {
|
|
152
162
|
const config = getResearchConfig(mode);
|
|
163
|
+
const hintedQueries = Array.isArray(config.queryHints) && config.queryHints.length
|
|
164
|
+
? config.queryHints.map((hint) => `${query} ${hint}`)
|
|
165
|
+
: [];
|
|
153
166
|
if (config.mode === "code") {
|
|
154
|
-
return planResearch(query, "code").subqueries.slice(0, config.maxQueries);
|
|
167
|
+
return [...new Set([...planResearch(query, "code").subqueries, ...hintedQueries])].slice(0, config.maxQueries);
|
|
155
168
|
}
|
|
156
169
|
if (config.mode === "deep" || config.mode === "academic") {
|
|
157
170
|
const prompt = [
|
|
@@ -165,15 +178,15 @@ export async function buildQueries(query, mode = "fast", ctx, signal) {
|
|
|
165
178
|
|
|
166
179
|
try {
|
|
167
180
|
const text = await completeWithResearchModel(ctx, signal, prompt, "low");
|
|
168
|
-
if (text) return parseDeepQueryPlan(text, query, config.maxQueries);
|
|
181
|
+
if (text) return [...new Set([...parseDeepQueryPlan(text, query, config.maxQueries), ...hintedQueries])].slice(0, config.maxQueries);
|
|
169
182
|
} catch {
|
|
170
183
|
// fall through
|
|
171
184
|
}
|
|
172
185
|
|
|
173
|
-
return buildDeepQueries(query, config.maxQueries);
|
|
186
|
+
return [...new Set([...buildDeepQueries(query, config.maxQueries), ...hintedQueries])].slice(0, config.maxQueries);
|
|
174
187
|
}
|
|
175
188
|
|
|
176
|
-
return buildFastQueries(query, config.maxQueries);
|
|
189
|
+
return [...new Set([...buildFastQueries(query, config.maxQueries), ...hintedQueries])].slice(0, config.maxQueries);
|
|
177
190
|
}
|
|
178
191
|
|
|
179
192
|
function withTimeoutSignal(signal, timeoutMs) {
|
|
@@ -334,7 +347,11 @@ function shouldSkipUrl(url) {
|
|
|
334
347
|
}
|
|
335
348
|
|
|
336
349
|
function shouldUseJinaFirst(url) {
|
|
337
|
-
|
|
350
|
+
try {
|
|
351
|
+
return /(^|\.)medium\.com$|(^|\.)dev\.to$|(^|\.)substack\.com$/i.test(new URL(url).hostname);
|
|
352
|
+
} catch {
|
|
353
|
+
return false;
|
|
354
|
+
}
|
|
338
355
|
}
|
|
339
356
|
|
|
340
357
|
function pageFromText(title, url, text, config, extra = {}) {
|
|
@@ -366,6 +383,7 @@ function withinTimeframe(page, config) {
|
|
|
366
383
|
|
|
367
384
|
export async function fetchPageSource(url, signal, config = getResearchConfig()) {
|
|
368
385
|
if (shouldSkipUrl(url)) return null;
|
|
386
|
+
const adapter = config.fetchAdapter || pageFetchAdapter;
|
|
369
387
|
const cacheKey = `${normalizeUrl(url)}::${config.pageTextLimit}::${JSON.stringify({
|
|
370
388
|
preferRecent: config.preferRecent || false,
|
|
371
389
|
minYear: config.minYear || "",
|
|
@@ -374,9 +392,12 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
|
|
|
374
392
|
})}`;
|
|
375
393
|
const cached = config.isolate ? null : getCacheValue(pageCache, cacheKey);
|
|
376
394
|
if (cached) return cached;
|
|
395
|
+
|
|
377
396
|
if (shouldUseJinaFirst(url)) {
|
|
378
397
|
const first = await fetchJinaPageSource(url, signal, config);
|
|
379
|
-
|
|
398
|
+
if (first && withinTimeframe(first, config)) {
|
|
399
|
+
return config.isolate ? first : setCacheValue(pageCache, cacheKey, first, PAGE_CACHE_TTL_MS);
|
|
400
|
+
}
|
|
380
401
|
}
|
|
381
402
|
|
|
382
403
|
try {
|
|
@@ -390,12 +411,31 @@ export async function fetchPageSource(url, signal, config = getResearchConfig())
|
|
|
390
411
|
|
|
391
412
|
const body = await response.text();
|
|
392
413
|
const snapshot = extractPageSnapshot(body, response.url || url);
|
|
393
|
-
|
|
414
|
+
let page = pageFromText(snapshot.title, snapshot.url, snapshot.text, config, {
|
|
394
415
|
publishDate: extractPublishDate(body),
|
|
395
416
|
sourceType: classifySourceType(snapshot.url, snapshot.title),
|
|
396
417
|
codeBlocks: snapshot.codeBlocks,
|
|
397
418
|
});
|
|
398
419
|
|
|
420
|
+
const assessment = adapter.assessPageAttempt?.({
|
|
421
|
+
status: response.status ?? 200,
|
|
422
|
+
body,
|
|
423
|
+
contentType,
|
|
424
|
+
url: response.url || url,
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
if ((!page && assessment?.weak) || assessment?.dynamic || assessment?.blocked) {
|
|
428
|
+
const scrapling = await adapter.fetchWithScrapling?.(url, assessment.mode, signal, config);
|
|
429
|
+
if (scrapling?.body) {
|
|
430
|
+
const scraplingSnapshot = extractPageSnapshot(scrapling.body, scrapling.url || url);
|
|
431
|
+
page = pageFromText(scraplingSnapshot.title, scraplingSnapshot.url, scraplingSnapshot.text, config, {
|
|
432
|
+
publishDate: extractPublishDate(scrapling.body),
|
|
433
|
+
sourceType: classifySourceType(scraplingSnapshot.url, scraplingSnapshot.title),
|
|
434
|
+
codeBlocks: scraplingSnapshot.codeBlocks,
|
|
435
|
+
});
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
399
439
|
const resolved = page || await fetchJinaPageSource(url, signal, config);
|
|
400
440
|
const finalPage = resolved && withinTimeframe(resolved, config) ? resolved : null;
|
|
401
441
|
return config.isolate ? finalPage : setCacheValue(pageCache, cacheKey, finalPage, PAGE_CACHE_TTL_MS);
|
|
@@ -499,8 +539,8 @@ function planSubqueries(rootQuery, currentQuery, config, sufficiency) {
|
|
|
499
539
|
return [...new Set(queries.filter(Boolean))].slice(0, Math.max(1, config.breadth || 2));
|
|
500
540
|
}
|
|
501
541
|
|
|
502
|
-
function formatResultText(result) {
|
|
503
|
-
return formatResearchResponse({ answer: result.answer, bullets: result.bullets, sources: result.sources, confidence: result.confidence });
|
|
542
|
+
function formatResultText(result, format) {
|
|
543
|
+
return formatResearchResponse({ answer: result.answer, bullets: result.bullets, sources: result.sources, confidence: result.confidence, format });
|
|
504
544
|
}
|
|
505
545
|
|
|
506
546
|
function modeCacheKey(query, config) {
|
|
@@ -520,7 +560,8 @@ function modeCacheKey(query, config) {
|
|
|
520
560
|
}
|
|
521
561
|
|
|
522
562
|
export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast") {
|
|
523
|
-
const
|
|
563
|
+
const domain = classifyQuestionDomain(query);
|
|
564
|
+
const config = getResearchConfig(typeof mode === "object" ? { ...mode, domain } : { mode, domain });
|
|
524
565
|
const cacheKey = modeCacheKey(query, config);
|
|
525
566
|
|
|
526
567
|
if (!config.isolate && !config.force) {
|
|
@@ -546,7 +587,7 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
546
587
|
let conflictSummary = "";
|
|
547
588
|
let conflictingSourcePairs = [];
|
|
548
589
|
let sufficiency = { sufficient: false, confidenceScore: 0.1, missingAspects: [], openSubQuestions: [] };
|
|
549
|
-
let currentQueries = await buildQueries(query, config
|
|
590
|
+
let currentQueries = await buildQueries(query, config, ctx, signal);
|
|
550
591
|
subqueries = [...currentQueries];
|
|
551
592
|
|
|
552
593
|
const localPages = await readLocalFiles(config.files || [], config);
|
|
@@ -665,7 +706,7 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
665
706
|
citations: synthesis.citations || [],
|
|
666
707
|
sources,
|
|
667
708
|
codeBlocks,
|
|
668
|
-
sufficient: sufficiency.sufficient && unverifiedRatio <= 0.2,
|
|
709
|
+
sufficient: sufficiency.sufficient && unverifiedRatio <= 0.2 && (!shouldRequireAuthoritativeSources(config) || sufficiency.authoritativeSourcesFound),
|
|
669
710
|
missingAspects: sufficiency.missingAspects,
|
|
670
711
|
openSubQuestions,
|
|
671
712
|
conflictSummary: conflictSummary || sufficiency.conflictSummary || "",
|
|
@@ -698,6 +739,7 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
698
739
|
sources: normalizedResult.sources,
|
|
699
740
|
sourceTypes,
|
|
700
741
|
codeBlocks: normalizedResult.codeBlocks,
|
|
742
|
+
format: config.format,
|
|
701
743
|
confidence,
|
|
702
744
|
meta: normalizedResult.meta,
|
|
703
745
|
confidenceScore: sufficiency.confidenceScore,
|
|
@@ -707,7 +749,7 @@ export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast"
|
|
|
707
749
|
openSubQuestions: normalizedResult.openSubQuestions,
|
|
708
750
|
missingAspects: normalizedResult.missingAspects,
|
|
709
751
|
unverifiedClaims: normalizedResult.unverifiedClaims,
|
|
710
|
-
contentText: formatResultText({ answer: normalizedResult.answer, bullets: normalizedResult.bullets, sources: normalizedResult.sources, confidence }),
|
|
752
|
+
contentText: formatResultText({ answer: normalizedResult.answer, bullets: normalizedResult.bullets, sources: normalizedResult.sources, confidence }, config.format),
|
|
711
753
|
};
|
|
712
754
|
|
|
713
755
|
setResearchMemory(cacheKey, result);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-research",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.2",
|
|
4
4
|
"private": false,
|
|
5
5
|
"type": "module",
|
|
6
6
|
"description": "Pi extension for web research.",
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
"index.js",
|
|
12
12
|
"lib",
|
|
13
13
|
"README.md",
|
|
14
|
+
"THIRD_PARTY_NOTICES.md",
|
|
14
15
|
"package.json"
|
|
15
16
|
],
|
|
16
17
|
"repository": {
|
|
@@ -25,11 +26,13 @@
|
|
|
25
26
|
"pi-package"
|
|
26
27
|
],
|
|
27
28
|
"scripts": {
|
|
28
|
-
"test": "node --test"
|
|
29
|
+
"test": "node --test",
|
|
30
|
+
"eval": "node --test test/eval-runner.test.js"
|
|
29
31
|
},
|
|
30
32
|
"dependencies": {
|
|
31
|
-
"@mariozechner/pi-ai": "
|
|
32
|
-
"
|
|
33
|
+
"@mariozechner/pi-ai": "*",
|
|
34
|
+
"pi-research": "^1.0.2",
|
|
35
|
+
"typebox": "*"
|
|
33
36
|
},
|
|
34
37
|
"peerDependencies": {
|
|
35
38
|
"@mariozechner/pi-ai": "*",
|