@apmantza/greedysearch-pi 1.4.1 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +219 -208
- package/cdp.mjs +16 -16
- package/extractors/bing-copilot.mjs +12 -21
- package/extractors/consent.mjs +10 -3
- package/extractors/gemini.mjs +12 -53
- package/extractors/google-ai.mjs +7 -10
- package/extractors/perplexity.mjs +28 -31
- package/extractors/selectors.mjs +52 -52
- package/index.ts +623 -623
- package/launch.mjs +33 -33
- package/newfeaturesideas.md +105 -0
- package/package.json +1 -1
- package/skills/greedy-search/SKILL.md +145 -145
- package/test.sh +298 -298
package/README.md
CHANGED
|
@@ -1,208 +1,219 @@
|
|
|
1
|
-
# GreedySearch for Pi
|
|
2
|
-
|
|
3
|
-
Pi extension that adds a `greedy_search` tool — fans out queries to Perplexity, Bing Copilot, and Google AI simultaneously and returns AI-synthesized answers with deduped sources. Streams progress as each engine completes.
|
|
4
|
-
|
|
5
|
-
Forked from [GreedySearch-claude](https://github.com/apmantza/GreedySearch-claude).
|
|
6
|
-
|
|
7
|
-
##
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
##
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
```
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
|
42
|
-
|
|
43
|
-
| `
|
|
44
|
-
| `
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
**
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
```
|
|
101
|
-
greedy_search({ query: "
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
```
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
##
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
```
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
###
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
- `
|
|
200
|
-
-
|
|
201
|
-
- `
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
1
|
+
# GreedySearch for Pi
|
|
2
|
+
|
|
3
|
+
Pi extension that adds a `greedy_search` tool — fans out queries to Perplexity, Bing Copilot, and Google AI simultaneously and returns AI-synthesized answers with deduped sources. Streams progress as each engine completes.
|
|
4
|
+
|
|
5
|
+
Forked from [GreedySearch-claude](https://github.com/apmantza/GreedySearch-claude).
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pi install npm:@apmantza/greedysearch-pi
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Or directly from git:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pi install git:github.com/apmantza/GreedySearch-pi
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
Once installed, Pi gains a `greedy_search` tool. The model will use it automatically for questions about current libraries, error messages, version-specific docs, etc.
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
greedy_search({ query: "What's new in React 19?", engine: "all" })
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Parameters
|
|
28
|
+
|
|
29
|
+
| Parameter | Type | Default | Description |
|
|
30
|
+
|-----------|------|---------|-------------|
|
|
31
|
+
| `query` | string | required | The search question |
|
|
32
|
+
| `engine` | string | `"all"` | Engine to use (see below) |
|
|
33
|
+
| `synthesize` | boolean | `false` | Synthesize results into one answer via Gemini |
|
|
34
|
+
| `fullAnswer` | boolean | `false` | Return complete answer (~3000+ chars) vs truncated preview (~300 chars) |
|
|
35
|
+
|
|
36
|
+
## Engines
|
|
37
|
+
|
|
38
|
+
| Engine | Alias | Latency | Best for |
|
|
39
|
+
|--------|-------|---------|----------|
|
|
40
|
+
| `all` | — | 30-90s | Highest confidence — all 3 engines in parallel (default) |
|
|
41
|
+
| `perplexity` | `p` | 15-30s | Technical Q&A, code explanations, documentation |
|
|
42
|
+
| `bing` | `b` | 15-30s | Recent news, Microsoft ecosystem |
|
|
43
|
+
| `google` | `g` | 15-30s | Broad coverage, multiple perspectives |
|
|
44
|
+
| `gemini` | `gem` | 15-30s | Google's AI with different training data |
|
|
45
|
+
|
|
46
|
+
## Streaming Progress
|
|
47
|
+
|
|
48
|
+
When using `engine: "all"`, the tool streams progress as each engine completes:
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
**Searching...** ⏳ perplexity · ⏳ bing · ⏳ google
|
|
52
|
+
**Searching...** ✅ perplexity done · ⏳ bing · ⏳ google
|
|
53
|
+
**Searching...** ✅ perplexity done · ✅ bing done · ⏳ google
|
|
54
|
+
**Searching...** ✅ perplexity done · ✅ bing done · ✅ google done
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Synthesis Mode
|
|
58
|
+
|
|
59
|
+
For complex research questions, use `synthesize: true` with `engine: "all"`:
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
greedy_search({ query: "best auth patterns for SaaS in 2026", engine: "all", synthesize: true })
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
This deduplicates sources across engines, builds a normalized source registry, and feeds that context to Gemini for one clean synthesized answer. Adds ~30s but returns agreement summaries, caveats, key claims, and better-labeled top sources.
|
|
66
|
+
|
|
67
|
+
**Use synthesis when:**
|
|
68
|
+
- You need one definitive answer, not multiple perspectives
|
|
69
|
+
- You're researching a topic to write about or make a decision
|
|
70
|
+
- Token efficiency matters (one answer vs three)
|
|
71
|
+
|
|
72
|
+
**Skip synthesis when:**
|
|
73
|
+
- You want to see where engines disagree
|
|
74
|
+
- Speed matters
|
|
75
|
+
|
|
76
|
+
## Full vs Short Answers
|
|
77
|
+
|
|
78
|
+
Default mode returns ~300 char summaries to save tokens. Use `fullAnswer: true` for complete responses:
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
greedy_search({ query: "explain the React compiler", engine: "perplexity", fullAnswer: true })
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Examples
|
|
85
|
+
|
|
86
|
+
**Quick technical lookup:**
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
greedy_search({ query: "How to use async await in Python", engine: "perplexity" })
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**Compare tools (see where engines agree/disagree):**
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
greedy_search({ query: "Prisma vs Drizzle in 2026", engine: "all" })
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**Research with synthesis:**
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
greedy_search({ query: "Best practices for monorepo structure", engine: "all", synthesize: true })
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Debug an error:**
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
greedy_search({ query: "Error: Cannot find module 'react-dom/client' Next.js 15", engine: "all" })
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Requirements
|
|
111
|
+
|
|
112
|
+
- **Chrome** — must be installed. The extension auto-launches a dedicated Chrome instance on port 9222 with its own isolated profile and DevTools port file, separate from your main browser session.
|
|
113
|
+
- **Node.js 22+** — for built-in `fetch` and WebSocket support.
|
|
114
|
+
|
|
115
|
+
## Setup (first time)
|
|
116
|
+
|
|
117
|
+
To pre-launch the dedicated GreedySearch Chrome instance:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
node ~/.pi/agent/git/GreedySearch-pi/launch.mjs
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Stop it when done:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
node ~/.pi/agent/git/GreedySearch-pi/launch.mjs --kill
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Check status:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
node ~/.pi/agent/git/GreedySearch-pi/launch.mjs --status
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Testing
|
|
136
|
+
|
|
137
|
+
Run the test suite to verify everything works:
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
./test.sh # full suite (~3-4 min)
|
|
141
|
+
./test.sh quick # skip parallel tests (~1 min)
|
|
142
|
+
./test.sh parallel # parallel race condition tests only
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Tests verify:
|
|
146
|
+
- Single engine mode (perplexity, bing, google)
|
|
147
|
+
- Sequential "all" mode searches
|
|
148
|
+
- Parallel "all" mode (5 concurrent searches) — detects tab race conditions
|
|
149
|
+
- Synthesis mode with Gemini
|
|
150
|
+
|
|
151
|
+
## Troubleshooting
|
|
152
|
+
|
|
153
|
+
### "Chrome not found"
|
|
154
|
+
|
|
155
|
+
Set the path explicitly:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
export CHROME_PATH="/path/to/chrome"
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### "CDP timeout" or "Chrome may have crashed"
|
|
162
|
+
|
|
163
|
+
Restart GreedySearch Chrome:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
node ~/.pi/agent/git/GreedySearch-pi/launch.mjs --kill
|
|
167
|
+
node ~/.pi/agent/git/GreedySearch-pi/launch.mjs
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Google / Bing "verify you're human"
|
|
171
|
+
|
|
172
|
+
The extension auto-clicks verification buttons and Cloudflare Turnstile challenges using broad keyword matching — resilient to variations like "Verify you are human" or localised button text. For hard CAPTCHAs (image puzzles), solve manually in the Chrome window that opens.
|
|
173
|
+
|
|
174
|
+
### Parallel searches failing
|
|
175
|
+
|
|
176
|
+
Each search creates a fresh isolated browser tab that is closed after completion, allowing safe parallel execution without tab state conflicts.
|
|
177
|
+
|
|
178
|
+
### Search hangs
|
|
179
|
+
|
|
180
|
+
Chrome may be unresponsive. Restart it with `launch.mjs --kill` then `launch.mjs`.
|
|
181
|
+
|
|
182
|
+
### Sources are empty or junk links
|
|
183
|
+
|
|
184
|
+
Sources are now extracted by regex-parsing Markdown links (`[title](url)`) from the clipboard text captured after each engine responds — not from DOM selectors that break when the engine's UI updates. If sources are empty, the engine's clipboard copy didn't include formatted links (Bing Copilot currently falls into this category).
|
|
185
|
+
|
|
186
|
+
## How It Works
|
|
187
|
+
|
|
188
|
+
- `index.ts` — Pi extension, registers `greedy_search` tool with streaming progress
|
|
189
|
+
- `search.mjs` — CLI runner, spawns extractors in parallel, emits `PROGRESS:` events to stderr
|
|
190
|
+
- `launch.mjs` — launches dedicated Chrome on port 9222 with isolated profile
|
|
191
|
+
- `extractors/` — per-engine CDP scrapers (Perplexity, Bing Copilot, Google AI, Gemini)
|
|
192
|
+
- `cdp.mjs` — Chrome DevTools Protocol CLI for browser automation
|
|
193
|
+
- `skills/greedy-search/SKILL.md` — skill file that guides the model on when/how to use greedy_search
|
|
194
|
+
|
|
195
|
+
## Changelog
|
|
196
|
+
|
|
197
|
+
### v1.4.2 (2026-03-25)
|
|
198
|
+
|
|
199
|
+
- **Fresh isolated tabs** — each search now always creates a new `about:blank` tab via `Target.createTarget` and refreshes the CDP page cache immediately after, preventing SPA navigation failures and stale DOM state from prior queries
|
|
200
|
+
- **Regex-based citation extraction** — all extractors (Perplexity, Bing, Gemini) now parse sources from clipboard Markdown links (`[title](url)`) instead of DOM selectors that break on UI updates
|
|
201
|
+
- **Relaxed verification detection** — `consent.mjs` now uses broad keyword matching (`includes('verify')`, `includes('human')`) instead of anchored regexes, correctly catching button text variants like "Verify you are human" across Cloudflare, Microsoft, and generic modals
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
### v1.4.1
|
|
206
|
+
|
|
207
|
+
- **Fixed parallel synthesis** — multiple `greedy_search` calls with `synthesize: true` now run safely in parallel. Each search creates a fresh Gemini tab that gets cleaned up after synthesis, preventing tab conflicts and "Uncaught" errors.
|
|
208
|
+
|
|
209
|
+
### v1.4.0
|
|
210
|
+
|
|
211
|
+
- **Grounded synthesis** — Gemini now receives a normalized source registry with stable source IDs, agreement summaries, caveats, and cited claims
|
|
212
|
+
- **Real deep research** — top sources are fetched before synthesis so deep research answers are grounded in fetched evidence, not just engine summaries
|
|
213
|
+
- **Richer source metadata** — source output now includes canonical URLs, domains, source types, per-engine attribution, and confidence metadata
|
|
214
|
+
- **Cleaner tab lifecycle** — temporary Perplexity, Bing, and Google tabs are closed after each fan-out search, and synthesis finishes on the Gemini tab
|
|
215
|
+
- **Isolated Chrome targeting** — GreedySearch now refuses to fall back to your normal Chrome session, preventing stray remote-debugging prompts
|
|
216
|
+
|
|
217
|
+
## License
|
|
218
|
+
|
|
219
|
+
MIT
|
package/cdp.mjs
CHANGED
|
@@ -37,22 +37,22 @@ function getDevToolsActivePortPath() {
|
|
|
37
37
|
return join(homedir(), '.config', 'google-chrome', 'DevToolsActivePort');
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
-
function getWsUrl() {
|
|
41
|
-
// If CDP_PROFILE_DIR is set (by search.mjs), prefer that profile's port file
|
|
42
|
-
// so GreedySearch targets its own Chrome, not the user's main session.
|
|
43
|
-
const profileDir = process.env.CDP_PROFILE_DIR;
|
|
44
|
-
if (profileDir) {
|
|
45
|
-
const p = profileDir.replace(/\\/g, '/') + '/DevToolsActivePort';
|
|
46
|
-
if (existsSync(p)) {
|
|
47
|
-
const lines = readFileSync(p, 'utf8').trim().split('\n');
|
|
48
|
-
return `ws://127.0.0.1:${lines[0]}${lines[1]}`;
|
|
49
|
-
}
|
|
50
|
-
throw new Error(`GreedySearch DevToolsActivePort not found at ${p}. Refusing to fall back to the main Chrome session.`);
|
|
51
|
-
}
|
|
52
|
-
const portFile = getDevToolsActivePortPath();
|
|
53
|
-
const lines = readFileSync(portFile, 'utf8').trim().split('\n');
|
|
54
|
-
return `ws://127.0.0.1:${lines[0]}${lines[1]}`;
|
|
55
|
-
}
|
|
40
|
+
function getWsUrl() {
|
|
41
|
+
// If CDP_PROFILE_DIR is set (by search.mjs), prefer that profile's port file
|
|
42
|
+
// so GreedySearch targets its own Chrome, not the user's main session.
|
|
43
|
+
const profileDir = process.env.CDP_PROFILE_DIR;
|
|
44
|
+
if (profileDir) {
|
|
45
|
+
const p = profileDir.replace(/\\/g, '/') + '/DevToolsActivePort';
|
|
46
|
+
if (existsSync(p)) {
|
|
47
|
+
const lines = readFileSync(p, 'utf8').trim().split('\n');
|
|
48
|
+
return `ws://127.0.0.1:${lines[0]}${lines[1]}`;
|
|
49
|
+
}
|
|
50
|
+
throw new Error(`GreedySearch DevToolsActivePort not found at ${p}. Refusing to fall back to the main Chrome session.`);
|
|
51
|
+
}
|
|
52
|
+
const portFile = getDevToolsActivePortPath();
|
|
53
|
+
const lines = readFileSync(portFile, 'utf8').trim().split('\n');
|
|
54
|
+
return `ws://127.0.0.1:${lines[0]}${lines[1]}`;
|
|
55
|
+
}
|
|
56
56
|
|
|
57
57
|
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
|
58
58
|
|
|
@@ -45,17 +45,14 @@ function cdp(args, timeoutMs = 30000) {
|
|
|
45
45
|
|
|
46
46
|
async function getOrOpenTab(tabPrefix) {
|
|
47
47
|
if (tabPrefix) return tabPrefix;
|
|
48
|
-
|
|
49
|
-
if (existsSync(PAGES_CACHE)) {
|
|
50
|
-
const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
|
|
51
|
-
const existing = pages.find(p => p.url.includes('copilot.microsoft.com'));
|
|
52
|
-
if (existing) return existing.targetId.slice(0, 8);
|
|
53
|
-
}
|
|
54
|
-
|
|
48
|
+
// Always open a fresh tab to avoid SPA navigation issues
|
|
55
49
|
const list = await cdp(['list']);
|
|
56
|
-
const
|
|
57
|
-
if (!
|
|
58
|
-
|
|
50
|
+
const anchor = list.split('\n')[0]?.slice(0, 8);
|
|
51
|
+
if (!anchor) throw new Error('No Chrome tabs found. Is Chrome running with --remote-debugging-port=9222?');
|
|
52
|
+
const raw = await cdp(['evalraw', anchor, 'Target.createTarget', '{"url":"about:blank"}']);
|
|
53
|
+
const { targetId } = JSON.parse(raw);
|
|
54
|
+
await cdp(['list']); // refresh cache
|
|
55
|
+
return targetId.slice(0, 8);
|
|
59
56
|
}
|
|
60
57
|
|
|
61
58
|
async function injectClipboardInterceptor(tab) {
|
|
@@ -101,17 +98,11 @@ async function extractAnswer(tab) {
|
|
|
101
98
|
const answer = await cdp(['eval', tab, `window.__bingClipboard || ''`]);
|
|
102
99
|
if (!answer) throw new Error('Clipboard interceptor returned empty text');
|
|
103
100
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
.filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
|
|
110
|
-
.slice(0, 10);
|
|
111
|
-
return JSON.stringify(sources);
|
|
112
|
-
})()
|
|
113
|
-
`]).catch(() => '[]');
|
|
114
|
-
const sources = JSON.parse(raw);
|
|
101
|
+
// Regex parse Markdown links from clipboard — robust against DOM changes
|
|
102
|
+
const sources = Array.from(answer.matchAll(/\[([^\]]+)\]\((https?:\/\/[^\s\)]+)\)/g))
|
|
103
|
+
.map(m => ({ title: m[1], url: m[2] }))
|
|
104
|
+
.filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
|
|
105
|
+
.slice(0, 10);
|
|
115
106
|
|
|
116
107
|
return { answer: answer.trim(), sources };
|
|
117
108
|
}
|
package/extractors/consent.mjs
CHANGED
|
@@ -102,8 +102,12 @@ const VERIFY_DETECT_JS = `
|
|
|
102
102
|
|
|
103
103
|
// --- Generic verification buttons (catch-all) ---
|
|
104
104
|
var btns = Array.from(document.querySelectorAll('button, input[type=submit], a[role=button]'));
|
|
105
|
-
var verify = btns.find(b =>
|
|
106
|
-
|
|
105
|
+
var verify = btns.find(b => {
|
|
106
|
+
var t = (b.innerText?.trim() || b.value || '').toLowerCase();
|
|
107
|
+
return (t.includes('verify') || t.includes('human') || t.includes('robot') || t.includes('continue') || t.includes('proceed')) &&
|
|
108
|
+
!t.includes('verified') && !document.querySelector('iframe[src*="recaptcha"]');
|
|
109
|
+
});
|
|
110
|
+
if (verify) {
|
|
107
111
|
verify.click();
|
|
108
112
|
return 'clicked-verify:' + (verify.innerText?.trim() || verify.value);
|
|
109
113
|
}
|
|
@@ -131,7 +135,10 @@ const VERIFY_RETRY_JS = `
|
|
|
131
135
|
|
|
132
136
|
// Try clicking any verify/continue button again
|
|
133
137
|
var btns = Array.from(document.querySelectorAll('button, input[type=submit], a[role=button]'));
|
|
134
|
-
var btn = btns.find(b =>
|
|
138
|
+
var btn = btns.find(b => {
|
|
139
|
+
var t = (b.innerText?.trim() || b.value || '').toLowerCase();
|
|
140
|
+
return t.includes('verify') || t.includes('human') || t.includes('robot') || t.includes('continue') || t.includes('next') || t.includes('submit');
|
|
141
|
+
});
|
|
135
142
|
if (btn) { btn.click(); return 'clicked:' + (btn.innerText?.trim() || btn.value); }
|
|
136
143
|
|
|
137
144
|
// Try Turnstile checkbox
|
package/extractors/gemini.mjs
CHANGED
|
@@ -44,13 +44,14 @@ function cdp(args, timeoutMs = 30000) {
|
|
|
44
44
|
|
|
45
45
|
async function getOrOpenTab(tabPrefix) {
|
|
46
46
|
if (tabPrefix) return tabPrefix;
|
|
47
|
-
|
|
48
|
-
const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
|
|
49
|
-
const existing = pages.find(p => p.url.includes('gemini.google.com'));
|
|
50
|
-
if (existing) return existing.targetId.slice(0, 8);
|
|
51
|
-
}
|
|
47
|
+
// Always open a fresh tab to avoid SPA navigation issues
|
|
52
48
|
const list = await cdp(['list']);
|
|
53
|
-
|
|
49
|
+
const anchor = list.split('\n')[0]?.slice(0, 8);
|
|
50
|
+
if (!anchor) throw new Error('No Chrome tabs found. Is Chrome running with --remote-debugging-port=9222?');
|
|
51
|
+
const raw = await cdp(['evalraw', anchor, 'Target.createTarget', '{"url":"about:blank"}']);
|
|
52
|
+
const { targetId } = JSON.parse(raw);
|
|
53
|
+
await cdp(['list']); // refresh cache
|
|
54
|
+
return targetId.slice(0, 8);
|
|
54
55
|
}
|
|
55
56
|
|
|
56
57
|
async function typeIntoGemini(tab, text) {
|
|
@@ -111,53 +112,11 @@ async function extractAnswer(tab) {
|
|
|
111
112
|
const answer = await cdp(['eval', tab, `window.__geminiClipboard || ''`]);
|
|
112
113
|
if (!answer) throw new Error('Clipboard interceptor returned empty text');
|
|
113
114
|
|
|
114
|
-
//
|
|
115
|
-
const
|
|
116
|
-
|
|
117
|
-
(
|
|
118
|
-
|
|
119
|
-
if (!btn) btn = Array.from(document.querySelectorAll('button')).find(b => b.innerText?.trim() === 'Sources');
|
|
120
|
-
if (btn) { btn.click(); return 'clicked'; }
|
|
121
|
-
return 'not-found';
|
|
122
|
-
})()
|
|
123
|
-
`]).catch(() => 'not-found');
|
|
124
|
-
|
|
125
|
-
// Wait for the sources sidebar to populate
|
|
126
|
-
await new Promise(r => setTimeout(r, 1500));
|
|
127
|
-
|
|
128
|
-
// Extract sources from the sidebar panel (has proper URLs + titles)
|
|
129
|
-
const raw = await cdp(['eval', tab, `
|
|
130
|
-
(function() {
|
|
131
|
-
// Find the Sources sidebar container by heading
|
|
132
|
-
var headings = Array.from(document.querySelectorAll('h1, h2, h3, [class*="header"]'));
|
|
133
|
-
var sourceHeading = headings.find(h => h.innerText?.trim() === 'Sources');
|
|
134
|
-
if (sourceHeading) {
|
|
135
|
-
var container = sourceHeading.closest('.container') || sourceHeading.parentElement;
|
|
136
|
-
var links = Array.from(container.querySelectorAll('a[href^="http"]'))
|
|
137
|
-
.map(a => ({ url: a.href.split('#')[0], title: a.innerText?.trim().split('\\n')[0] || '' }))
|
|
138
|
-
.filter(s => s.url && ${sourceExcludeFilter})
|
|
139
|
-
.filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
|
|
140
|
-
.slice(0, 8);
|
|
141
|
-
return JSON.stringify(links);
|
|
142
|
-
}
|
|
143
|
-
// Fallback: inline source cards with aria-labels
|
|
144
|
-
var cards = Array.from(document.querySelectorAll('${S.citationButtonPattern}'));
|
|
145
|
-
if (cards.length) {
|
|
146
|
-
return JSON.stringify(cards.map(b => {
|
|
147
|
-
var label = b.getAttribute('aria-label') || '';
|
|
148
|
-
var name = label.match(${S.citationNameRegex})?.[1] || label;
|
|
149
|
-
return { url: '', title: name };
|
|
150
|
-
}));
|
|
151
|
-
}
|
|
152
|
-
// Last resort: page-wide links (may include footer junk)
|
|
153
|
-
return JSON.stringify(Array.from(document.querySelectorAll('a[href^="http"]'))
|
|
154
|
-
.map(a => ({ url: a.href.split('#')[0], title: a.innerText?.trim().split('\\n')[0] || '' }))
|
|
155
|
-
.filter(s => s.url && !s.url.includes('gemini.google') && !s.url.includes('gstatic') && !s.url.includes('google.com/search'))
|
|
156
|
-
.filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
|
|
157
|
-
.slice(0, 8));
|
|
158
|
-
})()
|
|
159
|
-
`]).catch(() => '[]');
|
|
160
|
-
const sources = JSON.parse(raw);
|
|
115
|
+
// Regex parse Markdown links from clipboard — robust against DOM changes
|
|
116
|
+
const sources = Array.from(answer.matchAll(/\[([^\]]+)\]\((https?:\/\/[^\s\)]+)\)/g))
|
|
117
|
+
.map(m => ({ title: m[1], url: m[2] }))
|
|
118
|
+
.filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
|
|
119
|
+
.slice(0, 10);
|
|
161
120
|
|
|
162
121
|
return { answer: answer.trim(), sources };
|
|
163
122
|
}
|