@apmantza/greedysearch-pi 1.9.0 → 1.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -0
- package/README.md +11 -1
- package/bin/launch-visible.mjs +65 -0
- package/bin/launch.mjs +442 -417
- package/bin/search.mjs +757 -679
- package/extractors/bing-copilot.mjs +490 -374
- package/extractors/common.mjs +703 -596
- package/extractors/consent.mjs +421 -388
- package/extractors/selectors.mjs +55 -54
- package/index.ts +176 -177
- package/package.json +8 -3
- package/skills/greedy-search/skill.md +5 -19
- package/src/fetcher.mjs +666 -652
- package/src/formatters/synthesis.ts +1 -5
- package/src/search/output.mjs +23 -1
- package/src/search/research.mjs +1581 -0
- package/src/search/sources.mjs +488 -466
- package/src/search/synthesis-runner.mjs +52 -46
- package/src/tools/greedy-search-handler.ts +298 -124
- package/test.mjs +971 -534
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,52 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
### Fixed
|
|
8
|
+
|
|
9
|
+
### Changed
|
|
10
|
+
|
|
11
|
+
### Removed
|
|
12
|
+
|
|
13
|
+
## [1.9.2] — 2026-05-25
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
|
|
17
|
+
- **Iterative research mode** (`bin/search.mjs`, `src/search/research.mjs`) — Added `--research` / `--depth research` and `greedy_search({ depth: "research" })`. The new mode plans focused follow-up queries, runs fast multi-engine searches, fetches and deduplicates sources, extracts compact learnings/gaps with Gemini, and writes a final cited report. Optional knobs: `breadth` (1-5), `iterations` (1-3), and `maxSources` (3-12). Research mode now fills under-planned breadth with deterministic fallback query angles so `breadth: 3` actually fans out even when Gemini is conservative.
|
|
18
|
+
|
|
19
|
+
### Fixed
|
|
20
|
+
|
|
21
|
+
- **Pi update dependency install is leaner** (`package.json`, `package-lock.json`) — Moved the direct `@sinclair/typebox` import into runtime dependencies and marked the Pi host peer as optional so npm does not auto-install a full nested `@earendil-works/pi-coding-agent` tree during git-package updates. This keeps `pi update` focused on GreedySearch runtime deps (`jsdom`, `@mozilla/readability`, `turndown`) and avoids partial installs that leave `jsdom/package.json` missing.
|
|
22
|
+
|
|
23
|
+
- **Pi TUI peer import no longer required at load time** (`src/tools/greedy-search-handler.ts`) — Replaced the direct `@earendil-works/pi-tui` runtime import with a tiny local `Text` component implementation so Pi/jiti extension import works even when optional TUI peer packages are not installed locally.
|
|
24
|
+
|
|
25
|
+
- **Research unit tests no longer require fetcher dependencies at import time** (`src/search/research.mjs`) — Research mode now lazy-loads source fetching/file-output helpers only during live research execution, keeping pure planning/normalization unit tests runnable in CI's tarball install simulation without local `node_modules`.
|
|
26
|
+
|
|
27
|
+
- **Research query sanitizer avoids ReDoS hotspot** (`src/search/research.mjs`) — Replaced markdown-link cleanup regexes with bounded string scanning and manual whitespace collapse, resolving the SonarCloud super-linear regex hotspot while preserving `site:[label](url)` query cleanup.
|
|
28
|
+
|
|
29
|
+
- **Research source quality cleanup** (`src/search/sources.mjs`, `src/search/research.mjs`) — Social/login-wall domains (`facebook.com`, `linkedin.com`, `x.com`, etc.) now receive a strong ranking penalty unless the query explicitly targets that platform. Research source dedupe now uses the same composite score as normal source ranking, per-round learning extraction errors are recorded in `_research.rounds[].learningError`, child-search stderr forwarding is filtered so noisy page CSS/HTML cannot flood research logs, and markdown links in Gemini-generated follow-up queries are sanitized before search.
|
|
30
|
+
|
|
31
|
+
- **Bing headless stealth hardening** (`extractors/common.mjs`, `bin/launch.mjs`) — Adopted low-risk ideas from Obscura's stealth model: `navigator.webdriver` now resolves to `undefined` instead of `false`, navigator plugins/mimeTypes/mediaDevices/connection/pdfViewer/platform/vendor are made more Chrome-like, patched functions stringify as `[native code]`, canvas noise is stable per page instead of random on each call, and Chrome launches with `--lang=en-US` plus `--force-color-profile=srgb`. Live Bing headless smoke passed after the change without visible recovery.
|
|
32
|
+
|
|
33
|
+
- **Research/Bing false recovery fixed** (`bin/search.mjs`, `extractors/bing-copilot.mjs`, `extractors/consent.mjs`) — Research child searches no longer mark Bing/Perplexity failed before visible recovery has a final status, Bing fast-mode keeps a bounded 40s parent budget, and Bing's short-mode stream wait caps at 25s so research can extract rendered partial answers before timing out. Bing verification detection now reuses the DOM-based `handleVerification` detector instead of scanning accessibility text for generic words like “Cloudflare” or “challenge”, preventing false visible-recovery trips when the user query/answer is about anti-bot systems. Added locale-agnostic DOM/accessibility fallback extraction that picks the assistant article without relying solely on English “Copilot said” labels.
|
|
34
|
+
|
|
35
|
+
## [1.9.1] — 2026-05-23
|
|
36
|
+
|
|
37
|
+
### Fixed
|
|
38
|
+
|
|
39
|
+
- **Visible Chrome launches minimized** (`bin/launch-visible.mjs`) — After Chrome's CDP endpoint becomes ready, `minimizeViaCDP` sends `Browser.setWindowBounds { windowState: "minimized" }` via the browser-level WebSocket. Chrome lands in the taskbar immediately instead of stealing focus from the user's active window. Closes [#20](https://github.com/apmantza/GreedySearch-pi/issues/20).
|
|
40
|
+
|
|
41
|
+
- **Recovery path always returns to headless** (`bin/search.mjs`) — After a visible-mode retry (triggered by Cloudflare blocking headless), the pipeline now unconditionally kills visible Chrome and relaunches headless before running Gemini synthesis. Previously the switch-back only happened when zero engines were recovered (`recovered === 0`), so a partial recovery left visible Chrome alive and caused synthesis to open the Gemini tab in the visible window.
|
|
42
|
+
|
|
43
|
+
- **ReDoS hotspots fixed** (`bin/launch.mjs`, `extractors/selectors.mjs`, `src/fetcher.mjs`, `src/search/sources.mjs`) — Four SonarCloud `javasecurity:S5852` hotspots resolved: (1) Chrome version directory regex bounded (`\d+` → `\d{1,10}` ×4 groups); (2) Perplexity citation name regex bounded (`\s+` → `\s{1,20}`, `[^.]+` → `[^.]{1,200}`); (3) seven suspicious-content regex patterns in `checkContentQuality` replaced with `String.includes` checks (faster and immune to backtracking on adversarial input); (4) trailing-slash removal regex bounded (`\/+$` → `\/{1,10}$`). Follow-up: string checks lowercased via a single `markdown.toLowerCase()` call to restore the case-insensitive matching the original regexes provided.
|
|
44
|
+
|
|
45
|
+
- **Collapsed tool rendering: consensus label fixed** (`src/tools/greedy-search-handler.ts`) — The collapsed summary was reading `synthesis.consensus` which does not exist in the schema; the field is `synthesis.agreement.level`. Collapsed view now correctly shows e.g. `→ Synthesized · 5 sources · high`.
|
|
46
|
+
|
|
47
|
+
- **`minimizeViaCDP` guard inverted in `launch.mjs`** (`bin/launch.mjs`) — The early-return guard was `if (isVisible()) return` which caused the function to exit immediately in the only case it was ever called (visible Chrome launch via `GREEDY_SEARCH_VISIBLE=1`). Changed to `if (isHeadless()) return`. Also removed the unnecessary 1s sleep (Chrome is already confirmed ready via `writePortFile()` before this is called) and applied the SonarCloud S8480 fix (`wsPath` extracted from `webSocketDebuggerUrl`, WebSocket URL reconstructed as `ws://localhost:${PORT}${wsPath}`).
|
|
48
|
+
|
|
49
|
+
- **Gemini tab no longer steals focus during synthesis** (`bin/search.mjs`) — Removed the `activateTab` call on the pre-navigated Gemini tab. `Target.activateTarget` was restoring the minimized Chrome window mid-search; CDP synthesis operates on the target ID directly and has no need for the tab to be Chrome's active tab.
|
|
50
|
+
|
|
5
51
|
## [1.9.0] — 2026-05-22
|
|
6
52
|
|
|
7
53
|
### Added
|
package/README.md
CHANGED
|
@@ -37,6 +37,12 @@ greedy_search({
|
|
|
37
37
|
engine: "all",
|
|
38
38
|
depth: "deep",
|
|
39
39
|
});
|
|
40
|
+
greedy_search({
|
|
41
|
+
query: "Evaluate browser automation options for AI agents",
|
|
42
|
+
depth: "research",
|
|
43
|
+
breadth: 3,
|
|
44
|
+
iterations: 2,
|
|
45
|
+
});
|
|
40
46
|
// Headless is the default — no window. To force visible Chrome:
|
|
41
47
|
greedy_search({ query: "Bing captcha setup", engine: "bing", visible: true });
|
|
42
48
|
```
|
|
@@ -45,7 +51,10 @@ greedy_search({ query: "Bing captcha setup", engine: "bing", visible: true });
|
|
|
45
51
|
|
|
46
52
|
- `query` (required)
|
|
47
53
|
- `engine`: `all` (default), `perplexity`, `bing`, `google`, `gemini`
|
|
48
|
-
- `depth`: `standard` (default), `fast`, `deep`
|
|
54
|
+
- `depth`: `standard` (default), `fast`, `deep`, `research`
|
|
55
|
+
- `breadth`: research mode query breadth, 1-5 (default 3)
|
|
56
|
+
- `iterations`: research mode rounds, 1-3 (default 2)
|
|
57
|
+
- `maxSources`: research mode fetched source cap, 3-12
|
|
49
58
|
- `fullAnswer`: return full single-engine output instead of preview
|
|
50
59
|
- `headless`: set to `false` to show Chrome window (default: `true`)
|
|
51
60
|
- `visible` / `alwaysVisible`: set to `true` to always use visible Chrome for this search
|
|
@@ -65,6 +74,7 @@ greedy_search({ query: "Bing captcha setup", engine: "bing", visible: true });
|
|
|
65
74
|
- `fast` - quickest, no synthesis/source fetching
|
|
66
75
|
- `standard` - balanced default for `engine: "all"` (synthesis + fetched sources)
|
|
67
76
|
- `deep` - strongest grounding and confidence metadata
|
|
77
|
+
- `research` - slowest; iterative query planning, fast multi-engine searches, source fetching, learning extraction, and a final cited report
|
|
68
78
|
|
|
69
79
|
## Runtime commands
|
|
70
80
|
|
package/bin/launch-visible.mjs
CHANGED
|
@@ -106,6 +106,69 @@ function httpGet(url, timeoutMs = 1000) {
|
|
|
106
106
|
});
|
|
107
107
|
}
|
|
108
108
|
|
|
109
|
+
async function minimizeViaCDP(port) {
|
|
110
|
+
try {
|
|
111
|
+
const version = await httpGet(`http://localhost:${port}/json/version`).then(
|
|
112
|
+
(r) => JSON.parse(r.body),
|
|
113
|
+
);
|
|
114
|
+
const targets = await httpGet(`http://localhost:${port}/json/list`).then(
|
|
115
|
+
(r) => JSON.parse(r.body),
|
|
116
|
+
);
|
|
117
|
+
const targetId = targets.find((t) => t.type === "page")?.id;
|
|
118
|
+
if (!targetId) return;
|
|
119
|
+
|
|
120
|
+
// Validate browser WebSocket URL to prevent SSRF (SonarCloud javasecurity:S5335)
|
|
121
|
+
const wsUrlStr = version.webSocketDebuggerUrl;
|
|
122
|
+
if (typeof wsUrlStr !== "string") return;
|
|
123
|
+
const wsUrl = new URL(wsUrlStr);
|
|
124
|
+
if (wsUrl.hostname !== "localhost" && wsUrl.hostname !== "127.0.0.1")
|
|
125
|
+
return;
|
|
126
|
+
if (!/^ws:\/\/localhost:\d+/.test(`ws://${wsUrl.host}`)) return;
|
|
127
|
+
const wsPath = wsUrl.pathname;
|
|
128
|
+
const ws = new WebSocket(`ws://localhost:${port}${wsPath}`);
|
|
129
|
+
await new Promise((resolve) => {
|
|
130
|
+
ws.onopen = () =>
|
|
131
|
+
ws.send(
|
|
132
|
+
JSON.stringify({
|
|
133
|
+
id: 1,
|
|
134
|
+
method: "Browser.getWindowForTarget",
|
|
135
|
+
params: { targetId },
|
|
136
|
+
}),
|
|
137
|
+
);
|
|
138
|
+
ws.onmessage = (ev) => {
|
|
139
|
+
const msg = JSON.parse(ev.data);
|
|
140
|
+
if (msg.id === 1 && msg.result?.windowId) {
|
|
141
|
+
ws.send(
|
|
142
|
+
JSON.stringify({
|
|
143
|
+
id: 2,
|
|
144
|
+
method: "Browser.setWindowBounds",
|
|
145
|
+
params: {
|
|
146
|
+
windowId: msg.result.windowId,
|
|
147
|
+
bounds: { windowState: "minimized" },
|
|
148
|
+
},
|
|
149
|
+
}),
|
|
150
|
+
);
|
|
151
|
+
} else if (msg.id === 2) {
|
|
152
|
+
ws.close();
|
|
153
|
+
resolve();
|
|
154
|
+
}
|
|
155
|
+
};
|
|
156
|
+
ws.onerror = () => {
|
|
157
|
+
ws.close();
|
|
158
|
+
resolve();
|
|
159
|
+
};
|
|
160
|
+
setTimeout(() => {
|
|
161
|
+
try {
|
|
162
|
+
ws.close();
|
|
163
|
+
} catch {}
|
|
164
|
+
resolve();
|
|
165
|
+
}, 5000);
|
|
166
|
+
});
|
|
167
|
+
} catch {
|
|
168
|
+
// best-effort — Chrome is still usable if minimize fails
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
109
172
|
async function waitForPort(timeoutMs = 15000) {
|
|
110
173
|
const deadline = Date.now() + timeoutMs;
|
|
111
174
|
while (Date.now() < deadline) {
|
|
@@ -226,6 +289,8 @@ async function main() {
|
|
|
226
289
|
process.exit(1);
|
|
227
290
|
}
|
|
228
291
|
|
|
292
|
+
await minimizeViaCDP(PORT);
|
|
293
|
+
|
|
229
294
|
console.log("Visible Chrome ready on port 9222.");
|
|
230
295
|
console.log("Keep this terminal open to keep Chrome alive.");
|
|
231
296
|
}
|