@apmantza/greedysearch-pi 2.0.0 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@apmantza/greedysearch-pi",
3
- "version": "2.0.0",
3
+ "version": "2.1.3",
4
4
  "description": "Headless multi-engine AI search (Perplexity, Google AI, ChatGPT, Gemini) via browser automation. NO API KEYS needed. Grounded all-engine search fetches sources by default, with optional configurable synthesis and deep research.",
5
5
  "type": "module",
6
6
  "keywords": [
@@ -48,13 +48,13 @@
48
48
  },
49
49
  "dependencies": {
50
50
  "@mozilla/readability": "^0.6.0",
51
- "@sinclair/typebox": "^0.34.48",
52
- "jsdom": "^24.0.0",
51
+ "@sinclair/typebox": "^0.34.49",
52
+ "jsdom": "^29.1.1",
53
53
  "pdf-parse": "^2.4.5",
54
54
  "turndown": "^7.1.2"
55
55
  },
56
56
  "peerDependencies": {
57
- "@earendil-works/pi-coding-agent": "*"
57
+ "@earendil-works/pi-coding-agent": "^0.79.0"
58
58
  },
59
59
  "peerDependenciesMeta": {
60
60
  "@earendil-works/pi-coding-agent": {
@@ -1,18 +1,20 @@
1
- ---
2
- name: greedy-search
3
- description: Web/search plus opt-in research via Perplexity, Google AI, ChatGPT, Gemini, Semantic Scholar, and Logically. Grounded all-engine search fetches sources by default; optional configurable synthesis; deep research as separate workflow. Configurable via ~/.pi/greedyconfig. Bing Copilot available for signed-in users. Current docs, recent changes, dependency choices. NOT codebase search.
4
- ---
5
-
6
- `greedy_search({ query, engine: "all"|"perplexity"|"google"|"chatgpt"|"gemini"|"semantic-scholar"|"logically"|"bing", synthesize?: bool, synthesizer?: "gemini"|"chatgpt", depth?: "research", breadth: 1-5, iterations: 1-3, maxSources: 3-12, researchOutDir?: string, writeResearchBundle?: bool, visible: bool })`
7
-
8
- **Modes:** individual engine search · grounded `engine:"all"` search with fetched sources · optional `synthesize:true` using the configured synthesizer over all-engine results · `depth:"research"` for the iterative deep-research workflow.
9
-
10
- **Config:** `~/.pi/greedyconfig` supports `{ "engines": ["perplexity", "google", "chatgpt", "gemini", "semantic-scholar", "logically"], "synthesizer": "gemini" }`. Gemini is a normal search engine; Semantic Scholar and Logically are opt-in research engines. Any configured engine can participate in `engine:"all"`; deep research child searches reuse the same configured `engines` list and stdin-safe query passing. Normal all-search synthesis remains controlled separately by `synthesizer`; research planning/final synthesis uses Gemini.
11
-
12
- **Compatibility:** legacy `depth:"fast"|"standard"|"deep"` is still accepted. `fast` skips source fetching; `standard`/`deep` alias `synthesize:true`. Prefer `synthesize:true`, optional `synthesizer`, and `depth:"research"` going forward.
13
-
14
- **Research output:** `depth:"research"` writes a dataroom-style bundle by default under `.pi/greedysearch-research/<timestamp>_<query>/` with `STATUS.md`, `OUTLINE.md`, `reports/SUMMARY.md`, `reports/CLAIMS.md`, `reports/GAPS.md`, `sources/`, and `data/manifest.json`. Pass `researchOutDir` to choose the directory or `writeResearchBundle:false` to disable disk output.
15
-
16
- **Auto-recovery:** Headless default. Bing/Perplexity auto-retry visible on CF block. Manual CAPTCHA visible stays open; solve then rerun.
17
-
18
- **CDP safety:** Use `bin/cdp-greedy.mjs` only. Never raw `bin/cdp.mjs`.
1
+ ---
2
+ name: greedy-search
3
+ description: Web/search plus opt-in research via Perplexity, Google AI, ChatGPT, Gemini, Semantic Scholar, and Logically. Grounded all-engine search fetches sources by default; optional configurable synthesis; deep research as separate workflow. Configurable via ~/.pi/greedyconfig. Bing Copilot available for signed-in users. Current docs, recent changes, dependency choices. NOT codebase search.
4
+ ---
5
+
6
+ `greedy_search({ query, engine: "all"|"perplexity"|"google"|"chatgpt"|"gemini"|"semantic-scholar"|"logically"|"bing", synthesize?: bool, synthesizer?: "gemini"|"chatgpt", depth?: "research", breadth: 1-5, iterations: 1-3, maxSources: 3-12, researchOutDir?: string, writeResearchBundle?: bool, visible: bool })`
7
+
8
+ **Modes:** individual engine search · grounded `engine:"all"` search with fetched sources · optional `synthesize:true` using the configured synthesizer over all-engine results · `depth:"research"` for the iterative deep-research workflow.
9
+
10
+ **Config:** `~/.pi/greedyconfig` supports `{ "engines": ["perplexity", "google", "chatgpt", "gemini"], "synthesizer": "gemini" }` by default. `semantic-scholar` and `logically` are opt-in academic/research engines — add them to `engines` only when you want academic paper discovery or research-assistant workflows in the normal all-search fan-out. Without explicit opt-in, `engine:"all"` excludes them because their results are noisy for casual web search; they shine in `depth:"research"` mode. Any configured engine can participate in `engine:"all"`; deep research child searches reuse the same configured `engines` list and stdin-safe query passing. Normal all-search synthesis remains controlled separately by `synthesizer`; research planning/final synthesis uses Gemini.
11
+
12
+ **Compatibility:** legacy `depth:"fast"|"standard"|"deep"` is still accepted. `fast` skips source fetching; `standard`/`deep` alias `synthesize:true`. Prefer `synthesize:true`, optional `synthesizer`, and `depth:"research"` going forward.
13
+
14
+ **Research output:** `depth:"research"` writes a dataroom-style bundle by default under `.pi/greedysearch-research/<timestamp>_<query>/` with `STATUS.md`, `OUTLINE.md`, `reports/SUMMARY.md`, `reports/CLAIMS.md`, `reports/GAPS.md`, `sources/`, and `data/manifest.json`. Pass `researchOutDir` to choose the directory or `writeResearchBundle:false` to disable disk output.
15
+
16
+ **Scale-aware research:** When `breadth` and `iterations` are not explicitly set, the classifier auto-detects query complexity. Simple queries ("what is X") use a fast single-pass path (~70% faster). Moderate queries get tighter breadth/iterations. Complex queries use the full loop. Explicit `breadth`/`iterations` always override the classifier.
17
+
18
+ **Auto-recovery:** Headless default. Bing/Perplexity auto-retry visible on CF block. Manual CAPTCHA → visible stays open; solve then rerun.
19
+
20
+ **CDP safety:** Use `bin/cdp-greedy.mjs` only. Never raw `bin/cdp.mjs`.
package/src/fetcher.mjs CHANGED
@@ -195,6 +195,21 @@ export async function fetchSourceHttp(url, options = {}) {
195
195
  const finalUrl = response.url;
196
196
  const lastModified = response.headers.get("last-modified") || "";
197
197
 
198
+ // SSRF defense: re-validate the post-redirect finalUrl. A malicious
199
+ // server could redirect our fetch to a private IP, bypassing the
200
+ // initial isPrivateUrl() check on the original URL.
201
+ const finalPrivateCheck = isPrivateUrl(finalUrl);
202
+ if (finalPrivateCheck.blocked) {
203
+ return {
204
+ ok: false,
205
+ url,
206
+ finalUrl,
207
+ status: response.status,
208
+ error: `Blocked: ${finalPrivateCheck.reason}`,
209
+ needsBrowser: false,
210
+ };
211
+ }
212
+
198
213
  // Handle raw text/plain from GitHub (raw file content)
199
214
  let isRawGitHub = false;
200
215
  try {
@@ -6,6 +6,28 @@
6
6
  import { formatEngineName } from "../utils/helpers.js";
7
7
  import { renderSynthesis } from "./synthesis.js";
8
8
 
9
+ /**
10
+ * Maximum line length for any text passed to the TUI. Lines longer than
11
+ * this are truncated with an ellipsis. The TUI's Text.render wraps at the
12
+ * terminal width, but it crashes with
13
+ * "Rendered line N exceeds terminal width (W > W-4)"
14
+ * when a single line is wider than its own internal render width. Long
15
+ * lines (e.g. a markdown table row inside a chatgpt synthesis answer) that
16
+ * don't have a \n break would otherwise produce this crash. The safety
17
+ * net below trims individual lines before they reach the TUI.
18
+ */
19
+ const MAX_LINE_WIDTH = 800;
20
+ function _truncateLongLines(text: string): string {
21
+ return text
22
+ .split("\n")
23
+ .map((line) =>
24
+ line.length > MAX_LINE_WIDTH
25
+ ? line.slice(0, MAX_LINE_WIDTH - 1) + "…"
26
+ : line,
27
+ )
28
+ .join("\n");
29
+ }
30
+
9
31
  /**
10
32
  * Format search results based on engine type
11
33
  */
@@ -16,10 +38,10 @@ export function formatResults(
16
38
  const lines: string[] = [];
17
39
 
18
40
  if (engine === "all") {
19
- return formatAllEnginesResult(data, lines);
41
+ return _truncateLongLines(formatAllEnginesResult(data, lines));
20
42
  }
21
43
 
22
- return formatSingleEngineResult(data, lines);
44
+ return _truncateLongLines(formatSingleEngineResult(data, lines));
23
45
  }
24
46
 
25
47
  /**
@@ -0,0 +1,205 @@
1
+ // src/search/challenge-detect.mjs — Detect when a Cloudflare/Turnstile/captcha
2
+ // challenge has been solved so the extractor can auto-resume.
3
+ //
4
+ // Polls page state (URL, title, DOM markers, cookie presence) instead of waiting
5
+ // for a hard timeout. Resolves once the engine-specific "challenge cleared"
6
+ // signal is observed, or rejects with a clear error if the polling budget is
7
+ // exhausted before any progress.
8
+ //
9
+ // Usage:
10
+ // const cleared = await waitForChallengeCleared({ tab, engine: "chatgpt", timeoutMs: 300000 });
11
+ // if (!cleared) emit _needsHumanVerification; else re-run extractor.
12
+
13
+ import { cdp } from "../../extractors/common.mjs";
14
+
15
+ const DEFAULT_TIMEOUT_MS = Number.parseInt(
16
+ process.env.GREEDY_SEARCH_CHALLENGE_WAIT_MS || "300000",
17
+ 10,
18
+ ); // 5 minutes default
19
+ const POLL_INTERVAL_MS = 3000;
20
+
21
+ const ENGINE_SIGNALS = {
22
+ chatgpt: {
23
+ // After Cloudflare clearance, chatgpt.com shows the chat UI.
24
+ // Title changes from "Περιμένετε..." / "Just a moment..." → "ChatGPT"
25
+ // and div.ProseMirror renders.
26
+ name: "chatgpt",
27
+ isCleared: async (tab) => {
28
+ const probe = await cdp([
29
+ "eval",
30
+ tab,
31
+ `(() => {
32
+ const title = document.title;
33
+ const onChatGPT = location.hostname === "chatgpt.com";
34
+ const hasProseMirror = !!document.querySelector("div.ProseMirror");
35
+ const hasTurnstileInput =
36
+ !!document.querySelector("input[name=\\"cf-turnstile-response\\"]") ||
37
+ !!document.querySelector("iframe[id^=\\"cf-chl-widget-\\"]");
38
+ // Body innerText is empty while on the Turnstile page.
39
+ const bodyText = (document.body && document.body.innerText) || "";
40
+ return JSON.stringify({
41
+ title,
42
+ url: location.href,
43
+ hasProseMirror,
44
+ hasTurnstileInput,
45
+ bodyLen: bodyText.length,
46
+ onChatGPT,
47
+ });
48
+ })()`,
49
+ ]).catch(() => null);
50
+ if (!probe) return false;
51
+ let info;
52
+ try {
53
+ info = JSON.parse(probe);
54
+ } catch {
55
+ return false;
56
+ }
57
+ // Cleared when we're on chatgpt.com, the title is no longer the
58
+ // "Please wait…" placeholder, and either the chat UI rendered or
59
+ // the Turnstile marker is gone.
60
+ if (!info.onChatGPT) return false;
61
+ if (
62
+ info.title &&
63
+ /περιμένετε|please wait|just a moment|verifying|checking/i.test(
64
+ info.title,
65
+ )
66
+ ) {
67
+ return false;
68
+ }
69
+ if (info.hasTurnstileInput) return false;
70
+ // Either chat UI appeared OR we navigated past chatgpt.com (signed-in landing)
71
+ return info.hasProseMirror || info.bodyLen > 50;
72
+ },
73
+ },
74
+ bing: {
75
+ // Copilot shows "Verify you are human" challenge, then transitions to the chat UI.
76
+ // Cleared signals: URL on copilot.microsoft.com (no /challenge), textarea/input exists,
77
+ // or the Turnstile iframe is gone.
78
+ name: "bing",
79
+ isCleared: async (tab) => {
80
+ const probe = await cdp([
81
+ "eval",
82
+ tab,
83
+ `(() => {
84
+ const url = location.href;
85
+ const title = document.title;
86
+ const onCopilot = /copilot\\.microsoft\\.com/.test(location.hostname);
87
+ const onChallenge =
88
+ /challenge|turnstile|cdn-cgi\\/challenge/i.test(url) ||
89
+ /verify|human|robot/i.test(title);
90
+ const hasTextarea =
91
+ !!document.querySelector("textarea") ||
92
+ !!document.querySelector("div[contenteditable=\\"true\\"]");
93
+ const hasTurnstileInput =
94
+ !!document.querySelector("iframe[id^=\\"cf-chl-widget-\\"]") ||
95
+ !!document.querySelector("input[name=\\"cf-turnstile-response\\"]");
96
+ const bodyText = (document.body && document.body.innerText) || "";
97
+ return JSON.stringify({
98
+ url,
99
+ title,
100
+ onCopilot,
101
+ onChallenge,
102
+ hasTextarea,
103
+ hasTurnstileInput,
104
+ bodyLen: bodyText.length,
105
+ });
106
+ })()`,
107
+ ]).catch(() => null);
108
+ if (!probe) return false;
109
+ let info;
110
+ try {
111
+ info = JSON.parse(probe);
112
+ } catch {
113
+ return false;
114
+ }
115
+ if (!info.onCopilot) return false;
116
+ if (info.onChallenge) return false;
117
+ if (info.hasTurnstileInput) return false;
118
+ // Either chat input appeared OR we're past the challenge.
119
+ return info.hasTextarea || info.bodyLen > 50;
120
+ },
121
+ },
122
+ };
123
+
124
+ /**
125
+ * Generic fallback: poll for cf_clearance cookie presence on the engine domain.
126
+ * Used when the engine doesn't have specific DOM signals defined.
127
+ */
128
+ async function pollForCfClearanceCookie(tab) {
129
+ const probe = await cdp([
130
+ "eval",
131
+ tab,
132
+ `(() => {
133
+ const cookies = document.cookie || "";
134
+ return JSON.stringify({
135
+ hasCfClearance: /(?:^|;\\s*)cf_clearance=/.test(cookies),
136
+ hasCfBm: /(?:^|;\\s*)__cf_bm=/.test(cookies),
137
+ cookiesLength: cookies.length,
138
+ });
139
+ })()`,
140
+ ]).catch(() => null);
141
+ if (!probe) return false;
142
+ try {
143
+ const info = JSON.parse(probe);
144
+ return info.hasCfClearance || info.hasCfBm;
145
+ } catch {
146
+ return false;
147
+ }
148
+ }
149
+
150
+ /**
151
+ * Poll page state until a Cloudflare/Turnstile challenge is cleared.
152
+ *
153
+ * Returns:
154
+ * { cleared: true, signal: "..." } — challenge cleared; safe to re-extract.
155
+ * { cleared: false, reason: "..." } — timeout or unrecoverable.
156
+ */
157
+ export async function waitForChallengeCleared({
158
+ tab,
159
+ engine,
160
+ timeoutMs = DEFAULT_TIMEOUT_MS,
161
+ intervalMs = POLL_INTERVAL_MS,
162
+ signal: externalSignal,
163
+ log = () => {},
164
+ }) {
165
+ const def = ENGINE_SIGNALS[engine];
166
+ const start = Date.now();
167
+ let lastState = null;
168
+
169
+ while (Date.now() - start < timeoutMs) {
170
+ if (externalSignal?.aborted) {
171
+ return { cleared: false, reason: "aborted" };
172
+ }
173
+ const elapsed = Math.floor((Date.now() - start) / 1000);
174
+
175
+ let cleared = false;
176
+ if (def) {
177
+ cleared = await def.isCleared(tab).catch(() => false);
178
+ } else {
179
+ cleared = await pollForCfClearanceCookie(tab).catch(() => false);
180
+ }
181
+ if (cleared) {
182
+ log(
183
+ `[greedysearch] ✅ ${engine} challenge cleared after ${elapsed}s — auto-resuming extraction.`,
184
+ );
185
+ return { cleared: true, signal: def ? "dom-marker" : "cookie" };
186
+ }
187
+
188
+ // Periodic heartbeat to stderr so the user knows we're still polling
189
+ if (elapsed > 0 && elapsed % 30 === 0 && lastState !== elapsed) {
190
+ lastState = elapsed;
191
+ log(
192
+ `[greedysearch] ⏳ Waiting for ${engine} challenge to clear (${elapsed}s/${Math.floor(timeoutMs / 1000)}s)...`,
193
+ );
194
+ }
195
+
196
+ await new Promise((r) => setTimeout(r, intervalMs));
197
+ }
198
+
199
+ return {
200
+ cleared: false,
201
+ reason: `Challenge not cleared within ${Math.floor(timeoutMs / 1000)}s`,
202
+ };
203
+ }
204
+
205
+ export const CHALLENGE_ENGINES = Object.keys(ENGINE_SIGNALS);
@@ -20,6 +20,11 @@ export const VISIBLE_RECOVERY_LOG = `${tmpdir().replaceAll("\\", "/")}/greedysea
20
20
  const CONFIG_DIR = join(homedir(), ".pi");
21
21
  const CONFIG_FILE = join(CONFIG_DIR, "greedyconfig");
22
22
 
23
+ // Default engines that participate in the "all" fan-out for normal
24
+ // (non-research) searches. Opt-in research/academic engines like
25
+ // `semantic-scholar` are deliberately excluded — they belong in research
26
+ // mode, not casual web search. Users who want them in normal `engine:all`
27
+ // runs can add them via ~/.pi/greedyconfig (see ensureDefaultConfig()).
23
28
  export const DEFAULT_ENGINES = ["perplexity", "google", "chatgpt"];
24
29
  export const DEFAULT_SYNTHESIZER = "gemini";
25
30
 
@@ -0,0 +1,145 @@
1
+ // src/search/progress.mjs — Progress bar with ETA for long-running research
2
+ //
3
+ // Tracks per-action and per-round timing, prints a progress bar to stderr
4
+ // after each step with an ETA based on rolling average. Inspired by pi-webaio's
5
+ // streaming progress output.
6
+ //
7
+ // Usage:
8
+ // const tracker = createProgressTracker({ totalActions: 6, totalRounds: 2 });
9
+ // tracker.startRound(1);
10
+ // tracker.startAction('search', 'what is X');
11
+ // ... do work ...
12
+ // tracker.endAction();
13
+ // tracker.startAction('fetch', 'https://...');
14
+ // ... do work ...
15
+ // tracker.endAction();
16
+ // tracker.endRound();
17
+ // tracker.print(); // prints bar to stderr
18
+
19
+ const BAR_WIDTH = 20;
20
+
21
+ /**
22
+ * Format seconds as a human-readable duration (e.g. "1m 23s", "45s", "0s")
23
+ */
24
+ function formatDuration(ms) {
25
+ if (ms < 1000) return "0s";
26
+ const totalSeconds = Math.round(ms / 1000);
27
+ if (totalSeconds < 60) return `${totalSeconds}s`;
28
+ const minutes = Math.floor(totalSeconds / 60);
29
+ const seconds = totalSeconds % 60;
30
+ return `${minutes}m ${seconds}s`;
31
+ }
32
+
33
+ /**
34
+ * Render a progress bar string.
35
+ * Example: [████████████░░░░░░░░] 12/20 (ETA 1m 30s)
36
+ */
37
+ function renderBar(progress, width = BAR_WIDTH) {
38
+ const filled = Math.round(progress * width);
39
+ const empty = width - filled;
40
+ return "[" + "█".repeat(filled) + "░".repeat(empty) + "]";
41
+ }
42
+
43
+ /**
44
+ * Create a progress tracker.
45
+ * @param {object} opts
46
+ * @param {number} opts.totalActions - Total expected actions across all rounds
47
+ * @param {number} opts.totalRounds - Total expected rounds
48
+ * @param {number} opts.totalFetches - Total expected source fetches
49
+ * @param {boolean} [opts.silent] - Suppress stderr output (for tests)
50
+ */
51
+ export function createProgressTracker({
52
+ totalActions = 0,
53
+ totalRounds = 0,
54
+ totalFetches = 0,
55
+ silent = false,
56
+ } = {}) {
57
+ const startedAt = Date.now();
58
+ let completedActions = 0;
59
+ let completedRounds = 0;
60
+ let completedFetches = 0;
61
+ const actionTimings = []; // rolling window of recent action durations
62
+ let currentActionStart = null;
63
+ let currentActionLabel = null;
64
+ let lastPrintAt = 0;
65
+ const MIN_PRINT_INTERVAL_MS = 500; // throttle to avoid spam
66
+
67
+ function recordAction(durationMs) {
68
+ actionTimings.push(durationMs);
69
+ // keep only last 5 for rolling average
70
+ if (actionTimings.length > 5) actionTimings.shift();
71
+ }
72
+
73
+ function avgActionMs() {
74
+ if (actionTimings.length === 0) return null;
75
+ return actionTimings.reduce((a, b) => a + b, 0) / actionTimings.length;
76
+ }
77
+
78
+ function buildStatus(phase) {
79
+ const elapsed = Date.now() - startedAt;
80
+ const total = totalActions + totalFetches + totalRounds;
81
+ const done = completedActions + completedFetches + completedRounds;
82
+ const progress = total > 0 ? Math.min(1, done / total) : 0;
83
+ const bar = renderBar(progress);
84
+ const avg = avgActionMs();
85
+ const remaining = Math.max(0, total - done);
86
+ const etaMs = avg ? avg * remaining : null;
87
+ const eta = etaMs ? formatDuration(etaMs) : "—";
88
+ const label = currentActionLabel ? ` ${currentActionLabel}` : "";
89
+ return `${bar} ${done}/${total} (${phase}${label}, ETA ${eta})`;
90
+ }
91
+
92
+ function print(phase) {
93
+ if (silent) return;
94
+ const now = Date.now();
95
+ // throttle to avoid spamming
96
+ if (now - lastPrintAt < MIN_PRINT_INTERVAL_MS && phase !== "done") return;
97
+ lastPrintAt = now;
98
+ process.stderr.write(`[greedysearch] ${buildStatus(phase)}\n`);
99
+ }
100
+
101
+ return {
102
+ startRound(n) {
103
+ completedRounds = n - 1; // will be incremented when endRound fires
104
+ },
105
+ endRound() {
106
+ completedRounds++;
107
+ print("round");
108
+ },
109
+ startAction(type, label) {
110
+ currentActionStart = Date.now();
111
+ currentActionLabel = `${type}:${(label || "").slice(0, 40)}`;
112
+ print(type);
113
+ },
114
+ endAction() {
115
+ if (currentActionStart) {
116
+ recordAction(Date.now() - currentActionStart);
117
+ currentActionStart = null;
118
+ }
119
+ completedActions++;
120
+ print("action");
121
+ },
122
+ startFetch(label) {
123
+ currentActionStart = Date.now();
124
+ currentActionLabel = `fetch:${(label || "").slice(0, 40)}`;
125
+ print("fetch");
126
+ },
127
+ endFetch(ok = true) {
128
+ if (currentActionStart) {
129
+ recordAction(Date.now() - currentActionStart);
130
+ currentActionStart = null;
131
+ }
132
+ completedFetches++;
133
+ print(ok ? "fetch" : "fetch-failed");
134
+ },
135
+ print() {
136
+ print("progress");
137
+ },
138
+ finish() {
139
+ print("done");
140
+ },
141
+ getElapsedMs() {
142
+ return Date.now() - startedAt;
143
+ },
144
+ };
145
+ }
@@ -10,8 +10,14 @@ export const HEADLESS_RECOVERY_ENGINES = [
10
10
  "logically",
11
11
  ];
12
12
 
13
+ // blockedBy values that indicate visible-mode cookies CANNOT bypass the block.
14
+ // These still match the "headless blocked" shape but should NOT trigger
15
+ // visible recovery — the block is account-level (rate limit, ban) or
16
+ // structural (page redesign), not session-level.
17
+ const NON_RECOVERABLE_BLOCKED_BY = new Set(["rate-limit"]);
18
+
13
19
  const HEADLESS_BLOCKED_PATTERN =
14
- /timed out|timeout|verification|captcha|cloudflare|turnstile|input not found|ask-input|clipboard|copy button hidden|sign.in|login required/i;
20
+ /timed out|timeout|verification|captcha|cloudflare|turnstile|input not found|ask-input|copy button hidden|sign.in|login required/i;
15
21
 
16
22
  const MANUAL_VERIFICATION_PATTERN =
17
23
  /needs-human|verification required|please solve|captcha|cloudflare|turnstile|could not be completed automatically|manual intervention|sign.in|login required/i;
@@ -24,12 +30,24 @@ export function isManualVerificationError(error) {
24
30
  return MANUAL_VERIFICATION_PATTERN.test(String(error || ""));
25
31
  }
26
32
 
33
+ /**
34
+ * Check if a blockedBy value is non-recoverable (visible retry won't help).
35
+ */
36
+ export function isNonRecoverableBlockedBy(blockedBy) {
37
+ return NON_RECOVERABLE_BLOCKED_BY.has(blockedBy);
38
+ }
39
+
27
40
  export function findHeadlessBlockedEngines(resultsByEngine) {
28
41
  return HEADLESS_RECOVERY_ENGINES.filter((engine) => {
29
42
  const result = resultsByEngine?.[engine];
30
43
  if (!result) return false;
31
44
  // Data-driven: check envelope first (zero regex cost)
32
- if (result._envelope?.blockedBy) return true;
45
+ const blockedBy = result._envelope?.blockedBy;
46
+ if (blockedBy) {
47
+ // Skip non-recoverable blocks (rate-limit, ban, etc.)
48
+ if (NON_RECOVERABLE_BLOCKED_BY.has(blockedBy)) return false;
49
+ return true;
50
+ }
33
51
  if (result._envelope?.verificationResult === "needs-human") return true;
34
52
  // Fallback: legacy string matching for errors passed as plain strings
35
53
  const error = result.error;
@@ -45,7 +63,11 @@ export function findHeadlessBlockedEngines(resultsByEngine) {
45
63
  export function isHeadlessBlockedResult(error) {
46
64
  if (!error) return false;
47
65
  const env = error.envelope;
48
- if (env?.blockedBy) return true;
66
+ if (env?.blockedBy) {
67
+ // Skip non-recoverable blocks (rate-limit, ban, etc.)
68
+ if (NON_RECOVERABLE_BLOCKED_BY.has(env.blockedBy)) return false;
69
+ return true;
70
+ }
49
71
  if (env?.verificationResult === "needs-human") return true;
50
72
  return isHeadlessBlockedError(error.message);
51
73
  }