@apmantza/greedysearch-pi 1.9.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,436 @@
1
+ #!/usr/bin/env node
2
+
3
+ // extractors/chatgpt.mjs
4
+ // Navigate chatgpt.com, submit query, wait for answer, extract answer + sources.
5
+ //
6
+ // Usage:
7
+ // node extractors/chatgpt.mjs "<query>" [--tab <prefix>]
8
+ //
9
+ // Output (stdout): JSON { answer, sources, query, url }
10
+ // Errors go to stderr only — stdout is always clean JSON for piping.
11
+
12
+ import {
13
+ buildEnvelope,
14
+ cdp,
15
+ cdpWithInput,
16
+ formatAnswer,
17
+ getOrOpenTab,
18
+ handleError,
19
+ injectClipboardInterceptor,
20
+ jitter,
21
+ logStage,
22
+ outputJson,
23
+ parseArgs,
24
+ parseSourcesFromMarkdown,
25
+ parseSourcesFromMarkdownRefStyle,
26
+ prepareArgs,
27
+ validateQuery,
28
+ waitForSelector,
29
+ waitForStreamComplete,
30
+ } from "./common.mjs";
31
+ import { dismissConsent, handleVerification } from "./consent.mjs";
32
+
33
+ const GLOBAL_VAR = "__chatgptClipboard";
34
+ const PROSE_SELECTOR = "div.ProseMirror";
35
+ const SEND_SELECTOR = 'button[data-testid="send-button"]';
36
+ const COPY_SELECTOR = 'button[data-testid="copy-turn-action-button"]';
37
+
38
+ // ============================================================================
39
+ // ChatGPT-specific helpers
40
+ // ============================================================================
41
+
42
+ async function typeAndSubmit(tab, query) {
43
+ // Focus the ProseMirror editor
44
+ await cdp(["click", tab, PROSE_SELECTOR]);
45
+ await new Promise((r) => setTimeout(r, jitter(200)));
46
+
47
+ // Type via CDP (sends Input.insertText). Use stdin so long synthesis
48
+ // prompts do not hit Windows command-line length limits.
49
+ await cdpWithInput(["type", tab, "--stdin"], query);
50
+ await new Promise((r) => setTimeout(r, jitter(300)));
51
+
52
+ // Click send button
53
+ const sendCode = `
54
+ (() => {
55
+ const btn = document.querySelector('${SEND_SELECTOR}');
56
+ if (!btn) return 'no-send';
57
+ btn.click();
58
+ return 'ok';
59
+ })()
60
+ `;
61
+ const sendResult = await cdp(["eval", tab, sendCode]);
62
+ if (sendResult === "no-send")
63
+ throw new Error("ChatGPT send button not found");
64
+ await new Promise((r) => setTimeout(r, jitter(300)));
65
+ }
66
+
67
+ /**
68
+ * Inline selector for waitForStreamComplete: returns the assistant message
69
+ * that comes AFTER the last user message, or null if none exists. This
70
+ * skips chatgpt.com's static pre-rendered greeting card (which is
71
+ * `data-turn-start-message="true"` and lives on the homepage before any
72
+ * conversation) so short answers like "Hello! 👋" don't get confused with
73
+ * the 32-char placeholder.
74
+ */
75
+ const CHATGPT_RESPONSE_SELECTOR = String.raw`(() => {
76
+ const all = document.querySelectorAll('[data-message-author-role]');
77
+ let lastUserIdx = -1;
78
+ for (let i = 0; i < all.length; i++) {
79
+ if (all[i].getAttribute('data-message-author-role') === 'user') lastUserIdx = i;
80
+ }
81
+ if (lastUserIdx < 0) return null;
82
+ let bestEl = null;
83
+ let bestLen = 0;
84
+ for (let i = lastUserIdx + 1; i < all.length; i++) {
85
+ if (all[i].getAttribute('data-message-author-role') === 'assistant') {
86
+ const len = (all[i].innerText || '').length;
87
+ if (len > bestLen) { bestLen = len; bestEl = all[i]; }
88
+ }
89
+ }
90
+ return bestEl;
91
+ })()`;
92
+
93
+ /**
94
+ * Wait for ChatGPT's response to finish streaming. Delegates to the shared
95
+ * waitForStreamComplete in common.mjs with a custom selector that skips the
96
+ * static homepage greeting card. minLength: 1 means any non-empty response
97
+ * is considered "started" — short answers like "Hello! 👋" (8 chars) used
98
+ * to burn the full 65s budget under the old 50-char threshold.
99
+ */
100
+ async function waitForResponse(tab, timeoutMs = 20000) {
101
+ return waitForStreamComplete(tab, {
102
+ timeout: timeoutMs,
103
+ interval: 600,
104
+ stableRounds: 3,
105
+ minLength: 1,
106
+ selector: CHATGPT_RESPONSE_SELECTOR,
107
+ });
108
+ }
109
+
110
+ /**
111
+ * Node-side fallback for chatgpt stream completion. Used when the in-browser
112
+ * poll times out (typically because Chrome throttles background tabs to 1Hz
113
+ * when 3+ extractors run in parallel in `all` mode). Polls the same
114
+ * greeting-card-skipping selector via short independent Runtime.evaluate
115
+ * calls so the WebSocket is free between polls.
116
+ */
117
+ async function pollForResponseNodeSide(tab, maxMs = 15000) {
118
+ const deadline = Date.now() + maxMs;
119
+ let lastLen = 0;
120
+ let stableRounds = 0;
121
+ while (Date.now() < deadline) {
122
+ const result = await cdp(
123
+ ["eval", tab, `${CHATGPT_RESPONSE_SELECTOR}?.innerText?.length ?? 0`],
124
+ 4000,
125
+ ).catch(() => "0");
126
+ const len = parseInt(result, 10) || 0;
127
+ if (len >= 1 && len === lastLen) {
128
+ stableRounds++;
129
+ if (stableRounds >= 3) return len;
130
+ } else {
131
+ lastLen = len;
132
+ stableRounds = 0;
133
+ }
134
+ await new Promise((r) => setTimeout(r, 1200));
135
+ }
136
+ return lastLen;
137
+ }
138
+
139
+ async function extractAnswerFromDom(tab) {
140
+ const raw = await cdp([
141
+ "eval",
142
+ tab,
143
+ String.raw`
144
+ (() => {
145
+ // Find the assistant message that comes AFTER the last user message,
146
+ // not the absolute last assistant element. The chatgpt.com homepage
147
+ // has a static pre-rendered greeting card that renders as a
148
+ // [data-message-author-role="assistant"] element with
149
+ // data-turn-start-message="true" — it must be skipped or the
150
+ // static "Hello! How can I help you today?" placeholder gets
151
+ // returned as the answer to a query the assistant never answered.
152
+ const all = Array.from(document.querySelectorAll('[data-message-author-role]'));
153
+ let lastUserIdx = -1;
154
+ for (let i = 0; i < all.length; i++) {
155
+ if (all[i].getAttribute('data-message-author-role') === 'user') {
156
+ lastUserIdx = i;
157
+ }
158
+ }
159
+ if (lastUserIdx < 0) {
160
+ // No user message at all — page is still on the homepage.
161
+ return JSON.stringify({
162
+ answer: '',
163
+ sources: [],
164
+ skipped: 'no-user-message',
165
+ });
166
+ }
167
+ let assistant = null;
168
+ for (let i = lastUserIdx + 1; i < all.length; i++) {
169
+ if (all[i].getAttribute('data-message-author-role') === 'assistant') {
170
+ assistant = all[i];
171
+ }
172
+ }
173
+ if (!assistant) {
174
+ return JSON.stringify({
175
+ answer: '',
176
+ sources: [],
177
+ skipped: 'no-assistant-response',
178
+ });
179
+ }
180
+ const answer = (assistant.innerText || assistant.textContent || '').trim();
181
+ const seen = new Set();
182
+ const sources = [];
183
+ for (const link of assistant.querySelectorAll('a[href]')) {
184
+ const url = link.href;
185
+ if (!url || seen.has(url)) continue;
186
+ seen.add(url);
187
+ const title = (link.innerText || link.textContent || '').replace(/\s+/g, ' ').trim();
188
+ sources.push({ title, url });
189
+ if (sources.length >= 10) break;
190
+ }
191
+ return JSON.stringify({ answer, sources });
192
+ })()
193
+ `,
194
+ ]);
195
+ try {
196
+ return JSON.parse(raw);
197
+ } catch {
198
+ return { answer: "", sources: [], skipped: "parse-error" };
199
+ }
200
+ }
201
+
202
+ async function extractAnswer(tab, env) {
203
+ // Click the copy button on the assistant's response (after the last
204
+ // user message). The old `buttons[buttons.length - 1]` picked the
205
+ // absolute last copy button on the page — which is the USER message's
206
+ // copy button when the assistant response is still empty (0 chars) and
207
+ // has no copy button of its own. That copied the user's query into
208
+ // the clipboard interceptor and returned it as the "answer".
209
+ //
210
+ // If the assistant message has no copy button yet (still streaming, or
211
+ // the React tree hasn't rendered the button after streaming completed),
212
+ // we deliberately click NOTHING rather than falling back to the last
213
+ // copy button on the page. An empty clipboard routes us to the DOM
214
+ // fallback, which correctly targets the assistant message after the
215
+ // last user message and returns its innerText.
216
+ await cdp([
217
+ "eval",
218
+ tab,
219
+ `(() => {
220
+ const all = document.querySelectorAll('[data-message-author-role]');
221
+ let lastUserIdx = -1;
222
+ for (let i = 0; i < all.length; i++) {
223
+ if (all[i].getAttribute('data-message-author-role') === 'user') lastUserIdx = i;
224
+ }
225
+ if (lastUserIdx < 0) return 'no-user';
226
+ let assistantCopy = null;
227
+ for (let i = lastUserIdx + 1; i < all.length; i++) {
228
+ if (all[i].getAttribute('data-message-author-role') === 'assistant') {
229
+ const btn = all[i].querySelector('${COPY_SELECTOR}');
230
+ if (btn) assistantCopy = btn;
231
+ }
232
+ }
233
+ if (assistantCopy) { assistantCopy.click(); return 'clicked'; }
234
+ return 'no-assistant-copy';
235
+ })()`,
236
+ ]);
237
+ await new Promise((r) => setTimeout(r, 600));
238
+
239
+ let answer = await cdp(["eval", tab, `window.${GLOBAL_VAR} || ''`]);
240
+ env.clipboardEmpty = !answer;
241
+
242
+ // Retry once if clipboard is empty — the assistant message may have
243
+ // finished streaming and the copy button may have rendered in the
244
+ // meantime.
245
+ if (!answer) {
246
+ console.error("[chatgpt] Clipboard empty, retrying in 2s...");
247
+ await cdp([
248
+ "eval",
249
+ tab,
250
+ `(() => {
251
+ const all = document.querySelectorAll('[data-message-author-role]');
252
+ let lastUserIdx = -1;
253
+ for (let i = 0; i < all.length; i++) {
254
+ if (all[i].getAttribute('data-message-author-role') === 'user') lastUserIdx = i;
255
+ }
256
+ if (lastUserIdx < 0) return 'no-user';
257
+ let assistantCopy = null;
258
+ for (let i = lastUserIdx + 1; i < all.length; i++) {
259
+ if (all[i].getAttribute('data-message-author-role') === 'assistant') {
260
+ const btn = all[i].querySelector('${COPY_SELECTOR}');
261
+ if (btn) assistantCopy = btn;
262
+ }
263
+ }
264
+ if (assistantCopy) { assistantCopy.click(); return 'clicked'; }
265
+ return 'no-assistant-copy';
266
+ })()`,
267
+ ]);
268
+ await new Promise((r) => setTimeout(r, 2000));
269
+ answer = await cdp(["eval", tab, `window.${GLOBAL_VAR} || ''`]);
270
+ env.clipboardEmpty = !answer;
271
+ }
272
+
273
+ let domFallback = null;
274
+ if (!answer) {
275
+ domFallback = await extractAnswerFromDom(tab);
276
+ answer = domFallback.answer;
277
+ env.fallbackUsed = answer ? "dom" : null;
278
+ }
279
+
280
+ if (!answer) throw new Error("Clipboard interceptor returned empty text");
281
+
282
+ // Parse sources from both inline/reference-style markdown links and DOM links
283
+ // (DOM fallback preserves sources even when native clipboard copy fails).
284
+ const sourcesInline = parseSourcesFromMarkdown(answer);
285
+ const sourcesRef = parseSourcesFromMarkdownRefStyle(answer);
286
+ const sourceMap = new Map();
287
+ for (const s of [
288
+ ...(domFallback?.sources || []),
289
+ ...sourcesRef,
290
+ ...sourcesInline,
291
+ ]) {
292
+ if (s?.url && !sourceMap.has(s.url)) sourceMap.set(s.url, s);
293
+ }
294
+ const sources = Array.from(sourceMap.values()).slice(0, 10);
295
+
296
+ return { answer: answer.trim(), sources };
297
+ }
298
+
299
+ // ============================================================================
300
+ // Main
301
+ // ============================================================================
302
+
303
+ const USAGE = 'Usage: node extractors/chatgpt.mjs "<query>" [--tab <prefix>]\n';
304
+
305
+ async function main() {
306
+ const args = await prepareArgs(process.argv.slice(2));
307
+ validateQuery(args, USAGE);
308
+
309
+ const { query, tabPrefix, short } = parseArgs(args);
310
+ const startTime = Date.now();
311
+ const mode =
312
+ process.env.GREEDY_SEARCH_VISIBLE === "1" ? "visible" : "headless";
313
+
314
+ const env = {
315
+ engine: "chatgpt",
316
+ mode,
317
+ clipboardEmpty: null,
318
+ fallbackUsed: null,
319
+ blockedBy: null,
320
+ verificationResult: null,
321
+ inputReady: null,
322
+ };
323
+
324
+ try {
325
+ if (!tabPrefix) await cdp(["list"]);
326
+ const tab = await getOrOpenTab(tabPrefix);
327
+
328
+ const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
329
+ () => "",
330
+ );
331
+ let onChatGPT = false;
332
+ try {
333
+ onChatGPT = new URL(currentUrl).hostname.toLowerCase() === "chatgpt.com";
334
+ } catch {}
335
+
336
+ if (!onChatGPT) {
337
+ logStage(env, "nav", startTime);
338
+ await cdp(["nav", tab, "https://chatgpt.com"], 20000);
339
+ await new Promise((r) => setTimeout(r, 600));
340
+ }
341
+ logStage(env, "consent", startTime);
342
+ await dismissConsent(tab, cdp);
343
+ logStage(env, "verification", startTime);
344
+ await handleVerification(tab, cdp, 10000);
345
+
346
+ logStage(env, "input-wait", startTime);
347
+ const inputReady = await waitForSelector(tab, PROSE_SELECTOR, 8000, 400);
348
+ env.inputReady = inputReady;
349
+ if (!inputReady) {
350
+ const bodyText = await cdp([
351
+ "eval",
352
+ tab,
353
+ `document.body?.innerText || ''`,
354
+ ]).catch(() => "");
355
+ if (
356
+ /sign in|log in|sign up|\u03a3\u03cd\u03bd\u03b4\u03b5\u03c3\u03b7|login/i.test(
357
+ bodyText,
358
+ )
359
+ ) {
360
+ throw new Error(
361
+ "ChatGPT requires sign-in — please sign in in the visible browser window",
362
+ );
363
+ }
364
+ throw new Error(
365
+ "ChatGPT input not found — page may be blocked or in unexpected state",
366
+ );
367
+ }
368
+
369
+ logStage(env, "clipboard-inject", startTime);
370
+ await injectClipboardInterceptor(tab, GLOBAL_VAR);
371
+ logStage(env, "type-and-submit", startTime);
372
+ await typeAndSubmit(tab, query);
373
+
374
+ logStage(env, "stream-wait", startTime);
375
+ // waitForStreamComplete handles the in-browser poll in a single
376
+ // Runtime.evaluate call. If the response is still streaming past
377
+ // 20s (slow under tab throttling in `all` mode), fall back to
378
+ // node-side polls that release the WebSocket between each call.
379
+ // Together they stay well within the engine's 80s outer budget.
380
+ let asstLen = 0;
381
+ try {
382
+ asstLen = await waitForResponse(tab, 20000);
383
+ } catch (e) {
384
+ logStage(env, "stream-poll-fallback", startTime);
385
+ asstLen = await pollForResponseNodeSide(tab, 15000);
386
+ }
387
+ env.assistantTextLen = asstLen;
388
+ if (asstLen < 1) {
389
+ console.error(
390
+ "[chatgpt] Warning: assistant response may not have completed",
391
+ );
392
+ }
393
+
394
+ logStage(env, "extract", startTime);
395
+ const { answer, sources, skipped } = await extractAnswer(tab, env);
396
+ // If the DOM fallback skipped the response (no real assistant
397
+ // message after the user's query), surface a clear error so the
398
+ // caller doesn't silently consume the static homepage greeting
399
+ // card as a real answer. The static card lives on chatgpt.com
400
+ // before any conversation; without this guard the extractor used
401
+ // to return "Hello! How can I help you today?" as a successful
402
+ // response to every query.
403
+ if (!answer) {
404
+ env.blockedBy = "no-response";
405
+ env.skipped = skipped || null;
406
+ throw new Error(
407
+ skipped === "no-user-message"
408
+ ? "ChatGPT still on homepage — query was not submitted"
409
+ : skipped === "no-assistant-response"
410
+ ? "ChatGPT did not return an assistant response after submit"
411
+ : "ChatGPT returned no answer — assistant never responded",
412
+ );
413
+ }
414
+ logStage(env, "done", startTime);
415
+
416
+ const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
417
+ () => "https://chatgpt.com",
418
+ );
419
+ env.durationMs = Date.now() - startTime;
420
+ outputJson({
421
+ query,
422
+ url: finalUrl,
423
+ answer: formatAnswer(answer, short),
424
+ sources,
425
+ _envelope: buildEnvelope(env),
426
+ });
427
+ } catch (e) {
428
+ env.durationMs = Date.now() - startTime;
429
+ console.error(
430
+ `[chatgpt] error during stage '${env.lastStage || "unknown"}': ${e.message}`,
431
+ );
432
+ handleError(e, buildEnvelope(env));
433
+ }
434
+ }
435
+
436
+ main();
@@ -20,10 +20,18 @@ const CDP = join(__dir, "..", "bin", "cdp.mjs");
20
20
  * @returns {Promise<string>} Command output
21
21
  */
22
22
  export function cdp(args, timeoutMs = 30000) {
23
+ return cdpWithInput(args, null, timeoutMs);
24
+ }
25
+
26
+ export function cdpWithInput(args, input = null, timeoutMs = 30000) {
23
27
  return new Promise((resolve, reject) => {
24
28
  const proc = spawn(process.execPath, [CDP, ...args], {
25
- stdio: ["ignore", "pipe", "pipe"],
29
+ stdio: [input == null ? "ignore" : "pipe", "pipe", "pipe"],
26
30
  });
31
+ if (input != null) {
32
+ proc.stdin.write(input);
33
+ proc.stdin.end();
34
+ }
27
35
  let out = "";
28
36
  let err = "";
29
37
  proc.stdout.on("data", (d) => (out += d));
@@ -67,8 +75,20 @@ export async function getOrOpenTab(tabPrefix) {
67
75
  const { targetId } = JSON.parse(raw);
68
76
  await cdp(["list"]); // refresh cache
69
77
  const tid = targetId.slice(0, 8);
70
- // Inject stealth patches for anti-detection coverage (both headless + visible)
71
- injectHeadlessStealth(tid).catch(() => {});
78
+ // Inject stealth patches for anti-detection coverage (both headless + visible).
79
+ // MUST be awaited: the daemon processes commands concurrently, so a
80
+ // fire-and-forget registration races the next Page.navigate and the
81
+ // script may not be in place when the new document is created.
82
+ // Sites like consensus.app use this race to detect automation — the
83
+ // script's Navigator/webdriver overrides are absent on first paint,
84
+ // fingerprinting fires, and the user is bounced to a sign-up wall.
85
+ try {
86
+ await injectHeadlessStealth(tid);
87
+ } catch (e) {
88
+ process.stderr.write(
89
+ `[getOrOpenTab] stealth injection failed: ${e.message}\n`,
90
+ );
91
+ }
72
92
  return tid;
73
93
  }
74
94
 
@@ -84,25 +104,42 @@ export async function getOrOpenTab(tabPrefix) {
84
104
  */
85
105
  export async function injectClipboardInterceptor(tab, globalVar) {
86
106
  const code = `
87
- window.${globalVar} = null;
88
- const _origWriteText = navigator.clipboard.writeText.bind(navigator.clipboard);
89
- navigator.clipboard.writeText = function(text) {
90
- window.${globalVar} = text;
91
- return _origWriteText(text);
92
- };
93
- const _origWrite = navigator.clipboard.write.bind(navigator.clipboard);
94
- navigator.clipboard.write = async function(items) {
95
- try {
96
- for (const item of items) {
97
- if (item.types && item.types.includes('text/plain')) {
98
- const blob = await item.getType('text/plain');
99
- window.${globalVar} = await blob.text();
100
- break;
107
+ (() => {
108
+ window.${globalVar} = null;
109
+ const _clipboard = navigator.clipboard;
110
+ if (!_clipboard) return;
111
+ const _origWriteText = typeof _clipboard.writeText === 'function'
112
+ ? _clipboard.writeText.bind(_clipboard)
113
+ : null;
114
+ const _origWrite = typeof _clipboard.write === 'function'
115
+ ? _clipboard.write.bind(_clipboard)
116
+ : null;
117
+
118
+ _clipboard.writeText = function(text) {
119
+ window.${globalVar} = String(text ?? '');
120
+ if (!_origWriteText) return Promise.resolve();
121
+ // The OS/browser clipboard write may be denied in automated Chrome or
122
+ // when the tab is not focused. We only need the captured text; returning
123
+ // a resolved promise prevents the page from surfacing a misleading
124
+ // "failed to copy" toast after our interceptor already succeeded.
125
+ return Promise.resolve(_origWriteText(text)).catch(() => undefined);
126
+ };
127
+
128
+ _clipboard.write = async function(items) {
129
+ try {
130
+ for (const item of items || []) {
131
+ if (item.types && item.types.includes('text/plain')) {
132
+ const blob = await item.getType('text/plain');
133
+ window.${globalVar} = await blob.text();
134
+ break;
135
+ }
101
136
  }
102
- }
103
- } catch(e) {}
104
- return _origWrite(items);
105
- };
137
+ } catch(e) {}
138
+ if (!_origWrite) return undefined;
139
+ try { return await _origWrite(items); }
140
+ catch (_) { return undefined; }
141
+ };
142
+ })();
106
143
  `;
107
144
  await cdp(["eval", tab, code]);
108
145
  }
@@ -379,6 +416,79 @@ export function parseSourcesFromMarkdown(text) {
379
416
  return results;
380
417
  }
381
418
 
419
+ /**
420
+ * Linear-time "is this a non-empty digit string?" check.
421
+ * Equivalent to /^\d+$/ without the regex — used to keep the
422
+ * parseSourcesFromMarkdownRefStyle inline scan free of any regex
423
+ * (SonarCloud hotspot js:S5852).
424
+ * @param {string} s
425
+ * @returns {boolean}
426
+ */
427
+ function isAllDigits(s) {
428
+ if (!s) return false;
429
+ for (let k = 0; k < s.length; k++) {
430
+ const c = s.charCodeAt(k);
431
+ if (c < 48 || c > 57) return false;
432
+ }
433
+ return true;
434
+ }
435
+
436
+ /**
437
+ * Parse reference-style markdown links: [text][num] with [num]: url "title" at bottom.
438
+ * ChatGPT uses this format for its inline citations.
439
+ * @param {string} text - Markdown text
440
+ * @returns {Array<{title: string, url: string}>} Extracted sources
441
+ */
442
+ export function parseSourcesFromMarkdownRefStyle(text) {
443
+ if (!text) return [];
444
+ const results = [];
445
+
446
+ // Find all reference definitions: [num]: url "title"
447
+ const refMap = new Map();
448
+ const refRegex = /^\[(\d+)\]:\s*(https?:\/\/[^\s"]+)(?:\s+"([^"]*)")?/gm;
449
+ let m;
450
+ while ((m = refRegex.exec(text)) !== null) {
451
+ const num = m[1];
452
+ const url = m[2];
453
+ const title = m[3] || "";
454
+ refMap.set(num, { url, title });
455
+ }
456
+
457
+ // Find inline references: [text][num] or [num]. Linear scan via
458
+ // indexOf — avoids the ReDoS-prone /\[([^\]]*)\]\[(\d+)\]/g pattern
459
+ // (SonarCloud hotspot js:S5852). The original `[^\]]*` allowed `[`
460
+ // inside, which caused quadratic backtracking on inputs like
461
+ // `[a[[[[[[[[[[[1]`.
462
+ let cursor = 0;
463
+ while (cursor < text.length) {
464
+ const open = text.indexOf("[", cursor);
465
+ if (open === -1) break;
466
+ const close = text.indexOf("]", open + 1);
467
+ if (close === -1) break;
468
+ if (text[close + 1] !== "[") {
469
+ cursor = open + 1;
470
+ continue;
471
+ }
472
+ const close2 = text.indexOf("]", close + 2);
473
+ if (close2 === -1) break;
474
+
475
+ const inner = text.slice(open + 1, close);
476
+ const numStr = text.slice(close + 2, close2);
477
+ if (isAllDigits(numStr)) {
478
+ const ref = refMap.get(numStr);
479
+ if (ref && !results.some((r) => r.url === ref.url)) {
480
+ results.push({
481
+ title: inner.trim() || ref.title || "",
482
+ url: ref.url,
483
+ });
484
+ }
485
+ }
486
+ cursor = close2 + 1;
487
+ }
488
+
489
+ return results;
490
+ }
491
+
382
492
  // ============================================================================
383
493
  // Timing constants
384
494
  // ============================================================================
@@ -658,6 +768,26 @@ export function outputJson(data) {
658
768
  process.stdout.write(`${JSON.stringify(data, null, 2)}\n`);
659
769
  }
660
770
 
771
+ /**
772
+ * Record the current extractor stage for debugging and timeout diagnostics.
773
+ * Writes `[engine] stage: <name> (+<ms>)` to stderr and updates `env.lastStage`
774
+ * / `env.stages` so the envelope carries the last known phase on any outcome
775
+ * (success, error, timeout, kill).
776
+ *
777
+ * @param {object} env - The mutable env object the extractor is filling in.
778
+ * @param {string} stage - Short, snake_case stage name (e.g. "nav", "type", "stream").
779
+ * @param {number} [startTime] - Optional extractor start time for elapsed-ms logging.
780
+ */
781
+ export function logStage(env, stage, startTime = null) {
782
+ if (!env || typeof env !== "object") return;
783
+ const elapsed = startTime ? ` (+${Date.now() - startTime}ms)` : "";
784
+ env.lastStage = stage;
785
+ if (!Array.isArray(env.stages)) env.stages = [];
786
+ env.stages.push({ stage, at: Date.now() });
787
+ const engine = env.engine || "extractor";
788
+ console.error(`[${engine}] stage: ${stage}${elapsed}`);
789
+ }
790
+
661
791
  /**
662
792
  * Build a lightweight result envelope from data already collected during extraction.
663
793
  * Zero additional CDP calls — everything here is already known.
@@ -673,6 +803,8 @@ export function buildEnvelope({
673
803
  verificationResult = null,
674
804
  inputReady = null,
675
805
  durationMs = null,
806
+ lastStage = null,
807
+ stages = null,
676
808
  } = {}) {
677
809
  return {
678
810
  engine,
@@ -683,6 +815,8 @@ export function buildEnvelope({
683
815
  verificationResult,
684
816
  inputReady,
685
817
  durationMs,
818
+ lastStage,
819
+ stages,
686
820
  };
687
821
  }
688
822