@apmantza/greedysearch-pi 1.9.0 → 1.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,374 +1,490 @@
1
- #!/usr/bin/env node
2
-
3
- // extractors/bing-copilot.mjs
4
- // Navigate copilot.microsoft.com, wait for answer to complete, return clean answer + sources.
5
- //
6
- // Usage:
7
- // node extractors/bing-copilot.mjs "<query>" [--tab <prefix>]
8
- //
9
- // Output (stdout): JSON { answer, sources, query, url }
10
- // Errors go to stderr only — stdout is always clean JSON for piping.
11
-
12
- import {
13
- buildEnvelope,
14
- cdp,
15
- formatAnswer,
16
- getOrOpenTab,
17
- handleError,
18
- injectClipboardInterceptor,
19
- jitter,
20
- outputJson,
21
- parseArgs,
22
- parseSourcesFromMarkdown,
23
- prepareArgs,
24
- TIMING,
25
- validateQuery,
26
- waitForCopyButton,
27
- waitForSelector,
28
- waitForStreamComplete,
29
- } from "./common.mjs";
30
- import { dismissConsent, handleVerification } from "./consent.mjs";
31
- import { SELECTORS } from "./selectors.mjs";
32
-
33
- const S = SELECTORS.bing;
34
- const GLOBAL_VAR = "__bingClipboard";
35
-
36
- // ============================================================================
37
- // Bing Copilot-specific helpers
38
- // ============================================================================
39
-
40
- async function extractAnswer(tab, env) {
41
- // In headless mode: snap the accessibility tree before spending ~18s on
42
- // clipboard polls. Copilot loads its input fine in headless but renders
43
- // responses behind a Cloudflare-protected iframe — detecting that here
44
- // fast-fails to the visible retry instead of burning all the poll time.
45
- if (process.env.GREEDY_SEARCH_HEADLESS === "1") {
46
- const snap = await cdp(["snap", tab]).catch(() => "");
47
- if (/cloudflare|challenge|security check/i.test(snap)) {
48
- console.error("[bing] Cloudflare challenge in snap fast-failing to visible retry");
49
- env.blockedBy = "cloudflare";
50
- throw new Error("Cloudflare challenge detected — headless blocked");
51
- }
52
- }
53
-
54
- // Wait for the assistant copy button to exist. On fresh Copilot
55
- // sessions the answer text can render before the button handler is
56
- // fully hydrated. Wait for the button + a small hydration delay.
57
- // 2s is enough — the CF snap check above ensures we only reach here
58
- // on a clean response, where the button appears within ~1s.
59
- await waitForCopyButton(tab, S.copyButton, { timeout: 2000 }).catch(
60
- () => null,
61
- );
62
- // Give React time to hydrate the click handler on the button
63
- await new Promise((r) => setTimeout(r, 800));
64
-
65
- let answer = await clickCopyAndPollClipboard(tab, 5000);
66
- let clipboardEmpty = !answer;
67
-
68
- // Retry once if clipboard is empty (Copilot might be slow to wire the handler)
69
- if (!answer) {
70
- console.error("[bing] Clipboard empty, retrying copy/poll...");
71
- answer = await clickCopyAndPollClipboard(tab, 8000);
72
- clipboardEmpty = !answer;
73
- }
74
-
75
- // DOM fallback: visible Copilot can render a valid response while the copy
76
- // action/clipboard interceptor remains empty. Extract the last assistant
77
- // answer from page text before treating this as a headless/iframe block.
78
- if (!answer) {
79
- answer = await extractFromVisibleDom(tab);
80
- if (answer) env.fallbackUsed = "visibleDom";
81
- }
82
-
83
- // DOM fallback: if clipboard still empty, extract text directly from response DOM.
84
- // This handles headless mode where Copilot renders the AI reply inside nested
85
- // iframes (copilot.microsoft.com → copilot.fun → blob:…) and hides the copy button.
86
- if (!answer) {
87
- const iframeResult = await extractFromIframes(tab, env);
88
- answer = iframeResult.answer;
89
- if (answer) env.fallbackUsed = "iframeDom";
90
- }
91
-
92
- if (!answer) throw new Error("Clipboard interceptor returned empty text");
93
-
94
- env.clipboardEmpty = clipboardEmpty;
95
- const sources = parseSourcesFromMarkdown(answer);
96
- return { answer: answer.trim(), sources };
97
- }
98
-
99
- async function clickCopyAndPollClipboard(tab, timeoutMs) {
100
- await cdp([
101
- "eval",
102
- tab,
103
- `(() => {
104
- window.${GLOBAL_VAR} = '';
105
- const buttons = document.querySelectorAll('${S.copyButton}');
106
- buttons[buttons.length - 1]?.click();
107
- })()`,
108
- ]);
109
-
110
- const deadline = Date.now() + timeoutMs;
111
- while (Date.now() < deadline) {
112
- const answer = await cdp(["eval", tab, `window.${GLOBAL_VAR} || ''`]).catch(
113
- () => "",
114
- );
115
- if (answer) return answer;
116
- await new Promise((r) => setTimeout(r, 300));
117
- }
118
- return "";
119
- }
120
-
121
- /**
122
- * Visible-page DOM fallback. Copilot often exposes the completed assistant
123
- * message in document.body.innerText even when the copy button/clipboard path
124
- * fails. Keep this conservative: require a "Copilot said" marker and strip
125
- * known composer/action text after the answer.
126
- */
127
- async function extractFromVisibleDom(tab) {
128
- try {
129
- const bodyText = await cdp([
130
- "eval",
131
- tab,
132
- "document.body?.innerText || ''",
133
- ]).catch(() => "");
134
- if (!bodyText || !bodyText.includes("Copilot said")) return "";
135
-
136
- const answer = bodyText
137
- .split(/Copilot said\s*/i)
138
- .pop()
139
- .split(
140
- /\n[^\S\n]*(?:Good response|Bad response|Share message|Copy message|Read aloud|Regenerate|Edit in a page|Message Copilot|Smart)(?![\w])/i,
141
- )[0]
142
- .trim();
143
-
144
- if (answer.length < 20) return "";
145
- console.error(
146
- `[bing] Visible DOM extraction succeeded (${answer.length} chars)`,
147
- );
148
- return answer;
149
- } catch (e) {
150
- console.error(`[bing] Visible DOM extraction failed: ${e.message}`);
151
- return "";
152
- }
153
- }
154
-
155
- /**
156
- * DOM fallback: check if Copilot is blocked by Cloudflare in headless mode.
157
- * When blocked, the copilot.fun iframe shows a challenge instead of the chat UI.
158
- * Returns the extracted text or empty string on failure (caller falls through to error
159
- * which triggers the visible Chrome auto-retry in search.mjs).
160
- */
161
- async function extractFromIframes(mainTab, env) {
162
- try {
163
- // Check if the AI copy button exists — if it does, we're in visible mode
164
- // and clipboard should have worked. This is a different issue.
165
- const hasCopyBtn = await cdp([
166
- "eval",
167
- mainTab,
168
- `!!document.querySelector('${S.copyButton}')`,
169
- ]).catch(() => "false");
170
- if (hasCopyBtn === "true") return { answer: "" }; // not a headless/iframe issue
171
-
172
- // Check for Cloudflare challenge in the accessibility tree.
173
- // If present, Copilot content is blocked entirely — no DOM extraction possible.
174
- const snap = await cdp(["snap", mainTab]).catch(() => "");
175
- if (/cloudflare|challenge|security|verification/i.test(snap)) {
176
- console.error(
177
- "[bing] Cloudflare challenge detected — content blocked in headless",
178
- );
179
- env.blockedBy = "cloudflare";
180
- return { answer: "" }; // Let caller throw → triggers visible auto-retry
181
- }
182
-
183
- console.error(
184
- "[bing] Copy button hidden, no Cloudflare — trying DOM extraction...",
185
- );
186
-
187
- // Get CDP targets to find the copilot.fun iframe
188
- const targetsRaw = await cdp([
189
- "evalraw",
190
- mainTab,
191
- "Target.getTargets",
192
- "{}",
193
- ]);
194
- const targets = JSON.parse(targetsRaw);
195
- const targetInfos = targets.targetInfos || [];
196
- const funFrame = targetInfos.find(
197
- (t) => t.type === "iframe" && t.url.includes("copilot.fun"),
198
- );
199
- if (!funFrame) {
200
- console.error("[bing] No copilot.fun iframe target found");
201
- return { answer: "" };
202
- }
203
-
204
- // Try to extract from the nested blob iframe (rarely succeeds due to Cloudflare)
205
- const funTabId = funFrame.targetId.slice(0, 8);
206
- const innerText = await cdp([
207
- "eval",
208
- funTabId,
209
- `(()=>{const iframe=document.querySelector('iframe'); if(!iframe) return''; try{const doc=iframe.contentDocument||iframe.contentWindow.document; return doc?.body?.innerText?.trim()||''}catch(e){return''}})()`,
210
- ]).catch(() => "");
211
-
212
- if (innerText) {
213
- console.error(
214
- `[bing] DOM extraction succeeded (${innerText.length} chars)`,
215
- );
216
- return { answer: innerText };
217
- }
218
-
219
- console.error(
220
- "[bing] DOM extraction returned empty — falling through to visible retry",
221
- );
222
- } catch (e) {
223
- console.error(`[bing] DOM extraction failed: ${e.message}`);
224
- }
225
- return { answer: "" };
226
- }
227
-
228
- // ============================================================================
229
- // Main
230
- // ============================================================================
231
-
232
- const USAGE =
233
- 'Usage: node extractors/bing-copilot.mjs "<query>" [--tab <prefix>]\n';
234
-
235
- async function main() {
236
- const args = await prepareArgs(process.argv.slice(2));
237
- validateQuery(args, USAGE);
238
-
239
- const { query, tabPrefix, short } = parseArgs(args);
240
- const startTime = Date.now();
241
- const mode =
242
- process.env.GREEDY_SEARCH_VISIBLE === "1" ? "visible" : "headless";
243
-
244
- // Lightweight envelope no extra CDP calls, just tracks what we already know
245
- const env = {
246
- engine: "bing",
247
- mode,
248
- clipboardEmpty: null,
249
- fallbackUsed: null,
250
- blockedBy: null,
251
- verificationResult: null,
252
- inputReady: null,
253
- };
254
-
255
- try {
256
- // Only refresh page list when creating a fresh tab (no prefix provided)
257
- if (!tabPrefix) await cdp(["list"]);
258
- const tab = await getOrOpenTab(tabPrefix);
259
-
260
- // Skip navigation if already on Copilot domain (tab was seeded by search.mjs)
261
- const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
262
- () => "",
263
- );
264
- let onCopilot = false;
265
- try {
266
- const host = new URL(currentUrl).hostname.toLowerCase();
267
- onCopilot =
268
- host === "copilot.microsoft.com" ||
269
- host.endsWith(".copilot.microsoft.com");
270
- } catch {}
271
-
272
- if (!onCopilot) {
273
- await cdp(["nav", tab, "https://copilot.microsoft.com/"], 20000);
274
- await new Promise((r) => setTimeout(r, 600));
275
- }
276
- await dismissConsent(tab, cdp);
277
-
278
- // Handle verification challenges (Cloudflare Turnstile, Microsoft auth, etc.)
279
- const verifyResult = await handleVerification(tab, cdp, 10000);
280
- env.verificationResult = verifyResult;
281
- if (verifyResult === "needs-human") {
282
- throw new Error(
283
- "Copilot verification required — please solve it manually in the browser window",
284
- );
285
- }
286
-
287
- // After verification, page may have redirected or reloaded — wait for it to settle
288
- if (verifyResult === "clicked") {
289
- await new Promise((r) => setTimeout(r, TIMING.afterVerify));
290
-
291
- // Re-navigate if we got redirected
292
- const currentUrl = await cdp([
293
- "eval",
294
- tab,
295
- "document.location.href",
296
- ]).catch(() => "");
297
- let onCopilot = false;
298
- try {
299
- const host = new URL(currentUrl).hostname.toLowerCase();
300
- onCopilot =
301
- host === "copilot.microsoft.com" ||
302
- host.endsWith(".copilot.microsoft.com");
303
- } catch {}
304
- if (!onCopilot) {
305
- await cdp(["nav", tab, "https://copilot.microsoft.com/"], 20000);
306
- await new Promise((r) => setTimeout(r, 600));
307
- await dismissConsent(tab, cdp);
308
- }
309
- }
310
-
311
- // Wait for React app to mount input (up to 15s, longer after verification)
312
- const inputReady = await waitForSelector(tab, S.input, 15000, 500);
313
- env.inputReady = inputReady;
314
- await new Promise((r) => setTimeout(r, jitter(300)));
315
-
316
- if (!inputReady) {
317
- throw new Error(
318
- "Copilot input not found — verification may have failed or page is in unexpected state",
319
- );
320
- }
321
-
322
- await injectClipboardInterceptor(tab, GLOBAL_VAR);
323
- await cdp(["click", tab, S.input]);
324
- await new Promise((r) => setTimeout(r, TIMING.postClick));
325
- await cdp(["type", tab, query]);
326
- await new Promise((r) => setTimeout(r, TIMING.postType));
327
-
328
- // Submit with Enter (most reliable across locales and Chrome instances)
329
- await cdp([
330
- "eval",
331
- tab,
332
- `document.querySelector('${S.input}')?.dispatchEvent(new KeyboardEvent('keydown',{key:'Enter',bubbles:true,keyCode:13})), 'ok'`,
333
- ]);
334
-
335
- // Post-submit: Bing's antibot sometimes appears AFTER the query is sent.
336
- // Fire-and-forget verification check — runs in parallel with stream wait.
337
- // Zero added latency to the critical path; if it finds and clicks the
338
- // challenge, the stream unblocks instead of timing out at 60s.
339
- setTimeout(() => {
340
- handleVerification(tab, cdp, 10000)
341
- .then((v) => {
342
- if (v === "clicked") {
343
- console.error("[bing] Post-submit verification clicked");
344
- env.verificationResult = "post-submit-clicked";
345
- }
346
- })
347
- .catch(() => {});
348
- }, 2000);
349
-
350
- // Wait for Bing Copilot's response to finish streaming before extracting.
351
- await waitForStreamComplete(tab, { timeout: 60000, minLength: 50 });
352
-
353
- const { answer, sources } = await extractAnswer(tab, env);
354
- if (!answer)
355
- throw new Error("No answer extracted Copilot may not have responded");
356
-
357
- const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
358
- () => "",
359
- );
360
- env.durationMs = Date.now() - startTime;
361
- outputJson({
362
- query,
363
- url: finalUrl,
364
- answer: formatAnswer(answer, short),
365
- sources,
366
- _envelope: buildEnvelope(env),
367
- });
368
- } catch (e) {
369
- env.durationMs = Date.now() - startTime;
370
- handleError(e, buildEnvelope(env));
371
- }
372
- }
373
-
374
- main();
1
+ #!/usr/bin/env node
2
+
3
+ // extractors/bing-copilot.mjs
4
+ // Navigate copilot.microsoft.com, wait for answer to complete, return clean answer + sources.
5
+ //
6
+ // Usage:
7
+ // node extractors/bing-copilot.mjs "<query>" [--tab <prefix>]
8
+ //
9
+ // Output (stdout): JSON { answer, sources, query, url }
10
+ // Errors go to stderr only — stdout is always clean JSON for piping.
11
+
12
+ import {
13
+ buildEnvelope,
14
+ cdp,
15
+ formatAnswer,
16
+ getOrOpenTab,
17
+ handleError,
18
+ injectClipboardInterceptor,
19
+ jitter,
20
+ outputJson,
21
+ parseArgs,
22
+ parseSourcesFromMarkdown,
23
+ prepareArgs,
24
+ TIMING,
25
+ validateQuery,
26
+ waitForCopyButton,
27
+ waitForSelector,
28
+ waitForStreamComplete,
29
+ } from "./common.mjs";
30
+ import {
31
+ detectVerificationChallenge,
32
+ dismissConsent,
33
+ handleVerification,
34
+ } from "./consent.mjs";
35
+ import { SELECTORS } from "./selectors.mjs";
36
+
37
+ const S = SELECTORS.bing;
38
+ const GLOBAL_VAR = "__bingClipboard";
39
+
40
+ // ============================================================================
41
+ // Bing Copilot-specific helpers
42
+ // ============================================================================
43
+
44
+ async function extractAnswer(tab, env, query = "") {
45
+ // In headless mode: snap the accessibility tree before spending ~18s on
46
+ // clipboard polls. Copilot loads its input fine in headless but renders
47
+ // responses behind a Cloudflare-protected iframe — detecting that here
48
+ // fast-fails to the visible retry instead of burning all the poll time.
49
+ if (process.env.GREEDY_SEARCH_HEADLESS === "1") {
50
+ const verification = await detectVerificationChallenge(tab, cdp);
51
+ if (verification) {
52
+ console.error(
53
+ "[bing] Verification challenge detected — fast-failing to visible retry",
54
+ );
55
+ env.blockedBy = "verification";
56
+ throw new Error("Verification challenge detected headless blocked");
57
+ }
58
+ }
59
+
60
+ // Wait for the assistant copy button to exist. On fresh Copilot
61
+ // sessions the answer text can render before the button handler is
62
+ // fully hydrated. Wait for the button + a small hydration delay.
63
+ // 2s is enough the CF snap check above ensures we only reach here
64
+ // on a clean response, where the button appears within ~1s.
65
+ await waitForCopyButton(tab, S.copyButton, { timeout: 2000 }).catch(
66
+ () => null,
67
+ );
68
+ // Give React time to hydrate the click handler on the button
69
+ await new Promise((r) => setTimeout(r, 800));
70
+
71
+ let answer = await clickCopyAndPollClipboard(tab, 5000);
72
+ let clipboardEmpty = !answer;
73
+
74
+ // Retry once if clipboard is empty (Copilot might be slow to wire the handler)
75
+ if (!answer) {
76
+ console.error("[bing] Clipboard empty, retrying copy/poll...");
77
+ answer = await clickCopyAndPollClipboard(tab, 8000);
78
+ clipboardEmpty = !answer;
79
+ }
80
+
81
+ // DOM fallback: visible Copilot can render a valid response while the copy
82
+ // action/clipboard interceptor remains empty. Extract the last assistant
83
+ // answer from page text before treating this as a headless/iframe block.
84
+ if (!answer) {
85
+ answer = await extractFromVisibleDom(tab, query);
86
+ if (answer) env.fallbackUsed = "visibleDom";
87
+ }
88
+
89
+ // Accessibility fallback: if Copilot visibly rendered an answer but the
90
+ // clipboard/DOM selectors missed it, the accessibility tree often still has
91
+ // the assistant article text. This prevents false "blocked" reports when a
92
+ // human can plainly see Bing answered in the browser.
93
+ if (!answer) {
94
+ answer = await extractFromAccessibilityTree(tab, query);
95
+ if (answer) env.fallbackUsed = "accessibilityTree";
96
+ }
97
+
98
+ // DOM fallback: if clipboard still empty, extract text directly from response DOM.
99
+ // This handles headless mode where Copilot renders the AI reply inside nested
100
+ // iframes (copilot.microsoft.com → copilot.fun → blob:…) and hides the copy button.
101
+ if (!answer) {
102
+ const iframeResult = await extractFromIframes(tab, env);
103
+ answer = iframeResult.answer;
104
+ if (answer) env.fallbackUsed = "iframeDom";
105
+ }
106
+
107
+ if (!answer) throw new Error("Clipboard interceptor returned empty text");
108
+
109
+ env.clipboardEmpty = clipboardEmpty;
110
+ const sources = parseSourcesFromMarkdown(answer);
111
+ return { answer: answer.trim(), sources };
112
+ }
113
+
114
+ async function clickCopyAndPollClipboard(tab, timeoutMs) {
115
+ await cdp([
116
+ "eval",
117
+ tab,
118
+ `(() => {
119
+ window.${GLOBAL_VAR} = '';
120
+ const buttons = document.querySelectorAll('${S.copyButton}');
121
+ buttons[buttons.length - 1]?.click();
122
+ })()`,
123
+ ]);
124
+
125
+ const deadline = Date.now() + timeoutMs;
126
+ while (Date.now() < deadline) {
127
+ const answer = await cdp(["eval", tab, `window.${GLOBAL_VAR} || ''`]).catch(
128
+ () => "",
129
+ );
130
+ if (answer) return answer;
131
+ await new Promise((r) => setTimeout(r, 300));
132
+ }
133
+ return "";
134
+ }
135
+
136
+ /**
137
+ * Visible-page DOM fallback. Copilot often exposes the completed assistant
138
+ * message in document.body.innerText even when the copy button/clipboard path
139
+ * fails. Keep this conservative: require a "Copilot said" marker and strip
140
+ * known composer/action text after the answer.
141
+ */
142
+ async function extractFromVisibleDom(tab, query = "") {
143
+ try {
144
+ const bodyText = await cdp([
145
+ "eval",
146
+ tab,
147
+ "document.body?.innerText || ''",
148
+ ]).catch(() => "");
149
+
150
+ let answer = "";
151
+ if (bodyText && bodyText.includes("Copilot said")) {
152
+ // safe linear extraction — no ReDoS-vulnerable regex split
153
+ const copilotSplit = bodyText.split(/Copilot said\s*/i);
154
+ const afterCopilot = copilotSplit.pop() || "";
155
+ answer = cleanCopilotArticleText(truncateAtBoilerplate(afterCopilot));
156
+ }
157
+
158
+ if (!answer) {
159
+ const articlesJson = await cdp([
160
+ "eval",
161
+ tab,
162
+ `JSON.stringify(Array.from(document.querySelectorAll('article')).map(a => a.innerText || '').filter(Boolean))`,
163
+ ]).catch(() => "[]");
164
+ const articles = JSON.parse(articlesJson || "[]");
165
+ answer = pickAnswerArticle(articles, query);
166
+ }
167
+
168
+ if (answer.length < 20) return "";
169
+ console.error(
170
+ `[bing] Visible DOM extraction succeeded (${answer.length} chars)`,
171
+ );
172
+ return answer;
173
+ } catch (e) {
174
+ console.error(`[bing] Visible DOM extraction failed: ${e.message}`);
175
+ return "";
176
+ }
177
+ }
178
+
179
+ async function extractFromAccessibilityTree(tab, query = "") {
180
+ try {
181
+ const snap = await cdp(["snap", tab]).catch(() => "");
182
+ if (!snap || (await detectVerificationChallenge(tab, cdp))) return "";
183
+
184
+ const articleLines = snap
185
+ .split("\n")
186
+ .map((line) => line.match(/^\s*\[article\]\s+(.+)$/i)?.[1])
187
+ .filter(Boolean);
188
+ if (articleLines.length === 0) return "";
189
+
190
+ const answer = pickAnswerArticle(articleLines, query);
191
+ if (answer.length < 50) return "";
192
+ console.error(
193
+ `[bing] Accessibility extraction succeeded (${answer.length} chars)`,
194
+ );
195
+ return answer;
196
+ } catch (e) {
197
+ console.error(`[bing] Accessibility extraction failed: ${e.message}`);
198
+ return "";
199
+ }
200
+ }
201
+
202
+ function pickAnswerArticle(articles, query = "") {
203
+ const normalizedQuery = normalizeForCompare(query);
204
+ const candidates = articles
205
+ .map((text) => cleanCopilotArticleText(text))
206
+ .filter((text) => text.length >= 50)
207
+ .filter((text) => {
208
+ if (!normalizedQuery) return true;
209
+ const normalizedText = normalizeForCompare(text);
210
+ return (
211
+ !normalizedText.includes(normalizedQuery) ||
212
+ text.length > query.length * 3
213
+ );
214
+ });
215
+ return candidates.at(-1) || "";
216
+ }
217
+
218
+ function normalizeForCompare(text = "") {
219
+ return String(text).toLocaleLowerCase().replace(/\s+/g, " ").trim();
220
+ }
221
+
222
+ /** Boilerplate markers that appear after Copilot answers — safe linear search, no ReDoS */
223
+ const BOILERPLATE_MARKERS = [
224
+ "Good response",
225
+ "Bad response",
226
+ "Share message",
227
+ "Copy message",
228
+ "Read aloud",
229
+ "Regenerate",
230
+ "Edit in a page",
231
+ "Message Copilot",
232
+ "Smart",
233
+ ];
234
+
235
+ /**
236
+ * Linear-time truncation at the first boilerplate marker preceded by whitespace
237
+ * and NOT followed by a word character (matches the intent of the original regex
238
+ * without catastrophic backtracking).
239
+ */
240
+ function truncateAtBoilerplate(text) {
241
+ let earliest = text.length;
242
+ for (const marker of BOILERPLATE_MARKERS) {
243
+ let searchFrom = 0;
244
+ while (searchFrom < text.length) {
245
+ const idx = text.indexOf(marker, searchFrom);
246
+ if (idx === -1) break;
247
+ // Preceding char must be whitespace (equivalent to \s+ in original)
248
+ const before = idx > 0 ? text[idx - 1] : "";
249
+ const precededByWhitespace = !before || /\s/.test(before);
250
+ // Negative lookahead equivalent: marker NOT followed by a word char
251
+ const after = text[idx + marker.length] || "";
252
+ const notFollowedByWord = !after || !/\w/.test(after);
253
+ if (precededByWhitespace && notFollowedByWord) {
254
+ if (idx < earliest) earliest = idx;
255
+ break;
256
+ }
257
+ searchFrom = idx + marker.length;
258
+ }
259
+ }
260
+ return earliest < text.length ? text.slice(0, earliest) : text;
261
+ }
262
+
263
+ function cleanCopilotArticleText(text = "") {
264
+ return truncateAtBoilerplate(String(text).replace(/\s+/g, " ")).trim();
265
+ }
266
+
267
+ /**
268
+ * DOM fallback: check if Copilot is blocked by Cloudflare in headless mode.
269
+ * When blocked, the copilot.fun iframe shows a challenge instead of the chat UI.
270
+ * Returns the extracted text or empty string on failure (caller falls through to error
271
+ * which triggers the visible Chrome auto-retry in search.mjs).
272
+ */
273
+ async function extractFromIframes(mainTab, env) {
274
+ try {
275
+ // Check if the AI copy button exists — if it does, we're in visible mode
276
+ // and clipboard should have worked. This is a different issue.
277
+ const hasCopyBtn = await cdp([
278
+ "eval",
279
+ mainTab,
280
+ `!!document.querySelector('${S.copyButton}')`,
281
+ ]).catch(() => "false");
282
+ if (hasCopyBtn === "true") return { answer: "" }; // not a headless/iframe issue
283
+
284
+ // Check for Cloudflare challenge in the accessibility tree.
285
+ // If present, Copilot content is blocked entirely — no DOM extraction possible.
286
+ if (await detectVerificationChallenge(mainTab, cdp)) {
287
+ console.error(
288
+ "[bing] Verification challenge detected — content blocked in headless",
289
+ );
290
+ env.blockedBy = "verification";
291
+ return { answer: "" }; // Let caller throw triggers visible auto-retry
292
+ }
293
+
294
+ console.error(
295
+ "[bing] Copy button hidden, no Cloudflare — trying DOM extraction...",
296
+ );
297
+
298
+ // Get CDP targets to find the copilot.fun iframe
299
+ const targetsRaw = await cdp([
300
+ "evalraw",
301
+ mainTab,
302
+ "Target.getTargets",
303
+ "{}",
304
+ ]);
305
+ const targets = JSON.parse(targetsRaw);
306
+ const targetInfos = targets.targetInfos || [];
307
+ const funFrame = targetInfos.find(
308
+ (t) => t.type === "iframe" && t.url.includes("copilot.fun"),
309
+ );
310
+ if (!funFrame) {
311
+ console.error("[bing] No copilot.fun iframe target found");
312
+ return { answer: "" };
313
+ }
314
+
315
+ // Try to extract from the nested blob iframe (rarely succeeds due to Cloudflare)
316
+ const funTabId = funFrame.targetId.slice(0, 8);
317
+ const innerText = await cdp([
318
+ "eval",
319
+ funTabId,
320
+ `(()=>{const iframe=document.querySelector('iframe'); if(!iframe) return''; try{const doc=iframe.contentDocument||iframe.contentWindow.document; return doc?.body?.innerText?.trim()||''}catch(e){return''}})()`,
321
+ ]).catch(() => "");
322
+
323
+ if (innerText) {
324
+ console.error(
325
+ `[bing] DOM extraction succeeded (${innerText.length} chars)`,
326
+ );
327
+ return { answer: innerText };
328
+ }
329
+
330
+ console.error(
331
+ "[bing] DOM extraction returned empty — falling through to visible retry",
332
+ );
333
+ } catch (e) {
334
+ console.error(`[bing] DOM extraction failed: ${e.message}`);
335
+ }
336
+ return { answer: "" };
337
+ }
338
+
339
+ // ============================================================================
340
+ // Main
341
+ // ============================================================================
342
+
343
+ const USAGE =
344
+ 'Usage: node extractors/bing-copilot.mjs "<query>" [--tab <prefix>]\n';
345
+
346
+ async function main() {
347
+ const args = await prepareArgs(process.argv.slice(2));
348
+ validateQuery(args, USAGE);
349
+
350
+ const { query, tabPrefix, short } = parseArgs(args);
351
+ const startTime = Date.now();
352
+ const mode =
353
+ process.env.GREEDY_SEARCH_VISIBLE === "1" ? "visible" : "headless";
354
+
355
+ // Lightweight envelope no extra CDP calls, just tracks what we already know
356
+ const env = {
357
+ engine: "bing",
358
+ mode,
359
+ clipboardEmpty: null,
360
+ fallbackUsed: null,
361
+ blockedBy: null,
362
+ verificationResult: null,
363
+ inputReady: null,
364
+ };
365
+
366
+ try {
367
+ // Only refresh page list when creating a fresh tab (no prefix provided)
368
+ if (!tabPrefix) await cdp(["list"]);
369
+ const tab = await getOrOpenTab(tabPrefix);
370
+
371
+ // Skip navigation if already on Copilot domain (tab was seeded by search.mjs)
372
+ const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
373
+ () => "",
374
+ );
375
+ let onCopilot = false;
376
+ try {
377
+ const host = new URL(currentUrl).hostname.toLowerCase();
378
+ onCopilot =
379
+ host === "copilot.microsoft.com" ||
380
+ host.endsWith(".copilot.microsoft.com");
381
+ } catch {}
382
+
383
+ if (!onCopilot) {
384
+ await cdp(["nav", tab, "https://copilot.microsoft.com/"], 20000);
385
+ await new Promise((r) => setTimeout(r, 600));
386
+ }
387
+ await dismissConsent(tab, cdp);
388
+
389
+ // Handle verification challenges (Cloudflare Turnstile, Microsoft auth, etc.)
390
+ const verifyResult = await handleVerification(tab, cdp, 10000);
391
+ env.verificationResult = verifyResult;
392
+ if (verifyResult === "needs-human") {
393
+ throw new Error(
394
+ "Copilot verification required — please solve it manually in the browser window",
395
+ );
396
+ }
397
+
398
+ // After verification, page may have redirected or reloaded — wait for it to settle
399
+ if (verifyResult === "clicked") {
400
+ await new Promise((r) => setTimeout(r, TIMING.afterVerify));
401
+
402
+ // Re-navigate if we got redirected
403
+ const currentUrl = await cdp([
404
+ "eval",
405
+ tab,
406
+ "document.location.href",
407
+ ]).catch(() => "");
408
+ let onCopilot = false;
409
+ try {
410
+ const host = new URL(currentUrl).hostname.toLowerCase();
411
+ onCopilot =
412
+ host === "copilot.microsoft.com" ||
413
+ host.endsWith(".copilot.microsoft.com");
414
+ } catch {}
415
+ if (!onCopilot) {
416
+ await cdp(["nav", tab, "https://copilot.microsoft.com/"], 20000);
417
+ await new Promise((r) => setTimeout(r, 600));
418
+ await dismissConsent(tab, cdp);
419
+ }
420
+ }
421
+
422
+ // Wait for React app to mount input (up to 15s, longer after verification)
423
+ const inputReady = await waitForSelector(tab, S.input, 15000, 500);
424
+ env.inputReady = inputReady;
425
+ await new Promise((r) => setTimeout(r, jitter(300)));
426
+
427
+ if (!inputReady) {
428
+ throw new Error(
429
+ "Copilot input not found — verification may have failed or page is in unexpected state",
430
+ );
431
+ }
432
+
433
+ await injectClipboardInterceptor(tab, GLOBAL_VAR);
434
+ await cdp(["click", tab, S.input]);
435
+ await new Promise((r) => setTimeout(r, TIMING.postClick));
436
+ await cdp(["type", tab, query]);
437
+ await new Promise((r) => setTimeout(r, TIMING.postType));
438
+
439
+ // Submit with Enter (most reliable across locales and Chrome instances)
440
+ await cdp([
441
+ "eval",
442
+ tab,
443
+ `document.querySelector('${S.input}')?.dispatchEvent(new KeyboardEvent('keydown',{key:'Enter',bubbles:true,keyCode:13})), 'ok'`,
444
+ ]);
445
+
446
+ // Post-submit: Bing's antibot sometimes appears AFTER the query is sent.
447
+ // Fire-and-forget verification check — runs in parallel with stream wait.
448
+ // Zero added latency to the critical path; if it finds and clicks the
449
+ // challenge, the stream unblocks instead of timing out at 60s.
450
+ setTimeout(() => {
451
+ handleVerification(tab, cdp, 10000)
452
+ .then((v) => {
453
+ if (v === "clicked") {
454
+ console.error("[bing] Post-submit verification clicked");
455
+ env.verificationResult = "post-submit-clicked";
456
+ }
457
+ })
458
+ .catch(() => {});
459
+ }, 2000);
460
+
461
+ // Wait for Bing Copilot's response to finish streaming before extracting.
462
+ // In --short/fast mode, cap this below the parent 40s budget and extract
463
+ // whatever has rendered so research child searches stay fast.
464
+ await waitForStreamComplete(tab, {
465
+ timeout: short ? 25000 : 60000,
466
+ minLength: 50,
467
+ });
468
+
469
+ const { answer, sources } = await extractAnswer(tab, env, query);
470
+ if (!answer)
471
+ throw new Error("No answer extracted — Copilot may not have responded");
472
+
473
+ const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
474
+ () => "",
475
+ );
476
+ env.durationMs = Date.now() - startTime;
477
+ outputJson({
478
+ query,
479
+ url: finalUrl,
480
+ answer: formatAnswer(answer, short),
481
+ sources,
482
+ _envelope: buildEnvelope(env),
483
+ });
484
+ } catch (e) {
485
+ env.durationMs = Date.now() - startTime;
486
+ handleError(e, buildEnvelope(env));
487
+ }
488
+ }
489
+
490
+ main();