@apmantza/greedysearch-pi 1.9.0 → 1.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -0
- package/README.md +11 -1
- package/bin/launch-visible.mjs +65 -0
- package/bin/launch.mjs +442 -417
- package/bin/search.mjs +757 -679
- package/extractors/bing-copilot.mjs +490 -374
- package/extractors/common.mjs +703 -596
- package/extractors/consent.mjs +421 -388
- package/extractors/selectors.mjs +55 -54
- package/index.ts +176 -177
- package/package.json +8 -3
- package/skills/greedy-search/skill.md +5 -19
- package/src/fetcher.mjs +666 -652
- package/src/formatters/synthesis.ts +1 -5
- package/src/search/output.mjs +23 -1
- package/src/search/research.mjs +1581 -0
- package/src/search/sources.mjs +488 -466
- package/src/search/synthesis-runner.mjs +52 -46
- package/src/tools/greedy-search-handler.ts +298 -124
- package/test.mjs +971 -534
|
@@ -1,374 +1,490 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
// extractors/bing-copilot.mjs
|
|
4
|
-
// Navigate copilot.microsoft.com, wait for answer to complete, return clean answer + sources.
|
|
5
|
-
//
|
|
6
|
-
// Usage:
|
|
7
|
-
// node extractors/bing-copilot.mjs "<query>" [--tab <prefix>]
|
|
8
|
-
//
|
|
9
|
-
// Output (stdout): JSON { answer, sources, query, url }
|
|
10
|
-
// Errors go to stderr only — stdout is always clean JSON for piping.
|
|
11
|
-
|
|
12
|
-
import {
|
|
13
|
-
buildEnvelope,
|
|
14
|
-
cdp,
|
|
15
|
-
formatAnswer,
|
|
16
|
-
getOrOpenTab,
|
|
17
|
-
handleError,
|
|
18
|
-
injectClipboardInterceptor,
|
|
19
|
-
jitter,
|
|
20
|
-
outputJson,
|
|
21
|
-
parseArgs,
|
|
22
|
-
parseSourcesFromMarkdown,
|
|
23
|
-
prepareArgs,
|
|
24
|
-
TIMING,
|
|
25
|
-
validateQuery,
|
|
26
|
-
waitForCopyButton,
|
|
27
|
-
waitForSelector,
|
|
28
|
-
waitForStreamComplete,
|
|
29
|
-
} from "./common.mjs";
|
|
30
|
-
import {
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
//
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
//
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
//
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
tab,
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
const
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
//
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// extractors/bing-copilot.mjs
|
|
4
|
+
// Navigate copilot.microsoft.com, wait for answer to complete, return clean answer + sources.
|
|
5
|
+
//
|
|
6
|
+
// Usage:
|
|
7
|
+
// node extractors/bing-copilot.mjs "<query>" [--tab <prefix>]
|
|
8
|
+
//
|
|
9
|
+
// Output (stdout): JSON { answer, sources, query, url }
|
|
10
|
+
// Errors go to stderr only — stdout is always clean JSON for piping.
|
|
11
|
+
|
|
12
|
+
import {
|
|
13
|
+
buildEnvelope,
|
|
14
|
+
cdp,
|
|
15
|
+
formatAnswer,
|
|
16
|
+
getOrOpenTab,
|
|
17
|
+
handleError,
|
|
18
|
+
injectClipboardInterceptor,
|
|
19
|
+
jitter,
|
|
20
|
+
outputJson,
|
|
21
|
+
parseArgs,
|
|
22
|
+
parseSourcesFromMarkdown,
|
|
23
|
+
prepareArgs,
|
|
24
|
+
TIMING,
|
|
25
|
+
validateQuery,
|
|
26
|
+
waitForCopyButton,
|
|
27
|
+
waitForSelector,
|
|
28
|
+
waitForStreamComplete,
|
|
29
|
+
} from "./common.mjs";
|
|
30
|
+
import {
|
|
31
|
+
detectVerificationChallenge,
|
|
32
|
+
dismissConsent,
|
|
33
|
+
handleVerification,
|
|
34
|
+
} from "./consent.mjs";
|
|
35
|
+
import { SELECTORS } from "./selectors.mjs";
|
|
36
|
+
|
|
37
|
+
const S = SELECTORS.bing;
|
|
38
|
+
const GLOBAL_VAR = "__bingClipboard";
|
|
39
|
+
|
|
40
|
+
// ============================================================================
|
|
41
|
+
// Bing Copilot-specific helpers
|
|
42
|
+
// ============================================================================
|
|
43
|
+
|
|
44
|
+
async function extractAnswer(tab, env, query = "") {
|
|
45
|
+
// In headless mode: snap the accessibility tree before spending ~18s on
|
|
46
|
+
// clipboard polls. Copilot loads its input fine in headless but renders
|
|
47
|
+
// responses behind a Cloudflare-protected iframe — detecting that here
|
|
48
|
+
// fast-fails to the visible retry instead of burning all the poll time.
|
|
49
|
+
if (process.env.GREEDY_SEARCH_HEADLESS === "1") {
|
|
50
|
+
const verification = await detectVerificationChallenge(tab, cdp);
|
|
51
|
+
if (verification) {
|
|
52
|
+
console.error(
|
|
53
|
+
"[bing] Verification challenge detected — fast-failing to visible retry",
|
|
54
|
+
);
|
|
55
|
+
env.blockedBy = "verification";
|
|
56
|
+
throw new Error("Verification challenge detected — headless blocked");
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Wait for the assistant copy button to exist. On fresh Copilot
|
|
61
|
+
// sessions the answer text can render before the button handler is
|
|
62
|
+
// fully hydrated. Wait for the button + a small hydration delay.
|
|
63
|
+
// 2s is enough — the CF snap check above ensures we only reach here
|
|
64
|
+
// on a clean response, where the button appears within ~1s.
|
|
65
|
+
await waitForCopyButton(tab, S.copyButton, { timeout: 2000 }).catch(
|
|
66
|
+
() => null,
|
|
67
|
+
);
|
|
68
|
+
// Give React time to hydrate the click handler on the button
|
|
69
|
+
await new Promise((r) => setTimeout(r, 800));
|
|
70
|
+
|
|
71
|
+
let answer = await clickCopyAndPollClipboard(tab, 5000);
|
|
72
|
+
let clipboardEmpty = !answer;
|
|
73
|
+
|
|
74
|
+
// Retry once if clipboard is empty (Copilot might be slow to wire the handler)
|
|
75
|
+
if (!answer) {
|
|
76
|
+
console.error("[bing] Clipboard empty, retrying copy/poll...");
|
|
77
|
+
answer = await clickCopyAndPollClipboard(tab, 8000);
|
|
78
|
+
clipboardEmpty = !answer;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// DOM fallback: visible Copilot can render a valid response while the copy
|
|
82
|
+
// action/clipboard interceptor remains empty. Extract the last assistant
|
|
83
|
+
// answer from page text before treating this as a headless/iframe block.
|
|
84
|
+
if (!answer) {
|
|
85
|
+
answer = await extractFromVisibleDom(tab, query);
|
|
86
|
+
if (answer) env.fallbackUsed = "visibleDom";
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Accessibility fallback: if Copilot visibly rendered an answer but the
|
|
90
|
+
// clipboard/DOM selectors missed it, the accessibility tree often still has
|
|
91
|
+
// the assistant article text. This prevents false "blocked" reports when a
|
|
92
|
+
// human can plainly see Bing answered in the browser.
|
|
93
|
+
if (!answer) {
|
|
94
|
+
answer = await extractFromAccessibilityTree(tab, query);
|
|
95
|
+
if (answer) env.fallbackUsed = "accessibilityTree";
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// DOM fallback: if clipboard still empty, extract text directly from response DOM.
|
|
99
|
+
// This handles headless mode where Copilot renders the AI reply inside nested
|
|
100
|
+
// iframes (copilot.microsoft.com → copilot.fun → blob:…) and hides the copy button.
|
|
101
|
+
if (!answer) {
|
|
102
|
+
const iframeResult = await extractFromIframes(tab, env);
|
|
103
|
+
answer = iframeResult.answer;
|
|
104
|
+
if (answer) env.fallbackUsed = "iframeDom";
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
if (!answer) throw new Error("Clipboard interceptor returned empty text");
|
|
108
|
+
|
|
109
|
+
env.clipboardEmpty = clipboardEmpty;
|
|
110
|
+
const sources = parseSourcesFromMarkdown(answer);
|
|
111
|
+
return { answer: answer.trim(), sources };
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
async function clickCopyAndPollClipboard(tab, timeoutMs) {
|
|
115
|
+
await cdp([
|
|
116
|
+
"eval",
|
|
117
|
+
tab,
|
|
118
|
+
`(() => {
|
|
119
|
+
window.${GLOBAL_VAR} = '';
|
|
120
|
+
const buttons = document.querySelectorAll('${S.copyButton}');
|
|
121
|
+
buttons[buttons.length - 1]?.click();
|
|
122
|
+
})()`,
|
|
123
|
+
]);
|
|
124
|
+
|
|
125
|
+
const deadline = Date.now() + timeoutMs;
|
|
126
|
+
while (Date.now() < deadline) {
|
|
127
|
+
const answer = await cdp(["eval", tab, `window.${GLOBAL_VAR} || ''`]).catch(
|
|
128
|
+
() => "",
|
|
129
|
+
);
|
|
130
|
+
if (answer) return answer;
|
|
131
|
+
await new Promise((r) => setTimeout(r, 300));
|
|
132
|
+
}
|
|
133
|
+
return "";
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Visible-page DOM fallback. Copilot often exposes the completed assistant
|
|
138
|
+
* message in document.body.innerText even when the copy button/clipboard path
|
|
139
|
+
* fails. Keep this conservative: require a "Copilot said" marker and strip
|
|
140
|
+
* known composer/action text after the answer.
|
|
141
|
+
*/
|
|
142
|
+
async function extractFromVisibleDom(tab, query = "") {
|
|
143
|
+
try {
|
|
144
|
+
const bodyText = await cdp([
|
|
145
|
+
"eval",
|
|
146
|
+
tab,
|
|
147
|
+
"document.body?.innerText || ''",
|
|
148
|
+
]).catch(() => "");
|
|
149
|
+
|
|
150
|
+
let answer = "";
|
|
151
|
+
if (bodyText && bodyText.includes("Copilot said")) {
|
|
152
|
+
// safe linear extraction — no ReDoS-vulnerable regex split
|
|
153
|
+
const copilotSplit = bodyText.split(/Copilot said\s*/i);
|
|
154
|
+
const afterCopilot = copilotSplit.pop() || "";
|
|
155
|
+
answer = cleanCopilotArticleText(truncateAtBoilerplate(afterCopilot));
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if (!answer) {
|
|
159
|
+
const articlesJson = await cdp([
|
|
160
|
+
"eval",
|
|
161
|
+
tab,
|
|
162
|
+
`JSON.stringify(Array.from(document.querySelectorAll('article')).map(a => a.innerText || '').filter(Boolean))`,
|
|
163
|
+
]).catch(() => "[]");
|
|
164
|
+
const articles = JSON.parse(articlesJson || "[]");
|
|
165
|
+
answer = pickAnswerArticle(articles, query);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (answer.length < 20) return "";
|
|
169
|
+
console.error(
|
|
170
|
+
`[bing] Visible DOM extraction succeeded (${answer.length} chars)`,
|
|
171
|
+
);
|
|
172
|
+
return answer;
|
|
173
|
+
} catch (e) {
|
|
174
|
+
console.error(`[bing] Visible DOM extraction failed: ${e.message}`);
|
|
175
|
+
return "";
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
async function extractFromAccessibilityTree(tab, query = "") {
|
|
180
|
+
try {
|
|
181
|
+
const snap = await cdp(["snap", tab]).catch(() => "");
|
|
182
|
+
if (!snap || (await detectVerificationChallenge(tab, cdp))) return "";
|
|
183
|
+
|
|
184
|
+
const articleLines = snap
|
|
185
|
+
.split("\n")
|
|
186
|
+
.map((line) => line.match(/^\s*\[article\]\s+(.+)$/i)?.[1])
|
|
187
|
+
.filter(Boolean);
|
|
188
|
+
if (articleLines.length === 0) return "";
|
|
189
|
+
|
|
190
|
+
const answer = pickAnswerArticle(articleLines, query);
|
|
191
|
+
if (answer.length < 50) return "";
|
|
192
|
+
console.error(
|
|
193
|
+
`[bing] Accessibility extraction succeeded (${answer.length} chars)`,
|
|
194
|
+
);
|
|
195
|
+
return answer;
|
|
196
|
+
} catch (e) {
|
|
197
|
+
console.error(`[bing] Accessibility extraction failed: ${e.message}`);
|
|
198
|
+
return "";
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function pickAnswerArticle(articles, query = "") {
|
|
203
|
+
const normalizedQuery = normalizeForCompare(query);
|
|
204
|
+
const candidates = articles
|
|
205
|
+
.map((text) => cleanCopilotArticleText(text))
|
|
206
|
+
.filter((text) => text.length >= 50)
|
|
207
|
+
.filter((text) => {
|
|
208
|
+
if (!normalizedQuery) return true;
|
|
209
|
+
const normalizedText = normalizeForCompare(text);
|
|
210
|
+
return (
|
|
211
|
+
!normalizedText.includes(normalizedQuery) ||
|
|
212
|
+
text.length > query.length * 3
|
|
213
|
+
);
|
|
214
|
+
});
|
|
215
|
+
return candidates.at(-1) || "";
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
function normalizeForCompare(text = "") {
|
|
219
|
+
return String(text).toLocaleLowerCase().replace(/\s+/g, " ").trim();
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/** Boilerplate markers that appear after Copilot answers — safe linear search, no ReDoS */
|
|
223
|
+
const BOILERPLATE_MARKERS = [
|
|
224
|
+
"Good response",
|
|
225
|
+
"Bad response",
|
|
226
|
+
"Share message",
|
|
227
|
+
"Copy message",
|
|
228
|
+
"Read aloud",
|
|
229
|
+
"Regenerate",
|
|
230
|
+
"Edit in a page",
|
|
231
|
+
"Message Copilot",
|
|
232
|
+
"Smart",
|
|
233
|
+
];
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Linear-time truncation at the first boilerplate marker preceded by whitespace
|
|
237
|
+
* and NOT followed by a word character (matches the intent of the original regex
|
|
238
|
+
* without catastrophic backtracking).
|
|
239
|
+
*/
|
|
240
|
+
function truncateAtBoilerplate(text) {
|
|
241
|
+
let earliest = text.length;
|
|
242
|
+
for (const marker of BOILERPLATE_MARKERS) {
|
|
243
|
+
let searchFrom = 0;
|
|
244
|
+
while (searchFrom < text.length) {
|
|
245
|
+
const idx = text.indexOf(marker, searchFrom);
|
|
246
|
+
if (idx === -1) break;
|
|
247
|
+
// Preceding char must be whitespace (equivalent to \s+ in original)
|
|
248
|
+
const before = idx > 0 ? text[idx - 1] : "";
|
|
249
|
+
const precededByWhitespace = !before || /\s/.test(before);
|
|
250
|
+
// Negative lookahead equivalent: marker NOT followed by a word char
|
|
251
|
+
const after = text[idx + marker.length] || "";
|
|
252
|
+
const notFollowedByWord = !after || !/\w/.test(after);
|
|
253
|
+
if (precededByWhitespace && notFollowedByWord) {
|
|
254
|
+
if (idx < earliest) earliest = idx;
|
|
255
|
+
break;
|
|
256
|
+
}
|
|
257
|
+
searchFrom = idx + marker.length;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
return earliest < text.length ? text.slice(0, earliest) : text;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
function cleanCopilotArticleText(text = "") {
|
|
264
|
+
return truncateAtBoilerplate(String(text).replace(/\s+/g, " ")).trim();
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* DOM fallback: check if Copilot is blocked by Cloudflare in headless mode.
|
|
269
|
+
* When blocked, the copilot.fun iframe shows a challenge instead of the chat UI.
|
|
270
|
+
* Returns the extracted text or empty string on failure (caller falls through to error
|
|
271
|
+
* which triggers the visible Chrome auto-retry in search.mjs).
|
|
272
|
+
*/
|
|
273
|
+
async function extractFromIframes(mainTab, env) {
|
|
274
|
+
try {
|
|
275
|
+
// Check if the AI copy button exists — if it does, we're in visible mode
|
|
276
|
+
// and clipboard should have worked. This is a different issue.
|
|
277
|
+
const hasCopyBtn = await cdp([
|
|
278
|
+
"eval",
|
|
279
|
+
mainTab,
|
|
280
|
+
`!!document.querySelector('${S.copyButton}')`,
|
|
281
|
+
]).catch(() => "false");
|
|
282
|
+
if (hasCopyBtn === "true") return { answer: "" }; // not a headless/iframe issue
|
|
283
|
+
|
|
284
|
+
// Check for Cloudflare challenge in the accessibility tree.
|
|
285
|
+
// If present, Copilot content is blocked entirely — no DOM extraction possible.
|
|
286
|
+
if (await detectVerificationChallenge(mainTab, cdp)) {
|
|
287
|
+
console.error(
|
|
288
|
+
"[bing] Verification challenge detected — content blocked in headless",
|
|
289
|
+
);
|
|
290
|
+
env.blockedBy = "verification";
|
|
291
|
+
return { answer: "" }; // Let caller throw → triggers visible auto-retry
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
console.error(
|
|
295
|
+
"[bing] Copy button hidden, no Cloudflare — trying DOM extraction...",
|
|
296
|
+
);
|
|
297
|
+
|
|
298
|
+
// Get CDP targets to find the copilot.fun iframe
|
|
299
|
+
const targetsRaw = await cdp([
|
|
300
|
+
"evalraw",
|
|
301
|
+
mainTab,
|
|
302
|
+
"Target.getTargets",
|
|
303
|
+
"{}",
|
|
304
|
+
]);
|
|
305
|
+
const targets = JSON.parse(targetsRaw);
|
|
306
|
+
const targetInfos = targets.targetInfos || [];
|
|
307
|
+
const funFrame = targetInfos.find(
|
|
308
|
+
(t) => t.type === "iframe" && t.url.includes("copilot.fun"),
|
|
309
|
+
);
|
|
310
|
+
if (!funFrame) {
|
|
311
|
+
console.error("[bing] No copilot.fun iframe target found");
|
|
312
|
+
return { answer: "" };
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Try to extract from the nested blob iframe (rarely succeeds due to Cloudflare)
|
|
316
|
+
const funTabId = funFrame.targetId.slice(0, 8);
|
|
317
|
+
const innerText = await cdp([
|
|
318
|
+
"eval",
|
|
319
|
+
funTabId,
|
|
320
|
+
`(()=>{const iframe=document.querySelector('iframe'); if(!iframe) return''; try{const doc=iframe.contentDocument||iframe.contentWindow.document; return doc?.body?.innerText?.trim()||''}catch(e){return''}})()`,
|
|
321
|
+
]).catch(() => "");
|
|
322
|
+
|
|
323
|
+
if (innerText) {
|
|
324
|
+
console.error(
|
|
325
|
+
`[bing] DOM extraction succeeded (${innerText.length} chars)`,
|
|
326
|
+
);
|
|
327
|
+
return { answer: innerText };
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
console.error(
|
|
331
|
+
"[bing] DOM extraction returned empty — falling through to visible retry",
|
|
332
|
+
);
|
|
333
|
+
} catch (e) {
|
|
334
|
+
console.error(`[bing] DOM extraction failed: ${e.message}`);
|
|
335
|
+
}
|
|
336
|
+
return { answer: "" };
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// ============================================================================
|
|
340
|
+
// Main
|
|
341
|
+
// ============================================================================
|
|
342
|
+
|
|
343
|
+
const USAGE =
|
|
344
|
+
'Usage: node extractors/bing-copilot.mjs "<query>" [--tab <prefix>]\n';
|
|
345
|
+
|
|
346
|
+
async function main() {
|
|
347
|
+
const args = await prepareArgs(process.argv.slice(2));
|
|
348
|
+
validateQuery(args, USAGE);
|
|
349
|
+
|
|
350
|
+
const { query, tabPrefix, short } = parseArgs(args);
|
|
351
|
+
const startTime = Date.now();
|
|
352
|
+
const mode =
|
|
353
|
+
process.env.GREEDY_SEARCH_VISIBLE === "1" ? "visible" : "headless";
|
|
354
|
+
|
|
355
|
+
// Lightweight envelope — no extra CDP calls, just tracks what we already know
|
|
356
|
+
const env = {
|
|
357
|
+
engine: "bing",
|
|
358
|
+
mode,
|
|
359
|
+
clipboardEmpty: null,
|
|
360
|
+
fallbackUsed: null,
|
|
361
|
+
blockedBy: null,
|
|
362
|
+
verificationResult: null,
|
|
363
|
+
inputReady: null,
|
|
364
|
+
};
|
|
365
|
+
|
|
366
|
+
try {
|
|
367
|
+
// Only refresh page list when creating a fresh tab (no prefix provided)
|
|
368
|
+
if (!tabPrefix) await cdp(["list"]);
|
|
369
|
+
const tab = await getOrOpenTab(tabPrefix);
|
|
370
|
+
|
|
371
|
+
// Skip navigation if already on Copilot domain (tab was seeded by search.mjs)
|
|
372
|
+
const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
373
|
+
() => "",
|
|
374
|
+
);
|
|
375
|
+
let onCopilot = false;
|
|
376
|
+
try {
|
|
377
|
+
const host = new URL(currentUrl).hostname.toLowerCase();
|
|
378
|
+
onCopilot =
|
|
379
|
+
host === "copilot.microsoft.com" ||
|
|
380
|
+
host.endsWith(".copilot.microsoft.com");
|
|
381
|
+
} catch {}
|
|
382
|
+
|
|
383
|
+
if (!onCopilot) {
|
|
384
|
+
await cdp(["nav", tab, "https://copilot.microsoft.com/"], 20000);
|
|
385
|
+
await new Promise((r) => setTimeout(r, 600));
|
|
386
|
+
}
|
|
387
|
+
await dismissConsent(tab, cdp);
|
|
388
|
+
|
|
389
|
+
// Handle verification challenges (Cloudflare Turnstile, Microsoft auth, etc.)
|
|
390
|
+
const verifyResult = await handleVerification(tab, cdp, 10000);
|
|
391
|
+
env.verificationResult = verifyResult;
|
|
392
|
+
if (verifyResult === "needs-human") {
|
|
393
|
+
throw new Error(
|
|
394
|
+
"Copilot verification required — please solve it manually in the browser window",
|
|
395
|
+
);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// After verification, page may have redirected or reloaded — wait for it to settle
|
|
399
|
+
if (verifyResult === "clicked") {
|
|
400
|
+
await new Promise((r) => setTimeout(r, TIMING.afterVerify));
|
|
401
|
+
|
|
402
|
+
// Re-navigate if we got redirected
|
|
403
|
+
const currentUrl = await cdp([
|
|
404
|
+
"eval",
|
|
405
|
+
tab,
|
|
406
|
+
"document.location.href",
|
|
407
|
+
]).catch(() => "");
|
|
408
|
+
let onCopilot = false;
|
|
409
|
+
try {
|
|
410
|
+
const host = new URL(currentUrl).hostname.toLowerCase();
|
|
411
|
+
onCopilot =
|
|
412
|
+
host === "copilot.microsoft.com" ||
|
|
413
|
+
host.endsWith(".copilot.microsoft.com");
|
|
414
|
+
} catch {}
|
|
415
|
+
if (!onCopilot) {
|
|
416
|
+
await cdp(["nav", tab, "https://copilot.microsoft.com/"], 20000);
|
|
417
|
+
await new Promise((r) => setTimeout(r, 600));
|
|
418
|
+
await dismissConsent(tab, cdp);
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// Wait for React app to mount input (up to 15s, longer after verification)
|
|
423
|
+
const inputReady = await waitForSelector(tab, S.input, 15000, 500);
|
|
424
|
+
env.inputReady = inputReady;
|
|
425
|
+
await new Promise((r) => setTimeout(r, jitter(300)));
|
|
426
|
+
|
|
427
|
+
if (!inputReady) {
|
|
428
|
+
throw new Error(
|
|
429
|
+
"Copilot input not found — verification may have failed or page is in unexpected state",
|
|
430
|
+
);
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
await injectClipboardInterceptor(tab, GLOBAL_VAR);
|
|
434
|
+
await cdp(["click", tab, S.input]);
|
|
435
|
+
await new Promise((r) => setTimeout(r, TIMING.postClick));
|
|
436
|
+
await cdp(["type", tab, query]);
|
|
437
|
+
await new Promise((r) => setTimeout(r, TIMING.postType));
|
|
438
|
+
|
|
439
|
+
// Submit with Enter (most reliable across locales and Chrome instances)
|
|
440
|
+
await cdp([
|
|
441
|
+
"eval",
|
|
442
|
+
tab,
|
|
443
|
+
`document.querySelector('${S.input}')?.dispatchEvent(new KeyboardEvent('keydown',{key:'Enter',bubbles:true,keyCode:13})), 'ok'`,
|
|
444
|
+
]);
|
|
445
|
+
|
|
446
|
+
// Post-submit: Bing's antibot sometimes appears AFTER the query is sent.
|
|
447
|
+
// Fire-and-forget verification check — runs in parallel with stream wait.
|
|
448
|
+
// Zero added latency to the critical path; if it finds and clicks the
|
|
449
|
+
// challenge, the stream unblocks instead of timing out at 60s.
|
|
450
|
+
setTimeout(() => {
|
|
451
|
+
handleVerification(tab, cdp, 10000)
|
|
452
|
+
.then((v) => {
|
|
453
|
+
if (v === "clicked") {
|
|
454
|
+
console.error("[bing] Post-submit verification clicked");
|
|
455
|
+
env.verificationResult = "post-submit-clicked";
|
|
456
|
+
}
|
|
457
|
+
})
|
|
458
|
+
.catch(() => {});
|
|
459
|
+
}, 2000);
|
|
460
|
+
|
|
461
|
+
// Wait for Bing Copilot's response to finish streaming before extracting.
|
|
462
|
+
// In --short/fast mode, cap this below the parent 40s budget and extract
|
|
463
|
+
// whatever has rendered so research child searches stay fast.
|
|
464
|
+
await waitForStreamComplete(tab, {
|
|
465
|
+
timeout: short ? 25000 : 60000,
|
|
466
|
+
minLength: 50,
|
|
467
|
+
});
|
|
468
|
+
|
|
469
|
+
const { answer, sources } = await extractAnswer(tab, env, query);
|
|
470
|
+
if (!answer)
|
|
471
|
+
throw new Error("No answer extracted — Copilot may not have responded");
|
|
472
|
+
|
|
473
|
+
const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
474
|
+
() => "",
|
|
475
|
+
);
|
|
476
|
+
env.durationMs = Date.now() - startTime;
|
|
477
|
+
outputJson({
|
|
478
|
+
query,
|
|
479
|
+
url: finalUrl,
|
|
480
|
+
answer: formatAnswer(answer, short),
|
|
481
|
+
sources,
|
|
482
|
+
_envelope: buildEnvelope(env),
|
|
483
|
+
});
|
|
484
|
+
} catch (e) {
|
|
485
|
+
env.durationMs = Date.now() - startTime;
|
|
486
|
+
handleError(e, buildEnvelope(env));
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
main();
|