@apmantza/greedysearch-pi 1.8.9 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,539 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- // extractors/bing-aria.mjs — ARIA-tree-based Bing Copilot extractor
4
- //
5
- // Instead of copy button → clipboard polling → DOM fallback → iframe spelunking,
6
- // this extractor builds an ARIA accessibility tree of the page, finds the
7
- // Copilot answer region, and extracts structured text + sources directly.
8
- //
9
- // Inspiration: browser-use-rs extract_dom.js (Playwright's ariaSnapshot)
10
- //
11
- // Usage:
12
- // node extractors/bing-aria.mjs "<query>" [--tab <prefix>]
13
- //
14
- // Output (stdout): JSON { answer, sources, query, url }
15
- // Errors to stderr only — stdout is always clean JSON for piping.
16
-
17
- import {
18
- cdp,
19
- formatAnswer,
20
- getOrOpenTab,
21
- handleError,
22
- injectClipboardInterceptor,
23
- jitter,
24
- outputJson,
25
- parseArgs,
26
- parseSourcesFromMarkdown,
27
- prepareArgs,
28
- TIMING,
29
- validateQuery,
30
- waitForSelector,
31
- waitForStreamComplete,
32
- } from "./common.mjs";
33
- import { dismissConsent, handleVerification } from "./consent.mjs";
34
- import { SELECTORS } from "./selectors.mjs";
35
-
36
- const S = SELECTORS.bing;
37
-
38
- // ============================================================================
39
- // ARIA-tree answer extraction
40
- // ============================================================================
41
-
42
- const EXTRACT_ARIA_JS = String.raw`
43
- (async function() {
44
- 'use strict';
45
-
46
- // ── visibility helpers ──────────────────────────
47
- function isHidden(el) {
48
- if (['STYLE','SCRIPT','NOSCRIPT','TEMPLATE'].includes(el.tagName)) return true;
49
- const s = window.getComputedStyle(el);
50
- if (s.visibility !== 'visible') return true;
51
- if (s.display === 'none') return true;
52
- if (el.getAttribute('aria-hidden') === 'true') return true;
53
- return false;
54
- }
55
-
56
- function isVisible(el) {
57
- const r = el.getBoundingClientRect();
58
- return r.width > 0 && r.height > 0;
59
- }
60
-
61
- function getRole(el) {
62
- const explicit = el.getAttribute('role');
63
- if (explicit) return explicit.split(' ')[0];
64
- const tag = el.tagName;
65
- const map = {
66
- BUTTON:'button', A: el.hasAttribute('href')?'link':null,
67
- INPUT: (()=>{ const t=(el.type||'text').toLowerCase(); return {button:'button',checkbox:'checkbox',radio:'radio',range:'slider',search:'searchbox',text:'textbox',email:'textbox',tel:'textbox',url:'textbox',number:'spinbutton'}[t]||'textbox'; })(),
68
- TEXTAREA:'textbox', SELECT: el.hasAttribute('multiple')||el.size>1?'listbox':'combobox',
69
- H1:'heading',H2:'heading',H3:'heading',H4:'heading',H5:'heading',H6:'heading',
70
- IMG: el.getAttribute('alt')===''?'presentation':'img',
71
- NAV:'navigation', MAIN:'main', ARTICLE:'article',
72
- HEADER:'banner', FOOTER:'contentinfo', ASIDE:'complementary',
73
- FORM:'form', TABLE:'table', UL:'list', OL:'list', LI:'listitem',
74
- P:'paragraph', DIALOG:'dialog', IFRAME:'iframe'
75
- };
76
- return map[tag] || 'generic';
77
- }
78
-
79
- function getName(el) {
80
- const label = el.getAttribute('aria-label');
81
- if (label) return label;
82
- const labelledBy = el.getAttribute('aria-labelledby');
83
- if (labelledBy) {
84
- const texts = labelledBy.split(/\s+/).map(id => {
85
- const e = document.getElementById(id);
86
- return e ? e.textContent : '';
87
- }).filter(Boolean);
88
- if (texts.length) return texts.join(' ');
89
- }
90
- if (['INPUT','TEXTAREA','SELECT'].includes(el.tagName)) {
91
- const id = el.id;
92
- if (id) {
93
- const lbl = document.querySelector('label[for="'+id+'"]');
94
- if (lbl) return lbl.textContent || '';
95
- }
96
- const parentLbl = el.closest('label');
97
- if (parentLbl) return parentLbl.textContent || '';
98
- const ph = el.getAttribute('placeholder');
99
- if (ph) return ph;
100
- }
101
- if (el.tagName === 'IMG') return el.getAttribute('alt') || '';
102
- const title = el.getAttribute('title');
103
- if (title) return title;
104
- return '';
105
- }
106
-
107
- // ── Build ARIA tree ──────────────────────────
108
- let indexCounter = 0;
109
-
110
- function buildTree(node, visited = new Set()) {
111
- if (visited.has(node)) return null;
112
- visited.add(node);
113
-
114
- if (node.nodeType === 3) { // text node
115
- return node.nodeValue;
116
- }
117
- if (node.nodeType !== 1) return null;
118
-
119
- const el = node;
120
- if (isHidden(el)) return null;
121
-
122
- const role = getRole(el);
123
- if (!role || role === 'presentation' || role === 'none') return null;
124
-
125
- const name = (getName(el) || '').replace(/\s+/g, ' ').trim();
126
- const box = el.getBoundingClientRect();
127
- const visible = isVisible(el);
128
- const cursor = window.getComputedStyle(el).cursor;
129
-
130
- const result = {
131
- role,
132
- name,
133
- children: [],
134
- visible,
135
- cursor,
136
- tag: el.tagName,
137
- };
138
-
139
- // index visible interactive + pointer-cursor elements
140
- if (visible && (cursor === 'pointer' || ['button','link','textbox','searchbox',
141
- 'checkbox','radio','combobox','listbox','option','menuitem',
142
- 'slider','spinbutton','switch','tab','heading'].includes(role))) {
143
- result.index = indexCounter++;
144
- if (el.tagName === 'A' && el.href) result.href = el.href;
145
- if (el.id) result.id = el.id;
146
- }
147
-
148
- // shadow DOM
149
- if (el.shadowRoot) {
150
- for (let c = el.shadowRoot.firstChild; c; c = c.nextSibling) {
151
- const child = buildTree(c, visited);
152
- if (child) result.children.push(child);
153
- }
154
- }
155
-
156
- // regular children
157
- for (let c = el.firstChild; c; c = c.nextSibling) {
158
- if (c.assignedSlot) continue;
159
- const child = buildTree(c, visited);
160
- if (child) result.children.push(child);
161
- }
162
-
163
- // aria-owns
164
- if (el.hasAttribute('aria-owns')) {
165
- for (const id of el.getAttribute('aria-owns').split(/\s+/)) {
166
- const owned = document.getElementById(id);
167
- if (owned) {
168
- const child = buildTree(owned, visited);
169
- if (child) result.children.push(child);
170
- }
171
- }
172
- }
173
-
174
- return result;
175
- }
176
-
177
- // ── Find answer region ──────────────────────────
178
- // Locale-agnostic: finds the LAST Copilot AI message container.
179
- // Copilot uses consistent CSS patterns across locales:
180
- // - AI messages: class contains "ai-message" or "response"
181
- // - User messages: different class prefix
182
- // - We take the LAST ai-message container in DOM order.
183
- function findAnswerRegion() {
184
- // Look for AI message containers with the known Copilot class pattern
185
- // Tailwind-based: group/ai-message, .response-content, etc.
186
- const allDivs = document.querySelectorAll('div[class*="ai-message"], div[class*="response-content"], div[class*="message"]');
187
- let best = null;
188
- for (const el of allDivs) {
189
- const text = (el.innerText || '');
190
- if (text.length > 100) best = el; // take last one with substantial text
191
- }
192
- if (best) return best;
193
-
194
- // Fallback: walk the DOM looking for containers with role=region/article
195
- // and substantial text, take the last one.
196
- const containers = [];
197
- (function walk(el) {
198
- if (!el || el.nodeType !== 1) return;
199
- const role = el.getAttribute('role');
200
- const cls = (el.className || '').toString();
201
- if (role === 'region' || role === 'article' ||
202
- cls.includes('ac-container')) {
203
- const text = (el.innerText || '');
204
- if (text.length > 100) containers.push(el);
205
- }
206
- for (const c of el.children) walk(c);
207
- })(document.body);
208
- if (containers.length > 0) return containers[containers.length - 1];
209
-
210
- return document.body;
211
- }
212
-
213
- // ── Extract text from tree ──────────────────────
214
- // Returns a normalized string. Block elements get newlines.
215
- // Inline elements flow with whitespace separation.
216
- // Locale-agnostic: filters by role/structure only, never by text content.
217
- function extractText(node, isInline = false) {
218
- if (typeof node === 'string') return node;
219
- if (!node) return '';
220
-
221
- // Skip UI buttons entirely
222
- if (node.role === 'button') return '';
223
-
224
- const parts = [];
225
-
226
- // heading → markdown heading with surrounding newlines (block)
227
- if (node.role === 'heading') {
228
- const level = parseInt(node.tag?.[1]) || 2;
229
- const inner = node.children.map(c => extractText(c)).join('');
230
- if (inner.trim()) parts.push('\n' + '#'.repeat(level) + ' ' + inner.trim() + '\n');
231
- }
232
-
233
- // link → markdown link (inline if inside text, block otherwise)
234
- else if (node.role === 'link' && node.href) {
235
- const text = node.children.map(c => extractText(c)).join('').trim();
236
- if (text && node.href.startsWith('http')) {
237
- parts.push('[' + text + '](' + node.href + ')');
238
- } else if (text) {
239
- parts.push(text);
240
- }
241
- }
242
-
243
- // listitem → preserve structure (block)
244
- else if (node.role === 'listitem') {
245
- const text = node.children.map(c => extractText(c)).join('').trim();
246
- if (text) parts.push('\n- ' + text);
247
- }
248
-
249
- // code blocks
250
- else if (node.tag === 'CODE' || node.tag === 'PRE') {
251
- const text = node.children.map(c => extractText(c)).join('').trim();
252
- if (text) parts.push('\n\x60\x60\x60\n' + text + '\n\x60\x60\x60\n');
253
- }
254
-
255
- // paragraph — block level, newlines around
256
- else if (node.role === 'paragraph') {
257
- const text = node.children.map(c => extractText(c)).join('').trim();
258
- if (text) parts.push('\n' + text + '\n');
259
- }
260
-
261
- // generic/inline — flow text, join tight (whitespace already in text nodes)
262
- else {
263
- for (const child of node.children) {
264
- parts.push(extractText(child));
265
- }
266
- }
267
-
268
- return parts.join('');
269
- }
270
-
271
- // ── Collect sources ──────────────────────────
272
- function collectLinks(node) {
273
- const links = [];
274
- function walk(n) {
275
- if (typeof n === 'string') return;
276
- if (!n) return;
277
- if (n.role === 'link' && n.href && n.href.startsWith('http') &&
278
- !n.href.includes('copilot.microsoft.com') &&
279
- !n.href.includes('bing.com') &&
280
- !n.href.includes('microsoft.com/privacy')) {
281
- links.push({ title: n.name || '', url: n.href });
282
- }
283
- for (const c of n.children) walk(c);
284
- }
285
- walk(node);
286
- // deduplicate by url
287
- const seen = new Set();
288
- return links.filter(l => { if (seen.has(l.url)) return false; seen.add(l.url); return true; });
289
- }
290
-
291
- // ── Execute ──────────────────────────────────
292
- try {
293
- // Wait for the answer to actually render — the stream may be "complete"
294
- // but React hasn't painted the AI message yet. Poll for ai-message content.
295
- await new Promise(r => setTimeout(r, 400));
296
-
297
- const deadline = Date.now() + 8000;
298
- let answerEl = null;
299
- while (Date.now() < deadline) {
300
- answerEl = findAnswerRegion();
301
- if (answerEl && (answerEl.innerText || '').length > 200) break;
302
- answerEl = null;
303
- await new Promise(r => setTimeout(r, 500));
304
- }
305
-
306
- if (!answerEl) {
307
- return JSON.stringify({ error: 'No answer region found (content too short or not rendered)', answer: '', sources: [] });
308
- }
309
-
310
- if (!answerEl) {
311
- return JSON.stringify({ error: 'No answer region found', answer: '', sources: [] });
312
- }
313
-
314
- const tree = buildTree(answerEl);
315
- if (!tree) {
316
- return JSON.stringify({ error: 'ARIA tree build failed', answer: '', sources: [] });
317
- }
318
-
319
- const text = extractText(tree);
320
-
321
- // Post-process: structural normalization only (locale-agnostic)
322
- // Buttons are already filtered by role in extractText.
323
- // Deduplication handles Copilot's mobile+desktop DOM variants.
324
- let clean = text
325
- .replace(/\n{3,}/g, '\n\n')
326
- .replace(/^\s+|\s+$/g, '')
327
- .trim();
328
-
329
- // Strip leading heading if it's the Copilot "X said" label
330
- // (locale-agnostic: just checks for markdown heading syntax at start)
331
- clean = clean.replace(/^#{1,6}\s+.+?\n\n/, '');
332
-
333
- // Deduplicate: Copilot sends duplicate DOM for responsive variants
334
- const lines = clean.split('\n');
335
- const seen = new Set();
336
- const deduped = [];
337
- for (const line of lines) {
338
- const normalized = line.trim();
339
- if (!normalized) { deduped.push(''); continue; }
340
- if (normalized.length <= 2 && /^[-–—•·]$/.test(normalized)) continue;
341
- if (seen.has(normalized)) continue;
342
- seen.add(normalized);
343
- deduped.push(line);
344
- }
345
- clean = deduped.join('\n').replace(/\n{3,}/g, '\n\n').trim();
346
-
347
- const sources = collectLinks(tree).slice(0, 10);
348
-
349
- return JSON.stringify({ answer: clean, sources });
350
- } catch (e) {
351
- return JSON.stringify({ error: e.toString(), answer: '', sources: [] });
352
- }
353
- })()
354
- `;
355
-
356
- async function extractAnswer(tab) {
357
- console.error("[bing-aria] Extracting answer via ARIA tree...");
358
-
359
- const resultRaw = await cdp(["eval", tab, EXTRACT_ARIA_JS], 45000);
360
-
361
- let result;
362
- try {
363
- result = JSON.parse(resultRaw);
364
- } catch {
365
- throw new Error(
366
- `ARIA extraction returned invalid JSON: ${resultRaw.slice(0, 200)}`,
367
- );
368
- }
369
-
370
- if (result.error) {
371
- throw new Error(`ARIA extraction failed: ${result.error}`);
372
- }
373
-
374
- const { answer, sources: ariaSources } = result;
375
-
376
- if (!answer || answer.length < 10) {
377
- throw new Error(
378
- `ARIA extraction returned insufficient content (${answer?.length || 0} chars)`,
379
- );
380
- }
381
-
382
- // Hybrid: click copy button for markdown sources only (answer already extracted via ARIA).
383
- // At this point the copy button is guaranteed rendered — no hydration race, no retries.
384
- const GLOBAL_VAR = "__bingAriaClipboard";
385
- await injectClipboardInterceptor(tab, GLOBAL_VAR);
386
- const clipSources = await grabClipboardSources(tab, GLOBAL_VAR);
387
- console.error(`[bing-aria] Clipboard sources: ${clipSources.length}`);
388
-
389
- // Merge: ARIA DOM sources + clipboard markdown sources
390
- const allSources = [...ariaSources, ...clipSources]
391
- .filter((v, i, arr) => arr.findIndex((x) => x.url === v.url) === i)
392
- .slice(0, 10);
393
-
394
- console.error(
395
- `[bing-aria] Extracted ${answer.length} chars, ${allSources.length} sources`,
396
- );
397
- return { answer: answer.trim(), sources: allSources };
398
- }
399
-
400
- /**
401
- * Click the Bing copy button, grab clipboard markdown, extract just the [title](url) sources.
402
- * Lightweight — no retries, no hydration delay (button is already visible at this point).
403
- */
404
- async function grabClipboardSources(tab, globalVar) {
405
- try {
406
- // Click last copy button (the most recent AI message)
407
- const copyBtn = S.copyButton;
408
- await cdp([
409
- "eval",
410
- tab,
411
- `(() => {
412
- window.${globalVar} = '';
413
- const buttons = document.querySelectorAll('${copyBtn}');
414
- buttons[buttons.length - 1]?.click();
415
- })()`,
416
- ]);
417
-
418
- // Poll clipboard briefly (2s max — if it doesn't work, no big deal)
419
- const deadline = Date.now() + 2000;
420
- while (Date.now() < deadline) {
421
- const text = await cdp(["eval", tab, `window.${globalVar} || ''`]).catch(
422
- () => "",
423
- );
424
- if (text && text.length > 20) {
425
- return parseSourcesFromMarkdown(text);
426
- }
427
- await new Promise((r) => setTimeout(r, 200));
428
- }
429
- } catch (e) {
430
- console.error(`[bing-aria] Clipboard source grab failed: ${e.message}`);
431
- }
432
- return [];
433
- }
434
-
435
- // ============================================================================
436
- // Main
437
- // ============================================================================
438
-
439
- const USAGE =
440
- 'Usage: node extractors/bing-aria.mjs "<query>" [--tab <prefix>]\n';
441
-
442
- async function main() {
443
- const args = await prepareArgs(process.argv.slice(2));
444
- validateQuery(args, USAGE);
445
-
446
- const { query, tabPrefix, short } = parseArgs(args);
447
-
448
- try {
449
- if (!tabPrefix) await cdp(["list"]);
450
- const tab = await getOrOpenTab(tabPrefix);
451
-
452
- const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
453
- () => "",
454
- );
455
- let onCopilot = false;
456
- try {
457
- const host = new URL(currentUrl).hostname.toLowerCase();
458
- onCopilot =
459
- host === "copilot.microsoft.com" ||
460
- host.endsWith(".copilot.microsoft.com");
461
- } catch {}
462
-
463
- if (!onCopilot) {
464
- await cdp(["nav", tab, "https://copilot.microsoft.com/"], 20000);
465
- await new Promise((r) => setTimeout(r, 600));
466
- }
467
- await dismissConsent(tab, cdp);
468
-
469
- const verifyResult = await handleVerification(tab, cdp, 10000);
470
- if (verifyResult === "needs-human") {
471
- throw new Error(
472
- "Copilot verification required — please solve it manually in the browser window",
473
- );
474
- }
475
-
476
- if (verifyResult === "clicked") {
477
- await new Promise((r) => setTimeout(r, TIMING.afterVerify));
478
- const currentUrl = await cdp([
479
- "eval",
480
- tab,
481
- "document.location.href",
482
- ]).catch(() => "");
483
- let onCopilot = false;
484
- try {
485
- const host = new URL(currentUrl).hostname.toLowerCase();
486
- onCopilot =
487
- host === "copilot.microsoft.com" ||
488
- host.endsWith(".copilot.microsoft.com");
489
- } catch {}
490
- if (!onCopilot) {
491
- await cdp(["nav", tab, "https://copilot.microsoft.com/"], 20000);
492
- await new Promise((r) => setTimeout(r, 600));
493
- await dismissConsent(tab, cdp);
494
- }
495
- }
496
-
497
- const inputReady = await waitForSelector(tab, S.input, 15000, 500);
498
- await new Promise((r) => setTimeout(r, jitter(300)));
499
-
500
- if (!inputReady) {
501
- throw new Error(
502
- "Copilot input not found — verification may have failed or page is in unexpected state",
503
- );
504
- }
505
-
506
- // NO clipboard interceptor needed — ARIA extraction reads the DOM directly
507
- await cdp(["click", tab, S.input]);
508
- await new Promise((r) => setTimeout(r, TIMING.postClick));
509
- await cdp(["type", tab, query]);
510
- await new Promise((r) => setTimeout(r, TIMING.postType));
511
-
512
- await cdp([
513
- "eval",
514
- tab,
515
- `document.querySelector('${S.input}')?.dispatchEvent(new KeyboardEvent('keydown',{key:'Enter',bubbles:true,keyCode:13})), 'ok'`,
516
- ]);
517
-
518
- // Wait for Copilot's response to finish streaming
519
- await waitForStreamComplete(tab, { timeout: 60000, minLength: 50 });
520
-
521
- const { answer, sources } = await extractAnswer(tab);
522
- if (!answer)
523
- throw new Error("No answer extracted — Copilot may not have responded");
524
-
525
- const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
526
- () => "",
527
- );
528
- outputJson({
529
- query,
530
- url: finalUrl,
531
- answer: formatAnswer(answer, short),
532
- sources,
533
- });
534
- } catch (e) {
535
- handleError(e);
536
- }
537
- }
538
-
539
- main();