@apmantza/greedysearch-pi 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env node
2
+ // extractors/bing-copilot.mjs
3
+ // Navigate copilot.microsoft.com, wait for answer to complete, return clean answer + sources.
4
+ //
5
+ // Usage:
6
+ // node extractors/bing-copilot.mjs "<query>" [--tab <prefix>]
7
+ //
8
+ // Output (stdout): JSON { answer, sources, query, url }
9
+ // Errors go to stderr only — stdout is always clean JSON for piping.
10
+
11
+ import { readFileSync, existsSync } from 'fs';
12
+ import { spawn } from 'child_process';
13
+ import { tmpdir, homedir } from 'os';
14
+ import { join, dirname } from 'path';
15
+ import { fileURLToPath } from 'url';
16
+ import { dismissConsent } from './consent.mjs';
17
+
18
+ const CDP = join(dirname(fileURLToPath(import.meta.url)), '..', 'cdp.mjs');
19
+ const PAGES_CACHE = `${tmpdir().replace(/\\/g, '/')}/cdp-pages.json`;
20
+
21
+ const STREAM_POLL_INTERVAL = 700; // ms between checks
22
+ const STREAM_STABLE_ROUNDS = 3; // consecutive equal-length polls = done
23
+ const STREAM_TIMEOUT = 60000; // bail out after 60s
24
+ const MIN_ANSWER_LENGTH = 50; // don't accept trivial answers
25
+
26
+ // ---------------------------------------------------------------------------
27
+
28
+ function cdp(args, timeoutMs = 30000) {
29
+ return new Promise((resolve, reject) => {
30
+ const proc = spawn('node', [CDP, ...args], { stdio: ['ignore', 'pipe', 'pipe'] });
31
+ let out = '';
32
+ let err = '';
33
+ proc.stdout.on('data', d => out += d);
34
+ proc.stderr.on('data', d => err += d);
35
+ const timer = setTimeout(() => { proc.kill(); reject(new Error(`cdp timeout: ${args[0]}`)); }, timeoutMs);
36
+ proc.on('close', code => {
37
+ clearTimeout(timer);
38
+ if (code !== 0) reject(new Error(err.trim() || `cdp exit ${code}`));
39
+ else resolve(out.trim());
40
+ });
41
+ });
42
+ }
43
+
44
+ async function getOrOpenTab(tabPrefix) {
45
+ if (tabPrefix) return tabPrefix;
46
+
47
+ if (existsSync(PAGES_CACHE)) {
48
+ const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
49
+ const existing = pages.find(p => p.url.includes('copilot.microsoft.com'));
50
+ if (existing) return existing.targetId.slice(0, 8);
51
+ }
52
+
53
+ const list = await cdp(['list']);
54
+ const firstLine = list.split('\n')[0];
55
+ if (!firstLine) throw new Error('No Chrome tabs found. Is Chrome running with --remote-debugging-port=9222?');
56
+ return firstLine.slice(0, 8);
57
+ }
58
+
59
+ async function waitForStreamComplete(tab) {
60
+ const deadline = Date.now() + STREAM_TIMEOUT;
61
+ let stableCount = 0;
62
+ let lastLen = -1;
63
+
64
+ while (Date.now() < deadline) {
65
+ await new Promise(r => setTimeout(r, STREAM_POLL_INTERVAL));
66
+
67
+ // Poll the last non-empty ai-message-item (item[1] is always an empty placeholder)
68
+ const lenStr = await cdp(['eval', tab,
69
+ `(function(){
70
+ var items = Array.from(document.querySelectorAll('[class*="ai-message-item"]'));
71
+ var filled = items.filter(el => (el.innerText?.length || 0) > 0);
72
+ var last = filled[filled.length - 1];
73
+ return (last?.innerText?.length || 0) + '';
74
+ })()`
75
+ ]).catch(() => '0');
76
+
77
+ const len = parseInt(lenStr) || 0;
78
+
79
+ if (len >= MIN_ANSWER_LENGTH && len === lastLen) {
80
+ stableCount++;
81
+ if (stableCount >= STREAM_STABLE_ROUNDS) return len;
82
+ } else {
83
+ stableCount = 0;
84
+ lastLen = len;
85
+ }
86
+ }
87
+
88
+ if (lastLen >= MIN_ANSWER_LENGTH) return lastLen;
89
+ throw new Error(`Copilot answer did not stabilise within ${STREAM_TIMEOUT}ms`);
90
+ }
91
+
92
+ async function extractAnswer(tab) {
93
+ const raw = await cdp(['eval', tab, `
94
+ (function() {
95
+ var items = Array.from(document.querySelectorAll('[class*="ai-message-item"]'));
96
+ var el = items.filter(e => (e.innerText?.length || 0) > 0).pop();
97
+ if (!el) return JSON.stringify({ answer: '', sources: [] });
98
+ var answer = el.innerText.trim();
99
+ var sources = Array.from(document.querySelectorAll('a[href^="http"][target="_blank"]'))
100
+ .map(a => ({ url: a.href, title: a.innerText?.trim().split('\\n')[0] || a.title || '' }))
101
+ .filter(s => s.url && !s.url.includes('copilot.microsoft.com'))
102
+ .filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
103
+ .slice(0, 10);
104
+ return JSON.stringify({ answer, sources });
105
+ })()
106
+ `]);
107
+ return JSON.parse(raw);
108
+ }
109
+
110
+ // ---------------------------------------------------------------------------
111
+
112
+ async function main() {
113
+ const args = process.argv.slice(2);
114
+ if (!args.length || args[0] === '--help') {
115
+ process.stderr.write('Usage: node extractors/bing-copilot.mjs "<query>" [--tab <prefix>]\n');
116
+ process.exit(1);
117
+ }
118
+
119
+ const short = args.includes('--short');
120
+ const rest = args.filter(a => a !== '--short');
121
+ const tabFlagIdx = rest.indexOf('--tab');
122
+ const tabPrefix = tabFlagIdx !== -1 ? rest[tabFlagIdx + 1] : null;
123
+ const query = tabFlagIdx !== -1
124
+ ? rest.filter((_, i) => i !== tabFlagIdx && i !== tabFlagIdx + 1).join(' ')
125
+ : rest.join(' ');
126
+
127
+ try {
128
+ await cdp(['list']);
129
+ const tab = await getOrOpenTab(tabPrefix);
130
+
131
+ // Navigate to Copilot homepage and use the chat input
132
+ await cdp(['nav', tab, 'https://copilot.microsoft.com/'], 35000);
133
+ await dismissConsent(tab, cdp);
134
+
135
+ // Wait for React app to mount #userInput (up to 8s)
136
+ const deadline = Date.now() + 8000;
137
+ while (Date.now() < deadline) {
138
+ const found = await cdp(['eval', tab, `!!document.querySelector('#userInput')`]).catch(() => 'false');
139
+ if (found === 'true') break;
140
+ await new Promise(r => setTimeout(r, 400));
141
+ }
142
+ await new Promise(r => setTimeout(r, 300));
143
+
144
+ // Find input and type query
145
+ await cdp(['click', tab, '#userInput']);
146
+ await new Promise(r => setTimeout(r, 400));
147
+ await cdp(['type', tab, query]);
148
+ await new Promise(r => setTimeout(r, 400));
149
+
150
+ // Submit with Enter (most reliable across locales and Chrome instances)
151
+ await cdp(['eval', tab,
152
+ `document.querySelector('#userInput')?.dispatchEvent(new KeyboardEvent('keydown',{key:'Enter',bubbles:true,keyCode:13})), 'ok'`
153
+ ]);
154
+
155
+ await waitForStreamComplete(tab);
156
+
157
+ const { answer, sources } = await extractAnswer(tab);
158
+ if (!answer) throw new Error('No answer extracted — Copilot may not have responded');
159
+ const out = short ? answer.slice(0, 300).replace(/\s+\S*$/, '') + '…' : answer;
160
+
161
+ const finalUrl = await cdp(['eval', tab, 'document.location.href']).catch(() => '');
162
+ process.stdout.write(JSON.stringify({ query, url: finalUrl, answer: out, sources }, null, 2) + '\n');
163
+ } catch (e) {
164
+ process.stderr.write(`Error: ${e.message}\n`);
165
+ process.exit(1);
166
+ }
167
+ }
168
+
169
+ main();
@@ -0,0 +1,29 @@
1
+ // consent.mjs — auto-dismiss common cookie/consent banners
2
+ // Call dismissConsent(tab, cdpFn) after navigating to any page.
3
+
4
+ const CONSENT_JS = `
5
+ (function() {
6
+ // Google consent page (consent.google.com)
7
+ var g = document.querySelector('#L2AGLb, button[jsname="b3VHJd"], .tHlp8d');
8
+ if (g) { g.click(); return 'google'; }
9
+
10
+ // OneTrust (used by many sites including Stack Overflow)
11
+ var ot = document.querySelector('#onetrust-accept-btn-handler, .onetrust-accept-btn-handler');
12
+ if (ot) { ot.click(); return 'onetrust'; }
13
+
14
+ // Generic "accept all" / "agree" buttons
15
+ var btns = Array.from(document.querySelectorAll('button, a[role=button]'));
16
+ var accept = btns.find(b => /^(accept all|accept cookies|agree|i agree|got it|allow all|allow cookies)$/i.test(b.innerText?.trim()));
17
+ if (accept) { accept.click(); return 'generic:' + accept.innerText.trim(); }
18
+
19
+ return null;
20
+ })()
21
+ `;
22
+
23
+ export async function dismissConsent(tab, cdp) {
24
+ const result = await cdp(['eval', tab, CONSENT_JS]).catch(() => null);
25
+ if (result && result !== 'null') {
26
+ // Give page a moment to react after dismissal
27
+ await new Promise(r => setTimeout(r, 1500));
28
+ }
29
+ }
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env node
2
+ // extractors/google-ai.mjs
3
+ // Navigate Google AI Mode (udm=50), wait for answer, return clean answer + sources.
4
+ //
5
+ // Usage:
6
+ // node extractors/google-ai.mjs "<query>" [--tab <prefix>]
7
+ //
8
+ // Output (stdout): JSON { answer, sources, query, url }
9
+ // Errors go to stderr only — stdout is always clean JSON for piping.
10
+
11
+ import { readFileSync, existsSync } from 'fs';
12
+ import { spawn } from 'child_process';
13
+ import { tmpdir, homedir } from 'os';
14
+ import { join, dirname } from 'path';
15
+ import { fileURLToPath } from 'url';
16
+ import { dismissConsent } from './consent.mjs';
17
+ const __dir = dirname(fileURLToPath(import.meta.url));
18
+
19
+ const CDP = join(dirname(fileURLToPath(import.meta.url)), '..', 'cdp.mjs');
20
+ const PAGES_CACHE = `${tmpdir().replace(/\\/g, '/')}/cdp-pages.json`;
21
+
22
+ const STREAM_POLL_INTERVAL = 600;
23
+ const STREAM_STABLE_ROUNDS = 3;
24
+ const STREAM_TIMEOUT = 45000;
25
+ const MIN_ANSWER_LENGTH = 50;
26
+
27
+ // ---------------------------------------------------------------------------
28
+
29
+ function cdp(args, timeoutMs = 30000) {
30
+ return new Promise((resolve, reject) => {
31
+ const proc = spawn('node', [CDP, ...args], { stdio: ['ignore', 'pipe', 'pipe'] });
32
+ let out = '';
33
+ let err = '';
34
+ proc.stdout.on('data', d => out += d);
35
+ proc.stderr.on('data', d => err += d);
36
+ const timer = setTimeout(() => { proc.kill(); reject(new Error(`cdp timeout: ${args[0]}`)); }, timeoutMs);
37
+ proc.on('close', code => {
38
+ clearTimeout(timer);
39
+ if (code !== 0) reject(new Error(err.trim() || `cdp exit ${code}`));
40
+ else resolve(out.trim());
41
+ });
42
+ });
43
+ }
44
+
45
+ async function getOrOpenTab(tabPrefix) {
46
+ if (tabPrefix) return tabPrefix;
47
+
48
+ if (existsSync(PAGES_CACHE)) {
49
+ const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
50
+ const existing = pages.find(p => p.url.includes('google.com'));
51
+ if (existing) return existing.targetId.slice(0, 8);
52
+ }
53
+
54
+ const list = await cdp(['list']);
55
+ const firstLine = list.split('\n')[0];
56
+ if (!firstLine) throw new Error('No Chrome tabs found. Is Chrome running with --remote-debugging-port=9222?');
57
+ return firstLine.slice(0, 8);
58
+ }
59
+
60
+ async function waitForStreamComplete(tab) {
61
+ const deadline = Date.now() + STREAM_TIMEOUT;
62
+ let stableCount = 0;
63
+ let lastLen = -1;
64
+
65
+ while (Date.now() < deadline) {
66
+ await new Promise(r => setTimeout(r, STREAM_POLL_INTERVAL));
67
+
68
+ const lenStr = await cdp(['eval', tab,
69
+ `(document.querySelector('.pWvJNd')?.innerText?.length || 0) + ''`
70
+ ]).catch(() => '0');
71
+
72
+ const len = parseInt(lenStr) || 0;
73
+
74
+ if (len >= MIN_ANSWER_LENGTH && len === lastLen) {
75
+ stableCount++;
76
+ if (stableCount >= STREAM_STABLE_ROUNDS) return len;
77
+ } else {
78
+ stableCount = 0;
79
+ lastLen = len;
80
+ }
81
+ }
82
+
83
+ if (lastLen >= MIN_ANSWER_LENGTH) return lastLen;
84
+ throw new Error(`Google AI answer did not stabilise within ${STREAM_TIMEOUT}ms`);
85
+ }
86
+
87
+ async function extractAnswer(tab) {
88
+ const raw = await cdp(['eval', tab, `
89
+ (function() {
90
+ var el = document.querySelector('.pWvJNd');
91
+ if (!el) return JSON.stringify({ answer: '', sources: [] });
92
+ var answer = el.innerText.trim();
93
+ var sources = Array.from(document.querySelectorAll('a[href^="http"]'))
94
+ .filter(a => !a.href.includes('google.') && !a.href.includes('gstatic') && !a.href.includes('googleapis'))
95
+ .map(a => ({ url: a.href.split('#')[0], title: (a.closest('[data-snhf]')?.querySelector('h3, [role=heading]')?.innerText || a.innerText?.trim().split('\\n')[0] || '').slice(0, 100) }))
96
+ .filter(s => s.url && s.url.length > 10)
97
+ .filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
98
+ .slice(0, 10);
99
+ return JSON.stringify({ answer, sources });
100
+ })()
101
+ `]);
102
+ return JSON.parse(raw);
103
+ }
104
+
105
+ // ---------------------------------------------------------------------------
106
+
107
+ async function main() {
108
+ const args = process.argv.slice(2);
109
+ if (!args.length || args[0] === '--help') {
110
+ process.stderr.write('Usage: node extractors/google-ai.mjs "<query>" [--tab <prefix>]\n');
111
+ process.exit(1);
112
+ }
113
+
114
+ const short = args.includes('--short');
115
+ const rest = args.filter(a => a !== '--short');
116
+ const tabFlagIdx = rest.indexOf('--tab');
117
+ const tabPrefix = tabFlagIdx !== -1 ? rest[tabFlagIdx + 1] : null;
118
+ const query = tabFlagIdx !== -1
119
+ ? rest.filter((_, i) => i !== tabFlagIdx && i !== tabFlagIdx + 1).join(' ')
120
+ : rest.join(' ');
121
+
122
+ try {
123
+ await cdp(['list']);
124
+ const tab = await getOrOpenTab(tabPrefix);
125
+
126
+ const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=50`;
127
+ await cdp(['nav', tab, url], 35000);
128
+ await new Promise(r => setTimeout(r, 1500));
129
+ await dismissConsent(tab, cdp);
130
+
131
+ // If consent redirected us away, navigate back
132
+ const currentUrl = await cdp(['eval', tab, 'document.location.href']).catch(() => '');
133
+ if (!currentUrl.includes('google.com/search')) {
134
+ await cdp(['nav', tab, url], 35000);
135
+ await new Promise(r => setTimeout(r, 1500));
136
+ }
137
+
138
+ await waitForStreamComplete(tab);
139
+
140
+ const { answer, sources } = await extractAnswer(tab);
141
+ if (!answer) throw new Error('No answer extracted — Google AI Mode may not have responded');
142
+ const out = short ? answer.slice(0, 300).replace(/\s+\S*$/, '') + '…' : answer;
143
+
144
+ const finalUrl = await cdp(['eval', tab, 'document.location.href']).catch(() => url);
145
+ process.stdout.write(JSON.stringify({ query, url: finalUrl, answer: out, sources }, null, 2) + '\n');
146
+ } catch (e) {
147
+ process.stderr.write(`Error: ${e.message}\n`);
148
+ process.exit(1);
149
+ }
150
+ }
151
+
152
+ main();
@@ -0,0 +1,170 @@
1
+ #!/usr/bin/env node
2
+ // extractors/perplexity.mjs
3
+ // Navigate Perplexity, wait for streaming to complete, return clean answer + sources.
4
+ //
5
+ // Usage:
6
+ // node extractors/perplexity.mjs "<query>" [--tab <prefix>]
7
+ //
8
+ // Output (stdout): JSON { answer, sources, query, url }
9
+ // Errors go to stderr only — stdout is always clean JSON for piping.
10
+
11
+ import { readFileSync, existsSync } from 'fs';
12
+ import { spawn } from 'child_process';
13
+ import { tmpdir, homedir } from 'os';
14
+ import { join, dirname } from 'path';
15
+ import { fileURLToPath } from 'url';
16
+ import { dismissConsent } from './consent.mjs';
17
+
18
+ const CDP = join(dirname(fileURLToPath(import.meta.url)), '..', 'cdp.mjs');
19
+ const PAGES_CACHE = `${tmpdir().replace(/\\/g, '/')}/cdp-pages.json`;
20
+
21
+ const STREAM_POLL_INTERVAL = 600; // ms between length checks
22
+ const STREAM_STABLE_ROUNDS = 3; // consecutive equal-length polls = done
23
+ const STREAM_TIMEOUT = 30000; // bail out after 30s regardless
24
+ const MIN_ANSWER_LENGTH = 50; // don't accept trivial answers
25
+
26
+ // ---------------------------------------------------------------------------
27
+
28
+ function cdp(args, timeoutMs = 30000) {
29
+ return new Promise((resolve, reject) => {
30
+ const proc = spawn('node', [CDP, ...args], { stdio: ['ignore', 'pipe', 'pipe'] });
31
+ let out = '';
32
+ let err = '';
33
+ proc.stdout.on('data', d => out += d);
34
+ proc.stderr.on('data', d => err += d);
35
+ const timer = setTimeout(() => { proc.kill(); reject(new Error(`cdp timeout: ${args[0]}`)); }, timeoutMs);
36
+ proc.on('close', code => {
37
+ clearTimeout(timer);
38
+ if (code !== 0) reject(new Error(err.trim() || `cdp exit ${code}`));
39
+ else resolve(out.trim());
40
+ });
41
+ });
42
+ }
43
+
44
+ async function getOrOpenTab(tabPrefix) {
45
+ // If caller specified a tab, use it
46
+ if (tabPrefix) return tabPrefix;
47
+
48
+ // Otherwise look for an existing Perplexity tab
49
+ if (existsSync(PAGES_CACHE)) {
50
+ const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
51
+ const existing = pages.find(p => p.url.includes('perplexity.ai'));
52
+ if (existing) return existing.targetId.slice(0, 8);
53
+ }
54
+
55
+ // Fall back to first available tab
56
+ const list = await cdp(['list']);
57
+ const firstLine = list.split('\n')[0];
58
+ if (!firstLine) throw new Error('No Chrome tabs found. Is Chrome running with --remote-debugging-port=9222?');
59
+ return firstLine.slice(0, 8);
60
+ }
61
+
62
+ async function waitForStreamComplete(tab) {
63
+ const deadline = Date.now() + STREAM_TIMEOUT;
64
+ let stableCount = 0;
65
+ let lastLen = -1;
66
+
67
+ while (Date.now() < deadline) {
68
+ await new Promise(r => setTimeout(r, STREAM_POLL_INTERVAL));
69
+
70
+ const lenStr = await cdp(['eval', tab,
71
+ `(document.querySelector('.prose')?.innerText?.length || 0) + ''`
72
+ ]).catch(() => '0');
73
+
74
+ const len = parseInt(lenStr) || 0;
75
+
76
+ if (len >= MIN_ANSWER_LENGTH && len === lastLen) {
77
+ stableCount++;
78
+ if (stableCount >= STREAM_STABLE_ROUNDS) return len;
79
+ } else {
80
+ stableCount = 0;
81
+ lastLen = len;
82
+ }
83
+ }
84
+
85
+ // Timeout — return whatever we have if it meets minimum length
86
+ if (lastLen >= MIN_ANSWER_LENGTH) return lastLen;
87
+ throw new Error(`Perplexity answer did not stabilise within ${STREAM_TIMEOUT}ms`);
88
+ }
89
+
90
+ async function extractAnswer(tab) {
91
+ const raw = await cdp(['eval', tab, `
92
+ (function() {
93
+ var prose = document.querySelector('.prose');
94
+ if (!prose) return JSON.stringify({ answer: '', sources: [] });
95
+ var answer = prose.innerText.trim();
96
+ var sources = Array.from(document.querySelectorAll('[data-pplx-citation-url]'))
97
+ .map(el => ({ url: el.getAttribute('data-pplx-citation-url'), title: el.querySelector('a')?.innerText?.trim() || '' }))
98
+ .filter(s => s.url)
99
+ .filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
100
+ .slice(0, 10);
101
+ return JSON.stringify({ answer, sources });
102
+ })()
103
+ `]);
104
+ return JSON.parse(raw);
105
+ }
106
+
107
+ // ---------------------------------------------------------------------------
108
+
109
+ async function main() {
110
+ const args = process.argv.slice(2);
111
+ if (!args.length || args[0] === '--help') {
112
+ process.stderr.write('Usage: node extractors/perplexity.mjs "<query>" [--tab <prefix>]\n');
113
+ process.exit(1);
114
+ }
115
+
116
+ const short = args.includes('--short');
117
+ const rest = args.filter(a => a !== '--short');
118
+ const tabFlagIdx = rest.indexOf('--tab');
119
+ const tabPrefix = tabFlagIdx !== -1 ? rest[tabFlagIdx + 1] : null;
120
+ const query = tabFlagIdx !== -1
121
+ ? rest.filter((_, i) => i !== tabFlagIdx && i !== tabFlagIdx + 1).join(' ')
122
+ : rest.join(' ');
123
+
124
+
125
+ try {
126
+ // Refresh page list so cache is current
127
+ await cdp(['list']);
128
+
129
+ const tab = await getOrOpenTab(tabPrefix);
130
+
131
+ // Navigate to homepage and use the search box (direct ?q= URLs trigger bot redirect)
132
+ await cdp(['nav', tab, 'https://www.perplexity.ai/'], 35000);
133
+ await dismissConsent(tab, cdp);
134
+
135
+ // Wait for React app to mount #ask-input (up to 8s)
136
+ const deadline = Date.now() + 8000;
137
+ while (Date.now() < deadline) {
138
+ const found = await cdp(['eval', tab, `!!document.querySelector('#ask-input')`]).catch(() => 'false');
139
+ if (found === 'true') break;
140
+ await new Promise(r => setTimeout(r, 400));
141
+ }
142
+ await new Promise(r => setTimeout(r, 300));
143
+
144
+ await cdp(['click', tab, '#ask-input']);
145
+ await new Promise(r => setTimeout(r, 400));
146
+ await cdp(['type', tab, query]);
147
+ await new Promise(r => setTimeout(r, 400));
148
+ // Submit with Enter (most reliable across Chrome instances)
149
+ await cdp(['eval', tab,
150
+ `document.querySelector('#ask-input')?.dispatchEvent(new KeyboardEvent('keydown',{key:'Enter',bubbles:true,keyCode:13})), 'ok'`
151
+ ]);
152
+
153
+ // Wait for streaming answer to complete
154
+ await waitForStreamComplete(tab);
155
+
156
+ // Extract
157
+ const { answer, sources } = await extractAnswer(tab);
158
+
159
+ if (!answer) throw new Error('No answer extracted — Perplexity may not have responded');
160
+ const out = short ? answer.slice(0, 300).replace(/\s+\S*$/, '') + '…' : answer;
161
+
162
+ const finalUrl = await cdp(['eval', tab, 'document.location.href']).catch(() => '');
163
+ process.stdout.write(JSON.stringify({ query, url: finalUrl, answer: out, sources }, null, 2) + '\n');
164
+ } catch (e) {
165
+ process.stderr.write(`Error: ${e.message}\n`);
166
+ process.exit(1);
167
+ }
168
+ }
169
+
170
+ main();
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env node
2
+ // extractors/stackoverflow-ai.mjs
3
+ // Navigate Stack Overflow AI Assist, wait for answer, return clean answer + sources.
4
+ //
5
+ // Usage:
6
+ // node extractors/stackoverflow-ai.mjs "<query>" [--tab <prefix>]
7
+ //
8
+ // Output (stdout): JSON { answer, sources, query, url }
9
+ // Errors go to stderr only — stdout is always clean JSON for piping.
10
+
11
+ import { readFileSync, existsSync } from 'fs';
12
+ import { spawn } from 'child_process';
13
+ import { tmpdir, homedir } from 'os';
14
+ import { join, dirname } from 'path';
15
+ import { fileURLToPath } from 'url';
16
+ import { dismissConsent } from './consent.mjs';
17
+
18
+ const CDP = join(dirname(fileURLToPath(import.meta.url)), '..', 'cdp.mjs');
19
+ const PAGES_CACHE = `${tmpdir().replace(/\\/g, '/')}/cdp-pages.json`;
20
+
21
+ const STREAM_POLL_INTERVAL = 700;
22
+ const STREAM_STABLE_ROUNDS = 3;
23
+ const STREAM_TIMEOUT = 60000;
24
+ const MIN_ANSWER_LENGTH = 50;
25
+
26
+ // ---------------------------------------------------------------------------
27
+
28
+ function cdp(args, timeoutMs = 30000) {
29
+ return new Promise((resolve, reject) => {
30
+ const proc = spawn('node', [CDP, ...args], { stdio: ['ignore', 'pipe', 'pipe'] });
31
+ let out = '';
32
+ let err = '';
33
+ proc.stdout.on('data', d => out += d);
34
+ proc.stderr.on('data', d => err += d);
35
+ const timer = setTimeout(() => { proc.kill(); reject(new Error(`cdp timeout: ${args[0]}`)); }, timeoutMs);
36
+ proc.on('close', code => {
37
+ clearTimeout(timer);
38
+ if (code !== 0) reject(new Error(err.trim() || `cdp exit ${code}`));
39
+ else resolve(out.trim());
40
+ });
41
+ });
42
+ }
43
+
44
+ async function getOrOpenTab(tabPrefix) {
45
+ if (tabPrefix) return tabPrefix;
46
+
47
+ if (existsSync(PAGES_CACHE)) {
48
+ const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
49
+ const existing = pages.find(p => p.url.includes('stackoverflow.com'));
50
+ if (existing) return existing.targetId.slice(0, 8);
51
+ }
52
+
53
+ const list = await cdp(['list']);
54
+ const firstLine = list.split('\n')[0];
55
+ if (!firstLine) throw new Error('No Chrome tabs found. Is Chrome running with --remote-debugging-port=9222?');
56
+ return firstLine.slice(0, 8);
57
+ }
58
+
59
+ async function waitForStreamComplete(tab) {
60
+ const deadline = Date.now() + STREAM_TIMEOUT;
61
+ let stableCount = 0;
62
+ let lastLen = -1;
63
+
64
+ while (Date.now() < deadline) {
65
+ await new Promise(r => setTimeout(r, STREAM_POLL_INTERVAL));
66
+
67
+ const lenStr = await cdp(['eval', tab, `
68
+ (function(){
69
+ var msgs = Array.from(document.querySelectorAll('.s-prose.assistantMessage'));
70
+ var last = msgs[msgs.length - 1];
71
+ return (last?.innerText?.length || 0) + '';
72
+ })()
73
+ `]).catch(() => '0');
74
+
75
+ const len = parseInt(lenStr) || 0;
76
+
77
+ if (len >= MIN_ANSWER_LENGTH && len === lastLen) {
78
+ stableCount++;
79
+ if (stableCount >= STREAM_STABLE_ROUNDS) return len;
80
+ } else {
81
+ stableCount = 0;
82
+ lastLen = len;
83
+ }
84
+ }
85
+
86
+ if (lastLen >= MIN_ANSWER_LENGTH) return lastLen;
87
+ throw new Error(`Stack Overflow AI answer did not stabilise within ${STREAM_TIMEOUT}ms`);
88
+ }
89
+
90
+ async function extractAnswer(tab) {
91
+ const raw = await cdp(['eval', tab, `
92
+ (function() {
93
+ var msgs = Array.from(document.querySelectorAll('.s-prose.assistantMessage'));
94
+ var el = msgs[msgs.length - 1];
95
+ if (!el) return JSON.stringify({ answer: '', sources: [] });
96
+ var answer = el.innerText.trim();
97
+ // Source cards appear as sibling elements with links to SO questions/docs
98
+ var sources = Array.from(document.querySelectorAll('.d-flex.g16.px2 a[href], .s-card a[href]'))
99
+ .map(a => ({ url: a.href, title: a.innerText?.trim().split('\\n')[0]?.slice(0, 100) || '' }))
100
+ .filter(s => s.url && s.url.startsWith('http'))
101
+ .filter(s => !s.url.includes('/users/') && !s.url.includes('/questions/ask'))
102
+ .filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
103
+ .slice(0, 10);
104
+ return JSON.stringify({ answer, sources });
105
+ })()
106
+ `]);
107
+ return JSON.parse(raw);
108
+ }
109
+
110
+ // ---------------------------------------------------------------------------
111
+
112
+ async function main() {
113
+ const args = process.argv.slice(2);
114
+ if (!args.length || args[0] === '--help') {
115
+ process.stderr.write('Usage: node extractors/stackoverflow-ai.mjs "<query>" [--tab <prefix>]\n');
116
+ process.exit(1);
117
+ }
118
+
119
+ const tabFlagIdx = args.indexOf('--tab');
120
+ const tabPrefix = tabFlagIdx !== -1 ? args[tabFlagIdx + 1] : null;
121
+ const query = tabFlagIdx !== -1
122
+ ? args.filter((_, i) => i !== tabFlagIdx && i !== tabFlagIdx + 1).join(' ')
123
+ : args.join(' ');
124
+
125
+ try {
126
+ await cdp(['list']);
127
+ const tab = await getOrOpenTab(tabPrefix);
128
+
129
+ await cdp(['nav', tab, 'https://stackoverflow.com/ai-assist'], 35000);
130
+ await dismissConsent(tab, cdp);
131
+
132
+ // Wait for React app to mount the textarea (up to 8s)
133
+ const deadline = Date.now() + 8000;
134
+ while (Date.now() < deadline) {
135
+ const found = await cdp(['eval', tab, `!!document.querySelector('textarea.s-textarea')`]).catch(() => 'false');
136
+ if (found === 'true') break;
137
+ await new Promise(r => setTimeout(r, 400));
138
+ }
139
+ await new Promise(r => setTimeout(r, 800)); // extra settle time for SO's React app
140
+
141
+ // Set value and submit in one eval — prevents React re-render clearing the value between calls
142
+ await cdp(['eval', tab, `
143
+ (function(){
144
+ var ta = document.querySelector('textarea.s-textarea');
145
+ if (!ta) return;
146
+ ta.focus();
147
+ var setter = Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype, 'value').set;
148
+ setter.call(ta, ${JSON.stringify(query)});
149
+ ta.dispatchEvent(new Event('input', { bubbles: true }));
150
+ ta.dispatchEvent(new Event('change', { bubbles: true }));
151
+ // Submit immediately before React can re-render and clear the field
152
+ ta.dispatchEvent(new KeyboardEvent('keydown', { key: 'Enter', bubbles: true, keyCode: 13 }));
153
+ })()
154
+ `]);
155
+
156
+ await waitForStreamComplete(tab);
157
+
158
+ const { answer, sources } = await extractAnswer(tab);
159
+ if (!answer) throw new Error('No answer extracted — Stack Overflow AI may not have responded');
160
+
161
+ const finalUrl = await cdp(['eval', tab, 'document.location.href']).catch(() => '');
162
+ process.stdout.write(JSON.stringify({ query, url: finalUrl, answer, sources }, null, 2) + '\n');
163
+ } catch (e) {
164
+ process.stderr.write(`Error: ${e.message}\n`);
165
+ process.exit(1);
166
+ }
167
+ }
168
+
169
+ main();