@apmantza/greedysearch-pi 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,165 +1,165 @@
1
- #!/usr/bin/env node
2
- // extractors/google-ai.mjs
3
- // Navigate Google AI Mode (udm=50), wait for answer, return clean answer + sources.
4
- //
5
- // Usage:
6
- // node extractors/google-ai.mjs "<query>" [--tab <prefix>]
7
- //
8
- // Output (stdout): JSON { answer, sources, query, url }
9
- // Errors go to stderr only — stdout is always clean JSON for piping.
10
-
11
- import { readFileSync, existsSync } from 'fs';
12
- import { spawn } from 'child_process';
13
- import { tmpdir } from 'os';
14
- import { join, dirname } from 'path';
15
- import { fileURLToPath } from 'url';
16
- import { dismissConsent, handleVerification } from './consent.mjs';
17
- import { SELECTORS } from './selectors.mjs';
18
-
19
- const __dir = dirname(fileURLToPath(import.meta.url));
20
- const CDP = join(__dir, '..', 'cdp.mjs');
21
- const PAGES_CACHE = `${tmpdir().replace(/\\/g, '/')}/cdp-pages.json`;
22
-
23
- const STREAM_POLL_INTERVAL = 600;
24
- const STREAM_STABLE_ROUNDS = 3;
25
- const STREAM_TIMEOUT = 45000;
26
- const MIN_ANSWER_LENGTH = 50;
27
-
28
- const S = SELECTORS.google;
29
-
30
- // ---------------------------------------------------------------------------
31
-
32
- function cdp(args, timeoutMs = 30000) {
33
- return new Promise((resolve, reject) => {
34
- const proc = spawn('node', [CDP, ...args], { stdio: ['ignore', 'pipe', 'pipe'] });
35
- let out = '';
36
- let err = '';
37
- proc.stdout.on('data', d => out += d);
38
- proc.stderr.on('data', d => err += d);
39
- const timer = setTimeout(() => { proc.kill(); reject(new Error(`cdp timeout: ${args[0]}`)); }, timeoutMs);
40
- proc.on('close', code => {
41
- clearTimeout(timer);
42
- if (code !== 0) reject(new Error(err.trim() || `cdp exit ${code}`));
43
- else resolve(out.trim());
44
- });
45
- });
46
- }
47
-
48
- async function getOrOpenTab(tabPrefix) {
49
- if (tabPrefix) return tabPrefix;
50
-
51
- if (existsSync(PAGES_CACHE)) {
52
- const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
53
- const existing = pages.find(p => p.url.includes('google.com'));
54
- if (existing) return existing.targetId.slice(0, 8);
55
- }
56
-
57
- const list = await cdp(['list']);
58
- const firstLine = list.split('\n')[0];
59
- if (!firstLine) throw new Error('No Chrome tabs found. Is Chrome running with --remote-debugging-port=9222?');
60
- return firstLine.slice(0, 8);
61
- }
62
-
63
- async function waitForStreamComplete(tab) {
64
- const deadline = Date.now() + STREAM_TIMEOUT;
65
- let stableCount = 0;
66
- let lastLen = -1;
67
-
68
- while (Date.now() < deadline) {
69
- await new Promise(r => setTimeout(r, STREAM_POLL_INTERVAL));
70
-
71
- const lenStr = await cdp(['eval', tab,
72
- `(document.querySelector('${S.answerContainer}')?.innerText?.length || 0) + ''`
73
- ]).catch(() => '0');
74
-
75
- const len = parseInt(lenStr) || 0;
76
-
77
- if (len >= MIN_ANSWER_LENGTH && len === lastLen) {
78
- stableCount++;
79
- if (stableCount >= STREAM_STABLE_ROUNDS) return len;
80
- } else {
81
- stableCount = 0;
82
- lastLen = len;
83
- }
84
- }
85
-
86
- if (lastLen >= MIN_ANSWER_LENGTH) return lastLen;
87
- throw new Error(`Google AI answer did not stabilise within ${STREAM_TIMEOUT}ms`);
88
- }
89
-
90
- async function extractAnswer(tab) {
91
- const excludeFilter = S.sourceExclude.map(e => `!a.href.includes('${e}')`).join(' && ');
92
- const raw = await cdp(['eval', tab, `
93
- (function() {
94
- var el = document.querySelector('${S.answerContainer}');
95
- if (!el) return JSON.stringify({ answer: '', sources: [] });
96
- var answer = el.innerText.trim();
97
- var sources = Array.from(document.querySelectorAll('${S.sourceLink}'))
98
- .filter(a => ${excludeFilter})
99
- .map(a => ({ url: a.href.split('#')[0], title: (a.closest('${S.sourceHeadingParent}')?.querySelector('h3, [role=heading]')?.innerText || a.innerText?.trim().split('\\n')[0] || '').slice(0, 100) }))
100
- .filter(s => s.url && s.url.length > 10)
101
- .filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
102
- .slice(0, 10);
103
- return JSON.stringify({ answer, sources });
104
- })()
105
- `]);
106
- return JSON.parse(raw);
107
- }
108
-
109
- // ---------------------------------------------------------------------------
110
-
111
- async function main() {
112
- const args = process.argv.slice(2);
113
- if (!args.length || args[0] === '--help') {
114
- process.stderr.write('Usage: node extractors/google-ai.mjs "<query>" [--tab <prefix>]\n');
115
- process.exit(1);
116
- }
117
-
118
- const short = args.includes('--short');
119
- const rest = args.filter(a => a !== '--short');
120
- const tabFlagIdx = rest.indexOf('--tab');
121
- const tabPrefix = tabFlagIdx !== -1 ? rest[tabFlagIdx + 1] : null;
122
- const query = tabFlagIdx !== -1
123
- ? rest.filter((_, i) => i !== tabFlagIdx && i !== tabFlagIdx + 1).join(' ')
124
- : rest.join(' ');
125
-
126
- try {
127
- await cdp(['list']);
128
- const tab = await getOrOpenTab(tabPrefix);
129
-
130
- const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=50`;
131
- await cdp(['nav', tab, url], 35000);
132
- await new Promise(r => setTimeout(r, 1500));
133
- await dismissConsent(tab, cdp);
134
-
135
- // If consent redirected us away, navigate back
136
- const currentUrl = await cdp(['eval', tab, 'document.location.href']).catch(() => '');
137
- if (!currentUrl.includes('google.com/search')) {
138
- await cdp(['nav', tab, url], 35000);
139
- await new Promise(r => setTimeout(r, 1500));
140
- }
141
-
142
- // Handle "verify you're human" — auto-click simple buttons, wait for user on hard CAPTCHA
143
- const verifyResult = await handleVerification(tab, cdp, 60000);
144
- if (verifyResult === 'needs-human') throw new Error('Google verification required — could not be completed automatically');
145
- if (verifyResult === 'clicked' || verifyResult === 'cleared-by-user') {
146
- // Re-navigate to the search URL after verification
147
- await cdp(['nav', tab, url], 35000);
148
- await new Promise(r => setTimeout(r, 1500));
149
- }
150
-
151
- await waitForStreamComplete(tab);
152
-
153
- const { answer, sources } = await extractAnswer(tab);
154
- if (!answer) throw new Error('No answer extracted — Google AI Mode may not have responded');
155
- const out = short ? answer.slice(0, 300).replace(/\s+\S*$/, '') + '…' : answer;
156
-
157
- const finalUrl = await cdp(['eval', tab, 'document.location.href']).catch(() => url);
158
- process.stdout.write(JSON.stringify({ query, url: finalUrl, answer: out, sources }, null, 2) + '\n');
159
- } catch (e) {
160
- process.stderr.write(`Error: ${e.message}\n`);
161
- process.exit(1);
162
- }
163
- }
164
-
165
- main();
1
+ #!/usr/bin/env node
2
+ // extractors/google-ai.mjs
3
+ // Navigate Google AI Mode (udm=50), wait for answer, return clean answer + sources.
4
+ //
5
+ // Usage:
6
+ // node extractors/google-ai.mjs "<query>" [--tab <prefix>]
7
+ //
8
+ // Output (stdout): JSON { answer, sources, query, url }
9
+ // Errors go to stderr only — stdout is always clean JSON for piping.
10
+
11
+ import { readFileSync, existsSync } from 'fs';
12
+ import { spawn } from 'child_process';
13
+ import { tmpdir } from 'os';
14
+ import { join, dirname } from 'path';
15
+ import { fileURLToPath } from 'url';
16
+ import { dismissConsent, handleVerification } from './consent.mjs';
17
+ import { SELECTORS } from './selectors.mjs';
18
+
19
+ const __dir = dirname(fileURLToPath(import.meta.url));
20
+ const CDP = join(__dir, '..', 'cdp.mjs');
21
+ const PAGES_CACHE = `${tmpdir().replace(/\\/g, '/')}/cdp-pages.json`;
22
+
23
+ const STREAM_POLL_INTERVAL = 600;
24
+ const STREAM_STABLE_ROUNDS = 3;
25
+ const STREAM_TIMEOUT = 45000;
26
+ const MIN_ANSWER_LENGTH = 50;
27
+
28
+ const S = SELECTORS.google;
29
+
30
+ // ---------------------------------------------------------------------------
31
+
32
+ function cdp(args, timeoutMs = 30000) {
33
+ return new Promise((resolve, reject) => {
34
+ const proc = spawn('node', [CDP, ...args], { stdio: ['ignore', 'pipe', 'pipe'] });
35
+ let out = '';
36
+ let err = '';
37
+ proc.stdout.on('data', d => out += d);
38
+ proc.stderr.on('data', d => err += d);
39
+ const timer = setTimeout(() => { proc.kill(); reject(new Error(`cdp timeout: ${args[0]}`)); }, timeoutMs);
40
+ proc.on('close', code => {
41
+ clearTimeout(timer);
42
+ if (code !== 0) reject(new Error(err.trim() || `cdp exit ${code}`));
43
+ else resolve(out.trim());
44
+ });
45
+ });
46
+ }
47
+
48
+ async function getOrOpenTab(tabPrefix) {
49
+ if (tabPrefix) return tabPrefix;
50
+
51
+ if (existsSync(PAGES_CACHE)) {
52
+ const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
53
+ const existing = pages.find(p => p.url.includes('google.com'));
54
+ if (existing) return existing.targetId.slice(0, 8);
55
+ }
56
+
57
+ const list = await cdp(['list']);
58
+ const firstLine = list.split('\n')[0];
59
+ if (!firstLine) throw new Error('No Chrome tabs found. Is Chrome running with --remote-debugging-port=9222?');
60
+ return firstLine.slice(0, 8);
61
+ }
62
+
63
+ async function waitForStreamComplete(tab) {
64
+ const deadline = Date.now() + STREAM_TIMEOUT;
65
+ let stableCount = 0;
66
+ let lastLen = -1;
67
+
68
+ while (Date.now() < deadline) {
69
+ await new Promise(r => setTimeout(r, STREAM_POLL_INTERVAL));
70
+
71
+ const lenStr = await cdp(['eval', tab,
72
+ `(document.querySelector('${S.answerContainer}')?.innerText?.length || 0) + ''`
73
+ ]).catch(() => '0');
74
+
75
+ const len = parseInt(lenStr) || 0;
76
+
77
+ if (len >= MIN_ANSWER_LENGTH && len === lastLen) {
78
+ stableCount++;
79
+ if (stableCount >= STREAM_STABLE_ROUNDS) return len;
80
+ } else {
81
+ stableCount = 0;
82
+ lastLen = len;
83
+ }
84
+ }
85
+
86
+ if (lastLen >= MIN_ANSWER_LENGTH) return lastLen;
87
+ throw new Error(`Google AI answer did not stabilise within ${STREAM_TIMEOUT}ms`);
88
+ }
89
+
90
+ async function extractAnswer(tab) {
91
+ const excludeFilter = S.sourceExclude.map(e => `!a.href.includes('${e}')`).join(' && ');
92
+ const raw = await cdp(['eval', tab, `
93
+ (function() {
94
+ var el = document.querySelector('${S.answerContainer}');
95
+ if (!el) return JSON.stringify({ answer: '', sources: [] });
96
+ var answer = el.innerText.trim();
97
+ var sources = Array.from(document.querySelectorAll('${S.sourceLink}'))
98
+ .filter(a => ${excludeFilter})
99
+ .map(a => ({ url: a.href.split('#')[0], title: (a.closest('${S.sourceHeadingParent}')?.querySelector('h3, [role=heading]')?.innerText || a.innerText?.trim().split('\\n')[0] || '').slice(0, 100) }))
100
+ .filter(s => s.url && s.url.length > 10)
101
+ .filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
102
+ .slice(0, 10);
103
+ return JSON.stringify({ answer, sources });
104
+ })()
105
+ `]);
106
+ return JSON.parse(raw);
107
+ }
108
+
109
+ // ---------------------------------------------------------------------------
110
+
111
+ async function main() {
112
+ const args = process.argv.slice(2);
113
+ if (!args.length || args[0] === '--help') {
114
+ process.stderr.write('Usage: node extractors/google-ai.mjs "<query>" [--tab <prefix>]\n');
115
+ process.exit(1);
116
+ }
117
+
118
+ const short = args.includes('--short');
119
+ const rest = args.filter(a => a !== '--short');
120
+ const tabFlagIdx = rest.indexOf('--tab');
121
+ const tabPrefix = tabFlagIdx !== -1 ? rest[tabFlagIdx + 1] : null;
122
+ const query = tabFlagIdx !== -1
123
+ ? rest.filter((_, i) => i !== tabFlagIdx && i !== tabFlagIdx + 1).join(' ')
124
+ : rest.join(' ');
125
+
126
+ try {
127
+ await cdp(['list']);
128
+ const tab = await getOrOpenTab(tabPrefix);
129
+
130
+ const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=50`;
131
+ await cdp(['nav', tab, url], 35000);
132
+ await new Promise(r => setTimeout(r, 1500));
133
+ await dismissConsent(tab, cdp);
134
+
135
+ // If consent redirected us away, navigate back
136
+ const currentUrl = await cdp(['eval', tab, 'document.location.href']).catch(() => '');
137
+ if (!currentUrl.includes('google.com/search')) {
138
+ await cdp(['nav', tab, url], 35000);
139
+ await new Promise(r => setTimeout(r, 1500));
140
+ }
141
+
142
+ // Handle "verify you're human" — auto-click simple buttons, wait for user on hard CAPTCHA
143
+ const verifyResult = await handleVerification(tab, cdp, 60000);
144
+ if (verifyResult === 'needs-human') throw new Error('Google verification required — could not be completed automatically');
145
+ if (verifyResult === 'clicked' || verifyResult === 'cleared-by-user') {
146
+ // Re-navigate to the search URL after verification
147
+ await cdp(['nav', tab, url], 35000);
148
+ await new Promise(r => setTimeout(r, 1500));
149
+ }
150
+
151
+ await waitForStreamComplete(tab);
152
+
153
+ const { answer, sources } = await extractAnswer(tab);
154
+ if (!answer) throw new Error('No answer extracted — Google AI Mode may not have responded');
155
+ const out = short ? answer.slice(0, 300).replace(/\s+\S*$/, '') + '…' : answer;
156
+
157
+ const finalUrl = await cdp(['eval', tab, 'document.location.href']).catch(() => url);
158
+ process.stdout.write(JSON.stringify({ query, url: finalUrl, answer: out, sources }, null, 2) + '\n');
159
+ } catch (e) {
160
+ process.stderr.write(`Error: ${e.message}\n`);
161
+ process.exit(1);
162
+ }
163
+ }
164
+
165
+ main();