@apmantza/greedysearch-pi 1.6.2 → 1.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,156 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- // extractors/google-ai.mjs
4
- // Navigate Google AI Mode (udm=50), wait for answer, return clean answer + sources.
5
- //
6
- // Usage:
7
- // node extractors/google-ai.mjs "<query>" [--tab <prefix>]
8
- //
9
- // Output (stdout): JSON { answer, sources, query, url }
10
- // Errors go to stderr only — stdout is always clean JSON for piping.
11
-
12
- import {
13
- cdp,
14
- formatAnswer,
15
- getOrOpenTab,
16
- handleError,
17
- outputJson,
18
- parseArgs,
19
- validateQuery,
20
- } from "./common.mjs";
21
- import { dismissConsent, handleVerification } from "./consent.mjs";
22
- import { SELECTORS } from "./selectors.mjs";
23
-
24
- const S = SELECTORS.google;
25
-
26
- const STREAM_POLL_INTERVAL = 600;
27
- const STREAM_STABLE_ROUNDS = 3;
28
- const STREAM_TIMEOUT = 45000;
29
- const MIN_ANSWER_LENGTH = 50;
30
-
31
- // ============================================================================
32
- // Google AI-specific helpers
33
- // ============================================================================
34
-
35
- async function waitForGoogleStreamComplete(tab) {
36
- const deadline = Date.now() + STREAM_TIMEOUT;
37
- let stableCount = 0;
38
- let lastLen = -1;
39
-
40
- while (Date.now() < deadline) {
41
- await new Promise((r) => setTimeout(r, STREAM_POLL_INTERVAL));
42
-
43
- const lenStr = await cdp([
44
- "eval",
45
- tab,
46
- `(document.querySelector('${S.answerContainer}')?.innerText?.length || 0) + ''`,
47
- ]).catch(() => "0");
48
-
49
- const len = parseInt(lenStr, 10) || 0;
50
-
51
- if (len >= MIN_ANSWER_LENGTH && len === lastLen) {
52
- stableCount++;
53
- if (stableCount >= STREAM_STABLE_ROUNDS) return len;
54
- } else {
55
- stableCount = 0;
56
- lastLen = len;
57
- }
58
- }
59
-
60
- if (lastLen >= MIN_ANSWER_LENGTH) return lastLen;
61
- throw new Error(
62
- `Google AI answer did not stabilise within ${STREAM_TIMEOUT}ms`,
63
- );
64
- }
65
-
66
- async function extractAnswer(tab) {
67
- const excludeFilter = S.sourceExclude
68
- .map((e) => `!a.href.includes('${e}')`)
69
- .join(" && ");
70
- const raw = await cdp([
71
- "eval",
72
- tab,
73
- `
74
- (function() {
75
- var el = document.querySelector('${S.answerContainer}');
76
- if (!el) return JSON.stringify({ answer: '', sources: [] });
77
- var answer = el.innerText.trim();
78
- var sources = Array.from(document.querySelectorAll('${S.sourceLink}'))
79
- .filter(a => ${excludeFilter})
80
- .map(a => ({ url: a.href.split('#')[0], title: (a.closest('${S.sourceHeadingParent}')?.querySelector('h3, [role=heading]')?.innerText || a.innerText?.trim().split('\\n')[0] || '').slice(0, 100) }))
81
- .filter(s => s.url && s.url.length > 10)
82
- .filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
83
- .slice(0, 10);
84
- return JSON.stringify({ answer, sources });
85
- })()
86
- `,
87
- ]);
88
- return JSON.parse(raw);
89
- }
90
-
91
- // ============================================================================
92
- // Main
93
- // ============================================================================
94
-
95
- const USAGE =
96
- 'Usage: node extractors/google-ai.mjs "<query>" [--tab <prefix>]\n';
97
-
98
- async function main() {
99
- const args = process.argv.slice(2);
100
- validateQuery(args, USAGE);
101
-
102
- const { query, tabPrefix, short } = parseArgs(args);
103
-
104
- try {
105
- await cdp(["list"]);
106
- const tab = await getOrOpenTab(tabPrefix);
107
-
108
- const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=50`;
109
- await cdp(["nav", tab, url], 35000);
110
- await new Promise((r) => setTimeout(r, 1500));
111
- await dismissConsent(tab, cdp);
112
-
113
- // If consent redirected us away, navigate back
114
- const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
115
- () => "",
116
- );
117
- if (!currentUrl.includes("google.com/search")) {
118
- await cdp(["nav", tab, url], 35000);
119
- await new Promise((r) => setTimeout(r, 1500));
120
- }
121
-
122
- // Handle "verify you're human" — auto-click simple buttons, wait for user on hard CAPTCHA
123
- const verifyResult = await handleVerification(tab, cdp, 60000);
124
- if (verifyResult === "needs-human")
125
- throw new Error(
126
- "Google verification required — could not be completed automatically",
127
- );
128
- if (verifyResult === "clicked" || verifyResult === "cleared-by-user") {
129
- // Re-navigate to the search URL after verification
130
- await cdp(["nav", tab, url], 35000);
131
- await new Promise((r) => setTimeout(r, 1500));
132
- }
133
-
134
- await waitForGoogleStreamComplete(tab);
135
-
136
- const { answer, sources } = await extractAnswer(tab);
137
- if (!answer)
138
- throw new Error(
139
- "No answer extracted — Google AI Mode may not have responded",
140
- );
141
-
142
- const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
143
- () => url,
144
- );
145
- outputJson({
146
- query,
147
- url: finalUrl,
148
- answer: formatAnswer(answer, short),
149
- sources,
150
- });
151
- } catch (e) {
152
- handleError(e);
153
- }
154
- }
155
-
156
- main();
@@ -1,128 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- // extractors/perplexity.mjs
4
- // Navigate Perplexity, wait for streaming to complete, return clean answer + sources.
5
- //
6
- // Usage:
7
- // node extractors/perplexity.mjs "<query>" [--tab <prefix>]
8
- //
9
- // Output (stdout): JSON { answer, sources, query, url }
10
- // Errors go to stderr only — stdout is always clean JSON for piping.
11
- //
12
- // TODO: Refactor - this file has 42 lines duplicated with google-ai.mjs (line 28)
13
-
14
- import {
15
- cdp,
16
- formatAnswer,
17
- getOrOpenTab,
18
- handleError,
19
- injectClipboardInterceptor,
20
- outputJson,
21
- parseArgs,
22
- parseSourcesFromMarkdown,
23
- validateQuery,
24
- waitForStreamComplete,
25
- } from "./common.mjs";
26
- import { dismissConsent } from "./consent.mjs";
27
- import { SELECTORS } from "./selectors.mjs";
28
-
29
- const S = SELECTORS.perplexity;
30
- const GLOBAL_VAR = "__pplxClipboard";
31
-
32
- // ============================================================================
33
- // Extraction
34
- // ============================================================================
35
-
36
- async function extractAnswer(tab) {
37
- await cdp([
38
- "eval",
39
- tab,
40
- `document.querySelector('${S.copyButton}')?.click()`,
41
- ]);
42
- await new Promise((r) => setTimeout(r, 400));
43
-
44
- const answer = await cdp(["eval", tab, `window.${GLOBAL_VAR} || ''`]);
45
- if (!answer) throw new Error("Clipboard interceptor returned empty text");
46
-
47
- const sources = parseSourcesFromMarkdown(answer);
48
- return { answer: answer.trim(), sources };
49
- }
50
-
51
- // ============================================================================
52
- // Main
53
- // ============================================================================
54
-
55
- const USAGE =
56
- 'Usage: node extractors/perplexity.mjs "<query>" [--tab <prefix>]\n';
57
-
58
- async function main() {
59
- const args = process.argv.slice(2);
60
- validateQuery(args, USAGE);
61
-
62
- const { query, tabPrefix, short } = parseArgs(args);
63
-
64
- try {
65
- // Refresh page list so cache is current
66
- await cdp(["list"]);
67
-
68
- const tab = await getOrOpenTab(tabPrefix);
69
-
70
- // Navigate to homepage and use the search box (direct ?q= URLs trigger bot redirect)
71
- await cdp(["nav", tab, "https://www.perplexity.ai/"], 35000);
72
- await dismissConsent(tab, cdp);
73
-
74
- // Wait for React app to mount input (up to 8s)
75
- const deadline = Date.now() + 8000;
76
- while (Date.now() < deadline) {
77
- const found = await cdp([
78
- "eval",
79
- tab,
80
- `!!document.querySelector('${S.input}')`,
81
- ]).catch(() => "false");
82
- if (found === "true") break;
83
- await new Promise((r) => setTimeout(r, 400));
84
- }
85
- await new Promise((r) => setTimeout(r, 300));
86
-
87
- await injectClipboardInterceptor(tab, GLOBAL_VAR);
88
- await cdp(["click", tab, S.input]);
89
- await new Promise((r) => setTimeout(r, 400));
90
- await cdp(["type", tab, query]);
91
- await new Promise((r) => setTimeout(r, 400));
92
-
93
- // Submit with Enter (most reliable across Chrome instances)
94
- await cdp([
95
- "eval",
96
- tab,
97
- `document.querySelector('${S.input}')?.dispatchEvent(new KeyboardEvent('keydown',{key:'Enter',bubbles:true,keyCode:13})), 'ok'`,
98
- ]);
99
-
100
- await waitForStreamComplete(tab, {
101
- timeout: 30000,
102
- interval: 600,
103
- stableRounds: 3,
104
- selector: "document.body",
105
- });
106
-
107
- const { answer, sources } = await extractAnswer(tab);
108
-
109
- if (!answer)
110
- throw new Error(
111
- "No answer extracted — Perplexity may not have responded",
112
- );
113
-
114
- const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
115
- () => "",
116
- );
117
- outputJson({
118
- query,
119
- url: finalUrl,
120
- answer: formatAnswer(answer, short),
121
- sources,
122
- });
123
- } catch (e) {
124
- handleError(e);
125
- }
126
- }
127
-
128
- main();
@@ -1,52 +0,0 @@
1
- // extractors/selectors.mjs
2
- // Centralized CSS selectors for all engines.
3
- // Update selectors here when a site changes its UI.
4
-
5
- export const SELECTORS = {
6
- // ──────────────────────────────────────────────
7
- // Perplexity (perplexity.ai)
8
- // ──────────────────────────────────────────────
9
- perplexity: {
10
- input: "#ask-input",
11
- copyButton: 'button[aria-label="Copy"]',
12
- sourceItem: "[data-pplx-citation-url]",
13
- sourceLink: "a",
14
- consent: "#onetrust-accept-btn-handler",
15
- },
16
-
17
- // ──────────────────────────────────────────────
18
- // Bing Copilot (copilot.microsoft.com)
19
- // ──────────────────────────────────────────────
20
- bing: {
21
- input: "#userInput",
22
- copyButton: 'button[data-testid="copy-ai-message-button"]',
23
- sourceLink: 'a[href^="http"][target="_blank"]',
24
- sourceExclude: "copilot.microsoft.com",
25
- consent: "#onetrust-accept-btn-handler",
26
- },
27
-
28
- // ──────────────────────────────────────────────
29
- // Google AI Mode (google.com/search?udm=50)
30
- // ──────────────────────────────────────────────
31
- google: {
32
- answerContainer: ".pWvJNd",
33
- sourceLink: 'a[href^="http"]',
34
- sourceExclude: ["google.", "gstatic", "googleapis"],
35
- sourceHeadingParent: "[data-snhf]",
36
- consent: '#L2AGLb, button[jsname="b3VHJd"], .tHlp8d',
37
- },
38
-
39
- // ──────────────────────────────────────────────
40
- // Gemini (gemini.google.com/app)
41
- // ──────────────────────────────────────────────
42
- gemini: {
43
- input: "rich-textarea .ql-editor",
44
- copyButton: 'button[aria-label="Copy"]',
45
- sendButton: 'button[aria-label*="Send"]',
46
- sourcesSidebarButton: "button.legacy-sources-sidebar-button",
47
- sourcesExclude: ["gemini.google", "gstatic", "google.com/search"],
48
- citationButtonPattern: 'button[aria-label*="citation from"]',
49
- // For parsing citation aria-labels: "View source details for citation from {name}. Opens side panel."
50
- citationNameRegex: /from\s+(.+?)\.\s/,
51
- },
52
- };
package/launch.mjs DELETED
@@ -1,288 +0,0 @@
1
- #!/usr/bin/env node
2
- // launch.mjs — start a dedicated Chrome instance for GreedySearch
3
- //
4
- // This Chrome instance uses --disable-features=DevToolsPrivacyUI which suppresses
5
- // the "Allow remote debugging?" dialog entirely. It runs on port 9222 so it doesn't
6
- // conflict with your main Chrome session (which may use port 9223).
7
- //
8
- // search.mjs passes CDP_PROFILE_DIR so cdp.mjs targets this dedicated Chrome
9
- // without ever touching the user's main Chrome DevToolsActivePort file.
10
- //
11
- // Usage:
12
- // node launch.mjs — launch (or report if already running)
13
- // node launch.mjs --kill — stop and restore original DevToolsActivePort
14
- // node launch.mjs --status — check if running
15
-
16
- import { execSync, spawn } from "node:child_process";
17
- import {
18
- existsSync,
19
- mkdirSync,
20
- readFileSync,
21
- unlinkSync,
22
- writeFileSync,
23
- } from "node:fs";
24
- import http from "node:http";
25
- import { platform, tmpdir } from "node:os";
26
- import { join } from "node:path";
27
-
28
- const PORT = 9222;
29
- const PROFILE_DIR = join(tmpdir(), "greedysearch-chrome-profile");
30
- const ACTIVE_PORT = join(PROFILE_DIR, "DevToolsActivePort");
31
- const PID_FILE = join(tmpdir(), "greedysearch-chrome.pid");
32
-
33
- function findChrome() {
34
- const os = platform();
35
- const candidates =
36
- os === "win32"
37
- ? [
38
- "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
39
- "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
40
- ]
41
- : os === "darwin"
42
- ? [
43
- "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
44
- "/Applications/Chromium.app/Contents/MacOS/Chromium",
45
- ]
46
- : [
47
- "/usr/bin/google-chrome",
48
- "/usr/bin/google-chrome-stable",
49
- "/usr/bin/chromium-browser",
50
- "/usr/bin/chromium",
51
- "/snap/bin/chromium",
52
- ];
53
- return candidates.find(existsSync) || null;
54
- }
55
-
56
- const CHROME_FLAGS = [
57
- `--remote-debugging-port=${PORT}`,
58
- "--disable-features=DevToolsPrivacyUI", // suppresses "Allow remote debugging?" dialog
59
- "--no-first-run",
60
- "--no-default-browser-check",
61
- "--disable-default-apps",
62
- `--user-data-dir=${PROFILE_DIR}`,
63
- "--profile-directory=Default",
64
- "about:blank",
65
- ];
66
-
67
- // ---------------------------------------------------------------------------
68
-
69
- function isRunning() {
70
- if (!existsSync(PID_FILE)) return false;
71
- const pid = parseInt(readFileSync(PID_FILE, "utf8").trim(), 10);
72
- if (!pid) return false;
73
- try {
74
- process.kill(pid, 0);
75
- return pid;
76
- } catch {
77
- return false;
78
- }
79
- }
80
-
81
- // Get the PID of the process listening on a port (Windows + Unix)
82
- function getPortPid(port) {
83
- try {
84
- const os = platform();
85
- if (os === "win32") {
86
- // Windows: netstat -ano returns PID in last column
87
- const out = execSync(`netstat -ano -p TCP 2>nul`, { encoding: "utf8" });
88
- // Match lines like: TCP 127.0.0.1:9222 0.0.0.0:0 LISTENING 12345
89
- const regex = new RegExp(
90
- `TCP\\s+[^\\s]*:${port}\\s+[^\\s]*:0\\s+LISTENING\\s+(\\d+)`,
91
- "i",
92
- );
93
- const match = out.match(regex);
94
- return match ? parseInt(match[1], 10) : null;
95
- } else {
96
- // Unix: use lsof or ss
97
- try {
98
- const out = execSync(`lsof -i :${port} -t 2>/dev/null`, {
99
- encoding: "utf8",
100
- }).trim();
101
- return out ? parseInt(out.split("\n")[0], 10) : null;
102
- } catch {
103
- const out = execSync(`ss -tlnp 2>/dev/null | grep :${port}`, {
104
- encoding: "utf8",
105
- });
106
- const match = out.match(/pid=(\d+)/);
107
- return match ? parseInt(match[1], 10) : null;
108
- }
109
- }
110
- } catch {
111
- return null;
112
- }
113
- }
114
-
115
- // Kill a process by PID (with Windows/Unix compatibility)
116
- function killProcess(pid) {
117
- try {
118
- if (platform() === "win32") {
119
- execSync(`taskkill //F //PID ${pid}`, { stdio: "ignore" });
120
- } else {
121
- process.kill(pid, "SIGTERM");
122
- }
123
- return true;
124
- } catch {
125
- return false;
126
- }
127
- }
128
-
129
- // Clean up ghost Chrome on port 9222 that isn't tracked by our PID file
130
- function cleanupGhostChrome() {
131
- const portPid = getPortPid(PORT);
132
- if (!portPid) return; // Nothing on port 9222, all good
133
-
134
- const trackedPid = isRunning();
135
-
136
- if (trackedPid && portPid === trackedPid) {
137
- return; // Port 9222 is our Chrome, all good
138
- }
139
-
140
- // Ghost Chrome detected — something on 9222 that isn't ours
141
- if (trackedPid && portPid !== trackedPid) {
142
- console.log(
143
- `Ghost Chrome detected: port ${PORT} has pid ${portPid}, but our PID file says ${trackedPid}.`,
144
- );
145
- } else if (!trackedPid) {
146
- console.log(
147
- `Ghost Chrome detected: unknown process ${portPid} on port ${PORT} (no PID file).`,
148
- );
149
- }
150
-
151
- console.log(`Killing ghost Chrome (pid ${portPid})...`);
152
- killProcess(portPid);
153
-
154
- // Clean up stale files
155
- try {
156
- unlinkSync(PID_FILE);
157
- } catch {}
158
- try {
159
- unlinkSync(ACTIVE_PORT);
160
- } catch {}
161
- console.log("Cleaned up stale Chrome files.");
162
- }
163
-
164
- function httpGet(url, timeoutMs = 1000) {
165
- return new Promise((resolve) => {
166
- const req = http.get(url, (res) => {
167
- let body = "";
168
- res.on("data", (d) => (body += d));
169
- res.on("end", () => resolve({ ok: res.statusCode === 200, body }));
170
- });
171
- req.on("error", () => resolve({ ok: false }));
172
- req.setTimeout(timeoutMs, () => {
173
- req.destroy();
174
- resolve({ ok: false });
175
- });
176
- });
177
- }
178
-
179
- async function writePortFile(timeoutMs = 15000) {
180
- // Chrome on Windows doesn't write DevToolsActivePort — we build it from the HTTP API.
181
- const deadline = Date.now() + timeoutMs;
182
- while (Date.now() < deadline) {
183
- const { ok, body } = await httpGet(
184
- `http://localhost:${PORT}/json/version`,
185
- 1500,
186
- );
187
- if (ok) {
188
- try {
189
- const { webSocketDebuggerUrl } = JSON.parse(body);
190
- // webSocketDebuggerUrl = "ws://localhost:9223/devtools/browser/..."
191
- const wsPath = new URL(webSocketDebuggerUrl).pathname;
192
- // Write in DevToolsActivePort format: port on line 1, path on line 2
193
- const content = `${PORT}\n${wsPath}`;
194
- writeFileSync(ACTIVE_PORT, content, "utf8");
195
- return true;
196
- } catch {
197
- /* malformed response, retry */
198
- }
199
- }
200
- await new Promise((r) => setTimeout(r, 400));
201
- }
202
- return false;
203
- }
204
-
205
- // ---------------------------------------------------------------------------
206
-
207
- async function main() {
208
- const arg = process.argv[2];
209
-
210
- // Clean up any ghost Chrome on port 9222 before doing anything else
211
- cleanupGhostChrome();
212
-
213
- if (arg === "--kill") {
214
- const pid = isRunning();
215
- if (pid) {
216
- const ok = killProcess(pid);
217
- if (ok) console.log(`Stopped Chrome (pid ${pid}).`);
218
- else console.error(`Failed to stop pid ${pid}.`);
219
- } else {
220
- console.log("GreedySearch Chrome is not running.");
221
- }
222
- return;
223
- }
224
-
225
- if (arg === "--status") {
226
- const pid = isRunning();
227
- if (pid)
228
- console.log(
229
- `Running — pid ${pid}, port ${PORT}, DevToolsActivePort redirected.`,
230
- );
231
- else console.log("Not running.");
232
- return;
233
- }
234
-
235
- // Already running?
236
- const existing = isRunning();
237
- if (existing) {
238
- const ready = await writePortFile(5000);
239
- if (ready) {
240
- console.log(
241
- `GreedySearch Chrome already running (pid ${existing}, port ${PORT}).`,
242
- );
243
- console.log("Dedicated GreedySearch DevToolsActivePort is ready.");
244
- return;
245
- }
246
- // Stale PID — process alive but not Chrome on port 9223. Fall through to fresh launch.
247
- console.log(
248
- `Stale PID ${existing} detected (not Chrome on port ${PORT}) — launching fresh.`,
249
- );
250
- try {
251
- unlinkSync(PID_FILE);
252
- } catch {}
253
- }
254
-
255
- const CHROME_EXE = process.env.CHROME_PATH || findChrome();
256
- if (!CHROME_EXE) {
257
- console.error("Chrome not found. Tried standard paths for your OS.");
258
- console.error(
259
- "Set the CHROME_PATH environment variable to point to your Chrome binary.",
260
- );
261
- process.exit(1);
262
- }
263
-
264
- mkdirSync(PROFILE_DIR, { recursive: true });
265
-
266
- console.log(`Launching GreedySearch Chrome on port ${PORT}...`);
267
- const proc = spawn(CHROME_EXE, CHROME_FLAGS, {
268
- detached: true,
269
- stdio: "ignore",
270
- windowsHide: false,
271
- });
272
- proc.unref();
273
- writeFileSync(PID_FILE, String(proc.pid));
274
-
275
- // Wait for Chrome HTTP endpoint and build the dedicated DevToolsActivePort file
276
- const portFileReady = await writePortFile();
277
- if (!portFileReady) {
278
- console.error("Chrome did not become ready within 15s.");
279
- process.exit(1);
280
- }
281
-
282
- console.log(`Ready. No more "Allow remote debugging?" dialogs.`);
283
- console.log(
284
- "GreedySearch now uses its own isolated DevToolsActivePort file.",
285
- );
286
- }
287
-
288
- main();