@apmantza/greedysearch-pi 1.8.9 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,234 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- // extractors/google-search.mjs
4
- // Navigate google.com, type query into search box, submit, wait for results,
5
- // return clean list of results (title, url, snippet).
6
- //
7
- // Usage:
8
- // node extractors/google-search.mjs "<query>" [--tab <prefix>] [--max <n>]
9
- //
10
- // Output (stdout): JSON { query, url, results: [{ title, url, snippet }] }
11
- // Errors go to stderr only — stdout is always clean JSON for piping.
12
-
13
- import {
14
- cdp,
15
- formatAnswer,
16
- getOrOpenTab,
17
- handleError,
18
- outputJson,
19
- parseArgs,
20
- prepareArgs,
21
- TIMING,
22
- validateQuery,
23
- } from "./common.mjs";
24
- import { dismissConsent } from "./consent.mjs";
25
-
26
- // ─── Locale-agnostic selectors ──────────────────────────────────────
27
-
28
- // Search box: textarea[name="q"] works across all Google locales
29
- const SEARCH_BOX = 'textarea[name="q"], input[name="q"]';
30
- // Submit: form button or keyboard Enter (we'll use Enter which is universal)
31
- // Result containers: try multiple selectors that work across Google layouts
32
- const RESULT_SELECTORS = [
33
- ".g", // classic result container
34
- "[data-sokoban-container]", // newer layout
35
- ".MjjYud", // mobile-first layout
36
- "div:has(> a > h3)", // catch-all: div containing a link with heading
37
- ];
38
-
39
- // ─── Type into search box (locale-agnostic) ─────────────────────────
40
-
41
- async function typeIntoSearchBox(tab, text) {
42
- await cdp([
43
- "eval",
44
- tab,
45
- `
46
- (function(t) {
47
- var el = document.querySelector('${SEARCH_BOX.replace(/\\/g, "\\\\").replace(/'/g, "\\'")}');
48
- if (!el) return false;
49
- el.focus();
50
- el.value = '';
51
- document.execCommand('insertText', false, t);
52
- return true;
53
- })(${JSON.stringify(text)})
54
- `,
55
- ]);
56
- }
57
-
58
- // ─── Submit search (press Enter — locale agnostic) ──────────────────
59
-
60
- async function submitSearch(tab) {
61
- // Press Enter key on the search box
62
- await cdp([
63
- "eval",
64
- tab,
65
- `
66
- (function() {
67
- var el = document.querySelector('${SEARCH_BOX.replace(/\\/g, "\\\\").replace(/'/g, "\\'")}');
68
- if (!el) return false;
69
- el.dispatchEvent(new KeyboardEvent('keydown', {key:'Enter', code:'Enter', keyCode:13, which:13, bubbles:true}));
70
- // Also try form submission as fallback
71
- var form = el.closest('form');
72
- if (form) {
73
- setTimeout(function() { form.submit(); }, 100);
74
- }
75
- return true;
76
- })()
77
- `,
78
- ]);
79
- }
80
-
81
- // ─── Extract results ────────────────────────────────────────────────
82
-
83
- async function extractResults(tab, maxResults = 10) {
84
- const raw = await cdp([
85
- "eval",
86
- tab,
87
- String.raw`
88
- (function() {
89
- var results = [];
90
- // Strategy: find all h3 headings inside links, then find their container for snippet
91
- var headings = document.querySelectorAll('a[href^="http"] h3');
92
- var seen = new Set();
93
-
94
- for (var i = 0; i < headings.length && results.length < ${maxResults}; i++) {
95
- var h3 = headings[i];
96
- var a = h3.closest('a');
97
- if (!a) continue;
98
-
99
- var url = a.href;
100
- // Skip google.com internal links
101
- if (url.includes('google.com') && !url.includes('/search?')) continue;
102
- if (seen.has(url)) continue;
103
- seen.add(url);
104
-
105
- var title = h3.innerText.trim();
106
- if (!title) continue;
107
-
108
- // Find the containing block for the snippet
109
- var container = a.closest('.g, [data-sokoban-container], .MjjYud, div:has(> a > h3)');
110
- if (!container) container = a.parentElement;
111
-
112
- // Try multiple snippet selectors
113
- var snippet = '';
114
- var snippetEl = container.querySelector('.VwiC3b, [data-sncf], span.aCOpRe, .lEBKkf, div[style*="-webkit-line-clamp"]');
115
- if (!snippetEl) {
116
- // Fallback: find the largest text block that's not the title
117
- var textNodes = Array.from(container.querySelectorAll('span, div'))
118
- .filter(function(el) {
119
- var t = el.innerText?.trim();
120
- return t && t.length > 30 && t !== title && !el.querySelector('h3');
121
- })
122
- .sort(function(a,b) { return b.innerText.length - a.innerText.length; });
123
- if (textNodes[0]) snippetEl = textNodes[0];
124
- }
125
- snippet = snippetEl ? snippetEl.innerText.trim().slice(0, 300) : '';
126
-
127
- results.push({ title: title, url: url, snippet: snippet });
128
- }
129
-
130
- return JSON.stringify(results);
131
- })()
132
- `,
133
- ]);
134
-
135
- try {
136
- return JSON.parse(raw);
137
- } catch {
138
- return [];
139
- }
140
- }
141
-
142
- // ─── Wait for search results to load ───────────────────────────────
143
-
144
- async function waitForResults(tab, timeoutMs = 15000) {
145
- const deadline = Date.now() + timeoutMs;
146
- while (Date.now() < deadline) {
147
- await new Promise((r) => setTimeout(r, 600));
148
- const found = await cdp([
149
- "eval",
150
- tab,
151
- "document.querySelectorAll('a[href^=\"http\"] h3').length",
152
- ]).catch(() => "0");
153
- const count = parseInt(found, 10) || 0;
154
- if (count >= 3) return count;
155
- }
156
- const found = await cdp([
157
- "eval",
158
- tab,
159
- "document.querySelectorAll('a[href^=\"http\"] h3').length",
160
- ]).catch(() => "0");
161
- return parseInt(found, 10) || 0;
162
- }
163
-
164
- // ============================================================================
165
- // Main
166
- // ============================================================================
167
-
168
- const USAGE =
169
- 'Usage: node extractors/google-search.mjs "<query>" [--tab <prefix>] [--max <n>]\n';
170
-
171
- async function main() {
172
- const args = await prepareArgs(process.argv.slice(2));
173
- validateQuery(args, USAGE);
174
-
175
- // Parse --max flag BEFORE parseArgs so it doesn't leak into query
176
- let maxResults = 10;
177
- const maxIdx = args.indexOf("--max");
178
- const cleanArgs = [...args];
179
- if (maxIdx !== -1) {
180
- maxResults = parseInt(args[maxIdx + 1], 10) || 10;
181
- cleanArgs.splice(maxIdx, 2); // Remove --max and its value
182
- }
183
-
184
- const { query, tabPrefix } = parseArgs(cleanArgs);
185
-
186
- try {
187
- await cdp(["list"]);
188
- const tab = await getOrOpenTab(tabPrefix);
189
-
190
- // Navigate to google.com
191
- await cdp(["nav", tab, "https://www.google.com"], 35000);
192
- await new Promise((r) => setTimeout(r, TIMING.postNavSlow));
193
- await dismissConsent(tab, cdp);
194
-
195
- // Wait for search box to be ready
196
- const deadline = Date.now() + 8000;
197
- while (Date.now() < deadline) {
198
- const ready = await cdp([
199
- "eval",
200
- tab,
201
- `!!document.querySelector('${SEARCH_BOX.replace(/\\/g, "\\\\").replace(/'/g, "\\'")}')`,
202
- ]).catch(() => "false");
203
- if (ready === "true") break;
204
- await new Promise((r) => setTimeout(r, TIMING.inputPoll));
205
- }
206
-
207
- // Type query and submit
208
- await typeIntoSearchBox(tab, query);
209
- await new Promise((r) => setTimeout(r, TIMING.postType));
210
- await submitSearch(tab);
211
-
212
- // Wait for results
213
- const count = await waitForResults(tab, 15000);
214
- if (count === 0) {
215
- throw new Error("No search results found on page");
216
- }
217
-
218
- // Extract results
219
- const results = await extractResults(tab, maxResults);
220
- const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
221
- () => `https://www.google.com/search?q=${encodeURIComponent(query)}`,
222
- );
223
-
224
- outputJson({
225
- query,
226
- url: finalUrl,
227
- results,
228
- });
229
- } catch (e) {
230
- handleError(e);
231
- }
232
- }
233
-
234
- main();