@apmantza/greedysearch-pi 1.8.9 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +503 -446
- package/bin/cdp.mjs +15 -2
- package/bin/search.mjs +679 -668
- package/extractors/bing-copilot.mjs +68 -11
- package/extractors/common.mjs +37 -2
- package/extractors/consent.mjs +388 -294
- package/extractors/gemini.mjs +217 -150
- package/extractors/perplexity.mjs +56 -7
- package/package.json +1 -1
- package/src/search/chrome.mjs +62 -1
- package/src/search/constants.mjs +1 -6
- package/src/search/engines.mjs +76 -67
- package/src/search/file-sources.mjs +46 -0
- package/src/search/query.mjs +49 -0
- package/src/search/recovery.mjs +20 -1
- package/src/search/sources.mjs +37 -21
- package/src/search/synthesis.mjs +27 -16
- package/extractors/bing-aria.mjs +0 -539
- package/extractors/google-search.mjs +0 -234
|
@@ -1,234 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
// extractors/google-search.mjs
|
|
4
|
-
// Navigate google.com, type query into search box, submit, wait for results,
|
|
5
|
-
// return clean list of results (title, url, snippet).
|
|
6
|
-
//
|
|
7
|
-
// Usage:
|
|
8
|
-
// node extractors/google-search.mjs "<query>" [--tab <prefix>] [--max <n>]
|
|
9
|
-
//
|
|
10
|
-
// Output (stdout): JSON { query, url, results: [{ title, url, snippet }] }
|
|
11
|
-
// Errors go to stderr only — stdout is always clean JSON for piping.
|
|
12
|
-
|
|
13
|
-
import {
|
|
14
|
-
cdp,
|
|
15
|
-
formatAnswer,
|
|
16
|
-
getOrOpenTab,
|
|
17
|
-
handleError,
|
|
18
|
-
outputJson,
|
|
19
|
-
parseArgs,
|
|
20
|
-
prepareArgs,
|
|
21
|
-
TIMING,
|
|
22
|
-
validateQuery,
|
|
23
|
-
} from "./common.mjs";
|
|
24
|
-
import { dismissConsent } from "./consent.mjs";
|
|
25
|
-
|
|
26
|
-
// ─── Locale-agnostic selectors ──────────────────────────────────────
|
|
27
|
-
|
|
28
|
-
// Search box: textarea[name="q"] works across all Google locales
|
|
29
|
-
const SEARCH_BOX = 'textarea[name="q"], input[name="q"]';
|
|
30
|
-
// Submit: form button or keyboard Enter (we'll use Enter which is universal)
|
|
31
|
-
// Result containers: try multiple selectors that work across Google layouts
|
|
32
|
-
const RESULT_SELECTORS = [
|
|
33
|
-
".g", // classic result container
|
|
34
|
-
"[data-sokoban-container]", // newer layout
|
|
35
|
-
".MjjYud", // mobile-first layout
|
|
36
|
-
"div:has(> a > h3)", // catch-all: div containing a link with heading
|
|
37
|
-
];
|
|
38
|
-
|
|
39
|
-
// ─── Type into search box (locale-agnostic) ─────────────────────────
|
|
40
|
-
|
|
41
|
-
async function typeIntoSearchBox(tab, text) {
|
|
42
|
-
await cdp([
|
|
43
|
-
"eval",
|
|
44
|
-
tab,
|
|
45
|
-
`
|
|
46
|
-
(function(t) {
|
|
47
|
-
var el = document.querySelector('${SEARCH_BOX.replace(/\\/g, "\\\\").replace(/'/g, "\\'")}');
|
|
48
|
-
if (!el) return false;
|
|
49
|
-
el.focus();
|
|
50
|
-
el.value = '';
|
|
51
|
-
document.execCommand('insertText', false, t);
|
|
52
|
-
return true;
|
|
53
|
-
})(${JSON.stringify(text)})
|
|
54
|
-
`,
|
|
55
|
-
]);
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
// ─── Submit search (press Enter — locale agnostic) ──────────────────
|
|
59
|
-
|
|
60
|
-
async function submitSearch(tab) {
|
|
61
|
-
// Press Enter key on the search box
|
|
62
|
-
await cdp([
|
|
63
|
-
"eval",
|
|
64
|
-
tab,
|
|
65
|
-
`
|
|
66
|
-
(function() {
|
|
67
|
-
var el = document.querySelector('${SEARCH_BOX.replace(/\\/g, "\\\\").replace(/'/g, "\\'")}');
|
|
68
|
-
if (!el) return false;
|
|
69
|
-
el.dispatchEvent(new KeyboardEvent('keydown', {key:'Enter', code:'Enter', keyCode:13, which:13, bubbles:true}));
|
|
70
|
-
// Also try form submission as fallback
|
|
71
|
-
var form = el.closest('form');
|
|
72
|
-
if (form) {
|
|
73
|
-
setTimeout(function() { form.submit(); }, 100);
|
|
74
|
-
}
|
|
75
|
-
return true;
|
|
76
|
-
})()
|
|
77
|
-
`,
|
|
78
|
-
]);
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
// ─── Extract results ────────────────────────────────────────────────
|
|
82
|
-
|
|
83
|
-
async function extractResults(tab, maxResults = 10) {
|
|
84
|
-
const raw = await cdp([
|
|
85
|
-
"eval",
|
|
86
|
-
tab,
|
|
87
|
-
String.raw`
|
|
88
|
-
(function() {
|
|
89
|
-
var results = [];
|
|
90
|
-
// Strategy: find all h3 headings inside links, then find their container for snippet
|
|
91
|
-
var headings = document.querySelectorAll('a[href^="http"] h3');
|
|
92
|
-
var seen = new Set();
|
|
93
|
-
|
|
94
|
-
for (var i = 0; i < headings.length && results.length < ${maxResults}; i++) {
|
|
95
|
-
var h3 = headings[i];
|
|
96
|
-
var a = h3.closest('a');
|
|
97
|
-
if (!a) continue;
|
|
98
|
-
|
|
99
|
-
var url = a.href;
|
|
100
|
-
// Skip google.com internal links
|
|
101
|
-
if (url.includes('google.com') && !url.includes('/search?')) continue;
|
|
102
|
-
if (seen.has(url)) continue;
|
|
103
|
-
seen.add(url);
|
|
104
|
-
|
|
105
|
-
var title = h3.innerText.trim();
|
|
106
|
-
if (!title) continue;
|
|
107
|
-
|
|
108
|
-
// Find the containing block for the snippet
|
|
109
|
-
var container = a.closest('.g, [data-sokoban-container], .MjjYud, div:has(> a > h3)');
|
|
110
|
-
if (!container) container = a.parentElement;
|
|
111
|
-
|
|
112
|
-
// Try multiple snippet selectors
|
|
113
|
-
var snippet = '';
|
|
114
|
-
var snippetEl = container.querySelector('.VwiC3b, [data-sncf], span.aCOpRe, .lEBKkf, div[style*="-webkit-line-clamp"]');
|
|
115
|
-
if (!snippetEl) {
|
|
116
|
-
// Fallback: find the largest text block that's not the title
|
|
117
|
-
var textNodes = Array.from(container.querySelectorAll('span, div'))
|
|
118
|
-
.filter(function(el) {
|
|
119
|
-
var t = el.innerText?.trim();
|
|
120
|
-
return t && t.length > 30 && t !== title && !el.querySelector('h3');
|
|
121
|
-
})
|
|
122
|
-
.sort(function(a,b) { return b.innerText.length - a.innerText.length; });
|
|
123
|
-
if (textNodes[0]) snippetEl = textNodes[0];
|
|
124
|
-
}
|
|
125
|
-
snippet = snippetEl ? snippetEl.innerText.trim().slice(0, 300) : '';
|
|
126
|
-
|
|
127
|
-
results.push({ title: title, url: url, snippet: snippet });
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
return JSON.stringify(results);
|
|
131
|
-
})()
|
|
132
|
-
`,
|
|
133
|
-
]);
|
|
134
|
-
|
|
135
|
-
try {
|
|
136
|
-
return JSON.parse(raw);
|
|
137
|
-
} catch {
|
|
138
|
-
return [];
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
// ─── Wait for search results to load ───────────────────────────────
|
|
143
|
-
|
|
144
|
-
async function waitForResults(tab, timeoutMs = 15000) {
|
|
145
|
-
const deadline = Date.now() + timeoutMs;
|
|
146
|
-
while (Date.now() < deadline) {
|
|
147
|
-
await new Promise((r) => setTimeout(r, 600));
|
|
148
|
-
const found = await cdp([
|
|
149
|
-
"eval",
|
|
150
|
-
tab,
|
|
151
|
-
"document.querySelectorAll('a[href^=\"http\"] h3').length",
|
|
152
|
-
]).catch(() => "0");
|
|
153
|
-
const count = parseInt(found, 10) || 0;
|
|
154
|
-
if (count >= 3) return count;
|
|
155
|
-
}
|
|
156
|
-
const found = await cdp([
|
|
157
|
-
"eval",
|
|
158
|
-
tab,
|
|
159
|
-
"document.querySelectorAll('a[href^=\"http\"] h3').length",
|
|
160
|
-
]).catch(() => "0");
|
|
161
|
-
return parseInt(found, 10) || 0;
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
// ============================================================================
|
|
165
|
-
// Main
|
|
166
|
-
// ============================================================================
|
|
167
|
-
|
|
168
|
-
const USAGE =
|
|
169
|
-
'Usage: node extractors/google-search.mjs "<query>" [--tab <prefix>] [--max <n>]\n';
|
|
170
|
-
|
|
171
|
-
async function main() {
|
|
172
|
-
const args = await prepareArgs(process.argv.slice(2));
|
|
173
|
-
validateQuery(args, USAGE);
|
|
174
|
-
|
|
175
|
-
// Parse --max flag BEFORE parseArgs so it doesn't leak into query
|
|
176
|
-
let maxResults = 10;
|
|
177
|
-
const maxIdx = args.indexOf("--max");
|
|
178
|
-
const cleanArgs = [...args];
|
|
179
|
-
if (maxIdx !== -1) {
|
|
180
|
-
maxResults = parseInt(args[maxIdx + 1], 10) || 10;
|
|
181
|
-
cleanArgs.splice(maxIdx, 2); // Remove --max and its value
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
const { query, tabPrefix } = parseArgs(cleanArgs);
|
|
185
|
-
|
|
186
|
-
try {
|
|
187
|
-
await cdp(["list"]);
|
|
188
|
-
const tab = await getOrOpenTab(tabPrefix);
|
|
189
|
-
|
|
190
|
-
// Navigate to google.com
|
|
191
|
-
await cdp(["nav", tab, "https://www.google.com"], 35000);
|
|
192
|
-
await new Promise((r) => setTimeout(r, TIMING.postNavSlow));
|
|
193
|
-
await dismissConsent(tab, cdp);
|
|
194
|
-
|
|
195
|
-
// Wait for search box to be ready
|
|
196
|
-
const deadline = Date.now() + 8000;
|
|
197
|
-
while (Date.now() < deadline) {
|
|
198
|
-
const ready = await cdp([
|
|
199
|
-
"eval",
|
|
200
|
-
tab,
|
|
201
|
-
`!!document.querySelector('${SEARCH_BOX.replace(/\\/g, "\\\\").replace(/'/g, "\\'")}')`,
|
|
202
|
-
]).catch(() => "false");
|
|
203
|
-
if (ready === "true") break;
|
|
204
|
-
await new Promise((r) => setTimeout(r, TIMING.inputPoll));
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
// Type query and submit
|
|
208
|
-
await typeIntoSearchBox(tab, query);
|
|
209
|
-
await new Promise((r) => setTimeout(r, TIMING.postType));
|
|
210
|
-
await submitSearch(tab);
|
|
211
|
-
|
|
212
|
-
// Wait for results
|
|
213
|
-
const count = await waitForResults(tab, 15000);
|
|
214
|
-
if (count === 0) {
|
|
215
|
-
throw new Error("No search results found on page");
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
// Extract results
|
|
219
|
-
const results = await extractResults(tab, maxResults);
|
|
220
|
-
const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
221
|
-
() => `https://www.google.com/search?q=${encodeURIComponent(query)}`,
|
|
222
|
-
);
|
|
223
|
-
|
|
224
|
-
outputJson({
|
|
225
|
-
query,
|
|
226
|
-
url: finalUrl,
|
|
227
|
-
results,
|
|
228
|
-
});
|
|
229
|
-
} catch (e) {
|
|
230
|
-
handleError(e);
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
main();
|