@apmantza/greedysearch-pi 1.9.2 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +132 -2
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +318 -81
- package/extractors/bing-copilot.mjs +48 -18
- package/extractors/chatgpt.mjs +553 -0
- package/extractors/common.mjs +213 -22
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +182 -18
- package/extractors/gemini.mjs +350 -217
- package/extractors/google-ai.mjs +129 -128
- package/extractors/logically.mjs +629 -0
- package/extractors/perplexity.mjs +547 -217
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +8 -4
- package/skills/greedy-search/skill.md +20 -12
- package/src/fetcher.mjs +23 -1
- package/src/formatters/results.ts +185 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/challenge-detect.mjs +205 -0
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +155 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/progress.mjs +145 -0
- package/src/search/recovery.mjs +73 -45
- package/src/search/research.mjs +1419 -62
- package/src/search/scale-aware.mjs +93 -0
- package/src/search/simple-research.mjs +520 -0
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +129 -59
- package/src/tools/shared.ts +312 -186
- package/src/types.ts +110 -104
- package/test.mjs +537 -18
package/extractors/google-ai.mjs
CHANGED
|
@@ -1,128 +1,129 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
// extractors/google-ai.mjs
|
|
4
|
-
// Navigate Google AI Mode (udm=50), wait for answer, return clean answer + sources.
|
|
5
|
-
//
|
|
6
|
-
// Usage:
|
|
7
|
-
// node extractors/google-ai.mjs "<query>" [--tab <prefix>]
|
|
8
|
-
//
|
|
9
|
-
// Output (stdout): JSON { answer, sources, query, url }
|
|
10
|
-
// Errors go to stderr only — stdout is always clean JSON for piping.
|
|
11
|
-
|
|
12
|
-
import {
|
|
13
|
-
cdp,
|
|
14
|
-
formatAnswer,
|
|
15
|
-
getOrOpenTab,
|
|
16
|
-
handleError,
|
|
17
|
-
jitter,
|
|
18
|
-
outputJson,
|
|
19
|
-
parseArgs,
|
|
20
|
-
prepareArgs,
|
|
21
|
-
TIMING,
|
|
22
|
-
validateQuery,
|
|
23
|
-
waitForStreamComplete,
|
|
24
|
-
} from "./common.mjs";
|
|
25
|
-
import { dismissConsent, handleVerification } from "./consent.mjs";
|
|
26
|
-
import { SELECTORS } from "./selectors.mjs";
|
|
27
|
-
|
|
28
|
-
const S = SELECTORS.google;
|
|
29
|
-
|
|
30
|
-
const MIN_ANSWER_LENGTH = 50;
|
|
31
|
-
|
|
32
|
-
async function extractAnswer(tab) {
|
|
33
|
-
const excludeFilter = S.sourceExclude
|
|
34
|
-
.map((e) => `!a.href.includes('${e}')`)
|
|
35
|
-
.join(" && ");
|
|
36
|
-
const raw = await cdp([
|
|
37
|
-
"eval",
|
|
38
|
-
tab,
|
|
39
|
-
String.raw`
|
|
40
|
-
(function() {
|
|
41
|
-
var el = document.querySelector('${S.answerContainer}');
|
|
42
|
-
if (!el) return JSON.stringify({ answer: '', sources: [] });
|
|
43
|
-
var answer = el.innerText.trim();
|
|
44
|
-
var sources = Array.from(document.querySelectorAll('${S.sourceLink}'))
|
|
45
|
-
.filter(a => ${excludeFilter})
|
|
46
|
-
.map(a => ({ url: a.href.split('#')[0], title: (a.closest('${S.sourceHeadingParent}')?.querySelector('h3, [role=heading]')?.innerText || a.innerText?.trim().split('\n')[0] || '').slice(0, 100) }))
|
|
47
|
-
.filter(s => s.url && s.url.length > 10)
|
|
48
|
-
.filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
|
|
49
|
-
.slice(0, 10);
|
|
50
|
-
return JSON.stringify({ answer, sources });
|
|
51
|
-
})()
|
|
52
|
-
`,
|
|
53
|
-
]);
|
|
54
|
-
return JSON.parse(raw);
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
// ============================================================================
|
|
58
|
-
// Main
|
|
59
|
-
// ============================================================================
|
|
60
|
-
|
|
61
|
-
const USAGE =
|
|
62
|
-
'Usage: node extractors/google-ai.mjs "<query>" [--tab <prefix>]\n';
|
|
63
|
-
|
|
64
|
-
async function main() {
|
|
65
|
-
const args = await prepareArgs(process.argv.slice(2));
|
|
66
|
-
validateQuery(args, USAGE);
|
|
67
|
-
|
|
68
|
-
const { query, tabPrefix, short, locale } = parseArgs(args);
|
|
69
|
-
|
|
70
|
-
try {
|
|
71
|
-
// Only refresh page list when creating a fresh tab (no prefix provided)
|
|
72
|
-
if (!tabPrefix) await cdp(["list"]);
|
|
73
|
-
const tab = await getOrOpenTab(tabPrefix);
|
|
74
|
-
|
|
75
|
-
// Build URL with language parameter (default to English)
|
|
76
|
-
const langParam = locale ? `&hl=${encodeURIComponent(locale)}` : "&hl=en";
|
|
77
|
-
const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=50${langParam}`;
|
|
78
|
-
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
79
|
-
await dismissConsent(tab, cdp);
|
|
80
|
-
|
|
81
|
-
// If consent redirected us away, navigate back
|
|
82
|
-
const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
83
|
-
() => "",
|
|
84
|
-
);
|
|
85
|
-
if (!currentUrl.includes("google.com/search")) {
|
|
86
|
-
await cdp(["nav", tab, url], 20000);
|
|
87
|
-
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
// Handle "verify you're human" — auto-click simple buttons, wait for user on hard CAPTCHA
|
|
91
|
-
const verifyResult = await handleVerification(tab, cdp, 10000);
|
|
92
|
-
if (verifyResult === "needs-human")
|
|
93
|
-
throw new Error(
|
|
94
|
-
"Google verification required — could not be completed automatically",
|
|
95
|
-
);
|
|
96
|
-
if (verifyResult === "clicked" || verifyResult === "cleared-by-user") {
|
|
97
|
-
// Re-navigate to the search URL after verification
|
|
98
|
-
await cdp(["nav", tab, url], 20000);
|
|
99
|
-
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
await waitForStreamComplete(tab, {
|
|
103
|
-
timeout: 30000,
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// extractors/google-ai.mjs
|
|
4
|
+
// Navigate Google AI Mode (udm=50), wait for answer, return clean answer + sources.
|
|
5
|
+
//
|
|
6
|
+
// Usage:
|
|
7
|
+
// node extractors/google-ai.mjs "<query>" [--tab <prefix>]
|
|
8
|
+
//
|
|
9
|
+
// Output (stdout): JSON { answer, sources, query, url }
|
|
10
|
+
// Errors go to stderr only — stdout is always clean JSON for piping.
|
|
11
|
+
|
|
12
|
+
import {
|
|
13
|
+
cdp,
|
|
14
|
+
formatAnswer,
|
|
15
|
+
getOrOpenTab,
|
|
16
|
+
handleError,
|
|
17
|
+
jitter,
|
|
18
|
+
outputJson,
|
|
19
|
+
parseArgs,
|
|
20
|
+
prepareArgs,
|
|
21
|
+
TIMING,
|
|
22
|
+
validateQuery,
|
|
23
|
+
waitForStreamComplete,
|
|
24
|
+
} from "./common.mjs";
|
|
25
|
+
import { dismissConsent, handleVerification } from "./consent.mjs";
|
|
26
|
+
import { SELECTORS } from "./selectors.mjs";
|
|
27
|
+
|
|
28
|
+
const S = SELECTORS.google;
|
|
29
|
+
|
|
30
|
+
const MIN_ANSWER_LENGTH = 50;
|
|
31
|
+
|
|
32
|
+
async function extractAnswer(tab) {
|
|
33
|
+
const excludeFilter = S.sourceExclude
|
|
34
|
+
.map((e) => `!a.href.includes('${e}')`)
|
|
35
|
+
.join(" && ");
|
|
36
|
+
const raw = await cdp([
|
|
37
|
+
"eval",
|
|
38
|
+
tab,
|
|
39
|
+
String.raw`
|
|
40
|
+
(function() {
|
|
41
|
+
var el = document.querySelector('${S.answerContainer}');
|
|
42
|
+
if (!el) return JSON.stringify({ answer: '', sources: [] });
|
|
43
|
+
var answer = el.innerText.trim();
|
|
44
|
+
var sources = Array.from(document.querySelectorAll('${S.sourceLink}'))
|
|
45
|
+
.filter(a => ${excludeFilter})
|
|
46
|
+
.map(a => ({ url: a.href.split('#')[0], title: (a.closest('${S.sourceHeadingParent}')?.querySelector('h3, [role=heading]')?.innerText || a.innerText?.trim().split('\n')[0] || '').slice(0, 100) }))
|
|
47
|
+
.filter(s => s.url && s.url.length > 10)
|
|
48
|
+
.filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
|
|
49
|
+
.slice(0, 10);
|
|
50
|
+
return JSON.stringify({ answer, sources });
|
|
51
|
+
})()
|
|
52
|
+
`,
|
|
53
|
+
]);
|
|
54
|
+
return JSON.parse(raw);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// ============================================================================
|
|
58
|
+
// Main
|
|
59
|
+
// ============================================================================
|
|
60
|
+
|
|
61
|
+
const USAGE =
|
|
62
|
+
'Usage: node extractors/google-ai.mjs "<query>" [--tab <prefix>]\n';
|
|
63
|
+
|
|
64
|
+
async function main() {
|
|
65
|
+
const args = await prepareArgs(process.argv.slice(2));
|
|
66
|
+
validateQuery(args, USAGE);
|
|
67
|
+
|
|
68
|
+
const { query, tabPrefix, short, locale } = parseArgs(args);
|
|
69
|
+
|
|
70
|
+
try {
|
|
71
|
+
// Only refresh page list when creating a fresh tab (no prefix provided)
|
|
72
|
+
if (!tabPrefix) await cdp(["list"]);
|
|
73
|
+
const tab = await getOrOpenTab(tabPrefix);
|
|
74
|
+
|
|
75
|
+
// Build URL with language parameter (default to English)
|
|
76
|
+
const langParam = locale ? `&hl=${encodeURIComponent(locale)}` : "&hl=en";
|
|
77
|
+
const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=50${langParam}`;
|
|
78
|
+
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
79
|
+
await dismissConsent(tab, cdp);
|
|
80
|
+
|
|
81
|
+
// If consent redirected us away, navigate back
|
|
82
|
+
const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
83
|
+
() => "",
|
|
84
|
+
);
|
|
85
|
+
if (!currentUrl.includes("google.com/search")) {
|
|
86
|
+
await cdp(["nav", tab, url], 20000);
|
|
87
|
+
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Handle "verify you're human" — auto-click simple buttons, wait for user on hard CAPTCHA
|
|
91
|
+
const verifyResult = await handleVerification(tab, cdp, 10000);
|
|
92
|
+
if (verifyResult === "needs-human")
|
|
93
|
+
throw new Error(
|
|
94
|
+
"Google verification required — could not be completed automatically",
|
|
95
|
+
);
|
|
96
|
+
if (verifyResult === "clicked" || verifyResult === "cleared-by-user") {
|
|
97
|
+
// Re-navigate to the search URL after verification
|
|
98
|
+
await cdp(["nav", tab, url], 20000);
|
|
99
|
+
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
await waitForStreamComplete(tab, {
|
|
103
|
+
timeout: 30000,
|
|
104
|
+
stableRounds: 5,
|
|
105
|
+
selector: `document.querySelector('${S.answerContainer}')`,
|
|
106
|
+
minLength: MIN_ANSWER_LENGTH,
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
const { answer, sources } = await extractAnswer(tab);
|
|
110
|
+
if (!answer)
|
|
111
|
+
throw new Error(
|
|
112
|
+
"No answer extracted — Google AI Mode may not have responded",
|
|
113
|
+
);
|
|
114
|
+
|
|
115
|
+
const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
116
|
+
() => url,
|
|
117
|
+
);
|
|
118
|
+
outputJson({
|
|
119
|
+
query,
|
|
120
|
+
url: finalUrl,
|
|
121
|
+
answer: formatAnswer(answer, short),
|
|
122
|
+
sources,
|
|
123
|
+
});
|
|
124
|
+
} catch (e) {
|
|
125
|
+
handleError(e);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
main();
|