@apmantza/greedysearch-pi 1.9.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +80 -1
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +197 -68
- package/extractors/bing-copilot.mjs +42 -4
- package/extractors/chatgpt.mjs +436 -0
- package/extractors/common.mjs +155 -21
- package/extractors/consensus.mjs +655 -0
- package/extractors/gemini.mjs +335 -217
- package/extractors/logically.mjs +567 -0
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +7 -3
- package/skills/greedy-search/skill.md +9 -3
- package/src/fetcher.mjs +8 -1
- package/src/formatters/results.ts +163 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +150 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/recovery.mjs +51 -45
- package/src/search/research.mjs +1059 -61
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +124 -52
- package/src/tools/shared.ts +187 -186
- package/src/types.ts +110 -104
- package/test.mjs +377 -6
package/bin/launch.mjs
CHANGED
|
@@ -76,6 +76,15 @@ const BASE_CHROME_FLAGS = [
|
|
|
76
76
|
"--window-size=1920,1080",
|
|
77
77
|
"--lang=en-US",
|
|
78
78
|
"--force-color-profile=srgb",
|
|
79
|
+
// Background-tab throttling kills parallel extractions: Chrome clamps
|
|
80
|
+
// setTimeout to ~1Hz in unfocused tabs, so a streaming response that
|
|
81
|
+
// finishes in 5s solo takes 60s+ when 4 engines share one Chrome.
|
|
82
|
+
// The trio below restores full-speed JS in every tab. Safe for our
|
|
83
|
+
// anti-bot stealth — Cloudflare detects CDP/webdriver artifacts, not
|
|
84
|
+
// timer-throttling behavior. Same flags Playwright/Puppeteer add.
|
|
85
|
+
"--disable-background-timer-throttling",
|
|
86
|
+
"--disable-renderer-backgrounding",
|
|
87
|
+
"--disable-backgrounding-occluded-windows",
|
|
79
88
|
];
|
|
80
89
|
|
|
81
90
|
function getChromeVersion(chromePath) {
|
package/bin/search.mjs
CHANGED
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
// node search.mjs gem "latest React features"
|
|
21
21
|
// node search.mjs all "how does TCP congestion control work"
|
|
22
22
|
|
|
23
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
23
|
+
import { appendFileSync, existsSync, readFileSync } from "node:fs";
|
|
24
24
|
// Config file for user defaults
|
|
25
25
|
import { homedir } from "node:os";
|
|
26
26
|
import { join } from "node:path";
|
|
@@ -33,7 +33,12 @@ import {
|
|
|
33
33
|
openNewTab,
|
|
34
34
|
touchActivity,
|
|
35
35
|
} from "../src/search/chrome.mjs";
|
|
36
|
-
import {
|
|
36
|
+
import {
|
|
37
|
+
ALL_ENGINES,
|
|
38
|
+
ENGINES,
|
|
39
|
+
SYNTHESIZER,
|
|
40
|
+
VISIBLE_RECOVERY_LOG,
|
|
41
|
+
} from "../src/search/constants.mjs";
|
|
37
42
|
import { runExtractor } from "../src/search/engines.mjs";
|
|
38
43
|
import {
|
|
39
44
|
fetchMultipleSources,
|
|
@@ -51,7 +56,11 @@ import {
|
|
|
51
56
|
mergeFetchDataIntoSources,
|
|
52
57
|
} from "../src/search/sources.mjs";
|
|
53
58
|
import { buildConfidence } from "../src/search/synthesis.mjs";
|
|
54
|
-
import {
|
|
59
|
+
import {
|
|
60
|
+
getSynthesisStartUrl,
|
|
61
|
+
normalizeSynthesizer,
|
|
62
|
+
synthesizeResults,
|
|
63
|
+
} from "../src/search/synthesis-runner.mjs";
|
|
55
64
|
import { normalizeQuery } from "../src/search/query.mjs";
|
|
56
65
|
import { runResearchMode } from "../src/search/research.mjs";
|
|
57
66
|
|
|
@@ -69,6 +78,18 @@ function loadUserConfig() {
|
|
|
69
78
|
return {};
|
|
70
79
|
}
|
|
71
80
|
|
|
81
|
+
function logVisibleRecovery(event) {
|
|
82
|
+
try {
|
|
83
|
+
appendFileSync(
|
|
84
|
+
VISIBLE_RECOVERY_LOG,
|
|
85
|
+
`${JSON.stringify({ at: new Date().toISOString(), ...event })}\n`,
|
|
86
|
+
"utf8",
|
|
87
|
+
);
|
|
88
|
+
} catch {
|
|
89
|
+
// Best-effort diagnostics only. Never fail a search because logging failed.
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
72
93
|
/** Read query/prompt from stdin (used with --stdin to avoid command-line leakage) */
|
|
73
94
|
async function readStdin() {
|
|
74
95
|
return new Promise((resolve) => {
|
|
@@ -89,16 +110,20 @@ async function main() {
|
|
|
89
110
|
`${[
|
|
90
111
|
'Usage: node search.mjs <engine> "<query>"',
|
|
91
112
|
"",
|
|
92
|
-
"Engines: perplexity (p),
|
|
113
|
+
"Engines: all, perplexity (p), google (g), chatgpt (gpt), gemini (gem), semantic-scholar (s2), logically (log), bing (b)",
|
|
93
114
|
"",
|
|
94
115
|
"Flags:",
|
|
95
|
-
" --
|
|
96
|
-
" --
|
|
97
|
-
" --
|
|
116
|
+
" --synthesize For engine=all: synthesize fetched sources",
|
|
117
|
+
" --synthesizer <engine> Synthesis engine (default from ~/.pi/greedyconfig)",
|
|
118
|
+
" --fast Legacy quick mode: no source fetching or synthesis",
|
|
119
|
+
" --depth <mode> Legacy: fast|standard|deep aliases, or research",
|
|
120
|
+
" --deep-research Deprecated alias for --research",
|
|
98
121
|
" --research Iterative query/learnings loop (alias: --depth research)",
|
|
99
122
|
" --breadth <n> Research mode query breadth, 1-5 (default: 3)",
|
|
100
123
|
" --iterations <n> Research mode rounds, 1-3 (default: 2)",
|
|
101
124
|
" --max-sources <n> Research mode fetched source cap, 3-12",
|
|
125
|
+
" --research-out-dir <dir> Write research bundle to a specific directory",
|
|
126
|
+
" --no-research-bundle Disable the default .pi/greedysearch-research bundle",
|
|
102
127
|
" --fetch-top-source Fetch content from top source",
|
|
103
128
|
" --inline Output JSON to stdout (for piping)",
|
|
104
129
|
" --locale <lang> Force results language (en, de, fr, etc.)",
|
|
@@ -112,10 +137,11 @@ async function main() {
|
|
|
112
137
|
" GREEDY_SEARCH_LOCALE Default locale (default: en)",
|
|
113
138
|
"",
|
|
114
139
|
"Examples:",
|
|
115
|
-
' node search.mjs all "Node.js streams"
|
|
116
|
-
' node search.mjs all "
|
|
140
|
+
' node search.mjs all "Node.js streams" # Grounded: engines + fetched sources',
|
|
141
|
+
' node search.mjs all "Node.js streams" --synthesize # Add Gemini synthesis',
|
|
142
|
+
' node search.mjs all "quick check" --fast # Legacy fast: no sources/synthesis',
|
|
117
143
|
' node search.mjs all "browser automation" --research --breadth 3 --iterations 2',
|
|
118
|
-
' node search.mjs p "what is memoization"
|
|
144
|
+
' node search.mjs p "what is memoization" # Single engine search',
|
|
119
145
|
].join("\n")}\n`,
|
|
120
146
|
);
|
|
121
147
|
process.exit(1);
|
|
@@ -129,6 +155,11 @@ async function main() {
|
|
|
129
155
|
process.env.GREEDY_SEARCH_VISIBLE = "1";
|
|
130
156
|
process.env.GREEDY_SEARCH_ALWAYS_VISIBLE = "1";
|
|
131
157
|
delete process.env.GREEDY_SEARCH_HEADLESS;
|
|
158
|
+
} else if (process.env.GREEDY_SEARCH_VISIBLE !== "1") {
|
|
159
|
+
// Establish the desired mode BEFORE ensureChrome() so a stale visible
|
|
160
|
+
// recovery browser is switched back to headless before research planning
|
|
161
|
+
// and Gemini synthesis tabs are opened.
|
|
162
|
+
process.env.GREEDY_SEARCH_HEADLESS = "1";
|
|
132
163
|
}
|
|
133
164
|
|
|
134
165
|
await ensureChrome();
|
|
@@ -136,41 +167,44 @@ async function main() {
|
|
|
136
167
|
// Track activity for headless idle timeout
|
|
137
168
|
touchActivity();
|
|
138
169
|
|
|
139
|
-
// Depth modes: fast (no synthesis/fetch), standard (synthesis+fetch 5 sources)
|
|
140
170
|
const depthIdx = args.indexOf("--depth");
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
} else if (args.includes("--fast")) {
|
|
146
|
-
depth = "fast"; // Explicit fast mode requested
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
// For single engine (not "all"), default to fast unless explicit
|
|
171
|
+
const legacyDepth =
|
|
172
|
+
depthIdx !== -1 && args[depthIdx + 1]
|
|
173
|
+
? args[depthIdx + 1].toLowerCase()
|
|
174
|
+
: null;
|
|
150
175
|
const engineArg = args.find((a) => !a.startsWith("--"))?.toLowerCase();
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
176
|
+
const researchMode =
|
|
177
|
+
args.includes("--research") ||
|
|
178
|
+
args.includes("--deep-research") ||
|
|
179
|
+
legacyDepth === "research";
|
|
180
|
+
const legacyFast = args.includes("--fast") || legacyDepth === "fast";
|
|
181
|
+
const legacySynthesisDepth =
|
|
182
|
+
legacyDepth === "standard" ||
|
|
183
|
+
legacyDepth === "deep" ||
|
|
184
|
+
args.includes("--deep");
|
|
185
|
+
const shouldFetchSources = engineArg === "all" && !legacyFast;
|
|
186
|
+
const shouldSynthesize =
|
|
187
|
+
engineArg === "all" &&
|
|
188
|
+
!legacyFast &&
|
|
189
|
+
(args.includes("--synthesize") || legacySynthesisDepth);
|
|
190
|
+
const groundedSynthesis = legacyDepth === "deep" || args.includes("--deep");
|
|
154
191
|
|
|
155
|
-
// --deep-research / --deep flags map to deep mode (backward compat)
|
|
156
192
|
if (args.includes("--deep-research")) {
|
|
157
|
-
depth = "standard";
|
|
158
193
|
process.stderr.write(
|
|
159
|
-
"[greedysearch] --deep-research is deprecated; use --
|
|
194
|
+
"[greedysearch] --deep-research is deprecated; use --research or --depth research\n",
|
|
160
195
|
);
|
|
161
196
|
}
|
|
162
|
-
if (
|
|
163
|
-
depth = "deep";
|
|
164
|
-
}
|
|
165
|
-
if (args.includes("--research")) {
|
|
166
|
-
depth = "research";
|
|
167
|
-
}
|
|
168
|
-
if (args.includes("--synthesize")) {
|
|
197
|
+
if (legacySynthesisDepth) {
|
|
169
198
|
process.stderr.write(
|
|
170
|
-
"[greedysearch]
|
|
199
|
+
"[greedysearch] depth fast|standard|deep is deprecated; use default grounded search plus --synthesize when needed\n",
|
|
171
200
|
);
|
|
172
201
|
}
|
|
173
202
|
|
|
203
|
+
const synthesizerIdx = args.indexOf("--synthesizer");
|
|
204
|
+
const synthesizer = normalizeSynthesizer(
|
|
205
|
+
synthesizerIdx === -1 ? SYNTHESIZER : args[synthesizerIdx + 1],
|
|
206
|
+
);
|
|
207
|
+
|
|
174
208
|
const full = args.includes("--full");
|
|
175
209
|
const short = !full;
|
|
176
210
|
const fetchSource = args.includes("--fetch-top-source");
|
|
@@ -183,9 +217,10 @@ async function main() {
|
|
|
183
217
|
iterationsIdx === -1 ? undefined : args[iterationsIdx + 1];
|
|
184
218
|
const researchMaxSources =
|
|
185
219
|
maxSourcesIdx === -1 ? undefined : args[maxSourcesIdx + 1];
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
220
|
+
const researchOutDirIdx = args.indexOf("--research-out-dir");
|
|
221
|
+
const researchOutDir =
|
|
222
|
+
researchOutDirIdx === -1 ? undefined : args[researchOutDirIdx + 1];
|
|
223
|
+
const writeResearchBundle = !args.includes("--no-research-bundle");
|
|
189
224
|
const outIdx = args.indexOf("--out");
|
|
190
225
|
const outFile = outIdx === -1 ? null : args[outIdx + 1];
|
|
191
226
|
|
|
@@ -218,18 +253,23 @@ async function main() {
|
|
|
218
253
|
a !== "--visible" &&
|
|
219
254
|
a !== "--always-visible" &&
|
|
220
255
|
a !== "--depth" &&
|
|
256
|
+
a !== "--synthesizer" &&
|
|
221
257
|
a !== "--out" &&
|
|
222
258
|
a !== "--locale" &&
|
|
223
259
|
a !== "--breadth" &&
|
|
224
260
|
a !== "--iterations" &&
|
|
225
261
|
a !== "--max-sources" &&
|
|
262
|
+
a !== "--research-out-dir" &&
|
|
263
|
+
a !== "--no-research-bundle" &&
|
|
226
264
|
a !== "--help" &&
|
|
227
265
|
(depthIdx === -1 || i !== depthIdx + 1) &&
|
|
266
|
+
(synthesizerIdx === -1 || i !== synthesizerIdx + 1) &&
|
|
228
267
|
(outIdx === -1 || i !== outIdx + 1) &&
|
|
229
268
|
(localeIdx === -1 || i !== localeIdx + 1) &&
|
|
230
269
|
(breadthIdx === -1 || i !== breadthIdx + 1) &&
|
|
231
270
|
(iterationsIdx === -1 || i !== iterationsIdx + 1) &&
|
|
232
|
-
(maxSourcesIdx === -1 || i !== maxSourcesIdx + 1)
|
|
271
|
+
(maxSourcesIdx === -1 || i !== maxSourcesIdx + 1) &&
|
|
272
|
+
(researchOutDirIdx === -1 || i !== researchOutDirIdx + 1),
|
|
233
273
|
);
|
|
234
274
|
const engine = rest[0]?.toLowerCase();
|
|
235
275
|
// Read query from stdin when --stdin flag is set (avoids leaking query in process table)
|
|
@@ -241,7 +281,7 @@ async function main() {
|
|
|
241
281
|
query = rest.slice(1).join(" ");
|
|
242
282
|
}
|
|
243
283
|
|
|
244
|
-
if (
|
|
284
|
+
if (researchMode) {
|
|
245
285
|
if (engine !== "all") {
|
|
246
286
|
process.stderr.write(
|
|
247
287
|
`[greedysearch] Research mode uses all engines; ignoring engine "${engine}".\n`,
|
|
@@ -254,6 +294,8 @@ async function main() {
|
|
|
254
294
|
maxSources: researchMaxSources,
|
|
255
295
|
locale,
|
|
256
296
|
short,
|
|
297
|
+
writeBundle: writeResearchBundle,
|
|
298
|
+
researchOutDir,
|
|
257
299
|
});
|
|
258
300
|
writeOutput(out, outFile, {
|
|
259
301
|
inline,
|
|
@@ -270,8 +312,11 @@ async function main() {
|
|
|
270
312
|
// engine homepage so extractors can skip the initial navigation.
|
|
271
313
|
const ENGINE_START_URLS = {
|
|
272
314
|
perplexity: "https://www.perplexity.ai/",
|
|
273
|
-
bing: "https://copilot.microsoft.com/",
|
|
274
315
|
google: "https://www.google.com/",
|
|
316
|
+
"semantic-scholar": "https://www.semanticscholar.org/",
|
|
317
|
+
semanticscholar: "https://www.semanticscholar.org/",
|
|
318
|
+
s2: "https://www.semanticscholar.org/",
|
|
319
|
+
logically: "https://logically.app/research-assistant/",
|
|
275
320
|
};
|
|
276
321
|
const engineTabs = await Promise.all(
|
|
277
322
|
ALL_ENGINES.map((e) => openNewTab(ENGINE_START_URLS[e])),
|
|
@@ -280,11 +325,10 @@ async function main() {
|
|
|
280
325
|
await cdp(["list"]);
|
|
281
326
|
|
|
282
327
|
// Time-bounded per-engine extraction so slow engines don't stall the batch.
|
|
283
|
-
// Bing can take a little longer than Google/Perplexity under CDP contention;
|
|
284
|
-
// keep fast mode bounded while avoiding most false recovery trips.
|
|
285
328
|
const engineTimeoutFor = (engineName) => {
|
|
286
|
-
if (
|
|
287
|
-
|
|
329
|
+
if (!legacyFast) return 70000;
|
|
330
|
+
// ChatGPT needs ~25-30s solo; under CDP contention needs more headroom
|
|
331
|
+
return engineName === "chatgpt" ? 60000 : 35000;
|
|
288
332
|
};
|
|
289
333
|
|
|
290
334
|
try {
|
|
@@ -316,7 +360,19 @@ async function main() {
|
|
|
316
360
|
if (r.status === "fulfilled") {
|
|
317
361
|
out[r.value.engine] = r.value;
|
|
318
362
|
} else {
|
|
319
|
-
|
|
363
|
+
const err = r.reason;
|
|
364
|
+
const msg = err?.message || "unknown error";
|
|
365
|
+
out[ALL_ENGINES[i]] = { error: msg };
|
|
366
|
+
if (err?.lastStage) {
|
|
367
|
+
process.stderr.write(
|
|
368
|
+
`[greedysearch] ${ALL_ENGINES[i]} failed at stage '${err.lastStage}': ${msg}\n`,
|
|
369
|
+
);
|
|
370
|
+
}
|
|
371
|
+
if (err?.partialErr) {
|
|
372
|
+
process.stderr.write(
|
|
373
|
+
`[greedysearch] ${ALL_ENGINES[i]} tail stderr:\n${err.partialErr}\n`,
|
|
374
|
+
);
|
|
375
|
+
}
|
|
320
376
|
}
|
|
321
377
|
}
|
|
322
378
|
|
|
@@ -331,6 +387,20 @@ async function main() {
|
|
|
331
387
|
recoveryCandidates.length > 0 &&
|
|
332
388
|
process.env.GREEDY_SEARCH_VISIBLE !== "1"
|
|
333
389
|
) {
|
|
390
|
+
logVisibleRecovery({
|
|
391
|
+
scope: "all",
|
|
392
|
+
phase: "start",
|
|
393
|
+
engines: recoveryCandidates,
|
|
394
|
+
reasons: Object.fromEntries(
|
|
395
|
+
recoveryCandidates.map((engineName) => [
|
|
396
|
+
engineName,
|
|
397
|
+
{
|
|
398
|
+
error: out[engineName]?.error || null,
|
|
399
|
+
envelope: out[engineName]?._envelope || null,
|
|
400
|
+
},
|
|
401
|
+
]),
|
|
402
|
+
),
|
|
403
|
+
});
|
|
334
404
|
process.stderr.write(
|
|
335
405
|
`[greedysearch] 🔓 Headless ${recoveryCandidates.join(", ")} search hit timeout/verification/antibot signals — retrying visible to establish cookies...\n`,
|
|
336
406
|
);
|
|
@@ -431,6 +501,23 @@ async function main() {
|
|
|
431
501
|
stillBlocked.push(...secondStillBlocked);
|
|
432
502
|
}
|
|
433
503
|
|
|
504
|
+
logVisibleRecovery({
|
|
505
|
+
scope: "all",
|
|
506
|
+
phase: stillBlocked.length > 0 ? "needs-human" : "success",
|
|
507
|
+
engines: recoveryCandidates,
|
|
508
|
+
results: Object.fromEntries(
|
|
509
|
+
recoveryCandidates.map((engineName) => [
|
|
510
|
+
engineName,
|
|
511
|
+
{
|
|
512
|
+
mode: out[engineName]?._envelope?.mode || null,
|
|
513
|
+
durationMs: out[engineName]?._envelope?.durationMs || null,
|
|
514
|
+
lastStage: out[engineName]?._envelope?.lastStage || null,
|
|
515
|
+
error: out[engineName]?.error || null,
|
|
516
|
+
},
|
|
517
|
+
]),
|
|
518
|
+
),
|
|
519
|
+
});
|
|
520
|
+
|
|
434
521
|
if (stillBlocked.length > 0) {
|
|
435
522
|
for (const blockedEngine of stillBlocked) {
|
|
436
523
|
process.stderr.write(`PROGRESS:${blockedEngine}:needs-human\n`);
|
|
@@ -488,18 +575,9 @@ async function main() {
|
|
|
488
575
|
// Build a canonical source registry across all engines
|
|
489
576
|
out._sources = buildSourceRegistry(out, query);
|
|
490
577
|
|
|
491
|
-
// Pre-navigate Gemini tab in parallel with source fetch so the page
|
|
492
|
-
// is already loaded when synthesis starts — saves ~4s of nav time.
|
|
493
|
-
let geminiTabPromise = null;
|
|
494
|
-
if (depth !== "fast") {
|
|
495
|
-
geminiTabPromise = openNewTab("https://gemini.google.com/app").catch(
|
|
496
|
-
() => null,
|
|
497
|
-
);
|
|
498
|
-
}
|
|
499
|
-
|
|
500
578
|
// Source fetching: default for all "all" searches
|
|
501
579
|
// Fetch all sources in a single batch (concurrency = source count).
|
|
502
|
-
if (
|
|
580
|
+
if (shouldFetchSources && out._sources.length > 0) {
|
|
503
581
|
process.stderr.write("PROGRESS:source-fetch:start\n");
|
|
504
582
|
const fetchedSources = await fetchMultipleSources(
|
|
505
583
|
out._sources,
|
|
@@ -512,29 +590,41 @@ async function main() {
|
|
|
512
590
|
process.stderr.write("PROGRESS:source-fetch:done\n");
|
|
513
591
|
}
|
|
514
592
|
|
|
515
|
-
//
|
|
516
|
-
|
|
593
|
+
// Optional engine-agnostic synthesis for multi-engine searches.
|
|
594
|
+
// Open the synthesizer tab HERE (after source fetch) instead of
|
|
595
|
+
// pre-opening before source fetch. Pre-opening was fragile: Chrome could
|
|
596
|
+
// be killed during visible recovery or idle-timeout between source fetch
|
|
597
|
+
// and synthesis, leaving a stale tab ID that causes "No target matching prefix".
|
|
598
|
+
if (shouldSynthesize) {
|
|
517
599
|
process.stderr.write("PROGRESS:synthesis:start\n");
|
|
518
600
|
process.stderr.write(
|
|
519
|
-
|
|
601
|
+
`[greedysearch] Synthesizing results with ${synthesizer}...\n`,
|
|
520
602
|
);
|
|
603
|
+
let synthesisTab = null;
|
|
521
604
|
try {
|
|
522
|
-
|
|
523
|
-
const synthesis = await
|
|
524
|
-
grounded:
|
|
525
|
-
tabPrefix:
|
|
605
|
+
synthesisTab = await openNewTab(getSynthesisStartUrl(synthesizer));
|
|
606
|
+
const synthesis = await synthesizeResults(query, out, {
|
|
607
|
+
grounded: groundedSynthesis,
|
|
608
|
+
tabPrefix: synthesisTab,
|
|
609
|
+
visible: process.env.GREEDY_SEARCH_VISIBLE === "1",
|
|
610
|
+
synthesizer,
|
|
526
611
|
});
|
|
527
612
|
out._synthesis = {
|
|
528
613
|
...synthesis,
|
|
529
614
|
synthesized: true,
|
|
530
615
|
};
|
|
531
|
-
await closeTab(geminiTab);
|
|
532
616
|
process.stderr.write("PROGRESS:synthesis:done\n");
|
|
533
617
|
} catch (e) {
|
|
534
618
|
process.stderr.write(
|
|
535
619
|
`[greedysearch] Synthesis failed: ${e.message}\n`,
|
|
536
620
|
);
|
|
537
|
-
out._synthesis = {
|
|
621
|
+
out._synthesis = {
|
|
622
|
+
error: e.message,
|
|
623
|
+
synthesized: false,
|
|
624
|
+
synthesizedBy: synthesizer,
|
|
625
|
+
};
|
|
626
|
+
} finally {
|
|
627
|
+
if (synthesisTab) await closeTab(synthesisTab);
|
|
538
628
|
}
|
|
539
629
|
}
|
|
540
630
|
|
|
@@ -544,12 +634,12 @@ async function main() {
|
|
|
544
634
|
out._topSource = await fetchTopSource(top.canonicalUrl || top.url);
|
|
545
635
|
}
|
|
546
636
|
|
|
547
|
-
//
|
|
548
|
-
if (
|
|
637
|
+
// Include confidence metrics for grounded multi-engine searches.
|
|
638
|
+
if (!legacyFast) out._confidence = buildConfidence(out);
|
|
549
639
|
|
|
550
640
|
writeOutput(out, outFile, {
|
|
551
641
|
inline,
|
|
552
|
-
synthesize:
|
|
642
|
+
synthesize: shouldSynthesize,
|
|
553
643
|
query,
|
|
554
644
|
});
|
|
555
645
|
return;
|
|
@@ -585,13 +675,31 @@ async function main() {
|
|
|
585
675
|
? "bing"
|
|
586
676
|
: script.includes("perplexity")
|
|
587
677
|
? "perplexity"
|
|
588
|
-
:
|
|
678
|
+
: script.includes("chatgpt")
|
|
679
|
+
? "chatgpt"
|
|
680
|
+
: script.includes("semantic-scholar")
|
|
681
|
+
? "semantic-scholar"
|
|
682
|
+
: script.includes("logically")
|
|
683
|
+
? "logically"
|
|
684
|
+
: null;
|
|
589
685
|
const canRetryVisible =
|
|
590
686
|
recoveryEngine &&
|
|
591
687
|
process.env.GREEDY_SEARCH_VISIBLE !== "1" &&
|
|
592
688
|
isHeadlessBlockedResult(e);
|
|
593
689
|
|
|
594
690
|
if (canRetryVisible) {
|
|
691
|
+
logVisibleRecovery({
|
|
692
|
+
scope: "single",
|
|
693
|
+
phase: "start",
|
|
694
|
+
engines: [recoveryEngine],
|
|
695
|
+
reasons: {
|
|
696
|
+
[recoveryEngine]: {
|
|
697
|
+
error: e.message || null,
|
|
698
|
+
envelope: e.envelope || null,
|
|
699
|
+
lastStage: e.lastStage || null,
|
|
700
|
+
},
|
|
701
|
+
},
|
|
702
|
+
});
|
|
595
703
|
process.stderr.write(
|
|
596
704
|
`[greedysearch] 🔓 ${recoveryEngine} blocked in headless — retrying visible to establish cookies...\n`,
|
|
597
705
|
);
|
|
@@ -612,12 +720,33 @@ async function main() {
|
|
|
612
720
|
null,
|
|
613
721
|
locale,
|
|
614
722
|
);
|
|
723
|
+
logVisibleRecovery({
|
|
724
|
+
scope: "single",
|
|
725
|
+
phase: "success",
|
|
726
|
+
engines: [recoveryEngine],
|
|
727
|
+
result: {
|
|
728
|
+
engine: recoveryEngine,
|
|
729
|
+
mode: result._envelope?.mode || null,
|
|
730
|
+
durationMs: result._envelope?.durationMs || null,
|
|
731
|
+
lastStage: result._envelope?.lastStage || null,
|
|
732
|
+
},
|
|
733
|
+
});
|
|
615
734
|
if (fetchSource && result.sources?.length > 0) {
|
|
616
735
|
result.topSource = await fetchTopSource(result.sources[0].url);
|
|
617
736
|
}
|
|
618
737
|
writeOutput(result, outFile, { inline, synthesize: false, query });
|
|
619
738
|
return;
|
|
620
739
|
} catch (retryErr) {
|
|
740
|
+
logVisibleRecovery({
|
|
741
|
+
scope: "single",
|
|
742
|
+
phase: "needs-human",
|
|
743
|
+
engines: [recoveryEngine],
|
|
744
|
+
result: {
|
|
745
|
+
engine: recoveryEngine,
|
|
746
|
+
error: retryErr.message || String(retryErr),
|
|
747
|
+
envelope: retryErr.envelope || null,
|
|
748
|
+
},
|
|
749
|
+
});
|
|
621
750
|
// Any visible retry failure: keep Chrome open so user can solve Turnstile.
|
|
622
751
|
// Once solved, cookies are stored in the shared profile for future headless runs.
|
|
623
752
|
keepVisibleForHuman = true;
|
|
@@ -41,6 +41,24 @@ const GLOBAL_VAR = "__bingClipboard";
|
|
|
41
41
|
// Bing Copilot-specific helpers
|
|
42
42
|
// ============================================================================
|
|
43
43
|
|
|
44
|
+
async function detectSignInWall(tab) {
|
|
45
|
+
// Language-agnostic: if the chat input is absent but the page hosts
|
|
46
|
+
// known OAuth provider endpoints, we're on the Copilot login wall.
|
|
47
|
+
const code = `(() => {
|
|
48
|
+
if (document.querySelector('#userInput')) return false;
|
|
49
|
+
const links = Array.from(document.querySelectorAll('a[href], button'));
|
|
50
|
+
const hasOAuth = links.some(el => {
|
|
51
|
+
const h = (el.href || el.getAttribute('formaction') || '').toLowerCase();
|
|
52
|
+
return h.includes('login.microsoftonline.com')
|
|
53
|
+
|| h.includes('appleid.apple.com')
|
|
54
|
+
|| h.includes('accounts.google.com');
|
|
55
|
+
});
|
|
56
|
+
return hasOAuth;
|
|
57
|
+
})()`;
|
|
58
|
+
const result = await cdp(["eval", tab, code]).catch(() => "false");
|
|
59
|
+
return result === "true";
|
|
60
|
+
}
|
|
61
|
+
|
|
44
62
|
async function extractAnswer(tab, env, query = "") {
|
|
45
63
|
// In headless mode: snap the accessibility tree before spending ~18s on
|
|
46
64
|
// clipboard polls. Copilot loads its input fine in headless but renders
|
|
@@ -181,10 +199,15 @@ async function extractFromAccessibilityTree(tab, query = "") {
|
|
|
181
199
|
const snap = await cdp(["snap", tab]).catch(() => "");
|
|
182
200
|
if (!snap || (await detectVerificationChallenge(tab, cdp))) return "";
|
|
183
201
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
202
|
+
// Linear article extraction — no regex. Avoids the ReDoS-prone
|
|
203
|
+
// /^\s*\[article\]\s+(.+)$/i pattern (SonarCloud hotspot js:S5852).
|
|
204
|
+
const articleLines = [];
|
|
205
|
+
for (const line of snap.split("\n")) {
|
|
206
|
+
const trimmed = line.trimStart();
|
|
207
|
+
if (!trimmed.toLowerCase().startsWith("[article]")) continue;
|
|
208
|
+
const after = trimmed.slice("[article]".length).trimStart();
|
|
209
|
+
if (after) articleLines.push(after);
|
|
210
|
+
}
|
|
188
211
|
if (articleLines.length === 0) return "";
|
|
189
212
|
|
|
190
213
|
const answer = pickAnswerArticle(articleLines, query);
|
|
@@ -419,12 +442,27 @@ async function main() {
|
|
|
419
442
|
}
|
|
420
443
|
}
|
|
421
444
|
|
|
445
|
+
// Detect sign-in wall before burning time waiting for an input that
|
|
446
|
+
// will never appear. Copilot now gates the chat behind Microsoft/Apple/Google
|
|
447
|
+
// login on fresh sessions.
|
|
448
|
+
if (await detectSignInWall(tab)) {
|
|
449
|
+
throw new Error(
|
|
450
|
+
"Copilot requires sign-in — please sign in with Microsoft, Apple, or Google in the visible browser window. Once signed in, cookies persist for future runs.",
|
|
451
|
+
);
|
|
452
|
+
}
|
|
453
|
+
|
|
422
454
|
// Wait for React app to mount input (up to 15s, longer after verification)
|
|
423
455
|
const inputReady = await waitForSelector(tab, S.input, 15000, 500);
|
|
424
456
|
env.inputReady = inputReady;
|
|
425
457
|
await new Promise((r) => setTimeout(r, jitter(300)));
|
|
426
458
|
|
|
427
459
|
if (!inputReady) {
|
|
460
|
+
// If input still missing, double-check we didn't land on the login wall
|
|
461
|
+
if (await detectSignInWall(tab)) {
|
|
462
|
+
throw new Error(
|
|
463
|
+
"Copilot requires sign-in — please sign in with Microsoft, Apple, or Google in the visible browser window. Once signed in, cookies persist for future runs.",
|
|
464
|
+
);
|
|
465
|
+
}
|
|
428
466
|
throw new Error(
|
|
429
467
|
"Copilot input not found — verification may have failed or page is in unexpected state",
|
|
430
468
|
);
|