@apmantza/greedysearch-pi 1.9.2 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +132 -2
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +318 -81
- package/extractors/bing-copilot.mjs +48 -18
- package/extractors/chatgpt.mjs +553 -0
- package/extractors/common.mjs +213 -22
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +182 -18
- package/extractors/gemini.mjs +350 -217
- package/extractors/google-ai.mjs +129 -128
- package/extractors/logically.mjs +629 -0
- package/extractors/perplexity.mjs +547 -217
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +8 -4
- package/skills/greedy-search/skill.md +20 -12
- package/src/fetcher.mjs +23 -1
- package/src/formatters/results.ts +185 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/challenge-detect.mjs +205 -0
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +155 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/progress.mjs +145 -0
- package/src/search/recovery.mjs +73 -45
- package/src/search/research.mjs +1419 -62
- package/src/search/scale-aware.mjs +93 -0
- package/src/search/simple-research.mjs +520 -0
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +129 -59
- package/src/tools/shared.ts +312 -186
- package/src/types.ts +110 -104
- package/test.mjs +537 -18
package/src/search/sources.mjs
CHANGED
|
@@ -132,6 +132,16 @@ export function classifySourceType(domain, title = "", rawUrl = "") {
|
|
|
132
132
|
const lowerUrl = rawUrl.toLowerCase();
|
|
133
133
|
|
|
134
134
|
if (domain === "github.com" || domain === "gitlab.com") return "repo";
|
|
135
|
+
if (
|
|
136
|
+
domain === "arxiv.org" ||
|
|
137
|
+
domain === "doi.org" ||
|
|
138
|
+
domain === "semanticscholar.org" ||
|
|
139
|
+
domain.endsWith(".semanticscholar.org") ||
|
|
140
|
+
lowerUrl.includes("/paper/") ||
|
|
141
|
+
lowerUrl.includes("/pdf/")
|
|
142
|
+
) {
|
|
143
|
+
return "academic";
|
|
144
|
+
}
|
|
135
145
|
if (matchesDomain(domain, SOCIAL_HOSTS)) return "social";
|
|
136
146
|
if (matchesDomain(domain, COMMUNITY_HOSTS)) return "community";
|
|
137
147
|
if (matchesDomain(domain, NEWS_HOSTS)) return "news";
|
|
@@ -160,6 +170,8 @@ export function sourceTypePriority(sourceType) {
|
|
|
160
170
|
return 5;
|
|
161
171
|
case "repo":
|
|
162
172
|
return 4;
|
|
173
|
+
case "academic":
|
|
174
|
+
return 4;
|
|
163
175
|
case "maintainer-blog":
|
|
164
176
|
return 3;
|
|
165
177
|
case "website":
|
|
@@ -335,7 +347,9 @@ export function domainMatches(hostname, candidate) {
|
|
|
335
347
|
|
|
336
348
|
export function buildSourceRegistry(out, query = "") {
|
|
337
349
|
const seen = new Map();
|
|
338
|
-
const engineOrder =
|
|
350
|
+
const engineOrder = Object.keys(out || {}).filter(
|
|
351
|
+
(key) => !key.startsWith("_"),
|
|
352
|
+
);
|
|
339
353
|
|
|
340
354
|
// Get preferred domains for this query
|
|
341
355
|
const preferredDomains = inferPreferredDomains(query);
|
|
@@ -379,11 +393,16 @@ export function buildSourceRegistry(out, query = "") {
|
|
|
379
393
|
// Penalize discussion/social sites for technical queries — high noise,
|
|
380
394
|
// hard to fetch cleanly, and rarely canonical. Q&A sites (StackOverflow,
|
|
381
395
|
// StackExchange) are excluded from the community penalty.
|
|
396
|
+
//
|
|
397
|
+
// Social penalty is now −20 (was −12). The original −12 wasn't enough
|
|
398
|
+
// to overcome the +10 preferred-domain boost + clean rank, so a single
|
|
399
|
+
// social citation could land as S1. The post-sort demotion below
|
|
400
|
+
// is the hard guardrail on top.
|
|
382
401
|
const queryTargetsSocialHost = preferredDomains.some((pd) =>
|
|
383
402
|
domainMatches(domain, pd),
|
|
384
403
|
);
|
|
385
404
|
if (sourceType === "social" && !queryTargetsSocialHost) {
|
|
386
|
-
smartScore -=
|
|
405
|
+
smartScore -= 20;
|
|
387
406
|
}
|
|
388
407
|
if (preferredDomains.length > 0) {
|
|
389
408
|
if (matchesDomain(domain, DISCUSSION_HOSTS)) {
|
|
@@ -432,27 +451,38 @@ export function buildSourceRegistry(out, query = "") {
|
|
|
432
451
|
}
|
|
433
452
|
}
|
|
434
453
|
|
|
435
|
-
const sources = Array.from(seen.values())
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
+
const sources = Array.from(seen.values()).map((source) => ({
|
|
455
|
+
...source,
|
|
456
|
+
engineCount: source.engines.length,
|
|
457
|
+
}));
|
|
458
|
+
|
|
459
|
+
// Social hard guardrail: when the query doesn't explicitly target a
|
|
460
|
+
// social host (rare — only happens for queries like "latest twitter
|
|
461
|
+
// announcement"), keep social sources OUT of the composite sort and
|
|
462
|
+
// pin them to the end of the registry. The smartScore −20 penalty
|
|
463
|
+
// above handles the "bare social gets a +10 boost" case, but a
|
|
464
|
+
// clean multi-engine social citation can still occasionally outscore
|
|
465
|
+
// a noisy single-engine academic source. This sort is the final say
|
|
466
|
+
// on what becomes S1, S2, etc.
|
|
467
|
+
const nonSocial = sources.filter((s) => s.sourceType !== "social");
|
|
468
|
+
const socialSources = sources.filter((s) => s.sourceType === "social");
|
|
469
|
+
nonSocial.sort((a, b) => {
|
|
470
|
+
const diff = computeCompositeScore(b) - computeCompositeScore(a);
|
|
471
|
+
if (diff !== 0) return diff;
|
|
472
|
+
return a.domain.localeCompare(b.domain);
|
|
473
|
+
});
|
|
474
|
+
socialSources.sort((a, b) => {
|
|
475
|
+
const diff = computeCompositeScore(b) - computeCompositeScore(a);
|
|
476
|
+
if (diff !== 0) return diff;
|
|
477
|
+
return a.domain.localeCompare(b.domain);
|
|
478
|
+
});
|
|
479
|
+
const ordered = [...nonSocial, ...socialSources];
|
|
454
480
|
|
|
455
|
-
return
|
|
481
|
+
return ordered.slice(0, 12).map((source, index) => ({
|
|
482
|
+
...source,
|
|
483
|
+
id: `S${index + 1}`,
|
|
484
|
+
title: source.title || source.domain || source.canonicalUrl,
|
|
485
|
+
}));
|
|
456
486
|
}
|
|
457
487
|
|
|
458
488
|
export function mergeFetchDataIntoSources(sources, fetchedSources) {
|
|
@@ -1,36 +1,81 @@
|
|
|
1
|
-
// src/search/synthesis-runner.mjs —
|
|
1
|
+
// src/search/synthesis-runner.mjs — Engine-agnostic synthesis via CDP extractors
|
|
2
2
|
//
|
|
3
|
-
//
|
|
3
|
+
// The all-search synthesis layer builds a neutral prompt and can route it to a
|
|
4
|
+
// configured browser engine. Gemini remains the default synthesizer; ChatGPT is
|
|
5
|
+
// supported for users who opt in via ~/.pi/greedyconfig or --synthesizer.
|
|
4
6
|
|
|
5
7
|
import { spawn } from "node:child_process";
|
|
6
8
|
import { join } from "node:path";
|
|
7
|
-
import { GREEDY_PROFILE_DIR } from "./constants.mjs";
|
|
9
|
+
import { GREEDY_PROFILE_DIR, SUPPORTED_SYNTHESIZERS } from "./constants.mjs";
|
|
8
10
|
import {
|
|
9
11
|
buildSynthesisPrompt,
|
|
10
12
|
normalizeSynthesisPayload,
|
|
11
13
|
parseStructuredJson,
|
|
12
14
|
} from "./synthesis.mjs";
|
|
15
|
+
import { buildSourceRegistry } from "./sources.mjs";
|
|
13
16
|
|
|
14
17
|
const __dir =
|
|
15
18
|
import.meta.dirname ||
|
|
16
19
|
new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
|
|
17
20
|
|
|
18
|
-
|
|
21
|
+
const SYNTHESIS_EXTRACTORS = {
|
|
22
|
+
gemini: "gemini.mjs",
|
|
23
|
+
chatgpt: "chatgpt.mjs",
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
const SYNTHESIS_START_URLS = {
|
|
27
|
+
gemini: "https://gemini.google.com/app",
|
|
28
|
+
chatgpt: "https://chatgpt.com/",
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
export function normalizeSynthesizer(synthesizer = "gemini") {
|
|
32
|
+
const normalized = String(synthesizer || "gemini").toLowerCase();
|
|
33
|
+
if (normalized === "gem") return "gemini";
|
|
34
|
+
if (normalized === "gpt") return "chatgpt";
|
|
35
|
+
return normalized;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export function getSynthesisStartUrl(synthesizer = "gemini") {
|
|
39
|
+
return (
|
|
40
|
+
SYNTHESIS_START_URLS[normalizeSynthesizer(synthesizer)] || "about:blank"
|
|
41
|
+
);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export async function runSynthesisPrompt(
|
|
45
|
+
synthesizer,
|
|
19
46
|
prompt,
|
|
20
|
-
{ tabPrefix = null, timeoutMs = 180000 } = {},
|
|
47
|
+
{ tabPrefix = null, timeoutMs = 180000, visible = null } = {},
|
|
21
48
|
) {
|
|
49
|
+
const normalizedSynthesizer = normalizeSynthesizer(synthesizer);
|
|
50
|
+
const script = SYNTHESIS_EXTRACTORS[normalizedSynthesizer];
|
|
51
|
+
if (!script || !SUPPORTED_SYNTHESIZERS.includes(normalizedSynthesizer)) {
|
|
52
|
+
throw new Error(
|
|
53
|
+
`Unsupported synthesizer "${synthesizer}". Supported: ${SUPPORTED_SYNTHESIZERS.join(", ")}`,
|
|
54
|
+
);
|
|
55
|
+
}
|
|
56
|
+
|
|
22
57
|
return new Promise((resolve, reject) => {
|
|
23
58
|
const extraArgs = tabPrefix ? ["--tab", String(tabPrefix)] : [];
|
|
59
|
+
// Strip inherited visible-mode flags so a stale GREEDY_SEARCH_VISIBLE=1
|
|
60
|
+
// in the parent process doesn't force visible Chrome. Callers that
|
|
61
|
+
// genuinely want visible synthesis should pass visible: true explicitly.
|
|
62
|
+
const childEnv = {
|
|
63
|
+
...process.env,
|
|
64
|
+
CDP_PROFILE_DIR: GREEDY_PROFILE_DIR,
|
|
65
|
+
};
|
|
66
|
+
if (visible !== true) {
|
|
67
|
+
delete childEnv.GREEDY_SEARCH_VISIBLE;
|
|
68
|
+
delete childEnv.GREEDY_SEARCH_ALWAYS_VISIBLE;
|
|
69
|
+
} else {
|
|
70
|
+
childEnv.GREEDY_SEARCH_VISIBLE = "1";
|
|
71
|
+
childEnv.GREEDY_SEARCH_ALWAYS_VISIBLE = "1";
|
|
72
|
+
}
|
|
24
73
|
const proc = spawn(
|
|
25
74
|
process.execPath,
|
|
26
|
-
[
|
|
27
|
-
join(__dir, "..", "..", "extractors", "gemini.mjs"),
|
|
28
|
-
"--stdin",
|
|
29
|
-
...extraArgs,
|
|
30
|
-
],
|
|
75
|
+
[join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
|
|
31
76
|
{
|
|
32
77
|
stdio: ["pipe", "pipe", "pipe"],
|
|
33
|
-
env:
|
|
78
|
+
env: childEnv,
|
|
34
79
|
},
|
|
35
80
|
);
|
|
36
81
|
// Pipe prompts via stdin to avoid leaking them in process tables.
|
|
@@ -42,40 +87,63 @@ export async function runGeminiPrompt(
|
|
|
42
87
|
proc.stderr.on("data", (d) => (err += d));
|
|
43
88
|
const t = setTimeout(() => {
|
|
44
89
|
proc.kill();
|
|
45
|
-
reject(
|
|
90
|
+
reject(
|
|
91
|
+
new Error(
|
|
92
|
+
`${normalizedSynthesizer} prompt timed out after ${timeoutMs / 1000}s`,
|
|
93
|
+
),
|
|
94
|
+
);
|
|
46
95
|
}, timeoutMs);
|
|
47
96
|
proc.on("close", (code) => {
|
|
48
97
|
clearTimeout(t);
|
|
49
98
|
if (code !== 0) {
|
|
50
|
-
reject(
|
|
99
|
+
reject(
|
|
100
|
+
new Error(err.trim() || `${normalizedSynthesizer} extractor failed`),
|
|
101
|
+
);
|
|
51
102
|
return;
|
|
52
103
|
}
|
|
53
104
|
try {
|
|
54
105
|
resolve(JSON.parse(out.trim()));
|
|
55
106
|
} catch {
|
|
56
|
-
reject(
|
|
107
|
+
reject(
|
|
108
|
+
new Error(
|
|
109
|
+
`bad JSON from ${normalizedSynthesizer}: ${out.slice(0, 100)}`,
|
|
110
|
+
),
|
|
111
|
+
);
|
|
57
112
|
}
|
|
58
113
|
});
|
|
59
114
|
});
|
|
60
115
|
}
|
|
61
116
|
|
|
62
|
-
|
|
117
|
+
// Backward-compatible Gemini helper used by research mode internals.
|
|
118
|
+
export async function runGeminiPrompt(prompt, options = {}) {
|
|
119
|
+
return runSynthesisPrompt("gemini", prompt, options);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export async function synthesizeResults(
|
|
63
123
|
query,
|
|
64
124
|
results,
|
|
65
|
-
{
|
|
125
|
+
{
|
|
126
|
+
grounded = false,
|
|
127
|
+
tabPrefix = null,
|
|
128
|
+
visible = null,
|
|
129
|
+
synthesizer = "gemini",
|
|
130
|
+
} = {},
|
|
66
131
|
) {
|
|
132
|
+
const normalizedSynthesizer = normalizeSynthesizer(synthesizer);
|
|
67
133
|
const sources = Array.isArray(results._sources)
|
|
68
134
|
? results._sources
|
|
69
135
|
: buildSourceRegistry(results);
|
|
70
136
|
const prompt = buildSynthesisPrompt(query, results, sources, { grounded });
|
|
71
137
|
|
|
72
|
-
const raw = await
|
|
138
|
+
const raw = await runSynthesisPrompt(normalizedSynthesizer, prompt, {
|
|
139
|
+
tabPrefix,
|
|
140
|
+
timeoutMs: 180000,
|
|
141
|
+
visible,
|
|
142
|
+
});
|
|
73
143
|
let structured = parseStructuredJson(raw.answer || "");
|
|
74
144
|
|
|
75
|
-
// Detect if
|
|
76
|
-
//
|
|
77
|
-
// echoes the prompt JSON. The engine summary JSON has per-engine keys
|
|
78
|
-
// (perplexity/bing/google) but no synthesis fields (answer/agreement).
|
|
145
|
+
// Detect if the synthesizer echoed back the engine summaries instead of a
|
|
146
|
+
// synthesis. This can happen when it can't synthesize and mirrors prompt JSON.
|
|
79
147
|
const SYNTHESIS_FIELDS = [
|
|
80
148
|
"answer",
|
|
81
149
|
"agreement",
|
|
@@ -86,17 +154,28 @@ export async function synthesizeWithGemini(
|
|
|
86
154
|
const hasSynthesisFields =
|
|
87
155
|
structured && SYNTHESIS_FIELDS.some((f) => f in structured);
|
|
88
156
|
const hasEngineKeys =
|
|
89
|
-
structured &&
|
|
157
|
+
structured &&
|
|
158
|
+
["perplexity", "bing", "google", "chatgpt", "gemini"].some(
|
|
159
|
+
(e) => e in structured,
|
|
160
|
+
);
|
|
90
161
|
if (hasEngineKeys && !hasSynthesisFields) {
|
|
91
|
-
structured = null; // Treat as parse failure —
|
|
162
|
+
structured = null; // Treat as parse failure — synthesizer echoed input
|
|
92
163
|
}
|
|
93
164
|
|
|
94
165
|
return {
|
|
95
166
|
...normalizeSynthesisPayload(structured, sources, raw.answer || ""),
|
|
96
167
|
rawAnswer: raw.answer || "",
|
|
97
|
-
|
|
168
|
+
synthesizedBy: normalizedSynthesizer,
|
|
169
|
+
synthesizerSources: raw.sources || [],
|
|
170
|
+
// Backward-compatible field for existing consumers.
|
|
171
|
+
geminiSources: normalizedSynthesizer === "gemini" ? raw.sources || [] : [],
|
|
98
172
|
};
|
|
99
173
|
}
|
|
100
174
|
|
|
101
|
-
//
|
|
102
|
-
|
|
175
|
+
// Backward-compatible all-search synthesis helper.
|
|
176
|
+
export async function synthesizeWithGemini(query, results, options = {}) {
|
|
177
|
+
return synthesizeResults(query, results, {
|
|
178
|
+
...options,
|
|
179
|
+
synthesizer: "gemini",
|
|
180
|
+
});
|
|
181
|
+
}
|