@apmantza/greedysearch-pi 1.9.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +110 -14
- package/README.md +86 -41
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +11 -0
- package/bin/search.mjs +886 -674
- package/extractors/bing-copilot.mjs +528 -374
- package/extractors/chatgpt.mjs +436 -0
- package/extractors/common.mjs +837 -645
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +421 -388
- package/extractors/gemini.mjs +335 -217
- package/extractors/logically.mjs +567 -0
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/index.ts +2 -1
- package/package.json +14 -6
- package/skills/greedy-search/skill.md +9 -12
- package/src/fetcher.mjs +8 -1
- package/src/formatters/results.ts +163 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +150 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/recovery.mjs +51 -45
- package/src/search/research.mjs +2579 -0
- package/src/search/sources.mjs +77 -25
- package/src/search/synthesis-runner.mjs +142 -57
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +189 -45
- package/src/tools/shared.ts +187 -186
- package/src/types.ts +110 -104
- package/test.mjs +1342 -534
package/src/search/sources.mjs
CHANGED
|
@@ -37,6 +37,16 @@ export const NEWS_HOSTS = [
|
|
|
37
37
|
"zdnet.com",
|
|
38
38
|
];
|
|
39
39
|
|
|
40
|
+
export const SOCIAL_HOSTS = [
|
|
41
|
+
"facebook.com",
|
|
42
|
+
"instagram.com",
|
|
43
|
+
"linkedin.com",
|
|
44
|
+
"pinterest.com",
|
|
45
|
+
"tiktok.com",
|
|
46
|
+
"twitter.com",
|
|
47
|
+
"x.com",
|
|
48
|
+
];
|
|
49
|
+
|
|
40
50
|
export function trimText(text = "", maxChars = 240) {
|
|
41
51
|
const clean = String(text).replaceAll(/\s+/g, " ").trim();
|
|
42
52
|
if (clean.length <= maxChars) return clean;
|
|
@@ -122,6 +132,17 @@ export function classifySourceType(domain, title = "", rawUrl = "") {
|
|
|
122
132
|
const lowerUrl = rawUrl.toLowerCase();
|
|
123
133
|
|
|
124
134
|
if (domain === "github.com" || domain === "gitlab.com") return "repo";
|
|
135
|
+
if (
|
|
136
|
+
domain === "arxiv.org" ||
|
|
137
|
+
domain === "doi.org" ||
|
|
138
|
+
domain === "semanticscholar.org" ||
|
|
139
|
+
domain.endsWith(".semanticscholar.org") ||
|
|
140
|
+
lowerUrl.includes("/paper/") ||
|
|
141
|
+
lowerUrl.includes("/pdf/")
|
|
142
|
+
) {
|
|
143
|
+
return "academic";
|
|
144
|
+
}
|
|
145
|
+
if (matchesDomain(domain, SOCIAL_HOSTS)) return "social";
|
|
125
146
|
if (matchesDomain(domain, COMMUNITY_HOSTS)) return "community";
|
|
126
147
|
if (matchesDomain(domain, NEWS_HOSTS)) return "news";
|
|
127
148
|
if (
|
|
@@ -149,6 +170,8 @@ export function sourceTypePriority(sourceType) {
|
|
|
149
170
|
return 5;
|
|
150
171
|
case "repo":
|
|
151
172
|
return 4;
|
|
173
|
+
case "academic":
|
|
174
|
+
return 4;
|
|
152
175
|
case "maintainer-blog":
|
|
153
176
|
return 3;
|
|
154
177
|
case "website":
|
|
@@ -157,6 +180,8 @@ export function sourceTypePriority(sourceType) {
|
|
|
157
180
|
return 1;
|
|
158
181
|
case "news":
|
|
159
182
|
return 0;
|
|
183
|
+
case "social":
|
|
184
|
+
return -6;
|
|
160
185
|
default:
|
|
161
186
|
return 0;
|
|
162
187
|
}
|
|
@@ -308,6 +333,10 @@ export function inferPreferredDomains(query) {
|
|
|
308
333
|
if (normalized.includes("gemini") || normalized.includes("google ai")) {
|
|
309
334
|
matches.push("ai.google.dev", "developers.google.com");
|
|
310
335
|
}
|
|
336
|
+
for (const socialHost of SOCIAL_HOSTS) {
|
|
337
|
+
const bareName = socialHost.replace(/\.com$/, "");
|
|
338
|
+
if (normalized.includes(bareName)) matches.push(socialHost);
|
|
339
|
+
}
|
|
311
340
|
|
|
312
341
|
return [...new Set(matches)];
|
|
313
342
|
}
|
|
@@ -318,7 +347,9 @@ export function domainMatches(hostname, candidate) {
|
|
|
318
347
|
|
|
319
348
|
export function buildSourceRegistry(out, query = "") {
|
|
320
349
|
const seen = new Map();
|
|
321
|
-
const engineOrder =
|
|
350
|
+
const engineOrder = Object.keys(out || {}).filter(
|
|
351
|
+
(key) => !key.startsWith("_"),
|
|
352
|
+
);
|
|
322
353
|
|
|
323
354
|
// Get preferred domains for this query
|
|
324
355
|
const preferredDomains = inferPreferredDomains(query);
|
|
@@ -359,10 +390,20 @@ export function buildSourceRegistry(out, query = "") {
|
|
|
359
390
|
smartScore += 2;
|
|
360
391
|
}
|
|
361
392
|
|
|
362
|
-
// Penalize discussion
|
|
363
|
-
//
|
|
364
|
-
//
|
|
365
|
-
//
|
|
393
|
+
// Penalize discussion/social sites for technical queries — high noise,
|
|
394
|
+
// hard to fetch cleanly, and rarely canonical. Q&A sites (StackOverflow,
|
|
395
|
+
// StackExchange) are excluded from the community penalty.
|
|
396
|
+
//
|
|
397
|
+
// Social penalty is now −20 (was −12). The original −12 wasn't enough
|
|
398
|
+
// to overcome the +10 preferred-domain boost + clean rank, so a single
|
|
399
|
+
// social citation could land as S1. The post-sort demotion below
|
|
400
|
+
// is the hard guardrail on top.
|
|
401
|
+
const queryTargetsSocialHost = preferredDomains.some((pd) =>
|
|
402
|
+
domainMatches(domain, pd),
|
|
403
|
+
);
|
|
404
|
+
if (sourceType === "social" && !queryTargetsSocialHost) {
|
|
405
|
+
smartScore -= 20;
|
|
406
|
+
}
|
|
366
407
|
if (preferredDomains.length > 0) {
|
|
367
408
|
if (matchesDomain(domain, DISCUSSION_HOSTS)) {
|
|
368
409
|
smartScore -= 3;
|
|
@@ -410,27 +451,38 @@ export function buildSourceRegistry(out, query = "") {
|
|
|
410
451
|
}
|
|
411
452
|
}
|
|
412
453
|
|
|
413
|
-
const sources = Array.from(seen.values())
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
454
|
+
const sources = Array.from(seen.values()).map((source) => ({
|
|
455
|
+
...source,
|
|
456
|
+
engineCount: source.engines.length,
|
|
457
|
+
}));
|
|
458
|
+
|
|
459
|
+
// Social hard guardrail: when the query doesn't explicitly target a
|
|
460
|
+
// social host (rare — only happens for queries like "latest twitter
|
|
461
|
+
// announcement"), keep social sources OUT of the composite sort and
|
|
462
|
+
// pin them to the end of the registry. The smartScore −20 penalty
|
|
463
|
+
// above handles the "bare social gets a +10 boost" case, but a
|
|
464
|
+
// clean multi-engine social citation can still occasionally outscore
|
|
465
|
+
// a noisy single-engine academic source. This sort is the final say
|
|
466
|
+
// on what becomes S1, S2, etc.
|
|
467
|
+
const nonSocial = sources.filter((s) => s.sourceType !== "social");
|
|
468
|
+
const socialSources = sources.filter((s) => s.sourceType === "social");
|
|
469
|
+
nonSocial.sort((a, b) => {
|
|
470
|
+
const diff = computeCompositeScore(b) - computeCompositeScore(a);
|
|
471
|
+
if (diff !== 0) return diff;
|
|
472
|
+
return a.domain.localeCompare(b.domain);
|
|
473
|
+
});
|
|
474
|
+
socialSources.sort((a, b) => {
|
|
475
|
+
const diff = computeCompositeScore(b) - computeCompositeScore(a);
|
|
476
|
+
if (diff !== 0) return diff;
|
|
477
|
+
return a.domain.localeCompare(b.domain);
|
|
478
|
+
});
|
|
479
|
+
const ordered = [...nonSocial, ...socialSources];
|
|
432
480
|
|
|
433
|
-
return
|
|
481
|
+
return ordered.slice(0, 12).map((source, index) => ({
|
|
482
|
+
...source,
|
|
483
|
+
id: `S${index + 1}`,
|
|
484
|
+
title: source.title || source.domain || source.canonicalUrl,
|
|
485
|
+
}));
|
|
434
486
|
}
|
|
435
487
|
|
|
436
488
|
export function mergeFetchDataIntoSources(sources, fetchedSources) {
|
|
@@ -1,45 +1,84 @@
|
|
|
1
|
-
// src/search/synthesis-runner.mjs —
|
|
1
|
+
// src/search/synthesis-runner.mjs — Engine-agnostic synthesis via CDP extractors
|
|
2
2
|
//
|
|
3
|
-
//
|
|
3
|
+
// The all-search synthesis layer builds a neutral prompt and can route it to a
|
|
4
|
+
// configured browser engine. Gemini remains the default synthesizer; ChatGPT is
|
|
5
|
+
// supported for users who opt in via ~/.pi/greedyconfig or --synthesizer.
|
|
4
6
|
|
|
5
7
|
import { spawn } from "node:child_process";
|
|
6
8
|
import { join } from "node:path";
|
|
7
|
-
import { GREEDY_PROFILE_DIR } from "./constants.mjs";
|
|
9
|
+
import { GREEDY_PROFILE_DIR, SUPPORTED_SYNTHESIZERS } from "./constants.mjs";
|
|
8
10
|
import {
|
|
9
11
|
buildSynthesisPrompt,
|
|
10
12
|
normalizeSynthesisPayload,
|
|
11
13
|
parseStructuredJson,
|
|
12
14
|
} from "./synthesis.mjs";
|
|
15
|
+
import { buildSourceRegistry } from "./sources.mjs";
|
|
13
16
|
|
|
14
17
|
const __dir =
|
|
15
18
|
import.meta.dirname ||
|
|
16
19
|
new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
|
|
17
20
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
21
|
+
const SYNTHESIS_EXTRACTORS = {
|
|
22
|
+
gemini: "gemini.mjs",
|
|
23
|
+
chatgpt: "chatgpt.mjs",
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
const SYNTHESIS_START_URLS = {
|
|
27
|
+
gemini: "https://gemini.google.com/app",
|
|
28
|
+
chatgpt: "https://chatgpt.com/",
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
export function normalizeSynthesizer(synthesizer = "gemini") {
|
|
32
|
+
const normalized = String(synthesizer || "gemini").toLowerCase();
|
|
33
|
+
if (normalized === "gem") return "gemini";
|
|
34
|
+
if (normalized === "gpt") return "chatgpt";
|
|
35
|
+
return normalized;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export function getSynthesisStartUrl(synthesizer = "gemini") {
|
|
39
|
+
return (
|
|
40
|
+
SYNTHESIS_START_URLS[normalizeSynthesizer(synthesizer)] || "about:blank"
|
|
41
|
+
);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export async function runSynthesisPrompt(
|
|
45
|
+
synthesizer,
|
|
46
|
+
prompt,
|
|
47
|
+
{ tabPrefix = null, timeoutMs = 180000, visible = null } = {},
|
|
22
48
|
) {
|
|
23
|
-
const
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
49
|
+
const normalizedSynthesizer = normalizeSynthesizer(synthesizer);
|
|
50
|
+
const script = SYNTHESIS_EXTRACTORS[normalizedSynthesizer];
|
|
51
|
+
if (!script || !SUPPORTED_SYNTHESIZERS.includes(normalizedSynthesizer)) {
|
|
52
|
+
throw new Error(
|
|
53
|
+
`Unsupported synthesizer "${synthesizer}". Supported: ${SUPPORTED_SYNTHESIZERS.join(", ")}`,
|
|
54
|
+
);
|
|
55
|
+
}
|
|
27
56
|
|
|
28
57
|
return new Promise((resolve, reject) => {
|
|
29
58
|
const extraArgs = tabPrefix ? ["--tab", String(tabPrefix)] : [];
|
|
59
|
+
// Strip inherited visible-mode flags so a stale GREEDY_SEARCH_VISIBLE=1
|
|
60
|
+
// in the parent process doesn't force visible Chrome. Callers that
|
|
61
|
+
// genuinely want visible synthesis should pass visible: true explicitly.
|
|
62
|
+
const childEnv = {
|
|
63
|
+
...process.env,
|
|
64
|
+
CDP_PROFILE_DIR: GREEDY_PROFILE_DIR,
|
|
65
|
+
};
|
|
66
|
+
if (visible !== true) {
|
|
67
|
+
delete childEnv.GREEDY_SEARCH_VISIBLE;
|
|
68
|
+
delete childEnv.GREEDY_SEARCH_ALWAYS_VISIBLE;
|
|
69
|
+
} else {
|
|
70
|
+
childEnv.GREEDY_SEARCH_VISIBLE = "1";
|
|
71
|
+
childEnv.GREEDY_SEARCH_ALWAYS_VISIBLE = "1";
|
|
72
|
+
}
|
|
30
73
|
const proc = spawn(
|
|
31
74
|
process.execPath,
|
|
32
|
-
[
|
|
33
|
-
join(__dir, "..", "..", "extractors", "gemini.mjs"),
|
|
34
|
-
"--stdin",
|
|
35
|
-
...extraArgs,
|
|
36
|
-
],
|
|
75
|
+
[join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
|
|
37
76
|
{
|
|
38
77
|
stdio: ["pipe", "pipe", "pipe"],
|
|
39
|
-
env:
|
|
78
|
+
env: childEnv,
|
|
40
79
|
},
|
|
41
80
|
);
|
|
42
|
-
// Pipe
|
|
81
|
+
// Pipe prompts via stdin to avoid leaking them in process tables.
|
|
43
82
|
proc.stdin.write(prompt);
|
|
44
83
|
proc.stdin.end();
|
|
45
84
|
let out = "";
|
|
@@ -48,49 +87,95 @@ export async function synthesizeWithGemini(
|
|
|
48
87
|
proc.stderr.on("data", (d) => (err += d));
|
|
49
88
|
const t = setTimeout(() => {
|
|
50
89
|
proc.kill();
|
|
51
|
-
reject(
|
|
52
|
-
|
|
90
|
+
reject(
|
|
91
|
+
new Error(
|
|
92
|
+
`${normalizedSynthesizer} prompt timed out after ${timeoutMs / 1000}s`,
|
|
93
|
+
),
|
|
94
|
+
);
|
|
95
|
+
}, timeoutMs);
|
|
53
96
|
proc.on("close", (code) => {
|
|
54
97
|
clearTimeout(t);
|
|
55
|
-
if (code !== 0)
|
|
56
|
-
reject(
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
"claims",
|
|
70
|
-
"differences",
|
|
71
|
-
"caveats",
|
|
72
|
-
];
|
|
73
|
-
const hasSynthesisFields =
|
|
74
|
-
structured && SYNTHESIS_FIELDS.some((f) => f in structured);
|
|
75
|
-
const hasEngineKeys =
|
|
76
|
-
structured &&
|
|
77
|
-
["perplexity", "bing", "google"].some((e) => e in structured);
|
|
78
|
-
if (hasEngineKeys && !hasSynthesisFields) {
|
|
79
|
-
structured = null; // Treat as parse failure — Gemini echoed input
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
resolve({
|
|
83
|
-
...normalizeSynthesisPayload(structured, sources, raw.answer || ""),
|
|
84
|
-
rawAnswer: raw.answer || "",
|
|
85
|
-
geminiSources: raw.sources || [],
|
|
86
|
-
});
|
|
87
|
-
} catch {
|
|
88
|
-
reject(new Error(`bad JSON from gemini: ${out.slice(0, 100)}`));
|
|
89
|
-
}
|
|
98
|
+
if (code !== 0) {
|
|
99
|
+
reject(
|
|
100
|
+
new Error(err.trim() || `${normalizedSynthesizer} extractor failed`),
|
|
101
|
+
);
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
try {
|
|
105
|
+
resolve(JSON.parse(out.trim()));
|
|
106
|
+
} catch {
|
|
107
|
+
reject(
|
|
108
|
+
new Error(
|
|
109
|
+
`bad JSON from ${normalizedSynthesizer}: ${out.slice(0, 100)}`,
|
|
110
|
+
),
|
|
111
|
+
);
|
|
90
112
|
}
|
|
91
113
|
});
|
|
92
114
|
});
|
|
93
115
|
}
|
|
94
116
|
|
|
95
|
-
//
|
|
96
|
-
|
|
117
|
+
// Backward-compatible Gemini helper used by research mode internals.
|
|
118
|
+
export async function runGeminiPrompt(prompt, options = {}) {
|
|
119
|
+
return runSynthesisPrompt("gemini", prompt, options);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export async function synthesizeResults(
|
|
123
|
+
query,
|
|
124
|
+
results,
|
|
125
|
+
{
|
|
126
|
+
grounded = false,
|
|
127
|
+
tabPrefix = null,
|
|
128
|
+
visible = null,
|
|
129
|
+
synthesizer = "gemini",
|
|
130
|
+
} = {},
|
|
131
|
+
) {
|
|
132
|
+
const normalizedSynthesizer = normalizeSynthesizer(synthesizer);
|
|
133
|
+
const sources = Array.isArray(results._sources)
|
|
134
|
+
? results._sources
|
|
135
|
+
: buildSourceRegistry(results);
|
|
136
|
+
const prompt = buildSynthesisPrompt(query, results, sources, { grounded });
|
|
137
|
+
|
|
138
|
+
const raw = await runSynthesisPrompt(normalizedSynthesizer, prompt, {
|
|
139
|
+
tabPrefix,
|
|
140
|
+
timeoutMs: 180000,
|
|
141
|
+
visible,
|
|
142
|
+
});
|
|
143
|
+
let structured = parseStructuredJson(raw.answer || "");
|
|
144
|
+
|
|
145
|
+
// Detect if the synthesizer echoed back the engine summaries instead of a
|
|
146
|
+
// synthesis. This can happen when it can't synthesize and mirrors prompt JSON.
|
|
147
|
+
const SYNTHESIS_FIELDS = [
|
|
148
|
+
"answer",
|
|
149
|
+
"agreement",
|
|
150
|
+
"claims",
|
|
151
|
+
"differences",
|
|
152
|
+
"caveats",
|
|
153
|
+
];
|
|
154
|
+
const hasSynthesisFields =
|
|
155
|
+
structured && SYNTHESIS_FIELDS.some((f) => f in structured);
|
|
156
|
+
const hasEngineKeys =
|
|
157
|
+
structured &&
|
|
158
|
+
["perplexity", "bing", "google", "chatgpt", "gemini"].some(
|
|
159
|
+
(e) => e in structured,
|
|
160
|
+
);
|
|
161
|
+
if (hasEngineKeys && !hasSynthesisFields) {
|
|
162
|
+
structured = null; // Treat as parse failure — synthesizer echoed input
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return {
|
|
166
|
+
...normalizeSynthesisPayload(structured, sources, raw.answer || ""),
|
|
167
|
+
rawAnswer: raw.answer || "",
|
|
168
|
+
synthesizedBy: normalizedSynthesizer,
|
|
169
|
+
synthesizerSources: raw.sources || [],
|
|
170
|
+
// Backward-compatible field for existing consumers.
|
|
171
|
+
geminiSources: normalizedSynthesizer === "gemini" ? raw.sources || [] : [],
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Backward-compatible all-search synthesis helper.
|
|
176
|
+
export async function synthesizeWithGemini(query, results, options = {}) {
|
|
177
|
+
return synthesizeResults(query, results, {
|
|
178
|
+
...options,
|
|
179
|
+
synthesizer: "gemini",
|
|
180
|
+
});
|
|
181
|
+
}
|