@apmantza/greedysearch-pi 1.9.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,246 +1,286 @@
1
- // src/search/synthesis.mjs — Synthesis prompt building, structured JSON parsing,
2
- // confidence metrics, and payload normalization
3
- //
4
- // Extracted from search.mjs to reduce file complexity.
5
-
6
- import { ALL_ENGINES } from "./constants.mjs";
7
- import { trimText } from "./sources.mjs";
8
-
9
- export function parseStructuredJson(text) {
10
- if (!text) return null;
11
- let trimmed = String(text).trim();
12
-
13
- // Look for BEGIN_JSON/END_JSON markers first
14
- const beginIdx = trimmed.indexOf("BEGIN_JSON");
15
- const endIdx = trimmed.indexOf("END_JSON");
16
- if (beginIdx !== -1 && endIdx !== -1 && beginIdx < endIdx) {
17
- trimmed = trimmed.slice(beginIdx + "BEGIN_JSON".length, endIdx).trim();
18
- } else {
19
- // Strip out common LLM preamble text before the actual JSON
20
- const jsonStart = trimmed.indexOf("{");
21
- if (jsonStart > 0) {
22
- trimmed = trimmed.slice(jsonStart);
23
- }
24
- }
25
-
26
- const candidates = [
27
- trimmed,
28
- trimmed
29
- .replace(/^```json\s*/i, "")
30
- .replace(/^```\s*/i, "")
31
- .replace(/```$/i, "")
32
- .trim(),
33
- ];
34
-
35
- // Find the outermost JSON object via brace matching (avoids ReDoS-prone .* patterns)
36
- const firstBrace = trimmed.indexOf("{");
37
- const lastBrace = trimmed.lastIndexOf("}");
38
- if (firstBrace !== -1 && lastBrace !== -1 && firstBrace < lastBrace) {
39
- candidates.push(trimmed.slice(firstBrace, lastBrace + 1));
40
- }
41
-
42
- for (const candidate of candidates) {
43
- try {
44
- return JSON.parse(candidate);
45
- } catch {
46
- // try next candidate
47
- }
48
- }
49
- return null;
50
- }
51
-
52
- export function normalizeSynthesisPayload(
53
- payload,
54
- sources,
55
- fallbackAnswer = "",
56
- ) {
57
- const sourceIds = new Set(sources.map((source) => source.id));
58
- const agreementLevel = [
59
- "high",
60
- "medium",
61
- "low",
62
- "mixed",
63
- "conflicting",
64
- ].includes(payload?.agreement?.level)
65
- ? payload.agreement.level
66
- : "mixed";
67
- const claims = Array.isArray(payload?.claims)
68
- ? payload.claims
69
- .map((claim) => ({
70
- claim: trimText(claim?.claim || "", 260),
71
- support: ["strong", "moderate", "weak", "conflicting"].includes(
72
- claim?.support,
73
- )
74
- ? claim.support
75
- : "moderate",
76
- sourceIds: Array.isArray(claim?.sourceIds)
77
- ? claim.sourceIds.filter((id) => sourceIds.has(id))
78
- : [],
79
- }))
80
- .filter((claim) => claim.claim)
81
- : [];
82
- const recommendedSources = Array.isArray(payload?.recommendedSources)
83
- ? payload.recommendedSources.filter((id) => sourceIds.has(id)).slice(0, 6)
84
- : [];
85
-
86
- // Clean up fallback answer if it contains preamble text
87
- // Use indexOf/lastIndexOf instead of [\s\S]* patterns to avoid ReDoS
88
- let cleanFallback = "";
89
- if (fallbackAnswer) {
90
- const firstBrace = fallbackAnswer.indexOf("{");
91
- const lastBrace = fallbackAnswer.lastIndexOf("}");
92
- if (firstBrace !== -1 && lastBrace !== -1 && firstBrace < lastBrace) {
93
- cleanFallback = fallbackAnswer.slice(firstBrace, lastBrace + 1);
94
- } else {
95
- cleanFallback = fallbackAnswer;
96
- }
97
- }
98
-
99
- return {
100
- answer: trimText(payload?.answer || cleanFallback || fallbackAnswer, 4000),
101
- agreement: {
102
- level: agreementLevel,
103
- summary: trimText(payload?.agreement?.summary || "", 280),
104
- },
105
- differences: Array.isArray(payload?.differences)
106
- ? payload.differences
107
- .map((item) => trimText(item, 220))
108
- .filter(Boolean)
109
- .slice(0, 5)
110
- : [],
111
- caveats: Array.isArray(payload?.caveats)
112
- ? payload.caveats
113
- .map((item) => trimText(item, 220))
114
- .filter(Boolean)
115
- .slice(0, 5)
116
- : [],
117
- claims,
118
- recommendedSources,
119
- };
120
- }
121
-
122
- export function buildSynthesisPrompt(
123
- query,
124
- results,
125
- sources,
126
- { grounded = false } = {},
127
- ) {
128
- const engineSummaries = {};
129
- for (const engine of ["perplexity", "bing", "google"]) {
130
- const result = results[engine];
131
- if (!result) continue;
132
- if (result.error) {
133
- engineSummaries[engine] = {
134
- status: "error",
135
- error: String(result.error),
136
- };
137
- continue;
138
- }
139
-
140
- engineSummaries[engine] = {
141
- status: "ok",
142
- answer: trimText(result.answer || "", grounded ? 4500 : 2200),
143
- sourceIds: sources
144
- .filter((source) => source.engines.includes(engine))
145
- .sort(
146
- (a, b) =>
147
- (a.perEngine[engine]?.rank || 99) -
148
- (b.perEngine[engine]?.rank || 99),
149
- )
150
- .map((source) => source.id)
151
- .slice(0, 6),
152
- };
153
- }
154
-
155
- // Snippet budget: always include content for fetched sources so Gemini can
156
- // make citation decisions based on what the sources actually say, not just
157
- // their metadata. Grounded mode gets a larger budget per source.
158
- const snippetChars = grounded ? 700 : 300;
159
- const sourceRegistry = sources.slice(0, grounded ? 10 : 8).map((source) => ({
160
- id: source.id,
161
- title: source.title,
162
- domain: source.domain,
163
- canonicalUrl: source.canonicalUrl,
164
- sourceType: source.sourceType,
165
- isOfficial: source.isOfficial,
166
- engines: source.engines,
167
- engineCount: source.engineCount,
168
- fetch: source.fetch?.attempted
169
- ? {
170
- ok: source.fetch.ok,
171
- publishedTime: source.fetch.publishedTime || "",
172
- byline: source.fetch.byline || "",
173
- snippet: trimText(source.fetch.snippet || "", snippetChars),
174
- }
175
- : undefined,
176
- }));
177
-
178
- return [
179
- "You are a research synthesizer. Combine these search engine results into a single authoritative answer.",
180
- "",
181
- `Query: ${query}`,
182
- "",
183
- `Engine summaries:\n${JSON.stringify(engineSummaries, null, 2)}`,
184
- "",
185
- `Source registry:\n${JSON.stringify(sourceRegistry, null, 2)}`,
186
- "",
187
- "Instructions:",
188
- "- Write a clear, direct answer in markdown (use headers/bullets where they help readability)",
189
- "- Cite sources inline as [S1], [S2] etc. when making specific claims",
190
- "- Prefer sources with content (fetch.ok=true and non-empty snippet) for citations",
191
- "- Note where the engines agree or meaningfully disagree",
192
- "- List any important caveats or limitations",
193
- "- recommendedSources: the 2-4 source IDs most worth reading for this query",
194
- "",
195
- "Respond ONLY with a JSON object wrapped in BEGIN_JSON / END_JSON markers:",
196
- "",
197
- "BEGIN_JSON",
198
- JSON.stringify({
199
- answer: "<your markdown answer here>",
200
- agreement: { level: "high|medium|mixed|conflicting", summary: "<one sentence>" },
201
- differences: ["<notable difference between engines, if any>"],
202
- caveats: ["<important caveat or limitation>"],
203
- recommendedSources: ["S1", "S2"],
204
- }, null, 2),
205
- "END_JSON",
206
- ].join("\n");
207
- }
208
-
209
- export function buildConfidence(out) {
210
- const sources = Array.isArray(out._sources) ? out._sources : [];
211
- const topConsensus = sources.length > 0 ? sources[0]?.engineCount || 0 : 0;
212
- const officialSourceCount = sources.filter(
213
- (source) => source.isOfficial,
214
- ).length;
215
- const firstPartySourceCount = sources.filter(
216
- (source) => source.isOfficial || source.sourceType === "maintainer-blog",
217
- ).length;
218
- const fetchedAttempted = sources.filter(
219
- (source) => source.fetch?.attempted,
220
- ).length;
221
- const fetchedSucceeded = sources.filter((source) => source.fetch?.ok).length;
222
- const sourceTypeBreakdown = sources.reduce((acc, source) => {
223
- acc[source.sourceType] = (acc[source.sourceType] || 0) + 1;
224
- return acc;
225
- }, {});
226
- const synthesisLevel = out._synthesis?.agreement?.level;
227
-
228
- return {
229
- sourcesCount: sources.length,
230
- topSourceConsensus: topConsensus,
231
- agreementLevel:
232
- synthesisLevel ||
233
- (topConsensus >= 3 ? "high" : topConsensus >= 2 ? "medium" : "low"),
234
- enginesResponded: ALL_ENGINES.filter(
235
- (engine) => out[engine]?.answer && !out[engine]?.error,
236
- ),
237
- enginesFailed: ALL_ENGINES.filter((engine) => out[engine]?.error),
238
- officialSourceCount,
239
- firstPartySourceCount,
240
- fetchedSourceSuccessRate:
241
- fetchedAttempted > 0
242
- ? Number((fetchedSucceeded / fetchedAttempted).toFixed(2))
243
- : 0,
244
- sourceTypeBreakdown,
245
- };
246
- }
1
+ // src/search/synthesis.mjs — Synthesis prompt building, structured JSON parsing,
2
+ // confidence metrics, and payload normalization
3
+ //
4
+ // Extracted from search.mjs to reduce file complexity.
5
+
6
+ import { ALL_ENGINES } from "./constants.mjs";
7
+ import { trimText } from "./sources.mjs";
8
+
9
+ function escapeControlCharsInsideJsonStrings(text) {
10
+ let out = "";
11
+ let inString = false;
12
+ let escaped = false;
13
+ for (const char of String(text)) {
14
+ if (escaped) {
15
+ out += char;
16
+ escaped = false;
17
+ continue;
18
+ }
19
+ if (char === "\\") {
20
+ out += char;
21
+ escaped = true;
22
+ continue;
23
+ }
24
+ if (char === '"') {
25
+ inString = !inString;
26
+ out += char;
27
+ continue;
28
+ }
29
+ if (inString && char === "\n") out += "\\n";
30
+ else if (inString && char === "\r") out += "\\r";
31
+ else if (inString && char === "\t") out += "\\t";
32
+ else out += char;
33
+ }
34
+ return out;
35
+ }
36
+
37
+ export function parseStructuredJson(text) {
38
+ if (!text) return null;
39
+ let trimmed = String(text).trim();
40
+
41
+ // Look for BEGIN_JSON/END_JSON markers first
42
+ const beginIdx = trimmed.indexOf("BEGIN_JSON");
43
+ const endIdx = trimmed.indexOf("END_JSON");
44
+ if (beginIdx !== -1 && endIdx !== -1 && beginIdx < endIdx) {
45
+ trimmed = trimmed.slice(beginIdx + "BEGIN_JSON".length, endIdx).trim();
46
+ } else {
47
+ // Strip out common LLM preamble text before the actual JSON
48
+ const jsonStart = trimmed.indexOf("{");
49
+ if (jsonStart > 0) {
50
+ trimmed = trimmed.slice(jsonStart);
51
+ }
52
+ }
53
+
54
+ const candidates = [
55
+ trimmed,
56
+ trimmed
57
+ .replace(/^```json\s*/i, "")
58
+ .replace(/^```\s*/i, "")
59
+ .replace(/```$/i, "")
60
+ .trim(),
61
+ ];
62
+
63
+ // Find the outermost JSON object via brace matching (avoids ReDoS-prone .* patterns)
64
+ const firstBrace = trimmed.indexOf("{");
65
+ const lastBrace = trimmed.lastIndexOf("}");
66
+ if (firstBrace !== -1 && lastBrace !== -1 && firstBrace < lastBrace) {
67
+ candidates.push(trimmed.slice(firstBrace, lastBrace + 1));
68
+ }
69
+
70
+ for (const candidate of [...candidates]) {
71
+ const repaired = escapeControlCharsInsideJsonStrings(candidate);
72
+ if (repaired !== candidate) candidates.push(repaired);
73
+ }
74
+
75
+ for (const candidate of candidates) {
76
+ try {
77
+ return JSON.parse(candidate);
78
+ } catch {
79
+ // try next candidate
80
+ }
81
+ }
82
+ return null;
83
+ }
84
+
85
+ export function normalizeSynthesisPayload(
86
+ payload,
87
+ sources,
88
+ fallbackAnswer = "",
89
+ ) {
90
+ const sourceIds = new Set(sources.map((source) => source.id));
91
+ const agreementLevel = [
92
+ "high",
93
+ "medium",
94
+ "low",
95
+ "mixed",
96
+ "conflicting",
97
+ ].includes(payload?.agreement?.level)
98
+ ? payload.agreement.level
99
+ : "mixed";
100
+ const claims = Array.isArray(payload?.claims)
101
+ ? payload.claims
102
+ .map((claim) => ({
103
+ claim: trimText(claim?.claim || "", 260),
104
+ support: ["strong", "moderate", "weak", "conflicting"].includes(
105
+ claim?.support,
106
+ )
107
+ ? claim.support
108
+ : "moderate",
109
+ sourceIds: Array.isArray(claim?.sourceIds)
110
+ ? claim.sourceIds.filter((id) => sourceIds.has(id))
111
+ : [],
112
+ }))
113
+ .filter((claim) => claim.claim)
114
+ : [];
115
+ const recommendedSources = Array.isArray(payload?.recommendedSources)
116
+ ? payload.recommendedSources.filter((id) => sourceIds.has(id)).slice(0, 6)
117
+ : [];
118
+
119
+ // Clean up fallback answer if it contains preamble text
120
+ // Use indexOf/lastIndexOf instead of [\s\S]* patterns to avoid ReDoS
121
+ let cleanFallback = "";
122
+ if (fallbackAnswer) {
123
+ const firstBrace = fallbackAnswer.indexOf("{");
124
+ const lastBrace = fallbackAnswer.lastIndexOf("}");
125
+ if (firstBrace !== -1 && lastBrace !== -1 && firstBrace < lastBrace) {
126
+ cleanFallback = fallbackAnswer.slice(firstBrace, lastBrace + 1);
127
+ } else {
128
+ cleanFallback = fallbackAnswer;
129
+ }
130
+ }
131
+
132
+ return {
133
+ answer: trimText(payload?.answer || cleanFallback || fallbackAnswer, 4000),
134
+ agreement: {
135
+ level: agreementLevel,
136
+ summary: trimText(payload?.agreement?.summary || "", 280),
137
+ },
138
+ differences: Array.isArray(payload?.differences)
139
+ ? payload.differences
140
+ .map((item) => trimText(item, 220))
141
+ .filter(Boolean)
142
+ .slice(0, 5)
143
+ : [],
144
+ caveats: Array.isArray(payload?.caveats)
145
+ ? payload.caveats
146
+ .map((item) => trimText(item, 220))
147
+ .filter(Boolean)
148
+ .slice(0, 5)
149
+ : [],
150
+ claims,
151
+ recommendedSources,
152
+ };
153
+ }
154
+
155
+ export function buildSynthesisPrompt(
156
+ query,
157
+ results,
158
+ sources,
159
+ { grounded = false } = {},
160
+ ) {
161
+ const engineSummaries = {};
162
+ for (const engine of ["perplexity", "bing", "google"]) {
163
+ const result = results[engine];
164
+ if (!result) continue;
165
+ if (result.error) {
166
+ engineSummaries[engine] = {
167
+ status: "error",
168
+ error: String(result.error),
169
+ };
170
+ continue;
171
+ }
172
+
173
+ engineSummaries[engine] = {
174
+ status: "ok",
175
+ answer: trimText(result.answer || "", grounded ? 4500 : 2200),
176
+ sourceIds: sources
177
+ .filter((source) => source.engines.includes(engine))
178
+ .sort(
179
+ (a, b) =>
180
+ (a.perEngine[engine]?.rank || 99) -
181
+ (b.perEngine[engine]?.rank || 99),
182
+ )
183
+ .map((source) => source.id)
184
+ .slice(0, 6),
185
+ };
186
+ }
187
+
188
+ // Snippet budget: always include content for fetched sources so Gemini can
189
+ // make citation decisions based on what the sources actually say, not just
190
+ // their metadata. Grounded mode gets a larger budget per source.
191
+ const snippetChars = grounded ? 700 : 300;
192
+ const sourceRegistry = sources.slice(0, grounded ? 10 : 8).map((source) => ({
193
+ id: source.id,
194
+ title: source.title,
195
+ domain: source.domain,
196
+ canonicalUrl: source.canonicalUrl,
197
+ sourceType: source.sourceType,
198
+ isOfficial: source.isOfficial,
199
+ engines: source.engines,
200
+ engineCount: source.engineCount,
201
+ fetch: source.fetch?.attempted
202
+ ? {
203
+ ok: source.fetch.ok,
204
+ publishedTime: source.fetch.publishedTime || "",
205
+ byline: source.fetch.byline || "",
206
+ snippet: trimText(source.fetch.snippet || "", snippetChars),
207
+ }
208
+ : undefined,
209
+ }));
210
+
211
+ return [
212
+ "You are a research synthesizer. Combine these search engine results into a single authoritative answer.",
213
+ "",
214
+ `Query: ${query}`,
215
+ "",
216
+ `Engine summaries:\n${JSON.stringify(engineSummaries, null, 2)}`,
217
+ "",
218
+ `Source registry:\n${JSON.stringify(sourceRegistry, null, 2)}`,
219
+ "",
220
+ "Instructions:",
221
+ "- Write a clear, direct answer in markdown (use headers/bullets where they help readability)",
222
+ "- Cite sources inline as [S1], [S2] etc. when making specific claims",
223
+ "- Prefer sources with content (fetch.ok=true and non-empty snippet) for citations",
224
+ "- Note where the engines agree or meaningfully disagree",
225
+ "- List any important caveats or limitations",
226
+ "- recommendedSources: the 2-4 source IDs most worth reading for this query",
227
+ "",
228
+ "Respond ONLY with a JSON object wrapped in BEGIN_JSON / END_JSON markers:",
229
+ "",
230
+ "BEGIN_JSON",
231
+ JSON.stringify(
232
+ {
233
+ answer: "<your markdown answer here>",
234
+ agreement: {
235
+ level: "high|medium|mixed|conflicting",
236
+ summary: "<one sentence>",
237
+ },
238
+ differences: ["<notable difference between engines, if any>"],
239
+ caveats: ["<important caveat or limitation>"],
240
+ recommendedSources: ["S1", "S2"],
241
+ },
242
+ null,
243
+ 2,
244
+ ),
245
+ "END_JSON",
246
+ ].join("\n");
247
+ }
248
+
249
+ export function buildConfidence(out) {
250
+ const sources = Array.isArray(out._sources) ? out._sources : [];
251
+ const topConsensus = sources.length > 0 ? sources[0]?.engineCount || 0 : 0;
252
+ const officialSourceCount = sources.filter(
253
+ (source) => source.isOfficial,
254
+ ).length;
255
+ const firstPartySourceCount = sources.filter(
256
+ (source) => source.isOfficial || source.sourceType === "maintainer-blog",
257
+ ).length;
258
+ const fetchedAttempted = sources.filter(
259
+ (source) => source.fetch?.attempted,
260
+ ).length;
261
+ const fetchedSucceeded = sources.filter((source) => source.fetch?.ok).length;
262
+ const sourceTypeBreakdown = sources.reduce((acc, source) => {
263
+ acc[source.sourceType] = (acc[source.sourceType] || 0) + 1;
264
+ return acc;
265
+ }, {});
266
+ const synthesisLevel = out._synthesis?.agreement?.level;
267
+
268
+ return {
269
+ sourcesCount: sources.length,
270
+ topSourceConsensus: topConsensus,
271
+ agreementLevel:
272
+ synthesisLevel ||
273
+ (topConsensus >= 3 ? "high" : topConsensus >= 2 ? "medium" : "low"),
274
+ enginesResponded: ALL_ENGINES.filter(
275
+ (engine) => out[engine]?.answer && !out[engine]?.error,
276
+ ),
277
+ enginesFailed: ALL_ENGINES.filter((engine) => out[engine]?.error),
278
+ officialSourceCount,
279
+ firstPartySourceCount,
280
+ fetchedSourceSuccessRate:
281
+ fetchedAttempted > 0
282
+ ? Number((fetchedSucceeded / fetchedAttempted).toFixed(2))
283
+ : 0,
284
+ sourceTypeBreakdown,
285
+ };
286
+ }