@gmickel/gno 1.4.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,368 @@
1
+ /**
2
+ * Bounded graph expansion for hybrid retrieval.
3
+ *
4
+ * @module src/pipeline/graph-retrieval
5
+ */
6
+
7
+ import type {
8
+ ChunkRow,
9
+ DocumentRow,
10
+ GraphEdgeConfidence,
11
+ GraphLink,
12
+ StorePort,
13
+ } from "../store/types";
14
+ import type { FusionCandidate } from "./types";
15
+
16
+ import { isWithinTemporalRange } from "./temporal";
17
+
18
+ export interface GraphRetrievalMeta {
19
+ attempted: boolean;
20
+ enabled: boolean;
21
+ seedCount: number;
22
+ candidateCount: number;
23
+ maxCandidates: number;
24
+ edgeConfidence: Record<GraphEdgeConfidence, number>;
25
+ fallbackReasons: string[];
26
+ }
27
+
28
+ export interface GraphRetrievalResult {
29
+ candidates: Array<{ mirrorHash: string; seq: number }>;
30
+ meta: GraphRetrievalMeta;
31
+ }
32
+
33
+ const GRAPH_SEED_LIMIT = 5;
34
+ const GRAPH_CANDIDATE_LIMIT = 20;
35
+ const GRAPH_NODE_LIMIT = 2000;
36
+ const GRAPH_EDGE_LIMIT = 10000;
37
+
38
+ const EMPTY_EDGE_CONFIDENCE: Record<GraphEdgeConfidence, number> = {
39
+ explicit: 0,
40
+ inferred: 0,
41
+ ambiguous: 0,
42
+ similarity: 0,
43
+ };
44
+
45
+ const confidenceWeight = (confidence: GraphEdgeConfidence): number => {
46
+ switch (confidence) {
47
+ case "explicit":
48
+ return 1;
49
+ case "inferred":
50
+ return 0.65;
51
+ case "ambiguous":
52
+ return 0.35;
53
+ case "similarity":
54
+ return 0.25;
55
+ }
56
+ };
57
+
58
+ const matchesDocumentFilters = (
59
+ doc: DocumentRow,
60
+ options: {
61
+ since?: string;
62
+ until?: string;
63
+ categories?: string[];
64
+ author?: string;
65
+ }
66
+ ): boolean => {
67
+ if (
68
+ !isWithinTemporalRange(doc.sourceMtime, {
69
+ since: options.since,
70
+ until: options.until,
71
+ })
72
+ ) {
73
+ return false;
74
+ }
75
+ if (
76
+ options.author &&
77
+ !doc.author?.toLowerCase().includes(options.author.toLowerCase())
78
+ ) {
79
+ return false;
80
+ }
81
+ if (options.categories?.length) {
82
+ const allowed = new Set(options.categories.map((c) => c.toLowerCase()));
83
+ const contentTypeMatch = doc.contentType
84
+ ? allowed.has(doc.contentType.toLowerCase())
85
+ : false;
86
+ const categoryMatch = (doc.categories ?? []).some((c) =>
87
+ allowed.has(c.toLowerCase())
88
+ );
89
+ if (!contentTypeMatch && !categoryMatch) {
90
+ return false;
91
+ }
92
+ }
93
+ return true;
94
+ };
95
+
96
+ const filterDocsByTags = async (
97
+ store: StorePort,
98
+ docs: DocumentRow[],
99
+ options: { tagsAll?: string[]; tagsAny?: string[] }
100
+ ): Promise<DocumentRow[]> => {
101
+ if (!options.tagsAll?.length && !options.tagsAny?.length) {
102
+ return docs;
103
+ }
104
+
105
+ const tagsResult = await store.getTagsBatch(docs.map((doc) => doc.id));
106
+ if (!tagsResult.ok) {
107
+ return [];
108
+ }
109
+
110
+ return docs.filter((doc) => {
111
+ const docTags = new Set(
112
+ (tagsResult.value.get(doc.id) ?? []).map((tag) => tag.tag)
113
+ );
114
+ if (options.tagsAll?.length) {
115
+ const hasAll = options.tagsAll.every((tag) => docTags.has(tag));
116
+ if (!hasAll) {
117
+ return false;
118
+ }
119
+ }
120
+ if (options.tagsAny?.length) {
121
+ const hasAny = options.tagsAny.some((tag) => docTags.has(tag));
122
+ if (!hasAny) {
123
+ return false;
124
+ }
125
+ }
126
+ return true;
127
+ });
128
+ };
129
+
130
+ const chooseCandidateSeq = (
131
+ doc: DocumentRow,
132
+ chunks: ChunkRow[],
133
+ preferredSeqByHash: Map<string, number>,
134
+ lang?: string
135
+ ): number | null => {
136
+ if (!doc.mirrorHash) {
137
+ return null;
138
+ }
139
+
140
+ const preferredSeq = preferredSeqByHash.get(doc.mirrorHash);
141
+ if (preferredSeq !== undefined) {
142
+ const preferredChunk = chunks.find((chunk) => chunk.seq === preferredSeq);
143
+ if (preferredChunk && (!lang || preferredChunk.language === lang)) {
144
+ return preferredSeq;
145
+ }
146
+ }
147
+
148
+ const chunk = lang
149
+ ? chunks.find((candidate) => candidate.language === lang)
150
+ : chunks[0];
151
+ return chunk?.seq ?? null;
152
+ };
153
+
154
+ const createMeta = (
155
+ overrides: Partial<GraphRetrievalMeta> = {}
156
+ ): GraphRetrievalMeta => ({
157
+ attempted: false,
158
+ enabled: false,
159
+ seedCount: 0,
160
+ candidateCount: 0,
161
+ maxCandidates: GRAPH_CANDIDATE_LIMIT,
162
+ edgeConfidence: { ...EMPTY_EDGE_CONFIDENCE },
163
+ fallbackReasons: [],
164
+ ...overrides,
165
+ });
166
+
167
+ const addEdgeCandidate = (
168
+ scores: Map<string, number>,
169
+ link: GraphLink,
170
+ neighborDocid: string,
171
+ seedRank: number,
172
+ edgeConfidence: Record<GraphEdgeConfidence, number>
173
+ ): void => {
174
+ edgeConfidence[link.confidence] += 1;
175
+ const seedWeight = 1 / seedRank;
176
+ const edgeWeight =
177
+ link.confidence === "similarity"
178
+ ? Math.max(0, Math.min(1, link.weight))
179
+ : Math.max(1, link.weight);
180
+ const score = confidenceWeight(link.confidence) * edgeWeight * seedWeight;
181
+ const current = scores.get(neighborDocid) ?? 0;
182
+ scores.set(neighborDocid, current + score);
183
+ };
184
+
185
+ /**
186
+ * Expand top retrieval candidates through one-hop graph neighbors.
187
+ */
188
+ export async function expandGraphCandidates(
189
+ store: StorePort,
190
+ fusedCandidates: FusionCandidate[],
191
+ options: {
192
+ collection?: string;
193
+ includeSimilar?: boolean;
194
+ limit?: number;
195
+ candidateLimit?: number;
196
+ disabled?: boolean;
197
+ lang?: string;
198
+ tagsAll?: string[];
199
+ tagsAny?: string[];
200
+ since?: string;
201
+ until?: string;
202
+ categories?: string[];
203
+ author?: string;
204
+ } = {}
205
+ ): Promise<GraphRetrievalResult> {
206
+ const maxCandidates = Math.max(
207
+ 1,
208
+ Math.min(
209
+ GRAPH_CANDIDATE_LIMIT,
210
+ options.candidateLimit ?? options.limit ?? GRAPH_CANDIDATE_LIMIT
211
+ )
212
+ );
213
+ const meta = createMeta({ maxCandidates });
214
+
215
+ if (options.disabled) {
216
+ meta.fallbackReasons.push("graph_disabled");
217
+ return { candidates: [], meta };
218
+ }
219
+ if (fusedCandidates.length === 0) {
220
+ meta.fallbackReasons.push("graph_no_seed_candidates");
221
+ return { candidates: [], meta };
222
+ }
223
+ if (typeof store.getGraph !== "function") {
224
+ meta.fallbackReasons.push("graph_unavailable");
225
+ return { candidates: [], meta };
226
+ }
227
+
228
+ meta.attempted = true;
229
+ const seedCandidates = fusedCandidates.slice(0, GRAPH_SEED_LIMIT);
230
+ const seedHashes = [...new Set(seedCandidates.map((c) => c.mirrorHash))];
231
+ const preferredSeqByHash = new Map<string, number>();
232
+ for (const candidate of fusedCandidates) {
233
+ if (!preferredSeqByHash.has(candidate.mirrorHash)) {
234
+ preferredSeqByHash.set(candidate.mirrorHash, candidate.seq);
235
+ }
236
+ }
237
+ const seedDocsResult = await store.getDocumentsByMirrorHashes(seedHashes, {
238
+ collection: options.collection,
239
+ activeOnly: true,
240
+ });
241
+ if (!seedDocsResult.ok || seedDocsResult.value.length === 0) {
242
+ meta.fallbackReasons.push("graph_seed_lookup_empty");
243
+ return { candidates: [], meta };
244
+ }
245
+
246
+ const seedByDocid = new Map<string, { doc: DocumentRow; rank: number }>();
247
+ const seedDocids = new Set<string>();
248
+ for (const doc of seedDocsResult.value) {
249
+ if (!doc.mirrorHash) {
250
+ continue;
251
+ }
252
+ const rank =
253
+ seedCandidates.findIndex(
254
+ (candidate) => candidate.mirrorHash === doc.mirrorHash
255
+ ) + 1;
256
+ if (rank <= 0) {
257
+ continue;
258
+ }
259
+ seedByDocid.set(doc.docid, { doc, rank });
260
+ seedDocids.add(doc.docid);
261
+ }
262
+ meta.seedCount = seedDocids.size;
263
+ if (seedDocids.size === 0) {
264
+ meta.fallbackReasons.push("graph_seed_lookup_empty");
265
+ return { candidates: [], meta };
266
+ }
267
+
268
+ const graphResult = await store.getGraph({
269
+ collection: options.collection,
270
+ limitNodes: GRAPH_NODE_LIMIT,
271
+ limitEdges: GRAPH_EDGE_LIMIT,
272
+ includeSimilar: options.includeSimilar ?? false,
273
+ linkedOnly: true,
274
+ });
275
+ if (!graphResult.ok) {
276
+ meta.fallbackReasons.push("graph_query_failed");
277
+ return { candidates: [], meta };
278
+ }
279
+ if (graphResult.value.links.length === 0) {
280
+ meta.fallbackReasons.push("graph_empty");
281
+ return { candidates: [], meta };
282
+ }
283
+
284
+ const neighborScores = new Map<string, number>();
285
+ const edgeConfidence = { ...EMPTY_EDGE_CONFIDENCE };
286
+ for (const link of graphResult.value.links) {
287
+ const sourceSeed = seedByDocid.get(link.source);
288
+ const targetSeed = seedByDocid.get(link.target);
289
+ if (sourceSeed && !seedDocids.has(link.target)) {
290
+ addEdgeCandidate(
291
+ neighborScores,
292
+ link,
293
+ link.target,
294
+ sourceSeed.rank,
295
+ edgeConfidence
296
+ );
297
+ }
298
+ if (targetSeed && !seedDocids.has(link.source)) {
299
+ addEdgeCandidate(
300
+ neighborScores,
301
+ link,
302
+ link.source,
303
+ targetSeed.rank,
304
+ edgeConfidence
305
+ );
306
+ }
307
+ }
308
+ meta.edgeConfidence = edgeConfidence;
309
+
310
+ const rankedNeighborDocids = [...neighborScores.entries()]
311
+ .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
312
+ .map(([docid]) => docid);
313
+ if (rankedNeighborDocids.length === 0) {
314
+ meta.fallbackReasons.push("graph_no_new_candidates");
315
+ return { candidates: [], meta };
316
+ }
317
+
318
+ const docsResult = await store.getDocumentsByDocids(rankedNeighborDocids, {
319
+ collection: options.collection,
320
+ activeOnly: true,
321
+ });
322
+ if (!docsResult.ok) {
323
+ meta.fallbackReasons.push("graph_neighbor_lookup_failed");
324
+ return { candidates: [], meta };
325
+ }
326
+
327
+ const metadataFilteredDocs = docsResult.value.filter(
328
+ (doc) => doc.mirrorHash && matchesDocumentFilters(doc, options)
329
+ );
330
+ const docs = await filterDocsByTags(store, metadataFilteredDocs, options);
331
+ const docByDocid = new Map(docs.map((doc) => [doc.docid, doc]));
332
+ const rankedDocs = rankedNeighborDocids
333
+ .map((docid) => docByDocid.get(docid))
334
+ .filter((doc): doc is DocumentRow => Boolean(doc?.mirrorHash))
335
+ .slice(0, maxCandidates);
336
+ const hashes = rankedDocs
337
+ .map((doc) => doc.mirrorHash)
338
+ .filter((hash): hash is string => Boolean(hash));
339
+ const chunksResult = await store.getChunksBatch(hashes);
340
+ if (!chunksResult.ok) {
341
+ meta.fallbackReasons.push("graph_neighbor_chunks_failed");
342
+ return { candidates: [], meta };
343
+ }
344
+
345
+ const candidates = rankedDocs
346
+ .map((doc) => {
347
+ const mirrorHash = doc.mirrorHash as string;
348
+ const seq = chooseCandidateSeq(
349
+ doc,
350
+ chunksResult.value.get(mirrorHash) ?? [],
351
+ preferredSeqByHash,
352
+ options.lang
353
+ );
354
+ return seq === null ? null : { mirrorHash, seq };
355
+ })
356
+ .filter(
357
+ (candidate): candidate is { mirrorHash: string; seq: number } =>
358
+ candidate !== null
359
+ );
360
+
361
+ meta.enabled = candidates.length > 0;
362
+ meta.candidateCount = candidates.length;
363
+ if (candidates.length === 0) {
364
+ meta.fallbackReasons.push("graph_neighbor_lookup_empty");
365
+ }
366
+
367
+ return { candidates, meta };
368
+ }
@@ -37,6 +37,7 @@ import {
37
37
  explainVector,
38
38
  } from "./explain";
39
39
  import { type RankedInput, rrfFuse, toRankedInput } from "./fusion";
40
+ import { expandGraphCandidates } from "./graph-retrieval";
40
41
  import { selectBestChunkForSteering } from "./intent";
41
42
  import { detectQueryLanguage } from "./query-language";
42
43
  import {
@@ -269,6 +270,7 @@ export async function searchHybrid(
269
270
  expansionMs: 0,
270
271
  bm25Ms: 0,
271
272
  vectorMs: 0,
273
+ graphMs: 0,
272
274
  fusionMs: 0,
273
275
  rerankMs: 0,
274
276
  assemblyMs: 0,
@@ -541,8 +543,42 @@ export async function searchHybrid(
541
543
  // 3. RRF Fusion
542
544
  // ─────────────────────────────────────────────────────────────────────────
543
545
  const fusionStartedAt = performance.now();
544
- const fusedCandidates = rrfFuse(rankedInputs, pipelineConfig.rrf);
546
+ const candidateLimit =
547
+ options.candidateLimit ?? pipelineConfig.rerankCandidates;
548
+ let fusedCandidates = rrfFuse(rankedInputs, pipelineConfig.rrf);
549
+
545
550
  timings.fusionMs = performance.now() - fusionStartedAt;
551
+ const graphStartedAt = performance.now();
552
+ const graphExpansion = await expandGraphCandidates(store, fusedCandidates, {
553
+ collection: options.collection,
554
+ includeSimilar: vectorAvailable,
555
+ limit,
556
+ candidateLimit,
557
+ disabled: options.noGraph,
558
+ lang: options.lang,
559
+ tagsAll: options.tagsAll,
560
+ tagsAny: options.tagsAny,
561
+ since: temporalRange.since,
562
+ until: temporalRange.until,
563
+ categories: options.categories,
564
+ author: options.author,
565
+ });
566
+ timings.graphMs = performance.now() - graphStartedAt;
567
+ if (graphExpansion.candidates.length > 0) {
568
+ const graphFusionStartedAt = performance.now();
569
+ rankedInputs.push(toRankedInput("graph", graphExpansion.candidates));
570
+ fusedCandidates = rrfFuse(rankedInputs, pipelineConfig.rrf);
571
+ timings.fusionMs += performance.now() - graphFusionStartedAt;
572
+ }
573
+ if (graphExpansion.meta.fallbackReasons.length > 0) {
574
+ counters.fallbackEvents.push(...graphExpansion.meta.fallbackReasons);
575
+ }
576
+ explainLines.push({
577
+ stage: "graph",
578
+ message: graphExpansion.meta.attempted
579
+ ? `seeds=${graphExpansion.meta.seedCount}, candidates=${graphExpansion.meta.candidateCount}/${graphExpansion.meta.maxCandidates}, explicit=${graphExpansion.meta.edgeConfidence.explicit}, inferred=${graphExpansion.meta.edgeConfidence.inferred}, ambiguous=${graphExpansion.meta.edgeConfidence.ambiguous}, similarity=${graphExpansion.meta.edgeConfidence.similarity}`
580
+ : `skipped (${graphExpansion.meta.fallbackReasons.join(", ") || "disabled"})`,
581
+ });
546
582
  explainLines.push(
547
583
  explainFusion(pipelineConfig.rrf.k, fusedCandidates.length)
548
584
  );
@@ -551,8 +587,6 @@ export async function searchHybrid(
551
587
  // 4. Reranking
552
588
  // ─────────────────────────────────────────────────────────────────────────
553
589
  const rerankStartedAt = performance.now();
554
- const candidateLimit =
555
- options.candidateLimit ?? pipelineConfig.rerankCandidates;
556
590
  const rerankResult = await rerankCandidates(
557
591
  { rerankPort: options.noRerank ? null : rerankPort, store },
558
592
  query,
@@ -888,6 +922,14 @@ export async function searchHybrid(
888
922
  categories: options.categories,
889
923
  author: options.author,
890
924
  candidateLimit,
925
+ graphExpansion: {
926
+ enabled: graphExpansion.meta.enabled,
927
+ seedCount: graphExpansion.meta.seedCount,
928
+ candidateCount: graphExpansion.meta.candidateCount,
929
+ maxCandidates: graphExpansion.meta.maxCandidates,
930
+ edgeConfidence: graphExpansion.meta.edgeConfidence,
931
+ fallbackReasons: graphExpansion.meta.fallbackReasons,
932
+ },
891
933
  queryLanguage,
892
934
  queryModes: queryModeSummary,
893
935
  explain: explainData,
@@ -81,6 +81,20 @@ export interface SearchMeta {
81
81
  author?: string;
82
82
  /** Rerank candidate limit used */
83
83
  candidateLimit?: number;
84
+ /** Bounded graph expansion summary, when hybrid query evaluates graph neighbors */
85
+ graphExpansion?: {
86
+ enabled: boolean;
87
+ seedCount: number;
88
+ candidateCount: number;
89
+ maxCandidates: number;
90
+ edgeConfidence: {
91
+ explicit: number;
92
+ inferred: number;
93
+ ambiguous: number;
94
+ similarity: number;
95
+ };
96
+ fallbackReasons: string[];
97
+ };
84
98
  /** Explicit exclusion terms applied */
85
99
  exclude?: string[];
86
100
  /** Explain data (when --explain is used) */
@@ -160,6 +174,8 @@ export type HybridSearchOptions = SearchOptions & {
160
174
  candidateLimit?: number;
161
175
  /** Enable explain output */
162
176
  explain?: boolean;
177
+ /** Disable bounded one-hop graph candidate expansion */
178
+ noGraph?: boolean;
163
179
  /** Language hint for prompt selection (does NOT filter retrieval, only affects expansion prompts) */
164
180
  queryLanguageHint?: string;
165
181
  };
@@ -225,7 +241,8 @@ export type FusionSource =
225
241
  | "vector"
226
242
  | "bm25_variant"
227
243
  | "vector_variant"
228
- | "hyde";
244
+ | "hyde"
245
+ | "graph";
229
246
 
230
247
  /** Fusion candidate with ranks from different sources */
231
248
  export interface FusionCandidate {