@knolo/core 3.2.0 → 3.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -2
- package/dist/builder.d.ts +4 -0
- package/dist/builder.js +23 -1
- package/dist/graph/build_claim_graph.d.ts +5 -0
- package/dist/graph/build_claim_graph.js +88 -0
- package/dist/graph/claim_graph.d.ts +34 -0
- package/dist/graph/claim_graph.js +65 -0
- package/dist/graph/log.d.ts +33 -0
- package/dist/graph/log.js +106 -0
- package/dist/graph/query_expand.d.ts +6 -0
- package/dist/graph/query_expand.js +57 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.js +8 -0
- package/dist/pack.runtime.d.ts +7 -0
- package/dist/pack.runtime.js +54 -9
- package/dist/query.d.ts +15 -0
- package/dist/query.js +108 -7
- package/dist/semantic/cosine.d.ts +2 -0
- package/dist/semantic/cosine.js +20 -0
- package/dist/semantic/provider.d.ts +3 -0
- package/dist/semantic/provider.js +13 -0
- package/dist/semantic/rerank.d.ts +23 -0
- package/dist/semantic/rerank.js +42 -0
- package/dist/semantic/sidecar.d.ts +10 -0
- package/dist/semantic/sidecar.js +32 -0
- package/dist/semantic/types.d.ts +44 -0
- package/dist/semantic/types.js +1 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -319,10 +319,73 @@ Properties:
|
|
|
319
319
|
|
|
320
320
|
---
|
|
321
321
|
|
|
322
|
-
#
|
|
322
|
+
# 🕸 ClaimGraph API
|
|
323
323
|
|
|
324
|
-
|
|
324
|
+
`@knolo/core` includes a deterministic ClaimGraph subsystem.
|
|
325
|
+
|
|
326
|
+
## Build-time config
|
|
327
|
+
|
|
328
|
+
```ts
|
|
329
|
+
type BuildPackOptions = {
|
|
330
|
+
graph?: {
|
|
331
|
+
enabled?: boolean; // default true
|
|
332
|
+
maxEdgesPerDoc?: number; // default 500
|
|
333
|
+
};
|
|
334
|
+
};
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
## Query-time config
|
|
338
|
+
|
|
339
|
+
```ts
|
|
340
|
+
type QueryOptions = {
|
|
341
|
+
graph?: {
|
|
342
|
+
expand?: boolean; // default false
|
|
343
|
+
maxExtraTerms?: number; // default 12
|
|
344
|
+
predicates?: string[]; // default ['defined_as', 'is', 'mentions', 'ref']
|
|
345
|
+
};
|
|
346
|
+
};
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
## Exports
|
|
325
350
|
|
|
351
|
+
```ts
|
|
352
|
+
import {
|
|
353
|
+
buildClaimGraph,
|
|
354
|
+
getClaimGraph,
|
|
355
|
+
applyClaimGraphLog,
|
|
356
|
+
mergeClaimGraphLogs,
|
|
357
|
+
expandQueryWithGraph,
|
|
358
|
+
createGraphLog,
|
|
359
|
+
appendOp,
|
|
360
|
+
} from '@knolo/core';
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
Types:
|
|
364
|
+
|
|
365
|
+
* `ClaimNode`
|
|
366
|
+
* `ClaimEdge`
|
|
367
|
+
* `ClaimGraph`
|
|
368
|
+
* `ClaimOp`
|
|
369
|
+
* `ClaimGraphLog`
|
|
370
|
+
|
|
371
|
+
## Notes on determinism and bounds
|
|
372
|
+
|
|
373
|
+
* Node IDs are hash-derived from normalized labels.
|
|
374
|
+
* Edge IDs are hash-derived from `(from, predicate, to, evidence)`.
|
|
375
|
+
* Node labels are normalized and deterministically truncated.
|
|
376
|
+
* Evidence arrays are sorted + unique.
|
|
377
|
+
* Node/edge arrays are sorted by ID in final graph.
|
|
378
|
+
* Extraction is bounded with `maxEdgesPerDoc`.
|
|
379
|
+
* Query expansion is bounded with `maxExtraTerms` and stable ordering.
|
|
326
380
|
|
|
381
|
+
## Pack format note
|
|
327
382
|
|
|
383
|
+
`.knolo` binary layout now supports an optional trailing ClaimGraph JSON section after existing sections.
|
|
384
|
+
Runtimes that ignore unknown trailing bytes remain compatible.
|
|
385
|
+
|
|
386
|
+
---
|
|
387
|
+
|
|
388
|
+
# 📄 License
|
|
389
|
+
|
|
390
|
+
Apache-2.0
|
|
328
391
|
|
package/dist/builder.d.ts
CHANGED
package/dist/builder.js
CHANGED
|
@@ -9,6 +9,7 @@ import { tokenize } from './tokenize.js';
|
|
|
9
9
|
import { getTextEncoder } from './utils/utf8.js';
|
|
10
10
|
import { encodeScaleF16, quantizeEmbeddingInt8L2Norm } from './semantic.js';
|
|
11
11
|
import { validateAgentRegistry } from './agent.js';
|
|
12
|
+
import { buildClaimGraph } from './graph/build_claim_graph.js';
|
|
12
13
|
export async function buildPack(docs, opts = {}) {
|
|
13
14
|
const normalizedDocs = validateDocs(docs);
|
|
14
15
|
// Prepare blocks (strip MD) and carry heading/docId for optional boosts.
|
|
@@ -23,6 +24,10 @@ export async function buildPack(docs, opts = {}) {
|
|
|
23
24
|
const totalTokens = blockTokenLens.reduce((sum, len) => sum + len, 0);
|
|
24
25
|
const avgBlockLen = blocks.length ? totalTokens / blocks.length : 1;
|
|
25
26
|
const agents = normalizeAgents(opts.agents);
|
|
27
|
+
const graphEnabled = opts.graph?.enabled ?? true;
|
|
28
|
+
const claimGraph = graphEnabled
|
|
29
|
+
? buildClaimGraph(normalizedDocs, { maxEdgesPerDoc: opts.graph?.maxEdgesPerDoc })
|
|
30
|
+
: null;
|
|
26
31
|
const meta = {
|
|
27
32
|
version: 3,
|
|
28
33
|
stats: {
|
|
@@ -32,6 +37,15 @@ export async function buildPack(docs, opts = {}) {
|
|
|
32
37
|
avgBlockLen,
|
|
33
38
|
},
|
|
34
39
|
...(agents ? { agents } : {}),
|
|
40
|
+
...(claimGraph
|
|
41
|
+
? {
|
|
42
|
+
claimGraph: {
|
|
43
|
+
version: 1,
|
|
44
|
+
nodes: claimGraph.nodes.length,
|
|
45
|
+
edges: claimGraph.edges.length,
|
|
46
|
+
},
|
|
47
|
+
}
|
|
48
|
+
: {}),
|
|
35
49
|
};
|
|
36
50
|
// Persist blocks as objects to optionally carry heading/docId/token length.
|
|
37
51
|
const blocksPayload = blocks.map((b, i) => ({
|
|
@@ -54,6 +68,7 @@ export async function buildPack(docs, opts = {}) {
|
|
|
54
68
|
? enc.encode(JSON.stringify(semanticSection.semJson))
|
|
55
69
|
: undefined;
|
|
56
70
|
const semBlob = semanticSection?.semBlob;
|
|
71
|
+
const graphBytes = claimGraph ? enc.encode(JSON.stringify(claimGraph)) : undefined;
|
|
57
72
|
const totalLength = 4 +
|
|
58
73
|
metaBytes.length +
|
|
59
74
|
4 +
|
|
@@ -64,7 +79,8 @@ export async function buildPack(docs, opts = {}) {
|
|
|
64
79
|
blocksBytes.length +
|
|
65
80
|
(semanticEnabled && semBytes && semBlob
|
|
66
81
|
? 4 + semBytes.length + 4 + semBlob.length
|
|
67
|
-
: 0)
|
|
82
|
+
: 0) +
|
|
83
|
+
(graphBytes ? 4 + graphBytes.length : 0);
|
|
68
84
|
const out = new Uint8Array(totalLength);
|
|
69
85
|
const dv = new DataView(out.buffer);
|
|
70
86
|
let offset = 0;
|
|
@@ -98,6 +114,12 @@ export async function buildPack(docs, opts = {}) {
|
|
|
98
114
|
dv.setUint32(offset, semBlob.length, true);
|
|
99
115
|
offset += 4;
|
|
100
116
|
out.set(semBlob, offset);
|
|
117
|
+
offset += semBlob.length;
|
|
118
|
+
}
|
|
119
|
+
if (graphBytes) {
|
|
120
|
+
dv.setUint32(offset, graphBytes.length, true);
|
|
121
|
+
offset += 4;
|
|
122
|
+
out.set(graphBytes, offset);
|
|
101
123
|
}
|
|
102
124
|
return out;
|
|
103
125
|
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { canonicalEvidence, computeEdgeId, computeNodeId, finalizeGraph, normalizeClaimLabel, } from './claim_graph.js';
|
|
2
|
+
const DEF_RE = /^([A-Za-z0-9 _-]{2,80})\s+(is|are)\s+(.{2,120})[.?!]/;
|
|
3
|
+
const MD_LINK_RE = /\[([^\]]{1,200})\]\(([^)\s]{1,200})\)/g;
|
|
4
|
+
const WIKI_RE = /\[\[([^\]]{1,200})\]\]/g;
|
|
5
|
+
const HEADING_RE = /^(#{1,3})\s+(.+)$/gm;
|
|
6
|
+
const STOPWORDS = new Set(['a', 'an', 'and', 'or', 'the', 'it', 'they', 'this', 'that', 'these', 'those']);
|
|
7
|
+
export function buildClaimGraph(docs, opts = {}) {
|
|
8
|
+
const maxEdgesPerDoc = Math.max(1, opts.maxEdgesPerDoc ?? 500);
|
|
9
|
+
const nodeById = new Map();
|
|
10
|
+
const edgeById = new Map();
|
|
11
|
+
for (let i = 0; i < docs.length; i++) {
|
|
12
|
+
const doc = docs[i];
|
|
13
|
+
const docLabel = normalizeLabel(doc.id || doc.heading || `doc_${i}`);
|
|
14
|
+
const local = [];
|
|
15
|
+
for (const m of doc.text.matchAll(MD_LINK_RE)) {
|
|
16
|
+
addEdge(local, nodeById, normalizeLabel(m[1]), 'ref', normalizeLabel(m[2]), [i]);
|
|
17
|
+
}
|
|
18
|
+
for (const m of doc.text.matchAll(WIKI_RE)) {
|
|
19
|
+
addEdge(local, nodeById, docLabel, 'mentions', normalizeLabel(m[1]), [i]);
|
|
20
|
+
}
|
|
21
|
+
const headingMatches = Array.from(doc.text.matchAll(HEADING_RE));
|
|
22
|
+
for (const h of headingMatches) {
|
|
23
|
+
const headingLabel = normalizeLabel(h[2] || '');
|
|
24
|
+
const headingStart = h.index ?? 0;
|
|
25
|
+
const sentence = firstSentenceAfter(doc.text, headingStart + h[0].length);
|
|
26
|
+
if (sentence) {
|
|
27
|
+
addEdge(local, nodeById, headingLabel, 'defined_as', normalizeLabel(sentence), [i]);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
for (const sentence of splitSentences(doc.text)) {
|
|
31
|
+
const m = sentence.match(DEF_RE);
|
|
32
|
+
if (!m)
|
|
33
|
+
continue;
|
|
34
|
+
const subject = normalizeLabel(m[1]);
|
|
35
|
+
if (!subject || isStopwordOnly(subject))
|
|
36
|
+
continue;
|
|
37
|
+
const objectSnippet = normalizeLabel(m[3]);
|
|
38
|
+
addEdge(local, nodeById, subject, 'is', objectSnippet, [i]);
|
|
39
|
+
}
|
|
40
|
+
local.sort((a, b) => a.id.localeCompare(b.id));
|
|
41
|
+
for (const edge of local.slice(0, maxEdgesPerDoc)) {
|
|
42
|
+
const existing = edgeById.get(edge.id);
|
|
43
|
+
if (existing) {
|
|
44
|
+
existing.evidence = canonicalEvidence([...(existing.evidence ?? []), ...(edge.evidence ?? [])]);
|
|
45
|
+
}
|
|
46
|
+
else {
|
|
47
|
+
edgeById.set(edge.id, edge);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
return finalizeGraph({ version: 1, nodes: [...nodeById.values()], edges: [...edgeById.values()] });
|
|
52
|
+
}
|
|
53
|
+
function addEdge(local, nodeById, fromLabel, p, toLabel, evidence) {
|
|
54
|
+
if (!fromLabel || !toLabel)
|
|
55
|
+
return;
|
|
56
|
+
const fromId = ensureNode(nodeById, fromLabel);
|
|
57
|
+
const toId = ensureNode(nodeById, toLabel);
|
|
58
|
+
const edgeEvidence = canonicalEvidence(evidence);
|
|
59
|
+
const id = computeEdgeId(fromId, p, toId, edgeEvidence);
|
|
60
|
+
local.push({ id, from: fromId, p, to: toId, evidence: edgeEvidence });
|
|
61
|
+
}
|
|
62
|
+
function ensureNode(nodeById, label) {
|
|
63
|
+
const id = computeNodeId(label);
|
|
64
|
+
if (!nodeById.has(id))
|
|
65
|
+
nodeById.set(id, { id, label });
|
|
66
|
+
return id;
|
|
67
|
+
}
|
|
68
|
+
function normalizeLabel(input) {
|
|
69
|
+
return normalizeClaimLabel(input, 200);
|
|
70
|
+
}
|
|
71
|
+
function splitSentences(text) {
|
|
72
|
+
return text
|
|
73
|
+
.replace(/\r\n/g, '\n')
|
|
74
|
+
.split(/(?<=[.?!])\s+/)
|
|
75
|
+
.map((s) => s.trim())
|
|
76
|
+
.filter(Boolean);
|
|
77
|
+
}
|
|
78
|
+
function firstSentenceAfter(text, startIdx) {
|
|
79
|
+
const tail = text.slice(startIdx).replace(/^[^\n]*\n+/, '').trim();
|
|
80
|
+
if (!tail)
|
|
81
|
+
return '';
|
|
82
|
+
const first = splitSentences(tail)[0] ?? '';
|
|
83
|
+
return first.slice(0, 240);
|
|
84
|
+
}
|
|
85
|
+
function isStopwordOnly(subject) {
|
|
86
|
+
const words = subject.split(/\s+/).filter(Boolean);
|
|
87
|
+
return words.length > 0 && words.every((w) => STOPWORDS.has(w));
|
|
88
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import type { Pack } from '../pack.runtime.js';
|
|
2
|
+
export type ClaimNode = {
|
|
3
|
+
id: string;
|
|
4
|
+
label: string;
|
|
5
|
+
props?: Record<string, string>;
|
|
6
|
+
};
|
|
7
|
+
export type ClaimEdge = {
|
|
8
|
+
id: string;
|
|
9
|
+
from: string;
|
|
10
|
+
p: string;
|
|
11
|
+
to: string;
|
|
12
|
+
evidence?: number[];
|
|
13
|
+
actor?: string;
|
|
14
|
+
ts?: number;
|
|
15
|
+
};
|
|
16
|
+
export type ClaimGraph = {
|
|
17
|
+
version: 1;
|
|
18
|
+
nodes: ClaimNode[];
|
|
19
|
+
edges: ClaimEdge[];
|
|
20
|
+
index?: {
|
|
21
|
+
labelToId?: Record<string, string>;
|
|
22
|
+
out?: Record<string, string[]>;
|
|
23
|
+
in?: Record<string, string[]>;
|
|
24
|
+
};
|
|
25
|
+
};
|
|
26
|
+
export declare function normalizeClaimLabel(label: string, maxLen?: number): string;
|
|
27
|
+
export declare function computeNodeId(label: string): string;
|
|
28
|
+
export declare function computeEdgeId(from: string, p: string, to: string, evidence?: number[]): string;
|
|
29
|
+
export declare function canonicalEvidence(evidence?: number[]): number[];
|
|
30
|
+
export declare function buildGraphIndex(graph: ClaimGraph): ClaimGraph['index'];
|
|
31
|
+
export declare function finalizeGraph(graph: ClaimGraph): ClaimGraph;
|
|
32
|
+
export declare function getClaimGraph(pack: Pack): ClaimGraph | null;
|
|
33
|
+
export declare function validateClaimGraph(input: unknown): ClaimGraph | null;
|
|
34
|
+
export declare function expandLabelToTerms(label: string): string[];
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import { normalize, tokenize } from '../tokenize.js';
|
|
2
|
+
export function normalizeClaimLabel(label, maxLen = 200) {
|
|
3
|
+
const compact = normalize(label).replace(/\s+/g, ' ').trim();
|
|
4
|
+
return compact.slice(0, maxLen);
|
|
5
|
+
}
|
|
6
|
+
export function computeNodeId(label) {
|
|
7
|
+
return `n_${hash32Hex(normalizeClaimLabel(label))}`;
|
|
8
|
+
}
|
|
9
|
+
export function computeEdgeId(from, p, to, evidence) {
|
|
10
|
+
const evidenceCsv = canonicalEvidence(evidence).join(',');
|
|
11
|
+
return `e_${hash32Hex(`${from}\n${p}\n${to}\n${evidenceCsv}`)}`;
|
|
12
|
+
}
|
|
13
|
+
export function canonicalEvidence(evidence) {
|
|
14
|
+
if (!evidence?.length)
|
|
15
|
+
return [];
|
|
16
|
+
return Array.from(new Set(evidence.filter((n) => Number.isInteger(n) && n >= 0))).sort((a, b) => a - b);
|
|
17
|
+
}
|
|
18
|
+
export function buildGraphIndex(graph) {
|
|
19
|
+
const labelToId = {};
|
|
20
|
+
const out = {};
|
|
21
|
+
const inbound = {};
|
|
22
|
+
for (const node of graph.nodes) {
|
|
23
|
+
labelToId[normalizeClaimLabel(node.label)] = node.id;
|
|
24
|
+
}
|
|
25
|
+
for (const edge of graph.edges) {
|
|
26
|
+
(out[edge.from] ||= []).push(edge.id);
|
|
27
|
+
(inbound[edge.to] ||= []).push(edge.id);
|
|
28
|
+
}
|
|
29
|
+
for (const key of Object.keys(out))
|
|
30
|
+
out[key].sort();
|
|
31
|
+
for (const key of Object.keys(inbound))
|
|
32
|
+
inbound[key].sort();
|
|
33
|
+
return { labelToId, out, in: inbound };
|
|
34
|
+
}
|
|
35
|
+
export function finalizeGraph(graph) {
|
|
36
|
+
const nodes = [...graph.nodes].sort((a, b) => a.id.localeCompare(b.id));
|
|
37
|
+
const edges = [...graph.edges]
|
|
38
|
+
.map((e) => ({ ...e, evidence: canonicalEvidence(e.evidence) }))
|
|
39
|
+
.sort((a, b) => a.id.localeCompare(b.id));
|
|
40
|
+
const out = { version: 1, nodes, edges };
|
|
41
|
+
out.index = buildGraphIndex(out);
|
|
42
|
+
return out;
|
|
43
|
+
}
|
|
44
|
+
export function getClaimGraph(pack) {
|
|
45
|
+
return pack.claimGraph ?? null;
|
|
46
|
+
}
|
|
47
|
+
export function validateClaimGraph(input) {
|
|
48
|
+
if (!input || typeof input !== 'object')
|
|
49
|
+
return null;
|
|
50
|
+
const g = input;
|
|
51
|
+
if (g.version !== 1 || !Array.isArray(g.nodes) || !Array.isArray(g.edges))
|
|
52
|
+
return null;
|
|
53
|
+
return finalizeGraph({ version: 1, nodes: g.nodes, edges: g.edges });
|
|
54
|
+
}
|
|
55
|
+
export function expandLabelToTerms(label) {
|
|
56
|
+
return tokenize(normalizeClaimLabel(label)).map((t) => t.term);
|
|
57
|
+
}
|
|
58
|
+
function hash32Hex(input) {
|
|
59
|
+
let h = 0x811c9dc5;
|
|
60
|
+
for (let i = 0; i < input.length; i++) {
|
|
61
|
+
h ^= input.charCodeAt(i);
|
|
62
|
+
h = Math.imul(h, 0x01000193);
|
|
63
|
+
}
|
|
64
|
+
return (h >>> 0).toString(16).padStart(8, '0');
|
|
65
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { Pack } from '../pack.runtime.js';
|
|
2
|
+
import type { ClaimGraph } from './claim_graph.js';
|
|
3
|
+
export type ClaimOp = {
|
|
4
|
+
op: 'upsert_node';
|
|
5
|
+
id?: string;
|
|
6
|
+
label: string;
|
|
7
|
+
props?: Record<string, string>;
|
|
8
|
+
ts: number;
|
|
9
|
+
actor: string;
|
|
10
|
+
} | {
|
|
11
|
+
op: 'add_edge';
|
|
12
|
+
from: string;
|
|
13
|
+
p: string;
|
|
14
|
+
to: string;
|
|
15
|
+
evidence?: number[];
|
|
16
|
+
ts: number;
|
|
17
|
+
actor: string;
|
|
18
|
+
} | {
|
|
19
|
+
op: 'tombstone_edge';
|
|
20
|
+
edgeId: string;
|
|
21
|
+
ts: number;
|
|
22
|
+
actor: string;
|
|
23
|
+
};
|
|
24
|
+
export type ClaimGraphLog = {
|
|
25
|
+
version: 1;
|
|
26
|
+
ops: ClaimOp[];
|
|
27
|
+
};
|
|
28
|
+
export declare function createGraphLog(): ClaimGraphLog;
|
|
29
|
+
export declare function appendOp(log: ClaimGraphLog, op: ClaimOp): ClaimGraphLog;
|
|
30
|
+
export declare function mergeClaimGraphLogs(a: ClaimGraphLog, b: ClaimGraphLog): ClaimGraphLog;
|
|
31
|
+
export declare function serializeClaimGraphLog(log: ClaimGraphLog): Uint8Array;
|
|
32
|
+
export declare function deserializeClaimGraphLog(data: Uint8Array): ClaimGraphLog;
|
|
33
|
+
export declare function applyClaimGraphLog(graphOrPack: ClaimGraph | Pack, log: ClaimGraphLog): ClaimGraph;
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { canonicalEvidence, computeEdgeId, computeNodeId, finalizeGraph, normalizeClaimLabel, } from './claim_graph.js';
|
|
2
|
+
export function createGraphLog() {
|
|
3
|
+
return { version: 1, ops: [] };
|
|
4
|
+
}
|
|
5
|
+
export function appendOp(log, op) {
|
|
6
|
+
return { version: 1, ops: [...log.ops, op] };
|
|
7
|
+
}
|
|
8
|
+
export function mergeClaimGraphLogs(a, b) {
|
|
9
|
+
return { version: 1, ops: [...a.ops, ...b.ops].sort(compareOps) };
|
|
10
|
+
}
|
|
11
|
+
export function serializeClaimGraphLog(log) {
|
|
12
|
+
return new TextEncoder().encode(JSON.stringify({ version: 1, ops: [...log.ops].sort(compareOps) }));
|
|
13
|
+
}
|
|
14
|
+
export function deserializeClaimGraphLog(data) {
|
|
15
|
+
const parsed = JSON.parse(new TextDecoder().decode(data));
|
|
16
|
+
if (!parsed || parsed.version !== 1 || !Array.isArray(parsed.ops)) {
|
|
17
|
+
throw new Error('Invalid ClaimGraphLog payload');
|
|
18
|
+
}
|
|
19
|
+
return { version: 1, ops: parsed.ops.sort(compareOps) };
|
|
20
|
+
}
|
|
21
|
+
export function applyClaimGraphLog(graphOrPack, log) {
|
|
22
|
+
const baseGraph = isPack(graphOrPack)
|
|
23
|
+
? graphOrPack.claimGraph ?? { version: 1, nodes: [], edges: [] }
|
|
24
|
+
: graphOrPack;
|
|
25
|
+
const nodeById = new Map(baseGraph.nodes.map((n) => [n.id, { ...n, props: n.props ? { ...n.props } : undefined }]));
|
|
26
|
+
const edgeById = new Map(baseGraph.edges.map((e) => [e.id, { ...e, evidence: canonicalEvidence(e.evidence) }]));
|
|
27
|
+
const nodeStamp = new Map();
|
|
28
|
+
const addStamp = new Map();
|
|
29
|
+
const tombstoneStamp = new Map();
|
|
30
|
+
for (const op of [...log.ops].sort(compareOps)) {
|
|
31
|
+
if (op.op === 'upsert_node') {
|
|
32
|
+
const label = normalizeClaimLabel(op.label);
|
|
33
|
+
const id = op.id || computeNodeId(label);
|
|
34
|
+
const prev = nodeStamp.get(id);
|
|
35
|
+
if (!prev || compareStamp([op.ts, op.actor], prev) >= 0) {
|
|
36
|
+
nodeStamp.set(id, [op.ts, op.actor]);
|
|
37
|
+
nodeById.set(id, { id, label, props: op.props ? { ...op.props } : undefined });
|
|
38
|
+
}
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
if (op.op === 'add_edge') {
|
|
42
|
+
const evidence = canonicalEvidence(op.evidence);
|
|
43
|
+
const edgeId = computeEdgeId(op.from, op.p, op.to, evidence);
|
|
44
|
+
const prevAdd = addStamp.get(edgeId);
|
|
45
|
+
if (!prevAdd || compareStamp([op.ts, op.actor], prevAdd) >= 0) {
|
|
46
|
+
addStamp.set(edgeId, [op.ts, op.actor]);
|
|
47
|
+
}
|
|
48
|
+
const existing = edgeById.get(edgeId);
|
|
49
|
+
const mergedEvidence = canonicalEvidence([...(existing?.evidence ?? []), ...evidence]);
|
|
50
|
+
edgeById.set(edgeId, {
|
|
51
|
+
id: edgeId,
|
|
52
|
+
from: op.from,
|
|
53
|
+
p: op.p,
|
|
54
|
+
to: op.to,
|
|
55
|
+
evidence: mergedEvidence,
|
|
56
|
+
actor: op.actor,
|
|
57
|
+
ts: op.ts,
|
|
58
|
+
});
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
const prev = tombstoneStamp.get(op.edgeId);
|
|
62
|
+
if (!prev || compareStamp([op.ts, op.actor], prev) >= 0) {
|
|
63
|
+
tombstoneStamp.set(op.edgeId, [op.ts, op.actor]);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
for (const [edgeId, edge] of edgeById) {
|
|
67
|
+
const add = addStamp.get(edgeId) ?? [-Infinity, ''];
|
|
68
|
+
const tomb = tombstoneStamp.get(edgeId);
|
|
69
|
+
if (tomb && compareStamp(tomb, add) > 0) {
|
|
70
|
+
edgeById.delete(edgeId);
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
if (!nodeById.has(edge.from)) {
|
|
74
|
+
nodeById.set(edge.from, { id: edge.from, label: edge.from });
|
|
75
|
+
}
|
|
76
|
+
if (!nodeById.has(edge.to)) {
|
|
77
|
+
nodeById.set(edge.to, { id: edge.to, label: edge.to });
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return finalizeGraph({ version: 1, nodes: [...nodeById.values()], edges: [...edgeById.values()] });
|
|
81
|
+
}
|
|
82
|
+
function compareOps(a, b) {
|
|
83
|
+
if (a.ts !== b.ts)
|
|
84
|
+
return a.ts - b.ts;
|
|
85
|
+
const actorCmp = a.actor.localeCompare(b.actor);
|
|
86
|
+
if (actorCmp !== 0)
|
|
87
|
+
return actorCmp;
|
|
88
|
+
return stableSerializeOp(a).localeCompare(stableSerializeOp(b));
|
|
89
|
+
}
|
|
90
|
+
function stableSerializeOp(op) {
|
|
91
|
+
if (op.op === 'upsert_node') {
|
|
92
|
+
return `upsert_node|${op.id || ''}|${normalizeClaimLabel(op.label)}|${JSON.stringify(op.props || {})}`;
|
|
93
|
+
}
|
|
94
|
+
if (op.op === 'add_edge') {
|
|
95
|
+
return `add_edge|${op.from}|${op.p}|${op.to}|${canonicalEvidence(op.evidence).join(',')}`;
|
|
96
|
+
}
|
|
97
|
+
return `tombstone_edge|${op.edgeId}`;
|
|
98
|
+
}
|
|
99
|
+
function compareStamp(a, b) {
|
|
100
|
+
if (a[0] !== b[0])
|
|
101
|
+
return a[0] - b[0];
|
|
102
|
+
return a[1].localeCompare(b[1]);
|
|
103
|
+
}
|
|
104
|
+
function isPack(input) {
|
|
105
|
+
return Boolean(input.meta && input.blocks);
|
|
106
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { normalize, tokenize } from '../tokenize.js';
|
|
2
|
+
import { expandLabelToTerms } from './claim_graph.js';
|
|
3
|
+
export function expandQueryWithGraph(pack, queryString, opts = {}) {
|
|
4
|
+
const graph = pack.claimGraph;
|
|
5
|
+
if (!graph || graph.nodes.length === 0 || graph.edges.length === 0)
|
|
6
|
+
return queryString;
|
|
7
|
+
const maxExtraTerms = Math.max(1, opts.maxExtraTerms ?? 12);
|
|
8
|
+
const predicates = new Set((opts.predicates ?? ['defined_as', 'is', 'mentions', 'ref']).map((p) => normalize(p)));
|
|
9
|
+
const qTokens = tokenize(queryString).map((t) => t.term);
|
|
10
|
+
if (qTokens.length === 0)
|
|
11
|
+
return queryString;
|
|
12
|
+
const qSet = new Set(qTokens);
|
|
13
|
+
const candidateNodeIds = new Set();
|
|
14
|
+
const labelEntries = Object.entries(graph.index?.labelToId ?? {}).sort((a, b) => a[0].localeCompare(b[0]));
|
|
15
|
+
for (const [labelNorm, nodeId] of labelEntries) {
|
|
16
|
+
if (qSet.has(labelNorm))
|
|
17
|
+
candidateNodeIds.add(nodeId);
|
|
18
|
+
}
|
|
19
|
+
for (const token of qTokens.sort()) {
|
|
20
|
+
for (const [labelNorm, nodeId] of labelEntries) {
|
|
21
|
+
if (labelNorm.startsWith(token))
|
|
22
|
+
candidateNodeIds.add(nodeId);
|
|
23
|
+
if (candidateNodeIds.size >= maxExtraTerms * 4)
|
|
24
|
+
break;
|
|
25
|
+
}
|
|
26
|
+
if (candidateNodeIds.size >= maxExtraTerms * 4)
|
|
27
|
+
break;
|
|
28
|
+
}
|
|
29
|
+
const edgeById = new Map(graph.edges.map((e) => [e.id, e]));
|
|
30
|
+
const outIdx = graph.index?.out ?? {};
|
|
31
|
+
const extraTerms = new Set();
|
|
32
|
+
const sortedNodeIds = [...candidateNodeIds].sort();
|
|
33
|
+
for (const nodeId of sortedNodeIds) {
|
|
34
|
+
const edgeIds = [...(outIdx[nodeId] ?? [])].sort();
|
|
35
|
+
for (const edgeId of edgeIds) {
|
|
36
|
+
const edge = edgeById.get(edgeId);
|
|
37
|
+
if (!edge || !predicates.has(normalize(edge.p)))
|
|
38
|
+
continue;
|
|
39
|
+
const target = graph.nodes.find((n) => n.id === edge.to);
|
|
40
|
+
if (!target)
|
|
41
|
+
continue;
|
|
42
|
+
for (const term of expandLabelToTerms(target.label)) {
|
|
43
|
+
if (!qSet.has(term))
|
|
44
|
+
extraTerms.add(term);
|
|
45
|
+
if (extraTerms.size >= maxExtraTerms)
|
|
46
|
+
break;
|
|
47
|
+
}
|
|
48
|
+
if (extraTerms.size >= maxExtraTerms)
|
|
49
|
+
break;
|
|
50
|
+
}
|
|
51
|
+
if (extraTerms.size >= maxExtraTerms)
|
|
52
|
+
break;
|
|
53
|
+
}
|
|
54
|
+
if (extraTerms.size === 0)
|
|
55
|
+
return queryString;
|
|
56
|
+
return `${queryString} ${[...extraTerms].sort().join(' ')}`.trim();
|
|
57
|
+
}
|
package/dist/index.d.ts
CHANGED
|
@@ -3,12 +3,23 @@ export { query, lexConfidence, validateQueryOptions, validateSemanticQueryOption
|
|
|
3
3
|
export { makeContextPatch } from './patch.js';
|
|
4
4
|
export { buildPack } from './builder.js';
|
|
5
5
|
export { quantizeEmbeddingInt8L2Norm, encodeScaleF16, decodeScaleF16, } from './semantic.js';
|
|
6
|
+
export { cosineSimilarity, normalizeVector } from './semantic/cosine.js';
|
|
7
|
+
export { createPackFingerprint, serializeSidecar, parseSidecar, validateSidecarForPack, } from './semantic/sidecar.js';
|
|
8
|
+
export { rerankCandidates } from './semantic/rerank.js';
|
|
9
|
+
export { assertProviderCompatible, ensureProviderModelId } from './semantic/provider.js';
|
|
6
10
|
export { listAgents, getAgent, resolveAgent, buildSystemPrompt, isToolAllowed, assertToolAllowed, validateAgentRegistry, validateAgentDefinition, } from './agent.js';
|
|
11
|
+
export { getClaimGraph, validateClaimGraph, } from './graph/claim_graph.js';
|
|
12
|
+
export { buildClaimGraph } from './graph/build_claim_graph.js';
|
|
13
|
+
export { createGraphLog, appendOp, applyClaimGraphLog, mergeClaimGraphLogs, serializeClaimGraphLog, deserializeClaimGraphLog, } from './graph/log.js';
|
|
14
|
+
export { expandQueryWithGraph } from './graph/query_expand.js';
|
|
7
15
|
export type { MountOptions, PackMeta, Pack } from './pack.runtime.js';
|
|
8
16
|
export type { QueryOptions, Hit } from './query.js';
|
|
17
|
+
export type { EmbeddingProvider, SemanticSidecar, SemanticQueryOptions, RetrievalEvidence } from './semantic/types.js';
|
|
9
18
|
export type { ContextPatch } from './patch.js';
|
|
10
19
|
export type { BuildInputDoc, BuildPackOptions } from './builder.js';
|
|
11
20
|
export type { AgentPromptTemplate, AgentToolPolicy, AgentRetrievalDefaults, AgentDefinitionV1, AgentRegistry, ResolveAgentInput, ResolvedAgent, } from './agent.js';
|
|
21
|
+
export type { ClaimGraph, ClaimNode, ClaimEdge } from './graph/claim_graph.js';
|
|
22
|
+
export type { ClaimGraphLog, ClaimOp } from './graph/log.js';
|
|
12
23
|
export { parseToolCallV1FromText } from './tool_parse.js';
|
|
13
24
|
export { nowIso, createTrace } from './trace.js';
|
|
14
25
|
export { assertToolCallAllowed } from './tool_gate.js';
|
package/dist/index.js
CHANGED
|
@@ -4,7 +4,15 @@ export { query, lexConfidence, validateQueryOptions, validateSemanticQueryOption
|
|
|
4
4
|
export { makeContextPatch } from './patch.js';
|
|
5
5
|
export { buildPack } from './builder.js';
|
|
6
6
|
export { quantizeEmbeddingInt8L2Norm, encodeScaleF16, decodeScaleF16, } from './semantic.js';
|
|
7
|
+
export { cosineSimilarity, normalizeVector } from './semantic/cosine.js';
|
|
8
|
+
export { createPackFingerprint, serializeSidecar, parseSidecar, validateSidecarForPack, } from './semantic/sidecar.js';
|
|
9
|
+
export { rerankCandidates } from './semantic/rerank.js';
|
|
10
|
+
export { assertProviderCompatible, ensureProviderModelId } from './semantic/provider.js';
|
|
7
11
|
export { listAgents, getAgent, resolveAgent, buildSystemPrompt, isToolAllowed, assertToolAllowed, validateAgentRegistry, validateAgentDefinition, } from './agent.js';
|
|
12
|
+
export { getClaimGraph, validateClaimGraph, } from './graph/claim_graph.js';
|
|
13
|
+
export { buildClaimGraph } from './graph/build_claim_graph.js';
|
|
14
|
+
export { createGraphLog, appendOp, applyClaimGraphLog, mergeClaimGraphLogs, serializeClaimGraphLog, deserializeClaimGraphLog, } from './graph/log.js';
|
|
15
|
+
export { expandQueryWithGraph } from './graph/query_expand.js';
|
|
8
16
|
export { parseToolCallV1FromText } from './tool_parse.js';
|
|
9
17
|
export { nowIso, createTrace } from './trace.js';
|
|
10
18
|
export { assertToolCallAllowed } from './tool_gate.js';
|
package/dist/pack.runtime.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { AgentRegistry } from './agent.js';
|
|
2
|
+
import type { ClaimGraph } from './graph/claim_graph.js';
|
|
2
3
|
export type MountOptions = {
|
|
3
4
|
src: string | ArrayBufferLike | Uint8Array;
|
|
4
5
|
};
|
|
@@ -11,6 +12,11 @@ export type PackMeta = {
|
|
|
11
12
|
avgBlockLen?: number;
|
|
12
13
|
};
|
|
13
14
|
agents?: AgentRegistry;
|
|
15
|
+
claimGraph?: {
|
|
16
|
+
version: 1;
|
|
17
|
+
nodes: number;
|
|
18
|
+
edges: number;
|
|
19
|
+
};
|
|
14
20
|
};
|
|
15
21
|
export type Pack = {
|
|
16
22
|
meta: PackMeta;
|
|
@@ -30,6 +36,7 @@ export type Pack = {
|
|
|
30
36
|
vecs: Int8Array;
|
|
31
37
|
scales?: Uint16Array;
|
|
32
38
|
};
|
|
39
|
+
claimGraph?: ClaimGraph;
|
|
33
40
|
};
|
|
34
41
|
export declare function hasSemantic(pack: Pack): boolean;
|
|
35
42
|
export declare function mountPack(opts: MountOptions): Promise<Pack>;
|
package/dist/pack.runtime.js
CHANGED
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import { getTextDecoder } from './utils/utf8.js';
|
|
8
8
|
import { validateAgentRegistry } from './agent.js';
|
|
9
|
+
import { validateClaimGraph } from './graph/claim_graph.js';
|
|
9
10
|
export function hasSemantic(pack) {
|
|
10
11
|
return Boolean(pack.semantic && pack.semantic.dims > 0 && pack.semantic.vecs.length > 0);
|
|
11
12
|
}
|
|
@@ -75,16 +76,50 @@ export function mountPackFromBuffer(buf) {
|
|
|
75
76
|
}
|
|
76
77
|
}
|
|
77
78
|
let semantic;
|
|
78
|
-
|
|
79
|
-
|
|
79
|
+
let claimGraph;
|
|
80
|
+
while (offset < buf.byteLength) {
|
|
81
|
+
const sectionStart = offset;
|
|
82
|
+
if (buf.byteLength - offset < 4)
|
|
83
|
+
break;
|
|
84
|
+
const jsonLen = dv.getUint32(offset, true);
|
|
80
85
|
offset += 4;
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
86
|
+
if (jsonLen < 0 || offset + jsonLen > buf.byteLength) {
|
|
87
|
+
offset = sectionStart;
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
90
|
+
let parsed;
|
|
91
|
+
try {
|
|
92
|
+
const json = dec.decode(new Uint8Array(buf, offset, jsonLen));
|
|
93
|
+
parsed = JSON.parse(json);
|
|
94
|
+
}
|
|
95
|
+
catch {
|
|
96
|
+
offset = sectionStart;
|
|
97
|
+
break;
|
|
98
|
+
}
|
|
99
|
+
offset += jsonLen;
|
|
100
|
+
if (!semantic && looksLikeSemanticJson(parsed)) {
|
|
101
|
+
if (buf.byteLength - offset < 4) {
|
|
102
|
+
offset = sectionStart;
|
|
103
|
+
break;
|
|
104
|
+
}
|
|
105
|
+
const semBlobLen = dv.getUint32(offset, true);
|
|
106
|
+
offset += 4;
|
|
107
|
+
if (semBlobLen < 0 || offset + semBlobLen > buf.byteLength) {
|
|
108
|
+
offset = sectionStart;
|
|
109
|
+
break;
|
|
110
|
+
}
|
|
111
|
+
const semBlob = new Uint8Array(buf, offset, semBlobLen);
|
|
112
|
+
offset += semBlobLen;
|
|
113
|
+
semantic = parseSemanticSection(parsed, semBlob);
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
const graph = validateClaimGraph(parsed);
|
|
117
|
+
if (!claimGraph && graph) {
|
|
118
|
+
claimGraph = graph;
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
offset = sectionStart;
|
|
122
|
+
break;
|
|
88
123
|
}
|
|
89
124
|
return {
|
|
90
125
|
meta,
|
|
@@ -96,8 +131,18 @@ export function mountPackFromBuffer(buf) {
|
|
|
96
131
|
namespaces,
|
|
97
132
|
blockTokenLens,
|
|
98
133
|
semantic,
|
|
134
|
+
claimGraph,
|
|
99
135
|
};
|
|
100
136
|
}
|
|
137
|
+
function looksLikeSemanticJson(parsed) {
|
|
138
|
+
if (!parsed || typeof parsed !== 'object')
|
|
139
|
+
return false;
|
|
140
|
+
const sem = parsed;
|
|
141
|
+
return (sem.version === 1 &&
|
|
142
|
+
sem.encoding === 'int8_l2norm' &&
|
|
143
|
+
typeof sem.blocks?.vectors?.byteOffset === 'number' &&
|
|
144
|
+
typeof sem.blocks?.vectors?.length === 'number');
|
|
145
|
+
}
|
|
101
146
|
function parseSemanticSection(sem, blob) {
|
|
102
147
|
const vectors = sem?.blocks?.vectors;
|
|
103
148
|
const scales = sem?.blocks?.scales;
|
package/dist/query.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { Pack } from "./pack.js";
|
|
2
|
+
import type { RetrievalEvidence, SemanticSidecar } from "./semantic/types.js";
|
|
2
3
|
export type QueryOptions = {
|
|
3
4
|
topK?: number;
|
|
4
5
|
minScore?: number;
|
|
@@ -12,6 +13,11 @@ export type QueryOptions = {
|
|
|
12
13
|
weight?: number;
|
|
13
14
|
minTermLength?: number;
|
|
14
15
|
};
|
|
16
|
+
graph?: {
|
|
17
|
+
expand?: boolean;
|
|
18
|
+
maxExtraTerms?: number;
|
|
19
|
+
predicates?: string[];
|
|
20
|
+
};
|
|
15
21
|
semantic?: {
|
|
16
22
|
enabled?: boolean;
|
|
17
23
|
mode?: "rerank";
|
|
@@ -23,6 +29,14 @@ export type QueryOptions = {
|
|
|
23
29
|
wSem?: number;
|
|
24
30
|
};
|
|
25
31
|
queryEmbedding?: Float32Array;
|
|
32
|
+
sidecar?: SemanticSidecar;
|
|
33
|
+
provider?: {
|
|
34
|
+
type: "ollama";
|
|
35
|
+
modelId: string;
|
|
36
|
+
endpoint?: string;
|
|
37
|
+
};
|
|
38
|
+
sidecarPath?: string;
|
|
39
|
+
minSemanticScore?: number;
|
|
26
40
|
force?: boolean;
|
|
27
41
|
};
|
|
28
42
|
};
|
|
@@ -34,6 +48,7 @@ export type Hit = {
|
|
|
34
48
|
text: string;
|
|
35
49
|
source?: string;
|
|
36
50
|
namespace?: string;
|
|
51
|
+
evidence?: RetrievalEvidence;
|
|
37
52
|
};
|
|
38
53
|
export declare function query(pack: Pack, q: string, opts?: QueryOptions): Hit[];
|
|
39
54
|
export declare function lexConfidence(hits: Array<{
|
package/dist/query.js
CHANGED
|
@@ -14,6 +14,9 @@ import { minCoverSpan, proximityMultiplier } from "./quality/proximity.js";
|
|
|
14
14
|
import { diversifyAndDedupe } from "./quality/diversify.js";
|
|
15
15
|
import { knsSignature, knsDistance } from "./quality/signature.js";
|
|
16
16
|
import { decodeScaleF16, quantizeEmbeddingInt8L2Norm } from "./semantic.js";
|
|
17
|
+
import { expandQueryWithGraph } from "./graph/query_expand.js";
|
|
18
|
+
import { rerankCandidates } from "./semantic/rerank.js";
|
|
19
|
+
import { parseSidecar } from "./semantic/sidecar.js";
|
|
17
20
|
export function validateQueryOptions(opts) {
|
|
18
21
|
if (!opts)
|
|
19
22
|
return;
|
|
@@ -46,6 +49,17 @@ export function validateQueryOptions(opts) {
|
|
|
46
49
|
throw new Error("query(...): queryExpansion.minTermLength must be a positive integer.");
|
|
47
50
|
}
|
|
48
51
|
}
|
|
52
|
+
if (opts.graph) {
|
|
53
|
+
if (opts.graph.expand !== undefined && typeof opts.graph.expand !== "boolean") {
|
|
54
|
+
throw new Error("query(...): graph.expand must be a boolean when provided.");
|
|
55
|
+
}
|
|
56
|
+
if (opts.graph.maxExtraTerms !== undefined && (!Number.isInteger(opts.graph.maxExtraTerms) || opts.graph.maxExtraTerms < 1)) {
|
|
57
|
+
throw new Error("query(...): graph.maxExtraTerms must be a positive integer.");
|
|
58
|
+
}
|
|
59
|
+
if (opts.graph.predicates !== undefined && (!Array.isArray(opts.graph.predicates) || opts.graph.predicates.some((p) => typeof p !== "string"))) {
|
|
60
|
+
throw new Error("query(...): graph.predicates must be an array of strings when provided.");
|
|
61
|
+
}
|
|
62
|
+
}
|
|
49
63
|
validateSemanticQueryOptions(opts.semantic);
|
|
50
64
|
}
|
|
51
65
|
export function validateSemanticQueryOptions(options) {
|
|
@@ -66,6 +80,19 @@ export function validateSemanticQueryOptions(options) {
|
|
|
66
80
|
if (options.queryEmbedding !== undefined && !(options.queryEmbedding instanceof Float32Array)) {
|
|
67
81
|
throw new Error("query(...): semantic.queryEmbedding must be a Float32Array.");
|
|
68
82
|
}
|
|
83
|
+
if (options.sidecarPath !== undefined && typeof options.sidecarPath !== "string") {
|
|
84
|
+
throw new Error("query(...): semantic.sidecarPath must be a string when provided.");
|
|
85
|
+
}
|
|
86
|
+
if (options.minSemanticScore !== undefined && (!Number.isFinite(options.minSemanticScore) || options.minSemanticScore < 0 || options.minSemanticScore > 1)) {
|
|
87
|
+
throw new Error("query(...): semantic.minSemanticScore must be a finite number between 0 and 1.");
|
|
88
|
+
}
|
|
89
|
+
if (options.provider) {
|
|
90
|
+
if (options.provider.type !== "ollama")
|
|
91
|
+
throw new Error('query(...): semantic.provider.type must be "ollama".');
|
|
92
|
+
if (typeof options.provider.modelId !== "string" || !options.provider.modelId.trim()) {
|
|
93
|
+
throw new Error("query(...): semantic.provider.modelId must be a non-empty string.");
|
|
94
|
+
}
|
|
95
|
+
}
|
|
69
96
|
if (options.blend) {
|
|
70
97
|
if (options.blend.enabled !== undefined && typeof options.blend.enabled !== "boolean") {
|
|
71
98
|
throw new Error("query(...): semantic.blend.enabled must be a boolean when provided.");
|
|
@@ -103,10 +130,19 @@ export function query(pack, q, opts = {}) {
|
|
|
103
130
|
wSem: Math.max(0, opts.semantic?.blend?.wSem ?? 0.25),
|
|
104
131
|
},
|
|
105
132
|
queryEmbedding: opts.semantic?.queryEmbedding,
|
|
133
|
+
sidecar: resolveSemanticSidecar(opts.semantic?.sidecar, opts.semantic?.sidecarPath),
|
|
134
|
+
provider: opts.semantic?.provider,
|
|
135
|
+
minSemanticScore: opts.semantic?.minSemanticScore,
|
|
106
136
|
force: opts.semantic?.force ?? false,
|
|
107
137
|
};
|
|
138
|
+
const graphQuery = opts.graph?.expand === true
|
|
139
|
+
? expandQueryWithGraph(pack, q, {
|
|
140
|
+
maxExtraTerms: opts.graph?.maxExtraTerms,
|
|
141
|
+
predicates: opts.graph?.predicates,
|
|
142
|
+
})
|
|
143
|
+
: q;
|
|
108
144
|
// --- Query parsing
|
|
109
|
-
const normTokens = tokenize(
|
|
145
|
+
const normTokens = tokenize(graphQuery).map((t) => t.term);
|
|
110
146
|
// Normalize quoted phrases from q
|
|
111
147
|
const quotedRaw = parsePhrases(q);
|
|
112
148
|
const quoted = quotedRaw.map((seq) => seq.map((t) => normalize(t)).flatMap((s) => s.split(/\s+/)).filter(Boolean));
|
|
@@ -266,9 +302,16 @@ export function query(pack, q, opts = {}) {
|
|
|
266
302
|
return [];
|
|
267
303
|
}
|
|
268
304
|
const confidence = lexConfidence(prelim);
|
|
305
|
+
let semanticScores;
|
|
306
|
+
let blendedScores;
|
|
307
|
+
const originalLexicalScores = new Map(prelim.map((item) => [item.blockId, item.score]));
|
|
269
308
|
if (shouldRerankWithSemantic(pack, semanticOpts, confidence)) {
|
|
270
|
-
|
|
309
|
+
const semanticResult = rerankLexicalHitsWithSemantic(pack, prelim, semanticOpts);
|
|
310
|
+
prelim = semanticResult.hits;
|
|
311
|
+
semanticScores = semanticResult.semanticScores;
|
|
312
|
+
blendedScores = semanticResult.blendedScores;
|
|
271
313
|
}
|
|
314
|
+
const retrievalMode = semanticScores ? "hybrid" : "lexical";
|
|
272
315
|
// --- KNS tie-breaker + de-dup/MMR
|
|
273
316
|
const qSig = knsSignature(normalize(q));
|
|
274
317
|
const pool = prelim.slice(0, topK * 5).map((r) => {
|
|
@@ -280,6 +323,13 @@ export function query(pack, q, opts = {}) {
|
|
|
280
323
|
text,
|
|
281
324
|
source: pack.docIds?.[r.blockId] ?? undefined,
|
|
282
325
|
namespace: pack.namespaces?.[r.blockId] ?? undefined,
|
|
326
|
+
evidence: {
|
|
327
|
+
retrieval: retrievalMode,
|
|
328
|
+
lexicalScore: originalLexicalScores.get(r.blockId) ?? r.score,
|
|
329
|
+
semanticScore: semanticScores?.get(r.blockId),
|
|
330
|
+
blendedScore: blendedScores?.get(r.blockId),
|
|
331
|
+
modelId: semanticOpts.provider?.modelId ?? semanticOpts.sidecar?.modelId,
|
|
332
|
+
},
|
|
283
333
|
};
|
|
284
334
|
});
|
|
285
335
|
const finalHits = diversifyAndDedupe(pool, { k: topK });
|
|
@@ -297,19 +347,66 @@ export function lexConfidence(hits) {
|
|
|
297
347
|
function shouldRerankWithSemantic(pack, opts, confidence) {
|
|
298
348
|
if (!opts.enabled || opts.mode !== "rerank")
|
|
299
349
|
return false;
|
|
300
|
-
if (!pack.semantic)
|
|
350
|
+
if (!pack.semantic && !opts.sidecar)
|
|
301
351
|
return false;
|
|
302
352
|
if (!opts.queryEmbedding) {
|
|
303
353
|
throw new Error("query(...): semantic.queryEmbedding (Float32Array) is required when semantic.enabled=true.");
|
|
304
354
|
}
|
|
305
355
|
return opts.force || confidence < opts.minLexConfidence;
|
|
306
356
|
}
|
|
357
|
+
function resolveSemanticSidecar(sidecar, sidecarPath) {
|
|
358
|
+
if (sidecar)
|
|
359
|
+
return sidecar;
|
|
360
|
+
if (!sidecarPath)
|
|
361
|
+
return undefined;
|
|
362
|
+
const raw = sidecarPath.trim();
|
|
363
|
+
if (!raw)
|
|
364
|
+
return undefined;
|
|
365
|
+
if (raw.startsWith("{")) {
|
|
366
|
+
return parseSidecar(raw);
|
|
367
|
+
}
|
|
368
|
+
if (raw.startsWith("data:")) {
|
|
369
|
+
const comma = raw.indexOf(",");
|
|
370
|
+
if (comma <= 0)
|
|
371
|
+
return undefined;
|
|
372
|
+
const meta = raw.slice(5, comma).toLowerCase();
|
|
373
|
+
const payload = raw.slice(comma + 1);
|
|
374
|
+
const decoded = meta.includes(";base64")
|
|
375
|
+
? decodeBase64(payload)
|
|
376
|
+
: decodeURIComponent(payload);
|
|
377
|
+
if (!decoded.trim())
|
|
378
|
+
return undefined;
|
|
379
|
+
return parseSidecar(decoded);
|
|
380
|
+
}
|
|
381
|
+
return undefined;
|
|
382
|
+
}
|
|
383
|
+
function decodeBase64(input) {
|
|
384
|
+
const normalized = input.replace(/\s+/g, "");
|
|
385
|
+
const atobFn = globalThis.atob;
|
|
386
|
+
if (typeof atobFn === "function")
|
|
387
|
+
return atobFn(normalized);
|
|
388
|
+
const maybeBufferCtor = globalThis.Buffer;
|
|
389
|
+
if (maybeBufferCtor?.from)
|
|
390
|
+
return maybeBufferCtor.from(normalized, "base64").toString("utf8");
|
|
391
|
+
throw new Error("query(...): Unable to decode semantic.sidecarPath base64 payload in this runtime.");
|
|
392
|
+
}
|
|
307
393
|
function rerankLexicalHitsWithSemantic(pack, prelim, opts) {
|
|
394
|
+
if (opts.sidecar && opts.queryEmbedding) {
|
|
395
|
+
const sidecarResult = rerankCandidates({
|
|
396
|
+
lexical: prelim,
|
|
397
|
+
sidecar: opts.sidecar,
|
|
398
|
+
queryEmbedding: opts.queryEmbedding,
|
|
399
|
+
topN: opts.topN,
|
|
400
|
+
blend: opts.blend,
|
|
401
|
+
minSemanticScore: opts.minSemanticScore,
|
|
402
|
+
});
|
|
403
|
+
return { hits: sidecarResult.reranked, semanticScores: sidecarResult.semanticScores, blendedScores: sidecarResult.blendedScores };
|
|
404
|
+
}
|
|
308
405
|
const sem = pack.semantic;
|
|
309
406
|
if (!sem || !opts.queryEmbedding)
|
|
310
|
-
return prelim;
|
|
407
|
+
return { hits: prelim };
|
|
311
408
|
if (sem.dims <= 0 || sem.vecs.length === 0 || sem.dims !== opts.queryEmbedding.length)
|
|
312
|
-
return prelim;
|
|
409
|
+
return { hits: prelim };
|
|
313
410
|
const topN = Math.min(opts.topN, prelim.length);
|
|
314
411
|
const rerankSlice = prelim.slice(0, topN);
|
|
315
412
|
const tail = prelim.slice(topN);
|
|
@@ -324,15 +421,19 @@ function rerankLexicalHitsWithSemantic(pack, prelim, opts) {
|
|
|
324
421
|
const wLex = denom > 0 ? opts.blend.wLex / denom : 0.5;
|
|
325
422
|
const wSem = denom > 0 ? opts.blend.wSem / denom : 0.5;
|
|
326
423
|
const reranked = new Array(topN);
|
|
424
|
+
const semanticScores = new Map();
|
|
425
|
+
const blendedScores = new Map();
|
|
327
426
|
for (let i = 0; i < topN; i++) {
|
|
328
427
|
const hit = rerankSlice[i];
|
|
428
|
+
semanticScores.set(hit.blockId, normSem[i]);
|
|
429
|
+
blendedScores.set(hit.blockId, opts.blend.enabled ? wLex * normLex[i] + wSem * normSem[i] : semScores[i]);
|
|
329
430
|
reranked[i] = {
|
|
330
431
|
blockId: hit.blockId,
|
|
331
|
-
score:
|
|
432
|
+
score: blendedScores.get(hit.blockId) ?? hit.score,
|
|
332
433
|
};
|
|
333
434
|
}
|
|
334
435
|
reranked.sort((a, b) => b.score - a.score || a.blockId - b.blockId);
|
|
335
|
-
return [...reranked, ...tail];
|
|
436
|
+
return { hits: [...reranked, ...tail], semanticScores, blendedScores };
|
|
336
437
|
}
|
|
337
438
|
function scoreSemanticInt8(queryQ, queryScale, semantic, hits) {
|
|
338
439
|
const scores = new Float64Array(hits.length);
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
export function normalizeVector(vector) {
|
|
2
|
+
let normSq = 0;
|
|
3
|
+
for (let i = 0; i < vector.length; i++)
|
|
4
|
+
normSq += vector[i] * vector[i];
|
|
5
|
+
const norm = Math.sqrt(normSq);
|
|
6
|
+
if (!norm)
|
|
7
|
+
return new Float32Array(vector.length);
|
|
8
|
+
const out = new Float32Array(vector.length);
|
|
9
|
+
for (let i = 0; i < vector.length; i++)
|
|
10
|
+
out[i] = vector[i] / norm;
|
|
11
|
+
return out;
|
|
12
|
+
}
|
|
13
|
+
export function cosineSimilarity(a, b) {
|
|
14
|
+
if (a.length !== b.length || a.length === 0)
|
|
15
|
+
return 0;
|
|
16
|
+
let dot = 0;
|
|
17
|
+
for (let i = 0; i < a.length; i++)
|
|
18
|
+
dot += a[i] * b[i];
|
|
19
|
+
return dot;
|
|
20
|
+
}
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import type { EmbeddingProvider, SemanticQueryOptions } from './types.js';
|
|
2
|
+
export declare function ensureProviderModelId(options?: SemanticQueryOptions): string | undefined;
|
|
3
|
+
export declare function assertProviderCompatible(options?: SemanticQueryOptions, provider?: EmbeddingProvider): void;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export function ensureProviderModelId(options) {
|
|
2
|
+
return options?.provider?.modelId;
|
|
3
|
+
}
|
|
4
|
+
export function assertProviderCompatible(options, provider) {
|
|
5
|
+
if (!options?.enabled)
|
|
6
|
+
return;
|
|
7
|
+
if (!provider && !options.queryEmbedding) {
|
|
8
|
+
throw new Error('semantic.enabled=true requires either semantic.queryEmbedding or an EmbeddingProvider.');
|
|
9
|
+
}
|
|
10
|
+
if (provider && options.provider?.modelId && options.provider.modelId !== provider.modelId) {
|
|
11
|
+
throw new Error(`Semantic provider model mismatch: options requested ${options.provider.modelId}, provider exposes ${provider.modelId}.`);
|
|
12
|
+
}
|
|
13
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { SemanticSidecar } from './types.js';
|
|
2
|
+
export declare function rerankCandidates(params: {
|
|
3
|
+
lexical: Array<{
|
|
4
|
+
blockId: number;
|
|
5
|
+
score: number;
|
|
6
|
+
}>;
|
|
7
|
+
sidecar: SemanticSidecar;
|
|
8
|
+
queryEmbedding: Float32Array;
|
|
9
|
+
topN: number;
|
|
10
|
+
blend: {
|
|
11
|
+
enabled: boolean;
|
|
12
|
+
wLex: number;
|
|
13
|
+
wSem: number;
|
|
14
|
+
};
|
|
15
|
+
minSemanticScore?: number;
|
|
16
|
+
}): {
|
|
17
|
+
reranked: Array<{
|
|
18
|
+
blockId: number;
|
|
19
|
+
score: number;
|
|
20
|
+
}>;
|
|
21
|
+
semanticScores: Map<number, number>;
|
|
22
|
+
blendedScores: Map<number, number>;
|
|
23
|
+
};
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { cosineSimilarity, normalizeVector } from './cosine.js';
|
|
2
|
+
export function rerankCandidates(params) {
|
|
3
|
+
const topN = Math.min(params.topN, params.lexical.length);
|
|
4
|
+
const head = params.lexical.slice(0, topN);
|
|
5
|
+
const tail = params.lexical.slice(topN);
|
|
6
|
+
const q = normalizeVector(params.queryEmbedding);
|
|
7
|
+
const semanticScores = new Map();
|
|
8
|
+
const blendedScores = new Map();
|
|
9
|
+
const lexNorm = minMax(head.map((h) => h.score));
|
|
10
|
+
const semRaw = [];
|
|
11
|
+
for (const item of head) {
|
|
12
|
+
const rec = params.sidecar.blocks.find((b) => b.blockId === item.blockId);
|
|
13
|
+
const vec = rec ? Float32Array.from(rec.vector) : new Float32Array(q.length);
|
|
14
|
+
semRaw.push(cosineSimilarity(q, vec));
|
|
15
|
+
}
|
|
16
|
+
const semNorm = minMax(semRaw);
|
|
17
|
+
const denom = params.blend.wLex + params.blend.wSem;
|
|
18
|
+
const wLex = denom > 0 ? params.blend.wLex / denom : 0.7;
|
|
19
|
+
const wSem = denom > 0 ? params.blend.wSem / denom : 0.3;
|
|
20
|
+
const reranked = head.map((item, idx) => {
|
|
21
|
+
const sem = semNorm[idx];
|
|
22
|
+
semanticScores.set(item.blockId, sem);
|
|
23
|
+
if ((params.minSemanticScore ?? 0) > sem) {
|
|
24
|
+
blendedScores.set(item.blockId, lexNorm[idx]);
|
|
25
|
+
return { blockId: item.blockId, score: lexNorm[idx] };
|
|
26
|
+
}
|
|
27
|
+
const blended = params.blend.enabled ? wLex * lexNorm[idx] + wSem * sem : sem;
|
|
28
|
+
blendedScores.set(item.blockId, blended);
|
|
29
|
+
return { blockId: item.blockId, score: blended };
|
|
30
|
+
});
|
|
31
|
+
reranked.sort((a, b) => b.score - a.score || a.blockId - b.blockId);
|
|
32
|
+
return { reranked: [...reranked, ...tail], semanticScores, blendedScores };
|
|
33
|
+
}
|
|
34
|
+
function minMax(values) {
|
|
35
|
+
if (values.length === 0)
|
|
36
|
+
return values;
|
|
37
|
+
const min = Math.min(...values);
|
|
38
|
+
const max = Math.max(...values);
|
|
39
|
+
if (!Number.isFinite(min) || !Number.isFinite(max) || max <= min)
|
|
40
|
+
return values.map(() => 1);
|
|
41
|
+
return values.map((v) => Math.min(1, Math.max(0, (v - min) / (max - min))));
|
|
42
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { Pack } from '../pack.runtime.js';
|
|
2
|
+
import type { SemanticSidecar } from './types.js';
|
|
3
|
+
export declare function createPackFingerprint(pack: Pick<Pack, 'blocks' | 'docIds' | 'meta'>): string;
|
|
4
|
+
export declare function serializeSidecar(sidecar: SemanticSidecar): string;
|
|
5
|
+
export declare function parseSidecar(raw: string): SemanticSidecar;
|
|
6
|
+
export declare function validateSidecarForPack(input: {
|
|
7
|
+
sidecar: SemanticSidecar;
|
|
8
|
+
pack: Pick<Pack, 'blocks' | 'docIds' | 'meta'>;
|
|
9
|
+
modelId: string;
|
|
10
|
+
}): void;
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
export function createPackFingerprint(pack) {
|
|
2
|
+
let hash = 2166136261;
|
|
3
|
+
const parts = [String(pack.meta?.version ?? 0), ...(pack.docIds ?? []), ...pack.blocks];
|
|
4
|
+
for (const part of parts) {
|
|
5
|
+
const text = String(part ?? '');
|
|
6
|
+
for (let i = 0; i < text.length; i++) {
|
|
7
|
+
hash ^= text.charCodeAt(i);
|
|
8
|
+
hash = Math.imul(hash, 16777619);
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
return `fnv1a-${(hash >>> 0).toString(16).padStart(8, '0')}`;
|
|
12
|
+
}
|
|
13
|
+
export function serializeSidecar(sidecar) {
|
|
14
|
+
return `${JSON.stringify(sidecar, null, 2)}\n`;
|
|
15
|
+
}
|
|
16
|
+
export function parseSidecar(raw) {
|
|
17
|
+
const parsed = JSON.parse(raw);
|
|
18
|
+
if (parsed.version !== 1)
|
|
19
|
+
throw new Error(`Unsupported semantic sidecar version: ${parsed.version}`);
|
|
20
|
+
if (parsed.metric !== 'cosine')
|
|
21
|
+
throw new Error(`Unsupported semantic metric: ${parsed.metric}`);
|
|
22
|
+
return parsed;
|
|
23
|
+
}
|
|
24
|
+
export function validateSidecarForPack(input) {
|
|
25
|
+
const expectedFingerprint = createPackFingerprint(input.pack);
|
|
26
|
+
if (input.sidecar.packFingerprint !== expectedFingerprint) {
|
|
27
|
+
throw new Error(`Semantic sidecar pack fingerprint mismatch: expected ${expectedFingerprint}, got ${input.sidecar.packFingerprint}. Regenerate the sidecar for this pack.`);
|
|
28
|
+
}
|
|
29
|
+
if (input.sidecar.modelId !== input.modelId) {
|
|
30
|
+
throw new Error(`Semantic model mismatch: sidecar model is ${input.sidecar.modelId}, but query provider is ${input.modelId}. Use the same embedding model or regenerate the sidecar.`);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
export interface EmbeddingProvider {
|
|
2
|
+
readonly modelId: string;
|
|
3
|
+
embedQuery(text: string): Promise<Float32Array>;
|
|
4
|
+
embedTexts(texts: string[]): Promise<Float32Array[]>;
|
|
5
|
+
}
|
|
6
|
+
export interface SemanticSidecar {
|
|
7
|
+
version: 1;
|
|
8
|
+
packFingerprint: string;
|
|
9
|
+
modelId: string;
|
|
10
|
+
dimension: number;
|
|
11
|
+
metric: 'cosine';
|
|
12
|
+
createdAt: string;
|
|
13
|
+
blocks: Array<{
|
|
14
|
+
blockId: number;
|
|
15
|
+
vector: number[];
|
|
16
|
+
}>;
|
|
17
|
+
}
|
|
18
|
+
export type SemanticQueryOptions = {
|
|
19
|
+
enabled?: boolean;
|
|
20
|
+
mode?: 'rerank';
|
|
21
|
+
topN?: number;
|
|
22
|
+
minLexConfidence?: number;
|
|
23
|
+
minSemanticScore?: number;
|
|
24
|
+
blend?: {
|
|
25
|
+
enabled?: boolean;
|
|
26
|
+
wLex?: number;
|
|
27
|
+
wSem?: number;
|
|
28
|
+
};
|
|
29
|
+
provider?: {
|
|
30
|
+
type: 'ollama';
|
|
31
|
+
modelId: string;
|
|
32
|
+
endpoint?: string;
|
|
33
|
+
};
|
|
34
|
+
sidecarPath?: string;
|
|
35
|
+
queryEmbedding?: Float32Array;
|
|
36
|
+
force?: boolean;
|
|
37
|
+
};
|
|
38
|
+
export type RetrievalEvidence = {
|
|
39
|
+
retrieval: 'lexical' | 'hybrid';
|
|
40
|
+
lexicalScore?: number;
|
|
41
|
+
semanticScore?: number;
|
|
42
|
+
blendedScore?: number;
|
|
43
|
+
modelId?: string;
|
|
44
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|