@oscharko-dev/keiko-local-knowledge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -0
- package/dist/bounded-document-extraction.d.ts +27 -0
- package/dist/bounded-document-extraction.d.ts.map +1 -0
- package/dist/bounded-document-extraction.js +214 -0
- package/dist/capsule-lifecycle.d.ts +33 -0
- package/dist/capsule-lifecycle.d.ts.map +1 -0
- package/dist/capsule-lifecycle.js +292 -0
- package/dist/capsule-set-lifecycle.d.ts +15 -0
- package/dist/capsule-set-lifecycle.d.ts.map +1 -0
- package/dist/capsule-set-lifecycle.js +158 -0
- package/dist/chunking/chunker-persist.d.ts +36 -0
- package/dist/chunking/chunker-persist.d.ts.map +1 -0
- package/dist/chunking/chunker-persist.js +74 -0
- package/dist/chunking/chunker-runner.d.ts +9 -0
- package/dist/chunking/chunker-runner.d.ts.map +1 -0
- package/dist/chunking/chunker-runner.js +218 -0
- package/dist/chunking/chunker.d.ts +7 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +139 -0
- package/dist/chunking/citation-mapper.d.ts +4 -0
- package/dist/chunking/citation-mapper.d.ts.map +1 -0
- package/dist/chunking/citation-mapper.js +180 -0
- package/dist/chunking/index.d.ts +6 -0
- package/dist/chunking/index.d.ts.map +1 -0
- package/dist/chunking/index.js +8 -0
- package/dist/chunking/token-estimator.d.ts +3 -0
- package/dist/chunking/token-estimator.d.ts.map +1 -0
- package/dist/chunking/token-estimator.js +26 -0
- package/dist/chunking/types.d.ts +49 -0
- package/dist/chunking/types.d.ts.map +1 -0
- package/dist/chunking/types.js +26 -0
- package/dist/composition.d.ts +57 -0
- package/dist/composition.d.ts.map +1 -0
- package/dist/composition.js +310 -0
- package/dist/conversation/citation-attacher.d.ts +8 -0
- package/dist/conversation/citation-attacher.d.ts.map +1 -0
- package/dist/conversation/citation-attacher.js +55 -0
- package/dist/conversation/citation-excerpts.d.ts +4 -0
- package/dist/conversation/citation-excerpts.d.ts.map +1 -0
- package/dist/conversation/citation-excerpts.js +41 -0
- package/dist/conversation/grounded-answer-runner.d.ts +9 -0
- package/dist/conversation/grounded-answer-runner.d.ts.map +1 -0
- package/dist/conversation/grounded-answer-runner.js +61 -0
- package/dist/conversation/index.d.ts +5 -0
- package/dist/conversation/index.d.ts.map +1 -0
- package/dist/conversation/index.js +7 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts +28 -0
- package/dist/conversation/model-gateway-answer-generator.d.ts.map +1 -0
- package/dist/conversation/model-gateway-answer-generator.js +105 -0
- package/dist/conversation/types.d.ts +35 -0
- package/dist/conversation/types.d.ts.map +1 -0
- package/dist/conversation/types.js +24 -0
- package/dist/discovery/discovery-runner.d.ts +23 -0
- package/dist/discovery/discovery-runner.d.ts.map +1 -0
- package/dist/discovery/discovery-runner.js +109 -0
- package/dist/discovery/extract-progressive.d.ts +17 -0
- package/dist/discovery/extract-progressive.d.ts.map +1 -0
- package/dist/discovery/extract-progressive.js +522 -0
- package/dist/discovery/extract.d.ts +26 -0
- package/dist/discovery/extract.d.ts.map +1 -0
- package/dist/discovery/extract.js +906 -0
- package/dist/discovery/glob.d.ts +10 -0
- package/dist/discovery/glob.d.ts.map +1 -0
- package/dist/discovery/glob.js +72 -0
- package/dist/discovery/index.d.ts +6 -0
- package/dist/discovery/index.d.ts.map +1 -0
- package/dist/discovery/index.js +8 -0
- package/dist/discovery/media-type.d.ts +4 -0
- package/dist/discovery/media-type.d.ts.map +1 -0
- package/dist/discovery/media-type.js +62 -0
- package/dist/discovery/persist.d.ts +63 -0
- package/dist/discovery/persist.d.ts.map +1 -0
- package/dist/discovery/persist.js +345 -0
- package/dist/discovery/test-support.d.ts +16 -0
- package/dist/discovery/test-support.d.ts.map +1 -0
- package/dist/discovery/test-support.js +127 -0
- package/dist/discovery/types.d.ts +63 -0
- package/dist/discovery/types.d.ts.map +1 -0
- package/dist/discovery/types.js +28 -0
- package/dist/discovery/walk.d.ts +12 -0
- package/dist/discovery/walk.d.ts.map +1 -0
- package/dist/discovery/walk.js +302 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +22 -0
- package/dist/evaluations/dimensions.d.ts +14 -0
- package/dist/evaluations/dimensions.d.ts.map +1 -0
- package/dist/evaluations/dimensions.js +191 -0
- package/dist/evaluations/fixtures.d.ts +18 -0
- package/dist/evaluations/fixtures.d.ts.map +1 -0
- package/dist/evaluations/fixtures.js +858 -0
- package/dist/evaluations/index.d.ts +7 -0
- package/dist/evaluations/index.d.ts.map +1 -0
- package/dist/evaluations/index.js +10 -0
- package/dist/evaluations/report.d.ts +3 -0
- package/dist/evaluations/report.d.ts.map +1 -0
- package/dist/evaluations/report.js +31 -0
- package/dist/evaluations/runner-seed.d.ts +12 -0
- package/dist/evaluations/runner-seed.d.ts.map +1 -0
- package/dist/evaluations/runner-seed.js +175 -0
- package/dist/evaluations/runner.d.ts +8 -0
- package/dist/evaluations/runner.d.ts.map +1 -0
- package/dist/evaluations/runner.js +205 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts +13 -0
- package/dist/evaluations/scripted-embedding-adapter.d.ts.map +1 -0
- package/dist/evaluations/scripted-embedding-adapter.js +163 -0
- package/dist/evaluations/types.d.ts +116 -0
- package/dist/evaluations/types.d.ts.map +1 -0
- package/dist/evaluations/types.js +27 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -0
- package/dist/indexing/bounded-indexing.d.ts +41 -0
- package/dist/indexing/bounded-indexing.d.ts.map +1 -0
- package/dist/indexing/bounded-indexing.js +240 -0
- package/dist/indexing/checkpoint-persist.d.ts +8 -0
- package/dist/indexing/checkpoint-persist.d.ts.map +1 -0
- package/dist/indexing/checkpoint-persist.js +135 -0
- package/dist/indexing/checkpoint-resume.d.ts +20 -0
- package/dist/indexing/checkpoint-resume.d.ts.map +1 -0
- package/dist/indexing/checkpoint-resume.js +50 -0
- package/dist/indexing/embedding-batcher.d.ts +3 -0
- package/dist/indexing/embedding-batcher.d.ts.map +1 -0
- package/dist/indexing/embedding-batcher.js +390 -0
- package/dist/indexing/index.d.ts +7 -0
- package/dist/indexing/index.d.ts.map +1 -0
- package/dist/indexing/index.js +11 -0
- package/dist/indexing/job-persist.d.ts +46 -0
- package/dist/indexing/job-persist.d.ts.map +1 -0
- package/dist/indexing/job-persist.js +157 -0
- package/dist/indexing/job-resume.d.ts +4 -0
- package/dist/indexing/job-resume.d.ts.map +1 -0
- package/dist/indexing/job-resume.js +14 -0
- package/dist/indexing/orchestrator.d.ts +3 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +1151 -0
- package/dist/indexing/types.d.ts +156 -0
- package/dist/indexing/types.d.ts.map +1 -0
- package/dist/indexing/types.js +30 -0
- package/dist/indexing/vector-persist.d.ts +32 -0
- package/dist/indexing/vector-persist.d.ts.map +1 -0
- package/dist/indexing/vector-persist.js +105 -0
- package/dist/parsers/_internal.d.ts +20 -0
- package/dist/parsers/_internal.d.ts.map +1 -0
- package/dist/parsers/_internal.js +122 -0
- package/dist/parsers/csv-parser.d.ts +3 -0
- package/dist/parsers/csv-parser.d.ts.map +1 -0
- package/dist/parsers/csv-parser.js +202 -0
- package/dist/parsers/docx-parser.d.ts +3 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +390 -0
- package/dist/parsers/html-parser.d.ts +3 -0
- package/dist/parsers/html-parser.d.ts.map +1 -0
- package/dist/parsers/html-parser.js +310 -0
- package/dist/parsers/index.d.ts +15 -0
- package/dist/parsers/index.d.ts.map +1 -0
- package/dist/parsers/index.js +41 -0
- package/dist/parsers/json-parser.d.ts +3 -0
- package/dist/parsers/json-parser.d.ts.map +1 -0
- package/dist/parsers/json-parser.js +192 -0
- package/dist/parsers/large-document/capability-discovery.d.ts +27 -0
- package/dist/parsers/large-document/capability-discovery.d.ts.map +1 -0
- package/dist/parsers/large-document/capability-discovery.js +76 -0
- package/dist/parsers/large-document/diagnostics.d.ts +3 -0
- package/dist/parsers/large-document/diagnostics.d.ts.map +1 -0
- package/dist/parsers/large-document/diagnostics.js +11 -0
- package/dist/parsers/large-document/index.d.ts +15 -0
- package/dist/parsers/large-document/index.d.ts.map +1 -0
- package/dist/parsers/large-document/index.js +10 -0
- package/dist/parsers/large-document/legacy-format.d.ts +5 -0
- package/dist/parsers/large-document/legacy-format.d.ts.map +1 -0
- package/dist/parsers/large-document/legacy-format.js +25 -0
- package/dist/parsers/large-document/preflight.d.ts +9 -0
- package/dist/parsers/large-document/preflight.d.ts.map +1 -0
- package/dist/parsers/large-document/preflight.js +43 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts +55 -0
- package/dist/parsers/large-document/progressive-extraction.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-extraction.js +123 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts +20 -0
- package/dist/parsers/large-document/progressive-pdf.d.ts.map +1 -0
- package/dist/parsers/large-document/progressive-pdf.js +145 -0
- package/dist/parsers/large-document/synthetic-source.d.ts +9 -0
- package/dist/parsers/large-document/synthetic-source.d.ts.map +1 -0
- package/dist/parsers/large-document/synthetic-source.js +101 -0
- package/dist/parsers/large-document/window-builder.d.ts +24 -0
- package/dist/parsers/large-document/window-builder.d.ts.map +1 -0
- package/dist/parsers/large-document/window-builder.js +75 -0
- package/dist/parsers/ocr/index.d.ts +4 -0
- package/dist/parsers/ocr/index.d.ts.map +1 -0
- package/dist/parsers/ocr/index.js +4 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts +3 -0
- package/dist/parsers/ocr/null-ocr-adapter.d.ts.map +1 -0
- package/dist/parsers/ocr/null-ocr-adapter.js +14 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts +8 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.d.ts.map +1 -0
- package/dist/parsers/ocr/ocr-pipeline-parser.js +147 -0
- package/dist/parsers/ocr/types.d.ts +16 -0
- package/dist/parsers/ocr/types.d.ts.map +1 -0
- package/dist/parsers/ocr/types.js +4 -0
- package/dist/parsers/parser-test-fixtures.d.ts +28 -0
- package/dist/parsers/parser-test-fixtures.d.ts.map +1 -0
- package/dist/parsers/parser-test-fixtures.js +139 -0
- package/dist/parsers/pdf-parser.d.ts +43 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +388 -0
- package/dist/parsers/registry.d.ts +8 -0
- package/dist/parsers/registry.d.ts.map +1 -0
- package/dist/parsers/registry.js +57 -0
- package/dist/parsers/text-parser.d.ts +3 -0
- package/dist/parsers/text-parser.d.ts.map +1 -0
- package/dist/parsers/text-parser.js +214 -0
- package/dist/parsers/types.d.ts +53 -0
- package/dist/parsers/types.d.ts.map +1 -0
- package/dist/parsers/types.js +21 -0
- package/dist/parsers/unsupported-parser.d.ts +4 -0
- package/dist/parsers/unsupported-parser.d.ts.map +1 -0
- package/dist/parsers/unsupported-parser.js +97 -0
- package/dist/parsers/xlsx-parser.d.ts +3 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +425 -0
- package/dist/privacy/audit-emitter.d.ts +5 -0
- package/dist/privacy/audit-emitter.d.ts.map +1 -0
- package/dist/privacy/audit-emitter.js +93 -0
- package/dist/privacy/diagnostic-redactor.d.ts +2 -0
- package/dist/privacy/diagnostic-redactor.d.ts.map +1 -0
- package/dist/privacy/diagnostic-redactor.js +153 -0
- package/dist/privacy/index.d.ts +5 -0
- package/dist/privacy/index.d.ts.map +1 -0
- package/dist/privacy/index.js +6 -0
- package/dist/privacy/retention-applier.d.ts +5 -0
- package/dist/privacy/retention-applier.d.ts.map +1 -0
- package/dist/privacy/retention-applier.js +88 -0
- package/dist/privacy/types.d.ts +98 -0
- package/dist/privacy/types.d.ts.map +1 -0
- package/dist/privacy/types.js +12 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts +27 -0
- package/dist/qualityIntelligence/capsuleCorpus.d.ts.map +1 -0
- package/dist/qualityIntelligence/capsuleCorpus.js +58 -0
- package/dist/qualityIntelligence/index.d.ts +3 -0
- package/dist/qualityIntelligence/index.d.ts.map +1 -0
- package/dist/qualityIntelligence/index.js +5 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts +36 -0
- package/dist/qualityIntelligence/qiHandoff.d.ts.map +1 -0
- package/dist/qualityIntelligence/qiHandoff.js +82 -0
- package/dist/retrieval/answer-grounding.d.ts +9 -0
- package/dist/retrieval/answer-grounding.d.ts.map +1 -0
- package/dist/retrieval/answer-grounding.js +31 -0
- package/dist/retrieval/context-pack-assembler.d.ts +24 -0
- package/dist/retrieval/context-pack-assembler.d.ts.map +1 -0
- package/dist/retrieval/context-pack-assembler.js +50 -0
- package/dist/retrieval/index.d.ts +6 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/retrieval/index.js +9 -0
- package/dist/retrieval/retrieval-runner.d.ts +10 -0
- package/dist/retrieval/retrieval-runner.d.ts.map +1 -0
- package/dist/retrieval/retrieval-runner.js +163 -0
- package/dist/retrieval/scoped-vector-search.d.ts +24 -0
- package/dist/retrieval/scoped-vector-search.d.ts.map +1 -0
- package/dist/retrieval/scoped-vector-search.js +864 -0
- package/dist/retrieval/types.d.ts +28 -0
- package/dist/retrieval/types.d.ts.map +1 -0
- package/dist/retrieval/types.js +33 -0
- package/dist/section-path-hash.d.ts +3 -0
- package/dist/section-path-hash.d.ts.map +1 -0
- package/dist/section-path-hash.js +9 -0
- package/dist/source-lifecycle.d.ts +14 -0
- package/dist/source-lifecycle.d.ts.map +1 -0
- package/dist/source-lifecycle.js +155 -0
- package/dist/source-routing-validation.d.ts +11 -0
- package/dist/source-routing-validation.d.ts.map +1 -0
- package/dist/source-routing-validation.js +140 -0
- package/dist/store-content-cipher.d.ts +11 -0
- package/dist/store-content-cipher.d.ts.map +1 -0
- package/dist/store-content-cipher.js +67 -0
- package/dist/store-content-encryption.d.ts +12 -0
- package/dist/store-content-encryption.d.ts.map +1 -0
- package/dist/store-content-encryption.js +275 -0
- package/dist/store-paths.d.ts +6 -0
- package/dist/store-paths.d.ts.map +1 -0
- package/dist/store-paths.js +61 -0
- package/dist/store.d.ts +30 -0
- package/dist/store.d.ts.map +1 -0
- package/dist/store.js +219 -0
- package/dist/testing.d.ts +47 -0
- package/dist/testing.d.ts.map +1 -0
- package/dist/testing.js +170 -0
- package/dist/version.d.ts +2 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +4 -0
- package/package.json +43 -0
|
@@ -0,0 +1,864 @@
|
|
|
1
|
+
// Scoped vector search (Epic #189, Issue #199). Given a list of capsule ids and a
|
|
2
|
+
// pre-embedded query vector per capsule, returns the ranked top-K `RetrievalReference`
|
|
3
|
+
// across the scope. The "no global pool" invariant lives in the SQL: every SELECT
|
|
4
|
+
// filters by `capsule_id` and we never join across capsules — so a bug in caller
|
|
5
|
+
// composition can never silently leak rows from a capsule outside scope.
|
|
6
|
+
//
|
|
7
|
+
// Vector blob layout: each row's `embedding` is `vectorDimensions * 4` bytes encoded as
|
|
8
|
+
// a little-endian Float32 array (see `floatToBytes` in `../indexing/embedding-batcher.ts`).
|
|
9
|
+
// We decode to a `Float32Array` view and compute similarity in-process. This is a
|
|
10
|
+
// brute-force O(N·D) scan — that is intentional for the first cut, since capsules are
|
|
11
|
+
// expected to be small (≤ a few thousand vectors) and adding an ANN index pulls in a
|
|
12
|
+
// native dependency we have explicitly avoided in `@oscharko-dev/keiko-local-knowledge`.
|
|
13
|
+
import { getCapsule } from "../capsule-lifecycle.js";
|
|
14
|
+
import { RetrievalError } from "./types.js";
|
|
15
|
+
const SEARCH_EXCERPT_MAX_CHARS = 1_600;
|
|
16
|
+
const SEARCH_CONTEXT_BEFORE_CHARS = 420;
|
|
17
|
+
const LEXICAL_RECALL_EXCERPT_CHARS = 900;
|
|
18
|
+
const LEXICAL_RECALL_MAX_TERMS = 12;
|
|
19
|
+
const LEXICAL_RECALL_MIN_TOKEN_LENGTH = 3;
|
|
20
|
+
const EXACT_TERM_PATTERN = /[\p{L}\p{N}][\p{L}\p{N}._:/#-]{2,}/gu;
|
|
21
|
+
const BROAD_QUERY_PATTERN = /\b(compare|comparez|summari[sz]e|overview|explain|describe|analyse|analyze|erkl[aä]re|ueberblick|überblick|vergleiche|zusammenfassung)\b/iu;
|
|
22
|
+
const SEARCH_STOPWORDS = new Set([
|
|
23
|
+
"a",
|
|
24
|
+
"about",
|
|
25
|
+
"and",
|
|
26
|
+
"are",
|
|
27
|
+
"auf",
|
|
28
|
+
"aus",
|
|
29
|
+
"bei",
|
|
30
|
+
"das",
|
|
31
|
+
"der",
|
|
32
|
+
"die",
|
|
33
|
+
"ein",
|
|
34
|
+
"eine",
|
|
35
|
+
"einen",
|
|
36
|
+
"einer",
|
|
37
|
+
"eines",
|
|
38
|
+
"for",
|
|
39
|
+
"from",
|
|
40
|
+
"how",
|
|
41
|
+
"in",
|
|
42
|
+
"ist",
|
|
43
|
+
"mit",
|
|
44
|
+
"of",
|
|
45
|
+
"on",
|
|
46
|
+
"oder",
|
|
47
|
+
"sagen",
|
|
48
|
+
"steht",
|
|
49
|
+
"the",
|
|
50
|
+
"to",
|
|
51
|
+
"und",
|
|
52
|
+
"uber",
|
|
53
|
+
"ueber",
|
|
54
|
+
"über",
|
|
55
|
+
"von",
|
|
56
|
+
"was",
|
|
57
|
+
"what",
|
|
58
|
+
"wie",
|
|
59
|
+
"zu",
|
|
60
|
+
"zum",
|
|
61
|
+
"zur",
|
|
62
|
+
]);
|
|
63
|
+
// ─── Compose a scope object from either `ComposedRetrievalScope` or a single capsule ────
|
|
64
|
+
export function toScopeInput(scope) {
|
|
65
|
+
if ("capsuleId" in scope) {
|
|
66
|
+
return { capsuleIds: [scope.capsuleId] };
|
|
67
|
+
}
|
|
68
|
+
return { capsuleIds: scope.capsuleIds, sourceFilter: scope.sourceIds };
|
|
69
|
+
}
|
|
70
|
+
const SELECT_VECTORS_FOR_CAPSULE_SQL = [
|
|
71
|
+
"SELECT chunk_id, capsule_id, source_id, document_id, embedding,",
|
|
72
|
+
" vector_dimensions, vector_metric",
|
|
73
|
+
"FROM vectors",
|
|
74
|
+
"WHERE capsule_id = :c",
|
|
75
|
+
].join(" ");
|
|
76
|
+
function readVectorsForCapsule(store, capsuleId, sourceFilter) {
|
|
77
|
+
if (sourceFilter?.length === 0)
|
|
78
|
+
return [];
|
|
79
|
+
const params = { c: String(capsuleId) };
|
|
80
|
+
const sourceClause = sourceFilter === undefined
|
|
81
|
+
? ""
|
|
82
|
+
: ` AND source_id IN (${sourceFilter.map((_, i) => `:s${String(i)}`).join(", ")})`;
|
|
83
|
+
if (sourceFilter !== undefined) {
|
|
84
|
+
for (let i = 0; i < sourceFilter.length; i += 1) {
|
|
85
|
+
params[`s${String(i)}`] = String(sourceFilter[i]);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
return store._internal.db
|
|
89
|
+
.prepare(`${SELECT_VECTORS_FOR_CAPSULE_SQL}${sourceClause}`)
|
|
90
|
+
.all(params);
|
|
91
|
+
}
|
|
92
|
+
function readCitationRows(store, capsuleId, chunkIds) {
|
|
93
|
+
if (chunkIds.length === 0)
|
|
94
|
+
return [];
|
|
95
|
+
const placeholders = chunkIds.map((_, i) => `:c${String(i)}`).join(", ");
|
|
96
|
+
const sql = [
|
|
97
|
+
"SELECT c.id AS chunk_id, c.capsule_id, c.source_id, c.document_id,",
|
|
98
|
+
" d.safe_display_name AS safe_display_name,",
|
|
99
|
+
" COALESCE(pu.page_number, (",
|
|
100
|
+
" SELECT p.page_number FROM pages p",
|
|
101
|
+
" WHERE p.capsule_id = c.capsule_id AND p.document_id = c.document_id",
|
|
102
|
+
" AND p.character_start <= COALESCE(c.character_start, pu.character_start)",
|
|
103
|
+
" AND p.character_end >= COALESCE(c.character_end, pu.character_end)",
|
|
104
|
+
" ORDER BY p.page_number ASC LIMIT 1",
|
|
105
|
+
" )) AS page_number,",
|
|
106
|
+
" COALESCE(pu.page_label, (",
|
|
107
|
+
" SELECT p.page_label FROM pages p",
|
|
108
|
+
" WHERE p.capsule_id = c.capsule_id AND p.document_id = c.document_id",
|
|
109
|
+
" AND p.character_start <= COALESCE(c.character_start, pu.character_start)",
|
|
110
|
+
" AND p.character_end >= COALESCE(c.character_end, pu.character_end)",
|
|
111
|
+
" ORDER BY p.page_number ASC LIMIT 1",
|
|
112
|
+
" )) AS page_label,",
|
|
113
|
+
" COALESCE(pu.section_path_json, pu.heading_path_json) AS section_path_json,",
|
|
114
|
+
" pu.json_pointer, pu.table_name, pu.row_index,",
|
|
115
|
+
" COALESCE(c.character_start, pu.character_start) AS character_start,",
|
|
116
|
+
" COALESCE(c.character_end, pu.character_end) AS character_end",
|
|
117
|
+
"FROM chunks c",
|
|
118
|
+
"LEFT JOIN documents d ON d.capsule_id = c.capsule_id AND d.id = c.document_id",
|
|
119
|
+
"LEFT JOIN parsed_units pu",
|
|
120
|
+
" ON pu.capsule_id = c.capsule_id AND pu.id = c.parsed_unit_id",
|
|
121
|
+
`WHERE c.capsule_id = :cap AND c.id IN (${placeholders})`,
|
|
122
|
+
].join(" ");
|
|
123
|
+
const params = { cap: String(capsuleId) };
|
|
124
|
+
for (let i = 0; i < chunkIds.length; i += 1) {
|
|
125
|
+
params[`c${String(i)}`] = chunkIds[i] ?? "";
|
|
126
|
+
}
|
|
127
|
+
const rows = store._internal.db.prepare(sql).all(params);
|
|
128
|
+
return rows;
|
|
129
|
+
}
|
|
130
|
+
// ─── Similarity primitives ───────────────────────────────────────────────────
|
|
131
|
+
// Float32 decode. The row blob is a fresh-copied Uint8Array; we wrap it in a Float32Array
|
|
132
|
+
// view backed by the same ArrayBuffer. The byteLength must be exactly `dims * 4` — a
|
|
133
|
+
// length mismatch indicates DB corruption and we surface a `RetrievalError`.
|
|
134
|
+
function decodeEmbedding(row, cipher) {
|
|
135
|
+
const embedding = cipher.openVector(row.embedding, row.vector_dimensions * 4);
|
|
136
|
+
if (embedding.byteLength !== row.vector_dimensions * 4) {
|
|
137
|
+
throw new RetrievalError("STORE_READ_FAILED", "vector blob length does not match vector_dimensions");
|
|
138
|
+
}
|
|
139
|
+
const copy = new Uint8Array(embedding); // detach from sqlite row buffer / decrypted envelope
|
|
140
|
+
return new Float32Array(copy.buffer, copy.byteOffset, row.vector_dimensions);
|
|
141
|
+
}
|
|
142
|
+
// `noUncheckedIndexedAccess` widens `Float32Array[i]` to `number | undefined`; the loop
|
|
143
|
+
// stays in-bounds by construction (`i < a.length`), so we narrow with `?? 0` rather than
|
|
144
|
+
// a `!` assertion (forbidden by the project's lint rule) — at this index the value is
|
|
145
|
+
// always a real Float32 lane, never absent.
|
|
146
|
+
function cosineSimilarity(a, b) {
|
|
147
|
+
let dot = 0;
|
|
148
|
+
let na = 0;
|
|
149
|
+
let nb = 0;
|
|
150
|
+
for (let i = 0; i < a.length; i += 1) {
|
|
151
|
+
const av = a[i] ?? 0;
|
|
152
|
+
const bv = b[i] ?? 0;
|
|
153
|
+
dot += av * bv;
|
|
154
|
+
na += av * av;
|
|
155
|
+
nb += bv * bv;
|
|
156
|
+
}
|
|
157
|
+
if (na === 0 || nb === 0)
|
|
158
|
+
return 0;
|
|
159
|
+
return dot / (Math.sqrt(na) * Math.sqrt(nb));
|
|
160
|
+
}
|
|
161
|
+
function dotProduct(a, b) {
|
|
162
|
+
let dot = 0;
|
|
163
|
+
for (let i = 0; i < a.length; i += 1) {
|
|
164
|
+
dot += (a[i] ?? 0) * (b[i] ?? 0);
|
|
165
|
+
}
|
|
166
|
+
return dot;
|
|
167
|
+
}
|
|
168
|
+
// Negated Euclidean distance so higher = closer (uniform "score-desc" sort with the
|
|
169
|
+
// other two metrics). Documented in the function name; consumers never see the raw
|
|
170
|
+
// distance — only the unified score.
|
|
171
|
+
function negativeEuclideanDistance(a, b) {
|
|
172
|
+
let sum = 0;
|
|
173
|
+
for (let i = 0; i < a.length; i += 1) {
|
|
174
|
+
const d = (a[i] ?? 0) - (b[i] ?? 0);
|
|
175
|
+
sum += d * d;
|
|
176
|
+
}
|
|
177
|
+
return -Math.sqrt(sum);
|
|
178
|
+
}
|
|
179
|
+
function scoreFor(metric, query, vector) {
|
|
180
|
+
if (metric === "cosine")
|
|
181
|
+
return cosineSimilarity(query, vector);
|
|
182
|
+
if (metric === "dot")
|
|
183
|
+
return dotProduct(query, vector);
|
|
184
|
+
return negativeEuclideanDistance(query, vector);
|
|
185
|
+
}
|
|
186
|
+
function identityKey(identity) {
|
|
187
|
+
// modelRevision intentionally excluded — two capsules sharing structural identity
|
|
188
|
+
// tuple share an embedding even if one has been re-validated with a new revision.
|
|
189
|
+
return [
|
|
190
|
+
identity.provider,
|
|
191
|
+
identity.modelId,
|
|
192
|
+
String(identity.vectorDimensions),
|
|
193
|
+
identity.vectorMetric,
|
|
194
|
+
].join("|");
|
|
195
|
+
}
|
|
196
|
+
async function embedQueryFor(adapter, identity, text, signal) {
|
|
197
|
+
const outcome = await adapter.request({
|
|
198
|
+
endpoint: adapter.endpoint,
|
|
199
|
+
apiKey: adapter.apiKey,
|
|
200
|
+
...(adapter.apiKeyHeaderName !== undefined
|
|
201
|
+
? { apiKeyHeaderName: adapter.apiKeyHeaderName }
|
|
202
|
+
: {}),
|
|
203
|
+
modelId: identity.modelId,
|
|
204
|
+
input: text,
|
|
205
|
+
...(signal !== undefined ? { signal } : {}),
|
|
206
|
+
});
|
|
207
|
+
if (!outcome.ok) {
|
|
208
|
+
return new RetrievalError("EMBEDDING_ADAPTER_FAILED", `embedding adapter returned ${outcome.kind}`);
|
|
209
|
+
}
|
|
210
|
+
return { vector: outcome.value.vector, dimensions: outcome.value.vector.length };
|
|
211
|
+
}
|
|
212
|
+
// ─── Citation builder ────────────────────────────────────────────────────────
|
|
213
|
+
function parseSectionPath(json, cipher) {
|
|
214
|
+
if (json === null)
|
|
215
|
+
return undefined;
|
|
216
|
+
const opened = cipher.openText(json);
|
|
217
|
+
try {
|
|
218
|
+
const parsed = JSON.parse(opened);
|
|
219
|
+
if (!Array.isArray(parsed))
|
|
220
|
+
return undefined;
|
|
221
|
+
const out = [];
|
|
222
|
+
for (const item of parsed) {
|
|
223
|
+
if (typeof item !== "string")
|
|
224
|
+
return undefined;
|
|
225
|
+
out.push(item);
|
|
226
|
+
}
|
|
227
|
+
return out;
|
|
228
|
+
}
|
|
229
|
+
catch {
|
|
230
|
+
return undefined;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
function rowToCitation(row, cipher) {
|
|
234
|
+
const sectionPath = parseSectionPath(row.section_path_json, cipher);
|
|
235
|
+
// Build the citation without `undefined` literals to keep `exactOptionalPropertyTypes`
|
|
236
|
+
// happy. The contract permits omission of each optional field but rejects the explicit
|
|
237
|
+
// `undefined` value.
|
|
238
|
+
return {
|
|
239
|
+
documentId: row.document_id,
|
|
240
|
+
capsuleId: row.capsule_id,
|
|
241
|
+
sourceId: row.source_id,
|
|
242
|
+
chunkId: row.chunk_id,
|
|
243
|
+
safeDisplayName: row.safe_display_name ?? row.document_id,
|
|
244
|
+
...(row.page_number !== null ? { pageNumber: row.page_number } : {}),
|
|
245
|
+
...(row.page_label !== null ? { pageLabel: row.page_label } : {}),
|
|
246
|
+
...(sectionPath !== undefined ? { sectionPath } : {}),
|
|
247
|
+
...(row.json_pointer !== null ? { jsonPointer: row.json_pointer } : {}),
|
|
248
|
+
...(row.table_name !== null ? { tableName: row.table_name } : {}),
|
|
249
|
+
...(row.row_index !== null ? { rowIndex: row.row_index } : {}),
|
|
250
|
+
...(row.character_start !== null ? { characterStart: row.character_start } : {}),
|
|
251
|
+
...(row.character_end !== null ? { characterEnd: row.character_end } : {}),
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
function scoreCapsuleVectors(rows, capsule, queryVector, candidateLimit, minScore, cipher) {
|
|
255
|
+
const metric = capsule.embeddingModelIdentity.vectorMetric;
|
|
256
|
+
const scored = [];
|
|
257
|
+
for (const row of rows) {
|
|
258
|
+
// Belt-and-braces: the SQL filter already restricts to `capsule_id = capsule.id`, but
|
|
259
|
+
// we re-assert at decode time so an arbitrary store-bypass cannot leak a row.
|
|
260
|
+
if (row.capsule_id !== String(capsule.id))
|
|
261
|
+
continue;
|
|
262
|
+
if (row.vector_dimensions !== queryVector.length)
|
|
263
|
+
continue;
|
|
264
|
+
const vector = decodeEmbedding(row, cipher);
|
|
265
|
+
const score = scoreFor(metric, queryVector, vector);
|
|
266
|
+
if (minScore !== undefined && score < minScore)
|
|
267
|
+
continue;
|
|
268
|
+
scored.push({ chunkId: row.chunk_id, capsuleId: capsule.id, score });
|
|
269
|
+
}
|
|
270
|
+
scored.sort(scoreDesc);
|
|
271
|
+
return scored.slice(0, candidateLimit);
|
|
272
|
+
}
|
|
273
|
+
function lexicalRecallLimit(topK, profile) {
|
|
274
|
+
if (profile.exactTerms.some(isStrongLexicalRecallTerm))
|
|
275
|
+
return 1;
|
|
276
|
+
const multiplier = profile.strategy === "exact" ? 16 : profile.strategy === "broad" ? 8 : 10;
|
|
277
|
+
const cap = profile.strategy === "exact" ? topK + 144 : topK + 96;
|
|
278
|
+
return Math.max(topK, Math.min(topK * multiplier, cap));
|
|
279
|
+
}
|
|
280
|
+
function lexicalBaseScore(profile) {
|
|
281
|
+
if (profile.strategy === "exact")
|
|
282
|
+
return 0.88;
|
|
283
|
+
if (profile.strategy === "broad")
|
|
284
|
+
return 0.68;
|
|
285
|
+
return 0.78;
|
|
286
|
+
}
|
|
287
|
+
function lexicalCandidateScore(searchText, profile) {
|
|
288
|
+
if (profile.lexicalRecallTerms.length === 0)
|
|
289
|
+
return 0;
|
|
290
|
+
let termHits = 0;
|
|
291
|
+
for (const term of profile.lexicalRecallTerms) {
|
|
292
|
+
if (searchText.includes(term))
|
|
293
|
+
termHits += 1;
|
|
294
|
+
}
|
|
295
|
+
let exactHits = 0;
|
|
296
|
+
for (const term of profile.exactTerms) {
|
|
297
|
+
if (searchText.includes(term))
|
|
298
|
+
exactHits += 1;
|
|
299
|
+
}
|
|
300
|
+
const coverage = termHits / profile.lexicalRecallTerms.length;
|
|
301
|
+
return (lexicalBaseScore(profile) + Math.min(0.24, coverage * 0.24) + Math.min(0.16, exactHits * 0.04));
|
|
302
|
+
}
|
|
303
|
+
function sourceFilterClause(sourceFilter, qualifier) {
|
|
304
|
+
if (sourceFilter === undefined)
|
|
305
|
+
return "";
|
|
306
|
+
if (sourceFilter.length === 0)
|
|
307
|
+
return " AND 0";
|
|
308
|
+
return ` AND ${qualifier}source_id IN (${sourceFilter
|
|
309
|
+
.map((_, i) => `:source${String(i)}`)
|
|
310
|
+
.join(", ")})`;
|
|
311
|
+
}
|
|
312
|
+
function sourceParams(sourceFilter) {
|
|
313
|
+
const params = {};
|
|
314
|
+
if (sourceFilter !== undefined) {
|
|
315
|
+
for (let i = 0; i < sourceFilter.length; i += 1) {
|
|
316
|
+
params[`source${String(i)}`] = String(sourceFilter[i]);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
return params;
|
|
320
|
+
}
|
|
321
|
+
function lexicalDocumentSql(sourceFilter) {
|
|
322
|
+
return [
|
|
323
|
+
"SELECT d.id AS document_id, d.source_id, d.safe_display_name, dt.normalized_text",
|
|
324
|
+
"FROM documents AS d",
|
|
325
|
+
"JOIN document_texts AS dt ON dt.capsule_id = d.capsule_id AND dt.document_id = d.id",
|
|
326
|
+
`WHERE d.capsule_id = :capsule_id${sourceFilterClause(sourceFilter, "d.")}`,
|
|
327
|
+
"ORDER BY d.safe_display_name ASC, d.id ASC",
|
|
328
|
+
].join(" ");
|
|
329
|
+
}
|
|
330
|
+
function readLexicalDocuments(store, capsuleId, sourceFilter) {
|
|
331
|
+
const rows = store._internal.db.prepare(lexicalDocumentSql(sourceFilter)).all({
|
|
332
|
+
capsule_id: String(capsuleId),
|
|
333
|
+
...sourceParams(sourceFilter),
|
|
334
|
+
});
|
|
335
|
+
// Decrypt the joined document_texts text at the store boundary before the lexical scan runs over it.
|
|
336
|
+
// This join only matches small documents (large documents store text in document_text_windows, not
|
|
337
|
+
// document_texts), so the per-row decrypt stays within the small-document memory bound.
|
|
338
|
+
const cipher = store._internal.contentCipher;
|
|
339
|
+
if (!cipher.isEncrypted)
|
|
340
|
+
return rows;
|
|
341
|
+
return rows.map((row) => ({ ...row, normalized_text: cipher.openText(row.normalized_text) }));
|
|
342
|
+
}
|
|
343
|
+
function lexicalChunkSql(sourceFilter, mode) {
|
|
344
|
+
const positionStart = "COALESCE(c.character_start, pu.character_start, 0)";
|
|
345
|
+
const positionEnd = "COALESCE(c.character_end, pu.character_end, COALESCE(c.character_start, pu.character_start, 0) + 1)";
|
|
346
|
+
const predicate = mode === "contains" ? `AND ${positionStart} <= :position AND ${positionEnd} >= :position` : "";
|
|
347
|
+
const order = mode === "contains"
|
|
348
|
+
? "ORDER BY c.order_index ASC, c.id ASC"
|
|
349
|
+
: `ORDER BY ABS(${positionStart} - :position) ASC, c.order_index ASC, c.id ASC`;
|
|
350
|
+
return [
|
|
351
|
+
"SELECT c.id AS chunk_id, c.capsule_id AS capsule_id",
|
|
352
|
+
"FROM chunks AS c",
|
|
353
|
+
"JOIN vectors AS v ON v.capsule_id = c.capsule_id AND v.chunk_id = c.id",
|
|
354
|
+
"LEFT JOIN parsed_units AS pu ON pu.capsule_id = c.capsule_id AND pu.id = c.parsed_unit_id",
|
|
355
|
+
`WHERE c.capsule_id = :capsule_id AND c.document_id = :document_id${sourceFilterClause(sourceFilter, "c.")}`,
|
|
356
|
+
predicate,
|
|
357
|
+
order,
|
|
358
|
+
"LIMIT 3",
|
|
359
|
+
].join(" ");
|
|
360
|
+
}
|
|
361
|
+
function chunkRowsForHit(store, capsuleId, sourceFilter, documentId, position) {
|
|
362
|
+
const params = {
|
|
363
|
+
capsule_id: String(capsuleId),
|
|
364
|
+
document_id: documentId,
|
|
365
|
+
position,
|
|
366
|
+
...sourceParams(sourceFilter),
|
|
367
|
+
};
|
|
368
|
+
const contained = store._internal.db
|
|
369
|
+
.prepare(lexicalChunkSql(sourceFilter, "contains"))
|
|
370
|
+
.all(params);
|
|
371
|
+
if (contained.length > 0)
|
|
372
|
+
return contained;
|
|
373
|
+
return store._internal.db
|
|
374
|
+
.prepare(lexicalChunkSql(sourceFilter, "nearest"))
|
|
375
|
+
.all(params);
|
|
376
|
+
}
|
|
377
|
+
function lexicalSearchExcerpt(text, position, profile) {
|
|
378
|
+
const start = Math.max(0, position - profile.contextBeforeChars);
|
|
379
|
+
const end = Math.min(text.length, position + LEXICAL_RECALL_EXCERPT_CHARS);
|
|
380
|
+
return text.slice(start, end).toLowerCase();
|
|
381
|
+
}
|
|
382
|
+
function lexicalHitsForDocument(doc, profile, limit) {
|
|
383
|
+
const hits = [];
|
|
384
|
+
const seenBuckets = new Set();
|
|
385
|
+
const text = doc.normalized_text.toLowerCase();
|
|
386
|
+
const metadata = normaliseForSearch(doc.safe_display_name ?? "");
|
|
387
|
+
for (const term of profile.lexicalRecallTerms) {
|
|
388
|
+
let position = text.indexOf(term);
|
|
389
|
+
if (position < 0 && metadata.includes(term))
|
|
390
|
+
position = 0;
|
|
391
|
+
if (position < 0)
|
|
392
|
+
continue;
|
|
393
|
+
const bucket = Math.floor(position / Math.max(1, LEXICAL_RECALL_EXCERPT_CHARS));
|
|
394
|
+
if (seenBuckets.has(bucket))
|
|
395
|
+
continue;
|
|
396
|
+
seenBuckets.add(bucket);
|
|
397
|
+
hits.push({ position, searchText: lexicalSearchExcerpt(text, position, profile) });
|
|
398
|
+
if (hits.length >= limit)
|
|
399
|
+
break;
|
|
400
|
+
}
|
|
401
|
+
return hits;
|
|
402
|
+
}
|
|
403
|
+
function lexicalRecallCandidatesForCapsule(store, capsule, sourceFilter, profile, topK) {
|
|
404
|
+
if (profile.lexicalRecallTerms.length === 0)
|
|
405
|
+
return [];
|
|
406
|
+
const limit = lexicalRecallLimit(topK, profile);
|
|
407
|
+
const out = [];
|
|
408
|
+
for (const doc of readLexicalDocuments(store, capsule.id, sourceFilter)) {
|
|
409
|
+
const remaining = Math.max(0, limit - out.length);
|
|
410
|
+
if (remaining === 0)
|
|
411
|
+
break;
|
|
412
|
+
const hits = lexicalHitsForDocument(doc, profile, remaining);
|
|
413
|
+
for (const hit of hits) {
|
|
414
|
+
for (const row of chunkRowsForHit(store, capsule.id, sourceFilter, doc.document_id, hit.position)) {
|
|
415
|
+
out.push({
|
|
416
|
+
chunkId: row.chunk_id,
|
|
417
|
+
capsuleId: capsule.id,
|
|
418
|
+
score: lexicalCandidateScore(hit.searchText, profile),
|
|
419
|
+
});
|
|
420
|
+
if (out.length >= limit)
|
|
421
|
+
break;
|
|
422
|
+
}
|
|
423
|
+
if (out.length >= limit)
|
|
424
|
+
break;
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
return out.filter((candidate) => candidate.score > 0).sort(scoreDesc);
|
|
428
|
+
}
|
|
429
|
+
function mergeCandidates(candidates, lexicalCandidates) {
|
|
430
|
+
if (lexicalCandidates.length === 0)
|
|
431
|
+
return candidates;
|
|
432
|
+
const byKey = new Map();
|
|
433
|
+
for (const candidate of candidates) {
|
|
434
|
+
byKey.set(`${String(candidate.capsuleId)}|${candidate.chunkId}`, candidate);
|
|
435
|
+
}
|
|
436
|
+
for (const candidate of lexicalCandidates) {
|
|
437
|
+
const key = `${String(candidate.capsuleId)}|${candidate.chunkId}`;
|
|
438
|
+
const existing = byKey.get(key);
|
|
439
|
+
if (existing === undefined || candidate.score > existing.score) {
|
|
440
|
+
byKey.set(key, candidate);
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
return [...byKey.values()];
|
|
444
|
+
}
|
|
445
|
+
function collectLexicalRecallCandidates(store, capsules, scope, profile, topK) {
|
|
446
|
+
if (profile.lexicalRecallTerms.length === 0)
|
|
447
|
+
return [];
|
|
448
|
+
const out = [];
|
|
449
|
+
for (const capsule of capsules) {
|
|
450
|
+
out.push(...lexicalRecallCandidatesForCapsule(store, capsule, sourceFilterForCapsule(scope.sourceFilter, capsule), profile, topK));
|
|
451
|
+
}
|
|
452
|
+
return out;
|
|
453
|
+
}
|
|
454
|
+
function oversampleTopK(topK, profile) {
|
|
455
|
+
const multiplier = profile.strategy === "exact" ? 12 : profile.strategy === "broad" ? 10 : 8;
|
|
456
|
+
const cap = profile.strategy === "exact" ? topK + 96 : topK + 64;
|
|
457
|
+
return Math.max(topK, Math.min(topK * multiplier, cap));
|
|
458
|
+
}
|
|
459
|
+
function scoreDesc(a, b) {
|
|
460
|
+
if (b.score !== a.score)
|
|
461
|
+
return b.score - a.score;
|
|
462
|
+
// Stable tiebreak by chunkId so reordering of equal-score rows is deterministic across
|
|
463
|
+
// platforms — important for the snapshot tests in #200.
|
|
464
|
+
return a.chunkId.localeCompare(b.chunkId);
|
|
465
|
+
}
|
|
466
|
+
function emptyState() {
|
|
467
|
+
return {
|
|
468
|
+
candidates: [],
|
|
469
|
+
anyVectorSeen: false,
|
|
470
|
+
anyDimensionCompatible: false,
|
|
471
|
+
embeddingFailed: false,
|
|
472
|
+
};
|
|
473
|
+
}
|
|
474
|
+
async function processCapsule(store, embeddingAdapter, capsule, sourceFilter, query, options, profile, cache, state) {
|
|
475
|
+
const rows = readVectorsForCapsule(store, capsule.id, sourceFilter);
|
|
476
|
+
if (rows.length === 0)
|
|
477
|
+
return;
|
|
478
|
+
state.anyVectorSeen = true;
|
|
479
|
+
const embedded = await ensureQueryEmbedded(embeddingAdapter, capsule.embeddingModelIdentity, query, options.signal, cache);
|
|
480
|
+
if (embedded === undefined) {
|
|
481
|
+
state.embeddingFailed = true;
|
|
482
|
+
return;
|
|
483
|
+
}
|
|
484
|
+
if (embedded.dimensions !== capsule.embeddingModelIdentity.vectorDimensions) {
|
|
485
|
+
// Adapter returned a dim that doesn't match the capsule's pinned identity — same
|
|
486
|
+
// failure surface as #192's `INCOMPATIBLE_EMBEDDING_IDENTITY`. Skip this capsule.
|
|
487
|
+
return;
|
|
488
|
+
}
|
|
489
|
+
state.anyDimensionCompatible = true;
|
|
490
|
+
const candidates = scoreCapsuleVectors(rows, capsule, embedded.vector, oversampleTopK(options.topK, profile), options.minScore, store._internal.contentCipher);
|
|
491
|
+
state.candidates.push(...candidates);
|
|
492
|
+
}
|
|
493
|
+
async function ensureQueryEmbedded(adapter, identity, query, signal, cache) {
|
|
494
|
+
const key = identityKey(identity);
|
|
495
|
+
const cached = cache.get(key);
|
|
496
|
+
if (cached !== undefined)
|
|
497
|
+
return cached;
|
|
498
|
+
const result = await embedQueryFor(adapter, identity, query, signal);
|
|
499
|
+
if (result instanceof RetrievalError)
|
|
500
|
+
return undefined;
|
|
501
|
+
cache.set(key, result);
|
|
502
|
+
return result;
|
|
503
|
+
}
|
|
504
|
+
function selectTopCandidates(state, options, profile, candidates = state.candidates) {
|
|
505
|
+
if (!state.anyVectorSeen)
|
|
506
|
+
return { ok: false, reason: "no-vectors" };
|
|
507
|
+
if (state.embeddingFailed && candidates.length === 0) {
|
|
508
|
+
return { ok: false, reason: "embedding-failed" };
|
|
509
|
+
}
|
|
510
|
+
if (!state.anyDimensionCompatible) {
|
|
511
|
+
return { ok: false, reason: "incompatible-embedding-identity" };
|
|
512
|
+
}
|
|
513
|
+
const sorted = [...candidates].sort(scoreDesc);
|
|
514
|
+
const top = sorted.slice(0, oversampleTopK(options.topK, profile));
|
|
515
|
+
if (top.length === 0)
|
|
516
|
+
return { ok: false, reason: "below-min-score" };
|
|
517
|
+
return { ok: true, top };
|
|
518
|
+
}
|
|
519
|
+
export async function searchVectorsForScope(store, embeddingAdapter, scope, query, options) {
|
|
520
|
+
const capsules = loadCapsules(store, scope.capsuleIds);
|
|
521
|
+
if (capsules.length === 0)
|
|
522
|
+
return { references: [], noEvidenceReason: "no-vectors" };
|
|
523
|
+
const profile = profileQuery(query, options.strategy);
|
|
524
|
+
const cache = new Map();
|
|
525
|
+
const state = emptyState();
|
|
526
|
+
for (const capsule of capsules) {
|
|
527
|
+
await processCapsule(store, embeddingAdapter, capsule, sourceFilterForCapsule(scope.sourceFilter, capsule), query, options, profile, cache, state);
|
|
528
|
+
}
|
|
529
|
+
// GRD-002 / GRD-024: `minScore` is a DENSE relevance floor. Lexical recall is a recall booster
|
|
530
|
+
// whose candidates carry a lexical base score (0.68–0.88) unrelated to vector similarity, so
|
|
531
|
+
// they would bypass the floor (a ~0-cosine chunk that merely shares a query token could surface
|
|
532
|
+
// above a 0.9 floor). When a caller sets `minScore`, suppress lexical recall so only
|
|
533
|
+
// vector candidates that already passed the cosine floor (scoreCapsuleVectors) survive — and so
|
|
534
|
+
// the `below-min-score` no-evidence reason becomes reachable when none do. The default path
|
|
535
|
+
// (no `minScore`) keeps hybrid lexical recall unchanged.
|
|
536
|
+
const lexicalCandidates = state.anyDimensionCompatible && options.minScore === undefined
|
|
537
|
+
? collectLexicalRecallCandidates(store, capsules, scope, profile, options.topK)
|
|
538
|
+
: [];
|
|
539
|
+
const candidates = mergeCandidates(state.candidates, lexicalCandidates);
|
|
540
|
+
const selection = selectTopCandidates(state, options, profile, candidates);
|
|
541
|
+
if (!selection.ok)
|
|
542
|
+
return { references: [], noEvidenceReason: selection.reason };
|
|
543
|
+
const refs = buildReferences(store, selection.top, query, options.topK, profile);
|
|
544
|
+
return state.embeddingFailed
|
|
545
|
+
? { references: refs, embeddingDegraded: true }
|
|
546
|
+
: { references: refs };
|
|
547
|
+
}
|
|
548
|
+
function sourceFilterForCapsule(sourceFilter, capsule) {
|
|
549
|
+
if (sourceFilter === undefined)
|
|
550
|
+
return undefined;
|
|
551
|
+
const capsuleSourceIds = new Set(capsule.sourceIds.map(String));
|
|
552
|
+
return sourceFilter.filter((sourceId) => capsuleSourceIds.has(String(sourceId)));
|
|
553
|
+
}
|
|
554
|
+
function loadCapsules(store, ids) {
|
|
555
|
+
const out = [];
|
|
556
|
+
for (const id of ids) {
|
|
557
|
+
const capsule = getCapsule(store, id);
|
|
558
|
+
if (capsule !== undefined)
|
|
559
|
+
out.push(capsule);
|
|
560
|
+
}
|
|
561
|
+
return out;
|
|
562
|
+
}
|
|
563
|
+
function buildReferences(store, candidates, query, limit, profile) {
|
|
564
|
+
// Group surviving candidates by capsule so we can issue one citation-read per capsule.
|
|
565
|
+
const byCapsule = new Map();
|
|
566
|
+
for (const candidate of candidates) {
|
|
567
|
+
const key = String(candidate.capsuleId);
|
|
568
|
+
const bucket = byCapsule.get(key);
|
|
569
|
+
if (bucket === undefined) {
|
|
570
|
+
byCapsule.set(key, [candidate]);
|
|
571
|
+
}
|
|
572
|
+
else {
|
|
573
|
+
bucket.push(candidate);
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
const citationByChunk = new Map();
|
|
577
|
+
for (const [capsuleKey, bucket] of byCapsule.entries()) {
|
|
578
|
+
const rows = readCitationRows(store, capsuleKey, bucket.map((c) => c.chunkId));
|
|
579
|
+
for (const row of rows) {
|
|
580
|
+
// Composite scoping key — chunk ids ARE globally unique by construction (chunks
|
|
581
|
+
// table PK on `id`), but we still namespace the map by `capsule|chunk` so any
|
|
582
|
+
// future schema change cannot let a citation row for one capsule become the
|
|
583
|
+
// citation for another with the same chunkId by coincidence.
|
|
584
|
+
citationByChunk.set(`${row.capsule_id}|${row.chunk_id}`, rowToCitation(row, store._internal.contentCipher));
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
const refs = [];
|
|
588
|
+
for (const candidate of candidates) {
|
|
589
|
+
const key = `${String(candidate.capsuleId)}|${candidate.chunkId}`;
|
|
590
|
+
const citation = citationByChunk.get(key);
|
|
591
|
+
if (citation === undefined)
|
|
592
|
+
continue; // Defensive: a missing citation means the chunk
|
|
593
|
+
// row was deleted between the vectors read and the citations read. Drop the
|
|
594
|
+
// candidate rather than fabricate.
|
|
595
|
+
refs.push({
|
|
596
|
+
chunkId: citation.chunkId,
|
|
597
|
+
capsuleId: candidate.capsuleId,
|
|
598
|
+
score: candidate.score +
|
|
599
|
+
lexicalMetadataBonus(citation, profile) +
|
|
600
|
+
lexicalContentBonus(store, candidate.capsuleId, citation, profile),
|
|
601
|
+
citation,
|
|
602
|
+
});
|
|
603
|
+
}
|
|
604
|
+
refs.sort(referenceScoreDesc);
|
|
605
|
+
return diversifyReferences(refs, limit, profile);
|
|
606
|
+
}
|
|
607
|
+
function referenceScoreDesc(a, b) {
|
|
608
|
+
if (b.score !== a.score)
|
|
609
|
+
return b.score - a.score;
|
|
610
|
+
return String(a.chunkId).localeCompare(String(b.chunkId));
|
|
611
|
+
}
|
|
612
|
+
function diversifyReferences(references, limit, profile) {
|
|
613
|
+
if (references.length <= limit)
|
|
614
|
+
return references;
|
|
615
|
+
const remaining = [...references];
|
|
616
|
+
const selected = [];
|
|
617
|
+
while (remaining.length > 0 && selected.length < limit) {
|
|
618
|
+
const pick = pickNextReference(remaining, selected, profile);
|
|
619
|
+
selected.push(pick.reference);
|
|
620
|
+
remaining.splice(pick.index, 1);
|
|
621
|
+
}
|
|
622
|
+
selected.sort(referenceScoreDesc);
|
|
623
|
+
return selected;
|
|
624
|
+
}
|
|
625
|
+
function pickNextReference(remaining, selected, profile) {
|
|
626
|
+
let bestIndex = 0;
|
|
627
|
+
let best = withDiversityScore(remaining[0], selected, profile);
|
|
628
|
+
for (let i = 1; i < remaining.length; i += 1) {
|
|
629
|
+
const candidate = withDiversityScore(remaining[i], selected, profile);
|
|
630
|
+
if (referenceScoreDesc(candidate, best) < 0) {
|
|
631
|
+
best = candidate;
|
|
632
|
+
bestIndex = i;
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
return { reference: best, index: bestIndex };
|
|
636
|
+
}
|
|
637
|
+
function withDiversityScore(reference, selected, profile) {
|
|
638
|
+
if (reference === undefined)
|
|
639
|
+
throw new RetrievalError("STORE_READ_FAILED", "missing reference");
|
|
640
|
+
const penalty = diversityPenalty(reference, selected, profile);
|
|
641
|
+
if (penalty === 0)
|
|
642
|
+
return reference;
|
|
643
|
+
return { ...reference, score: reference.score - penalty };
|
|
644
|
+
}
|
|
645
|
+
function diversityPenalty(reference, selected, profile) {
|
|
646
|
+
let sameDocument = 0;
|
|
647
|
+
let sameSection = 0;
|
|
648
|
+
const sectionKey = referenceSectionKey(reference);
|
|
649
|
+
for (const prior of selected) {
|
|
650
|
+
if (String(prior.citation.documentId) === String(reference.citation.documentId)) {
|
|
651
|
+
sameDocument += 1;
|
|
652
|
+
}
|
|
653
|
+
if (sectionKey !== "" && sectionKey === referenceSectionKey(prior)) {
|
|
654
|
+
sameSection += 1;
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
return (sameDocument * profile.documentDiversityPenalty + sameSection * profile.sectionDiversityPenalty);
|
|
658
|
+
}
|
|
659
|
+
function referenceSectionKey(reference) {
|
|
660
|
+
const path = reference.citation.sectionPath?.join(">");
|
|
661
|
+
return path === undefined ? "" : `${String(reference.citation.documentId)}:${path}`;
|
|
662
|
+
}
|
|
663
|
+
function lexicalMetadataBonus(citation, profile) {
|
|
664
|
+
if (profile.tokens.length === 0)
|
|
665
|
+
return 0;
|
|
666
|
+
const haystack = tokenise([
|
|
667
|
+
citation.safeDisplayName,
|
|
668
|
+
citation.pageLabel,
|
|
669
|
+
...(citation.sectionPath ?? []),
|
|
670
|
+
citation.jsonPointer,
|
|
671
|
+
citation.tableName,
|
|
672
|
+
citation.rowIndex === undefined ? undefined : String(citation.rowIndex),
|
|
673
|
+
String(citation.pageNumber ?? ""),
|
|
674
|
+
]
|
|
675
|
+
.filter((value) => typeof value === "string" && value.length > 0)
|
|
676
|
+
.join(" "));
|
|
677
|
+
if (haystack.length === 0)
|
|
678
|
+
return 0;
|
|
679
|
+
const haystackSet = new Set(haystack);
|
|
680
|
+
const hits = countTokenHits(profile.tokens, haystackSet);
|
|
681
|
+
if (hits === 0)
|
|
682
|
+
return 0;
|
|
683
|
+
return (hits / profile.tokens.length) * profile.metadataWeight;
|
|
684
|
+
}
|
|
685
|
+
function lexicalContentBonus(store, capsuleId, citation, profile) {
|
|
686
|
+
if (profile.tokens.length === 0)
|
|
687
|
+
return 0;
|
|
688
|
+
const excerpt = readCitationSearchExcerpt(store, capsuleId, citation, SEARCH_EXCERPT_MAX_CHARS, profile.contextBeforeChars);
|
|
689
|
+
if (excerpt.length === 0)
|
|
690
|
+
return 0;
|
|
691
|
+
const excerptTokens = tokenise(excerpt);
|
|
692
|
+
if (excerptTokens.length === 0)
|
|
693
|
+
return 0;
|
|
694
|
+
const normalisedExcerpt = normaliseForSearch(excerpt);
|
|
695
|
+
const tokenCoverage = countTokenHits(profile.tokens, new Set(excerptTokens)) / profile.tokens.length;
|
|
696
|
+
const phraseHits = countAdjacentPhraseHits(profile.tokens, normalisedExcerpt);
|
|
697
|
+
const exactHits = countExactTermHits(profile.exactTerms, normalisedExcerpt);
|
|
698
|
+
return (Math.min(0.24, tokenCoverage * profile.lexicalWeight) +
|
|
699
|
+
Math.min(0.16, phraseHits * profile.phraseWeight) +
|
|
700
|
+
Math.min(0.18, exactHits * 0.06));
|
|
701
|
+
}
|
|
702
|
+
function readCitationSearchExcerpt(store, capsuleId, citation, maxChars, beforeChars) {
|
|
703
|
+
const row = store._internal.db
|
|
704
|
+
.prepare("SELECT normalized_text FROM document_texts WHERE capsule_id = :capsule_id AND document_id = :document_id")
|
|
705
|
+
.get({
|
|
706
|
+
capsule_id: String(capsuleId),
|
|
707
|
+
document_id: String(citation.documentId),
|
|
708
|
+
});
|
|
709
|
+
const stored = row?.normalized_text;
|
|
710
|
+
const text = typeof stored === "string" ? store._internal.contentCipher.openText(stored) : undefined;
|
|
711
|
+
if (typeof text !== "string" || text.length === 0)
|
|
712
|
+
return "";
|
|
713
|
+
const focusStart = Math.max(0, Math.min(text.length, citation.characterStart ?? 0));
|
|
714
|
+
const focusEnd = Math.max(focusStart, Math.min(text.length, citation.characterEnd ?? focusStart + maxChars));
|
|
715
|
+
const start = Math.max(0, focusStart - beforeChars);
|
|
716
|
+
const afterBudget = Math.max(0, maxChars - (focusStart - start));
|
|
717
|
+
const end = Math.min(text.length, focusEnd + afterBudget);
|
|
718
|
+
return text.slice(start, end).trim();
|
|
719
|
+
}
|
|
720
|
+
function countTokenHits(tokens, haystack) {
|
|
721
|
+
let hits = 0;
|
|
722
|
+
for (const token of tokens) {
|
|
723
|
+
if (haystack.has(token))
|
|
724
|
+
hits += 1;
|
|
725
|
+
}
|
|
726
|
+
return hits;
|
|
727
|
+
}
|
|
728
|
+
function countAdjacentPhraseHits(tokens, normalisedHaystack) {
|
|
729
|
+
let hits = 0;
|
|
730
|
+
for (let i = 0; i < tokens.length - 1; i += 1) {
|
|
731
|
+
const first = tokens[i];
|
|
732
|
+
const second = tokens[i + 1];
|
|
733
|
+
if (first === undefined || second === undefined)
|
|
734
|
+
continue;
|
|
735
|
+
if (normalisedHaystack.includes(`${first} ${second}`))
|
|
736
|
+
hits += 1;
|
|
737
|
+
}
|
|
738
|
+
return hits;
|
|
739
|
+
}
|
|
740
|
+
function countExactTermHits(terms, normalisedHaystack) {
|
|
741
|
+
let hits = 0;
|
|
742
|
+
for (const term of terms) {
|
|
743
|
+
if (normalisedHaystack.includes(term))
|
|
744
|
+
hits += 1;
|
|
745
|
+
}
|
|
746
|
+
return hits;
|
|
747
|
+
}
|
|
748
|
+
function profileQuery(query, requested) {
|
|
749
|
+
const tokens = uniqueTokens(tokenise(query));
|
|
750
|
+
const exactTerms = extractExactTerms(query);
|
|
751
|
+
const strategy = resolveQueryStrategy(query, tokens, exactTerms, requested);
|
|
752
|
+
if (strategy === "exact")
|
|
753
|
+
return exactQueryProfile(tokens, exactTerms);
|
|
754
|
+
if (strategy === "broad")
|
|
755
|
+
return broadQueryProfile(tokens, exactTerms);
|
|
756
|
+
return balancedQueryProfile(tokens, exactTerms);
|
|
757
|
+
}
|
|
758
|
+
function resolveQueryStrategy(query, tokens, exactTerms, requested) {
|
|
759
|
+
if (requested !== undefined && requested !== "auto")
|
|
760
|
+
return requested;
|
|
761
|
+
if (exactTerms.some(isStrongLexicalRecallTerm))
|
|
762
|
+
return "exact";
|
|
763
|
+
if (tokens.length >= 8 || BROAD_QUERY_PATTERN.test(query))
|
|
764
|
+
return "broad";
|
|
765
|
+
return "balanced";
|
|
766
|
+
}
|
|
767
|
+
function exactQueryProfile(tokens, exactTerms) {
|
|
768
|
+
return {
|
|
769
|
+
strategy: "exact",
|
|
770
|
+
tokens,
|
|
771
|
+
exactTerms,
|
|
772
|
+
lexicalRecallTerms: buildLexicalRecallTerms(tokens, exactTerms),
|
|
773
|
+
lexicalWeight: 0.22,
|
|
774
|
+
phraseWeight: 0.06,
|
|
775
|
+
metadataWeight: 0.16,
|
|
776
|
+
contextBeforeChars: SEARCH_CONTEXT_BEFORE_CHARS * 2,
|
|
777
|
+
documentDiversityPenalty: 0.018,
|
|
778
|
+
sectionDiversityPenalty: 0.01,
|
|
779
|
+
};
|
|
780
|
+
}
|
|
781
|
+
function broadQueryProfile(tokens, exactTerms) {
|
|
782
|
+
return {
|
|
783
|
+
strategy: "broad",
|
|
784
|
+
tokens,
|
|
785
|
+
exactTerms,
|
|
786
|
+
lexicalRecallTerms: buildLexicalRecallTerms(tokens, exactTerms),
|
|
787
|
+
lexicalWeight: 0.16,
|
|
788
|
+
phraseWeight: 0.04,
|
|
789
|
+
metadataWeight: 0.1,
|
|
790
|
+
contextBeforeChars: SEARCH_CONTEXT_BEFORE_CHARS,
|
|
791
|
+
documentDiversityPenalty: 0.085,
|
|
792
|
+
sectionDiversityPenalty: 0.035,
|
|
793
|
+
};
|
|
794
|
+
}
|
|
795
|
+
function balancedQueryProfile(tokens, exactTerms) {
|
|
796
|
+
return {
|
|
797
|
+
strategy: "balanced",
|
|
798
|
+
tokens,
|
|
799
|
+
exactTerms,
|
|
800
|
+
lexicalRecallTerms: buildLexicalRecallTerms(tokens, exactTerms),
|
|
801
|
+
lexicalWeight: 0.18,
|
|
802
|
+
phraseWeight: 0.045,
|
|
803
|
+
metadataWeight: 0.12,
|
|
804
|
+
contextBeforeChars: SEARCH_CONTEXT_BEFORE_CHARS,
|
|
805
|
+
documentDiversityPenalty: 0.045,
|
|
806
|
+
sectionDiversityPenalty: 0.02,
|
|
807
|
+
};
|
|
808
|
+
}
|
|
809
|
+
function extractExactTerms(value) {
|
|
810
|
+
const out = [];
|
|
811
|
+
const matches = value.matchAll(EXACT_TERM_PATTERN);
|
|
812
|
+
for (const match of matches) {
|
|
813
|
+
const raw = match[0];
|
|
814
|
+
if (!isExactTerm(raw))
|
|
815
|
+
continue;
|
|
816
|
+
const term = normaliseForSearch(raw);
|
|
817
|
+
if (term.length > 0)
|
|
818
|
+
out.push(term);
|
|
819
|
+
}
|
|
820
|
+
return uniqueTokens(out);
|
|
821
|
+
}
|
|
822
|
+
function isExactTerm(value) {
|
|
823
|
+
if (/\d/u.test(value))
|
|
824
|
+
return true;
|
|
825
|
+
if (/[._:/#-]/u.test(value))
|
|
826
|
+
return true;
|
|
827
|
+
if (/[a-z][A-Z]/u.test(value))
|
|
828
|
+
return true;
|
|
829
|
+
return value.length >= 3 && value === value.toUpperCase() && /\p{L}/u.test(value);
|
|
830
|
+
}
|
|
831
|
+
function hasDigitAndLetter(value) {
|
|
832
|
+
return /\d/u.test(value) && /\p{L}/u.test(value);
|
|
833
|
+
}
|
|
834
|
+
function isUppercaseLetterTerm(value) {
|
|
835
|
+
return value === value.toUpperCase() && /\p{L}/u.test(value);
|
|
836
|
+
}
|
|
837
|
+
function isStrongLexicalRecallTerm(value) {
|
|
838
|
+
const checks = [
|
|
839
|
+
value.length >= 4 && /[._:/#-]/u.test(value),
|
|
840
|
+
value.length >= 4 && hasDigitAndLetter(value),
|
|
841
|
+
value.length >= 8 && /\p{L}/u.test(value),
|
|
842
|
+
value.length >= 6 && isUppercaseLetterTerm(value),
|
|
843
|
+
];
|
|
844
|
+
return checks.includes(true);
|
|
845
|
+
}
|
|
846
|
+
function uniqueTokens(tokens) {
|
|
847
|
+
return [...new Set(tokens)];
|
|
848
|
+
}
|
|
849
|
+
function buildLexicalRecallTerms(tokens, exactTerms) {
|
|
850
|
+
const tokenTerms = tokens.filter((token) => token.length >= LEXICAL_RECALL_MIN_TOKEN_LENGTH);
|
|
851
|
+
return uniqueTokens([...exactTerms, ...tokenTerms]).slice(0, LEXICAL_RECALL_MAX_TERMS);
|
|
852
|
+
}
|
|
853
|
+
function tokenise(value) {
|
|
854
|
+
return normaliseForSearch(value)
|
|
855
|
+
.split(/[^\p{L}\p{N}]+/u)
|
|
856
|
+
.filter((token) => token.length >= 2 && !SEARCH_STOPWORDS.has(token));
|
|
857
|
+
}
|
|
858
|
+
function normaliseForSearch(value) {
|
|
859
|
+
return value
|
|
860
|
+
.normalize("NFKD")
|
|
861
|
+
.replace(/\p{Mark}+/gu, "")
|
|
862
|
+
.toLowerCase()
|
|
863
|
+
.replace(/ß/gu, "ss");
|
|
864
|
+
}
|