knolo-core 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +196 -61
- package/dist/builder.d.ts +0 -5
- package/dist/builder.js +38 -37
- package/dist/pack.d.ts +3 -29
- package/dist/pack.js +50 -44
- package/dist/quality/diversify.d.ts +13 -0
- package/dist/quality/diversify.js +41 -0
- package/dist/quality/proximity.d.ts +2 -0
- package/dist/quality/proximity.js +31 -0
- package/dist/quality/signature.d.ts +3 -0
- package/dist/quality/signature.js +24 -0
- package/dist/quality/similarity.d.ts +3 -0
- package/dist/quality/similarity.js +27 -0
- package/dist/query.d.ts +1 -7
- package/dist/query.js +129 -70
- package/dist/rank.d.ts +7 -6
- package/dist/rank.js +6 -19
- package/dist/utils/utf8.d.ts +8 -0
- package/dist/utils/utf8.js +72 -0
- package/package.json +4 -3
package/README.md
CHANGED
|
@@ -1,23 +1,29 @@
|
|
|
1
1
|
|
|
2
|
-
|
|
3
2
|
# 🧠 KnoLo Core
|
|
4
3
|
|
|
5
4
|
[](https://www.npmjs.com/package/knolo-core)
|
|
6
5
|
[](https://www.npmjs.com/package/knolo-core)
|
|
7
6
|
[](./LICENSE)
|
|
8
7
|
|
|
9
|
-
**KnoLo Core** is a **local-first knowledge base
|
|
10
|
-
|
|
8
|
+
**KnoLo Core** is a **local-first knowledge base** for small LLMs.
|
|
9
|
+
Package documents into a compact `.knolo` file and query them deterministically —
|
|
10
|
+
**no embeddings, no vector DB, no cloud**. Ideal for **on‑device / offline** assistants.
|
|
11
11
|
|
|
12
12
|
---
|
|
13
13
|
|
|
14
|
-
## ✨
|
|
14
|
+
## ✨ Highlights (v0.2.0)
|
|
15
|
+
|
|
16
|
+
* 🔎 **Stronger relevance:**
|
|
15
17
|
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
18
|
+
* **Required phrase enforcement** (quoted & `requirePhrases`)
|
|
19
|
+
* **Proximity bonus** using minimal term-span cover
|
|
20
|
+
* **Optional heading boosts** when headings are present
|
|
21
|
+
* 🌀 **Duplicate-free results:** **near-duplicate suppression** + **MMR diversity**
|
|
22
|
+
* 🧮 **KNS tie‑breaker:** lightweight numeric signature to stabilize close ties
|
|
23
|
+
* ⚡ **Faster & leaner:** precomputed `avgBlockLen` in pack metadata
|
|
24
|
+
* 📱 **Works in Expo/React Native:** safe TextEncoder/TextDecoder ponyfills
|
|
25
|
+
* 📑 **Context Patches:** LLM‑friendly snippets for prompts
|
|
26
|
+
* 🔒 **Local & private:** everything runs on device
|
|
21
27
|
|
|
22
28
|
---
|
|
23
29
|
|
|
@@ -27,7 +33,7 @@ It lets you package your own documents into a compact `.knolo` file and query th
|
|
|
27
33
|
npm install knolo-core
|
|
28
34
|
```
|
|
29
35
|
|
|
30
|
-
|
|
36
|
+
Dev from source:
|
|
31
37
|
|
|
32
38
|
```bash
|
|
33
39
|
git clone https://github.com/yourname/knolo-core.git
|
|
@@ -38,113 +44,242 @@ npm run build
|
|
|
38
44
|
|
|
39
45
|
---
|
|
40
46
|
|
|
41
|
-
## 🚀 Usage
|
|
47
|
+
## 🚀 Usage
|
|
42
48
|
|
|
43
|
-
### 1
|
|
49
|
+
### 1) Node.js (build → mount → query → patch)
|
|
44
50
|
|
|
45
|
-
```
|
|
51
|
+
```ts
|
|
46
52
|
import { buildPack, mountPack, query, makeContextPatch } from "knolo-core";
|
|
47
53
|
|
|
48
54
|
const docs = [
|
|
49
|
-
{ heading: "React Native Bridge", text: "The bridge sends messages between JS and native. You can throttle events
|
|
50
|
-
{ heading: "Throttling",
|
|
51
|
-
{ heading: "Debounce vs Throttle", text: "Debounce waits for silence
|
|
55
|
+
{ id: "guide", heading: "React Native Bridge", text: "The bridge sends messages between JS and native. You can throttle events..." },
|
|
56
|
+
{ id: "throttle", heading: "Throttling", text: "Throttling reduces frequency of events to avoid flooding the bridge." },
|
|
57
|
+
{ id: "dvst", heading: "Debounce vs Throttle", text: "Debounce waits for silence; throttle guarantees a max rate." }
|
|
52
58
|
];
|
|
53
59
|
|
|
54
|
-
|
|
55
|
-
const
|
|
60
|
+
const bytes = await buildPack(docs); // build .knolo bytes
|
|
61
|
+
const kb = await mountPack({ src: bytes }); // mount in-memory
|
|
62
|
+
const hits = query(kb, '“react native” throttle', // quotes enforce phrase
|
|
63
|
+
{ topK: 5, requirePhrases: ["max rate"] });
|
|
56
64
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
65
|
+
console.log(hits);
|
|
66
|
+
/*
|
|
67
|
+
[
|
|
68
|
+
{ blockId: 2, score: 6.73, text: "...", source: "dvst" },
|
|
69
|
+
...
|
|
70
|
+
]
|
|
71
|
+
*/
|
|
63
72
|
|
|
64
|
-
// Turn into an LLM-friendly context patch
|
|
65
73
|
const patch = makeContextPatch(hits, { budget: "small" });
|
|
66
|
-
console.log(
|
|
74
|
+
console.log(patch);
|
|
67
75
|
```
|
|
68
76
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
### 2. CLI (build `.knolo` file)
|
|
77
|
+
### 2) CLI (build a `.knolo` file)
|
|
72
78
|
|
|
73
|
-
|
|
79
|
+
Create `docs.json`:
|
|
74
80
|
|
|
75
81
|
```json
|
|
76
82
|
[
|
|
77
|
-
{ "heading": "Guide", "text": "Install deps
|
|
78
|
-
{ "heading": "FAQ",
|
|
83
|
+
{ "id": "guide", "heading": "Guide", "text": "Install deps...\n\n## Throttle\nLimit frequency of events." },
|
|
84
|
+
{ "id": "faq", "heading": "FAQ", "text": "What is throttling? It reduces event frequency." }
|
|
79
85
|
]
|
|
80
86
|
```
|
|
81
87
|
|
|
82
|
-
|
|
88
|
+
Build:
|
|
83
89
|
|
|
84
90
|
```bash
|
|
85
|
-
# writes
|
|
86
|
-
npx knolo docs.json
|
|
91
|
+
# writes knowledge.knolo
|
|
92
|
+
npx knolo docs.json knowledge.knolo
|
|
87
93
|
```
|
|
88
94
|
|
|
89
|
-
|
|
95
|
+
Then load it in your app:
|
|
90
96
|
|
|
91
|
-
```
|
|
97
|
+
```ts
|
|
92
98
|
import { mountPack, query } from "knolo-core";
|
|
93
|
-
|
|
94
|
-
const kb = await mountPack({ src: "./mypack.knolo" });
|
|
99
|
+
const kb = await mountPack({ src: "./knowledge.knolo" });
|
|
95
100
|
const hits = query(kb, "throttle events", { topK: 3 });
|
|
96
|
-
console.log(hits);
|
|
97
101
|
```
|
|
98
102
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
### 3. React / Expo (load from asset)
|
|
103
|
+
### 3) React / Expo
|
|
102
104
|
|
|
103
105
|
```ts
|
|
104
106
|
import { Asset } from "expo-asset";
|
|
105
107
|
import * as FileSystem from "expo-file-system";
|
|
106
|
-
import { mountPack, query
|
|
108
|
+
import { mountPack, query } from "knolo-core";
|
|
107
109
|
|
|
108
|
-
async function
|
|
109
|
-
const asset = Asset.fromModule(require("./assets/
|
|
110
|
+
async function loadKB() {
|
|
111
|
+
const asset = Asset.fromModule(require("./assets/knowledge.knolo"));
|
|
110
112
|
await asset.downloadAsync();
|
|
111
113
|
|
|
112
114
|
const base64 = await FileSystem.readAsStringAsync(asset.localUri!, { encoding: FileSystem.EncodingType.Base64 });
|
|
113
115
|
const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0));
|
|
114
116
|
|
|
115
117
|
const kb = await mountPack({ src: bytes.buffer });
|
|
116
|
-
|
|
117
|
-
return makeContextPatch(hits, { budget: "mini" });
|
|
118
|
+
return query(kb, `“react native” throttling`, { topK: 5 });
|
|
118
119
|
}
|
|
119
120
|
```
|
|
120
121
|
|
|
121
122
|
---
|
|
122
123
|
|
|
123
|
-
## 📑 API
|
|
124
|
+
## 📑 API
|
|
125
|
+
|
|
126
|
+
### `buildPack(docs) -> Promise<Uint8Array>`
|
|
127
|
+
|
|
128
|
+
Builds a pack from an array of documents.
|
|
124
129
|
|
|
125
130
|
```ts
|
|
126
|
-
|
|
127
|
-
|
|
131
|
+
type BuildInputDoc = {
|
|
132
|
+
id?: string; // optional doc id (exposed as hit.source)
|
|
133
|
+
heading?: string; // optional heading (used for boosts)
|
|
134
|
+
text: string; // raw markdown accepted (lightly stripped)
|
|
135
|
+
};
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
* Stores optional `heading` and `id` alongside each block.
|
|
139
|
+
* Computes and persists `meta.stats.avgBlockLen` for faster queries.
|
|
140
|
+
|
|
141
|
+
### `mountPack({ src }) -> Promise<Pack>`
|
|
142
|
+
|
|
143
|
+
Loads a pack from a URL, `Uint8Array`, or `ArrayBuffer`.
|
|
144
|
+
|
|
145
|
+
```ts
|
|
146
|
+
type Pack = {
|
|
147
|
+
meta: { version: number; stats: { docs: number; blocks: number; terms: number; avgBlockLen?: number } };
|
|
148
|
+
lexicon: Map<string, number>;
|
|
149
|
+
postings: Uint32Array;
|
|
150
|
+
blocks: string[];
|
|
151
|
+
headings?: (string | null)[];
|
|
152
|
+
docIds?: (string | null)[];
|
|
153
|
+
};
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
> **Compatibility:** v0.2.0 reads both v1 packs (string-only blocks) and v2 packs (objects with `text/heading/docId`).
|
|
128
157
|
|
|
129
|
-
|
|
130
|
-
mountPack({ src: string | Uint8Array | ArrayBuffer }) -> Promise<Pack>
|
|
158
|
+
### `query(pack, q, opts) -> Hit[]`
|
|
131
159
|
|
|
132
|
-
|
|
133
|
-
query(pack, "your query", { topK?: number, requirePhrases?: string[] }) -> Hit[]
|
|
160
|
+
Deterministic lexical search with phrase enforcement, proximity, and de‑duplication.
|
|
134
161
|
|
|
135
|
-
|
|
136
|
-
|
|
162
|
+
```ts
|
|
163
|
+
type QueryOptions = {
|
|
164
|
+
topK?: number; // default 10
|
|
165
|
+
requirePhrases?: string[]; // additional phrases to require (unquoted)
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
type Hit = {
|
|
169
|
+
blockId: number;
|
|
170
|
+
score: number;
|
|
171
|
+
text: string;
|
|
172
|
+
source?: string; // docId if provided at build time
|
|
173
|
+
};
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**What happens under the hood (v0.2.0):**
|
|
177
|
+
|
|
178
|
+
* Tokenize + **enforce all phrases** (quoted in `q` and `requirePhrases`)
|
|
179
|
+
* Candidate generation via inverted index
|
|
180
|
+
* **Proximity bonus** using minimal window covering all query terms
|
|
181
|
+
* Optional **heading overlap boost** (when headings are present)
|
|
182
|
+
* Tiny **KNS** numeric-signature tie‑breaker (\~±2% influence)
|
|
183
|
+
* **Near-duplicate suppression** (5‑gram Jaccard) + **MMR** diversity for top‑K
|
|
184
|
+
|
|
185
|
+
### `makeContextPatch(hits, { budget }) -> ContextPatch`
|
|
186
|
+
|
|
187
|
+
Create structured snippets for LLM prompts.
|
|
188
|
+
|
|
189
|
+
```ts
|
|
190
|
+
type ContextPatch = {
|
|
191
|
+
background: string[];
|
|
192
|
+
snippets: Array<{ text: string; source?: string }>;
|
|
193
|
+
definitions: Array<{ term: string; def: string; evidence?: number[] }>;
|
|
194
|
+
facts: Array<{ s: string; p: string; o: string; evidence?: number[] }>;
|
|
195
|
+
};
|
|
137
196
|
```
|
|
138
197
|
|
|
198
|
+
Budgets: `"mini" | "small" | "full"`.
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## 🧠 Relevance & De‑dupe Details
|
|
203
|
+
|
|
204
|
+
* **Phrases:**
|
|
205
|
+
Quoted phrases in the query (e.g., `“react native”`) and any `requirePhrases` **must appear** in results. Candidates failing this are dropped before ranking.
|
|
206
|
+
|
|
207
|
+
* **Proximity:**
|
|
208
|
+
We compute the **minimum span** that covers all query terms and apply a gentle multiplier:
|
|
209
|
+
`1 + 0.15 / (1 + span)` (bounded, stable).
|
|
210
|
+
|
|
211
|
+
* **Heading Boost:**
|
|
212
|
+
If you provide headings at build time, overlap with query terms boosts the score proportionally to the fraction of unique query terms present in the heading.
|
|
213
|
+
|
|
214
|
+
* **Duplicate Control:**
|
|
215
|
+
We use **5‑gram Jaccard** to filter near‑duplicates and **MMR** (λ≈0.8) to promote diversity within the top‑K.
|
|
216
|
+
|
|
217
|
+
* **KNS Signature (optional spice):**
|
|
218
|
+
A tiny numeric signature provides deterministic tie‑breaking without changing the overall retrieval behavior.
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## 🛠 Input Format & Pack Layout
|
|
223
|
+
|
|
224
|
+
**Input docs:**
|
|
225
|
+
`{ id?: string, heading?: string, text: string }`
|
|
226
|
+
|
|
227
|
+
**Pack layout (binary):**
|
|
228
|
+
`[metaLen:u32][meta JSON][lexLen:u32][lexicon JSON][postCount:u32][postings][blocksLen:u32][blocks JSON]`
|
|
229
|
+
|
|
230
|
+
* `meta.stats.avgBlockLen` is persisted (v2).
|
|
231
|
+
* `blocks JSON` may be:
|
|
232
|
+
|
|
233
|
+
* **v1:** `string[]` (text only)
|
|
234
|
+
* **v2:** `{ text, heading?, docId? }[]`
|
|
235
|
+
|
|
236
|
+
The runtime auto‑detects either format.
|
|
237
|
+
|
|
139
238
|
---
|
|
140
239
|
|
|
141
|
-
##
|
|
240
|
+
## 🔁 Migration (0.1.x → 0.2.0)
|
|
241
|
+
|
|
242
|
+
* **No API breaks.** `buildPack`, `mountPack`, `query`, `makeContextPatch` unchanged.
|
|
243
|
+
* Packs built with 0.1.x still load and query fine.
|
|
244
|
+
* If you want heading boosts and `hit.source`, pass `heading` and `id` to `buildPack`.
|
|
245
|
+
* React Native/Expo users no longer need polyfills—ponyfills are included.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## ⚡ Performance Tips
|
|
250
|
+
|
|
251
|
+
* Prefer multiple smaller blocks (≈512 tokens) over giant ones for better recall + proximity.
|
|
252
|
+
* Provide `heading` for each block: cheap, high‑signal boost.
|
|
253
|
+
* For large corpora, consider sharding packs by domain/topic to keep per‑pack size modest.
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## ❓ FAQ
|
|
258
|
+
|
|
259
|
+
**Q: Does this use embeddings?**
|
|
260
|
+
No. Pure lexical retrieval (index, positions, BM25L, proximity, phrases).
|
|
261
|
+
|
|
262
|
+
**Q: Can I run this offline?**
|
|
263
|
+
Yes. Everything is local.
|
|
264
|
+
|
|
265
|
+
**Q: How do I prevent duplicates?**
|
|
266
|
+
It’s built in (Jaccard + MMR). You can tune λ and similarity threshold in code if you fork.
|
|
267
|
+
|
|
268
|
+
**Q: Is RN/Expo supported?**
|
|
269
|
+
Yes—TextEncoder/TextDecoder ponyfills are included.
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## 🗺️ Roadmap
|
|
142
274
|
|
|
143
275
|
* Multi-resolution packs (summaries + facts)
|
|
144
|
-
* Overlay
|
|
145
|
-
* WASM core for
|
|
276
|
+
* Overlay layers (user annotations)
|
|
277
|
+
* WASM core for big-browser indexing
|
|
278
|
+
* Delta updates / append-only patch packs
|
|
146
279
|
|
|
147
280
|
---
|
|
148
281
|
|
|
149
|
-
|
|
282
|
+
## 📄 License
|
|
283
|
+
|
|
284
|
+
MIT — see [LICENSE](./LICENSE).
|
|
150
285
|
|
package/dist/builder.d.ts
CHANGED
|
@@ -3,9 +3,4 @@ export type BuildInputDoc = {
|
|
|
3
3
|
heading?: string;
|
|
4
4
|
text: string;
|
|
5
5
|
};
|
|
6
|
-
/** Build a `.knolo` pack from an array of input documents. At present each
|
|
7
|
-
* document becomes a single block. Future versions may split documents into
|
|
8
|
-
* multiple blocks based on headings or token count to improve retrieval
|
|
9
|
-
* granularity.
|
|
10
|
-
*/
|
|
11
6
|
export declare function buildPack(docs: BuildInputDoc[]): Promise<Uint8Array>;
|
package/dist/builder.js
CHANGED
|
@@ -1,34 +1,44 @@
|
|
|
1
1
|
/*
|
|
2
2
|
* builder.ts
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* `id`, `heading` and `text` fields. The builder performs simple
|
|
7
|
-
* Markdown stripping and calls the indexer to generate the inverted index. The
|
|
8
|
-
* resulting pack binary can be persisted to disk or served directly to
|
|
9
|
-
* clients.
|
|
4
|
+
* Build `.knolo` packs from input docs. Now persists optional headings/docIds
|
|
5
|
+
* and stores avgBlockLen in meta for faster/easier normalization at query-time.
|
|
10
6
|
*/
|
|
11
7
|
import { buildIndex } from './indexer.js';
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
* multiple blocks based on headings or token count to improve retrieval
|
|
15
|
-
* granularity.
|
|
16
|
-
*/
|
|
8
|
+
import { tokenize } from './tokenize.js';
|
|
9
|
+
import { getTextEncoder } from './utils/utf8.js';
|
|
17
10
|
export async function buildPack(docs) {
|
|
18
|
-
//
|
|
19
|
-
|
|
20
|
-
|
|
11
|
+
// Prepare blocks (strip MD) and carry heading/docId for optional boosts.
|
|
12
|
+
const blocks = docs.map((d, i) => ({
|
|
13
|
+
id: i,
|
|
14
|
+
text: stripMd(d.text),
|
|
15
|
+
heading: d.heading,
|
|
16
|
+
}));
|
|
17
|
+
// Build index
|
|
21
18
|
const { lexicon, postings } = buildIndex(blocks);
|
|
19
|
+
// Compute avg token length once (store in meta)
|
|
20
|
+
const totalTokens = blocks.reduce((sum, b) => sum + tokenize(b.text).length, 0);
|
|
21
|
+
const avgBlockLen = blocks.length ? totalTokens / blocks.length : 1;
|
|
22
22
|
const meta = {
|
|
23
|
-
version:
|
|
24
|
-
stats: {
|
|
23
|
+
version: 2,
|
|
24
|
+
stats: {
|
|
25
|
+
docs: docs.length,
|
|
26
|
+
blocks: blocks.length,
|
|
27
|
+
terms: lexicon.length,
|
|
28
|
+
avgBlockLen,
|
|
29
|
+
},
|
|
25
30
|
};
|
|
26
|
-
//
|
|
27
|
-
const
|
|
31
|
+
// Persist blocks as objects to optionally carry heading/docId
|
|
32
|
+
const blocksPayload = blocks.map((b, i) => ({
|
|
33
|
+
text: b.text,
|
|
34
|
+
heading: b.heading ?? null,
|
|
35
|
+
docId: docs[i]?.id ?? null,
|
|
36
|
+
}));
|
|
37
|
+
// Encode sections
|
|
38
|
+
const enc = getTextEncoder();
|
|
28
39
|
const metaBytes = enc.encode(JSON.stringify(meta));
|
|
29
40
|
const lexBytes = enc.encode(JSON.stringify(lexicon));
|
|
30
|
-
const blocksBytes = enc.encode(JSON.stringify(
|
|
31
|
-
// Compute lengths and allocate output
|
|
41
|
+
const blocksBytes = enc.encode(JSON.stringify(blocksPayload));
|
|
32
42
|
const totalLength = 4 + metaBytes.length +
|
|
33
43
|
4 + lexBytes.length +
|
|
34
44
|
4 + postings.length * 4 +
|
|
@@ -46,36 +56,27 @@ export async function buildPack(docs) {
|
|
|
46
56
|
offset += 4;
|
|
47
57
|
out.set(lexBytes, offset);
|
|
48
58
|
offset += lexBytes.length;
|
|
49
|
-
// postings
|
|
59
|
+
// postings (alignment-safe via DataView)
|
|
50
60
|
dv.setUint32(offset, postings.length, true);
|
|
51
61
|
offset += 4;
|
|
52
|
-
|
|
53
|
-
|
|
62
|
+
for (let i = 0; i < postings.length; i++) {
|
|
63
|
+
dv.setUint32(offset, postings[i], true);
|
|
64
|
+
offset += 4;
|
|
65
|
+
}
|
|
54
66
|
// blocks
|
|
55
67
|
dv.setUint32(offset, blocksBytes.length, true);
|
|
56
68
|
offset += 4;
|
|
57
69
|
out.set(blocksBytes, offset);
|
|
58
70
|
return out;
|
|
59
71
|
}
|
|
60
|
-
/** Strip Markdown syntax
|
|
61
|
-
* `marked` library is used for parsing and rendering. A very naive HTML tag
|
|
62
|
-
* stripper removes tags by dropping anything between `<` and `>`. This is
|
|
63
|
-
* simplistic but adequate for plain text extraction.
|
|
64
|
-
*/
|
|
72
|
+
/** Strip Markdown syntax with lightweight regexes (no deps). */
|
|
65
73
|
function stripMd(md) {
|
|
66
|
-
|
|
67
|
-
let text = md.replace(/```[^```]*```/g, ' ');
|
|
68
|
-
// Remove inline code backticks
|
|
74
|
+
let text = md.replace(/```[\s\S]*?```/g, ' ');
|
|
69
75
|
text = text.replace(/`[^`]*`/g, ' ');
|
|
70
|
-
// Remove emphasis markers (*, _, ~)
|
|
71
76
|
text = text.replace(/[\*_~]+/g, ' ');
|
|
72
|
-
// Remove headings (#)
|
|
73
77
|
text = text.replace(/^#+\s*/gm, '');
|
|
74
|
-
|
|
75
|
-
text = text.replace(/\[([^\]]+)\]\([^\)]+\)/g, '$1');
|
|
76
|
-
// Remove any remaining brackets
|
|
78
|
+
text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
|
|
77
79
|
text = text.replace(/[\[\]()]/g, ' ');
|
|
78
|
-
// Collapse whitespace
|
|
79
80
|
text = text.replace(/\s+/g, ' ').trim();
|
|
80
81
|
return text;
|
|
81
82
|
}
|
package/dist/pack.d.ts
CHANGED
|
@@ -1,47 +1,21 @@
|
|
|
1
1
|
export type MountOptions = {
|
|
2
2
|
src: string | ArrayBufferLike | Uint8Array;
|
|
3
3
|
};
|
|
4
|
-
/** Metadata about the pack. Version numbers should increment with format
|
|
5
|
-
* changes, allowing the runtime to adapt accordingly. */
|
|
6
4
|
export type PackMeta = {
|
|
7
5
|
version: number;
|
|
8
6
|
stats: {
|
|
9
7
|
docs: number;
|
|
10
8
|
blocks: number;
|
|
11
9
|
terms: number;
|
|
10
|
+
avgBlockLen?: number;
|
|
12
11
|
};
|
|
13
12
|
};
|
|
14
|
-
/**
|
|
15
|
-
* A mounted pack exposing the inverted index, block text and optional field
|
|
16
|
-
* metadata. The core runtime reads from these structures directly at query
|
|
17
|
-
* time.
|
|
18
|
-
*/
|
|
19
13
|
export type Pack = {
|
|
20
14
|
meta: PackMeta;
|
|
21
|
-
/** Map of token to term identifier used in the postings list. */
|
|
22
15
|
lexicon: Map<string, number>;
|
|
23
|
-
/** Flattened postings list where each term section starts with the termId
|
|
24
|
-
* followed by (blockId, positions..., 0) tuples, ending with a 0.
|
|
25
|
-
*/
|
|
26
16
|
postings: Uint32Array;
|
|
27
|
-
/** Array of block texts. Each block corresponds to a chunk of the original
|
|
28
|
-
* documents. The blockId used in the postings list indexes into this array.
|
|
29
|
-
*/
|
|
30
17
|
blocks: string[];
|
|
18
|
+
headings?: (string | null)[];
|
|
19
|
+
docIds?: (string | null)[];
|
|
31
20
|
};
|
|
32
|
-
/**
|
|
33
|
-
* Load a `.knolo` pack from a variety of sources. The pack binary layout is
|
|
34
|
-
* currently:
|
|
35
|
-
*
|
|
36
|
-
* [metaLen:u32][meta JSON][lexLen:u32][lexicon JSON][postCount:u32][postings][blocksLen:u32][blocks JSON]
|
|
37
|
-
*
|
|
38
|
-
* All integers are little endian. `metaLen`, `lexLen` and `blocksLen` denote
|
|
39
|
-
* the byte lengths of the subsequent JSON sections. `postCount` is the number
|
|
40
|
-
* of 32‑bit integers in the postings array. This simple layout is sufficient
|
|
41
|
-
* for v0 and avoids any additional dependencies beyond standard typed arrays.
|
|
42
|
-
*
|
|
43
|
-
* @param opts Options specifying how to load the pack. Accepts a URL string,
|
|
44
|
-
* ArrayBuffer, or Uint8Array.
|
|
45
|
-
* @returns A Promise resolving to a mounted pack with the index and blocks.
|
|
46
|
-
*/
|
|
47
21
|
export declare function mountPack(opts: MountOptions): Promise<Pack>;
|
package/dist/pack.js
CHANGED
|
@@ -1,77 +1,83 @@
|
|
|
1
1
|
/*
|
|
2
2
|
* pack.ts
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
* portable across Node.js and browser environments.
|
|
9
|
-
*/
|
|
10
|
-
/**
|
|
11
|
-
* Load a `.knolo` pack from a variety of sources. The pack binary layout is
|
|
12
|
-
* currently:
|
|
13
|
-
*
|
|
14
|
-
* [metaLen:u32][meta JSON][lexLen:u32][lexicon JSON][postCount:u32][postings][blocksLen:u32][blocks JSON]
|
|
15
|
-
*
|
|
16
|
-
* All integers are little endian. `metaLen`, `lexLen` and `blocksLen` denote
|
|
17
|
-
* the byte lengths of the subsequent JSON sections. `postCount` is the number
|
|
18
|
-
* of 32‑bit integers in the postings array. This simple layout is sufficient
|
|
19
|
-
* for v0 and avoids any additional dependencies beyond standard typed arrays.
|
|
20
|
-
*
|
|
21
|
-
* @param opts Options specifying how to load the pack. Accepts a URL string,
|
|
22
|
-
* ArrayBuffer, or Uint8Array.
|
|
23
|
-
* @returns A Promise resolving to a mounted pack with the index and blocks.
|
|
4
|
+
* Mount `.knolo` packs across Node, browsers, and RN/Expo. Now tolerant of:
|
|
5
|
+
* - blocks as string[] (v1) or object[] with { text, heading?, docId? } (v2)
|
|
6
|
+
* - meta.stats.avgBlockLen (optional)
|
|
7
|
+
* Includes RN/Expo-safe TextDecoder via ponyfill.
|
|
24
8
|
*/
|
|
9
|
+
import { getTextDecoder } from './utils/utf8.js';
|
|
25
10
|
export async function mountPack(opts) {
|
|
26
11
|
const buf = await resolveToBuffer(opts.src);
|
|
27
12
|
const dv = new DataView(buf);
|
|
13
|
+
const dec = getTextDecoder();
|
|
28
14
|
let offset = 0;
|
|
29
|
-
//
|
|
15
|
+
// meta
|
|
30
16
|
const metaLen = dv.getUint32(offset, true);
|
|
31
17
|
offset += 4;
|
|
32
|
-
const metaJson =
|
|
18
|
+
const metaJson = dec.decode(new Uint8Array(buf, offset, metaLen));
|
|
33
19
|
offset += metaLen;
|
|
34
20
|
const meta = JSON.parse(metaJson);
|
|
35
|
-
//
|
|
21
|
+
// lexicon
|
|
36
22
|
const lexLen = dv.getUint32(offset, true);
|
|
37
23
|
offset += 4;
|
|
38
|
-
const lexJson =
|
|
24
|
+
const lexJson = dec.decode(new Uint8Array(buf, offset, lexLen));
|
|
39
25
|
offset += lexLen;
|
|
40
26
|
const lexEntries = JSON.parse(lexJson);
|
|
41
27
|
const lexicon = new Map(lexEntries);
|
|
42
|
-
//
|
|
28
|
+
// postings
|
|
43
29
|
const postCount = dv.getUint32(offset, true);
|
|
44
30
|
offset += 4;
|
|
45
|
-
const postings = new Uint32Array(
|
|
46
|
-
|
|
47
|
-
|
|
31
|
+
const postings = new Uint32Array(postCount);
|
|
32
|
+
for (let i = 0; i < postCount; i++) {
|
|
33
|
+
postings[i] = dv.getUint32(offset, true);
|
|
34
|
+
offset += 4;
|
|
35
|
+
}
|
|
36
|
+
// blocks (v1: string[]; v2: {text, heading?, docId?}[])
|
|
48
37
|
const blocksLen = dv.getUint32(offset, true);
|
|
49
38
|
offset += 4;
|
|
50
|
-
const blocksJson =
|
|
51
|
-
const
|
|
52
|
-
|
|
39
|
+
const blocksJson = dec.decode(new Uint8Array(buf, offset, blocksLen));
|
|
40
|
+
const parsed = JSON.parse(blocksJson);
|
|
41
|
+
let blocks = [];
|
|
42
|
+
let headings;
|
|
43
|
+
let docIds;
|
|
44
|
+
if (Array.isArray(parsed) && parsed.length && typeof parsed[0] === 'string') {
|
|
45
|
+
// v1
|
|
46
|
+
blocks = parsed;
|
|
47
|
+
}
|
|
48
|
+
else if (Array.isArray(parsed)) {
|
|
49
|
+
blocks = [];
|
|
50
|
+
headings = [];
|
|
51
|
+
docIds = [];
|
|
52
|
+
for (const it of parsed) {
|
|
53
|
+
if (it && typeof it === 'object') {
|
|
54
|
+
blocks.push(String(it.text ?? ''));
|
|
55
|
+
headings.push(it.heading ?? null);
|
|
56
|
+
docIds.push(it.docId ?? null);
|
|
57
|
+
}
|
|
58
|
+
else {
|
|
59
|
+
blocks.push(String(it ?? ''));
|
|
60
|
+
headings.push(null);
|
|
61
|
+
docIds.push(null);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
else {
|
|
66
|
+
blocks = [];
|
|
67
|
+
}
|
|
68
|
+
return { meta, lexicon, postings, blocks, headings, docIds };
|
|
53
69
|
}
|
|
54
|
-
/** Resolve the `src` field of MountOptions into an ArrayBuffer. Supports:
|
|
55
|
-
* - strings interpreted as URLs (via fetch)
|
|
56
|
-
* - Uint8Array and ArrayBuffer inputs
|
|
57
|
-
*/
|
|
58
70
|
async function resolveToBuffer(src) {
|
|
59
71
|
if (typeof src === 'string') {
|
|
60
|
-
// Use fetch for browser and Node environments. For Node this requires the
|
|
61
|
-
// global fetch API (available since Node 18). Error handling is delegated
|
|
62
|
-
// to the caller.
|
|
63
72
|
const res = await fetch(src);
|
|
64
|
-
|
|
65
|
-
return ab;
|
|
73
|
+
return await res.arrayBuffer();
|
|
66
74
|
}
|
|
67
75
|
if (src instanceof Uint8Array) {
|
|
68
|
-
// If the view covers the whole buffer, return it directly (cast to ArrayBuffer).
|
|
69
76
|
if (src.byteOffset === 0 && src.byteLength === src.buffer.byteLength) {
|
|
70
77
|
return src.buffer;
|
|
71
78
|
}
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
return copy.buffer; // typed as ArrayBuffer
|
|
79
|
+
const copy = src.slice();
|
|
80
|
+
return copy.buffer;
|
|
75
81
|
}
|
|
76
82
|
return src;
|
|
77
83
|
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export type HitLike = {
|
|
2
|
+
blockId: number;
|
|
3
|
+
score: number;
|
|
4
|
+
text: string;
|
|
5
|
+
source?: string;
|
|
6
|
+
};
|
|
7
|
+
export type DiversifyOptions = {
|
|
8
|
+
k: number;
|
|
9
|
+
lambda?: number;
|
|
10
|
+
simThreshold?: number;
|
|
11
|
+
sim?: (a: HitLike, b: HitLike) => number;
|
|
12
|
+
};
|
|
13
|
+
export declare function diversifyAndDedupe(hits: HitLike[], opts: DiversifyOptions): HitLike[];
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
// src/quality/diversify.ts
|
|
2
|
+
import { jaccard5 } from './similarity.js';
|
|
3
|
+
export function diversifyAndDedupe(hits, opts) {
|
|
4
|
+
const { k, lambda = 0.8, simThreshold = 0.92, sim = (a, b) => jaccard5(a.text, b.text) } = opts;
|
|
5
|
+
const pool = [...hits].sort((a, b) => b.score - a.score);
|
|
6
|
+
const kept = [];
|
|
7
|
+
while (pool.length && kept.length < k) {
|
|
8
|
+
// compute MMR for current pool against kept
|
|
9
|
+
let bestIdx = 0;
|
|
10
|
+
let bestMMR = -Infinity;
|
|
11
|
+
for (let i = 0; i < pool.length; i++) {
|
|
12
|
+
const h = pool[i];
|
|
13
|
+
let maxSim = 0;
|
|
14
|
+
for (const s of kept) {
|
|
15
|
+
const v = sim(h, s);
|
|
16
|
+
if (v > maxSim)
|
|
17
|
+
maxSim = v;
|
|
18
|
+
if (v >= simThreshold) {
|
|
19
|
+
maxSim = v;
|
|
20
|
+
break;
|
|
21
|
+
} // early out
|
|
22
|
+
}
|
|
23
|
+
// skip near-duplicates
|
|
24
|
+
if (maxSim >= simThreshold)
|
|
25
|
+
continue;
|
|
26
|
+
const mmr = lambda * h.score - (1 - lambda) * maxSim;
|
|
27
|
+
if (mmr > bestMMR) {
|
|
28
|
+
bestMMR = mmr;
|
|
29
|
+
bestIdx = i;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
// if everything was a near-duplicate, just take the next best by score
|
|
33
|
+
const pick = pool.splice(bestMMR === -Infinity ? 0 : bestIdx, 1)[0];
|
|
34
|
+
if (!pick)
|
|
35
|
+
break;
|
|
36
|
+
// final dedupe check before push
|
|
37
|
+
if (!kept.some((x) => sim(x, pick) >= simThreshold))
|
|
38
|
+
kept.push(pick);
|
|
39
|
+
}
|
|
40
|
+
return kept;
|
|
41
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
// src/quality/proximity.ts
|
|
2
|
+
// Map<termId, positions[]>
|
|
3
|
+
export function minCoverSpan(posMap) {
|
|
4
|
+
const lists = posMap ? [...posMap.values()].map(arr => arr.slice().sort((a, b) => a - b)) : [];
|
|
5
|
+
if (lists.length === 0)
|
|
6
|
+
return null;
|
|
7
|
+
const idx = new Array(lists.length).fill(0);
|
|
8
|
+
let best = null;
|
|
9
|
+
while (true) {
|
|
10
|
+
const cur = [];
|
|
11
|
+
for (let i = 0; i < lists.length; i++) {
|
|
12
|
+
const val = lists[i][idx[i]];
|
|
13
|
+
if (val === undefined)
|
|
14
|
+
return best;
|
|
15
|
+
cur.push(val);
|
|
16
|
+
}
|
|
17
|
+
const min = Math.min(...cur);
|
|
18
|
+
const max = Math.max(...cur);
|
|
19
|
+
const span = max - min;
|
|
20
|
+
if (best === null || span < best)
|
|
21
|
+
best = span;
|
|
22
|
+
// advance list with current min
|
|
23
|
+
const minList = cur.indexOf(min);
|
|
24
|
+
idx[minList]++;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
export function proximityMultiplier(span, strength = 0.15) {
|
|
28
|
+
if (span === null)
|
|
29
|
+
return 1;
|
|
30
|
+
return 1 + strength / (1 + span); // gentle, bounded
|
|
31
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
// src/quality/signature.ts
|
|
2
|
+
// "KNS" — simple, deterministic lexical numeric signature for tie-breaking.
|
|
3
|
+
const PRIMES = [257, 263, 269];
|
|
4
|
+
export function knsSignature(s) {
|
|
5
|
+
let s1 = 0, s2 = 0, s3 = 0;
|
|
6
|
+
for (let i = 0; i < s.length; i++) {
|
|
7
|
+
const code = s.charCodeAt(i);
|
|
8
|
+
s1 = (s1 + code) % PRIMES[0];
|
|
9
|
+
s2 = (s2 + code * (i + 1)) % PRIMES[1];
|
|
10
|
+
s3 = (s3 + ((code << 1) ^ (i + 7))) % PRIMES[2];
|
|
11
|
+
}
|
|
12
|
+
return [s1, s2, s3];
|
|
13
|
+
}
|
|
14
|
+
export function knsDistance(a, b) {
|
|
15
|
+
// circular distance on a mod prime, averaged & normalized to 0..1
|
|
16
|
+
let acc = 0;
|
|
17
|
+
for (let i = 0; i < PRIMES.length; i++) {
|
|
18
|
+
const p = PRIMES[i];
|
|
19
|
+
const diff = Math.abs(a[i] - b[i]);
|
|
20
|
+
const circ = Math.min(diff, p - diff) / p;
|
|
21
|
+
acc += circ;
|
|
22
|
+
}
|
|
23
|
+
return acc / PRIMES.length;
|
|
24
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
// src/quality/similarity.ts
|
|
2
|
+
import { normalize } from '../tokenize.js';
|
|
3
|
+
export function ngramSet(s, n = 5) {
|
|
4
|
+
const t = normalize(s);
|
|
5
|
+
const out = new Set();
|
|
6
|
+
if (t.length < n) {
|
|
7
|
+
if (t)
|
|
8
|
+
out.add(t);
|
|
9
|
+
return out;
|
|
10
|
+
}
|
|
11
|
+
for (let i = 0; i <= t.length - n; i++)
|
|
12
|
+
out.add(t.slice(i, i + n));
|
|
13
|
+
return out;
|
|
14
|
+
}
|
|
15
|
+
export function jaccardFromSets(a, b) {
|
|
16
|
+
if (a.size === 0 && b.size === 0)
|
|
17
|
+
return 1;
|
|
18
|
+
let inter = 0;
|
|
19
|
+
for (const x of a)
|
|
20
|
+
if (b.has(x))
|
|
21
|
+
inter++;
|
|
22
|
+
const uni = a.size + b.size - inter;
|
|
23
|
+
return uni ? inter / uni : 0;
|
|
24
|
+
}
|
|
25
|
+
export function jaccard5(s1, s2) {
|
|
26
|
+
return jaccardFromSets(ngramSet(s1, 5), ngramSet(s2, 5));
|
|
27
|
+
}
|
package/dist/query.d.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
import type { Pack } from
|
|
1
|
+
import type { Pack } from "./pack.js";
|
|
2
2
|
export type QueryOptions = {
|
|
3
3
|
topK?: number;
|
|
4
|
-
/** Additional phrases (unquoted) that must be present in results. */
|
|
5
4
|
requirePhrases?: string[];
|
|
6
5
|
};
|
|
7
6
|
export type Hit = {
|
|
@@ -10,9 +9,4 @@ export type Hit = {
|
|
|
10
9
|
text: string;
|
|
11
10
|
source?: string;
|
|
12
11
|
};
|
|
13
|
-
/** Execute a search against a mounted pack. The query string can contain
|
|
14
|
-
* quoted phrases; unquoted terms are treated individually. The `topK`
|
|
15
|
-
* parameter controls how many results are returned. If `requirePhrases`
|
|
16
|
-
* contains strings, those phrases must appear verbatim in candidate blocks.
|
|
17
|
-
*/
|
|
18
12
|
export declare function query(pack: Pack, q: string, opts?: QueryOptions): Hit[];
|
package/dist/query.js
CHANGED
|
@@ -1,95 +1,154 @@
|
|
|
1
1
|
/*
|
|
2
2
|
* query.ts
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
4
|
+
* Deterministic, embedding-free retrieval with:
|
|
5
|
+
* - REQUIRED phrase enforcement (quoted and requirePhrases)
|
|
6
|
+
* - Proximity bonus based on min cover span
|
|
7
|
+
* - Optional heading overlap boost
|
|
8
|
+
* - KNS numeric-signature tie-breaker (tiny)
|
|
9
|
+
* - Near-duplicate suppression + MMR diversity
|
|
9
10
|
*/
|
|
10
|
-
import { tokenize, parsePhrases } from "./tokenize.js";
|
|
11
|
+
import { tokenize, parsePhrases, normalize } from "./tokenize.js";
|
|
11
12
|
import { rankBM25L } from "./rank.js";
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
* contains strings, those phrases must appear verbatim in candidate blocks.
|
|
16
|
-
*/
|
|
13
|
+
import { minCoverSpan, proximityMultiplier } from "./quality/proximity.js";
|
|
14
|
+
import { diversifyAndDedupe } from "./quality/diversify.js";
|
|
15
|
+
import { knsSignature, knsDistance } from "./quality/signature.js";
|
|
17
16
|
export function query(pack, q, opts = {}) {
|
|
18
17
|
const topK = opts.topK ?? 10;
|
|
18
|
+
// --- Query parsing
|
|
19
19
|
const normTokens = tokenize(q).map((t) => t.term);
|
|
20
|
-
|
|
21
|
-
//
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
// Translate tokens to term IDs. Terms not in the lexicon are skipped.
|
|
20
|
+
// Normalize quoted phrases from q
|
|
21
|
+
const quotedRaw = parsePhrases(q); // arrays of raw terms
|
|
22
|
+
const quoted = quotedRaw.map(seq => seq.map(t => normalize(t)).flatMap(s => s.split(/\s+/)).filter(Boolean));
|
|
23
|
+
// Normalize requirePhrases the same way
|
|
24
|
+
const extraReq = (opts.requirePhrases ?? [])
|
|
25
|
+
.map(s => tokenize(s).map(t => t.term)) // <<< normalize via tokenizer
|
|
26
|
+
.filter(arr => arr.length > 0);
|
|
27
|
+
const requiredPhrases = [...quoted, ...extraReq];
|
|
28
|
+
// --- Term ids for the free (unquoted) tokens in q
|
|
30
29
|
const termIds = normTokens
|
|
31
30
|
.map((t) => pack.lexicon.get(t))
|
|
32
31
|
.filter((id) => id !== undefined);
|
|
33
|
-
//
|
|
32
|
+
// If there are no free tokens but there ARE required phrases, we'll fill candidates from phrases later.
|
|
33
|
+
const termSet = new Set(termIds);
|
|
34
|
+
// --- Candidate map
|
|
34
35
|
const candidates = new Map();
|
|
35
|
-
//
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
if (relevant) {
|
|
52
|
-
let entry = candidates.get(bid);
|
|
53
|
-
if (!entry) {
|
|
54
|
-
entry = { tf: new Map() };
|
|
55
|
-
candidates.set(bid, entry);
|
|
36
|
+
// Helper to harvest postings for a given set of termIds into candidates
|
|
37
|
+
function scanForTermIds(idSet) {
|
|
38
|
+
const p = pack.postings;
|
|
39
|
+
let i = 0;
|
|
40
|
+
while (i < p.length) {
|
|
41
|
+
const tid = p[i++];
|
|
42
|
+
if (tid === 0)
|
|
43
|
+
continue;
|
|
44
|
+
const relevant = idSet.has(tid);
|
|
45
|
+
let bid = p[i++];
|
|
46
|
+
while (bid !== 0) {
|
|
47
|
+
let pos = p[i++];
|
|
48
|
+
const positions = [];
|
|
49
|
+
while (pos !== 0) {
|
|
50
|
+
positions.push(pos);
|
|
51
|
+
pos = p[i++];
|
|
56
52
|
}
|
|
57
|
-
|
|
58
|
-
|
|
53
|
+
if (relevant) {
|
|
54
|
+
let entry = candidates.get(bid);
|
|
55
|
+
if (!entry) {
|
|
56
|
+
entry = { tf: new Map(), pos: new Map() };
|
|
57
|
+
candidates.set(bid, entry);
|
|
58
|
+
}
|
|
59
|
+
entry.tf.set(tid, positions.length);
|
|
60
|
+
entry.pos.set(tid, positions);
|
|
61
|
+
}
|
|
62
|
+
bid = p[i++];
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
// 1) Scan using tokens from q (if any)
|
|
67
|
+
if (termSet.size > 0) {
|
|
68
|
+
scanForTermIds(termSet);
|
|
69
|
+
}
|
|
70
|
+
// 2) Phrase-first rescue:
|
|
71
|
+
// If nothing matched the free tokens, but we do have required phrases,
|
|
72
|
+
// build a fallback term set from ALL tokens that appear in those phrases and scan again.
|
|
73
|
+
if (candidates.size === 0 && requiredPhrases.length > 0) {
|
|
74
|
+
const phraseTokenIds = new Set();
|
|
75
|
+
for (const seq of requiredPhrases) {
|
|
76
|
+
for (const t of seq) {
|
|
77
|
+
const id = pack.lexicon.get(t);
|
|
78
|
+
if (id !== undefined)
|
|
79
|
+
phraseTokenIds.add(id);
|
|
59
80
|
}
|
|
60
|
-
bid = p[i++];
|
|
61
81
|
}
|
|
62
|
-
|
|
82
|
+
if (phraseTokenIds.size > 0) {
|
|
83
|
+
scanForTermIds(phraseTokenIds);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
// --- Phrase enforcement (now that we have some candidates)
|
|
87
|
+
if (requiredPhrases.length > 0) {
|
|
88
|
+
for (const [bid, data] of [...candidates]) {
|
|
89
|
+
const text = pack.blocks[bid] || "";
|
|
90
|
+
const ok = requiredPhrases.every((seq) => containsPhrase(text, seq));
|
|
91
|
+
if (!ok)
|
|
92
|
+
candidates.delete(bid);
|
|
93
|
+
else
|
|
94
|
+
data.hasPhrase = true;
|
|
95
|
+
}
|
|
63
96
|
}
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
97
|
+
else if (quoted.length > 0) {
|
|
98
|
+
for (const [bid, data] of candidates) {
|
|
99
|
+
const text = pack.blocks[bid] || "";
|
|
100
|
+
data.hasPhrase = quoted.some((seq) => containsPhrase(text, seq));
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
// If still nothing, bail early
|
|
104
|
+
if (candidates.size === 0)
|
|
105
|
+
return [];
|
|
106
|
+
// --- Heading overlap
|
|
107
|
+
if (pack.headings?.length) {
|
|
108
|
+
const qset = new Set(normTokens);
|
|
109
|
+
const qUniqueCount = new Set(normTokens).size || 1;
|
|
110
|
+
for (const [bid, data] of candidates) {
|
|
111
|
+
const h = pack.headings[bid] ?? "";
|
|
112
|
+
const hTerms = tokenize(h || "").map((t) => t.term);
|
|
113
|
+
const overlap = new Set(hTerms.filter((t) => qset.has(t))).size;
|
|
114
|
+
data.headingScore = overlap / qUniqueCount;
|
|
115
|
+
}
|
|
69
116
|
}
|
|
70
|
-
//
|
|
71
|
-
const avgLen = pack.
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
117
|
+
// --- Rank with proximity bonus
|
|
118
|
+
const avgLen = pack.meta?.stats?.avgBlockLen ??
|
|
119
|
+
(pack.blocks.length
|
|
120
|
+
? pack.blocks.reduce((s, b) => s + tokenize(b).length, 0) / pack.blocks.length
|
|
121
|
+
: 1);
|
|
122
|
+
const prelim = rankBM25L(candidates, avgLen, {
|
|
123
|
+
proximityBonus: (cand) => proximityMultiplier(minCoverSpan(cand.pos)),
|
|
124
|
+
});
|
|
125
|
+
if (prelim.length === 0)
|
|
126
|
+
return [];
|
|
127
|
+
// --- KNS tie-breaker + de-dup/MMR
|
|
128
|
+
const qSig = knsSignature(normalize(q));
|
|
129
|
+
const pool = prelim.slice(0, topK * 5).map((r) => {
|
|
130
|
+
const text = pack.blocks[r.blockId] || "";
|
|
131
|
+
const boost = 1 + 0.02 * (1 - knsDistance(qSig, knsSignature(text)));
|
|
132
|
+
return {
|
|
133
|
+
blockId: r.blockId,
|
|
134
|
+
score: r.score * boost,
|
|
135
|
+
text,
|
|
136
|
+
source: pack.docIds?.[r.blockId] ?? undefined,
|
|
137
|
+
};
|
|
138
|
+
});
|
|
139
|
+
const finalHits = diversifyAndDedupe(pool, { k: topK });
|
|
140
|
+
return finalHits;
|
|
80
141
|
}
|
|
81
|
-
/**
|
|
82
|
-
* text. The algorithm tokenizes the text and performs a sliding window
|
|
83
|
-
* comparison. This is case‑insensitive and uses the same normalization as
|
|
84
|
-
* other parts of the system.
|
|
85
|
-
*/
|
|
142
|
+
/** Ordered phrase check using the SAME tokenizer/normalizer path as the index. */
|
|
86
143
|
function containsPhrase(text, seq) {
|
|
87
144
|
if (seq.length === 0)
|
|
88
145
|
return false;
|
|
146
|
+
// normalize seq via tokenizer to be extra safe (handles diacritics/case)
|
|
147
|
+
const seqNorm = tokenize(seq.join(" ")).map(t => t.term);
|
|
89
148
|
const toks = tokenize(text).map((t) => t.term);
|
|
90
|
-
outer: for (let i = 0; i <= toks.length -
|
|
91
|
-
for (let j = 0; j <
|
|
92
|
-
if (toks[i + j] !==
|
|
149
|
+
outer: for (let i = 0; i <= toks.length - seqNorm.length; i++) {
|
|
150
|
+
for (let j = 0; j < seqNorm.length; j++) {
|
|
151
|
+
if (toks[i + j] !== seqNorm[j])
|
|
93
152
|
continue outer;
|
|
94
153
|
}
|
|
95
154
|
return true;
|
package/dist/rank.d.ts
CHANGED
|
@@ -3,15 +3,16 @@ export type RankOptions = {
|
|
|
3
3
|
b?: number;
|
|
4
4
|
headingBoost?: number;
|
|
5
5
|
phraseBoost?: number;
|
|
6
|
+
proximityBonus?: (cand: {
|
|
7
|
+
tf: Map<number, number>;
|
|
8
|
+
pos?: Map<number, number[]>;
|
|
9
|
+
hasPhrase?: boolean;
|
|
10
|
+
headingScore?: number;
|
|
11
|
+
}) => number;
|
|
6
12
|
};
|
|
7
|
-
/**
|
|
8
|
-
* Rank a set of candidate blocks using BM25L. Each candidate carries a term
|
|
9
|
-
* frequency (tf) map keyed by termId. Additional properties may include
|
|
10
|
-
* `hasPhrase` and `headingScore` to apply multiplicative boosts. The average
|
|
11
|
-
* document length (avgLen) is required for BM25L normalization.
|
|
12
|
-
*/
|
|
13
13
|
export declare function rankBM25L(candidates: Map<number, {
|
|
14
14
|
tf: Map<number, number>;
|
|
15
|
+
pos?: Map<number, number[]>;
|
|
15
16
|
hasPhrase?: boolean;
|
|
16
17
|
headingScore?: number;
|
|
17
18
|
}>, avgLen: number, opts?: RankOptions): Array<{
|
package/dist/rank.js
CHANGED
|
@@ -1,18 +1,6 @@
|
|
|
1
1
|
/*
|
|
2
2
|
* rank.ts
|
|
3
|
-
*
|
|
4
|
-
* Implements a simple BM25L ranker with optional boosts for headings and
|
|
5
|
-
* phrase matches. The inputs to the ranker are a map of block IDs to term
|
|
6
|
-
* frequency maps and additional boolean flags. The outputs are sorted by
|
|
7
|
-
* descending relevance score. This ranking algorithm can be replaced or
|
|
8
|
-
* augmented in the future without impacting the public API of the query
|
|
9
|
-
* function.
|
|
10
|
-
*/
|
|
11
|
-
/**
|
|
12
|
-
* Rank a set of candidate blocks using BM25L. Each candidate carries a term
|
|
13
|
-
* frequency (tf) map keyed by termId. Additional properties may include
|
|
14
|
-
* `hasPhrase` and `headingScore` to apply multiplicative boosts. The average
|
|
15
|
-
* document length (avgLen) is required for BM25L normalization.
|
|
3
|
+
* BM25L ranker with optional heading/phrase boosts and a proximity bonus hook.
|
|
16
4
|
*/
|
|
17
5
|
export function rankBM25L(candidates, avgLen, opts = {}) {
|
|
18
6
|
const k1 = opts.k1 ?? 1.5;
|
|
@@ -24,18 +12,17 @@ export function rankBM25L(candidates, avgLen, opts = {}) {
|
|
|
24
12
|
const len = Array.from(data.tf.values()).reduce((sum, tf) => sum + tf, 0) || 1;
|
|
25
13
|
let score = 0;
|
|
26
14
|
for (const [, tf] of data.tf) {
|
|
27
|
-
const idf = 1; //
|
|
15
|
+
const idf = 1; // v0: no DF; can be extended later
|
|
28
16
|
const numer = tf * (k1 + 1);
|
|
29
17
|
const denom = tf + k1 * (1 - b + b * (len / avgLen));
|
|
30
18
|
score += idf * (numer / denom);
|
|
31
19
|
}
|
|
32
|
-
|
|
33
|
-
|
|
20
|
+
if (opts.proximityBonus)
|
|
21
|
+
score *= opts.proximityBonus(data) ?? 1;
|
|
22
|
+
if (data.hasPhrase)
|
|
34
23
|
score *= 1 + phraseBoost;
|
|
35
|
-
|
|
36
|
-
if (data.headingScore) {
|
|
24
|
+
if (data.headingScore)
|
|
37
25
|
score *= 1 + headingBoost * data.headingScore;
|
|
38
|
-
}
|
|
39
26
|
results.push({ blockId: bid, score });
|
|
40
27
|
}
|
|
41
28
|
results.sort((a, b2) => b2.score - a.score);
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export type TextDecoderLike = {
|
|
2
|
+
decode: (u8: Uint8Array) => string;
|
|
3
|
+
};
|
|
4
|
+
export type TextEncoderLike = {
|
|
5
|
+
encode: (s: string) => Uint8Array;
|
|
6
|
+
};
|
|
7
|
+
export declare function getTextDecoder(): TextDecoderLike;
|
|
8
|
+
export declare function getTextEncoder(): TextEncoderLike;
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
// src/utils/utf8.ts
|
|
2
|
+
// Small, dependency-free UTF-8 encoder/decoder that works in RN/Hermes.
|
|
3
|
+
export function getTextDecoder() {
|
|
4
|
+
try {
|
|
5
|
+
// eslint-disable-next-line no-new
|
|
6
|
+
const td = new TextDecoder();
|
|
7
|
+
return td;
|
|
8
|
+
}
|
|
9
|
+
catch {
|
|
10
|
+
return {
|
|
11
|
+
decode: (u8) => {
|
|
12
|
+
let out = '';
|
|
13
|
+
for (let i = 0; i < u8.length;) {
|
|
14
|
+
const a = u8[i++];
|
|
15
|
+
if (a < 0x80) {
|
|
16
|
+
out += String.fromCharCode(a);
|
|
17
|
+
}
|
|
18
|
+
else if ((a & 0xe0) === 0xc0) {
|
|
19
|
+
const b = u8[i++] & 0x3f;
|
|
20
|
+
const cp = ((a & 0x1f) << 6) | b;
|
|
21
|
+
out += String.fromCharCode(cp);
|
|
22
|
+
}
|
|
23
|
+
else if ((a & 0xf0) === 0xe0) {
|
|
24
|
+
const b = u8[i++] & 0x3f;
|
|
25
|
+
const c = u8[i++] & 0x3f;
|
|
26
|
+
const cp = ((a & 0x0f) << 12) | (b << 6) | c;
|
|
27
|
+
out += String.fromCharCode(cp);
|
|
28
|
+
}
|
|
29
|
+
else {
|
|
30
|
+
const b = u8[i++] & 0x3f;
|
|
31
|
+
const c = u8[i++] & 0x3f;
|
|
32
|
+
const d = u8[i++] & 0x3f;
|
|
33
|
+
let cp = ((a & 0x07) << 18) | (b << 12) | (c << 6) | d;
|
|
34
|
+
cp -= 0x10000;
|
|
35
|
+
out += String.fromCharCode(0xd800 + (cp >> 10), 0xdc00 + (cp & 0x3ff));
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
return out;
|
|
39
|
+
},
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
export function getTextEncoder() {
|
|
44
|
+
try {
|
|
45
|
+
// eslint-disable-next-line no-new
|
|
46
|
+
const te = new TextEncoder();
|
|
47
|
+
return te;
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
return {
|
|
51
|
+
encode: (s) => {
|
|
52
|
+
const out = [];
|
|
53
|
+
for (let i = 0; i < s.length; i++) {
|
|
54
|
+
let cp = s.charCodeAt(i);
|
|
55
|
+
if (cp >= 0xd800 && cp <= 0xdbff && i + 1 < s.length) {
|
|
56
|
+
const next = s.charCodeAt(++i);
|
|
57
|
+
cp = 0x10000 + ((cp - 0xd800) << 10) + (next - 0xdc00);
|
|
58
|
+
}
|
|
59
|
+
if (cp < 0x80)
|
|
60
|
+
out.push(cp);
|
|
61
|
+
else if (cp < 0x800)
|
|
62
|
+
out.push(0xc0 | (cp >> 6), 0x80 | (cp & 0x3f));
|
|
63
|
+
else if (cp < 0x10000)
|
|
64
|
+
out.push(0xe0 | (cp >> 12), 0x80 | ((cp >> 6) & 0x3f), 0x80 | (cp & 0x3f));
|
|
65
|
+
else
|
|
66
|
+
out.push(0xf0 | (cp >> 18), 0x80 | ((cp >> 12) & 0x3f), 0x80 | ((cp >> 6) & 0x3f), 0x80 | (cp & 0x3f));
|
|
67
|
+
}
|
|
68
|
+
return new Uint8Array(out);
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
}
|
package/package.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "knolo-core",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Local-first knowledge packs for small LLMs.",
|
|
6
6
|
"keywords": ["llm", "knowledge-base", "rag", "local", "expo"],
|
|
7
|
-
"author": "
|
|
7
|
+
"author": "Sam Paniagua",
|
|
8
8
|
"license": "MIT",
|
|
9
9
|
"main": "./dist/index.js",
|
|
10
10
|
"types": "./dist/index.d.ts",
|
|
@@ -18,7 +18,8 @@
|
|
|
18
18
|
},
|
|
19
19
|
"scripts": {
|
|
20
20
|
"build": "tsc -p tsconfig.json",
|
|
21
|
-
"prepublishOnly": "npm run build"
|
|
21
|
+
"prepublishOnly": "npm run build",
|
|
22
|
+
"smoke": "node scripts/smoke.mjs"
|
|
22
23
|
},
|
|
23
24
|
"devDependencies": {
|
|
24
25
|
"typescript": "^5.5.0",
|