knolo-core 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/DOCS.md +355 -0
- package/LICENSE +20 -17
- package/README.md +1 -1
- package/package.json +20 -6
package/DOCS.md
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
|
|
2
|
+
# DOCS.md — KnoLo Core
|
|
3
|
+
|
|
4
|
+
> Deterministic, embedding-free retrieval and portable knowledge packs.
|
|
5
|
+
|
|
6
|
+
## Table of Contents
|
|
7
|
+
|
|
8
|
+
1. [What is KnoLo Core?](#what-is-knolo-core)
|
|
9
|
+
2. [Quickstart](#quickstart)
|
|
10
|
+
3. [Concepts](#concepts)
|
|
11
|
+
4. [Building Packs](#building-packs)
|
|
12
|
+
5. [Querying & Results](#querying--results)
|
|
13
|
+
6. [LLM Context Patches](#llm-context-patches)
|
|
14
|
+
7. [Advanced Retrieval Controls](#advanced-retrieval-controls)
|
|
15
|
+
8. [Pack Format (Spec)](#pack-format-spec)
|
|
16
|
+
9. [Performance & Tuning](#performance--tuning)
|
|
17
|
+
10. [React Native / Expo Notes](#react-native--expo-notes)
|
|
18
|
+
11. [Testing & QA](#testing--qa)
|
|
19
|
+
12. [Migration 0.1.x → 0.2.0](#migration-01x--020)
|
|
20
|
+
13. [Security & Privacy](#security--privacy)
|
|
21
|
+
14. [FAQ](#faq)
|
|
22
|
+
15. [Glossary](#glossary)
|
|
23
|
+
16. [Versioning & Releases](#versioning--releases)
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## What is KnoLo Core?
|
|
28
|
+
|
|
29
|
+
KnoLo Core packages your corpus into a single `.knolo` file and performs **deterministic lexical retrieval**—no embeddings or vector DBs. It’s designed for **local-first, offline** LLM use.
|
|
30
|
+
|
|
31
|
+
**Key properties**
|
|
32
|
+
|
|
33
|
+
* **Deterministic**: phrase enforcement, proximity scoring, heading boosts
|
|
34
|
+
* **Duplicate-free**: near-duplicate suppression + MMR diversity
|
|
35
|
+
* **Portable**: single-pack file; Node, browsers, Expo
|
|
36
|
+
* **LLM-ready**: outputs structured **Context Patches**
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Quickstart
|
|
41
|
+
|
|
42
|
+
### Install
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
npm install knolo-core
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Minimal example
|
|
49
|
+
|
|
50
|
+
```ts
|
|
51
|
+
import { buildPack, mountPack, query, makeContextPatch } from "knolo-core";
|
|
52
|
+
|
|
53
|
+
const docs = [
|
|
54
|
+
{ id: "guide", heading: "React Native Bridge", text: "The bridge sends messages between JS and native. You can throttle events..." },
|
|
55
|
+
{ id: "throttle", heading: "Throttling", text: "Throttling reduces frequency of events..." }
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
const bytes = await buildPack(docs);
|
|
59
|
+
const kb = await mountPack({ src: bytes });
|
|
60
|
+
const hits = query(kb, '"react native bridge" throttling', { topK: 5 });
|
|
61
|
+
const patch = makeContextPatch(hits, { budget: "small" });
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### CLI build
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# input docs.json -> output knowledge.knolo
|
|
68
|
+
npx knolo docs.json knowledge.knolo
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Concepts
|
|
74
|
+
|
|
75
|
+
* **Pack (.knolo)**: single-file container with metadata, lexicon, postings, and blocks.
|
|
76
|
+
* **Block**: chunk of text (\~512 tokens recommended) with optional `heading` and `id`.
|
|
77
|
+
* **Deterministic Retrieval**: lexical signals (terms, phrases, positions), not embeddings.
|
|
78
|
+
* **Proximity**: bonus for smaller minimal span covering all query terms.
|
|
79
|
+
* **MMR**: Maximum Marginal Relevance to promote diversity in the top-K.
|
|
80
|
+
* **KNS**: tiny lexical numeric signature for stable tie-breaking.
|
|
81
|
+
* **Context Patch**: structured snippets for LLM prompts (budgeted).
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Building Packs
|
|
86
|
+
|
|
87
|
+
### Input format
|
|
88
|
+
|
|
89
|
+
```ts
|
|
90
|
+
type BuildInputDoc = {
|
|
91
|
+
id?: string; // exposed later as hit.source
|
|
92
|
+
heading?: string; // boosts relevance when overlapping query terms
|
|
93
|
+
text: string; // raw markdown accepted (lightly stripped)
|
|
94
|
+
};
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### API
|
|
98
|
+
|
|
99
|
+
```ts
|
|
100
|
+
const bytes: Uint8Array = await buildPack(docs: BuildInputDoc[]);
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**Tips**
|
|
104
|
+
|
|
105
|
+
* Prefer multiple smaller blocks (\~512 tokens).
|
|
106
|
+
* Provide `heading` for stronger field boosts.
|
|
107
|
+
* Use stable `id` if you want `hit.source`.
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Querying & Results
|
|
112
|
+
|
|
113
|
+
### API
|
|
114
|
+
|
|
115
|
+
```ts
|
|
116
|
+
type QueryOptions = {
|
|
117
|
+
topK?: number; // default 10
|
|
118
|
+
requirePhrases?: string[]; // phrases that must appear verbatim
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
type Hit = {
|
|
122
|
+
blockId: number;
|
|
123
|
+
score: number;
|
|
124
|
+
text: string;
|
|
125
|
+
source?: string; // docId if provided
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
const hits: Hit[] = query(pack, '“react native bridge” throttling', {
|
|
129
|
+
topK: 5,
|
|
130
|
+
requirePhrases: ["maximum rate"] // hard constraint
|
|
131
|
+
});
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
**What the ranker does**
|
|
135
|
+
|
|
136
|
+
1. Enforces quoted/required phrases (hard filter)
|
|
137
|
+
2. BM25L with precomputed avg block length
|
|
138
|
+
3. **Proximity bonus** (minimal span cover)
|
|
139
|
+
4. **Heading overlap** boost
|
|
140
|
+
5. **KNS** tie-breaker (small, deterministic)
|
|
141
|
+
6. **De-dupe + MMR** diversity for final top-K
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## LLM Context Patches
|
|
146
|
+
|
|
147
|
+
### API
|
|
148
|
+
|
|
149
|
+
```ts
|
|
150
|
+
type ContextPatch = {
|
|
151
|
+
background: string[];
|
|
152
|
+
snippets: Array<{ text: string; source?: string }>;
|
|
153
|
+
definitions: Array<{ term: string; def: string; evidence?: number[] }>;
|
|
154
|
+
facts: Array<{ s: string; p: string; o: string; evidence?: number[] }>;
|
|
155
|
+
};
|
|
156
|
+
|
|
157
|
+
const patch = makeContextPatch(hits, { budget: "mini" | "small" | "full" });
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
**Budgets**
|
|
161
|
+
|
|
162
|
+
* `mini` ≈ 512 tokens
|
|
163
|
+
* `small` ≈ 1k tokens
|
|
164
|
+
* `full` ≈ 2k tokens
|
|
165
|
+
|
|
166
|
+
**Best practices**
|
|
167
|
+
|
|
168
|
+
* Prefer `background` as setup lines for the system prompt.
|
|
169
|
+
* Place `snippets` nearest to the user’s question in the prompt.
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Advanced Retrieval Controls
|
|
174
|
+
|
|
175
|
+
### Require phrases (hard constraints)
|
|
176
|
+
|
|
177
|
+
```ts
|
|
178
|
+
query(pack, "throttling", { requirePhrases: ["react native bridge"] });
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Tight vs. scattered matches
|
|
182
|
+
|
|
183
|
+
Proximity bonus favors blocks where all query terms co-occur in a small span.
|
|
184
|
+
|
|
185
|
+
### Diversity
|
|
186
|
+
|
|
187
|
+
Top-K results apply near-duplicate suppression (5-gram Jaccard) and MMR (λ≈0.8).
|
|
188
|
+
|
|
189
|
+
**Tuning (if you fork)**
|
|
190
|
+
|
|
191
|
+
* Jaccard threshold default \~0.92
|
|
192
|
+
* MMR λ default \~0.8
|
|
193
|
+
* Proximity multiplier default \~0.15
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Pack Format (Spec)
|
|
198
|
+
|
|
199
|
+
**Binary layout**
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
[metaLen:u32][meta JSON]
|
|
203
|
+
[lexLen:u32][lexicon JSON]
|
|
204
|
+
[postCount:u32][postings u32[]]
|
|
205
|
+
[blocksLen:u32][blocks JSON]
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
**Meta JSON**
|
|
209
|
+
|
|
210
|
+
```json
|
|
211
|
+
{
|
|
212
|
+
"version": 2,
|
|
213
|
+
"stats": {
|
|
214
|
+
"docs": <number>,
|
|
215
|
+
"blocks": <number>,
|
|
216
|
+
"terms": <number>,
|
|
217
|
+
"avgBlockLen": <number> // optional in older packs
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
**Lexicon JSON**
|
|
223
|
+
|
|
224
|
+
* Array of `[term, termId]` pairs.
|
|
225
|
+
|
|
226
|
+
**Postings**
|
|
227
|
+
|
|
228
|
+
* Flattened `Uint32Array`:
|
|
229
|
+
|
|
230
|
+
```
|
|
231
|
+
termId, blockId, pos, pos, …, 0, blockId, …, 0, 0, termId, ...
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
Each block section ends with `0`, each term section ends with `0`.
|
|
235
|
+
|
|
236
|
+
**Blocks JSON (v1 / v2)**
|
|
237
|
+
|
|
238
|
+
* **v1**: `string[]` (text only)
|
|
239
|
+
* **v2**: `{ text, heading?, docId? }[]`
|
|
240
|
+
|
|
241
|
+
Runtime auto-detects and exposes:
|
|
242
|
+
|
|
243
|
+
```ts
|
|
244
|
+
type Pack = {
|
|
245
|
+
meta, lexicon, postings, blocks: string[],
|
|
246
|
+
headings?: (string|null)[],
|
|
247
|
+
docIds?: (string|null)[]
|
|
248
|
+
}
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
---
|
|
252
|
+
|
|
253
|
+
## Performance & Tuning
|
|
254
|
+
|
|
255
|
+
**Targets (typical)**
|
|
256
|
+
|
|
257
|
+
* Query < 50 ms on mid-range laptops (packs ≤ 200 MB)
|
|
258
|
+
* Memory < 10 MB for \~50k blocks
|
|
259
|
+
* Pack size ≈ 6–12% of raw text
|
|
260
|
+
|
|
261
|
+
**Tuning checklist**
|
|
262
|
+
|
|
263
|
+
* Split large documents into \~512-token blocks
|
|
264
|
+
* Provide informative `heading`s
|
|
265
|
+
* Shard packs by domain if you exceed 200–500 MB
|
|
266
|
+
* Cache mounted packs in memory if app does repeated queries
|
|
267
|
+
|
|
268
|
+
---
|
|
269
|
+
|
|
270
|
+
## React Native / Expo Notes
|
|
271
|
+
|
|
272
|
+
* Built-in ponyfills for `TextEncoder`/`TextDecoder`; no extra deps needed.
|
|
273
|
+
* To load `.knolo` assets, read as Base64 then convert to `Uint8Array` before `mountPack({ src })`.
|
|
274
|
+
* Hermes compatible.
|
|
275
|
+
|
|
276
|
+
---
|
|
277
|
+
|
|
278
|
+
## Testing & QA
|
|
279
|
+
|
|
280
|
+
### Smoke test
|
|
281
|
+
|
|
282
|
+
We ship a no-deps smoke test that exercises phrase enforcement, proximity, de-dupe, and heading boosts.
|
|
283
|
+
|
|
284
|
+
Run:
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
npm run build
|
|
288
|
+
npm run smoke
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### What it validates
|
|
292
|
+
|
|
293
|
+
* Basic query returns results
|
|
294
|
+
* Quoted phrases enforced
|
|
295
|
+
* `requirePhrases` enforced (normalized)
|
|
296
|
+
* Tight spans outrank scattered
|
|
297
|
+
* No near-duplicates in top-K
|
|
298
|
+
* `source` type is `string | undefined`
|
|
299
|
+
|
|
300
|
+
---
|
|
301
|
+
|
|
302
|
+
## Migration 0.1.x → 0.2.0
|
|
303
|
+
|
|
304
|
+
* **No API breaks**: `buildPack`, `mountPack`, `query`, `makeContextPatch` unchanged.
|
|
305
|
+
* **Compatibility**: v0.2.0 mounts v1 packs (string blocks) and v2 (object blocks).
|
|
306
|
+
* **New**: phrase enforcement, proximity bonus, heading boosts, KNS tie-breaker, de-dupe + MMR, avgBlockLen in meta, RN/Expo ponyfills.
|
|
307
|
+
* Provide `id` and `heading` at build time to enable `hit.source` and field boosts.
|
|
308
|
+
|
|
309
|
+
---
|
|
310
|
+
|
|
311
|
+
## Security & Privacy
|
|
312
|
+
|
|
313
|
+
* All retrieval is **local**; no network dependency for search.
|
|
314
|
+
* Packs are plain JSON sections + typed arrays—auditable and diff-able.
|
|
315
|
+
* If packs contain sensitive data, treat `.knolo` files as confidential artifacts.
|
|
316
|
+
|
|
317
|
+
---
|
|
318
|
+
|
|
319
|
+
## FAQ
|
|
320
|
+
|
|
321
|
+
**Q: Does this use embeddings or a vector DB?**
|
|
322
|
+
A: No—pure lexical retrieval with positions and structural cues.
|
|
323
|
+
|
|
324
|
+
**Q: Why am I still seeing similar results?**
|
|
325
|
+
A: De-dup suppresses near-duplicates but allows related passages. Increase Jaccard threshold or tune λ (if forking).
|
|
326
|
+
|
|
327
|
+
**Q: How do I improve recalls for synonyms?**
|
|
328
|
+
A: Add domain alias tables or expand queries; we intentionally avoid opaque embeddings.
|
|
329
|
+
|
|
330
|
+
**Q: Does it work offline?**
|
|
331
|
+
A: Yes, end-to-end.
|
|
332
|
+
|
|
333
|
+
---
|
|
334
|
+
|
|
335
|
+
## Glossary
|
|
336
|
+
|
|
337
|
+
* **BM25L**: length-normalized lexical ranking.
|
|
338
|
+
* **Minimal span cover**: smallest token window containing all query terms.
|
|
339
|
+
* **MMR**: diversity-promoting re-ranker balancing relevance & novelty.
|
|
340
|
+
* **KNS**: deterministic lexical numeric signature (tie-breaker).
|
|
341
|
+
|
|
342
|
+
---
|
|
343
|
+
|
|
344
|
+
## Versioning & Releases
|
|
345
|
+
|
|
346
|
+
* **SemVer**: feature ≥ minor, bugs = patch.
|
|
347
|
+
* Tag releases: `vX.Y.Z`.
|
|
348
|
+
* Recommended commit message (example):
|
|
349
|
+
|
|
350
|
+
```
|
|
351
|
+
feat(core): deterministic retrieval upgrades
|
|
352
|
+
- phrase enforcement, proximity, heading boosts
|
|
353
|
+
- de-dup + MMR, KNS tie-breaker
|
|
354
|
+
- avgBlockLen, RN/Expo ponyfills
|
|
355
|
+
```
|
package/LICENSE
CHANGED
|
@@ -1,21 +1,24 @@
|
|
|
1
|
-
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
2
4
|
|
|
3
|
-
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
4
6
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
7
|
+
1. Definitions.
|
|
8
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
9
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
10
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
11
|
+
the copyright owner that is granting the License.
|
|
12
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
13
|
+
other entities that control, are controlled by, or are under common
|
|
14
|
+
control with that entity. For the purposes of this definition,
|
|
15
|
+
"control" means (i) the power, direct or indirect, to cause the
|
|
16
|
+
direction or management of such entity, whether by contract or
|
|
17
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
18
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
19
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
20
|
+
exercising permissions granted by this License.
|
|
11
21
|
|
|
12
|
-
|
|
13
|
-
copies or substantial portions of the Software.
|
|
22
|
+
...
|
|
14
23
|
|
|
15
|
-
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
|
24
|
+
(rest of standard Apache 2.0 text continues unchanged)
|
package/README.md
CHANGED
package/package.json
CHANGED
|
@@ -1,15 +1,29 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "knolo-core",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Local-first knowledge packs for small LLMs.",
|
|
6
|
-
"keywords": [
|
|
6
|
+
"keywords": [
|
|
7
|
+
"llm",
|
|
8
|
+
"knowledge-base",
|
|
9
|
+
"rag",
|
|
10
|
+
"local",
|
|
11
|
+
"expo"
|
|
12
|
+
],
|
|
7
13
|
"author": "Sam Paniagua",
|
|
8
|
-
"license": "
|
|
14
|
+
"license": "Apache-2.0",
|
|
9
15
|
"main": "./dist/index.js",
|
|
10
16
|
"types": "./dist/index.d.ts",
|
|
11
|
-
"files": [
|
|
12
|
-
|
|
17
|
+
"files": [
|
|
18
|
+
"dist",
|
|
19
|
+
"bin",
|
|
20
|
+
"README.md",
|
|
21
|
+
"LICENSE",
|
|
22
|
+
"DOCS.md"
|
|
23
|
+
],
|
|
24
|
+
"bin": {
|
|
25
|
+
"knolo": "./bin/knolo.mjs"
|
|
26
|
+
},
|
|
13
27
|
"exports": {
|
|
14
28
|
".": {
|
|
15
29
|
"import": "./dist/index.js",
|
|
@@ -19,7 +33,7 @@
|
|
|
19
33
|
"scripts": {
|
|
20
34
|
"build": "tsc -p tsconfig.json",
|
|
21
35
|
"prepublishOnly": "npm run build",
|
|
22
|
-
|
|
36
|
+
"smoke": "node scripts/smoke.mjs"
|
|
23
37
|
},
|
|
24
38
|
"devDependencies": {
|
|
25
39
|
"typescript": "^5.5.0",
|