knolo-core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 KnoLo Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,141 @@
1
+
2
+ # 🧠 KnoLo Core
3
+
4
+ KnoLo Core is a **local-first knowledge base system** for small language models (LLMs).
5
+ It lets you package your own documents into a compact `.knolo` file and query them deterministically — **no embeddings, no vector DBs, no cloud**. Perfect for **on-device LLMs**.
6
+
7
+ ---
8
+
9
+ ## ✨ Features
10
+
11
+ * 📦 **Single-file packs** (`.knolo`) you can ship or load offline.
12
+ * 🔎 **Deterministic lexical retrieval** (BM25L + phrase + heading boosts).
13
+ * ⚡ **Tiny & fast** — runs in Node, browsers, and Expo.
14
+ * 📑 **Context Patches**: structured snippets for direct LLM input.
15
+ * 🔒 **Privacy-first**: all data stays local.
16
+
17
+ ---
18
+
19
+ ## 📦 Install
20
+
21
+ ```bash
22
+ # local build
23
+ npm install
24
+ npm run build
25
+
26
+ # or if published later
27
+ npm install @knolo/core
28
+ ```
29
+
30
+ ---
31
+
32
+ ## 🚀 Usage Examples
33
+
34
+ ### 1. Node.js (in-memory build + query)
35
+
36
+ ```js
37
+ import { buildPack, mountPack, query, makeContextPatch } from "./dist/index.js";
38
+
39
+ const docs = [
40
+ { heading: "React Native Bridge", text: "The bridge sends messages between JS and native. You can throttle events to reduce jank." },
41
+ { heading: "Throttling", text: "Throttling reduces frequency of events to avoid flooding the bridge." },
42
+ { heading: "Debounce vs Throttle", text: "Debounce waits for silence, throttle guarantees a max rate." }
43
+ ];
44
+
45
+ // Build a pack in memory
46
+ const bytes = await buildPack(docs);
47
+
48
+ // Mount it
49
+ const kb = await mountPack({ src: bytes });
50
+
51
+ // Query
52
+ const hits = query(kb, "react native bridge throttling", { topK: 5 });
53
+ console.log("Top hits:", hits);
54
+
55
+ // Turn into an LLM-friendly context patch
56
+ const patch = makeContextPatch(hits, { budget: "small" });
57
+ console.log("Context Patch:", patch);
58
+ ```
59
+
60
+ ---
61
+
62
+ ### 2. CLI (build `.knolo` file)
63
+
64
+ **Prepare `docs.json`:**
65
+
66
+ ```json
67
+ [
68
+ { "heading": "Guide", "text": "Install deps.\n\n## Throttle\nLimit frequency of events." },
69
+ { "heading": "FAQ", "text": "What is throttling? It reduces event frequency." }
70
+ ]
71
+ ```
72
+
73
+ **Build the pack:**
74
+
75
+ ```bash
76
+ # writes mypack.knolo
77
+ node bin/knolo.mjs docs.json mypack.knolo
78
+ ```
79
+
80
+ **Query it in a script:**
81
+
82
+ ```js
83
+ import { mountPack, query } from "./dist/index.js";
84
+
85
+ const kb = await mountPack({ src: "./mypack.knolo" });
86
+ const hits = query(kb, "throttle events", { topK: 3 });
87
+ console.log(hits);
88
+ ```
89
+
90
+ ---
91
+
92
+ ### 3. React / Expo (load from asset)
93
+
94
+ ```ts
95
+ import { Asset } from "expo-asset";
96
+ import * as FileSystem from "expo-file-system";
97
+ import { mountPack, query, makeContextPatch } from "@knolo/core";
98
+
99
+ async function loadKnowledge() {
100
+ const asset = Asset.fromModule(require("./assets/mypack.knolo"));
101
+ await asset.downloadAsync();
102
+
103
+ const base64 = await FileSystem.readAsStringAsync(asset.localUri!, { encoding: FileSystem.EncodingType.Base64 });
104
+ const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0));
105
+
106
+ const kb = await mountPack({ src: bytes.buffer });
107
+ const hits = query(kb, "bridge throttling", { topK: 5 });
108
+ return makeContextPatch(hits, { budget: "mini" });
109
+ }
110
+ ```
111
+
112
+ ---
113
+
114
+ ## 📑 API Quick Reference
115
+
116
+ ```ts
117
+ // Build a pack from docs
118
+ buildPack([{ heading: string, text: string }[]]) -> Promise<Uint8Array>
119
+
120
+ // Load a pack
121
+ mountPack({ src: string | Uint8Array | ArrayBuffer }) -> Promise<Pack>
122
+
123
+ // Query
124
+ query(pack, "your query", { topK?: number, requirePhrases?: string[] }) -> Hit[]
125
+
126
+ // Create LLM-friendly patch
127
+ makeContextPatch(hits, { budget?: "mini"|"small"|"full" }) -> ContextPatch
128
+ ```
129
+
130
+ ---
131
+
132
+ ## 🔮 Roadmap
133
+
134
+ * Multi-resolution packs (summaries + facts).
135
+ * Overlay store for user notes.
136
+ * WASM core for very large packs.
137
+
138
+ ---
139
+
140
+ 👉 With KnoLo, you can **carry knowledge with your model** — no servers, no dependencies, just a tiny portable pack.
141
+
package/bin/knolo.mjs ADDED
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env node
2
+
3
+ // Simple CLI wrapper around the KnoLo pack builder. Reads an input JSON
4
+ // containing an array of documents with `heading` and `text` fields and
5
+ // writes a `.knolo` binary pack. Requires that the compiled `dist` files
6
+ // exist (run `npm run build` before using). This script uses ESM syntax.
7
+
8
+ import { readFileSync, writeFileSync } from 'node:fs';
9
+ import { buildPack } from '../dist/builder.js';
10
+
11
+ async function main() {
12
+ const [,, inputFile, outputFile = 'knowledge.knolo'] = process.argv;
13
+ if (!inputFile) {
14
+ console.log('Usage: knolo <input.json> [output.knolo]');
15
+ process.exit(1);
16
+ }
17
+ const json = JSON.parse(readFileSync(inputFile, 'utf8'));
18
+ if (!Array.isArray(json)) {
19
+ console.error('Input JSON must be an array of objects');
20
+ process.exit(1);
21
+ }
22
+ const bytes = await buildPack(json);
23
+ writeFileSync(outputFile, Buffer.from(bytes));
24
+ console.log(`wrote ${outputFile}`);
25
+ }
26
+
27
+ main().catch((err) => {
28
+ console.error(err);
29
+ process.exit(1);
30
+ });
@@ -0,0 +1,11 @@
1
+ export type BuildInputDoc = {
2
+ id?: string;
3
+ heading?: string;
4
+ text: string;
5
+ };
6
+ /** Build a `.knolo` pack from an array of input documents. At present each
7
+ * document becomes a single block. Future versions may split documents into
8
+ * multiple blocks based on headings or token count to improve retrieval
9
+ * granularity.
10
+ */
11
+ export declare function buildPack(docs: BuildInputDoc[]): Promise<Uint8Array>;
@@ -0,0 +1,81 @@
1
+ /*
2
+ * builder.ts
3
+ *
4
+ * Provides a programmatic interface for building `.knolo` knowledge packs
5
+ * from arrays of input documents. The input format accepts objects with
6
+ * `id`, `heading` and `text` fields. The builder performs simple
7
+ * Markdown stripping and calls the indexer to generate the inverted index. The
8
+ * resulting pack binary can be persisted to disk or served directly to
9
+ * clients.
10
+ */
11
+ import { buildIndex } from './indexer.js';
12
+ /** Build a `.knolo` pack from an array of input documents. At present each
13
+ * document becomes a single block. Future versions may split documents into
14
+ * multiple blocks based on headings or token count to improve retrieval
15
+ * granularity.
16
+ */
17
+ export async function buildPack(docs) {
18
+ // Convert docs into blocks. For v0 each doc is a single block; heading is
19
+ // ignored except possibly for future heading boosting in ranking.
20
+ const blocks = docs.map((d, i) => ({ id: i, text: stripMd(d.text), heading: d.heading }));
21
+ const { lexicon, postings } = buildIndex(blocks);
22
+ const meta = {
23
+ version: 1,
24
+ stats: { docs: docs.length, blocks: blocks.length, terms: lexicon.length },
25
+ };
26
+ // Encode sections to bytes
27
+ const enc = new TextEncoder();
28
+ const metaBytes = enc.encode(JSON.stringify(meta));
29
+ const lexBytes = enc.encode(JSON.stringify(lexicon));
30
+ const blocksBytes = enc.encode(JSON.stringify(blocks.map((b) => b.text)));
31
+ // Compute lengths and allocate output
32
+ const totalLength = 4 + metaBytes.length +
33
+ 4 + lexBytes.length +
34
+ 4 + postings.length * 4 +
35
+ 4 + blocksBytes.length;
36
+ const out = new Uint8Array(totalLength);
37
+ const dv = new DataView(out.buffer);
38
+ let offset = 0;
39
+ // meta
40
+ dv.setUint32(offset, metaBytes.length, true);
41
+ offset += 4;
42
+ out.set(metaBytes, offset);
43
+ offset += metaBytes.length;
44
+ // lexicon
45
+ dv.setUint32(offset, lexBytes.length, true);
46
+ offset += 4;
47
+ out.set(lexBytes, offset);
48
+ offset += lexBytes.length;
49
+ // postings
50
+ dv.setUint32(offset, postings.length, true);
51
+ offset += 4;
52
+ new Uint32Array(out.buffer, offset, postings.length).set(postings);
53
+ offset += postings.length * 4;
54
+ // blocks
55
+ dv.setUint32(offset, blocksBytes.length, true);
56
+ offset += 4;
57
+ out.set(blocksBytes, offset);
58
+ return out;
59
+ }
60
+ /** Strip Markdown syntax by converting to HTML and then removing tags. The
61
+ * `marked` library is used for parsing and rendering. A very naive HTML tag
62
+ * stripper removes tags by dropping anything between `<` and `>`. This is
63
+ * simplistic but adequate for plain text extraction.
64
+ */
65
+ function stripMd(md) {
66
+ // Remove code fences
67
+ let text = md.replace(/```[^```]*```/g, ' ');
68
+ // Remove inline code backticks
69
+ text = text.replace(/`[^`]*`/g, ' ');
70
+ // Remove emphasis markers (*, _, ~)
71
+ text = text.replace(/[\*_~]+/g, ' ');
72
+ // Remove headings (#)
73
+ text = text.replace(/^#+\s*/gm, '');
74
+ // Remove links [text](url) -> text
75
+ text = text.replace(/\[([^\]]+)\]\([^\)]+\)/g, '$1');
76
+ // Remove any remaining brackets
77
+ text = text.replace(/[\[\]()]/g, ' ');
78
+ // Collapse whitespace
79
+ text = text.replace(/\s+/g, ' ').trim();
80
+ return text;
81
+ }
@@ -0,0 +1,7 @@
1
+ export type { MountOptions, PackMeta, Pack } from './pack';
2
+ export type { QueryOptions, Hit } from './query';
3
+ export type { ContextPatch } from './patch';
4
+ export { mountPack } from './pack.js';
5
+ export { query } from './query.js';
6
+ export { makeContextPatch } from './patch.js';
7
+ export { buildPack } from './builder.js';
package/dist/index.js ADDED
@@ -0,0 +1,5 @@
1
+ // Public API surface for KnoLo core.
2
+ export { mountPack } from './pack.js';
3
+ export { query } from './query.js';
4
+ export { makeContextPatch } from './patch.js';
5
+ export { buildPack } from './builder.js';
@@ -0,0 +1,22 @@
1
+ export type Block = {
2
+ id: number;
3
+ text: string;
4
+ heading?: string;
5
+ };
6
+ export type IndexBuildResult = {
7
+ lexicon: Array<[string, number]>;
8
+ postings: Uint32Array;
9
+ };
10
+ /**
11
+ * Build an inverted index from an array of blocks. The postings list format
12
+ * encodes, for each term, a header containing the termId, followed by
13
+ * sequences of blockId and positions for that term, with zeros as delimiters.
14
+ * The structure looks like:
15
+ *
16
+ * [termId, blockId, pos, pos, 0, blockId, pos, 0, 0, termId, ...]
17
+ *
18
+ * Each block section ends with a 0, and each term section ends with a 0. The
19
+ * entire array can be streamed sequentially without needing to know the sizes
20
+ * of individual lists ahead of time.
21
+ */
22
+ export declare function buildIndex(blocks: Block[]): IndexBuildResult;
@@ -0,0 +1,70 @@
1
+ /*
2
+ * indexer.ts
3
+ *
4
+ * Implements a basic inverted index builder. Given an array of blocks, it
5
+ * produces a lexicon mapping each unique term to a term identifier and a
6
+ * flattened postings array. This representation is intentionally naïve to
7
+ * prioritise clarity and portability over maximum compression. The pack
8
+ * builder can later swap this implementation for a more compact format.
9
+ */
10
+ import { tokenize } from "./tokenize.js";
11
+ /**
12
+ * Build an inverted index from an array of blocks. The postings list format
13
+ * encodes, for each term, a header containing the termId, followed by
14
+ * sequences of blockId and positions for that term, with zeros as delimiters.
15
+ * The structure looks like:
16
+ *
17
+ * [termId, blockId, pos, pos, 0, blockId, pos, 0, 0, termId, ...]
18
+ *
19
+ * Each block section ends with a 0, and each term section ends with a 0. The
20
+ * entire array can be streamed sequentially without needing to know the sizes
21
+ * of individual lists ahead of time.
22
+ */
23
+ export function buildIndex(blocks) {
24
+ // Map term to termId and interim map of termId -> blockId -> positions
25
+ const term2id = new Map();
26
+ const termBlockPositions = new Map();
27
+ const getTermId = (t) => {
28
+ let id = term2id.get(t);
29
+ if (id === undefined) {
30
+ id = term2id.size + 1; // term IDs start at 1
31
+ term2id.set(t, id);
32
+ }
33
+ return id;
34
+ };
35
+ // Build a local term frequency map per block, then populate the global map
36
+ for (const block of blocks) {
37
+ const toks = tokenize(block.text);
38
+ const perTermPositions = new Map();
39
+ for (const tk of toks) {
40
+ const id = getTermId(tk.term);
41
+ let positions = perTermPositions.get(id);
42
+ if (!positions) {
43
+ positions = [];
44
+ perTermPositions.set(id, positions);
45
+ }
46
+ positions.push(tk.pos);
47
+ }
48
+ // Merge into global structure
49
+ for (const [tid, positions] of perTermPositions) {
50
+ let blockMap = termBlockPositions.get(tid);
51
+ if (!blockMap) {
52
+ blockMap = new Map();
53
+ termBlockPositions.set(tid, blockMap);
54
+ }
55
+ blockMap.set(block.id, positions);
56
+ }
57
+ }
58
+ // Flatten postings into a single Uint32Array
59
+ const postings = [];
60
+ for (const [tid, blockMap] of termBlockPositions) {
61
+ postings.push(tid);
62
+ for (const [bid, positions] of blockMap) {
63
+ postings.push(bid, ...positions, 0);
64
+ }
65
+ postings.push(0); // end of term
66
+ }
67
+ // Convert lexicon to array for serialization
68
+ const lexicon = Array.from(term2id.entries());
69
+ return { lexicon, postings: new Uint32Array(postings) };
70
+ }
package/dist/pack.d.ts ADDED
@@ -0,0 +1,47 @@
1
+ export type MountOptions = {
2
+ src: string | ArrayBufferLike | Uint8Array;
3
+ };
4
+ /** Metadata about the pack. Version numbers should increment with format
5
+ * changes, allowing the runtime to adapt accordingly. */
6
+ export type PackMeta = {
7
+ version: number;
8
+ stats: {
9
+ docs: number;
10
+ blocks: number;
11
+ terms: number;
12
+ };
13
+ };
14
+ /**
15
+ * A mounted pack exposing the inverted index, block text and optional field
16
+ * metadata. The core runtime reads from these structures directly at query
17
+ * time.
18
+ */
19
+ export type Pack = {
20
+ meta: PackMeta;
21
+ /** Map of token to term identifier used in the postings list. */
22
+ lexicon: Map<string, number>;
23
+ /** Flattened postings list where each term section starts with the termId
24
+ * followed by (blockId, positions..., 0) tuples, ending with a 0.
25
+ */
26
+ postings: Uint32Array;
27
+ /** Array of block texts. Each block corresponds to a chunk of the original
28
+ * documents. The blockId used in the postings list indexes into this array.
29
+ */
30
+ blocks: string[];
31
+ };
32
+ /**
33
+ * Load a `.knolo` pack from a variety of sources. The pack binary layout is
34
+ * currently:
35
+ *
36
+ * [metaLen:u32][meta JSON][lexLen:u32][lexicon JSON][postCount:u32][postings][blocksLen:u32][blocks JSON]
37
+ *
38
+ * All integers are little endian. `metaLen`, `lexLen` and `blocksLen` denote
39
+ * the byte lengths of the subsequent JSON sections. `postCount` is the number
40
+ * of 32‑bit integers in the postings array. This simple layout is sufficient
41
+ * for v0 and avoids any additional dependencies beyond standard typed arrays.
42
+ *
43
+ * @param opts Options specifying how to load the pack. Accepts a URL string,
44
+ * ArrayBuffer, or Uint8Array.
45
+ * @returns A Promise resolving to a mounted pack with the index and blocks.
46
+ */
47
+ export declare function mountPack(opts: MountOptions): Promise<Pack>;
package/dist/pack.js ADDED
@@ -0,0 +1,77 @@
1
+ /*
2
+ * pack.ts
3
+ *
4
+ * Defines the pack structure and implements the mountPack function that loads
5
+ * a `.knolo` pack from a variety of sources. A pack consists of a header
6
+ * containing metadata, followed by JSON‑encoded lexicon and blocks and a
7
+ * flattened postings list encoded as a 32‑bit integer array. The pack is
8
+ * portable across Node.js and browser environments.
9
+ */
10
+ /**
11
+ * Load a `.knolo` pack from a variety of sources. The pack binary layout is
12
+ * currently:
13
+ *
14
+ * [metaLen:u32][meta JSON][lexLen:u32][lexicon JSON][postCount:u32][postings][blocksLen:u32][blocks JSON]
15
+ *
16
+ * All integers are little endian. `metaLen`, `lexLen` and `blocksLen` denote
17
+ * the byte lengths of the subsequent JSON sections. `postCount` is the number
18
+ * of 32‑bit integers in the postings array. This simple layout is sufficient
19
+ * for v0 and avoids any additional dependencies beyond standard typed arrays.
20
+ *
21
+ * @param opts Options specifying how to load the pack. Accepts a URL string,
22
+ * ArrayBuffer, or Uint8Array.
23
+ * @returns A Promise resolving to a mounted pack with the index and blocks.
24
+ */
25
+ export async function mountPack(opts) {
26
+ const buf = await resolveToBuffer(opts.src);
27
+ const dv = new DataView(buf);
28
+ let offset = 0;
29
+ // Read meta section
30
+ const metaLen = dv.getUint32(offset, true);
31
+ offset += 4;
32
+ const metaJson = new TextDecoder().decode(new Uint8Array(buf, offset, metaLen));
33
+ offset += metaLen;
34
+ const meta = JSON.parse(metaJson);
35
+ // Read lexicon
36
+ const lexLen = dv.getUint32(offset, true);
37
+ offset += 4;
38
+ const lexJson = new TextDecoder().decode(new Uint8Array(buf, offset, lexLen));
39
+ offset += lexLen;
40
+ const lexEntries = JSON.parse(lexJson);
41
+ const lexicon = new Map(lexEntries);
42
+ // Read postings
43
+ const postCount = dv.getUint32(offset, true);
44
+ offset += 4;
45
+ const postings = new Uint32Array(buf, offset, postCount);
46
+ offset += postCount * 4;
47
+ // Read blocks
48
+ const blocksLen = dv.getUint32(offset, true);
49
+ offset += 4;
50
+ const blocksJson = new TextDecoder().decode(new Uint8Array(buf, offset, blocksLen));
51
+ const blocks = JSON.parse(blocksJson);
52
+ return { meta, lexicon, postings, blocks };
53
+ }
54
+ /** Resolve the `src` field of MountOptions into an ArrayBuffer. Supports:
55
+ * - strings interpreted as URLs (via fetch)
56
+ * - Uint8Array and ArrayBuffer inputs
57
+ */
58
+ async function resolveToBuffer(src) {
59
+ if (typeof src === 'string') {
60
+ // Use fetch for browser and Node environments. For Node this requires the
61
+ // global fetch API (available since Node 18). Error handling is delegated
62
+ // to the caller.
63
+ const res = await fetch(src);
64
+ const ab = await res.arrayBuffer();
65
+ return ab;
66
+ }
67
+ if (src instanceof Uint8Array) {
68
+ // If the view covers the whole buffer, return it directly (cast to ArrayBuffer).
69
+ if (src.byteOffset === 0 && src.byteLength === src.buffer.byteLength) {
70
+ return src.buffer;
71
+ }
72
+ // Otherwise, copy to a new buffer so we return exactly the bytes for this view.
73
+ const copy = src.slice(); // makes a copy of the bytes for this range
74
+ return copy.buffer; // typed as ArrayBuffer
75
+ }
76
+ return src;
77
+ }
@@ -0,0 +1,29 @@
1
+ import type { Hit } from "./query.js";
2
+ export type ContextPatch = {
3
+ background: string[];
4
+ snippets: Array<{
5
+ text: string;
6
+ source?: string;
7
+ }>;
8
+ definitions: Array<{
9
+ term: string;
10
+ def: string;
11
+ evidence?: number[];
12
+ }>;
13
+ facts: Array<{
14
+ s: string;
15
+ p: string;
16
+ o: string;
17
+ evidence?: number[];
18
+ }>;
19
+ };
20
+ /** Assemble a context patch from an array of hits. The `budget` determines
21
+ * how many snippets and how much text to include in each snippet. Currently
22
+ * three budgets are supported:
23
+ * - `mini`: ~512 token contexts
24
+ * - `small`: ~1k token contexts
25
+ * - `full`: ~2k token contexts
26
+ */
27
+ export declare function makeContextPatch(hits: Hit[], opts?: {
28
+ budget?: 'mini' | 'small' | 'full';
29
+ }): ContextPatch;
package/dist/patch.js ADDED
@@ -0,0 +1,50 @@
1
+ /*
2
+ * patch.ts
3
+ *
4
+ * Provides the makeContextPatch function that assembles search results into
5
+ * structured context patches for consumption by language models. A context
6
+ * patch includes a background summary and selected snippets. Future versions
7
+ * may include definitions, facts and other structured artifacts.
8
+ */
9
+ /** Assemble a context patch from an array of hits. The `budget` determines
10
+ * how many snippets and how much text to include in each snippet. Currently
11
+ * three budgets are supported:
12
+ * - `mini`: ~512 token contexts
13
+ * - `small`: ~1k token contexts
14
+ * - `full`: ~2k token contexts
15
+ */
16
+ export function makeContextPatch(hits, opts = {}) {
17
+ const budget = opts.budget ?? 'small';
18
+ const limits = {
19
+ mini: { snippets: 3, chars: 240 },
20
+ small: { snippets: 6, chars: 420 },
21
+ full: { snippets: 10, chars: 900 },
22
+ };
23
+ const limit = limits[budget];
24
+ const snippets = hits.slice(0, limit.snippets).map((h) => ({
25
+ text: truncate(h.text, limit.chars),
26
+ }));
27
+ // Build background summary from first two snippets by extracting first sentence
28
+ const background = snippets.slice(0, 2).map((s) => firstSentence(s.text));
29
+ return {
30
+ background,
31
+ snippets,
32
+ definitions: [],
33
+ facts: [],
34
+ };
35
+ }
36
+ /** Extract the first sentence from a block of text. If no terminal punctuation
37
+ * is found, returns the first N characters up to a reasonable length.
38
+ */
39
+ function firstSentence(text) {
40
+ const m = text.match(/^(.{10,200}?[.!?])\s/);
41
+ if (m)
42
+ return m[1];
43
+ return text.slice(0, 160);
44
+ }
45
+ /** Truncate text to a maximum length and append an ellipsis if it was
46
+ * truncated.
47
+ */
48
+ function truncate(text, maxChars) {
49
+ return text.length > maxChars ? text.slice(0, maxChars) + '…' : text;
50
+ }
@@ -0,0 +1,18 @@
1
+ import type { Pack } from './pack';
2
+ export type QueryOptions = {
3
+ topK?: number;
4
+ /** Additional phrases (unquoted) that must be present in results. */
5
+ requirePhrases?: string[];
6
+ };
7
+ export type Hit = {
8
+ blockId: number;
9
+ score: number;
10
+ text: string;
11
+ source?: string;
12
+ };
13
+ /** Execute a search against a mounted pack. The query string can contain
14
+ * quoted phrases; unquoted terms are treated individually. The `topK`
15
+ * parameter controls how many results are returned. If `requirePhrases`
16
+ * contains strings, those phrases must appear verbatim in candidate blocks.
17
+ */
18
+ export declare function query(pack: Pack, q: string, opts?: QueryOptions): Hit[];
package/dist/query.js ADDED
@@ -0,0 +1,98 @@
1
+ /*
2
+ * query.ts
3
+ *
4
+ * Implements the main query function that ties together tokenization, lexical
5
+ * lookup in the pack's lexicon, scanning the postings list to generate
6
+ * candidates, and ranking them using the BM25L ranker. Phrase detection
7
+ * operates on the block text itself and is naive but effective for short
8
+ * queries. Future versions may incorporate more advanced mechanisms.
9
+ */
10
+ import { tokenize, parsePhrases } from "./tokenize.js";
11
+ import { rankBM25L } from "./rank.js";
12
+ /** Execute a search against a mounted pack. The query string can contain
13
+ * quoted phrases; unquoted terms are treated individually. The `topK`
14
+ * parameter controls how many results are returned. If `requirePhrases`
15
+ * contains strings, those phrases must appear verbatim in candidate blocks.
16
+ */
17
+ export function query(pack, q, opts = {}) {
18
+ const topK = opts.topK ?? 10;
19
+ const normTokens = tokenize(q).map((t) => t.term);
20
+ const phraseTerms = parsePhrases(q);
21
+ // Include explicitly provided phrases
22
+ if (opts.requirePhrases) {
23
+ for (const p of opts.requirePhrases) {
24
+ const terms = p.split(/\s+/).filter(Boolean);
25
+ if (terms.length > 0)
26
+ phraseTerms.push(terms);
27
+ }
28
+ }
29
+ // Translate tokens to term IDs. Terms not in the lexicon are skipped.
30
+ const termIds = normTokens
31
+ .map((t) => pack.lexicon.get(t))
32
+ .filter((id) => id !== undefined);
33
+ // Map from blockId to candidate data (term frequencies, flags)
34
+ const candidates = new Map();
35
+ // Scan postings list. The format is [termId, blockId, pos... 0, blockId, pos... 0, 0, termId, ...]
36
+ const p = pack.postings;
37
+ let i = 0;
38
+ while (i < p.length) {
39
+ const tid = p[i++];
40
+ if (tid === 0)
41
+ continue;
42
+ const relevant = termIds.includes(tid);
43
+ let bid = p[i++];
44
+ while (bid !== 0) {
45
+ let pos = p[i++];
46
+ const positions = [];
47
+ while (pos !== 0) {
48
+ positions.push(pos);
49
+ pos = p[i++];
50
+ }
51
+ if (relevant) {
52
+ let entry = candidates.get(bid);
53
+ if (!entry) {
54
+ entry = { tf: new Map() };
55
+ candidates.set(bid, entry);
56
+ }
57
+ // accumulate tf per termId; positions array length is tf
58
+ entry.tf.set(tid, positions.length);
59
+ }
60
+ bid = p[i++];
61
+ }
62
+ // end of term section; skip trailing zero already consumed
63
+ }
64
+ // Check phrases on candidate texts. If a candidate does not contain all
65
+ // required phrases, it will be filtered out in ranking.
66
+ for (const [bid, data] of candidates) {
67
+ const text = pack.blocks[bid] || '';
68
+ data.hasPhrase = phraseTerms.some((seq) => containsPhrase(text, seq));
69
+ }
70
+ // Compute average block length for ranking normalization
71
+ const avgLen = pack.blocks.length
72
+ ? pack.blocks.reduce((sum, b) => sum + tokenize(b).length, 0) / pack.blocks.length
73
+ : 1;
74
+ const ranked = rankBM25L(candidates, avgLen);
75
+ return ranked.slice(0, topK).map((res) => ({
76
+ blockId: res.blockId,
77
+ score: res.score,
78
+ text: pack.blocks[res.blockId] || '',
79
+ }));
80
+ }
81
+ /** Determine whether the given sequence of terms appears in order within the
82
+ * text. The algorithm tokenizes the text and performs a sliding window
83
+ * comparison. This is case‑insensitive and uses the same normalization as
84
+ * other parts of the system.
85
+ */
86
+ function containsPhrase(text, seq) {
87
+ if (seq.length === 0)
88
+ return false;
89
+ const toks = tokenize(text).map((t) => t.term);
90
+ outer: for (let i = 0; i <= toks.length - seq.length; i++) {
91
+ for (let j = 0; j < seq.length; j++) {
92
+ if (toks[i + j] !== seq[j])
93
+ continue outer;
94
+ }
95
+ return true;
96
+ }
97
+ return false;
98
+ }
package/dist/rank.d.ts ADDED
@@ -0,0 +1,20 @@
1
+ export type RankOptions = {
2
+ k1?: number;
3
+ b?: number;
4
+ headingBoost?: number;
5
+ phraseBoost?: number;
6
+ };
7
+ /**
8
+ * Rank a set of candidate blocks using BM25L. Each candidate carries a term
9
+ * frequency (tf) map keyed by termId. Additional properties may include
10
+ * `hasPhrase` and `headingScore` to apply multiplicative boosts. The average
11
+ * document length (avgLen) is required for BM25L normalization.
12
+ */
13
+ export declare function rankBM25L(candidates: Map<number, {
14
+ tf: Map<number, number>;
15
+ hasPhrase?: boolean;
16
+ headingScore?: number;
17
+ }>, avgLen: number, opts?: RankOptions): Array<{
18
+ blockId: number;
19
+ score: number;
20
+ }>;
package/dist/rank.js ADDED
@@ -0,0 +1,43 @@
1
+ /*
2
+ * rank.ts
3
+ *
4
+ * Implements a simple BM25L ranker with optional boosts for headings and
5
+ * phrase matches. The inputs to the ranker are a map of block IDs to term
6
+ * frequency maps and additional boolean flags. The outputs are sorted by
7
+ * descending relevance score. This ranking algorithm can be replaced or
8
+ * augmented in the future without impacting the public API of the query
9
+ * function.
10
+ */
11
+ /**
12
+ * Rank a set of candidate blocks using BM25L. Each candidate carries a term
13
+ * frequency (tf) map keyed by termId. Additional properties may include
14
+ * `hasPhrase` and `headingScore` to apply multiplicative boosts. The average
15
+ * document length (avgLen) is required for BM25L normalization.
16
+ */
17
+ export function rankBM25L(candidates, avgLen, opts = {}) {
18
+ const k1 = opts.k1 ?? 1.5;
19
+ const b = opts.b ?? 0.75;
20
+ const headingBoost = opts.headingBoost ?? 0.3;
21
+ const phraseBoost = opts.phraseBoost ?? 0.6;
22
+ const results = [];
23
+ for (const [bid, data] of candidates) {
24
+ const len = Array.from(data.tf.values()).reduce((sum, tf) => sum + tf, 0) || 1;
25
+ let score = 0;
26
+ for (const [, tf] of data.tf) {
27
+ const idf = 1; // placeholder; no document frequency available in v0
28
+ const numer = tf * (k1 + 1);
29
+ const denom = tf + k1 * (1 - b + b * (len / avgLen));
30
+ score += idf * (numer / denom);
31
+ }
32
+ // Apply boosts multiplicatively
33
+ if (data.hasPhrase) {
34
+ score *= 1 + phraseBoost;
35
+ }
36
+ if (data.headingScore) {
37
+ score *= 1 + headingBoost * data.headingScore;
38
+ }
39
+ results.push({ blockId: bid, score });
40
+ }
41
+ results.sort((a, b2) => b2.score - a.score);
42
+ return results;
43
+ }
@@ -0,0 +1,24 @@
1
+ export type Token = {
2
+ term: string;
3
+ pos: number;
4
+ };
5
+ /** Normalize a string by:
6
+ * - Applying NFKD Unicode normalization
7
+ * - Stripping combining diacritics
8
+ * - Converting to lowercase
9
+ * - Replacing non‑alphanumeric characters (except hyphen and space) with space
10
+ *
11
+ * This normalization ensures that accents and case do not affect matching.
12
+ */
13
+ export declare function normalize(s: string): string;
14
+ /** Split a piece of text into tokens with positional information. Each token
15
+ * contains the normalized term and its position in the sequence. Positions
16
+ * increment only on actual tokens, ignoring multiple whitespace separators.
17
+ */
18
+ export declare function tokenize(text: string): Token[];
19
+ /** Parse quoted phrases in a query string. A phrase is a sequence of words
20
+ * enclosed in double quotes. Returns an array of term arrays representing
21
+ * each phrase. Single words outside quotes are ignored here and handled
22
+ * separately by tokenize().
23
+ */
24
+ export declare function parsePhrases(q: string): string[][];
@@ -0,0 +1,53 @@
1
+ /*
2
+ * tokenize.ts
3
+ *
4
+ * Provides functions for normalizing strings and splitting them into tokens
5
+ * suitable for indexing and querying. This module deliberately avoids any
6
+ * language‑specific stemming or lemmatization to keep the core simple and
7
+ * deterministic across platforms. Basic Unicode normalization and diacritic
8
+ * stripping are applied to ensure consistent matches.
9
+ */
10
+ /** Normalize a string by:
11
+ * - Applying NFKD Unicode normalization
12
+ * - Stripping combining diacritics
13
+ * - Converting to lowercase
14
+ * - Replacing non‑alphanumeric characters (except hyphen and space) with space
15
+ *
16
+ * This normalization ensures that accents and case do not affect matching.
17
+ */
18
+ export function normalize(s) {
19
+ return s
20
+ .normalize('NFKD')
21
+ .replace(/\p{M}+/gu, '')
22
+ .toLowerCase()
23
+ .replace(/[^\p{L}\p{N}\s-]/gu, ' ');
24
+ }
25
+ /** Split a piece of text into tokens with positional information. Each token
26
+ * contains the normalized term and its position in the sequence. Positions
27
+ * increment only on actual tokens, ignoring multiple whitespace separators.
28
+ */
29
+ export function tokenize(text) {
30
+ const norm = normalize(text);
31
+ const out = [];
32
+ let pos = 0;
33
+ for (const w of norm.split(/\s+/).filter(Boolean)) {
34
+ out.push({ term: w, pos: pos++ });
35
+ }
36
+ return out;
37
+ }
38
+ /** Parse quoted phrases in a query string. A phrase is a sequence of words
39
+ * enclosed in double quotes. Returns an array of term arrays representing
40
+ * each phrase. Single words outside quotes are ignored here and handled
41
+ * separately by tokenize().
42
+ */
43
+ export function parsePhrases(q) {
44
+ const parts = [];
45
+ const regex = /"([^\"]+)"/g;
46
+ let match;
47
+ while ((match = regex.exec(q)) !== null) {
48
+ const phrase = match[1].trim().split(/\s+/);
49
+ if (phrase.length > 0)
50
+ parts.push(phrase);
51
+ }
52
+ return parts;
53
+ }
package/package.json ADDED
@@ -0,0 +1,20 @@
1
+ {
2
+ "name": "knolo-core",
3
+ "version": "0.1.0",
4
+ "description": "Local-first knowledge packs for small LLMs.",
5
+ "keywords": ["llm", "knowledge-base", "rag", "local", "expo"],
6
+ "author": "Your Name",
7
+ "license": "MIT",
8
+ "main": "./dist/index.js",
9
+ "types": "./dist/index.d.ts",
10
+ "files": ["dist", "bin", "README.md", "LICENSE"],
11
+ "bin": { "knolo": "./bin/knolo.mjs" },
12
+ "scripts": {
13
+ "build": "tsc -p tsconfig.json",
14
+ "prepublishOnly": "npm run build"
15
+ },
16
+ "devDependencies": {
17
+ "typescript": "^5.5.0",
18
+ "@types/node": "^20.11.0"
19
+ }
20
+ }