knolo-core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +141 -0
- package/bin/knolo.mjs +30 -0
- package/dist/builder.d.ts +11 -0
- package/dist/builder.js +81 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +5 -0
- package/dist/indexer.d.ts +22 -0
- package/dist/indexer.js +70 -0
- package/dist/pack.d.ts +47 -0
- package/dist/pack.js +77 -0
- package/dist/patch.d.ts +29 -0
- package/dist/patch.js +50 -0
- package/dist/query.d.ts +18 -0
- package/dist/query.js +98 -0
- package/dist/rank.d.ts +20 -0
- package/dist/rank.js +43 -0
- package/dist/tokenize.d.ts +24 -0
- package/dist/tokenize.js +53 -0
- package/package.json +20 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 KnoLo Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
|
|
2
|
+
# 🧠KnoLo Core
|
|
3
|
+
|
|
4
|
+
KnoLo Core is a **local-first knowledge base system** for small language models (LLMs).
|
|
5
|
+
It lets you package your own documents into a compact `.knolo` file and query them deterministically — **no embeddings, no vector DBs, no cloud**. Perfect for **on-device LLMs**.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## ✨ Features
|
|
10
|
+
|
|
11
|
+
* 📦 **Single-file packs** (`.knolo`) you can ship or load offline.
|
|
12
|
+
* 🔎 **Deterministic lexical retrieval** (BM25L + phrase + heading boosts).
|
|
13
|
+
* ⚡ **Tiny & fast** — runs in Node, browsers, and Expo.
|
|
14
|
+
* 📑 **Context Patches**: structured snippets for direct LLM input.
|
|
15
|
+
* 🔒 **Privacy-first**: all data stays local.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 📦 Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# local build
|
|
23
|
+
npm install
|
|
24
|
+
npm run build
|
|
25
|
+
|
|
26
|
+
# or if published later
|
|
27
|
+
npm install @knolo/core
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## 🚀 Usage Examples
|
|
33
|
+
|
|
34
|
+
### 1. Node.js (in-memory build + query)
|
|
35
|
+
|
|
36
|
+
```js
|
|
37
|
+
import { buildPack, mountPack, query, makeContextPatch } from "./dist/index.js";
|
|
38
|
+
|
|
39
|
+
const docs = [
|
|
40
|
+
{ heading: "React Native Bridge", text: "The bridge sends messages between JS and native. You can throttle events to reduce jank." },
|
|
41
|
+
{ heading: "Throttling", text: "Throttling reduces frequency of events to avoid flooding the bridge." },
|
|
42
|
+
{ heading: "Debounce vs Throttle", text: "Debounce waits for silence, throttle guarantees a max rate." }
|
|
43
|
+
];
|
|
44
|
+
|
|
45
|
+
// Build a pack in memory
|
|
46
|
+
const bytes = await buildPack(docs);
|
|
47
|
+
|
|
48
|
+
// Mount it
|
|
49
|
+
const kb = await mountPack({ src: bytes });
|
|
50
|
+
|
|
51
|
+
// Query
|
|
52
|
+
const hits = query(kb, "react native bridge throttling", { topK: 5 });
|
|
53
|
+
console.log("Top hits:", hits);
|
|
54
|
+
|
|
55
|
+
// Turn into an LLM-friendly context patch
|
|
56
|
+
const patch = makeContextPatch(hits, { budget: "small" });
|
|
57
|
+
console.log("Context Patch:", patch);
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
### 2. CLI (build `.knolo` file)
|
|
63
|
+
|
|
64
|
+
**Prepare `docs.json`:**
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
[
|
|
68
|
+
{ "heading": "Guide", "text": "Install deps.\n\n## Throttle\nLimit frequency of events." },
|
|
69
|
+
{ "heading": "FAQ", "text": "What is throttling? It reduces event frequency." }
|
|
70
|
+
]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**Build the pack:**
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# writes mypack.knolo
|
|
77
|
+
node bin/knolo.mjs docs.json mypack.knolo
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Query it in a script:**
|
|
81
|
+
|
|
82
|
+
```js
|
|
83
|
+
import { mountPack, query } from "./dist/index.js";
|
|
84
|
+
|
|
85
|
+
const kb = await mountPack({ src: "./mypack.knolo" });
|
|
86
|
+
const hits = query(kb, "throttle events", { topK: 3 });
|
|
87
|
+
console.log(hits);
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
### 3. React / Expo (load from asset)
|
|
93
|
+
|
|
94
|
+
```ts
|
|
95
|
+
import { Asset } from "expo-asset";
|
|
96
|
+
import * as FileSystem from "expo-file-system";
|
|
97
|
+
import { mountPack, query, makeContextPatch } from "@knolo/core";
|
|
98
|
+
|
|
99
|
+
async function loadKnowledge() {
|
|
100
|
+
const asset = Asset.fromModule(require("./assets/mypack.knolo"));
|
|
101
|
+
await asset.downloadAsync();
|
|
102
|
+
|
|
103
|
+
const base64 = await FileSystem.readAsStringAsync(asset.localUri!, { encoding: FileSystem.EncodingType.Base64 });
|
|
104
|
+
const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0));
|
|
105
|
+
|
|
106
|
+
const kb = await mountPack({ src: bytes.buffer });
|
|
107
|
+
const hits = query(kb, "bridge throttling", { topK: 5 });
|
|
108
|
+
return makeContextPatch(hits, { budget: "mini" });
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## 📑 API Quick Reference
|
|
115
|
+
|
|
116
|
+
```ts
|
|
117
|
+
// Build a pack from docs
|
|
118
|
+
buildPack([{ heading: string, text: string }[]]) -> Promise<Uint8Array>
|
|
119
|
+
|
|
120
|
+
// Load a pack
|
|
121
|
+
mountPack({ src: string | Uint8Array | ArrayBuffer }) -> Promise<Pack>
|
|
122
|
+
|
|
123
|
+
// Query
|
|
124
|
+
query(pack, "your query", { topK?: number, requirePhrases?: string[] }) -> Hit[]
|
|
125
|
+
|
|
126
|
+
// Create LLM-friendly patch
|
|
127
|
+
makeContextPatch(hits, { budget?: "mini"|"small"|"full" }) -> ContextPatch
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## 🔮 Roadmap
|
|
133
|
+
|
|
134
|
+
* Multi-resolution packs (summaries + facts).
|
|
135
|
+
* Overlay store for user notes.
|
|
136
|
+
* WASM core for very large packs.
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
👉 With KnoLo, you can **carry knowledge with your model** — no servers, no dependencies, just a tiny portable pack.
|
|
141
|
+
|
package/bin/knolo.mjs
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// Simple CLI wrapper around the KnoLo pack builder. Reads an input JSON
|
|
4
|
+
// containing an array of documents with `heading` and `text` fields and
|
|
5
|
+
// writes a `.knolo` binary pack. Requires that the compiled `dist` files
|
|
6
|
+
// exist (run `npm run build` before using). This script uses ESM syntax.
|
|
7
|
+
|
|
8
|
+
import { readFileSync, writeFileSync } from 'node:fs';
|
|
9
|
+
import { buildPack } from '../dist/builder.js';
|
|
10
|
+
|
|
11
|
+
async function main() {
|
|
12
|
+
const [,, inputFile, outputFile = 'knowledge.knolo'] = process.argv;
|
|
13
|
+
if (!inputFile) {
|
|
14
|
+
console.log('Usage: knolo <input.json> [output.knolo]');
|
|
15
|
+
process.exit(1);
|
|
16
|
+
}
|
|
17
|
+
const json = JSON.parse(readFileSync(inputFile, 'utf8'));
|
|
18
|
+
if (!Array.isArray(json)) {
|
|
19
|
+
console.error('Input JSON must be an array of objects');
|
|
20
|
+
process.exit(1);
|
|
21
|
+
}
|
|
22
|
+
const bytes = await buildPack(json);
|
|
23
|
+
writeFileSync(outputFile, Buffer.from(bytes));
|
|
24
|
+
console.log(`wrote ${outputFile}`);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
main().catch((err) => {
|
|
28
|
+
console.error(err);
|
|
29
|
+
process.exit(1);
|
|
30
|
+
});
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export type BuildInputDoc = {
|
|
2
|
+
id?: string;
|
|
3
|
+
heading?: string;
|
|
4
|
+
text: string;
|
|
5
|
+
};
|
|
6
|
+
/** Build a `.knolo` pack from an array of input documents. At present each
|
|
7
|
+
* document becomes a single block. Future versions may split documents into
|
|
8
|
+
* multiple blocks based on headings or token count to improve retrieval
|
|
9
|
+
* granularity.
|
|
10
|
+
*/
|
|
11
|
+
export declare function buildPack(docs: BuildInputDoc[]): Promise<Uint8Array>;
|
package/dist/builder.js
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* builder.ts
|
|
3
|
+
*
|
|
4
|
+
* Provides a programmatic interface for building `.knolo` knowledge packs
|
|
5
|
+
* from arrays of input documents. The input format accepts objects with
|
|
6
|
+
* `id`, `heading` and `text` fields. The builder performs simple
|
|
7
|
+
* Markdown stripping and calls the indexer to generate the inverted index. The
|
|
8
|
+
* resulting pack binary can be persisted to disk or served directly to
|
|
9
|
+
* clients.
|
|
10
|
+
*/
|
|
11
|
+
import { buildIndex } from './indexer.js';
|
|
12
|
+
/** Build a `.knolo` pack from an array of input documents. At present each
|
|
13
|
+
* document becomes a single block. Future versions may split documents into
|
|
14
|
+
* multiple blocks based on headings or token count to improve retrieval
|
|
15
|
+
* granularity.
|
|
16
|
+
*/
|
|
17
|
+
export async function buildPack(docs) {
|
|
18
|
+
// Convert docs into blocks. For v0 each doc is a single block; heading is
|
|
19
|
+
// ignored except possibly for future heading boosting in ranking.
|
|
20
|
+
const blocks = docs.map((d, i) => ({ id: i, text: stripMd(d.text), heading: d.heading }));
|
|
21
|
+
const { lexicon, postings } = buildIndex(blocks);
|
|
22
|
+
const meta = {
|
|
23
|
+
version: 1,
|
|
24
|
+
stats: { docs: docs.length, blocks: blocks.length, terms: lexicon.length },
|
|
25
|
+
};
|
|
26
|
+
// Encode sections to bytes
|
|
27
|
+
const enc = new TextEncoder();
|
|
28
|
+
const metaBytes = enc.encode(JSON.stringify(meta));
|
|
29
|
+
const lexBytes = enc.encode(JSON.stringify(lexicon));
|
|
30
|
+
const blocksBytes = enc.encode(JSON.stringify(blocks.map((b) => b.text)));
|
|
31
|
+
// Compute lengths and allocate output
|
|
32
|
+
const totalLength = 4 + metaBytes.length +
|
|
33
|
+
4 + lexBytes.length +
|
|
34
|
+
4 + postings.length * 4 +
|
|
35
|
+
4 + blocksBytes.length;
|
|
36
|
+
const out = new Uint8Array(totalLength);
|
|
37
|
+
const dv = new DataView(out.buffer);
|
|
38
|
+
let offset = 0;
|
|
39
|
+
// meta
|
|
40
|
+
dv.setUint32(offset, metaBytes.length, true);
|
|
41
|
+
offset += 4;
|
|
42
|
+
out.set(metaBytes, offset);
|
|
43
|
+
offset += metaBytes.length;
|
|
44
|
+
// lexicon
|
|
45
|
+
dv.setUint32(offset, lexBytes.length, true);
|
|
46
|
+
offset += 4;
|
|
47
|
+
out.set(lexBytes, offset);
|
|
48
|
+
offset += lexBytes.length;
|
|
49
|
+
// postings
|
|
50
|
+
dv.setUint32(offset, postings.length, true);
|
|
51
|
+
offset += 4;
|
|
52
|
+
new Uint32Array(out.buffer, offset, postings.length).set(postings);
|
|
53
|
+
offset += postings.length * 4;
|
|
54
|
+
// blocks
|
|
55
|
+
dv.setUint32(offset, blocksBytes.length, true);
|
|
56
|
+
offset += 4;
|
|
57
|
+
out.set(blocksBytes, offset);
|
|
58
|
+
return out;
|
|
59
|
+
}
|
|
60
|
+
/** Strip Markdown syntax by converting to HTML and then removing tags. The
|
|
61
|
+
* `marked` library is used for parsing and rendering. A very naive HTML tag
|
|
62
|
+
* stripper removes tags by dropping anything between `<` and `>`. This is
|
|
63
|
+
* simplistic but adequate for plain text extraction.
|
|
64
|
+
*/
|
|
65
|
+
function stripMd(md) {
|
|
66
|
+
// Remove code fences
|
|
67
|
+
let text = md.replace(/```[^```]*```/g, ' ');
|
|
68
|
+
// Remove inline code backticks
|
|
69
|
+
text = text.replace(/`[^`]*`/g, ' ');
|
|
70
|
+
// Remove emphasis markers (*, _, ~)
|
|
71
|
+
text = text.replace(/[\*_~]+/g, ' ');
|
|
72
|
+
// Remove headings (#)
|
|
73
|
+
text = text.replace(/^#+\s*/gm, '');
|
|
74
|
+
// Remove links [text](url) -> text
|
|
75
|
+
text = text.replace(/\[([^\]]+)\]\([^\)]+\)/g, '$1');
|
|
76
|
+
// Remove any remaining brackets
|
|
77
|
+
text = text.replace(/[\[\]()]/g, ' ');
|
|
78
|
+
// Collapse whitespace
|
|
79
|
+
text = text.replace(/\s+/g, ' ').trim();
|
|
80
|
+
return text;
|
|
81
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export type { MountOptions, PackMeta, Pack } from './pack';
|
|
2
|
+
export type { QueryOptions, Hit } from './query';
|
|
3
|
+
export type { ContextPatch } from './patch';
|
|
4
|
+
export { mountPack } from './pack.js';
|
|
5
|
+
export { query } from './query.js';
|
|
6
|
+
export { makeContextPatch } from './patch.js';
|
|
7
|
+
export { buildPack } from './builder.js';
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export type Block = {
|
|
2
|
+
id: number;
|
|
3
|
+
text: string;
|
|
4
|
+
heading?: string;
|
|
5
|
+
};
|
|
6
|
+
export type IndexBuildResult = {
|
|
7
|
+
lexicon: Array<[string, number]>;
|
|
8
|
+
postings: Uint32Array;
|
|
9
|
+
};
|
|
10
|
+
/**
|
|
11
|
+
* Build an inverted index from an array of blocks. The postings list format
|
|
12
|
+
* encodes, for each term, a header containing the termId, followed by
|
|
13
|
+
* sequences of blockId and positions for that term, with zeros as delimiters.
|
|
14
|
+
* The structure looks like:
|
|
15
|
+
*
|
|
16
|
+
* [termId, blockId, pos, pos, 0, blockId, pos, 0, 0, termId, ...]
|
|
17
|
+
*
|
|
18
|
+
* Each block section ends with a 0, and each term section ends with a 0. The
|
|
19
|
+
* entire array can be streamed sequentially without needing to know the sizes
|
|
20
|
+
* of individual lists ahead of time.
|
|
21
|
+
*/
|
|
22
|
+
export declare function buildIndex(blocks: Block[]): IndexBuildResult;
|
package/dist/indexer.js
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* indexer.ts
|
|
3
|
+
*
|
|
4
|
+
* Implements a basic inverted index builder. Given an array of blocks, it
|
|
5
|
+
* produces a lexicon mapping each unique term to a term identifier and a
|
|
6
|
+
* flattened postings array. This representation is intentionally naïve to
|
|
7
|
+
* prioritise clarity and portability over maximum compression. The pack
|
|
8
|
+
* builder can later swap this implementation for a more compact format.
|
|
9
|
+
*/
|
|
10
|
+
import { tokenize } from "./tokenize.js";
|
|
11
|
+
/**
|
|
12
|
+
* Build an inverted index from an array of blocks. The postings list format
|
|
13
|
+
* encodes, for each term, a header containing the termId, followed by
|
|
14
|
+
* sequences of blockId and positions for that term, with zeros as delimiters.
|
|
15
|
+
* The structure looks like:
|
|
16
|
+
*
|
|
17
|
+
* [termId, blockId, pos, pos, 0, blockId, pos, 0, 0, termId, ...]
|
|
18
|
+
*
|
|
19
|
+
* Each block section ends with a 0, and each term section ends with a 0. The
|
|
20
|
+
* entire array can be streamed sequentially without needing to know the sizes
|
|
21
|
+
* of individual lists ahead of time.
|
|
22
|
+
*/
|
|
23
|
+
export function buildIndex(blocks) {
|
|
24
|
+
// Map term to termId and interim map of termId -> blockId -> positions
|
|
25
|
+
const term2id = new Map();
|
|
26
|
+
const termBlockPositions = new Map();
|
|
27
|
+
const getTermId = (t) => {
|
|
28
|
+
let id = term2id.get(t);
|
|
29
|
+
if (id === undefined) {
|
|
30
|
+
id = term2id.size + 1; // term IDs start at 1
|
|
31
|
+
term2id.set(t, id);
|
|
32
|
+
}
|
|
33
|
+
return id;
|
|
34
|
+
};
|
|
35
|
+
// Build a local term frequency map per block, then populate the global map
|
|
36
|
+
for (const block of blocks) {
|
|
37
|
+
const toks = tokenize(block.text);
|
|
38
|
+
const perTermPositions = new Map();
|
|
39
|
+
for (const tk of toks) {
|
|
40
|
+
const id = getTermId(tk.term);
|
|
41
|
+
let positions = perTermPositions.get(id);
|
|
42
|
+
if (!positions) {
|
|
43
|
+
positions = [];
|
|
44
|
+
perTermPositions.set(id, positions);
|
|
45
|
+
}
|
|
46
|
+
positions.push(tk.pos);
|
|
47
|
+
}
|
|
48
|
+
// Merge into global structure
|
|
49
|
+
for (const [tid, positions] of perTermPositions) {
|
|
50
|
+
let blockMap = termBlockPositions.get(tid);
|
|
51
|
+
if (!blockMap) {
|
|
52
|
+
blockMap = new Map();
|
|
53
|
+
termBlockPositions.set(tid, blockMap);
|
|
54
|
+
}
|
|
55
|
+
blockMap.set(block.id, positions);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// Flatten postings into a single Uint32Array
|
|
59
|
+
const postings = [];
|
|
60
|
+
for (const [tid, blockMap] of termBlockPositions) {
|
|
61
|
+
postings.push(tid);
|
|
62
|
+
for (const [bid, positions] of blockMap) {
|
|
63
|
+
postings.push(bid, ...positions, 0);
|
|
64
|
+
}
|
|
65
|
+
postings.push(0); // end of term
|
|
66
|
+
}
|
|
67
|
+
// Convert lexicon to array for serialization
|
|
68
|
+
const lexicon = Array.from(term2id.entries());
|
|
69
|
+
return { lexicon, postings: new Uint32Array(postings) };
|
|
70
|
+
}
|
package/dist/pack.d.ts
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
export type MountOptions = {
|
|
2
|
+
src: string | ArrayBufferLike | Uint8Array;
|
|
3
|
+
};
|
|
4
|
+
/** Metadata about the pack. Version numbers should increment with format
|
|
5
|
+
* changes, allowing the runtime to adapt accordingly. */
|
|
6
|
+
export type PackMeta = {
|
|
7
|
+
version: number;
|
|
8
|
+
stats: {
|
|
9
|
+
docs: number;
|
|
10
|
+
blocks: number;
|
|
11
|
+
terms: number;
|
|
12
|
+
};
|
|
13
|
+
};
|
|
14
|
+
/**
|
|
15
|
+
* A mounted pack exposing the inverted index, block text and optional field
|
|
16
|
+
* metadata. The core runtime reads from these structures directly at query
|
|
17
|
+
* time.
|
|
18
|
+
*/
|
|
19
|
+
export type Pack = {
|
|
20
|
+
meta: PackMeta;
|
|
21
|
+
/** Map of token to term identifier used in the postings list. */
|
|
22
|
+
lexicon: Map<string, number>;
|
|
23
|
+
/** Flattened postings list where each term section starts with the termId
|
|
24
|
+
* followed by (blockId, positions..., 0) tuples, ending with a 0.
|
|
25
|
+
*/
|
|
26
|
+
postings: Uint32Array;
|
|
27
|
+
/** Array of block texts. Each block corresponds to a chunk of the original
|
|
28
|
+
* documents. The blockId used in the postings list indexes into this array.
|
|
29
|
+
*/
|
|
30
|
+
blocks: string[];
|
|
31
|
+
};
|
|
32
|
+
/**
|
|
33
|
+
* Load a `.knolo` pack from a variety of sources. The pack binary layout is
|
|
34
|
+
* currently:
|
|
35
|
+
*
|
|
36
|
+
* [metaLen:u32][meta JSON][lexLen:u32][lexicon JSON][postCount:u32][postings][blocksLen:u32][blocks JSON]
|
|
37
|
+
*
|
|
38
|
+
* All integers are little endian. `metaLen`, `lexLen` and `blocksLen` denote
|
|
39
|
+
* the byte lengths of the subsequent JSON sections. `postCount` is the number
|
|
40
|
+
* of 32‑bit integers in the postings array. This simple layout is sufficient
|
|
41
|
+
* for v0 and avoids any additional dependencies beyond standard typed arrays.
|
|
42
|
+
*
|
|
43
|
+
* @param opts Options specifying how to load the pack. Accepts a URL string,
|
|
44
|
+
* ArrayBuffer, or Uint8Array.
|
|
45
|
+
* @returns A Promise resolving to a mounted pack with the index and blocks.
|
|
46
|
+
*/
|
|
47
|
+
export declare function mountPack(opts: MountOptions): Promise<Pack>;
|
package/dist/pack.js
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* pack.ts
|
|
3
|
+
*
|
|
4
|
+
* Defines the pack structure and implements the mountPack function that loads
|
|
5
|
+
* a `.knolo` pack from a variety of sources. A pack consists of a header
|
|
6
|
+
* containing metadata, followed by JSON‑encoded lexicon and blocks and a
|
|
7
|
+
* flattened postings list encoded as a 32‑bit integer array. The pack is
|
|
8
|
+
* portable across Node.js and browser environments.
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Load a `.knolo` pack from a variety of sources. The pack binary layout is
|
|
12
|
+
* currently:
|
|
13
|
+
*
|
|
14
|
+
* [metaLen:u32][meta JSON][lexLen:u32][lexicon JSON][postCount:u32][postings][blocksLen:u32][blocks JSON]
|
|
15
|
+
*
|
|
16
|
+
* All integers are little endian. `metaLen`, `lexLen` and `blocksLen` denote
|
|
17
|
+
* the byte lengths of the subsequent JSON sections. `postCount` is the number
|
|
18
|
+
* of 32‑bit integers in the postings array. This simple layout is sufficient
|
|
19
|
+
* for v0 and avoids any additional dependencies beyond standard typed arrays.
|
|
20
|
+
*
|
|
21
|
+
* @param opts Options specifying how to load the pack. Accepts a URL string,
|
|
22
|
+
* ArrayBuffer, or Uint8Array.
|
|
23
|
+
* @returns A Promise resolving to a mounted pack with the index and blocks.
|
|
24
|
+
*/
|
|
25
|
+
export async function mountPack(opts) {
|
|
26
|
+
const buf = await resolveToBuffer(opts.src);
|
|
27
|
+
const dv = new DataView(buf);
|
|
28
|
+
let offset = 0;
|
|
29
|
+
// Read meta section
|
|
30
|
+
const metaLen = dv.getUint32(offset, true);
|
|
31
|
+
offset += 4;
|
|
32
|
+
const metaJson = new TextDecoder().decode(new Uint8Array(buf, offset, metaLen));
|
|
33
|
+
offset += metaLen;
|
|
34
|
+
const meta = JSON.parse(metaJson);
|
|
35
|
+
// Read lexicon
|
|
36
|
+
const lexLen = dv.getUint32(offset, true);
|
|
37
|
+
offset += 4;
|
|
38
|
+
const lexJson = new TextDecoder().decode(new Uint8Array(buf, offset, lexLen));
|
|
39
|
+
offset += lexLen;
|
|
40
|
+
const lexEntries = JSON.parse(lexJson);
|
|
41
|
+
const lexicon = new Map(lexEntries);
|
|
42
|
+
// Read postings
|
|
43
|
+
const postCount = dv.getUint32(offset, true);
|
|
44
|
+
offset += 4;
|
|
45
|
+
const postings = new Uint32Array(buf, offset, postCount);
|
|
46
|
+
offset += postCount * 4;
|
|
47
|
+
// Read blocks
|
|
48
|
+
const blocksLen = dv.getUint32(offset, true);
|
|
49
|
+
offset += 4;
|
|
50
|
+
const blocksJson = new TextDecoder().decode(new Uint8Array(buf, offset, blocksLen));
|
|
51
|
+
const blocks = JSON.parse(blocksJson);
|
|
52
|
+
return { meta, lexicon, postings, blocks };
|
|
53
|
+
}
|
|
54
|
+
/** Resolve the `src` field of MountOptions into an ArrayBuffer. Supports:
|
|
55
|
+
* - strings interpreted as URLs (via fetch)
|
|
56
|
+
* - Uint8Array and ArrayBuffer inputs
|
|
57
|
+
*/
|
|
58
|
+
async function resolveToBuffer(src) {
|
|
59
|
+
if (typeof src === 'string') {
|
|
60
|
+
// Use fetch for browser and Node environments. For Node this requires the
|
|
61
|
+
// global fetch API (available since Node 18). Error handling is delegated
|
|
62
|
+
// to the caller.
|
|
63
|
+
const res = await fetch(src);
|
|
64
|
+
const ab = await res.arrayBuffer();
|
|
65
|
+
return ab;
|
|
66
|
+
}
|
|
67
|
+
if (src instanceof Uint8Array) {
|
|
68
|
+
// If the view covers the whole buffer, return it directly (cast to ArrayBuffer).
|
|
69
|
+
if (src.byteOffset === 0 && src.byteLength === src.buffer.byteLength) {
|
|
70
|
+
return src.buffer;
|
|
71
|
+
}
|
|
72
|
+
// Otherwise, copy to a new buffer so we return exactly the bytes for this view.
|
|
73
|
+
const copy = src.slice(); // makes a copy of the bytes for this range
|
|
74
|
+
return copy.buffer; // typed as ArrayBuffer
|
|
75
|
+
}
|
|
76
|
+
return src;
|
|
77
|
+
}
|
package/dist/patch.d.ts
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { Hit } from "./query.js";
|
|
2
|
+
export type ContextPatch = {
|
|
3
|
+
background: string[];
|
|
4
|
+
snippets: Array<{
|
|
5
|
+
text: string;
|
|
6
|
+
source?: string;
|
|
7
|
+
}>;
|
|
8
|
+
definitions: Array<{
|
|
9
|
+
term: string;
|
|
10
|
+
def: string;
|
|
11
|
+
evidence?: number[];
|
|
12
|
+
}>;
|
|
13
|
+
facts: Array<{
|
|
14
|
+
s: string;
|
|
15
|
+
p: string;
|
|
16
|
+
o: string;
|
|
17
|
+
evidence?: number[];
|
|
18
|
+
}>;
|
|
19
|
+
};
|
|
20
|
+
/** Assemble a context patch from an array of hits. The `budget` determines
|
|
21
|
+
* how many snippets and how much text to include in each snippet. Currently
|
|
22
|
+
* three budgets are supported:
|
|
23
|
+
* - `mini`: ~512 token contexts
|
|
24
|
+
* - `small`: ~1k token contexts
|
|
25
|
+
* - `full`: ~2k token contexts
|
|
26
|
+
*/
|
|
27
|
+
export declare function makeContextPatch(hits: Hit[], opts?: {
|
|
28
|
+
budget?: 'mini' | 'small' | 'full';
|
|
29
|
+
}): ContextPatch;
|
package/dist/patch.js
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* patch.ts
|
|
3
|
+
*
|
|
4
|
+
* Provides the makeContextPatch function that assembles search results into
|
|
5
|
+
* structured context patches for consumption by language models. A context
|
|
6
|
+
* patch includes a background summary and selected snippets. Future versions
|
|
7
|
+
* may include definitions, facts and other structured artifacts.
|
|
8
|
+
*/
|
|
9
|
+
/** Assemble a context patch from an array of hits. The `budget` determines
|
|
10
|
+
* how many snippets and how much text to include in each snippet. Currently
|
|
11
|
+
* three budgets are supported:
|
|
12
|
+
* - `mini`: ~512 token contexts
|
|
13
|
+
* - `small`: ~1k token contexts
|
|
14
|
+
* - `full`: ~2k token contexts
|
|
15
|
+
*/
|
|
16
|
+
export function makeContextPatch(hits, opts = {}) {
|
|
17
|
+
const budget = opts.budget ?? 'small';
|
|
18
|
+
const limits = {
|
|
19
|
+
mini: { snippets: 3, chars: 240 },
|
|
20
|
+
small: { snippets: 6, chars: 420 },
|
|
21
|
+
full: { snippets: 10, chars: 900 },
|
|
22
|
+
};
|
|
23
|
+
const limit = limits[budget];
|
|
24
|
+
const snippets = hits.slice(0, limit.snippets).map((h) => ({
|
|
25
|
+
text: truncate(h.text, limit.chars),
|
|
26
|
+
}));
|
|
27
|
+
// Build background summary from first two snippets by extracting first sentence
|
|
28
|
+
const background = snippets.slice(0, 2).map((s) => firstSentence(s.text));
|
|
29
|
+
return {
|
|
30
|
+
background,
|
|
31
|
+
snippets,
|
|
32
|
+
definitions: [],
|
|
33
|
+
facts: [],
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
/** Extract the first sentence from a block of text. If no terminal punctuation
|
|
37
|
+
* is found, returns the first N characters up to a reasonable length.
|
|
38
|
+
*/
|
|
39
|
+
function firstSentence(text) {
|
|
40
|
+
const m = text.match(/^(.{10,200}?[.!?])\s/);
|
|
41
|
+
if (m)
|
|
42
|
+
return m[1];
|
|
43
|
+
return text.slice(0, 160);
|
|
44
|
+
}
|
|
45
|
+
/** Truncate text to a maximum length and append an ellipsis if it was
|
|
46
|
+
* truncated.
|
|
47
|
+
*/
|
|
48
|
+
function truncate(text, maxChars) {
|
|
49
|
+
return text.length > maxChars ? text.slice(0, maxChars) + '…' : text;
|
|
50
|
+
}
|
package/dist/query.d.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { Pack } from './pack';
|
|
2
|
+
export type QueryOptions = {
|
|
3
|
+
topK?: number;
|
|
4
|
+
/** Additional phrases (unquoted) that must be present in results. */
|
|
5
|
+
requirePhrases?: string[];
|
|
6
|
+
};
|
|
7
|
+
export type Hit = {
|
|
8
|
+
blockId: number;
|
|
9
|
+
score: number;
|
|
10
|
+
text: string;
|
|
11
|
+
source?: string;
|
|
12
|
+
};
|
|
13
|
+
/** Execute a search against a mounted pack. The query string can contain
|
|
14
|
+
* quoted phrases; unquoted terms are treated individually. The `topK`
|
|
15
|
+
* parameter controls how many results are returned. If `requirePhrases`
|
|
16
|
+
* contains strings, those phrases must appear verbatim in candidate blocks.
|
|
17
|
+
*/
|
|
18
|
+
export declare function query(pack: Pack, q: string, opts?: QueryOptions): Hit[];
|
package/dist/query.js
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* query.ts
|
|
3
|
+
*
|
|
4
|
+
* Implements the main query function that ties together tokenization, lexical
|
|
5
|
+
* lookup in the pack's lexicon, scanning the postings list to generate
|
|
6
|
+
* candidates, and ranking them using the BM25L ranker. Phrase detection
|
|
7
|
+
* operates on the block text itself and is naive but effective for short
|
|
8
|
+
* queries. Future versions may incorporate more advanced mechanisms.
|
|
9
|
+
*/
|
|
10
|
+
import { tokenize, parsePhrases } from "./tokenize.js";
|
|
11
|
+
import { rankBM25L } from "./rank.js";
|
|
12
|
+
/** Execute a search against a mounted pack. The query string can contain
|
|
13
|
+
* quoted phrases; unquoted terms are treated individually. The `topK`
|
|
14
|
+
* parameter controls how many results are returned. If `requirePhrases`
|
|
15
|
+
* contains strings, those phrases must appear verbatim in candidate blocks.
|
|
16
|
+
*/
|
|
17
|
+
export function query(pack, q, opts = {}) {
|
|
18
|
+
const topK = opts.topK ?? 10;
|
|
19
|
+
const normTokens = tokenize(q).map((t) => t.term);
|
|
20
|
+
const phraseTerms = parsePhrases(q);
|
|
21
|
+
// Include explicitly provided phrases
|
|
22
|
+
if (opts.requirePhrases) {
|
|
23
|
+
for (const p of opts.requirePhrases) {
|
|
24
|
+
const terms = p.split(/\s+/).filter(Boolean);
|
|
25
|
+
if (terms.length > 0)
|
|
26
|
+
phraseTerms.push(terms);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
// Translate tokens to term IDs. Terms not in the lexicon are skipped.
|
|
30
|
+
const termIds = normTokens
|
|
31
|
+
.map((t) => pack.lexicon.get(t))
|
|
32
|
+
.filter((id) => id !== undefined);
|
|
33
|
+
// Map from blockId to candidate data (term frequencies, flags)
|
|
34
|
+
const candidates = new Map();
|
|
35
|
+
// Scan postings list. The format is [termId, blockId, pos... 0, blockId, pos... 0, 0, termId, ...]
|
|
36
|
+
const p = pack.postings;
|
|
37
|
+
let i = 0;
|
|
38
|
+
while (i < p.length) {
|
|
39
|
+
const tid = p[i++];
|
|
40
|
+
if (tid === 0)
|
|
41
|
+
continue;
|
|
42
|
+
const relevant = termIds.includes(tid);
|
|
43
|
+
let bid = p[i++];
|
|
44
|
+
while (bid !== 0) {
|
|
45
|
+
let pos = p[i++];
|
|
46
|
+
const positions = [];
|
|
47
|
+
while (pos !== 0) {
|
|
48
|
+
positions.push(pos);
|
|
49
|
+
pos = p[i++];
|
|
50
|
+
}
|
|
51
|
+
if (relevant) {
|
|
52
|
+
let entry = candidates.get(bid);
|
|
53
|
+
if (!entry) {
|
|
54
|
+
entry = { tf: new Map() };
|
|
55
|
+
candidates.set(bid, entry);
|
|
56
|
+
}
|
|
57
|
+
// accumulate tf per termId; positions array length is tf
|
|
58
|
+
entry.tf.set(tid, positions.length);
|
|
59
|
+
}
|
|
60
|
+
bid = p[i++];
|
|
61
|
+
}
|
|
62
|
+
// end of term section; skip trailing zero already consumed
|
|
63
|
+
}
|
|
64
|
+
// Check phrases on candidate texts. If a candidate does not contain all
|
|
65
|
+
// required phrases, it will be filtered out in ranking.
|
|
66
|
+
for (const [bid, data] of candidates) {
|
|
67
|
+
const text = pack.blocks[bid] || '';
|
|
68
|
+
data.hasPhrase = phraseTerms.some((seq) => containsPhrase(text, seq));
|
|
69
|
+
}
|
|
70
|
+
// Compute average block length for ranking normalization
|
|
71
|
+
const avgLen = pack.blocks.length
|
|
72
|
+
? pack.blocks.reduce((sum, b) => sum + tokenize(b).length, 0) / pack.blocks.length
|
|
73
|
+
: 1;
|
|
74
|
+
const ranked = rankBM25L(candidates, avgLen);
|
|
75
|
+
return ranked.slice(0, topK).map((res) => ({
|
|
76
|
+
blockId: res.blockId,
|
|
77
|
+
score: res.score,
|
|
78
|
+
text: pack.blocks[res.blockId] || '',
|
|
79
|
+
}));
|
|
80
|
+
}
|
|
81
|
+
/** Determine whether the given sequence of terms appears in order within the
|
|
82
|
+
* text. The algorithm tokenizes the text and performs a sliding window
|
|
83
|
+
* comparison. This is case‑insensitive and uses the same normalization as
|
|
84
|
+
* other parts of the system.
|
|
85
|
+
*/
|
|
86
|
+
function containsPhrase(text, seq) {
|
|
87
|
+
if (seq.length === 0)
|
|
88
|
+
return false;
|
|
89
|
+
const toks = tokenize(text).map((t) => t.term);
|
|
90
|
+
outer: for (let i = 0; i <= toks.length - seq.length; i++) {
|
|
91
|
+
for (let j = 0; j < seq.length; j++) {
|
|
92
|
+
if (toks[i + j] !== seq[j])
|
|
93
|
+
continue outer;
|
|
94
|
+
}
|
|
95
|
+
return true;
|
|
96
|
+
}
|
|
97
|
+
return false;
|
|
98
|
+
}
|
package/dist/rank.d.ts
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
export type RankOptions = {
|
|
2
|
+
k1?: number;
|
|
3
|
+
b?: number;
|
|
4
|
+
headingBoost?: number;
|
|
5
|
+
phraseBoost?: number;
|
|
6
|
+
};
|
|
7
|
+
/**
|
|
8
|
+
* Rank a set of candidate blocks using BM25L. Each candidate carries a term
|
|
9
|
+
* frequency (tf) map keyed by termId. Additional properties may include
|
|
10
|
+
* `hasPhrase` and `headingScore` to apply multiplicative boosts. The average
|
|
11
|
+
* document length (avgLen) is required for BM25L normalization.
|
|
12
|
+
*/
|
|
13
|
+
export declare function rankBM25L(candidates: Map<number, {
|
|
14
|
+
tf: Map<number, number>;
|
|
15
|
+
hasPhrase?: boolean;
|
|
16
|
+
headingScore?: number;
|
|
17
|
+
}>, avgLen: number, opts?: RankOptions): Array<{
|
|
18
|
+
blockId: number;
|
|
19
|
+
score: number;
|
|
20
|
+
}>;
|
package/dist/rank.js
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* rank.ts
|
|
3
|
+
*
|
|
4
|
+
* Implements a simple BM25L ranker with optional boosts for headings and
|
|
5
|
+
* phrase matches. The inputs to the ranker are a map of block IDs to term
|
|
6
|
+
* frequency maps and additional boolean flags. The outputs are sorted by
|
|
7
|
+
* descending relevance score. This ranking algorithm can be replaced or
|
|
8
|
+
* augmented in the future without impacting the public API of the query
|
|
9
|
+
* function.
|
|
10
|
+
*/
|
|
11
|
+
/**
|
|
12
|
+
* Rank a set of candidate blocks using BM25L. Each candidate carries a term
|
|
13
|
+
* frequency (tf) map keyed by termId. Additional properties may include
|
|
14
|
+
* `hasPhrase` and `headingScore` to apply multiplicative boosts. The average
|
|
15
|
+
* document length (avgLen) is required for BM25L normalization.
|
|
16
|
+
*/
|
|
17
|
+
export function rankBM25L(candidates, avgLen, opts = {}) {
|
|
18
|
+
const k1 = opts.k1 ?? 1.5;
|
|
19
|
+
const b = opts.b ?? 0.75;
|
|
20
|
+
const headingBoost = opts.headingBoost ?? 0.3;
|
|
21
|
+
const phraseBoost = opts.phraseBoost ?? 0.6;
|
|
22
|
+
const results = [];
|
|
23
|
+
for (const [bid, data] of candidates) {
|
|
24
|
+
const len = Array.from(data.tf.values()).reduce((sum, tf) => sum + tf, 0) || 1;
|
|
25
|
+
let score = 0;
|
|
26
|
+
for (const [, tf] of data.tf) {
|
|
27
|
+
const idf = 1; // placeholder; no document frequency available in v0
|
|
28
|
+
const numer = tf * (k1 + 1);
|
|
29
|
+
const denom = tf + k1 * (1 - b + b * (len / avgLen));
|
|
30
|
+
score += idf * (numer / denom);
|
|
31
|
+
}
|
|
32
|
+
// Apply boosts multiplicatively
|
|
33
|
+
if (data.hasPhrase) {
|
|
34
|
+
score *= 1 + phraseBoost;
|
|
35
|
+
}
|
|
36
|
+
if (data.headingScore) {
|
|
37
|
+
score *= 1 + headingBoost * data.headingScore;
|
|
38
|
+
}
|
|
39
|
+
results.push({ blockId: bid, score });
|
|
40
|
+
}
|
|
41
|
+
results.sort((a, b2) => b2.score - a.score);
|
|
42
|
+
return results;
|
|
43
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
export type Token = {
|
|
2
|
+
term: string;
|
|
3
|
+
pos: number;
|
|
4
|
+
};
|
|
5
|
+
/** Normalize a string by:
|
|
6
|
+
* - Applying NFKD Unicode normalization
|
|
7
|
+
* - Stripping combining diacritics
|
|
8
|
+
* - Converting to lowercase
|
|
9
|
+
* - Replacing non‑alphanumeric characters (except hyphen and space) with space
|
|
10
|
+
*
|
|
11
|
+
* This normalization ensures that accents and case do not affect matching.
|
|
12
|
+
*/
|
|
13
|
+
export declare function normalize(s: string): string;
|
|
14
|
+
/** Split a piece of text into tokens with positional information. Each token
|
|
15
|
+
* contains the normalized term and its position in the sequence. Positions
|
|
16
|
+
* increment only on actual tokens, ignoring multiple whitespace separators.
|
|
17
|
+
*/
|
|
18
|
+
export declare function tokenize(text: string): Token[];
|
|
19
|
+
/** Parse quoted phrases in a query string. A phrase is a sequence of words
|
|
20
|
+
* enclosed in double quotes. Returns an array of term arrays representing
|
|
21
|
+
* each phrase. Single words outside quotes are ignored here and handled
|
|
22
|
+
* separately by tokenize().
|
|
23
|
+
*/
|
|
24
|
+
export declare function parsePhrases(q: string): string[][];
|
package/dist/tokenize.js
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* tokenize.ts
|
|
3
|
+
*
|
|
4
|
+
* Provides functions for normalizing strings and splitting them into tokens
|
|
5
|
+
* suitable for indexing and querying. This module deliberately avoids any
|
|
6
|
+
* language‑specific stemming or lemmatization to keep the core simple and
|
|
7
|
+
* deterministic across platforms. Basic Unicode normalization and diacritic
|
|
8
|
+
* stripping are applied to ensure consistent matches.
|
|
9
|
+
*/
|
|
10
|
+
/** Normalize a string by:
|
|
11
|
+
* - Applying NFKD Unicode normalization
|
|
12
|
+
* - Stripping combining diacritics
|
|
13
|
+
* - Converting to lowercase
|
|
14
|
+
* - Replacing non‑alphanumeric characters (except hyphen and space) with space
|
|
15
|
+
*
|
|
16
|
+
* This normalization ensures that accents and case do not affect matching.
|
|
17
|
+
*/
|
|
18
|
+
export function normalize(s) {
|
|
19
|
+
return s
|
|
20
|
+
.normalize('NFKD')
|
|
21
|
+
.replace(/\p{M}+/gu, '')
|
|
22
|
+
.toLowerCase()
|
|
23
|
+
.replace(/[^\p{L}\p{N}\s-]/gu, ' ');
|
|
24
|
+
}
|
|
25
|
+
/** Split a piece of text into tokens with positional information. Each token
|
|
26
|
+
* contains the normalized term and its position in the sequence. Positions
|
|
27
|
+
* increment only on actual tokens, ignoring multiple whitespace separators.
|
|
28
|
+
*/
|
|
29
|
+
export function tokenize(text) {
|
|
30
|
+
const norm = normalize(text);
|
|
31
|
+
const out = [];
|
|
32
|
+
let pos = 0;
|
|
33
|
+
for (const w of norm.split(/\s+/).filter(Boolean)) {
|
|
34
|
+
out.push({ term: w, pos: pos++ });
|
|
35
|
+
}
|
|
36
|
+
return out;
|
|
37
|
+
}
|
|
38
|
+
/** Parse quoted phrases in a query string. A phrase is a sequence of words
|
|
39
|
+
* enclosed in double quotes. Returns an array of term arrays representing
|
|
40
|
+
* each phrase. Single words outside quotes are ignored here and handled
|
|
41
|
+
* separately by tokenize().
|
|
42
|
+
*/
|
|
43
|
+
export function parsePhrases(q) {
|
|
44
|
+
const parts = [];
|
|
45
|
+
const regex = /"([^\"]+)"/g;
|
|
46
|
+
let match;
|
|
47
|
+
while ((match = regex.exec(q)) !== null) {
|
|
48
|
+
const phrase = match[1].trim().split(/\s+/);
|
|
49
|
+
if (phrase.length > 0)
|
|
50
|
+
parts.push(phrase);
|
|
51
|
+
}
|
|
52
|
+
return parts;
|
|
53
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "knolo-core",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Local-first knowledge packs for small LLMs.",
|
|
5
|
+
"keywords": ["llm", "knowledge-base", "rag", "local", "expo"],
|
|
6
|
+
"author": "Your Name",
|
|
7
|
+
"license": "MIT",
|
|
8
|
+
"main": "./dist/index.js",
|
|
9
|
+
"types": "./dist/index.d.ts",
|
|
10
|
+
"files": ["dist", "bin", "README.md", "LICENSE"],
|
|
11
|
+
"bin": { "knolo": "./bin/knolo.mjs" },
|
|
12
|
+
"scripts": {
|
|
13
|
+
"build": "tsc -p tsconfig.json",
|
|
14
|
+
"prepublishOnly": "npm run build"
|
|
15
|
+
},
|
|
16
|
+
"devDependencies": {
|
|
17
|
+
"typescript": "^5.5.0",
|
|
18
|
+
"@types/node": "^20.11.0"
|
|
19
|
+
}
|
|
20
|
+
}
|