npm - knolo-core - Versions diffs - 0.1.3 → 0.2.0 - Mend

knolo-core 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +196 -61
package/dist/builder.d.ts +0 -5
package/dist/builder.js +38 -37
package/dist/pack.d.ts +3 -29
package/dist/pack.js +50 -44
package/dist/quality/diversify.d.ts +13 -0
package/dist/quality/diversify.js +41 -0
package/dist/quality/proximity.d.ts +2 -0
package/dist/quality/proximity.js +31 -0
package/dist/quality/signature.d.ts +3 -0
package/dist/quality/signature.js +24 -0
package/dist/quality/similarity.d.ts +3 -0
package/dist/quality/similarity.js +27 -0
package/dist/query.d.ts +1 -7
package/dist/query.js +129 -70
package/dist/rank.d.ts +7 -6
package/dist/rank.js +6 -19
package/dist/utils/utf8.d.ts +8 -0
package/dist/utils/utf8.js +72 -0
package/package.json +4 -3

package/README.md CHANGED Viewed

@@ -1,23 +1,29 @@
 # 🧠 KnoLo Core
 [![npm version](https://img.shields.io/npm/v/knolo-core.svg)](https://www.npmjs.com/package/knolo-core)
 [![npm downloads](https://img.shields.io/npm/dm/knolo-core.svg)](https://www.npmjs.com/package/knolo-core)
 [![License](https://img.shields.io/npm/l/knolo-core.svg)](./LICENSE)
-**KnoLo Core** is a **local-first knowledge base system** for small language models (LLMs).
-It lets you package your own documents into a compact `.knolo` file and query them deterministically — **no embeddings, no vector DBs, no cloud**. Perfect for **on-device LLMs**.
+**KnoLo Core** is a **local-first knowledge base** for small LLMs.
+Package documents into a compact `.knolo` file and query them deterministically —
+**no embeddings, no vector DB, no cloud**. Ideal for **on‑device / offline** assistants.
 ---
-## ✨ Features
+## ✨ Highlights (v0.2.0)
+* 🔎 **Stronger relevance:**
-* 📦 **Single-file packs** (`.knolo`) you can ship or load offline
-* 🔎 **Deterministic lexical retrieval** (BM25L + phrase + heading boosts)
-* ⚡ **Tiny & fast** — runs in Node, browsers, and Expo
-* 📑 **Context Patches**: structured snippets optimized for LLM prompts
-* 🔒 **Privacy-first**: all data stays local
+  * **Required phrase enforcement** (quoted & `requirePhrases`)
+  * **Proximity bonus** using minimal term-span cover
+  * **Optional heading boosts** when headings are present
+* 🌀 **Duplicate-free results:** **near-duplicate suppression** + **MMR diversity**
+* 🧮 **KNS tie‑breaker:** lightweight numeric signature to stabilize close ties
+* ⚡ **Faster & leaner:** precomputed `avgBlockLen` in pack metadata
+* 📱 **Works in Expo/React Native:** safe TextEncoder/TextDecoder ponyfills
+* 📑 **Context Patches:** LLM‑friendly snippets for prompts
+* 🔒 **Local & private:** everything runs on device
 ---
@@ -27,7 +33,7 @@ It lets you package your own documents into a compact `.knolo` file and query th
 npm install knolo-core
 ```
-For local development (building from source):
+Dev from source:
 ```bash
 git clone https://github.com/yourname/knolo-core.git
@@ -38,113 +44,242 @@ npm run build
 ---
-## 🚀 Usage Examples
+## 🚀 Usage
-### 1. Node.js (in-memory build + query)
+### 1) Node.js (build → mount → query → patch)
-```js
+```ts
 import { buildPack, mountPack, query, makeContextPatch } from "knolo-core";
 const docs = [
-  { heading: "React Native Bridge", text: "The bridge sends messages between JS and native. You can throttle events to reduce jank." },
-  { heading: "Throttling", text: "Throttling reduces frequency of events to avoid flooding the bridge." },
-  { heading: "Debounce vs Throttle", text: "Debounce waits for silence, throttle guarantees a max rate." }
+  { id: "guide",   heading: "React Native Bridge", text: "The bridge sends messages between JS and native. You can throttle events..." },
+  { id: "throttle", heading: "Throttling",         text: "Throttling reduces frequency of events to avoid flooding the bridge." },
+  { id: "dvst",     heading: "Debounce vs Throttle", text: "Debounce waits for silence; throttle guarantees a max rate." }
 ];
-// Build a pack in memory
-const bytes = await buildPack(docs);
+const bytes = await buildPack(docs);              // build .knolo bytes
+const kb = await mountPack({ src: bytes });       // mount in-memory
+const hits = query(kb, '“react native” throttle', // quotes enforce phrase
+  { topK: 5, requirePhrases: ["max rate"] });
-// Mount it
-const kb = await mountPack({ src: bytes });
-// Query
-const hits = query(kb, "react native bridge throttling", { topK: 5 });
-console.log("Top hits:", hits);
+console.log(hits);
+/*
+[
+  { blockId: 2, score: 6.73, text: "...", source: "dvst" },
+  ...
+]
+*/
-// Turn into an LLM-friendly context patch
 const patch = makeContextPatch(hits, { budget: "small" });
-console.log("Context Patch:", patch);
+console.log(patch);
 ```
----
-### 2. CLI (build `.knolo` file)
+### 2) CLI (build a `.knolo` file)
-**Prepare `docs.json`:**
+Create `docs.json`:
 ```json
 [
-  { "heading": "Guide", "text": "Install deps.\n\n## Throttle\nLimit frequency of events." },
-  { "heading": "FAQ", "text": "What is throttling? It reduces event frequency." }
+  { "id": "guide", "heading": "Guide", "text": "Install deps...\n\n## Throttle\nLimit frequency of events." },
+  { "id": "faq",   "heading": "FAQ",   "text": "What is throttling? It reduces event frequency." }
 ]
 ```
-**Build the pack:**
+Build:
 ```bash
-# writes mypack.knolo
-npx knolo docs.json mypack.knolo
+# writes knowledge.knolo
+npx knolo docs.json knowledge.knolo
 ```
-**Query it in a script:**
+Then load it in your app:
-```js
+```ts
 import { mountPack, query } from "knolo-core";
-const kb = await mountPack({ src: "./mypack.knolo" });
+const kb = await mountPack({ src: "./knowledge.knolo" });
 const hits = query(kb, "throttle events", { topK: 3 });
-console.log(hits);
 ```
----
-### 3. React / Expo (load from asset)
+### 3) React / Expo
 ```ts
 import { Asset } from "expo-asset";
 import * as FileSystem from "expo-file-system";
-import { mountPack, query, makeContextPatch } from "knolo-core";
+import { mountPack, query } from "knolo-core";
-async function loadKnowledge() {
-  const asset = Asset.fromModule(require("./assets/mypack.knolo"));
+async function loadKB() {
+  const asset = Asset.fromModule(require("./assets/knowledge.knolo"));
   await asset.downloadAsync();
   const base64 = await FileSystem.readAsStringAsync(asset.localUri!, { encoding: FileSystem.EncodingType.Base64 });
   const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0));
   const kb = await mountPack({ src: bytes.buffer });
-  const hits = query(kb, "bridge throttling", { topK: 5 });
-  return makeContextPatch(hits, { budget: "mini" });
+  return query(kb, `“react native” throttling`, { topK: 5 });
 }
 ```
 ---
-## 📑 API Quick Reference
+## 📑 API
+### `buildPack(docs) -> Promise<Uint8Array>`
+Builds a pack from an array of documents.
 ```ts
-// Build a pack from docs
-buildPack([{ heading: string, text: string }[]]) -> Promise<Uint8Array>
+type BuildInputDoc = {
+  id?: string;          // optional doc id (exposed as hit.source)
+  heading?: string;     // optional heading (used for boosts)
+  text: string;         // raw markdown accepted (lightly stripped)
+};
+```
+* Stores optional `heading` and `id` alongside each block.
+* Computes and persists `meta.stats.avgBlockLen` for faster queries.
+### `mountPack({ src }) -> Promise<Pack>`
+Loads a pack from a URL, `Uint8Array`, or `ArrayBuffer`.
+```ts
+type Pack = {
+  meta: { version: number; stats: { docs: number; blocks: number; terms: number; avgBlockLen?: number } };
+  lexicon: Map<string, number>;
+  postings: Uint32Array;
+  blocks: string[];
+  headings?: (string | null)[];
+  docIds?: (string | null)[];
+};
+```
+> **Compatibility:** v0.2.0 reads both v1 packs (string-only blocks) and v2 packs (objects with `text/heading/docId`).
-// Load a pack
-mountPack({ src: string | Uint8Array | ArrayBuffer }) -> Promise<Pack>
+### `query(pack, q, opts) -> Hit[]`
-// Query
-query(pack, "your query", { topK?: number, requirePhrases?: string[] }) -> Hit[]
+Deterministic lexical search with phrase enforcement, proximity, and de‑duplication.
-// Create LLM-friendly patch
-makeContextPatch(hits, { budget?: "mini" | "small" | "full" }) -> ContextPatch
+```ts
+type QueryOptions = {
+  topK?: number;                // default 10
+  requirePhrases?: string[];    // additional phrases to require (unquoted)
+};
+type Hit = {
+  blockId: number;
+  score: number;
+  text: string;
+  source?: string;              // docId if provided at build time
+};
+```
+**What happens under the hood (v0.2.0):**
+* Tokenize + **enforce all phrases** (quoted in `q` and `requirePhrases`)
+* Candidate generation via inverted index
+* **Proximity bonus** using minimal window covering all query terms
+* Optional **heading overlap boost** (when headings are present)
+* Tiny **KNS** numeric-signature tie‑breaker (\~±2% influence)
+* **Near-duplicate suppression** (5‑gram Jaccard) + **MMR** diversity for top‑K
+### `makeContextPatch(hits, { budget }) -> ContextPatch`
+Create structured snippets for LLM prompts.
+```ts
+type ContextPatch = {
+  background: string[];
+  snippets: Array<{ text: string; source?: string }>;
+  definitions: Array<{ term: string; def: string; evidence?: number[] }>;
+  facts: Array<{ s: string; p: string; o: string; evidence?: number[] }>;
+};
 ```
+Budgets: `"mini" | "small" | "full"`.
+---
+## 🧠 Relevance & De‑dupe Details
+* **Phrases:**
+  Quoted phrases in the query (e.g., `“react native”`) and any `requirePhrases` **must appear** in results. Candidates failing this are dropped before ranking.
+* **Proximity:**
+  We compute the **minimum span** that covers all query terms and apply a gentle multiplier:
+  `1 + 0.15 / (1 + span)` (bounded, stable).
+* **Heading Boost:**
+  If you provide headings at build time, overlap with query terms boosts the score proportionally to the fraction of unique query terms present in the heading.
+* **Duplicate Control:**
+  We use **5‑gram Jaccard** to filter near‑duplicates and **MMR** (λ≈0.8) to promote diversity within the top‑K.
+* **KNS Signature (optional spice):**
+  A tiny numeric signature provides deterministic tie‑breaking without changing the overall retrieval behavior.
+---
+## 🛠 Input Format & Pack Layout
+**Input docs:**
+`{ id?: string, heading?: string, text: string }`
+**Pack layout (binary):**
+`[metaLen:u32][meta JSON][lexLen:u32][lexicon JSON][postCount:u32][postings][blocksLen:u32][blocks JSON]`
+* `meta.stats.avgBlockLen` is persisted (v2).
+* `blocks JSON` may be:
+  * **v1:** `string[]` (text only)
+  * **v2:** `{ text, heading?, docId? }[]`
+The runtime auto‑detects either format.
 ---
-## 🔮 Roadmap
+## 🔁 Migration (0.1.x → 0.2.0)
+* **No API breaks.** `buildPack`, `mountPack`, `query`, `makeContextPatch` unchanged.
+* Packs built with 0.1.x still load and query fine.
+* If you want heading boosts and `hit.source`, pass `heading` and `id` to `buildPack`.
+* React Native/Expo users no longer need polyfills—ponyfills are included.
+---
+## ⚡ Performance Tips
+* Prefer multiple smaller blocks (≈512 tokens) over giant ones for better recall + proximity.
+* Provide `heading` for each block: cheap, high‑signal boost.
+* For large corpora, consider sharding packs by domain/topic to keep per‑pack size modest.
+---
+## ❓ FAQ
+**Q: Does this use embeddings?**
+No. Pure lexical retrieval (index, positions, BM25L, proximity, phrases).
+**Q: Can I run this offline?**
+Yes. Everything is local.
+**Q: How do I prevent duplicates?**
+It’s built in (Jaccard + MMR). You can tune λ and similarity threshold in code if you fork.
+**Q: Is RN/Expo supported?**
+Yes—TextEncoder/TextDecoder ponyfills are included.
+---
+## 🗺️ Roadmap
 * Multi-resolution packs (summaries + facts)
-* Overlay store for user notes
-* WASM core for very large packs
+* Overlay layers (user annotations)
+* WASM core for big-browser indexing
+* Delta updates / append-only patch packs
 ---
-👉 With **KnoLo**, you can **carry knowledge with your model** — no servers, no dependencies, just a tiny portable pack.
+## 📄 License
+MIT — see [LICENSE](./LICENSE).

package/dist/builder.d.ts CHANGED Viewed

@@ -3,9 +3,4 @@ export type BuildInputDoc = {
     heading?: string;
     text: string;
 };
-/** Build a `.knolo` pack from an array of input documents. At present each
- * document becomes a single block. Future versions may split documents into
- * multiple blocks based on headings or token count to improve retrieval
- * granularity.
- */
 export declare function buildPack(docs: BuildInputDoc[]): Promise<Uint8Array>;

package/dist/builder.js CHANGED Viewed

@@ -1,34 +1,44 @@
 /*
  * builder.ts
  *
- * Provides a programmatic interface for building `.knolo` knowledge packs
- * from arrays of input documents. The input format accepts objects with
- * `id`, `heading` and `text` fields. The builder performs simple
- * Markdown stripping and calls the indexer to generate the inverted index. The
- * resulting pack binary can be persisted to disk or served directly to
- * clients.
+ * Build `.knolo` packs from input docs. Now persists optional headings/docIds
+ * and stores avgBlockLen in meta for faster/easier normalization at query-time.
  */
 import { buildIndex } from './indexer.js';
-/** Build a `.knolo` pack from an array of input documents. At present each
- * document becomes a single block. Future versions may split documents into
- * multiple blocks based on headings or token count to improve retrieval
- * granularity.
- */
+import { tokenize } from './tokenize.js';
+import { getTextEncoder } from './utils/utf8.js';
 export async function buildPack(docs) {
-    // Convert docs into blocks. For v0 each doc is a single block; heading is
-    // ignored except possibly for future heading boosting in ranking.
-    const blocks = docs.map((d, i) => ({ id: i, text: stripMd(d.text), heading: d.heading }));
+    // Prepare blocks (strip MD) and carry heading/docId for optional boosts.
+    const blocks = docs.map((d, i) => ({
+        id: i,
+        text: stripMd(d.text),
+        heading: d.heading,
+    }));
+    // Build index
     const { lexicon, postings } = buildIndex(blocks);
+    // Compute avg token length once (store in meta)
+    const totalTokens = blocks.reduce((sum, b) => sum + tokenize(b.text).length, 0);
+    const avgBlockLen = blocks.length ? totalTokens / blocks.length : 1;
     const meta = {
-        version: 1,
-        stats: { docs: docs.length, blocks: blocks.length, terms: lexicon.length },
+        version: 2,
+        stats: {
+            docs: docs.length,
+            blocks: blocks.length,
+            terms: lexicon.length,
+            avgBlockLen,
+        },
     };
-    // Encode sections to bytes
-    const enc = new TextEncoder();
+    // Persist blocks as objects to optionally carry heading/docId
+    const blocksPayload = blocks.map((b, i) => ({
+        text: b.text,
+        heading: b.heading ?? null,
+        docId: docs[i]?.id ?? null,
+    }));
+    // Encode sections
+    const enc = getTextEncoder();
     const metaBytes = enc.encode(JSON.stringify(meta));
     const lexBytes = enc.encode(JSON.stringify(lexicon));
-    const blocksBytes = enc.encode(JSON.stringify(blocks.map((b) => b.text)));
-    // Compute lengths and allocate output
+    const blocksBytes = enc.encode(JSON.stringify(blocksPayload));
     const totalLength = 4 + metaBytes.length +
         4 + lexBytes.length +
         4 + postings.length * 4 +
@@ -46,36 +56,27 @@ export async function buildPack(docs) {
     offset += 4;
     out.set(lexBytes, offset);
     offset += lexBytes.length;
-    // postings
+    // postings (alignment-safe via DataView)
     dv.setUint32(offset, postings.length, true);
     offset += 4;
-    new Uint32Array(out.buffer, offset, postings.length).set(postings);
-    offset += postings.length * 4;
+    for (let i = 0; i < postings.length; i++) {
+        dv.setUint32(offset, postings[i], true);
+        offset += 4;
+    }
     // blocks
     dv.setUint32(offset, blocksBytes.length, true);
     offset += 4;
     out.set(blocksBytes, offset);
     return out;
 }
-/** Strip Markdown syntax by converting to HTML and then removing tags. The
- * `marked` library is used for parsing and rendering. A very naive HTML tag
- * stripper removes tags by dropping anything between `<` and `>`. This is
- * simplistic but adequate for plain text extraction.
- */
+/** Strip Markdown syntax with lightweight regexes (no deps). */
 function stripMd(md) {
-    // Remove code fences
-    let text = md.replace(/```[^```]*```/g, ' ');
-    // Remove inline code backticks
+    let text = md.replace(/```[\s\S]*?```/g, ' ');
     text = text.replace(/`[^`]*`/g, ' ');
-    // Remove emphasis markers (*, _, ~)
     text = text.replace(/[\*_~]+/g, ' ');
-    // Remove headings (#)
     text = text.replace(/^#+\s*/gm, '');
-    // Remove links [text](url) -> text
-    text = text.replace(/\[([^\]]+)\]\([^\)]+\)/g, '$1');
-    // Remove any remaining brackets
+    text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
     text = text.replace(/[\[\]()]/g, ' ');
-    // Collapse whitespace
     text = text.replace(/\s+/g, ' ').trim();
     return text;
 }

package/dist/pack.d.ts CHANGED Viewed

@@ -1,47 +1,21 @@
 export type MountOptions = {
     src: string | ArrayBufferLike | Uint8Array;
 };
-/** Metadata about the pack. Version numbers should increment with format
- *  changes, allowing the runtime to adapt accordingly. */
 export type PackMeta = {
     version: number;
     stats: {
         docs: number;
         blocks: number;
         terms: number;
+        avgBlockLen?: number;
     };
 };
-/**
- * A mounted pack exposing the inverted index, block text and optional field
- * metadata. The core runtime reads from these structures directly at query
- * time.
- */
 export type Pack = {
     meta: PackMeta;
-    /** Map of token to term identifier used in the postings list. */
     lexicon: Map<string, number>;
-    /** Flattened postings list where each term section starts with the termId
-     * followed by (blockId, positions..., 0) tuples, ending with a 0.
-     */
     postings: Uint32Array;
-    /** Array of block texts. Each block corresponds to a chunk of the original
-     * documents. The blockId used in the postings list indexes into this array.
-     */
     blocks: string[];
+    headings?: (string | null)[];
+    docIds?: (string | null)[];
 };
-/**
- * Load a `.knolo` pack from a variety of sources. The pack binary layout is
- * currently:
- *
- *  [metaLen:u32][meta JSON][lexLen:u32][lexicon JSON][postCount:u32][postings][blocksLen:u32][blocks JSON]
- *
- * All integers are little endian. `metaLen`, `lexLen` and `blocksLen` denote
- * the byte lengths of the subsequent JSON sections. `postCount` is the number
- * of 32‑bit integers in the postings array. This simple layout is sufficient
- * for v0 and avoids any additional dependencies beyond standard typed arrays.
- *
- * @param opts Options specifying how to load the pack. Accepts a URL string,
- *        ArrayBuffer, or Uint8Array.
- * @returns A Promise resolving to a mounted pack with the index and blocks.
- */
 export declare function mountPack(opts: MountOptions): Promise<Pack>;

package/dist/pack.js CHANGED Viewed

@@ -1,77 +1,83 @@
 /*
  * pack.ts
  *
- * Defines the pack structure and implements the mountPack function that loads
- * a `.knolo` pack from a variety of sources. A pack consists of a header
- * containing metadata, followed by JSON‑encoded lexicon and blocks and a
- * flattened postings list encoded as a 32‑bit integer array. The pack is
- * portable across Node.js and browser environments.
- */
-/**
- * Load a `.knolo` pack from a variety of sources. The pack binary layout is
- * currently:
- *
- *  [metaLen:u32][meta JSON][lexLen:u32][lexicon JSON][postCount:u32][postings][blocksLen:u32][blocks JSON]
- *
- * All integers are little endian. `metaLen`, `lexLen` and `blocksLen` denote
- * the byte lengths of the subsequent JSON sections. `postCount` is the number
- * of 32‑bit integers in the postings array. This simple layout is sufficient
- * for v0 and avoids any additional dependencies beyond standard typed arrays.
- *
- * @param opts Options specifying how to load the pack. Accepts a URL string,
- *        ArrayBuffer, or Uint8Array.
- * @returns A Promise resolving to a mounted pack with the index and blocks.
+ * Mount `.knolo` packs across Node, browsers, and RN/Expo. Now tolerant of:
+ *  - blocks as string[] (v1) or object[] with { text, heading?, docId? } (v2)
+ *  - meta.stats.avgBlockLen (optional)
+ * Includes RN/Expo-safe TextDecoder via ponyfill.
  */
+import { getTextDecoder } from './utils/utf8.js';
 export async function mountPack(opts) {
     const buf = await resolveToBuffer(opts.src);
     const dv = new DataView(buf);
+    const dec = getTextDecoder();
     let offset = 0;
-    // Read meta section
+    // meta
     const metaLen = dv.getUint32(offset, true);
     offset += 4;
-    const metaJson = new TextDecoder().decode(new Uint8Array(buf, offset, metaLen));
+    const metaJson = dec.decode(new Uint8Array(buf, offset, metaLen));
     offset += metaLen;
     const meta = JSON.parse(metaJson);
-    // Read lexicon
+    // lexicon
     const lexLen = dv.getUint32(offset, true);
     offset += 4;
-    const lexJson = new TextDecoder().decode(new Uint8Array(buf, offset, lexLen));
+    const lexJson = dec.decode(new Uint8Array(buf, offset, lexLen));
     offset += lexLen;
     const lexEntries = JSON.parse(lexJson);
     const lexicon = new Map(lexEntries);
-    // Read postings
+    // postings
     const postCount = dv.getUint32(offset, true);
     offset += 4;
-    const postings = new Uint32Array(buf, offset, postCount);
-    offset += postCount * 4;
-    // Read blocks
+    const postings = new Uint32Array(postCount);
+    for (let i = 0; i < postCount; i++) {
+        postings[i] = dv.getUint32(offset, true);
+        offset += 4;
+    }
+    // blocks (v1: string[]; v2: {text, heading?, docId?}[])
     const blocksLen = dv.getUint32(offset, true);
     offset += 4;
-    const blocksJson = new TextDecoder().decode(new Uint8Array(buf, offset, blocksLen));
-    const blocks = JSON.parse(blocksJson);
-    return { meta, lexicon, postings, blocks };
+    const blocksJson = dec.decode(new Uint8Array(buf, offset, blocksLen));
+    const parsed = JSON.parse(blocksJson);
+    let blocks = [];
+    let headings;
+    let docIds;
+    if (Array.isArray(parsed) && parsed.length && typeof parsed[0] === 'string') {
+        // v1
+        blocks = parsed;
+    }
+    else if (Array.isArray(parsed)) {
+        blocks = [];
+        headings = [];
+        docIds = [];
+        for (const it of parsed) {
+            if (it && typeof it === 'object') {
+                blocks.push(String(it.text ?? ''));
+                headings.push(it.heading ?? null);
+                docIds.push(it.docId ?? null);
+            }
+            else {
+                blocks.push(String(it ?? ''));
+                headings.push(null);
+                docIds.push(null);
+            }
+        }
+    }
+    else {
+        blocks = [];
+    }
+    return { meta, lexicon, postings, blocks, headings, docIds };
 }
-/** Resolve the `src` field of MountOptions into an ArrayBuffer. Supports:
- *  - strings interpreted as URLs (via fetch)
- *  - Uint8Array and ArrayBuffer inputs
- */
 async function resolveToBuffer(src) {
     if (typeof src === 'string') {
-        // Use fetch for browser and Node environments. For Node this requires the
-        // global fetch API (available since Node 18). Error handling is delegated
-        // to the caller.
         const res = await fetch(src);
-        const ab = await res.arrayBuffer();
-        return ab;
+        return await res.arrayBuffer();
     }
     if (src instanceof Uint8Array) {
-        // If the view covers the whole buffer, return it directly (cast to ArrayBuffer).
         if (src.byteOffset === 0 && src.byteLength === src.buffer.byteLength) {
             return src.buffer;
         }
-        // Otherwise, copy to a new buffer so we return exactly the bytes for this view.
-        const copy = src.slice(); // makes a copy of the bytes for this range
-        return copy.buffer; // typed as ArrayBuffer
+        const copy = src.slice();
+        return copy.buffer;
     }
     return src;
 }

package/dist/quality/diversify.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+export type HitLike = {
+    blockId: number;
+    score: number;
+    text: string;
+    source?: string;
+};
+export type DiversifyOptions = {
+    k: number;
+    lambda?: number;
+    simThreshold?: number;
+    sim?: (a: HitLike, b: HitLike) => number;
+};
+export declare function diversifyAndDedupe(hits: HitLike[], opts: DiversifyOptions): HitLike[];

package/dist/quality/diversify.js ADDED Viewed

@@ -0,0 +1,41 @@
+// src/quality/diversify.ts
+import { jaccard5 } from './similarity.js';
+export function diversifyAndDedupe(hits, opts) {
+    const { k, lambda = 0.8, simThreshold = 0.92, sim = (a, b) => jaccard5(a.text, b.text) } = opts;
+    const pool = [...hits].sort((a, b) => b.score - a.score);
+    const kept = [];
+    while (pool.length && kept.length < k) {
+        // compute MMR for current pool against kept
+        let bestIdx = 0;
+        let bestMMR = -Infinity;
+        for (let i = 0; i < pool.length; i++) {
+            const h = pool[i];
+            let maxSim = 0;
+            for (const s of kept) {
+                const v = sim(h, s);
+                if (v > maxSim)
+                    maxSim = v;
+                if (v >= simThreshold) {
+                    maxSim = v;
+                    break;
+                } // early out
+            }
+            // skip near-duplicates
+            if (maxSim >= simThreshold)
+                continue;
+            const mmr = lambda * h.score - (1 - lambda) * maxSim;
+            if (mmr > bestMMR) {
+                bestMMR = mmr;
+                bestIdx = i;
+            }
+        }
+        // if everything was a near-duplicate, just take the next best by score
+        const pick = pool.splice(bestMMR === -Infinity ? 0 : bestIdx, 1)[0];
+        if (!pick)
+            break;
+        // final dedupe check before push
+        if (!kept.some((x) => sim(x, pick) >= simThreshold))
+            kept.push(pick);
+    }
+    return kept;
+}

package/dist/quality/proximity.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export declare function minCoverSpan(posMap?: Map<number, number[]>): number \| null;
2	+ export declare function proximityMultiplier(span: number \| null, strength?: number): number;

package/dist/quality/proximity.js ADDED Viewed

@@ -0,0 +1,31 @@
+// src/quality/proximity.ts
+// Map<termId, positions[]>
+export function minCoverSpan(posMap) {
+    const lists = posMap ? [...posMap.values()].map(arr => arr.slice().sort((a, b) => a - b)) : [];
+    if (lists.length === 0)
+        return null;
+    const idx = new Array(lists.length).fill(0);
+    let best = null;
+    while (true) {
+        const cur = [];
+        for (let i = 0; i < lists.length; i++) {
+            const val = lists[i][idx[i]];
+            if (val === undefined)
+                return best;
+            cur.push(val);
+        }
+        const min = Math.min(...cur);
+        const max = Math.max(...cur);
+        const span = max - min;
+        if (best === null || span < best)
+            best = span;
+        // advance list with current min
+        const minList = cur.indexOf(min);
+        idx[minList]++;
+    }
+}
+export function proximityMultiplier(span, strength = 0.15) {
+    if (span === null)
+        return 1;
+    return 1 + strength / (1 + span); // gentle, bounded
+}

package/dist/quality/signature.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export type KNSSignature = [number, number, number];
+export declare function knsSignature(s: string): KNSSignature;
+export declare function knsDistance(a: KNSSignature, b: KNSSignature): number;

package/dist/quality/signature.js ADDED Viewed

@@ -0,0 +1,24 @@
+// src/quality/signature.ts
+// "KNS" — simple, deterministic lexical numeric signature for tie-breaking.
+const PRIMES = [257, 263, 269];
+export function knsSignature(s) {
+    let s1 = 0, s2 = 0, s3 = 0;
+    for (let i = 0; i < s.length; i++) {
+        const code = s.charCodeAt(i);
+        s1 = (s1 + code) % PRIMES[0];
+        s2 = (s2 + code * (i + 1)) % PRIMES[1];
+        s3 = (s3 + ((code << 1) ^ (i + 7))) % PRIMES[2];
+    }
+    return [s1, s2, s3];
+}
+export function knsDistance(a, b) {
+    // circular distance on a mod prime, averaged & normalized to 0..1
+    let acc = 0;
+    for (let i = 0; i < PRIMES.length; i++) {
+        const p = PRIMES[i];
+        const diff = Math.abs(a[i] - b[i]);
+        const circ = Math.min(diff, p - diff) / p;
+        acc += circ;
+    }
+    return acc / PRIMES.length;
+}

package/dist/quality/similarity.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export declare function ngramSet(s: string, n?: number): Set<string>;
+export declare function jaccardFromSets(a: Set<string>, b: Set<string>): number;
+export declare function jaccard5(s1: string, s2: string): number;

package/dist/quality/similarity.js ADDED Viewed

@@ -0,0 +1,27 @@
+// src/quality/similarity.ts
+import { normalize } from '../tokenize.js';
+export function ngramSet(s, n = 5) {
+    const t = normalize(s);
+    const out = new Set();
+    if (t.length < n) {
+        if (t)
+            out.add(t);
+        return out;
+    }
+    for (let i = 0; i <= t.length - n; i++)
+        out.add(t.slice(i, i + n));
+    return out;
+}
+export function jaccardFromSets(a, b) {
+    if (a.size === 0 && b.size === 0)
+        return 1;
+    let inter = 0;
+    for (const x of a)
+        if (b.has(x))
+            inter++;
+    const uni = a.size + b.size - inter;
+    return uni ? inter / uni : 0;
+}
+export function jaccard5(s1, s2) {
+    return jaccardFromSets(ngramSet(s1, 5), ngramSet(s2, 5));
+}

package/dist/query.d.ts CHANGED Viewed

@@ -1,7 +1,6 @@
-import type { Pack } from './pack';
+import type { Pack } from "./pack.js";
 export type QueryOptions = {
     topK?: number;
-    /** Additional phrases (unquoted) that must be present in results. */
     requirePhrases?: string[];
 };
 export type Hit = {
@@ -10,9 +9,4 @@ export type Hit = {
     text: string;
     source?: string;
 };
-/** Execute a search against a mounted pack. The query string can contain
- * quoted phrases; unquoted terms are treated individually. The `topK`
- * parameter controls how many results are returned. If `requirePhrases`
- * contains strings, those phrases must appear verbatim in candidate blocks.
- */
 export declare function query(pack: Pack, q: string, opts?: QueryOptions): Hit[];

package/dist/query.js CHANGED Viewed

@@ -1,95 +1,154 @@
 /*
  * query.ts
  *
- * Implements the main query function that ties together tokenization, lexical
- * lookup in the pack's lexicon, scanning the postings list to generate
- * candidates, and ranking them using the BM25L ranker. Phrase detection
- * operates on the block text itself and is naive but effective for short
- * queries. Future versions may incorporate more advanced mechanisms.
+ * Deterministic, embedding-free retrieval with:
+ *  - REQUIRED phrase enforcement (quoted and requirePhrases)
+ *  - Proximity bonus based on min cover span
+ *  - Optional heading overlap boost
+ *  - KNS numeric-signature tie-breaker (tiny)
+ *  - Near-duplicate suppression + MMR diversity
  */
-import { tokenize, parsePhrases } from "./tokenize.js";
+import { tokenize, parsePhrases, normalize } from "./tokenize.js";
 import { rankBM25L } from "./rank.js";
-/** Execute a search against a mounted pack. The query string can contain
- * quoted phrases; unquoted terms are treated individually. The `topK`
- * parameter controls how many results are returned. If `requirePhrases`
- * contains strings, those phrases must appear verbatim in candidate blocks.
- */
+import { minCoverSpan, proximityMultiplier } from "./quality/proximity.js";
+import { diversifyAndDedupe } from "./quality/diversify.js";
+import { knsSignature, knsDistance } from "./quality/signature.js";
 export function query(pack, q, opts = {}) {
     const topK = opts.topK ?? 10;
+    // --- Query parsing
     const normTokens = tokenize(q).map((t) => t.term);
-    const phraseTerms = parsePhrases(q);
-    // Include explicitly provided phrases
-    if (opts.requirePhrases) {
-        for (const p of opts.requirePhrases) {
-            const terms = p.split(/\s+/).filter(Boolean);
-            if (terms.length > 0)
-                phraseTerms.push(terms);
-        }
-    }
-    // Translate tokens to term IDs. Terms not in the lexicon are skipped.
+    // Normalize quoted phrases from q
+    const quotedRaw = parsePhrases(q); // arrays of raw terms
+    const quoted = quotedRaw.map(seq => seq.map(t => normalize(t)).flatMap(s => s.split(/\s+/)).filter(Boolean));
+    // Normalize requirePhrases the same way
+    const extraReq = (opts.requirePhrases ?? [])
+        .map(s => tokenize(s).map(t => t.term)) // <<< normalize via tokenizer
+        .filter(arr => arr.length > 0);
+    const requiredPhrases = [...quoted, ...extraReq];
+    // --- Term ids for the free (unquoted) tokens in q
     const termIds = normTokens
         .map((t) => pack.lexicon.get(t))
         .filter((id) => id !== undefined);
-    // Map from blockId to candidate data (term frequencies, flags)
+    // If there are no free tokens but there ARE required phrases, we'll fill candidates from phrases later.
+    const termSet = new Set(termIds);
+    // --- Candidate map
     const candidates = new Map();
-    // Scan postings list. The format is [termId, blockId, pos... 0, blockId, pos... 0, 0, termId, ...]
-    const p = pack.postings;
-    let i = 0;
-    while (i < p.length) {
-        const tid = p[i++];
-        if (tid === 0)
-            continue;
-        const relevant = termIds.includes(tid);
-        let bid = p[i++];
-        while (bid !== 0) {
-            let pos = p[i++];
-            const positions = [];
-            while (pos !== 0) {
-                positions.push(pos);
-                pos = p[i++];
-            }
-            if (relevant) {
-                let entry = candidates.get(bid);
-                if (!entry) {
-                    entry = { tf: new Map() };
-                    candidates.set(bid, entry);
+    // Helper to harvest postings for a given set of termIds into candidates
+    function scanForTermIds(idSet) {
+        const p = pack.postings;
+        let i = 0;
+        while (i < p.length) {
+            const tid = p[i++];
+            if (tid === 0)
+                continue;
+            const relevant = idSet.has(tid);
+            let bid = p[i++];
+            while (bid !== 0) {
+                let pos = p[i++];
+                const positions = [];
+                while (pos !== 0) {
+                    positions.push(pos);
+                    pos = p[i++];
                 }
-                // accumulate tf per termId; positions array length is tf
-                entry.tf.set(tid, positions.length);
+                if (relevant) {
+                    let entry = candidates.get(bid);
+                    if (!entry) {
+                        entry = { tf: new Map(), pos: new Map() };
+                        candidates.set(bid, entry);
+                    }
+                    entry.tf.set(tid, positions.length);
+                    entry.pos.set(tid, positions);
+                }
+                bid = p[i++];
+            }
+        }
+    }
+    // 1) Scan using tokens from q (if any)
+    if (termSet.size > 0) {
+        scanForTermIds(termSet);
+    }
+    // 2) Phrase-first rescue:
+    // If nothing matched the free tokens, but we do have required phrases,
+    // build a fallback term set from ALL tokens that appear in those phrases and scan again.
+    if (candidates.size === 0 && requiredPhrases.length > 0) {
+        const phraseTokenIds = new Set();
+        for (const seq of requiredPhrases) {
+            for (const t of seq) {
+                const id = pack.lexicon.get(t);
+                if (id !== undefined)
+                    phraseTokenIds.add(id);
             }
-            bid = p[i++];
         }
-        // end of term section; skip trailing zero already consumed
+        if (phraseTokenIds.size > 0) {
+            scanForTermIds(phraseTokenIds);
+        }
+    }
+    // --- Phrase enforcement (now that we have some candidates)
+    if (requiredPhrases.length > 0) {
+        for (const [bid, data] of [...candidates]) {
+            const text = pack.blocks[bid] || "";
+            const ok = requiredPhrases.every((seq) => containsPhrase(text, seq));
+            if (!ok)
+                candidates.delete(bid);
+            else
+                data.hasPhrase = true;
+        }
     }
-    // Check phrases on candidate texts. If a candidate does not contain all
-    // required phrases, it will be filtered out in ranking.
-    for (const [bid, data] of candidates) {
-        const text = pack.blocks[bid] || '';
-        data.hasPhrase = phraseTerms.some((seq) => containsPhrase(text, seq));
+    else if (quoted.length > 0) {
+        for (const [bid, data] of candidates) {
+            const text = pack.blocks[bid] || "";
+            data.hasPhrase = quoted.some((seq) => containsPhrase(text, seq));
+        }
+    }
+    // If still nothing, bail early
+    if (candidates.size === 0)
+        return [];
+    // --- Heading overlap
+    if (pack.headings?.length) {
+        const qset = new Set(normTokens);
+        const qUniqueCount = new Set(normTokens).size || 1;
+        for (const [bid, data] of candidates) {
+            const h = pack.headings[bid] ?? "";
+            const hTerms = tokenize(h || "").map((t) => t.term);
+            const overlap = new Set(hTerms.filter((t) => qset.has(t))).size;
+            data.headingScore = overlap / qUniqueCount;
+        }
     }
-    // Compute average block length for ranking normalization
-    const avgLen = pack.blocks.length
-        ? pack.blocks.reduce((sum, b) => sum + tokenize(b).length, 0) / pack.blocks.length
-        : 1;
-    const ranked = rankBM25L(candidates, avgLen);
-    return ranked.slice(0, topK).map((res) => ({
-        blockId: res.blockId,
-        score: res.score,
-        text: pack.blocks[res.blockId] || '',
-    }));
+    // --- Rank with proximity bonus
+    const avgLen = pack.meta?.stats?.avgBlockLen ??
+        (pack.blocks.length
+            ? pack.blocks.reduce((s, b) => s + tokenize(b).length, 0) / pack.blocks.length
+            : 1);
+    const prelim = rankBM25L(candidates, avgLen, {
+        proximityBonus: (cand) => proximityMultiplier(minCoverSpan(cand.pos)),
+    });
+    if (prelim.length === 0)
+        return [];
+    // --- KNS tie-breaker + de-dup/MMR
+    const qSig = knsSignature(normalize(q));
+    const pool = prelim.slice(0, topK * 5).map((r) => {
+        const text = pack.blocks[r.blockId] || "";
+        const boost = 1 + 0.02 * (1 - knsDistance(qSig, knsSignature(text)));
+        return {
+            blockId: r.blockId,
+            score: r.score * boost,
+            text,
+            source: pack.docIds?.[r.blockId] ?? undefined,
+        };
+    });
+    const finalHits = diversifyAndDedupe(pool, { k: topK });
+    return finalHits;
 }
-/** Determine whether the given sequence of terms appears in order within the
- * text. The algorithm tokenizes the text and performs a sliding window
- * comparison. This is case‑insensitive and uses the same normalization as
- * other parts of the system.
- */
+/** Ordered phrase check using the SAME tokenizer/normalizer path as the index. */
 function containsPhrase(text, seq) {
     if (seq.length === 0)
         return false;
+    // normalize seq via tokenizer to be extra safe (handles diacritics/case)
+    const seqNorm = tokenize(seq.join(" ")).map(t => t.term);
     const toks = tokenize(text).map((t) => t.term);
-    outer: for (let i = 0; i <= toks.length - seq.length; i++) {
-        for (let j = 0; j < seq.length; j++) {
-            if (toks[i + j] !== seq[j])
+    outer: for (let i = 0; i <= toks.length - seqNorm.length; i++) {
+        for (let j = 0; j < seqNorm.length; j++) {
+            if (toks[i + j] !== seqNorm[j])
                 continue outer;
         }
         return true;

package/dist/rank.d.ts CHANGED Viewed

@@ -3,15 +3,16 @@ export type RankOptions = {
     b?: number;
     headingBoost?: number;
     phraseBoost?: number;
+    proximityBonus?: (cand: {
+        tf: Map<number, number>;
+        pos?: Map<number, number[]>;
+        hasPhrase?: boolean;
+        headingScore?: number;
+    }) => number;
 };
-/**
- * Rank a set of candidate blocks using BM25L. Each candidate carries a term
- * frequency (tf) map keyed by termId. Additional properties may include
- * `hasPhrase` and `headingScore` to apply multiplicative boosts. The average
- * document length (avgLen) is required for BM25L normalization.
- */
 export declare function rankBM25L(candidates: Map<number, {
     tf: Map<number, number>;
+    pos?: Map<number, number[]>;
     hasPhrase?: boolean;
     headingScore?: number;
 }>, avgLen: number, opts?: RankOptions): Array<{

package/dist/rank.js CHANGED Viewed

@@ -1,18 +1,6 @@
 /*
  * rank.ts
- *
- * Implements a simple BM25L ranker with optional boosts for headings and
- * phrase matches. The inputs to the ranker are a map of block IDs to term
- * frequency maps and additional boolean flags. The outputs are sorted by
- * descending relevance score. This ranking algorithm can be replaced or
- * augmented in the future without impacting the public API of the query
- * function.
- */
-/**
- * Rank a set of candidate blocks using BM25L. Each candidate carries a term
- * frequency (tf) map keyed by termId. Additional properties may include
- * `hasPhrase` and `headingScore` to apply multiplicative boosts. The average
- * document length (avgLen) is required for BM25L normalization.
+ * BM25L ranker with optional heading/phrase boosts and a proximity bonus hook.
  */
 export function rankBM25L(candidates, avgLen, opts = {}) {
     const k1 = opts.k1 ?? 1.5;
@@ -24,18 +12,17 @@ export function rankBM25L(candidates, avgLen, opts = {}) {
         const len = Array.from(data.tf.values()).reduce((sum, tf) => sum + tf, 0) || 1;
         let score = 0;
         for (const [, tf] of data.tf) {
-            const idf = 1; // placeholder; no document frequency available in v0
+            const idf = 1; // v0: no DF; can be extended later
             const numer = tf * (k1 + 1);
             const denom = tf + k1 * (1 - b + b * (len / avgLen));
             score += idf * (numer / denom);
         }
-        // Apply boosts multiplicatively
-        if (data.hasPhrase) {
+        if (opts.proximityBonus)
+            score *= opts.proximityBonus(data) ?? 1;
+        if (data.hasPhrase)
             score *= 1 + phraseBoost;
-        }
-        if (data.headingScore) {
+        if (data.headingScore)
             score *= 1 + headingBoost * data.headingScore;
-        }
         results.push({ blockId: bid, score });
     }
     results.sort((a, b2) => b2.score - a.score);

package/dist/utils/utf8.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+export type TextDecoderLike = {
+    decode: (u8: Uint8Array) => string;
+};
+export type TextEncoderLike = {
+    encode: (s: string) => Uint8Array;
+};
+export declare function getTextDecoder(): TextDecoderLike;
+export declare function getTextEncoder(): TextEncoderLike;

package/dist/utils/utf8.js ADDED Viewed

@@ -0,0 +1,72 @@
+// src/utils/utf8.ts
+// Small, dependency-free UTF-8 encoder/decoder that works in RN/Hermes.
+export function getTextDecoder() {
+    try {
+        // eslint-disable-next-line no-new
+        const td = new TextDecoder();
+        return td;
+    }
+    catch {
+        return {
+            decode: (u8) => {
+                let out = '';
+                for (let i = 0; i < u8.length;) {
+                    const a = u8[i++];
+                    if (a < 0x80) {
+                        out += String.fromCharCode(a);
+                    }
+                    else if ((a & 0xe0) === 0xc0) {
+                        const b = u8[i++] & 0x3f;
+                        const cp = ((a & 0x1f) << 6) | b;
+                        out += String.fromCharCode(cp);
+                    }
+                    else if ((a & 0xf0) === 0xe0) {
+                        const b = u8[i++] & 0x3f;
+                        const c = u8[i++] & 0x3f;
+                        const cp = ((a & 0x0f) << 12) | (b << 6) | c;
+                        out += String.fromCharCode(cp);
+                    }
+                    else {
+                        const b = u8[i++] & 0x3f;
+                        const c = u8[i++] & 0x3f;
+                        const d = u8[i++] & 0x3f;
+                        let cp = ((a & 0x07) << 18) | (b << 12) | (c << 6) | d;
+                        cp -= 0x10000;
+                        out += String.fromCharCode(0xd800 + (cp >> 10), 0xdc00 + (cp & 0x3ff));
+                    }
+                }
+                return out;
+            },
+        };
+    }
+}
+export function getTextEncoder() {
+    try {
+        // eslint-disable-next-line no-new
+        const te = new TextEncoder();
+        return te;
+    }
+    catch {
+        return {
+            encode: (s) => {
+                const out = [];
+                for (let i = 0; i < s.length; i++) {
+                    let cp = s.charCodeAt(i);
+                    if (cp >= 0xd800 && cp <= 0xdbff && i + 1 < s.length) {
+                        const next = s.charCodeAt(++i);
+                        cp = 0x10000 + ((cp - 0xd800) << 10) + (next - 0xdc00);
+                    }
+                    if (cp < 0x80)
+                        out.push(cp);
+                    else if (cp < 0x800)
+                        out.push(0xc0 | (cp >> 6), 0x80 | (cp & 0x3f));
+                    else if (cp < 0x10000)
+                        out.push(0xe0 | (cp >> 12), 0x80 | ((cp >> 6) & 0x3f), 0x80 | (cp & 0x3f));
+                    else
+                        out.push(0xf0 | (cp >> 18), 0x80 | ((cp >> 12) & 0x3f), 0x80 | ((cp >> 6) & 0x3f), 0x80 | (cp & 0x3f));
+                }
+                return new Uint8Array(out);
+            },
+        };
+    }
+}

package/package.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
   "name": "knolo-core",
-  "version": "0.1.3",
+  "version": "0.2.0",
   "type": "module",
   "description": "Local-first knowledge packs for small LLMs.",
   "keywords": ["llm", "knowledge-base", "rag", "local", "expo"],
-  "author": "Your Name",
+  "author": "Sam Paniagua",
   "license": "MIT",
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",
@@ -18,7 +18,8 @@
   },
   "scripts": {
     "build": "tsc -p tsconfig.json",
-    "prepublishOnly": "npm run build"
+    "prepublishOnly": "npm run build",
+     "smoke": "node scripts/smoke.mjs"
   },
   "devDependencies": {
     "typescript": "^5.5.0",