knowy 1.0.0 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +58 -76
  2. package/knowy.js +87 -35
  3. package/package.json +3 -3
package/README.md CHANGED
@@ -1,108 +1,90 @@
1
- # Knowy
1
+ # 🧠 KnowyEngine
2
2
 
3
- A local-first, zero-config knowledge base engine with semantic vector search. Built on LanceDB and Hugging Face Transformers.
3
+ **KnowyEngine** is a lightweight, high-performance RAG (Retrieval-Augmented Generation) engine built on **LanceDB** and **HuggingFace Transformers**. It simplifies the process of embedding, storing, and retrieving knowledge with support for scoped knowledge bases, custom metadata, and precise text splitting.
4
4
 
5
- ## Features
5
+ ## 🚀 Features
6
6
 
7
- - 🔍 **Semantic Search** - Find relevant content using natural language queries
8
- - 💾 **Local-First** - All data stored locally using LanceDB (no cloud required)
9
- - 🧠 **Built-in Embeddings** - Uses Qwen3-Embedding-0.6B-ONNX for high-quality vectors
10
- - 📚 **Knowledge Base Organization** - Group data into named knowledge bases
11
- - **Q&A Support** - Store question-answer pairs or free text documents
12
- - 🚀 **Zero Configuration** - Works out of the box with sensible defaults
7
+ * **Hybrid API**: Use it globally for quick actions or scoped for organized KB management.
8
+ * **Smart Splitting**: Recursive text splitter that preserves sentence integrity and prevents \"semantic dilution.\"
9
+ * **Flexible Metadata**: Store any extra data (user IDs, versions, tags) and filter results using SQL-like queries.
10
+ * **Automatic Timestamps**: Every record is stamped with `__sys_created_at_` for easy time-based filtering.
11
+ * **Local-First**: Powered by ONNX-optimized embeddings and LanceDB for lightning-fast local vector search.
13
12
 
14
- ## Installation
13
+ ---
14
+
15
+ ## 📦 Installation
15
16
 
16
17
  ```bash
17
- npm install knowy
18
+ npm install @lancedb/lancedb @huggingface/transformers
18
19
  ```
19
20
 
20
- ## Quick Start
21
-
22
- ```javascript
23
- import { knowy } from 'knowy';
24
-
25
- // Initialize the engine
26
- const kbs = await knowy('./my_database');
27
-
28
- // Create or access a knowledge base
29
- const docs = kbs('documentation');
21
+ ## 🛠️ Usage
30
22
 
31
- // Add Q&A pairs
32
- await docs.addQA('getting-started', 'How do I install?', 'Run npm install knowy');
23
+ ### Initialization
33
24
 
34
- // Or add free text
35
- await docs.addText('api-reference', 'The API supports vector search with cosine similarity...');
25
+ ```javascript
26
+ import { knowy } from './knowy.js';
36
27
 
37
- // Search across your knowledge base
38
- const results = await docs.search('how to install this package?', { limit: 5 });
39
- console.log(results);
28
+ const kbs = await knowy(\"./my_knowledge_db\");
40
29
  ```
41
30
 
42
- ## API Reference
43
-
44
- ### `knowy(dbPath)`
45
-
46
- Initialize the Knowy engine.
47
-
48
- - `dbPath` (string): Path to the LanceDB database directory (default: `\"./knowy_database\"`)
49
-
50
- Returns a `kbs` function with attached methods.
31
+ ### Adding Knowledge
51
32
 
52
- ### `kbs(kbName)`
33
+ You can use the **Global API** or the **Scoped API**:
53
34
 
54
- Get or create a knowledge base.
55
-
56
- Returns a knowledge base object with methods:
57
- - `addQA(topic, question, answer)` - Add a Q&A pair
58
- - `addText(topic, text)` - Add free text content
59
- - `search(query, options)` - Search within this KB
60
- - `delete()` - Delete this knowledge base
61
-
62
- ### `kbs.search(query, options)`
35
+ ```javascript
36
+ // Global Style
37
+ await kbs.addText(\"hr\", \"benefits\", \"Unlimited vacation policy.\", \"manual.pdf\", { version: 1.2 });
38
+
39
+ // Scoped Style (Recommended for clean code)
40
+ const legal = kbs(\"legal\");
41
+ await legal.ingest(\"privacy\", longDocumentText, \"privacy_policy.txt\", {
42
+ chunkSize: 250,
43
+ overlap: 50,
44
+ metadata: { classification: \"confidential\" }
45
+ });
46
+ ```
63
47
 
64
- Search across all knowledge bases.
48
+ ### Advanced Searching & Filtering
65
49
 
66
- - `query` (string): Search query
67
- - `options.kbs` (string|string[]): Filter by specific knowledge base(s)
68
- - `options.limit` (number): Max results (default: 10)
50
+ Retrieve the most relevant context while filtering by your custom metadata:
69
51
 
70
- Returns array of results with `kb`, `topic`, `type`, `content`, and `score` (0-1).
71
52
 
72
- ### `kbs.list()`
73
53
 
74
- List all knowledge bases.
54
+ ```javascript
55
+ // Search a specific KB with a metadata filter
56
+ const results = await kbs(\"legal\").search(\"What is the retention policy?\", {
57
+ where: \"classification = 'confidential' AND __sys_created_at_ > 1709400000000\",
58
+ limit: 3
59
+ });
60
+
61
+ console.log(results[0].content.text);
62
+ console.log(results[0].metadata.classification); // \"confidential\"
63
+ ```
75
64
 
76
- ## Example: Multi-KB Setup
65
+ ### Management
77
66
 
78
67
  ```javascript
79
- const kbs = await knowy();
80
-
81
- // Separate KBs for different domains
82
- const codeKB = kbs('code-snippets');
83
- const docsKB = kbs('documentation');
84
- const supportKB = kbs('support-tickets');
68
+ // List all Knowledge Bases
69
+ const list = await kbs.list();
85
70
 
86
- // Add content to each...
87
- await codeKB.addText('auth', 'JWT authentication implementation...');
88
- await docsKB.addQA('setup', 'Node version?', 'Requires Node 18+');
89
- await supportKB.addQA('billing', 'How to cancel?', 'Go to settings > billing...');
71
+ // Delete a KB
72
+ await kbs.delete(\"temp_data\");
73
+ ```
90
74
 
91
- // Search everything
92
- const allResults = await kbs.search('authentication', { limit: 10 });
75
+ ---
93
76
 
94
- // Search specific KBs only
95
- const codeOnly = await kbs.search('authentication', { kbs: 'code-snippets' });
96
- ```
77
+ ## ⚙️ Configuration
97
78
 
98
- ## Search Scoring
79
+ | Option | Default | Description |
80
+ | :--- | :--- | :--- |
81
+ | `chunkSize` | `250` | Maximum characters per chunk. Lower values increase precision. |
82
+ | `overlap` | `80` | Character overlap between chunks to prevent splitting keywords. |
83
+ | `where` | `undefined` | A SQL-string for metadata filtering (e.g., `\"user_id = 5\"`). |
99
84
 
100
- Results include a `score` from 0 to 1, where 1 is a perfect match. The score is calculated using cosine similarity between the query embedding and stored vectors.
85
+ ## 🏗️ Architecture
101
86
 
102
- ## Requirements
103
87
 
104
- - Node.js >= 18.0.0
105
88
 
106
- ## License
89
+ KnowyEngine uses a **Recursive Splitter** to ensure that data is chunked logically at paragraph and sentence boundaries. These chunks are converted into 384-dimensional vectors using the `Qwen3-Embedding-0.6B-ONNX` model, providing a perfect balance between speed and semantic accuracy for local environments.
107
90
 
108
- MIT © littlejustnode
package/knowy.js CHANGED
@@ -22,69 +22,121 @@ class KnowyEngine {
22
22
  return this;
23
23
  }
24
24
 
25
+ _recursiveSplit(text, chunkSize, overlap) {
26
+ const separators = ["\n\n", "\n", ". ", " "];
27
+ const split = (str, sepIdx) => {
28
+ if (str.length <= chunkSize || sepIdx >= separators.length) return [str];
29
+ const sep = separators[sepIdx];
30
+ const parts = str.split(sep);
31
+ let chunks = [];
32
+ let current = "";
33
+ for (const part of parts) {
34
+ if ((current + part).length > chunkSize) {
35
+ if (current) chunks.push(current.trim());
36
+ current = current.slice(-overlap) + part + sep;
37
+ } else {
38
+ current += part + sep;
39
+ }
40
+ }
41
+ if (current) chunks.push(current.trim());
42
+ return chunks;
43
+ };
44
+ return split(text, 0);
45
+ }
46
+
25
47
  async _getEmbedding(text) {
26
- const output = await this.extractor(text, { pooling: 'last_token', normalize: true });
48
+ const cleaned = text.replace(/\s+/g, ' ').trim();
49
+ const output = await this.extractor(cleaned, { pooling: 'last_token', normalize: true });
27
50
  return Array.from(output.data);
28
51
  }
29
52
 
30
- async _upsert(record) {
53
+ async _upsert(records) {
31
54
  if (!this.table) {
32
- this.table = await this.db.createTable(this.tableName, [record]);
55
+ this.table = await this.db.createTable(this.tableName, records);
33
56
  } else {
34
- await this.table.add([record]);
57
+ await this.table.add(records);
35
58
  }
36
59
  }
37
60
 
38
61
  getKB(kbName) {
39
62
  return {
40
63
  name: kbName,
41
- addQA: (topic, q, a) => this.addQA(kbName, topic, q, a),
42
- addText: (topic, text) => this.addText(kbName, topic, text),
43
- search: (query, options = {}) => this.search(query, { ...options, kbs: kbName }),
64
+ addQA: (topic, q, a, src, meta) => this.addQA(kbName, topic, q, a, src, meta),
65
+ addText: (topic, txt, src, meta) => this.addText(kbName, topic, txt, src, meta),
66
+ ingest: (topic, txt, src, opts) => this.ingest(kbName, topic, txt, src, opts),
67
+ search: (query, opts = {}) => this.search(query, { ...opts, kbs: kbName }),
44
68
  delete: () => this.deleteKB(kbName)
45
69
  };
46
70
  }
47
71
 
48
- async addQA(kbName, topic, question, answer) {
72
+ async addQA(kbName, topic, question, answer, source = "manual", metadata = {}) {
49
73
  const vector = await this._getEmbedding(question);
50
- await this._upsert({
74
+ await this._upsert([{
51
75
  id: crypto.randomUUID(),
52
76
  vector, kb_name: kbName, topic, type: 'qa',
53
- question, answer, text: "", created_at: new Date().toISOString()
54
- });
77
+ question, answer, text: "", source,
78
+ __sys_created_at_: Date.now(),
79
+ ...metadata
80
+ }]);
55
81
  }
56
82
 
57
- async addText(kbName, topic, text) {
83
+ async addText(kbName, topic, text, source = "note", metadata = {}) {
58
84
  const vector = await this._getEmbedding(text);
59
- await this._upsert({
85
+ await this._upsert([{
60
86
  id: crypto.randomUUID(),
61
- vector, kb_name: kbName, topic, type: 'text',
62
- question: "", answer: "", text, created_at: new Date().toISOString()
63
- });
87
+ vector,
88
+ kb_name: kbName,
89
+ topic, type: 'text',
90
+ question: "", answer: "", text, source,
91
+ __sys_created_at_: Date.now(),
92
+ ...metadata
93
+ }]);
94
+ }
95
+
96
+ async ingest(kbName, topic, text, source = "file", options = {}) {
97
+ const { chunkSize = 250, overlap = 80, metadata = {} } = options;
98
+ const chunks = this._recursiveSplit(text, chunkSize, overlap);
99
+ const records = [];
100
+ const timestamp = Date.now();
101
+ for (const chunk of chunks) {
102
+ const vector = await this._getEmbedding(chunk);
103
+ records.push({
104
+ id: crypto.randomUUID(),
105
+ vector, kb_name: kbName, topic, type: 'text',
106
+ question: "", answer: "", text: chunk, source,
107
+ __sys_created_at_: timestamp,
108
+ ...metadata
109
+ });
110
+ }
111
+ await this._upsert(records);
112
+ return chunks.length;
64
113
  }
65
114
 
66
115
  async search(query, options = {}) {
67
116
  if (!this.table) return [];
68
- const { kbs, limit = 10 } = options;
117
+ const { kbs, limit = 5, where } = options;
69
118
  const queryVector = await this._getEmbedding(query);
70
119
  let request = this.table.vectorSearch(queryVector);
71
- request.metricType = "cosine";
72
-
120
+ request.metricType = "cosine";
121
+ let filters = [];
73
122
  if (kbs) {
74
- const filter = Array.isArray(kbs)
123
+ filters.push(Array.isArray(kbs)
75
124
  ? `kb_name IN (${kbs.map(n => `"${n}"`).join(",")})`
76
- : `kb_name = "${kbs}"`;
77
- request = request.where(filter);
125
+ : `kb_name = "${kbs}"`);
78
126
  }
79
-
127
+ if (where) filters.push(where);
128
+ if (filters.length > 0) request = request.where(filters.join(" AND "));
80
129
  const results = await request.limit(limit).toArray();
81
- return results.map(r => ({
82
- kb: r.kb_name,
83
- topic: r.topic,
84
- type: r.type,
85
- content: r.type === 'qa' ? { q: r.question, a: r.answer } : { text: r.text },
86
- score: 1-(r._distance/2)
87
- }));
130
+ return results.map(r => {
131
+ const { vector, _distance, kb_name, topic, type, source, question, answer, text, __sys_created_at_, ...extra } = r;
132
+ return {
133
+ kb: kb_name, topic, type, source,
134
+ content: type === 'qa' ? { q: question, a: answer } : { text },
135
+ score: 1 - (_distance / 2),
136
+ __sys_created_at_,
137
+ metadata: extra
138
+ };
139
+ });
88
140
  }
89
141
 
90
142
  async deleteKB(kbName) {
@@ -98,18 +150,18 @@ class KnowyEngine {
98
150
  }
99
151
  }
100
152
 
101
- /**
102
- * Main export: initializes engine and returns the kbs function.
103
- */
104
153
  export const knowy = async (dbPath = "./knowy_database") => {
105
154
  const engine = new KnowyEngine(dbPath);
106
155
  await engine.init();
107
156
 
108
- // The kbs function
109
157
  const kbs = (kbName) => engine.getKB(kbName);
110
158
 
111
- // Attached methods
159
+ // Global methods added back
160
+ kbs.addQA = (kb, topic, q, a, src, meta) => engine.addQA(kb, topic, q, a, src, meta);
161
+ kbs.addText = (kb, topic, txt, src, meta) => engine.addText(kb, topic, txt, src, meta);
162
+ kbs.ingest = (kb, topic, txt, src, opts) => engine.ingest(kb, topic, txt, src, opts);
112
163
  kbs.search = (query, options) => engine.search(query, options);
164
+ kbs.delete = (kb) => engine.deleteKB(kb);
113
165
  kbs.list = () => engine.listKBs();
114
166
 
115
167
  return kbs;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "knowy",
3
- "version": "1.0.0",
3
+ "version": "1.0.3",
4
4
  "description": "A local-first knowledge base engine with vector search using LanceDB and Hugging Face embeddings",
5
5
  "main": "knowy.js",
6
6
  "type": "module",
@@ -16,7 +16,7 @@
16
16
  "semantic-search",
17
17
  "qa",
18
18
  "ai",
19
- "knowy",
19
+ "knowy",
20
20
  "local-first"
21
21
  ],
22
22
  "author": "littlejustnode",
@@ -36,4 +36,4 @@
36
36
  "engines": {
37
37
  "node": ">=18.0.0"
38
38
  }
39
- }
39
+ }