knowy 1.0.0 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -76
- package/knowy.js +87 -35
- package/package.json +3 -3
package/README.md
CHANGED
|
@@ -1,108 +1,90 @@
|
|
|
1
|
-
#
|
|
1
|
+
# 🧠 KnowyEngine
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
**KnowyEngine** is a lightweight, high-performance RAG (Retrieval-Augmented Generation) engine built on **LanceDB** and **HuggingFace Transformers**. It simplifies the process of embedding, storing, and retrieving knowledge with support for scoped knowledge bases, custom metadata, and precise text splitting.
|
|
4
4
|
|
|
5
|
-
## Features
|
|
5
|
+
## 🚀 Features
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
-
|
|
12
|
-
- 🚀 **Zero Configuration** - Works out of the box with sensible defaults
|
|
7
|
+
* **Hybrid API**: Use it globally for quick actions or scoped for organized KB management.
|
|
8
|
+
* **Smart Splitting**: Recursive text splitter that preserves sentence integrity and prevents \"semantic dilution.\"
|
|
9
|
+
* **Flexible Metadata**: Store any extra data (user IDs, versions, tags) and filter results using SQL-like queries.
|
|
10
|
+
* **Automatic Timestamps**: Every record is stamped with `__sys_created_at_` for easy time-based filtering.
|
|
11
|
+
* **Local-First**: Powered by ONNX-optimized embeddings and LanceDB for lightning-fast local vector search.
|
|
13
12
|
|
|
14
|
-
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## 📦 Installation
|
|
15
16
|
|
|
16
17
|
```bash
|
|
17
|
-
npm install
|
|
18
|
+
npm install @lancedb/lancedb @huggingface/transformers
|
|
18
19
|
```
|
|
19
20
|
|
|
20
|
-
##
|
|
21
|
-
|
|
22
|
-
```javascript
|
|
23
|
-
import { knowy } from 'knowy';
|
|
24
|
-
|
|
25
|
-
// Initialize the engine
|
|
26
|
-
const kbs = await knowy('./my_database');
|
|
27
|
-
|
|
28
|
-
// Create or access a knowledge base
|
|
29
|
-
const docs = kbs('documentation');
|
|
21
|
+
## 🛠️ Usage
|
|
30
22
|
|
|
31
|
-
|
|
32
|
-
await docs.addQA('getting-started', 'How do I install?', 'Run npm install knowy');
|
|
23
|
+
### Initialization
|
|
33
24
|
|
|
34
|
-
|
|
35
|
-
|
|
25
|
+
```javascript
|
|
26
|
+
import { knowy } from './knowy.js';
|
|
36
27
|
|
|
37
|
-
|
|
38
|
-
const results = await docs.search('how to install this package?', { limit: 5 });
|
|
39
|
-
console.log(results);
|
|
28
|
+
const kbs = await knowy(\"./my_knowledge_db\");
|
|
40
29
|
```
|
|
41
30
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
### `knowy(dbPath)`
|
|
45
|
-
|
|
46
|
-
Initialize the Knowy engine.
|
|
47
|
-
|
|
48
|
-
- `dbPath` (string): Path to the LanceDB database directory (default: `\"./knowy_database\"`)
|
|
49
|
-
|
|
50
|
-
Returns a `kbs` function with attached methods.
|
|
31
|
+
### Adding Knowledge
|
|
51
32
|
|
|
52
|
-
|
|
33
|
+
You can use the **Global API** or the **Scoped API**:
|
|
53
34
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
35
|
+
```javascript
|
|
36
|
+
// Global Style
|
|
37
|
+
await kbs.addText(\"hr\", \"benefits\", \"Unlimited vacation policy.\", \"manual.pdf\", { version: 1.2 });
|
|
38
|
+
|
|
39
|
+
// Scoped Style (Recommended for clean code)
|
|
40
|
+
const legal = kbs(\"legal\");
|
|
41
|
+
await legal.ingest(\"privacy\", longDocumentText, \"privacy_policy.txt\", {
|
|
42
|
+
chunkSize: 250,
|
|
43
|
+
overlap: 50,
|
|
44
|
+
metadata: { classification: \"confidential\" }
|
|
45
|
+
});
|
|
46
|
+
```
|
|
63
47
|
|
|
64
|
-
|
|
48
|
+
### Advanced Searching & Filtering
|
|
65
49
|
|
|
66
|
-
|
|
67
|
-
- `options.kbs` (string|string[]): Filter by specific knowledge base(s)
|
|
68
|
-
- `options.limit` (number): Max results (default: 10)
|
|
50
|
+
Retrieve the most relevant context while filtering by your custom metadata:
|
|
69
51
|
|
|
70
|
-
Returns array of results with `kb`, `topic`, `type`, `content`, and `score` (0-1).
|
|
71
52
|
|
|
72
|
-
### `kbs.list()`
|
|
73
53
|
|
|
74
|
-
|
|
54
|
+
```javascript
|
|
55
|
+
// Search a specific KB with a metadata filter
|
|
56
|
+
const results = await kbs(\"legal\").search(\"What is the retention policy?\", {
|
|
57
|
+
where: \"classification = 'confidential' AND __sys_created_at_ > 1709400000000\",
|
|
58
|
+
limit: 3
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
console.log(results[0].content.text);
|
|
62
|
+
console.log(results[0].metadata.classification); // \"confidential\"
|
|
63
|
+
```
|
|
75
64
|
|
|
76
|
-
|
|
65
|
+
### Management
|
|
77
66
|
|
|
78
67
|
```javascript
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
// Separate KBs for different domains
|
|
82
|
-
const codeKB = kbs('code-snippets');
|
|
83
|
-
const docsKB = kbs('documentation');
|
|
84
|
-
const supportKB = kbs('support-tickets');
|
|
68
|
+
// List all Knowledge Bases
|
|
69
|
+
const list = await kbs.list();
|
|
85
70
|
|
|
86
|
-
//
|
|
87
|
-
await
|
|
88
|
-
|
|
89
|
-
await supportKB.addQA('billing', 'How to cancel?', 'Go to settings > billing...');
|
|
71
|
+
// Delete a KB
|
|
72
|
+
await kbs.delete(\"temp_data\");
|
|
73
|
+
```
|
|
90
74
|
|
|
91
|
-
|
|
92
|
-
const allResults = await kbs.search('authentication', { limit: 10 });
|
|
75
|
+
---
|
|
93
76
|
|
|
94
|
-
|
|
95
|
-
const codeOnly = await kbs.search('authentication', { kbs: 'code-snippets' });
|
|
96
|
-
```
|
|
77
|
+
## ⚙️ Configuration
|
|
97
78
|
|
|
98
|
-
|
|
79
|
+
| Option | Default | Description |
|
|
80
|
+
| :--- | :--- | :--- |
|
|
81
|
+
| `chunkSize` | `250` | Maximum characters per chunk. Lower values increase precision. |
|
|
82
|
+
| `overlap` | `80` | Character overlap between chunks to prevent splitting keywords. |
|
|
83
|
+
| `where` | `undefined` | A SQL-string for metadata filtering (e.g., `\"user_id = 5\"`). |
|
|
99
84
|
|
|
100
|
-
|
|
85
|
+
## 🏗️ Architecture
|
|
101
86
|
|
|
102
|
-
## Requirements
|
|
103
87
|
|
|
104
|
-
- Node.js >= 18.0.0
|
|
105
88
|
|
|
106
|
-
|
|
89
|
+
KnowyEngine uses a **Recursive Splitter** to ensure that data is chunked logically at paragraph and sentence boundaries. These chunks are converted into 384-dimensional vectors using the `Qwen3-Embedding-0.6B-ONNX` model, providing a perfect balance between speed and semantic accuracy for local environments.
|
|
107
90
|
|
|
108
|
-
MIT © littlejustnode
|
package/knowy.js
CHANGED
|
@@ -22,69 +22,121 @@ class KnowyEngine {
|
|
|
22
22
|
return this;
|
|
23
23
|
}
|
|
24
24
|
|
|
25
|
+
_recursiveSplit(text, chunkSize, overlap) {
|
|
26
|
+
const separators = ["\n\n", "\n", ". ", " "];
|
|
27
|
+
const split = (str, sepIdx) => {
|
|
28
|
+
if (str.length <= chunkSize || sepIdx >= separators.length) return [str];
|
|
29
|
+
const sep = separators[sepIdx];
|
|
30
|
+
const parts = str.split(sep);
|
|
31
|
+
let chunks = [];
|
|
32
|
+
let current = "";
|
|
33
|
+
for (const part of parts) {
|
|
34
|
+
if ((current + part).length > chunkSize) {
|
|
35
|
+
if (current) chunks.push(current.trim());
|
|
36
|
+
current = current.slice(-overlap) + part + sep;
|
|
37
|
+
} else {
|
|
38
|
+
current += part + sep;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
if (current) chunks.push(current.trim());
|
|
42
|
+
return chunks;
|
|
43
|
+
};
|
|
44
|
+
return split(text, 0);
|
|
45
|
+
}
|
|
46
|
+
|
|
25
47
|
async _getEmbedding(text) {
|
|
26
|
-
const
|
|
48
|
+
const cleaned = text.replace(/\s+/g, ' ').trim();
|
|
49
|
+
const output = await this.extractor(cleaned, { pooling: 'last_token', normalize: true });
|
|
27
50
|
return Array.from(output.data);
|
|
28
51
|
}
|
|
29
52
|
|
|
30
|
-
async _upsert(
|
|
53
|
+
async _upsert(records) {
|
|
31
54
|
if (!this.table) {
|
|
32
|
-
this.table = await this.db.createTable(this.tableName,
|
|
55
|
+
this.table = await this.db.createTable(this.tableName, records);
|
|
33
56
|
} else {
|
|
34
|
-
await this.table.add(
|
|
57
|
+
await this.table.add(records);
|
|
35
58
|
}
|
|
36
59
|
}
|
|
37
60
|
|
|
38
61
|
getKB(kbName) {
|
|
39
62
|
return {
|
|
40
63
|
name: kbName,
|
|
41
|
-
addQA: (topic, q, a) => this.addQA(kbName, topic, q, a),
|
|
42
|
-
addText: (topic,
|
|
43
|
-
|
|
64
|
+
addQA: (topic, q, a, src, meta) => this.addQA(kbName, topic, q, a, src, meta),
|
|
65
|
+
addText: (topic, txt, src, meta) => this.addText(kbName, topic, txt, src, meta),
|
|
66
|
+
ingest: (topic, txt, src, opts) => this.ingest(kbName, topic, txt, src, opts),
|
|
67
|
+
search: (query, opts = {}) => this.search(query, { ...opts, kbs: kbName }),
|
|
44
68
|
delete: () => this.deleteKB(kbName)
|
|
45
69
|
};
|
|
46
70
|
}
|
|
47
71
|
|
|
48
|
-
async addQA(kbName, topic, question, answer) {
|
|
72
|
+
async addQA(kbName, topic, question, answer, source = "manual", metadata = {}) {
|
|
49
73
|
const vector = await this._getEmbedding(question);
|
|
50
|
-
await this._upsert({
|
|
74
|
+
await this._upsert([{
|
|
51
75
|
id: crypto.randomUUID(),
|
|
52
76
|
vector, kb_name: kbName, topic, type: 'qa',
|
|
53
|
-
question, answer, text: "",
|
|
54
|
-
|
|
77
|
+
question, answer, text: "", source,
|
|
78
|
+
__sys_created_at_: Date.now(),
|
|
79
|
+
...metadata
|
|
80
|
+
}]);
|
|
55
81
|
}
|
|
56
82
|
|
|
57
|
-
async addText(kbName, topic, text) {
|
|
83
|
+
async addText(kbName, topic, text, source = "note", metadata = {}) {
|
|
58
84
|
const vector = await this._getEmbedding(text);
|
|
59
|
-
await this._upsert({
|
|
85
|
+
await this._upsert([{
|
|
60
86
|
id: crypto.randomUUID(),
|
|
61
|
-
vector,
|
|
62
|
-
|
|
63
|
-
|
|
87
|
+
vector,
|
|
88
|
+
kb_name: kbName,
|
|
89
|
+
topic, type: 'text',
|
|
90
|
+
question: "", answer: "", text, source,
|
|
91
|
+
__sys_created_at_: Date.now(),
|
|
92
|
+
...metadata
|
|
93
|
+
}]);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
async ingest(kbName, topic, text, source = "file", options = {}) {
|
|
97
|
+
const { chunkSize = 250, overlap = 80, metadata = {} } = options;
|
|
98
|
+
const chunks = this._recursiveSplit(text, chunkSize, overlap);
|
|
99
|
+
const records = [];
|
|
100
|
+
const timestamp = Date.now();
|
|
101
|
+
for (const chunk of chunks) {
|
|
102
|
+
const vector = await this._getEmbedding(chunk);
|
|
103
|
+
records.push({
|
|
104
|
+
id: crypto.randomUUID(),
|
|
105
|
+
vector, kb_name: kbName, topic, type: 'text',
|
|
106
|
+
question: "", answer: "", text: chunk, source,
|
|
107
|
+
__sys_created_at_: timestamp,
|
|
108
|
+
...metadata
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
await this._upsert(records);
|
|
112
|
+
return chunks.length;
|
|
64
113
|
}
|
|
65
114
|
|
|
66
115
|
async search(query, options = {}) {
|
|
67
116
|
if (!this.table) return [];
|
|
68
|
-
const { kbs, limit =
|
|
117
|
+
const { kbs, limit = 5, where } = options;
|
|
69
118
|
const queryVector = await this._getEmbedding(query);
|
|
70
119
|
let request = this.table.vectorSearch(queryVector);
|
|
71
|
-
|
|
72
|
-
|
|
120
|
+
request.metricType = "cosine";
|
|
121
|
+
let filters = [];
|
|
73
122
|
if (kbs) {
|
|
74
|
-
|
|
123
|
+
filters.push(Array.isArray(kbs)
|
|
75
124
|
? `kb_name IN (${kbs.map(n => `"${n}"`).join(",")})`
|
|
76
|
-
: `kb_name = "${kbs}"
|
|
77
|
-
request = request.where(filter);
|
|
125
|
+
: `kb_name = "${kbs}"`);
|
|
78
126
|
}
|
|
79
|
-
|
|
127
|
+
if (where) filters.push(where);
|
|
128
|
+
if (filters.length > 0) request = request.where(filters.join(" AND "));
|
|
80
129
|
const results = await request.limit(limit).toArray();
|
|
81
|
-
return results.map(r =>
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
130
|
+
return results.map(r => {
|
|
131
|
+
const { vector, _distance, kb_name, topic, type, source, question, answer, text, __sys_created_at_, ...extra } = r;
|
|
132
|
+
return {
|
|
133
|
+
kb: kb_name, topic, type, source,
|
|
134
|
+
content: type === 'qa' ? { q: question, a: answer } : { text },
|
|
135
|
+
score: 1 - (_distance / 2),
|
|
136
|
+
__sys_created_at_,
|
|
137
|
+
metadata: extra
|
|
138
|
+
};
|
|
139
|
+
});
|
|
88
140
|
}
|
|
89
141
|
|
|
90
142
|
async deleteKB(kbName) {
|
|
@@ -98,18 +150,18 @@ class KnowyEngine {
|
|
|
98
150
|
}
|
|
99
151
|
}
|
|
100
152
|
|
|
101
|
-
/**
|
|
102
|
-
* Main export: initializes engine and returns the kbs function.
|
|
103
|
-
*/
|
|
104
153
|
export const knowy = async (dbPath = "./knowy_database") => {
|
|
105
154
|
const engine = new KnowyEngine(dbPath);
|
|
106
155
|
await engine.init();
|
|
107
156
|
|
|
108
|
-
// The kbs function
|
|
109
157
|
const kbs = (kbName) => engine.getKB(kbName);
|
|
110
158
|
|
|
111
|
-
//
|
|
159
|
+
// Global methods added back
|
|
160
|
+
kbs.addQA = (kb, topic, q, a, src, meta) => engine.addQA(kb, topic, q, a, src, meta);
|
|
161
|
+
kbs.addText = (kb, topic, txt, src, meta) => engine.addText(kb, topic, txt, src, meta);
|
|
162
|
+
kbs.ingest = (kb, topic, txt, src, opts) => engine.ingest(kb, topic, txt, src, opts);
|
|
112
163
|
kbs.search = (query, options) => engine.search(query, options);
|
|
164
|
+
kbs.delete = (kb) => engine.deleteKB(kb);
|
|
113
165
|
kbs.list = () => engine.listKBs();
|
|
114
166
|
|
|
115
167
|
return kbs;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "knowy",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.3",
|
|
4
4
|
"description": "A local-first knowledge base engine with vector search using LanceDB and Hugging Face embeddings",
|
|
5
5
|
"main": "knowy.js",
|
|
6
6
|
"type": "module",
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
"semantic-search",
|
|
17
17
|
"qa",
|
|
18
18
|
"ai",
|
|
19
|
-
|
|
19
|
+
"knowy",
|
|
20
20
|
"local-first"
|
|
21
21
|
],
|
|
22
22
|
"author": "littlejustnode",
|
|
@@ -36,4 +36,4 @@
|
|
|
36
36
|
"engines": {
|
|
37
37
|
"node": ">=18.0.0"
|
|
38
38
|
}
|
|
39
|
-
}
|
|
39
|
+
}
|