@sjovanovic/recall.js 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -26
- package/package.json +6 -6
- package/recall.js +51 -19
- package/utils/sanitize.js +34 -0
package/README.md
CHANGED
|
@@ -8,11 +8,11 @@ Recall.js is long term memory for AI apps!
|
|
|
8
8
|
|
|
9
9
|
It is a tool for building RAG (Retrieval-augmented generation) in a form of JavaScript library and command line utility focused on speed, ease of use and embeddability.
|
|
10
10
|
|
|
11
|
-
It is versatile and you don't have to use it exclusively for RAG,
|
|
11
|
+
It is versatile and you don't have to use it exclusively for RAG, it can also be used for generic Semantic Search, as expert memory for your AI app, as a recommendation system, there are many possibilities...
|
|
12
12
|
|
|
13
13
|
Recall.js supports multilingual embeddings out of the box so you can add data in one language and then query it in another.
|
|
14
14
|
|
|
15
|
-
Under the hood, recall.js uses
|
|
15
|
+
Under the hood, recall.js uses [Transformers.js](https://huggingface.co/docs/transformers.js/index) feature extraction and a vector database to index and query your data. It is a light wrapper around local language models such as [Multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2) and [CozoDB](https://www.cozodb.org/) vector database.
|
|
16
16
|
|
|
17
17
|
## Install
|
|
18
18
|
|
|
@@ -26,7 +26,7 @@ Console:
|
|
|
26
26
|
recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
|
|
27
27
|
recall --query "Un animal saute par-dessus un autre animal" --limit 1
|
|
28
28
|
```
|
|
29
|
-
**Warning:** when this library is used for the first time, it will download a local language model MiniLM-L12-v2 which may take
|
|
29
|
+
**Warning:** when this library is used for the first time, it will download a local language model Multilingual-MiniLM-L12-v2 which may take a while depending on your Internet connectivity. Please be patient.
|
|
30
30
|
|
|
31
31
|
Below is the same example in JavaScript:
|
|
32
32
|
|
|
@@ -58,16 +58,18 @@ response:
|
|
|
58
58
|
"dist",
|
|
59
59
|
"result",
|
|
60
60
|
"id",
|
|
61
|
-
"data"
|
|
61
|
+
"data",
|
|
62
|
+
"category"
|
|
62
63
|
],
|
|
63
64
|
"rows": [
|
|
64
65
|
[
|
|
65
|
-
0.
|
|
66
|
+
0.6840495824813843, // vector similarity
|
|
66
67
|
"Fox and dog",
|
|
67
68
|
"08840189191373282",
|
|
68
69
|
{
|
|
69
70
|
"foo": "bar"
|
|
70
|
-
}
|
|
71
|
+
},
|
|
72
|
+
""
|
|
71
73
|
]
|
|
72
74
|
]
|
|
73
75
|
}
|
|
@@ -84,20 +86,21 @@ Easy way to view all the options is via command line:
|
|
|
84
86
|
recall --help
|
|
85
87
|
|
|
86
88
|
Usage:
|
|
87
|
-
recall --query "Foo Bar"
|
|
89
|
+
recall.js --query "Foo Bar"
|
|
88
90
|
|
|
89
91
|
Options:
|
|
90
|
-
--query "SEARCH_STRING"
|
|
91
|
-
--limit 2
|
|
92
|
-
--add 'input|result|{"foo":"bar"}'
|
|
93
|
-
--remove 'id'
|
|
94
|
-
--nuke
|
|
95
|
-
--mcp
|
|
96
|
-
--db "FILE_NAME"
|
|
97
|
-
--import "file.csv | file.tsv"
|
|
98
|
-
--input-header "foo"
|
|
99
|
-
--result-header "bar"
|
|
100
|
-
--json "FILE_NAME"
|
|
92
|
+
--query "SEARCH_STRING" - search
|
|
93
|
+
--limit 2 - limit number of results (used with --query)
|
|
94
|
+
--add 'input|result|{"foo":"bar"}|categ' - add data
|
|
95
|
+
--remove 'id' - remove data
|
|
96
|
+
--nuke - destroy database
|
|
97
|
+
--mcp - run as MCP server (experimental)
|
|
98
|
+
--db "FILE_NAME" - database file (SQLite)
|
|
99
|
+
--import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data
|
|
100
|
+
--input-header "foo" - when used with --import designates specific header column as input
|
|
101
|
+
--result-header "bar" - when used with --import designates specific header column as result
|
|
102
|
+
--json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}
|
|
103
|
+
--category "CATEGORY" - specify category when adding data and to filter by when querying (defaults to empty string)
|
|
101
104
|
```
|
|
102
105
|
|
|
103
106
|
**Note:** when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
|
|
@@ -111,11 +114,14 @@ Configuration object.
|
|
|
111
114
|
|
|
112
115
|
```javascript
|
|
113
116
|
export const config = {
|
|
114
|
-
VECTOR_SIZE: 384, // number of dimensions
|
|
115
|
-
MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
|
|
117
|
+
VECTOR_SIZE: 384, // number of dimensions (must match the models output)
|
|
118
|
+
MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
|
|
116
119
|
SHOW_ERRORS: true, // Show errors
|
|
117
120
|
DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
|
|
118
|
-
PATH: PATH // directory of recall.js
|
|
121
|
+
PATH: PATH, // directory of recall.js
|
|
122
|
+
DEVICE: undefined, // Transformers.js device
|
|
123
|
+
DTYPE: undefined, // Transformers.js dtype
|
|
124
|
+
PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
|
|
119
125
|
}
|
|
120
126
|
```
|
|
121
127
|
|
|
@@ -127,7 +133,7 @@ Returns reference to the CozoDB instance.
|
|
|
127
133
|
|
|
128
134
|
Given text calculates the embeddings vector
|
|
129
135
|
|
|
130
|
-
### RECALL.add(input, result, data={}) -> Promise(Object)
|
|
136
|
+
### RECALL.add(input, result, data={}, category="") -> Promise(Object)
|
|
131
137
|
|
|
132
138
|
Add data. `input` is the sentence to get embeddings from. `result` is the string to show in the results. `data` is arbitrary object intended to hold related pieces of information and references. If `data` object contains `id` property it will be used as unique id of the record.
|
|
133
139
|
|
|
@@ -136,14 +142,14 @@ Add data. `input` is the sentence to get embeddings from. `result` is the string
|
|
|
136
142
|
Add data in batches (faster than using add repeteadely).
|
|
137
143
|
`batch` is an Array that looks like this:
|
|
138
144
|
```
|
|
139
|
-
let batch = [{input:"", result:"", data:{}}]
|
|
145
|
+
let batch = [{input:"", result:"", data:{}, category:""}]
|
|
140
146
|
```
|
|
141
147
|
|
|
142
148
|
### RECALL.remove(id) -> Promise(Object)
|
|
143
149
|
|
|
144
150
|
Remove data by id. id is a string.
|
|
145
151
|
|
|
146
|
-
### RECALL.searchText(text, numResults = 5) -> Promise(Object)
|
|
152
|
+
### RECALL.searchText(text, category="", numResults = 5, includeInput=false) -> Promise(Object)
|
|
147
153
|
|
|
148
154
|
Query the vector database. Accepts query text and number of results to return.
|
|
149
155
|
|
|
@@ -155,8 +161,8 @@ Deletes the database.
|
|
|
155
161
|
|
|
156
162
|
Imports from readable stream or file which consists of JSON objects, one per line. e.g.
|
|
157
163
|
```
|
|
158
|
-
{input:"one", result:"one result", data:{"id":"123"}}
|
|
159
|
-
{input:"", result:"", data:{}}
|
|
164
|
+
{input:"one", result:"one result", data:{"id":"123"}, category:""}
|
|
165
|
+
{input:"", result:"", data:{}, category:""}
|
|
160
166
|
...
|
|
161
167
|
```
|
|
162
168
|
This is the most efficient way to import data.
|
package/package.json
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sjovanovic/recall.js",
|
|
3
|
-
"version": "1.0.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "1.0.4",
|
|
4
|
+
"description": "Easy RAG with semantic search and long term memory",
|
|
5
5
|
"main": "recall.js",
|
|
6
6
|
"bin": {
|
|
7
7
|
"recall": "recall.js"
|
|
8
8
|
},
|
|
9
9
|
"type": "module",
|
|
10
10
|
"scripts": {
|
|
11
|
+
"start": "node recall.js",
|
|
11
12
|
"test": "echo \"Error: no test specified\" && exit 1",
|
|
12
13
|
"query": "node recall.js --query "
|
|
13
14
|
},
|
|
14
15
|
"author": "Slobodan Jovanovic",
|
|
15
16
|
"license": "ISC",
|
|
16
17
|
"dependencies": {
|
|
17
|
-
"@
|
|
18
|
-
"@
|
|
19
|
-
"@xenova/transformers": "^2.17.2",
|
|
18
|
+
"@huggingface/transformers": "^4.2.0",
|
|
19
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
20
20
|
"cozo-node": "^0.7.6",
|
|
21
21
|
"csv-parser": "^3.2.0",
|
|
22
|
-
"zod": "^3.
|
|
22
|
+
"zod": "^4.3.6"
|
|
23
23
|
}
|
|
24
24
|
}
|
package/recall.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {CozoDb} from 'cozo-node'
|
|
3
|
-
import
|
|
3
|
+
import { pipeline } from "@huggingface/transformers";
|
|
4
4
|
import csv from 'csv-parser'
|
|
5
5
|
import fs from 'fs'
|
|
6
6
|
import { resolve, join, dirname, sep } from 'path'
|
|
@@ -10,17 +10,22 @@ import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mc
|
|
|
10
10
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
11
11
|
import { z } from "zod";
|
|
12
12
|
|
|
13
|
+
// import {sanitizeValue} from './utils/sanitize.js'
|
|
14
|
+
|
|
13
15
|
const pathToThisFile = resolve(fileURLToPath(import.meta.url))
|
|
14
16
|
const pathPassedToNode = resolve(process.argv[1])
|
|
15
17
|
const isThisFileBeingRunViaCLI = pathToThisFile.includes(pathPassedToNode) || pathPassedToNode.includes('.npm-global')
|
|
16
18
|
const PATH = dirname(pathToThisFile)
|
|
17
19
|
|
|
18
20
|
export const config = {
|
|
19
|
-
VECTOR_SIZE: 384, // number of dimensions
|
|
20
|
-
MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
|
|
21
|
+
VECTOR_SIZE: 384, // number of dimensions (must match the models output)
|
|
22
|
+
MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
|
|
21
23
|
SHOW_ERRORS: true, // Show errors
|
|
22
24
|
DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
|
|
23
|
-
PATH: PATH // directory of recall.js
|
|
25
|
+
PATH: PATH, // directory of recall.js
|
|
26
|
+
DEVICE: undefined, // Transformers.js device
|
|
27
|
+
DTYPE: undefined, // Transformers.js dtype
|
|
28
|
+
PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
|
|
24
29
|
}
|
|
25
30
|
|
|
26
31
|
var db = null, initDone = false
|
|
@@ -33,7 +38,6 @@ export const getDb = () => {
|
|
|
33
38
|
}
|
|
34
39
|
|
|
35
40
|
async function printQuery(query, params = {}) {
|
|
36
|
-
|
|
37
41
|
try{
|
|
38
42
|
if(!initDone) {
|
|
39
43
|
initDone = true
|
|
@@ -51,12 +55,22 @@ async function printQuery(query, params = {}) {
|
|
|
51
55
|
}
|
|
52
56
|
|
|
53
57
|
export const getEmbeddings = async (text) => {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
58
|
+
let pipe = config._pipe
|
|
59
|
+
if(!pipe) {
|
|
60
|
+
config._pipe = await pipeline("feature-extraction", config.MODEL_NAME, {
|
|
61
|
+
progress_callback:(progress) => {
|
|
62
|
+
if(config.PROGRESS_CALLBACK) return config.PROGRESS_CALLBACK();
|
|
63
|
+
if(progress.status === "progress_total"){
|
|
64
|
+
process.stdout.write(`\r\x1b[K✅ Loaded ${ Math.round(progress.progress)}% ${progress.name || "model"}`)
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
device: config.DEVICE,
|
|
68
|
+
dtype: config.DTYPE
|
|
69
|
+
});
|
|
70
|
+
pipe = config._pipe
|
|
71
|
+
}
|
|
72
|
+
const embedding = await pipe(text, { pooling: "mean", normalize: true });
|
|
73
|
+
return Array.from(embedding.data)
|
|
60
74
|
}
|
|
61
75
|
|
|
62
76
|
export const createTable = async () => {
|
|
@@ -81,7 +95,6 @@ export const createTable = async () => {
|
|
|
81
95
|
|
|
82
96
|
export const add = async (input, result, data={}, category="") => {
|
|
83
97
|
if(!input || !result) return
|
|
84
|
-
|
|
85
98
|
input = sanitizeString(input)
|
|
86
99
|
result = sanitizeString(result)
|
|
87
100
|
const embedding = await getEmbeddings(input)
|
|
@@ -99,7 +112,7 @@ export const add = async (input, result, data={}, category="") => {
|
|
|
99
112
|
* @param {Array} batch
|
|
100
113
|
* @returns
|
|
101
114
|
*/
|
|
102
|
-
export const addBatch = async (batch) => {
|
|
115
|
+
export const addBatch = async (batch, opts={onProgress:null}) => {
|
|
103
116
|
if(!batch || !Array.isArray(batch)) return
|
|
104
117
|
let vectorBatch = []
|
|
105
118
|
for(let i=0;i<batch.length; i++){
|
|
@@ -125,27 +138,46 @@ export const addBatch = async (batch) => {
|
|
|
125
138
|
:put embeddings {id, category => v, input, result, data}`
|
|
126
139
|
}
|
|
127
140
|
vectorBatch.push(item)
|
|
141
|
+
|
|
142
|
+
if(opts.onProgress && typeof opts.onProgress == 'function') {
|
|
143
|
+
await opts.onProgress({index: i+1, total:batch.length, item: batch[i], embedding, percent: Math.round((i+1) / batch.length * 100)})
|
|
144
|
+
}
|
|
128
145
|
}
|
|
129
146
|
return await printQuery(vectorBatch.join("\n"))
|
|
130
147
|
}
|
|
131
148
|
|
|
132
149
|
const sanitizeString = (str)=>{
|
|
133
|
-
return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ")
|
|
150
|
+
return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ").trim()
|
|
134
151
|
}
|
|
135
152
|
|
|
136
153
|
export const remove = async (id, category="") => {
|
|
137
154
|
if(!id || typeof id != 'string') return
|
|
138
|
-
id.replace(/[^a-zA-Z0-9]/g, '')
|
|
139
|
-
|
|
155
|
+
id = id.replace(/[^a-zA-Z0-9]/g, '')
|
|
156
|
+
category = sanitizeString(category)
|
|
157
|
+
if(!id || !category) return
|
|
140
158
|
let results = await printQuery(
|
|
141
159
|
`?[id, category] <- [['${id}', '${category}']]
|
|
142
|
-
::
|
|
160
|
+
::rm embeddings {id, category}`)
|
|
161
|
+
return results
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export const removeAllByCategory = async (category="") => {
|
|
165
|
+
category = sanitizeString(category)
|
|
166
|
+
if(!category) return
|
|
167
|
+
let results
|
|
168
|
+
try {
|
|
169
|
+
results = await printQuery(
|
|
170
|
+
`?[id, category] := *embeddings{id, category}, category = "${category}"
|
|
171
|
+
:rm embeddings {id, category}`)
|
|
172
|
+
}catch(err){
|
|
173
|
+
console.error(err)
|
|
174
|
+
}
|
|
143
175
|
return results
|
|
144
176
|
}
|
|
145
177
|
|
|
146
|
-
export const searchText = async (text, category="", numResults = 5) => {
|
|
178
|
+
export const searchText = async (text, category="", numResults = 5, includeInput=false) => {
|
|
147
179
|
const embedding = await getEmbeddings(text)
|
|
148
|
-
let results = await printQuery(`?[dist, result, id, data, category] := ~embeddings:index_name { id, v, input, result, data, category |
|
|
180
|
+
let results = await printQuery(`?[dist, result, id, data, category${includeInput? ', input' : ''}] := ~embeddings:index_name { id, v, input, result, data, category${includeInput? ', input' : ''} |
|
|
149
181
|
query: q,
|
|
150
182
|
k: ${numResults}, # number of results
|
|
151
183
|
ef: 50, # number of neighbours to consider
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
export function sanitizeValue(stringValue, maxChars=1000) {
|
|
2
|
+
if (typeof stringValue !== 'string') {
|
|
3
|
+
throw new Error('stringValue must be a string');
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
let sanitized = stringValue.normalize('NFC').trim();
|
|
7
|
+
|
|
8
|
+
// Basic validation
|
|
9
|
+
if (sanitized.length === 0) {
|
|
10
|
+
throw new Error('stringValue name cannot be empty');
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
if (sanitized.length > maxChars) {
|
|
14
|
+
throw new Error(`stringValue name too long (max ${maxChars} characters)`);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// Block control characters (primary security concern)
|
|
18
|
+
// This allows all other Unicode characters including emojis, Chinese, Arabic, etc.
|
|
19
|
+
if (/[\x00-\x1F\x7F-\x9F\u200B\u200E\u200F\u202A-\u202E\u2060-\u2069\uFEFF]/.test(sanitized)) {
|
|
20
|
+
throw new Error('stringValue contains disallowed control characters');
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Block private use areas
|
|
24
|
+
if (/[\uE000-\uF8FF\uFFF0-\uFFFF]/.test(sanitized)) {
|
|
25
|
+
throw new Error('stringValue contains disallowed Unicode characters');
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Block surrogate pairs (invalid alone)
|
|
29
|
+
if (/[\uD800-\uDFFF]/.test(sanitized)) {
|
|
30
|
+
throw new Error('stringValue contains invalid Unicode characters');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return sanitized;
|
|
34
|
+
}
|