@sjovanovic/recall.js 1.0.1 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -46
- package/package.json +6 -6
- package/recall.js +106 -77
- package/utils/sanitize.js +34 -0
package/README.md
CHANGED
|
@@ -6,13 +6,13 @@
|
|
|
6
6
|
|
|
7
7
|
Recall.js is long term memory for AI apps!
|
|
8
8
|
|
|
9
|
-
It is a
|
|
9
|
+
It is a tool for building RAG (Retrieval-augmented generation) in a form of JavaScript library and command line utility focused on speed, ease of use and embeddability.
|
|
10
10
|
|
|
11
|
-
It is versatile
|
|
11
|
+
It is versatile and you don't have to use it exclusively for RAG, it can also be used for generic Semantic Search, as expert memory for your AI app, as a recommendation system, there are many possibilities...
|
|
12
12
|
|
|
13
13
|
Recall.js supports multilingual embeddings out of the box so you can add data in one language and then query it in another.
|
|
14
14
|
|
|
15
|
-
Under the hood, recall.js uses
|
|
15
|
+
Under the hood, recall.js uses [Transformers.js](https://huggingface.co/docs/transformers.js/index) feature extraction and a vector database to index and query your data. It is a light wrapper around local language models such as [Multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2) and [CozoDB](https://www.cozodb.org/) vector database.
|
|
16
16
|
|
|
17
17
|
## Install
|
|
18
18
|
|
|
@@ -20,7 +20,15 @@ Under the hood, recall.js uses sentence vector embeddings and a vector database
|
|
|
20
20
|
|
|
21
21
|
## Usage
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
Console:
|
|
24
|
+
|
|
25
|
+
```console
|
|
26
|
+
recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
|
|
27
|
+
recall --query "Un animal saute par-dessus un autre animal" --limit 1
|
|
28
|
+
```
|
|
29
|
+
**Warning:** when this library is used for the first time, it will download a local language model Multilingual-MiniLM-L12-v2 which may take a while depending on your Internet connectivity. Please be patient.
|
|
30
|
+
|
|
31
|
+
Below is the same example in JavaScript:
|
|
24
32
|
|
|
25
33
|
```javascript
|
|
26
34
|
|
|
@@ -50,16 +58,18 @@ response:
|
|
|
50
58
|
"dist",
|
|
51
59
|
"result",
|
|
52
60
|
"id",
|
|
53
|
-
"data"
|
|
61
|
+
"data",
|
|
62
|
+
"category"
|
|
54
63
|
],
|
|
55
64
|
"rows": [
|
|
56
65
|
[
|
|
57
|
-
0.
|
|
66
|
+
0.6840495824813843, // vector similarity
|
|
58
67
|
"Fox and dog",
|
|
59
68
|
"08840189191373282",
|
|
60
69
|
{
|
|
61
70
|
"foo": "bar"
|
|
62
|
-
}
|
|
71
|
+
},
|
|
72
|
+
""
|
|
63
73
|
]
|
|
64
74
|
]
|
|
65
75
|
}
|
|
@@ -68,105 +78,102 @@ response:
|
|
|
68
78
|
|
|
69
79
|
```
|
|
70
80
|
|
|
71
|
-
Here's how the above example looks like in CLI:
|
|
72
|
-
|
|
73
|
-
```log
|
|
74
|
-
recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
|
|
75
|
-
recall --query "Un animal saute par-dessus un autre animal" --limit 1
|
|
76
|
-
```
|
|
77
|
-
|
|
78
81
|
## Options
|
|
79
82
|
|
|
80
|
-
|
|
83
|
+
Easy way to view all the options is via command line:
|
|
81
84
|
|
|
82
|
-
```
|
|
85
|
+
```console
|
|
83
86
|
recall --help
|
|
84
87
|
|
|
85
88
|
Usage:
|
|
86
|
-
recall --query "Foo Bar"
|
|
89
|
+
recall.js --query "Foo Bar"
|
|
87
90
|
|
|
88
91
|
Options:
|
|
89
|
-
--query "SEARCH_STRING"
|
|
90
|
-
--limit 2
|
|
91
|
-
--add 'input|result|{"foo":"bar"}'
|
|
92
|
-
--remove 'id'
|
|
93
|
-
--nuke
|
|
94
|
-
--mcp
|
|
95
|
-
--db "FILE_NAME"
|
|
96
|
-
--import "file.csv | file.tsv"
|
|
97
|
-
--input-header "foo"
|
|
98
|
-
--result-header "bar"
|
|
99
|
-
--json "FILE_NAME"
|
|
92
|
+
--query "SEARCH_STRING" - search
|
|
93
|
+
--limit 2 - limit number of results (used with --query)
|
|
94
|
+
--add 'input|result|{"foo":"bar"}|categ' - add data
|
|
95
|
+
--remove 'id' - remove data
|
|
96
|
+
--nuke - destroy database
|
|
97
|
+
--mcp - run as MCP server (experimental)
|
|
98
|
+
--db "FILE_NAME" - database file (SQLite)
|
|
99
|
+
--import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data
|
|
100
|
+
--input-header "foo" - when used with --import designates specific header column as input
|
|
101
|
+
--result-header "bar" - when used with --import designates specific header column as result
|
|
102
|
+
--json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}
|
|
103
|
+
--category "CATEGORY" - specify category when adding data and to filter by when querying (defaults to empty string)
|
|
100
104
|
```
|
|
101
105
|
|
|
102
|
-
Note when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
|
|
106
|
+
**Note:** when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
|
|
103
107
|
|
|
104
108
|
|
|
105
109
|
## JavaScript API Reference
|
|
106
110
|
|
|
107
|
-
|
|
111
|
+
### RECALL.config
|
|
108
112
|
|
|
109
113
|
Configuration object.
|
|
110
114
|
|
|
111
115
|
```javascript
|
|
112
116
|
export const config = {
|
|
113
|
-
VECTOR_SIZE: 384, // number of dimensions
|
|
114
|
-
MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
|
|
117
|
+
VECTOR_SIZE: 384, // number of dimensions (must match the models output)
|
|
118
|
+
MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
|
|
115
119
|
SHOW_ERRORS: true, // Show errors
|
|
116
120
|
DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
|
|
117
|
-
PATH: PATH // directory of recall.js
|
|
121
|
+
PATH: PATH, // directory of recall.js
|
|
122
|
+
DEVICE: undefined, // Transformers.js device
|
|
123
|
+
DTYPE: undefined, // Transformers.js dtype
|
|
124
|
+
PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
|
|
118
125
|
}
|
|
119
126
|
```
|
|
120
127
|
|
|
121
|
-
|
|
128
|
+
### RECALL.getDb()
|
|
122
129
|
|
|
123
130
|
Returns reference to the CozoDB instance.
|
|
124
131
|
|
|
125
|
-
|
|
132
|
+
### RECALL.getEmbeddings(text) -> Promise(Array)
|
|
126
133
|
|
|
127
134
|
Given text calculates the embeddings vector
|
|
128
135
|
|
|
129
|
-
|
|
136
|
+
### RECALL.add(input, result, data={}, category="") -> Promise(Object)
|
|
130
137
|
|
|
131
138
|
Add data. `input` is the sentence to get embeddings from. `result` is the string to show in the results. `data` is arbitrary object intended to hold related pieces of information and references. If `data` object contains `id` property it will be used as unique id of the record.
|
|
132
139
|
|
|
133
|
-
|
|
140
|
+
### RECALL.addBatch(batch) -> Promise(Object)
|
|
134
141
|
|
|
135
142
|
Add data in batches (faster than using add repeteadely).
|
|
136
143
|
`batch` is an Array that looks like this:
|
|
137
144
|
```
|
|
138
|
-
let batch = [{input:"", result:"", data:{}}]
|
|
145
|
+
let batch = [{input:"", result:"", data:{}, category:""}]
|
|
139
146
|
```
|
|
140
147
|
|
|
141
|
-
|
|
148
|
+
### RECALL.remove(id) -> Promise(Object)
|
|
142
149
|
|
|
143
150
|
Remove data by id. id is a string.
|
|
144
151
|
|
|
145
|
-
|
|
152
|
+
### RECALL.searchText(text, category="", numResults = 5, includeInput=false) -> Promise(Object)
|
|
146
153
|
|
|
147
154
|
Query the vector database. Accepts query text and number of results to return.
|
|
148
155
|
|
|
149
|
-
|
|
156
|
+
### RECALL.nuke()
|
|
150
157
|
|
|
151
158
|
Deletes the database.
|
|
152
159
|
|
|
153
|
-
|
|
160
|
+
### RECALL.importFromJSONStream(fileName) -> Promise(object)
|
|
154
161
|
|
|
155
162
|
Imports from readable stream or file which consists of JSON objects, one per line. e.g.
|
|
156
163
|
```
|
|
157
|
-
{input:"one", result:"one result", data:{"id":"123"}}
|
|
158
|
-
{input:"", result:"", data:{}}
|
|
164
|
+
{input:"one", result:"one result", data:{"id":"123"}, category:""}
|
|
165
|
+
{input:"", result:"", data:{}, category:""}
|
|
159
166
|
...
|
|
160
167
|
```
|
|
161
168
|
This is the most efficient way to import data.
|
|
162
169
|
|
|
163
|
-
|
|
170
|
+
### RECALL.importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise()
|
|
164
171
|
|
|
165
172
|
Imports from CSV or TSV file. By default fist column is used as input, second as result and remaining columns are put in the data object.
|
|
166
173
|
If `inputHeader` is specified, function will try to find the column by that name and use it as input.
|
|
167
174
|
If `resultHeader` is specified, function will try to find the column by that name and use it as result.
|
|
168
175
|
|
|
169
|
-
|
|
176
|
+
### RECALL.mcp() -> Promise()
|
|
170
177
|
|
|
171
178
|
(Experimental)
|
|
172
179
|
Runs MCP server and makes the results available when mentioning `Recall search` in the prompt. Currently only supports STDIO.
|
package/package.json
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sjovanovic/recall.js",
|
|
3
|
-
"version": "1.0.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "1.0.4",
|
|
4
|
+
"description": "Easy RAG with semantic search and long term memory",
|
|
5
5
|
"main": "recall.js",
|
|
6
6
|
"bin": {
|
|
7
7
|
"recall": "recall.js"
|
|
8
8
|
},
|
|
9
9
|
"type": "module",
|
|
10
10
|
"scripts": {
|
|
11
|
+
"start": "node recall.js",
|
|
11
12
|
"test": "echo \"Error: no test specified\" && exit 1",
|
|
12
13
|
"query": "node recall.js --query "
|
|
13
14
|
},
|
|
14
15
|
"author": "Slobodan Jovanovic",
|
|
15
16
|
"license": "ISC",
|
|
16
17
|
"dependencies": {
|
|
17
|
-
"@
|
|
18
|
-
"@
|
|
19
|
-
"@xenova/transformers": "^2.17.2",
|
|
18
|
+
"@huggingface/transformers": "^4.2.0",
|
|
19
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
20
20
|
"cozo-node": "^0.7.6",
|
|
21
21
|
"csv-parser": "^3.2.0",
|
|
22
|
-
"zod": "^3.
|
|
22
|
+
"zod": "^4.3.6"
|
|
23
23
|
}
|
|
24
24
|
}
|
package/recall.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {CozoDb} from 'cozo-node'
|
|
3
|
-
import
|
|
3
|
+
import { pipeline } from "@huggingface/transformers";
|
|
4
4
|
import csv from 'csv-parser'
|
|
5
5
|
import fs from 'fs'
|
|
6
6
|
import { resolve, join, dirname, sep } from 'path'
|
|
@@ -10,35 +10,43 @@ import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mc
|
|
|
10
10
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
11
11
|
import { z } from "zod";
|
|
12
12
|
|
|
13
|
+
// import {sanitizeValue} from './utils/sanitize.js'
|
|
14
|
+
|
|
13
15
|
const pathToThisFile = resolve(fileURLToPath(import.meta.url))
|
|
14
16
|
const pathPassedToNode = resolve(process.argv[1])
|
|
15
17
|
const isThisFileBeingRunViaCLI = pathToThisFile.includes(pathPassedToNode) || pathPassedToNode.includes('.npm-global')
|
|
16
18
|
const PATH = dirname(pathToThisFile)
|
|
17
19
|
|
|
18
20
|
export const config = {
|
|
19
|
-
VECTOR_SIZE: 384, // number of dimensions
|
|
20
|
-
MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
|
|
21
|
+
VECTOR_SIZE: 384, // number of dimensions (must match the models output)
|
|
22
|
+
MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
|
|
21
23
|
SHOW_ERRORS: true, // Show errors
|
|
22
24
|
DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
|
|
23
|
-
PATH: PATH // directory of recall.js
|
|
25
|
+
PATH: PATH, // directory of recall.js
|
|
26
|
+
DEVICE: undefined, // Transformers.js device
|
|
27
|
+
DTYPE: undefined, // Transformers.js dtype
|
|
28
|
+
PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
|
|
24
29
|
}
|
|
25
30
|
|
|
26
|
-
var db = null
|
|
31
|
+
var db = null, initDone = false
|
|
27
32
|
|
|
28
33
|
export const getDb = () => {
|
|
29
|
-
if(!db)
|
|
34
|
+
if(!db) {
|
|
35
|
+
db = new CozoDb('sqlite', config.DB_FILE)
|
|
36
|
+
}
|
|
30
37
|
return db
|
|
31
38
|
}
|
|
32
39
|
|
|
33
40
|
async function printQuery(query, params = {}) {
|
|
34
|
-
try
|
|
35
|
-
if(!
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
let isCreated = await createTable()
|
|
39
|
-
if(isCreated) console.log('Created embeddings table.')
|
|
40
|
-
}catch(err) {}
|
|
41
|
+
try{
|
|
42
|
+
if(!initDone) {
|
|
43
|
+
initDone = true
|
|
44
|
+
await createTable()
|
|
41
45
|
}
|
|
46
|
+
}catch(err) {
|
|
47
|
+
//console.log('CREATE TABLE ERROR', err)
|
|
48
|
+
}
|
|
49
|
+
try {
|
|
42
50
|
let data = getDb().run(query, params)
|
|
43
51
|
return data
|
|
44
52
|
}catch(err){
|
|
@@ -47,17 +55,27 @@ async function printQuery(query, params = {}) {
|
|
|
47
55
|
}
|
|
48
56
|
|
|
49
57
|
export const getEmbeddings = async (text) => {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
58
|
+
let pipe = config._pipe
|
|
59
|
+
if(!pipe) {
|
|
60
|
+
config._pipe = await pipeline("feature-extraction", config.MODEL_NAME, {
|
|
61
|
+
progress_callback:(progress) => {
|
|
62
|
+
if(config.PROGRESS_CALLBACK) return config.PROGRESS_CALLBACK();
|
|
63
|
+
if(progress.status === "progress_total"){
|
|
64
|
+
process.stdout.write(`\r\x1b[K✅ Loaded ${ Math.round(progress.progress)}% ${progress.name || "model"}`)
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
device: config.DEVICE,
|
|
68
|
+
dtype: config.DTYPE
|
|
69
|
+
});
|
|
70
|
+
pipe = config._pipe
|
|
71
|
+
}
|
|
72
|
+
const embedding = await pipe(text, { pooling: "mean", normalize: true });
|
|
73
|
+
return Array.from(embedding.data)
|
|
56
74
|
}
|
|
57
75
|
|
|
58
76
|
export const createTable = async () => {
|
|
59
77
|
// create table (id, v, input, result, data)
|
|
60
|
-
let tableCreated = await printQuery(`:create embeddings {id: String => v: <F32; ${config.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
|
|
78
|
+
let tableCreated = await printQuery(`:create embeddings {id: String, category: String => v: <F32; ${config.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
|
|
61
79
|
if(tableCreated){
|
|
62
80
|
// create index
|
|
63
81
|
let indexCreated = await printQuery(`::hnsw create embeddings:index_name {
|
|
@@ -67,7 +85,6 @@ export const createTable = async () => {
|
|
|
67
85
|
fields: [v],
|
|
68
86
|
distance: L2, # Cosine, IP
|
|
69
87
|
ef_construction:50, # number of nearest neighbors
|
|
70
|
-
#filter: k != 'foo', # only those rows for which the expression evaluates to true are indexed
|
|
71
88
|
extend_candidates: false, # include nearest neighbors of the nearest neighbors
|
|
72
89
|
keep_pruned_connections: false,
|
|
73
90
|
}`)
|
|
@@ -76,21 +93,14 @@ export const createTable = async () => {
|
|
|
76
93
|
return false
|
|
77
94
|
}
|
|
78
95
|
|
|
79
|
-
export const add = async (input, result, data={}) => {
|
|
96
|
+
export const add = async (input, result, data={}, category="") => {
|
|
80
97
|
if(!input || !result) return
|
|
81
|
-
|
|
82
98
|
input = sanitizeString(input)
|
|
83
99
|
result = sanitizeString(result)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
100
|
const embedding = await getEmbeddings(input)
|
|
88
|
-
|
|
89
|
-
console.log('Adding', input, '->', result)
|
|
90
|
-
|
|
91
101
|
let id = data.id || Math.random().toString().substring(2)
|
|
92
|
-
return await printQuery(`?[id, v, input, result, data] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)} ]]
|
|
93
|
-
:put embeddings {id => v, input, result, data}
|
|
102
|
+
return await printQuery(`?[id, v, input, result, data, category] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)}, ${JSON.stringify(category.replaceAll('"', "'"))} ]]
|
|
103
|
+
:put embeddings {id, category => v, input, result, data}
|
|
94
104
|
`)
|
|
95
105
|
}
|
|
96
106
|
|
|
@@ -102,64 +112,91 @@ export const add = async (input, result, data={}) => {
|
|
|
102
112
|
* @param {Array} batch
|
|
103
113
|
* @returns
|
|
104
114
|
*/
|
|
105
|
-
export const addBatch = async (batch) => {
|
|
115
|
+
export const addBatch = async (batch, opts={onProgress:null}) => {
|
|
106
116
|
if(!batch || !Array.isArray(batch)) return
|
|
107
117
|
let vectorBatch = []
|
|
108
118
|
for(let i=0;i<batch.length; i++){
|
|
109
|
-
let {input, result, data} = batch[i]
|
|
119
|
+
let {input, result, data, category} = batch[i]
|
|
110
120
|
|
|
111
121
|
if(!input || !result) continue
|
|
112
122
|
if(!data) data = {}
|
|
123
|
+
if(!category) category = ''
|
|
113
124
|
const embedding = await getEmbeddings(input)
|
|
114
125
|
batch[i].embedding = embedding
|
|
115
126
|
let item = ''
|
|
116
127
|
if(i == 0) {
|
|
117
|
-
item += `?[id, v, input, result, data] <- [`
|
|
128
|
+
item += `?[id, v, input, result, data, category] <- [`
|
|
118
129
|
}
|
|
119
130
|
|
|
120
131
|
input = sanitizeString(input)
|
|
121
132
|
result = sanitizeString(result)
|
|
122
133
|
|
|
123
134
|
let id = data?.id ? data.id : Math.random().toString().substring(2)
|
|
124
|
-
item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)} ],`
|
|
135
|
+
item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)}, ${JSON.stringify(category)} ],`
|
|
125
136
|
if(i == batch.length-1) {
|
|
126
137
|
item += `]
|
|
127
|
-
:put embeddings {id => v, input, result, data}`
|
|
138
|
+
:put embeddings {id, category => v, input, result, data}`
|
|
128
139
|
}
|
|
129
140
|
vectorBatch.push(item)
|
|
141
|
+
|
|
142
|
+
if(opts.onProgress && typeof opts.onProgress == 'function') {
|
|
143
|
+
await opts.onProgress({index: i+1, total:batch.length, item: batch[i], embedding, percent: Math.round((i+1) / batch.length * 100)})
|
|
144
|
+
}
|
|
130
145
|
}
|
|
131
146
|
return await printQuery(vectorBatch.join("\n"))
|
|
132
147
|
}
|
|
133
148
|
|
|
134
149
|
const sanitizeString = (str)=>{
|
|
135
|
-
return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ")
|
|
150
|
+
return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ").trim()
|
|
136
151
|
}
|
|
137
152
|
|
|
138
|
-
export const remove = async (id) => {
|
|
153
|
+
export const remove = async (id, category="") => {
|
|
139
154
|
if(!id || typeof id != 'string') return
|
|
140
|
-
id.replace(/[^a-zA-Z0-9]/g, '')
|
|
141
|
-
|
|
155
|
+
id = id.replace(/[^a-zA-Z0-9]/g, '')
|
|
156
|
+
category = sanitizeString(category)
|
|
157
|
+
if(!id || !category) return
|
|
142
158
|
let results = await printQuery(
|
|
143
|
-
`?[id] <- [['${id}']]
|
|
144
|
-
::
|
|
159
|
+
`?[id, category] <- [['${id}', '${category}']]
|
|
160
|
+
::rm embeddings {id, category}`)
|
|
161
|
+
return results
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export const removeAllByCategory = async (category="") => {
|
|
165
|
+
category = sanitizeString(category)
|
|
166
|
+
if(!category) return
|
|
167
|
+
let results
|
|
168
|
+
try {
|
|
169
|
+
results = await printQuery(
|
|
170
|
+
`?[id, category] := *embeddings{id, category}, category = "${category}"
|
|
171
|
+
:rm embeddings {id, category}`)
|
|
172
|
+
}catch(err){
|
|
173
|
+
console.error(err)
|
|
174
|
+
}
|
|
145
175
|
return results
|
|
146
176
|
}
|
|
147
177
|
|
|
148
|
-
export const searchText = async (text, numResults = 5) => {
|
|
178
|
+
export const searchText = async (text, category="", numResults = 5, includeInput=false) => {
|
|
149
179
|
const embedding = await getEmbeddings(text)
|
|
150
|
-
let results = await printQuery(`?[dist, result, id, data] := ~embeddings:index_name{ id, v, input, result, data |
|
|
180
|
+
let results = await printQuery(`?[dist, result, id, data, category${includeInput? ', input' : ''}] := ~embeddings:index_name { id, v, input, result, data, category${includeInput? ', input' : ''} |
|
|
151
181
|
query: q,
|
|
152
182
|
k: ${numResults}, # number of results
|
|
153
|
-
ef:
|
|
183
|
+
ef: 50, # number of neighbours to consider
|
|
154
184
|
bind_distance: dist,
|
|
185
|
+
filter: category==${JSON.stringify(category)},
|
|
155
186
|
radius: 10.0
|
|
156
187
|
}, q = vec(${JSON.stringify(embedding)})
|
|
157
|
-
:sort dist`)
|
|
188
|
+
:sort -dist`)
|
|
158
189
|
return results
|
|
159
190
|
}
|
|
160
191
|
|
|
161
|
-
export const vectorSearch = async (query, numResults=5) => {
|
|
162
|
-
|
|
192
|
+
export const vectorSearch = async (query, category='', numResults=5) => {
|
|
193
|
+
let result = undefined
|
|
194
|
+
try{
|
|
195
|
+
result = await searchText(query, category, numResults)
|
|
196
|
+
}catch(err){
|
|
197
|
+
if(config.SHOW_ERRORS) console.error(err.display || err.message)
|
|
198
|
+
}
|
|
199
|
+
return result
|
|
163
200
|
}
|
|
164
201
|
|
|
165
202
|
const cmdArgs = (list = []) => {
|
|
@@ -274,19 +311,6 @@ export const importFromCSVorTSV = async (fileName, inputHeader, resultHeader) =>
|
|
|
274
311
|
|
|
275
312
|
let results = await fetchFromFile(fileName)
|
|
276
313
|
|
|
277
|
-
// // split results to sentences
|
|
278
|
-
// let results_raw = await fetchFromFile(fileName)
|
|
279
|
-
// let results = []
|
|
280
|
-
// for(let i=0;i<results_raw.length; i++){
|
|
281
|
-
// let sentences = splitSentences(results_raw[i].input)
|
|
282
|
-
// for(let j=0; j<sentences.length; j++){
|
|
283
|
-
// results.push({
|
|
284
|
-
// ...results_raw[i],
|
|
285
|
-
// ...{ input: sentences[j] }
|
|
286
|
-
// })
|
|
287
|
-
// }
|
|
288
|
-
// }
|
|
289
|
-
|
|
290
314
|
let batchSize = 40, batch = [], currentBatch = 0, totalBatches = Math.ceil(results.length / batchSize), dataHeaders = Object.keys(results[results.length-1]).filter(k => k != 'input' && k != 'result'), data
|
|
291
315
|
for(let i=0; i<results.length; i++){
|
|
292
316
|
if(i % batchSize === 0){
|
|
@@ -431,18 +455,22 @@ const splitSentences = (text) => {
|
|
|
431
455
|
}
|
|
432
456
|
|
|
433
457
|
const runCLI = async () => {
|
|
434
|
-
let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--mcp', '--nuke', '--input-header', '--result-header', '--test', '--limit'])
|
|
458
|
+
let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--mcp', '--nuke', '--input-header', '--result-header', '--test', '--limit', '--category'])
|
|
435
459
|
let query = args['--query'] || args['-q']
|
|
436
460
|
if(args['--db']){
|
|
437
461
|
config.DB_FILE = args['--db']
|
|
438
462
|
}
|
|
463
|
+
let category = ''
|
|
464
|
+
if(args['--category']) {
|
|
465
|
+
category = args['--category']
|
|
466
|
+
}
|
|
439
467
|
if(query){
|
|
440
468
|
let numResults = 5
|
|
441
469
|
if(args['--limit'] && parseInt(args['--limit'])) {
|
|
442
470
|
numResults = parseInt(args['--limit'])
|
|
443
471
|
}
|
|
444
472
|
console.time('Search time')
|
|
445
|
-
let result = await vectorSearch(query, numResults)
|
|
473
|
+
let result = await vectorSearch(query, category, numResults)
|
|
446
474
|
console.timeEnd('Search time')
|
|
447
475
|
console.log('Results:')
|
|
448
476
|
console.log(JSON.stringify(result, null, 2))
|
|
@@ -451,17 +479,17 @@ const runCLI = async () => {
|
|
|
451
479
|
if(!input || !result) {
|
|
452
480
|
console.log('Usage:')
|
|
453
481
|
return console.log(args._cmd + `--add 'input|result|{"foo":"bar"}'`)
|
|
454
|
-
}
|
|
482
|
+
}
|
|
455
483
|
let data = {}
|
|
456
484
|
if(dataString) {
|
|
457
485
|
try {data = JSON.parse(dataString)}catch(err) {}
|
|
458
486
|
}
|
|
459
|
-
let resp = await add(input, result, data)
|
|
487
|
+
let resp = await add(input, result, data, category)
|
|
460
488
|
console.log(JSON.stringify(resp, null, 2))
|
|
461
489
|
}else if(args['--remove']){
|
|
462
490
|
let id = args['--remove']
|
|
463
491
|
if(!id) return console.log('Please specify ID to remove')
|
|
464
|
-
let resp = await remove(id)
|
|
492
|
+
let resp = await remove(id, category)
|
|
465
493
|
console.log(JSON.stringify(resp, null, 2))
|
|
466
494
|
}else if(args['--nuke'] != undefined){
|
|
467
495
|
nuke()
|
|
@@ -481,17 +509,18 @@ const runCLI = async () => {
|
|
|
481
509
|
console.log('Usage:')
|
|
482
510
|
console.log(args._cmd + ' --query "Foo Bar"')
|
|
483
511
|
console.log("\n" + 'Options:')
|
|
484
|
-
console.log('--query "SEARCH_STRING"
|
|
485
|
-
console.log('--limit 2
|
|
486
|
-
console.log(`--add 'input|result|{"foo":"bar"}'
|
|
487
|
-
console.log(`--remove 'id'
|
|
488
|
-
console.log(`--nuke
|
|
489
|
-
console.log(`--mcp
|
|
490
|
-
console.log(`--db "FILE_NAME"
|
|
491
|
-
console.log(`--import "file.csv | file.tsv"
|
|
492
|
-
console.log('--input-header "foo"
|
|
493
|
-
console.log('--result-header "bar"
|
|
494
|
-
console.log(`--json "FILE_NAME"
|
|
512
|
+
console.log('--query "SEARCH_STRING" - search')
|
|
513
|
+
console.log('--limit 2 - limit number of results (used with --query)')
|
|
514
|
+
console.log(`--add 'input|result|{"foo":"bar"}|categ' - add data`)
|
|
515
|
+
console.log(`--remove 'id' - remove data`)
|
|
516
|
+
console.log(`--nuke - destroy database`)
|
|
517
|
+
console.log(`--mcp - run as MCP server (experimental)`)
|
|
518
|
+
console.log(`--db "FILE_NAME" - database file (SQLite)`)
|
|
519
|
+
console.log(`--import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data`)
|
|
520
|
+
console.log('--input-header "foo" - when used with --import designates specific header column as input')
|
|
521
|
+
console.log('--result-header "bar" - when used with --import designates specific header column as result')
|
|
522
|
+
console.log(`--json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}`)
|
|
523
|
+
console.log(`--category "CATEGORY" - specify category when adding data and to filter by when querying (defaults to empty string)`)
|
|
495
524
|
}
|
|
496
525
|
}
|
|
497
526
|
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
export function sanitizeValue(stringValue, maxChars=1000) {
|
|
2
|
+
if (typeof stringValue !== 'string') {
|
|
3
|
+
throw new Error('stringValue must be a string');
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
let sanitized = stringValue.normalize('NFC').trim();
|
|
7
|
+
|
|
8
|
+
// Basic validation
|
|
9
|
+
if (sanitized.length === 0) {
|
|
10
|
+
throw new Error('stringValue name cannot be empty');
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
if (sanitized.length > maxChars) {
|
|
14
|
+
throw new Error(`stringValue name too long (max ${maxChars} characters)`);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// Block control characters (primary security concern)
|
|
18
|
+
// This allows all other Unicode characters including emojis, Chinese, Arabic, etc.
|
|
19
|
+
if (/[\x00-\x1F\x7F-\x9F\u200B\u200E\u200F\u202A-\u202E\u2060-\u2069\uFEFF]/.test(sanitized)) {
|
|
20
|
+
throw new Error('stringValue contains disallowed control characters');
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Block private use areas
|
|
24
|
+
if (/[\uE000-\uF8FF\uFFF0-\uFFFF]/.test(sanitized)) {
|
|
25
|
+
throw new Error('stringValue contains disallowed Unicode characters');
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Block surrogate pairs (invalid alone)
|
|
29
|
+
if (/[\uD800-\uDFFF]/.test(sanitized)) {
|
|
30
|
+
throw new Error('stringValue contains invalid Unicode characters');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return sanitized;
|
|
34
|
+
}
|