@sjovanovic/recall.js 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +171 -0
- package/cache/README.md +1 -0
- package/logo.svg +14 -0
- package/package.json +24 -0
- package/recall.js +505 -0
package/README.md
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# recall.js - Embedded RAG system
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img alt="Recall.js is long term memory for AI apps!" src="logo.svg" />
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
Recall.js is long term memory for AI apps!
|
|
8
|
+
|
|
9
|
+
It is a generic RAG (Retrieval-augmented generation) JavaScript library and command line interface focused on speed, ease of use and embeddability.
|
|
10
|
+
|
|
11
|
+
It is versatile: use it for generic Semantic Search, as expert memory for your AI app, as a recommendation system, there are so many possibilities.
|
|
12
|
+
|
|
13
|
+
Recall.js supports multilingual embeddings out of the box so you can add data in one language and then query it in another.
|
|
14
|
+
|
|
15
|
+
Under the hood, recall.js uses sentence vector embeddings and a vector database to index and query your data. It is a light wrapper around local language models such as [MiniLM-L12-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2) (optionally LLMs can be used) and [CozoDB](https://www.cozodb.org/) vector database.
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
`npm install recall`
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
Warning: when this library is used for the first time, it will download a local language model MiniLM-L12-v2 which may take long time depending on your Internet connectivity. Please be patient.
|
|
24
|
+
|
|
25
|
+
```javascript
|
|
26
|
+
|
|
27
|
+
import * as RECALL from 'recall'
|
|
28
|
+
|
|
29
|
+
const testRecall = async () => {
|
|
30
|
+
await RECALL.addBatch([
|
|
31
|
+
{
|
|
32
|
+
input: "The quick brown fox jumps over the lazy dog",
|
|
33
|
+
result: "Fox and dog",
|
|
34
|
+
data: { foo: "bar" }
|
|
35
|
+
}
|
|
36
|
+
])
|
|
37
|
+
|
|
38
|
+
// Semantic search query in different language (French) "Animal jumps over another animal"
|
|
39
|
+
let response = await RECALL.searchText("Un animal saute par-dessus un autre animal", 1)
|
|
40
|
+
}
|
|
41
|
+
testRecall()
|
|
42
|
+
|
|
43
|
+
/*
|
|
44
|
+
|
|
45
|
+
response:
|
|
46
|
+
|
|
47
|
+
{
|
|
48
|
+
"headers": [
|
|
49
|
+
"dist",
|
|
50
|
+
"result",
|
|
51
|
+
"id",
|
|
52
|
+
"data"
|
|
53
|
+
],
|
|
54
|
+
"rows": [
|
|
55
|
+
[
|
|
56
|
+
0.5840495824813843, // vector similarity
|
|
57
|
+
"Fox",
|
|
58
|
+
"08840189191373282",
|
|
59
|
+
{
|
|
60
|
+
"foo": "bar"
|
|
61
|
+
}
|
|
62
|
+
]
|
|
63
|
+
]
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
*/
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Here's how the above example looks like in CLI:
|
|
71
|
+
|
|
72
|
+
```log
|
|
73
|
+
recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
|
|
74
|
+
recall --query "Un animal saute par-dessus un autre animal" --limit 1
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Options
|
|
78
|
+
|
|
79
|
+
Easiest way to get all the options is via command line:
|
|
80
|
+
|
|
81
|
+
```log
|
|
82
|
+
recall --help
|
|
83
|
+
|
|
84
|
+
Usage:
|
|
85
|
+
recall --query "Foo Bar"
|
|
86
|
+
|
|
87
|
+
Options:
|
|
88
|
+
--query "SEARCH_STRING" - search
|
|
89
|
+
--limit 2 - limit number of results (used with --query)
|
|
90
|
+
--add 'input|result|{"foo":"bar"}' - add data
|
|
91
|
+
--remove 'id' - remove data
|
|
92
|
+
--nuke - destroy database
|
|
93
|
+
--mcp - run as MCP server
|
|
94
|
+
--db "FILE_NAME" - database file (SQLite)
|
|
95
|
+
--import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data
|
|
96
|
+
--input-header "foo" - when used with --import designates specific header column as input
|
|
97
|
+
--result-header "bar" - when used with --import designates specific header column as result
|
|
98
|
+
--json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Note when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
## JavaScript API Reference
|
|
105
|
+
|
|
106
|
+
**RECALL.config**
|
|
107
|
+
|
|
108
|
+
Configuration object.
|
|
109
|
+
|
|
110
|
+
```javascript
|
|
111
|
+
export const config = {
|
|
112
|
+
VECTOR_SIZE: 384, // number of dimensions
|
|
113
|
+
MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
|
|
114
|
+
SHOW_ERRORS: true, // Show errors
|
|
115
|
+
DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
|
|
116
|
+
PATH: PATH // directory of recall.js
|
|
117
|
+
}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**RECALL.getDb()**
|
|
121
|
+
|
|
122
|
+
Returns reference to the CozoDB instance.
|
|
123
|
+
|
|
124
|
+
**RECALL.getEmbeddings(text) -> Promise(Array)**
|
|
125
|
+
|
|
126
|
+
Given text calculates the embeddings vector
|
|
127
|
+
|
|
128
|
+
**RECALL.add(input, result, data={}) -> Promise(Object)**
|
|
129
|
+
|
|
130
|
+
Add data. `input` is the sentence to get embeddings from. `result` is the string to show in the results. `data` is arbitrary object intended to hold related pieces of information and references. If `data` object contains `id` property it will be used as unique id of the record.
|
|
131
|
+
|
|
132
|
+
**RECALL.addBatch(batch) -> Promise(Object)**
|
|
133
|
+
|
|
134
|
+
Add data in batches (faster than using add repeteadely).
|
|
135
|
+
`batch` is an Array that looks like this:
|
|
136
|
+
```
|
|
137
|
+
let batch = [{input:"", result:"", data:{}}]
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
**RECALL.remove(id) -> Promise(Object)**
|
|
141
|
+
|
|
142
|
+
Remove data by id. id is a string.
|
|
143
|
+
|
|
144
|
+
**RECALL.searchText(text, numResults = 5) -> Promise(Object)**
|
|
145
|
+
|
|
146
|
+
Query the vector database. Accepts query text and number of results to return.
|
|
147
|
+
|
|
148
|
+
**RECALL.nuke()**
|
|
149
|
+
|
|
150
|
+
Deletes the database.
|
|
151
|
+
|
|
152
|
+
**RECALL.importFromJSONStream(fileName) -> Promise(object)**
|
|
153
|
+
|
|
154
|
+
Imports from readable stream or file which consists of JSON objects, one per line. e.g.
|
|
155
|
+
```
|
|
156
|
+
{input:"one", result:"one result", data:{"id":"123"}}
|
|
157
|
+
{input:"", result:"", data:{}}
|
|
158
|
+
...
|
|
159
|
+
```
|
|
160
|
+
This is the most efficient way to import data.
|
|
161
|
+
|
|
162
|
+
**RECALL.importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise()**
|
|
163
|
+
|
|
164
|
+
Imports from CSV or TSV file. By default fist column is used as input, second as result and remaining columns are put in the data object.
|
|
165
|
+
If `inputHeader` is specified, function will try to find the column by that name and use it as input.
|
|
166
|
+
If `resultHeader` is specified, function will try to find the column by that name and use it as result.
|
|
167
|
+
|
|
168
|
+
**RECALL.mcp() -> Promise()**
|
|
169
|
+
|
|
170
|
+
(Experimental)
|
|
171
|
+
Runs MCP server and makes the results available when mentioning `Recall search` in the prompt. Currently only supports STDIO.
|
package/cache/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
## Cache folder for the embeddings
|
package/logo.svg
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
<svg width="80" height="60" viewBox="0 0 80 60" fill="none" xmlns="http://www.w3.org/2000/svg">
|
|
2
|
+
<g clip-path="url(#clip0_2514_579)">
|
|
3
|
+
<path d="M30 60C46.5685 60 60 46.5685 60 30C60 13.4315 46.5685 0 30 0C13.4315 0 0 13.4315 0 30C0 46.5685 13.4315 60 30 60Z" fill="#8E44AD" fill-opacity="0.93"/>
|
|
4
|
+
<path d="M50 60C66.5685 60 80 46.5685 80 30C80 13.4315 66.5685 0 50 0C33.4315 0 20 13.4315 20 30C20 46.5685 33.4315 60 50 60Z" fill="#9B59B6" fill-opacity="0.68"/>
|
|
5
|
+
<path fill-rule="evenodd" clip-rule="evenodd" d="M37.3295 17.0388L36.4584 15.75L36.1245 15.0913C37.2328 15.031 38.3591 15 39.5 15C56.2987 15 69.9167 21.7157 69.9167 30C69.9167 38.2843 56.2987 45 39.5 45C22.7014 45 9.08337 38.2843 9.08337 30C9.08337 26.139 12.0414 22.6188 16.9008 19.9603C19.1405 19.6658 20.2861 19.8819 20.8539 20.2991C20.1651 20.5807 19.5054 20.8772 18.8768 21.1871C13.5746 23.8019 11.0834 27.0381 11.0834 30C11.0834 32.962 13.5746 36.1981 18.8768 38.8129C24.0513 41.3647 31.3401 43 39.5 43C47.66 43 54.9488 41.3647 60.1233 38.8129C65.4254 36.1981 67.9167 32.962 67.9167 30C67.9167 27.0381 65.4254 23.8019 60.1233 21.1871C54.9488 18.6353 47.66 17 39.5 17C38.7693 17 38.0455 17.0131 37.3295 17.0388Z" fill="white"/>
|
|
6
|
+
<path d="M32 16.5L44 12.6029V20.3971L32 16.5Z" fill="white"/>
|
|
7
|
+
<path d="M46 30C46 33.3137 43.3137 36 40 36C36.6863 36 34 33.3137 34 30C34 26.6863 36.6863 24 40 24C43.3137 24 46 26.6863 46 30Z" fill="white"/>
|
|
8
|
+
</g>
|
|
9
|
+
<defs>
|
|
10
|
+
<clipPath id="clip0_2514_579">
|
|
11
|
+
<rect width="80" height="60" fill="white"/>
|
|
12
|
+
</clipPath>
|
|
13
|
+
</defs>
|
|
14
|
+
</svg>
|
package/package.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@sjovanovic/recall.js",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Semantic search as long term memory for LLMs",
|
|
5
|
+
"main": "recall.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"recall": "recall.js"
|
|
8
|
+
},
|
|
9
|
+
"type": "module",
|
|
10
|
+
"scripts": {
|
|
11
|
+
"test": "echo \"Error: no test specified\" && exit 1",
|
|
12
|
+
"query": "node recall.js --query "
|
|
13
|
+
},
|
|
14
|
+
"author": "Slobodan Jovanovic",
|
|
15
|
+
"license": "ISC",
|
|
16
|
+
"dependencies": {
|
|
17
|
+
"@modelcontextprotocol/sdk": "^1.8.0",
|
|
18
|
+
"@themaximalist/embeddings.js": "^0.1.3",
|
|
19
|
+
"@xenova/transformers": "^2.17.2",
|
|
20
|
+
"cozo-node": "^0.7.6",
|
|
21
|
+
"csv-parser": "^3.2.0",
|
|
22
|
+
"zod": "^3.24.2"
|
|
23
|
+
}
|
|
24
|
+
}
|
package/recall.js
ADDED
|
@@ -0,0 +1,505 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {CozoDb} from 'cozo-node'
|
|
3
|
+
import embeddings from "@themaximalist/embeddings.js";
|
|
4
|
+
import csv from 'csv-parser'
|
|
5
|
+
import fs from 'fs'
|
|
6
|
+
import { resolve, join, dirname, sep } from 'path'
|
|
7
|
+
import { fileURLToPath } from 'url'
|
|
8
|
+
|
|
9
|
+
import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
10
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
11
|
+
import { z } from "zod";
|
|
12
|
+
|
|
13
|
+
const pathToThisFile = resolve(fileURLToPath(import.meta.url))
|
|
14
|
+
const pathPassedToNode = resolve(process.argv[1])
|
|
15
|
+
const isThisFileBeingRunViaCLI = pathToThisFile.includes(pathPassedToNode) || pathPassedToNode.includes('.npm-global')
|
|
16
|
+
const PATH = dirname(pathToThisFile)
|
|
17
|
+
|
|
18
|
+
export const config = {
|
|
19
|
+
VECTOR_SIZE: 384, // number of dimensions
|
|
20
|
+
MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
|
|
21
|
+
SHOW_ERRORS: true, // Show errors
|
|
22
|
+
DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
|
|
23
|
+
PATH: PATH // directory of recall.js
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
var db = null
|
|
27
|
+
|
|
28
|
+
export const getDb = () => {
|
|
29
|
+
if(!db) db = new CozoDb('sqlite', config.DB_FILE)
|
|
30
|
+
return db
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
async function printQuery(query, params = {}) {
|
|
34
|
+
try {
|
|
35
|
+
if(!db) {
|
|
36
|
+
getDb()
|
|
37
|
+
try {
|
|
38
|
+
let isCreated = await createTable()
|
|
39
|
+
if(isCreated) console.log('Created embeddings table.')
|
|
40
|
+
}catch(err) {}
|
|
41
|
+
}
|
|
42
|
+
let data = getDb().run(query, params)
|
|
43
|
+
return data
|
|
44
|
+
}catch(err){
|
|
45
|
+
if(config.SHOW_ERRORS) console.error(err.display || err.message)
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export const getEmbeddings = async (text) => {
|
|
50
|
+
const embedding = await embeddings(text, {
|
|
51
|
+
service:'transformers',
|
|
52
|
+
model: config.MODEL_NAME,
|
|
53
|
+
cache_file: join(config.PATH, "cache", ".embeddings.cache.json")
|
|
54
|
+
});
|
|
55
|
+
return embedding
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export const createTable = async () => {
|
|
59
|
+
// create table (id, v, input, result, data)
|
|
60
|
+
let tableCreated = await printQuery(`:create embeddings {id: String => v: <F32; ${config.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
|
|
61
|
+
if(tableCreated){
|
|
62
|
+
// create index
|
|
63
|
+
let indexCreated = await printQuery(`::hnsw create embeddings:index_name {
|
|
64
|
+
dim: ${config.VECTOR_SIZE},
|
|
65
|
+
m: 50,
|
|
66
|
+
dtype: F32,
|
|
67
|
+
fields: [v],
|
|
68
|
+
distance: L2, # Cosine, IP
|
|
69
|
+
ef_construction:50, # number of nearest neighbors
|
|
70
|
+
#filter: k != 'foo', # only those rows for which the expression evaluates to true are indexed
|
|
71
|
+
extend_candidates: false, # include nearest neighbors of the nearest neighbors
|
|
72
|
+
keep_pruned_connections: false,
|
|
73
|
+
}`)
|
|
74
|
+
return tableCreated && indexCreated
|
|
75
|
+
}
|
|
76
|
+
return false
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export const add = async (input, result, data={}) => {
|
|
80
|
+
if(!input || !result) return
|
|
81
|
+
|
|
82
|
+
input = sanitizeString(input)
|
|
83
|
+
result = sanitizeString(result)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
const embedding = await getEmbeddings(input)
|
|
88
|
+
|
|
89
|
+
console.log('Adding', input, '->', result)
|
|
90
|
+
|
|
91
|
+
let id = data.id || Math.random().toString().substring(2)
|
|
92
|
+
return await printQuery(`?[id, v, input, result, data] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)} ]]
|
|
93
|
+
:put embeddings {id => v, input, result, data}
|
|
94
|
+
`)
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
*
|
|
99
|
+
* Batch array:
|
|
100
|
+
* [{input:"", result:"", data:{}}]
|
|
101
|
+
*
|
|
102
|
+
* @param {Array} batch
|
|
103
|
+
* @returns
|
|
104
|
+
*/
|
|
105
|
+
export const addBatch = async (batch) => {
|
|
106
|
+
if(!batch || !Array.isArray(batch)) return
|
|
107
|
+
let vectorBatch = []
|
|
108
|
+
for(let i=0;i<batch.length; i++){
|
|
109
|
+
let {input, result, data} = batch[i]
|
|
110
|
+
|
|
111
|
+
if(!input || !result) continue
|
|
112
|
+
if(!data) data = {}
|
|
113
|
+
const embedding = await getEmbeddings(input)
|
|
114
|
+
batch[i].embedding = embedding
|
|
115
|
+
let item = ''
|
|
116
|
+
if(i == 0) {
|
|
117
|
+
item += `?[id, v, input, result, data] <- [`
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
input = sanitizeString(input)
|
|
121
|
+
result = sanitizeString(result)
|
|
122
|
+
|
|
123
|
+
let id = data?.id ? data.id : Math.random().toString().substring(2)
|
|
124
|
+
item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)} ],`
|
|
125
|
+
if(i == batch.length-1) {
|
|
126
|
+
item += `]
|
|
127
|
+
:put embeddings {id => v, input, result, data}`
|
|
128
|
+
}
|
|
129
|
+
vectorBatch.push(item)
|
|
130
|
+
}
|
|
131
|
+
return await printQuery(vectorBatch.join("\n"))
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const sanitizeString = (str)=>{
|
|
135
|
+
return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ")
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
export const remove = async (id) => {
|
|
139
|
+
if(!id || typeof id != 'string') return
|
|
140
|
+
id.replace(/[^a-zA-Z0-9]/g, '')
|
|
141
|
+
if(!id) return
|
|
142
|
+
let results = await printQuery(
|
|
143
|
+
`?[id] <- [['${id}']]
|
|
144
|
+
::remove embeddings {id}`)
|
|
145
|
+
return results
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
export const searchText = async (text, numResults = 5) => {
|
|
149
|
+
const embedding = await getEmbeddings(text)
|
|
150
|
+
let results = await printQuery(`?[dist, result, id, data] := ~embeddings:index_name{ id, v, input, result, data |
|
|
151
|
+
query: q,
|
|
152
|
+
k: ${numResults}, # number of results
|
|
153
|
+
ef: 90, # number of neighbours to consider
|
|
154
|
+
bind_distance: dist,
|
|
155
|
+
radius: 10.0
|
|
156
|
+
}, q = vec(${JSON.stringify(embedding)})
|
|
157
|
+
:sort dist`)
|
|
158
|
+
return results
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
export const vectorSearch = async (query, numResults=5) => {
|
|
162
|
+
return await searchText(query, numResults)
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const cmdArgs = (list = []) => {
|
|
166
|
+
let args = {}, current = null
|
|
167
|
+
for(let i=0; i<process.argv.length; i++){
|
|
168
|
+
let val = process.argv[i]
|
|
169
|
+
if(current && !list.includes(val)){
|
|
170
|
+
args[current] = val
|
|
171
|
+
current = null
|
|
172
|
+
}
|
|
173
|
+
if(list.includes(val)) {
|
|
174
|
+
current = val
|
|
175
|
+
args[current] = ''
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
args._cmd = process.argv[1].split(sep).pop()
|
|
179
|
+
return args
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
export const nuke = () => {
|
|
183
|
+
return fs.unlinkSync(config.DB_FILE)
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
export const importFromJSONStream = async (fileName) => {
|
|
187
|
+
async function jsonStream(readable, callback = async function(){}) {
|
|
188
|
+
readable.setEncoding('utf8');
|
|
189
|
+
let data = '';
|
|
190
|
+
for await (const chunk of readable) {
|
|
191
|
+
if(chunk.indexOf("\n")) {
|
|
192
|
+
pts = chunk.split("\n")
|
|
193
|
+
for(let i=0;i<pts.length; i++){
|
|
194
|
+
data += pts[i]
|
|
195
|
+
try {
|
|
196
|
+
let json = JSON.parse(data)
|
|
197
|
+
await callback(json)
|
|
198
|
+
json = null
|
|
199
|
+
data = ''
|
|
200
|
+
}catch(err) {
|
|
201
|
+
//console.error(err)
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}else{
|
|
205
|
+
data += chunk;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
let batchSize = 40, batch = [], i=0, currentBatch = 0
|
|
210
|
+
let stream = typeof fileName == 'string' ? fs.createReadStream(fileName) : fileName
|
|
211
|
+
await jsonStream(stream, async (json) => {
|
|
212
|
+
if(json.input && json.result){
|
|
213
|
+
if(!json.data) json.data = {}
|
|
214
|
+
if(i % batchSize === 0){
|
|
215
|
+
if(batch.length) {
|
|
216
|
+
currentBatch = currentBatch + 1
|
|
217
|
+
console.log(`Adding batch ${currentBatch} (${batch.length} items)`)
|
|
218
|
+
await addBatch(batch)
|
|
219
|
+
batch = []
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
batch.push(json)
|
|
223
|
+
i=i+1
|
|
224
|
+
}
|
|
225
|
+
})
|
|
226
|
+
if(batch.length) {
|
|
227
|
+
console.log(`Adding batch ${currentBatch + 1} (${batch.length} items)`)
|
|
228
|
+
await addBatch(batch)
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
export const importFromCSVorTSV = async (fileName, inputHeader, resultHeader) => {
|
|
233
|
+
if(!fileName || !fileName.includes('.')) return
|
|
234
|
+
let ext = fileName.split('.').pop()
|
|
235
|
+
ext = ext.toLowerCase()
|
|
236
|
+
if(ext != 'csv' && ext != 'tsv') return console.log('File must have csv or tsv extension')
|
|
237
|
+
let parseOpts = {
|
|
238
|
+
separator: ext == 'tsv' ? '\t' : ',',
|
|
239
|
+
mapHeaders: ({ header, index }) => {
|
|
240
|
+
if(inputHeader) {
|
|
241
|
+
if(inputHeader == header){
|
|
242
|
+
return 'input'
|
|
243
|
+
}
|
|
244
|
+
}else if(index === 0){
|
|
245
|
+
return 'input'
|
|
246
|
+
}
|
|
247
|
+
if(resultHeader){
|
|
248
|
+
if(resultHeader == header){
|
|
249
|
+
return 'result'
|
|
250
|
+
}
|
|
251
|
+
}else if(index === 1){
|
|
252
|
+
return 'result'
|
|
253
|
+
}
|
|
254
|
+
return header.replaceAll(/\W/gi, '_').replaceAll(/[^a-zA-Z0-9\_]/g, '').toLowerCase()
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
let fetchFromFile = async (fileName) => {
|
|
258
|
+
return new Promise(async (resolve, reject)=>{
|
|
259
|
+
let results = []
|
|
260
|
+
fs.createReadStream(fileName)
|
|
261
|
+
.pipe(csv(parseOpts))
|
|
262
|
+
.on('data', async (data) => {
|
|
263
|
+
results.push(data)
|
|
264
|
+
})
|
|
265
|
+
.on('end', () => {
|
|
266
|
+
console.log(`${fileName} loaded.`);
|
|
267
|
+
resolve(results)
|
|
268
|
+
}).on('error', (err) => {
|
|
269
|
+
console.error(err);
|
|
270
|
+
})
|
|
271
|
+
})
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
let results = await fetchFromFile(fileName)
|
|
276
|
+
|
|
277
|
+
// // split results to sentences
|
|
278
|
+
// let results_raw = await fetchFromFile(fileName)
|
|
279
|
+
// let results = []
|
|
280
|
+
// for(let i=0;i<results_raw.length; i++){
|
|
281
|
+
// let sentences = splitSentences(results_raw[i].input)
|
|
282
|
+
// for(let j=0; j<sentences.length; j++){
|
|
283
|
+
// results.push({
|
|
284
|
+
// ...results_raw[i],
|
|
285
|
+
// ...{ input: sentences[j] }
|
|
286
|
+
// })
|
|
287
|
+
// }
|
|
288
|
+
// }
|
|
289
|
+
|
|
290
|
+
let batchSize = 40, batch = [], currentBatch = 0, totalBatches = Math.ceil(results.length / batchSize), dataHeaders = Object.keys(results[results.length-1]).filter(k => k != 'input' && k != 'result'), data
|
|
291
|
+
for(let i=0; i<results.length; i++){
|
|
292
|
+
if(i % batchSize === 0){
|
|
293
|
+
if(batch.length) {
|
|
294
|
+
currentBatch = currentBatch + 1
|
|
295
|
+
console.log(`Adding batch ${currentBatch} of ${totalBatches} (${batch.length} items)`)
|
|
296
|
+
await addBatch(batch)
|
|
297
|
+
batch = []
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
data = {}
|
|
301
|
+
dataHeaders.forEach(k => k && results[i][k] ? data[k] = results[i][k] : null)
|
|
302
|
+
batch.push({
|
|
303
|
+
input: results[i].input,
|
|
304
|
+
result: results[i].result,
|
|
305
|
+
data
|
|
306
|
+
})
|
|
307
|
+
}
|
|
308
|
+
if(batch.length) {
|
|
309
|
+
console.log(`Adding batch ${currentBatch + 1} of ${totalBatches} (${batch.length} items)`)
|
|
310
|
+
await addBatch(batch)
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
const mcp = async () => {
|
|
315
|
+
|
|
316
|
+
// Create an MCP server
|
|
317
|
+
// const server = new McpServer({
|
|
318
|
+
// name: "Demo",
|
|
319
|
+
// version: "1.0.0"
|
|
320
|
+
// });
|
|
321
|
+
|
|
322
|
+
// // Add an addition tool
|
|
323
|
+
// server.tool("add",
|
|
324
|
+
// { a: z.number(), b: z.number() },
|
|
325
|
+
// async ({ a, b }) => ({
|
|
326
|
+
// content: [{ type: "text", text: String(a + b) }]
|
|
327
|
+
// })
|
|
328
|
+
// );
|
|
329
|
+
|
|
330
|
+
// // Add a dynamic greeting resource
|
|
331
|
+
// server.resource(
|
|
332
|
+
// "greeting",
|
|
333
|
+
// new ResourceTemplate("greeting://{name}", { list: undefined }),
|
|
334
|
+
// async (uri, { name }) => ({
|
|
335
|
+
// contents: [{
|
|
336
|
+
// uri: uri.href,
|
|
337
|
+
// text: `Hello, ${name}!`
|
|
338
|
+
// }]
|
|
339
|
+
// })
|
|
340
|
+
// );
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
const server = new McpServer({
|
|
344
|
+
name: "Recall",
|
|
345
|
+
description: "Recall provides semantic search on the local vector database.",
|
|
346
|
+
version: "1.0.0"
|
|
347
|
+
});
|
|
348
|
+
|
|
349
|
+
// server.resource(
|
|
350
|
+
// "echo",
|
|
351
|
+
// new ResourceTemplate("echo://{message}", { list: undefined }),
|
|
352
|
+
// async (uri, { message }) => ({
|
|
353
|
+
// contents: [{
|
|
354
|
+
// uri: uri.href,
|
|
355
|
+
// text: `Resource echo: ${message}`
|
|
356
|
+
// }]
|
|
357
|
+
// })
|
|
358
|
+
// );
|
|
359
|
+
|
|
360
|
+
server.tool(
|
|
361
|
+
"search",
|
|
362
|
+
{
|
|
363
|
+
text: z.string(),
|
|
364
|
+
//numberOfResults: z.number()
|
|
365
|
+
},
|
|
366
|
+
async ({ text, numberOfResults }) => {
|
|
367
|
+
if(numberOfResults && numberOfResults > 50) numberOfResults = 50
|
|
368
|
+
|
|
369
|
+
let startTime = performance.now()
|
|
370
|
+
let results = await searchText(text, numberOfResults)
|
|
371
|
+
var timeDiff = ((performance.now() - startTime) / 1000).toFixed(2)
|
|
372
|
+
let content = [
|
|
373
|
+
{
|
|
374
|
+
type: "text",
|
|
375
|
+
text: `Sorry. Recal search didn't find anything.`
|
|
376
|
+
}
|
|
377
|
+
]
|
|
378
|
+
if(results && results.rows && results.rows.length) {
|
|
379
|
+
// content = results.rows.map(r => {
|
|
380
|
+
// return {
|
|
381
|
+
// type: "text",
|
|
382
|
+
// text: r[1]
|
|
383
|
+
// }
|
|
384
|
+
// })
|
|
385
|
+
content = [{
|
|
386
|
+
type: "text",
|
|
387
|
+
text: `Recal search found the following results in ${timeDiff}s:`
|
|
388
|
+
}]
|
|
389
|
+
for(let i=0; i<results.rows.length; i++){
|
|
390
|
+
let row = results.rows[i]
|
|
391
|
+
content.push({
|
|
392
|
+
type: "text",
|
|
393
|
+
text: row[1]
|
|
394
|
+
})
|
|
395
|
+
// if(results.rows[2] && Object.keys(results.rows[2])){
|
|
396
|
+
// content.push({
|
|
397
|
+
// type: "json",
|
|
398
|
+
// text: row[2]
|
|
399
|
+
// })
|
|
400
|
+
// }
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
return {
|
|
405
|
+
content
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
);
|
|
409
|
+
|
|
410
|
+
// server.prompt(
|
|
411
|
+
// "echo",
|
|
412
|
+
// { message: z.string() },
|
|
413
|
+
// ({ message }) => ({
|
|
414
|
+
// messages: [{
|
|
415
|
+
// role: "user",
|
|
416
|
+
// content: {
|
|
417
|
+
// type: "text",
|
|
418
|
+
// text: `Please process this message: ${message}`
|
|
419
|
+
// }
|
|
420
|
+
// }]
|
|
421
|
+
// })
|
|
422
|
+
// );
|
|
423
|
+
|
|
424
|
+
// Start receiving messages on stdin and sending messages on stdout
|
|
425
|
+
const transport = new StdioServerTransport();
|
|
426
|
+
await server.connect(transport);
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
const splitSentences = (text) => {
|
|
430
|
+
return text.replace(/([.?!])\s*(?=[A-Z])/g, "$1|").split("|")
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
const runCLI = async () => {
|
|
434
|
+
let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--mcp', '--nuke', '--input-header', '--result-header', '--test', '--limit'])
|
|
435
|
+
let query = args['--query'] || args['-q']
|
|
436
|
+
if(args['--db']){
|
|
437
|
+
config.DB_FILE = args['--db']
|
|
438
|
+
}
|
|
439
|
+
if(query){
|
|
440
|
+
let numResults = 5
|
|
441
|
+
if(args['--limit'] && parseInt(args['--limit'])) {
|
|
442
|
+
numResults = parseInt(args['--limit'])
|
|
443
|
+
}
|
|
444
|
+
console.time('Search time')
|
|
445
|
+
let result = await vectorSearch(query, numResults)
|
|
446
|
+
console.timeEnd('Search time')
|
|
447
|
+
console.log('Results:')
|
|
448
|
+
console.log(JSON.stringify(result, null, 2))
|
|
449
|
+
}else if(args['--add']){
|
|
450
|
+
let [input, result, dataString] = args['--add'].split('|')
|
|
451
|
+
if(!input || !result) {
|
|
452
|
+
console.log('Usage:')
|
|
453
|
+
return console.log(args._cmd + `--add 'input|result|{"foo":"bar"}'`)
|
|
454
|
+
}
|
|
455
|
+
let data = {}
|
|
456
|
+
if(dataString) {
|
|
457
|
+
try {data = JSON.parse(dataString)}catch(err) {}
|
|
458
|
+
}
|
|
459
|
+
let resp = await add(input, result, data)
|
|
460
|
+
console.log(JSON.stringify(resp, null, 2))
|
|
461
|
+
}else if(args['--remove']){
|
|
462
|
+
let id = args['--remove']
|
|
463
|
+
if(!id) return console.log('Please specify ID to remove')
|
|
464
|
+
let resp = await remove(id)
|
|
465
|
+
console.log(JSON.stringify(resp, null, 2))
|
|
466
|
+
}else if(args['--nuke'] != undefined){
|
|
467
|
+
nuke()
|
|
468
|
+
console.log('Nuked.')
|
|
469
|
+
}else if(args['--import']){
|
|
470
|
+
await importFromCSVorTSV(args['--import'], args['--input-header'], args['--result-header'])
|
|
471
|
+
console.log('Imported.')
|
|
472
|
+
}else if(args['--json']){
|
|
473
|
+
await importFromJSONStream(args['--json'])
|
|
474
|
+
console.log('Imported.')
|
|
475
|
+
}else if(args['--mcp'] != undefined){
|
|
476
|
+
await mcp()
|
|
477
|
+
console.log('MCP server running.')
|
|
478
|
+
}else if(args['--test'] != undefined){
|
|
479
|
+
console.log('Test: ', await test())
|
|
480
|
+
}else{
|
|
481
|
+
console.log('Usage:')
|
|
482
|
+
console.log(args._cmd + ' --query "Foo Bar"')
|
|
483
|
+
console.log("\n" + 'Options:')
|
|
484
|
+
console.log('--query "SEARCH_STRING" - search')
|
|
485
|
+
console.log('--limit 2 - limit number of results (used with --query)')
|
|
486
|
+
console.log(`--add 'input|result|{"foo":"bar"}' - add data`)
|
|
487
|
+
console.log(`--remove 'id' - remove data`)
|
|
488
|
+
console.log(`--nuke - destroy database`)
|
|
489
|
+
console.log(`--mcp - run as MCP server (experimental)`)
|
|
490
|
+
console.log(`--db "FILE_NAME" - database file (SQLite)`)
|
|
491
|
+
console.log(`--import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data`)
|
|
492
|
+
console.log('--input-header "foo" - when used with --import designates specific header column as input')
|
|
493
|
+
console.log('--result-header "bar" - when used with --import designates specific header column as result')
|
|
494
|
+
console.log(`--json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}`)
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
if(isThisFileBeingRunViaCLI){
|
|
499
|
+
runCLI()
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
// recall --nuke
|
|
503
|
+
// recall --import "test.tsv"
|
|
504
|
+
// recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
|
|
505
|
+
// recall --query "Un animal saute par-dessus un autre animal"
|