@sjovanovic/recall.js 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -4,15 +4,11 @@
4
4
  <img alt="Recall.js is long term memory for AI apps!" src="logo.svg" />
5
5
  </p>
6
6
 
7
- Recall.js is long term memory for AI apps!
7
+ Recall.js provides longterm memory for AI applications. It is a JavaScript library and command‑line tool for building Retrieval‑Augmented Generation (RAG) systems, with a focus on speed, ease of use, and embeddability.
8
8
 
9
- It is a tool for building RAG (Retrieval-augmented generation) in a form of JavaScript library and command line utility focused on speed, ease of use and embeddability.
9
+ Beyond RAG, recall.js can be used for generic semantic search, as expert memory for your AI app, or as a recommendation system. It supports multilingual embeddings out of the box, allowing you to add data in one language and query it in another.
10
10
 
11
- It is versatile and you don't have to use it exclusively for RAG, use it for generic Semantic Search, as expert memory for your AI app, as a recommendation system, there are so many possibilities...
12
-
13
- Recall.js supports multilingual embeddings out of the box so you can add data in one language and then query it in another.
14
-
15
- Under the hood, recall.js uses sentence vector embeddings and a vector database to index and query your data. It is a light wrapper around local language models such as [MiniLM-L12-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2) and [CozoDB](https://www.cozodb.org/) vector database.
11
+ Under the hood, recall.js uses [Transformers.js](https://huggingface.co/docs/transformers.js/index) for feature extraction and a vector database (powered by [CozoDB](https://www.cozodb.org/)) for indexing and querying. It is a lightweight wrapper around local language models such as [Multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2).
16
12
 
17
13
  ## Install
18
14
 
@@ -20,60 +16,57 @@ Under the hood, recall.js uses sentence vector embeddings and a vector database
20
16
 
21
17
  ## Usage
22
18
 
23
- Console:
19
+ ### Command Line
24
20
 
25
21
  ```console
26
22
  recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
27
23
  recall --query "Un animal saute par-dessus un autre animal" --limit 1
28
24
  ```
29
- **Warning:** when this library is used for the first time, it will download a local language model MiniLM-L12-v2 which may take long time depending on your Internet connectivity. Please be patient.
25
+ > **Note:** When the library is used for the first time, it will download a local language model (Multilingual-MiniLM-L12-v2). This may take a while depending on your internet connection. Please be patient.
30
26
 
31
- Below is the same example in JavaScript:
27
+ ### JavaScript
32
28
 
33
29
  ```javascript
34
30
 
35
- import * as RECALL from '@sjovanovic/recall.js'
31
+ import Recall from '@sjovanovic/recall.js'
36
32
 
37
33
  const testRecall = async () => {
38
- await RECALL.addBatch([
39
- {
40
- input: "The quick brown fox jumps over the lazy dog",
41
- result: "Fox and dog",
42
- data: { foo: "bar" }
43
- }
44
- ])
45
-
46
- // Semantic search query in different language (French) "Animal jumps over another animal"
47
- let response = await RECALL.searchText("Un animal saute par-dessus un autre animal", 1)
48
- console.log(response)
49
- }
50
- testRecall()
51
34
 
52
- /*
35
+ let config = {
36
+ SHOW_PROGRESS: true
37
+ }
38
+
39
+ let recall = new Recall(config)
53
40
 
54
- response:
41
+ await recall.addBatch([
42
+ {
43
+ input: "The quick brown fox jumps over the lazy dog",
44
+ result: "Fox and dog",
45
+ data: { foo: "bar" }
46
+ }
47
+ ])
55
48
 
49
+ // Semantic search query in different language (French) "Animal jumps over another animal"
50
+ let response = await recall.searchText("Un animal saute par-dessus un autre animal", 1)
51
+ console.log(response)
52
+ }
53
+ testRecall()
54
+ ```
55
+
56
+ **Example response:**
57
+ ```json
56
58
  {
57
- "headers": [
58
- "dist",
59
- "result",
60
- "id",
61
- "data"
62
- ],
59
+ "headers": ["dist", "result", "id", "data", "category"],
63
60
  "rows": [
64
61
  [
65
- 0.5840495824813843, // vector similarity
62
+ 0.6840495824813843,
66
63
  "Fox and dog",
67
64
  "08840189191373282",
68
- {
69
- "foo": "bar"
70
- }
65
+ { "foo": "bar" },
66
+ ""
71
67
  ]
72
68
  ]
73
69
  }
74
-
75
- */
76
-
77
70
  ```
78
71
 
79
72
  ## Options
@@ -87,17 +80,17 @@ Usage:
87
80
  recall --query "Foo Bar"
88
81
 
89
82
  Options:
90
- --query "SEARCH_STRING" - search
91
- --limit 2 - limit number of results (used with --query)
92
- --add 'input|result|{"foo":"bar"}' - add data
93
- --remove 'id' - remove data
94
- --nuke - destroy database
95
- --mcp - run as MCP server
96
- --db "FILE_NAME" - database file (SQLite)
97
- --import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data
98
- --input-header "foo" - when used with --import designates specific header column as input
99
- --result-header "bar" - when used with --import designates specific header column as result
100
- --json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}
83
+ --query "SEARCH_STRING" - Search the database
84
+ --limit N - Limit number of results (used with --query).
85
+ --add 'input|result|{"foo":"bar"}|categ' - Add a data entry.
86
+ --remove 'id' - Remove data by ID.
87
+ --nuke - Destroy the database.
88
+ --db "FILE_NAME" - Specify database file (SQLite).
89
+ --import "file.csv | file.tsv" - Import from CSV or TSV with columns: input, result, additional data.
90
+ --input-header "foo" - When used with --import, designate a specific header column as input.
91
+ --result-header "bar" - When used with --import, designate a specific header column as result.
92
+ --json "FILE_NAME" - Import from a file with one JSON object per line: {input:"", result:"", data:{}}.
93
+ --category "CATEGORY" - Specify category when adding data and filter by it when querying (defaults to empty string).
101
94
  ```
102
95
 
103
96
  **Note:** when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
@@ -105,69 +98,79 @@ Options:
105
98
 
106
99
  ## JavaScript API Reference
107
100
 
108
- ### RECALL.config
101
+ ### Configuration
109
102
 
110
- Configuration object.
103
+ The default configuration object is exported as config:
111
104
 
112
105
  ```javascript
113
106
  export const config = {
114
- VECTOR_SIZE: 384, // number of dimensions
115
- MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
116
- SHOW_ERRORS: true, // Show errors
117
- DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
118
- PATH: PATH // directory of recall.js
107
+ VECTOR_SIZE: 384, // Number of dimensions (must match the model's output)
108
+ MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // Model name for Transformers.js
109
+ SHOW_ERRORS: true, // Show error messages
110
+ DB_FILE: join(PATH, 'vector.db'), // Path to the SQLite database file (used by CozoDB)
111
+ PATH: PATH, // Directory of recall.js
112
+ DEVICE: undefined, // Transformers.js device
113
+ DTYPE: undefined, // Transformers.js dtype
114
+ PROGRESS_CALLBACK: undefined // Transformers.js progress callback
119
115
  }
120
116
  ```
121
117
 
122
- ### RECALL.getDb()
118
+ ### Methods
119
+
120
+ **getDb()**
123
121
 
124
- Returns reference to the CozoDB instance.
122
+ Returns reference to the underlying CozoDB instance.
125
123
 
126
- ### RECALL.getEmbeddings(text) -> Promise(Array)
124
+ **getEmbeddings(text) -> Promise&lt;Array&gt;**
127
125
 
128
126
  Given text calculates the embeddings vector
129
127
 
130
- ### RECALL.add(input, result, data={}) -> Promise(Object)
128
+ **add(input, result, data={}, category="") -> Promise&lt;Object&gt;**
131
129
 
132
- Add data. `input` is the sentence to get embeddings from. `result` is the string to show in the results. `data` is arbitrary object intended to hold related pieces of information and references. If `data` object contains `id` property it will be used as unique id of the record.
130
+ Adds a data entry.
133
131
 
134
- ### RECALL.addBatch(batch) -> Promise(Object)
132
+ - input The sentence to generate embeddings from.
133
+ - result – The string to display in search results.
134
+ - data – Arbitrary object for additional information and references. If it contains an id property, that value will be used as the record’s unique ID.
135
+ - category – Optional category string.
135
136
 
136
- Add data in batches (faster than using add repeteadely).
137
- `batch` is an Array that looks like this:
138
- ```
139
- let batch = [{input:"", result:"", data:{}}]
140
- ```
137
+ **addBatch(batch) -> Promise&lt;Object&gt;**
141
138
 
142
- ### RECALL.remove(id) -> Promise(Object)
139
+ Adds multiple entries in a batch (more efficient than repeated add calls).
140
+ batch is an array of objects with the same structure as add:
141
+ ```javascript
142
+ let batch = [
143
+ { input: "", result: "", data: {}, category: "" }
144
+ ]
145
+ ```
143
146
 
144
- Remove data by id. id is a string.
147
+ **remove(id) -> Promise&lt;Object&gt;**
145
148
 
146
- ### RECALL.searchText(text, numResults = 5) -> Promise(Object)
149
+ Removes the record with the specified ID (string).
147
150
 
148
- Query the vector database. Accepts query text and number of results to return.
151
+ **searchText(text, category="", numResults = 5, includeInput=false) -> Promise&lt;Object&gt;**
149
152
 
150
- ### RECALL.nuke()
153
+ Queries the vector database.
151
154
 
152
- Deletes the database.
155
+ - text – The query text.
156
+ - category – Optional category filter.
157
+ - numResults – Number of results to return.
158
+ - includeInput – If true, the original input text is included in the response.
153
159
 
154
- ### RECALL.importFromJSONStream(fileName) -> Promise(object)
160
+ **nuke()**
155
161
 
156
- Imports from readable stream or file which consists of JSON objects, one per line. e.g.
157
- ```
158
- {input:"one", result:"one result", data:{"id":"123"}}
159
- {input:"", result:"", data:{}}
160
- ...
161
- ```
162
- This is the most efficient way to import data.
162
+ Deletes the entire database.
163
163
 
164
- ### RECALL.importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise()
164
+ **importFromJSONStream(fileName) -> Promise&lt;Object&gt;**
165
165
 
166
- Imports from CSV or TSV file. By default fist column is used as input, second as result and remaining columns are put in the data object.
167
- If `inputHeader` is specified, function will try to find the column by that name and use it as input.
168
- If `resultHeader` is specified, function will try to find the column by that name and use it as result.
166
+ Imports data from a readable stream or file containing one JSON object per line (JSONL). Example line format:
167
+ ```json
168
+ {input:"one", result:"one result", data:{"id":"123"}, category:""}
169
+ ```
170
+ This is the most efficient import method.
169
171
 
170
- ### RECALL.mcp() -> Promise()
172
+ **importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise&lt;Object&gt;**
171
173
 
172
- (Experimental)
173
- Runs MCP server and makes the results available when mentioning `Recall search` in the prompt. Currently only supports STDIO.
174
+ Imports data from a CSV or TSV file. By default, the first column is used as input, the second as result, and the remaining columns are merged into the data object.
175
+ If `inputHeader` is specified, the function looks for a column with that name and uses it as input.
176
+ If `resultHeader` is specified, it looks for a column with that name and uses it as result.
package/package.json CHANGED
@@ -1,24 +1,24 @@
1
1
  {
2
2
  "name": "@sjovanovic/recall.js",
3
- "version": "1.0.3",
4
- "description": "Semantic search as long term memory for LLMs",
3
+ "version": "1.0.5",
4
+ "description": "Easy RAG with semantic search and long term memory",
5
5
  "main": "recall.js",
6
6
  "bin": {
7
7
  "recall": "recall.js"
8
8
  },
9
9
  "type": "module",
10
10
  "scripts": {
11
- "test": "echo \"Error: no test specified\" && exit 1",
11
+ "start": "node recall.js",
12
+ "test": "node recall.js --test",
12
13
  "query": "node recall.js --query "
13
14
  },
14
15
  "author": "Slobodan Jovanovic",
15
16
  "license": "ISC",
16
17
  "dependencies": {
17
- "@modelcontextprotocol/sdk": "^1.8.0",
18
- "@themaximalist/embeddings.js": "^0.1.3",
19
- "@xenova/transformers": "^2.17.2",
18
+ "@huggingface/transformers": "^4.2.0",
19
+ "@modelcontextprotocol/sdk": "^1.29.0",
20
20
  "cozo-node": "^0.7.6",
21
21
  "csv-parser": "^3.2.0",
22
- "zod": "^3.24.2"
22
+ "zod": "^4.3.6"
23
23
  }
24
24
  }
package/recall.js CHANGED
@@ -1,429 +1,409 @@
1
1
  #!/usr/bin/env node
2
2
  import {CozoDb} from 'cozo-node'
3
- import embeddings from "@themaximalist/embeddings.js";
3
+ import { pipeline } from "@huggingface/transformers";
4
4
  import csv from 'csv-parser'
5
5
  import fs from 'fs'
6
6
  import { resolve, join, dirname, sep } from 'path'
7
7
  import { fileURLToPath } from 'url'
8
8
 
9
- import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
10
- import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
11
- import { z } from "zod";
12
-
13
9
  const pathToThisFile = resolve(fileURLToPath(import.meta.url))
14
10
  const pathPassedToNode = resolve(process.argv[1])
15
11
  const isThisFileBeingRunViaCLI = pathToThisFile.includes(pathPassedToNode) || pathPassedToNode.includes('.npm-global')
16
12
  const PATH = dirname(pathToThisFile)
17
13
 
18
14
  export const config = {
19
- VECTOR_SIZE: 384, // number of dimensions
20
- MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
15
+ VECTOR_SIZE: 384, // number of dimensions (must match the models output)
16
+ MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
21
17
  SHOW_ERRORS: true, // Show errors
18
+ SHOW_PROGRESS: false, // Show model loading progress in the console
22
19
  DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
23
- PATH: PATH // directory of recall.js
20
+ PATH: PATH, // directory of recall.js
21
+ DEVICE: undefined, // Transformers.js device
22
+ DTYPE: undefined, // Transformers.js dtype
23
+ PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
24
24
  }
25
+ var recal_instance = null
25
26
 
26
- var db = null, initDone = false
27
-
28
- export const getDb = () => {
29
- if(!db) {
30
- db = new CozoDb('sqlite', config.DB_FILE)
27
+ export class Recall {
28
+ constructor(opts = {}){
29
+ this.opts = {
30
+ ...config,
31
+ ...opts
32
+ }
33
+ this.initDone = false
34
+ this.db = new CozoDb('sqlite', this.opts.DB_FILE)
35
+ }
36
+ async printQuery(query, params = {}) {
37
+ try{
38
+ if(!this.initDone) {
39
+ this.initDone = true
40
+ await this.createTable()
41
+ }
42
+ }catch(err) {}
43
+ try {
44
+ let data = this.db.run(query, params)
45
+ return data
46
+ }catch(err){
47
+ if(this.opts.SHOW_ERRORS) console.error(err.display || err.message)
48
+ }
31
49
  }
32
- return db
33
- }
34
-
35
- async function printQuery(query, params = {}) {
36
50
 
37
- try{
38
- if(!initDone) {
39
- initDone = true
40
- await createTable()
51
+ async getEmbeddings(text){
52
+ let pipe = this.opts._pipe
53
+ if(!pipe) {
54
+ this.opts._pipe = await pipeline("feature-extraction", this.opts.MODEL_NAME, {
55
+ progress_callback:(progress) => {
56
+ if(this.opts.PROGRESS_CALLBACK) return this.opts.PROGRESS_CALLBACK(progress);
57
+ if(this.opts.SHOW_PROGRESS && progress.status === "progress_total"){
58
+ process.stdout.write(`\r\x1b[K✅ Loaded ${ Math.round(progress.progress)}% ${progress.name || "model"}`)
59
+ }
60
+ },
61
+ device: this.opts.DEVICE,
62
+ dtype: this.opts.DTYPE
63
+ });
64
+ pipe = this.opts._pipe
41
65
  }
42
- }catch(err) {
43
- //console.log('CREATE TABLE ERROR', err)
66
+ const embedding = await pipe(text, { pooling: "mean", normalize: true });
67
+ return Array.from(embedding.data)
44
68
  }
45
- try {
46
- let data = getDb().run(query, params)
47
- return data
48
- }catch(err){
49
- if(config.SHOW_ERRORS) console.error(err.display || err.message)
69
+
70
+ async createTable() {
71
+ // create table
72
+ let tableCreated = await this.printQuery(`:create embeddings {id: String, category: String => v: <F32; ${this.opts.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
73
+ if(tableCreated){
74
+ // create vector index
75
+ let indexCreated = await printQuery(`::hnsw create embeddings:index_name {
76
+ dim: ${this.opts.VECTOR_SIZE},
77
+ m: 50,
78
+ dtype: F32,
79
+ fields: [v],
80
+ distance: L2, # Cosine, IP
81
+ ef_construction:50, # number of nearest neighbors
82
+ extend_candidates: false, # include nearest neighbors of the nearest neighbors
83
+ keep_pruned_connections: false,
84
+ }`)
85
+ return tableCreated && indexCreated
86
+ }
87
+ return false
50
88
  }
51
- }
52
89
 
53
- export const getEmbeddings = async (text) => {
54
- const embedding = await embeddings(text, {
55
- service:'transformers',
56
- model: config.MODEL_NAME,
57
- cache_file: join(config.PATH, "cache", ".embeddings.cache.json")
58
- });
59
- return embedding
60
- }
90
+ async add(input, result, data={}, category="") {
91
+ if(!input || !result) return
92
+ input = this.sanitizeString(input)
93
+ result = this.sanitizeString(result)
94
+ const embedding = await this.getEmbeddings(input)
95
+ let id = data.id || Math.random().toString().substring(2)
96
+ return await printQuery(`?[id, v, input, result, data, category] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)}, ${JSON.stringify(category.replaceAll('"', "'"))} ]]
97
+ :put embeddings {id, category => v, input, result, data}
98
+ `)
99
+ }
61
100
 
62
- export const createTable = async () => {
63
- // create table (id, v, input, result, data)
64
- let tableCreated = await printQuery(`:create embeddings {id: String, category: String => v: <F32; ${config.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
65
- if(tableCreated){
66
- // create index
67
- let indexCreated = await printQuery(`::hnsw create embeddings:index_name {
68
- dim: ${config.VECTOR_SIZE},
69
- m: 50,
70
- dtype: F32,
71
- fields: [v],
72
- distance: L2, # Cosine, IP
73
- ef_construction:50, # number of nearest neighbors
74
- extend_candidates: false, # include nearest neighbors of the nearest neighbors
75
- keep_pruned_connections: false,
76
- }`)
77
- return tableCreated && indexCreated
101
+ sanitizeString(str){
102
+ return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ").trim()
78
103
  }
79
- return false
80
- }
81
104
 
82
- export const add = async (input, result, data={}, category="") => {
83
- if(!input || !result) return
84
-
85
- input = sanitizeString(input)
86
- result = sanitizeString(result)
87
- const embedding = await getEmbeddings(input)
88
- let id = data.id || Math.random().toString().substring(2)
89
- return await printQuery(`?[id, v, input, result, data, category] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)}, ${JSON.stringify(category.replaceAll('"', "'"))} ]]
90
- :put embeddings {id, category => v, input, result, data}
91
- `)
92
- }
105
+ /**
106
+ *
107
+ * Batch array:
108
+ * [{input:"", result:"", data:{}}]
109
+ *
110
+ * @param {Array} batch
111
+ * @returns
112
+ */
113
+ async addBatch(batch, opts={onProgress:null}) {
114
+ if(!batch || !Array.isArray(batch)) return
115
+ let vectorBatch = []
116
+ for(let i=0;i<batch.length; i++){
117
+ let {input, result, data, category} = batch[i]
118
+
119
+ if(!input || !result) continue
120
+ if(!data) data = {}
121
+ if(!category) category = ''
122
+ const embedding = await this.getEmbeddings(input)
123
+ batch[i].embedding = embedding
124
+ let item = ''
125
+ if(i == 0) {
126
+ item += `?[id, v, input, result, data, category] <- [`
127
+ }
93
128
 
94
- /**
95
- *
96
- * Batch array:
97
- * [{input:"", result:"", data:{}}]
98
- *
99
- * @param {Array} batch
100
- * @returns
101
- */
102
- export const addBatch = async (batch) => {
103
- if(!batch || !Array.isArray(batch)) return
104
- let vectorBatch = []
105
- for(let i=0;i<batch.length; i++){
106
- let {input, result, data, category} = batch[i]
107
-
108
- if(!input || !result) continue
109
- if(!data) data = {}
110
- if(!category) category = ''
111
- const embedding = await getEmbeddings(input)
112
- batch[i].embedding = embedding
113
- let item = ''
114
- if(i == 0) {
115
- item += `?[id, v, input, result, data, category] <- [`
116
- }
129
+ input = this.sanitizeString(input)
130
+ result = this.sanitizeString(result)
117
131
 
118
- input = sanitizeString(input)
119
- result = sanitizeString(result)
132
+ let id = data?.id ? data.id : Math.random().toString().substring(2)
133
+ item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)}, ${JSON.stringify(category)} ],`
134
+ if(i == batch.length-1) {
135
+ item += `]
136
+ :put embeddings {id, category => v, input, result, data}`
137
+ }
138
+ vectorBatch.push(item)
120
139
 
121
- let id = data?.id ? data.id : Math.random().toString().substring(2)
122
- item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)}, ${JSON.stringify(category)} ],`
123
- if(i == batch.length-1) {
124
- item += `]
125
- :put embeddings {id, category => v, input, result, data}`
140
+ if(opts.onProgress && typeof opts.onProgress == 'function') {
141
+ await opts.onProgress({index: i+1, total:batch.length, item: batch[i], embedding, percent: Math.round((i+1) / batch.length * 100)})
142
+ }
126
143
  }
127
- vectorBatch.push(item)
144
+ return await this.printQuery(vectorBatch.join("\n"))
128
145
  }
129
- return await printQuery(vectorBatch.join("\n"))
130
- }
131
146
 
132
- const sanitizeString = (str)=>{
133
- return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ")
134
- }
135
-
136
- export const remove = async (id, category="") => {
137
- if(!id || typeof id != 'string') return
138
- id.replace(/[^a-zA-Z0-9]/g, '')
139
- if(!id) return
140
- let results = await printQuery(
141
- `?[id, category] <- [['${id}', '${category}']]
142
- ::remove embeddings {id}`)
143
- return results
144
- }
145
-
146
- export const searchText = async (text, category="", numResults = 5) => {
147
- const embedding = await getEmbeddings(text)
148
- let results = await printQuery(`?[dist, result, id, data, category] := ~embeddings:index_name { id, v, input, result, data, category |
149
- query: q,
150
- k: ${numResults}, # number of results
151
- ef: 50, # number of neighbours to consider
152
- bind_distance: dist,
153
- filter: category==${JSON.stringify(category)},
154
- radius: 10.0
155
- }, q = vec(${JSON.stringify(embedding)})
156
- :sort -dist`)
157
- return results
158
- }
159
-
160
- export const vectorSearch = async (query, category='', numResults=5) => {
161
- let result = undefined
162
- try{
163
- result = await searchText(query, category, numResults)
164
- }catch(err){
165
- if(config.SHOW_ERRORS) console.error(err.display || err.message)
147
+ async remove(id, category="") {
148
+ if(!id || typeof id != 'string') return
149
+ id = id.replace(/[^a-zA-Z0-9]/g, '')
150
+ category = this.sanitizeString(category)
151
+ if(!id || !category) return
152
+ let results = await this.printQuery(
153
+ `?[id, category] <- [['${id}', '${category}']]
154
+ ::rm embeddings {id, category}`)
155
+ return results
156
+ }
157
+
158
+ async removeAllByCategory(category=""){
159
+ category = this.sanitizeString(category)
160
+ if(!category) return
161
+ let results
162
+ try {
163
+ results = await this.printQuery(
164
+ `?[id, category] := *embeddings{id, category}, category = "${category}"
165
+ :rm embeddings {id, category}`)
166
+ }catch(err){
167
+ console.error(err)
168
+ }
169
+ return results
166
170
  }
167
- return result
168
- }
169
171
 
170
- const cmdArgs = (list = []) => {
171
- let args = {}, current = null
172
- for(let i=0; i<process.argv.length; i++){
173
- let val = process.argv[i]
174
- if(current && !list.includes(val)){
175
- args[current] = val
176
- current = null
177
- }
178
- if(list.includes(val)) {
179
- current = val
180
- args[current] = ''
172
+ async searchText(text, category="", numResults = 5, includeInput=false) {
173
+ const embedding = await this.getEmbeddings(text)
174
+ let results = await this.printQuery(`?[dist, result, id, data, category${includeInput? ', input' : ''}] := ~embeddings:index_name { id, v, input, result, data, category${includeInput? ', input' : ''} |
175
+ query: q,
176
+ k: ${numResults}, # number of results
177
+ ef: 50, # number of neighbours to consider
178
+ bind_distance: dist,
179
+ filter: category==${JSON.stringify(category)},
180
+ radius: 10.0
181
+ }, q = vec(${JSON.stringify(embedding)})
182
+ :sort dist`)
183
+ return results
184
+ }
185
+
186
+ async vectorSearch(query, category='', numResults=5) {
187
+ let result = undefined
188
+ try{
189
+ result = await this.searchText(query, category, numResults)
190
+ }catch(err){
191
+ if(config.SHOW_ERRORS) console.error(err.display || err.message)
181
192
  }
193
+ return result
182
194
  }
183
- args._cmd = process.argv[1].split(sep).pop()
184
- return args
185
- }
186
195
 
187
- export const nuke = () => {
188
- return fs.unlinkSync(config.DB_FILE)
189
- }
196
+ nuke() {
197
+ return fs.unlinkSync(this.opts.DB_FILE)
198
+ }
190
199
 
191
- export const importFromJSONStream = async (fileName) => {
192
- async function jsonStream(readable, callback = async function(){}) {
193
- readable.setEncoding('utf8');
194
- let data = '';
195
- for await (const chunk of readable) {
196
- if(chunk.indexOf("\n")) {
197
- pts = chunk.split("\n")
198
- for(let i=0;i<pts.length; i++){
199
- data += pts[i]
200
- try {
201
- let json = JSON.parse(data)
202
- await callback(json)
203
- json = null
204
- data = ''
205
- }catch(err) {
206
- //console.error(err)
200
+ async importFromJSONStream(fileName) {
201
+ async function jsonStream(readable, callback = async function(){}) {
202
+ readable.setEncoding('utf8');
203
+ let data = '';
204
+ for await (const chunk of readable) {
205
+ if(chunk.indexOf("\n")) {
206
+ pts = chunk.split("\n")
207
+ for(let i=0;i<pts.length; i++){
208
+ data += pts[i]
209
+ try {
210
+ let json = JSON.parse(data)
211
+ await callback(json)
212
+ json = null
213
+ data = ''
214
+ }catch(err) {}
207
215
  }
216
+ }else{
217
+ data += chunk;
208
218
  }
209
- }else{
210
- data += chunk;
211
219
  }
212
220
  }
213
- }
214
- let batchSize = 40, batch = [], i=0, currentBatch = 0
215
- let stream = typeof fileName == 'string' ? fs.createReadStream(fileName) : fileName
216
- await jsonStream(stream, async (json) => {
217
- if(json.input && json.result){
218
- if(!json.data) json.data = {}
219
- if(i % batchSize === 0){
220
- if(batch.length) {
221
- currentBatch = currentBatch + 1
222
- console.log(`Adding batch ${currentBatch} (${batch.length} items)`)
223
- await addBatch(batch)
224
- batch = []
221
+ let batchSize = 40, batch = [], i=0, currentBatch = 0
222
+ let stream = typeof fileName == 'string' ? fs.createReadStream(fileName) : fileName
223
+ await jsonStream(stream, async (json) => {
224
+ if(json.input && json.result){
225
+ if(!json.data) json.data = {}
226
+ if(i % batchSize === 0){
227
+ if(batch.length) {
228
+ currentBatch = currentBatch + 1
229
+ console.log(`Adding batch ${currentBatch} (${batch.length} items)`)
230
+ await this.addBatch(batch)
231
+ batch = []
232
+ }
225
233
  }
234
+ batch.push(json)
235
+ i=i+1
226
236
  }
227
- batch.push(json)
228
- i=i+1
237
+ })
238
+ if(batch.length) {
239
+ console.log(`Adding batch ${currentBatch + 1} (${batch.length} items)`)
240
+ await this.addBatch(batch)
229
241
  }
230
- })
231
- if(batch.length) {
232
- console.log(`Adding batch ${currentBatch + 1} (${batch.length} items)`)
233
- await addBatch(batch)
234
242
  }
235
- }
236
-
237
- export const importFromCSVorTSV = async (fileName, inputHeader, resultHeader) => {
238
- if(!fileName || !fileName.includes('.')) return
239
- let ext = fileName.split('.').pop()
240
- ext = ext.toLowerCase()
241
- if(ext != 'csv' && ext != 'tsv') return console.log('File must have csv or tsv extension')
242
- let parseOpts = {
243
- separator: ext == 'tsv' ? '\t' : ',',
244
- mapHeaders: ({ header, index }) => {
245
- if(inputHeader) {
246
- if(inputHeader == header){
243
+
244
+ async importFromCSVorTSV(fileName, inputHeader, resultHeader) {
245
+ if(!fileName || !fileName.includes('.')) return
246
+ let ext = fileName.split('.').pop()
247
+ ext = ext.toLowerCase()
248
+ if(ext != 'csv' && ext != 'tsv') return console.log('File must have csv or tsv extension')
249
+ let parseOpts = {
250
+ separator: ext == 'tsv' ? '\t' : ',',
251
+ mapHeaders: ({ header, index }) => {
252
+ if(inputHeader) {
253
+ if(inputHeader == header){
254
+ return 'input'
255
+ }
256
+ }else if(index === 0){
247
257
  return 'input'
248
258
  }
249
- }else if(index === 0){
250
- return 'input'
251
- }
252
- if(resultHeader){
253
- if(resultHeader == header){
259
+ if(resultHeader){
260
+ if(resultHeader == header){
261
+ return 'result'
262
+ }
263
+ }else if(index === 1){
254
264
  return 'result'
255
265
  }
256
- }else if(index === 1){
257
- return 'result'
266
+ return header.replaceAll(/\W/gi, '_').replaceAll(/[^a-zA-Z0-9\_]/g, '').toLowerCase()
258
267
  }
259
- return header.replaceAll(/\W/gi, '_').replaceAll(/[^a-zA-Z0-9\_]/g, '').toLowerCase()
260
268
  }
261
- }
262
- let fetchFromFile = async (fileName) => {
263
- return new Promise(async (resolve, reject)=>{
264
- let results = []
265
- fs.createReadStream(fileName)
266
- .pipe(csv(parseOpts))
267
- .on('data', async (data) => {
268
- results.push(data)
269
+ let fetchFromFile = async (fileName) => {
270
+ return new Promise(async (resolve, reject)=>{
271
+ let results = []
272
+ fs.createReadStream(fileName)
273
+ .pipe(csv(parseOpts))
274
+ .on('data', async (data) => {
275
+ results.push(data)
276
+ })
277
+ .on('end', () => {
278
+ console.log(`${fileName} loaded.`);
279
+ resolve(results)
280
+ }).on('error', (err) => {
281
+ console.error(err);
282
+ })
269
283
  })
270
- .on('end', () => {
271
- console.log(`${fileName} loaded.`);
272
- resolve(results)
273
- }).on('error', (err) => {
274
- console.error(err);
275
- })
276
- })
277
- }
278
-
284
+ }
279
285
 
280
- let results = await fetchFromFile(fileName)
281
-
282
- let batchSize = 40, batch = [], currentBatch = 0, totalBatches = Math.ceil(results.length / batchSize), dataHeaders = Object.keys(results[results.length-1]).filter(k => k != 'input' && k != 'result'), data
283
- for(let i=0; i<results.length; i++){
284
- if(i % batchSize === 0){
285
- if(batch.length) {
286
- currentBatch = currentBatch + 1
287
- console.log(`Adding batch ${currentBatch} of ${totalBatches} (${batch.length} items)`)
288
- await addBatch(batch)
289
- batch = []
286
+ let results = await fetchFromFile(fileName)
287
+
288
+ let batchSize = 40, batch = [], currentBatch = 0, totalBatches = Math.ceil(results.length / batchSize), dataHeaders = Object.keys(results[results.length-1]).filter(k => k != 'input' && k != 'result'), data
289
+ for(let i=0; i<results.length; i++){
290
+ if(i % batchSize === 0){
291
+ if(batch.length) {
292
+ currentBatch = currentBatch + 1
293
+ console.log(`Adding batch ${currentBatch} of ${totalBatches} (${batch.length} items)`)
294
+ await this.addBatch(batch)
295
+ batch = []
296
+ }
290
297
  }
298
+ data = {}
299
+ dataHeaders.forEach(k => k && results[i][k] ? data[k] = results[i][k] : null)
300
+ batch.push({
301
+ input: results[i].input,
302
+ result: results[i].result,
303
+ data
304
+ })
305
+ }
306
+ if(batch.length) {
307
+ console.log(`Adding batch ${currentBatch + 1} of ${totalBatches} (${batch.length} items)`)
308
+ await this.addBatch(batch)
291
309
  }
292
- data = {}
293
- dataHeaders.forEach(k => k && results[i][k] ? data[k] = results[i][k] : null)
294
- batch.push({
295
- input: results[i].input,
296
- result: results[i].result,
297
- data
298
- })
299
- }
300
- if(batch.length) {
301
- console.log(`Adding batch ${currentBatch + 1} of ${totalBatches} (${batch.length} items)`)
302
- await addBatch(batch)
303
310
  }
311
+
304
312
  }
305
313
 
306
- const mcp = async () => {
314
+ export const getDb = () => {
315
+ if(!recal_instance) recal_instance = new Recall()
316
+ return recal_instance.db
317
+ }
307
318
 
308
- // Create an MCP server
309
- // const server = new McpServer({
310
- // name: "Demo",
311
- // version: "1.0.0"
312
- // });
313
-
314
- // // Add an addition tool
315
- // server.tool("add",
316
- // { a: z.number(), b: z.number() },
317
- // async ({ a, b }) => ({
318
- // content: [{ type: "text", text: String(a + b) }]
319
- // })
320
- // );
321
-
322
- // // Add a dynamic greeting resource
323
- // server.resource(
324
- // "greeting",
325
- // new ResourceTemplate("greeting://{name}", { list: undefined }),
326
- // async (uri, { name }) => ({
327
- // contents: [{
328
- // uri: uri.href,
329
- // text: `Hello, ${name}!`
330
- // }]
331
- // })
332
- // );
333
-
334
-
335
- const server = new McpServer({
336
- name: "Recall",
337
- description: "Recall provides semantic search on the local vector database.",
338
- version: "1.0.0"
339
- });
340
-
341
- // server.resource(
342
- // "echo",
343
- // new ResourceTemplate("echo://{message}", { list: undefined }),
344
- // async (uri, { message }) => ({
345
- // contents: [{
346
- // uri: uri.href,
347
- // text: `Resource echo: ${message}`
348
- // }]
349
- // })
350
- // );
351
-
352
- server.tool(
353
- "search",
354
- {
355
- text: z.string(),
356
- //numberOfResults: z.number()
357
- },
358
- async ({ text, numberOfResults }) => {
359
- if(numberOfResults && numberOfResults > 50) numberOfResults = 50
360
-
361
- let startTime = performance.now()
362
- let results = await searchText(text, numberOfResults)
363
- var timeDiff = ((performance.now() - startTime) / 1000).toFixed(2)
364
- let content = [
365
- {
366
- type: "text",
367
- text: `Sorry. Recal search didn't find anything.`
368
- }
369
- ]
370
- if(results && results.rows && results.rows.length) {
371
- // content = results.rows.map(r => {
372
- // return {
373
- // type: "text",
374
- // text: r[1]
375
- // }
376
- // })
377
- content = [{
378
- type: "text",
379
- text: `Recal search found the following results in ${timeDiff}s:`
380
- }]
381
- for(let i=0; i<results.rows.length; i++){
382
- let row = results.rows[i]
383
- content.push({
384
- type: "text",
385
- text: row[1]
386
- })
387
- // if(results.rows[2] && Object.keys(results.rows[2])){
388
- // content.push({
389
- // type: "json",
390
- // text: row[2]
391
- // })
392
- // }
393
- }
394
- }
319
+ async function printQuery(query, params = {}) {
320
+ if(!recal_instance) recal_instance = new Recall()
321
+ return await recal_instance.printQuery(query, params)
322
+ }
395
323
 
396
- return {
397
- content
398
- }
399
- }
400
- );
401
-
402
- // server.prompt(
403
- // "echo",
404
- // { message: z.string() },
405
- // ({ message }) => ({
406
- // messages: [{
407
- // role: "user",
408
- // content: {
409
- // type: "text",
410
- // text: `Please process this message: ${message}`
411
- // }
412
- // }]
413
- // })
414
- // );
415
-
416
- // Start receiving messages on stdin and sending messages on stdout
417
- const transport = new StdioServerTransport();
418
- await server.connect(transport);
324
+ export const getEmbeddings = async (text) => {
325
+ if(!recal_instance) recal_instance = new Recall()
326
+ return await recal_instance.getEmbeddings(text)
327
+ }
328
+
329
+ export const createTable = async () => {
330
+ if(!recal_instance) recal_instance = new Recall()
331
+ return await recal_instance.createTable()
332
+ }
333
+
334
+ export const add = async (input, result, data={}, category="") => {
335
+ if(!recal_instance) recal_instance = new Recall()
336
+ return await recal_instance.add(input, result, data, category)
337
+ }
338
+
339
+ export const addBatch = async (batch, opts={onProgress:null}) => {
340
+ if(!recal_instance) recal_instance = new Recall()
341
+ return await recal_instance.addBatch(batch, opts)
342
+ }
343
+
344
+ export const remove = async (id, category="") => {
345
+ if(!recal_instance) recal_instance = new Recall()
346
+ return await recal_instance.remove(id, category)
347
+ }
348
+
349
+ export const removeAllByCategory = async (category="") => {
350
+ if(!recal_instance) recal_instance = new Recall()
351
+ return await recal_instance.removeAllByCategory(category)
352
+ }
353
+
354
+ export const searchText = async (text, category="", numResults = 5, includeInput=false) => {
355
+ if(!recal_instance) recal_instance = new Recall()
356
+ return await recal_instance.searchText(text, category, numResults, includeInput)
419
357
  }
420
358
 
421
- const splitSentences = (text) => {
422
- return text.replace(/([.?!])\s*(?=[A-Z])/g, "$1|").split("|")
359
+ export const vectorSearch = async (query, category='', numResults=5) => {
360
+ if(!recal_instance) recal_instance = new Recall()
361
+ return await recal_instance.vectorSearch(query, category, numResults)
362
+ }
363
+
364
+ export const nuke = () => {
365
+ if(!recal_instance) recal_instance = new Recall()
366
+ return recal_instance.nuke()
367
+ }
368
+
369
+ export const importFromJSONStream = async (fileName) => {
370
+ if(!recal_instance) recal_instance = new Recall()
371
+ return await recal_instance.importFromJSONStream(fileName)
372
+ }
373
+
374
+ export const importFromCSVorTSV = async (fileName, inputHeader, resultHeader) => {
375
+ if(!recal_instance) recal_instance = new Recall()
376
+ return await recal_instance.importFromCSVorTSV(fileName, inputHeader, resultHeader)
377
+ }
378
+
379
+ async function test(){
380
+ let recall = new Recall()
381
+ recall.nuke()
382
+ await recall.add('The quick brown fox jumps over the lazy dog', 'Fox jumps over dog', {foo:"bar"})
383
+ await recall.add('History of Serbia бегинс with emperor Heraclius', 'Serbia and Roman empire', {foo:"baz"})
384
+ let resp = await recall.vectorSearch('Un animal saute par-dessus un autre animal')
385
+ return JSON.stringify(resp)
386
+ }
387
+
388
+ const cmdArgs = (list = []) => {
389
+ let args = {}, current = null
390
+ for(let i=0; i<process.argv.length; i++){
391
+ let val = process.argv[i]
392
+ if(current && !list.includes(val)){
393
+ args[current] = val
394
+ current = null
395
+ }
396
+ if(list.includes(val)) {
397
+ current = val
398
+ args[current] = ''
399
+ }
400
+ }
401
+ args._cmd = process.argv[1].split(sep).pop()
402
+ return args
423
403
  }
424
404
 
425
405
  const runCLI = async () => {
426
- let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--mcp', '--nuke', '--input-header', '--result-header', '--test', '--limit', '--category'])
406
+ let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--nuke', '--input-header', '--result-header', '--test', '--limit', '--category'])
427
407
  let query = args['--query'] || args['-q']
428
408
  if(args['--db']){
429
409
  config.DB_FILE = args['--db']
@@ -468,27 +448,23 @@ const runCLI = async () => {
468
448
  }else if(args['--json']){
469
449
  await importFromJSONStream(args['--json'])
470
450
  console.log('Imported.')
471
- }else if(args['--mcp'] != undefined){
472
- await mcp()
473
- console.log('MCP server running.')
474
451
  }else if(args['--test'] != undefined){
475
452
  console.log('Test: ', await test())
476
453
  }else{
477
454
  console.log('Usage:')
478
455
  console.log(args._cmd + ' --query "Foo Bar"')
479
456
  console.log("\n" + 'Options:')
480
- console.log('--query "SEARCH_STRING" - search')
481
- console.log('--limit 2 - limit number of results (used with --query)')
482
- console.log(`--add 'input|result|{"foo":"bar"}|categ' - add data`)
483
- console.log(`--remove 'id' - remove data`)
484
- console.log(`--nuke - destroy database`)
485
- console.log(`--mcp - run as MCP server (experimental)`)
486
- console.log(`--db "FILE_NAME" - database file (SQLite)`)
487
- console.log(`--import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data`)
488
- console.log('--input-header "foo" - when used with --import designates specific header column as input')
489
- console.log('--result-header "bar" - when used with --import designates specific header column as result')
490
- console.log(`--json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}`)
491
- console.log(`--category "CATEGORY" - specify category when adding data and to filter by when querying (defaults to empty string)`)
457
+ console.log('--query "SEARCH_STRING" - Search the database')
458
+ console.log('--limit N - Limit number of results (used with --query).')
459
+ console.log(`--add 'input|result|{"foo":"bar"}|categ' - Add a data entry.`)
460
+ console.log(`--remove 'id' - Remove data by ID.`)
461
+ console.log(`--nuke - Destroy the database.`)
462
+ console.log(`--db "FILE_NAME" - Specify database file (SQLite).`)
463
+ console.log(`--import "file.csv | file.tsv" - Import from CSV or TSV with columns: input, result, additional data.`)
464
+ console.log('--input-header "foo" - When used with --import, designate a specific header column as input.')
465
+ console.log('--result-header "bar" - When used with --import, designate a specific header column as result.')
466
+ console.log(`--json "FILE_NAME" - Import from a file with one JSON object per line: {input:"", result:"", data:{}}.`)
467
+ console.log(`--category "CATEGORY" - Specify category when adding data and filter by it when querying (defaults to empty string).`)
492
468
  }
493
469
  }
494
470
 
@@ -0,0 +1,34 @@
1
+ export function sanitizeValue(stringValue, maxChars=1000) {
2
+ if (typeof stringValue !== 'string') {
3
+ throw new Error('stringValue must be a string');
4
+ }
5
+
6
+ let sanitized = stringValue.normalize('NFC').trim();
7
+
8
+ // Basic validation
9
+ if (sanitized.length === 0) {
10
+ throw new Error('stringValue name cannot be empty');
11
+ }
12
+
13
+ if (sanitized.length > maxChars) {
14
+ throw new Error(`stringValue name too long (max ${maxChars} characters)`);
15
+ }
16
+
17
+ // Block control characters (primary security concern)
18
+ // This allows all other Unicode characters including emojis, Chinese, Arabic, etc.
19
+ if (/[\x00-\x1F\x7F-\x9F\u200B\u200E\u200F\u202A-\u202E\u2060-\u2069\uFEFF]/.test(sanitized)) {
20
+ throw new Error('stringValue contains disallowed control characters');
21
+ }
22
+
23
+ // Block private use areas
24
+ if (/[\uE000-\uF8FF\uFFF0-\uFFFF]/.test(sanitized)) {
25
+ throw new Error('stringValue contains disallowed Unicode characters');
26
+ }
27
+
28
+ // Block surrogate pairs (invalid alone)
29
+ if (/[\uD800-\uDFFF]/.test(sanitized)) {
30
+ throw new Error('stringValue contains invalid Unicode characters');
31
+ }
32
+
33
+ return sanitized;
34
+ }