@sjovanovic/recall.js 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +91 -94
  2. package/package.json +2 -2
  3. package/recall.js +341 -397
package/README.md CHANGED
@@ -4,15 +4,11 @@
4
4
  <img alt="Recall.js is long term memory for AI apps!" src="logo.svg" />
5
5
  </p>
6
6
 
7
- Recall.js is long term memory for AI apps!
7
+ Recall.js provides longterm memory for AI applications. It is a JavaScript library and command‑line tool for building Retrieval‑Augmented Generation (RAG) systems, with a focus on speed, ease of use, and embeddability.
8
8
 
9
- It is a tool for building RAG (Retrieval-augmented generation) in a form of JavaScript library and command line utility focused on speed, ease of use and embeddability.
9
+ Beyond RAG, recall.js can be used for generic semantic search, as expert memory for your AI app, or as a recommendation system. It supports multilingual embeddings out of the box, allowing you to add data in one language and query it in another.
10
10
 
11
- It is versatile and you don't have to use it exclusively for RAG, it can also be used for generic Semantic Search, as expert memory for your AI app, as a recommendation system, there are many possibilities...
12
-
13
- Recall.js supports multilingual embeddings out of the box so you can add data in one language and then query it in another.
14
-
15
- Under the hood, recall.js uses [Transformers.js](https://huggingface.co/docs/transformers.js/index) feature extraction and a vector database to index and query your data. It is a light wrapper around local language models such as [Multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2) and [CozoDB](https://www.cozodb.org/) vector database.
11
+ Under the hood, recall.js uses [Transformers.js](https://huggingface.co/docs/transformers.js/index) for feature extraction and a vector database (powered by [CozoDB](https://www.cozodb.org/)) for indexing and querying. It is a lightweight wrapper around local language models such as [Multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2).
16
12
 
17
13
  ## Install
18
14
 
@@ -20,62 +16,57 @@ Under the hood, recall.js uses [Transformers.js](https://huggingface.co/docs/tra
20
16
 
21
17
  ## Usage
22
18
 
23
- Console:
19
+ ### Command Line
24
20
 
25
21
  ```console
26
22
  recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
27
23
  recall --query "Un animal saute par-dessus un autre animal" --limit 1
28
24
  ```
29
- **Warning:** when this library is used for the first time, it will download a local language model Multilingual-MiniLM-L12-v2 which may take a while depending on your Internet connectivity. Please be patient.
25
+ > **Note:** When the library is used for the first time, it will download a local language model (Multilingual-MiniLM-L12-v2). This may take a while depending on your internet connection. Please be patient.
30
26
 
31
- Below is the same example in JavaScript:
27
+ ### JavaScript
32
28
 
33
29
  ```javascript
34
30
 
35
- import * as RECALL from '@sjovanovic/recall.js'
31
+ import Recall from '@sjovanovic/recall.js'
36
32
 
37
33
  const testRecall = async () => {
38
- await RECALL.addBatch([
39
- {
40
- input: "The quick brown fox jumps over the lazy dog",
41
- result: "Fox and dog",
42
- data: { foo: "bar" }
43
- }
44
- ])
45
-
46
- // Semantic search query in different language (French) "Animal jumps over another animal"
47
- let response = await RECALL.searchText("Un animal saute par-dessus un autre animal", 1)
48
- console.log(response)
49
- }
50
- testRecall()
51
34
 
52
- /*
35
+ let config = {
36
+ SHOW_PROGRESS: true
37
+ }
38
+
39
+ let recall = new Recall(config)
53
40
 
54
- response:
41
+ await recall.addBatch([
42
+ {
43
+ input: "The quick brown fox jumps over the lazy dog",
44
+ result: "Fox and dog",
45
+ data: { foo: "bar" }
46
+ }
47
+ ])
55
48
 
49
+ // Semantic search query in different language (French) "Animal jumps over another animal"
50
+ let response = await recall.searchText("Un animal saute par-dessus un autre animal", 1)
51
+ console.log(response)
52
+ }
53
+ testRecall()
54
+ ```
55
+
56
+ **Example response:**
57
+ ```json
56
58
  {
57
- "headers": [
58
- "dist",
59
- "result",
60
- "id",
61
- "data",
62
- "category"
63
- ],
59
+ "headers": ["dist", "result", "id", "data", "category"],
64
60
  "rows": [
65
61
  [
66
- 0.6840495824813843, // vector similarity
62
+ 0.6840495824813843,
67
63
  "Fox and dog",
68
64
  "08840189191373282",
69
- {
70
- "foo": "bar"
71
- },
65
+ { "foo": "bar" },
72
66
  ""
73
67
  ]
74
68
  ]
75
69
  }
76
-
77
- */
78
-
79
70
  ```
80
71
 
81
72
  ## Options
@@ -86,21 +77,20 @@ Easy way to view all the options is via command line:
86
77
  recall --help
87
78
 
88
79
  Usage:
89
- recall.js --query "Foo Bar"
80
+ recall --query "Foo Bar"
90
81
 
91
82
  Options:
92
- --query "SEARCH_STRING" - search
93
- --limit 2 - limit number of results (used with --query)
94
- --add 'input|result|{"foo":"bar"}|categ' - add data
95
- --remove 'id' - remove data
96
- --nuke - destroy database
97
- --mcp - run as MCP server (experimental)
98
- --db "FILE_NAME" - database file (SQLite)
99
- --import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data
100
- --input-header "foo" - when used with --import designates specific header column as input
101
- --result-header "bar" - when used with --import designates specific header column as result
102
- --json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}
103
- --category "CATEGORY" - specify category when adding data and to filter by when querying (defaults to empty string)
83
+ --query "SEARCH_STRING" - Search the database
84
+ --limit N - Limit number of results (used with --query).
85
+ --add 'input|result|{"foo":"bar"}|categ' - Add a data entry.
86
+ --remove 'id' - Remove data by ID.
87
+ --nuke - Destroy the database.
88
+ --db "FILE_NAME" - Specify database file (SQLite).
89
+ --import "file.csv | file.tsv" - Import from CSV or TSV with columns: input, result, additional data.
90
+ --input-header "foo" - When used with --import, designate a specific header column as input.
91
+ --result-header "bar" - When used with --import, designate a specific header column as result.
92
+ --json "FILE_NAME" - Import from a file with one JSON object per line: {input:"", result:"", data:{}}.
93
+ --category "CATEGORY" - Specify category when adding data and filter by it when querying (defaults to empty string).
104
94
  ```
105
95
 
106
96
  **Note:** when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
@@ -108,72 +98,79 @@ Options:
108
98
 
109
99
  ## JavaScript API Reference
110
100
 
111
- ### RECALL.config
101
+ ### Configuration
112
102
 
113
- Configuration object.
103
+ The default configuration object is exported as config:
114
104
 
115
105
  ```javascript
116
106
  export const config = {
117
- VECTOR_SIZE: 384, // number of dimensions (must match the models output)
118
- MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
119
- SHOW_ERRORS: true, // Show errors
120
- DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
121
- PATH: PATH, // directory of recall.js
122
- DEVICE: undefined, // Transformers.js device
123
- DTYPE: undefined, // Transformers.js dtype
124
- PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
107
+ VECTOR_SIZE: 384, // Number of dimensions (must match the model's output)
108
+ MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // Model name for Transformers.js
109
+ SHOW_ERRORS: true, // Show error messages
110
+ DB_FILE: join(PATH, 'vector.db'), // Path to the SQLite database file (used by CozoDB)
111
+ PATH: PATH, // Directory of recall.js
112
+ DEVICE: undefined, // Transformers.js device
113
+ DTYPE: undefined, // Transformers.js dtype
114
+ PROGRESS_CALLBACK: undefined // Transformers.js progress callback
125
115
  }
126
116
  ```
127
117
 
128
- ### RECALL.getDb()
118
+ ### Methods
119
+
120
+ **getDb()**
129
121
 
130
- Returns reference to the CozoDB instance.
122
+ Returns reference to the underlying CozoDB instance.
131
123
 
132
- ### RECALL.getEmbeddings(text) -> Promise(Array)
124
+ **getEmbeddings(text) -> Promise&lt;Array&gt;**
133
125
 
134
126
  Given text calculates the embeddings vector
135
127
 
136
- ### RECALL.add(input, result, data={}, category="") -> Promise(Object)
128
+ **add(input, result, data={}, category="") -> Promise&lt;Object&gt;**
137
129
 
138
- Add data. `input` is the sentence to get embeddings from. `result` is the string to show in the results. `data` is arbitrary object intended to hold related pieces of information and references. If `data` object contains `id` property it will be used as unique id of the record.
130
+ Adds a data entry.
139
131
 
140
- ### RECALL.addBatch(batch) -> Promise(Object)
132
+ - input The sentence to generate embeddings from.
133
+ - result – The string to display in search results.
134
+ - data – Arbitrary object for additional information and references. If it contains an id property, that value will be used as the record’s unique ID.
135
+ - category – Optional category string.
141
136
 
142
- Add data in batches (faster than using add repeteadely).
143
- `batch` is an Array that looks like this:
144
- ```
145
- let batch = [{input:"", result:"", data:{}, category:""}]
137
+ **addBatch(batch) -> Promise&lt;Object&gt;**
138
+
139
+ Adds multiple entries in a batch (more efficient than repeated add calls).
140
+ batch is an array of objects with the same structure as add:
141
+ ```javascript
142
+ let batch = [
143
+ { input: "", result: "", data: {}, category: "" }
144
+ ]
146
145
  ```
147
146
 
148
- ### RECALL.remove(id) -> Promise(Object)
147
+ **remove(id) -> Promise&lt;Object&gt;**
149
148
 
150
- Remove data by id. id is a string.
149
+ Removes the record with the specified ID (string).
151
150
 
152
- ### RECALL.searchText(text, category="", numResults = 5, includeInput=false) -> Promise(Object)
151
+ **searchText(text, category="", numResults = 5, includeInput=false) -> Promise&lt;Object&gt;**
153
152
 
154
- Query the vector database. Accepts query text and number of results to return.
153
+ Queries the vector database.
155
154
 
156
- ### RECALL.nuke()
155
+ - text – The query text.
156
+ - category – Optional category filter.
157
+ - numResults – Number of results to return.
158
+ - includeInput – If true, the original input text is included in the response.
157
159
 
158
- Deletes the database.
160
+ **nuke()**
159
161
 
160
- ### RECALL.importFromJSONStream(fileName) -> Promise(object)
162
+ Deletes the entire database.
161
163
 
162
- Imports from readable stream or file which consists of JSON objects, one per line. e.g.
163
- ```
164
+ **importFromJSONStream(fileName) -> Promise&lt;Object&gt;**
165
+
166
+ Imports data from a readable stream or file containing one JSON object per line (JSONL). Example line format:
167
+ ```json
164
168
  {input:"one", result:"one result", data:{"id":"123"}, category:""}
165
- {input:"", result:"", data:{}, category:""}
166
- ...
167
169
  ```
168
- This is the most efficient way to import data.
169
-
170
- ### RECALL.importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise()
171
-
172
- Imports from CSV or TSV file. By default fist column is used as input, second as result and remaining columns are put in the data object.
173
- If `inputHeader` is specified, function will try to find the column by that name and use it as input.
174
- If `resultHeader` is specified, function will try to find the column by that name and use it as result.
170
+ This is the most efficient import method.
175
171
 
176
- ### RECALL.mcp() -> Promise()
172
+ **importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise&lt;Object&gt;**
177
173
 
178
- (Experimental)
179
- Runs MCP server and makes the results available when mentioning `Recall search` in the prompt. Currently only supports STDIO.
174
+ Imports data from a CSV or TSV file. By default, the first column is used as input, the second as result, and the remaining columns are merged into the data object.
175
+ If `inputHeader` is specified, the function looks for a column with that name and uses it as input.
176
+ If `resultHeader` is specified, it looks for a column with that name and uses it as result.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sjovanovic/recall.js",
3
- "version": "1.0.4",
3
+ "version": "1.0.5",
4
4
  "description": "Easy RAG with semantic search and long term memory",
5
5
  "main": "recall.js",
6
6
  "bin": {
@@ -9,7 +9,7 @@
9
9
  "type": "module",
10
10
  "scripts": {
11
11
  "start": "node recall.js",
12
- "test": "echo \"Error: no test specified\" && exit 1",
12
+ "test": "node recall.js --test",
13
13
  "query": "node recall.js --query "
14
14
  },
15
15
  "author": "Slobodan Jovanovic",
package/recall.js CHANGED
@@ -6,12 +6,6 @@ import fs from 'fs'
6
6
  import { resolve, join, dirname, sep } from 'path'
7
7
  import { fileURLToPath } from 'url'
8
8
 
9
- import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
10
- import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
11
- import { z } from "zod";
12
-
13
- // import {sanitizeValue} from './utils/sanitize.js'
14
-
15
9
  const pathToThisFile = resolve(fileURLToPath(import.meta.url))
16
10
  const pathPassedToNode = resolve(process.argv[1])
17
11
  const isThisFileBeingRunViaCLI = pathToThisFile.includes(pathPassedToNode) || pathPassedToNode.includes('.npm-global')
@@ -21,182 +15,374 @@ export const config = {
21
15
  VECTOR_SIZE: 384, // number of dimensions (must match the models output)
22
16
  MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
23
17
  SHOW_ERRORS: true, // Show errors
18
+ SHOW_PROGRESS: false, // Show model loading progress in the console
24
19
  DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
25
20
  PATH: PATH, // directory of recall.js
26
21
  DEVICE: undefined, // Transformers.js device
27
22
  DTYPE: undefined, // Transformers.js dtype
28
23
  PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
29
24
  }
25
+ var recal_instance = null
30
26
 
31
- var db = null, initDone = false
32
-
33
- export const getDb = () => {
34
- if(!db) {
35
- db = new CozoDb('sqlite', config.DB_FILE)
27
+ export class Recall {
28
+ constructor(opts = {}){
29
+ this.opts = {
30
+ ...config,
31
+ ...opts
32
+ }
33
+ this.initDone = false
34
+ this.db = new CozoDb('sqlite', this.opts.DB_FILE)
35
+ }
36
+ async printQuery(query, params = {}) {
37
+ try{
38
+ if(!this.initDone) {
39
+ this.initDone = true
40
+ await this.createTable()
41
+ }
42
+ }catch(err) {}
43
+ try {
44
+ let data = this.db.run(query, params)
45
+ return data
46
+ }catch(err){
47
+ if(this.opts.SHOW_ERRORS) console.error(err.display || err.message)
48
+ }
36
49
  }
37
- return db
38
- }
39
50
 
40
- async function printQuery(query, params = {}) {
41
- try{
42
- if(!initDone) {
43
- initDone = true
44
- await createTable()
51
+ async getEmbeddings(text){
52
+ let pipe = this.opts._pipe
53
+ if(!pipe) {
54
+ this.opts._pipe = await pipeline("feature-extraction", this.opts.MODEL_NAME, {
55
+ progress_callback:(progress) => {
56
+ if(this.opts.PROGRESS_CALLBACK) return this.opts.PROGRESS_CALLBACK(progress);
57
+ if(this.opts.SHOW_PROGRESS && progress.status === "progress_total"){
58
+ process.stdout.write(`\r\x1b[K✅ Loaded ${ Math.round(progress.progress)}% ${progress.name || "model"}`)
59
+ }
60
+ },
61
+ device: this.opts.DEVICE,
62
+ dtype: this.opts.DTYPE
63
+ });
64
+ pipe = this.opts._pipe
45
65
  }
46
- }catch(err) {
47
- //console.log('CREATE TABLE ERROR', err)
66
+ const embedding = await pipe(text, { pooling: "mean", normalize: true });
67
+ return Array.from(embedding.data)
48
68
  }
49
- try {
50
- let data = getDb().run(query, params)
51
- return data
52
- }catch(err){
53
- if(config.SHOW_ERRORS) console.error(err.display || err.message)
69
+
70
+ async createTable() {
71
+ // create table
72
+ let tableCreated = await this.printQuery(`:create embeddings {id: String, category: String => v: <F32; ${this.opts.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
73
+ if(tableCreated){
74
+ // create vector index
75
+ let indexCreated = await printQuery(`::hnsw create embeddings:index_name {
76
+ dim: ${this.opts.VECTOR_SIZE},
77
+ m: 50,
78
+ dtype: F32,
79
+ fields: [v],
80
+ distance: L2, # Cosine, IP
81
+ ef_construction:50, # number of nearest neighbors
82
+ extend_candidates: false, # include nearest neighbors of the nearest neighbors
83
+ keep_pruned_connections: false,
84
+ }`)
85
+ return tableCreated && indexCreated
86
+ }
87
+ return false
54
88
  }
55
- }
56
89
 
57
- export const getEmbeddings = async (text) => {
58
- let pipe = config._pipe
59
- if(!pipe) {
60
- config._pipe = await pipeline("feature-extraction", config.MODEL_NAME, {
61
- progress_callback:(progress) => {
62
- if(config.PROGRESS_CALLBACK) return config.PROGRESS_CALLBACK();
63
- if(progress.status === "progress_total"){
64
- process.stdout.write(`\r\x1b[K✅ Loaded ${ Math.round(progress.progress)}% ${progress.name || "model"}`)
65
- }
66
- },
67
- device: config.DEVICE,
68
- dtype: config.DTYPE
69
- });
70
- pipe = config._pipe
90
+ async add(input, result, data={}, category="") {
91
+ if(!input || !result) return
92
+ input = this.sanitizeString(input)
93
+ result = this.sanitizeString(result)
94
+ const embedding = await this.getEmbeddings(input)
95
+ let id = data.id || Math.random().toString().substring(2)
96
+ return await printQuery(`?[id, v, input, result, data, category] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)}, ${JSON.stringify(category.replaceAll('"', "'"))} ]]
97
+ :put embeddings {id, category => v, input, result, data}
98
+ `)
71
99
  }
72
- const embedding = await pipe(text, { pooling: "mean", normalize: true });
73
- return Array.from(embedding.data)
74
- }
75
100
 
76
- export const createTable = async () => {
77
- // create table (id, v, input, result, data)
78
- let tableCreated = await printQuery(`:create embeddings {id: String, category: String => v: <F32; ${config.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
79
- if(tableCreated){
80
- // create index
81
- let indexCreated = await printQuery(`::hnsw create embeddings:index_name {
82
- dim: ${config.VECTOR_SIZE},
83
- m: 50,
84
- dtype: F32,
85
- fields: [v],
86
- distance: L2, # Cosine, IP
87
- ef_construction:50, # number of nearest neighbors
88
- extend_candidates: false, # include nearest neighbors of the nearest neighbors
89
- keep_pruned_connections: false,
90
- }`)
91
- return tableCreated && indexCreated
101
+ sanitizeString(str){
102
+ return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ").trim()
92
103
  }
93
- return false
94
- }
95
104
 
96
- export const add = async (input, result, data={}, category="") => {
97
- if(!input || !result) return
98
- input = sanitizeString(input)
99
- result = sanitizeString(result)
100
- const embedding = await getEmbeddings(input)
101
- let id = data.id || Math.random().toString().substring(2)
102
- return await printQuery(`?[id, v, input, result, data, category] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)}, ${JSON.stringify(category.replaceAll('"', "'"))} ]]
103
- :put embeddings {id, category => v, input, result, data}
104
- `)
105
- }
105
+ /**
106
+ *
107
+ * Batch array:
108
+ * [{input:"", result:"", data:{}}]
109
+ *
110
+ * @param {Array} batch
111
+ * @returns
112
+ */
113
+ async addBatch(batch, opts={onProgress:null}) {
114
+ if(!batch || !Array.isArray(batch)) return
115
+ let vectorBatch = []
116
+ for(let i=0;i<batch.length; i++){
117
+ let {input, result, data, category} = batch[i]
118
+
119
+ if(!input || !result) continue
120
+ if(!data) data = {}
121
+ if(!category) category = ''
122
+ const embedding = await this.getEmbeddings(input)
123
+ batch[i].embedding = embedding
124
+ let item = ''
125
+ if(i == 0) {
126
+ item += `?[id, v, input, result, data, category] <- [`
127
+ }
106
128
 
107
- /**
108
- *
109
- * Batch array:
110
- * [{input:"", result:"", data:{}}]
111
- *
112
- * @param {Array} batch
113
- * @returns
114
- */
115
- export const addBatch = async (batch, opts={onProgress:null}) => {
116
- if(!batch || !Array.isArray(batch)) return
117
- let vectorBatch = []
118
- for(let i=0;i<batch.length; i++){
119
- let {input, result, data, category} = batch[i]
120
-
121
- if(!input || !result) continue
122
- if(!data) data = {}
123
- if(!category) category = ''
124
- const embedding = await getEmbeddings(input)
125
- batch[i].embedding = embedding
126
- let item = ''
127
- if(i == 0) {
128
- item += `?[id, v, input, result, data, category] <- [`
129
+ input = this.sanitizeString(input)
130
+ result = this.sanitizeString(result)
131
+
132
+ let id = data?.id ? data.id : Math.random().toString().substring(2)
133
+ item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)}, ${JSON.stringify(category)} ],`
134
+ if(i == batch.length-1) {
135
+ item += `]
136
+ :put embeddings {id, category => v, input, result, data}`
137
+ }
138
+ vectorBatch.push(item)
139
+
140
+ if(opts.onProgress && typeof opts.onProgress == 'function') {
141
+ await opts.onProgress({index: i+1, total:batch.length, item: batch[i], embedding, percent: Math.round((i+1) / batch.length * 100)})
142
+ }
129
143
  }
144
+ return await this.printQuery(vectorBatch.join("\n"))
145
+ }
130
146
 
131
- input = sanitizeString(input)
132
- result = sanitizeString(result)
147
+ async remove(id, category="") {
148
+ if(!id || typeof id != 'string') return
149
+ id = id.replace(/[^a-zA-Z0-9]/g, '')
150
+ category = this.sanitizeString(category)
151
+ if(!id || !category) return
152
+ let results = await this.printQuery(
153
+ `?[id, category] <- [['${id}', '${category}']]
154
+ ::rm embeddings {id, category}`)
155
+ return results
156
+ }
157
+
158
+ async removeAllByCategory(category=""){
159
+ category = this.sanitizeString(category)
160
+ if(!category) return
161
+ let results
162
+ try {
163
+ results = await this.printQuery(
164
+ `?[id, category] := *embeddings{id, category}, category = "${category}"
165
+ :rm embeddings {id, category}`)
166
+ }catch(err){
167
+ console.error(err)
168
+ }
169
+ return results
170
+ }
133
171
 
134
- let id = data?.id ? data.id : Math.random().toString().substring(2)
135
- item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)}, ${JSON.stringify(category)} ],`
136
- if(i == batch.length-1) {
137
- item += `]
138
- :put embeddings {id, category => v, input, result, data}`
172
+ async searchText(text, category="", numResults = 5, includeInput=false) {
173
+ const embedding = await this.getEmbeddings(text)
174
+ let results = await this.printQuery(`?[dist, result, id, data, category${includeInput? ', input' : ''}] := ~embeddings:index_name { id, v, input, result, data, category${includeInput? ', input' : ''} |
175
+ query: q,
176
+ k: ${numResults}, # number of results
177
+ ef: 50, # number of neighbours to consider
178
+ bind_distance: dist,
179
+ filter: category==${JSON.stringify(category)},
180
+ radius: 10.0
181
+ }, q = vec(${JSON.stringify(embedding)})
182
+ :sort dist`)
183
+ return results
184
+ }
185
+
186
+ async vectorSearch(query, category='', numResults=5) {
187
+ let result = undefined
188
+ try{
189
+ result = await this.searchText(query, category, numResults)
190
+ }catch(err){
191
+ if(config.SHOW_ERRORS) console.error(err.display || err.message)
139
192
  }
140
- vectorBatch.push(item)
193
+ return result
194
+ }
141
195
 
142
- if(opts.onProgress && typeof opts.onProgress == 'function') {
143
- await opts.onProgress({index: i+1, total:batch.length, item: batch[i], embedding, percent: Math.round((i+1) / batch.length * 100)})
196
+ nuke() {
197
+ return fs.unlinkSync(this.opts.DB_FILE)
198
+ }
199
+
200
+ async importFromJSONStream(fileName) {
201
+ async function jsonStream(readable, callback = async function(){}) {
202
+ readable.setEncoding('utf8');
203
+ let data = '';
204
+ for await (const chunk of readable) {
205
+ if(chunk.indexOf("\n")) {
206
+ pts = chunk.split("\n")
207
+ for(let i=0;i<pts.length; i++){
208
+ data += pts[i]
209
+ try {
210
+ let json = JSON.parse(data)
211
+ await callback(json)
212
+ json = null
213
+ data = ''
214
+ }catch(err) {}
215
+ }
216
+ }else{
217
+ data += chunk;
218
+ }
219
+ }
220
+ }
221
+ let batchSize = 40, batch = [], i=0, currentBatch = 0
222
+ let stream = typeof fileName == 'string' ? fs.createReadStream(fileName) : fileName
223
+ await jsonStream(stream, async (json) => {
224
+ if(json.input && json.result){
225
+ if(!json.data) json.data = {}
226
+ if(i % batchSize === 0){
227
+ if(batch.length) {
228
+ currentBatch = currentBatch + 1
229
+ console.log(`Adding batch ${currentBatch} (${batch.length} items)`)
230
+ await this.addBatch(batch)
231
+ batch = []
232
+ }
233
+ }
234
+ batch.push(json)
235
+ i=i+1
236
+ }
237
+ })
238
+ if(batch.length) {
239
+ console.log(`Adding batch ${currentBatch + 1} (${batch.length} items)`)
240
+ await this.addBatch(batch)
144
241
  }
145
242
  }
146
- return await printQuery(vectorBatch.join("\n"))
243
+
244
+ async importFromCSVorTSV(fileName, inputHeader, resultHeader) {
245
+ if(!fileName || !fileName.includes('.')) return
246
+ let ext = fileName.split('.').pop()
247
+ ext = ext.toLowerCase()
248
+ if(ext != 'csv' && ext != 'tsv') return console.log('File must have csv or tsv extension')
249
+ let parseOpts = {
250
+ separator: ext == 'tsv' ? '\t' : ',',
251
+ mapHeaders: ({ header, index }) => {
252
+ if(inputHeader) {
253
+ if(inputHeader == header){
254
+ return 'input'
255
+ }
256
+ }else if(index === 0){
257
+ return 'input'
258
+ }
259
+ if(resultHeader){
260
+ if(resultHeader == header){
261
+ return 'result'
262
+ }
263
+ }else if(index === 1){
264
+ return 'result'
265
+ }
266
+ return header.replaceAll(/\W/gi, '_').replaceAll(/[^a-zA-Z0-9\_]/g, '').toLowerCase()
267
+ }
268
+ }
269
+ let fetchFromFile = async (fileName) => {
270
+ return new Promise(async (resolve, reject)=>{
271
+ let results = []
272
+ fs.createReadStream(fileName)
273
+ .pipe(csv(parseOpts))
274
+ .on('data', async (data) => {
275
+ results.push(data)
276
+ })
277
+ .on('end', () => {
278
+ console.log(`${fileName} loaded.`);
279
+ resolve(results)
280
+ }).on('error', (err) => {
281
+ console.error(err);
282
+ })
283
+ })
284
+ }
285
+
286
+ let results = await fetchFromFile(fileName)
287
+
288
+ let batchSize = 40, batch = [], currentBatch = 0, totalBatches = Math.ceil(results.length / batchSize), dataHeaders = Object.keys(results[results.length-1]).filter(k => k != 'input' && k != 'result'), data
289
+ for(let i=0; i<results.length; i++){
290
+ if(i % batchSize === 0){
291
+ if(batch.length) {
292
+ currentBatch = currentBatch + 1
293
+ console.log(`Adding batch ${currentBatch} of ${totalBatches} (${batch.length} items)`)
294
+ await this.addBatch(batch)
295
+ batch = []
296
+ }
297
+ }
298
+ data = {}
299
+ dataHeaders.forEach(k => k && results[i][k] ? data[k] = results[i][k] : null)
300
+ batch.push({
301
+ input: results[i].input,
302
+ result: results[i].result,
303
+ data
304
+ })
305
+ }
306
+ if(batch.length) {
307
+ console.log(`Adding batch ${currentBatch + 1} of ${totalBatches} (${batch.length} items)`)
308
+ await this.addBatch(batch)
309
+ }
310
+ }
311
+
312
+ }
313
+
314
+ export const getDb = () => {
315
+ if(!recal_instance) recal_instance = new Recall()
316
+ return recal_instance.db
317
+ }
318
+
319
+ async function printQuery(query, params = {}) {
320
+ if(!recal_instance) recal_instance = new Recall()
321
+ return await recal_instance.printQuery(query, params)
322
+ }
323
+
324
+ export const getEmbeddings = async (text) => {
325
+ if(!recal_instance) recal_instance = new Recall()
326
+ return await recal_instance.getEmbeddings(text)
147
327
  }
148
328
 
149
- const sanitizeString = (str)=>{
150
- return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ").trim()
329
+ export const createTable = async () => {
330
+ if(!recal_instance) recal_instance = new Recall()
331
+ return await recal_instance.createTable()
332
+ }
333
+
334
+ export const add = async (input, result, data={}, category="") => {
335
+ if(!recal_instance) recal_instance = new Recall()
336
+ return await recal_instance.add(input, result, data, category)
337
+ }
338
+
339
+ export const addBatch = async (batch, opts={onProgress:null}) => {
340
+ if(!recal_instance) recal_instance = new Recall()
341
+ return await recal_instance.addBatch(batch, opts)
151
342
  }
152
343
 
153
344
  export const remove = async (id, category="") => {
154
- if(!id || typeof id != 'string') return
155
- id = id.replace(/[^a-zA-Z0-9]/g, '')
156
- category = sanitizeString(category)
157
- if(!id || !category) return
158
- let results = await printQuery(
159
- `?[id, category] <- [['${id}', '${category}']]
160
- ::rm embeddings {id, category}`)
161
- return results
345
+ if(!recal_instance) recal_instance = new Recall()
346
+ return await recal_instance.remove(id, category)
162
347
  }
163
348
 
164
349
  export const removeAllByCategory = async (category="") => {
165
- category = sanitizeString(category)
166
- if(!category) return
167
- let results
168
- try {
169
- results = await printQuery(
170
- `?[id, category] := *embeddings{id, category}, category = "${category}"
171
- :rm embeddings {id, category}`)
172
- }catch(err){
173
- console.error(err)
174
- }
175
- return results
350
+ if(!recal_instance) recal_instance = new Recall()
351
+ return await recal_instance.removeAllByCategory(category)
176
352
  }
177
353
 
178
354
  export const searchText = async (text, category="", numResults = 5, includeInput=false) => {
179
- const embedding = await getEmbeddings(text)
180
- let results = await printQuery(`?[dist, result, id, data, category${includeInput? ', input' : ''}] := ~embeddings:index_name { id, v, input, result, data, category${includeInput? ', input' : ''} |
181
- query: q,
182
- k: ${numResults}, # number of results
183
- ef: 50, # number of neighbours to consider
184
- bind_distance: dist,
185
- filter: category==${JSON.stringify(category)},
186
- radius: 10.0
187
- }, q = vec(${JSON.stringify(embedding)})
188
- :sort -dist`)
189
- return results
355
+ if(!recal_instance) recal_instance = new Recall()
356
+ return await recal_instance.searchText(text, category, numResults, includeInput)
190
357
  }
191
358
 
192
359
  export const vectorSearch = async (query, category='', numResults=5) => {
193
- let result = undefined
194
- try{
195
- result = await searchText(query, category, numResults)
196
- }catch(err){
197
- if(config.SHOW_ERRORS) console.error(err.display || err.message)
198
- }
199
- return result
360
+ if(!recal_instance) recal_instance = new Recall()
361
+ return await recal_instance.vectorSearch(query, category, numResults)
362
+ }
363
+
364
+ export const nuke = () => {
365
+ if(!recal_instance) recal_instance = new Recall()
366
+ return recal_instance.nuke()
367
+ }
368
+
369
+ export const importFromJSONStream = async (fileName) => {
370
+ if(!recal_instance) recal_instance = new Recall()
371
+ return await recal_instance.importFromJSONStream(fileName)
372
+ }
373
+
374
+ export const importFromCSVorTSV = async (fileName, inputHeader, resultHeader) => {
375
+ if(!recal_instance) recal_instance = new Recall()
376
+ return await recal_instance.importFromCSVorTSV(fileName, inputHeader, resultHeader)
377
+ }
378
+
379
+ async function test(){
380
+ let recall = new Recall()
381
+ recall.nuke()
382
+ await recall.add('The quick brown fox jumps over the lazy dog', 'Fox jumps over dog', {foo:"bar"})
383
+ await recall.add('History of Serbia бегинс with emperor Heraclius', 'Serbia and Roman empire', {foo:"baz"})
384
+ let resp = await recall.vectorSearch('Un animal saute par-dessus un autre animal')
385
+ return JSON.stringify(resp)
200
386
  }
201
387
 
202
388
  const cmdArgs = (list = []) => {
@@ -216,246 +402,8 @@ const cmdArgs = (list = []) => {
216
402
  return args
217
403
  }
218
404
 
219
- export const nuke = () => {
220
- return fs.unlinkSync(config.DB_FILE)
221
- }
222
-
223
- export const importFromJSONStream = async (fileName) => {
224
- async function jsonStream(readable, callback = async function(){}) {
225
- readable.setEncoding('utf8');
226
- let data = '';
227
- for await (const chunk of readable) {
228
- if(chunk.indexOf("\n")) {
229
- pts = chunk.split("\n")
230
- for(let i=0;i<pts.length; i++){
231
- data += pts[i]
232
- try {
233
- let json = JSON.parse(data)
234
- await callback(json)
235
- json = null
236
- data = ''
237
- }catch(err) {
238
- //console.error(err)
239
- }
240
- }
241
- }else{
242
- data += chunk;
243
- }
244
- }
245
- }
246
- let batchSize = 40, batch = [], i=0, currentBatch = 0
247
- let stream = typeof fileName == 'string' ? fs.createReadStream(fileName) : fileName
248
- await jsonStream(stream, async (json) => {
249
- if(json.input && json.result){
250
- if(!json.data) json.data = {}
251
- if(i % batchSize === 0){
252
- if(batch.length) {
253
- currentBatch = currentBatch + 1
254
- console.log(`Adding batch ${currentBatch} (${batch.length} items)`)
255
- await addBatch(batch)
256
- batch = []
257
- }
258
- }
259
- batch.push(json)
260
- i=i+1
261
- }
262
- })
263
- if(batch.length) {
264
- console.log(`Adding batch ${currentBatch + 1} (${batch.length} items)`)
265
- await addBatch(batch)
266
- }
267
- }
268
-
269
- export const importFromCSVorTSV = async (fileName, inputHeader, resultHeader) => {
270
- if(!fileName || !fileName.includes('.')) return
271
- let ext = fileName.split('.').pop()
272
- ext = ext.toLowerCase()
273
- if(ext != 'csv' && ext != 'tsv') return console.log('File must have csv or tsv extension')
274
- let parseOpts = {
275
- separator: ext == 'tsv' ? '\t' : ',',
276
- mapHeaders: ({ header, index }) => {
277
- if(inputHeader) {
278
- if(inputHeader == header){
279
- return 'input'
280
- }
281
- }else if(index === 0){
282
- return 'input'
283
- }
284
- if(resultHeader){
285
- if(resultHeader == header){
286
- return 'result'
287
- }
288
- }else if(index === 1){
289
- return 'result'
290
- }
291
- return header.replaceAll(/\W/gi, '_').replaceAll(/[^a-zA-Z0-9\_]/g, '').toLowerCase()
292
- }
293
- }
294
- let fetchFromFile = async (fileName) => {
295
- return new Promise(async (resolve, reject)=>{
296
- let results = []
297
- fs.createReadStream(fileName)
298
- .pipe(csv(parseOpts))
299
- .on('data', async (data) => {
300
- results.push(data)
301
- })
302
- .on('end', () => {
303
- console.log(`${fileName} loaded.`);
304
- resolve(results)
305
- }).on('error', (err) => {
306
- console.error(err);
307
- })
308
- })
309
- }
310
-
311
-
312
- let results = await fetchFromFile(fileName)
313
-
314
- let batchSize = 40, batch = [], currentBatch = 0, totalBatches = Math.ceil(results.length / batchSize), dataHeaders = Object.keys(results[results.length-1]).filter(k => k != 'input' && k != 'result'), data
315
- for(let i=0; i<results.length; i++){
316
- if(i % batchSize === 0){
317
- if(batch.length) {
318
- currentBatch = currentBatch + 1
319
- console.log(`Adding batch ${currentBatch} of ${totalBatches} (${batch.length} items)`)
320
- await addBatch(batch)
321
- batch = []
322
- }
323
- }
324
- data = {}
325
- dataHeaders.forEach(k => k && results[i][k] ? data[k] = results[i][k] : null)
326
- batch.push({
327
- input: results[i].input,
328
- result: results[i].result,
329
- data
330
- })
331
- }
332
- if(batch.length) {
333
- console.log(`Adding batch ${currentBatch + 1} of ${totalBatches} (${batch.length} items)`)
334
- await addBatch(batch)
335
- }
336
- }
337
-
338
- const mcp = async () => {
339
-
340
- // Create an MCP server
341
- // const server = new McpServer({
342
- // name: "Demo",
343
- // version: "1.0.0"
344
- // });
345
-
346
- // // Add an addition tool
347
- // server.tool("add",
348
- // { a: z.number(), b: z.number() },
349
- // async ({ a, b }) => ({
350
- // content: [{ type: "text", text: String(a + b) }]
351
- // })
352
- // );
353
-
354
- // // Add a dynamic greeting resource
355
- // server.resource(
356
- // "greeting",
357
- // new ResourceTemplate("greeting://{name}", { list: undefined }),
358
- // async (uri, { name }) => ({
359
- // contents: [{
360
- // uri: uri.href,
361
- // text: `Hello, ${name}!`
362
- // }]
363
- // })
364
- // );
365
-
366
-
367
- const server = new McpServer({
368
- name: "Recall",
369
- description: "Recall provides semantic search on the local vector database.",
370
- version: "1.0.0"
371
- });
372
-
373
- // server.resource(
374
- // "echo",
375
- // new ResourceTemplate("echo://{message}", { list: undefined }),
376
- // async (uri, { message }) => ({
377
- // contents: [{
378
- // uri: uri.href,
379
- // text: `Resource echo: ${message}`
380
- // }]
381
- // })
382
- // );
383
-
384
- server.tool(
385
- "search",
386
- {
387
- text: z.string(),
388
- //numberOfResults: z.number()
389
- },
390
- async ({ text, numberOfResults }) => {
391
- if(numberOfResults && numberOfResults > 50) numberOfResults = 50
392
-
393
- let startTime = performance.now()
394
- let results = await searchText(text, numberOfResults)
395
- var timeDiff = ((performance.now() - startTime) / 1000).toFixed(2)
396
- let content = [
397
- {
398
- type: "text",
399
- text: `Sorry. Recal search didn't find anything.`
400
- }
401
- ]
402
- if(results && results.rows && results.rows.length) {
403
- // content = results.rows.map(r => {
404
- // return {
405
- // type: "text",
406
- // text: r[1]
407
- // }
408
- // })
409
- content = [{
410
- type: "text",
411
- text: `Recal search found the following results in ${timeDiff}s:`
412
- }]
413
- for(let i=0; i<results.rows.length; i++){
414
- let row = results.rows[i]
415
- content.push({
416
- type: "text",
417
- text: row[1]
418
- })
419
- // if(results.rows[2] && Object.keys(results.rows[2])){
420
- // content.push({
421
- // type: "json",
422
- // text: row[2]
423
- // })
424
- // }
425
- }
426
- }
427
-
428
- return {
429
- content
430
- }
431
- }
432
- );
433
-
434
- // server.prompt(
435
- // "echo",
436
- // { message: z.string() },
437
- // ({ message }) => ({
438
- // messages: [{
439
- // role: "user",
440
- // content: {
441
- // type: "text",
442
- // text: `Please process this message: ${message}`
443
- // }
444
- // }]
445
- // })
446
- // );
447
-
448
- // Start receiving messages on stdin and sending messages on stdout
449
- const transport = new StdioServerTransport();
450
- await server.connect(transport);
451
- }
452
-
453
- const splitSentences = (text) => {
454
- return text.replace(/([.?!])\s*(?=[A-Z])/g, "$1|").split("|")
455
- }
456
-
457
405
  const runCLI = async () => {
458
- let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--mcp', '--nuke', '--input-header', '--result-header', '--test', '--limit', '--category'])
406
+ let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--nuke', '--input-header', '--result-header', '--test', '--limit', '--category'])
459
407
  let query = args['--query'] || args['-q']
460
408
  if(args['--db']){
461
409
  config.DB_FILE = args['--db']
@@ -500,27 +448,23 @@ const runCLI = async () => {
500
448
  }else if(args['--json']){
501
449
  await importFromJSONStream(args['--json'])
502
450
  console.log('Imported.')
503
- }else if(args['--mcp'] != undefined){
504
- await mcp()
505
- console.log('MCP server running.')
506
451
  }else if(args['--test'] != undefined){
507
452
  console.log('Test: ', await test())
508
453
  }else{
509
454
  console.log('Usage:')
510
455
  console.log(args._cmd + ' --query "Foo Bar"')
511
456
  console.log("\n" + 'Options:')
512
- console.log('--query "SEARCH_STRING" - search')
513
- console.log('--limit 2 - limit number of results (used with --query)')
514
- console.log(`--add 'input|result|{"foo":"bar"}|categ' - add data`)
515
- console.log(`--remove 'id' - remove data`)
516
- console.log(`--nuke - destroy database`)
517
- console.log(`--mcp - run as MCP server (experimental)`)
518
- console.log(`--db "FILE_NAME" - database file (SQLite)`)
519
- console.log(`--import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data`)
520
- console.log('--input-header "foo" - when used with --import designates specific header column as input')
521
- console.log('--result-header "bar" - when used with --import designates specific header column as result')
522
- console.log(`--json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}`)
523
- console.log(`--category "CATEGORY" - specify category when adding data and to filter by when querying (defaults to empty string)`)
457
+ console.log('--query "SEARCH_STRING" - Search the database')
458
+ console.log('--limit N - Limit number of results (used with --query).')
459
+ console.log(`--add 'input|result|{"foo":"bar"}|categ' - Add a data entry.`)
460
+ console.log(`--remove 'id' - Remove data by ID.`)
461
+ console.log(`--nuke - Destroy the database.`)
462
+ console.log(`--db "FILE_NAME" - Specify database file (SQLite).`)
463
+ console.log(`--import "file.csv | file.tsv" - Import from CSV or TSV with columns: input, result, additional data.`)
464
+ console.log('--input-header "foo" - When used with --import, designate a specific header column as input.')
465
+ console.log('--result-header "bar" - When used with --import, designate a specific header column as result.')
466
+ console.log(`--json "FILE_NAME" - Import from a file with one JSON object per line: {input:"", result:"", data:{}}.`)
467
+ console.log(`--category "CATEGORY" - Specify category when adding data and filter by it when querying (defaults to empty string).`)
524
468
  }
525
469
  }
526
470