@sjovanovic/recall.js 1.0.1 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -6,13 +6,13 @@
6
6
 
7
7
  Recall.js is long term memory for AI apps!
8
8
 
9
- It is a generic RAG (Retrieval-augmented generation) JavaScript library and command line utility focused on speed, ease of use and embeddability.
9
+ It is a tool for building RAG (Retrieval-augmented generation) in a form of JavaScript library and command line utility focused on speed, ease of use and embeddability.
10
10
 
11
- It is versatile: use it for generic Semantic Search, as expert memory for your AI app, as a recommendation system, there are so many possibilities.
11
+ It is versatile and you don't have to use it exclusively for RAG, it can also be used for generic Semantic Search, as expert memory for your AI app, as a recommendation system, there are many possibilities...
12
12
 
13
13
  Recall.js supports multilingual embeddings out of the box so you can add data in one language and then query it in another.
14
14
 
15
- Under the hood, recall.js uses sentence vector embeddings and a vector database to index and query your data. It is a light wrapper around local language models such as [MiniLM-L12-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2) (optionally LLMs can be used) and [CozoDB](https://www.cozodb.org/) vector database.
15
+ Under the hood, recall.js uses [Transformers.js](https://huggingface.co/docs/transformers.js/index) feature extraction and a vector database to index and query your data. It is a light wrapper around local language models such as [Multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2) and [CozoDB](https://www.cozodb.org/) vector database.
16
16
 
17
17
  ## Install
18
18
 
@@ -20,7 +20,15 @@ Under the hood, recall.js uses sentence vector embeddings and a vector database
20
20
 
21
21
  ## Usage
22
22
 
23
- Warning: when this library is used for the first time, it will download a local language model MiniLM-L12-v2 which may take long time depending on your Internet connectivity. Please be patient.
23
+ Console:
24
+
25
+ ```console
26
+ recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
27
+ recall --query "Un animal saute par-dessus un autre animal" --limit 1
28
+ ```
29
+ **Warning:** when this library is used for the first time, it will download a local language model Multilingual-MiniLM-L12-v2 which may take a while depending on your Internet connectivity. Please be patient.
30
+
31
+ Below is the same example in JavaScript:
24
32
 
25
33
  ```javascript
26
34
 
@@ -50,16 +58,18 @@ response:
50
58
  "dist",
51
59
  "result",
52
60
  "id",
53
- "data"
61
+ "data",
62
+ "category"
54
63
  ],
55
64
  "rows": [
56
65
  [
57
- 0.5840495824813843, // vector similarity
66
+ 0.6840495824813843, // vector similarity
58
67
  "Fox and dog",
59
68
  "08840189191373282",
60
69
  {
61
70
  "foo": "bar"
62
- }
71
+ },
72
+ ""
63
73
  ]
64
74
  ]
65
75
  }
@@ -68,105 +78,102 @@ response:
68
78
 
69
79
  ```
70
80
 
71
- Here's how the above example looks like in CLI:
72
-
73
- ```log
74
- recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
75
- recall --query "Un animal saute par-dessus un autre animal" --limit 1
76
- ```
77
-
78
81
  ## Options
79
82
 
80
- Easiest way to get all the options is via command line:
83
+ Easy way to view all the options is via command line:
81
84
 
82
- ```log
85
+ ```console
83
86
  recall --help
84
87
 
85
88
  Usage:
86
- recall --query "Foo Bar"
89
+ recall.js --query "Foo Bar"
87
90
 
88
91
  Options:
89
- --query "SEARCH_STRING" - search
90
- --limit 2 - limit number of results (used with --query)
91
- --add 'input|result|{"foo":"bar"}' - add data
92
- --remove 'id' - remove data
93
- --nuke - destroy database
94
- --mcp - run as MCP server
95
- --db "FILE_NAME" - database file (SQLite)
96
- --import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data
97
- --input-header "foo" - when used with --import designates specific header column as input
98
- --result-header "bar" - when used with --import designates specific header column as result
99
- --json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}
92
+ --query "SEARCH_STRING" - search
93
+ --limit 2 - limit number of results (used with --query)
94
+ --add 'input|result|{"foo":"bar"}|categ' - add data
95
+ --remove 'id' - remove data
96
+ --nuke - destroy database
97
+ --mcp - run as MCP server (experimental)
98
+ --db "FILE_NAME" - database file (SQLite)
99
+ --import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data
100
+ --input-header "foo" - when used with --import designates specific header column as input
101
+ --result-header "bar" - when used with --import designates specific header column as result
102
+ --json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}
103
+ --category "CATEGORY" - specify category when adding data and to filter by when querying (defaults to empty string)
100
104
  ```
101
105
 
102
- Note when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
106
+ **Note:** when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
103
107
 
104
108
 
105
109
  ## JavaScript API Reference
106
110
 
107
- **RECALL.config**
111
+ ### RECALL.config
108
112
 
109
113
  Configuration object.
110
114
 
111
115
  ```javascript
112
116
  export const config = {
113
- VECTOR_SIZE: 384, // number of dimensions
114
- MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
117
+ VECTOR_SIZE: 384, // number of dimensions (must match the models output)
118
+ MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
115
119
  SHOW_ERRORS: true, // Show errors
116
120
  DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
117
- PATH: PATH // directory of recall.js
121
+ PATH: PATH, // directory of recall.js
122
+ DEVICE: undefined, // Transformers.js device
123
+ DTYPE: undefined, // Transformers.js dtype
124
+ PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
118
125
  }
119
126
  ```
120
127
 
121
- **RECALL.getDb()**
128
+ ### RECALL.getDb()
122
129
 
123
130
  Returns reference to the CozoDB instance.
124
131
 
125
- **RECALL.getEmbeddings(text) -> Promise(Array)**
132
+ ### RECALL.getEmbeddings(text) -> Promise(Array)
126
133
 
127
134
  Given text calculates the embeddings vector
128
135
 
129
- **RECALL.add(input, result, data={}) -> Promise(Object)**
136
+ ### RECALL.add(input, result, data={}, category="") -> Promise(Object)
130
137
 
131
138
  Add data. `input` is the sentence to get embeddings from. `result` is the string to show in the results. `data` is arbitrary object intended to hold related pieces of information and references. If `data` object contains `id` property it will be used as unique id of the record.
132
139
 
133
- **RECALL.addBatch(batch) -> Promise(Object)**
140
+ ### RECALL.addBatch(batch) -> Promise(Object)
134
141
 
135
142
  Add data in batches (faster than using add repeteadely).
136
143
  `batch` is an Array that looks like this:
137
144
  ```
138
- let batch = [{input:"", result:"", data:{}}]
145
+ let batch = [{input:"", result:"", data:{}, category:""}]
139
146
  ```
140
147
 
141
- **RECALL.remove(id) -> Promise(Object)**
148
+ ### RECALL.remove(id) -> Promise(Object)
142
149
 
143
150
  Remove data by id. id is a string.
144
151
 
145
- **RECALL.searchText(text, numResults = 5) -> Promise(Object)**
152
+ ### RECALL.searchText(text, category="", numResults = 5, includeInput=false) -> Promise(Object)
146
153
 
147
154
  Query the vector database. Accepts query text and number of results to return.
148
155
 
149
- **RECALL.nuke()**
156
+ ### RECALL.nuke()
150
157
 
151
158
  Deletes the database.
152
159
 
153
- **RECALL.importFromJSONStream(fileName) -> Promise(object)**
160
+ ### RECALL.importFromJSONStream(fileName) -> Promise(object)
154
161
 
155
162
  Imports from readable stream or file which consists of JSON objects, one per line. e.g.
156
163
  ```
157
- {input:"one", result:"one result", data:{"id":"123"}}
158
- {input:"", result:"", data:{}}
164
+ {input:"one", result:"one result", data:{"id":"123"}, category:""}
165
+ {input:"", result:"", data:{}, category:""}
159
166
  ...
160
167
  ```
161
168
  This is the most efficient way to import data.
162
169
 
163
- **RECALL.importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise()**
170
+ ### RECALL.importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise()
164
171
 
165
172
  Imports from CSV or TSV file. By default fist column is used as input, second as result and remaining columns are put in the data object.
166
173
  If `inputHeader` is specified, function will try to find the column by that name and use it as input.
167
174
  If `resultHeader` is specified, function will try to find the column by that name and use it as result.
168
175
 
169
- **RECALL.mcp() -> Promise()**
176
+ ### RECALL.mcp() -> Promise()
170
177
 
171
178
  (Experimental)
172
179
  Runs MCP server and makes the results available when mentioning `Recall search` in the prompt. Currently only supports STDIO.
package/package.json CHANGED
@@ -1,24 +1,24 @@
1
1
  {
2
2
  "name": "@sjovanovic/recall.js",
3
- "version": "1.0.1",
4
- "description": "Semantic search as long term memory for LLMs",
3
+ "version": "1.0.4",
4
+ "description": "Easy RAG with semantic search and long term memory",
5
5
  "main": "recall.js",
6
6
  "bin": {
7
7
  "recall": "recall.js"
8
8
  },
9
9
  "type": "module",
10
10
  "scripts": {
11
+ "start": "node recall.js",
11
12
  "test": "echo \"Error: no test specified\" && exit 1",
12
13
  "query": "node recall.js --query "
13
14
  },
14
15
  "author": "Slobodan Jovanovic",
15
16
  "license": "ISC",
16
17
  "dependencies": {
17
- "@modelcontextprotocol/sdk": "^1.8.0",
18
- "@themaximalist/embeddings.js": "^0.1.3",
19
- "@xenova/transformers": "^2.17.2",
18
+ "@huggingface/transformers": "^4.2.0",
19
+ "@modelcontextprotocol/sdk": "^1.29.0",
20
20
  "cozo-node": "^0.7.6",
21
21
  "csv-parser": "^3.2.0",
22
- "zod": "^3.24.2"
22
+ "zod": "^4.3.6"
23
23
  }
24
24
  }
package/recall.js CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env node
2
2
  import {CozoDb} from 'cozo-node'
3
- import embeddings from "@themaximalist/embeddings.js";
3
+ import { pipeline } from "@huggingface/transformers";
4
4
  import csv from 'csv-parser'
5
5
  import fs from 'fs'
6
6
  import { resolve, join, dirname, sep } from 'path'
@@ -10,35 +10,43 @@ import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mc
10
10
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
11
11
  import { z } from "zod";
12
12
 
13
+ // import {sanitizeValue} from './utils/sanitize.js'
14
+
13
15
  const pathToThisFile = resolve(fileURLToPath(import.meta.url))
14
16
  const pathPassedToNode = resolve(process.argv[1])
15
17
  const isThisFileBeingRunViaCLI = pathToThisFile.includes(pathPassedToNode) || pathPassedToNode.includes('.npm-global')
16
18
  const PATH = dirname(pathToThisFile)
17
19
 
18
20
  export const config = {
19
- VECTOR_SIZE: 384, // number of dimensions
20
- MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
21
+ VECTOR_SIZE: 384, // number of dimensions (must match the models output)
22
+ MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
21
23
  SHOW_ERRORS: true, // Show errors
22
24
  DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
23
- PATH: PATH // directory of recall.js
25
+ PATH: PATH, // directory of recall.js
26
+ DEVICE: undefined, // Transformers.js device
27
+ DTYPE: undefined, // Transformers.js dtype
28
+ PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
24
29
  }
25
30
 
26
- var db = null
31
+ var db = null, initDone = false
27
32
 
28
33
  export const getDb = () => {
29
- if(!db) db = new CozoDb('sqlite', config.DB_FILE)
34
+ if(!db) {
35
+ db = new CozoDb('sqlite', config.DB_FILE)
36
+ }
30
37
  return db
31
38
  }
32
39
 
33
40
  async function printQuery(query, params = {}) {
34
- try {
35
- if(!db) {
36
- getDb()
37
- try {
38
- let isCreated = await createTable()
39
- if(isCreated) console.log('Created embeddings table.')
40
- }catch(err) {}
41
+ try{
42
+ if(!initDone) {
43
+ initDone = true
44
+ await createTable()
41
45
  }
46
+ }catch(err) {
47
+ //console.log('CREATE TABLE ERROR', err)
48
+ }
49
+ try {
42
50
  let data = getDb().run(query, params)
43
51
  return data
44
52
  }catch(err){
@@ -47,17 +55,27 @@ async function printQuery(query, params = {}) {
47
55
  }
48
56
 
49
57
  export const getEmbeddings = async (text) => {
50
- const embedding = await embeddings(text, {
51
- service:'transformers',
52
- model: config.MODEL_NAME,
53
- cache_file: join(config.PATH, "cache", ".embeddings.cache.json")
54
- });
55
- return embedding
58
+ let pipe = config._pipe
59
+ if(!pipe) {
60
+ config._pipe = await pipeline("feature-extraction", config.MODEL_NAME, {
61
+ progress_callback:(progress) => {
62
+ if(config.PROGRESS_CALLBACK) return config.PROGRESS_CALLBACK();
63
+ if(progress.status === "progress_total"){
64
+ process.stdout.write(`\r\x1b[K✅ Loaded ${ Math.round(progress.progress)}% ${progress.name || "model"}`)
65
+ }
66
+ },
67
+ device: config.DEVICE,
68
+ dtype: config.DTYPE
69
+ });
70
+ pipe = config._pipe
71
+ }
72
+ const embedding = await pipe(text, { pooling: "mean", normalize: true });
73
+ return Array.from(embedding.data)
56
74
  }
57
75
 
58
76
  export const createTable = async () => {
59
77
  // create table (id, v, input, result, data)
60
- let tableCreated = await printQuery(`:create embeddings {id: String => v: <F32; ${config.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
78
+ let tableCreated = await printQuery(`:create embeddings {id: String, category: String => v: <F32; ${config.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
61
79
  if(tableCreated){
62
80
  // create index
63
81
  let indexCreated = await printQuery(`::hnsw create embeddings:index_name {
@@ -67,7 +85,6 @@ export const createTable = async () => {
67
85
  fields: [v],
68
86
  distance: L2, # Cosine, IP
69
87
  ef_construction:50, # number of nearest neighbors
70
- #filter: k != 'foo', # only those rows for which the expression evaluates to true are indexed
71
88
  extend_candidates: false, # include nearest neighbors of the nearest neighbors
72
89
  keep_pruned_connections: false,
73
90
  }`)
@@ -76,21 +93,14 @@ export const createTable = async () => {
76
93
  return false
77
94
  }
78
95
 
79
- export const add = async (input, result, data={}) => {
96
+ export const add = async (input, result, data={}, category="") => {
80
97
  if(!input || !result) return
81
-
82
98
  input = sanitizeString(input)
83
99
  result = sanitizeString(result)
84
-
85
-
86
-
87
100
  const embedding = await getEmbeddings(input)
88
-
89
- console.log('Adding', input, '->', result)
90
-
91
101
  let id = data.id || Math.random().toString().substring(2)
92
- return await printQuery(`?[id, v, input, result, data] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)} ]]
93
- :put embeddings {id => v, input, result, data}
102
+ return await printQuery(`?[id, v, input, result, data, category] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)}, ${JSON.stringify(category.replaceAll('"', "'"))} ]]
103
+ :put embeddings {id, category => v, input, result, data}
94
104
  `)
95
105
  }
96
106
 
@@ -102,64 +112,91 @@ export const add = async (input, result, data={}) => {
102
112
  * @param {Array} batch
103
113
  * @returns
104
114
  */
105
- export const addBatch = async (batch) => {
115
+ export const addBatch = async (batch, opts={onProgress:null}) => {
106
116
  if(!batch || !Array.isArray(batch)) return
107
117
  let vectorBatch = []
108
118
  for(let i=0;i<batch.length; i++){
109
- let {input, result, data} = batch[i]
119
+ let {input, result, data, category} = batch[i]
110
120
 
111
121
  if(!input || !result) continue
112
122
  if(!data) data = {}
123
+ if(!category) category = ''
113
124
  const embedding = await getEmbeddings(input)
114
125
  batch[i].embedding = embedding
115
126
  let item = ''
116
127
  if(i == 0) {
117
- item += `?[id, v, input, result, data] <- [`
128
+ item += `?[id, v, input, result, data, category] <- [`
118
129
  }
119
130
 
120
131
  input = sanitizeString(input)
121
132
  result = sanitizeString(result)
122
133
 
123
134
  let id = data?.id ? data.id : Math.random().toString().substring(2)
124
- item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)} ],`
135
+ item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)}, ${JSON.stringify(category)} ],`
125
136
  if(i == batch.length-1) {
126
137
  item += `]
127
- :put embeddings {id => v, input, result, data}`
138
+ :put embeddings {id, category => v, input, result, data}`
128
139
  }
129
140
  vectorBatch.push(item)
141
+
142
+ if(opts.onProgress && typeof opts.onProgress == 'function') {
143
+ await opts.onProgress({index: i+1, total:batch.length, item: batch[i], embedding, percent: Math.round((i+1) / batch.length * 100)})
144
+ }
130
145
  }
131
146
  return await printQuery(vectorBatch.join("\n"))
132
147
  }
133
148
 
134
149
  const sanitizeString = (str)=>{
135
- return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ")
150
+ return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ").trim()
136
151
  }
137
152
 
138
- export const remove = async (id) => {
153
+ export const remove = async (id, category="") => {
139
154
  if(!id || typeof id != 'string') return
140
- id.replace(/[^a-zA-Z0-9]/g, '')
141
- if(!id) return
155
+ id = id.replace(/[^a-zA-Z0-9]/g, '')
156
+ category = sanitizeString(category)
157
+ if(!id || !category) return
142
158
  let results = await printQuery(
143
- `?[id] <- [['${id}']]
144
- ::remove embeddings {id}`)
159
+ `?[id, category] <- [['${id}', '${category}']]
160
+ ::rm embeddings {id, category}`)
161
+ return results
162
+ }
163
+
164
+ export const removeAllByCategory = async (category="") => {
165
+ category = sanitizeString(category)
166
+ if(!category) return
167
+ let results
168
+ try {
169
+ results = await printQuery(
170
+ `?[id, category] := *embeddings{id, category}, category = "${category}"
171
+ :rm embeddings {id, category}`)
172
+ }catch(err){
173
+ console.error(err)
174
+ }
145
175
  return results
146
176
  }
147
177
 
148
- export const searchText = async (text, numResults = 5) => {
178
+ export const searchText = async (text, category="", numResults = 5, includeInput=false) => {
149
179
  const embedding = await getEmbeddings(text)
150
- let results = await printQuery(`?[dist, result, id, data] := ~embeddings:index_name{ id, v, input, result, data |
180
+ let results = await printQuery(`?[dist, result, id, data, category${includeInput? ', input' : ''}] := ~embeddings:index_name { id, v, input, result, data, category${includeInput? ', input' : ''} |
151
181
  query: q,
152
182
  k: ${numResults}, # number of results
153
- ef: 90, # number of neighbours to consider
183
+ ef: 50, # number of neighbours to consider
154
184
  bind_distance: dist,
185
+ filter: category==${JSON.stringify(category)},
155
186
  radius: 10.0
156
187
  }, q = vec(${JSON.stringify(embedding)})
157
- :sort dist`)
188
+ :sort -dist`)
158
189
  return results
159
190
  }
160
191
 
161
- export const vectorSearch = async (query, numResults=5) => {
162
- return await searchText(query, numResults)
192
+ export const vectorSearch = async (query, category='', numResults=5) => {
193
+ let result = undefined
194
+ try{
195
+ result = await searchText(query, category, numResults)
196
+ }catch(err){
197
+ if(config.SHOW_ERRORS) console.error(err.display || err.message)
198
+ }
199
+ return result
163
200
  }
164
201
 
165
202
  const cmdArgs = (list = []) => {
@@ -274,19 +311,6 @@ export const importFromCSVorTSV = async (fileName, inputHeader, resultHeader) =>
274
311
 
275
312
  let results = await fetchFromFile(fileName)
276
313
 
277
- // // split results to sentences
278
- // let results_raw = await fetchFromFile(fileName)
279
- // let results = []
280
- // for(let i=0;i<results_raw.length; i++){
281
- // let sentences = splitSentences(results_raw[i].input)
282
- // for(let j=0; j<sentences.length; j++){
283
- // results.push({
284
- // ...results_raw[i],
285
- // ...{ input: sentences[j] }
286
- // })
287
- // }
288
- // }
289
-
290
314
  let batchSize = 40, batch = [], currentBatch = 0, totalBatches = Math.ceil(results.length / batchSize), dataHeaders = Object.keys(results[results.length-1]).filter(k => k != 'input' && k != 'result'), data
291
315
  for(let i=0; i<results.length; i++){
292
316
  if(i % batchSize === 0){
@@ -431,18 +455,22 @@ const splitSentences = (text) => {
431
455
  }
432
456
 
433
457
  const runCLI = async () => {
434
- let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--mcp', '--nuke', '--input-header', '--result-header', '--test', '--limit'])
458
+ let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--mcp', '--nuke', '--input-header', '--result-header', '--test', '--limit', '--category'])
435
459
  let query = args['--query'] || args['-q']
436
460
  if(args['--db']){
437
461
  config.DB_FILE = args['--db']
438
462
  }
463
+ let category = ''
464
+ if(args['--category']) {
465
+ category = args['--category']
466
+ }
439
467
  if(query){
440
468
  let numResults = 5
441
469
  if(args['--limit'] && parseInt(args['--limit'])) {
442
470
  numResults = parseInt(args['--limit'])
443
471
  }
444
472
  console.time('Search time')
445
- let result = await vectorSearch(query, numResults)
473
+ let result = await vectorSearch(query, category, numResults)
446
474
  console.timeEnd('Search time')
447
475
  console.log('Results:')
448
476
  console.log(JSON.stringify(result, null, 2))
@@ -451,17 +479,17 @@ const runCLI = async () => {
451
479
  if(!input || !result) {
452
480
  console.log('Usage:')
453
481
  return console.log(args._cmd + `--add 'input|result|{"foo":"bar"}'`)
454
- }
482
+ }
455
483
  let data = {}
456
484
  if(dataString) {
457
485
  try {data = JSON.parse(dataString)}catch(err) {}
458
486
  }
459
- let resp = await add(input, result, data)
487
+ let resp = await add(input, result, data, category)
460
488
  console.log(JSON.stringify(resp, null, 2))
461
489
  }else if(args['--remove']){
462
490
  let id = args['--remove']
463
491
  if(!id) return console.log('Please specify ID to remove')
464
- let resp = await remove(id)
492
+ let resp = await remove(id, category)
465
493
  console.log(JSON.stringify(resp, null, 2))
466
494
  }else if(args['--nuke'] != undefined){
467
495
  nuke()
@@ -481,17 +509,18 @@ const runCLI = async () => {
481
509
  console.log('Usage:')
482
510
  console.log(args._cmd + ' --query "Foo Bar"')
483
511
  console.log("\n" + 'Options:')
484
- console.log('--query "SEARCH_STRING" - search')
485
- console.log('--limit 2 - limit number of results (used with --query)')
486
- console.log(`--add 'input|result|{"foo":"bar"}' - add data`)
487
- console.log(`--remove 'id' - remove data`)
488
- console.log(`--nuke - destroy database`)
489
- console.log(`--mcp - run as MCP server (experimental)`)
490
- console.log(`--db "FILE_NAME" - database file (SQLite)`)
491
- console.log(`--import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data`)
492
- console.log('--input-header "foo" - when used with --import designates specific header column as input')
493
- console.log('--result-header "bar" - when used with --import designates specific header column as result')
494
- console.log(`--json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}`)
512
+ console.log('--query "SEARCH_STRING" - search')
513
+ console.log('--limit 2 - limit number of results (used with --query)')
514
+ console.log(`--add 'input|result|{"foo":"bar"}|categ' - add data`)
515
+ console.log(`--remove 'id' - remove data`)
516
+ console.log(`--nuke - destroy database`)
517
+ console.log(`--mcp - run as MCP server (experimental)`)
518
+ console.log(`--db "FILE_NAME" - database file (SQLite)`)
519
+ console.log(`--import "file.csv | file.tsv" - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data`)
520
+ console.log('--input-header "foo" - when used with --import designates specific header column as input')
521
+ console.log('--result-header "bar" - when used with --import designates specific header column as result')
522
+ console.log(`--json "FILE_NAME" - import from file which has one json object per line: {input:"", result:"", data:{}}`)
523
+ console.log(`--category "CATEGORY" - specify category when adding data and to filter by when querying (defaults to empty string)`)
495
524
  }
496
525
  }
497
526
 
@@ -0,0 +1,34 @@
1
+ export function sanitizeValue(stringValue, maxChars=1000) {
2
+ if (typeof stringValue !== 'string') {
3
+ throw new Error('stringValue must be a string');
4
+ }
5
+
6
+ let sanitized = stringValue.normalize('NFC').trim();
7
+
8
+ // Basic validation
9
+ if (sanitized.length === 0) {
10
+ throw new Error('stringValue name cannot be empty');
11
+ }
12
+
13
+ if (sanitized.length > maxChars) {
14
+ throw new Error(`stringValue name too long (max ${maxChars} characters)`);
15
+ }
16
+
17
+ // Block control characters (primary security concern)
18
+ // This allows all other Unicode characters including emojis, Chinese, Arabic, etc.
19
+ if (/[\x00-\x1F\x7F-\x9F\u200B\u200E\u200F\u202A-\u202E\u2060-\u2069\uFEFF]/.test(sanitized)) {
20
+ throw new Error('stringValue contains disallowed control characters');
21
+ }
22
+
23
+ // Block private use areas
24
+ if (/[\uE000-\uF8FF\uFFF0-\uFFFF]/.test(sanitized)) {
25
+ throw new Error('stringValue contains disallowed Unicode characters');
26
+ }
27
+
28
+ // Block surrogate pairs (invalid alone)
29
+ if (/[\uD800-\uDFFF]/.test(sanitized)) {
30
+ throw new Error('stringValue contains invalid Unicode characters');
31
+ }
32
+
33
+ return sanitized;
34
+ }