npm - @sjovanovic/recall.js - Versions diffs - 1.0.3 → 1.0.5 - Mend

@sjovanovic/recall.js 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -4,15 +4,11 @@
   <img alt="Recall.js is long term memory for AI apps!" src="logo.svg" />
 </p>
-Recall.js is long term memory for AI apps!
+Recall.js provides long‑term memory for AI applications. It is a JavaScript library and command‑line tool for building Retrieval‑Augmented Generation (RAG) systems, with a focus on speed, ease of use, and embeddability.
-It is a tool for building RAG (Retrieval-augmented generation) in a form of JavaScript library and command line utility focused on speed, ease of use and embeddability.
+Beyond RAG, recall.js can be used for generic semantic search, as expert memory for your AI app, or as a recommendation system. It supports multilingual embeddings out of the box, allowing you to add data in one language and query it in another.
-It is versatile and you don't have to use it exclusively for RAG, use it for generic Semantic Search, as expert memory for your AI app, as a  recommendation system, there are so many possibilities...
-Recall.js supports multilingual embeddings out of the box so you can add data in one language and then query it in another.
-Under the hood, recall.js uses sentence vector embeddings and a vector database to index and query your data. It is a light wrapper around local language models such as [MiniLM-L12-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2) and [CozoDB](https://www.cozodb.org/) vector database.
+Under the hood, recall.js uses [Transformers.js](https://huggingface.co/docs/transformers.js/index) for feature extraction and a vector database (powered by [CozoDB](https://www.cozodb.org/)) for indexing and querying. It is a lightweight wrapper around local language models such as [Multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2).
 ## Install
@@ -20,60 +16,57 @@ Under the hood, recall.js uses sentence vector embeddings and a vector database
 ## Usage
-Console:
+### Command Line
 ```console
 recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
 recall --query "Un animal saute par-dessus un autre animal" --limit 1
 ```
-**Warning:** when this library is used for the first time, it will download a local language model MiniLM-L12-v2 which may take long time depending on your Internet connectivity. Please be patient.
+> **Note:**  When the library is used for the first time, it will download a local language model (Multilingual-MiniLM-L12-v2). This may take a while depending on your internet connection. Please be patient.
-Below is the same example in JavaScript:
+### JavaScript
 ```javascript
-import * as RECALL from '@sjovanovic/recall.js'
+import Recall from '@sjovanovic/recall.js'
 const testRecall = async () => {
-    await RECALL.addBatch([
-        {
-            input: "The quick brown fox jumps over the lazy dog",
-            result: "Fox and dog",
-            data: { foo: "bar" }
-        }
-    ])
-    // Semantic search query in different language (French) "Animal jumps over another animal"
-    let response = await RECALL.searchText("Un animal saute par-dessus un autre animal", 1)
-    console.log(response)
-}
-testRecall()
-/*
+  let config = {
+    SHOW_PROGRESS: true
+  }
+  let recall = new Recall(config)
-response:
+  await recall.addBatch([
+    {
+        input: "The quick brown fox jumps over the lazy dog",
+        result: "Fox and dog",
+        data: { foo: "bar" }
+    }
+  ])
+  // Semantic search query in different language (French) "Animal jumps over another animal"
+  let response = await recall.searchText("Un animal saute par-dessus un autre animal", 1)
+  console.log(response)
+}
+testRecall()
+```
+**Example response:**
+```json
 {
-  "headers": [
-    "dist",
-    "result",
-    "id",
-    "data"
-  ],
+  "headers": ["dist", "result", "id", "data", "category"],
   "rows": [
     [
-      0.5840495824813843, // vector similarity
+      0.6840495824813843,
       "Fox and dog",
       "08840189191373282",
-      {
-        "foo": "bar"
-      }
+      { "foo": "bar" },
+      ""
     ]
   ]
 }
-*/
 ```
 ## Options
@@ -87,17 +80,17 @@ Usage:
 recall --query "Foo Bar"
 Options:
---query "SEARCH_STRING"                - search
---limit 2                              - limit number of results (used with --query)
---add 'input|result|{"foo":"bar"}'     - add data
---remove 'id'                          - remove data
---nuke                                 - destroy database
---mcp                                  - run as MCP server
---db "FILE_NAME"                       - database file (SQLite)
---import "file.csv | file.tsv"         - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data
---input-header "foo"                   - when used with --import designates specific header column as input
---result-header "bar"                  - when used with --import designates specific header column as result
---json "FILE_NAME"                     - import from file which has one json object per line: {input:"", result:"", data:{}}
+--query "SEARCH_STRING"                    - Search the database
+--limit N                                  - Limit number of results (used with --query).
+--add 'input|result|{"foo":"bar"}|categ'   - Add a data entry.
+--remove 'id'                              - Remove data by ID.
+--nuke                                     - Destroy the database.
+--db "FILE_NAME"                           - Specify database file (SQLite).
+--import "file.csv | file.tsv"             - Import from CSV or TSV with columns: input, result, additional data.
+--input-header "foo"                       - When used with --import, designate a specific header column as input.
+--result-header "bar"                      - When used with --import, designate a specific header column as result.
+--json "FILE_NAME"                         - Import from a file with one JSON object per line: {input:"", result:"", data:{}}.
+--category "CATEGORY"                      - Specify category when adding data and filter by it when querying (defaults to empty string).
 ```
 **Note:** when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
@@ -105,69 +98,79 @@ Options:
 ## JavaScript API Reference
-### RECALL.config
+### Configuration
-Configuration object.
+The default configuration object is exported as config:
 ```javascript
 export const config = {
-    VECTOR_SIZE: 384, // number of dimensions
-    MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
-    SHOW_ERRORS: true, // Show errors
-    DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
-    PATH: PATH // directory of recall.js
+  VECTOR_SIZE: 384,                                // Number of dimensions (must match the model's output)
+  MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // Model name for Transformers.js
+  SHOW_ERRORS: true,                               // Show error messages
+  DB_FILE: join(PATH, 'vector.db'),                // Path to the SQLite database file (used by CozoDB)
+  PATH: PATH,                                      // Directory of recall.js
+  DEVICE: undefined,                               // Transformers.js device
+  DTYPE: undefined,                                // Transformers.js dtype
+  PROGRESS_CALLBACK: undefined                     // Transformers.js progress callback
 }
 ```
-### RECALL.getDb()
+### Methods
+**getDb()**
-Returns reference to the CozoDB instance.
+Returns reference to the underlying CozoDB instance.
-### RECALL.getEmbeddings(text) -> Promise(Array)
+**getEmbeddings(text) -> Promise&lt;Array&gt;**
 Given text calculates the embeddings vector
-### RECALL.add(input, result, data={}) -> Promise(Object)
+**add(input, result, data={}, category="") -> Promise&lt;Object&gt;**
-Add data. `input` is the sentence to get embeddings from. `result` is the string to show in the results. `data` is arbitrary object intended to hold related pieces of information and references. If `data` object contains `id` property it will be used as unique id of the record.
+Adds a data entry.
-### RECALL.addBatch(batch) -> Promise(Object)
+- input – The sentence to generate embeddings from.
+- result – The string to display in search results.
+- data – Arbitrary object for additional information and references. If it contains an id property, that value will be used as the record’s unique ID.
+- category – Optional category string.
-Add data in batches (faster than using add repeteadely).
-`batch` is an Array that looks like this:
-```
-let batch = [{input:"", result:"", data:{}}]
-```
+**addBatch(batch) -> Promise&lt;Object&gt;**
-### RECALL.remove(id) -> Promise(Object)
+Adds multiple entries in a batch (more efficient than repeated add calls).
+batch is an array of objects with the same structure as add:
+```javascript
+let batch = [
+  { input: "", result: "", data: {}, category: "" }
+]
+```
-Remove data by id. id is a string.
+**remove(id) -> Promise&lt;Object&gt;**
-### RECALL.searchText(text, numResults = 5) ->  Promise(Object)
+Removes the record with the specified ID (string).
-Query the vector database. Accepts query text and number of results to return.
+**searchText(text, category="", numResults = 5, includeInput=false) -> Promise&lt;Object&gt;**
-### RECALL.nuke()
+Queries the vector database.
-Deletes the database.
+- text – The query text.
+- category – Optional category filter.
+- numResults – Number of results to return.
+- includeInput – If true, the original input text is included in the response.
-### RECALL.importFromJSONStream(fileName) -> Promise(object)
+**nuke()**
-Imports from readable stream or file which consists of JSON objects, one per line. e.g.
-```
-{input:"one", result:"one result", data:{"id":"123"}}
-{input:"", result:"", data:{}}
-...
-```
-This is the most efficient way to import data.
+Deletes the entire database.
-### RECALL.importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise()
+**importFromJSONStream(fileName) -> Promise&lt;Object&gt;**
-Imports from CSV or TSV file. By default fist column is used as input, second as result and remaining columns are put in the data object.
-If `inputHeader` is specified, function will try to find the column by that name and use it as input.
-If `resultHeader` is specified, function will try to find the column by that name and use it as result.
+Imports data from a readable stream or file containing one JSON object per line (JSONL). Example line format:
+```json
+{input:"one", result:"one result", data:{"id":"123"}, category:""}
+```
+This is the most efficient import method.
-### RECALL.mcp() -> Promise()
+**importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise&lt;Object&gt;**
-(Experimental)
-Runs MCP server and makes the results available when mentioning `Recall search` in the prompt. Currently only supports STDIO.
+Imports data from a CSV or TSV file. By default, the first column is used as input, the second as result, and the remaining columns are merged into the data object.
+If `inputHeader` is specified, the function looks for a column with that name and uses it as input.
+If `resultHeader` is specified, it looks for a column with that name and uses it as result.

package/package.json CHANGED Viewed

@@ -1,24 +1,24 @@
 {
   "name": "@sjovanovic/recall.js",
-  "version": "1.0.3",
-  "description": "Semantic search as long term memory for LLMs",
+  "version": "1.0.5",
+  "description": "Easy RAG with semantic search and long term memory",
   "main": "recall.js",
   "bin": {
     "recall": "recall.js"
   },
   "type": "module",
   "scripts": {
-    "test": "echo \"Error: no test specified\" && exit 1",
+    "start": "node recall.js",
+    "test": "node recall.js --test",
     "query": "node recall.js --query "
   },
   "author": "Slobodan Jovanovic",
   "license": "ISC",
   "dependencies": {
-    "@modelcontextprotocol/sdk": "^1.8.0",
-    "@themaximalist/embeddings.js": "^0.1.3",
-    "@xenova/transformers": "^2.17.2",
+    "@huggingface/transformers": "^4.2.0",
+    "@modelcontextprotocol/sdk": "^1.29.0",
     "cozo-node": "^0.7.6",
     "csv-parser": "^3.2.0",
-    "zod": "^3.24.2"
+    "zod": "^4.3.6"
   }
 }

package/recall.js CHANGED Viewed

@@ -1,429 +1,409 @@
 #!/usr/bin/env node
 import {CozoDb} from 'cozo-node'
-import embeddings from "@themaximalist/embeddings.js";
+import { pipeline } from "@huggingface/transformers";
 import csv from 'csv-parser'
 import fs from 'fs'
 import { resolve, join, dirname, sep } from 'path'
 import { fileURLToPath } from 'url'
-import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
-import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
-import { z } from "zod";
 const pathToThisFile = resolve(fileURLToPath(import.meta.url))
 const pathPassedToNode = resolve(process.argv[1])
 const isThisFileBeingRunViaCLI = pathToThisFile.includes(pathPassedToNode) || pathPassedToNode.includes('.npm-global')
 const PATH = dirname(pathToThisFile)
 export const config = {
-    VECTOR_SIZE: 384, // number of dimensions
-    MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
+    VECTOR_SIZE: 384, // number of dimensions (must match the models output)
+    MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
     SHOW_ERRORS: true, // Show errors
+    SHOW_PROGRESS: false, // Show model loading progress in the console
     DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
-    PATH: PATH // directory of recall.js
+    PATH: PATH, // directory of recall.js
+    DEVICE: undefined, // Transformers.js device
+    DTYPE: undefined, // Transformers.js dtype
+    PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
 }
+var recal_instance = null
-var db = null, initDone = false
-export const getDb = () => {
-    if(!db) {
-        db = new CozoDb('sqlite', config.DB_FILE)
+export class Recall {
+    constructor(opts = {}){
+        this.opts = {
+            ...config,
+            ...opts
+        }
+        this.initDone = false
+        this.db = new CozoDb('sqlite', this.opts.DB_FILE)
+    }
+    async printQuery(query, params = {}) {
+        try{
+            if(!this.initDone) {
+                this.initDone = true
+                await this.createTable()
+            }
+        }catch(err) {}
+        try {
+            let data = this.db.run(query, params)
+            return data
+        }catch(err){
+            if(this.opts.SHOW_ERRORS) console.error(err.display || err.message)
+        }
     }
-    return db
-}
-async function printQuery(query, params = {}) {
-    try{
-        if(!initDone) {
-            initDone = true
-            await createTable()
+    async getEmbeddings(text){
+        let pipe = this.opts._pipe
+        if(!pipe) {
+            this.opts._pipe = await pipeline("feature-extraction", this.opts.MODEL_NAME, {
+                progress_callback:(progress) => {
+                    if(this.opts.PROGRESS_CALLBACK) return this.opts.PROGRESS_CALLBACK(progress);
+                    if(this.opts.SHOW_PROGRESS && progress.status === "progress_total"){
+                        process.stdout.write(`\r\x1b[K✅ Loaded ${ Math.round(progress.progress)}% ${progress.name || "model"}`)
+                    }
+                },
+                device: this.opts.DEVICE,
+                dtype: this.opts.DTYPE
+            });
+            pipe = this.opts._pipe
         }
-    }catch(err) {
-        //console.log('CREATE TABLE ERROR', err)
+        const embedding = await pipe(text, { pooling: "mean", normalize: true });
+        return Array.from(embedding.data)
     }
-    try {
-        let data = getDb().run(query, params)
-        return data
-    }catch(err){
-        if(config.SHOW_ERRORS) console.error(err.display || err.message)
+    async createTable() {
+        // create table
+        let tableCreated = await this.printQuery(`:create embeddings {id: String, category: String => v: <F32; ${this.opts.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
+        if(tableCreated){
+            // create vector index
+            let indexCreated = await printQuery(`::hnsw create embeddings:index_name {
+                dim: ${this.opts.VECTOR_SIZE},
+                m: 50,
+                dtype: F32,
+                fields: [v],
+                distance: L2, # Cosine, IP
+                ef_construction:50, # number of nearest neighbors
+                extend_candidates: false, # include nearest neighbors of the nearest neighbors
+                keep_pruned_connections: false,
+            }`)
+            return tableCreated && indexCreated
+        }
+        return false
     }
-}
-export const getEmbeddings = async (text) => {
-    const embedding = await embeddings(text,  {
-        service:'transformers',
-        model: config.MODEL_NAME,
-        cache_file: join(config.PATH, "cache", ".embeddings.cache.json")
-    });
-    return embedding
-}
+    async add(input, result, data={}, category="") {
+        if(!input || !result) return
+        input = this.sanitizeString(input)
+        result = this.sanitizeString(result)
+        const embedding = await this.getEmbeddings(input)
+        let id = data.id || Math.random().toString().substring(2)
+        return await printQuery(`?[id, v, input, result, data, category] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)}, ${JSON.stringify(category.replaceAll('"', "'"))} ]]
+            :put embeddings {id, category => v, input, result, data}
+        `)
+    }
-export const createTable = async () => {
-    // create table (id, v, input, result, data)
-    let tableCreated = await printQuery(`:create embeddings {id: String, category: String => v: <F32; ${config.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
-    if(tableCreated){
-        // create index
-        let indexCreated = await printQuery(`::hnsw create embeddings:index_name {
-            dim: ${config.VECTOR_SIZE},
-            m: 50,
-            dtype: F32,
-            fields: [v],
-            distance: L2, # Cosine, IP
-            ef_construction:50, # number of nearest neighbors
-            extend_candidates: false, # include nearest neighbors of the nearest neighbors
-            keep_pruned_connections: false,
-        }`)
-        return tableCreated && indexCreated
+    sanitizeString(str){
+        return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ").trim()
     }
-    return false
-}
-export const add = async (input, result, data={}, category="") => {
-    if(!input || !result) return
-    input = sanitizeString(input)
-    result = sanitizeString(result)
-    const embedding = await getEmbeddings(input)
-    let id = data.id || Math.random().toString().substring(2)
-    return await printQuery(`?[id, v, input, result, data, category] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)}, ${JSON.stringify(category.replaceAll('"', "'"))} ]]
-        :put embeddings {id, category => v, input, result, data}
-    `)
-}
+    /**
+     *
+     * Batch array:
+     * [{input:"", result:"", data:{}}]
+     *
+     * @param {Array} batch
+     * @returns
+     */
+    async addBatch(batch, opts={onProgress:null}) {
+        if(!batch || !Array.isArray(batch)) return
+        let vectorBatch = []
+        for(let i=0;i<batch.length; i++){
+            let {input, result, data, category} = batch[i]
+            if(!input || !result) continue
+            if(!data) data = {}
+            if(!category) category = ''
+            const embedding = await this.getEmbeddings(input)
+            batch[i].embedding = embedding
+            let item = ''
+            if(i == 0) {
+                item += `?[id, v, input, result, data, category] <- [`
+            }
-/**
- *
- * Batch array:
- * [{input:"", result:"", data:{}}]
- *
- * @param {Array} batch
- * @returns
- */
-export const addBatch = async (batch) => {
-    if(!batch || !Array.isArray(batch)) return
-    let vectorBatch = []
-    for(let i=0;i<batch.length; i++){
-        let {input, result, data, category} = batch[i]
-        if(!input || !result) continue
-        if(!data) data = {}
-        if(!category) category = ''
-        const embedding = await getEmbeddings(input)
-        batch[i].embedding = embedding
-        let item = ''
-        if(i == 0) {
-            item += `?[id, v, input, result, data, category] <- [`
-        }
+            input = this.sanitizeString(input)
+            result = this.sanitizeString(result)
-        input = sanitizeString(input)
-        result = sanitizeString(result)
+            let id = data?.id ? data.id : Math.random().toString().substring(2)
+            item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)}, ${JSON.stringify(category)} ],`
+            if(i == batch.length-1) {
+                item += `]
+                :put embeddings {id, category => v, input, result, data}`
+            }
+            vectorBatch.push(item)
-        let id = data?.id ? data.id : Math.random().toString().substring(2)
-        item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)}, ${JSON.stringify(category)} ],`
-        if(i == batch.length-1) {
-            item += `]
-            :put embeddings {id, category => v, input, result, data}`
+            if(opts.onProgress && typeof opts.onProgress == 'function') {
+                await opts.onProgress({index: i+1, total:batch.length, item: batch[i], embedding, percent: Math.round((i+1) / batch.length * 100)})
+            }
         }
-        vectorBatch.push(item)
+        return await this.printQuery(vectorBatch.join("\n"))
     }
-    return await printQuery(vectorBatch.join("\n"))
-}
-const sanitizeString = (str)=>{
-    return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ")
-}
-export const remove = async (id, category="") => {
-    if(!id || typeof id != 'string') return
-    id.replace(/[^a-zA-Z0-9]/g, '')
-    if(!id) return
-    let results = await printQuery(
-        `?[id, category] <- [['${id}', '${category}']]
-        ::remove embeddings {id}`)
-    return results
-}
-export const searchText = async (text, category="", numResults = 5) => {
-    const embedding = await getEmbeddings(text)
-    let results = await printQuery(`?[dist, result, id, data, category] := ~embeddings:index_name { id, v, input, result, data, category |
-        query: q,
-        k: ${numResults}, # number of results
-        ef: 50, # number of neighbours to consider
-        bind_distance: dist,
-        filter: category==${JSON.stringify(category)},
-        radius: 10.0
-    }, q = vec(${JSON.stringify(embedding)})
-    :sort -dist`)
-    return results
-}
-export const vectorSearch = async (query, category='', numResults=5) => {
-    let result = undefined
-    try{
-        result = await searchText(query, category, numResults)
-    }catch(err){
-        if(config.SHOW_ERRORS) console.error(err.display || err.message)
+    async remove(id, category="") {
+        if(!id || typeof id != 'string') return
+        id = id.replace(/[^a-zA-Z0-9]/g, '')
+        category = this.sanitizeString(category)
+        if(!id || !category) return
+        let results = await this.printQuery(
+            `?[id, category] <- [['${id}', '${category}']]
+            ::rm embeddings {id, category}`)
+        return results
+    }
+    async removeAllByCategory(category=""){
+        category = this.sanitizeString(category)
+        if(!category) return
+        let results
+        try {
+            results = await this.printQuery(
+                `?[id, category] := *embeddings{id, category}, category = "${category}"
+                :rm embeddings {id, category}`)
+        }catch(err){
+            console.error(err)
+        }
+        return results
     }
-    return result
-}
-const cmdArgs = (list = []) => {
-    let args = {}, current = null
-    for(let i=0; i<process.argv.length; i++){
-        let val = process.argv[i]
-        if(current && !list.includes(val)){
-            args[current] = val
-            current = null
-        }
-        if(list.includes(val)) {
-            current = val
-            args[current] = ''
+    async searchText(text, category="", numResults = 5, includeInput=false) {
+        const embedding = await this.getEmbeddings(text)
+        let results = await this.printQuery(`?[dist, result, id, data, category${includeInput? ', input' : ''}] := ~embeddings:index_name { id, v, input, result, data, category${includeInput? ', input' : ''} |
+            query: q,
+            k: ${numResults}, # number of results
+            ef: 50, # number of neighbours to consider
+            bind_distance: dist,
+            filter: category==${JSON.stringify(category)},
+            radius: 10.0
+        }, q = vec(${JSON.stringify(embedding)})
+        :sort dist`)
+        return results
+    }
+    async vectorSearch(query, category='', numResults=5) {
+        let result = undefined
+        try{
+            result = await this.searchText(query, category, numResults)
+        }catch(err){
+            if(config.SHOW_ERRORS) console.error(err.display || err.message)
         }
+        return result
     }
-    args._cmd = process.argv[1].split(sep).pop()
-    return args
-}
-export const nuke = () => {
-    return fs.unlinkSync(config.DB_FILE)
-}
+    nuke() {
+        return fs.unlinkSync(this.opts.DB_FILE)
+    }
-export const importFromJSONStream = async (fileName) => {
-    async function jsonStream(readable, callback = async function(){}) {
-        readable.setEncoding('utf8');
-        let data = '';
-        for await (const chunk of readable) {
-            if(chunk.indexOf("\n")) {
-                pts = chunk.split("\n")
-                for(let i=0;i<pts.length; i++){
-                    data += pts[i]
-                    try {
-                        let json = JSON.parse(data)
-                        await callback(json)
-                        json = null
-                        data = ''
-                    }catch(err) {
-                        //console.error(err)
+    async importFromJSONStream(fileName) {
+        async function jsonStream(readable, callback = async function(){}) {
+            readable.setEncoding('utf8');
+            let data = '';
+            for await (const chunk of readable) {
+                if(chunk.indexOf("\n")) {
+                    pts = chunk.split("\n")
+                    for(let i=0;i<pts.length; i++){
+                        data += pts[i]
+                        try {
+                            let json = JSON.parse(data)
+                            await callback(json)
+                            json = null
+                            data = ''
+                        }catch(err) {}
                     }
+                }else{
+                    data += chunk;
                 }
-            }else{
-                data += chunk;
             }
         }
-    }
-    let batchSize = 40, batch = [], i=0, currentBatch = 0
-    let stream = typeof fileName == 'string' ? fs.createReadStream(fileName) : fileName
-    await jsonStream(stream, async (json) => {
-        if(json.input && json.result){
-            if(!json.data) json.data = {}
-            if(i % batchSize === 0){
-                if(batch.length) {
-                    currentBatch = currentBatch + 1
-                    console.log(`Adding batch ${currentBatch} (${batch.length} items)`)
-                    await addBatch(batch)
-                    batch = []
+        let batchSize = 40, batch = [], i=0, currentBatch = 0
+        let stream = typeof fileName == 'string' ? fs.createReadStream(fileName) : fileName
+        await jsonStream(stream, async (json) => {
+            if(json.input && json.result){
+                if(!json.data) json.data = {}
+                if(i % batchSize === 0){
+                    if(batch.length) {
+                        currentBatch = currentBatch + 1
+                        console.log(`Adding batch ${currentBatch} (${batch.length} items)`)
+                        await this.addBatch(batch)
+                        batch = []
+                    }
                 }
+                batch.push(json)
+                i=i+1
             }
-            batch.push(json)
-            i=i+1
+        })
+        if(batch.length) {
+            console.log(`Adding batch ${currentBatch + 1} (${batch.length} items)`)
+            await this.addBatch(batch)
         }
-    })
-    if(batch.length) {
-        console.log(`Adding batch ${currentBatch + 1} (${batch.length} items)`)
-        await addBatch(batch)
     }
-}
-export const importFromCSVorTSV = async (fileName, inputHeader, resultHeader) => {
-    if(!fileName || !fileName.includes('.')) return
-    let ext = fileName.split('.').pop()
-    ext = ext.toLowerCase()
-    if(ext != 'csv' && ext != 'tsv') return console.log('File must have csv or tsv extension')
-    let parseOpts = {
-        separator: ext == 'tsv' ? '\t' : ',',
-        mapHeaders: ({ header, index }) => {
-            if(inputHeader) {
-                if(inputHeader == header){
+    async importFromCSVorTSV(fileName, inputHeader, resultHeader) {
+        if(!fileName || !fileName.includes('.')) return
+        let ext = fileName.split('.').pop()
+        ext = ext.toLowerCase()
+        if(ext != 'csv' && ext != 'tsv') return console.log('File must have csv or tsv extension')
+        let parseOpts = {
+            separator: ext == 'tsv' ? '\t' : ',',
+            mapHeaders: ({ header, index }) => {
+                if(inputHeader) {
+                    if(inputHeader == header){
+                        return 'input'
+                    }
+                }else if(index === 0){
                     return 'input'
                 }
-            }else if(index === 0){
-                return 'input'
-            }
-            if(resultHeader){
-                if(resultHeader == header){
+                if(resultHeader){
+                    if(resultHeader == header){
+                        return 'result'
+                    }
+                }else if(index === 1){
                     return 'result'
                 }
-            }else if(index === 1){
-                return 'result'
+                return header.replaceAll(/\W/gi, '_').replaceAll(/[^a-zA-Z0-9\_]/g, '').toLowerCase()
             }
-            return header.replaceAll(/\W/gi, '_').replaceAll(/[^a-zA-Z0-9\_]/g, '').toLowerCase()
         }
-    }
-    let fetchFromFile = async (fileName) => {
-        return new Promise(async (resolve, reject)=>{
-            let results = []
-            fs.createReadStream(fileName)
-            .pipe(csv(parseOpts))
-            .on('data', async (data) => {
-                results.push(data)
+        let fetchFromFile = async (fileName) => {
+            return new Promise(async (resolve, reject)=>{
+                let results = []
+                fs.createReadStream(fileName)
+                .pipe(csv(parseOpts))
+                .on('data', async (data) => {
+                    results.push(data)
+                })
+                .on('end', () => {
+                    console.log(`${fileName} loaded.`);
+                    resolve(results)
+                }).on('error', (err) => {
+                    console.error(err);
+                })
             })
-            .on('end', () => {
-                console.log(`${fileName} loaded.`);
-                resolve(results)
-            }).on('error', (err) => {
-                console.error(err);
-            })
-        })
-    }
+        }
-    let results = await fetchFromFile(fileName)
-    let batchSize = 40, batch = [], currentBatch = 0, totalBatches = Math.ceil(results.length / batchSize), dataHeaders = Object.keys(results[results.length-1]).filter(k => k != 'input' && k != 'result'), data
-    for(let i=0; i<results.length; i++){
-        if(i % batchSize === 0){
-            if(batch.length) {
-                currentBatch = currentBatch + 1
-                console.log(`Adding batch ${currentBatch} of ${totalBatches} (${batch.length} items)`)
-                await addBatch(batch)
-                batch = []
+        let results = await fetchFromFile(fileName)
+        let batchSize = 40, batch = [], currentBatch = 0, totalBatches = Math.ceil(results.length / batchSize), dataHeaders = Object.keys(results[results.length-1]).filter(k => k != 'input' && k != 'result'), data
+        for(let i=0; i<results.length; i++){
+            if(i % batchSize === 0){
+                if(batch.length) {
+                    currentBatch = currentBatch + 1
+                    console.log(`Adding batch ${currentBatch} of ${totalBatches} (${batch.length} items)`)
+                    await this.addBatch(batch)
+                    batch = []
+                }
             }
+            data = {}
+            dataHeaders.forEach(k => k && results[i][k] ? data[k] = results[i][k] : null)
+            batch.push({
+                input: results[i].input,
+                result: results[i].result,
+                data
+            })
+        }
+        if(batch.length) {
+            console.log(`Adding batch ${currentBatch + 1} of ${totalBatches} (${batch.length} items)`)
+            await this.addBatch(batch)
         }
-        data = {}
-        dataHeaders.forEach(k => k && results[i][k] ? data[k] = results[i][k] : null)
-        batch.push({
-            input: results[i].input,
-            result: results[i].result,
-            data
-        })
-    }
-    if(batch.length) {
-        console.log(`Adding batch ${currentBatch + 1} of ${totalBatches} (${batch.length} items)`)
-        await addBatch(batch)
     }
 }
-const mcp = async () => {
+export const getDb = () => {
+    if(!recal_instance) recal_instance = new Recall()
+    return recal_instance.db
+}
-    // Create an MCP server
-    // const server = new McpServer({
-    //     name: "Demo",
-    //     version: "1.0.0"
-    // });
-    // // Add an addition tool
-    // server.tool("add",
-    //     { a: z.number(), b: z.number() },
-    //     async ({ a, b }) => ({
-    //         content: [{ type: "text", text: String(a + b) }]
-    //     })
-    // );
-    // // Add a dynamic greeting resource
-    // server.resource(
-    //     "greeting",
-    //     new ResourceTemplate("greeting://{name}", { list: undefined }),
-    //     async (uri, { name }) => ({
-    //     contents: [{
-    //         uri: uri.href,
-    //         text: `Hello, ${name}!`
-    //     }]
-    //     })
-    // );
-    const server = new McpServer({
-        name: "Recall",
-        description: "Recall provides semantic search on the local vector database.",
-        version: "1.0.0"
-    });
-    // server.resource(
-    //     "echo",
-    //     new ResourceTemplate("echo://{message}", { list: undefined }),
-    //     async (uri, { message }) => ({
-    //     contents: [{
-    //         uri: uri.href,
-    //         text: `Resource echo: ${message}`
-    //     }]
-    //     })
-    // );
-    server.tool(
-        "search",
-        {
-            text: z.string(),
-            //numberOfResults: z.number()
-        },
-        async ({ text, numberOfResults }) => {
-            if(numberOfResults && numberOfResults > 50) numberOfResults = 50
-            let startTime = performance.now()
-            let results = await searchText(text, numberOfResults)
-            var timeDiff = ((performance.now() - startTime) / 1000).toFixed(2)
-            let content = [
-                {
-                    type: "text",
-                    text: `Sorry. Recal search didn't find anything.`
-                }
-            ]
-            if(results && results.rows && results.rows.length) {
-                // content = results.rows.map(r => {
-                //     return {
-                //         type: "text",
-                //         text: r[1]
-                //     }
-                // })
-                content = [{
-                    type: "text",
-                    text: `Recal search found the following results in ${timeDiff}s:`
-                }]
-                for(let i=0; i<results.rows.length; i++){
-                    let row = results.rows[i]
-                    content.push({
-                        type: "text",
-                        text: row[1]
-                    })
-                    // if(results.rows[2] && Object.keys(results.rows[2])){
-                    //     content.push({
-                    //         type: "json",
-                    //         text: row[2]
-                    //     })
-                    // }
-                }
-            }
+async function printQuery(query, params = {}) {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.printQuery(query, params)
+}
-            return {
-                content
-            }
-        }
-    );
-    // server.prompt(
-    //     "echo",
-    //     { message: z.string() },
-    //     ({ message }) => ({
-    //     messages: [{
-    //         role: "user",
-    //         content: {
-    //         type: "text",
-    //         text: `Please process this message: ${message}`
-    //         }
-    //     }]
-    //     })
-    // );
-    // Start receiving messages on stdin and sending messages on stdout
-    const transport = new StdioServerTransport();
-    await server.connect(transport);
+export const getEmbeddings = async (text) => {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.getEmbeddings(text)
+}
+export const createTable = async () => {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.createTable()
+}
+export const add = async (input, result, data={}, category="") => {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.add(input, result, data, category)
+}
+export const addBatch = async (batch, opts={onProgress:null}) => {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.addBatch(batch, opts)
+}
+export const remove = async (id, category="") => {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.remove(id, category)
+}
+export const removeAllByCategory = async (category="") => {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.removeAllByCategory(category)
+}
+export const searchText = async (text, category="", numResults = 5, includeInput=false) => {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.searchText(text, category, numResults, includeInput)
 }
-const splitSentences = (text) => {
-    return text.replace(/([.?!])\s*(?=[A-Z])/g, "$1|").split("|")
+export const vectorSearch = async (query, category='', numResults=5) => {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.vectorSearch(query, category, numResults)
+}
+export const nuke = () => {
+    if(!recal_instance) recal_instance = new Recall()
+    return recal_instance.nuke()
+}
+export const importFromJSONStream = async (fileName) => {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.importFromJSONStream(fileName)
+}
+export const importFromCSVorTSV = async (fileName, inputHeader, resultHeader) => {
+    if(!recal_instance) recal_instance = new Recall()
+    return await recal_instance.importFromCSVorTSV(fileName, inputHeader, resultHeader)
+}
+async function test(){
+    let recall = new Recall()
+    recall.nuke()
+    await recall.add('The quick brown fox jumps over the lazy dog', 'Fox jumps over dog', {foo:"bar"})
+    await recall.add('History of Serbia бегинс with emperor Heraclius', 'Serbia and Roman empire', {foo:"baz"})
+    let resp = await recall.vectorSearch('Un animal saute par-dessus un autre animal')
+    return JSON.stringify(resp)
+}
+const cmdArgs = (list = []) => {
+    let args = {}, current = null
+    for(let i=0; i<process.argv.length; i++){
+        let val = process.argv[i]
+        if(current && !list.includes(val)){
+            args[current] = val
+            current = null
+        }
+        if(list.includes(val)) {
+            current = val
+            args[current] = ''
+        }
+    }
+    args._cmd = process.argv[1].split(sep).pop()
+    return args
 }
 const runCLI = async () => {
-    let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--mcp', '--nuke', '--input-header', '--result-header', '--test', '--limit', '--category'])
+    let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--nuke', '--input-header', '--result-header', '--test', '--limit', '--category'])
     let query = args['--query'] || args['-q']
     if(args['--db']){
         config.DB_FILE = args['--db']
@@ -468,27 +448,23 @@ const runCLI = async () => {
     }else if(args['--json']){
         await importFromJSONStream(args['--json'])
         console.log('Imported.')
-    }else if(args['--mcp'] != undefined){
-        await mcp()
-        console.log('MCP server running.')
     }else if(args['--test'] != undefined){
         console.log('Test: ', await test())
     }else{
         console.log('Usage:')
         console.log(args._cmd + ' --query "Foo Bar"')
         console.log("\n" + 'Options:')
-        console.log('--query "SEARCH_STRING"                    - search')
-        console.log('--limit 2                                  - limit number of results (used with --query)')
-        console.log(`--add 'input|result|{"foo":"bar"}|categ'   - add data`)
-        console.log(`--remove 'id'                              - remove data`)
-        console.log(`--nuke                                     - destroy database`)
-        console.log(`--mcp                                      - run as MCP server (experimental)`)
-        console.log(`--db "FILE_NAME"                           - database file (SQLite)`)
-        console.log(`--import "file.csv | file.tsv"             - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data`)
-        console.log('--input-header "foo"                       - when used with --import designates specific header column as input')
-        console.log('--result-header "bar"                      - when used with --import designates specific header column as result')
-        console.log(`--json "FILE_NAME"                         - import from file which has one json object per line: {input:"", result:"", data:{}}`)
-        console.log(`--category "CATEGORY"                      - specify category when adding data and to filter by when querying (defaults to empty string)`)
+        console.log('--query "SEARCH_STRING"                    - Search the database')
+        console.log('--limit N                                  - Limit number of results (used with --query).')
+        console.log(`--add 'input|result|{"foo":"bar"}|categ'   - Add a data entry.`)
+        console.log(`--remove 'id'                              - Remove data by ID.`)
+        console.log(`--nuke                                     - Destroy the database.`)
+        console.log(`--db "FILE_NAME"                           - Specify database file (SQLite).`)
+        console.log(`--import "file.csv | file.tsv"             - Import from CSV or TSV with columns: input, result, additional data.`)
+        console.log('--input-header "foo"                       - When used with --import, designate a specific header column as input.')
+        console.log('--result-header "bar"                      - When used with --import, designate a specific header column as result.')
+        console.log(`--json "FILE_NAME"                         - Import from a file with one JSON object per line: {input:"", result:"", data:{}}.`)
+        console.log(`--category "CATEGORY"                      - Specify category when adding data and filter by it when querying (defaults to empty string).`)
     }
 }

package/utils/sanitize.js ADDED Viewed

@@ -0,0 +1,34 @@
+export function sanitizeValue(stringValue, maxChars=1000) {
+    if (typeof stringValue !== 'string') {
+        throw new Error('stringValue must be a string');
+    }
+    let sanitized = stringValue.normalize('NFC').trim();
+    // Basic validation
+    if (sanitized.length === 0) {
+        throw new Error('stringValue name cannot be empty');
+    }
+    if (sanitized.length > maxChars) {
+        throw new Error(`stringValue name too long (max ${maxChars} characters)`);
+    }
+    // Block control characters (primary security concern)
+    // This allows all other Unicode characters including emojis, Chinese, Arabic, etc.
+    if (/[\x00-\x1F\x7F-\x9F\u200B\u200E\u200F\u202A-\u202E\u2060-\u2069\uFEFF]/.test(sanitized)) {
+        throw new Error('stringValue contains disallowed control characters');
+    }
+    // Block private use areas
+    if (/[\uE000-\uF8FF\uFFF0-\uFFFF]/.test(sanitized)) {
+        throw new Error('stringValue contains disallowed Unicode characters');
+    }
+    // Block surrogate pairs (invalid alone)
+    if (/[\uD800-\uDFFF]/.test(sanitized)) {
+        throw new Error('stringValue contains invalid Unicode characters');
+    }
+    return sanitized;
+}