npm - @sjovanovic/recall.js - Versions diffs - 1.0.1 → 1.0.4 - Mend

@sjovanovic/recall.js 1.0.1 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -6,13 +6,13 @@
 Recall.js is long term memory for AI apps!
-It is a generic RAG (Retrieval-augmented generation) JavaScript library and command line utility focused on speed, ease of use and embeddability.
+It is a tool for building RAG (Retrieval-augmented generation) in a form of JavaScript library and command line utility focused on speed, ease of use and embeddability.
-It is versatile: use it for generic Semantic Search, as expert memory for your AI app, as a recommendation system, there are so many possibilities.
+It is versatile and you don't have to use it exclusively for RAG, it can also be used for generic Semantic Search, as expert memory for your AI app, as a  recommendation system, there are many possibilities...
 Recall.js supports multilingual embeddings out of the box so you can add data in one language and then query it in another.
-Under the hood, recall.js uses sentence vector embeddings and a vector database to index and query your data. It is a light wrapper around local language models such as [MiniLM-L12-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2) (optionally LLMs can be used) and [CozoDB](https://www.cozodb.org/) vector database.
+Under the hood, recall.js uses [Transformers.js](https://huggingface.co/docs/transformers.js/index) feature extraction and a vector database to index and query your data. It is a light wrapper around local language models such as [Multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2) and [CozoDB](https://www.cozodb.org/) vector database.
 ## Install
@@ -20,7 +20,15 @@ Under the hood, recall.js uses sentence vector embeddings and a vector database
 ## Usage
-Warning: when this library is used for the first time, it will download a local language model MiniLM-L12-v2 which may take long time depending on your Internet connectivity. Please be patient.
+Console:
+```console
+recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
+recall --query "Un animal saute par-dessus un autre animal" --limit 1
+```
+**Warning:** when this library is used for the first time, it will download a local language model Multilingual-MiniLM-L12-v2 which may take a while depending on your Internet connectivity. Please be patient.
+Below is the same example in JavaScript:
 ```javascript
@@ -50,16 +58,18 @@ response:
     "dist",
     "result",
     "id",
-    "data"
+    "data",
+    "category"
   ],
   "rows": [
     [
-      0.5840495824813843, // vector similarity
+      0.6840495824813843, // vector similarity
       "Fox and dog",
       "08840189191373282",
       {
         "foo": "bar"
-      }
+      },
+      ""
     ]
   ]
 }
@@ -68,105 +78,102 @@ response:
 ```
-Here's how the above example looks like in CLI:
-```log
-recall --add 'The quick brown fox jumps over the lazy dog|Fox|{"foo":"bar"}'
-recall --query "Un animal saute par-dessus un autre animal" --limit 1
-```
 ## Options
-Easiest way to get all the options is via command line:
+Easy way to view all the options is via command line:
-```log
+```console
 recall --help
 Usage:
-recall --query "Foo Bar"
+recall.js --query "Foo Bar"
 Options:
---query "SEARCH_STRING"                - search
---limit 2                              - limit number of results (used with --query)
---add 'input|result|{"foo":"bar"}'     - add data
---remove 'id'                          - remove data
---nuke                                 - destroy database
---mcp                                  - run as MCP server
---db "FILE_NAME"                       - database file (SQLite)
---import "file.csv | file.tsv"         - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data
---input-header "foo"                   - when used with --import designates specific header column as input
---result-header "bar"                  - when used with --import designates specific header column as result
---json "FILE_NAME"                     - import from file which has one json object per line: {input:"", result:"", data:{}}
+--query "SEARCH_STRING"                    - search
+--limit 2                                  - limit number of results (used with --query)
+--add 'input|result|{"foo":"bar"}|categ'   - add data
+--remove 'id'                              - remove data
+--nuke                                     - destroy database
+--mcp                                      - run as MCP server (experimental)
+--db "FILE_NAME"                           - database file (SQLite)
+--import "file.csv | file.tsv"             - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data
+--input-header "foo"                       - when used with --import designates specific header column as input
+--result-header "bar"                      - when used with --import designates specific header column as result
+--json "FILE_NAME"                         - import from file which has one json object per line: {input:"", result:"", data:{}}
+--category "CATEGORY"                      - specify category when adding data and to filter by when querying (defaults to empty string)
 ```
-Note when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
+**Note:** when adding data recall will generate unique id automatically. To set custom id add it as a string property named "id" in the data object (i.e. `{"id":"customID"}`).
 ## JavaScript API Reference
-**RECALL.config**
+### RECALL.config
 Configuration object.
 ```javascript
 export const config = {
-    VECTOR_SIZE: 384, // number of dimensions
-    MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
+    VECTOR_SIZE: 384, // number of dimensions (must match the models output)
+    MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
     SHOW_ERRORS: true, // Show errors
     DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
-    PATH: PATH // directory of recall.js
+    PATH: PATH, // directory of recall.js
+    DEVICE: undefined, // Transformers.js device
+    DTYPE: undefined, // Transformers.js dtype
+    PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
 }
 ```
-**RECALL.getDb()**
+### RECALL.getDb()
 Returns reference to the CozoDB instance.
-**RECALL.getEmbeddings(text) -> Promise(Array)**
+### RECALL.getEmbeddings(text) -> Promise(Array)
 Given text calculates the embeddings vector
-**RECALL.add(input, result, data={}) -> Promise(Object)**
+### RECALL.add(input, result, data={}, category="") -> Promise(Object)
 Add data. `input` is the sentence to get embeddings from. `result` is the string to show in the results. `data` is arbitrary object intended to hold related pieces of information and references. If `data` object contains `id` property it will be used as unique id of the record.
-**RECALL.addBatch(batch) -> Promise(Object)**
+### RECALL.addBatch(batch) -> Promise(Object)
 Add data in batches (faster than using add repeteadely).
 `batch` is an Array that looks like this:
 ```
-let batch = [{input:"", result:"", data:{}}]
+let batch = [{input:"", result:"", data:{}, category:""}]
 ```
-**RECALL.remove(id) -> Promise(Object)**
+### RECALL.remove(id) -> Promise(Object)
 Remove data by id. id is a string.
-**RECALL.searchText(text, numResults = 5) ->  Promise(Object)**
+### RECALL.searchText(text, category="", numResults = 5, includeInput=false) ->  Promise(Object)
 Query the vector database. Accepts query text and number of results to return.
-**RECALL.nuke()**
+### RECALL.nuke()
 Deletes the database.
-**RECALL.importFromJSONStream(fileName) -> Promise(object)**
+### RECALL.importFromJSONStream(fileName) -> Promise(object)
 Imports from readable stream or file which consists of JSON objects, one per line. e.g.
 ```
-{input:"one", result:"one result", data:{"id":"123"}}
-{input:"", result:"", data:{}}
+{input:"one", result:"one result", data:{"id":"123"}, category:""}
+{input:"", result:"", data:{}, category:""}
 ...
 ```
 This is the most efficient way to import data.
-**RECALL.importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise()**
+### RECALL.importFromCSVorTSV(fileName, inputHeader=null, resultHeader=null) -> Promise()
 Imports from CSV or TSV file. By default fist column is used as input, second as result and remaining columns are put in the data object.
 If `inputHeader` is specified, function will try to find the column by that name and use it as input.
 If `resultHeader` is specified, function will try to find the column by that name and use it as result.
-**RECALL.mcp() -> Promise()**
+### RECALL.mcp() -> Promise()
 (Experimental)
 Runs MCP server and makes the results available when mentioning `Recall search` in the prompt. Currently only supports STDIO.

package/package.json CHANGED Viewed

@@ -1,24 +1,24 @@
 {
   "name": "@sjovanovic/recall.js",
-  "version": "1.0.1",
-  "description": "Semantic search as long term memory for LLMs",
+  "version": "1.0.4",
+  "description": "Easy RAG with semantic search and long term memory",
   "main": "recall.js",
   "bin": {
     "recall": "recall.js"
   },
   "type": "module",
   "scripts": {
+    "start": "node recall.js",
     "test": "echo \"Error: no test specified\" && exit 1",
     "query": "node recall.js --query "
   },
   "author": "Slobodan Jovanovic",
   "license": "ISC",
   "dependencies": {
-    "@modelcontextprotocol/sdk": "^1.8.0",
-    "@themaximalist/embeddings.js": "^0.1.3",
-    "@xenova/transformers": "^2.17.2",
+    "@huggingface/transformers": "^4.2.0",
+    "@modelcontextprotocol/sdk": "^1.29.0",
     "cozo-node": "^0.7.6",
     "csv-parser": "^3.2.0",
-    "zod": "^3.24.2"
+    "zod": "^4.3.6"
   }
 }

package/recall.js CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env node
 import {CozoDb} from 'cozo-node'
-import embeddings from "@themaximalist/embeddings.js";
+import { pipeline } from "@huggingface/transformers";
 import csv from 'csv-parser'
 import fs from 'fs'
 import { resolve, join, dirname, sep } from 'path'
@@ -10,35 +10,43 @@ import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mc
 import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
 import { z } from "zod";
+// import {sanitizeValue} from './utils/sanitize.js'
 const pathToThisFile = resolve(fileURLToPath(import.meta.url))
 const pathPassedToNode = resolve(process.argv[1])
 const isThisFileBeingRunViaCLI = pathToThisFile.includes(pathPassedToNode) || pathPassedToNode.includes('.npm-global')
 const PATH = dirname(pathToThisFile)
 export const config = {
-    VECTOR_SIZE: 384, // number of dimensions
-    MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use
+    VECTOR_SIZE: 384, // number of dimensions (must match the models output)
+    MODEL_NAME: 'Xenova/paraphrase-multilingual-MiniLM-L12-v2', // model to use (passed to Transformers.js)
     SHOW_ERRORS: true, // Show errors
     DB_FILE: join(PATH, 'vector.db'), // Path to the datbase file (SQLite file used by CozoDB)
-    PATH: PATH // directory of recall.js
+    PATH: PATH, // directory of recall.js
+    DEVICE: undefined, // Transformers.js device
+    DTYPE: undefined, // Transformers.js dtype
+    PROGRESS_CALLBACK: undefined // Transformers.js progress_callback
 }
-var db = null
+var db = null, initDone = false
 export const getDb = () => {
-    if(!db) db = new CozoDb('sqlite', config.DB_FILE)
+    if(!db) {
+        db = new CozoDb('sqlite', config.DB_FILE)
+    }
     return db
 }
 async function printQuery(query, params = {}) {
-    try {
-        if(!db) {
-            getDb()
-            try {
-                let isCreated = await createTable()
-                if(isCreated) console.log('Created embeddings table.')
-            }catch(err) {}
+    try{
+        if(!initDone) {
+            initDone = true
+            await createTable()
         }
+    }catch(err) {
+        //console.log('CREATE TABLE ERROR', err)
+    }
+    try {
         let data = getDb().run(query, params)
         return data
     }catch(err){
@@ -47,17 +55,27 @@ async function printQuery(query, params = {}) {
 }
 export const getEmbeddings = async (text) => {
-    const embedding = await embeddings(text,  {
-        service:'transformers',
-        model: config.MODEL_NAME,
-        cache_file: join(config.PATH, "cache", ".embeddings.cache.json")
-    });
-    return embedding
+    let pipe = config._pipe
+    if(!pipe) {
+        config._pipe = await pipeline("feature-extraction", config.MODEL_NAME, {
+            progress_callback:(progress) => {
+                if(config.PROGRESS_CALLBACK) return config.PROGRESS_CALLBACK();
+                if(progress.status === "progress_total"){
+                    process.stdout.write(`\r\x1b[K✅ Loaded ${ Math.round(progress.progress)}% ${progress.name || "model"}`)
+                }
+            },
+            device: config.DEVICE,
+            dtype: config.DTYPE
+        });
+        pipe = config._pipe
+    }
+    const embedding = await pipe(text, { pooling: "mean", normalize: true });
+    return Array.from(embedding.data)
 }
 export const createTable = async () => {
     // create table (id, v, input, result, data)
-    let tableCreated = await printQuery(`:create embeddings {id: String => v: <F32; ${config.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
+    let tableCreated = await printQuery(`:create embeddings {id: String, category: String => v: <F32; ${config.VECTOR_SIZE}>, input: String, result: String, data: Json}`)
     if(tableCreated){
         // create index
         let indexCreated = await printQuery(`::hnsw create embeddings:index_name {
@@ -67,7 +85,6 @@ export const createTable = async () => {
             fields: [v],
             distance: L2, # Cosine, IP
             ef_construction:50, # number of nearest neighbors
-            #filter: k != 'foo', # only those rows for which the expression evaluates to true are indexed
             extend_candidates: false, # include nearest neighbors of the nearest neighbors
             keep_pruned_connections: false,
         }`)
@@ -76,21 +93,14 @@ export const createTable = async () => {
     return false
 }
-export const add = async (input, result, data={}) => {
+export const add = async (input, result, data={}, category="") => {
     if(!input || !result) return
     input = sanitizeString(input)
     result = sanitizeString(result)
     const embedding = await getEmbeddings(input)
-    console.log('Adding', input, '->', result)
     let id = data.id || Math.random().toString().substring(2)
-    return await printQuery(`?[id, v, input, result, data] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)} ]]
-        :put embeddings {id => v, input, result, data}
+    return await printQuery(`?[id, v, input, result, data, category] <- [["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input.replaceAll('"', "'"))}, ${JSON.stringify(result.replaceAll('"', "'"))}, ${JSON.stringify(data)}, ${JSON.stringify(category.replaceAll('"', "'"))} ]]
+        :put embeddings {id, category => v, input, result, data}
     `)
 }
@@ -102,64 +112,91 @@ export const add = async (input, result, data={}) => {
  * @param {Array} batch
  * @returns
  */
-export const addBatch = async (batch) => {
+export const addBatch = async (batch, opts={onProgress:null}) => {
     if(!batch || !Array.isArray(batch)) return
     let vectorBatch = []
     for(let i=0;i<batch.length; i++){
-        let {input, result, data} = batch[i]
+        let {input, result, data, category} = batch[i]
         if(!input || !result) continue
         if(!data) data = {}
+        if(!category) category = ''
         const embedding = await getEmbeddings(input)
         batch[i].embedding = embedding
         let item = ''
         if(i == 0) {
-            item += `?[id, v, input, result, data] <- [`
+            item += `?[id, v, input, result, data, category] <- [`
         }
         input = sanitizeString(input)
         result = sanitizeString(result)
         let id = data?.id ? data.id : Math.random().toString().substring(2)
-        item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)} ],`
+        item += `["${id}", ${JSON.stringify(embedding)}, ${JSON.stringify(input)}, ${JSON.stringify(result)}, ${JSON.stringify(data)}, ${JSON.stringify(category)} ],`
         if(i == batch.length-1) {
             item += `]
-            :put embeddings {id => v, input, result, data}`
+            :put embeddings {id, category => v, input, result, data}`
         }
         vectorBatch.push(item)
+        if(opts.onProgress && typeof opts.onProgress == 'function') {
+            await opts.onProgress({index: i+1, total:batch.length, item: batch[i], embedding, percent: Math.round((i+1) / batch.length * 100)})
+        }
     }
     return await printQuery(vectorBatch.join("\n"))
 }
 const sanitizeString = (str)=>{
-    return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ")
+    return str.replace(/[\/#$%\^&\*{}=_`~()\"]/g," ").replace(/\s{2,}/g, " ").trim()
 }
-export const remove = async (id) => {
+export const remove = async (id, category="") => {
     if(!id || typeof id != 'string') return
-    id.replace(/[^a-zA-Z0-9]/g, '')
-    if(!id) return
+    id = id.replace(/[^a-zA-Z0-9]/g, '')
+    category = sanitizeString(category)
+    if(!id || !category) return
     let results = await printQuery(
-        `?[id] <- [['${id}']]
-        ::remove embeddings {id}`)
+        `?[id, category] <- [['${id}', '${category}']]
+        ::rm embeddings {id, category}`)
+    return results
+}
+export const removeAllByCategory = async (category="") => {
+    category = sanitizeString(category)
+    if(!category) return
+    let results
+    try {
+        results = await printQuery(
+            `?[id, category] := *embeddings{id, category}, category = "${category}"
+            :rm embeddings {id, category}`)
+    }catch(err){
+        console.error(err)
+    }
     return results
 }
-export const searchText = async (text, numResults = 5) => {
+export const searchText = async (text, category="", numResults = 5, includeInput=false) => {
     const embedding = await getEmbeddings(text)
-    let results = await printQuery(`?[dist, result, id, data] := ~embeddings:index_name{ id, v, input, result, data |
+    let results = await printQuery(`?[dist, result, id, data, category${includeInput? ', input' : ''}] := ~embeddings:index_name { id, v, input, result, data, category${includeInput? ', input' : ''} |
         query: q,
         k: ${numResults}, # number of results
-        ef: 90, # number of neighbours to consider
+        ef: 50, # number of neighbours to consider
         bind_distance: dist,
+        filter: category==${JSON.stringify(category)},
         radius: 10.0
     }, q = vec(${JSON.stringify(embedding)})
-    :sort dist`)
+    :sort -dist`)
     return results
 }
-export const vectorSearch = async (query, numResults=5) => {
-    return await searchText(query, numResults)
+export const vectorSearch = async (query, category='', numResults=5) => {
+    let result = undefined
+    try{
+        result = await searchText(query, category, numResults)
+    }catch(err){
+        if(config.SHOW_ERRORS) console.error(err.display || err.message)
+    }
+    return result
 }
 const cmdArgs = (list = []) => {
@@ -274,19 +311,6 @@ export const importFromCSVorTSV = async (fileName, inputHeader, resultHeader) =>
     let results = await fetchFromFile(fileName)
-    // // split results to sentences
-    // let results_raw = await fetchFromFile(fileName)
-    // let results = []
-    // for(let i=0;i<results_raw.length; i++){
-    //     let sentences = splitSentences(results_raw[i].input)
-    //     for(let j=0; j<sentences.length; j++){
-    //         results.push({
-    //             ...results_raw[i],
-    //             ...{ input: sentences[j] }
-    //         })
-    //     }
-    // }
     let batchSize = 40, batch = [], currentBatch = 0, totalBatches = Math.ceil(results.length / batchSize), dataHeaders = Object.keys(results[results.length-1]).filter(k => k != 'input' && k != 'result'), data
     for(let i=0; i<results.length; i++){
         if(i % batchSize === 0){
@@ -431,18 +455,22 @@ const splitSentences = (text) => {
 }
 const runCLI = async () => {
-    let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--mcp', '--nuke', '--input-header', '--result-header', '--test', '--limit'])
+    let args = cmdArgs(['--query', '-q', '--add', '--db', '--import', '--json', '--mcp', '--nuke', '--input-header', '--result-header', '--test', '--limit', '--category'])
     let query = args['--query'] || args['-q']
     if(args['--db']){
         config.DB_FILE = args['--db']
     }
+    let category = ''
+    if(args['--category']) {
+        category = args['--category']
+    }
     if(query){
         let numResults = 5
         if(args['--limit'] && parseInt(args['--limit'])) {
             numResults = parseInt(args['--limit'])
         }
         console.time('Search time')
-        let result = await vectorSearch(query, numResults)
+        let result = await vectorSearch(query, category, numResults)
         console.timeEnd('Search time')
         console.log('Results:')
         console.log(JSON.stringify(result, null, 2))
@@ -451,17 +479,17 @@ const runCLI = async () => {
         if(!input || !result) {
             console.log('Usage:')
             return console.log(args._cmd + `--add 'input|result|{"foo":"bar"}'`)
-        }
+        }
         let data = {}
         if(dataString) {
             try {data = JSON.parse(dataString)}catch(err) {}
         }
-        let resp = await add(input, result, data)
+        let resp = await add(input, result, data, category)
         console.log(JSON.stringify(resp, null, 2))
     }else if(args['--remove']){
         let id = args['--remove']
         if(!id) return console.log('Please specify ID to remove')
-        let resp = await remove(id)
+        let resp = await remove(id, category)
         console.log(JSON.stringify(resp, null, 2))
     }else if(args['--nuke'] != undefined){
         nuke()
@@ -481,17 +509,18 @@ const runCLI = async () => {
         console.log('Usage:')
         console.log(args._cmd + ' --query "Foo Bar"')
         console.log("\n" + 'Options:')
-        console.log('--query "SEARCH_STRING"                - search')
-        console.log('--limit 2                              - limit number of results (used with --query)')
-        console.log(`--add 'input|result|{"foo":"bar"}'     - add data`)
-        console.log(`--remove 'id'                          - remove data`)
-        console.log(`--nuke                                 - destroy database`)
-        console.log(`--mcp                                  - run as MCP server (experimental)`)
-        console.log(`--db "FILE_NAME"                       - database file (SQLite)`)
-        console.log(`--import "file.csv | file.tsv"         - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data`)
-        console.log('--input-header "foo"                   - when used with --import designates specific header column as input')
-        console.log('--result-header "bar"                  - when used with --import designates specific header column as result')
-        console.log(`--json "FILE_NAME"                     - import from file which has one json object per line: {input:"", result:"", data:{}}`)
+        console.log('--query "SEARCH_STRING"                    - search')
+        console.log('--limit 2                                  - limit number of results (used with --query)')
+        console.log(`--add 'input|result|{"foo":"bar"}|categ'   - add data`)
+        console.log(`--remove 'id'                              - remove data`)
+        console.log(`--nuke                                     - destroy database`)
+        console.log(`--mcp                                      - run as MCP server (experimental)`)
+        console.log(`--db "FILE_NAME"                           - database file (SQLite)`)
+        console.log(`--import "file.csv | file.tsv"             - import from CSV or TSV w/ columns: 1. input 2. result 3. and remaining columns are additional data`)
+        console.log('--input-header "foo"                       - when used with --import designates specific header column as input')
+        console.log('--result-header "bar"                      - when used with --import designates specific header column as result')
+        console.log(`--json "FILE_NAME"                         - import from file which has one json object per line: {input:"", result:"", data:{}}`)
+        console.log(`--category "CATEGORY"                      - specify category when adding data and to filter by when querying (defaults to empty string)`)
     }
 }

package/utils/sanitize.js ADDED Viewed

@@ -0,0 +1,34 @@
+export function sanitizeValue(stringValue, maxChars=1000) {
+    if (typeof stringValue !== 'string') {
+        throw new Error('stringValue must be a string');
+    }
+    let sanitized = stringValue.normalize('NFC').trim();
+    // Basic validation
+    if (sanitized.length === 0) {
+        throw new Error('stringValue name cannot be empty');
+    }
+    if (sanitized.length > maxChars) {
+        throw new Error(`stringValue name too long (max ${maxChars} characters)`);
+    }
+    // Block control characters (primary security concern)
+    // This allows all other Unicode characters including emojis, Chinese, Arabic, etc.
+    if (/[\x00-\x1F\x7F-\x9F\u200B\u200E\u200F\u202A-\u202E\u2060-\u2069\uFEFF]/.test(sanitized)) {
+        throw new Error('stringValue contains disallowed control characters');
+    }
+    // Block private use areas
+    if (/[\uE000-\uF8FF\uFFF0-\uFFFF]/.test(sanitized)) {
+        throw new Error('stringValue contains disallowed Unicode characters');
+    }
+    // Block surrogate pairs (invalid alone)
+    if (/[\uD800-\uDFFF]/.test(sanitized)) {
+        throw new Error('stringValue contains invalid Unicode characters');
+    }
+    return sanitized;
+}