@strav/rag 0.3.30 → 0.3.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -5
- package/src/helpers.ts +27 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@strav/rag",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.33",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Vector retrieval framework for RAG in the Strav framework",
|
|
6
6
|
"license": "MIT",
|
|
@@ -18,10 +18,10 @@
|
|
|
18
18
|
"tsconfig.json"
|
|
19
19
|
],
|
|
20
20
|
"peerDependencies": {
|
|
21
|
-
"@strav/kernel": "0.3.
|
|
22
|
-
"@strav/brain": "0.3.
|
|
23
|
-
"@strav/database": "0.3.
|
|
24
|
-
"@strav/cli": "0.3.
|
|
21
|
+
"@strav/kernel": "0.3.33",
|
|
22
|
+
"@strav/brain": "0.3.33",
|
|
23
|
+
"@strav/database": "0.3.33",
|
|
24
|
+
"@strav/cli": "0.3.33"
|
|
25
25
|
},
|
|
26
26
|
"scripts": {
|
|
27
27
|
"test": "bun test tests/",
|
package/src/helpers.ts
CHANGED
|
@@ -17,6 +17,19 @@ export interface IngestOptions {
|
|
|
17
17
|
chunkSize?: number
|
|
18
18
|
overlap?: number
|
|
19
19
|
strategy?: string
|
|
20
|
+
/**
|
|
21
|
+
* Optional per-chunk sanitizer applied AFTER chunking, BEFORE
|
|
22
|
+
* embedding. Use to scrub PII, secrets, or prompt-injection markers
|
|
23
|
+
* out of untrusted source content before it lands in the vector
|
|
24
|
+
* store. Return `null` to drop a chunk; otherwise return the
|
|
25
|
+
* (possibly modified) text.
|
|
26
|
+
*
|
|
27
|
+
* The hook is the caller's escape valve — RAG cannot judge what's
|
|
28
|
+
* sensitive in your domain. See `docs/rag/rag.md` "Content trust
|
|
29
|
+
* model" for the threat surface (prompt injection at retrieval
|
|
30
|
+
* time, indexed PII, accidental secret indexing).
|
|
31
|
+
*/
|
|
32
|
+
sanitize?: (chunk: { content: string; index: number }) => string | null | Promise<string | null>
|
|
20
33
|
}
|
|
21
34
|
|
|
22
35
|
export const rag = {
|
|
@@ -43,10 +56,23 @@ export const rag = {
|
|
|
43
56
|
separators: config.chunking.separators,
|
|
44
57
|
}
|
|
45
58
|
const chunker = createChunker(chunkerConfig)
|
|
46
|
-
|
|
59
|
+
let chunks = chunker.chunk(content)
|
|
47
60
|
|
|
48
61
|
if (chunks.length === 0) return []
|
|
49
62
|
|
|
63
|
+
// Apply the optional sanitize hook before embedding. Drops chunks
|
|
64
|
+
// where the hook returns null (e.g., a chunk that's all PII).
|
|
65
|
+
if (options.sanitize) {
|
|
66
|
+
const sanitized: typeof chunks = []
|
|
67
|
+
for (const chunk of chunks) {
|
|
68
|
+
const result = await options.sanitize({ content: chunk.content, index: chunk.index })
|
|
69
|
+
if (result === null) continue
|
|
70
|
+
sanitized.push({ ...chunk, content: result })
|
|
71
|
+
}
|
|
72
|
+
chunks = sanitized
|
|
73
|
+
if (chunks.length === 0) return []
|
|
74
|
+
}
|
|
75
|
+
|
|
50
76
|
const chunkTexts = chunks.map(c => c.content)
|
|
51
77
|
let embeddings: number[][]
|
|
52
78
|
try {
|