@strav/rag 0.3.32 → 0.3.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +5 -5
  2. package/src/helpers.ts +27 -1
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@strav/rag",
3
- "version": "0.3.32",
3
+ "version": "0.3.33",
4
4
  "type": "module",
5
5
  "description": "Vector retrieval framework for RAG in the Strav framework",
6
6
  "license": "MIT",
@@ -18,10 +18,10 @@
18
18
  "tsconfig.json"
19
19
  ],
20
20
  "peerDependencies": {
21
- "@strav/kernel": "0.3.32",
22
- "@strav/brain": "0.3.32",
23
- "@strav/database": "0.3.32",
24
- "@strav/cli": "0.3.32"
21
+ "@strav/kernel": "0.3.33",
22
+ "@strav/brain": "0.3.33",
23
+ "@strav/database": "0.3.33",
24
+ "@strav/cli": "0.3.33"
25
25
  },
26
26
  "scripts": {
27
27
  "test": "bun test tests/",
package/src/helpers.ts CHANGED
@@ -17,6 +17,19 @@ export interface IngestOptions {
17
17
  chunkSize?: number
18
18
  overlap?: number
19
19
  strategy?: string
20
+ /**
21
+ * Optional per-chunk sanitizer applied AFTER chunking, BEFORE
22
+ * embedding. Use to scrub PII, secrets, or prompt-injection markers
23
+ * out of untrusted source content before it lands in the vector
24
+ * store. Return `null` to drop a chunk; otherwise return the
25
+ * (possibly modified) text.
26
+ *
27
+ * The hook is the caller's escape valve — RAG cannot judge what's
28
+ * sensitive in your domain. See `docs/rag/rag.md` "Content trust
29
+ * model" for the threat surface (prompt injection at retrieval
30
+ * time, indexed PII, accidental secret indexing).
31
+ */
32
+ sanitize?: (chunk: { content: string; index: number }) => string | null | Promise<string | null>
20
33
  }
21
34
 
22
35
  export const rag = {
@@ -43,10 +56,23 @@ export const rag = {
43
56
  separators: config.chunking.separators,
44
57
  }
45
58
  const chunker = createChunker(chunkerConfig)
46
- const chunks = chunker.chunk(content)
59
+ let chunks = chunker.chunk(content)
47
60
 
48
61
  if (chunks.length === 0) return []
49
62
 
63
+ // Apply the optional sanitize hook before embedding. Drops chunks
64
+ // where the hook returns null (e.g., a chunk that's all PII).
65
+ if (options.sanitize) {
66
+ const sanitized: typeof chunks = []
67
+ for (const chunk of chunks) {
68
+ const result = await options.sanitize({ content: chunk.content, index: chunk.index })
69
+ if (result === null) continue
70
+ sanitized.push({ ...chunk, content: result })
71
+ }
72
+ chunks = sanitized
73
+ if (chunks.length === 0) return []
74
+ }
75
+
50
76
  const chunkTexts = chunks.map(c => c.content)
51
77
  let embeddings: number[][]
52
78
  try {