@oculum/scanner 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/dist/formatters/cli-terminal.d.ts +27 -0
  2. package/dist/formatters/cli-terminal.d.ts.map +1 -0
  3. package/dist/formatters/cli-terminal.js +412 -0
  4. package/dist/formatters/cli-terminal.js.map +1 -0
  5. package/dist/formatters/github-comment.d.ts +41 -0
  6. package/dist/formatters/github-comment.d.ts.map +1 -0
  7. package/dist/formatters/github-comment.js +306 -0
  8. package/dist/formatters/github-comment.js.map +1 -0
  9. package/dist/formatters/grouping.d.ts +52 -0
  10. package/dist/formatters/grouping.d.ts.map +1 -0
  11. package/dist/formatters/grouping.js +152 -0
  12. package/dist/formatters/grouping.js.map +1 -0
  13. package/dist/formatters/index.d.ts +9 -0
  14. package/dist/formatters/index.d.ts.map +1 -0
  15. package/dist/formatters/index.js +35 -0
  16. package/dist/formatters/index.js.map +1 -0
  17. package/dist/formatters/vscode-diagnostic.d.ts +103 -0
  18. package/dist/formatters/vscode-diagnostic.d.ts.map +1 -0
  19. package/dist/formatters/vscode-diagnostic.js +151 -0
  20. package/dist/formatters/vscode-diagnostic.js.map +1 -0
  21. package/dist/index.d.ts +52 -0
  22. package/dist/index.d.ts.map +1 -0
  23. package/dist/index.js +648 -0
  24. package/dist/index.js.map +1 -0
  25. package/dist/layer1/comments.d.ts +8 -0
  26. package/dist/layer1/comments.d.ts.map +1 -0
  27. package/dist/layer1/comments.js +203 -0
  28. package/dist/layer1/comments.js.map +1 -0
  29. package/dist/layer1/config-audit.d.ts +8 -0
  30. package/dist/layer1/config-audit.d.ts.map +1 -0
  31. package/dist/layer1/config-audit.js +252 -0
  32. package/dist/layer1/config-audit.js.map +1 -0
  33. package/dist/layer1/entropy.d.ts +8 -0
  34. package/dist/layer1/entropy.d.ts.map +1 -0
  35. package/dist/layer1/entropy.js +500 -0
  36. package/dist/layer1/entropy.js.map +1 -0
  37. package/dist/layer1/file-flags.d.ts +7 -0
  38. package/dist/layer1/file-flags.d.ts.map +1 -0
  39. package/dist/layer1/file-flags.js +112 -0
  40. package/dist/layer1/file-flags.js.map +1 -0
  41. package/dist/layer1/index.d.ts +36 -0
  42. package/dist/layer1/index.d.ts.map +1 -0
  43. package/dist/layer1/index.js +132 -0
  44. package/dist/layer1/index.js.map +1 -0
  45. package/dist/layer1/patterns.d.ts +8 -0
  46. package/dist/layer1/patterns.d.ts.map +1 -0
  47. package/dist/layer1/patterns.js +482 -0
  48. package/dist/layer1/patterns.js.map +1 -0
  49. package/dist/layer1/urls.d.ts +8 -0
  50. package/dist/layer1/urls.d.ts.map +1 -0
  51. package/dist/layer1/urls.js +296 -0
  52. package/dist/layer1/urls.js.map +1 -0
  53. package/dist/layer1/weak-crypto.d.ts +7 -0
  54. package/dist/layer1/weak-crypto.d.ts.map +1 -0
  55. package/dist/layer1/weak-crypto.js +291 -0
  56. package/dist/layer1/weak-crypto.js.map +1 -0
  57. package/dist/layer2/ai-agent-tools.d.ts +19 -0
  58. package/dist/layer2/ai-agent-tools.d.ts.map +1 -0
  59. package/dist/layer2/ai-agent-tools.js +528 -0
  60. package/dist/layer2/ai-agent-tools.js.map +1 -0
  61. package/dist/layer2/ai-endpoint-protection.d.ts +36 -0
  62. package/dist/layer2/ai-endpoint-protection.d.ts.map +1 -0
  63. package/dist/layer2/ai-endpoint-protection.js +332 -0
  64. package/dist/layer2/ai-endpoint-protection.js.map +1 -0
  65. package/dist/layer2/ai-execution-sinks.d.ts +18 -0
  66. package/dist/layer2/ai-execution-sinks.d.ts.map +1 -0
  67. package/dist/layer2/ai-execution-sinks.js +496 -0
  68. package/dist/layer2/ai-execution-sinks.js.map +1 -0
  69. package/dist/layer2/ai-fingerprinting.d.ts +7 -0
  70. package/dist/layer2/ai-fingerprinting.d.ts.map +1 -0
  71. package/dist/layer2/ai-fingerprinting.js +654 -0
  72. package/dist/layer2/ai-fingerprinting.js.map +1 -0
  73. package/dist/layer2/ai-prompt-hygiene.d.ts +19 -0
  74. package/dist/layer2/ai-prompt-hygiene.d.ts.map +1 -0
  75. package/dist/layer2/ai-prompt-hygiene.js +356 -0
  76. package/dist/layer2/ai-prompt-hygiene.js.map +1 -0
  77. package/dist/layer2/ai-rag-safety.d.ts +21 -0
  78. package/dist/layer2/ai-rag-safety.d.ts.map +1 -0
  79. package/dist/layer2/ai-rag-safety.js +459 -0
  80. package/dist/layer2/ai-rag-safety.js.map +1 -0
  81. package/dist/layer2/ai-schema-validation.d.ts +25 -0
  82. package/dist/layer2/ai-schema-validation.d.ts.map +1 -0
  83. package/dist/layer2/ai-schema-validation.js +375 -0
  84. package/dist/layer2/ai-schema-validation.js.map +1 -0
  85. package/dist/layer2/auth-antipatterns.d.ts +20 -0
  86. package/dist/layer2/auth-antipatterns.d.ts.map +1 -0
  87. package/dist/layer2/auth-antipatterns.js +333 -0
  88. package/dist/layer2/auth-antipatterns.js.map +1 -0
  89. package/dist/layer2/byok-patterns.d.ts +12 -0
  90. package/dist/layer2/byok-patterns.d.ts.map +1 -0
  91. package/dist/layer2/byok-patterns.js +299 -0
  92. package/dist/layer2/byok-patterns.js.map +1 -0
  93. package/dist/layer2/dangerous-functions.d.ts +7 -0
  94. package/dist/layer2/dangerous-functions.d.ts.map +1 -0
  95. package/dist/layer2/dangerous-functions.js +1375 -0
  96. package/dist/layer2/dangerous-functions.js.map +1 -0
  97. package/dist/layer2/data-exposure.d.ts +16 -0
  98. package/dist/layer2/data-exposure.d.ts.map +1 -0
  99. package/dist/layer2/data-exposure.js +279 -0
  100. package/dist/layer2/data-exposure.js.map +1 -0
  101. package/dist/layer2/framework-checks.d.ts +7 -0
  102. package/dist/layer2/framework-checks.d.ts.map +1 -0
  103. package/dist/layer2/framework-checks.js +388 -0
  104. package/dist/layer2/framework-checks.js.map +1 -0
  105. package/dist/layer2/index.d.ts +58 -0
  106. package/dist/layer2/index.d.ts.map +1 -0
  107. package/dist/layer2/index.js +380 -0
  108. package/dist/layer2/index.js.map +1 -0
  109. package/dist/layer2/logic-gates.d.ts +7 -0
  110. package/dist/layer2/logic-gates.d.ts.map +1 -0
  111. package/dist/layer2/logic-gates.js +182 -0
  112. package/dist/layer2/logic-gates.js.map +1 -0
  113. package/dist/layer2/risky-imports.d.ts +7 -0
  114. package/dist/layer2/risky-imports.d.ts.map +1 -0
  115. package/dist/layer2/risky-imports.js +161 -0
  116. package/dist/layer2/risky-imports.js.map +1 -0
  117. package/dist/layer2/variables.d.ts +8 -0
  118. package/dist/layer2/variables.d.ts.map +1 -0
  119. package/dist/layer2/variables.js +152 -0
  120. package/dist/layer2/variables.js.map +1 -0
  121. package/dist/layer3/anthropic.d.ts +83 -0
  122. package/dist/layer3/anthropic.d.ts.map +1 -0
  123. package/dist/layer3/anthropic.js +1745 -0
  124. package/dist/layer3/anthropic.js.map +1 -0
  125. package/dist/layer3/index.d.ts +24 -0
  126. package/dist/layer3/index.d.ts.map +1 -0
  127. package/dist/layer3/index.js +119 -0
  128. package/dist/layer3/index.js.map +1 -0
  129. package/dist/layer3/openai.d.ts +25 -0
  130. package/dist/layer3/openai.d.ts.map +1 -0
  131. package/dist/layer3/openai.js +238 -0
  132. package/dist/layer3/openai.js.map +1 -0
  133. package/dist/layer3/package-check.d.ts +63 -0
  134. package/dist/layer3/package-check.d.ts.map +1 -0
  135. package/dist/layer3/package-check.js +508 -0
  136. package/dist/layer3/package-check.js.map +1 -0
  137. package/dist/modes/incremental.d.ts +66 -0
  138. package/dist/modes/incremental.d.ts.map +1 -0
  139. package/dist/modes/incremental.js +200 -0
  140. package/dist/modes/incremental.js.map +1 -0
  141. package/dist/tiers.d.ts +125 -0
  142. package/dist/tiers.d.ts.map +1 -0
  143. package/dist/tiers.js +234 -0
  144. package/dist/tiers.js.map +1 -0
  145. package/dist/types.d.ts +175 -0
  146. package/dist/types.d.ts.map +1 -0
  147. package/dist/types.js +50 -0
  148. package/dist/types.js.map +1 -0
  149. package/dist/utils/auth-helper-detector.d.ts +56 -0
  150. package/dist/utils/auth-helper-detector.d.ts.map +1 -0
  151. package/dist/utils/auth-helper-detector.js +360 -0
  152. package/dist/utils/auth-helper-detector.js.map +1 -0
  153. package/dist/utils/context-helpers.d.ts +96 -0
  154. package/dist/utils/context-helpers.d.ts.map +1 -0
  155. package/dist/utils/context-helpers.js +493 -0
  156. package/dist/utils/context-helpers.js.map +1 -0
  157. package/dist/utils/diff-detector.d.ts +53 -0
  158. package/dist/utils/diff-detector.d.ts.map +1 -0
  159. package/dist/utils/diff-detector.js +104 -0
  160. package/dist/utils/diff-detector.js.map +1 -0
  161. package/dist/utils/diff-parser.d.ts +80 -0
  162. package/dist/utils/diff-parser.d.ts.map +1 -0
  163. package/dist/utils/diff-parser.js +202 -0
  164. package/dist/utils/diff-parser.js.map +1 -0
  165. package/dist/utils/imported-auth-detector.d.ts +37 -0
  166. package/dist/utils/imported-auth-detector.d.ts.map +1 -0
  167. package/dist/utils/imported-auth-detector.js +251 -0
  168. package/dist/utils/imported-auth-detector.js.map +1 -0
  169. package/dist/utils/middleware-detector.d.ts +55 -0
  170. package/dist/utils/middleware-detector.d.ts.map +1 -0
  171. package/dist/utils/middleware-detector.js +260 -0
  172. package/dist/utils/middleware-detector.js.map +1 -0
  173. package/dist/utils/oauth-flow-detector.d.ts +41 -0
  174. package/dist/utils/oauth-flow-detector.d.ts.map +1 -0
  175. package/dist/utils/oauth-flow-detector.js +202 -0
  176. package/dist/utils/oauth-flow-detector.js.map +1 -0
  177. package/dist/utils/path-exclusions.d.ts +55 -0
  178. package/dist/utils/path-exclusions.d.ts.map +1 -0
  179. package/dist/utils/path-exclusions.js +222 -0
  180. package/dist/utils/path-exclusions.js.map +1 -0
  181. package/dist/utils/project-context-builder.d.ts +119 -0
  182. package/dist/utils/project-context-builder.d.ts.map +1 -0
  183. package/dist/utils/project-context-builder.js +534 -0
  184. package/dist/utils/project-context-builder.js.map +1 -0
  185. package/dist/utils/registry-clients.d.ts +93 -0
  186. package/dist/utils/registry-clients.d.ts.map +1 -0
  187. package/dist/utils/registry-clients.js +273 -0
  188. package/dist/utils/registry-clients.js.map +1 -0
  189. package/dist/utils/trpc-analyzer.d.ts +78 -0
  190. package/dist/utils/trpc-analyzer.d.ts.map +1 -0
  191. package/dist/utils/trpc-analyzer.js +297 -0
  192. package/dist/utils/trpc-analyzer.js.map +1 -0
  193. package/package.json +45 -0
  194. package/src/__tests__/benchmark/fixtures/false-positives.ts +227 -0
  195. package/src/__tests__/benchmark/fixtures/index.ts +68 -0
  196. package/src/__tests__/benchmark/fixtures/layer1/config-audit.ts +364 -0
  197. package/src/__tests__/benchmark/fixtures/layer1/hardcoded-secrets.ts +173 -0
  198. package/src/__tests__/benchmark/fixtures/layer1/high-entropy.ts +234 -0
  199. package/src/__tests__/benchmark/fixtures/layer1/index.ts +31 -0
  200. package/src/__tests__/benchmark/fixtures/layer1/sensitive-urls.ts +90 -0
  201. package/src/__tests__/benchmark/fixtures/layer1/weak-crypto.ts +197 -0
  202. package/src/__tests__/benchmark/fixtures/layer2/ai-agent-tools.ts +170 -0
  203. package/src/__tests__/benchmark/fixtures/layer2/ai-endpoint-protection.ts +418 -0
  204. package/src/__tests__/benchmark/fixtures/layer2/ai-execution-sinks.ts +189 -0
  205. package/src/__tests__/benchmark/fixtures/layer2/ai-fingerprinting.ts +316 -0
  206. package/src/__tests__/benchmark/fixtures/layer2/ai-prompt-hygiene.ts +178 -0
  207. package/src/__tests__/benchmark/fixtures/layer2/ai-rag-safety.ts +184 -0
  208. package/src/__tests__/benchmark/fixtures/layer2/ai-schema-validation.ts +434 -0
  209. package/src/__tests__/benchmark/fixtures/layer2/auth-antipatterns.ts +159 -0
  210. package/src/__tests__/benchmark/fixtures/layer2/byok-patterns.ts +112 -0
  211. package/src/__tests__/benchmark/fixtures/layer2/dangerous-functions.ts +246 -0
  212. package/src/__tests__/benchmark/fixtures/layer2/data-exposure.ts +168 -0
  213. package/src/__tests__/benchmark/fixtures/layer2/framework-checks.ts +346 -0
  214. package/src/__tests__/benchmark/fixtures/layer2/index.ts +67 -0
  215. package/src/__tests__/benchmark/fixtures/layer2/injection-vulnerabilities.ts +239 -0
  216. package/src/__tests__/benchmark/fixtures/layer2/logic-gates.ts +246 -0
  217. package/src/__tests__/benchmark/fixtures/layer2/risky-imports.ts +231 -0
  218. package/src/__tests__/benchmark/fixtures/layer2/variables.ts +167 -0
  219. package/src/__tests__/benchmark/index.ts +29 -0
  220. package/src/__tests__/benchmark/run-benchmark.ts +144 -0
  221. package/src/__tests__/benchmark/run-depth-validation.ts +206 -0
  222. package/src/__tests__/benchmark/run-real-world-test.ts +243 -0
  223. package/src/__tests__/benchmark/security-benchmark-script.ts +1737 -0
  224. package/src/__tests__/benchmark/tier-integration-script.ts +177 -0
  225. package/src/__tests__/benchmark/types.ts +144 -0
  226. package/src/__tests__/benchmark/utils/test-runner.ts +475 -0
  227. package/src/__tests__/regression/known-false-positives.test.ts +467 -0
  228. package/src/__tests__/snapshots/__snapshots__/scan-depth.test.ts.snap +178 -0
  229. package/src/__tests__/snapshots/scan-depth.test.ts +258 -0
  230. package/src/__tests__/validation/analyze-results.ts +542 -0
  231. package/src/__tests__/validation/extract-for-triage.ts +146 -0
  232. package/src/__tests__/validation/fp-deep-analysis.ts +327 -0
  233. package/src/__tests__/validation/run-validation.ts +364 -0
  234. package/src/__tests__/validation/triage-template.md +132 -0
  235. package/src/formatters/cli-terminal.ts +446 -0
  236. package/src/formatters/github-comment.ts +382 -0
  237. package/src/formatters/grouping.ts +190 -0
  238. package/src/formatters/index.ts +47 -0
  239. package/src/formatters/vscode-diagnostic.ts +243 -0
  240. package/src/index.ts +823 -0
  241. package/src/layer1/comments.ts +218 -0
  242. package/src/layer1/config-audit.ts +289 -0
  243. package/src/layer1/entropy.ts +583 -0
  244. package/src/layer1/file-flags.ts +127 -0
  245. package/src/layer1/index.ts +181 -0
  246. package/src/layer1/patterns.ts +516 -0
  247. package/src/layer1/urls.ts +334 -0
  248. package/src/layer1/weak-crypto.ts +328 -0
  249. package/src/layer2/ai-agent-tools.ts +601 -0
  250. package/src/layer2/ai-endpoint-protection.ts +387 -0
  251. package/src/layer2/ai-execution-sinks.ts +580 -0
  252. package/src/layer2/ai-fingerprinting.ts +758 -0
  253. package/src/layer2/ai-prompt-hygiene.ts +411 -0
  254. package/src/layer2/ai-rag-safety.ts +511 -0
  255. package/src/layer2/ai-schema-validation.ts +421 -0
  256. package/src/layer2/auth-antipatterns.ts +394 -0
  257. package/src/layer2/byok-patterns.ts +336 -0
  258. package/src/layer2/dangerous-functions.ts +1563 -0
  259. package/src/layer2/data-exposure.ts +315 -0
  260. package/src/layer2/framework-checks.ts +433 -0
  261. package/src/layer2/index.ts +473 -0
  262. package/src/layer2/logic-gates.ts +206 -0
  263. package/src/layer2/risky-imports.ts +186 -0
  264. package/src/layer2/variables.ts +166 -0
  265. package/src/layer3/anthropic.ts +2030 -0
  266. package/src/layer3/index.ts +130 -0
  267. package/src/layer3/package-check.ts +604 -0
  268. package/src/modes/incremental.ts +293 -0
  269. package/src/tiers.ts +318 -0
  270. package/src/types.ts +284 -0
  271. package/src/utils/auth-helper-detector.ts +443 -0
  272. package/src/utils/context-helpers.ts +535 -0
  273. package/src/utils/diff-detector.ts +135 -0
  274. package/src/utils/diff-parser.ts +272 -0
  275. package/src/utils/imported-auth-detector.ts +320 -0
  276. package/src/utils/middleware-detector.ts +333 -0
  277. package/src/utils/oauth-flow-detector.ts +246 -0
  278. package/src/utils/path-exclusions.ts +266 -0
  279. package/src/utils/project-context-builder.ts +707 -0
  280. package/src/utils/registry-clients.ts +351 -0
  281. package/src/utils/trpc-analyzer.ts +382 -0
@@ -0,0 +1,511 @@
1
+ /**
2
+ * Layer 2: RAG Data Safety Detection
3
+ * Detects data exfiltration risks in Retrieval Augmented Generation systems
4
+ *
5
+ * Covers:
6
+ * - M5.1: RAG data exfiltration (cross-tenant retrieval, raw context exposure)
7
+ * - Unscoped vector store queries
8
+ * - Raw retrieved context in responses
9
+ * - Context logging risks
10
+ */
11
+
12
+ import type { Vulnerability, VulnerabilitySeverity } from '../types'
13
+ import {
14
+ isComment,
15
+ isTestOrMockFile,
16
+ isDocumentationFile,
17
+ isScannerOrFixtureFile,
18
+ isExampleDirectory,
19
+ isLibraryCode,
20
+ } from '../utils/context-helpers'
21
+
22
+ // ============================================================================
23
+ // Context Detection
24
+ // ============================================================================
25
+
26
+ /**
27
+ * Check if file uses client-side fuzzy search libraries (not vector stores)
28
+ * These are safe local search implementations, not cross-tenant data access risks
29
+ */
30
+ function isClientSideFuzzySearch(content: string): boolean {
31
+ const fuzzySearchPatterns = [
32
+ // Fuse.js - client-side fuzzy search
33
+ /import.*from\s+['"]fuse\.js['"]/i,
34
+ /require\s*\(\s*['"]fuse\.js['"]\s*\)/i,
35
+ /new\s+Fuse\s*\(/i,
36
+ // Other client-side search libraries
37
+ /import.*from\s+['"]flexsearch['"]/i,
38
+ /import.*from\s+['"]lunr['"]/i,
39
+ /import.*from\s+['"]minisearch['"]/i,
40
+ /import.*from\s+['"]fuzzysort['"]/i,
41
+ /import.*from\s+['"]match-sorter['"]/i,
42
+ ]
43
+ return fuzzySearchPatterns.some(p => p.test(content))
44
+ }
45
+
46
+ /**
47
+ * Check if a line contains a generic query pattern that is NOT a vector store query
48
+ * These are common web framework patterns that should not be flagged as RAG issues
49
+ */
50
+ function isGenericQueryPattern(lineContent: string): boolean {
51
+ const genericQueryPatterns = [
52
+ // Express/Hono/Koa query params
53
+ /req\.query\s*\(/i,
54
+ /c\.req\.query\s*\(/i,
55
+ /ctx\.query\s*\(/i,
56
+ /request\.query\s*\(/i,
57
+ // URL search params
58
+ /searchParams\.get\s*\(/i,
59
+ /url\.searchParams/i,
60
+ /URLSearchParams/i,
61
+ // Query string parsing
62
+ /querystring\.parse/i,
63
+ /qs\.parse/i,
64
+ // Database query builders (not vector stores)
65
+ /\.query\s*\(\s*['"`]SELECT/i,
66
+ /\.query\s*\(\s*['"`]INSERT/i,
67
+ /\.query\s*\(\s*['"`]UPDATE/i,
68
+ /\.query\s*\(\s*['"`]DELETE/i,
69
+ // GraphQL queries
70
+ /graphql.*query/i,
71
+ /useQuery\s*\(/i,
72
+ /useLazyQuery\s*\(/i,
73
+ // tRPC/React Query
74
+ /trpc\.\w+\.\w+\.query/i,
75
+ /\.useQuery\s*\(/i,
76
+ // Prisma/Drizzle queries
77
+ /prisma\.\w+\.findMany/i,
78
+ /db\.query\./i,
79
+ // Generic method chaining that isn't vector search
80
+ /\.query\s*\(\s*\)/i, // Empty query call
81
+ ]
82
+ return genericQueryPatterns.some(p => p.test(lineContent))
83
+ }
84
+
85
+ /**
86
+ * Check if file has vector store imports (required for RAG detection)
87
+ */
88
+ function hasVectorStoreImport(content: string): boolean {
89
+ const vectorStoreImports = [
90
+ /from\s+['"]pinecone/i,
91
+ /from\s+['"]@pinecone-database/i,
92
+ /from\s+['"]weaviate/i,
93
+ /from\s+['"]chromadb/i,
94
+ /from\s+['"]@qdrant/i,
95
+ /from\s+['"]qdrant/i,
96
+ /from\s+['"]@langchain\/vectorstores/i,
97
+ /from\s+['"]langchain\/vectorstores/i,
98
+ /from\s+['"]faiss/i,
99
+ /from\s+['"]milvus/i,
100
+ /from\s+['"]@supabase.*vector/i,
101
+ /pgvector/i,
102
+ /VectorStore/i,
103
+ /Embeddings/i,
104
+ ]
105
+ return vectorStoreImports.some(p => p.test(content))
106
+ }
107
+
108
+ /**
109
+ * Check if a file is in a RAG/retrieval context based on path and content
110
+ */
111
+ function isRAGContextFile(filePath: string, content: string): boolean {
112
+ // Skip client-side fuzzy search libraries - these are NOT vector stores
113
+ if (isClientSideFuzzySearch(content)) {
114
+ return false
115
+ }
116
+
117
+ // Must have vector store imports to be considered RAG context
118
+ if (!hasVectorStoreImport(content)) {
119
+ return false
120
+ }
121
+
122
+ // File path indicators of RAG code
123
+ const ragPathPatterns = [
124
+ /\/(rag|retrieval|retriever|embedding|vector|knowledge)\//i,
125
+ /\/(search|index|indexer|embeddings?)\//i,
126
+ /(rag|retriever|embedding|vector|knowledge).*\.(ts|js|tsx|jsx|py)$/i,
127
+ /(search|retrieval|indexer).*\.(ts|js|tsx|jsx|py)$/i,
128
+ ]
129
+
130
+ if (ragPathPatterns.some(p => p.test(filePath))) {
131
+ return true
132
+ }
133
+
134
+ // Content patterns suggesting RAG usage - must be actual vector store clients
135
+ const ragContentPatterns = [
136
+ // Vector store patterns - specific to actual vector DBs
137
+ /VectorStore|Embeddings?|Retriever/i,
138
+ /similaritySearch|query_engine|retriever/i,
139
+ /vectorStore|embeddingModel|documentLoader/i,
140
+ // Framework imports - actual vector store SDKs
141
+ /from\s+['"](?:langchain|llama[-_]?index|@pinecone|@qdrant|chromadb|weaviate)/i,
142
+ /import.*(?:Pinecone|Chroma|Weaviate|Qdrant|Milvus|PGVector)/i,
143
+ // Vercel AI SDK RAG
144
+ /VercelKVVectorStore|SupabaseVectorStore|createEmbedding/i,
145
+ // Query patterns - but NOT generic .search() which could be Fuse.js
146
+ /\.retrieve\(|\.query\(/i,
147
+ /sourceDocuments|retrievedDocs|retrievedChunks/i,
148
+ // Supabase vector search
149
+ /\.rpc\s*\(\s*['"`]match_documents/i,
150
+ /pgvector|embedding.*vector/i,
151
+ ]
152
+
153
+ return ragContentPatterns.some(p => p.test(content))
154
+ }
155
+
156
+ /**
157
+ * Check if line/context has access control scoping
158
+ */
159
+ function hasAccessControlScoping(context: string): boolean {
160
+ const accessPatterns = [
161
+ // User/tenant scoping
162
+ /userId|user_id|user\.id|currentUser/i,
163
+ /tenantId|tenant_id|tenant\.id|orgId|org_id|workspaceId/i,
164
+ // Filter parameters
165
+ /filter\s*[:=]\s*\{[^}]*(?:user|tenant|org)/i,
166
+ /where\s*[:=].*(?:user|tenant|org)/i,
167
+ /metadata\s*[:=].*(?:user|tenant|org)/i,
168
+ /namespace\s*[:=]/i,
169
+ // Access check functions
170
+ /checkAccess|verifyPermission|canRead|canAccess|hasAccess/i,
171
+ /getAuthorized|filterByUser|filterByTenant/i,
172
+ ]
173
+ return accessPatterns.some(p => p.test(context))
174
+ }
175
+
176
+ /**
177
+ * Check if response is filtered/processed before return
178
+ */
179
+ function hasResponseFiltering(context: string): boolean {
180
+ const filterPatterns = [
181
+ // Content filtering
182
+ /\.map\s*\([^)]*\.(title|name|id|metadata)\)/i,
183
+ /\.filter\s*\(/i,
184
+ /sanitize|redact|mask|strip/i,
185
+ // Only returning specific fields
186
+ /return\s*\{[^}]*(?:id|title|summary)[^}]*\}(?![^}]*content)/i,
187
+ ]
188
+ return filterPatterns.some(p => p.test(context))
189
+ }
190
+
191
+ /**
192
+ * Check if there's authentication in the route/function
193
+ */
194
+ function hasAuthenticationInContext(content: string): boolean {
195
+ const authPatterns = [
196
+ /getSession|getCurrentUser|getServerSession/i,
197
+ /auth\(\)|requireAuth|verifyToken/i,
198
+ /req\.user|request\.user|context\.user/i,
199
+ /isAuthenticated|checkAuth|withAuth/i,
200
+ /Authorization.*Bearer/i,
201
+ /userId|user\.id|currentUserId/i,
202
+ ]
203
+ return authPatterns.some(p => p.test(content))
204
+ }
205
+
206
+ /**
207
+ * Get surrounding context lines
208
+ */
209
+ function getSurroundingContext(content: string, lineIndex: number, windowSize: number = 25): string {
210
+ const lines = content.split('\n')
211
+ const start = Math.max(0, lineIndex - windowSize)
212
+ const end = Math.min(lines.length, lineIndex + windowSize)
213
+ return lines.slice(start, end).join('\n')
214
+ }
215
+
216
+ // ============================================================================
217
+ // Pattern Definitions
218
+ // ============================================================================
219
+
220
+ interface RAGSafetyPattern {
221
+ name: string
222
+ pattern: RegExp
223
+ riskType: 'unscoped_retrieval' | 'context_exposure' | 'context_logging'
224
+ baseSeverity: VulnerabilitySeverity
225
+ description: string
226
+ suggestedFix: string
227
+ }
228
+
229
+ /**
230
+ * Unscoped retrieval query patterns
231
+ * Detects vector store queries without user/tenant filtering
232
+ */
233
+ const UNSCOPED_RETRIEVAL_PATTERNS: RAGSafetyPattern[] = [
234
+ // Generic vector store queries
235
+ {
236
+ name: 'Unscoped vector store query',
237
+ pattern: /\.(?:query|search|similaritySearch|retrieve)\s*\(\s*(?:["'`][^"'`]+["'`]|[a-zA-Z_]\w*)\s*\)/gi,
238
+ riskType: 'unscoped_retrieval',
239
+ baseSeverity: 'high',
240
+ description: 'Vector store query without user/tenant scoping. Retrieved documents may belong to other users, enabling cross-tenant data access.',
241
+ suggestedFix: 'Add filter/metadata parameter to scope queries: .query(query, { filter: { userId: currentUser.id } })',
242
+ },
243
+ // LangChain retriever invoke
244
+ {
245
+ name: 'LangChain retriever without filter',
246
+ pattern: /retriever\.(?:invoke|getRelevantDocuments)\s*\(\s*(?:["'`][^"'`]+["'`]|[a-zA-Z_]\w*)\s*\)/gi,
247
+ riskType: 'unscoped_retrieval',
248
+ baseSeverity: 'high',
249
+ description: 'LangChain retriever invocation without metadata filter. Documents from all users may be retrieved.',
250
+ suggestedFix: 'Use a filtered retriever or add metadata filter: retriever.invoke(query, { filter: { userId } })',
251
+ },
252
+ // LlamaIndex query engine
253
+ {
254
+ name: 'LlamaIndex query engine without filter',
255
+ pattern: /query_engine\.query\s*\(\s*["'`][^"'`]+["'`]\s*\)/gi,
256
+ riskType: 'unscoped_retrieval',
257
+ baseSeverity: 'high',
258
+ description: 'LlamaIndex query without node postprocessors or filters. All indexed documents are searchable.',
259
+ suggestedFix: 'Add node_postprocessors to filter by user/tenant metadata before retrieval.',
260
+ },
261
+ // Pinecone query
262
+ {
263
+ name: 'Pinecone query without metadata filter',
264
+ pattern: /\.query\s*\(\s*\{[^}]*(?:vector|topK)[^}]*\}\s*\)/gi,
265
+ riskType: 'unscoped_retrieval',
266
+ baseSeverity: 'medium',
267
+ description: 'Pinecone query may lack metadata filtering. Verify namespace or filter is set.',
268
+ suggestedFix: 'Add filter parameter: .query({ vector, topK, filter: { userId: { $eq: currentUserId } } })',
269
+ },
270
+ // Chroma query
271
+ {
272
+ name: 'Chroma collection query',
273
+ pattern: /collection\.query\s*\(\s*\{[^}]*query_texts[^}]*\}\s*\)/gi,
274
+ riskType: 'unscoped_retrieval',
275
+ baseSeverity: 'medium',
276
+ description: 'ChromaDB query without where filter. All documents in collection are searchable.',
277
+ suggestedFix: 'Add where parameter: collection.query({ query_texts, where: { userId: currentUserId } })',
278
+ },
279
+ // Weaviate search
280
+ {
281
+ name: 'Weaviate search without filter',
282
+ pattern: /\.nearText\s*\([^)]+\)\.(?:do|withLimit)/gi,
283
+ riskType: 'unscoped_retrieval',
284
+ baseSeverity: 'medium',
285
+ description: 'Weaviate nearText search without where filter. Results may include other users\' data.',
286
+ suggestedFix: 'Add .withWhere() to filter by user: .nearText({...}).withWhere({ path: ["userId"], operator: "Equal", valueString: userId })',
287
+ },
288
+ // Supabase vector search
289
+ {
290
+ name: 'Supabase vector search without RLS',
291
+ pattern: /\.rpc\s*\(\s*['"`]match_documents['"`]/gi,
292
+ riskType: 'unscoped_retrieval',
293
+ baseSeverity: 'medium',
294
+ description: 'Supabase vector search function called. Ensure RLS policies filter by user.',
295
+ suggestedFix: 'Verify Row Level Security (RLS) is enabled and filters documents by authenticated user.',
296
+ },
297
+ ]
298
+
299
+ /**
300
+ * Raw context exposure patterns
301
+ * Detects retrieved documents being returned directly to clients
302
+ */
303
+ const CONTEXT_EXPOSURE_PATTERNS: RAGSafetyPattern[] = [
304
+ // Returning sourceDocuments in response
305
+ {
306
+ name: 'Source documents in API response',
307
+ pattern: /(?:res\.json|NextResponse\.json|return)\s*\([^)]*(?:sourceDocuments|retrievedDocs|documents|chunks)/gi,
308
+ riskType: 'context_exposure',
309
+ baseSeverity: 'medium',
310
+ description: 'Raw retrieved documents returned in API response. Source content may leak sensitive information from the knowledge base.',
311
+ suggestedFix: 'Return only synthesized response or document IDs/titles. If source attribution needed, filter to metadata only.',
312
+ },
313
+ // Spreading documents into response
314
+ {
315
+ name: 'Retrieved context spread in response',
316
+ pattern: /(?:res\.json|return)\s*\(\s*\{[^}]*\.\.\.(?:docs|documents|chunks|sourceDocuments|context)/gi,
317
+ riskType: 'context_exposure',
318
+ baseSeverity: 'medium',
319
+ description: 'Retrieved document objects spread into response. Full document content may be exposed.',
320
+ suggestedFix: 'Extract and return only safe fields: { sources: docs.map(d => ({ id: d.id, title: d.title })) }',
321
+ },
322
+ // Returning raw context in response object
323
+ {
324
+ name: 'Raw retrieval context in response',
325
+ pattern: /return\s*\{[^}]*(?:context|retrievedContext|ragContext)\s*:/gi,
326
+ riskType: 'context_exposure',
327
+ baseSeverity: 'low',
328
+ description: 'Retrieved context included in response object. Review what data is actually exposed.',
329
+ suggestedFix: 'Ensure context field contains only safe, summarized content - not raw document text.',
330
+ },
331
+ // WebSocket/stream context exposure
332
+ {
333
+ name: 'Context in streaming response',
334
+ pattern: /(?:socket|ws|stream)\.(?:send|emit|write)\s*\([^)]*(?:sourceDocuments|context|chunks)/gi,
335
+ riskType: 'context_exposure',
336
+ baseSeverity: 'medium',
337
+ description: 'Retrieved context sent via streaming/WebSocket. Clients receive raw source data.',
338
+ suggestedFix: 'Stream only AI-generated text. Send source attribution separately with filtered metadata.',
339
+ },
340
+ ]
341
+
342
+ /**
343
+ * Context logging patterns
344
+ * Detects logging of retrieved documents or prompts with context
345
+ */
346
+ const CONTEXT_LOGGING_PATTERNS: RAGSafetyPattern[] = [
347
+ // Logging retrieved documents
348
+ {
349
+ name: 'Retrieved documents logged',
350
+ pattern: /(?:console|logger)\.\w+\s*\([^)]*(?:retrievedDocs|sourceDocuments|documents|chunks)/gi,
351
+ riskType: 'context_logging',
352
+ baseSeverity: 'info',
353
+ description: 'Retrieved documents logged. If logs are accessible, sensitive document content may be exposed.',
354
+ suggestedFix: 'Log document IDs/titles only: console.log("Retrieved:", docs.map(d => d.id))',
355
+ },
356
+ // Logging full prompt with context
357
+ {
358
+ name: 'Full prompt with context logged',
359
+ pattern: /(?:console|logger)\.\w+\s*\([^)]*(?:fullPrompt|promptWithContext|augmentedPrompt)/gi,
360
+ riskType: 'context_logging',
361
+ baseSeverity: 'low',
362
+ description: 'Full prompt (including retrieved context) logged. May expose sensitive document content in logs.',
363
+ suggestedFix: 'Log prompt length/metadata only. Avoid logging full prompt content in production.',
364
+ },
365
+ // Debug logging of RAG context
366
+ {
367
+ name: 'RAG context debug logging',
368
+ pattern: /(?:console\.(?:debug|log)|logger\.debug)\s*\([^)]*(?:context|ragContext|retrievalContext)/gi,
369
+ riskType: 'context_logging',
370
+ baseSeverity: 'info',
371
+ description: 'RAG context logged for debugging. Ensure debug logging is disabled in production.',
372
+ suggestedFix: 'Use conditional logging: if (process.env.NODE_ENV !== "production") console.debug(...)',
373
+ },
374
+ // Storing prompts with context
375
+ {
376
+ name: 'Prompt with context persisted',
377
+ pattern: /(?:\.create|\.insert|\.save)\s*\([^)]*(?:fullPrompt|promptWithContext|augmentedPrompt)/gi,
378
+ riskType: 'context_logging',
379
+ baseSeverity: 'medium',
380
+ description: 'Full prompt with retrieved context being persisted. May store sensitive document content.',
381
+ suggestedFix: 'Store user query and response separately. Do not persist raw retrieved context.',
382
+ },
383
+ ]
384
+
385
+ // ============================================================================
386
+ // Main Detection Function
387
+ // ============================================================================
388
+
389
+ /**
390
+ * Main detection function for RAG data safety issues
391
+ */
392
+ export function detectRAGSafetyIssues(
393
+ content: string,
394
+ filePath: string
395
+ ): Vulnerability[] {
396
+ const vulnerabilities: Vulnerability[] = []
397
+
398
+ // Skip non-applicable files
399
+ if (isScannerOrFixtureFile(filePath)) return vulnerabilities
400
+ if (isDocumentationFile(filePath)) return vulnerabilities
401
+
402
+ // Only scan files in RAG context
403
+ if (!isRAGContextFile(filePath, content)) {
404
+ return vulnerabilities
405
+ }
406
+
407
+ const lines = content.split('\n')
408
+ const isTestFile = isTestOrMockFile(filePath)
409
+ const isExample = isExampleDirectory(filePath)
410
+ const isLibrary = isLibraryCode(filePath)
411
+ const hasAuth = hasAuthenticationInContext(content)
412
+
413
+ // Process all pattern categories
414
+ const allPatterns: RAGSafetyPattern[] = [
415
+ ...UNSCOPED_RETRIEVAL_PATTERNS,
416
+ ...CONTEXT_EXPOSURE_PATTERNS,
417
+ ...CONTEXT_LOGGING_PATTERNS,
418
+ ]
419
+
420
+ for (const pattern of allPatterns) {
421
+ const regex = new RegExp(pattern.pattern.source, pattern.pattern.flags)
422
+ let match
423
+
424
+ while ((match = regex.exec(content)) !== null) {
425
+ const lineNumber = content.substring(0, match.index).split('\n').length
426
+ const lineContent = lines[lineNumber - 1]?.trim() || ''
427
+
428
+ // Skip comments
429
+ if (isComment(lineContent)) continue
430
+
431
+ // Skip generic query patterns (req.query, searchParams, etc.)
432
+ if (isGenericQueryPattern(lineContent)) continue
433
+
434
+ // Get surrounding context for analysis
435
+ const context = getSurroundingContext(content, lineNumber - 1, 25)
436
+
437
+ // Calculate severity based on context
438
+ let severity = pattern.baseSeverity
439
+ let description = pattern.description
440
+ const notes: string[] = []
441
+
442
+ // Apply context-aware severity adjustments
443
+ if (pattern.riskType === 'unscoped_retrieval') {
444
+ // Check for access control in surrounding context
445
+ if (hasAccessControlScoping(context)) {
446
+ severity = 'info'
447
+ notes.push('Access control scoping detected nearby')
448
+ } else if (!hasAuth) {
449
+ // No auth at all - higher risk
450
+ if (severity === 'medium') severity = 'high'
451
+ notes.push('No authentication detected in this file')
452
+ }
453
+ }
454
+
455
+ if (pattern.riskType === 'context_exposure') {
456
+ // Check if response is filtered
457
+ if (hasResponseFiltering(context)) {
458
+ severity = 'info'
459
+ notes.push('Response filtering detected')
460
+ } else if (!hasAuth) {
461
+ // Unauthenticated endpoint exposing context - higher risk
462
+ if (severity === 'medium') severity = 'high'
463
+ notes.push('Endpoint may be unauthenticated')
464
+ }
465
+ }
466
+
467
+ // Downgrade test files
468
+ if (isTestFile) {
469
+ severity = 'info'
470
+ notes.push('in test file')
471
+ }
472
+
473
+ // Downgrade example/demo directories
474
+ if (isExample && severity !== 'info') {
475
+ severity = 'info'
476
+ notes.push('in example/demo directory')
477
+ }
478
+
479
+ // Downgrade library code - base classes are intentionally generic
480
+ if (isLibrary && severity !== 'info') {
481
+ severity = 'info'
482
+ notes.push('library code - consumers add access controls')
483
+ }
484
+
485
+ // Build final description
486
+ if (notes.length > 0) {
487
+ description += ` (${notes.join('; ')})`
488
+ }
489
+
490
+ vulnerabilities.push({
491
+ id: `ai-rag-${filePath}-${lineNumber}-${pattern.name.replace(/\s+/g, '-')}`,
492
+ filePath,
493
+ lineNumber,
494
+ lineContent,
495
+ severity,
496
+ category: 'ai_rag_exfiltration',
497
+ title: pattern.name,
498
+ description,
499
+ suggestedFix: pattern.suggestedFix,
500
+ confidence: severity === 'info' ? 'low' : 'medium',
501
+ layer: 2,
502
+ requiresAIValidation: severity !== 'info' && pattern.riskType !== 'context_logging',
503
+ })
504
+ }
505
+ }
506
+
507
+ return vulnerabilities
508
+ }
509
+
510
+ // Export helper for use in other modules
511
+ export { isRAGContextFile }