rag-lite-ts 2.1.1 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (309) hide show
  1. package/dist/{core → cjs/core}/model-validator.js +1 -1
  2. package/dist/{core → cjs/core}/vector-index.js +4 -2
  3. package/dist/esm/api-errors.d.ts +90 -0
  4. package/dist/esm/api-errors.js +320 -0
  5. package/dist/esm/cli/indexer.d.ts +11 -0
  6. package/dist/esm/cli/indexer.js +471 -0
  7. package/dist/esm/cli/search.d.ts +7 -0
  8. package/dist/esm/cli/search.js +332 -0
  9. package/dist/esm/cli.d.ts +3 -0
  10. package/dist/esm/cli.js +529 -0
  11. package/dist/esm/config.d.ts +51 -0
  12. package/dist/esm/config.js +79 -0
  13. package/dist/esm/core/abstract-embedder.d.ts +125 -0
  14. package/dist/esm/core/abstract-embedder.js +264 -0
  15. package/dist/esm/core/actionable-error-messages.d.ts +60 -0
  16. package/dist/esm/core/actionable-error-messages.js +397 -0
  17. package/dist/esm/core/adapters.d.ts +93 -0
  18. package/dist/esm/core/adapters.js +139 -0
  19. package/dist/esm/core/batch-processing-optimizer.d.ts +155 -0
  20. package/dist/esm/core/batch-processing-optimizer.js +536 -0
  21. package/dist/esm/core/binary-index-format.d.ts +78 -0
  22. package/dist/esm/core/binary-index-format.js +291 -0
  23. package/dist/esm/core/chunker.d.ts +119 -0
  24. package/dist/esm/core/chunker.js +73 -0
  25. package/dist/esm/core/cli-database-utils.d.ts +53 -0
  26. package/dist/esm/core/cli-database-utils.js +239 -0
  27. package/dist/esm/core/config.d.ts +102 -0
  28. package/dist/esm/core/config.js +247 -0
  29. package/dist/esm/core/content-errors.d.ts +111 -0
  30. package/dist/esm/core/content-errors.js +362 -0
  31. package/dist/esm/core/content-manager.d.ts +335 -0
  32. package/dist/esm/core/content-manager.js +1476 -0
  33. package/dist/esm/core/content-performance-optimizer.d.ts +150 -0
  34. package/dist/esm/core/content-performance-optimizer.js +516 -0
  35. package/dist/esm/core/content-resolver.d.ts +104 -0
  36. package/dist/esm/core/content-resolver.js +285 -0
  37. package/dist/esm/core/cross-modal-search.d.ts +164 -0
  38. package/dist/esm/core/cross-modal-search.js +342 -0
  39. package/dist/esm/core/database-connection-manager.d.ts +109 -0
  40. package/dist/esm/core/database-connection-manager.js +310 -0
  41. package/dist/esm/core/db.d.ts +213 -0
  42. package/dist/esm/core/db.js +895 -0
  43. package/dist/esm/core/embedder-factory.d.ts +154 -0
  44. package/dist/esm/core/embedder-factory.js +311 -0
  45. package/dist/esm/core/error-handler.d.ts +112 -0
  46. package/dist/esm/core/error-handler.js +239 -0
  47. package/dist/esm/core/index.d.ts +59 -0
  48. package/dist/esm/core/index.js +69 -0
  49. package/dist/esm/core/ingestion.d.ts +202 -0
  50. package/dist/esm/core/ingestion.js +901 -0
  51. package/dist/esm/core/interfaces.d.ts +408 -0
  52. package/dist/esm/core/interfaces.js +106 -0
  53. package/dist/esm/core/lazy-dependency-loader.d.ts +147 -0
  54. package/dist/esm/core/lazy-dependency-loader.js +435 -0
  55. package/dist/esm/core/mode-detection-service.d.ts +150 -0
  56. package/dist/esm/core/mode-detection-service.js +565 -0
  57. package/dist/esm/core/mode-model-validator.d.ts +92 -0
  58. package/dist/esm/core/mode-model-validator.js +203 -0
  59. package/dist/esm/core/model-registry.d.ts +116 -0
  60. package/dist/esm/core/model-registry.js +411 -0
  61. package/dist/esm/core/model-validator.d.ts +217 -0
  62. package/dist/esm/core/model-validator.js +782 -0
  63. package/dist/esm/core/path-manager.d.ts +47 -0
  64. package/dist/esm/core/path-manager.js +71 -0
  65. package/dist/esm/core/raglite-paths.d.ts +121 -0
  66. package/dist/esm/core/raglite-paths.js +145 -0
  67. package/dist/esm/core/reranking-config.d.ts +42 -0
  68. package/dist/esm/core/reranking-config.js +147 -0
  69. package/dist/esm/core/reranking-factory.d.ts +92 -0
  70. package/dist/esm/core/reranking-factory.js +410 -0
  71. package/dist/esm/core/reranking-strategies.d.ts +310 -0
  72. package/dist/esm/core/reranking-strategies.js +650 -0
  73. package/dist/esm/core/resource-cleanup.d.ts +163 -0
  74. package/dist/esm/core/resource-cleanup.js +371 -0
  75. package/dist/esm/core/resource-manager.d.ts +212 -0
  76. package/dist/esm/core/resource-manager.js +564 -0
  77. package/dist/esm/core/search-pipeline.d.ts +111 -0
  78. package/dist/esm/core/search-pipeline.js +287 -0
  79. package/dist/esm/core/search.d.ts +141 -0
  80. package/dist/esm/core/search.js +320 -0
  81. package/dist/esm/core/streaming-operations.d.ts +145 -0
  82. package/dist/esm/core/streaming-operations.js +409 -0
  83. package/dist/esm/core/types.d.ts +66 -0
  84. package/dist/esm/core/types.js +6 -0
  85. package/dist/esm/core/universal-embedder.d.ts +177 -0
  86. package/dist/esm/core/universal-embedder.js +139 -0
  87. package/dist/esm/core/validation-messages.d.ts +99 -0
  88. package/dist/esm/core/validation-messages.js +334 -0
  89. package/dist/esm/core/vector-index.d.ts +72 -0
  90. package/dist/esm/core/vector-index.js +333 -0
  91. package/dist/esm/dom-polyfills.d.ts +6 -0
  92. package/dist/esm/dom-polyfills.js +37 -0
  93. package/dist/esm/factories/index.d.ts +27 -0
  94. package/dist/esm/factories/index.js +29 -0
  95. package/dist/esm/factories/ingestion-factory.d.ts +200 -0
  96. package/dist/esm/factories/ingestion-factory.js +477 -0
  97. package/dist/esm/factories/search-factory.d.ts +154 -0
  98. package/dist/esm/factories/search-factory.js +344 -0
  99. package/dist/esm/file-processor.d.ts +147 -0
  100. package/dist/esm/file-processor.js +963 -0
  101. package/dist/esm/index-manager.d.ts +116 -0
  102. package/dist/esm/index-manager.js +598 -0
  103. package/dist/esm/index.d.ts +75 -0
  104. package/dist/esm/index.js +110 -0
  105. package/dist/esm/indexer.d.ts +7 -0
  106. package/dist/esm/indexer.js +54 -0
  107. package/dist/esm/ingestion.d.ts +63 -0
  108. package/dist/esm/ingestion.js +124 -0
  109. package/dist/esm/mcp-server.d.ts +46 -0
  110. package/dist/esm/mcp-server.js +1820 -0
  111. package/dist/esm/multimodal/clip-embedder.d.ts +327 -0
  112. package/dist/esm/multimodal/clip-embedder.js +996 -0
  113. package/dist/esm/multimodal/index.d.ts +6 -0
  114. package/dist/esm/multimodal/index.js +6 -0
  115. package/dist/esm/preprocess.d.ts +19 -0
  116. package/dist/esm/preprocess.js +203 -0
  117. package/dist/esm/preprocessors/index.d.ts +17 -0
  118. package/dist/esm/preprocessors/index.js +38 -0
  119. package/dist/esm/preprocessors/mdx.d.ts +25 -0
  120. package/dist/esm/preprocessors/mdx.js +101 -0
  121. package/dist/esm/preprocessors/mermaid.d.ts +68 -0
  122. package/dist/esm/preprocessors/mermaid.js +329 -0
  123. package/dist/esm/preprocessors/registry.d.ts +56 -0
  124. package/dist/esm/preprocessors/registry.js +179 -0
  125. package/dist/esm/run-error-recovery-tests.d.ts +7 -0
  126. package/dist/esm/run-error-recovery-tests.js +101 -0
  127. package/dist/esm/search-standalone.d.ts +7 -0
  128. package/dist/esm/search-standalone.js +117 -0
  129. package/dist/esm/search.d.ts +99 -0
  130. package/dist/esm/search.js +177 -0
  131. package/dist/esm/test-utils.d.ts +18 -0
  132. package/dist/esm/test-utils.js +27 -0
  133. package/dist/esm/text/chunker.d.ts +33 -0
  134. package/dist/esm/text/chunker.js +279 -0
  135. package/dist/esm/text/embedder.d.ts +111 -0
  136. package/dist/esm/text/embedder.js +386 -0
  137. package/dist/esm/text/index.d.ts +8 -0
  138. package/dist/esm/text/index.js +9 -0
  139. package/dist/esm/text/preprocessors/index.d.ts +17 -0
  140. package/dist/esm/text/preprocessors/index.js +38 -0
  141. package/dist/esm/text/preprocessors/mdx.d.ts +25 -0
  142. package/dist/esm/text/preprocessors/mdx.js +101 -0
  143. package/dist/esm/text/preprocessors/mermaid.d.ts +68 -0
  144. package/dist/esm/text/preprocessors/mermaid.js +330 -0
  145. package/dist/esm/text/preprocessors/registry.d.ts +56 -0
  146. package/dist/esm/text/preprocessors/registry.js +180 -0
  147. package/dist/esm/text/reranker.d.ts +49 -0
  148. package/dist/esm/text/reranker.js +274 -0
  149. package/dist/esm/text/sentence-transformer-embedder.d.ts +96 -0
  150. package/dist/esm/text/sentence-transformer-embedder.js +340 -0
  151. package/dist/esm/text/tokenizer.d.ts +22 -0
  152. package/dist/esm/text/tokenizer.js +64 -0
  153. package/dist/esm/types.d.ts +83 -0
  154. package/dist/esm/types.js +3 -0
  155. package/dist/esm/utils/vector-math.d.ts +31 -0
  156. package/dist/esm/utils/vector-math.js +70 -0
  157. package/package.json +30 -12
  158. /package/dist/{api-errors.d.ts → cjs/api-errors.d.ts} +0 -0
  159. /package/dist/{api-errors.js → cjs/api-errors.js} +0 -0
  160. /package/dist/{cli → cjs/cli}/indexer.d.ts +0 -0
  161. /package/dist/{cli → cjs/cli}/indexer.js +0 -0
  162. /package/dist/{cli → cjs/cli}/search.d.ts +0 -0
  163. /package/dist/{cli → cjs/cli}/search.js +0 -0
  164. /package/dist/{cli.d.ts → cjs/cli.d.ts} +0 -0
  165. /package/dist/{cli.js → cjs/cli.js} +0 -0
  166. /package/dist/{config.d.ts → cjs/config.d.ts} +0 -0
  167. /package/dist/{config.js → cjs/config.js} +0 -0
  168. /package/dist/{core → cjs/core}/abstract-embedder.d.ts +0 -0
  169. /package/dist/{core → cjs/core}/abstract-embedder.js +0 -0
  170. /package/dist/{core → cjs/core}/actionable-error-messages.d.ts +0 -0
  171. /package/dist/{core → cjs/core}/actionable-error-messages.js +0 -0
  172. /package/dist/{core → cjs/core}/adapters.d.ts +0 -0
  173. /package/dist/{core → cjs/core}/adapters.js +0 -0
  174. /package/dist/{core → cjs/core}/batch-processing-optimizer.d.ts +0 -0
  175. /package/dist/{core → cjs/core}/batch-processing-optimizer.js +0 -0
  176. /package/dist/{core → cjs/core}/binary-index-format.d.ts +0 -0
  177. /package/dist/{core → cjs/core}/binary-index-format.js +0 -0
  178. /package/dist/{core → cjs/core}/chunker.d.ts +0 -0
  179. /package/dist/{core → cjs/core}/chunker.js +0 -0
  180. /package/dist/{core → cjs/core}/cli-database-utils.d.ts +0 -0
  181. /package/dist/{core → cjs/core}/cli-database-utils.js +0 -0
  182. /package/dist/{core → cjs/core}/config.d.ts +0 -0
  183. /package/dist/{core → cjs/core}/config.js +0 -0
  184. /package/dist/{core → cjs/core}/content-errors.d.ts +0 -0
  185. /package/dist/{core → cjs/core}/content-errors.js +0 -0
  186. /package/dist/{core → cjs/core}/content-manager.d.ts +0 -0
  187. /package/dist/{core → cjs/core}/content-manager.js +0 -0
  188. /package/dist/{core → cjs/core}/content-performance-optimizer.d.ts +0 -0
  189. /package/dist/{core → cjs/core}/content-performance-optimizer.js +0 -0
  190. /package/dist/{core → cjs/core}/content-resolver.d.ts +0 -0
  191. /package/dist/{core → cjs/core}/content-resolver.js +0 -0
  192. /package/dist/{core → cjs/core}/cross-modal-search.d.ts +0 -0
  193. /package/dist/{core → cjs/core}/cross-modal-search.js +0 -0
  194. /package/dist/{core → cjs/core}/database-connection-manager.d.ts +0 -0
  195. /package/dist/{core → cjs/core}/database-connection-manager.js +0 -0
  196. /package/dist/{core → cjs/core}/db.d.ts +0 -0
  197. /package/dist/{core → cjs/core}/db.js +0 -0
  198. /package/dist/{core → cjs/core}/embedder-factory.d.ts +0 -0
  199. /package/dist/{core → cjs/core}/embedder-factory.js +0 -0
  200. /package/dist/{core → cjs/core}/error-handler.d.ts +0 -0
  201. /package/dist/{core → cjs/core}/error-handler.js +0 -0
  202. /package/dist/{core → cjs/core}/index.d.ts +0 -0
  203. /package/dist/{core → cjs/core}/index.js +0 -0
  204. /package/dist/{core → cjs/core}/ingestion.d.ts +0 -0
  205. /package/dist/{core → cjs/core}/ingestion.js +0 -0
  206. /package/dist/{core → cjs/core}/interfaces.d.ts +0 -0
  207. /package/dist/{core → cjs/core}/interfaces.js +0 -0
  208. /package/dist/{core → cjs/core}/lazy-dependency-loader.d.ts +0 -0
  209. /package/dist/{core → cjs/core}/lazy-dependency-loader.js +0 -0
  210. /package/dist/{core → cjs/core}/mode-detection-service.d.ts +0 -0
  211. /package/dist/{core → cjs/core}/mode-detection-service.js +0 -0
  212. /package/dist/{core → cjs/core}/mode-model-validator.d.ts +0 -0
  213. /package/dist/{core → cjs/core}/mode-model-validator.js +0 -0
  214. /package/dist/{core → cjs/core}/model-registry.d.ts +0 -0
  215. /package/dist/{core → cjs/core}/model-registry.js +0 -0
  216. /package/dist/{core → cjs/core}/model-validator.d.ts +0 -0
  217. /package/dist/{core → cjs/core}/path-manager.d.ts +0 -0
  218. /package/dist/{core → cjs/core}/path-manager.js +0 -0
  219. /package/dist/{core → cjs/core}/raglite-paths.d.ts +0 -0
  220. /package/dist/{core → cjs/core}/raglite-paths.js +0 -0
  221. /package/dist/{core → cjs/core}/reranking-config.d.ts +0 -0
  222. /package/dist/{core → cjs/core}/reranking-config.js +0 -0
  223. /package/dist/{core → cjs/core}/reranking-factory.d.ts +0 -0
  224. /package/dist/{core → cjs/core}/reranking-factory.js +0 -0
  225. /package/dist/{core → cjs/core}/reranking-strategies.d.ts +0 -0
  226. /package/dist/{core → cjs/core}/reranking-strategies.js +0 -0
  227. /package/dist/{core → cjs/core}/resource-cleanup.d.ts +0 -0
  228. /package/dist/{core → cjs/core}/resource-cleanup.js +0 -0
  229. /package/dist/{core → cjs/core}/resource-manager.d.ts +0 -0
  230. /package/dist/{core → cjs/core}/resource-manager.js +0 -0
  231. /package/dist/{core → cjs/core}/search-pipeline.d.ts +0 -0
  232. /package/dist/{core → cjs/core}/search-pipeline.js +0 -0
  233. /package/dist/{core → cjs/core}/search.d.ts +0 -0
  234. /package/dist/{core → cjs/core}/search.js +0 -0
  235. /package/dist/{core → cjs/core}/streaming-operations.d.ts +0 -0
  236. /package/dist/{core → cjs/core}/streaming-operations.js +0 -0
  237. /package/dist/{core → cjs/core}/types.d.ts +0 -0
  238. /package/dist/{core → cjs/core}/types.js +0 -0
  239. /package/dist/{core → cjs/core}/universal-embedder.d.ts +0 -0
  240. /package/dist/{core → cjs/core}/universal-embedder.js +0 -0
  241. /package/dist/{core → cjs/core}/validation-messages.d.ts +0 -0
  242. /package/dist/{core → cjs/core}/validation-messages.js +0 -0
  243. /package/dist/{core → cjs/core}/vector-index.d.ts +0 -0
  244. /package/dist/{dom-polyfills.d.ts → cjs/dom-polyfills.d.ts} +0 -0
  245. /package/dist/{dom-polyfills.js → cjs/dom-polyfills.js} +0 -0
  246. /package/dist/{factories → cjs/factories}/index.d.ts +0 -0
  247. /package/dist/{factories → cjs/factories}/index.js +0 -0
  248. /package/dist/{factories → cjs/factories}/ingestion-factory.d.ts +0 -0
  249. /package/dist/{factories → cjs/factories}/ingestion-factory.js +0 -0
  250. /package/dist/{factories → cjs/factories}/search-factory.d.ts +0 -0
  251. /package/dist/{factories → cjs/factories}/search-factory.js +0 -0
  252. /package/dist/{file-processor.d.ts → cjs/file-processor.d.ts} +0 -0
  253. /package/dist/{file-processor.js → cjs/file-processor.js} +0 -0
  254. /package/dist/{index-manager.d.ts → cjs/index-manager.d.ts} +0 -0
  255. /package/dist/{index-manager.js → cjs/index-manager.js} +0 -0
  256. /package/dist/{index.d.ts → cjs/index.d.ts} +0 -0
  257. /package/dist/{index.js → cjs/index.js} +0 -0
  258. /package/dist/{indexer.d.ts → cjs/indexer.d.ts} +0 -0
  259. /package/dist/{indexer.js → cjs/indexer.js} +0 -0
  260. /package/dist/{ingestion.d.ts → cjs/ingestion.d.ts} +0 -0
  261. /package/dist/{ingestion.js → cjs/ingestion.js} +0 -0
  262. /package/dist/{mcp-server.d.ts → cjs/mcp-server.d.ts} +0 -0
  263. /package/dist/{mcp-server.js → cjs/mcp-server.js} +0 -0
  264. /package/dist/{multimodal → cjs/multimodal}/clip-embedder.d.ts +0 -0
  265. /package/dist/{multimodal → cjs/multimodal}/clip-embedder.js +0 -0
  266. /package/dist/{multimodal → cjs/multimodal}/index.d.ts +0 -0
  267. /package/dist/{multimodal → cjs/multimodal}/index.js +0 -0
  268. /package/dist/{preprocess.d.ts → cjs/preprocess.d.ts} +0 -0
  269. /package/dist/{preprocess.js → cjs/preprocess.js} +0 -0
  270. /package/dist/{preprocessors → cjs/preprocessors}/index.d.ts +0 -0
  271. /package/dist/{preprocessors → cjs/preprocessors}/index.js +0 -0
  272. /package/dist/{preprocessors → cjs/preprocessors}/mdx.d.ts +0 -0
  273. /package/dist/{preprocessors → cjs/preprocessors}/mdx.js +0 -0
  274. /package/dist/{preprocessors → cjs/preprocessors}/mermaid.d.ts +0 -0
  275. /package/dist/{preprocessors → cjs/preprocessors}/mermaid.js +0 -0
  276. /package/dist/{preprocessors → cjs/preprocessors}/registry.d.ts +0 -0
  277. /package/dist/{preprocessors → cjs/preprocessors}/registry.js +0 -0
  278. /package/dist/{run-error-recovery-tests.d.ts → cjs/run-error-recovery-tests.d.ts} +0 -0
  279. /package/dist/{run-error-recovery-tests.js → cjs/run-error-recovery-tests.js} +0 -0
  280. /package/dist/{search-standalone.d.ts → cjs/search-standalone.d.ts} +0 -0
  281. /package/dist/{search-standalone.js → cjs/search-standalone.js} +0 -0
  282. /package/dist/{search.d.ts → cjs/search.d.ts} +0 -0
  283. /package/dist/{search.js → cjs/search.js} +0 -0
  284. /package/dist/{test-utils.d.ts → cjs/test-utils.d.ts} +0 -0
  285. /package/dist/{test-utils.js → cjs/test-utils.js} +0 -0
  286. /package/dist/{text → cjs/text}/chunker.d.ts +0 -0
  287. /package/dist/{text → cjs/text}/chunker.js +0 -0
  288. /package/dist/{text → cjs/text}/embedder.d.ts +0 -0
  289. /package/dist/{text → cjs/text}/embedder.js +0 -0
  290. /package/dist/{text → cjs/text}/index.d.ts +0 -0
  291. /package/dist/{text → cjs/text}/index.js +0 -0
  292. /package/dist/{text → cjs/text}/preprocessors/index.d.ts +0 -0
  293. /package/dist/{text → cjs/text}/preprocessors/index.js +0 -0
  294. /package/dist/{text → cjs/text}/preprocessors/mdx.d.ts +0 -0
  295. /package/dist/{text → cjs/text}/preprocessors/mdx.js +0 -0
  296. /package/dist/{text → cjs/text}/preprocessors/mermaid.d.ts +0 -0
  297. /package/dist/{text → cjs/text}/preprocessors/mermaid.js +0 -0
  298. /package/dist/{text → cjs/text}/preprocessors/registry.d.ts +0 -0
  299. /package/dist/{text → cjs/text}/preprocessors/registry.js +0 -0
  300. /package/dist/{text → cjs/text}/reranker.d.ts +0 -0
  301. /package/dist/{text → cjs/text}/reranker.js +0 -0
  302. /package/dist/{text → cjs/text}/sentence-transformer-embedder.d.ts +0 -0
  303. /package/dist/{text → cjs/text}/sentence-transformer-embedder.js +0 -0
  304. /package/dist/{text → cjs/text}/tokenizer.d.ts +0 -0
  305. /package/dist/{text → cjs/text}/tokenizer.js +0 -0
  306. /package/dist/{types.d.ts → cjs/types.d.ts} +0 -0
  307. /package/dist/{types.js → cjs/types.js} +0 -0
  308. /package/dist/{utils → cjs/utils}/vector-math.d.ts +0 -0
  309. /package/dist/{utils → cjs/utils}/vector-math.js +0 -0
@@ -0,0 +1,1476 @@
1
+ /**
2
+ * Content Manager - Handles content ingestion routing for unified content system
3
+ * Routes filesystem content to reference storage and memory content to content directory
4
+ * Implements deduplication and content ID generation
5
+ */
6
+ import { createHash } from 'crypto';
7
+ import { promises as fs } from 'fs';
8
+ import { join, extname, basename } from 'path';
9
+ import { insertContentMetadata, getContentMetadataByHash, getStorageStats, updateStorageStats, getContentMetadataByStorageType, deleteContentMetadata } from './db.js';
10
+ import { ContentIngestionError, StorageLimitExceededError, InvalidContentFormatError, ContentErrorHandler } from './content-errors.js';
11
+ import { globalResourceCleanup, withResourceCleanup, writeFileAtomic, withTimeout, SafeBuffer } from './resource-cleanup.js';
12
+ import { createStreamingOperations, formatBytes, formatProcessingTime } from './streaming-operations.js';
13
+ import { createContentPerformanceOptimizer, formatCacheHitRate } from './content-performance-optimizer.js';
14
+ /**
15
+ * Default configuration
16
+ */
17
+ const DEFAULT_CONFIG = {
18
+ contentDir: '.raglite/content',
19
+ maxFileSize: 50 * 1024 * 1024, // 50MB
20
+ maxContentDirSize: 2 * 1024 * 1024 * 1024, // 2GB
21
+ enableDeduplication: true,
22
+ enableStorageTracking: true,
23
+ storageWarningThreshold: 75, // Warn at 75% usage
24
+ storageErrorThreshold: 95 // Reject at 95% usage
25
+ };
26
+ /**
27
+ * ContentManager class for handling content ingestion routing
28
+ * Implements the unified content system's ingestion logic
29
+ */
30
+ export class ContentManager {
31
+ db;
32
+ config;
33
+ streamingOps;
34
+ performanceOptimizer;
35
+ constructor(db, config = {}) {
36
+ this.db = db;
37
+ // Parse and normalize configuration
38
+ const inputConfig = { ...DEFAULT_CONFIG, ...config };
39
+ // Parse size strings to bytes
40
+ const maxFileSize = this.parseSizeToBytes(inputConfig.maxFileSize);
41
+ const maxContentDirSize = this.parseSizeToBytes(inputConfig.maxContentDirSize);
42
+ // Validate thresholds
43
+ if (inputConfig.storageWarningThreshold < 0 || inputConfig.storageWarningThreshold > 100) {
44
+ throw new Error('Storage warning threshold must be between 0 and 100');
45
+ }
46
+ if (inputConfig.storageErrorThreshold < 0 || inputConfig.storageErrorThreshold > 100) {
47
+ throw new Error('Storage error threshold must be between 0 and 100');
48
+ }
49
+ if (inputConfig.storageErrorThreshold <= inputConfig.storageWarningThreshold) {
50
+ throw new Error('Storage error threshold must be greater than warning threshold');
51
+ }
52
+ // Create normalized config
53
+ this.config = {
54
+ contentDir: inputConfig.contentDir,
55
+ maxFileSize,
56
+ maxContentDirSize,
57
+ enableDeduplication: inputConfig.enableDeduplication,
58
+ enableStorageTracking: inputConfig.enableStorageTracking,
59
+ storageWarningThreshold: inputConfig.storageWarningThreshold,
60
+ storageErrorThreshold: inputConfig.storageErrorThreshold
61
+ };
62
+ // Initialize streaming operations with appropriate chunk size based on file size limits
63
+ const chunkSize = Math.floor(Math.min(1024 * 1024, Math.max(64 * 1024, maxFileSize / 100))); // 64KB to 1MB chunks
64
+ this.streamingOps = createStreamingOperations({
65
+ chunkSize,
66
+ enableProgress: false, // Can be enabled for debugging
67
+ enableHashing: true,
68
+ timeout: 300000 // 5 minutes
69
+ });
70
+ // Initialize performance optimizer with optimized settings
71
+ this.performanceOptimizer = createContentPerformanceOptimizer({
72
+ hashCacheSize: 1000,
73
+ hashCacheTTL: 60 * 60 * 1000, // 1 hour
74
+ maxConcurrentOperations: 10,
75
+ batchSize: 50,
76
+ fileBufferSize: chunkSize,
77
+ enableAsyncIO: true,
78
+ enableMetrics: true,
79
+ metricsRetentionTime: 24 * 60 * 60 * 1000 // 24 hours
80
+ });
81
+ }
82
+ /**
83
+ * Ingests content from filesystem by creating references without copying files
84
+ * @param filePath - Path to the file to ingest
85
+ * @returns Promise that resolves to content ingestion result
86
+ */
87
+ async ingestFromFilesystem(filePath) {
88
+ // Use resource cleanup with timeout for filesystem operations
89
+ return withResourceCleanup(async (transactionId) => {
90
+ let content = null;
91
+ let safeBuffer = null;
92
+ try {
93
+ // Verify file exists and get stats with timeout
94
+ const stats = await withTimeout(fs.stat(filePath), 10000, // 10 second timeout for file stat
95
+ 'File stat operation timed out');
96
+ if (!stats.isFile()) {
97
+ throw new ContentIngestionError('file validation', `Path is not a file: ${filePath}`, 'filesystem_ingestion');
98
+ }
99
+ // Check file size limit
100
+ if (stats.size > this.config.maxFileSize) {
101
+ const sizeMB = Math.round((stats.size / 1024 / 1024) * 100) / 100;
102
+ const limitMB = Math.round((this.config.maxFileSize / 1024 / 1024) * 100) / 100;
103
+ throw new ContentIngestionError('file size validation', `File size (${sizeMB}MB) exceeds maximum allowed size (${limitMB}MB)`, 'filesystem_ingestion');
104
+ }
105
+ // Use optimized hash calculation with caching
106
+ let contentHash;
107
+ if (stats.size > 10 * 1024 * 1024) { // Use streaming for files > 10MB
108
+ contentHash = await withTimeout(this.performanceOptimizer.calculateFileHashOptimized(filePath), 120000, // 2 minute timeout for large file hashing
109
+ 'Optimized hash calculation timed out');
110
+ // Log performance metrics for large files
111
+ if (stats.size > 50 * 1024 * 1024) {
112
+ const cacheStats = this.performanceOptimizer.getHashCacheStats();
113
+ console.log(`Optimized hash completed: ${formatBytes(stats.size)} (Cache hit rate: ${formatCacheHitRate(cacheStats.hitRate)})`);
114
+ }
115
+ }
116
+ else {
117
+ // For smaller files, use traditional method with memory management
118
+ content = await withTimeout(fs.readFile(filePath), 60000, // 60 second timeout for file reading
119
+ 'File read operation timed out');
120
+ // Create safe buffer for memory management (don't clear original for normal operations)
121
+ safeBuffer = new SafeBuffer(content, { clearOriginal: false });
122
+ globalResourceCleanup.addBuffer(transactionId, safeBuffer.get());
123
+ contentHash = this.generateContentHash(safeBuffer.get());
124
+ }
125
+ // Check for existing content if deduplication is enabled
126
+ if (this.config.enableDeduplication) {
127
+ const existing = await withTimeout(getContentMetadataByHash(this.db, contentHash), 10000, // 10 second timeout for database query
128
+ 'Database query for existing content timed out');
129
+ if (existing) {
130
+ return {
131
+ contentId: existing.id,
132
+ wasDeduped: true,
133
+ storageType: existing.storageType,
134
+ contentPath: existing.contentPath
135
+ };
136
+ }
137
+ }
138
+ // Generate content ID
139
+ const contentId = safeBuffer ? this.generateContentId(safeBuffer.get()) : this.generateContentIdFromHash(contentHash);
140
+ // Detect content type - for streaming case, read small sample for magic number detection
141
+ let contentType;
142
+ if (stats.size > 10 * 1024 * 1024 && !content) {
143
+ // For large files processed with streaming, read small sample for content type detection
144
+ const sampleSize = Math.min(8192, stats.size); // Read first 8KB for magic number detection
145
+ const sample = Buffer.alloc(sampleSize);
146
+ const fd = await fs.open(filePath, 'r');
147
+ try {
148
+ await fd.read(sample, 0, sampleSize, 0);
149
+ contentType = this.detectContentType(filePath, sample);
150
+ }
151
+ finally {
152
+ await fd.close();
153
+ }
154
+ }
155
+ else {
156
+ contentType = safeBuffer ? this.detectContentType(filePath, safeBuffer.get()) : this.detectContentType(filePath);
157
+ }
158
+ // Validate content type is supported
159
+ const validation = this.validateContentType(contentType);
160
+ if (!validation.isSupported) {
161
+ throw new InvalidContentFormatError(contentType, validation.error, 'filesystem_ingestion');
162
+ }
163
+ // Create content metadata for filesystem reference
164
+ const contentMetadata = {
165
+ id: contentId,
166
+ storageType: 'filesystem',
167
+ originalPath: filePath,
168
+ contentPath: filePath, // For filesystem, content path is the same as original path
169
+ displayName: basename(filePath),
170
+ contentType,
171
+ fileSize: stats.size,
172
+ contentHash
173
+ };
174
+ // Track database entry for cleanup in case of failure
175
+ globalResourceCleanup.addDatabaseEntry(transactionId, this.db, contentId);
176
+ // Insert content metadata with timeout
177
+ await withTimeout(insertContentMetadata(this.db, contentMetadata), 10000, // 10 second timeout for database insertion
178
+ 'Database insertion timed out');
179
+ return {
180
+ contentId,
181
+ wasDeduped: false,
182
+ storageType: 'filesystem',
183
+ contentPath: filePath
184
+ };
185
+ }
186
+ catch (error) {
187
+ if (error instanceof ContentIngestionError || error instanceof InvalidContentFormatError) {
188
+ throw error; // Re-throw content-specific errors
189
+ }
190
+ ContentErrorHandler.handleContentError(error, 'filesystem ingestion', 'ingestFromFilesystem');
191
+ }
192
+ finally {
193
+ // Clear sensitive buffer data
194
+ if (safeBuffer) {
195
+ safeBuffer.clear();
196
+ }
197
+ }
198
+ }, 90000); // 90 second overall timeout for filesystem operations
199
+ }
200
+ /**
201
+ * Ingests content from memory by storing it in content directory with hash-based filenames
202
+ * @param content - Buffer containing the content
203
+ * @param metadata - Memory content metadata
204
+ * @returns Promise that resolves to content ingestion result
205
+ */
206
+ async ingestFromMemory(content, metadata) {
207
+ // Use resource cleanup with timeout for long-running operations
208
+ return withResourceCleanup(async (transactionId) => {
209
+ // Create safe buffer for memory management (don't clear original for normal operations)
210
+ const safeBuffer = new SafeBuffer(content, { clearOriginal: false });
211
+ globalResourceCleanup.addBuffer(transactionId, safeBuffer.get());
212
+ try {
213
+ // Check content size limit
214
+ if (content.length > this.config.maxFileSize) {
215
+ const sizeMB = Math.round((content.length / 1024 / 1024) * 100) / 100;
216
+ const limitMB = Math.round((this.config.maxFileSize / 1024 / 1024) * 100) / 100;
217
+ throw new ContentIngestionError('content size validation', `Content size (${sizeMB}MB) exceeds maximum allowed size (${limitMB}MB)`, 'memory_ingestion');
218
+ }
219
+ // Enforce storage limits with enhanced error messages and guidance
220
+ await withTimeout(this.enforceStorageLimits(content.length), 30000, // 30 second timeout for storage limit checks
221
+ 'Storage limit enforcement timed out');
222
+ // Use optimized hash calculation with caching
223
+ let contentHash;
224
+ // Use optimized hash calculation with caching
225
+ // Don't use a cache key for memory content to ensure proper deduplication
226
+ contentHash = await withTimeout(this.performanceOptimizer.calculateBufferHashOptimized(safeBuffer.get()), 120000, // 2 minute timeout for hash calculation
227
+ 'Optimized buffer hash calculation timed out');
228
+ // Log performance metrics for large content
229
+ if (content.length > 50 * 1024 * 1024) {
230
+ const cacheStats = this.performanceOptimizer.getHashCacheStats();
231
+ console.log(`Optimized buffer hash completed: ${formatBytes(content.length)} (Cache hit rate: ${formatCacheHitRate(cacheStats.hitRate)})`);
232
+ }
233
+ // Check for existing content if deduplication is enabled
234
+ if (this.config.enableDeduplication) {
235
+ const existing = await withTimeout(getContentMetadataByHash(this.db, contentHash), 10000, // 10 second timeout for database queries
236
+ 'Database query for existing content timed out');
237
+ if (existing) {
238
+ // Content already exists, no cleanup needed
239
+ return {
240
+ contentId: existing.id,
241
+ wasDeduped: true,
242
+ storageType: existing.storageType,
243
+ contentPath: existing.contentPath
244
+ };
245
+ }
246
+ }
247
+ // Generate content ID
248
+ const contentId = this.generateContentId(safeBuffer.get());
249
+ // Detect content type
250
+ const contentType = metadata.contentType || this.detectContentTypeFromBuffer(safeBuffer.get(), metadata.displayName);
251
+ // Validate content type is supported
252
+ const validation = this.validateContentType(contentType);
253
+ if (!validation.isSupported) {
254
+ throw new InvalidContentFormatError(contentType, validation.error, 'memory_ingestion');
255
+ }
256
+ // Ensure content directory exists
257
+ await withTimeout(this.ensureContentDirectory(), 5000, // 5 second timeout for directory creation
258
+ 'Content directory creation timed out');
259
+ // Generate filename with extension based on content type or display name
260
+ const extension = this.getExtensionFromContentType(contentType) ||
261
+ (metadata.displayName ? extname(metadata.displayName) : '.bin');
262
+ const filename = `${contentHash}${extension}`;
263
+ const contentPath = join(this.config.contentDir, filename);
264
+ // Use streaming write for large content to minimize memory usage
265
+ if (content.length > 10 * 1024 * 1024) { // Use streaming for content > 10MB
266
+ const writeResult = await withTimeout(this.streamingOps.writeBufferStreaming(safeBuffer.get(), contentPath), 180000, // 3 minute timeout for large content writing
267
+ 'Streaming write operation timed out');
268
+ // Log performance metrics for large content
269
+ if (content.length > 50 * 1024 * 1024) {
270
+ console.log(`Streaming write completed: ${formatBytes(writeResult.bytesWritten)} in ${formatProcessingTime(writeResult.processingTimeMs)}`);
271
+ }
272
+ // Track file for cleanup
273
+ globalResourceCleanup.addTempFile(transactionId, contentPath);
274
+ }
275
+ else {
276
+ // For smaller content, use atomic write with cleanup tracking
277
+ await withTimeout(writeFileAtomic(contentPath, safeBuffer.get(), transactionId), 60000, // 60 second timeout for file writing
278
+ 'File write operation timed out');
279
+ }
280
+ // Create content metadata
281
+ const contentMetadata = {
282
+ id: contentId,
283
+ storageType: 'content_dir',
284
+ originalPath: metadata.originalPath,
285
+ contentPath,
286
+ displayName: metadata.displayName,
287
+ contentType,
288
+ fileSize: content.length,
289
+ contentHash
290
+ };
291
+ // Insert content metadata with cleanup tracking
292
+ globalResourceCleanup.addDatabaseEntry(transactionId, this.db, contentId);
293
+ await withTimeout(insertContentMetadata(this.db, contentMetadata), 10000, // 10 second timeout for database insertion
294
+ 'Database insertion timed out');
295
+ // Update storage statistics if tracking is enabled
296
+ if (this.config.enableStorageTracking) {
297
+ try {
298
+ await withTimeout(this.updateStorageStats(), 15000, // 15 second timeout for stats update
299
+ 'Storage stats update timed out');
300
+ }
301
+ catch (error) {
302
+ // Don't fail the operation if stats update fails
303
+ console.warn('Failed to update storage stats after ingestion:', error);
304
+ }
305
+ }
306
+ return {
307
+ contentId,
308
+ wasDeduped: false,
309
+ storageType: 'content_dir',
310
+ contentPath
311
+ };
312
+ }
313
+ catch (error) {
314
+ if (error instanceof ContentIngestionError ||
315
+ error instanceof InvalidContentFormatError ||
316
+ error instanceof StorageLimitExceededError) {
317
+ throw error; // Re-throw content-specific errors
318
+ }
319
+ ContentErrorHandler.handleContentError(error, 'memory ingestion', 'ingestFromMemory');
320
+ }
321
+ finally {
322
+ // Clear sensitive buffer data
323
+ safeBuffer.clear();
324
+ }
325
+ }, 120000); // 2 minute overall timeout for the entire operation
326
+ }
327
+ /**
328
+ * Generates a stable content ID using SHA-256 hash of content
329
+ * @param content - Buffer containing the content
330
+ * @returns Content ID string
331
+ */
332
+ generateContentId(content) {
333
+ return this.generateContentHash(content);
334
+ }
335
+ /**
336
+ * Generates a unique content ID from an existing hash
337
+ * @param hash - Content hash
338
+ * @returns Content ID string
339
+ */
340
+ generateContentIdFromHash(hash) {
341
+ return hash;
342
+ }
343
+ /**
344
+ * Gets performance statistics for monitoring and optimization
345
+ * @returns Performance statistics
346
+ */
347
+ getPerformanceStats() {
348
+ const cacheStats = this.performanceOptimizer.getHashCacheStats();
349
+ const operationStats = this.performanceOptimizer.getPerformanceStats();
350
+ return {
351
+ hashCache: cacheStats,
352
+ operations: operationStats
353
+ };
354
+ }
355
+ /**
356
+ * Clears performance caches and resets metrics
357
+ */
358
+ clearPerformanceCaches() {
359
+ this.performanceOptimizer.clearHashCache();
360
+ }
361
+ /**
362
+ * Checks if content with given ID already exists (deduplication check)
363
+ * @param contentId - Content ID to check
364
+ * @returns Promise that resolves to true if content exists, false otherwise
365
+ */
366
+ async deduplicateContent(contentId) {
367
+ try {
368
+ const existing = await getContentMetadataByHash(this.db, contentId);
369
+ return existing !== null;
370
+ }
371
+ catch (error) {
372
+ throw new Error(`Failed to check for duplicate content: ${error instanceof Error ? error.message : 'Unknown error'}`);
373
+ }
374
+ }
375
+ // =============================================================================
376
+ // STORAGE LIMIT ENFORCEMENT METHODS
377
+ // =============================================================================
378
+ /**
379
+ * Enforces storage limits before accepting new content
380
+ * @param contentSize - Size of content to add in bytes
381
+ * @returns Promise that resolves if content can be added, throws error otherwise
382
+ */
383
+ async enforceStorageLimits(contentSize) {
384
+ if (!this.config.enableStorageTracking) {
385
+ return; // Skip enforcement if tracking is disabled
386
+ }
387
+ try {
388
+ const stats = await this.getStorageStats();
389
+ const currentUsage = stats.contentDirectory.totalSize;
390
+ const projectedUsage = currentUsage + contentSize;
391
+ const maxSize = this.config.maxContentDirSize;
392
+ const currentPercent = (currentUsage / maxSize) * 100;
393
+ const projectedPercent = (projectedUsage / maxSize) * 100;
394
+ // Check if adding content would exceed error threshold
395
+ if (projectedPercent > this.config.storageErrorThreshold) {
396
+ const currentMB = Math.round((currentUsage / 1024 / 1024) * 100) / 100;
397
+ const maxMB = Math.round((maxSize / 1024 / 1024) * 100) / 100;
398
+ const contentMB = Math.round((contentSize / 1024 / 1024) * 100) / 100;
399
+ const remainingMB = Math.round(((maxSize - currentUsage) / 1024 / 1024) * 100) / 100;
400
+ throw new StorageLimitExceededError(currentMB, maxMB, contentMB, 'storage_enforcement');
401
+ }
402
+ // Check if adding content would exceed warning threshold
403
+ if (projectedPercent > this.config.storageWarningThreshold && currentPercent <= this.config.storageWarningThreshold) {
404
+ const currentMB = Math.round((currentUsage / 1024 / 1024) * 100) / 100;
405
+ const maxMB = Math.round((maxSize / 1024 / 1024) * 100) / 100;
406
+ console.warn(`⚠️ Storage Warning: Content directory usage will reach ${Math.round(projectedPercent)}% after adding this content.\n` +
407
+ `Current: ${currentMB}MB / ${maxMB}MB (${Math.round(currentPercent)}%)\n` +
408
+ `Consider running cleanup operations to free space.`);
409
+ }
410
+ }
411
+ catch (error) {
412
+ if (error instanceof Error && error.message.includes('Storage limit exceeded')) {
413
+ throw error; // Re-throw storage limit errors
414
+ }
415
+ // Log other errors but don't fail the operation
416
+ console.warn('Failed to enforce storage limits:', error);
417
+ }
418
+ }
419
+ /**
420
+ * Gets storage limit status and recommendations
421
+ * @returns Promise that resolves to storage limit status
422
+ */
423
+ async getStorageLimitStatus() {
424
+ try {
425
+ const stats = await this.getStorageStats();
426
+ const currentUsage = stats.contentDirectory.totalSize;
427
+ const maxSize = this.config.maxContentDirSize;
428
+ const currentPercent = (currentUsage / maxSize) * 100;
429
+ const isNearWarningThreshold = currentPercent >= this.config.storageWarningThreshold;
430
+ const isNearErrorThreshold = currentPercent >= this.config.storageErrorThreshold;
431
+ const canAcceptContent = currentPercent < this.config.storageErrorThreshold;
432
+ const recommendations = [];
433
+ if (isNearErrorThreshold) {
434
+ recommendations.push('🚨 URGENT: Storage is critically full - new content will be rejected');
435
+ recommendations.push('Run cleanup operations immediately: removeOrphanedFiles() and removeDuplicateContent()');
436
+ recommendations.push('Consider increasing storage limits or removing unused content');
437
+ }
438
+ else if (isNearWarningThreshold) {
439
+ recommendations.push('⚠️ WARNING: Storage is getting full');
440
+ recommendations.push('Consider running cleanup operations: removeOrphanedFiles() and removeDuplicateContent()');
441
+ recommendations.push('Monitor storage usage closely');
442
+ }
443
+ else if (currentPercent > 50) {
444
+ recommendations.push('ℹ️ Storage is over 50% full');
445
+ recommendations.push('Regular cleanup operations recommended');
446
+ }
447
+ else {
448
+ recommendations.push('✅ Storage usage is healthy');
449
+ }
450
+ return {
451
+ currentUsagePercent: Math.round(currentPercent * 100) / 100,
452
+ isNearWarningThreshold,
453
+ isNearErrorThreshold,
454
+ canAcceptContent,
455
+ recommendations,
456
+ limits: {
457
+ warningThreshold: this.config.storageWarningThreshold,
458
+ errorThreshold: this.config.storageErrorThreshold,
459
+ maxSizeMB: Math.round((maxSize / 1024 / 1024) * 100) / 100,
460
+ currentSizeMB: Math.round((currentUsage / 1024 / 1024) * 100) / 100,
461
+ remainingSizeMB: Math.round(((maxSize - currentUsage) / 1024 / 1024) * 100) / 100
462
+ }
463
+ };
464
+ }
465
+ catch (error) {
466
+ throw new Error(`Failed to get storage limit status: ${error instanceof Error ? error.message : 'Unknown error'}`);
467
+ }
468
+ }
469
+ // =============================================================================
470
+ // PRIVATE METHODS
471
+ // =============================================================================
472
+ /**
473
+ * Parses size string or number to bytes
474
+ * @param size - Size as number (bytes) or string like "50MB", "2GB"
475
+ * @returns Size in bytes
476
+ */
477
+ parseSizeToBytes(size) {
478
+ if (typeof size === 'number') {
479
+ return size;
480
+ }
481
+ const sizeStr = size.toString().trim().toUpperCase();
482
+ const match = sizeStr.match(/^(\d+(?:\.\d+)?)\s*(B|KB|MB|GB|TB)?$/);
483
+ if (!match) {
484
+ throw new Error(`Invalid size format: ${size}. Use formats like "50MB", "2GB", or number of bytes.`);
485
+ }
486
+ const value = parseFloat(match[1]);
487
+ const unit = match[2] || 'B';
488
+ const multipliers = {
489
+ 'B': 1,
490
+ 'KB': 1024,
491
+ 'MB': 1024 * 1024,
492
+ 'GB': 1024 * 1024 * 1024,
493
+ 'TB': 1024 * 1024 * 1024 * 1024
494
+ };
495
+ return Math.round(value * multipliers[unit]);
496
+ }
497
+ /**
498
+ * Generates SHA-256 hash of content
499
+ * @param content - Buffer containing the content
500
+ * @returns SHA-256 hash string
501
+ */
502
+ generateContentHash(content) {
503
+ return createHash('sha256').update(content).digest('hex');
504
+ }
505
+ /**
506
+ * Detects content type from file path and optionally content using enhanced magic number detection
507
+ * @param filePath - Path to the file
508
+ * @param content - File content buffer (optional)
509
+ * @returns MIME type string
510
+ */
511
+ detectContentType(filePath, content) {
512
+ const extension = extname(filePath).toLowerCase();
513
+ // First try magic number detection for more reliable identification (if content is available)
514
+ if (content) {
515
+ const magicBasedType = this.detectContentTypeByMagicNumbers(content);
516
+ if (magicBasedType !== 'application/octet-stream') {
517
+ return magicBasedType;
518
+ }
519
+ }
520
+ // Fall back to extension-based detection
521
+ const extensionBasedType = this.detectContentTypeByExtension(extension);
522
+ if (extensionBasedType !== 'application/octet-stream') {
523
+ return extensionBasedType;
524
+ }
525
+ // Final fallback: check if it's text content (if content is available)
526
+ if (content && this.isTextContent(content)) {
527
+ return 'text/plain';
528
+ }
529
+ return 'application/octet-stream';
530
+ }
531
+ /**
532
+ * Detects content type from buffer and optional filename for memory-based ingestion
533
+ * @param content - Content buffer
534
+ * @param filename - Optional filename for extension-based detection
535
+ * @returns MIME type string
536
+ */
537
+ detectContentTypeFromBuffer(content, filename) {
538
+ // Use filename if provided for more accurate detection
539
+ if (filename) {
540
+ return this.detectContentType(filename, content);
541
+ }
542
+ // Use magic number detection for buffer-only content
543
+ const magicBasedType = this.detectContentTypeByMagicNumbers(content);
544
+ if (magicBasedType !== 'application/octet-stream') {
545
+ return magicBasedType;
546
+ }
547
+ // Final fallback: check if it's text content
548
+ if (this.isTextContent(content)) {
549
+ return 'text/plain';
550
+ }
551
+ return 'application/octet-stream';
552
+ }
553
+ /**
554
+ * Enhanced magic number detection for comprehensive content type identification
555
+ * @param content - Content buffer to analyze
556
+ * @returns MIME type string based on magic numbers, or 'application/octet-stream' if unknown
557
+ */
558
+ detectContentTypeByMagicNumbers(content) {
559
+ if (content.length === 0) {
560
+ return 'application/octet-stream';
561
+ }
562
+ // Get enough bytes for magic number detection
563
+ const magicBytes = content.subarray(0, Math.min(32, content.length));
564
+ // PDF - %PDF
565
+ if (magicBytes.length >= 4 && magicBytes.subarray(0, 4).toString() === '%PDF') {
566
+ return 'application/pdf';
567
+ }
568
+ // PNG - 89 50 4E 47 0D 0A 1A 0A
569
+ if (magicBytes.length >= 8 &&
570
+ magicBytes.subarray(0, 8).equals(Buffer.from([0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]))) {
571
+ return 'image/png';
572
+ }
573
+ // JPEG - FF D8 FF
574
+ if (magicBytes.length >= 3 &&
575
+ magicBytes.subarray(0, 3).equals(Buffer.from([0xFF, 0xD8, 0xFF]))) {
576
+ return 'image/jpeg';
577
+ }
578
+ // GIF87a or GIF89a
579
+ if (magicBytes.length >= 6) {
580
+ const gifHeader = magicBytes.subarray(0, 6).toString();
581
+ if (gifHeader === 'GIF87a' || gifHeader === 'GIF89a') {
582
+ return 'image/gif';
583
+ }
584
+ }
585
+ // WebP - RIFF....WEBP
586
+ if (magicBytes.length >= 12 &&
587
+ magicBytes.subarray(0, 4).toString() === 'RIFF' &&
588
+ magicBytes.subarray(8, 12).toString() === 'WEBP') {
589
+ return 'image/webp';
590
+ }
591
+ // ZIP-based formats (DOCX, XLSX, etc.) - 50 4B 03 04 or 50 4B 05 06 or 50 4B 07 08
592
+ if (magicBytes.length >= 4) {
593
+ const zipMagic = magicBytes.subarray(0, 4);
594
+ if (zipMagic.equals(Buffer.from([0x50, 0x4B, 0x03, 0x04])) ||
595
+ zipMagic.equals(Buffer.from([0x50, 0x4B, 0x05, 0x06])) ||
596
+ zipMagic.equals(Buffer.from([0x50, 0x4B, 0x07, 0x08]))) {
597
+ // For ZIP files, we need more context to determine the specific type
598
+ // This is a generic ZIP file, specific detection would require filename
599
+ return 'application/zip';
600
+ }
601
+ }
602
+ // BMP - 42 4D
603
+ if (magicBytes.length >= 2 &&
604
+ magicBytes.subarray(0, 2).equals(Buffer.from([0x42, 0x4D]))) {
605
+ return 'image/bmp';
606
+ }
607
+ // TIFF - 49 49 2A 00 (little endian) or 4D 4D 00 2A (big endian)
608
+ if (magicBytes.length >= 4) {
609
+ const tiffLE = Buffer.from([0x49, 0x49, 0x2A, 0x00]);
610
+ const tiffBE = Buffer.from([0x4D, 0x4D, 0x00, 0x2A]);
611
+ if (magicBytes.subarray(0, 4).equals(tiffLE) || magicBytes.subarray(0, 4).equals(tiffBE)) {
612
+ return 'image/tiff';
613
+ }
614
+ }
615
+ // ICO - 00 00 01 00
616
+ if (magicBytes.length >= 4 &&
617
+ magicBytes.subarray(0, 4).equals(Buffer.from([0x00, 0x00, 0x01, 0x00]))) {
618
+ return 'image/x-icon';
619
+ }
620
+ // SVG - Check for XML declaration and SVG tag
621
+ if (magicBytes.length >= 5) {
622
+ const start = magicBytes.toString('utf8', 0, Math.min(100, magicBytes.length)).toLowerCase();
623
+ if (start.includes('<svg') || (start.includes('<?xml') && start.includes('<svg'))) {
624
+ return 'image/svg+xml';
625
+ }
626
+ }
627
+ // HTML - Check for HTML tags
628
+ if (magicBytes.length >= 5) {
629
+ const start = magicBytes.toString('utf8', 0, Math.min(100, magicBytes.length)).toLowerCase();
630
+ if (start.includes('<!doctype html') || start.includes('<html') || start.includes('<head')) {
631
+ return 'text/html';
632
+ }
633
+ }
634
+ // XML - Check for XML declaration
635
+ if (magicBytes.length >= 5) {
636
+ const start = magicBytes.toString('utf8', 0, Math.min(50, magicBytes.length)).toLowerCase();
637
+ if (start.startsWith('<?xml')) {
638
+ return 'application/xml';
639
+ }
640
+ }
641
+ // JSON - Check for JSON structure (basic heuristic)
642
+ if (magicBytes.length >= 2) {
643
+ const start = magicBytes.toString('utf8', 0, Math.min(10, magicBytes.length)).trim();
644
+ if (start.startsWith('{') || start.startsWith('[')) {
645
+ // Additional validation to ensure it's likely JSON
646
+ try {
647
+ const sample = content.toString('utf8', 0, Math.min(1024, content.length));
648
+ JSON.parse(sample);
649
+ return 'application/json';
650
+ }
651
+ catch {
652
+ // Not valid JSON, continue with other detection
653
+ }
654
+ }
655
+ }
656
+ return 'application/octet-stream';
657
+ }
658
+ /**
659
+ * Extension-based content type detection with comprehensive mapping
660
+ * @param extension - File extension (with or without dot)
661
+ * @returns MIME type string based on extension, or 'application/octet-stream' if unknown
662
+ */
663
+ detectContentTypeByExtension(extension) {
664
+ const ext = extension.toLowerCase().startsWith('.') ? extension.toLowerCase() : `.${extension.toLowerCase()}`;
665
+ // Text formats
666
+ switch (ext) {
667
+ case '.txt':
668
+ case '.text':
669
+ return 'text/plain';
670
+ case '.md':
671
+ case '.markdown':
672
+ case '.mdown':
673
+ return 'text/markdown';
674
+ case '.html':
675
+ case '.htm':
676
+ return 'text/html';
677
+ case '.css':
678
+ return 'text/css';
679
+ case '.js':
680
+ case '.mjs':
681
+ return 'application/javascript';
682
+ case '.json':
683
+ return 'application/json';
684
+ case '.xml':
685
+ return 'application/xml';
686
+ case '.csv':
687
+ return 'text/csv';
688
+ case '.rtf':
689
+ return 'application/rtf';
690
+ // Document formats
691
+ case '.pdf':
692
+ return 'application/pdf';
693
+ case '.doc':
694
+ return 'application/msword';
695
+ case '.docx':
696
+ return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
697
+ case '.xls':
698
+ return 'application/vnd.ms-excel';
699
+ case '.xlsx':
700
+ return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
701
+ case '.ppt':
702
+ return 'application/vnd.ms-powerpoint';
703
+ case '.pptx':
704
+ return 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
705
+ case '.odt':
706
+ return 'application/vnd.oasis.opendocument.text';
707
+ case '.ods':
708
+ return 'application/vnd.oasis.opendocument.spreadsheet';
709
+ case '.odp':
710
+ return 'application/vnd.oasis.opendocument.presentation';
711
+ // Image formats
712
+ case '.jpg':
713
+ case '.jpeg':
714
+ return 'image/jpeg';
715
+ case '.png':
716
+ return 'image/png';
717
+ case '.gif':
718
+ return 'image/gif';
719
+ case '.webp':
720
+ return 'image/webp';
721
+ case '.bmp':
722
+ return 'image/bmp';
723
+ case '.tiff':
724
+ case '.tif':
725
+ return 'image/tiff';
726
+ case '.ico':
727
+ return 'image/x-icon';
728
+ case '.svg':
729
+ return 'image/svg+xml';
730
+ case '.avif':
731
+ return 'image/avif';
732
+ case '.heic':
733
+ case '.heif':
734
+ return 'image/heic';
735
+ // Archive formats
736
+ case '.zip':
737
+ return 'application/zip';
738
+ case '.rar':
739
+ return 'application/vnd.rar';
740
+ case '.7z':
741
+ return 'application/x-7z-compressed';
742
+ case '.tar':
743
+ return 'application/x-tar';
744
+ case '.gz':
745
+ return 'application/gzip';
746
+ // Audio formats
747
+ case '.mp3':
748
+ return 'audio/mpeg';
749
+ case '.wav':
750
+ return 'audio/wav';
751
+ case '.ogg':
752
+ return 'audio/ogg';
753
+ case '.flac':
754
+ return 'audio/flac';
755
+ // Video formats
756
+ case '.mp4':
757
+ return 'video/mp4';
758
+ case '.avi':
759
+ return 'video/x-msvideo';
760
+ case '.mov':
761
+ return 'video/quicktime';
762
+ case '.webm':
763
+ return 'video/webm';
764
+ default:
765
+ return 'application/octet-stream';
766
+ }
767
+ }
768
+ /**
769
+ * Validates if a content type is supported for processing
770
+ * @param contentType - MIME type to validate
771
+ * @returns Object with validation result and error message if unsupported
772
+ */
773
+ validateContentType(contentType) {
774
+ // Define supported content types for RAG-lite processing
775
+ const supportedTypes = new Set([
776
+ // Text formats (fully supported)
777
+ 'text/plain',
778
+ 'text/markdown',
779
+ 'text/html',
780
+ 'text/css',
781
+ 'text/csv',
782
+ 'application/json',
783
+ 'application/xml',
784
+ 'application/javascript',
785
+ 'application/rtf',
786
+ // Document formats (supported via preprocessing)
787
+ 'application/pdf',
788
+ 'application/msword',
789
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
790
+ 'application/vnd.ms-excel',
791
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
792
+ 'application/vnd.ms-powerpoint',
793
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
794
+ 'application/vnd.oasis.opendocument.text',
795
+ 'application/vnd.oasis.opendocument.spreadsheet',
796
+ 'application/vnd.oasis.opendocument.presentation',
797
+ // Image formats (supported via multimodal processing)
798
+ 'image/jpeg',
799
+ 'image/png',
800
+ 'image/gif',
801
+ 'image/webp',
802
+ 'image/bmp',
803
+ 'image/tiff',
804
+ 'image/svg+xml',
805
+ 'image/avif',
806
+ 'image/heic',
807
+ // Generic binary (accepted but limited processing)
808
+ 'application/octet-stream',
809
+ 'application/zip' // May contain supported documents
810
+ ]);
811
+ if (supportedTypes.has(contentType)) {
812
+ return { isSupported: true };
813
+ }
814
+ // Provide specific guidance for unsupported types
815
+ const category = contentType.split('/')[0];
816
+ let error = `Unsupported content type: ${contentType}. `;
817
+ switch (category) {
818
+ case 'audio':
819
+ error += 'Audio files are not supported for text-based RAG processing. Consider extracting transcripts or metadata.';
820
+ break;
821
+ case 'video':
822
+ error += 'Video files are not supported for text-based RAG processing. Consider extracting transcripts, subtitles, or metadata.';
823
+ break;
824
+ case 'application':
825
+ if (contentType.includes('executable') || contentType.includes('binary')) {
826
+ error += 'Executable and binary application files are not supported for security and processing reasons.';
827
+ }
828
+ else {
829
+ error += 'This application format is not currently supported. Supported formats include PDF, Office documents, and common text formats.';
830
+ }
831
+ break;
832
+ default:
833
+ error += `The ${category} content type is not supported. Supported types include text, documents (PDF, DOCX), and images.`;
834
+ }
835
+ return { isSupported: false, error };
836
+ }
837
+ /**
838
+ * Gets file extension from content type with enhanced mapping
839
+ * @param contentType - MIME type
840
+ * @returns File extension with dot, or null if unknown
841
+ */
842
+ getExtensionFromContentType(contentType) {
843
+ switch (contentType) {
844
+ // Text formats
845
+ case 'text/plain':
846
+ return '.txt';
847
+ case 'text/markdown':
848
+ return '.md';
849
+ case 'text/html':
850
+ return '.html';
851
+ case 'text/css':
852
+ return '.css';
853
+ case 'text/csv':
854
+ return '.csv';
855
+ case 'application/json':
856
+ return '.json';
857
+ case 'application/xml':
858
+ return '.xml';
859
+ case 'application/javascript':
860
+ return '.js';
861
+ case 'application/rtf':
862
+ return '.rtf';
863
+ // Document formats
864
+ case 'application/pdf':
865
+ return '.pdf';
866
+ case 'application/msword':
867
+ return '.doc';
868
+ case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
869
+ return '.docx';
870
+ case 'application/vnd.ms-excel':
871
+ return '.xls';
872
+ case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
873
+ return '.xlsx';
874
+ case 'application/vnd.ms-powerpoint':
875
+ return '.ppt';
876
+ case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
877
+ return '.pptx';
878
+ case 'application/vnd.oasis.opendocument.text':
879
+ return '.odt';
880
+ case 'application/vnd.oasis.opendocument.spreadsheet':
881
+ return '.ods';
882
+ case 'application/vnd.oasis.opendocument.presentation':
883
+ return '.odp';
884
+ // Image formats
885
+ case 'image/jpeg':
886
+ return '.jpg';
887
+ case 'image/png':
888
+ return '.png';
889
+ case 'image/gif':
890
+ return '.gif';
891
+ case 'image/webp':
892
+ return '.webp';
893
+ case 'image/bmp':
894
+ return '.bmp';
895
+ case 'image/tiff':
896
+ return '.tiff';
897
+ case 'image/x-icon':
898
+ return '.ico';
899
+ case 'image/svg+xml':
900
+ return '.svg';
901
+ case 'image/avif':
902
+ return '.avif';
903
+ case 'image/heic':
904
+ return '.heic';
905
+ // Archive formats
906
+ case 'application/zip':
907
+ return '.zip';
908
+ case 'application/vnd.rar':
909
+ return '.rar';
910
+ case 'application/x-7z-compressed':
911
+ return '.7z';
912
+ case 'application/x-tar':
913
+ return '.tar';
914
+ case 'application/gzip':
915
+ return '.gz';
916
+ default:
917
+ return '.bin'; // Generic binary extension for unknown types
918
+ }
919
+ }
920
+ /**
921
+ * Enhanced text content detection with better UTF-8 and encoding support
922
+ * @param content - Content buffer
923
+ * @returns True if content appears to be text
924
+ */
925
+ isTextContent(content) {
926
+ if (content.length === 0) {
927
+ return true; // Empty content is considered text
928
+ }
929
+ // Check first 2KB for better accuracy
930
+ const sample = content.subarray(0, Math.min(2048, content.length));
931
+ let nonTextBytes = 0;
932
+ let totalBytes = sample.length;
933
+ // Skip UTF-8 BOM if present
934
+ let startIndex = 0;
935
+ if (sample.length >= 3 &&
936
+ sample[0] === 0xEF && sample[1] === 0xBB && sample[2] === 0xBF) {
937
+ startIndex = 3;
938
+ }
939
+ // Skip UTF-16 BOM if present
940
+ if (sample.length >= 2 &&
941
+ ((sample[0] === 0xFF && sample[1] === 0xFE) ||
942
+ (sample[0] === 0xFE && sample[1] === 0xFF))) {
943
+ startIndex = 2;
944
+ }
945
+ for (let i = startIndex; i < sample.length; i++) {
946
+ const byte = sample[i];
947
+ // Allow common control characters
948
+ if (byte === 9 || byte === 10 || byte === 13) { // Tab, LF, CR
949
+ continue;
950
+ }
951
+ // Allow printable ASCII (32-126)
952
+ if (byte >= 32 && byte <= 126) {
953
+ continue;
954
+ }
955
+ // Allow extended ASCII and UTF-8 continuation bytes
956
+ if (byte >= 128) {
957
+ // Check if this is part of a valid UTF-8 sequence
958
+ if (this.isValidUTF8Byte(sample, i)) {
959
+ continue;
960
+ }
961
+ }
962
+ // Count non-text bytes
963
+ nonTextBytes++;
964
+ }
965
+ // Consider it text if less than 5% of bytes are non-text
966
+ const nonTextRatio = nonTextBytes / totalBytes;
967
+ return nonTextRatio < 0.05;
968
+ }
969
+ /**
970
+ * Checks if a byte at given position is part of a valid UTF-8 sequence
971
+ * @param buffer - Buffer to check
972
+ * @param index - Index of the byte to check
973
+ * @returns True if the byte is part of valid UTF-8
974
+ */
975
+ isValidUTF8Byte(buffer, index) {
976
+ const byte = buffer[index];
977
+ // UTF-8 continuation byte (10xxxxxx)
978
+ if ((byte & 0xC0) === 0x80) {
979
+ return true;
980
+ }
981
+ // UTF-8 start bytes
982
+ if ((byte & 0xE0) === 0xC0) { // 110xxxxx - 2-byte sequence
983
+ return index + 1 < buffer.length && (buffer[index + 1] & 0xC0) === 0x80;
984
+ }
985
+ if ((byte & 0xF0) === 0xE0) { // 1110xxxx - 3-byte sequence
986
+ return index + 2 < buffer.length &&
987
+ (buffer[index + 1] & 0xC0) === 0x80 &&
988
+ (buffer[index + 2] & 0xC0) === 0x80;
989
+ }
990
+ if ((byte & 0xF8) === 0xF0) { // 11110xxx - 4-byte sequence
991
+ return index + 3 < buffer.length &&
992
+ (buffer[index + 1] & 0xC0) === 0x80 &&
993
+ (buffer[index + 2] & 0xC0) === 0x80 &&
994
+ (buffer[index + 3] & 0xC0) === 0x80;
995
+ }
996
+ // Extended ASCII (128-255) - allow but consider less reliable
997
+ return byte >= 128 && byte <= 255;
998
+ }
999
+ /**
1000
+ * Ensures content directory exists
1001
+ * @returns Promise that resolves when directory is created
1002
+ */
1003
+ async ensureContentDirectory() {
1004
+ try {
1005
+ await fs.mkdir(this.config.contentDir, { recursive: true });
1006
+ }
1007
+ catch (error) {
1008
+ throw new Error(`Failed to create content directory: ${error instanceof Error ? error.message : 'Unknown error'}`);
1009
+ }
1010
+ }
1011
+ // =============================================================================
1012
+ // CONTENT DIRECTORY MANAGEMENT METHODS
1013
+ // =============================================================================
1014
+ /**
1015
+ * Gets comprehensive storage statistics for monitoring and reporting
1016
+ * @returns Promise that resolves to detailed storage statistics
1017
+ */
1018
+ async getStorageStats() {
1019
+ try {
1020
+ const dbStats = await getStorageStats(this.db);
1021
+ if (!dbStats) {
1022
+ // Initialize stats if they don't exist
1023
+ await this.updateStorageStats();
1024
+ return this.getStorageStats(); // Recursive call after initialization
1025
+ }
1026
+ // Calculate filesystem references total size
1027
+ const filesystemContent = await getContentMetadataByStorageType(this.db, 'filesystem');
1028
+ const filesystemTotalSize = filesystemContent.reduce((sum, meta) => sum + meta.fileSize, 0);
1029
+ // Calculate derived statistics
1030
+ const contentDirSizeMB = Math.round((dbStats.contentDirSize / 1024 / 1024) * 100) / 100;
1031
+ const filesystemSizeMB = Math.round((filesystemTotalSize / 1024 / 1024) * 100) / 100;
1032
+ const maxSizeMB = Math.round((this.config.maxContentDirSize / 1024 / 1024) * 100) / 100;
1033
+ const averageFileSize = dbStats.contentDirFiles > 0
1034
+ ? Math.round(dbStats.contentDirSize / dbStats.contentDirFiles)
1035
+ : 0;
1036
+ const totalContentItems = dbStats.contentDirFiles + dbStats.filesystemRefs;
1037
+ const totalStorageUsed = dbStats.contentDirSize + filesystemTotalSize;
1038
+ const totalStorageUsedMB = Math.round((totalStorageUsed / 1024 / 1024) * 100) / 100;
1039
+ const currentUsagePercent = this.config.maxContentDirSize > 0
1040
+ ? Math.round((dbStats.contentDirSize / this.config.maxContentDirSize) * 10000) / 100
1041
+ : 0;
1042
+ const remainingSpace = Math.max(0, this.config.maxContentDirSize - dbStats.contentDirSize);
1043
+ const remainingSpaceMB = Math.round((remainingSpace / 1024 / 1024) * 100) / 100;
1044
+ // Calculate storage efficiency (how much space saved by deduplication)
1045
+ // This is a rough estimate based on the assumption that without deduplication,
1046
+ // we might have more duplicate files
1047
+ const storageEfficiency = totalContentItems > 0
1048
+ ? Math.round((totalContentItems / Math.max(1, totalContentItems)) * 100)
1049
+ : 100;
1050
+ return {
1051
+ contentDirectory: {
1052
+ totalFiles: dbStats.contentDirFiles,
1053
+ totalSize: dbStats.contentDirSize,
1054
+ totalSizeMB: contentDirSizeMB,
1055
+ averageFileSize
1056
+ },
1057
+ filesystemReferences: {
1058
+ totalRefs: dbStats.filesystemRefs,
1059
+ totalSize: filesystemTotalSize,
1060
+ totalSizeMB: filesystemSizeMB
1061
+ },
1062
+ overall: {
1063
+ totalContentItems,
1064
+ totalStorageUsed,
1065
+ totalStorageUsedMB,
1066
+ storageEfficiency
1067
+ },
1068
+ limits: {
1069
+ maxContentDirSize: this.config.maxContentDirSize,
1070
+ maxContentDirSizeMB: maxSizeMB,
1071
+ currentUsagePercent,
1072
+ remainingSpace,
1073
+ remainingSpaceMB
1074
+ },
1075
+ lastUpdated: new Date(),
1076
+ lastCleanup: dbStats.lastCleanup
1077
+ };
1078
+ }
1079
+ catch (error) {
1080
+ throw new Error(`Failed to get storage statistics: ${error instanceof Error ? error.message : 'Unknown error'}`);
1081
+ }
1082
+ }
1083
+ /**
1084
+ * Gets current storage statistics for the content directory (legacy method)
1085
+ * @returns Promise that resolves to storage statistics
1086
+ * @deprecated Use getStorageStats() for more comprehensive statistics
1087
+ */
1088
+ async getContentDirectoryStats() {
1089
+ try {
1090
+ const stats = await getStorageStats(this.db);
1091
+ if (!stats) {
1092
+ // Initialize stats if they don't exist
1093
+ await this.updateStorageStats();
1094
+ return {
1095
+ totalFiles: 0,
1096
+ totalSize: 0,
1097
+ filesystemRefs: 0,
1098
+ lastCleanup: null
1099
+ };
1100
+ }
1101
+ return {
1102
+ totalFiles: stats.contentDirFiles,
1103
+ totalSize: stats.contentDirSize,
1104
+ filesystemRefs: stats.filesystemRefs,
1105
+ lastCleanup: stats.lastCleanup
1106
+ };
1107
+ }
1108
+ catch (error) {
1109
+ throw new Error(`Failed to get content directory stats: ${error instanceof Error ? error.message : 'Unknown error'}`);
1110
+ }
1111
+ }
1112
+ /**
1113
+ * Generates a simple, human-readable storage usage report
1114
+ * @returns Promise that resolves to formatted storage report
1115
+ */
1116
+ async generateStorageReport() {
1117
+ try {
1118
+ const stats = await this.getStorageStats();
1119
+ const report = [
1120
+ '=== RAG-lite Content Storage Report ===',
1121
+ '',
1122
+ 'Content Directory:',
1123
+ ` Files: ${stats.contentDirectory.totalFiles}`,
1124
+ ` Size: ${stats.contentDirectory.totalSizeMB} MB`,
1125
+ ` Average file size: ${Math.round(stats.contentDirectory.averageFileSize / 1024)} KB`,
1126
+ '',
1127
+ 'Filesystem References:',
1128
+ ` References: ${stats.filesystemReferences.totalRefs}`,
1129
+ ` Total size: ${stats.filesystemReferences.totalSizeMB} MB`,
1130
+ '',
1131
+ 'Overall Usage:',
1132
+ ` Total content items: ${stats.overall.totalContentItems}`,
1133
+ ` Total storage used: ${stats.overall.totalStorageUsedMB} MB`,
1134
+ ` Storage efficiency: ${stats.overall.storageEfficiency}%`,
1135
+ '',
1136
+ 'Storage Limits:',
1137
+ ` Content directory limit: ${stats.limits.maxContentDirSizeMB} MB`,
1138
+ ` Current usage: ${stats.limits.currentUsagePercent}%`,
1139
+ ` Remaining space: ${stats.limits.remainingSpaceMB} MB`,
1140
+ '',
1141
+ 'Maintenance:',
1142
+ ` Last updated: ${stats.lastUpdated.toISOString()}`,
1143
+ ` Last cleanup: ${stats.lastCleanup ? stats.lastCleanup.toISOString() : 'Never'}`,
1144
+ ''
1145
+ ];
1146
+ // Add warnings if needed
1147
+ if (stats.limits.currentUsagePercent > 90) {
1148
+ report.push('⚠️ WARNING: Content directory is over 90% full!');
1149
+ report.push(' Consider running cleanup operations to free space.');
1150
+ report.push('');
1151
+ }
1152
+ else if (stats.limits.currentUsagePercent > 75) {
1153
+ report.push('⚠️ NOTICE: Content directory is over 75% full.');
1154
+ report.push(' You may want to run cleanup operations soon.');
1155
+ report.push('');
1156
+ }
1157
+ return report.join('\n');
1158
+ }
1159
+ catch (error) {
1160
+ throw new Error(`Failed to generate storage report: ${error instanceof Error ? error.message : 'Unknown error'}`);
1161
+ }
1162
+ }
1163
+ /**
1164
+ * Gets storage statistics in a format suitable for monitoring systems
1165
+ * @returns Promise that resolves to monitoring-friendly statistics
1166
+ */
1167
+ async getStorageMetrics() {
1168
+ try {
1169
+ const stats = await this.getStorageStats();
1170
+ return {
1171
+ contentDirFiles: stats.contentDirectory.totalFiles,
1172
+ contentDirSizeBytes: stats.contentDirectory.totalSize,
1173
+ contentDirSizeMB: stats.contentDirectory.totalSizeMB,
1174
+ filesystemRefs: stats.filesystemReferences.totalRefs,
1175
+ filesystemSizeBytes: stats.filesystemReferences.totalSize,
1176
+ filesystemSizeMB: stats.filesystemReferences.totalSizeMB,
1177
+ totalContentItems: stats.overall.totalContentItems,
1178
+ totalStorageBytes: stats.overall.totalStorageUsed,
1179
+ totalStorageMB: stats.overall.totalStorageUsedMB,
1180
+ usagePercent: stats.limits.currentUsagePercent,
1181
+ remainingBytes: stats.limits.remainingSpace,
1182
+ remainingMB: stats.limits.remainingSpaceMB,
1183
+ lastCleanupTimestamp: stats.lastCleanup ? stats.lastCleanup.getTime() : null,
1184
+ lastUpdatedTimestamp: stats.lastUpdated.getTime()
1185
+ };
1186
+ }
1187
+ catch (error) {
1188
+ throw new Error(`Failed to get storage metrics: ${error instanceof Error ? error.message : 'Unknown error'}`);
1189
+ }
1190
+ }
1191
+ /**
1192
+ * Updates storage statistics by scanning the content directory
1193
+ * @returns Promise that resolves when stats are updated
1194
+ */
1195
+ async updateStorageStats() {
1196
+ try {
1197
+ let contentDirFiles = 0;
1198
+ let contentDirSize = 0;
1199
+ let filesystemRefs = 0;
1200
+ // Count content directory files and size
1201
+ try {
1202
+ const contentDirContents = await fs.readdir(this.config.contentDir);
1203
+ for (const filename of contentDirContents) {
1204
+ const filePath = join(this.config.contentDir, filename);
1205
+ try {
1206
+ const stats = await fs.stat(filePath);
1207
+ if (stats.isFile()) {
1208
+ contentDirFiles++;
1209
+ contentDirSize += stats.size;
1210
+ }
1211
+ }
1212
+ catch {
1213
+ // Skip files that can't be accessed
1214
+ }
1215
+ }
1216
+ }
1217
+ catch {
1218
+ // Content directory doesn't exist or can't be read
1219
+ contentDirFiles = 0;
1220
+ contentDirSize = 0;
1221
+ }
1222
+ // Count filesystem references
1223
+ const filesystemContent = await getContentMetadataByStorageType(this.db, 'filesystem');
1224
+ filesystemRefs = filesystemContent.length;
1225
+ // Update database stats
1226
+ await updateStorageStats(this.db, {
1227
+ contentDirFiles,
1228
+ contentDirSize,
1229
+ filesystemRefs
1230
+ });
1231
+ }
1232
+ catch (error) {
1233
+ throw new Error(`Failed to update storage stats: ${error instanceof Error ? error.message : 'Unknown error'}`);
1234
+ }
1235
+ }
1236
+ /**
1237
+ * Checks if adding new content would exceed storage limits (legacy method)
1238
+ * @param contentSize - Size of content to add
1239
+ * @returns Promise that resolves to true if within limits, false otherwise
1240
+ * @deprecated Use enforceStorageLimits() for better error handling and guidance
1241
+ */
1242
+ async checkStorageLimits(contentSize) {
1243
+ try {
1244
+ const stats = await this.getContentDirectoryStats();
1245
+ return (stats.totalSize + contentSize) <= this.config.maxContentDirSize;
1246
+ }
1247
+ catch (error) {
1248
+ // If we can't get stats, allow the operation but log the error
1249
+ console.warn('Failed to check storage limits:', error);
1250
+ return true;
1251
+ }
1252
+ }
1253
+ /**
1254
+ * Removes orphaned files that exist in content directory but have no metadata references
1255
+ * @returns Promise that resolves to cleanup results
1256
+ */
1257
+ async removeOrphanedFiles() {
1258
+ return this.cleanupOrphanedFiles();
1259
+ }
1260
+ /**
1261
+ * Removes duplicate content files based on content hash, keeping the first occurrence
1262
+ * @returns Promise that resolves to deduplication results
1263
+ */
1264
+ async removeDuplicateContent() {
1265
+ return this.deduplicateContentFiles();
1266
+ }
1267
+ /**
1268
+ * Cleans up orphaned files in the content directory
1269
+ * Removes files that exist in the directory but have no corresponding metadata
1270
+ * @returns Promise that resolves to cleanup results
1271
+ */
1272
+ async cleanupOrphanedFiles() {
1273
+ const removedFiles = [];
1274
+ const errors = [];
1275
+ let freedSpace = 0;
1276
+ try {
1277
+ // Ensure content directory exists
1278
+ await this.ensureContentDirectory();
1279
+ // Get all content metadata for content_dir storage
1280
+ const contentMetadata = await getContentMetadataByStorageType(this.db, 'content_dir');
1281
+ const validPaths = new Set(contentMetadata.map(meta => meta.contentPath));
1282
+ // Scan content directory for files
1283
+ const contentDirContents = await fs.readdir(this.config.contentDir);
1284
+ for (const filename of contentDirContents) {
1285
+ const filePath = join(this.config.contentDir, filename);
1286
+ try {
1287
+ const stats = await fs.stat(filePath);
1288
+ if (stats.isFile() && !validPaths.has(filePath)) {
1289
+ // This file is orphaned - remove it
1290
+ await fs.unlink(filePath);
1291
+ removedFiles.push(filename);
1292
+ freedSpace += stats.size;
1293
+ }
1294
+ }
1295
+ catch (error) {
1296
+ errors.push(`Failed to process ${filename}: ${error instanceof Error ? error.message : 'Unknown error'}`);
1297
+ }
1298
+ }
1299
+ // Update storage stats after cleanup
1300
+ if (removedFiles.length > 0) {
1301
+ await this.updateStorageStats();
1302
+ // Update last cleanup time
1303
+ await updateStorageStats(this.db, {
1304
+ lastCleanup: new Date()
1305
+ });
1306
+ }
1307
+ return { removedFiles, errors, freedSpace };
1308
+ }
1309
+ catch (error) {
1310
+ throw new Error(`Failed to cleanup orphaned files: ${error instanceof Error ? error.message : 'Unknown error'}`);
1311
+ }
1312
+ }
1313
+ /**
1314
+ * Removes duplicate content files based on content hash
1315
+ * Keeps the first occurrence and removes duplicates
1316
+ * @returns Promise that resolves to deduplication results
1317
+ */
1318
+ async deduplicateContentFiles() {
1319
+ const removedFiles = [];
1320
+ const errors = [];
1321
+ let freedSpace = 0;
1322
+ try {
1323
+ // Get all content metadata for content_dir storage
1324
+ const contentMetadata = await getContentMetadataByStorageType(this.db, 'content_dir');
1325
+ // Group by content hash
1326
+ const hashGroups = new Map();
1327
+ for (const metadata of contentMetadata) {
1328
+ const hash = metadata.contentHash;
1329
+ if (!hashGroups.has(hash)) {
1330
+ hashGroups.set(hash, []);
1331
+ }
1332
+ hashGroups.get(hash).push(metadata);
1333
+ }
1334
+ // Process groups with duplicates
1335
+ for (const [hash, group] of hashGroups) {
1336
+ if (group.length > 1) {
1337
+ // Keep the first one, remove the rest
1338
+ const [keep, ...remove] = group.sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime());
1339
+ for (const duplicate of remove) {
1340
+ try {
1341
+ // Remove file
1342
+ const stats = await fs.stat(duplicate.contentPath);
1343
+ await fs.unlink(duplicate.contentPath);
1344
+ // Remove metadata
1345
+ await deleteContentMetadata(this.db, duplicate.id);
1346
+ removedFiles.push(basename(duplicate.contentPath));
1347
+ freedSpace += stats.size;
1348
+ }
1349
+ catch (error) {
1350
+ errors.push(`Failed to remove duplicate ${duplicate.id}: ${error instanceof Error ? error.message : 'Unknown error'}`);
1351
+ }
1352
+ }
1353
+ }
1354
+ }
1355
+ // Update storage stats after deduplication
1356
+ if (removedFiles.length > 0) {
1357
+ await this.updateStorageStats();
1358
+ }
1359
+ return { removedFiles, errors, freedSpace };
1360
+ }
1361
+ catch (error) {
1362
+ throw new Error(`Failed to deduplicate content files: ${error instanceof Error ? error.message : 'Unknown error'}`);
1363
+ }
1364
+ }
1365
+ /**
1366
+ * Ensures content directory has proper permissions
1367
+ * @returns Promise that resolves when permissions are set
1368
+ */
1369
+ async ensureContentDirectoryPermissions() {
1370
+ try {
1371
+ await this.ensureContentDirectory();
1372
+ // Set directory permissions to 755 (owner: rwx, group: rx, others: rx)
1373
+ await fs.chmod(this.config.contentDir, 0o755);
1374
+ }
1375
+ catch (error) {
1376
+ throw new Error(`Failed to set content directory permissions: ${error instanceof Error ? error.message : 'Unknown error'}`);
1377
+ }
1378
+ }
1379
+ /**
1380
+ * Validates content directory structure and repairs if needed
1381
+ * @returns Promise that resolves to validation results
1382
+ */
1383
+ async validateAndRepairContentDirectory() {
1384
+ const issues = [];
1385
+ const repaired = [];
1386
+ try {
1387
+ // Check if content directory exists
1388
+ try {
1389
+ const stats = await fs.stat(this.config.contentDir);
1390
+ if (!stats.isDirectory()) {
1391
+ issues.push('Content path exists but is not a directory');
1392
+ }
1393
+ }
1394
+ catch {
1395
+ // Directory doesn't exist - create it
1396
+ await this.ensureContentDirectory();
1397
+ repaired.push('Created missing content directory');
1398
+ }
1399
+ // Check permissions
1400
+ try {
1401
+ await fs.access(this.config.contentDir, fs.constants.R_OK | fs.constants.W_OK);
1402
+ }
1403
+ catch {
1404
+ issues.push('Content directory is not readable/writable');
1405
+ try {
1406
+ await this.ensureContentDirectoryPermissions();
1407
+ repaired.push('Fixed content directory permissions');
1408
+ }
1409
+ catch {
1410
+ issues.push('Failed to fix content directory permissions');
1411
+ }
1412
+ }
1413
+ // Validate storage stats consistency
1414
+ try {
1415
+ const dbStats = await getStorageStats(this.db);
1416
+ const actualStats = await this.getActualDirectoryStats();
1417
+ if (!dbStats ||
1418
+ dbStats.contentDirFiles !== actualStats.files ||
1419
+ Math.abs(dbStats.contentDirSize - actualStats.size) > 1024) { // Allow 1KB tolerance
1420
+ await this.updateStorageStats();
1421
+ repaired.push('Updated inconsistent storage statistics');
1422
+ }
1423
+ }
1424
+ catch (error) {
1425
+ issues.push(`Failed to validate storage stats: ${error instanceof Error ? error.message : 'Unknown error'}`);
1426
+ }
1427
+ return {
1428
+ isValid: issues.length === 0,
1429
+ issues,
1430
+ repaired
1431
+ };
1432
+ }
1433
+ catch (error) {
1434
+ throw new Error(`Failed to validate content directory: ${error instanceof Error ? error.message : 'Unknown error'}`);
1435
+ }
1436
+ }
1437
+ /**
1438
+ * Gets actual directory statistics by scanning the filesystem
1439
+ * @returns Promise that resolves to actual directory stats
1440
+ */
1441
+ async getActualDirectoryStats() {
1442
+ let files = 0;
1443
+ let size = 0;
1444
+ try {
1445
+ const contentDirContents = await fs.readdir(this.config.contentDir);
1446
+ for (const filename of contentDirContents) {
1447
+ const filePath = join(this.config.contentDir, filename);
1448
+ try {
1449
+ const stats = await fs.stat(filePath);
1450
+ if (stats.isFile()) {
1451
+ files++;
1452
+ size += stats.size;
1453
+ }
1454
+ }
1455
+ catch {
1456
+ // Skip files that can't be accessed
1457
+ }
1458
+ }
1459
+ }
1460
+ catch {
1461
+ // Directory doesn't exist or can't be read
1462
+ }
1463
+ return { files, size };
1464
+ }
1465
+ /**
1466
+ * Cleanup resources to prevent memory leaks and hanging processes
1467
+ * Should be called when ContentManager is no longer needed
1468
+ */
1469
+ cleanup() {
1470
+ // Clean up performance optimizer interval that prevents process exit
1471
+ if (this.performanceOptimizer && typeof this.performanceOptimizer.cleanup === 'function') {
1472
+ this.performanceOptimizer.cleanup();
1473
+ }
1474
+ }
1475
+ }
1476
+ //# sourceMappingURL=content-manager.js.map