amd-gaia 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (800) hide show
  1. amd_gaia-0.14.1.dist-info/METADATA +768 -0
  2. amd_gaia-0.14.1.dist-info/RECORD +800 -0
  3. amd_gaia-0.14.1.dist-info/WHEEL +5 -0
  4. amd_gaia-0.14.1.dist-info/entry_points.txt +5 -0
  5. amd_gaia-0.14.1.dist-info/licenses/LICENSE.md +21 -0
  6. amd_gaia-0.14.1.dist-info/top_level.txt +1 -0
  7. gaia/__init__.py +2 -0
  8. gaia/agents/__init__.py +19 -0
  9. gaia/agents/base/__init__.py +9 -0
  10. gaia/agents/base/agent.py +2072 -0
  11. gaia/agents/base/api_agent.py +120 -0
  12. gaia/agents/base/console.py +1457 -0
  13. gaia/agents/base/mcp_agent.py +86 -0
  14. gaia/agents/base/tools.py +83 -0
  15. gaia/agents/blender/agent.py +556 -0
  16. gaia/agents/blender/agent_simple.py +135 -0
  17. gaia/agents/blender/app.py +211 -0
  18. gaia/agents/blender/app_simple.py +41 -0
  19. gaia/agents/blender/core/__init__.py +16 -0
  20. gaia/agents/blender/core/materials.py +506 -0
  21. gaia/agents/blender/core/objects.py +316 -0
  22. gaia/agents/blender/core/rendering.py +225 -0
  23. gaia/agents/blender/core/scene.py +220 -0
  24. gaia/agents/blender/core/view.py +146 -0
  25. gaia/agents/chat/__init__.py +9 -0
  26. gaia/agents/chat/agent.py +975 -0
  27. gaia/agents/chat/app.py +1058 -0
  28. gaia/agents/chat/session.py +508 -0
  29. gaia/agents/chat/tools/__init__.py +15 -0
  30. gaia/agents/chat/tools/file_tools.py +96 -0
  31. gaia/agents/chat/tools/rag_tools.py +1729 -0
  32. gaia/agents/chat/tools/shell_tools.py +436 -0
  33. gaia/agents/code/__init__.py +7 -0
  34. gaia/agents/code/agent.py +547 -0
  35. gaia/agents/code/app.py +266 -0
  36. gaia/agents/code/models.py +135 -0
  37. gaia/agents/code/orchestration/__init__.py +24 -0
  38. gaia/agents/code/orchestration/checklist_executor.py +1739 -0
  39. gaia/agents/code/orchestration/checklist_generator.py +709 -0
  40. gaia/agents/code/orchestration/factories/__init__.py +9 -0
  41. gaia/agents/code/orchestration/factories/base.py +63 -0
  42. gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -0
  43. gaia/agents/code/orchestration/factories/python_factory.py +106 -0
  44. gaia/agents/code/orchestration/orchestrator.py +610 -0
  45. gaia/agents/code/orchestration/project_analyzer.py +391 -0
  46. gaia/agents/code/orchestration/steps/__init__.py +67 -0
  47. gaia/agents/code/orchestration/steps/base.py +188 -0
  48. gaia/agents/code/orchestration/steps/error_handler.py +314 -0
  49. gaia/agents/code/orchestration/steps/nextjs.py +828 -0
  50. gaia/agents/code/orchestration/steps/python.py +307 -0
  51. gaia/agents/code/orchestration/template_catalog.py +463 -0
  52. gaia/agents/code/orchestration/workflows/__init__.py +14 -0
  53. gaia/agents/code/orchestration/workflows/base.py +80 -0
  54. gaia/agents/code/orchestration/workflows/nextjs.py +186 -0
  55. gaia/agents/code/orchestration/workflows/python.py +94 -0
  56. gaia/agents/code/prompts/__init__.py +11 -0
  57. gaia/agents/code/prompts/base_prompt.py +77 -0
  58. gaia/agents/code/prompts/code_patterns.py +1925 -0
  59. gaia/agents/code/prompts/nextjs_prompt.py +40 -0
  60. gaia/agents/code/prompts/python_prompt.py +109 -0
  61. gaia/agents/code/schema_inference.py +365 -0
  62. gaia/agents/code/system_prompt.py +41 -0
  63. gaia/agents/code/tools/__init__.py +42 -0
  64. gaia/agents/code/tools/cli_tools.py +1138 -0
  65. gaia/agents/code/tools/code_formatting.py +319 -0
  66. gaia/agents/code/tools/code_tools.py +769 -0
  67. gaia/agents/code/tools/error_fixing.py +1347 -0
  68. gaia/agents/code/tools/external_tools.py +180 -0
  69. gaia/agents/code/tools/file_io.py +845 -0
  70. gaia/agents/code/tools/prisma_tools.py +190 -0
  71. gaia/agents/code/tools/project_management.py +1016 -0
  72. gaia/agents/code/tools/testing.py +321 -0
  73. gaia/agents/code/tools/typescript_tools.py +122 -0
  74. gaia/agents/code/tools/validation_parsing.py +461 -0
  75. gaia/agents/code/tools/validation_tools.py +803 -0
  76. gaia/agents/code/tools/web_dev_tools.py +1744 -0
  77. gaia/agents/code/validators/__init__.py +16 -0
  78. gaia/agents/code/validators/antipattern_checker.py +241 -0
  79. gaia/agents/code/validators/ast_analyzer.py +197 -0
  80. gaia/agents/code/validators/requirements_validator.py +145 -0
  81. gaia/agents/code/validators/syntax_validator.py +171 -0
  82. gaia/agents/docker/__init__.py +7 -0
  83. gaia/agents/docker/agent.py +642 -0
  84. gaia/agents/jira/__init__.py +11 -0
  85. gaia/agents/jira/agent.py +894 -0
  86. gaia/agents/jira/jql_templates.py +299 -0
  87. gaia/agents/routing/__init__.py +7 -0
  88. gaia/agents/routing/agent.py +512 -0
  89. gaia/agents/routing/system_prompt.py +75 -0
  90. gaia/api/__init__.py +23 -0
  91. gaia/api/agent_registry.py +238 -0
  92. gaia/api/app.py +305 -0
  93. gaia/api/openai_server.py +575 -0
  94. gaia/api/schemas.py +186 -0
  95. gaia/api/sse_handler.py +370 -0
  96. gaia/apps/__init__.py +4 -0
  97. gaia/apps/llm/__init__.py +6 -0
  98. gaia/apps/llm/app.py +169 -0
  99. gaia/apps/summarize/app.py +633 -0
  100. gaia/apps/summarize/html_viewer.py +133 -0
  101. gaia/apps/summarize/pdf_formatter.py +284 -0
  102. gaia/audio/__init__.py +2 -0
  103. gaia/audio/audio_client.py +439 -0
  104. gaia/audio/audio_recorder.py +269 -0
  105. gaia/audio/kokoro_tts.py +599 -0
  106. gaia/audio/whisper_asr.py +432 -0
  107. gaia/chat/__init__.py +16 -0
  108. gaia/chat/app.py +430 -0
  109. gaia/chat/prompts.py +522 -0
  110. gaia/chat/sdk.py +1200 -0
  111. gaia/cli.py +5621 -0
  112. gaia/eval/batch_experiment.py +2332 -0
  113. gaia/eval/claude.py +542 -0
  114. gaia/eval/config.py +37 -0
  115. gaia/eval/email_generator.py +512 -0
  116. gaia/eval/eval.py +3179 -0
  117. gaia/eval/groundtruth.py +1130 -0
  118. gaia/eval/transcript_generator.py +582 -0
  119. gaia/eval/webapp/README.md +168 -0
  120. gaia/eval/webapp/node_modules/.bin/mime +16 -0
  121. gaia/eval/webapp/node_modules/.bin/mime.cmd +17 -0
  122. gaia/eval/webapp/node_modules/.bin/mime.ps1 +28 -0
  123. gaia/eval/webapp/node_modules/.package-lock.json +865 -0
  124. gaia/eval/webapp/node_modules/accepts/HISTORY.md +243 -0
  125. gaia/eval/webapp/node_modules/accepts/LICENSE +23 -0
  126. gaia/eval/webapp/node_modules/accepts/README.md +140 -0
  127. gaia/eval/webapp/node_modules/accepts/index.js +238 -0
  128. gaia/eval/webapp/node_modules/accepts/package.json +47 -0
  129. gaia/eval/webapp/node_modules/array-flatten/LICENSE +21 -0
  130. gaia/eval/webapp/node_modules/array-flatten/README.md +43 -0
  131. gaia/eval/webapp/node_modules/array-flatten/array-flatten.js +64 -0
  132. gaia/eval/webapp/node_modules/array-flatten/package.json +39 -0
  133. gaia/eval/webapp/node_modules/body-parser/HISTORY.md +672 -0
  134. gaia/eval/webapp/node_modules/body-parser/LICENSE +23 -0
  135. gaia/eval/webapp/node_modules/body-parser/README.md +476 -0
  136. gaia/eval/webapp/node_modules/body-parser/SECURITY.md +25 -0
  137. gaia/eval/webapp/node_modules/body-parser/index.js +156 -0
  138. gaia/eval/webapp/node_modules/body-parser/lib/read.js +205 -0
  139. gaia/eval/webapp/node_modules/body-parser/lib/types/json.js +247 -0
  140. gaia/eval/webapp/node_modules/body-parser/lib/types/raw.js +101 -0
  141. gaia/eval/webapp/node_modules/body-parser/lib/types/text.js +121 -0
  142. gaia/eval/webapp/node_modules/body-parser/lib/types/urlencoded.js +307 -0
  143. gaia/eval/webapp/node_modules/body-parser/package.json +56 -0
  144. gaia/eval/webapp/node_modules/bytes/History.md +97 -0
  145. gaia/eval/webapp/node_modules/bytes/LICENSE +23 -0
  146. gaia/eval/webapp/node_modules/bytes/Readme.md +152 -0
  147. gaia/eval/webapp/node_modules/bytes/index.js +170 -0
  148. gaia/eval/webapp/node_modules/bytes/package.json +42 -0
  149. gaia/eval/webapp/node_modules/call-bind-apply-helpers/.eslintrc +17 -0
  150. gaia/eval/webapp/node_modules/call-bind-apply-helpers/.github/FUNDING.yml +12 -0
  151. gaia/eval/webapp/node_modules/call-bind-apply-helpers/.nycrc +9 -0
  152. gaia/eval/webapp/node_modules/call-bind-apply-helpers/CHANGELOG.md +30 -0
  153. gaia/eval/webapp/node_modules/call-bind-apply-helpers/LICENSE +21 -0
  154. gaia/eval/webapp/node_modules/call-bind-apply-helpers/README.md +62 -0
  155. gaia/eval/webapp/node_modules/call-bind-apply-helpers/actualApply.d.ts +1 -0
  156. gaia/eval/webapp/node_modules/call-bind-apply-helpers/actualApply.js +10 -0
  157. gaia/eval/webapp/node_modules/call-bind-apply-helpers/applyBind.d.ts +19 -0
  158. gaia/eval/webapp/node_modules/call-bind-apply-helpers/applyBind.js +10 -0
  159. gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionApply.d.ts +1 -0
  160. gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionApply.js +4 -0
  161. gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionCall.d.ts +1 -0
  162. gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionCall.js +4 -0
  163. gaia/eval/webapp/node_modules/call-bind-apply-helpers/index.d.ts +64 -0
  164. gaia/eval/webapp/node_modules/call-bind-apply-helpers/index.js +15 -0
  165. gaia/eval/webapp/node_modules/call-bind-apply-helpers/package.json +85 -0
  166. gaia/eval/webapp/node_modules/call-bind-apply-helpers/reflectApply.d.ts +3 -0
  167. gaia/eval/webapp/node_modules/call-bind-apply-helpers/reflectApply.js +4 -0
  168. gaia/eval/webapp/node_modules/call-bind-apply-helpers/test/index.js +63 -0
  169. gaia/eval/webapp/node_modules/call-bind-apply-helpers/tsconfig.json +9 -0
  170. gaia/eval/webapp/node_modules/call-bound/.eslintrc +13 -0
  171. gaia/eval/webapp/node_modules/call-bound/.github/FUNDING.yml +12 -0
  172. gaia/eval/webapp/node_modules/call-bound/.nycrc +9 -0
  173. gaia/eval/webapp/node_modules/call-bound/CHANGELOG.md +42 -0
  174. gaia/eval/webapp/node_modules/call-bound/LICENSE +21 -0
  175. gaia/eval/webapp/node_modules/call-bound/README.md +53 -0
  176. gaia/eval/webapp/node_modules/call-bound/index.d.ts +94 -0
  177. gaia/eval/webapp/node_modules/call-bound/index.js +19 -0
  178. gaia/eval/webapp/node_modules/call-bound/package.json +99 -0
  179. gaia/eval/webapp/node_modules/call-bound/test/index.js +61 -0
  180. gaia/eval/webapp/node_modules/call-bound/tsconfig.json +10 -0
  181. gaia/eval/webapp/node_modules/content-disposition/HISTORY.md +60 -0
  182. gaia/eval/webapp/node_modules/content-disposition/LICENSE +22 -0
  183. gaia/eval/webapp/node_modules/content-disposition/README.md +142 -0
  184. gaia/eval/webapp/node_modules/content-disposition/index.js +458 -0
  185. gaia/eval/webapp/node_modules/content-disposition/package.json +44 -0
  186. gaia/eval/webapp/node_modules/content-type/HISTORY.md +29 -0
  187. gaia/eval/webapp/node_modules/content-type/LICENSE +22 -0
  188. gaia/eval/webapp/node_modules/content-type/README.md +94 -0
  189. gaia/eval/webapp/node_modules/content-type/index.js +225 -0
  190. gaia/eval/webapp/node_modules/content-type/package.json +42 -0
  191. gaia/eval/webapp/node_modules/cookie/LICENSE +24 -0
  192. gaia/eval/webapp/node_modules/cookie/README.md +317 -0
  193. gaia/eval/webapp/node_modules/cookie/SECURITY.md +25 -0
  194. gaia/eval/webapp/node_modules/cookie/index.js +334 -0
  195. gaia/eval/webapp/node_modules/cookie/package.json +44 -0
  196. gaia/eval/webapp/node_modules/cookie-signature/.npmignore +4 -0
  197. gaia/eval/webapp/node_modules/cookie-signature/History.md +38 -0
  198. gaia/eval/webapp/node_modules/cookie-signature/Readme.md +42 -0
  199. gaia/eval/webapp/node_modules/cookie-signature/index.js +51 -0
  200. gaia/eval/webapp/node_modules/cookie-signature/package.json +18 -0
  201. gaia/eval/webapp/node_modules/debug/.coveralls.yml +1 -0
  202. gaia/eval/webapp/node_modules/debug/.eslintrc +11 -0
  203. gaia/eval/webapp/node_modules/debug/.npmignore +9 -0
  204. gaia/eval/webapp/node_modules/debug/.travis.yml +14 -0
  205. gaia/eval/webapp/node_modules/debug/CHANGELOG.md +362 -0
  206. gaia/eval/webapp/node_modules/debug/LICENSE +19 -0
  207. gaia/eval/webapp/node_modules/debug/Makefile +50 -0
  208. gaia/eval/webapp/node_modules/debug/README.md +312 -0
  209. gaia/eval/webapp/node_modules/debug/component.json +19 -0
  210. gaia/eval/webapp/node_modules/debug/karma.conf.js +70 -0
  211. gaia/eval/webapp/node_modules/debug/node.js +1 -0
  212. gaia/eval/webapp/node_modules/debug/package.json +49 -0
  213. gaia/eval/webapp/node_modules/debug/src/browser.js +185 -0
  214. gaia/eval/webapp/node_modules/debug/src/debug.js +202 -0
  215. gaia/eval/webapp/node_modules/debug/src/index.js +10 -0
  216. gaia/eval/webapp/node_modules/debug/src/inspector-log.js +15 -0
  217. gaia/eval/webapp/node_modules/debug/src/node.js +248 -0
  218. gaia/eval/webapp/node_modules/depd/History.md +103 -0
  219. gaia/eval/webapp/node_modules/depd/LICENSE +22 -0
  220. gaia/eval/webapp/node_modules/depd/Readme.md +280 -0
  221. gaia/eval/webapp/node_modules/depd/index.js +538 -0
  222. gaia/eval/webapp/node_modules/depd/lib/browser/index.js +77 -0
  223. gaia/eval/webapp/node_modules/depd/package.json +45 -0
  224. gaia/eval/webapp/node_modules/destroy/LICENSE +23 -0
  225. gaia/eval/webapp/node_modules/destroy/README.md +63 -0
  226. gaia/eval/webapp/node_modules/destroy/index.js +209 -0
  227. gaia/eval/webapp/node_modules/destroy/package.json +48 -0
  228. gaia/eval/webapp/node_modules/dunder-proto/.eslintrc +5 -0
  229. gaia/eval/webapp/node_modules/dunder-proto/.github/FUNDING.yml +12 -0
  230. gaia/eval/webapp/node_modules/dunder-proto/.nycrc +13 -0
  231. gaia/eval/webapp/node_modules/dunder-proto/CHANGELOG.md +24 -0
  232. gaia/eval/webapp/node_modules/dunder-proto/LICENSE +21 -0
  233. gaia/eval/webapp/node_modules/dunder-proto/README.md +54 -0
  234. gaia/eval/webapp/node_modules/dunder-proto/get.d.ts +5 -0
  235. gaia/eval/webapp/node_modules/dunder-proto/get.js +30 -0
  236. gaia/eval/webapp/node_modules/dunder-proto/package.json +76 -0
  237. gaia/eval/webapp/node_modules/dunder-proto/set.d.ts +5 -0
  238. gaia/eval/webapp/node_modules/dunder-proto/set.js +35 -0
  239. gaia/eval/webapp/node_modules/dunder-proto/test/get.js +34 -0
  240. gaia/eval/webapp/node_modules/dunder-proto/test/index.js +4 -0
  241. gaia/eval/webapp/node_modules/dunder-proto/test/set.js +50 -0
  242. gaia/eval/webapp/node_modules/dunder-proto/tsconfig.json +9 -0
  243. gaia/eval/webapp/node_modules/ee-first/LICENSE +22 -0
  244. gaia/eval/webapp/node_modules/ee-first/README.md +80 -0
  245. gaia/eval/webapp/node_modules/ee-first/index.js +95 -0
  246. gaia/eval/webapp/node_modules/ee-first/package.json +29 -0
  247. gaia/eval/webapp/node_modules/encodeurl/LICENSE +22 -0
  248. gaia/eval/webapp/node_modules/encodeurl/README.md +109 -0
  249. gaia/eval/webapp/node_modules/encodeurl/index.js +60 -0
  250. gaia/eval/webapp/node_modules/encodeurl/package.json +40 -0
  251. gaia/eval/webapp/node_modules/es-define-property/.eslintrc +13 -0
  252. gaia/eval/webapp/node_modules/es-define-property/.github/FUNDING.yml +12 -0
  253. gaia/eval/webapp/node_modules/es-define-property/.nycrc +9 -0
  254. gaia/eval/webapp/node_modules/es-define-property/CHANGELOG.md +29 -0
  255. gaia/eval/webapp/node_modules/es-define-property/LICENSE +21 -0
  256. gaia/eval/webapp/node_modules/es-define-property/README.md +49 -0
  257. gaia/eval/webapp/node_modules/es-define-property/index.d.ts +3 -0
  258. gaia/eval/webapp/node_modules/es-define-property/index.js +14 -0
  259. gaia/eval/webapp/node_modules/es-define-property/package.json +81 -0
  260. gaia/eval/webapp/node_modules/es-define-property/test/index.js +56 -0
  261. gaia/eval/webapp/node_modules/es-define-property/tsconfig.json +10 -0
  262. gaia/eval/webapp/node_modules/es-errors/.eslintrc +5 -0
  263. gaia/eval/webapp/node_modules/es-errors/.github/FUNDING.yml +12 -0
  264. gaia/eval/webapp/node_modules/es-errors/CHANGELOG.md +40 -0
  265. gaia/eval/webapp/node_modules/es-errors/LICENSE +21 -0
  266. gaia/eval/webapp/node_modules/es-errors/README.md +55 -0
  267. gaia/eval/webapp/node_modules/es-errors/eval.d.ts +3 -0
  268. gaia/eval/webapp/node_modules/es-errors/eval.js +4 -0
  269. gaia/eval/webapp/node_modules/es-errors/index.d.ts +3 -0
  270. gaia/eval/webapp/node_modules/es-errors/index.js +4 -0
  271. gaia/eval/webapp/node_modules/es-errors/package.json +80 -0
  272. gaia/eval/webapp/node_modules/es-errors/range.d.ts +3 -0
  273. gaia/eval/webapp/node_modules/es-errors/range.js +4 -0
  274. gaia/eval/webapp/node_modules/es-errors/ref.d.ts +3 -0
  275. gaia/eval/webapp/node_modules/es-errors/ref.js +4 -0
  276. gaia/eval/webapp/node_modules/es-errors/syntax.d.ts +3 -0
  277. gaia/eval/webapp/node_modules/es-errors/syntax.js +4 -0
  278. gaia/eval/webapp/node_modules/es-errors/test/index.js +19 -0
  279. gaia/eval/webapp/node_modules/es-errors/tsconfig.json +49 -0
  280. gaia/eval/webapp/node_modules/es-errors/type.d.ts +3 -0
  281. gaia/eval/webapp/node_modules/es-errors/type.js +4 -0
  282. gaia/eval/webapp/node_modules/es-errors/uri.d.ts +3 -0
  283. gaia/eval/webapp/node_modules/es-errors/uri.js +4 -0
  284. gaia/eval/webapp/node_modules/es-object-atoms/.eslintrc +16 -0
  285. gaia/eval/webapp/node_modules/es-object-atoms/.github/FUNDING.yml +12 -0
  286. gaia/eval/webapp/node_modules/es-object-atoms/CHANGELOG.md +37 -0
  287. gaia/eval/webapp/node_modules/es-object-atoms/LICENSE +21 -0
  288. gaia/eval/webapp/node_modules/es-object-atoms/README.md +63 -0
  289. gaia/eval/webapp/node_modules/es-object-atoms/RequireObjectCoercible.d.ts +3 -0
  290. gaia/eval/webapp/node_modules/es-object-atoms/RequireObjectCoercible.js +11 -0
  291. gaia/eval/webapp/node_modules/es-object-atoms/ToObject.d.ts +7 -0
  292. gaia/eval/webapp/node_modules/es-object-atoms/ToObject.js +10 -0
  293. gaia/eval/webapp/node_modules/es-object-atoms/index.d.ts +3 -0
  294. gaia/eval/webapp/node_modules/es-object-atoms/index.js +4 -0
  295. gaia/eval/webapp/node_modules/es-object-atoms/isObject.d.ts +3 -0
  296. gaia/eval/webapp/node_modules/es-object-atoms/isObject.js +6 -0
  297. gaia/eval/webapp/node_modules/es-object-atoms/package.json +80 -0
  298. gaia/eval/webapp/node_modules/es-object-atoms/test/index.js +38 -0
  299. gaia/eval/webapp/node_modules/es-object-atoms/tsconfig.json +6 -0
  300. gaia/eval/webapp/node_modules/escape-html/LICENSE +24 -0
  301. gaia/eval/webapp/node_modules/escape-html/Readme.md +43 -0
  302. gaia/eval/webapp/node_modules/escape-html/index.js +78 -0
  303. gaia/eval/webapp/node_modules/escape-html/package.json +24 -0
  304. gaia/eval/webapp/node_modules/etag/HISTORY.md +83 -0
  305. gaia/eval/webapp/node_modules/etag/LICENSE +22 -0
  306. gaia/eval/webapp/node_modules/etag/README.md +159 -0
  307. gaia/eval/webapp/node_modules/etag/index.js +131 -0
  308. gaia/eval/webapp/node_modules/etag/package.json +47 -0
  309. gaia/eval/webapp/node_modules/express/History.md +3656 -0
  310. gaia/eval/webapp/node_modules/express/LICENSE +24 -0
  311. gaia/eval/webapp/node_modules/express/Readme.md +260 -0
  312. gaia/eval/webapp/node_modules/express/index.js +11 -0
  313. gaia/eval/webapp/node_modules/express/lib/application.js +661 -0
  314. gaia/eval/webapp/node_modules/express/lib/express.js +116 -0
  315. gaia/eval/webapp/node_modules/express/lib/middleware/init.js +43 -0
  316. gaia/eval/webapp/node_modules/express/lib/middleware/query.js +47 -0
  317. gaia/eval/webapp/node_modules/express/lib/request.js +525 -0
  318. gaia/eval/webapp/node_modules/express/lib/response.js +1179 -0
  319. gaia/eval/webapp/node_modules/express/lib/router/index.js +673 -0
  320. gaia/eval/webapp/node_modules/express/lib/router/layer.js +181 -0
  321. gaia/eval/webapp/node_modules/express/lib/router/route.js +230 -0
  322. gaia/eval/webapp/node_modules/express/lib/utils.js +303 -0
  323. gaia/eval/webapp/node_modules/express/lib/view.js +182 -0
  324. gaia/eval/webapp/node_modules/express/package.json +102 -0
  325. gaia/eval/webapp/node_modules/finalhandler/HISTORY.md +210 -0
  326. gaia/eval/webapp/node_modules/finalhandler/LICENSE +22 -0
  327. gaia/eval/webapp/node_modules/finalhandler/README.md +147 -0
  328. gaia/eval/webapp/node_modules/finalhandler/SECURITY.md +25 -0
  329. gaia/eval/webapp/node_modules/finalhandler/index.js +341 -0
  330. gaia/eval/webapp/node_modules/finalhandler/package.json +47 -0
  331. gaia/eval/webapp/node_modules/forwarded/HISTORY.md +21 -0
  332. gaia/eval/webapp/node_modules/forwarded/LICENSE +22 -0
  333. gaia/eval/webapp/node_modules/forwarded/README.md +57 -0
  334. gaia/eval/webapp/node_modules/forwarded/index.js +90 -0
  335. gaia/eval/webapp/node_modules/forwarded/package.json +45 -0
  336. gaia/eval/webapp/node_modules/fresh/HISTORY.md +70 -0
  337. gaia/eval/webapp/node_modules/fresh/LICENSE +23 -0
  338. gaia/eval/webapp/node_modules/fresh/README.md +119 -0
  339. gaia/eval/webapp/node_modules/fresh/index.js +137 -0
  340. gaia/eval/webapp/node_modules/fresh/package.json +46 -0
  341. gaia/eval/webapp/node_modules/fs/README.md +9 -0
  342. gaia/eval/webapp/node_modules/fs/package.json +20 -0
  343. gaia/eval/webapp/node_modules/function-bind/.eslintrc +21 -0
  344. gaia/eval/webapp/node_modules/function-bind/.github/FUNDING.yml +12 -0
  345. gaia/eval/webapp/node_modules/function-bind/.github/SECURITY.md +3 -0
  346. gaia/eval/webapp/node_modules/function-bind/.nycrc +13 -0
  347. gaia/eval/webapp/node_modules/function-bind/CHANGELOG.md +136 -0
  348. gaia/eval/webapp/node_modules/function-bind/LICENSE +20 -0
  349. gaia/eval/webapp/node_modules/function-bind/README.md +46 -0
  350. gaia/eval/webapp/node_modules/function-bind/implementation.js +84 -0
  351. gaia/eval/webapp/node_modules/function-bind/index.js +5 -0
  352. gaia/eval/webapp/node_modules/function-bind/package.json +87 -0
  353. gaia/eval/webapp/node_modules/function-bind/test/.eslintrc +9 -0
  354. gaia/eval/webapp/node_modules/function-bind/test/index.js +252 -0
  355. gaia/eval/webapp/node_modules/get-intrinsic/.eslintrc +42 -0
  356. gaia/eval/webapp/node_modules/get-intrinsic/.github/FUNDING.yml +12 -0
  357. gaia/eval/webapp/node_modules/get-intrinsic/.nycrc +9 -0
  358. gaia/eval/webapp/node_modules/get-intrinsic/CHANGELOG.md +186 -0
  359. gaia/eval/webapp/node_modules/get-intrinsic/LICENSE +21 -0
  360. gaia/eval/webapp/node_modules/get-intrinsic/README.md +71 -0
  361. gaia/eval/webapp/node_modules/get-intrinsic/index.js +378 -0
  362. gaia/eval/webapp/node_modules/get-intrinsic/package.json +97 -0
  363. gaia/eval/webapp/node_modules/get-intrinsic/test/GetIntrinsic.js +274 -0
  364. gaia/eval/webapp/node_modules/get-proto/.eslintrc +10 -0
  365. gaia/eval/webapp/node_modules/get-proto/.github/FUNDING.yml +12 -0
  366. gaia/eval/webapp/node_modules/get-proto/.nycrc +9 -0
  367. gaia/eval/webapp/node_modules/get-proto/CHANGELOG.md +21 -0
  368. gaia/eval/webapp/node_modules/get-proto/LICENSE +21 -0
  369. gaia/eval/webapp/node_modules/get-proto/Object.getPrototypeOf.d.ts +5 -0
  370. gaia/eval/webapp/node_modules/get-proto/Object.getPrototypeOf.js +6 -0
  371. gaia/eval/webapp/node_modules/get-proto/README.md +50 -0
  372. gaia/eval/webapp/node_modules/get-proto/Reflect.getPrototypeOf.d.ts +3 -0
  373. gaia/eval/webapp/node_modules/get-proto/Reflect.getPrototypeOf.js +4 -0
  374. gaia/eval/webapp/node_modules/get-proto/index.d.ts +5 -0
  375. gaia/eval/webapp/node_modules/get-proto/index.js +27 -0
  376. gaia/eval/webapp/node_modules/get-proto/package.json +81 -0
  377. gaia/eval/webapp/node_modules/get-proto/test/index.js +68 -0
  378. gaia/eval/webapp/node_modules/get-proto/tsconfig.json +9 -0
  379. gaia/eval/webapp/node_modules/gopd/.eslintrc +16 -0
  380. gaia/eval/webapp/node_modules/gopd/.github/FUNDING.yml +12 -0
  381. gaia/eval/webapp/node_modules/gopd/CHANGELOG.md +45 -0
  382. gaia/eval/webapp/node_modules/gopd/LICENSE +21 -0
  383. gaia/eval/webapp/node_modules/gopd/README.md +40 -0
  384. gaia/eval/webapp/node_modules/gopd/gOPD.d.ts +1 -0
  385. gaia/eval/webapp/node_modules/gopd/gOPD.js +4 -0
  386. gaia/eval/webapp/node_modules/gopd/index.d.ts +5 -0
  387. gaia/eval/webapp/node_modules/gopd/index.js +15 -0
  388. gaia/eval/webapp/node_modules/gopd/package.json +77 -0
  389. gaia/eval/webapp/node_modules/gopd/test/index.js +36 -0
  390. gaia/eval/webapp/node_modules/gopd/tsconfig.json +9 -0
  391. gaia/eval/webapp/node_modules/has-symbols/.eslintrc +11 -0
  392. gaia/eval/webapp/node_modules/has-symbols/.github/FUNDING.yml +12 -0
  393. gaia/eval/webapp/node_modules/has-symbols/.nycrc +9 -0
  394. gaia/eval/webapp/node_modules/has-symbols/CHANGELOG.md +91 -0
  395. gaia/eval/webapp/node_modules/has-symbols/LICENSE +21 -0
  396. gaia/eval/webapp/node_modules/has-symbols/README.md +46 -0
  397. gaia/eval/webapp/node_modules/has-symbols/index.d.ts +3 -0
  398. gaia/eval/webapp/node_modules/has-symbols/index.js +14 -0
  399. gaia/eval/webapp/node_modules/has-symbols/package.json +111 -0
  400. gaia/eval/webapp/node_modules/has-symbols/shams.d.ts +3 -0
  401. gaia/eval/webapp/node_modules/has-symbols/shams.js +45 -0
  402. gaia/eval/webapp/node_modules/has-symbols/test/index.js +22 -0
  403. gaia/eval/webapp/node_modules/has-symbols/test/shams/core-js.js +29 -0
  404. gaia/eval/webapp/node_modules/has-symbols/test/shams/get-own-property-symbols.js +29 -0
  405. gaia/eval/webapp/node_modules/has-symbols/test/tests.js +58 -0
  406. gaia/eval/webapp/node_modules/has-symbols/tsconfig.json +10 -0
  407. gaia/eval/webapp/node_modules/hasown/.eslintrc +5 -0
  408. gaia/eval/webapp/node_modules/hasown/.github/FUNDING.yml +12 -0
  409. gaia/eval/webapp/node_modules/hasown/.nycrc +13 -0
  410. gaia/eval/webapp/node_modules/hasown/CHANGELOG.md +40 -0
  411. gaia/eval/webapp/node_modules/hasown/LICENSE +21 -0
  412. gaia/eval/webapp/node_modules/hasown/README.md +40 -0
  413. gaia/eval/webapp/node_modules/hasown/index.d.ts +3 -0
  414. gaia/eval/webapp/node_modules/hasown/index.js +8 -0
  415. gaia/eval/webapp/node_modules/hasown/package.json +92 -0
  416. gaia/eval/webapp/node_modules/hasown/tsconfig.json +6 -0
  417. gaia/eval/webapp/node_modules/http-errors/HISTORY.md +180 -0
  418. gaia/eval/webapp/node_modules/http-errors/LICENSE +23 -0
  419. gaia/eval/webapp/node_modules/http-errors/README.md +169 -0
  420. gaia/eval/webapp/node_modules/http-errors/index.js +289 -0
  421. gaia/eval/webapp/node_modules/http-errors/package.json +50 -0
  422. gaia/eval/webapp/node_modules/iconv-lite/Changelog.md +162 -0
  423. gaia/eval/webapp/node_modules/iconv-lite/LICENSE +21 -0
  424. gaia/eval/webapp/node_modules/iconv-lite/README.md +156 -0
  425. gaia/eval/webapp/node_modules/iconv-lite/encodings/dbcs-codec.js +555 -0
  426. gaia/eval/webapp/node_modules/iconv-lite/encodings/dbcs-data.js +176 -0
  427. gaia/eval/webapp/node_modules/iconv-lite/encodings/index.js +22 -0
  428. gaia/eval/webapp/node_modules/iconv-lite/encodings/internal.js +188 -0
  429. gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-codec.js +72 -0
  430. gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-data-generated.js +451 -0
  431. gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-data.js +174 -0
  432. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/big5-added.json +122 -0
  433. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp936.json +264 -0
  434. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp949.json +273 -0
  435. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp950.json +177 -0
  436. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/eucjp.json +182 -0
  437. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/gb18030-ranges.json +1 -0
  438. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/gbk-added.json +55 -0
  439. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/shiftjis.json +125 -0
  440. gaia/eval/webapp/node_modules/iconv-lite/encodings/utf16.js +177 -0
  441. gaia/eval/webapp/node_modules/iconv-lite/encodings/utf7.js +290 -0
  442. gaia/eval/webapp/node_modules/iconv-lite/lib/bom-handling.js +52 -0
  443. gaia/eval/webapp/node_modules/iconv-lite/lib/extend-node.js +217 -0
  444. gaia/eval/webapp/node_modules/iconv-lite/lib/index.d.ts +24 -0
  445. gaia/eval/webapp/node_modules/iconv-lite/lib/index.js +153 -0
  446. gaia/eval/webapp/node_modules/iconv-lite/lib/streams.js +121 -0
  447. gaia/eval/webapp/node_modules/iconv-lite/package.json +46 -0
  448. gaia/eval/webapp/node_modules/inherits/LICENSE +16 -0
  449. gaia/eval/webapp/node_modules/inherits/README.md +42 -0
  450. gaia/eval/webapp/node_modules/inherits/inherits.js +9 -0
  451. gaia/eval/webapp/node_modules/inherits/inherits_browser.js +27 -0
  452. gaia/eval/webapp/node_modules/inherits/package.json +29 -0
  453. gaia/eval/webapp/node_modules/ipaddr.js/LICENSE +19 -0
  454. gaia/eval/webapp/node_modules/ipaddr.js/README.md +233 -0
  455. gaia/eval/webapp/node_modules/ipaddr.js/ipaddr.min.js +1 -0
  456. gaia/eval/webapp/node_modules/ipaddr.js/lib/ipaddr.js +673 -0
  457. gaia/eval/webapp/node_modules/ipaddr.js/lib/ipaddr.js.d.ts +68 -0
  458. gaia/eval/webapp/node_modules/ipaddr.js/package.json +35 -0
  459. gaia/eval/webapp/node_modules/math-intrinsics/.eslintrc +16 -0
  460. gaia/eval/webapp/node_modules/math-intrinsics/.github/FUNDING.yml +12 -0
  461. gaia/eval/webapp/node_modules/math-intrinsics/CHANGELOG.md +24 -0
  462. gaia/eval/webapp/node_modules/math-intrinsics/LICENSE +21 -0
  463. gaia/eval/webapp/node_modules/math-intrinsics/README.md +50 -0
  464. gaia/eval/webapp/node_modules/math-intrinsics/abs.d.ts +1 -0
  465. gaia/eval/webapp/node_modules/math-intrinsics/abs.js +4 -0
  466. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxArrayLength.d.ts +3 -0
  467. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxArrayLength.js +4 -0
  468. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxSafeInteger.d.ts +3 -0
  469. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxSafeInteger.js +5 -0
  470. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxValue.d.ts +3 -0
  471. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxValue.js +5 -0
  472. gaia/eval/webapp/node_modules/math-intrinsics/floor.d.ts +1 -0
  473. gaia/eval/webapp/node_modules/math-intrinsics/floor.js +4 -0
  474. gaia/eval/webapp/node_modules/math-intrinsics/isFinite.d.ts +3 -0
  475. gaia/eval/webapp/node_modules/math-intrinsics/isFinite.js +12 -0
  476. gaia/eval/webapp/node_modules/math-intrinsics/isInteger.d.ts +3 -0
  477. gaia/eval/webapp/node_modules/math-intrinsics/isInteger.js +16 -0
  478. gaia/eval/webapp/node_modules/math-intrinsics/isNaN.d.ts +1 -0
  479. gaia/eval/webapp/node_modules/math-intrinsics/isNaN.js +6 -0
  480. gaia/eval/webapp/node_modules/math-intrinsics/isNegativeZero.d.ts +3 -0
  481. gaia/eval/webapp/node_modules/math-intrinsics/isNegativeZero.js +6 -0
  482. gaia/eval/webapp/node_modules/math-intrinsics/max.d.ts +1 -0
  483. gaia/eval/webapp/node_modules/math-intrinsics/max.js +4 -0
  484. gaia/eval/webapp/node_modules/math-intrinsics/min.d.ts +1 -0
  485. gaia/eval/webapp/node_modules/math-intrinsics/min.js +4 -0
  486. gaia/eval/webapp/node_modules/math-intrinsics/mod.d.ts +3 -0
  487. gaia/eval/webapp/node_modules/math-intrinsics/mod.js +9 -0
  488. gaia/eval/webapp/node_modules/math-intrinsics/package.json +86 -0
  489. gaia/eval/webapp/node_modules/math-intrinsics/pow.d.ts +1 -0
  490. gaia/eval/webapp/node_modules/math-intrinsics/pow.js +4 -0
  491. gaia/eval/webapp/node_modules/math-intrinsics/round.d.ts +1 -0
  492. gaia/eval/webapp/node_modules/math-intrinsics/round.js +4 -0
  493. gaia/eval/webapp/node_modules/math-intrinsics/sign.d.ts +3 -0
  494. gaia/eval/webapp/node_modules/math-intrinsics/sign.js +11 -0
  495. gaia/eval/webapp/node_modules/math-intrinsics/test/index.js +192 -0
  496. gaia/eval/webapp/node_modules/math-intrinsics/tsconfig.json +3 -0
  497. gaia/eval/webapp/node_modules/media-typer/HISTORY.md +22 -0
  498. gaia/eval/webapp/node_modules/media-typer/LICENSE +22 -0
  499. gaia/eval/webapp/node_modules/media-typer/README.md +81 -0
  500. gaia/eval/webapp/node_modules/media-typer/index.js +270 -0
  501. gaia/eval/webapp/node_modules/media-typer/package.json +26 -0
  502. gaia/eval/webapp/node_modules/merge-descriptors/HISTORY.md +21 -0
  503. gaia/eval/webapp/node_modules/merge-descriptors/LICENSE +23 -0
  504. gaia/eval/webapp/node_modules/merge-descriptors/README.md +49 -0
  505. gaia/eval/webapp/node_modules/merge-descriptors/index.js +60 -0
  506. gaia/eval/webapp/node_modules/merge-descriptors/package.json +39 -0
  507. gaia/eval/webapp/node_modules/methods/HISTORY.md +29 -0
  508. gaia/eval/webapp/node_modules/methods/LICENSE +24 -0
  509. gaia/eval/webapp/node_modules/methods/README.md +51 -0
  510. gaia/eval/webapp/node_modules/methods/index.js +69 -0
  511. gaia/eval/webapp/node_modules/methods/package.json +36 -0
  512. gaia/eval/webapp/node_modules/mime/.npmignore +0 -0
  513. gaia/eval/webapp/node_modules/mime/CHANGELOG.md +164 -0
  514. gaia/eval/webapp/node_modules/mime/LICENSE +21 -0
  515. gaia/eval/webapp/node_modules/mime/README.md +90 -0
  516. gaia/eval/webapp/node_modules/mime/cli.js +8 -0
  517. gaia/eval/webapp/node_modules/mime/mime.js +108 -0
  518. gaia/eval/webapp/node_modules/mime/package.json +44 -0
  519. gaia/eval/webapp/node_modules/mime/src/build.js +53 -0
  520. gaia/eval/webapp/node_modules/mime/src/test.js +60 -0
  521. gaia/eval/webapp/node_modules/mime/types.json +1 -0
  522. gaia/eval/webapp/node_modules/mime-db/HISTORY.md +507 -0
  523. gaia/eval/webapp/node_modules/mime-db/LICENSE +23 -0
  524. gaia/eval/webapp/node_modules/mime-db/README.md +100 -0
  525. gaia/eval/webapp/node_modules/mime-db/db.json +8519 -0
  526. gaia/eval/webapp/node_modules/mime-db/index.js +12 -0
  527. gaia/eval/webapp/node_modules/mime-db/package.json +60 -0
  528. gaia/eval/webapp/node_modules/mime-types/HISTORY.md +397 -0
  529. gaia/eval/webapp/node_modules/mime-types/LICENSE +23 -0
  530. gaia/eval/webapp/node_modules/mime-types/README.md +113 -0
  531. gaia/eval/webapp/node_modules/mime-types/index.js +188 -0
  532. gaia/eval/webapp/node_modules/mime-types/package.json +44 -0
  533. gaia/eval/webapp/node_modules/ms/index.js +152 -0
  534. gaia/eval/webapp/node_modules/ms/license.md +21 -0
  535. gaia/eval/webapp/node_modules/ms/package.json +37 -0
  536. gaia/eval/webapp/node_modules/ms/readme.md +51 -0
  537. gaia/eval/webapp/node_modules/negotiator/HISTORY.md +108 -0
  538. gaia/eval/webapp/node_modules/negotiator/LICENSE +24 -0
  539. gaia/eval/webapp/node_modules/negotiator/README.md +203 -0
  540. gaia/eval/webapp/node_modules/negotiator/index.js +82 -0
  541. gaia/eval/webapp/node_modules/negotiator/lib/charset.js +169 -0
  542. gaia/eval/webapp/node_modules/negotiator/lib/encoding.js +184 -0
  543. gaia/eval/webapp/node_modules/negotiator/lib/language.js +179 -0
  544. gaia/eval/webapp/node_modules/negotiator/lib/mediaType.js +294 -0
  545. gaia/eval/webapp/node_modules/negotiator/package.json +42 -0
  546. gaia/eval/webapp/node_modules/object-inspect/.eslintrc +53 -0
  547. gaia/eval/webapp/node_modules/object-inspect/.github/FUNDING.yml +12 -0
  548. gaia/eval/webapp/node_modules/object-inspect/.nycrc +13 -0
  549. gaia/eval/webapp/node_modules/object-inspect/CHANGELOG.md +424 -0
  550. gaia/eval/webapp/node_modules/object-inspect/LICENSE +21 -0
  551. gaia/eval/webapp/node_modules/object-inspect/example/all.js +23 -0
  552. gaia/eval/webapp/node_modules/object-inspect/example/circular.js +6 -0
  553. gaia/eval/webapp/node_modules/object-inspect/example/fn.js +5 -0
  554. gaia/eval/webapp/node_modules/object-inspect/example/inspect.js +10 -0
  555. gaia/eval/webapp/node_modules/object-inspect/index.js +544 -0
  556. gaia/eval/webapp/node_modules/object-inspect/package-support.json +20 -0
  557. gaia/eval/webapp/node_modules/object-inspect/package.json +105 -0
  558. gaia/eval/webapp/node_modules/object-inspect/readme.markdown +84 -0
  559. gaia/eval/webapp/node_modules/object-inspect/test/bigint.js +58 -0
  560. gaia/eval/webapp/node_modules/object-inspect/test/browser/dom.js +15 -0
  561. gaia/eval/webapp/node_modules/object-inspect/test/circular.js +16 -0
  562. gaia/eval/webapp/node_modules/object-inspect/test/deep.js +12 -0
  563. gaia/eval/webapp/node_modules/object-inspect/test/element.js +53 -0
  564. gaia/eval/webapp/node_modules/object-inspect/test/err.js +48 -0
  565. gaia/eval/webapp/node_modules/object-inspect/test/fakes.js +29 -0
  566. gaia/eval/webapp/node_modules/object-inspect/test/fn.js +76 -0
  567. gaia/eval/webapp/node_modules/object-inspect/test/global.js +17 -0
  568. gaia/eval/webapp/node_modules/object-inspect/test/has.js +15 -0
  569. gaia/eval/webapp/node_modules/object-inspect/test/holes.js +15 -0
  570. gaia/eval/webapp/node_modules/object-inspect/test/indent-option.js +271 -0
  571. gaia/eval/webapp/node_modules/object-inspect/test/inspect.js +139 -0
  572. gaia/eval/webapp/node_modules/object-inspect/test/lowbyte.js +12 -0
  573. gaia/eval/webapp/node_modules/object-inspect/test/number.js +58 -0
  574. gaia/eval/webapp/node_modules/object-inspect/test/quoteStyle.js +26 -0
  575. gaia/eval/webapp/node_modules/object-inspect/test/toStringTag.js +40 -0
  576. gaia/eval/webapp/node_modules/object-inspect/test/undef.js +12 -0
  577. gaia/eval/webapp/node_modules/object-inspect/test/values.js +261 -0
  578. gaia/eval/webapp/node_modules/object-inspect/test-core-js.js +26 -0
  579. gaia/eval/webapp/node_modules/object-inspect/util.inspect.js +1 -0
  580. gaia/eval/webapp/node_modules/on-finished/HISTORY.md +98 -0
  581. gaia/eval/webapp/node_modules/on-finished/LICENSE +23 -0
  582. gaia/eval/webapp/node_modules/on-finished/README.md +162 -0
  583. gaia/eval/webapp/node_modules/on-finished/index.js +234 -0
  584. gaia/eval/webapp/node_modules/on-finished/package.json +39 -0
  585. gaia/eval/webapp/node_modules/parseurl/HISTORY.md +58 -0
  586. gaia/eval/webapp/node_modules/parseurl/LICENSE +24 -0
  587. gaia/eval/webapp/node_modules/parseurl/README.md +133 -0
  588. gaia/eval/webapp/node_modules/parseurl/index.js +158 -0
  589. gaia/eval/webapp/node_modules/parseurl/package.json +40 -0
  590. gaia/eval/webapp/node_modules/path/.npmignore +1 -0
  591. gaia/eval/webapp/node_modules/path/LICENSE +18 -0
  592. gaia/eval/webapp/node_modules/path/README.md +15 -0
  593. gaia/eval/webapp/node_modules/path/package.json +24 -0
  594. gaia/eval/webapp/node_modules/path/path.js +628 -0
  595. gaia/eval/webapp/node_modules/path-to-regexp/LICENSE +21 -0
  596. gaia/eval/webapp/node_modules/path-to-regexp/Readme.md +35 -0
  597. gaia/eval/webapp/node_modules/path-to-regexp/index.js +156 -0
  598. gaia/eval/webapp/node_modules/path-to-regexp/package.json +30 -0
  599. gaia/eval/webapp/node_modules/process/.eslintrc +21 -0
  600. gaia/eval/webapp/node_modules/process/LICENSE +22 -0
  601. gaia/eval/webapp/node_modules/process/README.md +26 -0
  602. gaia/eval/webapp/node_modules/process/browser.js +184 -0
  603. gaia/eval/webapp/node_modules/process/index.js +2 -0
  604. gaia/eval/webapp/node_modules/process/package.json +27 -0
  605. gaia/eval/webapp/node_modules/process/test.js +199 -0
  606. gaia/eval/webapp/node_modules/proxy-addr/HISTORY.md +161 -0
  607. gaia/eval/webapp/node_modules/proxy-addr/LICENSE +22 -0
  608. gaia/eval/webapp/node_modules/proxy-addr/README.md +139 -0
  609. gaia/eval/webapp/node_modules/proxy-addr/index.js +327 -0
  610. gaia/eval/webapp/node_modules/proxy-addr/package.json +47 -0
  611. gaia/eval/webapp/node_modules/qs/.editorconfig +46 -0
  612. gaia/eval/webapp/node_modules/qs/.eslintrc +38 -0
  613. gaia/eval/webapp/node_modules/qs/.github/FUNDING.yml +12 -0
  614. gaia/eval/webapp/node_modules/qs/.nycrc +13 -0
  615. gaia/eval/webapp/node_modules/qs/CHANGELOG.md +600 -0
  616. gaia/eval/webapp/node_modules/qs/LICENSE.md +29 -0
  617. gaia/eval/webapp/node_modules/qs/README.md +709 -0
  618. gaia/eval/webapp/node_modules/qs/dist/qs.js +90 -0
  619. gaia/eval/webapp/node_modules/qs/lib/formats.js +23 -0
  620. gaia/eval/webapp/node_modules/qs/lib/index.js +11 -0
  621. gaia/eval/webapp/node_modules/qs/lib/parse.js +296 -0
  622. gaia/eval/webapp/node_modules/qs/lib/stringify.js +351 -0
  623. gaia/eval/webapp/node_modules/qs/lib/utils.js +265 -0
  624. gaia/eval/webapp/node_modules/qs/package.json +91 -0
  625. gaia/eval/webapp/node_modules/qs/test/empty-keys-cases.js +267 -0
  626. gaia/eval/webapp/node_modules/qs/test/parse.js +1170 -0
  627. gaia/eval/webapp/node_modules/qs/test/stringify.js +1298 -0
  628. gaia/eval/webapp/node_modules/qs/test/utils.js +136 -0
  629. gaia/eval/webapp/node_modules/range-parser/HISTORY.md +56 -0
  630. gaia/eval/webapp/node_modules/range-parser/LICENSE +23 -0
  631. gaia/eval/webapp/node_modules/range-parser/README.md +84 -0
  632. gaia/eval/webapp/node_modules/range-parser/index.js +162 -0
  633. gaia/eval/webapp/node_modules/range-parser/package.json +44 -0
  634. gaia/eval/webapp/node_modules/raw-body/HISTORY.md +308 -0
  635. gaia/eval/webapp/node_modules/raw-body/LICENSE +22 -0
  636. gaia/eval/webapp/node_modules/raw-body/README.md +223 -0
  637. gaia/eval/webapp/node_modules/raw-body/SECURITY.md +24 -0
  638. gaia/eval/webapp/node_modules/raw-body/index.d.ts +87 -0
  639. gaia/eval/webapp/node_modules/raw-body/index.js +336 -0
  640. gaia/eval/webapp/node_modules/raw-body/package.json +49 -0
  641. gaia/eval/webapp/node_modules/safe-buffer/LICENSE +21 -0
  642. gaia/eval/webapp/node_modules/safe-buffer/README.md +584 -0
  643. gaia/eval/webapp/node_modules/safe-buffer/index.d.ts +187 -0
  644. gaia/eval/webapp/node_modules/safe-buffer/index.js +65 -0
  645. gaia/eval/webapp/node_modules/safe-buffer/package.json +51 -0
  646. gaia/eval/webapp/node_modules/safer-buffer/LICENSE +21 -0
  647. gaia/eval/webapp/node_modules/safer-buffer/Porting-Buffer.md +268 -0
  648. gaia/eval/webapp/node_modules/safer-buffer/Readme.md +156 -0
  649. gaia/eval/webapp/node_modules/safer-buffer/dangerous.js +58 -0
  650. gaia/eval/webapp/node_modules/safer-buffer/package.json +34 -0
  651. gaia/eval/webapp/node_modules/safer-buffer/safer.js +77 -0
  652. gaia/eval/webapp/node_modules/safer-buffer/tests.js +406 -0
  653. gaia/eval/webapp/node_modules/send/HISTORY.md +526 -0
  654. gaia/eval/webapp/node_modules/send/LICENSE +23 -0
  655. gaia/eval/webapp/node_modules/send/README.md +327 -0
  656. gaia/eval/webapp/node_modules/send/SECURITY.md +24 -0
  657. gaia/eval/webapp/node_modules/send/index.js +1142 -0
  658. gaia/eval/webapp/node_modules/send/node_modules/encodeurl/HISTORY.md +14 -0
  659. gaia/eval/webapp/node_modules/send/node_modules/encodeurl/LICENSE +22 -0
  660. gaia/eval/webapp/node_modules/send/node_modules/encodeurl/README.md +128 -0
  661. gaia/eval/webapp/node_modules/send/node_modules/encodeurl/index.js +60 -0
  662. gaia/eval/webapp/node_modules/send/node_modules/encodeurl/package.json +40 -0
  663. gaia/eval/webapp/node_modules/send/node_modules/ms/index.js +162 -0
  664. gaia/eval/webapp/node_modules/send/node_modules/ms/license.md +21 -0
  665. gaia/eval/webapp/node_modules/send/node_modules/ms/package.json +38 -0
  666. gaia/eval/webapp/node_modules/send/node_modules/ms/readme.md +59 -0
  667. gaia/eval/webapp/node_modules/send/package.json +62 -0
  668. gaia/eval/webapp/node_modules/serve-static/HISTORY.md +487 -0
  669. gaia/eval/webapp/node_modules/serve-static/LICENSE +25 -0
  670. gaia/eval/webapp/node_modules/serve-static/README.md +257 -0
  671. gaia/eval/webapp/node_modules/serve-static/index.js +209 -0
  672. gaia/eval/webapp/node_modules/serve-static/package.json +42 -0
  673. gaia/eval/webapp/node_modules/setprototypeof/LICENSE +13 -0
  674. gaia/eval/webapp/node_modules/setprototypeof/README.md +31 -0
  675. gaia/eval/webapp/node_modules/setprototypeof/index.d.ts +2 -0
  676. gaia/eval/webapp/node_modules/setprototypeof/index.js +17 -0
  677. gaia/eval/webapp/node_modules/setprototypeof/package.json +38 -0
  678. gaia/eval/webapp/node_modules/setprototypeof/test/index.js +24 -0
  679. gaia/eval/webapp/node_modules/side-channel/.editorconfig +9 -0
  680. gaia/eval/webapp/node_modules/side-channel/.eslintrc +12 -0
  681. gaia/eval/webapp/node_modules/side-channel/.github/FUNDING.yml +12 -0
  682. gaia/eval/webapp/node_modules/side-channel/.nycrc +13 -0
  683. gaia/eval/webapp/node_modules/side-channel/CHANGELOG.md +110 -0
  684. gaia/eval/webapp/node_modules/side-channel/LICENSE +21 -0
  685. gaia/eval/webapp/node_modules/side-channel/README.md +61 -0
  686. gaia/eval/webapp/node_modules/side-channel/index.d.ts +14 -0
  687. gaia/eval/webapp/node_modules/side-channel/index.js +43 -0
  688. gaia/eval/webapp/node_modules/side-channel/package.json +85 -0
  689. gaia/eval/webapp/node_modules/side-channel/test/index.js +104 -0
  690. gaia/eval/webapp/node_modules/side-channel/tsconfig.json +9 -0
  691. gaia/eval/webapp/node_modules/side-channel-list/.editorconfig +9 -0
  692. gaia/eval/webapp/node_modules/side-channel-list/.eslintrc +11 -0
  693. gaia/eval/webapp/node_modules/side-channel-list/.github/FUNDING.yml +12 -0
  694. gaia/eval/webapp/node_modules/side-channel-list/.nycrc +13 -0
  695. gaia/eval/webapp/node_modules/side-channel-list/CHANGELOG.md +15 -0
  696. gaia/eval/webapp/node_modules/side-channel-list/LICENSE +21 -0
  697. gaia/eval/webapp/node_modules/side-channel-list/README.md +62 -0
  698. gaia/eval/webapp/node_modules/side-channel-list/index.d.ts +13 -0
  699. gaia/eval/webapp/node_modules/side-channel-list/index.js +113 -0
  700. gaia/eval/webapp/node_modules/side-channel-list/list.d.ts +14 -0
  701. gaia/eval/webapp/node_modules/side-channel-list/package.json +77 -0
  702. gaia/eval/webapp/node_modules/side-channel-list/test/index.js +104 -0
  703. gaia/eval/webapp/node_modules/side-channel-list/tsconfig.json +9 -0
  704. gaia/eval/webapp/node_modules/side-channel-map/.editorconfig +9 -0
  705. gaia/eval/webapp/node_modules/side-channel-map/.eslintrc +11 -0
  706. gaia/eval/webapp/node_modules/side-channel-map/.github/FUNDING.yml +12 -0
  707. gaia/eval/webapp/node_modules/side-channel-map/.nycrc +13 -0
  708. gaia/eval/webapp/node_modules/side-channel-map/CHANGELOG.md +22 -0
  709. gaia/eval/webapp/node_modules/side-channel-map/LICENSE +21 -0
  710. gaia/eval/webapp/node_modules/side-channel-map/README.md +62 -0
  711. gaia/eval/webapp/node_modules/side-channel-map/index.d.ts +15 -0
  712. gaia/eval/webapp/node_modules/side-channel-map/index.js +68 -0
  713. gaia/eval/webapp/node_modules/side-channel-map/package.json +80 -0
  714. gaia/eval/webapp/node_modules/side-channel-map/test/index.js +114 -0
  715. gaia/eval/webapp/node_modules/side-channel-map/tsconfig.json +9 -0
  716. gaia/eval/webapp/node_modules/side-channel-weakmap/.editorconfig +9 -0
  717. gaia/eval/webapp/node_modules/side-channel-weakmap/.eslintrc +12 -0
  718. gaia/eval/webapp/node_modules/side-channel-weakmap/.github/FUNDING.yml +12 -0
  719. gaia/eval/webapp/node_modules/side-channel-weakmap/.nycrc +13 -0
  720. gaia/eval/webapp/node_modules/side-channel-weakmap/CHANGELOG.md +28 -0
  721. gaia/eval/webapp/node_modules/side-channel-weakmap/LICENSE +21 -0
  722. gaia/eval/webapp/node_modules/side-channel-weakmap/README.md +62 -0
  723. gaia/eval/webapp/node_modules/side-channel-weakmap/index.d.ts +15 -0
  724. gaia/eval/webapp/node_modules/side-channel-weakmap/index.js +84 -0
  725. gaia/eval/webapp/node_modules/side-channel-weakmap/package.json +87 -0
  726. gaia/eval/webapp/node_modules/side-channel-weakmap/test/index.js +114 -0
  727. gaia/eval/webapp/node_modules/side-channel-weakmap/tsconfig.json +9 -0
  728. gaia/eval/webapp/node_modules/statuses/HISTORY.md +82 -0
  729. gaia/eval/webapp/node_modules/statuses/LICENSE +23 -0
  730. gaia/eval/webapp/node_modules/statuses/README.md +136 -0
  731. gaia/eval/webapp/node_modules/statuses/codes.json +65 -0
  732. gaia/eval/webapp/node_modules/statuses/index.js +146 -0
  733. gaia/eval/webapp/node_modules/statuses/package.json +49 -0
  734. gaia/eval/webapp/node_modules/toidentifier/HISTORY.md +9 -0
  735. gaia/eval/webapp/node_modules/toidentifier/LICENSE +21 -0
  736. gaia/eval/webapp/node_modules/toidentifier/README.md +61 -0
  737. gaia/eval/webapp/node_modules/toidentifier/index.js +32 -0
  738. gaia/eval/webapp/node_modules/toidentifier/package.json +38 -0
  739. gaia/eval/webapp/node_modules/type-is/HISTORY.md +259 -0
  740. gaia/eval/webapp/node_modules/type-is/LICENSE +23 -0
  741. gaia/eval/webapp/node_modules/type-is/README.md +170 -0
  742. gaia/eval/webapp/node_modules/type-is/index.js +266 -0
  743. gaia/eval/webapp/node_modules/type-is/package.json +45 -0
  744. gaia/eval/webapp/node_modules/unpipe/HISTORY.md +4 -0
  745. gaia/eval/webapp/node_modules/unpipe/LICENSE +22 -0
  746. gaia/eval/webapp/node_modules/unpipe/README.md +43 -0
  747. gaia/eval/webapp/node_modules/unpipe/index.js +69 -0
  748. gaia/eval/webapp/node_modules/unpipe/package.json +27 -0
  749. gaia/eval/webapp/node_modules/util/LICENSE +18 -0
  750. gaia/eval/webapp/node_modules/util/README.md +15 -0
  751. gaia/eval/webapp/node_modules/util/node_modules/inherits/LICENSE +16 -0
  752. gaia/eval/webapp/node_modules/util/node_modules/inherits/README.md +42 -0
  753. gaia/eval/webapp/node_modules/util/node_modules/inherits/inherits.js +7 -0
  754. gaia/eval/webapp/node_modules/util/node_modules/inherits/inherits_browser.js +23 -0
  755. gaia/eval/webapp/node_modules/util/node_modules/inherits/package.json +29 -0
  756. gaia/eval/webapp/node_modules/util/package.json +35 -0
  757. gaia/eval/webapp/node_modules/util/support/isBuffer.js +3 -0
  758. gaia/eval/webapp/node_modules/util/support/isBufferBrowser.js +6 -0
  759. gaia/eval/webapp/node_modules/util/util.js +586 -0
  760. gaia/eval/webapp/node_modules/utils-merge/.npmignore +9 -0
  761. gaia/eval/webapp/node_modules/utils-merge/LICENSE +20 -0
  762. gaia/eval/webapp/node_modules/utils-merge/README.md +34 -0
  763. gaia/eval/webapp/node_modules/utils-merge/index.js +23 -0
  764. gaia/eval/webapp/node_modules/utils-merge/package.json +40 -0
  765. gaia/eval/webapp/node_modules/vary/HISTORY.md +39 -0
  766. gaia/eval/webapp/node_modules/vary/LICENSE +22 -0
  767. gaia/eval/webapp/node_modules/vary/README.md +101 -0
  768. gaia/eval/webapp/node_modules/vary/index.js +149 -0
  769. gaia/eval/webapp/node_modules/vary/package.json +43 -0
  770. gaia/eval/webapp/package-lock.json +875 -0
  771. gaia/eval/webapp/package.json +21 -0
  772. gaia/eval/webapp/public/app.js +3403 -0
  773. gaia/eval/webapp/public/index.html +88 -0
  774. gaia/eval/webapp/public/styles.css +3661 -0
  775. gaia/eval/webapp/server.js +416 -0
  776. gaia/eval/webapp/test-setup.js +73 -0
  777. gaia/llm/__init__.py +2 -0
  778. gaia/llm/lemonade_client.py +3083 -0
  779. gaia/llm/lemonade_manager.py +269 -0
  780. gaia/llm/llm_client.py +729 -0
  781. gaia/llm/vlm_client.py +307 -0
  782. gaia/logger.py +189 -0
  783. gaia/mcp/agent_mcp_server.py +245 -0
  784. gaia/mcp/blender_mcp_client.py +138 -0
  785. gaia/mcp/blender_mcp_server.py +648 -0
  786. gaia/mcp/context7_cache.py +332 -0
  787. gaia/mcp/external_services.py +518 -0
  788. gaia/mcp/mcp_bridge.py +550 -0
  789. gaia/mcp/servers/__init__.py +6 -0
  790. gaia/mcp/servers/docker_mcp.py +83 -0
  791. gaia/rag/__init__.py +10 -0
  792. gaia/rag/app.py +293 -0
  793. gaia/rag/demo.py +304 -0
  794. gaia/rag/pdf_utils.py +235 -0
  795. gaia/rag/sdk.py +2194 -0
  796. gaia/security.py +163 -0
  797. gaia/talk/app.py +289 -0
  798. gaia/talk/sdk.py +538 -0
  799. gaia/util.py +46 -0
  800. gaia/version.py +100 -0
gaia/rag/sdk.py ADDED
@@ -0,0 +1,2194 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright(C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
3
+ # SPDX-License-Identifier: MIT
4
+
5
+ """
6
+ GAIA RAG SDK - Simple PDF document retrieval and Q&A
7
+ """
8
+
9
+ import hashlib
10
+ import os
11
+ import pickle
12
+ import re
13
+ import time
14
+ from dataclasses import dataclass
15
+ from pathlib import Path
16
+ from typing import Any, Dict, List, Optional
17
+
18
+ import numpy as np
19
+
20
+ try:
21
+ from pypdf import PdfReader
22
+ except ImportError:
23
+ try:
24
+ from PyPDF2 import PdfReader
25
+ except ImportError:
26
+ PdfReader = None
27
+
28
+ try:
29
+ from sentence_transformers import SentenceTransformer
30
+ except ImportError:
31
+ SentenceTransformer = None
32
+
33
+ try:
34
+ import faiss
35
+ except ImportError:
36
+ faiss = None
37
+
38
+ from gaia.chat.sdk import ChatConfig, ChatSDK
39
+ from gaia.logger import get_logger
40
+ from gaia.security import PathValidator
41
+
42
+
43
+ @dataclass
44
+ class RAGConfig:
45
+ """Configuration for RAG SDK."""
46
+
47
+ model: str = "Qwen3-Coder-30B-A3B-Instruct-GGUF"
48
+ max_tokens: int = 1024
49
+ chunk_size: int = 500
50
+ chunk_overlap: int = 100 # Increased to 20% overlap for better context preservation
51
+ max_chunks: int = 5 # Increased to retrieve more context
52
+ embedding_model: str = (
53
+ "nomic-embed-text-v2-moe-GGUF" # Lemonade GGUF embedding model
54
+ )
55
+ cache_dir: str = ".gaia"
56
+ show_stats: bool = False
57
+ use_local_llm: bool = True
58
+ base_url: str = "http://localhost:8000/api/v1" # Lemonade server API URL
59
+ # Memory management settings
60
+ max_indexed_files: int = 100 # Maximum number of files to keep indexed
61
+ max_total_chunks: int = 10000 # Maximum total chunks across all files
62
+ enable_lru_eviction: bool = (
63
+ True # Enable automatic eviction of least recently used documents
64
+ )
65
+ # File size limits (prevent OOM)
66
+ max_file_size_mb: int = 100 # Maximum file size in MB (default: 100MB)
67
+ warn_file_size_mb: int = 50 # Warn if file exceeds this size (default: 50MB)
68
+ # LLM-based chunking
69
+ use_llm_chunking: bool = (
70
+ False # Enable LLM-based intelligent chunking (requires LLM client)
71
+ )
72
+ # VLM settings (enabled if available, errors out if model can't be loaded)
73
+ vlm_model: str = "Qwen2.5-VL-7B-Instruct-GGUF"
74
+ # Security settings
75
+ allowed_paths: Optional[List[str]] = None
76
+
77
+
78
+ @dataclass
79
+ class RAGResponse:
80
+ """Response from RAG operations with enhanced metadata."""
81
+
82
+ text: str
83
+ chunks: Optional[List[str]] = None
84
+ chunk_scores: Optional[List[float]] = None
85
+ stats: Optional[Dict[str, Any]] = None
86
+ # Enhanced metadata
87
+ source_files: Optional[List[str]] = None # List of source files for each chunk
88
+ chunk_metadata: Optional[List[Dict[str, Any]]] = None # Detailed metadata per chunk
89
+ query_metadata: Optional[Dict[str, Any]] = None # Query-level metadata
90
+
91
+
92
+ class RAGSDK:
93
+ """
94
+ Simple RAG SDK for PDF document Q&A following GAIA patterns.
95
+
96
+ Example usage:
97
+ ```python
98
+ from gaia.rag.sdk import RAGSDK, RAGConfig
99
+
100
+ # Initialize
101
+ config = RAGConfig(show_stats=True)
102
+ rag = RAGSDK(config)
103
+
104
+ # Index document
105
+ rag.index_document("document.pdf")
106
+
107
+ # Query
108
+ response = rag.query("What are the key features?")
109
+ print(response.text)
110
+ ```
111
+ """
112
+
113
+ def __init__(self, config: Optional[RAGConfig] = None):
114
+ """Initialize RAG SDK."""
115
+ self.config = config or RAGConfig()
116
+ self.log = get_logger(__name__)
117
+
118
+ # Check dependencies
119
+ self._check_dependencies()
120
+
121
+ # Initialize components
122
+ self.embedder = None
123
+ self.llm_client = None
124
+ self.use_lemonade_embeddings = False
125
+ self.index = None
126
+ self.chunks = []
127
+ self.indexed_files = set()
128
+
129
+ # Per-file indexing: maps file paths to their chunk indices
130
+ # This enables efficient per-file searches
131
+ self.file_to_chunk_indices = {} # {file_path: [chunk_idx1, chunk_idx2, ...]}
132
+ self.chunk_to_file = {} # {chunk_idx: file_path} for reverse lookup
133
+
134
+ # Per-file FAISS indices and embeddings (CACHED for performance)
135
+ self.file_indices = {} # {file_path: faiss.Index}
136
+ self.file_embeddings = {} # {file_path: numpy.array}
137
+
138
+ # Per-file metadata (for /dump command and stats)
139
+ self.file_metadata = (
140
+ {}
141
+ ) # {file_path: {'full_text': str, 'num_pages': int, 'vlm_pages': int, ...}}
142
+
143
+ # LRU tracking for memory management
144
+ self.file_access_times = {} # {file_path: last_access_time}
145
+ self.file_index_times = {} # {file_path: index_time}
146
+
147
+ # Create cache directory
148
+ os.makedirs(self.config.cache_dir, exist_ok=True)
149
+
150
+ # Initialize chat SDK for LLM responses
151
+ chat_config = ChatConfig(
152
+ model=self.config.model,
153
+ max_tokens=self.config.max_tokens,
154
+ show_stats=self.config.show_stats,
155
+ use_local_llm=self.config.use_local_llm,
156
+ )
157
+ self.chat = ChatSDK(chat_config)
158
+
159
+ # Initialize path validator
160
+ self.path_validator = PathValidator(self.config.allowed_paths)
161
+
162
+ self.log.debug("RAG SDK initialized")
163
+
164
+ def _check_dependencies(self):
165
+ """Check if required dependencies are available."""
166
+ missing = []
167
+ if PdfReader is None:
168
+ missing.append("pypdf (or PyPDF2)")
169
+ if SentenceTransformer is None:
170
+ missing.append("sentence-transformers")
171
+ if faiss is None:
172
+ missing.append("faiss-cpu")
173
+
174
+ if missing:
175
+ error_msg = (
176
+ f"\n❌ Error: Missing required RAG dependencies: {', '.join(missing)}\n\n"
177
+ f"Please install the RAG dependencies:\n"
178
+ f" pip install -e .[rag]\n\n"
179
+ f"Or install packages directly:\n"
180
+ f" pip install {' '.join(missing)}\n"
181
+ )
182
+ raise ImportError(error_msg)
183
+
184
+ def _safe_open(self, file_path: str, mode="rb"):
185
+ """
186
+ Safely open file with path validation and O_NOFOLLOW to prevent symlink attacks.
187
+
188
+ Args:
189
+ file_path: Path to file
190
+ mode: Open mode ('rb', 'r', 'w', 'wb', etc.)
191
+
192
+ Returns:
193
+ File handle
194
+
195
+ Raises:
196
+ PermissionError: If file is outside allowed paths or is a symlink
197
+ IOError: If file cannot be opened
198
+ """
199
+ # Security check: Validate path against allowed directories
200
+ if not self.path_validator.is_path_allowed(file_path):
201
+ raise PermissionError(f"Access denied: {file_path} is not in allowed paths")
202
+
203
+ import stat
204
+
205
+ # Determine flags based on mode
206
+ if "r" in mode and "+" not in mode:
207
+ flags = os.O_RDONLY
208
+ elif "w" in mode:
209
+ flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
210
+ elif "a" in mode:
211
+ flags = os.O_WRONLY | os.O_CREAT | os.O_APPEND
212
+ else:
213
+ flags = os.O_RDONLY
214
+
215
+ # CRITICAL: Add O_NOFOLLOW to reject symlinks
216
+ # This prevents TOCTOU attacks where symlinks are swapped
217
+ if hasattr(os, "O_NOFOLLOW"):
218
+ flags |= os.O_NOFOLLOW
219
+
220
+ try:
221
+ # Open file descriptor with O_NOFOLLOW
222
+ fd = os.open(str(file_path), flags)
223
+ except OSError as e:
224
+ if e.errno == 40: # ELOOP - too many symbolic links
225
+ raise PermissionError(f"Symlinks not allowed: {file_path}")
226
+ raise IOError(f"Cannot open file {file_path}: {e}")
227
+
228
+ # Verify it's a regular file (not directory or special file)
229
+ try:
230
+ file_stat = os.fstat(fd)
231
+ if not stat.S_ISREG(file_stat.st_mode):
232
+ os.close(fd)
233
+ raise PermissionError(f"Not a regular file: {file_path}")
234
+
235
+ # Convert to file object with appropriate mode
236
+ mode_str = "rb" if "b" in mode else "r"
237
+ if "w" in mode:
238
+ mode_str = "wb" if "b" in mode else "w"
239
+ elif "a" in mode:
240
+ mode_str = "ab" if "b" in mode else "a"
241
+
242
+ return os.fdopen(fd, mode_str)
243
+
244
+ except Exception as _e:
245
+ os.close(fd)
246
+ raise
247
+
248
+ def _get_cache_path(self, file_path: str) -> str:
249
+ """
250
+ Get cache file path for a document using content-based hashing.
251
+
252
+ Uses SHA-256 hash of actual file content for cache key.
253
+ This ensures proper cache invalidation even for:
254
+ - Same-size file edits
255
+ - Files modified within same second (low mtime resolution)
256
+ - Content changes that preserve size
257
+
258
+ Args:
259
+ file_path: Path to the document
260
+
261
+ Returns:
262
+ Path to cache file
263
+ """
264
+ path = Path(file_path).absolute()
265
+
266
+ try:
267
+ # Hash the actual file CONTENT for reliable cache invalidation
268
+ # This is more reliable than mtime + size
269
+ hasher = hashlib.sha256()
270
+
271
+ # Read file in chunks to handle large files efficiently
272
+ # Use _safe_open to prevent symlink attacks
273
+ with self._safe_open(path, "rb") as f:
274
+ while chunk := f.read(8192):
275
+ hasher.update(chunk)
276
+
277
+ content_hash = hasher.hexdigest()
278
+
279
+ # Include path in hash to avoid collisions between identical files
280
+ path_hash = hashlib.sha256(str(path).encode()).hexdigest()[:16]
281
+ cache_key = f"{path_hash}_{content_hash[:32]}"
282
+
283
+ return os.path.join(self.config.cache_dir, f"{cache_key}.pkl")
284
+
285
+ except (OSError, IOError) as e:
286
+ # If file doesn't exist or can't be read, use path-based key
287
+ # This will fail later during indexing anyway
288
+ self.log.warning(f"Cannot read file for cache key: {e}")
289
+ file_hash = hashlib.sha256(str(path).encode()).hexdigest()
290
+ return os.path.join(self.config.cache_dir, f"{file_hash}_notfound.pkl")
291
+
292
+ def _load_embedder(self):
293
+ """Load embedding model via Lemonade server for hardware acceleration.
294
+
295
+ Forces a fresh load with --ubatch-size 2048 to prevent llama.cpp issues
296
+ after VLM processing. Must unload first since Lemonade skips reload
297
+ if model already loaded.
298
+ """
299
+ if self.embedder is None:
300
+ self.log.info(
301
+ f"Loading embedding model via Lemonade: {self.config.embedding_model}"
302
+ )
303
+
304
+ from gaia.llm.lemonade_client import LemonadeClient
305
+
306
+ if not hasattr(self, "llm_client") or self.llm_client is None:
307
+ self.llm_client = LemonadeClient()
308
+
309
+ # Force fresh load - must unload first
310
+ try:
311
+ self.llm_client.unload_model()
312
+ except Exception:
313
+ pass # Ignore if nothing to unload
314
+
315
+ try:
316
+ self.llm_client.load_model(
317
+ self.config.embedding_model,
318
+ llamacpp_args="--ubatch-size 2048",
319
+ )
320
+ self.log.info("Loaded embedding model with ubatch-size=2048")
321
+ except Exception as e:
322
+ self.log.warning(f"Could not pre-load embedding model: {e}")
323
+
324
+ self.embedder = self.llm_client
325
+ self.use_lemonade_embeddings = True
326
+
327
+ self.log.info("Using Lemonade server for hardware-accelerated embeddings")
328
+
329
+ def _encode_texts(
330
+ self, texts: List[str], show_progress: bool = False
331
+ ) -> "np.ndarray":
332
+ """
333
+ Encode texts to embeddings using Lemonade server with batching and timing.
334
+
335
+ Args:
336
+ texts: List of text strings to encode
337
+ show_progress: Whether to show progress
338
+
339
+ Returns:
340
+ numpy array of embeddings with shape (num_texts, embedding_dim)
341
+ """
342
+
343
+ # Batch embedding requests to avoid timeouts
344
+ BATCH_SIZE = 25 # Smaller batches for reliability (25 chunks ~= 12KB text)
345
+ all_embeddings = []
346
+
347
+ total_batches = (len(texts) + BATCH_SIZE - 1) // BATCH_SIZE
348
+ total_start = time.time()
349
+
350
+ for batch_idx in range(0, len(texts), BATCH_SIZE):
351
+ batch_texts = texts[batch_idx : batch_idx + BATCH_SIZE]
352
+ batch_num = (batch_idx // BATCH_SIZE) + 1
353
+
354
+ batch_start = time.time()
355
+
356
+ if show_progress or self.config.show_stats:
357
+ self.log.info(
358
+ f" 📦 Embedding batch {batch_num}/{total_batches} ({len(batch_texts)} chunks)..."
359
+ )
360
+
361
+ # Call Lemonade embeddings API for this batch with retry
362
+ max_retries = 2
363
+ for attempt in range(max_retries + 1):
364
+ try:
365
+ # Use longer timeout for embedding batches (180s = 3 minutes per batch)
366
+ response = self.embedder.embeddings(
367
+ batch_texts, model=self.config.embedding_model, timeout=180
368
+ )
369
+ break # Success, exit retry loop
370
+ except Exception as e:
371
+ if attempt < max_retries:
372
+ self.log.warning(
373
+ f" ⚠️ Batch {batch_num} attempt {attempt + 1} failed, retrying: {e}"
374
+ )
375
+ time.sleep(2) # Wait before retry
376
+ else:
377
+ self.log.error(
378
+ f" ❌ Batch {batch_num} failed after {max_retries + 1} attempts"
379
+ )
380
+ raise
381
+
382
+ batch_duration = time.time() - batch_start
383
+
384
+ if show_progress or self.config.show_stats:
385
+ chunks_per_sec = (
386
+ len(batch_texts) / batch_duration if batch_duration > 0 else 0
387
+ )
388
+ self.log.info(
389
+ f" ✅ Batch {batch_num} complete in {batch_duration:.2f}s ({chunks_per_sec:.1f} chunks/sec)"
390
+ )
391
+
392
+ # Extract embeddings from response
393
+ # Expected format: {"data": [{"embedding": [...]}, ...]}
394
+ for item in response.get("data", []):
395
+ embedding = item.get("embedding", [])
396
+ all_embeddings.append(embedding)
397
+
398
+ total_duration = time.time() - total_start
399
+ if len(texts) > BATCH_SIZE:
400
+ overall_rate = len(texts) / total_duration if total_duration > 0 else 0
401
+ self.log.info(
402
+ f" 🎯 Total embedding time: {total_duration:.2f}s ({overall_rate:.1f} chunks/sec, {total_batches} batches)"
403
+ )
404
+
405
+ # Convert to numpy array
406
+ return np.array(all_embeddings, dtype=np.float32)
407
+
408
+ def _get_file_type(self, file_path: str) -> str:
409
+ """Detect file type from extension."""
410
+ ext = Path(file_path).suffix.lower()
411
+ return ext if ext else ".unknown"
412
+
413
+ def _extract_text_from_pdf(self, pdf_path: str) -> tuple:
414
+ """
415
+ Extract text from PDF file with VLM for images (always enabled if available).
416
+
417
+ Returns:
418
+ (text, num_pages, metadata) tuple where metadata contains:
419
+ - num_pages: int
420
+ - vlm_pages: int (number of pages enhanced with VLM)
421
+ - total_images: int (total images processed)
422
+ """
423
+ import time as time_module # pylint: disable=reimported
424
+
425
+ try:
426
+ extract_start = time_module.time()
427
+ reader = PdfReader(pdf_path)
428
+ total_pages = len(reader.pages)
429
+ self.log.info(f"📄 Extracting text from {total_pages} pages...")
430
+
431
+ # Initialize VLM client (auto-enabled if available)
432
+ vlm = None
433
+ vlm_available = False
434
+ try:
435
+ from gaia.llm.vlm_client import VLMClient
436
+ from gaia.rag.pdf_utils import (
437
+ count_images_in_page,
438
+ extract_images_from_page_pymupdf,
439
+ )
440
+
441
+ vlm = VLMClient(
442
+ vlm_model=self.config.vlm_model, base_url=self.config.base_url
443
+ )
444
+ vlm_available = vlm.check_availability()
445
+
446
+ if vlm_available and self.config.show_stats:
447
+ print(" 🔍 VLM enabled: Will extract text from images")
448
+ elif not vlm_available and self.config.show_stats:
449
+ print(" ⚠️ VLM not available - images will not be processed")
450
+ print(" 📥 To enable VLM image extraction:")
451
+ print(" 1. Open Lemonade Model Manager (http://localhost:8000)")
452
+ print(f" 2. Download model: {self.config.vlm_model}")
453
+
454
+ except Exception as vlm_error:
455
+ if self.config.show_stats:
456
+ print(f" ⚠️ VLM initialization failed: {vlm_error}")
457
+ self.log.warning(f"VLM initialization failed: {vlm_error}")
458
+ vlm_available = False
459
+
460
+ if self.config.show_stats:
461
+ print(f"\n{'='*60}")
462
+ print(" 📄 COMPUTE INTENSIVE: PDF Text Extraction")
463
+ print(f" 📊 Total pages: {total_pages}")
464
+ print(f" ⏱️ Estimated time: {total_pages * 0.2:.1f} seconds")
465
+ if vlm_available:
466
+ print(" 🖼️ VLM: Enabled for image text extraction")
467
+ else:
468
+ print(" 🖼️ VLM: Disabled (text-only extraction)")
469
+ print(f"{'='*60}")
470
+
471
+ pages_data = []
472
+ vlm_pages_count = 0
473
+ total_images_processed = 0
474
+
475
+ for i, page in enumerate(reader.pages, 1):
476
+ page_start = time_module.time()
477
+
478
+ # Step 1: Extract text with pypdf
479
+ pypdf_text = page.extract_text()
480
+
481
+ # Step 2: Check for images
482
+ has_imgs = False
483
+ num_imgs = 0
484
+ if vlm_available:
485
+ try:
486
+ has_imgs, num_imgs = count_images_in_page(page)
487
+ except Exception: # pylint: disable=broad-except
488
+ pass
489
+
490
+ # Step 3: Extract from images if present
491
+ image_texts = []
492
+ if has_imgs and vlm_available:
493
+ try:
494
+ images = extract_images_from_page_pymupdf(pdf_path, page_num=i)
495
+ if images:
496
+ image_texts = vlm.extract_from_page_images(
497
+ images, page_num=i
498
+ )
499
+ if image_texts:
500
+ vlm_pages_count += 1
501
+ total_images_processed += len(image_texts)
502
+ except Exception as img_error:
503
+ self.log.warning(
504
+ f"Image extraction failed on page {i}: {img_error}"
505
+ )
506
+
507
+ # Step 4: Merge
508
+ merged_text = self._merge_page_texts(
509
+ pypdf_text, image_texts, page_num=i
510
+ )
511
+
512
+ pages_data.append(
513
+ {
514
+ "page": i,
515
+ "text": merged_text,
516
+ "has_images": has_imgs,
517
+ "num_images": num_imgs,
518
+ "vlm_used": len(image_texts) > 0,
519
+ }
520
+ )
521
+
522
+ page_duration = time_module.time() - page_start
523
+
524
+ if self.config.show_stats:
525
+ # Update progress with timing info
526
+ progress_pct = (i / total_pages) * 100
527
+ avg_time_per_page = (time_module.time() - extract_start) / i
528
+ eta = avg_time_per_page * (total_pages - i)
529
+ vlm_indicator = " 🖼️" if len(image_texts) > 0 else ""
530
+ print(
531
+ f" 📄 Page {i}/{total_pages} ({progress_pct:.0f}%){vlm_indicator} | "
532
+ f"⏱️ {page_duration:.2f}s | ETA: {eta:.1f}s" + " " * 10,
533
+ end="\r",
534
+ flush=True,
535
+ )
536
+
537
+ # Cleanup VLM
538
+ if vlm_available and vlm:
539
+ try:
540
+ vlm.cleanup()
541
+ except Exception: # pylint: disable=broad-except
542
+ pass
543
+
544
+ extract_duration = time_module.time() - extract_start
545
+
546
+ # Build full text
547
+ full_text = "\n\n".join(
548
+ [f"[Page {p['page']}]\n{p['text']}" for p in pages_data]
549
+ )
550
+
551
+ if self.config.show_stats:
552
+ print(
553
+ f"\n ✅ Extracted {len(full_text):,} characters from {total_pages} pages"
554
+ )
555
+ print(
556
+ f" ⏱️ Total extraction time: {extract_duration:.2f}s ({total_pages/extract_duration:.1f} pages/sec)"
557
+ )
558
+ print(f" 💾 Text size: {len(full_text) / 1024:.1f} KB")
559
+ if vlm_pages_count > 0:
560
+ print(
561
+ f" 🖼️ VLM enhanced: {vlm_pages_count} pages, {total_images_processed} images"
562
+ )
563
+ print(f"{'='*60}\n")
564
+
565
+ self.log.info(
566
+ f"📝 Extracted {len(full_text):,} characters in {extract_duration:.2f}s (VLM: {vlm_pages_count} pages)"
567
+ )
568
+
569
+ # Build metadata
570
+ metadata = {
571
+ "num_pages": total_pages,
572
+ "vlm_pages": vlm_pages_count,
573
+ "total_images": total_images_processed,
574
+ "vlm_checked": True, # Indicates this cache was created with VLM capability check
575
+ "vlm_available": vlm_available, # Whether VLM was actually available
576
+ }
577
+
578
+ return full_text, total_pages, metadata
579
+ except Exception as e:
580
+ self.log.error(f"Error reading PDF {pdf_path}: {e}")
581
+ raise
582
+
583
+ def _merge_page_texts(
584
+ self, pypdf_text: str, image_texts: list, page_num: int
585
+ ) -> str:
586
+ """
587
+ Merge pypdf text + VLM image texts.
588
+
589
+ Args:
590
+ pypdf_text: Text extracted by pypdf
591
+ image_texts: List of dicts from VLM extraction (each has 'image_num' and 'text')
592
+ page_num: Page number for logging
593
+
594
+ Returns:
595
+ Merged text with image content clearly marked
596
+ """
597
+ parts = []
598
+
599
+ # Add pypdf text first (if any)
600
+ if pypdf_text.strip():
601
+ parts.append(pypdf_text.strip())
602
+
603
+ # Add VLM-extracted image content (if any)
604
+ if image_texts:
605
+ parts.append("\n\n---\n")
606
+ parts.append(f"[Page {page_num}]\n**Content Extracted from Images:**\n")
607
+
608
+ for img_data in image_texts:
609
+ parts.append(
610
+ f"\n[Page {page_num}] ### 🖼️ IMAGE {img_data['image_num']}\n\n"
611
+ )
612
+
613
+ # Clean up the VLM text for better structure
614
+ image_text = img_data["text"].strip()
615
+
616
+ # Ensure proper line breaks for list items (general pattern)
617
+ # Look for patterns like "- text" or "* text" or "1. text"
618
+ image_text = re.sub(r"(?<!\n)([•\-\*]|\d+\.)\s+", r"\n\1 ", image_text)
619
+
620
+ # Add double newline after what looks like a heading
621
+ # (line ending with colon or short line followed by longer text)
622
+ lines = image_text.split("\n")
623
+ formatted_lines = []
624
+ for i, line in enumerate(lines):
625
+ formatted_lines.append(line)
626
+ # Add extra newline after lines that look like headers
627
+ if line.strip().endswith(":") and i < len(lines) - 1:
628
+ formatted_lines.append("")
629
+
630
+ image_text = "\n".join(formatted_lines)
631
+
632
+ parts.append(image_text)
633
+ parts.append("\n\n")
634
+
635
+ return "\n".join(parts)
636
+
637
+ def _llm_based_chunking(
638
+ self, text: str, chunk_size: int, overlap: int
639
+ ) -> List[str]:
640
+ """
641
+ Use LLM to intelligently identify chunk boundaries.
642
+
643
+ The LLM analyzes the text structure and suggests optimal split points
644
+ that preserve semantic meaning and context.
645
+ """
646
+ self.log.info("🤖 Using LLM for intelligent text chunking...")
647
+
648
+ chunks = []
649
+
650
+ # Process text in segments (to handle long documents)
651
+ # Approximate: 1 token ≈ 4 characters
652
+ segment_size = chunk_size * 4 * 3 # Process 3 chunks worth at a time
653
+ text_length = len(text)
654
+ position = 0
655
+
656
+ while position < text_length:
657
+ # Get a segment to process
658
+ segment_end = min(position + segment_size, text_length)
659
+ segment = text[position:segment_end]
660
+
661
+ # Ask LLM to identify good chunk boundaries
662
+ prompt = """You are a document chunking expert. Your task is to identify optimal points to split the following text into chunks.
663
+
664
+ The text should be split into chunks of approximately {chunk_size} tokens (roughly {chunk_size * 4} characters each).
665
+
666
+ IMPORTANT RULES:
667
+ 1. Keep semantic units together (complete thoughts, paragraphs, sections)
668
+ 2. Never split in the middle of sentences
669
+ 3. Preserve context - each chunk should be understandable on its own
670
+ 4. Keep related information together (e.g., a heading with its content)
671
+ 5. For lists, try to keep the list introduction with at least some items
672
+
673
+ Text to chunk:
674
+ ---
675
+ {segment[:2000]} # Limit prompt size
676
+ {"..." if len(segment) > 2000 else ""}
677
+ ---
678
+
679
+ Please identify the CHARACTER POSITIONS where the text should be split.
680
+ Return ONLY a JSON array of split positions, like: [245, 502, 847]
681
+ These positions indicate where to split the text."""
682
+
683
+ try:
684
+ # Get LLM response
685
+ response_data = self.llm_client.completions(
686
+ model=self.config.model,
687
+ prompt=prompt,
688
+ temperature=0.0, # Low temperature for deterministic chunking
689
+ max_tokens=500,
690
+ )
691
+ response = response_data["choices"][0]["text"]
692
+
693
+ # Parse the split positions
694
+ import json
695
+
696
+ split_positions = json.loads(response)
697
+
698
+ # Create chunks based on LLM-suggested positions
699
+ last_pos = 0
700
+ for split_pos in split_positions:
701
+ if split_pos > last_pos and split_pos < len(segment):
702
+ chunk = segment[last_pos:split_pos].strip()
703
+ if chunk:
704
+ chunks.append(chunk)
705
+ last_pos = split_pos
706
+
707
+ # Add remaining text
708
+ if last_pos < len(segment):
709
+ chunk = segment[last_pos:].strip()
710
+ if chunk:
711
+ chunks.append(chunk)
712
+
713
+ except Exception as e:
714
+ self.log.warning(f"LLM chunking failed for segment: {e}")
715
+ # Fall back to simple splitting for this segment
716
+ segment_chunks = self._fallback_chunk_segment(segment, chunk_size)
717
+ chunks.extend(segment_chunks)
718
+
719
+ # Move to next segment with overlap
720
+ position = segment_end - (overlap * 4) # Convert overlap tokens to chars
721
+
722
+ return chunks
723
+
724
+ def _fallback_chunk_segment(self, text: str, chunk_size: int) -> List[str]:
725
+ """Simple fallback chunking for a text segment."""
726
+ chunks = []
727
+ words = text.split()
728
+ current_chunk = []
729
+ current_size = 0
730
+
731
+ for word in words:
732
+ word_size = len(word) // 4 # Rough token estimate
733
+ if current_size + word_size > chunk_size and current_chunk:
734
+ chunks.append(" ".join(current_chunk))
735
+ current_chunk = [word]
736
+ current_size = word_size
737
+ else:
738
+ current_chunk.append(word)
739
+ current_size += word_size
740
+
741
+ if current_chunk:
742
+ chunks.append(" ".join(current_chunk))
743
+
744
+ return chunks
745
+
746
+ def _extract_text_from_text_file(self, file_path: str) -> str:
747
+ """Extract text from text-based file (txt, md, etc.)."""
748
+ try:
749
+ encodings = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
750
+ text = None
751
+
752
+ for encoding in encodings:
753
+ try:
754
+ # Use _safe_open with binary mode, then decode
755
+ with self._safe_open(file_path, "rb") as f:
756
+ text = f.read().decode(encoding)
757
+ break
758
+ except (UnicodeDecodeError, AttributeError):
759
+ continue
760
+
761
+ if text is None:
762
+ raise ValueError(
763
+ f"Failed to decode file: {file_path}\n"
764
+ f"Tried encodings: {', '.join(encodings)}\n"
765
+ "Suggestions:\n"
766
+ " 1. Convert the file to UTF-8 encoding\n"
767
+ " 2. Check if the file is corrupted\n"
768
+ " 3. Ensure the file is a text file (not binary)"
769
+ )
770
+
771
+ if self.config.show_stats:
772
+ print(f" ✅ Loaded text file ({len(text):,} characters)")
773
+
774
+ self.log.info(f"📝 Extracted {len(text):,} characters from text file")
775
+ return text.strip()
776
+ except Exception as e:
777
+ self.log.error(f"Error reading text file {file_path}: {e}")
778
+ raise
779
+
780
+ def _extract_text_from_csv(self, csv_path: str) -> str:
781
+ """Extract text from CSV file."""
782
+ try:
783
+ import csv
784
+
785
+ text_parts = []
786
+ encodings = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
787
+
788
+ for encoding in encodings:
789
+ try:
790
+ # Use _safe_open with binary mode, then decode for csv.reader
791
+ from io import StringIO
792
+
793
+ with self._safe_open(csv_path, "rb") as f:
794
+ text = f.read().decode(encoding)
795
+ reader = csv.reader(StringIO(text))
796
+ rows = list(reader)
797
+
798
+ if not rows:
799
+ raise ValueError("CSV file is empty")
800
+
801
+ # Include header as context
802
+ if rows:
803
+ header = rows[0]
804
+ text_parts.append(f"Columns: {', '.join(header)}\n")
805
+
806
+ # Convert rows to readable text
807
+ for row in rows[1:]:
808
+ # Create a readable row format
809
+ row_text = []
810
+ for i, cell in enumerate(row):
811
+ if i < len(header):
812
+ row_text.append(f"{header[i]}: {cell}")
813
+ else:
814
+ row_text.append(cell)
815
+ text_parts.append(" | ".join(row_text))
816
+
817
+ text = "\n".join(text_parts)
818
+
819
+ if self.config.show_stats:
820
+ print(
821
+ f" ✅ Loaded CSV file ({len(rows)} rows, {len(header)} columns)"
822
+ )
823
+
824
+ self.log.info(f"📊 Extracted {len(rows)} rows from CSV")
825
+ return text
826
+ except UnicodeDecodeError:
827
+ continue
828
+
829
+ raise ValueError(
830
+ f"Failed to decode CSV file: {csv_path}\n"
831
+ f"Tried encodings: {', '.join(encodings)}\n"
832
+ "Suggestions:\n"
833
+ " 1. Save the CSV file with UTF-8 encoding in Excel/LibreOffice\n"
834
+ " 2. Check if the file is a valid CSV (not corrupted)\n"
835
+ " 3. Try opening and re-saving in a text editor"
836
+ )
837
+ except Exception as e:
838
+ self.log.error(f"Error reading CSV {csv_path}: {e}")
839
+ raise
840
+
841
+ def _extract_text_from_json(self, json_path: str) -> str:
842
+ """Extract text from JSON file."""
843
+ try:
844
+ import json
845
+
846
+ # Use _safe_open to prevent symlink attacks
847
+ with self._safe_open(json_path, "rb") as f:
848
+ data = json.load(f)
849
+
850
+ # Convert JSON to readable text format
851
+ def json_to_text(obj, indent=0):
852
+ """Recursively convert JSON to readable text."""
853
+ lines = []
854
+ prefix = " " * indent
855
+
856
+ if isinstance(obj, dict):
857
+ for key, value in obj.items():
858
+ if isinstance(value, (dict, list)):
859
+ lines.append(f"{prefix}{key}:")
860
+ lines.extend(json_to_text(value, indent + 1))
861
+ else:
862
+ lines.append(f"{prefix}{key}: {value}")
863
+ elif isinstance(obj, list):
864
+ for i, item in enumerate(obj):
865
+ if isinstance(item, (dict, list)):
866
+ lines.append(f"{prefix}Item {i + 1}:")
867
+ lines.extend(json_to_text(item, indent + 1))
868
+ else:
869
+ lines.append(f"{prefix}- {item}")
870
+ else:
871
+ lines.append(f"{prefix}{obj}")
872
+
873
+ return lines
874
+
875
+ text = "\n".join(json_to_text(data))
876
+
877
+ if self.config.show_stats:
878
+ print(f" ✅ Loaded JSON file ({len(text):,} characters)")
879
+
880
+ self.log.info(f"📝 Extracted {len(text):,} characters from JSON")
881
+ return text
882
+ except Exception as e:
883
+ self.log.error(f"Error reading JSON {json_path}: {e}")
884
+ raise
885
+
886
+ def _extract_text_from_file(self, file_path: str) -> tuple:
887
+ """
888
+ Extract text from file based on type.
889
+
890
+ Returns:
891
+ (text, metadata_dict) tuple where metadata_dict contains:
892
+ - num_pages: int (for PDFs) or None
893
+ - vlm_pages: int (for PDFs with VLM) or None
894
+ - total_images: int (for PDFs with VLM) or None
895
+ """
896
+ file_type = self._get_file_type(file_path)
897
+ metadata = {"num_pages": None, "vlm_pages": None, "total_images": None}
898
+
899
+ # PDF files
900
+ if file_type == ".pdf":
901
+ text, num_pages, pdf_metadata = self._extract_text_from_pdf(file_path)
902
+ metadata["num_pages"] = num_pages
903
+ metadata["vlm_pages"] = pdf_metadata.get("vlm_pages", 0)
904
+ metadata["total_images"] = pdf_metadata.get("total_images", 0)
905
+ return text, metadata
906
+
907
+ # Text-based files
908
+ elif file_type in [".txt", ".md", ".markdown", ".rst", ".log"]:
909
+ return self._extract_text_from_text_file(file_path), metadata
910
+
911
+ # CSV files
912
+ elif file_type == ".csv":
913
+ return self._extract_text_from_csv(file_path), metadata
914
+
915
+ # JSON files
916
+ elif file_type == ".json":
917
+ return self._extract_text_from_json(file_path), metadata
918
+
919
+ # Code files (treat as text for Q&A purposes)
920
+ elif file_type in [
921
+ # Backend languages
922
+ ".py",
923
+ ".pyw", # Python
924
+ ".java", # Java
925
+ ".cpp",
926
+ ".cc",
927
+ ".cxx",
928
+ ".hpp",
929
+ ".h", # C++
930
+ ".c", # C
931
+ ".cs", # C#
932
+ ".go", # Go
933
+ ".rs", # Rust
934
+ ".rb", # Ruby
935
+ ".php", # PHP
936
+ ".swift", # Swift
937
+ ".kt",
938
+ ".kts", # Kotlin
939
+ ".scala", # Scala
940
+ # Web - JavaScript/TypeScript
941
+ ".js",
942
+ ".jsx", # JavaScript
943
+ ".ts",
944
+ ".tsx", # TypeScript
945
+ ".mjs",
946
+ ".cjs", # JavaScript modules
947
+ # Web - Frameworks
948
+ ".vue", # Vue.js
949
+ ".svelte", # Svelte
950
+ ".astro", # Astro
951
+ # Web - Styling
952
+ ".css", # CSS
953
+ ".scss",
954
+ ".sass", # Sass
955
+ ".less", # Less
956
+ ".styl",
957
+ ".stylus", # Stylus
958
+ # Web - Markup
959
+ ".html",
960
+ ".htm", # HTML
961
+ ".svg", # SVG
962
+ ".jsx",
963
+ ".tsx", # JSX/TSX (already listed but emphasizing)
964
+ # Scripting
965
+ ".sh",
966
+ ".bash", # Shell
967
+ ".ps1", # PowerShell
968
+ ".r",
969
+ ".R", # R
970
+ # Database
971
+ ".sql", # SQL
972
+ # Configuration
973
+ ".yaml",
974
+ ".yml", # YAML
975
+ ".xml", # XML
976
+ ".toml", # TOML
977
+ ".ini",
978
+ ".cfg",
979
+ ".conf", # Config files
980
+ ".env", # Environment files
981
+ ".properties", # Properties files
982
+ # Build & Package
983
+ ".gradle", # Gradle
984
+ ".cmake", # CMake
985
+ ".mk",
986
+ ".make", # Makefiles
987
+ # Documentation
988
+ ".rst", # ReStructuredText
989
+ ]:
990
+ self.log.info(f"Indexing code/web file: {file_type}")
991
+ return self._extract_text_from_text_file(file_path), metadata
992
+
993
+ # Unknown file type - try as text
994
+ else:
995
+ self.log.warning(
996
+ f"Unknown file type {file_type}, attempting to read as text"
997
+ )
998
+ return self._extract_text_from_text_file(file_path), metadata
999
+
1000
+ def _split_text_into_chunks(self, text: str) -> List[str]:
1001
+ """
1002
+ Split text into semantic chunks using LLM intelligence when available.
1003
+
1004
+ Uses intelligent splitting that:
1005
+ - Leverages LLM to identify natural semantic boundaries (if available)
1006
+ - Falls back to structural heuristics if LLM is not available
1007
+ - Respects natural document boundaries (paragraphs, sections)
1008
+ - Keeps semantic units together
1009
+ - Maintains context with overlap
1010
+
1011
+ This dramatically improves Q&A quality over naive word splitting.
1012
+ """
1013
+ self.log.info("📝 Splitting text into semantic chunks...")
1014
+
1015
+ chunks = []
1016
+ chunk_size_tokens = self.config.chunk_size
1017
+ overlap_tokens = self.config.chunk_overlap
1018
+
1019
+ # Try to use LLM for intelligent chunking if available
1020
+ if self.config.use_llm_chunking:
1021
+ # Ensure LLM client is initialized for chunking
1022
+ if self.llm_client is None:
1023
+ try:
1024
+ from gaia.llm.lemonade_client import LemonadeClient
1025
+
1026
+ self.llm_client = LemonadeClient()
1027
+ self.log.info("✅ Initialized LLM client for intelligent chunking")
1028
+ except Exception as e:
1029
+ self.log.warning(
1030
+ f"Failed to initialize LLM client for chunking: {e}"
1031
+ )
1032
+
1033
+ if self.llm_client is not None:
1034
+ try:
1035
+ return self._llm_based_chunking(
1036
+ text, chunk_size_tokens, overlap_tokens
1037
+ )
1038
+ except Exception as e:
1039
+ self.log.warning(
1040
+ f"LLM chunking failed, falling back to heuristic: {e}"
1041
+ )
1042
+
1043
+ # Fall back to heuristic-based chunking
1044
+
1045
+ # STEP 1: Identify and protect VLM content blocks as atomic units
1046
+ # VLM content starts with "[Page X] ### 🖼️ IMAGE" and continues until next image or end
1047
+ # We'll mark these sections to prevent splitting during paragraph processing
1048
+ vlm_pattern = r"\[Page \d+\] ### 🖼️ IMAGE \d+.*?(?=\[Page \d+\] ### 🖼️ IMAGE|\[Page \d+\]\n(?!### 🖼️)|\Z)"
1049
+
1050
+ # Find all VLM image blocks and replace them with placeholders temporarily
1051
+ vlm_blocks = []
1052
+ protected_text = text
1053
+ for i, match in enumerate(re.finditer(vlm_pattern, text, re.DOTALL)):
1054
+ placeholder = f"<<<VLM_BLOCK_{i}>>>"
1055
+ vlm_blocks.append(
1056
+ {"placeholder": placeholder, "content": match.group(0).strip()}
1057
+ )
1058
+ protected_text = protected_text.replace(match.group(0), placeholder, 1)
1059
+
1060
+ # STEP 2: Identify natural document boundaries
1061
+ # Look for markdown headers, section breaks, or significant whitespace
1062
+ # Use protected_text which has VLM blocks replaced with placeholders
1063
+ lines = protected_text.split("\n")
1064
+ sections = []
1065
+ current_section = []
1066
+
1067
+ for i, line in enumerate(lines):
1068
+ # Detect section boundaries:
1069
+ # 1. Markdown headers (# Header, ## Header, ### Header)
1070
+ # 2. Lines that look like titles (short, possibly capitalized)
1071
+ # 3. Horizontal rules (---, ===, ___)
1072
+ # 4. Significant whitespace gaps
1073
+
1074
+ is_boundary = False
1075
+
1076
+ # Check for markdown headers
1077
+ if line.strip().startswith("#"):
1078
+ is_boundary = True
1079
+ # Check for horizontal rules
1080
+ elif re.match(r"^[\-=_]{3,}$", line.strip()):
1081
+ is_boundary = True
1082
+ # Check for lines that look like section titles (short, might be all caps)
1083
+ elif line.strip() and len(line.strip()) < 100 and i > 0:
1084
+ # If previous line was empty and next line exists and is not empty
1085
+ prev_empty = i > 0 and not lines[i - 1].strip()
1086
+ next_exists = i < len(lines) - 1
1087
+ next_not_empty = next_exists and lines[i + 1].strip()
1088
+
1089
+ # Heuristic: likely a section header if surrounded by whitespace
1090
+ if prev_empty and next_not_empty:
1091
+ # Additional check: does it look like a title?
1092
+ # (starts with capital, no ending punctuation, relatively short)
1093
+ if line.strip()[0].isupper() and not line.strip()[-1] in ".!?,;":
1094
+ is_boundary = True
1095
+
1096
+ if is_boundary and current_section:
1097
+ # Save the current section
1098
+ sections.append("\n".join(current_section))
1099
+ current_section = [line]
1100
+ else:
1101
+ current_section.append(line)
1102
+
1103
+ # Don't forget the last section
1104
+ if current_section:
1105
+ sections.append("\n".join(current_section))
1106
+
1107
+ # If we didn't find many sections, try paragraph-based splitting
1108
+ if len(sections) <= 3:
1109
+ # Split by double newlines (paragraphs)
1110
+ paragraphs = re.split(r"\n\s*\n", text)
1111
+ # Filter out empty paragraphs
1112
+ paragraphs = [p.strip() for p in paragraphs if p.strip()]
1113
+ else:
1114
+ paragraphs = sections
1115
+
1116
+ # STEP 3: Mark paragraphs that are VLM content (should not be split)
1117
+ vlm_paragraphs = set()
1118
+ for idx, para in enumerate(paragraphs):
1119
+ # Check if this paragraph contains VLM markers
1120
+ if "### 🖼️ IMAGE" in para or "**Content Extracted from Images:**" in para:
1121
+ vlm_paragraphs.add(idx)
1122
+ self.log.debug(
1123
+ f"Paragraph {idx} marked as VLM content (will keep atomic)"
1124
+ )
1125
+
1126
+ current_chunk = []
1127
+ current_size = 0
1128
+
1129
+ for idx, para in enumerate(paragraphs):
1130
+ para = para.strip()
1131
+ if not para:
1132
+ continue
1133
+
1134
+ # Estimate tokens (rough: 1 token ≈ 4 characters)
1135
+ para_tokens = len(para) // 4
1136
+
1137
+ # Check if this is VLM content - if so, keep it atomic
1138
+ is_vlm_content = idx in vlm_paragraphs
1139
+
1140
+ # If single paragraph exceeds chunk size AND it's not VLM content, split by sentences
1141
+ if para_tokens > chunk_size_tokens and not is_vlm_content:
1142
+ # Split into sentences
1143
+ sentences = self._split_into_sentences(para)
1144
+
1145
+ for sentence in sentences:
1146
+ sentence_tokens = len(sentence) // 4
1147
+
1148
+ # If adding this sentence exceeds chunk size, save current chunk
1149
+ if (
1150
+ current_size + sentence_tokens > chunk_size_tokens
1151
+ and current_chunk
1152
+ ):
1153
+ chunks.append(" ".join(current_chunk))
1154
+
1155
+ # Keep overlap (last few sentences)
1156
+ overlap_text = " ".join(current_chunk)
1157
+ overlap_actual = len(overlap_text) // 4
1158
+ if overlap_actual > overlap_tokens:
1159
+ # Trim to overlap size
1160
+ current_chunk = self._get_last_n_tokens(
1161
+ overlap_text, overlap_tokens
1162
+ ).split()
1163
+ current_size = overlap_tokens
1164
+ else:
1165
+ current_chunk = []
1166
+ current_size = 0
1167
+
1168
+ current_chunk.append(sentence)
1169
+ current_size += sentence_tokens
1170
+ else:
1171
+ # Small paragraph - try to keep intact
1172
+ # SPECIAL CASE: If this is VLM content, keep it atomic even if it exceeds chunk size
1173
+ if is_vlm_content:
1174
+ if current_chunk:
1175
+ # Save current chunk before adding VLM content
1176
+ chunks.append(" ".join(current_chunk))
1177
+ current_chunk = []
1178
+ current_size = 0
1179
+
1180
+ # Add VLM content as its own chunk (atomic, not split)
1181
+ chunks.append(para)
1182
+ self.log.debug(
1183
+ f"Added VLM content as atomic chunk ({para_tokens} tokens)"
1184
+ )
1185
+
1186
+ elif current_size + para_tokens > chunk_size_tokens and current_chunk:
1187
+ # Save current chunk
1188
+ chunks.append(" ".join(current_chunk))
1189
+
1190
+ # Keep overlap
1191
+ overlap_text = " ".join(current_chunk)
1192
+ current_chunk = self._get_last_n_tokens(
1193
+ overlap_text, overlap_tokens
1194
+ ).split()
1195
+ current_size = len(" ".join(current_chunk)) // 4
1196
+
1197
+ current_chunk.append(para)
1198
+ current_size += para_tokens
1199
+ else:
1200
+ current_chunk.append(para)
1201
+ current_size += para_tokens
1202
+
1203
+ # Add final chunk
1204
+ if current_chunk:
1205
+ chunks.append(" ".join(current_chunk))
1206
+
1207
+ # STEP 4: Restore VLM blocks from placeholders
1208
+ if vlm_blocks:
1209
+ restored_chunks = []
1210
+ for chunk in chunks:
1211
+ restored_chunk = chunk
1212
+ # Replace placeholders with actual VLM content
1213
+ for vlm_block in vlm_blocks:
1214
+ if vlm_block["placeholder"] in restored_chunk:
1215
+ restored_chunk = restored_chunk.replace(
1216
+ vlm_block["placeholder"], vlm_block["content"]
1217
+ )
1218
+ restored_chunks.append(restored_chunk)
1219
+ chunks = restored_chunks
1220
+
1221
+ if self.config.show_stats:
1222
+ avg_size = sum(len(c) for c in chunks) // len(chunks) if chunks else 0
1223
+ print(f" ✅ Created {len(chunks)} semantic chunks (avg {avg_size} chars)")
1224
+
1225
+ self.log.info(f"📦 Created {len(chunks)} semantic chunks")
1226
+ return chunks
1227
+
1228
+ def _split_into_sentences(self, text: str) -> List[str]:
1229
+ """
1230
+ Split text into sentences using simple heuristics.
1231
+
1232
+ Better than word splitting, doesn't require NLTK dependency.
1233
+ """
1234
+ # Split on sentence endings followed by space and capital letter
1235
+
1236
+ # Handle common abbreviations that shouldn't split
1237
+ text = text.replace("Dr.", "Dr<DOT>")
1238
+ text = text.replace("Mr.", "Mr<DOT>")
1239
+ text = text.replace("Mrs.", "Mrs<DOT>")
1240
+ text = text.replace("Ms.", "Ms<DOT>")
1241
+ text = text.replace("Prof.", "Prof<DOT>")
1242
+ text = text.replace("Sr.", "Sr<DOT>")
1243
+ text = text.replace("Jr.", "Jr<DOT>")
1244
+ text = text.replace("vs.", "vs<DOT>")
1245
+ text = text.replace("e.g.", "e<DOT>g<DOT>")
1246
+ text = text.replace("i.e.", "i<DOT>e<DOT>")
1247
+ text = text.replace("etc.", "etc<DOT>")
1248
+
1249
+ # Split on sentence boundaries
1250
+ sentences = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
1251
+
1252
+ # Restore abbreviations
1253
+ sentences = [s.replace("<DOT>", ".") for s in sentences]
1254
+
1255
+ return [s.strip() for s in sentences if s.strip()]
1256
+
1257
+ def _get_last_n_tokens(self, text: str, n_tokens: int) -> str:
1258
+ """Get approximately the last n tokens from text."""
1259
+ # Rough estimate: 1 token ≈ 4 characters
1260
+ target_chars = n_tokens * 4
1261
+ if len(text) <= target_chars:
1262
+ return text
1263
+
1264
+ # Try to break on word boundary
1265
+ trimmed = text[-target_chars:]
1266
+ first_space = trimmed.find(" ")
1267
+ if first_space > 0:
1268
+ return trimmed[first_space + 1 :]
1269
+ return trimmed
1270
+
1271
+ def _create_vector_index(self, chunks: List[str]) -> tuple:
1272
+ """Create FAISS vector index from chunks with progress reporting."""
1273
+ import time as time_module # pylint: disable=reimported
1274
+
1275
+ self._load_embedder()
1276
+
1277
+ # Generate embeddings with detailed progress
1278
+ self.log.info(f"🔍 Generating embeddings for {len(chunks)} chunks...")
1279
+
1280
+ if self.config.show_stats:
1281
+ print(f"\n{'='*60}")
1282
+ print(" 🧠 COMPUTE INTENSIVE: Generating vector embeddings")
1283
+ print(f" 📊 Processing {len(chunks)} chunks")
1284
+ print(f" ⏱️ Estimated time: {len(chunks) * 0.05:.1f} seconds")
1285
+ print(f"{'='*60}")
1286
+
1287
+ embed_start = time_module.time()
1288
+ embeddings = self._encode_texts(chunks, show_progress=self.config.show_stats)
1289
+ embed_duration = time_module.time() - embed_start
1290
+
1291
+ if self.config.show_stats:
1292
+ print(
1293
+ f"\n ✅ Generated {embeddings.shape[0]} embeddings ({embeddings.shape[1]} dimensions)"
1294
+ )
1295
+ print(
1296
+ f" ⏱️ Embedding time: {embed_duration:.2f}s ({len(chunks)/embed_duration:.1f} chunks/sec)"
1297
+ )
1298
+
1299
+ # Create FAISS index
1300
+ self.log.info("🏗️ Building FAISS search index...")
1301
+
1302
+ if self.config.show_stats:
1303
+ print("\n 🏗️ Building FAISS search index...")
1304
+
1305
+ index_start = time_module.time()
1306
+ dimension = embeddings.shape[1]
1307
+ index = faiss.IndexFlatL2(dimension)
1308
+ # pylint: disable=no-value-for-parameter
1309
+ index.add(embeddings.astype("float32"))
1310
+ index_duration = time_module.time() - index_start
1311
+
1312
+ if self.config.show_stats:
1313
+ print(
1314
+ f" ✅ Built search index for {index.ntotal} vectors in {index_duration:.2f}s"
1315
+ )
1316
+ print(
1317
+ f" 💾 Memory: ~{(embeddings.nbytes / (1024**2)):.1f}MB for embeddings"
1318
+ )
1319
+ print(f"{'='*60}\n")
1320
+
1321
+ self.log.info(
1322
+ f"📚 Index ready with {index.ntotal} vectors "
1323
+ f"(embed: {embed_duration:.2f}s, index: {index_duration:.2f}s)"
1324
+ )
1325
+ return index, chunks
1326
+
1327
+ def remove_document(self, file_path: str) -> bool:
1328
+ """
1329
+ Remove a document from the index.
1330
+
1331
+ Args:
1332
+ file_path: Path to document to remove
1333
+
1334
+ Returns:
1335
+ True if removal succeeded, False otherwise
1336
+ """
1337
+ file_path = str(Path(file_path).absolute())
1338
+
1339
+ if file_path not in self.indexed_files:
1340
+ self.log.warning(f"Document not indexed: {file_path}")
1341
+ return False
1342
+
1343
+ try:
1344
+ # Get chunk indices for this file
1345
+ if file_path in self.file_to_chunk_indices:
1346
+ chunk_indices_set = set(self.file_to_chunk_indices[file_path])
1347
+
1348
+ # OPTIMIZED: Rebuild all structures in one O(N) pass
1349
+ # This is much faster than deleting in a loop (which is O(N²))
1350
+ new_chunks = []
1351
+ new_chunk_to_file = {}
1352
+ new_file_to_chunk_indices = {}
1353
+
1354
+ # Single pass through all chunks - O(N)
1355
+ for old_idx, chunk in enumerate(self.chunks):
1356
+ # Skip chunks from file being removed
1357
+ if old_idx in chunk_indices_set:
1358
+ continue
1359
+
1360
+ new_idx = len(new_chunks)
1361
+ new_chunks.append(chunk)
1362
+
1363
+ # Update chunk_to_file mapping
1364
+ if old_idx in self.chunk_to_file:
1365
+ file = self.chunk_to_file[old_idx]
1366
+ new_chunk_to_file[new_idx] = file
1367
+
1368
+ # Update file_to_chunk_indices for this file
1369
+ if file not in new_file_to_chunk_indices:
1370
+ new_file_to_chunk_indices[file] = []
1371
+ new_file_to_chunk_indices[file].append(new_idx)
1372
+
1373
+ # Atomic replacement - all or nothing
1374
+ self.chunks = new_chunks
1375
+ self.chunk_to_file = new_chunk_to_file
1376
+ self.file_to_chunk_indices = new_file_to_chunk_indices
1377
+
1378
+ # Remove from indexed files
1379
+ self.indexed_files.discard(file_path)
1380
+
1381
+ # Clean up LRU tracking
1382
+ if file_path in self.file_access_times:
1383
+ del self.file_access_times[file_path]
1384
+ if file_path in self.file_index_times:
1385
+ del self.file_index_times[file_path]
1386
+
1387
+ # Clean up cached per-file indices and embeddings
1388
+ if file_path in self.file_indices:
1389
+ del self.file_indices[file_path]
1390
+ if file_path in self.file_embeddings:
1391
+ del self.file_embeddings[file_path]
1392
+
1393
+ # Clean up cached metadata
1394
+ if file_path in self.file_metadata:
1395
+ del self.file_metadata[file_path]
1396
+
1397
+ # Rebuild index if chunks remain
1398
+ if self.chunks:
1399
+ self.index, self.chunks = self._create_vector_index(self.chunks)
1400
+ if self.config.show_stats:
1401
+ print(f"✅ Removed {Path(file_path).name} from index")
1402
+ print(
1403
+ f"📊 Remaining: {len(self.indexed_files)} documents, {len(self.chunks)} chunks"
1404
+ )
1405
+ else:
1406
+ self.index = None
1407
+ if self.config.show_stats:
1408
+ print("✅ Removed last document from index")
1409
+
1410
+ self.log.info(f"Successfully removed document: {file_path}")
1411
+ return True
1412
+
1413
+ except Exception as e:
1414
+ self.log.error(f"Failed to remove document {file_path}: {e}")
1415
+ return False
1416
+
1417
+ def reindex_document(self, file_path: str) -> Dict[str, Any]:
1418
+ """
1419
+ Reindex a document (remove old chunks and add new ones).
1420
+
1421
+ Args:
1422
+ file_path: Path to document to reindex
1423
+
1424
+ Returns:
1425
+ Dict with indexing results and statistics (same as index_document)
1426
+ """
1427
+ file_path = str(Path(file_path).absolute())
1428
+
1429
+ # Remove old version if it exists
1430
+ if file_path in self.indexed_files:
1431
+ self.log.info(f"Removing old version of {file_path}")
1432
+ if not self.remove_document(file_path):
1433
+ return {
1434
+ "success": False,
1435
+ "error": "Failed to remove old version",
1436
+ "file_name": Path(file_path).name,
1437
+ }
1438
+
1439
+ # Index the new version
1440
+ self.log.info(f"Indexing new version of {file_path}")
1441
+ result = self.index_document(file_path)
1442
+ if result.get("success"):
1443
+ result["reindexed"] = True
1444
+ return result
1445
+
1446
+ def _evict_lru_document(self) -> bool:
1447
+ """
1448
+ Evict the least recently used document to free memory.
1449
+
1450
+ Returns:
1451
+ True if a document was evicted, False otherwise
1452
+ """
1453
+ if not self.config.enable_lru_eviction or not self.file_access_times:
1454
+ return False
1455
+
1456
+ # Find LRU file (oldest access time)
1457
+ lru_file = min(self.file_access_times, key=self.file_access_times.get)
1458
+
1459
+ if self.config.show_stats:
1460
+ print(
1461
+ f"📦 Memory limit reached, evicting LRU document: {Path(lru_file).name}"
1462
+ )
1463
+
1464
+ # Remove the LRU document
1465
+ return self.remove_document(lru_file)
1466
+
1467
+ def _check_memory_limits(self) -> None:
1468
+ """
1469
+ Check memory limits and evict documents if necessary.
1470
+ """
1471
+ # Check total chunks limit
1472
+ while (
1473
+ self.config.max_total_chunks > 0
1474
+ and len(self.chunks) > self.config.max_total_chunks
1475
+ and len(self.indexed_files) > 1
1476
+ ): # Keep at least one file
1477
+ if not self._evict_lru_document():
1478
+ break
1479
+
1480
+ # Check indexed files limit
1481
+ while (
1482
+ self.config.max_indexed_files > 0
1483
+ and len(self.indexed_files) > self.config.max_indexed_files
1484
+ ):
1485
+ if not self._evict_lru_document():
1486
+ break
1487
+
1488
+ def index_document(self, file_path: str) -> Dict[str, Any]:
1489
+ """
1490
+ Index a document for retrieval.
1491
+
1492
+ Supports:
1493
+ - Documents: PDF, TXT, MD, CSV, JSON
1494
+ - Backend Code: Python, Java, C/C++, Go, Rust, Ruby, PHP, Swift, Kotlin, Scala
1495
+ - Web Code: JavaScript/TypeScript, HTML, CSS/SCSS/SASS/LESS, Vue, Svelte, Astro
1496
+ - Config: YAML, XML, TOML, INI, ENV, Properties
1497
+ - Build: Gradle, CMake, Makefiles
1498
+ - Database: SQL
1499
+
1500
+ Args:
1501
+ file_path: Path to document or code file
1502
+
1503
+ Returns:
1504
+ Dict with indexing results and statistics:
1505
+ {
1506
+ "success": bool,
1507
+ "file_name": str,
1508
+ "file_type": str,
1509
+ "file_size_mb": float,
1510
+ "num_pages": int (for PDFs),
1511
+ "num_chunks": int,
1512
+ "total_indexed_files": int,
1513
+ "total_chunks": int,
1514
+ "error": str (if failed)
1515
+ }
1516
+
1517
+ Raises:
1518
+ ValueError: If file_path is empty or file doesn't exist
1519
+ """
1520
+ # Validate input
1521
+ if not file_path or not file_path.strip():
1522
+ raise ValueError("File path cannot be empty")
1523
+
1524
+ # Initialize stats dict
1525
+ stats = {
1526
+ "success": False,
1527
+ "file_name": Path(file_path).name if file_path else "",
1528
+ "file_type": "",
1529
+ "file_size_mb": 0.0,
1530
+ "num_pages": None,
1531
+ "vlm_pages": None,
1532
+ "total_images": None,
1533
+ "num_chunks": 0,
1534
+ "total_indexed_files": len(self.indexed_files),
1535
+ "total_chunks": len(self.chunks),
1536
+ }
1537
+
1538
+ # Check if file exists before processing
1539
+ if not os.path.exists(file_path):
1540
+ self.log.error(f"File not found: {file_path}")
1541
+ if self.config.show_stats:
1542
+ print(f"❌ File not found: {file_path}")
1543
+ print(" Please check the file path and try again")
1544
+ stats["error"] = f"File not found: {file_path}"
1545
+ return stats
1546
+
1547
+ # Check if file is empty (early validation to save time)
1548
+ file_size = os.path.getsize(file_path)
1549
+ file_size_mb = file_size / (1024 * 1024)
1550
+ stats["file_size_mb"] = round(file_size_mb, 2)
1551
+
1552
+ if file_size == 0:
1553
+ self.log.error(f"File is empty: {file_path}")
1554
+ if self.config.show_stats:
1555
+ print(f"❌ File is empty: {file_path}")
1556
+ print(" The file has no content to index")
1557
+ stats["error"] = "File is empty"
1558
+ return stats
1559
+
1560
+ # Enforce maximum file size limit (prevent OOM)
1561
+ if file_size_mb > self.config.max_file_size_mb:
1562
+ error_msg = (
1563
+ f"File too large: {Path(file_path).name} ({file_size_mb:.1f}MB)\n"
1564
+ f"Maximum allowed: {self.config.max_file_size_mb}MB\n"
1565
+ "Suggestions:\n"
1566
+ " 1. Split the file into smaller documents\n"
1567
+ " 2. Increase max_file_size_mb in RAGConfig\n"
1568
+ " 3. Use a more powerful system with more RAM"
1569
+ )
1570
+ self.log.error(error_msg)
1571
+ if self.config.show_stats:
1572
+ print(f"❌ {error_msg}")
1573
+ stats["error"] = (
1574
+ f"File too large ({file_size_mb:.1f}MB > {self.config.max_file_size_mb}MB)"
1575
+ )
1576
+ return stats
1577
+
1578
+ # Warn if file is large
1579
+ if file_size_mb > self.config.warn_file_size_mb:
1580
+ if self.config.show_stats:
1581
+ print(f"⚠️ Large file detected ({file_size_mb:.1f}MB)")
1582
+ print(" This may take 30-60 seconds to process...")
1583
+ self.log.warning(f"Processing large file: {file_size_mb:.1f}MB")
1584
+
1585
+ # Convert to absolute path only after validation
1586
+ file_path = str(Path(file_path).absolute())
1587
+
1588
+ # Get file type for logging
1589
+ file_type = self._get_file_type(file_path)
1590
+ stats["file_type"] = file_type
1591
+ stats["file_name"] = Path(file_path).name
1592
+
1593
+ # Check if already indexed
1594
+ if file_path in self.indexed_files:
1595
+ if self.config.show_stats:
1596
+ print(f"📋 Document already indexed: {Path(file_path).name}")
1597
+ self.log.info(f"Document already indexed: {file_path}")
1598
+ stats["success"] = True
1599
+ stats["already_indexed"] = True
1600
+ stats["total_indexed_files"] = len(self.indexed_files)
1601
+ stats["total_chunks"] = len(self.chunks)
1602
+ return stats
1603
+
1604
+ # Check cache - the cache key is based on file content hash
1605
+ cache_path = self._get_cache_path(file_path)
1606
+
1607
+ # Also check for cached Markdown file with hash-based name
1608
+ # Extract the cache key from the pickle cache path to find matching MD file
1609
+ cache_filename = Path(cache_path).stem # Remove .pkl extension
1610
+ md_cache_path = os.path.join(
1611
+ self.config.cache_dir, f"{cache_filename}_extracted.md"
1612
+ )
1613
+
1614
+ if os.path.exists(cache_path):
1615
+ if self.config.show_stats:
1616
+ print(f"💾 Loading from cache: {Path(file_path).name}")
1617
+ self.log.info(f"📦 Loading cached index for: {file_path}")
1618
+ try:
1619
+ with open(cache_path, "rb") as f:
1620
+ cached_data = pickle.load(f)
1621
+ cached_chunks = cached_data["chunks"]
1622
+ cached_full_text = cached_data.get(
1623
+ "full_text", ""
1624
+ ) # May not exist in old caches
1625
+ cached_metadata = cached_data.get(
1626
+ "metadata", {}
1627
+ ) # May not exist in old caches
1628
+
1629
+ # Check if cache might be missing VLM content
1630
+ # If metadata doesn't have VLM info, it's an old cache
1631
+ if not cached_metadata.get("vlm_checked", False):
1632
+ if self.config.show_stats:
1633
+ print(
1634
+ " ⚠️ Cache might be missing image text (pre-VLM cache)"
1635
+ )
1636
+ print(
1637
+ " 💡 Use /clear-cache to force re-extraction with VLM"
1638
+ )
1639
+
1640
+ # Verify Markdown cache exists alongside pickle cache
1641
+ if os.path.exists(md_cache_path):
1642
+ self.log.info(
1643
+ f" ✅ Markdown cache also available: {md_cache_path}"
1644
+ )
1645
+
1646
+ if self.config.show_stats:
1647
+ vlm_info = ""
1648
+ if cached_metadata.get("vlm_pages", 0) > 0:
1649
+ vlm_info = f" (VLM: {cached_metadata['vlm_pages']} pages)"
1650
+ print(
1651
+ f" ✅ Loaded {len(cached_chunks)} cached chunks{vlm_info}"
1652
+ )
1653
+
1654
+ # Track chunk indices for this file
1655
+ start_idx = len(self.chunks)
1656
+ file_chunk_indices = []
1657
+
1658
+ if self.index is None:
1659
+ # First document - use cached index directly
1660
+ self.chunks = cached_chunks
1661
+ # Track indices for all chunks (0 to len-1)
1662
+ for i in range(len(cached_chunks)):
1663
+ file_chunk_indices.append(i)
1664
+ self.chunk_to_file[i] = file_path
1665
+ self.index, self.chunks = self._create_vector_index(self.chunks)
1666
+ else:
1667
+ # Merge with existing chunks and recreate index
1668
+ old_count = len(self.chunks)
1669
+ self.chunks.extend(cached_chunks)
1670
+ # Track indices for new chunks (start_idx to start_idx+len-1)
1671
+ for i in range(len(cached_chunks)):
1672
+ chunk_idx = start_idx + i
1673
+ file_chunk_indices.append(chunk_idx)
1674
+ self.chunk_to_file[chunk_idx] = file_path
1675
+ if self.config.show_stats:
1676
+ print(
1677
+ f" 🔄 Rebuilding index ({old_count} + {len(cached_chunks)} = {len(self.chunks)} chunks)"
1678
+ )
1679
+ self.index, self.chunks = self._create_vector_index(self.chunks)
1680
+
1681
+ # Store file-to-chunk mapping
1682
+ self.file_to_chunk_indices[file_path] = file_chunk_indices
1683
+
1684
+ # Restore metadata in memory
1685
+ if cached_full_text or cached_metadata:
1686
+ self.file_metadata[file_path] = {
1687
+ "full_text": cached_full_text,
1688
+ **cached_metadata,
1689
+ }
1690
+
1691
+ self.indexed_files.add(file_path)
1692
+ if self.config.show_stats:
1693
+ print(" ✅ Successfully loaded from cache")
1694
+
1695
+ # Update stats for cache load
1696
+ stats["success"] = True
1697
+ stats["num_chunks"] = len(cached_chunks)
1698
+ stats["num_pages"] = cached_metadata.get("num_pages")
1699
+ stats["vlm_pages"] = cached_metadata.get("vlm_pages")
1700
+ stats["total_images"] = cached_metadata.get("total_images")
1701
+ stats["total_indexed_files"] = len(self.indexed_files)
1702
+ stats["total_chunks"] = len(self.chunks)
1703
+ stats["from_cache"] = True
1704
+ return stats
1705
+ except Exception as e:
1706
+ self.log.warning(f"Cache load failed: {e}, reindexing")
1707
+ if self.config.show_stats:
1708
+ print(" ⚠️ Cache loading failed, will reindex from scratch")
1709
+
1710
+ # Extract and process document
1711
+ if self.config.show_stats:
1712
+ print(f"🚀 Starting to index: {Path(file_path).name} ({file_type})")
1713
+ self.log.info(f"📄 Indexing document: {file_path} ({file_type})")
1714
+
1715
+ try:
1716
+ # Extract text based on file type
1717
+ text, file_metadata = self._extract_text_from_file(file_path)
1718
+
1719
+ # Store metadata in stats if available (for PDFs)
1720
+ if file_metadata.get("num_pages"):
1721
+ stats["num_pages"] = file_metadata["num_pages"]
1722
+ if file_metadata.get("vlm_pages"):
1723
+ stats["vlm_pages"] = file_metadata["vlm_pages"]
1724
+ if file_metadata.get("total_images"):
1725
+ stats["total_images"] = file_metadata["total_images"]
1726
+
1727
+ if not text.strip():
1728
+ error_msg = (
1729
+ f"No text content found in {file_type} file: {Path(file_path).name}\n"
1730
+ "Possible reasons:\n"
1731
+ " 1. The file contains only images or non-text content\n"
1732
+ " 2. The file is password-protected (PDFs)\n"
1733
+ " 3. The file uses an unsupported format\n"
1734
+ " 4. The text extraction failed\n"
1735
+ "Try opening the file manually to verify it contains readable text"
1736
+ )
1737
+ stats["error"] = "No text content found"
1738
+ raise ValueError(error_msg)
1739
+
1740
+ # Split into chunks
1741
+ new_chunks = self._split_text_into_chunks(text)
1742
+
1743
+ # Track which chunks belong to this file
1744
+ file_chunk_indices = []
1745
+ start_idx = len(self.chunks)
1746
+
1747
+ # Add to existing chunks or create new
1748
+ if self.chunks:
1749
+ old_count = len(self.chunks)
1750
+ self.chunks.extend(new_chunks)
1751
+
1752
+ # Track the indices of chunks for this file
1753
+ for i in range(start_idx, start_idx + len(new_chunks)):
1754
+ file_chunk_indices.append(i)
1755
+ self.chunk_to_file[i] = file_path
1756
+
1757
+ if self.config.show_stats:
1758
+ print(
1759
+ f"🔄 Rebuilding search index ({old_count} + {len(new_chunks)} = {len(self.chunks)} total chunks)"
1760
+ )
1761
+ self.index, self.chunks = self._create_vector_index(self.chunks)
1762
+ else:
1763
+ # First file being indexed
1764
+ for i in range(len(new_chunks)):
1765
+ file_chunk_indices.append(i)
1766
+ self.chunk_to_file[i] = file_path
1767
+
1768
+ if self.config.show_stats:
1769
+ print("🏗️ Building initial search index...")
1770
+ self.index, self.chunks = self._create_vector_index(new_chunks)
1771
+
1772
+ # Store the file-to-chunks mapping
1773
+ self.file_to_chunk_indices[file_path] = file_chunk_indices
1774
+
1775
+ # Build and cache per-file FAISS index for fast file-specific searches
1776
+ if self.config.show_stats:
1777
+ print("🔍 Building per-file search index...")
1778
+
1779
+ self._load_embedder()
1780
+ # Generate embeddings for this file's chunks only
1781
+ file_embeddings = self._encode_texts(new_chunks, show_progress=False)
1782
+
1783
+ # Create FAISS index for this file
1784
+ dimension = file_embeddings.shape[1]
1785
+ file_index = faiss.IndexFlatL2(dimension)
1786
+ # pylint: disable=no-value-for-parameter
1787
+ file_index.add(file_embeddings.astype("float32"))
1788
+
1789
+ # Cache the index and embeddings for this file
1790
+ self.file_indices[file_path] = file_index
1791
+ self.file_embeddings[file_path] = file_embeddings
1792
+
1793
+ if self.config.show_stats:
1794
+ print(f"✅ Cached per-file index with {len(new_chunks)} chunks")
1795
+
1796
+ # Cache the results for this specific document
1797
+ if self.config.show_stats:
1798
+ print("💾 Caching processed chunks...")
1799
+ cache_data = {
1800
+ "chunks": new_chunks, # Cache only new chunks for this document
1801
+ "full_text": text, # Cache full extracted text (for /dump)
1802
+ "metadata": file_metadata, # Cache metadata (num_pages, vlm_pages, etc.)
1803
+ }
1804
+ with open(cache_path, "wb") as f:
1805
+ pickle.dump(cache_data, f)
1806
+
1807
+ # Store metadata in memory for fast access
1808
+ self.file_metadata[file_path] = {
1809
+ "full_text": text,
1810
+ **file_metadata, # num_pages, vlm_pages, total_images
1811
+ }
1812
+
1813
+ # Auto-save markdown version to cache directory for easy access
1814
+ self._save_extracted_markdown(file_path, text, file_metadata)
1815
+
1816
+ self.indexed_files.add(file_path)
1817
+
1818
+ # Track index time for LRU
1819
+ current_time = time.time()
1820
+ self.file_index_times[file_path] = current_time
1821
+ self.file_access_times[file_path] = current_time
1822
+
1823
+ # Check memory limits and evict if necessary
1824
+ self._check_memory_limits()
1825
+
1826
+ if self.config.show_stats:
1827
+ print(f"✅ Successfully indexed {Path(file_path).name}")
1828
+ print(
1829
+ f"📊 Total: {len(self.indexed_files)} documents, {len(self.chunks)} chunks"
1830
+ )
1831
+ if self.config.enable_lru_eviction:
1832
+ print(
1833
+ f"📈 Memory usage: {len(self.chunks)}/{self.config.max_total_chunks} chunks, "
1834
+ f"{len(self.indexed_files)}/{self.config.max_indexed_files} files"
1835
+ )
1836
+
1837
+ self.log.info(f"✅ Successfully indexed {file_path}")
1838
+
1839
+ # Update final stats
1840
+ stats["success"] = True
1841
+ stats["num_chunks"] = len(new_chunks)
1842
+ stats["total_indexed_files"] = len(self.indexed_files)
1843
+ stats["total_chunks"] = len(self.chunks)
1844
+ return stats
1845
+
1846
+ except Exception as e:
1847
+ if self.config.show_stats:
1848
+ print(f"❌ Failed to index {Path(file_path).name}: {e}")
1849
+ self.log.error(f"Failed to index {file_path}: {e}")
1850
+ stats["error"] = str(e)
1851
+ return stats
1852
+
1853
+ def _retrieve_chunks_from_file(self, query: str, file_path: str) -> tuple:
1854
+ """
1855
+ Retrieve relevant chunks from a specific file using cached per-file index.
1856
+
1857
+ This is much faster than the global search because:
1858
+ 1. Uses pre-computed embeddings (no re-encoding)
1859
+ 2. Searches smaller, file-specific FAISS index
1860
+ 3. No need to rebuild index on each query
1861
+ """
1862
+ if self.index is None or not self.chunks:
1863
+ raise ValueError("No documents indexed. Call index_document() first.")
1864
+
1865
+ if file_path not in self.file_to_chunk_indices:
1866
+ raise ValueError(f"File not indexed: {file_path}")
1867
+
1868
+ # Update access time for LRU tracking
1869
+ self.file_access_times[file_path] = time.time()
1870
+
1871
+ # Get chunk indices for this file
1872
+ file_chunk_indices = self.file_to_chunk_indices[file_path]
1873
+ if not file_chunk_indices:
1874
+ return [], []
1875
+
1876
+ # Get chunks for this file
1877
+ file_chunks = [self.chunks[i] for i in file_chunk_indices]
1878
+
1879
+ # Use CACHED per-file index (this is the performance fix!)
1880
+ if file_path not in self.file_indices:
1881
+ # Index not cached - build it now (shouldn't happen normally)
1882
+ self.log.warning(f"Per-file index not cached for {file_path}, rebuilding")
1883
+ self._load_embedder()
1884
+ file_embeddings = self._encode_texts(file_chunks, show_progress=False)
1885
+ dimension = file_embeddings.shape[1]
1886
+ file_index = faiss.IndexFlatL2(dimension)
1887
+ # pylint: disable=no-value-for-parameter
1888
+ file_index.add(file_embeddings.astype("float32"))
1889
+ self.file_indices[file_path] = file_index
1890
+ self.file_embeddings[file_path] = file_embeddings
1891
+ else:
1892
+ # Use cached index - FAST!
1893
+ file_index = self.file_indices[file_path]
1894
+
1895
+ # Encode query only once
1896
+ self._load_embedder()
1897
+ query_embedding = self._encode_texts([query], show_progress=False)
1898
+
1899
+ # Search in cached file-specific index
1900
+ k = min(self.config.max_chunks, len(file_chunks))
1901
+ # pylint: disable=no-value-for-parameter
1902
+ distances, indices = file_index.search(query_embedding.astype("float32"), k)
1903
+
1904
+ # Get matching chunks and scores
1905
+ retrieved_chunks = []
1906
+ scores = []
1907
+ for idx, dist in zip(indices[0], distances[0]):
1908
+ if idx < len(file_chunks): # Safety check
1909
+ retrieved_chunks.append(file_chunks[idx])
1910
+ # Convert distance to similarity score
1911
+ score = 1.0 / (1.0 + float(dist))
1912
+ scores.append(score)
1913
+
1914
+ if self.config.show_stats:
1915
+ print(
1916
+ f" ✅ Found {len(retrieved_chunks)} relevant chunks from {Path(file_path).name} (using cached index)"
1917
+ )
1918
+
1919
+ return retrieved_chunks, scores
1920
+
1921
+ def _retrieve_chunks_with_metadata(self, query: str) -> tuple:
1922
+ """
1923
+ Retrieve chunks with metadata about their source files.
1924
+
1925
+ Returns:
1926
+ (chunks, scores, metadata) tuple
1927
+ """
1928
+ chunks, scores = self._retrieve_chunks(query)
1929
+
1930
+ # Build metadata for each chunk
1931
+ metadata = []
1932
+ for i, (chunk, score) in enumerate(zip(chunks, scores)):
1933
+ # Find which file this chunk came from
1934
+ chunk_idx = self.chunks.index(chunk) if chunk in self.chunks else -1
1935
+ source_file = self.chunk_to_file.get(chunk_idx, "unknown")
1936
+
1937
+ metadata.append(
1938
+ {
1939
+ "chunk_index": i + 1,
1940
+ "source_file": (
1941
+ Path(source_file).name
1942
+ if source_file != "unknown"
1943
+ else "unknown"
1944
+ ),
1945
+ "source_path": source_file,
1946
+ "relevance_score": float(score),
1947
+ "chunk_length": len(chunk),
1948
+ "estimated_tokens": len(chunk) // 4, # Rough token estimate
1949
+ }
1950
+ )
1951
+
1952
+ return chunks, scores, metadata
1953
+
1954
+ def _retrieve_chunks(self, query: str) -> tuple:
1955
+ """Retrieve relevant chunks for query."""
1956
+ if self.index is None or not self.chunks:
1957
+ raise ValueError("No documents indexed. Call index_document() first.")
1958
+
1959
+ self._load_embedder()
1960
+
1961
+ # Generate query embedding
1962
+ if self.config.show_stats:
1963
+ print(f"🔍 Searching through {len(self.chunks)} chunks...")
1964
+ self.log.debug(f"Encoding query: {query[:50]}...")
1965
+ query_embedding = self._encode_texts([query], show_progress=False)
1966
+
1967
+ # Search for similar chunks
1968
+ k = min(self.config.max_chunks, len(self.chunks))
1969
+ if self.config.show_stats:
1970
+ print(f" 🎯 Finding {k} most relevant chunks...")
1971
+ # pylint: disable=no-value-for-parameter
1972
+ distances, indices = self.index.search(query_embedding.astype("float32"), k)
1973
+
1974
+ # Get chunks and scores
1975
+ retrieved_chunks = [self.chunks[i] for i in indices[0]]
1976
+ # Convert distances to similarity scores (lower distance = higher similarity)
1977
+ scores = [1.0 / (1.0 + dist) for dist in distances[0]]
1978
+
1979
+ if self.config.show_stats:
1980
+ print(
1981
+ f" ✅ Retrieved {len(retrieved_chunks)} chunks (avg relevance: {sum(scores)/len(scores):.3f})"
1982
+ )
1983
+
1984
+ self.log.debug(
1985
+ f"Retrieved {len(retrieved_chunks)} chunks with scores: {[f'{s:.3f}' for s in scores]}"
1986
+ )
1987
+ return retrieved_chunks, scores
1988
+
1989
+ def query(self, question: str, include_metadata: bool = True) -> RAGResponse:
1990
+ """
1991
+ Query the indexed documents with enhanced metadata tracking.
1992
+
1993
+ Args:
1994
+ question: Question to ask about the documents
1995
+ include_metadata: Whether to include detailed metadata in response
1996
+
1997
+ Returns:
1998
+ RAGResponse with answer, retrieved chunks, and metadata
1999
+ """
2000
+ if self.index is None:
2001
+ raise ValueError("No documents indexed. Call index_document() first.")
2002
+
2003
+ # Retrieve relevant chunks with metadata
2004
+ if include_metadata:
2005
+ chunks, scores, chunk_metadata = self._retrieve_chunks_with_metadata(
2006
+ question
2007
+ )
2008
+ else:
2009
+ chunks, scores = self._retrieve_chunks(question)
2010
+ chunk_metadata = None
2011
+
2012
+ # Build context from retrieved chunks
2013
+ context = "\n\n".join(
2014
+ [f"Context {i+1}:\n{chunk}" for i, chunk in enumerate(chunks)]
2015
+ )
2016
+
2017
+ # Create prompt
2018
+ prompt = f"""Based on the following context, please answer the question. If the answer is not in the context, say so.
2019
+
2020
+ Context:
2021
+ {context}
2022
+
2023
+ Question: {question}
2024
+
2025
+ Answer:"""
2026
+
2027
+ # Get LLM response
2028
+ response = self.chat.send(prompt)
2029
+
2030
+ # Build query metadata
2031
+ query_metadata = None
2032
+ if include_metadata and chunk_metadata:
2033
+ # Get unique source files
2034
+ source_files = list(
2035
+ set(
2036
+ m["source_file"]
2037
+ for m in chunk_metadata
2038
+ if m["source_file"] != "unknown"
2039
+ )
2040
+ )
2041
+
2042
+ query_metadata = {
2043
+ "question": question,
2044
+ "num_chunks_retrieved": len(chunks),
2045
+ "source_files": source_files,
2046
+ "total_indexed_files": len(self.indexed_files),
2047
+ "total_indexed_chunks": len(self.chunks),
2048
+ "average_relevance_score": float(np.mean(scores)) if scores else 0.0,
2049
+ "max_relevance_score": float(max(scores)) if scores else 0.0,
2050
+ "min_relevance_score": float(min(scores)) if scores else 0.0,
2051
+ }
2052
+
2053
+ # Collect source files list
2054
+ source_files_list = [m["source_file"] for m in chunk_metadata]
2055
+ else:
2056
+ source_files_list = None
2057
+
2058
+ return RAGResponse(
2059
+ text=response.text,
2060
+ chunks=chunks,
2061
+ chunk_scores=scores,
2062
+ stats=response.stats,
2063
+ source_files=source_files_list,
2064
+ chunk_metadata=chunk_metadata,
2065
+ query_metadata=query_metadata,
2066
+ )
2067
+
2068
+ def _save_extracted_markdown(
2069
+ self, file_path: str, text: str, metadata: Dict[str, Any]
2070
+ ):
2071
+ """
2072
+ Save extracted text as markdown file in cache directory.
2073
+
2074
+ This creates a human-readable markdown version of the extracted text
2075
+ that can be used for /dump commands and debugging without re-extraction.
2076
+ Uses hash-based naming to match the pickle cache for consistency.
2077
+
2078
+ Args:
2079
+ file_path: Path to original document
2080
+ text: Extracted text content
2081
+ metadata: File metadata (num_pages, vlm_pages, etc.)
2082
+ """
2083
+ try:
2084
+ from datetime import datetime
2085
+
2086
+ # Calculate file hash for consistency with pickle cache
2087
+ path = Path(file_path).absolute()
2088
+ hasher = hashlib.sha256()
2089
+ with self._safe_open(path, "rb") as f:
2090
+ while chunk := f.read(8192):
2091
+ hasher.update(chunk)
2092
+ content_hash = hasher.hexdigest()
2093
+
2094
+ # Use hash-based filename similar to pickle cache
2095
+ path_hash = hashlib.sha256(str(path).encode()).hexdigest()[:16]
2096
+ cache_key = f"{path_hash}_{content_hash[:32]}"
2097
+ md_filename = f"{cache_key}_extracted.md"
2098
+ md_path = os.path.join(self.config.cache_dir, md_filename)
2099
+
2100
+ # Create markdown content with metadata header
2101
+ vlm_status = (
2102
+ "✅ Enabled"
2103
+ if metadata.get("vlm_available", False)
2104
+ else "❌ Not Available"
2105
+ )
2106
+ markdown_content = f"""# Extracted Text from {Path(file_path).name}
2107
+
2108
+ ## Metadata
2109
+ **Source File:** {file_path}
2110
+ **File Hash (SHA-256):** {content_hash[:32]}
2111
+ **Extraction Date:** {datetime.now().isoformat()}
2112
+ **Pages:** {metadata.get('num_pages', 'N/A')}
2113
+ **VLM Status:** {vlm_status}
2114
+ **VLM Pages (with images):** {metadata.get('vlm_pages', 0)}
2115
+ **Total Images Processed:** {metadata.get('total_images', 0)}
2116
+
2117
+ ---
2118
+
2119
+ ## Extracted Content
2120
+ {text}
2121
+ """
2122
+
2123
+ # Write markdown file
2124
+ with open(md_path, "w", encoding="utf-8") as f:
2125
+ f.write(markdown_content)
2126
+
2127
+ self.log.debug(f"Saved extracted markdown to {md_path}")
2128
+
2129
+ except Exception as e:
2130
+ # Don't fail indexing if markdown save fails
2131
+ self.log.warning(
2132
+ f"Failed to save markdown cache for {Path(file_path).name}: {e}"
2133
+ )
2134
+
2135
+ def clear_cache(self):
2136
+ """Clear the RAG cache."""
2137
+ import shutil
2138
+
2139
+ if os.path.exists(self.config.cache_dir):
2140
+ shutil.rmtree(self.config.cache_dir)
2141
+ os.makedirs(self.config.cache_dir, exist_ok=True)
2142
+ self.log.info("Cache cleared")
2143
+
2144
+ def get_status(self) -> Dict[str, Any]:
2145
+ """Get RAG system status."""
2146
+ return {
2147
+ "indexed_files": len(self.indexed_files),
2148
+ "total_chunks": len(self.chunks) if self.chunks else 0,
2149
+ "cache_dir": self.config.cache_dir,
2150
+ "embedding_model": self.config.embedding_model,
2151
+ "config": {
2152
+ "chunk_size": self.config.chunk_size,
2153
+ "chunk_overlap": self.config.chunk_overlap,
2154
+ "max_chunks": self.config.max_chunks,
2155
+ },
2156
+ }
2157
+
2158
+
2159
+ def quick_rag(pdf_path: str, question: str, **kwargs) -> str:
2160
+ """
2161
+ Convenience function for quick RAG query.
2162
+
2163
+ Args:
2164
+ pdf_path: Path to PDF file
2165
+ question: Question to ask
2166
+ **kwargs: Additional config parameters
2167
+
2168
+ Returns:
2169
+ Answer text
2170
+
2171
+ Raises:
2172
+ ValueError: If pdf_path is empty, question is empty, or file doesn't exist
2173
+ """
2174
+ # Validate inputs
2175
+ if not pdf_path or not pdf_path.strip():
2176
+ raise ValueError("PDF path cannot be empty")
2177
+
2178
+ if not question or not question.strip():
2179
+ raise ValueError("Question cannot be empty")
2180
+
2181
+ # Check if file exists
2182
+ if not os.path.exists(pdf_path):
2183
+ raise ValueError(f"PDF file not found: {pdf_path}")
2184
+
2185
+ config = RAGConfig(**kwargs)
2186
+ rag = RAGSDK(config)
2187
+
2188
+ result = rag.index_document(pdf_path)
2189
+ if not result.get("success"):
2190
+ error = result.get("error", "Unknown error")
2191
+ raise ValueError(f"Failed to index document: {pdf_path}. Error: {error}")
2192
+
2193
+ response = rag.query(question)
2194
+ return response.text