amd-gaia 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amd_gaia-0.14.1.dist-info/METADATA +768 -0
- amd_gaia-0.14.1.dist-info/RECORD +800 -0
- amd_gaia-0.14.1.dist-info/WHEEL +5 -0
- amd_gaia-0.14.1.dist-info/entry_points.txt +5 -0
- amd_gaia-0.14.1.dist-info/licenses/LICENSE.md +21 -0
- amd_gaia-0.14.1.dist-info/top_level.txt +1 -0
- gaia/__init__.py +2 -0
- gaia/agents/__init__.py +19 -0
- gaia/agents/base/__init__.py +9 -0
- gaia/agents/base/agent.py +2072 -0
- gaia/agents/base/api_agent.py +120 -0
- gaia/agents/base/console.py +1457 -0
- gaia/agents/base/mcp_agent.py +86 -0
- gaia/agents/base/tools.py +83 -0
- gaia/agents/blender/agent.py +556 -0
- gaia/agents/blender/agent_simple.py +135 -0
- gaia/agents/blender/app.py +211 -0
- gaia/agents/blender/app_simple.py +41 -0
- gaia/agents/blender/core/__init__.py +16 -0
- gaia/agents/blender/core/materials.py +506 -0
- gaia/agents/blender/core/objects.py +316 -0
- gaia/agents/blender/core/rendering.py +225 -0
- gaia/agents/blender/core/scene.py +220 -0
- gaia/agents/blender/core/view.py +146 -0
- gaia/agents/chat/__init__.py +9 -0
- gaia/agents/chat/agent.py +975 -0
- gaia/agents/chat/app.py +1058 -0
- gaia/agents/chat/session.py +508 -0
- gaia/agents/chat/tools/__init__.py +15 -0
- gaia/agents/chat/tools/file_tools.py +96 -0
- gaia/agents/chat/tools/rag_tools.py +1729 -0
- gaia/agents/chat/tools/shell_tools.py +436 -0
- gaia/agents/code/__init__.py +7 -0
- gaia/agents/code/agent.py +547 -0
- gaia/agents/code/app.py +266 -0
- gaia/agents/code/models.py +135 -0
- gaia/agents/code/orchestration/__init__.py +24 -0
- gaia/agents/code/orchestration/checklist_executor.py +1739 -0
- gaia/agents/code/orchestration/checklist_generator.py +709 -0
- gaia/agents/code/orchestration/factories/__init__.py +9 -0
- gaia/agents/code/orchestration/factories/base.py +63 -0
- gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -0
- gaia/agents/code/orchestration/factories/python_factory.py +106 -0
- gaia/agents/code/orchestration/orchestrator.py +610 -0
- gaia/agents/code/orchestration/project_analyzer.py +391 -0
- gaia/agents/code/orchestration/steps/__init__.py +67 -0
- gaia/agents/code/orchestration/steps/base.py +188 -0
- gaia/agents/code/orchestration/steps/error_handler.py +314 -0
- gaia/agents/code/orchestration/steps/nextjs.py +828 -0
- gaia/agents/code/orchestration/steps/python.py +307 -0
- gaia/agents/code/orchestration/template_catalog.py +463 -0
- gaia/agents/code/orchestration/workflows/__init__.py +14 -0
- gaia/agents/code/orchestration/workflows/base.py +80 -0
- gaia/agents/code/orchestration/workflows/nextjs.py +186 -0
- gaia/agents/code/orchestration/workflows/python.py +94 -0
- gaia/agents/code/prompts/__init__.py +11 -0
- gaia/agents/code/prompts/base_prompt.py +77 -0
- gaia/agents/code/prompts/code_patterns.py +1925 -0
- gaia/agents/code/prompts/nextjs_prompt.py +40 -0
- gaia/agents/code/prompts/python_prompt.py +109 -0
- gaia/agents/code/schema_inference.py +365 -0
- gaia/agents/code/system_prompt.py +41 -0
- gaia/agents/code/tools/__init__.py +42 -0
- gaia/agents/code/tools/cli_tools.py +1138 -0
- gaia/agents/code/tools/code_formatting.py +319 -0
- gaia/agents/code/tools/code_tools.py +769 -0
- gaia/agents/code/tools/error_fixing.py +1347 -0
- gaia/agents/code/tools/external_tools.py +180 -0
- gaia/agents/code/tools/file_io.py +845 -0
- gaia/agents/code/tools/prisma_tools.py +190 -0
- gaia/agents/code/tools/project_management.py +1016 -0
- gaia/agents/code/tools/testing.py +321 -0
- gaia/agents/code/tools/typescript_tools.py +122 -0
- gaia/agents/code/tools/validation_parsing.py +461 -0
- gaia/agents/code/tools/validation_tools.py +803 -0
- gaia/agents/code/tools/web_dev_tools.py +1744 -0
- gaia/agents/code/validators/__init__.py +16 -0
- gaia/agents/code/validators/antipattern_checker.py +241 -0
- gaia/agents/code/validators/ast_analyzer.py +197 -0
- gaia/agents/code/validators/requirements_validator.py +145 -0
- gaia/agents/code/validators/syntax_validator.py +171 -0
- gaia/agents/docker/__init__.py +7 -0
- gaia/agents/docker/agent.py +642 -0
- gaia/agents/jira/__init__.py +11 -0
- gaia/agents/jira/agent.py +894 -0
- gaia/agents/jira/jql_templates.py +299 -0
- gaia/agents/routing/__init__.py +7 -0
- gaia/agents/routing/agent.py +512 -0
- gaia/agents/routing/system_prompt.py +75 -0
- gaia/api/__init__.py +23 -0
- gaia/api/agent_registry.py +238 -0
- gaia/api/app.py +305 -0
- gaia/api/openai_server.py +575 -0
- gaia/api/schemas.py +186 -0
- gaia/api/sse_handler.py +370 -0
- gaia/apps/__init__.py +4 -0
- gaia/apps/llm/__init__.py +6 -0
- gaia/apps/llm/app.py +169 -0
- gaia/apps/summarize/app.py +633 -0
- gaia/apps/summarize/html_viewer.py +133 -0
- gaia/apps/summarize/pdf_formatter.py +284 -0
- gaia/audio/__init__.py +2 -0
- gaia/audio/audio_client.py +439 -0
- gaia/audio/audio_recorder.py +269 -0
- gaia/audio/kokoro_tts.py +599 -0
- gaia/audio/whisper_asr.py +432 -0
- gaia/chat/__init__.py +16 -0
- gaia/chat/app.py +430 -0
- gaia/chat/prompts.py +522 -0
- gaia/chat/sdk.py +1200 -0
- gaia/cli.py +5621 -0
- gaia/eval/batch_experiment.py +2332 -0
- gaia/eval/claude.py +542 -0
- gaia/eval/config.py +37 -0
- gaia/eval/email_generator.py +512 -0
- gaia/eval/eval.py +3179 -0
- gaia/eval/groundtruth.py +1130 -0
- gaia/eval/transcript_generator.py +582 -0
- gaia/eval/webapp/README.md +168 -0
- gaia/eval/webapp/node_modules/.bin/mime +16 -0
- gaia/eval/webapp/node_modules/.bin/mime.cmd +17 -0
- gaia/eval/webapp/node_modules/.bin/mime.ps1 +28 -0
- gaia/eval/webapp/node_modules/.package-lock.json +865 -0
- gaia/eval/webapp/node_modules/accepts/HISTORY.md +243 -0
- gaia/eval/webapp/node_modules/accepts/LICENSE +23 -0
- gaia/eval/webapp/node_modules/accepts/README.md +140 -0
- gaia/eval/webapp/node_modules/accepts/index.js +238 -0
- gaia/eval/webapp/node_modules/accepts/package.json +47 -0
- gaia/eval/webapp/node_modules/array-flatten/LICENSE +21 -0
- gaia/eval/webapp/node_modules/array-flatten/README.md +43 -0
- gaia/eval/webapp/node_modules/array-flatten/array-flatten.js +64 -0
- gaia/eval/webapp/node_modules/array-flatten/package.json +39 -0
- gaia/eval/webapp/node_modules/body-parser/HISTORY.md +672 -0
- gaia/eval/webapp/node_modules/body-parser/LICENSE +23 -0
- gaia/eval/webapp/node_modules/body-parser/README.md +476 -0
- gaia/eval/webapp/node_modules/body-parser/SECURITY.md +25 -0
- gaia/eval/webapp/node_modules/body-parser/index.js +156 -0
- gaia/eval/webapp/node_modules/body-parser/lib/read.js +205 -0
- gaia/eval/webapp/node_modules/body-parser/lib/types/json.js +247 -0
- gaia/eval/webapp/node_modules/body-parser/lib/types/raw.js +101 -0
- gaia/eval/webapp/node_modules/body-parser/lib/types/text.js +121 -0
- gaia/eval/webapp/node_modules/body-parser/lib/types/urlencoded.js +307 -0
- gaia/eval/webapp/node_modules/body-parser/package.json +56 -0
- gaia/eval/webapp/node_modules/bytes/History.md +97 -0
- gaia/eval/webapp/node_modules/bytes/LICENSE +23 -0
- gaia/eval/webapp/node_modules/bytes/Readme.md +152 -0
- gaia/eval/webapp/node_modules/bytes/index.js +170 -0
- gaia/eval/webapp/node_modules/bytes/package.json +42 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/.eslintrc +17 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/.nycrc +9 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/CHANGELOG.md +30 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/LICENSE +21 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/README.md +62 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/actualApply.d.ts +1 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/actualApply.js +10 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/applyBind.d.ts +19 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/applyBind.js +10 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionApply.d.ts +1 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionApply.js +4 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionCall.d.ts +1 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionCall.js +4 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/index.d.ts +64 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/index.js +15 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/package.json +85 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/reflectApply.d.ts +3 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/reflectApply.js +4 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/test/index.js +63 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/call-bound/.eslintrc +13 -0
- gaia/eval/webapp/node_modules/call-bound/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/call-bound/.nycrc +9 -0
- gaia/eval/webapp/node_modules/call-bound/CHANGELOG.md +42 -0
- gaia/eval/webapp/node_modules/call-bound/LICENSE +21 -0
- gaia/eval/webapp/node_modules/call-bound/README.md +53 -0
- gaia/eval/webapp/node_modules/call-bound/index.d.ts +94 -0
- gaia/eval/webapp/node_modules/call-bound/index.js +19 -0
- gaia/eval/webapp/node_modules/call-bound/package.json +99 -0
- gaia/eval/webapp/node_modules/call-bound/test/index.js +61 -0
- gaia/eval/webapp/node_modules/call-bound/tsconfig.json +10 -0
- gaia/eval/webapp/node_modules/content-disposition/HISTORY.md +60 -0
- gaia/eval/webapp/node_modules/content-disposition/LICENSE +22 -0
- gaia/eval/webapp/node_modules/content-disposition/README.md +142 -0
- gaia/eval/webapp/node_modules/content-disposition/index.js +458 -0
- gaia/eval/webapp/node_modules/content-disposition/package.json +44 -0
- gaia/eval/webapp/node_modules/content-type/HISTORY.md +29 -0
- gaia/eval/webapp/node_modules/content-type/LICENSE +22 -0
- gaia/eval/webapp/node_modules/content-type/README.md +94 -0
- gaia/eval/webapp/node_modules/content-type/index.js +225 -0
- gaia/eval/webapp/node_modules/content-type/package.json +42 -0
- gaia/eval/webapp/node_modules/cookie/LICENSE +24 -0
- gaia/eval/webapp/node_modules/cookie/README.md +317 -0
- gaia/eval/webapp/node_modules/cookie/SECURITY.md +25 -0
- gaia/eval/webapp/node_modules/cookie/index.js +334 -0
- gaia/eval/webapp/node_modules/cookie/package.json +44 -0
- gaia/eval/webapp/node_modules/cookie-signature/.npmignore +4 -0
- gaia/eval/webapp/node_modules/cookie-signature/History.md +38 -0
- gaia/eval/webapp/node_modules/cookie-signature/Readme.md +42 -0
- gaia/eval/webapp/node_modules/cookie-signature/index.js +51 -0
- gaia/eval/webapp/node_modules/cookie-signature/package.json +18 -0
- gaia/eval/webapp/node_modules/debug/.coveralls.yml +1 -0
- gaia/eval/webapp/node_modules/debug/.eslintrc +11 -0
- gaia/eval/webapp/node_modules/debug/.npmignore +9 -0
- gaia/eval/webapp/node_modules/debug/.travis.yml +14 -0
- gaia/eval/webapp/node_modules/debug/CHANGELOG.md +362 -0
- gaia/eval/webapp/node_modules/debug/LICENSE +19 -0
- gaia/eval/webapp/node_modules/debug/Makefile +50 -0
- gaia/eval/webapp/node_modules/debug/README.md +312 -0
- gaia/eval/webapp/node_modules/debug/component.json +19 -0
- gaia/eval/webapp/node_modules/debug/karma.conf.js +70 -0
- gaia/eval/webapp/node_modules/debug/node.js +1 -0
- gaia/eval/webapp/node_modules/debug/package.json +49 -0
- gaia/eval/webapp/node_modules/debug/src/browser.js +185 -0
- gaia/eval/webapp/node_modules/debug/src/debug.js +202 -0
- gaia/eval/webapp/node_modules/debug/src/index.js +10 -0
- gaia/eval/webapp/node_modules/debug/src/inspector-log.js +15 -0
- gaia/eval/webapp/node_modules/debug/src/node.js +248 -0
- gaia/eval/webapp/node_modules/depd/History.md +103 -0
- gaia/eval/webapp/node_modules/depd/LICENSE +22 -0
- gaia/eval/webapp/node_modules/depd/Readme.md +280 -0
- gaia/eval/webapp/node_modules/depd/index.js +538 -0
- gaia/eval/webapp/node_modules/depd/lib/browser/index.js +77 -0
- gaia/eval/webapp/node_modules/depd/package.json +45 -0
- gaia/eval/webapp/node_modules/destroy/LICENSE +23 -0
- gaia/eval/webapp/node_modules/destroy/README.md +63 -0
- gaia/eval/webapp/node_modules/destroy/index.js +209 -0
- gaia/eval/webapp/node_modules/destroy/package.json +48 -0
- gaia/eval/webapp/node_modules/dunder-proto/.eslintrc +5 -0
- gaia/eval/webapp/node_modules/dunder-proto/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/dunder-proto/.nycrc +13 -0
- gaia/eval/webapp/node_modules/dunder-proto/CHANGELOG.md +24 -0
- gaia/eval/webapp/node_modules/dunder-proto/LICENSE +21 -0
- gaia/eval/webapp/node_modules/dunder-proto/README.md +54 -0
- gaia/eval/webapp/node_modules/dunder-proto/get.d.ts +5 -0
- gaia/eval/webapp/node_modules/dunder-proto/get.js +30 -0
- gaia/eval/webapp/node_modules/dunder-proto/package.json +76 -0
- gaia/eval/webapp/node_modules/dunder-proto/set.d.ts +5 -0
- gaia/eval/webapp/node_modules/dunder-proto/set.js +35 -0
- gaia/eval/webapp/node_modules/dunder-proto/test/get.js +34 -0
- gaia/eval/webapp/node_modules/dunder-proto/test/index.js +4 -0
- gaia/eval/webapp/node_modules/dunder-proto/test/set.js +50 -0
- gaia/eval/webapp/node_modules/dunder-proto/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/ee-first/LICENSE +22 -0
- gaia/eval/webapp/node_modules/ee-first/README.md +80 -0
- gaia/eval/webapp/node_modules/ee-first/index.js +95 -0
- gaia/eval/webapp/node_modules/ee-first/package.json +29 -0
- gaia/eval/webapp/node_modules/encodeurl/LICENSE +22 -0
- gaia/eval/webapp/node_modules/encodeurl/README.md +109 -0
- gaia/eval/webapp/node_modules/encodeurl/index.js +60 -0
- gaia/eval/webapp/node_modules/encodeurl/package.json +40 -0
- gaia/eval/webapp/node_modules/es-define-property/.eslintrc +13 -0
- gaia/eval/webapp/node_modules/es-define-property/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/es-define-property/.nycrc +9 -0
- gaia/eval/webapp/node_modules/es-define-property/CHANGELOG.md +29 -0
- gaia/eval/webapp/node_modules/es-define-property/LICENSE +21 -0
- gaia/eval/webapp/node_modules/es-define-property/README.md +49 -0
- gaia/eval/webapp/node_modules/es-define-property/index.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-define-property/index.js +14 -0
- gaia/eval/webapp/node_modules/es-define-property/package.json +81 -0
- gaia/eval/webapp/node_modules/es-define-property/test/index.js +56 -0
- gaia/eval/webapp/node_modules/es-define-property/tsconfig.json +10 -0
- gaia/eval/webapp/node_modules/es-errors/.eslintrc +5 -0
- gaia/eval/webapp/node_modules/es-errors/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/es-errors/CHANGELOG.md +40 -0
- gaia/eval/webapp/node_modules/es-errors/LICENSE +21 -0
- gaia/eval/webapp/node_modules/es-errors/README.md +55 -0
- gaia/eval/webapp/node_modules/es-errors/eval.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/eval.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/index.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/index.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/package.json +80 -0
- gaia/eval/webapp/node_modules/es-errors/range.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/range.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/ref.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/ref.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/syntax.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/syntax.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/test/index.js +19 -0
- gaia/eval/webapp/node_modules/es-errors/tsconfig.json +49 -0
- gaia/eval/webapp/node_modules/es-errors/type.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/type.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/uri.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/uri.js +4 -0
- gaia/eval/webapp/node_modules/es-object-atoms/.eslintrc +16 -0
- gaia/eval/webapp/node_modules/es-object-atoms/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/es-object-atoms/CHANGELOG.md +37 -0
- gaia/eval/webapp/node_modules/es-object-atoms/LICENSE +21 -0
- gaia/eval/webapp/node_modules/es-object-atoms/README.md +63 -0
- gaia/eval/webapp/node_modules/es-object-atoms/RequireObjectCoercible.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-object-atoms/RequireObjectCoercible.js +11 -0
- gaia/eval/webapp/node_modules/es-object-atoms/ToObject.d.ts +7 -0
- gaia/eval/webapp/node_modules/es-object-atoms/ToObject.js +10 -0
- gaia/eval/webapp/node_modules/es-object-atoms/index.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-object-atoms/index.js +4 -0
- gaia/eval/webapp/node_modules/es-object-atoms/isObject.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-object-atoms/isObject.js +6 -0
- gaia/eval/webapp/node_modules/es-object-atoms/package.json +80 -0
- gaia/eval/webapp/node_modules/es-object-atoms/test/index.js +38 -0
- gaia/eval/webapp/node_modules/es-object-atoms/tsconfig.json +6 -0
- gaia/eval/webapp/node_modules/escape-html/LICENSE +24 -0
- gaia/eval/webapp/node_modules/escape-html/Readme.md +43 -0
- gaia/eval/webapp/node_modules/escape-html/index.js +78 -0
- gaia/eval/webapp/node_modules/escape-html/package.json +24 -0
- gaia/eval/webapp/node_modules/etag/HISTORY.md +83 -0
- gaia/eval/webapp/node_modules/etag/LICENSE +22 -0
- gaia/eval/webapp/node_modules/etag/README.md +159 -0
- gaia/eval/webapp/node_modules/etag/index.js +131 -0
- gaia/eval/webapp/node_modules/etag/package.json +47 -0
- gaia/eval/webapp/node_modules/express/History.md +3656 -0
- gaia/eval/webapp/node_modules/express/LICENSE +24 -0
- gaia/eval/webapp/node_modules/express/Readme.md +260 -0
- gaia/eval/webapp/node_modules/express/index.js +11 -0
- gaia/eval/webapp/node_modules/express/lib/application.js +661 -0
- gaia/eval/webapp/node_modules/express/lib/express.js +116 -0
- gaia/eval/webapp/node_modules/express/lib/middleware/init.js +43 -0
- gaia/eval/webapp/node_modules/express/lib/middleware/query.js +47 -0
- gaia/eval/webapp/node_modules/express/lib/request.js +525 -0
- gaia/eval/webapp/node_modules/express/lib/response.js +1179 -0
- gaia/eval/webapp/node_modules/express/lib/router/index.js +673 -0
- gaia/eval/webapp/node_modules/express/lib/router/layer.js +181 -0
- gaia/eval/webapp/node_modules/express/lib/router/route.js +230 -0
- gaia/eval/webapp/node_modules/express/lib/utils.js +303 -0
- gaia/eval/webapp/node_modules/express/lib/view.js +182 -0
- gaia/eval/webapp/node_modules/express/package.json +102 -0
- gaia/eval/webapp/node_modules/finalhandler/HISTORY.md +210 -0
- gaia/eval/webapp/node_modules/finalhandler/LICENSE +22 -0
- gaia/eval/webapp/node_modules/finalhandler/README.md +147 -0
- gaia/eval/webapp/node_modules/finalhandler/SECURITY.md +25 -0
- gaia/eval/webapp/node_modules/finalhandler/index.js +341 -0
- gaia/eval/webapp/node_modules/finalhandler/package.json +47 -0
- gaia/eval/webapp/node_modules/forwarded/HISTORY.md +21 -0
- gaia/eval/webapp/node_modules/forwarded/LICENSE +22 -0
- gaia/eval/webapp/node_modules/forwarded/README.md +57 -0
- gaia/eval/webapp/node_modules/forwarded/index.js +90 -0
- gaia/eval/webapp/node_modules/forwarded/package.json +45 -0
- gaia/eval/webapp/node_modules/fresh/HISTORY.md +70 -0
- gaia/eval/webapp/node_modules/fresh/LICENSE +23 -0
- gaia/eval/webapp/node_modules/fresh/README.md +119 -0
- gaia/eval/webapp/node_modules/fresh/index.js +137 -0
- gaia/eval/webapp/node_modules/fresh/package.json +46 -0
- gaia/eval/webapp/node_modules/fs/README.md +9 -0
- gaia/eval/webapp/node_modules/fs/package.json +20 -0
- gaia/eval/webapp/node_modules/function-bind/.eslintrc +21 -0
- gaia/eval/webapp/node_modules/function-bind/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/function-bind/.github/SECURITY.md +3 -0
- gaia/eval/webapp/node_modules/function-bind/.nycrc +13 -0
- gaia/eval/webapp/node_modules/function-bind/CHANGELOG.md +136 -0
- gaia/eval/webapp/node_modules/function-bind/LICENSE +20 -0
- gaia/eval/webapp/node_modules/function-bind/README.md +46 -0
- gaia/eval/webapp/node_modules/function-bind/implementation.js +84 -0
- gaia/eval/webapp/node_modules/function-bind/index.js +5 -0
- gaia/eval/webapp/node_modules/function-bind/package.json +87 -0
- gaia/eval/webapp/node_modules/function-bind/test/.eslintrc +9 -0
- gaia/eval/webapp/node_modules/function-bind/test/index.js +252 -0
- gaia/eval/webapp/node_modules/get-intrinsic/.eslintrc +42 -0
- gaia/eval/webapp/node_modules/get-intrinsic/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/get-intrinsic/.nycrc +9 -0
- gaia/eval/webapp/node_modules/get-intrinsic/CHANGELOG.md +186 -0
- gaia/eval/webapp/node_modules/get-intrinsic/LICENSE +21 -0
- gaia/eval/webapp/node_modules/get-intrinsic/README.md +71 -0
- gaia/eval/webapp/node_modules/get-intrinsic/index.js +378 -0
- gaia/eval/webapp/node_modules/get-intrinsic/package.json +97 -0
- gaia/eval/webapp/node_modules/get-intrinsic/test/GetIntrinsic.js +274 -0
- gaia/eval/webapp/node_modules/get-proto/.eslintrc +10 -0
- gaia/eval/webapp/node_modules/get-proto/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/get-proto/.nycrc +9 -0
- gaia/eval/webapp/node_modules/get-proto/CHANGELOG.md +21 -0
- gaia/eval/webapp/node_modules/get-proto/LICENSE +21 -0
- gaia/eval/webapp/node_modules/get-proto/Object.getPrototypeOf.d.ts +5 -0
- gaia/eval/webapp/node_modules/get-proto/Object.getPrototypeOf.js +6 -0
- gaia/eval/webapp/node_modules/get-proto/README.md +50 -0
- gaia/eval/webapp/node_modules/get-proto/Reflect.getPrototypeOf.d.ts +3 -0
- gaia/eval/webapp/node_modules/get-proto/Reflect.getPrototypeOf.js +4 -0
- gaia/eval/webapp/node_modules/get-proto/index.d.ts +5 -0
- gaia/eval/webapp/node_modules/get-proto/index.js +27 -0
- gaia/eval/webapp/node_modules/get-proto/package.json +81 -0
- gaia/eval/webapp/node_modules/get-proto/test/index.js +68 -0
- gaia/eval/webapp/node_modules/get-proto/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/gopd/.eslintrc +16 -0
- gaia/eval/webapp/node_modules/gopd/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/gopd/CHANGELOG.md +45 -0
- gaia/eval/webapp/node_modules/gopd/LICENSE +21 -0
- gaia/eval/webapp/node_modules/gopd/README.md +40 -0
- gaia/eval/webapp/node_modules/gopd/gOPD.d.ts +1 -0
- gaia/eval/webapp/node_modules/gopd/gOPD.js +4 -0
- gaia/eval/webapp/node_modules/gopd/index.d.ts +5 -0
- gaia/eval/webapp/node_modules/gopd/index.js +15 -0
- gaia/eval/webapp/node_modules/gopd/package.json +77 -0
- gaia/eval/webapp/node_modules/gopd/test/index.js +36 -0
- gaia/eval/webapp/node_modules/gopd/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/has-symbols/.eslintrc +11 -0
- gaia/eval/webapp/node_modules/has-symbols/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/has-symbols/.nycrc +9 -0
- gaia/eval/webapp/node_modules/has-symbols/CHANGELOG.md +91 -0
- gaia/eval/webapp/node_modules/has-symbols/LICENSE +21 -0
- gaia/eval/webapp/node_modules/has-symbols/README.md +46 -0
- gaia/eval/webapp/node_modules/has-symbols/index.d.ts +3 -0
- gaia/eval/webapp/node_modules/has-symbols/index.js +14 -0
- gaia/eval/webapp/node_modules/has-symbols/package.json +111 -0
- gaia/eval/webapp/node_modules/has-symbols/shams.d.ts +3 -0
- gaia/eval/webapp/node_modules/has-symbols/shams.js +45 -0
- gaia/eval/webapp/node_modules/has-symbols/test/index.js +22 -0
- gaia/eval/webapp/node_modules/has-symbols/test/shams/core-js.js +29 -0
- gaia/eval/webapp/node_modules/has-symbols/test/shams/get-own-property-symbols.js +29 -0
- gaia/eval/webapp/node_modules/has-symbols/test/tests.js +58 -0
- gaia/eval/webapp/node_modules/has-symbols/tsconfig.json +10 -0
- gaia/eval/webapp/node_modules/hasown/.eslintrc +5 -0
- gaia/eval/webapp/node_modules/hasown/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/hasown/.nycrc +13 -0
- gaia/eval/webapp/node_modules/hasown/CHANGELOG.md +40 -0
- gaia/eval/webapp/node_modules/hasown/LICENSE +21 -0
- gaia/eval/webapp/node_modules/hasown/README.md +40 -0
- gaia/eval/webapp/node_modules/hasown/index.d.ts +3 -0
- gaia/eval/webapp/node_modules/hasown/index.js +8 -0
- gaia/eval/webapp/node_modules/hasown/package.json +92 -0
- gaia/eval/webapp/node_modules/hasown/tsconfig.json +6 -0
- gaia/eval/webapp/node_modules/http-errors/HISTORY.md +180 -0
- gaia/eval/webapp/node_modules/http-errors/LICENSE +23 -0
- gaia/eval/webapp/node_modules/http-errors/README.md +169 -0
- gaia/eval/webapp/node_modules/http-errors/index.js +289 -0
- gaia/eval/webapp/node_modules/http-errors/package.json +50 -0
- gaia/eval/webapp/node_modules/iconv-lite/Changelog.md +162 -0
- gaia/eval/webapp/node_modules/iconv-lite/LICENSE +21 -0
- gaia/eval/webapp/node_modules/iconv-lite/README.md +156 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/dbcs-codec.js +555 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/dbcs-data.js +176 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/index.js +22 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/internal.js +188 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-codec.js +72 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-data-generated.js +451 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-data.js +174 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/big5-added.json +122 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp936.json +264 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp949.json +273 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp950.json +177 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/eucjp.json +182 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/gb18030-ranges.json +1 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/gbk-added.json +55 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/shiftjis.json +125 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/utf16.js +177 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/utf7.js +290 -0
- gaia/eval/webapp/node_modules/iconv-lite/lib/bom-handling.js +52 -0
- gaia/eval/webapp/node_modules/iconv-lite/lib/extend-node.js +217 -0
- gaia/eval/webapp/node_modules/iconv-lite/lib/index.d.ts +24 -0
- gaia/eval/webapp/node_modules/iconv-lite/lib/index.js +153 -0
- gaia/eval/webapp/node_modules/iconv-lite/lib/streams.js +121 -0
- gaia/eval/webapp/node_modules/iconv-lite/package.json +46 -0
- gaia/eval/webapp/node_modules/inherits/LICENSE +16 -0
- gaia/eval/webapp/node_modules/inherits/README.md +42 -0
- gaia/eval/webapp/node_modules/inherits/inherits.js +9 -0
- gaia/eval/webapp/node_modules/inherits/inherits_browser.js +27 -0
- gaia/eval/webapp/node_modules/inherits/package.json +29 -0
- gaia/eval/webapp/node_modules/ipaddr.js/LICENSE +19 -0
- gaia/eval/webapp/node_modules/ipaddr.js/README.md +233 -0
- gaia/eval/webapp/node_modules/ipaddr.js/ipaddr.min.js +1 -0
- gaia/eval/webapp/node_modules/ipaddr.js/lib/ipaddr.js +673 -0
- gaia/eval/webapp/node_modules/ipaddr.js/lib/ipaddr.js.d.ts +68 -0
- gaia/eval/webapp/node_modules/ipaddr.js/package.json +35 -0
- gaia/eval/webapp/node_modules/math-intrinsics/.eslintrc +16 -0
- gaia/eval/webapp/node_modules/math-intrinsics/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/math-intrinsics/CHANGELOG.md +24 -0
- gaia/eval/webapp/node_modules/math-intrinsics/LICENSE +21 -0
- gaia/eval/webapp/node_modules/math-intrinsics/README.md +50 -0
- gaia/eval/webapp/node_modules/math-intrinsics/abs.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/abs.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxArrayLength.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxArrayLength.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxSafeInteger.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxSafeInteger.js +5 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxValue.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxValue.js +5 -0
- gaia/eval/webapp/node_modules/math-intrinsics/floor.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/floor.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isFinite.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isFinite.js +12 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isInteger.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isInteger.js +16 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isNaN.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isNaN.js +6 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isNegativeZero.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isNegativeZero.js +6 -0
- gaia/eval/webapp/node_modules/math-intrinsics/max.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/max.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/min.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/min.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/mod.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/mod.js +9 -0
- gaia/eval/webapp/node_modules/math-intrinsics/package.json +86 -0
- gaia/eval/webapp/node_modules/math-intrinsics/pow.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/pow.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/round.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/round.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/sign.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/sign.js +11 -0
- gaia/eval/webapp/node_modules/math-intrinsics/test/index.js +192 -0
- gaia/eval/webapp/node_modules/math-intrinsics/tsconfig.json +3 -0
- gaia/eval/webapp/node_modules/media-typer/HISTORY.md +22 -0
- gaia/eval/webapp/node_modules/media-typer/LICENSE +22 -0
- gaia/eval/webapp/node_modules/media-typer/README.md +81 -0
- gaia/eval/webapp/node_modules/media-typer/index.js +270 -0
- gaia/eval/webapp/node_modules/media-typer/package.json +26 -0
- gaia/eval/webapp/node_modules/merge-descriptors/HISTORY.md +21 -0
- gaia/eval/webapp/node_modules/merge-descriptors/LICENSE +23 -0
- gaia/eval/webapp/node_modules/merge-descriptors/README.md +49 -0
- gaia/eval/webapp/node_modules/merge-descriptors/index.js +60 -0
- gaia/eval/webapp/node_modules/merge-descriptors/package.json +39 -0
- gaia/eval/webapp/node_modules/methods/HISTORY.md +29 -0
- gaia/eval/webapp/node_modules/methods/LICENSE +24 -0
- gaia/eval/webapp/node_modules/methods/README.md +51 -0
- gaia/eval/webapp/node_modules/methods/index.js +69 -0
- gaia/eval/webapp/node_modules/methods/package.json +36 -0
- gaia/eval/webapp/node_modules/mime/.npmignore +0 -0
- gaia/eval/webapp/node_modules/mime/CHANGELOG.md +164 -0
- gaia/eval/webapp/node_modules/mime/LICENSE +21 -0
- gaia/eval/webapp/node_modules/mime/README.md +90 -0
- gaia/eval/webapp/node_modules/mime/cli.js +8 -0
- gaia/eval/webapp/node_modules/mime/mime.js +108 -0
- gaia/eval/webapp/node_modules/mime/package.json +44 -0
- gaia/eval/webapp/node_modules/mime/src/build.js +53 -0
- gaia/eval/webapp/node_modules/mime/src/test.js +60 -0
- gaia/eval/webapp/node_modules/mime/types.json +1 -0
- gaia/eval/webapp/node_modules/mime-db/HISTORY.md +507 -0
- gaia/eval/webapp/node_modules/mime-db/LICENSE +23 -0
- gaia/eval/webapp/node_modules/mime-db/README.md +100 -0
- gaia/eval/webapp/node_modules/mime-db/db.json +8519 -0
- gaia/eval/webapp/node_modules/mime-db/index.js +12 -0
- gaia/eval/webapp/node_modules/mime-db/package.json +60 -0
- gaia/eval/webapp/node_modules/mime-types/HISTORY.md +397 -0
- gaia/eval/webapp/node_modules/mime-types/LICENSE +23 -0
- gaia/eval/webapp/node_modules/mime-types/README.md +113 -0
- gaia/eval/webapp/node_modules/mime-types/index.js +188 -0
- gaia/eval/webapp/node_modules/mime-types/package.json +44 -0
- gaia/eval/webapp/node_modules/ms/index.js +152 -0
- gaia/eval/webapp/node_modules/ms/license.md +21 -0
- gaia/eval/webapp/node_modules/ms/package.json +37 -0
- gaia/eval/webapp/node_modules/ms/readme.md +51 -0
- gaia/eval/webapp/node_modules/negotiator/HISTORY.md +108 -0
- gaia/eval/webapp/node_modules/negotiator/LICENSE +24 -0
- gaia/eval/webapp/node_modules/negotiator/README.md +203 -0
- gaia/eval/webapp/node_modules/negotiator/index.js +82 -0
- gaia/eval/webapp/node_modules/negotiator/lib/charset.js +169 -0
- gaia/eval/webapp/node_modules/negotiator/lib/encoding.js +184 -0
- gaia/eval/webapp/node_modules/negotiator/lib/language.js +179 -0
- gaia/eval/webapp/node_modules/negotiator/lib/mediaType.js +294 -0
- gaia/eval/webapp/node_modules/negotiator/package.json +42 -0
- gaia/eval/webapp/node_modules/object-inspect/.eslintrc +53 -0
- gaia/eval/webapp/node_modules/object-inspect/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/object-inspect/.nycrc +13 -0
- gaia/eval/webapp/node_modules/object-inspect/CHANGELOG.md +424 -0
- gaia/eval/webapp/node_modules/object-inspect/LICENSE +21 -0
- gaia/eval/webapp/node_modules/object-inspect/example/all.js +23 -0
- gaia/eval/webapp/node_modules/object-inspect/example/circular.js +6 -0
- gaia/eval/webapp/node_modules/object-inspect/example/fn.js +5 -0
- gaia/eval/webapp/node_modules/object-inspect/example/inspect.js +10 -0
- gaia/eval/webapp/node_modules/object-inspect/index.js +544 -0
- gaia/eval/webapp/node_modules/object-inspect/package-support.json +20 -0
- gaia/eval/webapp/node_modules/object-inspect/package.json +105 -0
- gaia/eval/webapp/node_modules/object-inspect/readme.markdown +84 -0
- gaia/eval/webapp/node_modules/object-inspect/test/bigint.js +58 -0
- gaia/eval/webapp/node_modules/object-inspect/test/browser/dom.js +15 -0
- gaia/eval/webapp/node_modules/object-inspect/test/circular.js +16 -0
- gaia/eval/webapp/node_modules/object-inspect/test/deep.js +12 -0
- gaia/eval/webapp/node_modules/object-inspect/test/element.js +53 -0
- gaia/eval/webapp/node_modules/object-inspect/test/err.js +48 -0
- gaia/eval/webapp/node_modules/object-inspect/test/fakes.js +29 -0
- gaia/eval/webapp/node_modules/object-inspect/test/fn.js +76 -0
- gaia/eval/webapp/node_modules/object-inspect/test/global.js +17 -0
- gaia/eval/webapp/node_modules/object-inspect/test/has.js +15 -0
- gaia/eval/webapp/node_modules/object-inspect/test/holes.js +15 -0
- gaia/eval/webapp/node_modules/object-inspect/test/indent-option.js +271 -0
- gaia/eval/webapp/node_modules/object-inspect/test/inspect.js +139 -0
- gaia/eval/webapp/node_modules/object-inspect/test/lowbyte.js +12 -0
- gaia/eval/webapp/node_modules/object-inspect/test/number.js +58 -0
- gaia/eval/webapp/node_modules/object-inspect/test/quoteStyle.js +26 -0
- gaia/eval/webapp/node_modules/object-inspect/test/toStringTag.js +40 -0
- gaia/eval/webapp/node_modules/object-inspect/test/undef.js +12 -0
- gaia/eval/webapp/node_modules/object-inspect/test/values.js +261 -0
- gaia/eval/webapp/node_modules/object-inspect/test-core-js.js +26 -0
- gaia/eval/webapp/node_modules/object-inspect/util.inspect.js +1 -0
- gaia/eval/webapp/node_modules/on-finished/HISTORY.md +98 -0
- gaia/eval/webapp/node_modules/on-finished/LICENSE +23 -0
- gaia/eval/webapp/node_modules/on-finished/README.md +162 -0
- gaia/eval/webapp/node_modules/on-finished/index.js +234 -0
- gaia/eval/webapp/node_modules/on-finished/package.json +39 -0
- gaia/eval/webapp/node_modules/parseurl/HISTORY.md +58 -0
- gaia/eval/webapp/node_modules/parseurl/LICENSE +24 -0
- gaia/eval/webapp/node_modules/parseurl/README.md +133 -0
- gaia/eval/webapp/node_modules/parseurl/index.js +158 -0
- gaia/eval/webapp/node_modules/parseurl/package.json +40 -0
- gaia/eval/webapp/node_modules/path/.npmignore +1 -0
- gaia/eval/webapp/node_modules/path/LICENSE +18 -0
- gaia/eval/webapp/node_modules/path/README.md +15 -0
- gaia/eval/webapp/node_modules/path/package.json +24 -0
- gaia/eval/webapp/node_modules/path/path.js +628 -0
- gaia/eval/webapp/node_modules/path-to-regexp/LICENSE +21 -0
- gaia/eval/webapp/node_modules/path-to-regexp/Readme.md +35 -0
- gaia/eval/webapp/node_modules/path-to-regexp/index.js +156 -0
- gaia/eval/webapp/node_modules/path-to-regexp/package.json +30 -0
- gaia/eval/webapp/node_modules/process/.eslintrc +21 -0
- gaia/eval/webapp/node_modules/process/LICENSE +22 -0
- gaia/eval/webapp/node_modules/process/README.md +26 -0
- gaia/eval/webapp/node_modules/process/browser.js +184 -0
- gaia/eval/webapp/node_modules/process/index.js +2 -0
- gaia/eval/webapp/node_modules/process/package.json +27 -0
- gaia/eval/webapp/node_modules/process/test.js +199 -0
- gaia/eval/webapp/node_modules/proxy-addr/HISTORY.md +161 -0
- gaia/eval/webapp/node_modules/proxy-addr/LICENSE +22 -0
- gaia/eval/webapp/node_modules/proxy-addr/README.md +139 -0
- gaia/eval/webapp/node_modules/proxy-addr/index.js +327 -0
- gaia/eval/webapp/node_modules/proxy-addr/package.json +47 -0
- gaia/eval/webapp/node_modules/qs/.editorconfig +46 -0
- gaia/eval/webapp/node_modules/qs/.eslintrc +38 -0
- gaia/eval/webapp/node_modules/qs/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/qs/.nycrc +13 -0
- gaia/eval/webapp/node_modules/qs/CHANGELOG.md +600 -0
- gaia/eval/webapp/node_modules/qs/LICENSE.md +29 -0
- gaia/eval/webapp/node_modules/qs/README.md +709 -0
- gaia/eval/webapp/node_modules/qs/dist/qs.js +90 -0
- gaia/eval/webapp/node_modules/qs/lib/formats.js +23 -0
- gaia/eval/webapp/node_modules/qs/lib/index.js +11 -0
- gaia/eval/webapp/node_modules/qs/lib/parse.js +296 -0
- gaia/eval/webapp/node_modules/qs/lib/stringify.js +351 -0
- gaia/eval/webapp/node_modules/qs/lib/utils.js +265 -0
- gaia/eval/webapp/node_modules/qs/package.json +91 -0
- gaia/eval/webapp/node_modules/qs/test/empty-keys-cases.js +267 -0
- gaia/eval/webapp/node_modules/qs/test/parse.js +1170 -0
- gaia/eval/webapp/node_modules/qs/test/stringify.js +1298 -0
- gaia/eval/webapp/node_modules/qs/test/utils.js +136 -0
- gaia/eval/webapp/node_modules/range-parser/HISTORY.md +56 -0
- gaia/eval/webapp/node_modules/range-parser/LICENSE +23 -0
- gaia/eval/webapp/node_modules/range-parser/README.md +84 -0
- gaia/eval/webapp/node_modules/range-parser/index.js +162 -0
- gaia/eval/webapp/node_modules/range-parser/package.json +44 -0
- gaia/eval/webapp/node_modules/raw-body/HISTORY.md +308 -0
- gaia/eval/webapp/node_modules/raw-body/LICENSE +22 -0
- gaia/eval/webapp/node_modules/raw-body/README.md +223 -0
- gaia/eval/webapp/node_modules/raw-body/SECURITY.md +24 -0
- gaia/eval/webapp/node_modules/raw-body/index.d.ts +87 -0
- gaia/eval/webapp/node_modules/raw-body/index.js +336 -0
- gaia/eval/webapp/node_modules/raw-body/package.json +49 -0
- gaia/eval/webapp/node_modules/safe-buffer/LICENSE +21 -0
- gaia/eval/webapp/node_modules/safe-buffer/README.md +584 -0
- gaia/eval/webapp/node_modules/safe-buffer/index.d.ts +187 -0
- gaia/eval/webapp/node_modules/safe-buffer/index.js +65 -0
- gaia/eval/webapp/node_modules/safe-buffer/package.json +51 -0
- gaia/eval/webapp/node_modules/safer-buffer/LICENSE +21 -0
- gaia/eval/webapp/node_modules/safer-buffer/Porting-Buffer.md +268 -0
- gaia/eval/webapp/node_modules/safer-buffer/Readme.md +156 -0
- gaia/eval/webapp/node_modules/safer-buffer/dangerous.js +58 -0
- gaia/eval/webapp/node_modules/safer-buffer/package.json +34 -0
- gaia/eval/webapp/node_modules/safer-buffer/safer.js +77 -0
- gaia/eval/webapp/node_modules/safer-buffer/tests.js +406 -0
- gaia/eval/webapp/node_modules/send/HISTORY.md +526 -0
- gaia/eval/webapp/node_modules/send/LICENSE +23 -0
- gaia/eval/webapp/node_modules/send/README.md +327 -0
- gaia/eval/webapp/node_modules/send/SECURITY.md +24 -0
- gaia/eval/webapp/node_modules/send/index.js +1142 -0
- gaia/eval/webapp/node_modules/send/node_modules/encodeurl/HISTORY.md +14 -0
- gaia/eval/webapp/node_modules/send/node_modules/encodeurl/LICENSE +22 -0
- gaia/eval/webapp/node_modules/send/node_modules/encodeurl/README.md +128 -0
- gaia/eval/webapp/node_modules/send/node_modules/encodeurl/index.js +60 -0
- gaia/eval/webapp/node_modules/send/node_modules/encodeurl/package.json +40 -0
- gaia/eval/webapp/node_modules/send/node_modules/ms/index.js +162 -0
- gaia/eval/webapp/node_modules/send/node_modules/ms/license.md +21 -0
- gaia/eval/webapp/node_modules/send/node_modules/ms/package.json +38 -0
- gaia/eval/webapp/node_modules/send/node_modules/ms/readme.md +59 -0
- gaia/eval/webapp/node_modules/send/package.json +62 -0
- gaia/eval/webapp/node_modules/serve-static/HISTORY.md +487 -0
- gaia/eval/webapp/node_modules/serve-static/LICENSE +25 -0
- gaia/eval/webapp/node_modules/serve-static/README.md +257 -0
- gaia/eval/webapp/node_modules/serve-static/index.js +209 -0
- gaia/eval/webapp/node_modules/serve-static/package.json +42 -0
- gaia/eval/webapp/node_modules/setprototypeof/LICENSE +13 -0
- gaia/eval/webapp/node_modules/setprototypeof/README.md +31 -0
- gaia/eval/webapp/node_modules/setprototypeof/index.d.ts +2 -0
- gaia/eval/webapp/node_modules/setprototypeof/index.js +17 -0
- gaia/eval/webapp/node_modules/setprototypeof/package.json +38 -0
- gaia/eval/webapp/node_modules/setprototypeof/test/index.js +24 -0
- gaia/eval/webapp/node_modules/side-channel/.editorconfig +9 -0
- gaia/eval/webapp/node_modules/side-channel/.eslintrc +12 -0
- gaia/eval/webapp/node_modules/side-channel/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/side-channel/.nycrc +13 -0
- gaia/eval/webapp/node_modules/side-channel/CHANGELOG.md +110 -0
- gaia/eval/webapp/node_modules/side-channel/LICENSE +21 -0
- gaia/eval/webapp/node_modules/side-channel/README.md +61 -0
- gaia/eval/webapp/node_modules/side-channel/index.d.ts +14 -0
- gaia/eval/webapp/node_modules/side-channel/index.js +43 -0
- gaia/eval/webapp/node_modules/side-channel/package.json +85 -0
- gaia/eval/webapp/node_modules/side-channel/test/index.js +104 -0
- gaia/eval/webapp/node_modules/side-channel/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/side-channel-list/.editorconfig +9 -0
- gaia/eval/webapp/node_modules/side-channel-list/.eslintrc +11 -0
- gaia/eval/webapp/node_modules/side-channel-list/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/side-channel-list/.nycrc +13 -0
- gaia/eval/webapp/node_modules/side-channel-list/CHANGELOG.md +15 -0
- gaia/eval/webapp/node_modules/side-channel-list/LICENSE +21 -0
- gaia/eval/webapp/node_modules/side-channel-list/README.md +62 -0
- gaia/eval/webapp/node_modules/side-channel-list/index.d.ts +13 -0
- gaia/eval/webapp/node_modules/side-channel-list/index.js +113 -0
- gaia/eval/webapp/node_modules/side-channel-list/list.d.ts +14 -0
- gaia/eval/webapp/node_modules/side-channel-list/package.json +77 -0
- gaia/eval/webapp/node_modules/side-channel-list/test/index.js +104 -0
- gaia/eval/webapp/node_modules/side-channel-list/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/side-channel-map/.editorconfig +9 -0
- gaia/eval/webapp/node_modules/side-channel-map/.eslintrc +11 -0
- gaia/eval/webapp/node_modules/side-channel-map/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/side-channel-map/.nycrc +13 -0
- gaia/eval/webapp/node_modules/side-channel-map/CHANGELOG.md +22 -0
- gaia/eval/webapp/node_modules/side-channel-map/LICENSE +21 -0
- gaia/eval/webapp/node_modules/side-channel-map/README.md +62 -0
- gaia/eval/webapp/node_modules/side-channel-map/index.d.ts +15 -0
- gaia/eval/webapp/node_modules/side-channel-map/index.js +68 -0
- gaia/eval/webapp/node_modules/side-channel-map/package.json +80 -0
- gaia/eval/webapp/node_modules/side-channel-map/test/index.js +114 -0
- gaia/eval/webapp/node_modules/side-channel-map/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/.editorconfig +9 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/.eslintrc +12 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/.nycrc +13 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/CHANGELOG.md +28 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/LICENSE +21 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/README.md +62 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/index.d.ts +15 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/index.js +84 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/package.json +87 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/test/index.js +114 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/statuses/HISTORY.md +82 -0
- gaia/eval/webapp/node_modules/statuses/LICENSE +23 -0
- gaia/eval/webapp/node_modules/statuses/README.md +136 -0
- gaia/eval/webapp/node_modules/statuses/codes.json +65 -0
- gaia/eval/webapp/node_modules/statuses/index.js +146 -0
- gaia/eval/webapp/node_modules/statuses/package.json +49 -0
- gaia/eval/webapp/node_modules/toidentifier/HISTORY.md +9 -0
- gaia/eval/webapp/node_modules/toidentifier/LICENSE +21 -0
- gaia/eval/webapp/node_modules/toidentifier/README.md +61 -0
- gaia/eval/webapp/node_modules/toidentifier/index.js +32 -0
- gaia/eval/webapp/node_modules/toidentifier/package.json +38 -0
- gaia/eval/webapp/node_modules/type-is/HISTORY.md +259 -0
- gaia/eval/webapp/node_modules/type-is/LICENSE +23 -0
- gaia/eval/webapp/node_modules/type-is/README.md +170 -0
- gaia/eval/webapp/node_modules/type-is/index.js +266 -0
- gaia/eval/webapp/node_modules/type-is/package.json +45 -0
- gaia/eval/webapp/node_modules/unpipe/HISTORY.md +4 -0
- gaia/eval/webapp/node_modules/unpipe/LICENSE +22 -0
- gaia/eval/webapp/node_modules/unpipe/README.md +43 -0
- gaia/eval/webapp/node_modules/unpipe/index.js +69 -0
- gaia/eval/webapp/node_modules/unpipe/package.json +27 -0
- gaia/eval/webapp/node_modules/util/LICENSE +18 -0
- gaia/eval/webapp/node_modules/util/README.md +15 -0
- gaia/eval/webapp/node_modules/util/node_modules/inherits/LICENSE +16 -0
- gaia/eval/webapp/node_modules/util/node_modules/inherits/README.md +42 -0
- gaia/eval/webapp/node_modules/util/node_modules/inherits/inherits.js +7 -0
- gaia/eval/webapp/node_modules/util/node_modules/inherits/inherits_browser.js +23 -0
- gaia/eval/webapp/node_modules/util/node_modules/inherits/package.json +29 -0
- gaia/eval/webapp/node_modules/util/package.json +35 -0
- gaia/eval/webapp/node_modules/util/support/isBuffer.js +3 -0
- gaia/eval/webapp/node_modules/util/support/isBufferBrowser.js +6 -0
- gaia/eval/webapp/node_modules/util/util.js +586 -0
- gaia/eval/webapp/node_modules/utils-merge/.npmignore +9 -0
- gaia/eval/webapp/node_modules/utils-merge/LICENSE +20 -0
- gaia/eval/webapp/node_modules/utils-merge/README.md +34 -0
- gaia/eval/webapp/node_modules/utils-merge/index.js +23 -0
- gaia/eval/webapp/node_modules/utils-merge/package.json +40 -0
- gaia/eval/webapp/node_modules/vary/HISTORY.md +39 -0
- gaia/eval/webapp/node_modules/vary/LICENSE +22 -0
- gaia/eval/webapp/node_modules/vary/README.md +101 -0
- gaia/eval/webapp/node_modules/vary/index.js +149 -0
- gaia/eval/webapp/node_modules/vary/package.json +43 -0
- gaia/eval/webapp/package-lock.json +875 -0
- gaia/eval/webapp/package.json +21 -0
- gaia/eval/webapp/public/app.js +3403 -0
- gaia/eval/webapp/public/index.html +88 -0
- gaia/eval/webapp/public/styles.css +3661 -0
- gaia/eval/webapp/server.js +416 -0
- gaia/eval/webapp/test-setup.js +73 -0
- gaia/llm/__init__.py +2 -0
- gaia/llm/lemonade_client.py +3083 -0
- gaia/llm/lemonade_manager.py +269 -0
- gaia/llm/llm_client.py +729 -0
- gaia/llm/vlm_client.py +307 -0
- gaia/logger.py +189 -0
- gaia/mcp/agent_mcp_server.py +245 -0
- gaia/mcp/blender_mcp_client.py +138 -0
- gaia/mcp/blender_mcp_server.py +648 -0
- gaia/mcp/context7_cache.py +332 -0
- gaia/mcp/external_services.py +518 -0
- gaia/mcp/mcp_bridge.py +550 -0
- gaia/mcp/servers/__init__.py +6 -0
- gaia/mcp/servers/docker_mcp.py +83 -0
- gaia/rag/__init__.py +10 -0
- gaia/rag/app.py +293 -0
- gaia/rag/demo.py +304 -0
- gaia/rag/pdf_utils.py +235 -0
- gaia/rag/sdk.py +2194 -0
- gaia/security.py +163 -0
- gaia/talk/app.py +289 -0
- gaia/talk/sdk.py +538 -0
- gaia/util.py +46 -0
- gaia/version.py +100 -0
gaia/rag/sdk.py
ADDED
|
@@ -0,0 +1,2194 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright(C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
GAIA RAG SDK - Simple PDF document retrieval and Q&A
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import os
|
|
11
|
+
import pickle
|
|
12
|
+
import re
|
|
13
|
+
import time
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from pypdf import PdfReader
|
|
22
|
+
except ImportError:
|
|
23
|
+
try:
|
|
24
|
+
from PyPDF2 import PdfReader
|
|
25
|
+
except ImportError:
|
|
26
|
+
PdfReader = None
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
from sentence_transformers import SentenceTransformer
|
|
30
|
+
except ImportError:
|
|
31
|
+
SentenceTransformer = None
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
import faiss
|
|
35
|
+
except ImportError:
|
|
36
|
+
faiss = None
|
|
37
|
+
|
|
38
|
+
from gaia.chat.sdk import ChatConfig, ChatSDK
|
|
39
|
+
from gaia.logger import get_logger
|
|
40
|
+
from gaia.security import PathValidator
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class RAGConfig:
|
|
45
|
+
"""Configuration for RAG SDK."""
|
|
46
|
+
|
|
47
|
+
model: str = "Qwen3-Coder-30B-A3B-Instruct-GGUF"
|
|
48
|
+
max_tokens: int = 1024
|
|
49
|
+
chunk_size: int = 500
|
|
50
|
+
chunk_overlap: int = 100 # Increased to 20% overlap for better context preservation
|
|
51
|
+
max_chunks: int = 5 # Increased to retrieve more context
|
|
52
|
+
embedding_model: str = (
|
|
53
|
+
"nomic-embed-text-v2-moe-GGUF" # Lemonade GGUF embedding model
|
|
54
|
+
)
|
|
55
|
+
cache_dir: str = ".gaia"
|
|
56
|
+
show_stats: bool = False
|
|
57
|
+
use_local_llm: bool = True
|
|
58
|
+
base_url: str = "http://localhost:8000/api/v1" # Lemonade server API URL
|
|
59
|
+
# Memory management settings
|
|
60
|
+
max_indexed_files: int = 100 # Maximum number of files to keep indexed
|
|
61
|
+
max_total_chunks: int = 10000 # Maximum total chunks across all files
|
|
62
|
+
enable_lru_eviction: bool = (
|
|
63
|
+
True # Enable automatic eviction of least recently used documents
|
|
64
|
+
)
|
|
65
|
+
# File size limits (prevent OOM)
|
|
66
|
+
max_file_size_mb: int = 100 # Maximum file size in MB (default: 100MB)
|
|
67
|
+
warn_file_size_mb: int = 50 # Warn if file exceeds this size (default: 50MB)
|
|
68
|
+
# LLM-based chunking
|
|
69
|
+
use_llm_chunking: bool = (
|
|
70
|
+
False # Enable LLM-based intelligent chunking (requires LLM client)
|
|
71
|
+
)
|
|
72
|
+
# VLM settings (enabled if available, errors out if model can't be loaded)
|
|
73
|
+
vlm_model: str = "Qwen2.5-VL-7B-Instruct-GGUF"
|
|
74
|
+
# Security settings
|
|
75
|
+
allowed_paths: Optional[List[str]] = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class RAGResponse:
|
|
80
|
+
"""Response from RAG operations with enhanced metadata."""
|
|
81
|
+
|
|
82
|
+
text: str
|
|
83
|
+
chunks: Optional[List[str]] = None
|
|
84
|
+
chunk_scores: Optional[List[float]] = None
|
|
85
|
+
stats: Optional[Dict[str, Any]] = None
|
|
86
|
+
# Enhanced metadata
|
|
87
|
+
source_files: Optional[List[str]] = None # List of source files for each chunk
|
|
88
|
+
chunk_metadata: Optional[List[Dict[str, Any]]] = None # Detailed metadata per chunk
|
|
89
|
+
query_metadata: Optional[Dict[str, Any]] = None # Query-level metadata
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class RAGSDK:
|
|
93
|
+
"""
|
|
94
|
+
Simple RAG SDK for PDF document Q&A following GAIA patterns.
|
|
95
|
+
|
|
96
|
+
Example usage:
|
|
97
|
+
```python
|
|
98
|
+
from gaia.rag.sdk import RAGSDK, RAGConfig
|
|
99
|
+
|
|
100
|
+
# Initialize
|
|
101
|
+
config = RAGConfig(show_stats=True)
|
|
102
|
+
rag = RAGSDK(config)
|
|
103
|
+
|
|
104
|
+
# Index document
|
|
105
|
+
rag.index_document("document.pdf")
|
|
106
|
+
|
|
107
|
+
# Query
|
|
108
|
+
response = rag.query("What are the key features?")
|
|
109
|
+
print(response.text)
|
|
110
|
+
```
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
def __init__(self, config: Optional[RAGConfig] = None):
|
|
114
|
+
"""Initialize RAG SDK."""
|
|
115
|
+
self.config = config or RAGConfig()
|
|
116
|
+
self.log = get_logger(__name__)
|
|
117
|
+
|
|
118
|
+
# Check dependencies
|
|
119
|
+
self._check_dependencies()
|
|
120
|
+
|
|
121
|
+
# Initialize components
|
|
122
|
+
self.embedder = None
|
|
123
|
+
self.llm_client = None
|
|
124
|
+
self.use_lemonade_embeddings = False
|
|
125
|
+
self.index = None
|
|
126
|
+
self.chunks = []
|
|
127
|
+
self.indexed_files = set()
|
|
128
|
+
|
|
129
|
+
# Per-file indexing: maps file paths to their chunk indices
|
|
130
|
+
# This enables efficient per-file searches
|
|
131
|
+
self.file_to_chunk_indices = {} # {file_path: [chunk_idx1, chunk_idx2, ...]}
|
|
132
|
+
self.chunk_to_file = {} # {chunk_idx: file_path} for reverse lookup
|
|
133
|
+
|
|
134
|
+
# Per-file FAISS indices and embeddings (CACHED for performance)
|
|
135
|
+
self.file_indices = {} # {file_path: faiss.Index}
|
|
136
|
+
self.file_embeddings = {} # {file_path: numpy.array}
|
|
137
|
+
|
|
138
|
+
# Per-file metadata (for /dump command and stats)
|
|
139
|
+
self.file_metadata = (
|
|
140
|
+
{}
|
|
141
|
+
) # {file_path: {'full_text': str, 'num_pages': int, 'vlm_pages': int, ...}}
|
|
142
|
+
|
|
143
|
+
# LRU tracking for memory management
|
|
144
|
+
self.file_access_times = {} # {file_path: last_access_time}
|
|
145
|
+
self.file_index_times = {} # {file_path: index_time}
|
|
146
|
+
|
|
147
|
+
# Create cache directory
|
|
148
|
+
os.makedirs(self.config.cache_dir, exist_ok=True)
|
|
149
|
+
|
|
150
|
+
# Initialize chat SDK for LLM responses
|
|
151
|
+
chat_config = ChatConfig(
|
|
152
|
+
model=self.config.model,
|
|
153
|
+
max_tokens=self.config.max_tokens,
|
|
154
|
+
show_stats=self.config.show_stats,
|
|
155
|
+
use_local_llm=self.config.use_local_llm,
|
|
156
|
+
)
|
|
157
|
+
self.chat = ChatSDK(chat_config)
|
|
158
|
+
|
|
159
|
+
# Initialize path validator
|
|
160
|
+
self.path_validator = PathValidator(self.config.allowed_paths)
|
|
161
|
+
|
|
162
|
+
self.log.debug("RAG SDK initialized")
|
|
163
|
+
|
|
164
|
+
def _check_dependencies(self):
|
|
165
|
+
"""Check if required dependencies are available."""
|
|
166
|
+
missing = []
|
|
167
|
+
if PdfReader is None:
|
|
168
|
+
missing.append("pypdf (or PyPDF2)")
|
|
169
|
+
if SentenceTransformer is None:
|
|
170
|
+
missing.append("sentence-transformers")
|
|
171
|
+
if faiss is None:
|
|
172
|
+
missing.append("faiss-cpu")
|
|
173
|
+
|
|
174
|
+
if missing:
|
|
175
|
+
error_msg = (
|
|
176
|
+
f"\n❌ Error: Missing required RAG dependencies: {', '.join(missing)}\n\n"
|
|
177
|
+
f"Please install the RAG dependencies:\n"
|
|
178
|
+
f" pip install -e .[rag]\n\n"
|
|
179
|
+
f"Or install packages directly:\n"
|
|
180
|
+
f" pip install {' '.join(missing)}\n"
|
|
181
|
+
)
|
|
182
|
+
raise ImportError(error_msg)
|
|
183
|
+
|
|
184
|
+
def _safe_open(self, file_path: str, mode="rb"):
|
|
185
|
+
"""
|
|
186
|
+
Safely open file with path validation and O_NOFOLLOW to prevent symlink attacks.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
file_path: Path to file
|
|
190
|
+
mode: Open mode ('rb', 'r', 'w', 'wb', etc.)
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
File handle
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
PermissionError: If file is outside allowed paths or is a symlink
|
|
197
|
+
IOError: If file cannot be opened
|
|
198
|
+
"""
|
|
199
|
+
# Security check: Validate path against allowed directories
|
|
200
|
+
if not self.path_validator.is_path_allowed(file_path):
|
|
201
|
+
raise PermissionError(f"Access denied: {file_path} is not in allowed paths")
|
|
202
|
+
|
|
203
|
+
import stat
|
|
204
|
+
|
|
205
|
+
# Determine flags based on mode
|
|
206
|
+
if "r" in mode and "+" not in mode:
|
|
207
|
+
flags = os.O_RDONLY
|
|
208
|
+
elif "w" in mode:
|
|
209
|
+
flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
|
|
210
|
+
elif "a" in mode:
|
|
211
|
+
flags = os.O_WRONLY | os.O_CREAT | os.O_APPEND
|
|
212
|
+
else:
|
|
213
|
+
flags = os.O_RDONLY
|
|
214
|
+
|
|
215
|
+
# CRITICAL: Add O_NOFOLLOW to reject symlinks
|
|
216
|
+
# This prevents TOCTOU attacks where symlinks are swapped
|
|
217
|
+
if hasattr(os, "O_NOFOLLOW"):
|
|
218
|
+
flags |= os.O_NOFOLLOW
|
|
219
|
+
|
|
220
|
+
try:
|
|
221
|
+
# Open file descriptor with O_NOFOLLOW
|
|
222
|
+
fd = os.open(str(file_path), flags)
|
|
223
|
+
except OSError as e:
|
|
224
|
+
if e.errno == 40: # ELOOP - too many symbolic links
|
|
225
|
+
raise PermissionError(f"Symlinks not allowed: {file_path}")
|
|
226
|
+
raise IOError(f"Cannot open file {file_path}: {e}")
|
|
227
|
+
|
|
228
|
+
# Verify it's a regular file (not directory or special file)
|
|
229
|
+
try:
|
|
230
|
+
file_stat = os.fstat(fd)
|
|
231
|
+
if not stat.S_ISREG(file_stat.st_mode):
|
|
232
|
+
os.close(fd)
|
|
233
|
+
raise PermissionError(f"Not a regular file: {file_path}")
|
|
234
|
+
|
|
235
|
+
# Convert to file object with appropriate mode
|
|
236
|
+
mode_str = "rb" if "b" in mode else "r"
|
|
237
|
+
if "w" in mode:
|
|
238
|
+
mode_str = "wb" if "b" in mode else "w"
|
|
239
|
+
elif "a" in mode:
|
|
240
|
+
mode_str = "ab" if "b" in mode else "a"
|
|
241
|
+
|
|
242
|
+
return os.fdopen(fd, mode_str)
|
|
243
|
+
|
|
244
|
+
except Exception as _e:
|
|
245
|
+
os.close(fd)
|
|
246
|
+
raise
|
|
247
|
+
|
|
248
|
+
def _get_cache_path(self, file_path: str) -> str:
|
|
249
|
+
"""
|
|
250
|
+
Get cache file path for a document using content-based hashing.
|
|
251
|
+
|
|
252
|
+
Uses SHA-256 hash of actual file content for cache key.
|
|
253
|
+
This ensures proper cache invalidation even for:
|
|
254
|
+
- Same-size file edits
|
|
255
|
+
- Files modified within same second (low mtime resolution)
|
|
256
|
+
- Content changes that preserve size
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
file_path: Path to the document
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Path to cache file
|
|
263
|
+
"""
|
|
264
|
+
path = Path(file_path).absolute()
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
# Hash the actual file CONTENT for reliable cache invalidation
|
|
268
|
+
# This is more reliable than mtime + size
|
|
269
|
+
hasher = hashlib.sha256()
|
|
270
|
+
|
|
271
|
+
# Read file in chunks to handle large files efficiently
|
|
272
|
+
# Use _safe_open to prevent symlink attacks
|
|
273
|
+
with self._safe_open(path, "rb") as f:
|
|
274
|
+
while chunk := f.read(8192):
|
|
275
|
+
hasher.update(chunk)
|
|
276
|
+
|
|
277
|
+
content_hash = hasher.hexdigest()
|
|
278
|
+
|
|
279
|
+
# Include path in hash to avoid collisions between identical files
|
|
280
|
+
path_hash = hashlib.sha256(str(path).encode()).hexdigest()[:16]
|
|
281
|
+
cache_key = f"{path_hash}_{content_hash[:32]}"
|
|
282
|
+
|
|
283
|
+
return os.path.join(self.config.cache_dir, f"{cache_key}.pkl")
|
|
284
|
+
|
|
285
|
+
except (OSError, IOError) as e:
|
|
286
|
+
# If file doesn't exist or can't be read, use path-based key
|
|
287
|
+
# This will fail later during indexing anyway
|
|
288
|
+
self.log.warning(f"Cannot read file for cache key: {e}")
|
|
289
|
+
file_hash = hashlib.sha256(str(path).encode()).hexdigest()
|
|
290
|
+
return os.path.join(self.config.cache_dir, f"{file_hash}_notfound.pkl")
|
|
291
|
+
|
|
292
|
+
def _load_embedder(self):
|
|
293
|
+
"""Load embedding model via Lemonade server for hardware acceleration.
|
|
294
|
+
|
|
295
|
+
Forces a fresh load with --ubatch-size 2048 to prevent llama.cpp issues
|
|
296
|
+
after VLM processing. Must unload first since Lemonade skips reload
|
|
297
|
+
if model already loaded.
|
|
298
|
+
"""
|
|
299
|
+
if self.embedder is None:
|
|
300
|
+
self.log.info(
|
|
301
|
+
f"Loading embedding model via Lemonade: {self.config.embedding_model}"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
from gaia.llm.lemonade_client import LemonadeClient
|
|
305
|
+
|
|
306
|
+
if not hasattr(self, "llm_client") or self.llm_client is None:
|
|
307
|
+
self.llm_client = LemonadeClient()
|
|
308
|
+
|
|
309
|
+
# Force fresh load - must unload first
|
|
310
|
+
try:
|
|
311
|
+
self.llm_client.unload_model()
|
|
312
|
+
except Exception:
|
|
313
|
+
pass # Ignore if nothing to unload
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
self.llm_client.load_model(
|
|
317
|
+
self.config.embedding_model,
|
|
318
|
+
llamacpp_args="--ubatch-size 2048",
|
|
319
|
+
)
|
|
320
|
+
self.log.info("Loaded embedding model with ubatch-size=2048")
|
|
321
|
+
except Exception as e:
|
|
322
|
+
self.log.warning(f"Could not pre-load embedding model: {e}")
|
|
323
|
+
|
|
324
|
+
self.embedder = self.llm_client
|
|
325
|
+
self.use_lemonade_embeddings = True
|
|
326
|
+
|
|
327
|
+
self.log.info("Using Lemonade server for hardware-accelerated embeddings")
|
|
328
|
+
|
|
329
|
+
def _encode_texts(
|
|
330
|
+
self, texts: List[str], show_progress: bool = False
|
|
331
|
+
) -> "np.ndarray":
|
|
332
|
+
"""
|
|
333
|
+
Encode texts to embeddings using Lemonade server with batching and timing.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
texts: List of text strings to encode
|
|
337
|
+
show_progress: Whether to show progress
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
numpy array of embeddings with shape (num_texts, embedding_dim)
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
# Batch embedding requests to avoid timeouts
|
|
344
|
+
BATCH_SIZE = 25 # Smaller batches for reliability (25 chunks ~= 12KB text)
|
|
345
|
+
all_embeddings = []
|
|
346
|
+
|
|
347
|
+
total_batches = (len(texts) + BATCH_SIZE - 1) // BATCH_SIZE
|
|
348
|
+
total_start = time.time()
|
|
349
|
+
|
|
350
|
+
for batch_idx in range(0, len(texts), BATCH_SIZE):
|
|
351
|
+
batch_texts = texts[batch_idx : batch_idx + BATCH_SIZE]
|
|
352
|
+
batch_num = (batch_idx // BATCH_SIZE) + 1
|
|
353
|
+
|
|
354
|
+
batch_start = time.time()
|
|
355
|
+
|
|
356
|
+
if show_progress or self.config.show_stats:
|
|
357
|
+
self.log.info(
|
|
358
|
+
f" 📦 Embedding batch {batch_num}/{total_batches} ({len(batch_texts)} chunks)..."
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Call Lemonade embeddings API for this batch with retry
|
|
362
|
+
max_retries = 2
|
|
363
|
+
for attempt in range(max_retries + 1):
|
|
364
|
+
try:
|
|
365
|
+
# Use longer timeout for embedding batches (180s = 3 minutes per batch)
|
|
366
|
+
response = self.embedder.embeddings(
|
|
367
|
+
batch_texts, model=self.config.embedding_model, timeout=180
|
|
368
|
+
)
|
|
369
|
+
break # Success, exit retry loop
|
|
370
|
+
except Exception as e:
|
|
371
|
+
if attempt < max_retries:
|
|
372
|
+
self.log.warning(
|
|
373
|
+
f" ⚠️ Batch {batch_num} attempt {attempt + 1} failed, retrying: {e}"
|
|
374
|
+
)
|
|
375
|
+
time.sleep(2) # Wait before retry
|
|
376
|
+
else:
|
|
377
|
+
self.log.error(
|
|
378
|
+
f" ❌ Batch {batch_num} failed after {max_retries + 1} attempts"
|
|
379
|
+
)
|
|
380
|
+
raise
|
|
381
|
+
|
|
382
|
+
batch_duration = time.time() - batch_start
|
|
383
|
+
|
|
384
|
+
if show_progress or self.config.show_stats:
|
|
385
|
+
chunks_per_sec = (
|
|
386
|
+
len(batch_texts) / batch_duration if batch_duration > 0 else 0
|
|
387
|
+
)
|
|
388
|
+
self.log.info(
|
|
389
|
+
f" ✅ Batch {batch_num} complete in {batch_duration:.2f}s ({chunks_per_sec:.1f} chunks/sec)"
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Extract embeddings from response
|
|
393
|
+
# Expected format: {"data": [{"embedding": [...]}, ...]}
|
|
394
|
+
for item in response.get("data", []):
|
|
395
|
+
embedding = item.get("embedding", [])
|
|
396
|
+
all_embeddings.append(embedding)
|
|
397
|
+
|
|
398
|
+
total_duration = time.time() - total_start
|
|
399
|
+
if len(texts) > BATCH_SIZE:
|
|
400
|
+
overall_rate = len(texts) / total_duration if total_duration > 0 else 0
|
|
401
|
+
self.log.info(
|
|
402
|
+
f" 🎯 Total embedding time: {total_duration:.2f}s ({overall_rate:.1f} chunks/sec, {total_batches} batches)"
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# Convert to numpy array
|
|
406
|
+
return np.array(all_embeddings, dtype=np.float32)
|
|
407
|
+
|
|
408
|
+
def _get_file_type(self, file_path: str) -> str:
|
|
409
|
+
"""Detect file type from extension."""
|
|
410
|
+
ext = Path(file_path).suffix.lower()
|
|
411
|
+
return ext if ext else ".unknown"
|
|
412
|
+
|
|
413
|
+
def _extract_text_from_pdf(self, pdf_path: str) -> tuple:
|
|
414
|
+
"""
|
|
415
|
+
Extract text from PDF file with VLM for images (always enabled if available).
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
(text, num_pages, metadata) tuple where metadata contains:
|
|
419
|
+
- num_pages: int
|
|
420
|
+
- vlm_pages: int (number of pages enhanced with VLM)
|
|
421
|
+
- total_images: int (total images processed)
|
|
422
|
+
"""
|
|
423
|
+
import time as time_module # pylint: disable=reimported
|
|
424
|
+
|
|
425
|
+
try:
|
|
426
|
+
extract_start = time_module.time()
|
|
427
|
+
reader = PdfReader(pdf_path)
|
|
428
|
+
total_pages = len(reader.pages)
|
|
429
|
+
self.log.info(f"📄 Extracting text from {total_pages} pages...")
|
|
430
|
+
|
|
431
|
+
# Initialize VLM client (auto-enabled if available)
|
|
432
|
+
vlm = None
|
|
433
|
+
vlm_available = False
|
|
434
|
+
try:
|
|
435
|
+
from gaia.llm.vlm_client import VLMClient
|
|
436
|
+
from gaia.rag.pdf_utils import (
|
|
437
|
+
count_images_in_page,
|
|
438
|
+
extract_images_from_page_pymupdf,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
vlm = VLMClient(
|
|
442
|
+
vlm_model=self.config.vlm_model, base_url=self.config.base_url
|
|
443
|
+
)
|
|
444
|
+
vlm_available = vlm.check_availability()
|
|
445
|
+
|
|
446
|
+
if vlm_available and self.config.show_stats:
|
|
447
|
+
print(" 🔍 VLM enabled: Will extract text from images")
|
|
448
|
+
elif not vlm_available and self.config.show_stats:
|
|
449
|
+
print(" ⚠️ VLM not available - images will not be processed")
|
|
450
|
+
print(" 📥 To enable VLM image extraction:")
|
|
451
|
+
print(" 1. Open Lemonade Model Manager (http://localhost:8000)")
|
|
452
|
+
print(f" 2. Download model: {self.config.vlm_model}")
|
|
453
|
+
|
|
454
|
+
except Exception as vlm_error:
|
|
455
|
+
if self.config.show_stats:
|
|
456
|
+
print(f" ⚠️ VLM initialization failed: {vlm_error}")
|
|
457
|
+
self.log.warning(f"VLM initialization failed: {vlm_error}")
|
|
458
|
+
vlm_available = False
|
|
459
|
+
|
|
460
|
+
if self.config.show_stats:
|
|
461
|
+
print(f"\n{'='*60}")
|
|
462
|
+
print(" 📄 COMPUTE INTENSIVE: PDF Text Extraction")
|
|
463
|
+
print(f" 📊 Total pages: {total_pages}")
|
|
464
|
+
print(f" ⏱️ Estimated time: {total_pages * 0.2:.1f} seconds")
|
|
465
|
+
if vlm_available:
|
|
466
|
+
print(" 🖼️ VLM: Enabled for image text extraction")
|
|
467
|
+
else:
|
|
468
|
+
print(" 🖼️ VLM: Disabled (text-only extraction)")
|
|
469
|
+
print(f"{'='*60}")
|
|
470
|
+
|
|
471
|
+
pages_data = []
|
|
472
|
+
vlm_pages_count = 0
|
|
473
|
+
total_images_processed = 0
|
|
474
|
+
|
|
475
|
+
for i, page in enumerate(reader.pages, 1):
|
|
476
|
+
page_start = time_module.time()
|
|
477
|
+
|
|
478
|
+
# Step 1: Extract text with pypdf
|
|
479
|
+
pypdf_text = page.extract_text()
|
|
480
|
+
|
|
481
|
+
# Step 2: Check for images
|
|
482
|
+
has_imgs = False
|
|
483
|
+
num_imgs = 0
|
|
484
|
+
if vlm_available:
|
|
485
|
+
try:
|
|
486
|
+
has_imgs, num_imgs = count_images_in_page(page)
|
|
487
|
+
except Exception: # pylint: disable=broad-except
|
|
488
|
+
pass
|
|
489
|
+
|
|
490
|
+
# Step 3: Extract from images if present
|
|
491
|
+
image_texts = []
|
|
492
|
+
if has_imgs and vlm_available:
|
|
493
|
+
try:
|
|
494
|
+
images = extract_images_from_page_pymupdf(pdf_path, page_num=i)
|
|
495
|
+
if images:
|
|
496
|
+
image_texts = vlm.extract_from_page_images(
|
|
497
|
+
images, page_num=i
|
|
498
|
+
)
|
|
499
|
+
if image_texts:
|
|
500
|
+
vlm_pages_count += 1
|
|
501
|
+
total_images_processed += len(image_texts)
|
|
502
|
+
except Exception as img_error:
|
|
503
|
+
self.log.warning(
|
|
504
|
+
f"Image extraction failed on page {i}: {img_error}"
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
# Step 4: Merge
|
|
508
|
+
merged_text = self._merge_page_texts(
|
|
509
|
+
pypdf_text, image_texts, page_num=i
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
pages_data.append(
|
|
513
|
+
{
|
|
514
|
+
"page": i,
|
|
515
|
+
"text": merged_text,
|
|
516
|
+
"has_images": has_imgs,
|
|
517
|
+
"num_images": num_imgs,
|
|
518
|
+
"vlm_used": len(image_texts) > 0,
|
|
519
|
+
}
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
page_duration = time_module.time() - page_start
|
|
523
|
+
|
|
524
|
+
if self.config.show_stats:
|
|
525
|
+
# Update progress with timing info
|
|
526
|
+
progress_pct = (i / total_pages) * 100
|
|
527
|
+
avg_time_per_page = (time_module.time() - extract_start) / i
|
|
528
|
+
eta = avg_time_per_page * (total_pages - i)
|
|
529
|
+
vlm_indicator = " 🖼️" if len(image_texts) > 0 else ""
|
|
530
|
+
print(
|
|
531
|
+
f" 📄 Page {i}/{total_pages} ({progress_pct:.0f}%){vlm_indicator} | "
|
|
532
|
+
f"⏱️ {page_duration:.2f}s | ETA: {eta:.1f}s" + " " * 10,
|
|
533
|
+
end="\r",
|
|
534
|
+
flush=True,
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
# Cleanup VLM
|
|
538
|
+
if vlm_available and vlm:
|
|
539
|
+
try:
|
|
540
|
+
vlm.cleanup()
|
|
541
|
+
except Exception: # pylint: disable=broad-except
|
|
542
|
+
pass
|
|
543
|
+
|
|
544
|
+
extract_duration = time_module.time() - extract_start
|
|
545
|
+
|
|
546
|
+
# Build full text
|
|
547
|
+
full_text = "\n\n".join(
|
|
548
|
+
[f"[Page {p['page']}]\n{p['text']}" for p in pages_data]
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
if self.config.show_stats:
|
|
552
|
+
print(
|
|
553
|
+
f"\n ✅ Extracted {len(full_text):,} characters from {total_pages} pages"
|
|
554
|
+
)
|
|
555
|
+
print(
|
|
556
|
+
f" ⏱️ Total extraction time: {extract_duration:.2f}s ({total_pages/extract_duration:.1f} pages/sec)"
|
|
557
|
+
)
|
|
558
|
+
print(f" 💾 Text size: {len(full_text) / 1024:.1f} KB")
|
|
559
|
+
if vlm_pages_count > 0:
|
|
560
|
+
print(
|
|
561
|
+
f" 🖼️ VLM enhanced: {vlm_pages_count} pages, {total_images_processed} images"
|
|
562
|
+
)
|
|
563
|
+
print(f"{'='*60}\n")
|
|
564
|
+
|
|
565
|
+
self.log.info(
|
|
566
|
+
f"📝 Extracted {len(full_text):,} characters in {extract_duration:.2f}s (VLM: {vlm_pages_count} pages)"
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
# Build metadata
|
|
570
|
+
metadata = {
|
|
571
|
+
"num_pages": total_pages,
|
|
572
|
+
"vlm_pages": vlm_pages_count,
|
|
573
|
+
"total_images": total_images_processed,
|
|
574
|
+
"vlm_checked": True, # Indicates this cache was created with VLM capability check
|
|
575
|
+
"vlm_available": vlm_available, # Whether VLM was actually available
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
return full_text, total_pages, metadata
|
|
579
|
+
except Exception as e:
|
|
580
|
+
self.log.error(f"Error reading PDF {pdf_path}: {e}")
|
|
581
|
+
raise
|
|
582
|
+
|
|
583
|
+
def _merge_page_texts(
|
|
584
|
+
self, pypdf_text: str, image_texts: list, page_num: int
|
|
585
|
+
) -> str:
|
|
586
|
+
"""
|
|
587
|
+
Merge pypdf text + VLM image texts.
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
pypdf_text: Text extracted by pypdf
|
|
591
|
+
image_texts: List of dicts from VLM extraction (each has 'image_num' and 'text')
|
|
592
|
+
page_num: Page number for logging
|
|
593
|
+
|
|
594
|
+
Returns:
|
|
595
|
+
Merged text with image content clearly marked
|
|
596
|
+
"""
|
|
597
|
+
parts = []
|
|
598
|
+
|
|
599
|
+
# Add pypdf text first (if any)
|
|
600
|
+
if pypdf_text.strip():
|
|
601
|
+
parts.append(pypdf_text.strip())
|
|
602
|
+
|
|
603
|
+
# Add VLM-extracted image content (if any)
|
|
604
|
+
if image_texts:
|
|
605
|
+
parts.append("\n\n---\n")
|
|
606
|
+
parts.append(f"[Page {page_num}]\n**Content Extracted from Images:**\n")
|
|
607
|
+
|
|
608
|
+
for img_data in image_texts:
|
|
609
|
+
parts.append(
|
|
610
|
+
f"\n[Page {page_num}] ### 🖼️ IMAGE {img_data['image_num']}\n\n"
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
# Clean up the VLM text for better structure
|
|
614
|
+
image_text = img_data["text"].strip()
|
|
615
|
+
|
|
616
|
+
# Ensure proper line breaks for list items (general pattern)
|
|
617
|
+
# Look for patterns like "- text" or "* text" or "1. text"
|
|
618
|
+
image_text = re.sub(r"(?<!\n)([•\-\*]|\d+\.)\s+", r"\n\1 ", image_text)
|
|
619
|
+
|
|
620
|
+
# Add double newline after what looks like a heading
|
|
621
|
+
# (line ending with colon or short line followed by longer text)
|
|
622
|
+
lines = image_text.split("\n")
|
|
623
|
+
formatted_lines = []
|
|
624
|
+
for i, line in enumerate(lines):
|
|
625
|
+
formatted_lines.append(line)
|
|
626
|
+
# Add extra newline after lines that look like headers
|
|
627
|
+
if line.strip().endswith(":") and i < len(lines) - 1:
|
|
628
|
+
formatted_lines.append("")
|
|
629
|
+
|
|
630
|
+
image_text = "\n".join(formatted_lines)
|
|
631
|
+
|
|
632
|
+
parts.append(image_text)
|
|
633
|
+
parts.append("\n\n")
|
|
634
|
+
|
|
635
|
+
return "\n".join(parts)
|
|
636
|
+
|
|
637
|
+
def _llm_based_chunking(
|
|
638
|
+
self, text: str, chunk_size: int, overlap: int
|
|
639
|
+
) -> List[str]:
|
|
640
|
+
"""
|
|
641
|
+
Use LLM to intelligently identify chunk boundaries.
|
|
642
|
+
|
|
643
|
+
The LLM analyzes the text structure and suggests optimal split points
|
|
644
|
+
that preserve semantic meaning and context.
|
|
645
|
+
"""
|
|
646
|
+
self.log.info("🤖 Using LLM for intelligent text chunking...")
|
|
647
|
+
|
|
648
|
+
chunks = []
|
|
649
|
+
|
|
650
|
+
# Process text in segments (to handle long documents)
|
|
651
|
+
# Approximate: 1 token ≈ 4 characters
|
|
652
|
+
segment_size = chunk_size * 4 * 3 # Process 3 chunks worth at a time
|
|
653
|
+
text_length = len(text)
|
|
654
|
+
position = 0
|
|
655
|
+
|
|
656
|
+
while position < text_length:
|
|
657
|
+
# Get a segment to process
|
|
658
|
+
segment_end = min(position + segment_size, text_length)
|
|
659
|
+
segment = text[position:segment_end]
|
|
660
|
+
|
|
661
|
+
# Ask LLM to identify good chunk boundaries
|
|
662
|
+
prompt = """You are a document chunking expert. Your task is to identify optimal points to split the following text into chunks.
|
|
663
|
+
|
|
664
|
+
The text should be split into chunks of approximately {chunk_size} tokens (roughly {chunk_size * 4} characters each).
|
|
665
|
+
|
|
666
|
+
IMPORTANT RULES:
|
|
667
|
+
1. Keep semantic units together (complete thoughts, paragraphs, sections)
|
|
668
|
+
2. Never split in the middle of sentences
|
|
669
|
+
3. Preserve context - each chunk should be understandable on its own
|
|
670
|
+
4. Keep related information together (e.g., a heading with its content)
|
|
671
|
+
5. For lists, try to keep the list introduction with at least some items
|
|
672
|
+
|
|
673
|
+
Text to chunk:
|
|
674
|
+
---
|
|
675
|
+
{segment[:2000]} # Limit prompt size
|
|
676
|
+
{"..." if len(segment) > 2000 else ""}
|
|
677
|
+
---
|
|
678
|
+
|
|
679
|
+
Please identify the CHARACTER POSITIONS where the text should be split.
|
|
680
|
+
Return ONLY a JSON array of split positions, like: [245, 502, 847]
|
|
681
|
+
These positions indicate where to split the text."""
|
|
682
|
+
|
|
683
|
+
try:
|
|
684
|
+
# Get LLM response
|
|
685
|
+
response_data = self.llm_client.completions(
|
|
686
|
+
model=self.config.model,
|
|
687
|
+
prompt=prompt,
|
|
688
|
+
temperature=0.0, # Low temperature for deterministic chunking
|
|
689
|
+
max_tokens=500,
|
|
690
|
+
)
|
|
691
|
+
response = response_data["choices"][0]["text"]
|
|
692
|
+
|
|
693
|
+
# Parse the split positions
|
|
694
|
+
import json
|
|
695
|
+
|
|
696
|
+
split_positions = json.loads(response)
|
|
697
|
+
|
|
698
|
+
# Create chunks based on LLM-suggested positions
|
|
699
|
+
last_pos = 0
|
|
700
|
+
for split_pos in split_positions:
|
|
701
|
+
if split_pos > last_pos and split_pos < len(segment):
|
|
702
|
+
chunk = segment[last_pos:split_pos].strip()
|
|
703
|
+
if chunk:
|
|
704
|
+
chunks.append(chunk)
|
|
705
|
+
last_pos = split_pos
|
|
706
|
+
|
|
707
|
+
# Add remaining text
|
|
708
|
+
if last_pos < len(segment):
|
|
709
|
+
chunk = segment[last_pos:].strip()
|
|
710
|
+
if chunk:
|
|
711
|
+
chunks.append(chunk)
|
|
712
|
+
|
|
713
|
+
except Exception as e:
|
|
714
|
+
self.log.warning(f"LLM chunking failed for segment: {e}")
|
|
715
|
+
# Fall back to simple splitting for this segment
|
|
716
|
+
segment_chunks = self._fallback_chunk_segment(segment, chunk_size)
|
|
717
|
+
chunks.extend(segment_chunks)
|
|
718
|
+
|
|
719
|
+
# Move to next segment with overlap
|
|
720
|
+
position = segment_end - (overlap * 4) # Convert overlap tokens to chars
|
|
721
|
+
|
|
722
|
+
return chunks
|
|
723
|
+
|
|
724
|
+
def _fallback_chunk_segment(self, text: str, chunk_size: int) -> List[str]:
|
|
725
|
+
"""Simple fallback chunking for a text segment."""
|
|
726
|
+
chunks = []
|
|
727
|
+
words = text.split()
|
|
728
|
+
current_chunk = []
|
|
729
|
+
current_size = 0
|
|
730
|
+
|
|
731
|
+
for word in words:
|
|
732
|
+
word_size = len(word) // 4 # Rough token estimate
|
|
733
|
+
if current_size + word_size > chunk_size and current_chunk:
|
|
734
|
+
chunks.append(" ".join(current_chunk))
|
|
735
|
+
current_chunk = [word]
|
|
736
|
+
current_size = word_size
|
|
737
|
+
else:
|
|
738
|
+
current_chunk.append(word)
|
|
739
|
+
current_size += word_size
|
|
740
|
+
|
|
741
|
+
if current_chunk:
|
|
742
|
+
chunks.append(" ".join(current_chunk))
|
|
743
|
+
|
|
744
|
+
return chunks
|
|
745
|
+
|
|
746
|
+
def _extract_text_from_text_file(self, file_path: str) -> str:
|
|
747
|
+
"""Extract text from text-based file (txt, md, etc.)."""
|
|
748
|
+
try:
|
|
749
|
+
encodings = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
|
|
750
|
+
text = None
|
|
751
|
+
|
|
752
|
+
for encoding in encodings:
|
|
753
|
+
try:
|
|
754
|
+
# Use _safe_open with binary mode, then decode
|
|
755
|
+
with self._safe_open(file_path, "rb") as f:
|
|
756
|
+
text = f.read().decode(encoding)
|
|
757
|
+
break
|
|
758
|
+
except (UnicodeDecodeError, AttributeError):
|
|
759
|
+
continue
|
|
760
|
+
|
|
761
|
+
if text is None:
|
|
762
|
+
raise ValueError(
|
|
763
|
+
f"Failed to decode file: {file_path}\n"
|
|
764
|
+
f"Tried encodings: {', '.join(encodings)}\n"
|
|
765
|
+
"Suggestions:\n"
|
|
766
|
+
" 1. Convert the file to UTF-8 encoding\n"
|
|
767
|
+
" 2. Check if the file is corrupted\n"
|
|
768
|
+
" 3. Ensure the file is a text file (not binary)"
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
if self.config.show_stats:
|
|
772
|
+
print(f" ✅ Loaded text file ({len(text):,} characters)")
|
|
773
|
+
|
|
774
|
+
self.log.info(f"📝 Extracted {len(text):,} characters from text file")
|
|
775
|
+
return text.strip()
|
|
776
|
+
except Exception as e:
|
|
777
|
+
self.log.error(f"Error reading text file {file_path}: {e}")
|
|
778
|
+
raise
|
|
779
|
+
|
|
780
|
+
def _extract_text_from_csv(self, csv_path: str) -> str:
|
|
781
|
+
"""Extract text from CSV file."""
|
|
782
|
+
try:
|
|
783
|
+
import csv
|
|
784
|
+
|
|
785
|
+
text_parts = []
|
|
786
|
+
encodings = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
|
|
787
|
+
|
|
788
|
+
for encoding in encodings:
|
|
789
|
+
try:
|
|
790
|
+
# Use _safe_open with binary mode, then decode for csv.reader
|
|
791
|
+
from io import StringIO
|
|
792
|
+
|
|
793
|
+
with self._safe_open(csv_path, "rb") as f:
|
|
794
|
+
text = f.read().decode(encoding)
|
|
795
|
+
reader = csv.reader(StringIO(text))
|
|
796
|
+
rows = list(reader)
|
|
797
|
+
|
|
798
|
+
if not rows:
|
|
799
|
+
raise ValueError("CSV file is empty")
|
|
800
|
+
|
|
801
|
+
# Include header as context
|
|
802
|
+
if rows:
|
|
803
|
+
header = rows[0]
|
|
804
|
+
text_parts.append(f"Columns: {', '.join(header)}\n")
|
|
805
|
+
|
|
806
|
+
# Convert rows to readable text
|
|
807
|
+
for row in rows[1:]:
|
|
808
|
+
# Create a readable row format
|
|
809
|
+
row_text = []
|
|
810
|
+
for i, cell in enumerate(row):
|
|
811
|
+
if i < len(header):
|
|
812
|
+
row_text.append(f"{header[i]}: {cell}")
|
|
813
|
+
else:
|
|
814
|
+
row_text.append(cell)
|
|
815
|
+
text_parts.append(" | ".join(row_text))
|
|
816
|
+
|
|
817
|
+
text = "\n".join(text_parts)
|
|
818
|
+
|
|
819
|
+
if self.config.show_stats:
|
|
820
|
+
print(
|
|
821
|
+
f" ✅ Loaded CSV file ({len(rows)} rows, {len(header)} columns)"
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
self.log.info(f"📊 Extracted {len(rows)} rows from CSV")
|
|
825
|
+
return text
|
|
826
|
+
except UnicodeDecodeError:
|
|
827
|
+
continue
|
|
828
|
+
|
|
829
|
+
raise ValueError(
|
|
830
|
+
f"Failed to decode CSV file: {csv_path}\n"
|
|
831
|
+
f"Tried encodings: {', '.join(encodings)}\n"
|
|
832
|
+
"Suggestions:\n"
|
|
833
|
+
" 1. Save the CSV file with UTF-8 encoding in Excel/LibreOffice\n"
|
|
834
|
+
" 2. Check if the file is a valid CSV (not corrupted)\n"
|
|
835
|
+
" 3. Try opening and re-saving in a text editor"
|
|
836
|
+
)
|
|
837
|
+
except Exception as e:
|
|
838
|
+
self.log.error(f"Error reading CSV {csv_path}: {e}")
|
|
839
|
+
raise
|
|
840
|
+
|
|
841
|
+
def _extract_text_from_json(self, json_path: str) -> str:
|
|
842
|
+
"""Extract text from JSON file."""
|
|
843
|
+
try:
|
|
844
|
+
import json
|
|
845
|
+
|
|
846
|
+
# Use _safe_open to prevent symlink attacks
|
|
847
|
+
with self._safe_open(json_path, "rb") as f:
|
|
848
|
+
data = json.load(f)
|
|
849
|
+
|
|
850
|
+
# Convert JSON to readable text format
|
|
851
|
+
def json_to_text(obj, indent=0):
|
|
852
|
+
"""Recursively convert JSON to readable text."""
|
|
853
|
+
lines = []
|
|
854
|
+
prefix = " " * indent
|
|
855
|
+
|
|
856
|
+
if isinstance(obj, dict):
|
|
857
|
+
for key, value in obj.items():
|
|
858
|
+
if isinstance(value, (dict, list)):
|
|
859
|
+
lines.append(f"{prefix}{key}:")
|
|
860
|
+
lines.extend(json_to_text(value, indent + 1))
|
|
861
|
+
else:
|
|
862
|
+
lines.append(f"{prefix}{key}: {value}")
|
|
863
|
+
elif isinstance(obj, list):
|
|
864
|
+
for i, item in enumerate(obj):
|
|
865
|
+
if isinstance(item, (dict, list)):
|
|
866
|
+
lines.append(f"{prefix}Item {i + 1}:")
|
|
867
|
+
lines.extend(json_to_text(item, indent + 1))
|
|
868
|
+
else:
|
|
869
|
+
lines.append(f"{prefix}- {item}")
|
|
870
|
+
else:
|
|
871
|
+
lines.append(f"{prefix}{obj}")
|
|
872
|
+
|
|
873
|
+
return lines
|
|
874
|
+
|
|
875
|
+
text = "\n".join(json_to_text(data))
|
|
876
|
+
|
|
877
|
+
if self.config.show_stats:
|
|
878
|
+
print(f" ✅ Loaded JSON file ({len(text):,} characters)")
|
|
879
|
+
|
|
880
|
+
self.log.info(f"📝 Extracted {len(text):,} characters from JSON")
|
|
881
|
+
return text
|
|
882
|
+
except Exception as e:
|
|
883
|
+
self.log.error(f"Error reading JSON {json_path}: {e}")
|
|
884
|
+
raise
|
|
885
|
+
|
|
886
|
+
def _extract_text_from_file(self, file_path: str) -> tuple:
|
|
887
|
+
"""
|
|
888
|
+
Extract text from file based on type.
|
|
889
|
+
|
|
890
|
+
Returns:
|
|
891
|
+
(text, metadata_dict) tuple where metadata_dict contains:
|
|
892
|
+
- num_pages: int (for PDFs) or None
|
|
893
|
+
- vlm_pages: int (for PDFs with VLM) or None
|
|
894
|
+
- total_images: int (for PDFs with VLM) or None
|
|
895
|
+
"""
|
|
896
|
+
file_type = self._get_file_type(file_path)
|
|
897
|
+
metadata = {"num_pages": None, "vlm_pages": None, "total_images": None}
|
|
898
|
+
|
|
899
|
+
# PDF files
|
|
900
|
+
if file_type == ".pdf":
|
|
901
|
+
text, num_pages, pdf_metadata = self._extract_text_from_pdf(file_path)
|
|
902
|
+
metadata["num_pages"] = num_pages
|
|
903
|
+
metadata["vlm_pages"] = pdf_metadata.get("vlm_pages", 0)
|
|
904
|
+
metadata["total_images"] = pdf_metadata.get("total_images", 0)
|
|
905
|
+
return text, metadata
|
|
906
|
+
|
|
907
|
+
# Text-based files
|
|
908
|
+
elif file_type in [".txt", ".md", ".markdown", ".rst", ".log"]:
|
|
909
|
+
return self._extract_text_from_text_file(file_path), metadata
|
|
910
|
+
|
|
911
|
+
# CSV files
|
|
912
|
+
elif file_type == ".csv":
|
|
913
|
+
return self._extract_text_from_csv(file_path), metadata
|
|
914
|
+
|
|
915
|
+
# JSON files
|
|
916
|
+
elif file_type == ".json":
|
|
917
|
+
return self._extract_text_from_json(file_path), metadata
|
|
918
|
+
|
|
919
|
+
# Code files (treat as text for Q&A purposes)
|
|
920
|
+
elif file_type in [
|
|
921
|
+
# Backend languages
|
|
922
|
+
".py",
|
|
923
|
+
".pyw", # Python
|
|
924
|
+
".java", # Java
|
|
925
|
+
".cpp",
|
|
926
|
+
".cc",
|
|
927
|
+
".cxx",
|
|
928
|
+
".hpp",
|
|
929
|
+
".h", # C++
|
|
930
|
+
".c", # C
|
|
931
|
+
".cs", # C#
|
|
932
|
+
".go", # Go
|
|
933
|
+
".rs", # Rust
|
|
934
|
+
".rb", # Ruby
|
|
935
|
+
".php", # PHP
|
|
936
|
+
".swift", # Swift
|
|
937
|
+
".kt",
|
|
938
|
+
".kts", # Kotlin
|
|
939
|
+
".scala", # Scala
|
|
940
|
+
# Web - JavaScript/TypeScript
|
|
941
|
+
".js",
|
|
942
|
+
".jsx", # JavaScript
|
|
943
|
+
".ts",
|
|
944
|
+
".tsx", # TypeScript
|
|
945
|
+
".mjs",
|
|
946
|
+
".cjs", # JavaScript modules
|
|
947
|
+
# Web - Frameworks
|
|
948
|
+
".vue", # Vue.js
|
|
949
|
+
".svelte", # Svelte
|
|
950
|
+
".astro", # Astro
|
|
951
|
+
# Web - Styling
|
|
952
|
+
".css", # CSS
|
|
953
|
+
".scss",
|
|
954
|
+
".sass", # Sass
|
|
955
|
+
".less", # Less
|
|
956
|
+
".styl",
|
|
957
|
+
".stylus", # Stylus
|
|
958
|
+
# Web - Markup
|
|
959
|
+
".html",
|
|
960
|
+
".htm", # HTML
|
|
961
|
+
".svg", # SVG
|
|
962
|
+
".jsx",
|
|
963
|
+
".tsx", # JSX/TSX (already listed but emphasizing)
|
|
964
|
+
# Scripting
|
|
965
|
+
".sh",
|
|
966
|
+
".bash", # Shell
|
|
967
|
+
".ps1", # PowerShell
|
|
968
|
+
".r",
|
|
969
|
+
".R", # R
|
|
970
|
+
# Database
|
|
971
|
+
".sql", # SQL
|
|
972
|
+
# Configuration
|
|
973
|
+
".yaml",
|
|
974
|
+
".yml", # YAML
|
|
975
|
+
".xml", # XML
|
|
976
|
+
".toml", # TOML
|
|
977
|
+
".ini",
|
|
978
|
+
".cfg",
|
|
979
|
+
".conf", # Config files
|
|
980
|
+
".env", # Environment files
|
|
981
|
+
".properties", # Properties files
|
|
982
|
+
# Build & Package
|
|
983
|
+
".gradle", # Gradle
|
|
984
|
+
".cmake", # CMake
|
|
985
|
+
".mk",
|
|
986
|
+
".make", # Makefiles
|
|
987
|
+
# Documentation
|
|
988
|
+
".rst", # ReStructuredText
|
|
989
|
+
]:
|
|
990
|
+
self.log.info(f"Indexing code/web file: {file_type}")
|
|
991
|
+
return self._extract_text_from_text_file(file_path), metadata
|
|
992
|
+
|
|
993
|
+
# Unknown file type - try as text
|
|
994
|
+
else:
|
|
995
|
+
self.log.warning(
|
|
996
|
+
f"Unknown file type {file_type}, attempting to read as text"
|
|
997
|
+
)
|
|
998
|
+
return self._extract_text_from_text_file(file_path), metadata
|
|
999
|
+
|
|
1000
|
+
def _split_text_into_chunks(self, text: str) -> List[str]:
|
|
1001
|
+
"""
|
|
1002
|
+
Split text into semantic chunks using LLM intelligence when available.
|
|
1003
|
+
|
|
1004
|
+
Uses intelligent splitting that:
|
|
1005
|
+
- Leverages LLM to identify natural semantic boundaries (if available)
|
|
1006
|
+
- Falls back to structural heuristics if LLM is not available
|
|
1007
|
+
- Respects natural document boundaries (paragraphs, sections)
|
|
1008
|
+
- Keeps semantic units together
|
|
1009
|
+
- Maintains context with overlap
|
|
1010
|
+
|
|
1011
|
+
This dramatically improves Q&A quality over naive word splitting.
|
|
1012
|
+
"""
|
|
1013
|
+
self.log.info("📝 Splitting text into semantic chunks...")
|
|
1014
|
+
|
|
1015
|
+
chunks = []
|
|
1016
|
+
chunk_size_tokens = self.config.chunk_size
|
|
1017
|
+
overlap_tokens = self.config.chunk_overlap
|
|
1018
|
+
|
|
1019
|
+
# Try to use LLM for intelligent chunking if available
|
|
1020
|
+
if self.config.use_llm_chunking:
|
|
1021
|
+
# Ensure LLM client is initialized for chunking
|
|
1022
|
+
if self.llm_client is None:
|
|
1023
|
+
try:
|
|
1024
|
+
from gaia.llm.lemonade_client import LemonadeClient
|
|
1025
|
+
|
|
1026
|
+
self.llm_client = LemonadeClient()
|
|
1027
|
+
self.log.info("✅ Initialized LLM client for intelligent chunking")
|
|
1028
|
+
except Exception as e:
|
|
1029
|
+
self.log.warning(
|
|
1030
|
+
f"Failed to initialize LLM client for chunking: {e}"
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
if self.llm_client is not None:
|
|
1034
|
+
try:
|
|
1035
|
+
return self._llm_based_chunking(
|
|
1036
|
+
text, chunk_size_tokens, overlap_tokens
|
|
1037
|
+
)
|
|
1038
|
+
except Exception as e:
|
|
1039
|
+
self.log.warning(
|
|
1040
|
+
f"LLM chunking failed, falling back to heuristic: {e}"
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
# Fall back to heuristic-based chunking
|
|
1044
|
+
|
|
1045
|
+
# STEP 1: Identify and protect VLM content blocks as atomic units
|
|
1046
|
+
# VLM content starts with "[Page X] ### 🖼️ IMAGE" and continues until next image or end
|
|
1047
|
+
# We'll mark these sections to prevent splitting during paragraph processing
|
|
1048
|
+
vlm_pattern = r"\[Page \d+\] ### 🖼️ IMAGE \d+.*?(?=\[Page \d+\] ### 🖼️ IMAGE|\[Page \d+\]\n(?!### 🖼️)|\Z)"
|
|
1049
|
+
|
|
1050
|
+
# Find all VLM image blocks and replace them with placeholders temporarily
|
|
1051
|
+
vlm_blocks = []
|
|
1052
|
+
protected_text = text
|
|
1053
|
+
for i, match in enumerate(re.finditer(vlm_pattern, text, re.DOTALL)):
|
|
1054
|
+
placeholder = f"<<<VLM_BLOCK_{i}>>>"
|
|
1055
|
+
vlm_blocks.append(
|
|
1056
|
+
{"placeholder": placeholder, "content": match.group(0).strip()}
|
|
1057
|
+
)
|
|
1058
|
+
protected_text = protected_text.replace(match.group(0), placeholder, 1)
|
|
1059
|
+
|
|
1060
|
+
# STEP 2: Identify natural document boundaries
|
|
1061
|
+
# Look for markdown headers, section breaks, or significant whitespace
|
|
1062
|
+
# Use protected_text which has VLM blocks replaced with placeholders
|
|
1063
|
+
lines = protected_text.split("\n")
|
|
1064
|
+
sections = []
|
|
1065
|
+
current_section = []
|
|
1066
|
+
|
|
1067
|
+
for i, line in enumerate(lines):
|
|
1068
|
+
# Detect section boundaries:
|
|
1069
|
+
# 1. Markdown headers (# Header, ## Header, ### Header)
|
|
1070
|
+
# 2. Lines that look like titles (short, possibly capitalized)
|
|
1071
|
+
# 3. Horizontal rules (---, ===, ___)
|
|
1072
|
+
# 4. Significant whitespace gaps
|
|
1073
|
+
|
|
1074
|
+
is_boundary = False
|
|
1075
|
+
|
|
1076
|
+
# Check for markdown headers
|
|
1077
|
+
if line.strip().startswith("#"):
|
|
1078
|
+
is_boundary = True
|
|
1079
|
+
# Check for horizontal rules
|
|
1080
|
+
elif re.match(r"^[\-=_]{3,}$", line.strip()):
|
|
1081
|
+
is_boundary = True
|
|
1082
|
+
# Check for lines that look like section titles (short, might be all caps)
|
|
1083
|
+
elif line.strip() and len(line.strip()) < 100 and i > 0:
|
|
1084
|
+
# If previous line was empty and next line exists and is not empty
|
|
1085
|
+
prev_empty = i > 0 and not lines[i - 1].strip()
|
|
1086
|
+
next_exists = i < len(lines) - 1
|
|
1087
|
+
next_not_empty = next_exists and lines[i + 1].strip()
|
|
1088
|
+
|
|
1089
|
+
# Heuristic: likely a section header if surrounded by whitespace
|
|
1090
|
+
if prev_empty and next_not_empty:
|
|
1091
|
+
# Additional check: does it look like a title?
|
|
1092
|
+
# (starts with capital, no ending punctuation, relatively short)
|
|
1093
|
+
if line.strip()[0].isupper() and not line.strip()[-1] in ".!?,;":
|
|
1094
|
+
is_boundary = True
|
|
1095
|
+
|
|
1096
|
+
if is_boundary and current_section:
|
|
1097
|
+
# Save the current section
|
|
1098
|
+
sections.append("\n".join(current_section))
|
|
1099
|
+
current_section = [line]
|
|
1100
|
+
else:
|
|
1101
|
+
current_section.append(line)
|
|
1102
|
+
|
|
1103
|
+
# Don't forget the last section
|
|
1104
|
+
if current_section:
|
|
1105
|
+
sections.append("\n".join(current_section))
|
|
1106
|
+
|
|
1107
|
+
# If we didn't find many sections, try paragraph-based splitting
|
|
1108
|
+
if len(sections) <= 3:
|
|
1109
|
+
# Split by double newlines (paragraphs)
|
|
1110
|
+
paragraphs = re.split(r"\n\s*\n", text)
|
|
1111
|
+
# Filter out empty paragraphs
|
|
1112
|
+
paragraphs = [p.strip() for p in paragraphs if p.strip()]
|
|
1113
|
+
else:
|
|
1114
|
+
paragraphs = sections
|
|
1115
|
+
|
|
1116
|
+
# STEP 3: Mark paragraphs that are VLM content (should not be split)
|
|
1117
|
+
vlm_paragraphs = set()
|
|
1118
|
+
for idx, para in enumerate(paragraphs):
|
|
1119
|
+
# Check if this paragraph contains VLM markers
|
|
1120
|
+
if "### 🖼️ IMAGE" in para or "**Content Extracted from Images:**" in para:
|
|
1121
|
+
vlm_paragraphs.add(idx)
|
|
1122
|
+
self.log.debug(
|
|
1123
|
+
f"Paragraph {idx} marked as VLM content (will keep atomic)"
|
|
1124
|
+
)
|
|
1125
|
+
|
|
1126
|
+
current_chunk = []
|
|
1127
|
+
current_size = 0
|
|
1128
|
+
|
|
1129
|
+
for idx, para in enumerate(paragraphs):
|
|
1130
|
+
para = para.strip()
|
|
1131
|
+
if not para:
|
|
1132
|
+
continue
|
|
1133
|
+
|
|
1134
|
+
# Estimate tokens (rough: 1 token ≈ 4 characters)
|
|
1135
|
+
para_tokens = len(para) // 4
|
|
1136
|
+
|
|
1137
|
+
# Check if this is VLM content - if so, keep it atomic
|
|
1138
|
+
is_vlm_content = idx in vlm_paragraphs
|
|
1139
|
+
|
|
1140
|
+
# If single paragraph exceeds chunk size AND it's not VLM content, split by sentences
|
|
1141
|
+
if para_tokens > chunk_size_tokens and not is_vlm_content:
|
|
1142
|
+
# Split into sentences
|
|
1143
|
+
sentences = self._split_into_sentences(para)
|
|
1144
|
+
|
|
1145
|
+
for sentence in sentences:
|
|
1146
|
+
sentence_tokens = len(sentence) // 4
|
|
1147
|
+
|
|
1148
|
+
# If adding this sentence exceeds chunk size, save current chunk
|
|
1149
|
+
if (
|
|
1150
|
+
current_size + sentence_tokens > chunk_size_tokens
|
|
1151
|
+
and current_chunk
|
|
1152
|
+
):
|
|
1153
|
+
chunks.append(" ".join(current_chunk))
|
|
1154
|
+
|
|
1155
|
+
# Keep overlap (last few sentences)
|
|
1156
|
+
overlap_text = " ".join(current_chunk)
|
|
1157
|
+
overlap_actual = len(overlap_text) // 4
|
|
1158
|
+
if overlap_actual > overlap_tokens:
|
|
1159
|
+
# Trim to overlap size
|
|
1160
|
+
current_chunk = self._get_last_n_tokens(
|
|
1161
|
+
overlap_text, overlap_tokens
|
|
1162
|
+
).split()
|
|
1163
|
+
current_size = overlap_tokens
|
|
1164
|
+
else:
|
|
1165
|
+
current_chunk = []
|
|
1166
|
+
current_size = 0
|
|
1167
|
+
|
|
1168
|
+
current_chunk.append(sentence)
|
|
1169
|
+
current_size += sentence_tokens
|
|
1170
|
+
else:
|
|
1171
|
+
# Small paragraph - try to keep intact
|
|
1172
|
+
# SPECIAL CASE: If this is VLM content, keep it atomic even if it exceeds chunk size
|
|
1173
|
+
if is_vlm_content:
|
|
1174
|
+
if current_chunk:
|
|
1175
|
+
# Save current chunk before adding VLM content
|
|
1176
|
+
chunks.append(" ".join(current_chunk))
|
|
1177
|
+
current_chunk = []
|
|
1178
|
+
current_size = 0
|
|
1179
|
+
|
|
1180
|
+
# Add VLM content as its own chunk (atomic, not split)
|
|
1181
|
+
chunks.append(para)
|
|
1182
|
+
self.log.debug(
|
|
1183
|
+
f"Added VLM content as atomic chunk ({para_tokens} tokens)"
|
|
1184
|
+
)
|
|
1185
|
+
|
|
1186
|
+
elif current_size + para_tokens > chunk_size_tokens and current_chunk:
|
|
1187
|
+
# Save current chunk
|
|
1188
|
+
chunks.append(" ".join(current_chunk))
|
|
1189
|
+
|
|
1190
|
+
# Keep overlap
|
|
1191
|
+
overlap_text = " ".join(current_chunk)
|
|
1192
|
+
current_chunk = self._get_last_n_tokens(
|
|
1193
|
+
overlap_text, overlap_tokens
|
|
1194
|
+
).split()
|
|
1195
|
+
current_size = len(" ".join(current_chunk)) // 4
|
|
1196
|
+
|
|
1197
|
+
current_chunk.append(para)
|
|
1198
|
+
current_size += para_tokens
|
|
1199
|
+
else:
|
|
1200
|
+
current_chunk.append(para)
|
|
1201
|
+
current_size += para_tokens
|
|
1202
|
+
|
|
1203
|
+
# Add final chunk
|
|
1204
|
+
if current_chunk:
|
|
1205
|
+
chunks.append(" ".join(current_chunk))
|
|
1206
|
+
|
|
1207
|
+
# STEP 4: Restore VLM blocks from placeholders
|
|
1208
|
+
if vlm_blocks:
|
|
1209
|
+
restored_chunks = []
|
|
1210
|
+
for chunk in chunks:
|
|
1211
|
+
restored_chunk = chunk
|
|
1212
|
+
# Replace placeholders with actual VLM content
|
|
1213
|
+
for vlm_block in vlm_blocks:
|
|
1214
|
+
if vlm_block["placeholder"] in restored_chunk:
|
|
1215
|
+
restored_chunk = restored_chunk.replace(
|
|
1216
|
+
vlm_block["placeholder"], vlm_block["content"]
|
|
1217
|
+
)
|
|
1218
|
+
restored_chunks.append(restored_chunk)
|
|
1219
|
+
chunks = restored_chunks
|
|
1220
|
+
|
|
1221
|
+
if self.config.show_stats:
|
|
1222
|
+
avg_size = sum(len(c) for c in chunks) // len(chunks) if chunks else 0
|
|
1223
|
+
print(f" ✅ Created {len(chunks)} semantic chunks (avg {avg_size} chars)")
|
|
1224
|
+
|
|
1225
|
+
self.log.info(f"📦 Created {len(chunks)} semantic chunks")
|
|
1226
|
+
return chunks
|
|
1227
|
+
|
|
1228
|
+
def _split_into_sentences(self, text: str) -> List[str]:
|
|
1229
|
+
"""
|
|
1230
|
+
Split text into sentences using simple heuristics.
|
|
1231
|
+
|
|
1232
|
+
Better than word splitting, doesn't require NLTK dependency.
|
|
1233
|
+
"""
|
|
1234
|
+
# Split on sentence endings followed by space and capital letter
|
|
1235
|
+
|
|
1236
|
+
# Handle common abbreviations that shouldn't split
|
|
1237
|
+
text = text.replace("Dr.", "Dr<DOT>")
|
|
1238
|
+
text = text.replace("Mr.", "Mr<DOT>")
|
|
1239
|
+
text = text.replace("Mrs.", "Mrs<DOT>")
|
|
1240
|
+
text = text.replace("Ms.", "Ms<DOT>")
|
|
1241
|
+
text = text.replace("Prof.", "Prof<DOT>")
|
|
1242
|
+
text = text.replace("Sr.", "Sr<DOT>")
|
|
1243
|
+
text = text.replace("Jr.", "Jr<DOT>")
|
|
1244
|
+
text = text.replace("vs.", "vs<DOT>")
|
|
1245
|
+
text = text.replace("e.g.", "e<DOT>g<DOT>")
|
|
1246
|
+
text = text.replace("i.e.", "i<DOT>e<DOT>")
|
|
1247
|
+
text = text.replace("etc.", "etc<DOT>")
|
|
1248
|
+
|
|
1249
|
+
# Split on sentence boundaries
|
|
1250
|
+
sentences = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
|
|
1251
|
+
|
|
1252
|
+
# Restore abbreviations
|
|
1253
|
+
sentences = [s.replace("<DOT>", ".") for s in sentences]
|
|
1254
|
+
|
|
1255
|
+
return [s.strip() for s in sentences if s.strip()]
|
|
1256
|
+
|
|
1257
|
+
def _get_last_n_tokens(self, text: str, n_tokens: int) -> str:
|
|
1258
|
+
"""Get approximately the last n tokens from text."""
|
|
1259
|
+
# Rough estimate: 1 token ≈ 4 characters
|
|
1260
|
+
target_chars = n_tokens * 4
|
|
1261
|
+
if len(text) <= target_chars:
|
|
1262
|
+
return text
|
|
1263
|
+
|
|
1264
|
+
# Try to break on word boundary
|
|
1265
|
+
trimmed = text[-target_chars:]
|
|
1266
|
+
first_space = trimmed.find(" ")
|
|
1267
|
+
if first_space > 0:
|
|
1268
|
+
return trimmed[first_space + 1 :]
|
|
1269
|
+
return trimmed
|
|
1270
|
+
|
|
1271
|
+
def _create_vector_index(self, chunks: List[str]) -> tuple:
|
|
1272
|
+
"""Create FAISS vector index from chunks with progress reporting."""
|
|
1273
|
+
import time as time_module # pylint: disable=reimported
|
|
1274
|
+
|
|
1275
|
+
self._load_embedder()
|
|
1276
|
+
|
|
1277
|
+
# Generate embeddings with detailed progress
|
|
1278
|
+
self.log.info(f"🔍 Generating embeddings for {len(chunks)} chunks...")
|
|
1279
|
+
|
|
1280
|
+
if self.config.show_stats:
|
|
1281
|
+
print(f"\n{'='*60}")
|
|
1282
|
+
print(" 🧠 COMPUTE INTENSIVE: Generating vector embeddings")
|
|
1283
|
+
print(f" 📊 Processing {len(chunks)} chunks")
|
|
1284
|
+
print(f" ⏱️ Estimated time: {len(chunks) * 0.05:.1f} seconds")
|
|
1285
|
+
print(f"{'='*60}")
|
|
1286
|
+
|
|
1287
|
+
embed_start = time_module.time()
|
|
1288
|
+
embeddings = self._encode_texts(chunks, show_progress=self.config.show_stats)
|
|
1289
|
+
embed_duration = time_module.time() - embed_start
|
|
1290
|
+
|
|
1291
|
+
if self.config.show_stats:
|
|
1292
|
+
print(
|
|
1293
|
+
f"\n ✅ Generated {embeddings.shape[0]} embeddings ({embeddings.shape[1]} dimensions)"
|
|
1294
|
+
)
|
|
1295
|
+
print(
|
|
1296
|
+
f" ⏱️ Embedding time: {embed_duration:.2f}s ({len(chunks)/embed_duration:.1f} chunks/sec)"
|
|
1297
|
+
)
|
|
1298
|
+
|
|
1299
|
+
# Create FAISS index
|
|
1300
|
+
self.log.info("🏗️ Building FAISS search index...")
|
|
1301
|
+
|
|
1302
|
+
if self.config.show_stats:
|
|
1303
|
+
print("\n 🏗️ Building FAISS search index...")
|
|
1304
|
+
|
|
1305
|
+
index_start = time_module.time()
|
|
1306
|
+
dimension = embeddings.shape[1]
|
|
1307
|
+
index = faiss.IndexFlatL2(dimension)
|
|
1308
|
+
# pylint: disable=no-value-for-parameter
|
|
1309
|
+
index.add(embeddings.astype("float32"))
|
|
1310
|
+
index_duration = time_module.time() - index_start
|
|
1311
|
+
|
|
1312
|
+
if self.config.show_stats:
|
|
1313
|
+
print(
|
|
1314
|
+
f" ✅ Built search index for {index.ntotal} vectors in {index_duration:.2f}s"
|
|
1315
|
+
)
|
|
1316
|
+
print(
|
|
1317
|
+
f" 💾 Memory: ~{(embeddings.nbytes / (1024**2)):.1f}MB for embeddings"
|
|
1318
|
+
)
|
|
1319
|
+
print(f"{'='*60}\n")
|
|
1320
|
+
|
|
1321
|
+
self.log.info(
|
|
1322
|
+
f"📚 Index ready with {index.ntotal} vectors "
|
|
1323
|
+
f"(embed: {embed_duration:.2f}s, index: {index_duration:.2f}s)"
|
|
1324
|
+
)
|
|
1325
|
+
return index, chunks
|
|
1326
|
+
|
|
1327
|
+
def remove_document(self, file_path: str) -> bool:
|
|
1328
|
+
"""
|
|
1329
|
+
Remove a document from the index.
|
|
1330
|
+
|
|
1331
|
+
Args:
|
|
1332
|
+
file_path: Path to document to remove
|
|
1333
|
+
|
|
1334
|
+
Returns:
|
|
1335
|
+
True if removal succeeded, False otherwise
|
|
1336
|
+
"""
|
|
1337
|
+
file_path = str(Path(file_path).absolute())
|
|
1338
|
+
|
|
1339
|
+
if file_path not in self.indexed_files:
|
|
1340
|
+
self.log.warning(f"Document not indexed: {file_path}")
|
|
1341
|
+
return False
|
|
1342
|
+
|
|
1343
|
+
try:
|
|
1344
|
+
# Get chunk indices for this file
|
|
1345
|
+
if file_path in self.file_to_chunk_indices:
|
|
1346
|
+
chunk_indices_set = set(self.file_to_chunk_indices[file_path])
|
|
1347
|
+
|
|
1348
|
+
# OPTIMIZED: Rebuild all structures in one O(N) pass
|
|
1349
|
+
# This is much faster than deleting in a loop (which is O(N²))
|
|
1350
|
+
new_chunks = []
|
|
1351
|
+
new_chunk_to_file = {}
|
|
1352
|
+
new_file_to_chunk_indices = {}
|
|
1353
|
+
|
|
1354
|
+
# Single pass through all chunks - O(N)
|
|
1355
|
+
for old_idx, chunk in enumerate(self.chunks):
|
|
1356
|
+
# Skip chunks from file being removed
|
|
1357
|
+
if old_idx in chunk_indices_set:
|
|
1358
|
+
continue
|
|
1359
|
+
|
|
1360
|
+
new_idx = len(new_chunks)
|
|
1361
|
+
new_chunks.append(chunk)
|
|
1362
|
+
|
|
1363
|
+
# Update chunk_to_file mapping
|
|
1364
|
+
if old_idx in self.chunk_to_file:
|
|
1365
|
+
file = self.chunk_to_file[old_idx]
|
|
1366
|
+
new_chunk_to_file[new_idx] = file
|
|
1367
|
+
|
|
1368
|
+
# Update file_to_chunk_indices for this file
|
|
1369
|
+
if file not in new_file_to_chunk_indices:
|
|
1370
|
+
new_file_to_chunk_indices[file] = []
|
|
1371
|
+
new_file_to_chunk_indices[file].append(new_idx)
|
|
1372
|
+
|
|
1373
|
+
# Atomic replacement - all or nothing
|
|
1374
|
+
self.chunks = new_chunks
|
|
1375
|
+
self.chunk_to_file = new_chunk_to_file
|
|
1376
|
+
self.file_to_chunk_indices = new_file_to_chunk_indices
|
|
1377
|
+
|
|
1378
|
+
# Remove from indexed files
|
|
1379
|
+
self.indexed_files.discard(file_path)
|
|
1380
|
+
|
|
1381
|
+
# Clean up LRU tracking
|
|
1382
|
+
if file_path in self.file_access_times:
|
|
1383
|
+
del self.file_access_times[file_path]
|
|
1384
|
+
if file_path in self.file_index_times:
|
|
1385
|
+
del self.file_index_times[file_path]
|
|
1386
|
+
|
|
1387
|
+
# Clean up cached per-file indices and embeddings
|
|
1388
|
+
if file_path in self.file_indices:
|
|
1389
|
+
del self.file_indices[file_path]
|
|
1390
|
+
if file_path in self.file_embeddings:
|
|
1391
|
+
del self.file_embeddings[file_path]
|
|
1392
|
+
|
|
1393
|
+
# Clean up cached metadata
|
|
1394
|
+
if file_path in self.file_metadata:
|
|
1395
|
+
del self.file_metadata[file_path]
|
|
1396
|
+
|
|
1397
|
+
# Rebuild index if chunks remain
|
|
1398
|
+
if self.chunks:
|
|
1399
|
+
self.index, self.chunks = self._create_vector_index(self.chunks)
|
|
1400
|
+
if self.config.show_stats:
|
|
1401
|
+
print(f"✅ Removed {Path(file_path).name} from index")
|
|
1402
|
+
print(
|
|
1403
|
+
f"📊 Remaining: {len(self.indexed_files)} documents, {len(self.chunks)} chunks"
|
|
1404
|
+
)
|
|
1405
|
+
else:
|
|
1406
|
+
self.index = None
|
|
1407
|
+
if self.config.show_stats:
|
|
1408
|
+
print("✅ Removed last document from index")
|
|
1409
|
+
|
|
1410
|
+
self.log.info(f"Successfully removed document: {file_path}")
|
|
1411
|
+
return True
|
|
1412
|
+
|
|
1413
|
+
except Exception as e:
|
|
1414
|
+
self.log.error(f"Failed to remove document {file_path}: {e}")
|
|
1415
|
+
return False
|
|
1416
|
+
|
|
1417
|
+
def reindex_document(self, file_path: str) -> Dict[str, Any]:
|
|
1418
|
+
"""
|
|
1419
|
+
Reindex a document (remove old chunks and add new ones).
|
|
1420
|
+
|
|
1421
|
+
Args:
|
|
1422
|
+
file_path: Path to document to reindex
|
|
1423
|
+
|
|
1424
|
+
Returns:
|
|
1425
|
+
Dict with indexing results and statistics (same as index_document)
|
|
1426
|
+
"""
|
|
1427
|
+
file_path = str(Path(file_path).absolute())
|
|
1428
|
+
|
|
1429
|
+
# Remove old version if it exists
|
|
1430
|
+
if file_path in self.indexed_files:
|
|
1431
|
+
self.log.info(f"Removing old version of {file_path}")
|
|
1432
|
+
if not self.remove_document(file_path):
|
|
1433
|
+
return {
|
|
1434
|
+
"success": False,
|
|
1435
|
+
"error": "Failed to remove old version",
|
|
1436
|
+
"file_name": Path(file_path).name,
|
|
1437
|
+
}
|
|
1438
|
+
|
|
1439
|
+
# Index the new version
|
|
1440
|
+
self.log.info(f"Indexing new version of {file_path}")
|
|
1441
|
+
result = self.index_document(file_path)
|
|
1442
|
+
if result.get("success"):
|
|
1443
|
+
result["reindexed"] = True
|
|
1444
|
+
return result
|
|
1445
|
+
|
|
1446
|
+
def _evict_lru_document(self) -> bool:
|
|
1447
|
+
"""
|
|
1448
|
+
Evict the least recently used document to free memory.
|
|
1449
|
+
|
|
1450
|
+
Returns:
|
|
1451
|
+
True if a document was evicted, False otherwise
|
|
1452
|
+
"""
|
|
1453
|
+
if not self.config.enable_lru_eviction or not self.file_access_times:
|
|
1454
|
+
return False
|
|
1455
|
+
|
|
1456
|
+
# Find LRU file (oldest access time)
|
|
1457
|
+
lru_file = min(self.file_access_times, key=self.file_access_times.get)
|
|
1458
|
+
|
|
1459
|
+
if self.config.show_stats:
|
|
1460
|
+
print(
|
|
1461
|
+
f"📦 Memory limit reached, evicting LRU document: {Path(lru_file).name}"
|
|
1462
|
+
)
|
|
1463
|
+
|
|
1464
|
+
# Remove the LRU document
|
|
1465
|
+
return self.remove_document(lru_file)
|
|
1466
|
+
|
|
1467
|
+
def _check_memory_limits(self) -> None:
|
|
1468
|
+
"""
|
|
1469
|
+
Check memory limits and evict documents if necessary.
|
|
1470
|
+
"""
|
|
1471
|
+
# Check total chunks limit
|
|
1472
|
+
while (
|
|
1473
|
+
self.config.max_total_chunks > 0
|
|
1474
|
+
and len(self.chunks) > self.config.max_total_chunks
|
|
1475
|
+
and len(self.indexed_files) > 1
|
|
1476
|
+
): # Keep at least one file
|
|
1477
|
+
if not self._evict_lru_document():
|
|
1478
|
+
break
|
|
1479
|
+
|
|
1480
|
+
# Check indexed files limit
|
|
1481
|
+
while (
|
|
1482
|
+
self.config.max_indexed_files > 0
|
|
1483
|
+
and len(self.indexed_files) > self.config.max_indexed_files
|
|
1484
|
+
):
|
|
1485
|
+
if not self._evict_lru_document():
|
|
1486
|
+
break
|
|
1487
|
+
|
|
1488
|
+
def index_document(self, file_path: str) -> Dict[str, Any]:
|
|
1489
|
+
"""
|
|
1490
|
+
Index a document for retrieval.
|
|
1491
|
+
|
|
1492
|
+
Supports:
|
|
1493
|
+
- Documents: PDF, TXT, MD, CSV, JSON
|
|
1494
|
+
- Backend Code: Python, Java, C/C++, Go, Rust, Ruby, PHP, Swift, Kotlin, Scala
|
|
1495
|
+
- Web Code: JavaScript/TypeScript, HTML, CSS/SCSS/SASS/LESS, Vue, Svelte, Astro
|
|
1496
|
+
- Config: YAML, XML, TOML, INI, ENV, Properties
|
|
1497
|
+
- Build: Gradle, CMake, Makefiles
|
|
1498
|
+
- Database: SQL
|
|
1499
|
+
|
|
1500
|
+
Args:
|
|
1501
|
+
file_path: Path to document or code file
|
|
1502
|
+
|
|
1503
|
+
Returns:
|
|
1504
|
+
Dict with indexing results and statistics:
|
|
1505
|
+
{
|
|
1506
|
+
"success": bool,
|
|
1507
|
+
"file_name": str,
|
|
1508
|
+
"file_type": str,
|
|
1509
|
+
"file_size_mb": float,
|
|
1510
|
+
"num_pages": int (for PDFs),
|
|
1511
|
+
"num_chunks": int,
|
|
1512
|
+
"total_indexed_files": int,
|
|
1513
|
+
"total_chunks": int,
|
|
1514
|
+
"error": str (if failed)
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
Raises:
|
|
1518
|
+
ValueError: If file_path is empty or file doesn't exist
|
|
1519
|
+
"""
|
|
1520
|
+
# Validate input
|
|
1521
|
+
if not file_path or not file_path.strip():
|
|
1522
|
+
raise ValueError("File path cannot be empty")
|
|
1523
|
+
|
|
1524
|
+
# Initialize stats dict
|
|
1525
|
+
stats = {
|
|
1526
|
+
"success": False,
|
|
1527
|
+
"file_name": Path(file_path).name if file_path else "",
|
|
1528
|
+
"file_type": "",
|
|
1529
|
+
"file_size_mb": 0.0,
|
|
1530
|
+
"num_pages": None,
|
|
1531
|
+
"vlm_pages": None,
|
|
1532
|
+
"total_images": None,
|
|
1533
|
+
"num_chunks": 0,
|
|
1534
|
+
"total_indexed_files": len(self.indexed_files),
|
|
1535
|
+
"total_chunks": len(self.chunks),
|
|
1536
|
+
}
|
|
1537
|
+
|
|
1538
|
+
# Check if file exists before processing
|
|
1539
|
+
if not os.path.exists(file_path):
|
|
1540
|
+
self.log.error(f"File not found: {file_path}")
|
|
1541
|
+
if self.config.show_stats:
|
|
1542
|
+
print(f"❌ File not found: {file_path}")
|
|
1543
|
+
print(" Please check the file path and try again")
|
|
1544
|
+
stats["error"] = f"File not found: {file_path}"
|
|
1545
|
+
return stats
|
|
1546
|
+
|
|
1547
|
+
# Check if file is empty (early validation to save time)
|
|
1548
|
+
file_size = os.path.getsize(file_path)
|
|
1549
|
+
file_size_mb = file_size / (1024 * 1024)
|
|
1550
|
+
stats["file_size_mb"] = round(file_size_mb, 2)
|
|
1551
|
+
|
|
1552
|
+
if file_size == 0:
|
|
1553
|
+
self.log.error(f"File is empty: {file_path}")
|
|
1554
|
+
if self.config.show_stats:
|
|
1555
|
+
print(f"❌ File is empty: {file_path}")
|
|
1556
|
+
print(" The file has no content to index")
|
|
1557
|
+
stats["error"] = "File is empty"
|
|
1558
|
+
return stats
|
|
1559
|
+
|
|
1560
|
+
# Enforce maximum file size limit (prevent OOM)
|
|
1561
|
+
if file_size_mb > self.config.max_file_size_mb:
|
|
1562
|
+
error_msg = (
|
|
1563
|
+
f"File too large: {Path(file_path).name} ({file_size_mb:.1f}MB)\n"
|
|
1564
|
+
f"Maximum allowed: {self.config.max_file_size_mb}MB\n"
|
|
1565
|
+
"Suggestions:\n"
|
|
1566
|
+
" 1. Split the file into smaller documents\n"
|
|
1567
|
+
" 2. Increase max_file_size_mb in RAGConfig\n"
|
|
1568
|
+
" 3. Use a more powerful system with more RAM"
|
|
1569
|
+
)
|
|
1570
|
+
self.log.error(error_msg)
|
|
1571
|
+
if self.config.show_stats:
|
|
1572
|
+
print(f"❌ {error_msg}")
|
|
1573
|
+
stats["error"] = (
|
|
1574
|
+
f"File too large ({file_size_mb:.1f}MB > {self.config.max_file_size_mb}MB)"
|
|
1575
|
+
)
|
|
1576
|
+
return stats
|
|
1577
|
+
|
|
1578
|
+
# Warn if file is large
|
|
1579
|
+
if file_size_mb > self.config.warn_file_size_mb:
|
|
1580
|
+
if self.config.show_stats:
|
|
1581
|
+
print(f"⚠️ Large file detected ({file_size_mb:.1f}MB)")
|
|
1582
|
+
print(" This may take 30-60 seconds to process...")
|
|
1583
|
+
self.log.warning(f"Processing large file: {file_size_mb:.1f}MB")
|
|
1584
|
+
|
|
1585
|
+
# Convert to absolute path only after validation
|
|
1586
|
+
file_path = str(Path(file_path).absolute())
|
|
1587
|
+
|
|
1588
|
+
# Get file type for logging
|
|
1589
|
+
file_type = self._get_file_type(file_path)
|
|
1590
|
+
stats["file_type"] = file_type
|
|
1591
|
+
stats["file_name"] = Path(file_path).name
|
|
1592
|
+
|
|
1593
|
+
# Check if already indexed
|
|
1594
|
+
if file_path in self.indexed_files:
|
|
1595
|
+
if self.config.show_stats:
|
|
1596
|
+
print(f"📋 Document already indexed: {Path(file_path).name}")
|
|
1597
|
+
self.log.info(f"Document already indexed: {file_path}")
|
|
1598
|
+
stats["success"] = True
|
|
1599
|
+
stats["already_indexed"] = True
|
|
1600
|
+
stats["total_indexed_files"] = len(self.indexed_files)
|
|
1601
|
+
stats["total_chunks"] = len(self.chunks)
|
|
1602
|
+
return stats
|
|
1603
|
+
|
|
1604
|
+
# Check cache - the cache key is based on file content hash
|
|
1605
|
+
cache_path = self._get_cache_path(file_path)
|
|
1606
|
+
|
|
1607
|
+
# Also check for cached Markdown file with hash-based name
|
|
1608
|
+
# Extract the cache key from the pickle cache path to find matching MD file
|
|
1609
|
+
cache_filename = Path(cache_path).stem # Remove .pkl extension
|
|
1610
|
+
md_cache_path = os.path.join(
|
|
1611
|
+
self.config.cache_dir, f"{cache_filename}_extracted.md"
|
|
1612
|
+
)
|
|
1613
|
+
|
|
1614
|
+
if os.path.exists(cache_path):
|
|
1615
|
+
if self.config.show_stats:
|
|
1616
|
+
print(f"💾 Loading from cache: {Path(file_path).name}")
|
|
1617
|
+
self.log.info(f"📦 Loading cached index for: {file_path}")
|
|
1618
|
+
try:
|
|
1619
|
+
with open(cache_path, "rb") as f:
|
|
1620
|
+
cached_data = pickle.load(f)
|
|
1621
|
+
cached_chunks = cached_data["chunks"]
|
|
1622
|
+
cached_full_text = cached_data.get(
|
|
1623
|
+
"full_text", ""
|
|
1624
|
+
) # May not exist in old caches
|
|
1625
|
+
cached_metadata = cached_data.get(
|
|
1626
|
+
"metadata", {}
|
|
1627
|
+
) # May not exist in old caches
|
|
1628
|
+
|
|
1629
|
+
# Check if cache might be missing VLM content
|
|
1630
|
+
# If metadata doesn't have VLM info, it's an old cache
|
|
1631
|
+
if not cached_metadata.get("vlm_checked", False):
|
|
1632
|
+
if self.config.show_stats:
|
|
1633
|
+
print(
|
|
1634
|
+
" ⚠️ Cache might be missing image text (pre-VLM cache)"
|
|
1635
|
+
)
|
|
1636
|
+
print(
|
|
1637
|
+
" 💡 Use /clear-cache to force re-extraction with VLM"
|
|
1638
|
+
)
|
|
1639
|
+
|
|
1640
|
+
# Verify Markdown cache exists alongside pickle cache
|
|
1641
|
+
if os.path.exists(md_cache_path):
|
|
1642
|
+
self.log.info(
|
|
1643
|
+
f" ✅ Markdown cache also available: {md_cache_path}"
|
|
1644
|
+
)
|
|
1645
|
+
|
|
1646
|
+
if self.config.show_stats:
|
|
1647
|
+
vlm_info = ""
|
|
1648
|
+
if cached_metadata.get("vlm_pages", 0) > 0:
|
|
1649
|
+
vlm_info = f" (VLM: {cached_metadata['vlm_pages']} pages)"
|
|
1650
|
+
print(
|
|
1651
|
+
f" ✅ Loaded {len(cached_chunks)} cached chunks{vlm_info}"
|
|
1652
|
+
)
|
|
1653
|
+
|
|
1654
|
+
# Track chunk indices for this file
|
|
1655
|
+
start_idx = len(self.chunks)
|
|
1656
|
+
file_chunk_indices = []
|
|
1657
|
+
|
|
1658
|
+
if self.index is None:
|
|
1659
|
+
# First document - use cached index directly
|
|
1660
|
+
self.chunks = cached_chunks
|
|
1661
|
+
# Track indices for all chunks (0 to len-1)
|
|
1662
|
+
for i in range(len(cached_chunks)):
|
|
1663
|
+
file_chunk_indices.append(i)
|
|
1664
|
+
self.chunk_to_file[i] = file_path
|
|
1665
|
+
self.index, self.chunks = self._create_vector_index(self.chunks)
|
|
1666
|
+
else:
|
|
1667
|
+
# Merge with existing chunks and recreate index
|
|
1668
|
+
old_count = len(self.chunks)
|
|
1669
|
+
self.chunks.extend(cached_chunks)
|
|
1670
|
+
# Track indices for new chunks (start_idx to start_idx+len-1)
|
|
1671
|
+
for i in range(len(cached_chunks)):
|
|
1672
|
+
chunk_idx = start_idx + i
|
|
1673
|
+
file_chunk_indices.append(chunk_idx)
|
|
1674
|
+
self.chunk_to_file[chunk_idx] = file_path
|
|
1675
|
+
if self.config.show_stats:
|
|
1676
|
+
print(
|
|
1677
|
+
f" 🔄 Rebuilding index ({old_count} + {len(cached_chunks)} = {len(self.chunks)} chunks)"
|
|
1678
|
+
)
|
|
1679
|
+
self.index, self.chunks = self._create_vector_index(self.chunks)
|
|
1680
|
+
|
|
1681
|
+
# Store file-to-chunk mapping
|
|
1682
|
+
self.file_to_chunk_indices[file_path] = file_chunk_indices
|
|
1683
|
+
|
|
1684
|
+
# Restore metadata in memory
|
|
1685
|
+
if cached_full_text or cached_metadata:
|
|
1686
|
+
self.file_metadata[file_path] = {
|
|
1687
|
+
"full_text": cached_full_text,
|
|
1688
|
+
**cached_metadata,
|
|
1689
|
+
}
|
|
1690
|
+
|
|
1691
|
+
self.indexed_files.add(file_path)
|
|
1692
|
+
if self.config.show_stats:
|
|
1693
|
+
print(" ✅ Successfully loaded from cache")
|
|
1694
|
+
|
|
1695
|
+
# Update stats for cache load
|
|
1696
|
+
stats["success"] = True
|
|
1697
|
+
stats["num_chunks"] = len(cached_chunks)
|
|
1698
|
+
stats["num_pages"] = cached_metadata.get("num_pages")
|
|
1699
|
+
stats["vlm_pages"] = cached_metadata.get("vlm_pages")
|
|
1700
|
+
stats["total_images"] = cached_metadata.get("total_images")
|
|
1701
|
+
stats["total_indexed_files"] = len(self.indexed_files)
|
|
1702
|
+
stats["total_chunks"] = len(self.chunks)
|
|
1703
|
+
stats["from_cache"] = True
|
|
1704
|
+
return stats
|
|
1705
|
+
except Exception as e:
|
|
1706
|
+
self.log.warning(f"Cache load failed: {e}, reindexing")
|
|
1707
|
+
if self.config.show_stats:
|
|
1708
|
+
print(" ⚠️ Cache loading failed, will reindex from scratch")
|
|
1709
|
+
|
|
1710
|
+
# Extract and process document
|
|
1711
|
+
if self.config.show_stats:
|
|
1712
|
+
print(f"🚀 Starting to index: {Path(file_path).name} ({file_type})")
|
|
1713
|
+
self.log.info(f"📄 Indexing document: {file_path} ({file_type})")
|
|
1714
|
+
|
|
1715
|
+
try:
|
|
1716
|
+
# Extract text based on file type
|
|
1717
|
+
text, file_metadata = self._extract_text_from_file(file_path)
|
|
1718
|
+
|
|
1719
|
+
# Store metadata in stats if available (for PDFs)
|
|
1720
|
+
if file_metadata.get("num_pages"):
|
|
1721
|
+
stats["num_pages"] = file_metadata["num_pages"]
|
|
1722
|
+
if file_metadata.get("vlm_pages"):
|
|
1723
|
+
stats["vlm_pages"] = file_metadata["vlm_pages"]
|
|
1724
|
+
if file_metadata.get("total_images"):
|
|
1725
|
+
stats["total_images"] = file_metadata["total_images"]
|
|
1726
|
+
|
|
1727
|
+
if not text.strip():
|
|
1728
|
+
error_msg = (
|
|
1729
|
+
f"No text content found in {file_type} file: {Path(file_path).name}\n"
|
|
1730
|
+
"Possible reasons:\n"
|
|
1731
|
+
" 1. The file contains only images or non-text content\n"
|
|
1732
|
+
" 2. The file is password-protected (PDFs)\n"
|
|
1733
|
+
" 3. The file uses an unsupported format\n"
|
|
1734
|
+
" 4. The text extraction failed\n"
|
|
1735
|
+
"Try opening the file manually to verify it contains readable text"
|
|
1736
|
+
)
|
|
1737
|
+
stats["error"] = "No text content found"
|
|
1738
|
+
raise ValueError(error_msg)
|
|
1739
|
+
|
|
1740
|
+
# Split into chunks
|
|
1741
|
+
new_chunks = self._split_text_into_chunks(text)
|
|
1742
|
+
|
|
1743
|
+
# Track which chunks belong to this file
|
|
1744
|
+
file_chunk_indices = []
|
|
1745
|
+
start_idx = len(self.chunks)
|
|
1746
|
+
|
|
1747
|
+
# Add to existing chunks or create new
|
|
1748
|
+
if self.chunks:
|
|
1749
|
+
old_count = len(self.chunks)
|
|
1750
|
+
self.chunks.extend(new_chunks)
|
|
1751
|
+
|
|
1752
|
+
# Track the indices of chunks for this file
|
|
1753
|
+
for i in range(start_idx, start_idx + len(new_chunks)):
|
|
1754
|
+
file_chunk_indices.append(i)
|
|
1755
|
+
self.chunk_to_file[i] = file_path
|
|
1756
|
+
|
|
1757
|
+
if self.config.show_stats:
|
|
1758
|
+
print(
|
|
1759
|
+
f"🔄 Rebuilding search index ({old_count} + {len(new_chunks)} = {len(self.chunks)} total chunks)"
|
|
1760
|
+
)
|
|
1761
|
+
self.index, self.chunks = self._create_vector_index(self.chunks)
|
|
1762
|
+
else:
|
|
1763
|
+
# First file being indexed
|
|
1764
|
+
for i in range(len(new_chunks)):
|
|
1765
|
+
file_chunk_indices.append(i)
|
|
1766
|
+
self.chunk_to_file[i] = file_path
|
|
1767
|
+
|
|
1768
|
+
if self.config.show_stats:
|
|
1769
|
+
print("🏗️ Building initial search index...")
|
|
1770
|
+
self.index, self.chunks = self._create_vector_index(new_chunks)
|
|
1771
|
+
|
|
1772
|
+
# Store the file-to-chunks mapping
|
|
1773
|
+
self.file_to_chunk_indices[file_path] = file_chunk_indices
|
|
1774
|
+
|
|
1775
|
+
# Build and cache per-file FAISS index for fast file-specific searches
|
|
1776
|
+
if self.config.show_stats:
|
|
1777
|
+
print("🔍 Building per-file search index...")
|
|
1778
|
+
|
|
1779
|
+
self._load_embedder()
|
|
1780
|
+
# Generate embeddings for this file's chunks only
|
|
1781
|
+
file_embeddings = self._encode_texts(new_chunks, show_progress=False)
|
|
1782
|
+
|
|
1783
|
+
# Create FAISS index for this file
|
|
1784
|
+
dimension = file_embeddings.shape[1]
|
|
1785
|
+
file_index = faiss.IndexFlatL2(dimension)
|
|
1786
|
+
# pylint: disable=no-value-for-parameter
|
|
1787
|
+
file_index.add(file_embeddings.astype("float32"))
|
|
1788
|
+
|
|
1789
|
+
# Cache the index and embeddings for this file
|
|
1790
|
+
self.file_indices[file_path] = file_index
|
|
1791
|
+
self.file_embeddings[file_path] = file_embeddings
|
|
1792
|
+
|
|
1793
|
+
if self.config.show_stats:
|
|
1794
|
+
print(f"✅ Cached per-file index with {len(new_chunks)} chunks")
|
|
1795
|
+
|
|
1796
|
+
# Cache the results for this specific document
|
|
1797
|
+
if self.config.show_stats:
|
|
1798
|
+
print("💾 Caching processed chunks...")
|
|
1799
|
+
cache_data = {
|
|
1800
|
+
"chunks": new_chunks, # Cache only new chunks for this document
|
|
1801
|
+
"full_text": text, # Cache full extracted text (for /dump)
|
|
1802
|
+
"metadata": file_metadata, # Cache metadata (num_pages, vlm_pages, etc.)
|
|
1803
|
+
}
|
|
1804
|
+
with open(cache_path, "wb") as f:
|
|
1805
|
+
pickle.dump(cache_data, f)
|
|
1806
|
+
|
|
1807
|
+
# Store metadata in memory for fast access
|
|
1808
|
+
self.file_metadata[file_path] = {
|
|
1809
|
+
"full_text": text,
|
|
1810
|
+
**file_metadata, # num_pages, vlm_pages, total_images
|
|
1811
|
+
}
|
|
1812
|
+
|
|
1813
|
+
# Auto-save markdown version to cache directory for easy access
|
|
1814
|
+
self._save_extracted_markdown(file_path, text, file_metadata)
|
|
1815
|
+
|
|
1816
|
+
self.indexed_files.add(file_path)
|
|
1817
|
+
|
|
1818
|
+
# Track index time for LRU
|
|
1819
|
+
current_time = time.time()
|
|
1820
|
+
self.file_index_times[file_path] = current_time
|
|
1821
|
+
self.file_access_times[file_path] = current_time
|
|
1822
|
+
|
|
1823
|
+
# Check memory limits and evict if necessary
|
|
1824
|
+
self._check_memory_limits()
|
|
1825
|
+
|
|
1826
|
+
if self.config.show_stats:
|
|
1827
|
+
print(f"✅ Successfully indexed {Path(file_path).name}")
|
|
1828
|
+
print(
|
|
1829
|
+
f"📊 Total: {len(self.indexed_files)} documents, {len(self.chunks)} chunks"
|
|
1830
|
+
)
|
|
1831
|
+
if self.config.enable_lru_eviction:
|
|
1832
|
+
print(
|
|
1833
|
+
f"📈 Memory usage: {len(self.chunks)}/{self.config.max_total_chunks} chunks, "
|
|
1834
|
+
f"{len(self.indexed_files)}/{self.config.max_indexed_files} files"
|
|
1835
|
+
)
|
|
1836
|
+
|
|
1837
|
+
self.log.info(f"✅ Successfully indexed {file_path}")
|
|
1838
|
+
|
|
1839
|
+
# Update final stats
|
|
1840
|
+
stats["success"] = True
|
|
1841
|
+
stats["num_chunks"] = len(new_chunks)
|
|
1842
|
+
stats["total_indexed_files"] = len(self.indexed_files)
|
|
1843
|
+
stats["total_chunks"] = len(self.chunks)
|
|
1844
|
+
return stats
|
|
1845
|
+
|
|
1846
|
+
except Exception as e:
|
|
1847
|
+
if self.config.show_stats:
|
|
1848
|
+
print(f"❌ Failed to index {Path(file_path).name}: {e}")
|
|
1849
|
+
self.log.error(f"Failed to index {file_path}: {e}")
|
|
1850
|
+
stats["error"] = str(e)
|
|
1851
|
+
return stats
|
|
1852
|
+
|
|
1853
|
+
def _retrieve_chunks_from_file(self, query: str, file_path: str) -> tuple:
|
|
1854
|
+
"""
|
|
1855
|
+
Retrieve relevant chunks from a specific file using cached per-file index.
|
|
1856
|
+
|
|
1857
|
+
This is much faster than the global search because:
|
|
1858
|
+
1. Uses pre-computed embeddings (no re-encoding)
|
|
1859
|
+
2. Searches smaller, file-specific FAISS index
|
|
1860
|
+
3. No need to rebuild index on each query
|
|
1861
|
+
"""
|
|
1862
|
+
if self.index is None or not self.chunks:
|
|
1863
|
+
raise ValueError("No documents indexed. Call index_document() first.")
|
|
1864
|
+
|
|
1865
|
+
if file_path not in self.file_to_chunk_indices:
|
|
1866
|
+
raise ValueError(f"File not indexed: {file_path}")
|
|
1867
|
+
|
|
1868
|
+
# Update access time for LRU tracking
|
|
1869
|
+
self.file_access_times[file_path] = time.time()
|
|
1870
|
+
|
|
1871
|
+
# Get chunk indices for this file
|
|
1872
|
+
file_chunk_indices = self.file_to_chunk_indices[file_path]
|
|
1873
|
+
if not file_chunk_indices:
|
|
1874
|
+
return [], []
|
|
1875
|
+
|
|
1876
|
+
# Get chunks for this file
|
|
1877
|
+
file_chunks = [self.chunks[i] for i in file_chunk_indices]
|
|
1878
|
+
|
|
1879
|
+
# Use CACHED per-file index (this is the performance fix!)
|
|
1880
|
+
if file_path not in self.file_indices:
|
|
1881
|
+
# Index not cached - build it now (shouldn't happen normally)
|
|
1882
|
+
self.log.warning(f"Per-file index not cached for {file_path}, rebuilding")
|
|
1883
|
+
self._load_embedder()
|
|
1884
|
+
file_embeddings = self._encode_texts(file_chunks, show_progress=False)
|
|
1885
|
+
dimension = file_embeddings.shape[1]
|
|
1886
|
+
file_index = faiss.IndexFlatL2(dimension)
|
|
1887
|
+
# pylint: disable=no-value-for-parameter
|
|
1888
|
+
file_index.add(file_embeddings.astype("float32"))
|
|
1889
|
+
self.file_indices[file_path] = file_index
|
|
1890
|
+
self.file_embeddings[file_path] = file_embeddings
|
|
1891
|
+
else:
|
|
1892
|
+
# Use cached index - FAST!
|
|
1893
|
+
file_index = self.file_indices[file_path]
|
|
1894
|
+
|
|
1895
|
+
# Encode query only once
|
|
1896
|
+
self._load_embedder()
|
|
1897
|
+
query_embedding = self._encode_texts([query], show_progress=False)
|
|
1898
|
+
|
|
1899
|
+
# Search in cached file-specific index
|
|
1900
|
+
k = min(self.config.max_chunks, len(file_chunks))
|
|
1901
|
+
# pylint: disable=no-value-for-parameter
|
|
1902
|
+
distances, indices = file_index.search(query_embedding.astype("float32"), k)
|
|
1903
|
+
|
|
1904
|
+
# Get matching chunks and scores
|
|
1905
|
+
retrieved_chunks = []
|
|
1906
|
+
scores = []
|
|
1907
|
+
for idx, dist in zip(indices[0], distances[0]):
|
|
1908
|
+
if idx < len(file_chunks): # Safety check
|
|
1909
|
+
retrieved_chunks.append(file_chunks[idx])
|
|
1910
|
+
# Convert distance to similarity score
|
|
1911
|
+
score = 1.0 / (1.0 + float(dist))
|
|
1912
|
+
scores.append(score)
|
|
1913
|
+
|
|
1914
|
+
if self.config.show_stats:
|
|
1915
|
+
print(
|
|
1916
|
+
f" ✅ Found {len(retrieved_chunks)} relevant chunks from {Path(file_path).name} (using cached index)"
|
|
1917
|
+
)
|
|
1918
|
+
|
|
1919
|
+
return retrieved_chunks, scores
|
|
1920
|
+
|
|
1921
|
+
def _retrieve_chunks_with_metadata(self, query: str) -> tuple:
|
|
1922
|
+
"""
|
|
1923
|
+
Retrieve chunks with metadata about their source files.
|
|
1924
|
+
|
|
1925
|
+
Returns:
|
|
1926
|
+
(chunks, scores, metadata) tuple
|
|
1927
|
+
"""
|
|
1928
|
+
chunks, scores = self._retrieve_chunks(query)
|
|
1929
|
+
|
|
1930
|
+
# Build metadata for each chunk
|
|
1931
|
+
metadata = []
|
|
1932
|
+
for i, (chunk, score) in enumerate(zip(chunks, scores)):
|
|
1933
|
+
# Find which file this chunk came from
|
|
1934
|
+
chunk_idx = self.chunks.index(chunk) if chunk in self.chunks else -1
|
|
1935
|
+
source_file = self.chunk_to_file.get(chunk_idx, "unknown")
|
|
1936
|
+
|
|
1937
|
+
metadata.append(
|
|
1938
|
+
{
|
|
1939
|
+
"chunk_index": i + 1,
|
|
1940
|
+
"source_file": (
|
|
1941
|
+
Path(source_file).name
|
|
1942
|
+
if source_file != "unknown"
|
|
1943
|
+
else "unknown"
|
|
1944
|
+
),
|
|
1945
|
+
"source_path": source_file,
|
|
1946
|
+
"relevance_score": float(score),
|
|
1947
|
+
"chunk_length": len(chunk),
|
|
1948
|
+
"estimated_tokens": len(chunk) // 4, # Rough token estimate
|
|
1949
|
+
}
|
|
1950
|
+
)
|
|
1951
|
+
|
|
1952
|
+
return chunks, scores, metadata
|
|
1953
|
+
|
|
1954
|
+
def _retrieve_chunks(self, query: str) -> tuple:
|
|
1955
|
+
"""Retrieve relevant chunks for query."""
|
|
1956
|
+
if self.index is None or not self.chunks:
|
|
1957
|
+
raise ValueError("No documents indexed. Call index_document() first.")
|
|
1958
|
+
|
|
1959
|
+
self._load_embedder()
|
|
1960
|
+
|
|
1961
|
+
# Generate query embedding
|
|
1962
|
+
if self.config.show_stats:
|
|
1963
|
+
print(f"🔍 Searching through {len(self.chunks)} chunks...")
|
|
1964
|
+
self.log.debug(f"Encoding query: {query[:50]}...")
|
|
1965
|
+
query_embedding = self._encode_texts([query], show_progress=False)
|
|
1966
|
+
|
|
1967
|
+
# Search for similar chunks
|
|
1968
|
+
k = min(self.config.max_chunks, len(self.chunks))
|
|
1969
|
+
if self.config.show_stats:
|
|
1970
|
+
print(f" 🎯 Finding {k} most relevant chunks...")
|
|
1971
|
+
# pylint: disable=no-value-for-parameter
|
|
1972
|
+
distances, indices = self.index.search(query_embedding.astype("float32"), k)
|
|
1973
|
+
|
|
1974
|
+
# Get chunks and scores
|
|
1975
|
+
retrieved_chunks = [self.chunks[i] for i in indices[0]]
|
|
1976
|
+
# Convert distances to similarity scores (lower distance = higher similarity)
|
|
1977
|
+
scores = [1.0 / (1.0 + dist) for dist in distances[0]]
|
|
1978
|
+
|
|
1979
|
+
if self.config.show_stats:
|
|
1980
|
+
print(
|
|
1981
|
+
f" ✅ Retrieved {len(retrieved_chunks)} chunks (avg relevance: {sum(scores)/len(scores):.3f})"
|
|
1982
|
+
)
|
|
1983
|
+
|
|
1984
|
+
self.log.debug(
|
|
1985
|
+
f"Retrieved {len(retrieved_chunks)} chunks with scores: {[f'{s:.3f}' for s in scores]}"
|
|
1986
|
+
)
|
|
1987
|
+
return retrieved_chunks, scores
|
|
1988
|
+
|
|
1989
|
+
def query(self, question: str, include_metadata: bool = True) -> RAGResponse:
|
|
1990
|
+
"""
|
|
1991
|
+
Query the indexed documents with enhanced metadata tracking.
|
|
1992
|
+
|
|
1993
|
+
Args:
|
|
1994
|
+
question: Question to ask about the documents
|
|
1995
|
+
include_metadata: Whether to include detailed metadata in response
|
|
1996
|
+
|
|
1997
|
+
Returns:
|
|
1998
|
+
RAGResponse with answer, retrieved chunks, and metadata
|
|
1999
|
+
"""
|
|
2000
|
+
if self.index is None:
|
|
2001
|
+
raise ValueError("No documents indexed. Call index_document() first.")
|
|
2002
|
+
|
|
2003
|
+
# Retrieve relevant chunks with metadata
|
|
2004
|
+
if include_metadata:
|
|
2005
|
+
chunks, scores, chunk_metadata = self._retrieve_chunks_with_metadata(
|
|
2006
|
+
question
|
|
2007
|
+
)
|
|
2008
|
+
else:
|
|
2009
|
+
chunks, scores = self._retrieve_chunks(question)
|
|
2010
|
+
chunk_metadata = None
|
|
2011
|
+
|
|
2012
|
+
# Build context from retrieved chunks
|
|
2013
|
+
context = "\n\n".join(
|
|
2014
|
+
[f"Context {i+1}:\n{chunk}" for i, chunk in enumerate(chunks)]
|
|
2015
|
+
)
|
|
2016
|
+
|
|
2017
|
+
# Create prompt
|
|
2018
|
+
prompt = f"""Based on the following context, please answer the question. If the answer is not in the context, say so.
|
|
2019
|
+
|
|
2020
|
+
Context:
|
|
2021
|
+
{context}
|
|
2022
|
+
|
|
2023
|
+
Question: {question}
|
|
2024
|
+
|
|
2025
|
+
Answer:"""
|
|
2026
|
+
|
|
2027
|
+
# Get LLM response
|
|
2028
|
+
response = self.chat.send(prompt)
|
|
2029
|
+
|
|
2030
|
+
# Build query metadata
|
|
2031
|
+
query_metadata = None
|
|
2032
|
+
if include_metadata and chunk_metadata:
|
|
2033
|
+
# Get unique source files
|
|
2034
|
+
source_files = list(
|
|
2035
|
+
set(
|
|
2036
|
+
m["source_file"]
|
|
2037
|
+
for m in chunk_metadata
|
|
2038
|
+
if m["source_file"] != "unknown"
|
|
2039
|
+
)
|
|
2040
|
+
)
|
|
2041
|
+
|
|
2042
|
+
query_metadata = {
|
|
2043
|
+
"question": question,
|
|
2044
|
+
"num_chunks_retrieved": len(chunks),
|
|
2045
|
+
"source_files": source_files,
|
|
2046
|
+
"total_indexed_files": len(self.indexed_files),
|
|
2047
|
+
"total_indexed_chunks": len(self.chunks),
|
|
2048
|
+
"average_relevance_score": float(np.mean(scores)) if scores else 0.0,
|
|
2049
|
+
"max_relevance_score": float(max(scores)) if scores else 0.0,
|
|
2050
|
+
"min_relevance_score": float(min(scores)) if scores else 0.0,
|
|
2051
|
+
}
|
|
2052
|
+
|
|
2053
|
+
# Collect source files list
|
|
2054
|
+
source_files_list = [m["source_file"] for m in chunk_metadata]
|
|
2055
|
+
else:
|
|
2056
|
+
source_files_list = None
|
|
2057
|
+
|
|
2058
|
+
return RAGResponse(
|
|
2059
|
+
text=response.text,
|
|
2060
|
+
chunks=chunks,
|
|
2061
|
+
chunk_scores=scores,
|
|
2062
|
+
stats=response.stats,
|
|
2063
|
+
source_files=source_files_list,
|
|
2064
|
+
chunk_metadata=chunk_metadata,
|
|
2065
|
+
query_metadata=query_metadata,
|
|
2066
|
+
)
|
|
2067
|
+
|
|
2068
|
+
def _save_extracted_markdown(
|
|
2069
|
+
self, file_path: str, text: str, metadata: Dict[str, Any]
|
|
2070
|
+
):
|
|
2071
|
+
"""
|
|
2072
|
+
Save extracted text as markdown file in cache directory.
|
|
2073
|
+
|
|
2074
|
+
This creates a human-readable markdown version of the extracted text
|
|
2075
|
+
that can be used for /dump commands and debugging without re-extraction.
|
|
2076
|
+
Uses hash-based naming to match the pickle cache for consistency.
|
|
2077
|
+
|
|
2078
|
+
Args:
|
|
2079
|
+
file_path: Path to original document
|
|
2080
|
+
text: Extracted text content
|
|
2081
|
+
metadata: File metadata (num_pages, vlm_pages, etc.)
|
|
2082
|
+
"""
|
|
2083
|
+
try:
|
|
2084
|
+
from datetime import datetime
|
|
2085
|
+
|
|
2086
|
+
# Calculate file hash for consistency with pickle cache
|
|
2087
|
+
path = Path(file_path).absolute()
|
|
2088
|
+
hasher = hashlib.sha256()
|
|
2089
|
+
with self._safe_open(path, "rb") as f:
|
|
2090
|
+
while chunk := f.read(8192):
|
|
2091
|
+
hasher.update(chunk)
|
|
2092
|
+
content_hash = hasher.hexdigest()
|
|
2093
|
+
|
|
2094
|
+
# Use hash-based filename similar to pickle cache
|
|
2095
|
+
path_hash = hashlib.sha256(str(path).encode()).hexdigest()[:16]
|
|
2096
|
+
cache_key = f"{path_hash}_{content_hash[:32]}"
|
|
2097
|
+
md_filename = f"{cache_key}_extracted.md"
|
|
2098
|
+
md_path = os.path.join(self.config.cache_dir, md_filename)
|
|
2099
|
+
|
|
2100
|
+
# Create markdown content with metadata header
|
|
2101
|
+
vlm_status = (
|
|
2102
|
+
"✅ Enabled"
|
|
2103
|
+
if metadata.get("vlm_available", False)
|
|
2104
|
+
else "❌ Not Available"
|
|
2105
|
+
)
|
|
2106
|
+
markdown_content = f"""# Extracted Text from {Path(file_path).name}
|
|
2107
|
+
|
|
2108
|
+
## Metadata
|
|
2109
|
+
**Source File:** {file_path}
|
|
2110
|
+
**File Hash (SHA-256):** {content_hash[:32]}
|
|
2111
|
+
**Extraction Date:** {datetime.now().isoformat()}
|
|
2112
|
+
**Pages:** {metadata.get('num_pages', 'N/A')}
|
|
2113
|
+
**VLM Status:** {vlm_status}
|
|
2114
|
+
**VLM Pages (with images):** {metadata.get('vlm_pages', 0)}
|
|
2115
|
+
**Total Images Processed:** {metadata.get('total_images', 0)}
|
|
2116
|
+
|
|
2117
|
+
---
|
|
2118
|
+
|
|
2119
|
+
## Extracted Content
|
|
2120
|
+
{text}
|
|
2121
|
+
"""
|
|
2122
|
+
|
|
2123
|
+
# Write markdown file
|
|
2124
|
+
with open(md_path, "w", encoding="utf-8") as f:
|
|
2125
|
+
f.write(markdown_content)
|
|
2126
|
+
|
|
2127
|
+
self.log.debug(f"Saved extracted markdown to {md_path}")
|
|
2128
|
+
|
|
2129
|
+
except Exception as e:
|
|
2130
|
+
# Don't fail indexing if markdown save fails
|
|
2131
|
+
self.log.warning(
|
|
2132
|
+
f"Failed to save markdown cache for {Path(file_path).name}: {e}"
|
|
2133
|
+
)
|
|
2134
|
+
|
|
2135
|
+
def clear_cache(self):
|
|
2136
|
+
"""Clear the RAG cache."""
|
|
2137
|
+
import shutil
|
|
2138
|
+
|
|
2139
|
+
if os.path.exists(self.config.cache_dir):
|
|
2140
|
+
shutil.rmtree(self.config.cache_dir)
|
|
2141
|
+
os.makedirs(self.config.cache_dir, exist_ok=True)
|
|
2142
|
+
self.log.info("Cache cleared")
|
|
2143
|
+
|
|
2144
|
+
def get_status(self) -> Dict[str, Any]:
|
|
2145
|
+
"""Get RAG system status."""
|
|
2146
|
+
return {
|
|
2147
|
+
"indexed_files": len(self.indexed_files),
|
|
2148
|
+
"total_chunks": len(self.chunks) if self.chunks else 0,
|
|
2149
|
+
"cache_dir": self.config.cache_dir,
|
|
2150
|
+
"embedding_model": self.config.embedding_model,
|
|
2151
|
+
"config": {
|
|
2152
|
+
"chunk_size": self.config.chunk_size,
|
|
2153
|
+
"chunk_overlap": self.config.chunk_overlap,
|
|
2154
|
+
"max_chunks": self.config.max_chunks,
|
|
2155
|
+
},
|
|
2156
|
+
}
|
|
2157
|
+
|
|
2158
|
+
|
|
2159
|
+
def quick_rag(pdf_path: str, question: str, **kwargs) -> str:
|
|
2160
|
+
"""
|
|
2161
|
+
Convenience function for quick RAG query.
|
|
2162
|
+
|
|
2163
|
+
Args:
|
|
2164
|
+
pdf_path: Path to PDF file
|
|
2165
|
+
question: Question to ask
|
|
2166
|
+
**kwargs: Additional config parameters
|
|
2167
|
+
|
|
2168
|
+
Returns:
|
|
2169
|
+
Answer text
|
|
2170
|
+
|
|
2171
|
+
Raises:
|
|
2172
|
+
ValueError: If pdf_path is empty, question is empty, or file doesn't exist
|
|
2173
|
+
"""
|
|
2174
|
+
# Validate inputs
|
|
2175
|
+
if not pdf_path or not pdf_path.strip():
|
|
2176
|
+
raise ValueError("PDF path cannot be empty")
|
|
2177
|
+
|
|
2178
|
+
if not question or not question.strip():
|
|
2179
|
+
raise ValueError("Question cannot be empty")
|
|
2180
|
+
|
|
2181
|
+
# Check if file exists
|
|
2182
|
+
if not os.path.exists(pdf_path):
|
|
2183
|
+
raise ValueError(f"PDF file not found: {pdf_path}")
|
|
2184
|
+
|
|
2185
|
+
config = RAGConfig(**kwargs)
|
|
2186
|
+
rag = RAGSDK(config)
|
|
2187
|
+
|
|
2188
|
+
result = rag.index_document(pdf_path)
|
|
2189
|
+
if not result.get("success"):
|
|
2190
|
+
error = result.get("error", "Unknown error")
|
|
2191
|
+
raise ValueError(f"Failed to index document: {pdf_path}. Error: {error}")
|
|
2192
|
+
|
|
2193
|
+
response = rag.query(question)
|
|
2194
|
+
return response.text
|