amd-gaia 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amd_gaia-0.14.1.dist-info/METADATA +768 -0
- amd_gaia-0.14.1.dist-info/RECORD +800 -0
- amd_gaia-0.14.1.dist-info/WHEEL +5 -0
- amd_gaia-0.14.1.dist-info/entry_points.txt +5 -0
- amd_gaia-0.14.1.dist-info/licenses/LICENSE.md +21 -0
- amd_gaia-0.14.1.dist-info/top_level.txt +1 -0
- gaia/__init__.py +2 -0
- gaia/agents/__init__.py +19 -0
- gaia/agents/base/__init__.py +9 -0
- gaia/agents/base/agent.py +2072 -0
- gaia/agents/base/api_agent.py +120 -0
- gaia/agents/base/console.py +1457 -0
- gaia/agents/base/mcp_agent.py +86 -0
- gaia/agents/base/tools.py +83 -0
- gaia/agents/blender/agent.py +556 -0
- gaia/agents/blender/agent_simple.py +135 -0
- gaia/agents/blender/app.py +211 -0
- gaia/agents/blender/app_simple.py +41 -0
- gaia/agents/blender/core/__init__.py +16 -0
- gaia/agents/blender/core/materials.py +506 -0
- gaia/agents/blender/core/objects.py +316 -0
- gaia/agents/blender/core/rendering.py +225 -0
- gaia/agents/blender/core/scene.py +220 -0
- gaia/agents/blender/core/view.py +146 -0
- gaia/agents/chat/__init__.py +9 -0
- gaia/agents/chat/agent.py +975 -0
- gaia/agents/chat/app.py +1058 -0
- gaia/agents/chat/session.py +508 -0
- gaia/agents/chat/tools/__init__.py +15 -0
- gaia/agents/chat/tools/file_tools.py +96 -0
- gaia/agents/chat/tools/rag_tools.py +1729 -0
- gaia/agents/chat/tools/shell_tools.py +436 -0
- gaia/agents/code/__init__.py +7 -0
- gaia/agents/code/agent.py +547 -0
- gaia/agents/code/app.py +266 -0
- gaia/agents/code/models.py +135 -0
- gaia/agents/code/orchestration/__init__.py +24 -0
- gaia/agents/code/orchestration/checklist_executor.py +1739 -0
- gaia/agents/code/orchestration/checklist_generator.py +709 -0
- gaia/agents/code/orchestration/factories/__init__.py +9 -0
- gaia/agents/code/orchestration/factories/base.py +63 -0
- gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -0
- gaia/agents/code/orchestration/factories/python_factory.py +106 -0
- gaia/agents/code/orchestration/orchestrator.py +610 -0
- gaia/agents/code/orchestration/project_analyzer.py +391 -0
- gaia/agents/code/orchestration/steps/__init__.py +67 -0
- gaia/agents/code/orchestration/steps/base.py +188 -0
- gaia/agents/code/orchestration/steps/error_handler.py +314 -0
- gaia/agents/code/orchestration/steps/nextjs.py +828 -0
- gaia/agents/code/orchestration/steps/python.py +307 -0
- gaia/agents/code/orchestration/template_catalog.py +463 -0
- gaia/agents/code/orchestration/workflows/__init__.py +14 -0
- gaia/agents/code/orchestration/workflows/base.py +80 -0
- gaia/agents/code/orchestration/workflows/nextjs.py +186 -0
- gaia/agents/code/orchestration/workflows/python.py +94 -0
- gaia/agents/code/prompts/__init__.py +11 -0
- gaia/agents/code/prompts/base_prompt.py +77 -0
- gaia/agents/code/prompts/code_patterns.py +1925 -0
- gaia/agents/code/prompts/nextjs_prompt.py +40 -0
- gaia/agents/code/prompts/python_prompt.py +109 -0
- gaia/agents/code/schema_inference.py +365 -0
- gaia/agents/code/system_prompt.py +41 -0
- gaia/agents/code/tools/__init__.py +42 -0
- gaia/agents/code/tools/cli_tools.py +1138 -0
- gaia/agents/code/tools/code_formatting.py +319 -0
- gaia/agents/code/tools/code_tools.py +769 -0
- gaia/agents/code/tools/error_fixing.py +1347 -0
- gaia/agents/code/tools/external_tools.py +180 -0
- gaia/agents/code/tools/file_io.py +845 -0
- gaia/agents/code/tools/prisma_tools.py +190 -0
- gaia/agents/code/tools/project_management.py +1016 -0
- gaia/agents/code/tools/testing.py +321 -0
- gaia/agents/code/tools/typescript_tools.py +122 -0
- gaia/agents/code/tools/validation_parsing.py +461 -0
- gaia/agents/code/tools/validation_tools.py +803 -0
- gaia/agents/code/tools/web_dev_tools.py +1744 -0
- gaia/agents/code/validators/__init__.py +16 -0
- gaia/agents/code/validators/antipattern_checker.py +241 -0
- gaia/agents/code/validators/ast_analyzer.py +197 -0
- gaia/agents/code/validators/requirements_validator.py +145 -0
- gaia/agents/code/validators/syntax_validator.py +171 -0
- gaia/agents/docker/__init__.py +7 -0
- gaia/agents/docker/agent.py +642 -0
- gaia/agents/jira/__init__.py +11 -0
- gaia/agents/jira/agent.py +894 -0
- gaia/agents/jira/jql_templates.py +299 -0
- gaia/agents/routing/__init__.py +7 -0
- gaia/agents/routing/agent.py +512 -0
- gaia/agents/routing/system_prompt.py +75 -0
- gaia/api/__init__.py +23 -0
- gaia/api/agent_registry.py +238 -0
- gaia/api/app.py +305 -0
- gaia/api/openai_server.py +575 -0
- gaia/api/schemas.py +186 -0
- gaia/api/sse_handler.py +370 -0
- gaia/apps/__init__.py +4 -0
- gaia/apps/llm/__init__.py +6 -0
- gaia/apps/llm/app.py +169 -0
- gaia/apps/summarize/app.py +633 -0
- gaia/apps/summarize/html_viewer.py +133 -0
- gaia/apps/summarize/pdf_formatter.py +284 -0
- gaia/audio/__init__.py +2 -0
- gaia/audio/audio_client.py +439 -0
- gaia/audio/audio_recorder.py +269 -0
- gaia/audio/kokoro_tts.py +599 -0
- gaia/audio/whisper_asr.py +432 -0
- gaia/chat/__init__.py +16 -0
- gaia/chat/app.py +430 -0
- gaia/chat/prompts.py +522 -0
- gaia/chat/sdk.py +1200 -0
- gaia/cli.py +5621 -0
- gaia/eval/batch_experiment.py +2332 -0
- gaia/eval/claude.py +542 -0
- gaia/eval/config.py +37 -0
- gaia/eval/email_generator.py +512 -0
- gaia/eval/eval.py +3179 -0
- gaia/eval/groundtruth.py +1130 -0
- gaia/eval/transcript_generator.py +582 -0
- gaia/eval/webapp/README.md +168 -0
- gaia/eval/webapp/node_modules/.bin/mime +16 -0
- gaia/eval/webapp/node_modules/.bin/mime.cmd +17 -0
- gaia/eval/webapp/node_modules/.bin/mime.ps1 +28 -0
- gaia/eval/webapp/node_modules/.package-lock.json +865 -0
- gaia/eval/webapp/node_modules/accepts/HISTORY.md +243 -0
- gaia/eval/webapp/node_modules/accepts/LICENSE +23 -0
- gaia/eval/webapp/node_modules/accepts/README.md +140 -0
- gaia/eval/webapp/node_modules/accepts/index.js +238 -0
- gaia/eval/webapp/node_modules/accepts/package.json +47 -0
- gaia/eval/webapp/node_modules/array-flatten/LICENSE +21 -0
- gaia/eval/webapp/node_modules/array-flatten/README.md +43 -0
- gaia/eval/webapp/node_modules/array-flatten/array-flatten.js +64 -0
- gaia/eval/webapp/node_modules/array-flatten/package.json +39 -0
- gaia/eval/webapp/node_modules/body-parser/HISTORY.md +672 -0
- gaia/eval/webapp/node_modules/body-parser/LICENSE +23 -0
- gaia/eval/webapp/node_modules/body-parser/README.md +476 -0
- gaia/eval/webapp/node_modules/body-parser/SECURITY.md +25 -0
- gaia/eval/webapp/node_modules/body-parser/index.js +156 -0
- gaia/eval/webapp/node_modules/body-parser/lib/read.js +205 -0
- gaia/eval/webapp/node_modules/body-parser/lib/types/json.js +247 -0
- gaia/eval/webapp/node_modules/body-parser/lib/types/raw.js +101 -0
- gaia/eval/webapp/node_modules/body-parser/lib/types/text.js +121 -0
- gaia/eval/webapp/node_modules/body-parser/lib/types/urlencoded.js +307 -0
- gaia/eval/webapp/node_modules/body-parser/package.json +56 -0
- gaia/eval/webapp/node_modules/bytes/History.md +97 -0
- gaia/eval/webapp/node_modules/bytes/LICENSE +23 -0
- gaia/eval/webapp/node_modules/bytes/Readme.md +152 -0
- gaia/eval/webapp/node_modules/bytes/index.js +170 -0
- gaia/eval/webapp/node_modules/bytes/package.json +42 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/.eslintrc +17 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/.nycrc +9 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/CHANGELOG.md +30 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/LICENSE +21 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/README.md +62 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/actualApply.d.ts +1 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/actualApply.js +10 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/applyBind.d.ts +19 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/applyBind.js +10 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionApply.d.ts +1 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionApply.js +4 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionCall.d.ts +1 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionCall.js +4 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/index.d.ts +64 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/index.js +15 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/package.json +85 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/reflectApply.d.ts +3 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/reflectApply.js +4 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/test/index.js +63 -0
- gaia/eval/webapp/node_modules/call-bind-apply-helpers/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/call-bound/.eslintrc +13 -0
- gaia/eval/webapp/node_modules/call-bound/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/call-bound/.nycrc +9 -0
- gaia/eval/webapp/node_modules/call-bound/CHANGELOG.md +42 -0
- gaia/eval/webapp/node_modules/call-bound/LICENSE +21 -0
- gaia/eval/webapp/node_modules/call-bound/README.md +53 -0
- gaia/eval/webapp/node_modules/call-bound/index.d.ts +94 -0
- gaia/eval/webapp/node_modules/call-bound/index.js +19 -0
- gaia/eval/webapp/node_modules/call-bound/package.json +99 -0
- gaia/eval/webapp/node_modules/call-bound/test/index.js +61 -0
- gaia/eval/webapp/node_modules/call-bound/tsconfig.json +10 -0
- gaia/eval/webapp/node_modules/content-disposition/HISTORY.md +60 -0
- gaia/eval/webapp/node_modules/content-disposition/LICENSE +22 -0
- gaia/eval/webapp/node_modules/content-disposition/README.md +142 -0
- gaia/eval/webapp/node_modules/content-disposition/index.js +458 -0
- gaia/eval/webapp/node_modules/content-disposition/package.json +44 -0
- gaia/eval/webapp/node_modules/content-type/HISTORY.md +29 -0
- gaia/eval/webapp/node_modules/content-type/LICENSE +22 -0
- gaia/eval/webapp/node_modules/content-type/README.md +94 -0
- gaia/eval/webapp/node_modules/content-type/index.js +225 -0
- gaia/eval/webapp/node_modules/content-type/package.json +42 -0
- gaia/eval/webapp/node_modules/cookie/LICENSE +24 -0
- gaia/eval/webapp/node_modules/cookie/README.md +317 -0
- gaia/eval/webapp/node_modules/cookie/SECURITY.md +25 -0
- gaia/eval/webapp/node_modules/cookie/index.js +334 -0
- gaia/eval/webapp/node_modules/cookie/package.json +44 -0
- gaia/eval/webapp/node_modules/cookie-signature/.npmignore +4 -0
- gaia/eval/webapp/node_modules/cookie-signature/History.md +38 -0
- gaia/eval/webapp/node_modules/cookie-signature/Readme.md +42 -0
- gaia/eval/webapp/node_modules/cookie-signature/index.js +51 -0
- gaia/eval/webapp/node_modules/cookie-signature/package.json +18 -0
- gaia/eval/webapp/node_modules/debug/.coveralls.yml +1 -0
- gaia/eval/webapp/node_modules/debug/.eslintrc +11 -0
- gaia/eval/webapp/node_modules/debug/.npmignore +9 -0
- gaia/eval/webapp/node_modules/debug/.travis.yml +14 -0
- gaia/eval/webapp/node_modules/debug/CHANGELOG.md +362 -0
- gaia/eval/webapp/node_modules/debug/LICENSE +19 -0
- gaia/eval/webapp/node_modules/debug/Makefile +50 -0
- gaia/eval/webapp/node_modules/debug/README.md +312 -0
- gaia/eval/webapp/node_modules/debug/component.json +19 -0
- gaia/eval/webapp/node_modules/debug/karma.conf.js +70 -0
- gaia/eval/webapp/node_modules/debug/node.js +1 -0
- gaia/eval/webapp/node_modules/debug/package.json +49 -0
- gaia/eval/webapp/node_modules/debug/src/browser.js +185 -0
- gaia/eval/webapp/node_modules/debug/src/debug.js +202 -0
- gaia/eval/webapp/node_modules/debug/src/index.js +10 -0
- gaia/eval/webapp/node_modules/debug/src/inspector-log.js +15 -0
- gaia/eval/webapp/node_modules/debug/src/node.js +248 -0
- gaia/eval/webapp/node_modules/depd/History.md +103 -0
- gaia/eval/webapp/node_modules/depd/LICENSE +22 -0
- gaia/eval/webapp/node_modules/depd/Readme.md +280 -0
- gaia/eval/webapp/node_modules/depd/index.js +538 -0
- gaia/eval/webapp/node_modules/depd/lib/browser/index.js +77 -0
- gaia/eval/webapp/node_modules/depd/package.json +45 -0
- gaia/eval/webapp/node_modules/destroy/LICENSE +23 -0
- gaia/eval/webapp/node_modules/destroy/README.md +63 -0
- gaia/eval/webapp/node_modules/destroy/index.js +209 -0
- gaia/eval/webapp/node_modules/destroy/package.json +48 -0
- gaia/eval/webapp/node_modules/dunder-proto/.eslintrc +5 -0
- gaia/eval/webapp/node_modules/dunder-proto/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/dunder-proto/.nycrc +13 -0
- gaia/eval/webapp/node_modules/dunder-proto/CHANGELOG.md +24 -0
- gaia/eval/webapp/node_modules/dunder-proto/LICENSE +21 -0
- gaia/eval/webapp/node_modules/dunder-proto/README.md +54 -0
- gaia/eval/webapp/node_modules/dunder-proto/get.d.ts +5 -0
- gaia/eval/webapp/node_modules/dunder-proto/get.js +30 -0
- gaia/eval/webapp/node_modules/dunder-proto/package.json +76 -0
- gaia/eval/webapp/node_modules/dunder-proto/set.d.ts +5 -0
- gaia/eval/webapp/node_modules/dunder-proto/set.js +35 -0
- gaia/eval/webapp/node_modules/dunder-proto/test/get.js +34 -0
- gaia/eval/webapp/node_modules/dunder-proto/test/index.js +4 -0
- gaia/eval/webapp/node_modules/dunder-proto/test/set.js +50 -0
- gaia/eval/webapp/node_modules/dunder-proto/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/ee-first/LICENSE +22 -0
- gaia/eval/webapp/node_modules/ee-first/README.md +80 -0
- gaia/eval/webapp/node_modules/ee-first/index.js +95 -0
- gaia/eval/webapp/node_modules/ee-first/package.json +29 -0
- gaia/eval/webapp/node_modules/encodeurl/LICENSE +22 -0
- gaia/eval/webapp/node_modules/encodeurl/README.md +109 -0
- gaia/eval/webapp/node_modules/encodeurl/index.js +60 -0
- gaia/eval/webapp/node_modules/encodeurl/package.json +40 -0
- gaia/eval/webapp/node_modules/es-define-property/.eslintrc +13 -0
- gaia/eval/webapp/node_modules/es-define-property/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/es-define-property/.nycrc +9 -0
- gaia/eval/webapp/node_modules/es-define-property/CHANGELOG.md +29 -0
- gaia/eval/webapp/node_modules/es-define-property/LICENSE +21 -0
- gaia/eval/webapp/node_modules/es-define-property/README.md +49 -0
- gaia/eval/webapp/node_modules/es-define-property/index.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-define-property/index.js +14 -0
- gaia/eval/webapp/node_modules/es-define-property/package.json +81 -0
- gaia/eval/webapp/node_modules/es-define-property/test/index.js +56 -0
- gaia/eval/webapp/node_modules/es-define-property/tsconfig.json +10 -0
- gaia/eval/webapp/node_modules/es-errors/.eslintrc +5 -0
- gaia/eval/webapp/node_modules/es-errors/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/es-errors/CHANGELOG.md +40 -0
- gaia/eval/webapp/node_modules/es-errors/LICENSE +21 -0
- gaia/eval/webapp/node_modules/es-errors/README.md +55 -0
- gaia/eval/webapp/node_modules/es-errors/eval.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/eval.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/index.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/index.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/package.json +80 -0
- gaia/eval/webapp/node_modules/es-errors/range.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/range.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/ref.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/ref.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/syntax.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/syntax.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/test/index.js +19 -0
- gaia/eval/webapp/node_modules/es-errors/tsconfig.json +49 -0
- gaia/eval/webapp/node_modules/es-errors/type.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/type.js +4 -0
- gaia/eval/webapp/node_modules/es-errors/uri.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-errors/uri.js +4 -0
- gaia/eval/webapp/node_modules/es-object-atoms/.eslintrc +16 -0
- gaia/eval/webapp/node_modules/es-object-atoms/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/es-object-atoms/CHANGELOG.md +37 -0
- gaia/eval/webapp/node_modules/es-object-atoms/LICENSE +21 -0
- gaia/eval/webapp/node_modules/es-object-atoms/README.md +63 -0
- gaia/eval/webapp/node_modules/es-object-atoms/RequireObjectCoercible.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-object-atoms/RequireObjectCoercible.js +11 -0
- gaia/eval/webapp/node_modules/es-object-atoms/ToObject.d.ts +7 -0
- gaia/eval/webapp/node_modules/es-object-atoms/ToObject.js +10 -0
- gaia/eval/webapp/node_modules/es-object-atoms/index.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-object-atoms/index.js +4 -0
- gaia/eval/webapp/node_modules/es-object-atoms/isObject.d.ts +3 -0
- gaia/eval/webapp/node_modules/es-object-atoms/isObject.js +6 -0
- gaia/eval/webapp/node_modules/es-object-atoms/package.json +80 -0
- gaia/eval/webapp/node_modules/es-object-atoms/test/index.js +38 -0
- gaia/eval/webapp/node_modules/es-object-atoms/tsconfig.json +6 -0
- gaia/eval/webapp/node_modules/escape-html/LICENSE +24 -0
- gaia/eval/webapp/node_modules/escape-html/Readme.md +43 -0
- gaia/eval/webapp/node_modules/escape-html/index.js +78 -0
- gaia/eval/webapp/node_modules/escape-html/package.json +24 -0
- gaia/eval/webapp/node_modules/etag/HISTORY.md +83 -0
- gaia/eval/webapp/node_modules/etag/LICENSE +22 -0
- gaia/eval/webapp/node_modules/etag/README.md +159 -0
- gaia/eval/webapp/node_modules/etag/index.js +131 -0
- gaia/eval/webapp/node_modules/etag/package.json +47 -0
- gaia/eval/webapp/node_modules/express/History.md +3656 -0
- gaia/eval/webapp/node_modules/express/LICENSE +24 -0
- gaia/eval/webapp/node_modules/express/Readme.md +260 -0
- gaia/eval/webapp/node_modules/express/index.js +11 -0
- gaia/eval/webapp/node_modules/express/lib/application.js +661 -0
- gaia/eval/webapp/node_modules/express/lib/express.js +116 -0
- gaia/eval/webapp/node_modules/express/lib/middleware/init.js +43 -0
- gaia/eval/webapp/node_modules/express/lib/middleware/query.js +47 -0
- gaia/eval/webapp/node_modules/express/lib/request.js +525 -0
- gaia/eval/webapp/node_modules/express/lib/response.js +1179 -0
- gaia/eval/webapp/node_modules/express/lib/router/index.js +673 -0
- gaia/eval/webapp/node_modules/express/lib/router/layer.js +181 -0
- gaia/eval/webapp/node_modules/express/lib/router/route.js +230 -0
- gaia/eval/webapp/node_modules/express/lib/utils.js +303 -0
- gaia/eval/webapp/node_modules/express/lib/view.js +182 -0
- gaia/eval/webapp/node_modules/express/package.json +102 -0
- gaia/eval/webapp/node_modules/finalhandler/HISTORY.md +210 -0
- gaia/eval/webapp/node_modules/finalhandler/LICENSE +22 -0
- gaia/eval/webapp/node_modules/finalhandler/README.md +147 -0
- gaia/eval/webapp/node_modules/finalhandler/SECURITY.md +25 -0
- gaia/eval/webapp/node_modules/finalhandler/index.js +341 -0
- gaia/eval/webapp/node_modules/finalhandler/package.json +47 -0
- gaia/eval/webapp/node_modules/forwarded/HISTORY.md +21 -0
- gaia/eval/webapp/node_modules/forwarded/LICENSE +22 -0
- gaia/eval/webapp/node_modules/forwarded/README.md +57 -0
- gaia/eval/webapp/node_modules/forwarded/index.js +90 -0
- gaia/eval/webapp/node_modules/forwarded/package.json +45 -0
- gaia/eval/webapp/node_modules/fresh/HISTORY.md +70 -0
- gaia/eval/webapp/node_modules/fresh/LICENSE +23 -0
- gaia/eval/webapp/node_modules/fresh/README.md +119 -0
- gaia/eval/webapp/node_modules/fresh/index.js +137 -0
- gaia/eval/webapp/node_modules/fresh/package.json +46 -0
- gaia/eval/webapp/node_modules/fs/README.md +9 -0
- gaia/eval/webapp/node_modules/fs/package.json +20 -0
- gaia/eval/webapp/node_modules/function-bind/.eslintrc +21 -0
- gaia/eval/webapp/node_modules/function-bind/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/function-bind/.github/SECURITY.md +3 -0
- gaia/eval/webapp/node_modules/function-bind/.nycrc +13 -0
- gaia/eval/webapp/node_modules/function-bind/CHANGELOG.md +136 -0
- gaia/eval/webapp/node_modules/function-bind/LICENSE +20 -0
- gaia/eval/webapp/node_modules/function-bind/README.md +46 -0
- gaia/eval/webapp/node_modules/function-bind/implementation.js +84 -0
- gaia/eval/webapp/node_modules/function-bind/index.js +5 -0
- gaia/eval/webapp/node_modules/function-bind/package.json +87 -0
- gaia/eval/webapp/node_modules/function-bind/test/.eslintrc +9 -0
- gaia/eval/webapp/node_modules/function-bind/test/index.js +252 -0
- gaia/eval/webapp/node_modules/get-intrinsic/.eslintrc +42 -0
- gaia/eval/webapp/node_modules/get-intrinsic/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/get-intrinsic/.nycrc +9 -0
- gaia/eval/webapp/node_modules/get-intrinsic/CHANGELOG.md +186 -0
- gaia/eval/webapp/node_modules/get-intrinsic/LICENSE +21 -0
- gaia/eval/webapp/node_modules/get-intrinsic/README.md +71 -0
- gaia/eval/webapp/node_modules/get-intrinsic/index.js +378 -0
- gaia/eval/webapp/node_modules/get-intrinsic/package.json +97 -0
- gaia/eval/webapp/node_modules/get-intrinsic/test/GetIntrinsic.js +274 -0
- gaia/eval/webapp/node_modules/get-proto/.eslintrc +10 -0
- gaia/eval/webapp/node_modules/get-proto/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/get-proto/.nycrc +9 -0
- gaia/eval/webapp/node_modules/get-proto/CHANGELOG.md +21 -0
- gaia/eval/webapp/node_modules/get-proto/LICENSE +21 -0
- gaia/eval/webapp/node_modules/get-proto/Object.getPrototypeOf.d.ts +5 -0
- gaia/eval/webapp/node_modules/get-proto/Object.getPrototypeOf.js +6 -0
- gaia/eval/webapp/node_modules/get-proto/README.md +50 -0
- gaia/eval/webapp/node_modules/get-proto/Reflect.getPrototypeOf.d.ts +3 -0
- gaia/eval/webapp/node_modules/get-proto/Reflect.getPrototypeOf.js +4 -0
- gaia/eval/webapp/node_modules/get-proto/index.d.ts +5 -0
- gaia/eval/webapp/node_modules/get-proto/index.js +27 -0
- gaia/eval/webapp/node_modules/get-proto/package.json +81 -0
- gaia/eval/webapp/node_modules/get-proto/test/index.js +68 -0
- gaia/eval/webapp/node_modules/get-proto/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/gopd/.eslintrc +16 -0
- gaia/eval/webapp/node_modules/gopd/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/gopd/CHANGELOG.md +45 -0
- gaia/eval/webapp/node_modules/gopd/LICENSE +21 -0
- gaia/eval/webapp/node_modules/gopd/README.md +40 -0
- gaia/eval/webapp/node_modules/gopd/gOPD.d.ts +1 -0
- gaia/eval/webapp/node_modules/gopd/gOPD.js +4 -0
- gaia/eval/webapp/node_modules/gopd/index.d.ts +5 -0
- gaia/eval/webapp/node_modules/gopd/index.js +15 -0
- gaia/eval/webapp/node_modules/gopd/package.json +77 -0
- gaia/eval/webapp/node_modules/gopd/test/index.js +36 -0
- gaia/eval/webapp/node_modules/gopd/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/has-symbols/.eslintrc +11 -0
- gaia/eval/webapp/node_modules/has-symbols/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/has-symbols/.nycrc +9 -0
- gaia/eval/webapp/node_modules/has-symbols/CHANGELOG.md +91 -0
- gaia/eval/webapp/node_modules/has-symbols/LICENSE +21 -0
- gaia/eval/webapp/node_modules/has-symbols/README.md +46 -0
- gaia/eval/webapp/node_modules/has-symbols/index.d.ts +3 -0
- gaia/eval/webapp/node_modules/has-symbols/index.js +14 -0
- gaia/eval/webapp/node_modules/has-symbols/package.json +111 -0
- gaia/eval/webapp/node_modules/has-symbols/shams.d.ts +3 -0
- gaia/eval/webapp/node_modules/has-symbols/shams.js +45 -0
- gaia/eval/webapp/node_modules/has-symbols/test/index.js +22 -0
- gaia/eval/webapp/node_modules/has-symbols/test/shams/core-js.js +29 -0
- gaia/eval/webapp/node_modules/has-symbols/test/shams/get-own-property-symbols.js +29 -0
- gaia/eval/webapp/node_modules/has-symbols/test/tests.js +58 -0
- gaia/eval/webapp/node_modules/has-symbols/tsconfig.json +10 -0
- gaia/eval/webapp/node_modules/hasown/.eslintrc +5 -0
- gaia/eval/webapp/node_modules/hasown/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/hasown/.nycrc +13 -0
- gaia/eval/webapp/node_modules/hasown/CHANGELOG.md +40 -0
- gaia/eval/webapp/node_modules/hasown/LICENSE +21 -0
- gaia/eval/webapp/node_modules/hasown/README.md +40 -0
- gaia/eval/webapp/node_modules/hasown/index.d.ts +3 -0
- gaia/eval/webapp/node_modules/hasown/index.js +8 -0
- gaia/eval/webapp/node_modules/hasown/package.json +92 -0
- gaia/eval/webapp/node_modules/hasown/tsconfig.json +6 -0
- gaia/eval/webapp/node_modules/http-errors/HISTORY.md +180 -0
- gaia/eval/webapp/node_modules/http-errors/LICENSE +23 -0
- gaia/eval/webapp/node_modules/http-errors/README.md +169 -0
- gaia/eval/webapp/node_modules/http-errors/index.js +289 -0
- gaia/eval/webapp/node_modules/http-errors/package.json +50 -0
- gaia/eval/webapp/node_modules/iconv-lite/Changelog.md +162 -0
- gaia/eval/webapp/node_modules/iconv-lite/LICENSE +21 -0
- gaia/eval/webapp/node_modules/iconv-lite/README.md +156 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/dbcs-codec.js +555 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/dbcs-data.js +176 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/index.js +22 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/internal.js +188 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-codec.js +72 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-data-generated.js +451 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-data.js +174 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/big5-added.json +122 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp936.json +264 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp949.json +273 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp950.json +177 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/eucjp.json +182 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/gb18030-ranges.json +1 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/gbk-added.json +55 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/shiftjis.json +125 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/utf16.js +177 -0
- gaia/eval/webapp/node_modules/iconv-lite/encodings/utf7.js +290 -0
- gaia/eval/webapp/node_modules/iconv-lite/lib/bom-handling.js +52 -0
- gaia/eval/webapp/node_modules/iconv-lite/lib/extend-node.js +217 -0
- gaia/eval/webapp/node_modules/iconv-lite/lib/index.d.ts +24 -0
- gaia/eval/webapp/node_modules/iconv-lite/lib/index.js +153 -0
- gaia/eval/webapp/node_modules/iconv-lite/lib/streams.js +121 -0
- gaia/eval/webapp/node_modules/iconv-lite/package.json +46 -0
- gaia/eval/webapp/node_modules/inherits/LICENSE +16 -0
- gaia/eval/webapp/node_modules/inherits/README.md +42 -0
- gaia/eval/webapp/node_modules/inherits/inherits.js +9 -0
- gaia/eval/webapp/node_modules/inherits/inherits_browser.js +27 -0
- gaia/eval/webapp/node_modules/inherits/package.json +29 -0
- gaia/eval/webapp/node_modules/ipaddr.js/LICENSE +19 -0
- gaia/eval/webapp/node_modules/ipaddr.js/README.md +233 -0
- gaia/eval/webapp/node_modules/ipaddr.js/ipaddr.min.js +1 -0
- gaia/eval/webapp/node_modules/ipaddr.js/lib/ipaddr.js +673 -0
- gaia/eval/webapp/node_modules/ipaddr.js/lib/ipaddr.js.d.ts +68 -0
- gaia/eval/webapp/node_modules/ipaddr.js/package.json +35 -0
- gaia/eval/webapp/node_modules/math-intrinsics/.eslintrc +16 -0
- gaia/eval/webapp/node_modules/math-intrinsics/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/math-intrinsics/CHANGELOG.md +24 -0
- gaia/eval/webapp/node_modules/math-intrinsics/LICENSE +21 -0
- gaia/eval/webapp/node_modules/math-intrinsics/README.md +50 -0
- gaia/eval/webapp/node_modules/math-intrinsics/abs.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/abs.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxArrayLength.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxArrayLength.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxSafeInteger.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxSafeInteger.js +5 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxValue.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/constants/maxValue.js +5 -0
- gaia/eval/webapp/node_modules/math-intrinsics/floor.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/floor.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isFinite.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isFinite.js +12 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isInteger.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isInteger.js +16 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isNaN.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isNaN.js +6 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isNegativeZero.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/isNegativeZero.js +6 -0
- gaia/eval/webapp/node_modules/math-intrinsics/max.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/max.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/min.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/min.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/mod.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/mod.js +9 -0
- gaia/eval/webapp/node_modules/math-intrinsics/package.json +86 -0
- gaia/eval/webapp/node_modules/math-intrinsics/pow.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/pow.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/round.d.ts +1 -0
- gaia/eval/webapp/node_modules/math-intrinsics/round.js +4 -0
- gaia/eval/webapp/node_modules/math-intrinsics/sign.d.ts +3 -0
- gaia/eval/webapp/node_modules/math-intrinsics/sign.js +11 -0
- gaia/eval/webapp/node_modules/math-intrinsics/test/index.js +192 -0
- gaia/eval/webapp/node_modules/math-intrinsics/tsconfig.json +3 -0
- gaia/eval/webapp/node_modules/media-typer/HISTORY.md +22 -0
- gaia/eval/webapp/node_modules/media-typer/LICENSE +22 -0
- gaia/eval/webapp/node_modules/media-typer/README.md +81 -0
- gaia/eval/webapp/node_modules/media-typer/index.js +270 -0
- gaia/eval/webapp/node_modules/media-typer/package.json +26 -0
- gaia/eval/webapp/node_modules/merge-descriptors/HISTORY.md +21 -0
- gaia/eval/webapp/node_modules/merge-descriptors/LICENSE +23 -0
- gaia/eval/webapp/node_modules/merge-descriptors/README.md +49 -0
- gaia/eval/webapp/node_modules/merge-descriptors/index.js +60 -0
- gaia/eval/webapp/node_modules/merge-descriptors/package.json +39 -0
- gaia/eval/webapp/node_modules/methods/HISTORY.md +29 -0
- gaia/eval/webapp/node_modules/methods/LICENSE +24 -0
- gaia/eval/webapp/node_modules/methods/README.md +51 -0
- gaia/eval/webapp/node_modules/methods/index.js +69 -0
- gaia/eval/webapp/node_modules/methods/package.json +36 -0
- gaia/eval/webapp/node_modules/mime/.npmignore +0 -0
- gaia/eval/webapp/node_modules/mime/CHANGELOG.md +164 -0
- gaia/eval/webapp/node_modules/mime/LICENSE +21 -0
- gaia/eval/webapp/node_modules/mime/README.md +90 -0
- gaia/eval/webapp/node_modules/mime/cli.js +8 -0
- gaia/eval/webapp/node_modules/mime/mime.js +108 -0
- gaia/eval/webapp/node_modules/mime/package.json +44 -0
- gaia/eval/webapp/node_modules/mime/src/build.js +53 -0
- gaia/eval/webapp/node_modules/mime/src/test.js +60 -0
- gaia/eval/webapp/node_modules/mime/types.json +1 -0
- gaia/eval/webapp/node_modules/mime-db/HISTORY.md +507 -0
- gaia/eval/webapp/node_modules/mime-db/LICENSE +23 -0
- gaia/eval/webapp/node_modules/mime-db/README.md +100 -0
- gaia/eval/webapp/node_modules/mime-db/db.json +8519 -0
- gaia/eval/webapp/node_modules/mime-db/index.js +12 -0
- gaia/eval/webapp/node_modules/mime-db/package.json +60 -0
- gaia/eval/webapp/node_modules/mime-types/HISTORY.md +397 -0
- gaia/eval/webapp/node_modules/mime-types/LICENSE +23 -0
- gaia/eval/webapp/node_modules/mime-types/README.md +113 -0
- gaia/eval/webapp/node_modules/mime-types/index.js +188 -0
- gaia/eval/webapp/node_modules/mime-types/package.json +44 -0
- gaia/eval/webapp/node_modules/ms/index.js +152 -0
- gaia/eval/webapp/node_modules/ms/license.md +21 -0
- gaia/eval/webapp/node_modules/ms/package.json +37 -0
- gaia/eval/webapp/node_modules/ms/readme.md +51 -0
- gaia/eval/webapp/node_modules/negotiator/HISTORY.md +108 -0
- gaia/eval/webapp/node_modules/negotiator/LICENSE +24 -0
- gaia/eval/webapp/node_modules/negotiator/README.md +203 -0
- gaia/eval/webapp/node_modules/negotiator/index.js +82 -0
- gaia/eval/webapp/node_modules/negotiator/lib/charset.js +169 -0
- gaia/eval/webapp/node_modules/negotiator/lib/encoding.js +184 -0
- gaia/eval/webapp/node_modules/negotiator/lib/language.js +179 -0
- gaia/eval/webapp/node_modules/negotiator/lib/mediaType.js +294 -0
- gaia/eval/webapp/node_modules/negotiator/package.json +42 -0
- gaia/eval/webapp/node_modules/object-inspect/.eslintrc +53 -0
- gaia/eval/webapp/node_modules/object-inspect/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/object-inspect/.nycrc +13 -0
- gaia/eval/webapp/node_modules/object-inspect/CHANGELOG.md +424 -0
- gaia/eval/webapp/node_modules/object-inspect/LICENSE +21 -0
- gaia/eval/webapp/node_modules/object-inspect/example/all.js +23 -0
- gaia/eval/webapp/node_modules/object-inspect/example/circular.js +6 -0
- gaia/eval/webapp/node_modules/object-inspect/example/fn.js +5 -0
- gaia/eval/webapp/node_modules/object-inspect/example/inspect.js +10 -0
- gaia/eval/webapp/node_modules/object-inspect/index.js +544 -0
- gaia/eval/webapp/node_modules/object-inspect/package-support.json +20 -0
- gaia/eval/webapp/node_modules/object-inspect/package.json +105 -0
- gaia/eval/webapp/node_modules/object-inspect/readme.markdown +84 -0
- gaia/eval/webapp/node_modules/object-inspect/test/bigint.js +58 -0
- gaia/eval/webapp/node_modules/object-inspect/test/browser/dom.js +15 -0
- gaia/eval/webapp/node_modules/object-inspect/test/circular.js +16 -0
- gaia/eval/webapp/node_modules/object-inspect/test/deep.js +12 -0
- gaia/eval/webapp/node_modules/object-inspect/test/element.js +53 -0
- gaia/eval/webapp/node_modules/object-inspect/test/err.js +48 -0
- gaia/eval/webapp/node_modules/object-inspect/test/fakes.js +29 -0
- gaia/eval/webapp/node_modules/object-inspect/test/fn.js +76 -0
- gaia/eval/webapp/node_modules/object-inspect/test/global.js +17 -0
- gaia/eval/webapp/node_modules/object-inspect/test/has.js +15 -0
- gaia/eval/webapp/node_modules/object-inspect/test/holes.js +15 -0
- gaia/eval/webapp/node_modules/object-inspect/test/indent-option.js +271 -0
- gaia/eval/webapp/node_modules/object-inspect/test/inspect.js +139 -0
- gaia/eval/webapp/node_modules/object-inspect/test/lowbyte.js +12 -0
- gaia/eval/webapp/node_modules/object-inspect/test/number.js +58 -0
- gaia/eval/webapp/node_modules/object-inspect/test/quoteStyle.js +26 -0
- gaia/eval/webapp/node_modules/object-inspect/test/toStringTag.js +40 -0
- gaia/eval/webapp/node_modules/object-inspect/test/undef.js +12 -0
- gaia/eval/webapp/node_modules/object-inspect/test/values.js +261 -0
- gaia/eval/webapp/node_modules/object-inspect/test-core-js.js +26 -0
- gaia/eval/webapp/node_modules/object-inspect/util.inspect.js +1 -0
- gaia/eval/webapp/node_modules/on-finished/HISTORY.md +98 -0
- gaia/eval/webapp/node_modules/on-finished/LICENSE +23 -0
- gaia/eval/webapp/node_modules/on-finished/README.md +162 -0
- gaia/eval/webapp/node_modules/on-finished/index.js +234 -0
- gaia/eval/webapp/node_modules/on-finished/package.json +39 -0
- gaia/eval/webapp/node_modules/parseurl/HISTORY.md +58 -0
- gaia/eval/webapp/node_modules/parseurl/LICENSE +24 -0
- gaia/eval/webapp/node_modules/parseurl/README.md +133 -0
- gaia/eval/webapp/node_modules/parseurl/index.js +158 -0
- gaia/eval/webapp/node_modules/parseurl/package.json +40 -0
- gaia/eval/webapp/node_modules/path/.npmignore +1 -0
- gaia/eval/webapp/node_modules/path/LICENSE +18 -0
- gaia/eval/webapp/node_modules/path/README.md +15 -0
- gaia/eval/webapp/node_modules/path/package.json +24 -0
- gaia/eval/webapp/node_modules/path/path.js +628 -0
- gaia/eval/webapp/node_modules/path-to-regexp/LICENSE +21 -0
- gaia/eval/webapp/node_modules/path-to-regexp/Readme.md +35 -0
- gaia/eval/webapp/node_modules/path-to-regexp/index.js +156 -0
- gaia/eval/webapp/node_modules/path-to-regexp/package.json +30 -0
- gaia/eval/webapp/node_modules/process/.eslintrc +21 -0
- gaia/eval/webapp/node_modules/process/LICENSE +22 -0
- gaia/eval/webapp/node_modules/process/README.md +26 -0
- gaia/eval/webapp/node_modules/process/browser.js +184 -0
- gaia/eval/webapp/node_modules/process/index.js +2 -0
- gaia/eval/webapp/node_modules/process/package.json +27 -0
- gaia/eval/webapp/node_modules/process/test.js +199 -0
- gaia/eval/webapp/node_modules/proxy-addr/HISTORY.md +161 -0
- gaia/eval/webapp/node_modules/proxy-addr/LICENSE +22 -0
- gaia/eval/webapp/node_modules/proxy-addr/README.md +139 -0
- gaia/eval/webapp/node_modules/proxy-addr/index.js +327 -0
- gaia/eval/webapp/node_modules/proxy-addr/package.json +47 -0
- gaia/eval/webapp/node_modules/qs/.editorconfig +46 -0
- gaia/eval/webapp/node_modules/qs/.eslintrc +38 -0
- gaia/eval/webapp/node_modules/qs/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/qs/.nycrc +13 -0
- gaia/eval/webapp/node_modules/qs/CHANGELOG.md +600 -0
- gaia/eval/webapp/node_modules/qs/LICENSE.md +29 -0
- gaia/eval/webapp/node_modules/qs/README.md +709 -0
- gaia/eval/webapp/node_modules/qs/dist/qs.js +90 -0
- gaia/eval/webapp/node_modules/qs/lib/formats.js +23 -0
- gaia/eval/webapp/node_modules/qs/lib/index.js +11 -0
- gaia/eval/webapp/node_modules/qs/lib/parse.js +296 -0
- gaia/eval/webapp/node_modules/qs/lib/stringify.js +351 -0
- gaia/eval/webapp/node_modules/qs/lib/utils.js +265 -0
- gaia/eval/webapp/node_modules/qs/package.json +91 -0
- gaia/eval/webapp/node_modules/qs/test/empty-keys-cases.js +267 -0
- gaia/eval/webapp/node_modules/qs/test/parse.js +1170 -0
- gaia/eval/webapp/node_modules/qs/test/stringify.js +1298 -0
- gaia/eval/webapp/node_modules/qs/test/utils.js +136 -0
- gaia/eval/webapp/node_modules/range-parser/HISTORY.md +56 -0
- gaia/eval/webapp/node_modules/range-parser/LICENSE +23 -0
- gaia/eval/webapp/node_modules/range-parser/README.md +84 -0
- gaia/eval/webapp/node_modules/range-parser/index.js +162 -0
- gaia/eval/webapp/node_modules/range-parser/package.json +44 -0
- gaia/eval/webapp/node_modules/raw-body/HISTORY.md +308 -0
- gaia/eval/webapp/node_modules/raw-body/LICENSE +22 -0
- gaia/eval/webapp/node_modules/raw-body/README.md +223 -0
- gaia/eval/webapp/node_modules/raw-body/SECURITY.md +24 -0
- gaia/eval/webapp/node_modules/raw-body/index.d.ts +87 -0
- gaia/eval/webapp/node_modules/raw-body/index.js +336 -0
- gaia/eval/webapp/node_modules/raw-body/package.json +49 -0
- gaia/eval/webapp/node_modules/safe-buffer/LICENSE +21 -0
- gaia/eval/webapp/node_modules/safe-buffer/README.md +584 -0
- gaia/eval/webapp/node_modules/safe-buffer/index.d.ts +187 -0
- gaia/eval/webapp/node_modules/safe-buffer/index.js +65 -0
- gaia/eval/webapp/node_modules/safe-buffer/package.json +51 -0
- gaia/eval/webapp/node_modules/safer-buffer/LICENSE +21 -0
- gaia/eval/webapp/node_modules/safer-buffer/Porting-Buffer.md +268 -0
- gaia/eval/webapp/node_modules/safer-buffer/Readme.md +156 -0
- gaia/eval/webapp/node_modules/safer-buffer/dangerous.js +58 -0
- gaia/eval/webapp/node_modules/safer-buffer/package.json +34 -0
- gaia/eval/webapp/node_modules/safer-buffer/safer.js +77 -0
- gaia/eval/webapp/node_modules/safer-buffer/tests.js +406 -0
- gaia/eval/webapp/node_modules/send/HISTORY.md +526 -0
- gaia/eval/webapp/node_modules/send/LICENSE +23 -0
- gaia/eval/webapp/node_modules/send/README.md +327 -0
- gaia/eval/webapp/node_modules/send/SECURITY.md +24 -0
- gaia/eval/webapp/node_modules/send/index.js +1142 -0
- gaia/eval/webapp/node_modules/send/node_modules/encodeurl/HISTORY.md +14 -0
- gaia/eval/webapp/node_modules/send/node_modules/encodeurl/LICENSE +22 -0
- gaia/eval/webapp/node_modules/send/node_modules/encodeurl/README.md +128 -0
- gaia/eval/webapp/node_modules/send/node_modules/encodeurl/index.js +60 -0
- gaia/eval/webapp/node_modules/send/node_modules/encodeurl/package.json +40 -0
- gaia/eval/webapp/node_modules/send/node_modules/ms/index.js +162 -0
- gaia/eval/webapp/node_modules/send/node_modules/ms/license.md +21 -0
- gaia/eval/webapp/node_modules/send/node_modules/ms/package.json +38 -0
- gaia/eval/webapp/node_modules/send/node_modules/ms/readme.md +59 -0
- gaia/eval/webapp/node_modules/send/package.json +62 -0
- gaia/eval/webapp/node_modules/serve-static/HISTORY.md +487 -0
- gaia/eval/webapp/node_modules/serve-static/LICENSE +25 -0
- gaia/eval/webapp/node_modules/serve-static/README.md +257 -0
- gaia/eval/webapp/node_modules/serve-static/index.js +209 -0
- gaia/eval/webapp/node_modules/serve-static/package.json +42 -0
- gaia/eval/webapp/node_modules/setprototypeof/LICENSE +13 -0
- gaia/eval/webapp/node_modules/setprototypeof/README.md +31 -0
- gaia/eval/webapp/node_modules/setprototypeof/index.d.ts +2 -0
- gaia/eval/webapp/node_modules/setprototypeof/index.js +17 -0
- gaia/eval/webapp/node_modules/setprototypeof/package.json +38 -0
- gaia/eval/webapp/node_modules/setprototypeof/test/index.js +24 -0
- gaia/eval/webapp/node_modules/side-channel/.editorconfig +9 -0
- gaia/eval/webapp/node_modules/side-channel/.eslintrc +12 -0
- gaia/eval/webapp/node_modules/side-channel/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/side-channel/.nycrc +13 -0
- gaia/eval/webapp/node_modules/side-channel/CHANGELOG.md +110 -0
- gaia/eval/webapp/node_modules/side-channel/LICENSE +21 -0
- gaia/eval/webapp/node_modules/side-channel/README.md +61 -0
- gaia/eval/webapp/node_modules/side-channel/index.d.ts +14 -0
- gaia/eval/webapp/node_modules/side-channel/index.js +43 -0
- gaia/eval/webapp/node_modules/side-channel/package.json +85 -0
- gaia/eval/webapp/node_modules/side-channel/test/index.js +104 -0
- gaia/eval/webapp/node_modules/side-channel/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/side-channel-list/.editorconfig +9 -0
- gaia/eval/webapp/node_modules/side-channel-list/.eslintrc +11 -0
- gaia/eval/webapp/node_modules/side-channel-list/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/side-channel-list/.nycrc +13 -0
- gaia/eval/webapp/node_modules/side-channel-list/CHANGELOG.md +15 -0
- gaia/eval/webapp/node_modules/side-channel-list/LICENSE +21 -0
- gaia/eval/webapp/node_modules/side-channel-list/README.md +62 -0
- gaia/eval/webapp/node_modules/side-channel-list/index.d.ts +13 -0
- gaia/eval/webapp/node_modules/side-channel-list/index.js +113 -0
- gaia/eval/webapp/node_modules/side-channel-list/list.d.ts +14 -0
- gaia/eval/webapp/node_modules/side-channel-list/package.json +77 -0
- gaia/eval/webapp/node_modules/side-channel-list/test/index.js +104 -0
- gaia/eval/webapp/node_modules/side-channel-list/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/side-channel-map/.editorconfig +9 -0
- gaia/eval/webapp/node_modules/side-channel-map/.eslintrc +11 -0
- gaia/eval/webapp/node_modules/side-channel-map/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/side-channel-map/.nycrc +13 -0
- gaia/eval/webapp/node_modules/side-channel-map/CHANGELOG.md +22 -0
- gaia/eval/webapp/node_modules/side-channel-map/LICENSE +21 -0
- gaia/eval/webapp/node_modules/side-channel-map/README.md +62 -0
- gaia/eval/webapp/node_modules/side-channel-map/index.d.ts +15 -0
- gaia/eval/webapp/node_modules/side-channel-map/index.js +68 -0
- gaia/eval/webapp/node_modules/side-channel-map/package.json +80 -0
- gaia/eval/webapp/node_modules/side-channel-map/test/index.js +114 -0
- gaia/eval/webapp/node_modules/side-channel-map/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/.editorconfig +9 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/.eslintrc +12 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/.github/FUNDING.yml +12 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/.nycrc +13 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/CHANGELOG.md +28 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/LICENSE +21 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/README.md +62 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/index.d.ts +15 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/index.js +84 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/package.json +87 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/test/index.js +114 -0
- gaia/eval/webapp/node_modules/side-channel-weakmap/tsconfig.json +9 -0
- gaia/eval/webapp/node_modules/statuses/HISTORY.md +82 -0
- gaia/eval/webapp/node_modules/statuses/LICENSE +23 -0
- gaia/eval/webapp/node_modules/statuses/README.md +136 -0
- gaia/eval/webapp/node_modules/statuses/codes.json +65 -0
- gaia/eval/webapp/node_modules/statuses/index.js +146 -0
- gaia/eval/webapp/node_modules/statuses/package.json +49 -0
- gaia/eval/webapp/node_modules/toidentifier/HISTORY.md +9 -0
- gaia/eval/webapp/node_modules/toidentifier/LICENSE +21 -0
- gaia/eval/webapp/node_modules/toidentifier/README.md +61 -0
- gaia/eval/webapp/node_modules/toidentifier/index.js +32 -0
- gaia/eval/webapp/node_modules/toidentifier/package.json +38 -0
- gaia/eval/webapp/node_modules/type-is/HISTORY.md +259 -0
- gaia/eval/webapp/node_modules/type-is/LICENSE +23 -0
- gaia/eval/webapp/node_modules/type-is/README.md +170 -0
- gaia/eval/webapp/node_modules/type-is/index.js +266 -0
- gaia/eval/webapp/node_modules/type-is/package.json +45 -0
- gaia/eval/webapp/node_modules/unpipe/HISTORY.md +4 -0
- gaia/eval/webapp/node_modules/unpipe/LICENSE +22 -0
- gaia/eval/webapp/node_modules/unpipe/README.md +43 -0
- gaia/eval/webapp/node_modules/unpipe/index.js +69 -0
- gaia/eval/webapp/node_modules/unpipe/package.json +27 -0
- gaia/eval/webapp/node_modules/util/LICENSE +18 -0
- gaia/eval/webapp/node_modules/util/README.md +15 -0
- gaia/eval/webapp/node_modules/util/node_modules/inherits/LICENSE +16 -0
- gaia/eval/webapp/node_modules/util/node_modules/inherits/README.md +42 -0
- gaia/eval/webapp/node_modules/util/node_modules/inherits/inherits.js +7 -0
- gaia/eval/webapp/node_modules/util/node_modules/inherits/inherits_browser.js +23 -0
- gaia/eval/webapp/node_modules/util/node_modules/inherits/package.json +29 -0
- gaia/eval/webapp/node_modules/util/package.json +35 -0
- gaia/eval/webapp/node_modules/util/support/isBuffer.js +3 -0
- gaia/eval/webapp/node_modules/util/support/isBufferBrowser.js +6 -0
- gaia/eval/webapp/node_modules/util/util.js +586 -0
- gaia/eval/webapp/node_modules/utils-merge/.npmignore +9 -0
- gaia/eval/webapp/node_modules/utils-merge/LICENSE +20 -0
- gaia/eval/webapp/node_modules/utils-merge/README.md +34 -0
- gaia/eval/webapp/node_modules/utils-merge/index.js +23 -0
- gaia/eval/webapp/node_modules/utils-merge/package.json +40 -0
- gaia/eval/webapp/node_modules/vary/HISTORY.md +39 -0
- gaia/eval/webapp/node_modules/vary/LICENSE +22 -0
- gaia/eval/webapp/node_modules/vary/README.md +101 -0
- gaia/eval/webapp/node_modules/vary/index.js +149 -0
- gaia/eval/webapp/node_modules/vary/package.json +43 -0
- gaia/eval/webapp/package-lock.json +875 -0
- gaia/eval/webapp/package.json +21 -0
- gaia/eval/webapp/public/app.js +3403 -0
- gaia/eval/webapp/public/index.html +88 -0
- gaia/eval/webapp/public/styles.css +3661 -0
- gaia/eval/webapp/server.js +416 -0
- gaia/eval/webapp/test-setup.js +73 -0
- gaia/llm/__init__.py +2 -0
- gaia/llm/lemonade_client.py +3083 -0
- gaia/llm/lemonade_manager.py +269 -0
- gaia/llm/llm_client.py +729 -0
- gaia/llm/vlm_client.py +307 -0
- gaia/logger.py +189 -0
- gaia/mcp/agent_mcp_server.py +245 -0
- gaia/mcp/blender_mcp_client.py +138 -0
- gaia/mcp/blender_mcp_server.py +648 -0
- gaia/mcp/context7_cache.py +332 -0
- gaia/mcp/external_services.py +518 -0
- gaia/mcp/mcp_bridge.py +550 -0
- gaia/mcp/servers/__init__.py +6 -0
- gaia/mcp/servers/docker_mcp.py +83 -0
- gaia/rag/__init__.py +10 -0
- gaia/rag/app.py +293 -0
- gaia/rag/demo.py +304 -0
- gaia/rag/pdf_utils.py +235 -0
- gaia/rag/sdk.py +2194 -0
- gaia/security.py +163 -0
- gaia/talk/app.py +289 -0
- gaia/talk/sdk.py +538 -0
- gaia/util.py +46 -0
- gaia/version.py +100 -0
gaia/eval/eval.py
ADDED
|
@@ -0,0 +1,3179 @@
|
|
|
1
|
+
# Copyright(C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
import time
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
13
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
14
|
+
|
|
15
|
+
from gaia.eval.claude import ClaudeClient
|
|
16
|
+
from gaia.logger import get_logger
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Evaluator:
|
|
20
|
+
"""Evaluates AI model performance across various use cases (summarization, Q&A, RAG, etc.)."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, model="claude-sonnet-4-20250514"):
|
|
23
|
+
self.log = get_logger(__name__)
|
|
24
|
+
# Increase max_tokens to 4096 to avoid truncation of complex JSON responses
|
|
25
|
+
self.claude = ClaudeClient(model=model, max_tokens=4096)
|
|
26
|
+
|
|
27
|
+
def calculate_similarity(self, text1: str, text2: str) -> float:
|
|
28
|
+
"""
|
|
29
|
+
Calculate cosine similarity between two texts using TF-IDF vectors.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
text1: First text (ground truth)
|
|
33
|
+
text2: Second text (response)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
float: Cosine similarity score between 0 and 1
|
|
37
|
+
"""
|
|
38
|
+
if not text1.strip() or not text2.strip():
|
|
39
|
+
return 0.0
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
vectorizer = TfidfVectorizer(stop_words="english", lowercase=True)
|
|
43
|
+
vectors = vectorizer.fit_transform([text1, text2])
|
|
44
|
+
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
|
|
45
|
+
return float(similarity)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
self.log.warning(f"Error calculating similarity: {e}")
|
|
48
|
+
return 0.0
|
|
49
|
+
|
|
50
|
+
def determine_pass_fail(
|
|
51
|
+
self, similarity: float, threshold: float, claude_analysis: Dict = None
|
|
52
|
+
) -> Dict:
|
|
53
|
+
"""
|
|
54
|
+
Determine pass/fail based on comprehensive evaluation criteria.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
similarity: Similarity score between ground truth and response
|
|
58
|
+
threshold: Similarity threshold
|
|
59
|
+
claude_analysis: Claude's qualitative analysis (correctness, completeness, etc.)
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Dict containing pass/fail determination and reasoning
|
|
63
|
+
"""
|
|
64
|
+
# Start with similarity-based evaluation
|
|
65
|
+
similarity_pass = similarity >= threshold
|
|
66
|
+
|
|
67
|
+
# If no Claude analysis available, fall back to similarity only
|
|
68
|
+
if not claude_analysis:
|
|
69
|
+
return {
|
|
70
|
+
"is_pass": similarity_pass,
|
|
71
|
+
"pass_fail": "pass" if similarity_pass else "fail",
|
|
72
|
+
"criteria": "similarity_only",
|
|
73
|
+
"reasoning": f"Similarity score {similarity:.3f} {'meets' if similarity_pass else 'below'} threshold {threshold:.3f}",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Extract Claude's ratings
|
|
77
|
+
ratings = {}
|
|
78
|
+
for criterion in ["correctness", "completeness", "conciseness", "relevance"]:
|
|
79
|
+
if criterion in claude_analysis:
|
|
80
|
+
rating = claude_analysis[criterion].get("rating", "").lower()
|
|
81
|
+
ratings[criterion] = rating
|
|
82
|
+
|
|
83
|
+
# Define scoring system: excellent=4, good=3, fair=2, poor=1
|
|
84
|
+
score_map = {"excellent": 4, "good": 3, "fair": 2, "poor": 1}
|
|
85
|
+
|
|
86
|
+
# Calculate weighted scores (correctness and completeness are more important)
|
|
87
|
+
weights = {
|
|
88
|
+
"correctness": 0.4,
|
|
89
|
+
"completeness": 0.3,
|
|
90
|
+
"conciseness": 0.15,
|
|
91
|
+
"relevance": 0.15,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
total_score = 0
|
|
95
|
+
max_possible = 0
|
|
96
|
+
criteria_details = []
|
|
97
|
+
|
|
98
|
+
for criterion, weight in weights.items():
|
|
99
|
+
if criterion in ratings:
|
|
100
|
+
rating = ratings[criterion]
|
|
101
|
+
score = score_map.get(rating, 1)
|
|
102
|
+
weighted_score = score * weight
|
|
103
|
+
total_score += weighted_score
|
|
104
|
+
max_possible += 4 * weight
|
|
105
|
+
criteria_details.append(f"{criterion}: {rating} ({score}/4)")
|
|
106
|
+
|
|
107
|
+
# Calculate normalized score (0-1)
|
|
108
|
+
normalized_score = total_score / max_possible if max_possible > 0 else 0
|
|
109
|
+
|
|
110
|
+
# Determine pass/fail using combined criteria:
|
|
111
|
+
# 1. Must meet minimum qualitative threshold (normalized score >= 0.6)
|
|
112
|
+
# 2. Correctness must be at least "fair"
|
|
113
|
+
# 3. Either high similarity OR good qualitative scores can pass
|
|
114
|
+
|
|
115
|
+
correctness_acceptable = ratings.get("correctness", "poor") in [
|
|
116
|
+
"fair",
|
|
117
|
+
"good",
|
|
118
|
+
"excellent",
|
|
119
|
+
]
|
|
120
|
+
qualitative_pass = normalized_score >= 0.6 and correctness_acceptable
|
|
121
|
+
|
|
122
|
+
# Final determination: pass if either high similarity OR good qualitative scores
|
|
123
|
+
final_pass = similarity_pass or qualitative_pass
|
|
124
|
+
|
|
125
|
+
# Override: fail if correctness is "poor" regardless of other factors
|
|
126
|
+
if ratings.get("correctness", "") == "poor":
|
|
127
|
+
final_pass = False
|
|
128
|
+
|
|
129
|
+
reasoning_parts = [
|
|
130
|
+
f"Similarity: {similarity:.3f} ({'✓' if similarity_pass else '✗'} threshold {threshold:.3f})",
|
|
131
|
+
f"Qualitative score: {normalized_score:.2f} ({'✓' if qualitative_pass else '✗'} ≥0.6)",
|
|
132
|
+
f"Correctness: {ratings.get('correctness', 'N/A')} ({'✓' if correctness_acceptable else '✗'} ≥fair)",
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
return {
|
|
136
|
+
"is_pass": final_pass,
|
|
137
|
+
"pass_fail": "pass" if final_pass else "fail",
|
|
138
|
+
"criteria": "comprehensive",
|
|
139
|
+
"reasoning": "; ".join(reasoning_parts),
|
|
140
|
+
"scores": {
|
|
141
|
+
"similarity": similarity,
|
|
142
|
+
"qualitative_normalized": normalized_score,
|
|
143
|
+
"qualitative_details": criteria_details,
|
|
144
|
+
},
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
def load_results(self, results_path: str) -> Dict:
|
|
148
|
+
"""Load test results from a JSON file."""
|
|
149
|
+
try:
|
|
150
|
+
with open(results_path, "r") as f:
|
|
151
|
+
return json.load(f)
|
|
152
|
+
except Exception as e:
|
|
153
|
+
self.log.error(f"Error loading results file: {e}")
|
|
154
|
+
raise
|
|
155
|
+
|
|
156
|
+
def check_evaluation_exists(self, experiment_file: str, output_dir: str) -> bool:
|
|
157
|
+
"""Check if evaluation already exists for experiment file.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
experiment_file: Path to the experiment file
|
|
161
|
+
output_dir: Output directory for evaluations
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
True if evaluation file already exists, False otherwise
|
|
165
|
+
"""
|
|
166
|
+
experiment_path = Path(experiment_file)
|
|
167
|
+
output_base_path = Path(output_dir)
|
|
168
|
+
|
|
169
|
+
# Generate expected eval filename: <name>.experiment.eval.json
|
|
170
|
+
eval_filename = f"{experiment_path.stem}.eval.json"
|
|
171
|
+
|
|
172
|
+
# Check for hierarchical structure first
|
|
173
|
+
relative_path = None
|
|
174
|
+
if "experiments" in experiment_path.parts:
|
|
175
|
+
# Extract relative path from experiments directory
|
|
176
|
+
exp_idx = experiment_path.parts.index("experiments")
|
|
177
|
+
if exp_idx + 1 < len(experiment_path.parts):
|
|
178
|
+
relative_path = Path(*experiment_path.parts[exp_idx + 1 : -1])
|
|
179
|
+
|
|
180
|
+
# Check both locations: hierarchical and flat
|
|
181
|
+
eval_paths = []
|
|
182
|
+
if relative_path:
|
|
183
|
+
eval_paths.append(output_base_path / relative_path / eval_filename)
|
|
184
|
+
eval_paths.append(output_base_path / eval_filename)
|
|
185
|
+
|
|
186
|
+
for eval_path in eval_paths:
|
|
187
|
+
if eval_path.exists():
|
|
188
|
+
self.log.info(f"Evaluation already exists: {eval_path}")
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
def evaluate(self, results_path: str) -> Dict:
|
|
194
|
+
"""
|
|
195
|
+
Evaluate RAG results and generate metrics.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
results_path: Path to the results JSON file
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Dict containing evaluation metrics
|
|
202
|
+
"""
|
|
203
|
+
results = self.load_results(results_path)
|
|
204
|
+
qa_results = results["analysis"]["qa_results"]
|
|
205
|
+
|
|
206
|
+
# Calculate similarity scores and pass/fail during evaluation
|
|
207
|
+
similarities = []
|
|
208
|
+
pass_results = []
|
|
209
|
+
threshold = results["metadata"]["similarity_threshold"]
|
|
210
|
+
|
|
211
|
+
for result in qa_results:
|
|
212
|
+
similarity = self.calculate_similarity(
|
|
213
|
+
result["ground_truth"], result["response"]
|
|
214
|
+
)
|
|
215
|
+
similarities.append(similarity)
|
|
216
|
+
pass_results.append(similarity >= threshold)
|
|
217
|
+
|
|
218
|
+
# Calculate accuracy metrics
|
|
219
|
+
total_questions = len(pass_results)
|
|
220
|
+
passed_questions = sum(pass_results)
|
|
221
|
+
failed_questions = total_questions - passed_questions
|
|
222
|
+
accuracy = passed_questions / total_questions if total_questions > 0 else 0.0
|
|
223
|
+
|
|
224
|
+
metrics = {
|
|
225
|
+
"test_file": results["metadata"]["test_file"],
|
|
226
|
+
"timestamp": results["metadata"]["timestamp"],
|
|
227
|
+
"threshold": results["metadata"]["similarity_threshold"],
|
|
228
|
+
"num_questions": len(qa_results),
|
|
229
|
+
"similarity_scores": {
|
|
230
|
+
"mean": float(np.mean(similarities)),
|
|
231
|
+
"median": float(np.median(similarities)),
|
|
232
|
+
"std": float(np.std(similarities)),
|
|
233
|
+
"min": float(np.min(similarities)),
|
|
234
|
+
"max": float(np.max(similarities)),
|
|
235
|
+
},
|
|
236
|
+
"threshold_metrics": {
|
|
237
|
+
"num_passed": passed_questions,
|
|
238
|
+
"num_failed": failed_questions,
|
|
239
|
+
"accuracy": accuracy,
|
|
240
|
+
"accuracy_percentage": accuracy * 100.0,
|
|
241
|
+
},
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
# Calculate pass rate
|
|
245
|
+
metrics["threshold_metrics"]["pass_rate"] = (
|
|
246
|
+
metrics["threshold_metrics"]["num_passed"] / metrics["num_questions"]
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Add overall rating based on pass rate and mean similarity
|
|
250
|
+
pass_rate = metrics["threshold_metrics"]["pass_rate"]
|
|
251
|
+
mean_similarity = metrics["similarity_scores"]["mean"]
|
|
252
|
+
|
|
253
|
+
if pass_rate >= 0.9 and mean_similarity >= 0.8:
|
|
254
|
+
rating = "excellent"
|
|
255
|
+
elif pass_rate >= 0.8 and mean_similarity >= 0.7:
|
|
256
|
+
rating = "good"
|
|
257
|
+
elif pass_rate >= 0.6 and mean_similarity >= 0.6:
|
|
258
|
+
rating = "fair"
|
|
259
|
+
else:
|
|
260
|
+
rating = "poor"
|
|
261
|
+
|
|
262
|
+
metrics["overall_rating"] = {
|
|
263
|
+
"rating": rating,
|
|
264
|
+
"pass_rate": pass_rate,
|
|
265
|
+
"mean_similarity": mean_similarity,
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
return metrics
|
|
269
|
+
|
|
270
|
+
def analyze_with_claude(
|
|
271
|
+
self, results_path: str, groundtruth_path: Optional[str] = None
|
|
272
|
+
) -> Dict:
|
|
273
|
+
"""
|
|
274
|
+
Use Claude to perform qualitative analysis of RAG results.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
results_path: Path to results JSON file
|
|
278
|
+
groundtruth_path: Optional path to groundtruth file for comparison
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Dict containing Claude's analysis
|
|
282
|
+
"""
|
|
283
|
+
# Start timing
|
|
284
|
+
start_time = time.time()
|
|
285
|
+
|
|
286
|
+
try:
|
|
287
|
+
results = self.load_results(results_path)
|
|
288
|
+
|
|
289
|
+
# Detect result type and extract appropriate data
|
|
290
|
+
analysis_data = results.get("analysis", {})
|
|
291
|
+
qa_results = analysis_data.get("qa_results", results.get("qa_results", []))
|
|
292
|
+
summarization_results = analysis_data.get("summarization_results", [])
|
|
293
|
+
|
|
294
|
+
# Determine evaluation type
|
|
295
|
+
if qa_results:
|
|
296
|
+
return self._analyze_qa_results(results, qa_results)
|
|
297
|
+
elif summarization_results:
|
|
298
|
+
return self._analyze_summarization_results(
|
|
299
|
+
results, summarization_results, groundtruth_path
|
|
300
|
+
)
|
|
301
|
+
else:
|
|
302
|
+
return {
|
|
303
|
+
"overall_analysis": "No QA or summarization results found to analyze",
|
|
304
|
+
"strengths": [],
|
|
305
|
+
"weaknesses": ["No data available for analysis"],
|
|
306
|
+
"recommendations": [
|
|
307
|
+
"Ensure input data contains QA or summarization results"
|
|
308
|
+
],
|
|
309
|
+
"use_case_fit": "Unable to determine",
|
|
310
|
+
"per_question": [],
|
|
311
|
+
"overall_rating": {
|
|
312
|
+
"rating": "error",
|
|
313
|
+
"explanation": "No analyzable results found",
|
|
314
|
+
},
|
|
315
|
+
"timing": {
|
|
316
|
+
"total_processing_time_seconds": round(
|
|
317
|
+
time.time() - start_time, 3
|
|
318
|
+
)
|
|
319
|
+
},
|
|
320
|
+
}
|
|
321
|
+
except Exception as e:
|
|
322
|
+
self.log.error(f"Error in analyze_with_claude: {e}")
|
|
323
|
+
return {
|
|
324
|
+
"overall_analysis": f"Analysis failed: {str(e)}",
|
|
325
|
+
"strengths": [],
|
|
326
|
+
"weaknesses": ["Analysis failed to complete"],
|
|
327
|
+
"recommendations": ["Check logs for error details"],
|
|
328
|
+
"use_case_fit": "",
|
|
329
|
+
"per_question": [],
|
|
330
|
+
"overall_rating": {"rating": "error", "explanation": str(e)},
|
|
331
|
+
"timing": {
|
|
332
|
+
"total_processing_time_seconds": round(time.time() - start_time, 3)
|
|
333
|
+
},
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
def _analyze_qa_results(self, results: Dict, qa_results: List) -> Dict:
|
|
337
|
+
"""Analyze QA results using Claude."""
|
|
338
|
+
# Start timing
|
|
339
|
+
analysis_start_time = time.time()
|
|
340
|
+
|
|
341
|
+
# Initialize analysis structure
|
|
342
|
+
analysis = {
|
|
343
|
+
"overall_analysis": "",
|
|
344
|
+
"strengths": [],
|
|
345
|
+
"weaknesses": [],
|
|
346
|
+
"recommendations": [],
|
|
347
|
+
"use_case_fit": "",
|
|
348
|
+
"per_question": [],
|
|
349
|
+
"overall_rating": {"rating": "", "explanation": ""},
|
|
350
|
+
"timing": {}, # Add timing information
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if not qa_results:
|
|
354
|
+
return {
|
|
355
|
+
"overall_analysis": "No QA results found to analyze",
|
|
356
|
+
"strengths": [],
|
|
357
|
+
"weaknesses": ["No data available for analysis"],
|
|
358
|
+
"recommendations": ["Ensure input data contains QA results"],
|
|
359
|
+
"use_case_fit": "Unable to determine",
|
|
360
|
+
"per_question": [],
|
|
361
|
+
"overall_rating": {
|
|
362
|
+
"rating": "error",
|
|
363
|
+
"explanation": "No QA results found",
|
|
364
|
+
},
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
try:
|
|
368
|
+
per_question_timings = [] # Track timing for each question
|
|
369
|
+
|
|
370
|
+
# Set up intermediate output directory for crash recovery
|
|
371
|
+
intermediate_dir = None
|
|
372
|
+
experiment_name = results.get("metadata", {}).get(
|
|
373
|
+
"experiment_name", "qa_evaluation"
|
|
374
|
+
)
|
|
375
|
+
if hasattr(self, "intermediate_dir") and self.intermediate_dir:
|
|
376
|
+
# Use existing intermediate directory if set
|
|
377
|
+
intermediate_dir = (
|
|
378
|
+
Path(self.intermediate_dir)
|
|
379
|
+
/ f"{experiment_name}_qa_analysis.intermediate"
|
|
380
|
+
)
|
|
381
|
+
else:
|
|
382
|
+
# Create in temp directory
|
|
383
|
+
import tempfile
|
|
384
|
+
|
|
385
|
+
temp_dir = Path(tempfile.gettempdir()) / "gaia_eval"
|
|
386
|
+
intermediate_dir = (
|
|
387
|
+
temp_dir / f"{experiment_name}_qa_analysis.intermediate"
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
if intermediate_dir:
|
|
391
|
+
intermediate_dir.mkdir(parents=True, exist_ok=True)
|
|
392
|
+
self.log.info(
|
|
393
|
+
f"Writing intermediate QA analysis results to: {intermediate_dir}"
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
for qa_result in qa_results:
|
|
397
|
+
question_start_time = time.time()
|
|
398
|
+
|
|
399
|
+
# Calculate similarity score between ground truth and response
|
|
400
|
+
similarity_score = self.calculate_similarity(
|
|
401
|
+
qa_result["ground_truth"], qa_result["response"]
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# Store initial data (pass/fail will be determined after Claude analysis)
|
|
405
|
+
threshold = results["metadata"]["similarity_threshold"]
|
|
406
|
+
|
|
407
|
+
# Restructure the qa_result into qa_inputs
|
|
408
|
+
qa_inputs = {
|
|
409
|
+
"query": qa_result["query"],
|
|
410
|
+
"ground_truth": qa_result["ground_truth"],
|
|
411
|
+
"response": qa_result["response"],
|
|
412
|
+
"similarity": similarity_score,
|
|
413
|
+
"threshold": threshold,
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
prompt = f"""
|
|
417
|
+
Analyze this RAG (Retrieval Augmented Generation) system test result and provide detailed insights.
|
|
418
|
+
|
|
419
|
+
Query: {qa_inputs['query']}
|
|
420
|
+
Ground Truth: {qa_inputs['ground_truth']}
|
|
421
|
+
System Response: {qa_inputs['response']}
|
|
422
|
+
Similarity Score: {qa_inputs['similarity']}
|
|
423
|
+
|
|
424
|
+
Evaluate the response on these criteria, providing both a rating (excellent/good/fair/poor) and detailed explanation:
|
|
425
|
+
1. Correctness: Is it factually correct compared to ground truth?
|
|
426
|
+
2. Completeness: Does it fully answer the question?
|
|
427
|
+
3. Conciseness: Is it appropriately brief while maintaining accuracy?
|
|
428
|
+
4. Relevance: Does it directly address the query?
|
|
429
|
+
|
|
430
|
+
Return your analysis in this exact JSON format:
|
|
431
|
+
{{
|
|
432
|
+
"correctness": {{
|
|
433
|
+
"rating": "one of: excellent/good/fair/poor",
|
|
434
|
+
"explanation": "analysis of factual correctness"
|
|
435
|
+
}},
|
|
436
|
+
"completeness": {{
|
|
437
|
+
"rating": "one of: excellent/good/fair/poor",
|
|
438
|
+
"explanation": "analysis of answer completeness"
|
|
439
|
+
}},
|
|
440
|
+
"conciseness": {{
|
|
441
|
+
"rating": "one of: excellent/good/fair/poor",
|
|
442
|
+
"explanation": "analysis of brevity and clarity"
|
|
443
|
+
}},
|
|
444
|
+
"relevance": {{
|
|
445
|
+
"rating": "one of: excellent/good/fair/poor",
|
|
446
|
+
"explanation": "analysis of how well it addresses the query"
|
|
447
|
+
}}
|
|
448
|
+
}}
|
|
449
|
+
"""
|
|
450
|
+
|
|
451
|
+
response_data = self.claude.get_completion_with_usage(prompt)
|
|
452
|
+
|
|
453
|
+
try:
|
|
454
|
+
# Extract JSON and combine with qa_inputs
|
|
455
|
+
response = response_data["content"]
|
|
456
|
+
usage = response_data["usage"]
|
|
457
|
+
cost = response_data["cost"]
|
|
458
|
+
|
|
459
|
+
if isinstance(response, list):
|
|
460
|
+
response_text = (
|
|
461
|
+
response[0].text
|
|
462
|
+
if hasattr(response[0], "text")
|
|
463
|
+
else str(response[0])
|
|
464
|
+
)
|
|
465
|
+
else:
|
|
466
|
+
response_text = (
|
|
467
|
+
response.text
|
|
468
|
+
if hasattr(response, "text")
|
|
469
|
+
else str(response)
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
json_start = response_text.find("{")
|
|
473
|
+
json_end = response_text.rfind("}") + 1
|
|
474
|
+
if json_start >= 0 and json_end > json_start:
|
|
475
|
+
json_content = response_text[json_start:json_end]
|
|
476
|
+
qa_analysis = json.loads(json_content)
|
|
477
|
+
|
|
478
|
+
# Determine comprehensive pass/fail
|
|
479
|
+
pass_fail_result = self.determine_pass_fail(
|
|
480
|
+
similarity_score, threshold, qa_analysis
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Add all data to qa_inputs
|
|
484
|
+
qa_inputs.update(pass_fail_result)
|
|
485
|
+
|
|
486
|
+
# Add qa_inputs, usage, and cost as nested dictionaries
|
|
487
|
+
qa_analysis["qa_inputs"] = qa_inputs
|
|
488
|
+
qa_analysis["usage"] = usage
|
|
489
|
+
qa_analysis["cost"] = cost
|
|
490
|
+
|
|
491
|
+
# Add timing for this question
|
|
492
|
+
question_time = time.time() - question_start_time
|
|
493
|
+
qa_analysis["processing_time_seconds"] = round(question_time, 3)
|
|
494
|
+
per_question_timings.append(question_time)
|
|
495
|
+
|
|
496
|
+
analysis["per_question"].append(qa_analysis)
|
|
497
|
+
|
|
498
|
+
# Write intermediate result immediately for crash recovery
|
|
499
|
+
if intermediate_dir:
|
|
500
|
+
try:
|
|
501
|
+
intermediate_file = (
|
|
502
|
+
intermediate_dir
|
|
503
|
+
/ f"qa_{len(analysis['per_question']):04d}_analysis.json"
|
|
504
|
+
)
|
|
505
|
+
intermediate_data = {
|
|
506
|
+
"question_index": len(analysis["per_question"]) - 1,
|
|
507
|
+
"experiment_name": experiment_name,
|
|
508
|
+
"qa_inputs": qa_inputs,
|
|
509
|
+
"analysis": qa_analysis,
|
|
510
|
+
"usage": qa_analysis.get("usage", {}),
|
|
511
|
+
"cost": qa_analysis.get("cost", {}),
|
|
512
|
+
"processing_time_seconds": qa_analysis.get(
|
|
513
|
+
"processing_time_seconds", 0
|
|
514
|
+
),
|
|
515
|
+
"timestamp": datetime.now().isoformat(),
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
with open(
|
|
519
|
+
intermediate_file, "w", encoding="utf-8"
|
|
520
|
+
) as f:
|
|
521
|
+
json.dump(intermediate_data, f, indent=2)
|
|
522
|
+
|
|
523
|
+
# Update progress file
|
|
524
|
+
progress_file = (
|
|
525
|
+
intermediate_dir / "qa_analysis_progress.json"
|
|
526
|
+
)
|
|
527
|
+
progress_data = {
|
|
528
|
+
"experiment_name": experiment_name,
|
|
529
|
+
"total_questions": len(qa_results),
|
|
530
|
+
"completed_questions": len(
|
|
531
|
+
analysis["per_question"]
|
|
532
|
+
),
|
|
533
|
+
"progress_percent": round(
|
|
534
|
+
len(analysis["per_question"])
|
|
535
|
+
/ len(qa_results)
|
|
536
|
+
* 100,
|
|
537
|
+
1,
|
|
538
|
+
),
|
|
539
|
+
"last_updated": datetime.now().isoformat(),
|
|
540
|
+
"estimated_remaining_time": None,
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
# Calculate estimated remaining time
|
|
544
|
+
if len(per_question_timings) > 0:
|
|
545
|
+
avg_time_per_question = sum(
|
|
546
|
+
per_question_timings
|
|
547
|
+
) / len(per_question_timings)
|
|
548
|
+
remaining_questions = len(qa_results) - len(
|
|
549
|
+
analysis["per_question"]
|
|
550
|
+
)
|
|
551
|
+
estimated_remaining = (
|
|
552
|
+
remaining_questions * avg_time_per_question
|
|
553
|
+
)
|
|
554
|
+
progress_data["estimated_remaining_time"] = round(
|
|
555
|
+
estimated_remaining, 1
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
with open(progress_file, "w", encoding="utf-8") as f:
|
|
559
|
+
json.dump(progress_data, f, indent=2)
|
|
560
|
+
|
|
561
|
+
self.log.info(
|
|
562
|
+
f"QA analysis progress: {len(analysis['per_question'])}/{len(qa_results)} questions completed ({progress_data['progress_percent']}%)"
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
except Exception as e:
|
|
566
|
+
self.log.warning(
|
|
567
|
+
f"Failed to write intermediate QA analysis result {len(analysis['per_question'])}: {e}"
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
else:
|
|
571
|
+
self.log.error(f"No JSON found in response for question")
|
|
572
|
+
|
|
573
|
+
# Determine pass/fail without Claude analysis (similarity only)
|
|
574
|
+
pass_fail_result = self.determine_pass_fail(
|
|
575
|
+
similarity_score, threshold, None
|
|
576
|
+
)
|
|
577
|
+
qa_inputs.update(pass_fail_result)
|
|
578
|
+
|
|
579
|
+
# Add timing even for failed parsing
|
|
580
|
+
question_time = time.time() - question_start_time
|
|
581
|
+
per_question_timings.append(question_time)
|
|
582
|
+
|
|
583
|
+
analysis["per_question"].append(
|
|
584
|
+
{
|
|
585
|
+
"error": "Failed to parse analysis",
|
|
586
|
+
"raw_response": response_text,
|
|
587
|
+
"qa_inputs": qa_inputs,
|
|
588
|
+
"usage": usage,
|
|
589
|
+
"cost": cost,
|
|
590
|
+
"processing_time_seconds": round(question_time, 3),
|
|
591
|
+
}
|
|
592
|
+
)
|
|
593
|
+
except Exception as e:
|
|
594
|
+
self.log.error(f"Error processing analysis: {e}")
|
|
595
|
+
|
|
596
|
+
# Determine pass/fail without Claude analysis (similarity only)
|
|
597
|
+
pass_fail_result = self.determine_pass_fail(
|
|
598
|
+
similarity_score, threshold, None
|
|
599
|
+
)
|
|
600
|
+
qa_inputs.update(pass_fail_result)
|
|
601
|
+
|
|
602
|
+
# Add timing even for exceptions
|
|
603
|
+
question_time = time.time() - question_start_time
|
|
604
|
+
per_question_timings.append(question_time)
|
|
605
|
+
|
|
606
|
+
analysis["per_question"].append(
|
|
607
|
+
{
|
|
608
|
+
"error": str(e),
|
|
609
|
+
"raw_response": str(response_data),
|
|
610
|
+
"qa_inputs": qa_inputs,
|
|
611
|
+
"usage": response_data.get("usage", {}),
|
|
612
|
+
"cost": response_data.get("cost", {}),
|
|
613
|
+
"processing_time_seconds": round(question_time, 3),
|
|
614
|
+
}
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
# Calculate similarity scores and accuracy metrics (extract from per_question analysis)
|
|
618
|
+
calculated_similarities = [
|
|
619
|
+
q["qa_inputs"]["similarity"]
|
|
620
|
+
for q in analysis["per_question"]
|
|
621
|
+
if "qa_inputs" in q
|
|
622
|
+
]
|
|
623
|
+
pass_results = [
|
|
624
|
+
q["qa_inputs"]["is_pass"]
|
|
625
|
+
for q in analysis["per_question"]
|
|
626
|
+
if "qa_inputs" in q
|
|
627
|
+
]
|
|
628
|
+
|
|
629
|
+
# Calculate accuracy metrics
|
|
630
|
+
total_questions = len(pass_results)
|
|
631
|
+
passed_questions = sum(pass_results)
|
|
632
|
+
failed_questions = total_questions - passed_questions
|
|
633
|
+
accuracy = (
|
|
634
|
+
passed_questions / total_questions if total_questions > 0 else 0.0
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
# After analyzing all questions, get overall analysis
|
|
638
|
+
overall_start_time = time.time()
|
|
639
|
+
overall_prompt = f"""
|
|
640
|
+
Review these RAG system test results and provide an overall analysis.
|
|
641
|
+
|
|
642
|
+
Number of questions: {total_questions}
|
|
643
|
+
Similarity threshold: {results["metadata"]["similarity_threshold"]}
|
|
644
|
+
Number passed threshold: {passed_questions}
|
|
645
|
+
Number failed threshold: {failed_questions}
|
|
646
|
+
Pass rate: {accuracy:.3f}
|
|
647
|
+
Accuracy: {accuracy * 100:.1f}%
|
|
648
|
+
|
|
649
|
+
Similarity statistics:
|
|
650
|
+
- Mean: {np.mean(calculated_similarities):.3f}
|
|
651
|
+
- Median: {np.median(calculated_similarities):.3f}
|
|
652
|
+
- Min: {np.min(calculated_similarities):.3f}
|
|
653
|
+
- Max: {np.max(calculated_similarities):.3f}
|
|
654
|
+
- Standard Deviation: {np.std(calculated_similarities):.3f}
|
|
655
|
+
|
|
656
|
+
Individual analyses: {json.dumps(analysis['per_question'], indent=2)}
|
|
657
|
+
|
|
658
|
+
Provide a comprehensive analysis including:
|
|
659
|
+
1. Overall Rating: Rate the system (excellent/good/fair/poor) with explanation
|
|
660
|
+
2. Overall Analysis: General assessment of the RAG system's performance
|
|
661
|
+
3. Strengths: What the system does well
|
|
662
|
+
4. Weaknesses: Areas needing improvement
|
|
663
|
+
5. Recommendations: Specific suggestions for improvement
|
|
664
|
+
6. Use Case Fit: Types of queries the system handles well/poorly
|
|
665
|
+
|
|
666
|
+
Return your analysis in this exact JSON format:
|
|
667
|
+
{{
|
|
668
|
+
"overall_rating": {{
|
|
669
|
+
"rating": "one of: excellent/good/fair/poor",
|
|
670
|
+
"explanation": "explanation of the rating",
|
|
671
|
+
"metrics": {{
|
|
672
|
+
"num_questions": number of questions analyzed,
|
|
673
|
+
"similarity_threshold": threshold value used,
|
|
674
|
+
"num_passed": number of questions that passed threshold,
|
|
675
|
+
"num_failed": number of questions that failed threshold,
|
|
676
|
+
"pass_rate": pass rate as decimal,
|
|
677
|
+
"accuracy": accuracy as decimal,
|
|
678
|
+
"accuracy_percentage": accuracy as percentage,
|
|
679
|
+
"mean_similarity": average similarity score,
|
|
680
|
+
"median_similarity": median similarity score,
|
|
681
|
+
"min_similarity": minimum similarity score,
|
|
682
|
+
"max_similarity": maximum similarity score,
|
|
683
|
+
"std_similarity": standard deviation of similarity scores
|
|
684
|
+
}}
|
|
685
|
+
}},
|
|
686
|
+
"overall_analysis": "general assessment",
|
|
687
|
+
"strengths": ["strength 1", "strength 2", ...],
|
|
688
|
+
"weaknesses": ["weakness 1", "weakness 2", ...],
|
|
689
|
+
"recommendations": ["recommendation 1", "recommendation 2", ...],
|
|
690
|
+
"use_case_fit": "analysis of suitable use cases"
|
|
691
|
+
}}
|
|
692
|
+
"""
|
|
693
|
+
|
|
694
|
+
overall_response_data = self.claude.get_completion_with_usage(
|
|
695
|
+
overall_prompt
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
try:
|
|
699
|
+
# Extract JSON from overall response
|
|
700
|
+
overall_response = overall_response_data["content"]
|
|
701
|
+
overall_usage = overall_response_data["usage"]
|
|
702
|
+
overall_cost = overall_response_data["cost"]
|
|
703
|
+
|
|
704
|
+
if isinstance(overall_response, list):
|
|
705
|
+
response_text = (
|
|
706
|
+
overall_response[0].text
|
|
707
|
+
if hasattr(overall_response[0], "text")
|
|
708
|
+
else str(overall_response[0])
|
|
709
|
+
)
|
|
710
|
+
else:
|
|
711
|
+
response_text = (
|
|
712
|
+
overall_response.text
|
|
713
|
+
if hasattr(overall_response, "text")
|
|
714
|
+
else str(overall_response)
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
json_start = response_text.find("{")
|
|
718
|
+
json_end = response_text.rfind("}") + 1
|
|
719
|
+
if json_start >= 0 and json_end > json_start:
|
|
720
|
+
json_content = response_text[json_start:json_end]
|
|
721
|
+
overall_analysis = json.loads(json_content)
|
|
722
|
+
# Add overall usage and cost to the analysis
|
|
723
|
+
overall_analysis["overall_usage"] = overall_usage
|
|
724
|
+
overall_analysis["overall_cost"] = overall_cost
|
|
725
|
+
|
|
726
|
+
# Add overall timing
|
|
727
|
+
overall_time = time.time() - overall_start_time
|
|
728
|
+
overall_analysis["overall_processing_time_seconds"] = round(
|
|
729
|
+
overall_time, 3
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
analysis.update(overall_analysis)
|
|
733
|
+
else:
|
|
734
|
+
self.log.error("No JSON found in overall analysis response")
|
|
735
|
+
analysis.update(
|
|
736
|
+
{
|
|
737
|
+
"error": "Failed to parse overall analysis",
|
|
738
|
+
"raw_response": response_text,
|
|
739
|
+
"overall_usage": overall_usage,
|
|
740
|
+
"overall_cost": overall_cost,
|
|
741
|
+
}
|
|
742
|
+
)
|
|
743
|
+
except Exception as e:
|
|
744
|
+
self.log.error(f"Error processing overall analysis: {e}")
|
|
745
|
+
analysis.update(
|
|
746
|
+
{
|
|
747
|
+
"error": str(e),
|
|
748
|
+
"raw_response": str(overall_response_data),
|
|
749
|
+
"overall_usage": overall_response_data.get("usage", {}),
|
|
750
|
+
"overall_cost": overall_response_data.get("cost", {}),
|
|
751
|
+
}
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
# Calculate total cost across all questions and overall analysis
|
|
755
|
+
total_usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
|
|
756
|
+
total_cost = {"input_cost": 0.0, "output_cost": 0.0, "total_cost": 0.0}
|
|
757
|
+
|
|
758
|
+
# Sum up costs from per-question analysis
|
|
759
|
+
for question_analysis in analysis["per_question"]:
|
|
760
|
+
if "usage" in question_analysis and "cost" in question_analysis:
|
|
761
|
+
usage = question_analysis["usage"]
|
|
762
|
+
cost = question_analysis["cost"]
|
|
763
|
+
total_usage["input_tokens"] += usage.get("input_tokens", 0)
|
|
764
|
+
total_usage["output_tokens"] += usage.get("output_tokens", 0)
|
|
765
|
+
total_usage["total_tokens"] += usage.get("total_tokens", 0)
|
|
766
|
+
total_cost["input_cost"] += cost.get("input_cost", 0.0)
|
|
767
|
+
total_cost["output_cost"] += cost.get("output_cost", 0.0)
|
|
768
|
+
total_cost["total_cost"] += cost.get("total_cost", 0.0)
|
|
769
|
+
|
|
770
|
+
# Add overall analysis costs if available
|
|
771
|
+
if "overall_usage" in analysis and "overall_cost" in analysis:
|
|
772
|
+
overall_usage = analysis["overall_usage"]
|
|
773
|
+
overall_cost = analysis["overall_cost"]
|
|
774
|
+
total_usage["input_tokens"] += overall_usage.get("input_tokens", 0)
|
|
775
|
+
total_usage["output_tokens"] += overall_usage.get("output_tokens", 0)
|
|
776
|
+
total_usage["total_tokens"] += overall_usage.get("total_tokens", 0)
|
|
777
|
+
total_cost["input_cost"] += overall_cost.get("input_cost", 0.0)
|
|
778
|
+
total_cost["output_cost"] += overall_cost.get("output_cost", 0.0)
|
|
779
|
+
total_cost["total_cost"] += overall_cost.get("total_cost", 0.0)
|
|
780
|
+
|
|
781
|
+
# Add total cost summary to analysis
|
|
782
|
+
analysis["total_usage"] = total_usage
|
|
783
|
+
analysis["total_cost"] = total_cost
|
|
784
|
+
|
|
785
|
+
# Add comprehensive timing information
|
|
786
|
+
total_time = time.time() - analysis_start_time
|
|
787
|
+
analysis["timing"] = {
|
|
788
|
+
"total_processing_time_seconds": round(total_time, 3),
|
|
789
|
+
"per_question_times_seconds": [
|
|
790
|
+
round(t, 3) for t in per_question_timings
|
|
791
|
+
],
|
|
792
|
+
"average_per_question_seconds": (
|
|
793
|
+
round(np.mean(per_question_timings), 3)
|
|
794
|
+
if per_question_timings
|
|
795
|
+
else 0
|
|
796
|
+
),
|
|
797
|
+
"max_per_question_seconds": (
|
|
798
|
+
round(max(per_question_timings), 3) if per_question_timings else 0
|
|
799
|
+
),
|
|
800
|
+
"min_per_question_seconds": (
|
|
801
|
+
round(min(per_question_timings), 3) if per_question_timings else 0
|
|
802
|
+
),
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
# Clean up intermediate files after successful completion
|
|
806
|
+
if intermediate_dir and intermediate_dir.exists():
|
|
807
|
+
try:
|
|
808
|
+
import shutil
|
|
809
|
+
|
|
810
|
+
shutil.rmtree(intermediate_dir)
|
|
811
|
+
self.log.info(
|
|
812
|
+
f"Cleaned up intermediate QA analysis files from: {intermediate_dir}"
|
|
813
|
+
)
|
|
814
|
+
except Exception as e:
|
|
815
|
+
self.log.warning(
|
|
816
|
+
f"Failed to clean up intermediate directory {intermediate_dir}: {e}"
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
return analysis
|
|
820
|
+
except Exception as api_error:
|
|
821
|
+
if "529" in str(api_error) or "overloaded" in str(api_error).lower():
|
|
822
|
+
self.log.warning(
|
|
823
|
+
"Claude API is currently overloaded. Returning partial analysis with raw data."
|
|
824
|
+
)
|
|
825
|
+
# Include raw QA results without Claude analysis
|
|
826
|
+
for qa_result in qa_results:
|
|
827
|
+
# Calculate similarity score even when Claude analysis fails
|
|
828
|
+
similarity_score = self.calculate_similarity(
|
|
829
|
+
qa_result["ground_truth"], qa_result["response"]
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
# Determine pass/fail without Claude analysis (similarity only)
|
|
833
|
+
threshold = results["metadata"]["similarity_threshold"]
|
|
834
|
+
|
|
835
|
+
qa_inputs = {
|
|
836
|
+
"query": qa_result["query"],
|
|
837
|
+
"ground_truth": qa_result["ground_truth"],
|
|
838
|
+
"response": qa_result["response"],
|
|
839
|
+
"similarity": similarity_score,
|
|
840
|
+
"threshold": threshold,
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
# Add pass/fail determination
|
|
844
|
+
pass_fail_result = self.determine_pass_fail(
|
|
845
|
+
similarity_score, threshold, None
|
|
846
|
+
)
|
|
847
|
+
qa_inputs.update(pass_fail_result)
|
|
848
|
+
analysis["per_question"].append(
|
|
849
|
+
{
|
|
850
|
+
"status": "raw_data_only",
|
|
851
|
+
"analysis_error": "Claude API overloaded",
|
|
852
|
+
"qa_inputs": qa_inputs,
|
|
853
|
+
}
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
analysis.update(
|
|
857
|
+
{
|
|
858
|
+
"overall_analysis": "Analysis incomplete due to Claude API overload",
|
|
859
|
+
"strengths": ["Raw data preserved"],
|
|
860
|
+
"weaknesses": [
|
|
861
|
+
"Claude analysis unavailable due to API overload"
|
|
862
|
+
],
|
|
863
|
+
"recommendations": ["Retry analysis when API is available"],
|
|
864
|
+
"use_case_fit": "Analysis pending",
|
|
865
|
+
"overall_rating": {
|
|
866
|
+
"rating": "pending",
|
|
867
|
+
"explanation": "Claude API temporarily unavailable",
|
|
868
|
+
},
|
|
869
|
+
}
|
|
870
|
+
)
|
|
871
|
+
return analysis
|
|
872
|
+
raise # Re-raise if it's not an overload error
|
|
873
|
+
|
|
874
|
+
except Exception as e:
|
|
875
|
+
self.log.error(f"Error in analyze_with_claude: {e}")
|
|
876
|
+
return {
|
|
877
|
+
"overall_analysis": f"Analysis failed: {str(e)}",
|
|
878
|
+
"strengths": [],
|
|
879
|
+
"weaknesses": ["Analysis failed to complete"],
|
|
880
|
+
"recommendations": ["Check logs for error details"],
|
|
881
|
+
"use_case_fit": "",
|
|
882
|
+
"per_question": [],
|
|
883
|
+
"overall_rating": {"rating": "error", "explanation": str(e)},
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
def _analyze_summarization_results(
|
|
887
|
+
self,
|
|
888
|
+
results: Dict,
|
|
889
|
+
summarization_results: List,
|
|
890
|
+
groundtruth_path: Optional[str] = None,
|
|
891
|
+
) -> Dict:
|
|
892
|
+
"""Analyze summarization results using Claude."""
|
|
893
|
+
# Start timing
|
|
894
|
+
analysis_start_time = time.time()
|
|
895
|
+
|
|
896
|
+
analysis = {
|
|
897
|
+
"overall_analysis": "",
|
|
898
|
+
"strengths": [],
|
|
899
|
+
"weaknesses": [],
|
|
900
|
+
"recommendations": [],
|
|
901
|
+
"use_case_fit": "",
|
|
902
|
+
"per_question": [],
|
|
903
|
+
"overall_rating": {"rating": "", "explanation": ""},
|
|
904
|
+
"timing": {}, # Add timing information
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
if not summarization_results:
|
|
908
|
+
return {
|
|
909
|
+
"overall_analysis": "No summarization results found to analyze",
|
|
910
|
+
"strengths": [],
|
|
911
|
+
"weaknesses": ["No summarization data available for analysis"],
|
|
912
|
+
"recommendations": ["Ensure input data contains summarization results"],
|
|
913
|
+
"use_case_fit": "Unable to determine",
|
|
914
|
+
"per_question": [],
|
|
915
|
+
"overall_rating": {
|
|
916
|
+
"rating": "error",
|
|
917
|
+
"explanation": "No summarization results found",
|
|
918
|
+
},
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
try:
|
|
922
|
+
# Load ground truth summaries from separate file if provided
|
|
923
|
+
ground_truth_data = None
|
|
924
|
+
if groundtruth_path and Path(groundtruth_path).exists():
|
|
925
|
+
try:
|
|
926
|
+
with open(groundtruth_path, "r", encoding="utf-8") as f:
|
|
927
|
+
ground_truth_data = json.load(f)
|
|
928
|
+
self.log.info(f"Loaded ground truth data from: {groundtruth_path}")
|
|
929
|
+
|
|
930
|
+
# Check if this is a consolidated ground truth file
|
|
931
|
+
if "consolidated_from" in ground_truth_data.get("metadata", {}):
|
|
932
|
+
self.log.info(
|
|
933
|
+
f"Using consolidated ground truth with {ground_truth_data['metadata']['consolidated_from']} transcripts"
|
|
934
|
+
)
|
|
935
|
+
|
|
936
|
+
except Exception as e:
|
|
937
|
+
self.log.warning(
|
|
938
|
+
f"Failed to load ground truth file {groundtruth_path}: {e}"
|
|
939
|
+
)
|
|
940
|
+
ground_truth_data = None
|
|
941
|
+
elif groundtruth_path:
|
|
942
|
+
self.log.warning(f"Ground truth file not found: {groundtruth_path}")
|
|
943
|
+
|
|
944
|
+
total_usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
|
|
945
|
+
total_cost = {"input_cost": 0.0, "output_cost": 0.0, "total_cost": 0.0}
|
|
946
|
+
per_summary_timings = [] # Track timing for each summary
|
|
947
|
+
|
|
948
|
+
# Set up intermediate output directory for crash recovery
|
|
949
|
+
intermediate_dir = None
|
|
950
|
+
experiment_name = results.get("metadata", {}).get(
|
|
951
|
+
"experiment_name", "evaluation"
|
|
952
|
+
)
|
|
953
|
+
if hasattr(self, "intermediate_dir") and self.intermediate_dir:
|
|
954
|
+
# Use existing intermediate directory if set
|
|
955
|
+
intermediate_dir = (
|
|
956
|
+
Path(self.intermediate_dir)
|
|
957
|
+
/ f"{experiment_name}_analysis.intermediate"
|
|
958
|
+
)
|
|
959
|
+
else:
|
|
960
|
+
# Create in temp directory
|
|
961
|
+
import tempfile
|
|
962
|
+
|
|
963
|
+
temp_dir = Path(tempfile.gettempdir()) / "gaia_eval"
|
|
964
|
+
intermediate_dir = temp_dir / f"{experiment_name}_analysis.intermediate"
|
|
965
|
+
|
|
966
|
+
if intermediate_dir:
|
|
967
|
+
intermediate_dir.mkdir(parents=True, exist_ok=True)
|
|
968
|
+
self.log.info(
|
|
969
|
+
f"Writing intermediate analysis results to: {intermediate_dir}"
|
|
970
|
+
)
|
|
971
|
+
|
|
972
|
+
for i, summary_result in enumerate(summarization_results):
|
|
973
|
+
summary_start_time = time.time()
|
|
974
|
+
generated_summaries = summary_result.get("generated_summaries", {})
|
|
975
|
+
|
|
976
|
+
# Get ground truth summaries from embedded data or separate file
|
|
977
|
+
groundtruth_summaries = summary_result.get("groundtruth_summaries", {})
|
|
978
|
+
|
|
979
|
+
# If no embedded ground truth but we have a ground truth file, extract from it
|
|
980
|
+
if not groundtruth_summaries and ground_truth_data:
|
|
981
|
+
gt_analysis = ground_truth_data.get("analysis", {})
|
|
982
|
+
gt_summaries = gt_analysis.get("summaries", {})
|
|
983
|
+
|
|
984
|
+
# Handle both regular and consolidated ground truth formats
|
|
985
|
+
if gt_summaries:
|
|
986
|
+
# Check if this is consolidated format (summaries have transcript_id keys)
|
|
987
|
+
if "consolidated_from" in ground_truth_data.get("metadata", {}):
|
|
988
|
+
# For consolidated format, try to match by source file or use first available
|
|
989
|
+
source_file = summary_result.get("source_file", "")
|
|
990
|
+
transcript_id = None
|
|
991
|
+
|
|
992
|
+
# Try to match by source file name using metadata.source_files
|
|
993
|
+
source_files = ground_truth_data.get("metadata", {}).get(
|
|
994
|
+
"source_files", []
|
|
995
|
+
)
|
|
996
|
+
for source_mapping in source_files:
|
|
997
|
+
mapped_source = source_mapping.get("source_file", "")
|
|
998
|
+
if source_file and (
|
|
999
|
+
source_file == mapped_source
|
|
1000
|
+
or source_file.replace("/", "\\") == mapped_source
|
|
1001
|
+
or source_file.replace("\\", "/") == mapped_source
|
|
1002
|
+
):
|
|
1003
|
+
transcript_id = source_mapping.get("transcript_id")
|
|
1004
|
+
break
|
|
1005
|
+
|
|
1006
|
+
# If no match found, fail loudly - DO NOT use fallback
|
|
1007
|
+
if not transcript_id:
|
|
1008
|
+
available_sources = [
|
|
1009
|
+
s.get("source_file", "") for s in source_files
|
|
1010
|
+
]
|
|
1011
|
+
available_ids = (
|
|
1012
|
+
list(gt_summaries.keys()) if gt_summaries else []
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
error_msg = (
|
|
1016
|
+
f"\n{'='*70}\n"
|
|
1017
|
+
f"ERROR: No matching ground truth found for experiment result\n"
|
|
1018
|
+
f"{'='*70}\n"
|
|
1019
|
+
f"Source file in experiment: {source_file}\n"
|
|
1020
|
+
f"\nAvailable source files in ground truth:\n"
|
|
1021
|
+
)
|
|
1022
|
+
for idx, src in enumerate(available_sources[:10], 1):
|
|
1023
|
+
error_msg += f" {idx}. {src}\n"
|
|
1024
|
+
if len(available_sources) > 10:
|
|
1025
|
+
error_msg += f" ... and {len(available_sources) - 10} more\n"
|
|
1026
|
+
|
|
1027
|
+
error_msg += f"\nAvailable transcript IDs:\n"
|
|
1028
|
+
for idx, tid in enumerate(available_ids[:10], 1):
|
|
1029
|
+
error_msg += f" {idx}. {tid}\n"
|
|
1030
|
+
if len(available_ids) > 10:
|
|
1031
|
+
error_msg += (
|
|
1032
|
+
f" ... and {len(available_ids) - 10} more\n"
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
error_msg += (
|
|
1036
|
+
f"\nPossible fixes:\n"
|
|
1037
|
+
f" 1. Ensure ground truth source_file paths match experiment paths exactly\n"
|
|
1038
|
+
f" 2. Check if ground truth was generated from the same test data\n"
|
|
1039
|
+
f" 3. Verify path separators (forward vs backslash) are consistent\n"
|
|
1040
|
+
f" 4. Run fix_groundtruth_paths.py to normalize path prefixes\n"
|
|
1041
|
+
f"{'='*70}\n"
|
|
1042
|
+
)
|
|
1043
|
+
|
|
1044
|
+
self.log.error(error_msg)
|
|
1045
|
+
raise ValueError(
|
|
1046
|
+
f"No matching ground truth found for source: {source_file}. "
|
|
1047
|
+
f"Cannot evaluate without correct ground truth data."
|
|
1048
|
+
)
|
|
1049
|
+
|
|
1050
|
+
if transcript_id and transcript_id in gt_summaries:
|
|
1051
|
+
groundtruth_summaries = gt_summaries[transcript_id]
|
|
1052
|
+
self.log.debug(
|
|
1053
|
+
f"Using consolidated ground truth summaries for {transcript_id}"
|
|
1054
|
+
)
|
|
1055
|
+
else:
|
|
1056
|
+
# Regular format - summaries are directly under gt_summaries
|
|
1057
|
+
groundtruth_summaries = gt_summaries
|
|
1058
|
+
self.log.debug(
|
|
1059
|
+
f"Using regular ground truth summaries from file for summary {i}"
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
# Analyze each summary component
|
|
1063
|
+
summary_analysis = {
|
|
1064
|
+
"summary_index": i,
|
|
1065
|
+
"source_file": summary_result.get("source_file", ""),
|
|
1066
|
+
"analysis": {},
|
|
1067
|
+
"overall_quality": "",
|
|
1068
|
+
}
|
|
1069
|
+
|
|
1070
|
+
# Compare generated vs ground truth if available
|
|
1071
|
+
if groundtruth_summaries:
|
|
1072
|
+
prompt = f"""
|
|
1073
|
+
Analyze this summarization system result by comparing the generated summary against the ground truth.
|
|
1074
|
+
|
|
1075
|
+
GENERATED SUMMARY:
|
|
1076
|
+
Executive Summary: {generated_summaries.get('executive_summary', 'N/A')}
|
|
1077
|
+
Detailed Summary: {generated_summaries.get('detailed_summary', 'N/A')}
|
|
1078
|
+
Action Items: {generated_summaries.get('action_items', [])}
|
|
1079
|
+
Key Decisions: {generated_summaries.get('key_decisions', [])}
|
|
1080
|
+
Participants: {generated_summaries.get('participants', [])}
|
|
1081
|
+
Topics Discussed: {generated_summaries.get('topics_discussed', [])}
|
|
1082
|
+
|
|
1083
|
+
GROUND TRUTH SUMMARY:
|
|
1084
|
+
Executive Summary: {groundtruth_summaries.get('executive_summary', 'N/A')}
|
|
1085
|
+
Detailed Summary: {groundtruth_summaries.get('detailed_summary', 'N/A')}
|
|
1086
|
+
Action Items: {groundtruth_summaries.get('action_items', [])}
|
|
1087
|
+
Key Decisions: {groundtruth_summaries.get('key_decisions', [])}
|
|
1088
|
+
Participants: {groundtruth_summaries.get('participants', [])}
|
|
1089
|
+
Topics Discussed: {groundtruth_summaries.get('topics_discussed', [])}
|
|
1090
|
+
|
|
1091
|
+
Evaluate the generated summary on these criteria (rate each as excellent/good/fair/poor):
|
|
1092
|
+
1. Executive Summary Accuracy: How well does the executive summary capture the key points?
|
|
1093
|
+
2. Completeness: Are all important details covered?
|
|
1094
|
+
3. Action Items Accuracy: Are action items correctly identified and detailed?
|
|
1095
|
+
4. Key Decisions Accuracy: Are key decisions properly captured?
|
|
1096
|
+
5. Participant Identification: Are participants correctly identified?
|
|
1097
|
+
6. Topic Coverage: Are all discussed topics included?
|
|
1098
|
+
|
|
1099
|
+
Return your analysis in this JSON format:
|
|
1100
|
+
{{
|
|
1101
|
+
"executive_summary_quality": {{
|
|
1102
|
+
"rating": "excellent/good/fair/poor",
|
|
1103
|
+
"explanation": "detailed analysis"
|
|
1104
|
+
}},
|
|
1105
|
+
"detail_completeness": {{
|
|
1106
|
+
"rating": "excellent/good/fair/poor",
|
|
1107
|
+
"explanation": "detailed analysis"
|
|
1108
|
+
}},
|
|
1109
|
+
"action_items_structure": {{
|
|
1110
|
+
"rating": "excellent/good/fair/poor",
|
|
1111
|
+
"explanation": "detailed analysis"
|
|
1112
|
+
}},
|
|
1113
|
+
"key_decisions_clarity": {{
|
|
1114
|
+
"rating": "excellent/good/fair/poor",
|
|
1115
|
+
"explanation": "detailed analysis"
|
|
1116
|
+
}},
|
|
1117
|
+
"participant_information": {{
|
|
1118
|
+
"rating": "excellent/good/fair/poor",
|
|
1119
|
+
"explanation": "detailed analysis"
|
|
1120
|
+
}},
|
|
1121
|
+
"topic_organization": {{
|
|
1122
|
+
"rating": "excellent/good/fair/poor",
|
|
1123
|
+
"explanation": "detailed analysis"
|
|
1124
|
+
}},
|
|
1125
|
+
"overall_quality": "excellent/good/fair/poor"
|
|
1126
|
+
}}
|
|
1127
|
+
"""
|
|
1128
|
+
else:
|
|
1129
|
+
# Analyze standalone summary quality
|
|
1130
|
+
prompt = f"""
|
|
1131
|
+
Analyze this generated meeting summary for quality and completeness.
|
|
1132
|
+
|
|
1133
|
+
GENERATED SUMMARY:
|
|
1134
|
+
Executive Summary: {generated_summaries.get('executive_summary', 'N/A')}
|
|
1135
|
+
Detailed Summary: {generated_summaries.get('detailed_summary', 'N/A')}
|
|
1136
|
+
Action Items: {generated_summaries.get('action_items', [])}
|
|
1137
|
+
Key Decisions: {generated_summaries.get('key_decisions', [])}
|
|
1138
|
+
Participants: {generated_summaries.get('participants', [])}
|
|
1139
|
+
Topics Discussed: {generated_summaries.get('topics_discussed', [])}
|
|
1140
|
+
|
|
1141
|
+
Evaluate the summary quality (rate each as excellent/good/fair/poor):
|
|
1142
|
+
1. Executive Summary Quality: Is it clear and high-level?
|
|
1143
|
+
2. Detail Completeness: Does the detailed summary provide sufficient context?
|
|
1144
|
+
3. Action Items Structure: Are action items specific and actionable?
|
|
1145
|
+
4. Key Decisions Clarity: Are decisions clearly stated?
|
|
1146
|
+
5. Participant Information: Are participants properly identified?
|
|
1147
|
+
6. Topic Organization: Are topics well-organized and comprehensive?
|
|
1148
|
+
|
|
1149
|
+
IMPORTANT: Return ONLY valid JSON with no additional text, markdown formatting, or explanations.
|
|
1150
|
+
Ensure all JSON syntax is correct - no trailing commas, proper quotes, and complete structure.
|
|
1151
|
+
|
|
1152
|
+
Return your analysis in this exact JSON format:
|
|
1153
|
+
{{
|
|
1154
|
+
"executive_summary_quality": {{
|
|
1155
|
+
"rating": "excellent/good/fair/poor",
|
|
1156
|
+
"explanation": "detailed analysis"
|
|
1157
|
+
}},
|
|
1158
|
+
"detail_completeness": {{
|
|
1159
|
+
"rating": "excellent/good/fair/poor",
|
|
1160
|
+
"explanation": "detailed analysis"
|
|
1161
|
+
}},
|
|
1162
|
+
"action_items_structure": {{
|
|
1163
|
+
"rating": "excellent/good/fair/poor",
|
|
1164
|
+
"explanation": "detailed analysis"
|
|
1165
|
+
}},
|
|
1166
|
+
"key_decisions_clarity": {{
|
|
1167
|
+
"rating": "excellent/good/fair/poor",
|
|
1168
|
+
"explanation": "detailed analysis"
|
|
1169
|
+
}},
|
|
1170
|
+
"participant_information": {{
|
|
1171
|
+
"rating": "excellent/good/fair/poor",
|
|
1172
|
+
"explanation": "detailed analysis"
|
|
1173
|
+
}},
|
|
1174
|
+
"topic_organization": {{
|
|
1175
|
+
"rating": "excellent/good/fair/poor",
|
|
1176
|
+
"explanation": "detailed analysis"
|
|
1177
|
+
}},
|
|
1178
|
+
"overall_quality": "excellent/good/fair/poor"
|
|
1179
|
+
}}
|
|
1180
|
+
"""
|
|
1181
|
+
|
|
1182
|
+
try:
|
|
1183
|
+
response_data = self.claude.get_completion_with_usage(prompt)
|
|
1184
|
+
response = response_data["content"]
|
|
1185
|
+
usage = response_data["usage"]
|
|
1186
|
+
cost = response_data["cost"]
|
|
1187
|
+
|
|
1188
|
+
# Extract text from response
|
|
1189
|
+
if isinstance(response, list):
|
|
1190
|
+
response_text = (
|
|
1191
|
+
response[0].text
|
|
1192
|
+
if hasattr(response[0], "text")
|
|
1193
|
+
else str(response[0])
|
|
1194
|
+
)
|
|
1195
|
+
else:
|
|
1196
|
+
response_text = (
|
|
1197
|
+
response.text
|
|
1198
|
+
if hasattr(response, "text")
|
|
1199
|
+
else str(response)
|
|
1200
|
+
)
|
|
1201
|
+
|
|
1202
|
+
# Parse JSON response with improved error handling
|
|
1203
|
+
# First try to extract from markdown code blocks
|
|
1204
|
+
markdown_json = re.search(
|
|
1205
|
+
r"```(?:json)?\s*(\{.*?\})\s*```", response_text, re.DOTALL
|
|
1206
|
+
)
|
|
1207
|
+
if markdown_json:
|
|
1208
|
+
json_content = markdown_json.group(1)
|
|
1209
|
+
else:
|
|
1210
|
+
# Fall back to finding raw JSON
|
|
1211
|
+
json_start = response_text.find("{")
|
|
1212
|
+
json_end = response_text.rfind("}") + 1
|
|
1213
|
+
if json_start >= 0 and json_end > json_start:
|
|
1214
|
+
json_content = response_text[json_start:json_end]
|
|
1215
|
+
else:
|
|
1216
|
+
json_content = None
|
|
1217
|
+
|
|
1218
|
+
if json_content:
|
|
1219
|
+
try:
|
|
1220
|
+
# First attempt: direct JSON parsing
|
|
1221
|
+
summary_analysis["analysis"] = json.loads(json_content)
|
|
1222
|
+
except json.JSONDecodeError as e:
|
|
1223
|
+
self.log.warning(f"Initial JSON parse failed: {e}")
|
|
1224
|
+
# Second attempt: clean up common issues
|
|
1225
|
+
# Remove any trailing commas before closing braces/brackets
|
|
1226
|
+
cleaned_json = re.sub(r",\s*([}\]])", r"\1", json_content)
|
|
1227
|
+
# Replace single quotes with double quotes (if any) - but not within strings
|
|
1228
|
+
# This is a simple heuristic, not perfect
|
|
1229
|
+
cleaned_json = cleaned_json.replace("'", '"')
|
|
1230
|
+
# Remove any control characters except newlines and tabs
|
|
1231
|
+
cleaned_json = re.sub(
|
|
1232
|
+
r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]",
|
|
1233
|
+
"",
|
|
1234
|
+
cleaned_json,
|
|
1235
|
+
)
|
|
1236
|
+
# Fix common escape issues
|
|
1237
|
+
cleaned_json = cleaned_json.replace(
|
|
1238
|
+
'\\"', '"'
|
|
1239
|
+
) # Remove escaped quotes that might be double-escaped
|
|
1240
|
+
cleaned_json = re.sub(
|
|
1241
|
+
r'(?<!\\)\\(?!["\\/bfnrt])', r"\\\\", cleaned_json
|
|
1242
|
+
) # Fix unescaped backslashes
|
|
1243
|
+
|
|
1244
|
+
try:
|
|
1245
|
+
summary_analysis["analysis"] = json.loads(cleaned_json)
|
|
1246
|
+
self.log.info("Successfully parsed JSON after cleanup")
|
|
1247
|
+
except json.JSONDecodeError as e2:
|
|
1248
|
+
self.log.error(f"JSON parse failed after cleanup: {e2}")
|
|
1249
|
+
# Third attempt: extract individual fields manually
|
|
1250
|
+
analysis_dict = {}
|
|
1251
|
+
|
|
1252
|
+
# Try to extract each field individually
|
|
1253
|
+
fields = [
|
|
1254
|
+
"executive_summary_quality",
|
|
1255
|
+
"detail_completeness",
|
|
1256
|
+
"action_items_structure",
|
|
1257
|
+
"key_decisions_clarity",
|
|
1258
|
+
"participant_information",
|
|
1259
|
+
"topic_organization",
|
|
1260
|
+
"overall_quality",
|
|
1261
|
+
]
|
|
1262
|
+
|
|
1263
|
+
for field in fields:
|
|
1264
|
+
# Find the field and extract its rating
|
|
1265
|
+
pattern = rf'"{field}":\s*(?:"([^"]+)"|{{[^}}]+}})'
|
|
1266
|
+
match = re.search(pattern, json_content)
|
|
1267
|
+
if match:
|
|
1268
|
+
if field == "overall_quality":
|
|
1269
|
+
analysis_dict[field] = (
|
|
1270
|
+
match.group(1)
|
|
1271
|
+
if match.group(1)
|
|
1272
|
+
else "unknown"
|
|
1273
|
+
)
|
|
1274
|
+
else:
|
|
1275
|
+
# Try to extract rating from nested object
|
|
1276
|
+
rating_pattern = rf'"{field}":\s*{{[^}}]*"rating":\s*"([^"]+)"'
|
|
1277
|
+
rating_match = re.search(
|
|
1278
|
+
rating_pattern, json_content
|
|
1279
|
+
)
|
|
1280
|
+
if rating_match:
|
|
1281
|
+
analysis_dict[field] = {
|
|
1282
|
+
"rating": rating_match.group(1),
|
|
1283
|
+
"explanation": "Extracted from partial JSON",
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
if analysis_dict:
|
|
1287
|
+
summary_analysis["analysis"] = analysis_dict
|
|
1288
|
+
self.log.warning(
|
|
1289
|
+
f"PARTIAL RECOVERY - Used fallback field extraction for summary {i}, extracted {len(analysis_dict)} fields"
|
|
1290
|
+
)
|
|
1291
|
+
else:
|
|
1292
|
+
# Final fallback: save raw response for debugging
|
|
1293
|
+
self.log.error(
|
|
1294
|
+
f"FALLBACK VALUES USED - Complete JSON parse failure for summary {i}"
|
|
1295
|
+
)
|
|
1296
|
+
summary_analysis["analysis"] = {
|
|
1297
|
+
"error": f"[FALLBACK - JSON PARSE FAILED] {str(e2)}",
|
|
1298
|
+
"raw_response": response_text[
|
|
1299
|
+
:1000
|
|
1300
|
+
], # First 1000 chars for debugging
|
|
1301
|
+
"_warning": "This is a fallback response - Claude's analysis could not be parsed",
|
|
1302
|
+
"executive_summary_quality": {
|
|
1303
|
+
"rating": "error",
|
|
1304
|
+
"explanation": "[PARSE ERROR - See raw_response]",
|
|
1305
|
+
},
|
|
1306
|
+
"detail_completeness": {
|
|
1307
|
+
"rating": "error",
|
|
1308
|
+
"explanation": "[PARSE ERROR - See raw_response]",
|
|
1309
|
+
},
|
|
1310
|
+
"action_items_structure": {
|
|
1311
|
+
"rating": "error",
|
|
1312
|
+
"explanation": "[PARSE ERROR - See raw_response]",
|
|
1313
|
+
},
|
|
1314
|
+
"key_decisions_clarity": {
|
|
1315
|
+
"rating": "error",
|
|
1316
|
+
"explanation": "[PARSE ERROR - See raw_response]",
|
|
1317
|
+
},
|
|
1318
|
+
"participant_information": {
|
|
1319
|
+
"rating": "error",
|
|
1320
|
+
"explanation": "[PARSE ERROR - See raw_response]",
|
|
1321
|
+
},
|
|
1322
|
+
"topic_organization": {
|
|
1323
|
+
"rating": "error",
|
|
1324
|
+
"explanation": "[PARSE ERROR - See raw_response]",
|
|
1325
|
+
},
|
|
1326
|
+
}
|
|
1327
|
+
summary_analysis["overall_quality"] = "error"
|
|
1328
|
+
|
|
1329
|
+
# Set overall quality if successfully parsed
|
|
1330
|
+
if "analysis" in summary_analysis and isinstance(
|
|
1331
|
+
summary_analysis["analysis"], dict
|
|
1332
|
+
):
|
|
1333
|
+
summary_analysis["overall_quality"] = summary_analysis[
|
|
1334
|
+
"analysis"
|
|
1335
|
+
].get("overall_quality", "unknown")
|
|
1336
|
+
else:
|
|
1337
|
+
summary_analysis["overall_quality"] = "error"
|
|
1338
|
+
else:
|
|
1339
|
+
summary_analysis["analysis"] = {
|
|
1340
|
+
"error": "No JSON content found in Claude response",
|
|
1341
|
+
"raw_response": response_text[:500],
|
|
1342
|
+
}
|
|
1343
|
+
summary_analysis["overall_quality"] = "error"
|
|
1344
|
+
|
|
1345
|
+
# Add usage and cost
|
|
1346
|
+
summary_analysis["usage"] = usage
|
|
1347
|
+
summary_analysis["cost"] = cost
|
|
1348
|
+
|
|
1349
|
+
# Add timing for this summary
|
|
1350
|
+
summary_time = time.time() - summary_start_time
|
|
1351
|
+
summary_analysis["processing_time_seconds"] = round(summary_time, 3)
|
|
1352
|
+
per_summary_timings.append(summary_time)
|
|
1353
|
+
|
|
1354
|
+
# Accumulate totals
|
|
1355
|
+
total_usage["input_tokens"] += usage.get("input_tokens", 0)
|
|
1356
|
+
total_usage["output_tokens"] += usage.get("output_tokens", 0)
|
|
1357
|
+
total_usage["total_tokens"] += usage.get("total_tokens", 0)
|
|
1358
|
+
total_cost["input_cost"] += cost.get("input_cost", 0.0)
|
|
1359
|
+
total_cost["output_cost"] += cost.get("output_cost", 0.0)
|
|
1360
|
+
total_cost["total_cost"] += cost.get("total_cost", 0.0)
|
|
1361
|
+
|
|
1362
|
+
except Exception as e:
|
|
1363
|
+
self.log.error(f"Error analyzing summary {i}: {e}")
|
|
1364
|
+
summary_analysis["analysis"] = {"error": str(e)}
|
|
1365
|
+
summary_analysis["overall_quality"] = "error"
|
|
1366
|
+
|
|
1367
|
+
# Add timing even for errors
|
|
1368
|
+
summary_time = time.time() - summary_start_time
|
|
1369
|
+
summary_analysis["processing_time_seconds"] = round(summary_time, 3)
|
|
1370
|
+
per_summary_timings.append(summary_time)
|
|
1371
|
+
|
|
1372
|
+
analysis["per_question"].append(summary_analysis)
|
|
1373
|
+
|
|
1374
|
+
# Write intermediate result immediately for crash recovery
|
|
1375
|
+
if intermediate_dir:
|
|
1376
|
+
try:
|
|
1377
|
+
intermediate_file = (
|
|
1378
|
+
intermediate_dir / f"summary_{i+1:04d}_analysis.json"
|
|
1379
|
+
)
|
|
1380
|
+
intermediate_data = {
|
|
1381
|
+
"summary_index": i,
|
|
1382
|
+
"experiment_name": experiment_name,
|
|
1383
|
+
"source_file": summary_result.get("source_file", ""),
|
|
1384
|
+
"analysis": summary_analysis,
|
|
1385
|
+
"usage": summary_analysis.get("usage", {}),
|
|
1386
|
+
"cost": summary_analysis.get("cost", {}),
|
|
1387
|
+
"processing_time_seconds": summary_analysis.get(
|
|
1388
|
+
"processing_time_seconds", 0
|
|
1389
|
+
),
|
|
1390
|
+
"timestamp": datetime.now().isoformat(),
|
|
1391
|
+
}
|
|
1392
|
+
|
|
1393
|
+
with open(intermediate_file, "w", encoding="utf-8") as f:
|
|
1394
|
+
json.dump(intermediate_data, f, indent=2)
|
|
1395
|
+
|
|
1396
|
+
# Update progress file
|
|
1397
|
+
progress_file = intermediate_dir / "analysis_progress.json"
|
|
1398
|
+
progress_data = {
|
|
1399
|
+
"experiment_name": experiment_name,
|
|
1400
|
+
"total_summaries": len(summarization_results),
|
|
1401
|
+
"completed_summaries": i + 1,
|
|
1402
|
+
"progress_percent": round(
|
|
1403
|
+
(i + 1) / len(summarization_results) * 100, 1
|
|
1404
|
+
),
|
|
1405
|
+
"total_usage": total_usage.copy(),
|
|
1406
|
+
"total_cost": total_cost.copy(),
|
|
1407
|
+
"last_updated": datetime.now().isoformat(),
|
|
1408
|
+
"estimated_remaining_time": None,
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
# Calculate estimated remaining time
|
|
1412
|
+
if i > 0:
|
|
1413
|
+
avg_time_per_summary = sum(per_summary_timings) / len(
|
|
1414
|
+
per_summary_timings
|
|
1415
|
+
)
|
|
1416
|
+
remaining_summaries = len(summarization_results) - (i + 1)
|
|
1417
|
+
estimated_remaining = (
|
|
1418
|
+
remaining_summaries * avg_time_per_summary
|
|
1419
|
+
)
|
|
1420
|
+
progress_data["estimated_remaining_time"] = round(
|
|
1421
|
+
estimated_remaining, 1
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
with open(progress_file, "w", encoding="utf-8") as f:
|
|
1425
|
+
json.dump(progress_data, f, indent=2)
|
|
1426
|
+
|
|
1427
|
+
self.log.info(
|
|
1428
|
+
f"Analysis progress: {i+1}/{len(summarization_results)} summaries completed ({progress_data['progress_percent']}%)"
|
|
1429
|
+
)
|
|
1430
|
+
|
|
1431
|
+
except Exception as e:
|
|
1432
|
+
self.log.warning(
|
|
1433
|
+
f"Failed to write intermediate analysis result {i+1}: {e}"
|
|
1434
|
+
)
|
|
1435
|
+
|
|
1436
|
+
# Generate overall analysis
|
|
1437
|
+
quality_ratings = [
|
|
1438
|
+
s.get("overall_quality", "unknown") for s in analysis["per_question"]
|
|
1439
|
+
]
|
|
1440
|
+
|
|
1441
|
+
# Filter out error and unknown ratings for scoring
|
|
1442
|
+
valid_quality_ratings = [
|
|
1443
|
+
rating
|
|
1444
|
+
for rating in quality_ratings
|
|
1445
|
+
if rating in ["excellent", "good", "fair", "poor"]
|
|
1446
|
+
]
|
|
1447
|
+
|
|
1448
|
+
excellent_count = valid_quality_ratings.count("excellent")
|
|
1449
|
+
good_count = valid_quality_ratings.count("good")
|
|
1450
|
+
fair_count = valid_quality_ratings.count("fair")
|
|
1451
|
+
poor_count = valid_quality_ratings.count("poor")
|
|
1452
|
+
total_summaries = len(valid_quality_ratings)
|
|
1453
|
+
error_count = quality_ratings.count("error")
|
|
1454
|
+
|
|
1455
|
+
# Log information about errors if any
|
|
1456
|
+
if error_count > 0:
|
|
1457
|
+
self.log.warning(
|
|
1458
|
+
f"Excluded {error_count} error entries from quality scoring"
|
|
1459
|
+
)
|
|
1460
|
+
|
|
1461
|
+
# Handle case where no valid summaries are available for scoring
|
|
1462
|
+
if total_summaries == 0:
|
|
1463
|
+
if error_count > 0:
|
|
1464
|
+
self.log.error(
|
|
1465
|
+
"All summaries failed analysis - cannot compute quality score"
|
|
1466
|
+
)
|
|
1467
|
+
overall_rating = "error"
|
|
1468
|
+
else:
|
|
1469
|
+
self.log.warning("No summaries found for analysis")
|
|
1470
|
+
overall_rating = "unknown"
|
|
1471
|
+
elif excellent_count >= total_summaries * 0.7:
|
|
1472
|
+
overall_rating = "excellent"
|
|
1473
|
+
elif (excellent_count + good_count) >= total_summaries * 0.7:
|
|
1474
|
+
overall_rating = "good"
|
|
1475
|
+
elif (excellent_count + good_count + fair_count) >= total_summaries * 0.7:
|
|
1476
|
+
overall_rating = "fair"
|
|
1477
|
+
else:
|
|
1478
|
+
overall_rating = "poor"
|
|
1479
|
+
|
|
1480
|
+
# Send individual analyses to Claude for comprehensive overall analysis
|
|
1481
|
+
overall_start_time = time.time()
|
|
1482
|
+
|
|
1483
|
+
# Get experiment/model information
|
|
1484
|
+
experiment_name = results.get("metadata", {}).get(
|
|
1485
|
+
"experiment_name", "Unknown Model"
|
|
1486
|
+
)
|
|
1487
|
+
model_type = results.get("metadata", {}).get("model", "")
|
|
1488
|
+
|
|
1489
|
+
overall_prompt = f"""
|
|
1490
|
+
Review these summarization test results and provide a comprehensive overall analysis.
|
|
1491
|
+
|
|
1492
|
+
Model/Experiment: {experiment_name}
|
|
1493
|
+
Number of summaries analyzed: {total_summaries}
|
|
1494
|
+
Quality distribution:
|
|
1495
|
+
- Excellent: {excellent_count} ({excellent_count/total_summaries*100:.1f}%)
|
|
1496
|
+
- Good: {good_count} ({good_count/total_summaries*100:.1f}%)
|
|
1497
|
+
- Fair: {fair_count} ({fair_count/total_summaries*100:.1f}%)
|
|
1498
|
+
- Poor: {poor_count} ({poor_count/total_summaries*100:.1f}%)
|
|
1499
|
+
|
|
1500
|
+
Overall quality rating: {overall_rating}
|
|
1501
|
+
|
|
1502
|
+
Individual summary analyses: {json.dumps(analysis['per_question'], indent=2)}
|
|
1503
|
+
|
|
1504
|
+
Based on the detailed analysis of each summary above, provide a comprehensive assessment including:
|
|
1505
|
+
|
|
1506
|
+
1. Overall Analysis: General assessment of the summarization system's performance
|
|
1507
|
+
2. Strengths: Specific aspects the model does well (be specific based on the individual analyses)
|
|
1508
|
+
3. Weaknesses: Concrete areas needing improvement (based on patterns in the individual analyses)
|
|
1509
|
+
4. Recommendations: Actionable suggestions for improvement
|
|
1510
|
+
5. Use Case Fit: Types of meetings/content this model handles well or poorly
|
|
1511
|
+
|
|
1512
|
+
Consider the following in your analysis:
|
|
1513
|
+
- Patterns in accuracy, completeness, organization across summaries
|
|
1514
|
+
- Consistency of performance
|
|
1515
|
+
- Specific failure modes observed
|
|
1516
|
+
- Model characteristics (e.g., if it's Claude, Llama, Qwen, etc.)
|
|
1517
|
+
|
|
1518
|
+
IMPORTANT: Return ONLY valid JSON with no additional text, markdown formatting, or explanations.
|
|
1519
|
+
Ensure all JSON syntax is correct - no trailing commas, proper quotes, and complete structure.
|
|
1520
|
+
|
|
1521
|
+
Return your analysis in this exact JSON format:
|
|
1522
|
+
{{
|
|
1523
|
+
"overall_analysis": "comprehensive assessment of overall performance",
|
|
1524
|
+
"strengths": ["specific strength 1", "specific strength 2", ...],
|
|
1525
|
+
"weaknesses": ["specific weakness 1", "specific weakness 2", ...],
|
|
1526
|
+
"recommendations": ["actionable recommendation 1", "actionable recommendation 2", ...],
|
|
1527
|
+
"use_case_fit": "detailed analysis of suitable use cases and limitations"
|
|
1528
|
+
}}
|
|
1529
|
+
"""
|
|
1530
|
+
|
|
1531
|
+
try:
|
|
1532
|
+
overall_response_data = self.claude.get_completion_with_usage(
|
|
1533
|
+
overall_prompt
|
|
1534
|
+
)
|
|
1535
|
+
|
|
1536
|
+
# Extract JSON from overall response
|
|
1537
|
+
overall_response = overall_response_data["content"]
|
|
1538
|
+
overall_usage = overall_response_data["usage"]
|
|
1539
|
+
overall_cost = overall_response_data["cost"]
|
|
1540
|
+
|
|
1541
|
+
if isinstance(overall_response, list):
|
|
1542
|
+
response_text = (
|
|
1543
|
+
overall_response[0].text
|
|
1544
|
+
if hasattr(overall_response[0], "text")
|
|
1545
|
+
else str(overall_response[0])
|
|
1546
|
+
)
|
|
1547
|
+
else:
|
|
1548
|
+
response_text = (
|
|
1549
|
+
overall_response.text
|
|
1550
|
+
if hasattr(overall_response, "text")
|
|
1551
|
+
else str(overall_response)
|
|
1552
|
+
)
|
|
1553
|
+
|
|
1554
|
+
# Try to extract JSON from various formats (markdown, plain, etc.)
|
|
1555
|
+
# First try to extract from markdown code blocks
|
|
1556
|
+
markdown_json = re.search(
|
|
1557
|
+
r"```(?:json)?\s*(\{.*?\})\s*```", response_text, re.DOTALL
|
|
1558
|
+
)
|
|
1559
|
+
if markdown_json:
|
|
1560
|
+
json_content = markdown_json.group(1)
|
|
1561
|
+
json_found = True
|
|
1562
|
+
else:
|
|
1563
|
+
# Fall back to finding raw JSON
|
|
1564
|
+
json_start = response_text.find("{")
|
|
1565
|
+
json_end = response_text.rfind("}") + 1
|
|
1566
|
+
if json_start >= 0 and json_end > json_start:
|
|
1567
|
+
json_content = response_text[json_start:json_end]
|
|
1568
|
+
json_found = True
|
|
1569
|
+
else:
|
|
1570
|
+
json_found = False
|
|
1571
|
+
|
|
1572
|
+
if json_found:
|
|
1573
|
+
try:
|
|
1574
|
+
# First attempt: direct JSON parsing
|
|
1575
|
+
claude_analysis = json.loads(json_content)
|
|
1576
|
+
except json.JSONDecodeError as e:
|
|
1577
|
+
self.log.warning(
|
|
1578
|
+
f"Initial JSON parse failed for overall analysis: {e}"
|
|
1579
|
+
)
|
|
1580
|
+
# Second attempt: clean up common issues
|
|
1581
|
+
# Remove trailing commas before closing braces/brackets
|
|
1582
|
+
cleaned_json = re.sub(r",\s*([}\]])", r"\1", json_content)
|
|
1583
|
+
# Replace single quotes with double quotes (if any) - simple heuristic
|
|
1584
|
+
cleaned_json = cleaned_json.replace("'", '"')
|
|
1585
|
+
# Remove control characters except newlines and tabs
|
|
1586
|
+
cleaned_json = re.sub(
|
|
1587
|
+
r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]", "", cleaned_json
|
|
1588
|
+
)
|
|
1589
|
+
# Fix common escape issues
|
|
1590
|
+
cleaned_json = cleaned_json.replace(
|
|
1591
|
+
'\\"', '"'
|
|
1592
|
+
) # Remove escaped quotes that might be double-escaped
|
|
1593
|
+
cleaned_json = re.sub(
|
|
1594
|
+
r'(?<!\\)\\(?!["\\/bfnrt])', r"\\\\", cleaned_json
|
|
1595
|
+
) # Fix unescaped backslashes
|
|
1596
|
+
|
|
1597
|
+
try:
|
|
1598
|
+
claude_analysis = json.loads(cleaned_json)
|
|
1599
|
+
self.log.info(
|
|
1600
|
+
"Successfully parsed overall analysis JSON after cleanup"
|
|
1601
|
+
)
|
|
1602
|
+
except json.JSONDecodeError as e2:
|
|
1603
|
+
self.log.error(
|
|
1604
|
+
f"FALLBACK VALUES USED - Failed to parse Claude's overall analysis response after cleanup: {e2}"
|
|
1605
|
+
)
|
|
1606
|
+
self.log.error(
|
|
1607
|
+
f"Raw response preview: {json_content[:500]}..."
|
|
1608
|
+
)
|
|
1609
|
+
# Use fallback values - CLEARLY MARKED
|
|
1610
|
+
claude_analysis = {
|
|
1611
|
+
"overall_analysis": f"[FALLBACK - JSON PARSE ERROR] Analyzed {total_summaries} summaries. Quality distribution: {excellent_count} excellent, {good_count} good, {fair_count} fair, {poor_count} poor. NOTE: Claude's detailed analysis could not be parsed due to malformed JSON response.",
|
|
1612
|
+
"strengths": [
|
|
1613
|
+
"[FALLBACK VALUE - Real analysis unavailable due to JSON parse error]"
|
|
1614
|
+
],
|
|
1615
|
+
"weaknesses": [
|
|
1616
|
+
"[FALLBACK VALUE - Real analysis unavailable due to JSON parse error]"
|
|
1617
|
+
],
|
|
1618
|
+
"recommendations": [
|
|
1619
|
+
"[FALLBACK VALUE - Real analysis unavailable due to JSON parse error]",
|
|
1620
|
+
"Review raw Claude response in logs for actual recommendations",
|
|
1621
|
+
],
|
|
1622
|
+
"use_case_fit": "[FALLBACK VALUE - Real analysis unavailable due to JSON parse error]",
|
|
1623
|
+
"_warning": "These are fallback values - Claude's actual analysis failed to parse. Check logs for details.",
|
|
1624
|
+
}
|
|
1625
|
+
|
|
1626
|
+
# Add Claude's analysis to our results
|
|
1627
|
+
overall_analysis_text = claude_analysis.get(
|
|
1628
|
+
"overall_analysis",
|
|
1629
|
+
f"Analyzed {total_summaries} summaries. Quality distribution: {excellent_count} excellent, {good_count} good, {fair_count} fair, {poor_count} poor.",
|
|
1630
|
+
)
|
|
1631
|
+
strengths = claude_analysis.get(
|
|
1632
|
+
"strengths", ["Summary generation completed"]
|
|
1633
|
+
)
|
|
1634
|
+
weaknesses = claude_analysis.get(
|
|
1635
|
+
"weaknesses", ["Areas for improvement identified"]
|
|
1636
|
+
)
|
|
1637
|
+
recommendations = claude_analysis.get(
|
|
1638
|
+
"recommendations", ["Continue monitoring performance"]
|
|
1639
|
+
)
|
|
1640
|
+
use_case_fit = claude_analysis.get(
|
|
1641
|
+
"use_case_fit", "Suitable for meeting summarization"
|
|
1642
|
+
)
|
|
1643
|
+
|
|
1644
|
+
# Track Claude API usage for overall analysis
|
|
1645
|
+
analysis["overall_usage"] = overall_usage
|
|
1646
|
+
analysis["overall_cost"] = overall_cost
|
|
1647
|
+
analysis["overall_processing_time_seconds"] = round(
|
|
1648
|
+
time.time() - overall_start_time, 3
|
|
1649
|
+
)
|
|
1650
|
+
else:
|
|
1651
|
+
self.log.error(
|
|
1652
|
+
"FALLBACK VALUES USED - No JSON content found in Claude's overall analysis response"
|
|
1653
|
+
)
|
|
1654
|
+
self.log.error(
|
|
1655
|
+
f"Raw response preview (first 1000 chars): {response_text[:1000]}"
|
|
1656
|
+
)
|
|
1657
|
+
# Save full response to debug file
|
|
1658
|
+
debug_dir = Path("debug_claude_responses")
|
|
1659
|
+
debug_dir.mkdir(exist_ok=True)
|
|
1660
|
+
debug_file = (
|
|
1661
|
+
debug_dir
|
|
1662
|
+
/ f"overall_analysis_no_json_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
|
|
1663
|
+
)
|
|
1664
|
+
with open(debug_file, "w", encoding="utf-8") as f:
|
|
1665
|
+
f.write(
|
|
1666
|
+
f"No JSON found in response. JSON start: {json_start}, JSON end: {json_end}\n"
|
|
1667
|
+
)
|
|
1668
|
+
f.write(f"Full response:\n{response_text}")
|
|
1669
|
+
self.log.error(f"Full response saved to: {debug_file}")
|
|
1670
|
+
# Fallback to programmatic analysis - CLEARLY MARKED
|
|
1671
|
+
overall_analysis_text = f"[FALLBACK - NO JSON FOUND] Analyzed {total_summaries} summaries. Quality distribution: {excellent_count} excellent, {good_count} good, {fair_count} fair, {poor_count} poor. NOTE: Claude's response contained no parseable JSON."
|
|
1672
|
+
strengths = [
|
|
1673
|
+
"[FALLBACK VALUE - Claude response had no JSON content]"
|
|
1674
|
+
]
|
|
1675
|
+
weaknesses = [
|
|
1676
|
+
"[FALLBACK VALUE - Manual review of logs required to see actual Claude response]"
|
|
1677
|
+
]
|
|
1678
|
+
recommendations = [
|
|
1679
|
+
"[FALLBACK VALUE - Check logs for Claude's actual response]"
|
|
1680
|
+
]
|
|
1681
|
+
use_case_fit = "[FALLBACK VALUE - Claude's analysis not available]"
|
|
1682
|
+
|
|
1683
|
+
except Exception as e:
|
|
1684
|
+
self.log.error(
|
|
1685
|
+
f"FALLBACK VALUES USED - Exception during Claude overall analysis: {e}"
|
|
1686
|
+
)
|
|
1687
|
+
# Fallback to basic programmatic analysis if Claude fails - CLEARLY MARKED
|
|
1688
|
+
overall_analysis_text = f"[FALLBACK - EXCEPTION: {str(e)[:100]}] Analyzed {total_summaries} summaries. Quality distribution: {excellent_count} excellent, {good_count} good, {fair_count} fair, {poor_count} poor."
|
|
1689
|
+
strengths = [f"[FALLBACK VALUE - Claude API error: {str(e)[:100]}]"]
|
|
1690
|
+
weaknesses = ["[FALLBACK VALUE - Analysis failed due to API error]"]
|
|
1691
|
+
recommendations = [
|
|
1692
|
+
"[FALLBACK VALUE - Check API connectivity and retry]"
|
|
1693
|
+
]
|
|
1694
|
+
|
|
1695
|
+
# Basic programmatic fallback analysis
|
|
1696
|
+
if excellent_count > 0:
|
|
1697
|
+
strengths.append(
|
|
1698
|
+
f"Achieved excellent quality in {excellent_count}/{total_summaries} summaries"
|
|
1699
|
+
)
|
|
1700
|
+
if good_count > 0:
|
|
1701
|
+
strengths.append(
|
|
1702
|
+
f"Produced good quality summaries in {good_count}/{total_summaries} cases"
|
|
1703
|
+
)
|
|
1704
|
+
|
|
1705
|
+
if poor_count > 0:
|
|
1706
|
+
weaknesses.append(
|
|
1707
|
+
f"Generated poor quality summaries in {poor_count}/{total_summaries} cases"
|
|
1708
|
+
)
|
|
1709
|
+
if excellent_count == 0:
|
|
1710
|
+
weaknesses.append("No summaries achieved excellent quality rating")
|
|
1711
|
+
|
|
1712
|
+
if poor_count > 0 or fair_count > total_summaries * 0.3:
|
|
1713
|
+
recommendations.append("Review and improve prompt engineering")
|
|
1714
|
+
if excellent_count == 0:
|
|
1715
|
+
recommendations.append("Consider using a more capable model")
|
|
1716
|
+
|
|
1717
|
+
if not strengths:
|
|
1718
|
+
strengths = ["Summary generation completed"]
|
|
1719
|
+
if not weaknesses:
|
|
1720
|
+
weaknesses = ["Some areas for improvement"]
|
|
1721
|
+
if not recommendations:
|
|
1722
|
+
recommendations = ["Continue monitoring performance"]
|
|
1723
|
+
|
|
1724
|
+
use_case_fit = "Suitable for meeting summarization with review"
|
|
1725
|
+
|
|
1726
|
+
analysis.update(
|
|
1727
|
+
{
|
|
1728
|
+
"overall_analysis": overall_analysis_text,
|
|
1729
|
+
"strengths": strengths,
|
|
1730
|
+
"weaknesses": weaknesses,
|
|
1731
|
+
"recommendations": recommendations,
|
|
1732
|
+
"use_case_fit": use_case_fit,
|
|
1733
|
+
"overall_rating": {
|
|
1734
|
+
"rating": overall_rating,
|
|
1735
|
+
"explanation": f"Based on {total_summaries} valid summaries with {excellent_count + good_count} high-quality results"
|
|
1736
|
+
+ (
|
|
1737
|
+
f" ({error_count} errors excluded)"
|
|
1738
|
+
if error_count > 0
|
|
1739
|
+
else ""
|
|
1740
|
+
),
|
|
1741
|
+
"metrics": {
|
|
1742
|
+
"total_summaries": total_summaries,
|
|
1743
|
+
"excellent_count": excellent_count,
|
|
1744
|
+
"good_count": good_count,
|
|
1745
|
+
"fair_count": fair_count,
|
|
1746
|
+
"poor_count": poor_count,
|
|
1747
|
+
"error_count": error_count,
|
|
1748
|
+
"quality_score": (
|
|
1749
|
+
(
|
|
1750
|
+
(
|
|
1751
|
+
excellent_count * 4
|
|
1752
|
+
+ good_count * 3
|
|
1753
|
+
+ fair_count * 2
|
|
1754
|
+
+ poor_count * 1
|
|
1755
|
+
)
|
|
1756
|
+
/ total_summaries
|
|
1757
|
+
- 1 # Convert from 1-4 scale to 0-3 scale
|
|
1758
|
+
)
|
|
1759
|
+
* 100
|
|
1760
|
+
/ 3 # Convert to percentage (0-100%)
|
|
1761
|
+
if total_summaries > 0
|
|
1762
|
+
else None # Return None instead of 0 when no valid summaries
|
|
1763
|
+
),
|
|
1764
|
+
},
|
|
1765
|
+
},
|
|
1766
|
+
}
|
|
1767
|
+
)
|
|
1768
|
+
|
|
1769
|
+
# Add overall analysis costs to totals if available
|
|
1770
|
+
if "overall_usage" in analysis and "overall_cost" in analysis:
|
|
1771
|
+
total_usage["input_tokens"] += analysis["overall_usage"].get(
|
|
1772
|
+
"input_tokens", 0
|
|
1773
|
+
)
|
|
1774
|
+
total_usage["output_tokens"] += analysis["overall_usage"].get(
|
|
1775
|
+
"output_tokens", 0
|
|
1776
|
+
)
|
|
1777
|
+
total_usage["total_tokens"] += analysis["overall_usage"].get(
|
|
1778
|
+
"total_tokens", 0
|
|
1779
|
+
)
|
|
1780
|
+
total_cost["input_cost"] += analysis["overall_cost"].get(
|
|
1781
|
+
"input_cost", 0.0
|
|
1782
|
+
)
|
|
1783
|
+
total_cost["output_cost"] += analysis["overall_cost"].get(
|
|
1784
|
+
"output_cost", 0.0
|
|
1785
|
+
)
|
|
1786
|
+
total_cost["total_cost"] += analysis["overall_cost"].get(
|
|
1787
|
+
"total_cost", 0.0
|
|
1788
|
+
)
|
|
1789
|
+
|
|
1790
|
+
# Update with final totals
|
|
1791
|
+
analysis["total_usage"] = total_usage
|
|
1792
|
+
analysis["total_cost"] = total_cost
|
|
1793
|
+
|
|
1794
|
+
# Add comprehensive timing information
|
|
1795
|
+
total_time = time.time() - analysis_start_time
|
|
1796
|
+
analysis["timing"] = {
|
|
1797
|
+
"total_processing_time_seconds": round(total_time, 3),
|
|
1798
|
+
"per_summary_times_seconds": [round(t, 3) for t in per_summary_timings],
|
|
1799
|
+
"average_per_summary_seconds": (
|
|
1800
|
+
round(np.mean(per_summary_timings), 3) if per_summary_timings else 0
|
|
1801
|
+
),
|
|
1802
|
+
"max_per_summary_seconds": (
|
|
1803
|
+
round(max(per_summary_timings), 3) if per_summary_timings else 0
|
|
1804
|
+
),
|
|
1805
|
+
"min_per_summary_seconds": (
|
|
1806
|
+
round(min(per_summary_timings), 3) if per_summary_timings else 0
|
|
1807
|
+
),
|
|
1808
|
+
}
|
|
1809
|
+
|
|
1810
|
+
# Clean up intermediate files after successful completion
|
|
1811
|
+
if intermediate_dir and intermediate_dir.exists():
|
|
1812
|
+
try:
|
|
1813
|
+
import shutil
|
|
1814
|
+
|
|
1815
|
+
shutil.rmtree(intermediate_dir)
|
|
1816
|
+
self.log.info(
|
|
1817
|
+
f"Cleaned up intermediate analysis files from: {intermediate_dir}"
|
|
1818
|
+
)
|
|
1819
|
+
except Exception as e:
|
|
1820
|
+
self.log.warning(
|
|
1821
|
+
f"Failed to clean up intermediate directory {intermediate_dir}: {e}"
|
|
1822
|
+
)
|
|
1823
|
+
|
|
1824
|
+
return analysis
|
|
1825
|
+
|
|
1826
|
+
except Exception as e:
|
|
1827
|
+
self.log.error(f"Error in summarization analysis: {e}")
|
|
1828
|
+
return {
|
|
1829
|
+
"overall_analysis": f"Summarization analysis failed: {str(e)}",
|
|
1830
|
+
"strengths": [],
|
|
1831
|
+
"weaknesses": ["Analysis failed to complete"],
|
|
1832
|
+
"recommendations": ["Check logs for error details"],
|
|
1833
|
+
"use_case_fit": "",
|
|
1834
|
+
"per_question": [],
|
|
1835
|
+
"overall_rating": {"rating": "error", "explanation": str(e)},
|
|
1836
|
+
}
|
|
1837
|
+
|
|
1838
|
+
def generate_enhanced_report(
|
|
1839
|
+
self,
|
|
1840
|
+
results_path: str,
|
|
1841
|
+
output_dir: Optional[str] = None,
|
|
1842
|
+
groundtruth_path: Optional[str] = None,
|
|
1843
|
+
base_experiment_dir: Optional[str] = None,
|
|
1844
|
+
) -> None:
|
|
1845
|
+
"""
|
|
1846
|
+
Generate a detailed evaluation report including Claude's analysis.
|
|
1847
|
+
|
|
1848
|
+
Args:
|
|
1849
|
+
results_path: Path to results JSON file
|
|
1850
|
+
output_dir: Optional dir path to save report. If None, returns the data.
|
|
1851
|
+
groundtruth_path: Optional path to groundtruth file for comparison (especially for summarization)
|
|
1852
|
+
"""
|
|
1853
|
+
# Start timing
|
|
1854
|
+
report_start_time = time.time()
|
|
1855
|
+
|
|
1856
|
+
try:
|
|
1857
|
+
if output_dir:
|
|
1858
|
+
output_path = Path(output_dir)
|
|
1859
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
1860
|
+
|
|
1861
|
+
# Get Claude analysis
|
|
1862
|
+
claude_analysis = self.analyze_with_claude(results_path, groundtruth_path)
|
|
1863
|
+
|
|
1864
|
+
# Calculate total report generation time
|
|
1865
|
+
report_generation_time = time.time() - report_start_time
|
|
1866
|
+
|
|
1867
|
+
# Load experiment results to extract tested model info
|
|
1868
|
+
with open(results_path, "r", encoding="utf-8") as f:
|
|
1869
|
+
experiment_results = json.load(f)
|
|
1870
|
+
|
|
1871
|
+
# Extract tested model info from experiment results
|
|
1872
|
+
experiment_metadata = experiment_results.get("metadata", {})
|
|
1873
|
+
tested_model = experiment_metadata.get("model", "unknown")
|
|
1874
|
+
tested_model_type = experiment_metadata.get("llm_type", "unknown")
|
|
1875
|
+
inference_type = experiment_metadata.get("inference_type", "unknown")
|
|
1876
|
+
|
|
1877
|
+
# Create evaluation data without depending on threshold_metrics
|
|
1878
|
+
evaluation_data = {
|
|
1879
|
+
"metadata": {
|
|
1880
|
+
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
1881
|
+
"evaluator_model": self.claude.model, # Model doing the evaluation
|
|
1882
|
+
"tested_model": tested_model, # The model being evaluated
|
|
1883
|
+
"tested_model_type": tested_model_type, # Provider (lemonade, anthropic, etc.)
|
|
1884
|
+
"tested_model_inference": inference_type, # local or cloud
|
|
1885
|
+
"original_results_file": str(results_path),
|
|
1886
|
+
"groundtruth_file": (
|
|
1887
|
+
str(groundtruth_path) if groundtruth_path else None
|
|
1888
|
+
),
|
|
1889
|
+
"report_generation_time_seconds": round(report_generation_time, 3),
|
|
1890
|
+
},
|
|
1891
|
+
**claude_analysis,
|
|
1892
|
+
}
|
|
1893
|
+
|
|
1894
|
+
if output_dir:
|
|
1895
|
+
results_path_obj = Path(results_path)
|
|
1896
|
+
results_filename = results_path_obj.name
|
|
1897
|
+
|
|
1898
|
+
# Preserve directory hierarchy if base_experiment_dir is provided
|
|
1899
|
+
if base_experiment_dir:
|
|
1900
|
+
base_exp_path = Path(base_experiment_dir)
|
|
1901
|
+
try:
|
|
1902
|
+
# Calculate relative path from base experiment directory
|
|
1903
|
+
relative_path = results_path_obj.relative_to(base_exp_path)
|
|
1904
|
+
# Create the same directory structure in output
|
|
1905
|
+
eval_subdir = output_path / relative_path.parent
|
|
1906
|
+
eval_subdir.mkdir(parents=True, exist_ok=True)
|
|
1907
|
+
json_path = eval_subdir / f"{results_path_obj.stem}.eval.json"
|
|
1908
|
+
except ValueError:
|
|
1909
|
+
# If results_path is not relative to base_experiment_dir, use flat structure
|
|
1910
|
+
json_path = output_path / f"{results_path_obj.stem}.eval.json"
|
|
1911
|
+
else:
|
|
1912
|
+
# Flat structure (original behavior)
|
|
1913
|
+
json_path = output_path / f"{results_path_obj.stem}.eval.json"
|
|
1914
|
+
|
|
1915
|
+
with open(json_path, "w") as f:
|
|
1916
|
+
json.dump(evaluation_data, f, indent=2)
|
|
1917
|
+
self.log.info(f"Evaluation data saved to: {json_path}")
|
|
1918
|
+
|
|
1919
|
+
return evaluation_data
|
|
1920
|
+
|
|
1921
|
+
except Exception as e:
|
|
1922
|
+
self.log.error(f"Error during evaluation: {str(e)}")
|
|
1923
|
+
raise
|
|
1924
|
+
|
|
1925
|
+
def create_template(
|
|
1926
|
+
self,
|
|
1927
|
+
groundtruth_file: str,
|
|
1928
|
+
output_dir: str = "./output/templates",
|
|
1929
|
+
similarity_threshold: float = 0.7,
|
|
1930
|
+
) -> str:
|
|
1931
|
+
"""
|
|
1932
|
+
Create a template results file from ground truth data for manual RAG evaluation.
|
|
1933
|
+
|
|
1934
|
+
Args:
|
|
1935
|
+
groundtruth_file: Path to the ground truth JSON file
|
|
1936
|
+
output_dir: Directory to save the template file
|
|
1937
|
+
similarity_threshold: Similarity threshold for evaluation
|
|
1938
|
+
|
|
1939
|
+
Returns:
|
|
1940
|
+
Path to the created template file
|
|
1941
|
+
"""
|
|
1942
|
+
try:
|
|
1943
|
+
# Load ground truth data
|
|
1944
|
+
with open(groundtruth_file, "r", encoding="utf-8") as f:
|
|
1945
|
+
groundtruth_data = json.load(f)
|
|
1946
|
+
|
|
1947
|
+
# Extract QA pairs from ground truth
|
|
1948
|
+
qa_pairs = groundtruth_data.get("analysis", {}).get("qa_pairs", [])
|
|
1949
|
+
if not qa_pairs:
|
|
1950
|
+
raise ValueError("No QA pairs found in ground truth file")
|
|
1951
|
+
|
|
1952
|
+
# Create template structure
|
|
1953
|
+
template_data = {
|
|
1954
|
+
"metadata": {
|
|
1955
|
+
"test_file": groundtruth_file,
|
|
1956
|
+
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
1957
|
+
"similarity_threshold": similarity_threshold,
|
|
1958
|
+
"instructions": "Fill in the 'response' fields with your RAG system outputs, then evaluate using gaia eval",
|
|
1959
|
+
},
|
|
1960
|
+
"analysis": {"qa_results": []},
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1963
|
+
# Convert QA pairs to result template format
|
|
1964
|
+
for i, qa_pair in enumerate(qa_pairs):
|
|
1965
|
+
result_entry = {
|
|
1966
|
+
"query": qa_pair.get("question", qa_pair.get("query", "")),
|
|
1967
|
+
"ground_truth": qa_pair.get(
|
|
1968
|
+
"answer",
|
|
1969
|
+
qa_pair.get("response", qa_pair.get("ground_truth", "")),
|
|
1970
|
+
),
|
|
1971
|
+
"response": f"[FILL IN YOUR RAG SYSTEM RESPONSE FOR QUESTION {i+1}]",
|
|
1972
|
+
}
|
|
1973
|
+
template_data["analysis"]["qa_results"].append(result_entry)
|
|
1974
|
+
|
|
1975
|
+
# Create output directory
|
|
1976
|
+
output_path = Path(output_dir)
|
|
1977
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
1978
|
+
|
|
1979
|
+
# Generate output filename
|
|
1980
|
+
groundtruth_filename = Path(groundtruth_file).stem
|
|
1981
|
+
if groundtruth_filename.endswith(".groundtruth"):
|
|
1982
|
+
base_name = groundtruth_filename[:-12] # Remove '.groundtruth'
|
|
1983
|
+
else:
|
|
1984
|
+
base_name = groundtruth_filename
|
|
1985
|
+
|
|
1986
|
+
template_filename = f"{base_name}.template.json"
|
|
1987
|
+
template_path = output_path / template_filename
|
|
1988
|
+
|
|
1989
|
+
# Save template file
|
|
1990
|
+
with open(template_path, "w", encoding="utf-8") as f:
|
|
1991
|
+
json.dump(template_data, f, indent=2, ensure_ascii=False)
|
|
1992
|
+
|
|
1993
|
+
self.log.info(f"Created template with {len(qa_pairs)} questions")
|
|
1994
|
+
return str(template_path)
|
|
1995
|
+
|
|
1996
|
+
except Exception as e:
|
|
1997
|
+
self.log.error(f"Error creating template: {e}")
|
|
1998
|
+
raise
|
|
1999
|
+
|
|
2000
|
+
def create_consolidated_evaluation_report(
|
|
2001
|
+
self, evaluation_files: List[str], output_dir: str, base_experiment_dir: str
|
|
2002
|
+
) -> str:
|
|
2003
|
+
"""Create a consolidated report of all evaluations."""
|
|
2004
|
+
from datetime import datetime
|
|
2005
|
+
|
|
2006
|
+
output_base_path = Path(output_dir)
|
|
2007
|
+
|
|
2008
|
+
# Load all evaluation results
|
|
2009
|
+
all_evaluations = []
|
|
2010
|
+
total_usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
|
|
2011
|
+
total_cost = {"input_cost": 0.0, "output_cost": 0.0, "total_cost": 0.0}
|
|
2012
|
+
|
|
2013
|
+
for eval_file in evaluation_files:
|
|
2014
|
+
# Find the actual evaluation file (could be in subdirectory)
|
|
2015
|
+
eval_paths = list(output_base_path.rglob(eval_file))
|
|
2016
|
+
if not eval_paths:
|
|
2017
|
+
self.log.warning(f"Evaluation file not found: {eval_file}")
|
|
2018
|
+
continue
|
|
2019
|
+
|
|
2020
|
+
eval_path = eval_paths[0] # Take first match
|
|
2021
|
+
|
|
2022
|
+
try:
|
|
2023
|
+
with open(eval_path, "r", encoding="utf-8") as f:
|
|
2024
|
+
evaluation_data = json.load(f)
|
|
2025
|
+
|
|
2026
|
+
# For consolidated report, only include summary statistics
|
|
2027
|
+
metadata = evaluation_data.get("metadata", {})
|
|
2028
|
+
eval_info = {
|
|
2029
|
+
"experiment_name": eval_path.stem.replace(".eval", ""),
|
|
2030
|
+
"file_path": str(eval_path.relative_to(output_base_path)),
|
|
2031
|
+
"timestamp": metadata.get("timestamp", ""),
|
|
2032
|
+
"evaluator_model": metadata.get(
|
|
2033
|
+
"evaluator_model", ""
|
|
2034
|
+
), # Model doing the evaluation
|
|
2035
|
+
"tested_model": metadata.get(
|
|
2036
|
+
"tested_model", "unknown"
|
|
2037
|
+
), # Model being tested
|
|
2038
|
+
"tested_model_type": metadata.get(
|
|
2039
|
+
"tested_model_type", "unknown"
|
|
2040
|
+
), # Provider
|
|
2041
|
+
"tested_model_inference": metadata.get(
|
|
2042
|
+
"tested_model_inference", "unknown"
|
|
2043
|
+
), # Local/cloud
|
|
2044
|
+
"overall_rating": evaluation_data.get("overall_rating", {}),
|
|
2045
|
+
"original_results_file": metadata.get("original_results_file", ""),
|
|
2046
|
+
"usage": evaluation_data.get("total_usage", {}),
|
|
2047
|
+
"cost": evaluation_data.get(
|
|
2048
|
+
"total_cost", {}
|
|
2049
|
+
), # This is evaluation cost
|
|
2050
|
+
}
|
|
2051
|
+
|
|
2052
|
+
# Load the corresponding experiment file to get inference cost
|
|
2053
|
+
experiment_name = eval_path.stem.replace(".experiment.eval", "")
|
|
2054
|
+
|
|
2055
|
+
# Preserve the subdirectory structure when looking for experiment file
|
|
2056
|
+
relative_eval_path = eval_path.relative_to(output_base_path)
|
|
2057
|
+
relative_dir = relative_eval_path.parent
|
|
2058
|
+
|
|
2059
|
+
experiment_file = (
|
|
2060
|
+
Path(base_experiment_dir)
|
|
2061
|
+
/ relative_dir
|
|
2062
|
+
/ f"{experiment_name}.experiment.json"
|
|
2063
|
+
)
|
|
2064
|
+
|
|
2065
|
+
if experiment_file.exists():
|
|
2066
|
+
try:
|
|
2067
|
+
with open(experiment_file, "r", encoding="utf-8") as f:
|
|
2068
|
+
experiment_data = json.load(f)
|
|
2069
|
+
# Add inference cost from experiment file
|
|
2070
|
+
eval_info["inference_cost"] = experiment_data.get(
|
|
2071
|
+
"metadata", {}
|
|
2072
|
+
).get("total_cost", {})
|
|
2073
|
+
eval_info["inference_usage"] = experiment_data.get(
|
|
2074
|
+
"metadata", {}
|
|
2075
|
+
).get("total_usage", {})
|
|
2076
|
+
eval_info["inference_type"] = experiment_data.get(
|
|
2077
|
+
"metadata", {}
|
|
2078
|
+
).get("inference_type", "unknown")
|
|
2079
|
+
except Exception as e:
|
|
2080
|
+
self.log.warning(
|
|
2081
|
+
f"Could not load experiment file {experiment_file}: {e}"
|
|
2082
|
+
)
|
|
2083
|
+
# Set default values for missing experiment data
|
|
2084
|
+
eval_info["inference_cost"] = {
|
|
2085
|
+
"input_cost": 0.0,
|
|
2086
|
+
"output_cost": 0.0,
|
|
2087
|
+
"total_cost": 0.0,
|
|
2088
|
+
}
|
|
2089
|
+
eval_info["inference_usage"] = {
|
|
2090
|
+
"input_tokens": 0,
|
|
2091
|
+
"output_tokens": 0,
|
|
2092
|
+
"total_tokens": 0,
|
|
2093
|
+
}
|
|
2094
|
+
eval_info["inference_type"] = "unknown"
|
|
2095
|
+
else:
|
|
2096
|
+
self.log.warning(f"Experiment file not found: {experiment_file}")
|
|
2097
|
+
# Set default values for missing experiment data
|
|
2098
|
+
eval_info["inference_cost"] = {
|
|
2099
|
+
"input_cost": 0.0,
|
|
2100
|
+
"output_cost": 0.0,
|
|
2101
|
+
"total_cost": 0.0,
|
|
2102
|
+
}
|
|
2103
|
+
eval_info["inference_usage"] = {
|
|
2104
|
+
"input_tokens": 0,
|
|
2105
|
+
"output_tokens": 0,
|
|
2106
|
+
"total_tokens": 0,
|
|
2107
|
+
}
|
|
2108
|
+
eval_info["inference_type"] = "unknown"
|
|
2109
|
+
|
|
2110
|
+
# Extract aspect summary if available (aggregate only)
|
|
2111
|
+
if evaluation_data.get("per_question"):
|
|
2112
|
+
aspect_summary = {}
|
|
2113
|
+
# Define the aspects we want to extract (matching visualization expectations)
|
|
2114
|
+
# Map old aspect names to new ones for backwards compatibility
|
|
2115
|
+
aspect_mapping = {
|
|
2116
|
+
# Old names -> New names
|
|
2117
|
+
"executive_summary_accuracy": "executive_summary_quality",
|
|
2118
|
+
"completeness": "detail_completeness",
|
|
2119
|
+
"action_items_accuracy": "action_items_structure",
|
|
2120
|
+
"key_decisions_accuracy": "key_decisions_clarity",
|
|
2121
|
+
"participant_identification": "participant_information",
|
|
2122
|
+
"topic_coverage": "topic_organization",
|
|
2123
|
+
# New names (map to themselves)
|
|
2124
|
+
"executive_summary_quality": "executive_summary_quality",
|
|
2125
|
+
"detail_completeness": "detail_completeness",
|
|
2126
|
+
"action_items_structure": "action_items_structure",
|
|
2127
|
+
"key_decisions_clarity": "key_decisions_clarity",
|
|
2128
|
+
"participant_information": "participant_information",
|
|
2129
|
+
"topic_organization": "topic_organization",
|
|
2130
|
+
}
|
|
2131
|
+
|
|
2132
|
+
aspects = [
|
|
2133
|
+
"executive_summary_quality",
|
|
2134
|
+
"detail_completeness",
|
|
2135
|
+
"action_items_structure",
|
|
2136
|
+
"key_decisions_clarity",
|
|
2137
|
+
"participant_information",
|
|
2138
|
+
"topic_organization",
|
|
2139
|
+
]
|
|
2140
|
+
|
|
2141
|
+
for aspect in aspects:
|
|
2142
|
+
aspect_ratings = []
|
|
2143
|
+
for question in evaluation_data.get("per_question", []):
|
|
2144
|
+
analysis = question.get("analysis", {})
|
|
2145
|
+
# Check for the aspect using both old and new names
|
|
2146
|
+
for old_name, new_name in aspect_mapping.items():
|
|
2147
|
+
if new_name == aspect and old_name in analysis:
|
|
2148
|
+
rating = analysis[old_name].get("rating")
|
|
2149
|
+
if rating:
|
|
2150
|
+
aspect_ratings.append(rating)
|
|
2151
|
+
break
|
|
2152
|
+
|
|
2153
|
+
if aspect_ratings:
|
|
2154
|
+
# Count occurrences of each rating
|
|
2155
|
+
rating_counts = {}
|
|
2156
|
+
for rating in aspect_ratings:
|
|
2157
|
+
rating_counts[rating] = rating_counts.get(rating, 0) + 1
|
|
2158
|
+
|
|
2159
|
+
# Find most common rating
|
|
2160
|
+
most_common = max(rating_counts.items(), key=lambda x: x[1])
|
|
2161
|
+
aspect_summary[aspect] = {
|
|
2162
|
+
"most_common_rating": most_common[0],
|
|
2163
|
+
"rating_distribution": rating_counts,
|
|
2164
|
+
}
|
|
2165
|
+
|
|
2166
|
+
if aspect_summary:
|
|
2167
|
+
eval_info["aspect_summary"] = aspect_summary
|
|
2168
|
+
|
|
2169
|
+
# Include timing summary if available
|
|
2170
|
+
if evaluation_data.get("timing"):
|
|
2171
|
+
eval_info["avg_processing_time_seconds"] = evaluation_data[
|
|
2172
|
+
"timing"
|
|
2173
|
+
].get(
|
|
2174
|
+
"average_per_summary_seconds",
|
|
2175
|
+
evaluation_data["timing"].get(
|
|
2176
|
+
"total_processing_time_seconds", 0
|
|
2177
|
+
),
|
|
2178
|
+
)
|
|
2179
|
+
|
|
2180
|
+
all_evaluations.append(eval_info)
|
|
2181
|
+
|
|
2182
|
+
# Accumulate totals
|
|
2183
|
+
usage = evaluation_data.get("total_usage", {})
|
|
2184
|
+
for key in total_usage:
|
|
2185
|
+
total_usage[key] += usage.get(key, 0)
|
|
2186
|
+
|
|
2187
|
+
cost = evaluation_data.get("total_cost", {})
|
|
2188
|
+
for key in total_cost:
|
|
2189
|
+
total_cost[key] += cost.get(key, 0.0)
|
|
2190
|
+
|
|
2191
|
+
except Exception as e:
|
|
2192
|
+
self.log.error(f"Error loading evaluation file {eval_path}: {e}")
|
|
2193
|
+
continue
|
|
2194
|
+
|
|
2195
|
+
# Create consolidated report with enhanced metadata tracking
|
|
2196
|
+
evaluation_files_metadata = []
|
|
2197
|
+
for eval_file in evaluation_files:
|
|
2198
|
+
# Find the actual evaluation file (could be in subdirectory)
|
|
2199
|
+
eval_paths = list(output_base_path.rglob(eval_file))
|
|
2200
|
+
if eval_paths:
|
|
2201
|
+
eval_path = eval_paths[0]
|
|
2202
|
+
relative_path = str(eval_path.relative_to(output_base_path))
|
|
2203
|
+
evaluation_files_metadata.append(
|
|
2204
|
+
{
|
|
2205
|
+
"file_path": relative_path,
|
|
2206
|
+
"added_at": datetime.now().isoformat(),
|
|
2207
|
+
"last_modified": datetime.fromtimestamp(
|
|
2208
|
+
eval_path.stat().st_mtime
|
|
2209
|
+
).isoformat(),
|
|
2210
|
+
"fingerprint": self.get_evaluation_fingerprint(str(eval_path)),
|
|
2211
|
+
}
|
|
2212
|
+
)
|
|
2213
|
+
|
|
2214
|
+
consolidated_report = {
|
|
2215
|
+
"metadata": {
|
|
2216
|
+
"report_type": "consolidated_evaluations",
|
|
2217
|
+
"created_at": datetime.now().isoformat(),
|
|
2218
|
+
"last_updated": datetime.now().isoformat(),
|
|
2219
|
+
"timestamp": datetime.now().strftime(
|
|
2220
|
+
"%Y-%m-%d %H:%M:%S"
|
|
2221
|
+
), # Keep for backwards compatibility
|
|
2222
|
+
"experiment_directory": base_experiment_dir,
|
|
2223
|
+
"output_directory": output_dir,
|
|
2224
|
+
"total_evaluations": len(all_evaluations),
|
|
2225
|
+
"total_usage": total_usage,
|
|
2226
|
+
"total_cost": total_cost,
|
|
2227
|
+
"evaluation_files": evaluation_files_metadata,
|
|
2228
|
+
},
|
|
2229
|
+
"evaluations": all_evaluations,
|
|
2230
|
+
}
|
|
2231
|
+
|
|
2232
|
+
# Save consolidated report
|
|
2233
|
+
consolidated_filename = "consolidated_evaluations_report.json"
|
|
2234
|
+
consolidated_path = output_base_path / consolidated_filename
|
|
2235
|
+
|
|
2236
|
+
with open(consolidated_path, "w", encoding="utf-8") as f:
|
|
2237
|
+
json.dump(consolidated_report, f, indent=2)
|
|
2238
|
+
|
|
2239
|
+
return str(consolidated_path)
|
|
2240
|
+
|
|
2241
|
+
def get_evaluation_fingerprint(self, eval_file: str) -> str:
|
|
2242
|
+
"""Generate fingerprint for evaluation file to detect changes.
|
|
2243
|
+
|
|
2244
|
+
Args:
|
|
2245
|
+
eval_file: Path to the evaluation file
|
|
2246
|
+
|
|
2247
|
+
Returns:
|
|
2248
|
+
Fingerprint string combining modification time and file size
|
|
2249
|
+
"""
|
|
2250
|
+
eval_path = Path(eval_file)
|
|
2251
|
+
if not eval_path.exists():
|
|
2252
|
+
return ""
|
|
2253
|
+
|
|
2254
|
+
# Use file modification time + file size as fingerprint
|
|
2255
|
+
stat = eval_path.stat()
|
|
2256
|
+
return f"{stat.st_mtime}_{stat.st_size}"
|
|
2257
|
+
|
|
2258
|
+
def find_changed_evaluations(self, output_dir: str) -> List[str]:
|
|
2259
|
+
"""Find evaluations that have changed since last consolidation.
|
|
2260
|
+
|
|
2261
|
+
Args:
|
|
2262
|
+
output_dir: Output directory containing evaluations
|
|
2263
|
+
|
|
2264
|
+
Returns:
|
|
2265
|
+
List of paths to changed evaluation files
|
|
2266
|
+
"""
|
|
2267
|
+
output_base_path = Path(output_dir)
|
|
2268
|
+
consolidated_path = output_base_path / "consolidated_evaluations_report.json"
|
|
2269
|
+
|
|
2270
|
+
if not consolidated_path.exists():
|
|
2271
|
+
return [str(f) for f in output_base_path.rglob("*.eval.json")]
|
|
2272
|
+
|
|
2273
|
+
# Load existing fingerprints
|
|
2274
|
+
try:
|
|
2275
|
+
with open(consolidated_path, "r", encoding="utf-8") as f:
|
|
2276
|
+
existing_report = json.load(f)
|
|
2277
|
+
|
|
2278
|
+
existing_fingerprints = {}
|
|
2279
|
+
if "evaluation_files" in existing_report.get("metadata", {}):
|
|
2280
|
+
for item in existing_report["metadata"]["evaluation_files"]:
|
|
2281
|
+
existing_fingerprints[item["file_path"]] = item.get(
|
|
2282
|
+
"fingerprint", ""
|
|
2283
|
+
)
|
|
2284
|
+
except Exception as e:
|
|
2285
|
+
self.log.warning(f"Error reading existing consolidated report: {e}")
|
|
2286
|
+
return [str(f) for f in output_base_path.rglob("*.eval.json")]
|
|
2287
|
+
|
|
2288
|
+
changed_files = []
|
|
2289
|
+
for eval_file in output_base_path.rglob("*.eval.json"):
|
|
2290
|
+
relative_path = str(eval_file.relative_to(output_base_path))
|
|
2291
|
+
current_fingerprint = self.get_evaluation_fingerprint(str(eval_file))
|
|
2292
|
+
|
|
2293
|
+
if (
|
|
2294
|
+
relative_path not in existing_fingerprints
|
|
2295
|
+
or existing_fingerprints[relative_path] != current_fingerprint
|
|
2296
|
+
):
|
|
2297
|
+
changed_files.append(str(eval_file))
|
|
2298
|
+
|
|
2299
|
+
return changed_files
|
|
2300
|
+
|
|
2301
|
+
def update_consolidated_evaluation_report(
|
|
2302
|
+
self,
|
|
2303
|
+
output_dir: str,
|
|
2304
|
+
new_eval_files: List[str] = None,
|
|
2305
|
+
regenerate: bool = False,
|
|
2306
|
+
base_experiment_dir: str = None,
|
|
2307
|
+
) -> str:
|
|
2308
|
+
"""Update consolidated report with new evaluations or regenerate completely.
|
|
2309
|
+
|
|
2310
|
+
Args:
|
|
2311
|
+
output_dir: Output directory containing evaluations
|
|
2312
|
+
new_eval_files: List of new evaluation files to add (if None, auto-detect)
|
|
2313
|
+
regenerate: Force full regeneration of the report
|
|
2314
|
+
base_experiment_dir: Base experiment directory path
|
|
2315
|
+
|
|
2316
|
+
Returns:
|
|
2317
|
+
Path to the consolidated report file
|
|
2318
|
+
"""
|
|
2319
|
+
from datetime import datetime
|
|
2320
|
+
|
|
2321
|
+
output_base_path = Path(output_dir)
|
|
2322
|
+
consolidated_filename = "consolidated_evaluations_report.json"
|
|
2323
|
+
consolidated_path = output_base_path / consolidated_filename
|
|
2324
|
+
|
|
2325
|
+
if regenerate or not consolidated_path.exists():
|
|
2326
|
+
# Full regeneration (use existing logic)
|
|
2327
|
+
evaluation_files = [f.name for f in output_base_path.rglob("*.eval.json")]
|
|
2328
|
+
return self.create_consolidated_evaluation_report(
|
|
2329
|
+
evaluation_files, output_dir, base_experiment_dir or output_dir
|
|
2330
|
+
)
|
|
2331
|
+
|
|
2332
|
+
# Load existing consolidated report
|
|
2333
|
+
try:
|
|
2334
|
+
with open(consolidated_path, "r", encoding="utf-8") as f:
|
|
2335
|
+
existing_report = json.load(f)
|
|
2336
|
+
except Exception as e:
|
|
2337
|
+
self.log.error(f"Error loading existing consolidated report: {e}")
|
|
2338
|
+
# Fallback to full regeneration
|
|
2339
|
+
evaluation_files = [f.name for f in output_base_path.rglob("*.eval.json")]
|
|
2340
|
+
return self.create_consolidated_evaluation_report(
|
|
2341
|
+
evaluation_files, output_dir, base_experiment_dir or output_dir
|
|
2342
|
+
)
|
|
2343
|
+
|
|
2344
|
+
# Initialize metadata structure if missing
|
|
2345
|
+
if "evaluation_files" not in existing_report.get("metadata", {}):
|
|
2346
|
+
existing_report["metadata"]["evaluation_files"] = []
|
|
2347
|
+
|
|
2348
|
+
# Find new evaluation files
|
|
2349
|
+
if not new_eval_files:
|
|
2350
|
+
all_eval_files = list(output_base_path.rglob("*.eval.json"))
|
|
2351
|
+
existing_files = {
|
|
2352
|
+
item["file_path"]
|
|
2353
|
+
for item in existing_report["metadata"]["evaluation_files"]
|
|
2354
|
+
}
|
|
2355
|
+
new_eval_files = [
|
|
2356
|
+
str(f)
|
|
2357
|
+
for f in all_eval_files
|
|
2358
|
+
if str(f.relative_to(output_base_path)) not in existing_files
|
|
2359
|
+
]
|
|
2360
|
+
|
|
2361
|
+
if not new_eval_files:
|
|
2362
|
+
self.log.info(
|
|
2363
|
+
"No new evaluations found - consolidated report is up to date"
|
|
2364
|
+
)
|
|
2365
|
+
return str(consolidated_path)
|
|
2366
|
+
|
|
2367
|
+
self.log.info(
|
|
2368
|
+
f"Adding {len(new_eval_files)} new evaluations to consolidated report"
|
|
2369
|
+
)
|
|
2370
|
+
|
|
2371
|
+
# Process new files and update report
|
|
2372
|
+
new_evaluations = []
|
|
2373
|
+
updated_usage = existing_report["metadata"].get(
|
|
2374
|
+
"total_usage", {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
|
|
2375
|
+
)
|
|
2376
|
+
updated_cost = existing_report["metadata"].get(
|
|
2377
|
+
"total_cost", {"input_cost": 0.0, "output_cost": 0.0, "total_cost": 0.0}
|
|
2378
|
+
)
|
|
2379
|
+
|
|
2380
|
+
for eval_file in new_eval_files:
|
|
2381
|
+
eval_path = Path(eval_file)
|
|
2382
|
+
relative_path = str(eval_path.relative_to(output_base_path))
|
|
2383
|
+
|
|
2384
|
+
# Add to metadata tracking
|
|
2385
|
+
existing_report["metadata"]["evaluation_files"].append(
|
|
2386
|
+
{
|
|
2387
|
+
"file_path": relative_path,
|
|
2388
|
+
"added_at": datetime.now().isoformat(),
|
|
2389
|
+
"last_modified": datetime.fromtimestamp(
|
|
2390
|
+
eval_path.stat().st_mtime
|
|
2391
|
+
).isoformat(),
|
|
2392
|
+
"fingerprint": self.get_evaluation_fingerprint(str(eval_path)),
|
|
2393
|
+
}
|
|
2394
|
+
)
|
|
2395
|
+
|
|
2396
|
+
# Load and integrate evaluation data
|
|
2397
|
+
try:
|
|
2398
|
+
with open(eval_path, "r", encoding="utf-8") as f:
|
|
2399
|
+
eval_data = json.load(f)
|
|
2400
|
+
|
|
2401
|
+
# Create evaluation summary (similar to existing logic)
|
|
2402
|
+
eval_info = {
|
|
2403
|
+
"experiment_name": eval_path.stem.replace(".eval", ""),
|
|
2404
|
+
"file_path": relative_path,
|
|
2405
|
+
"timestamp": eval_data.get("metadata", {}).get("timestamp", ""),
|
|
2406
|
+
"model": eval_data.get("metadata", {}).get("model", "unknown"),
|
|
2407
|
+
}
|
|
2408
|
+
|
|
2409
|
+
# Add overall analysis if available
|
|
2410
|
+
if "overall_analysis" in eval_data:
|
|
2411
|
+
eval_info["overall_analysis"] = (
|
|
2412
|
+
eval_data["overall_analysis"][:200] + "..."
|
|
2413
|
+
if len(eval_data["overall_analysis"]) > 200
|
|
2414
|
+
else eval_data["overall_analysis"]
|
|
2415
|
+
)
|
|
2416
|
+
|
|
2417
|
+
# Add timing info if available
|
|
2418
|
+
if eval_data.get("timing"):
|
|
2419
|
+
eval_info["avg_processing_time_seconds"] = eval_data["timing"].get(
|
|
2420
|
+
"average_per_summary_seconds",
|
|
2421
|
+
eval_data["timing"].get("total_processing_time_seconds", 0),
|
|
2422
|
+
)
|
|
2423
|
+
|
|
2424
|
+
new_evaluations.append(eval_info)
|
|
2425
|
+
|
|
2426
|
+
# Accumulate usage and cost
|
|
2427
|
+
usage = eval_data.get("total_usage", {})
|
|
2428
|
+
for key in updated_usage:
|
|
2429
|
+
updated_usage[key] += usage.get(key, 0)
|
|
2430
|
+
|
|
2431
|
+
cost = eval_data.get("total_cost", {})
|
|
2432
|
+
for key in updated_cost:
|
|
2433
|
+
updated_cost[key] += cost.get(key, 0.0)
|
|
2434
|
+
|
|
2435
|
+
except Exception as e:
|
|
2436
|
+
self.log.error(f"Error processing new evaluation file {eval_path}: {e}")
|
|
2437
|
+
continue
|
|
2438
|
+
|
|
2439
|
+
# Update the consolidated report
|
|
2440
|
+
existing_report["evaluations"].extend(new_evaluations)
|
|
2441
|
+
existing_report["metadata"]["last_updated"] = datetime.now().isoformat()
|
|
2442
|
+
existing_report["metadata"]["total_evaluations"] = len(
|
|
2443
|
+
existing_report["evaluations"]
|
|
2444
|
+
)
|
|
2445
|
+
existing_report["metadata"]["total_usage"] = updated_usage
|
|
2446
|
+
existing_report["metadata"]["total_cost"] = updated_cost
|
|
2447
|
+
|
|
2448
|
+
# Save updated report
|
|
2449
|
+
with open(consolidated_path, "w", encoding="utf-8") as f:
|
|
2450
|
+
json.dump(existing_report, f, indent=2)
|
|
2451
|
+
|
|
2452
|
+
self.log.info(
|
|
2453
|
+
f"Updated consolidated report with {len(new_evaluations)} new evaluations"
|
|
2454
|
+
)
|
|
2455
|
+
return str(consolidated_path)
|
|
2456
|
+
|
|
2457
|
+
def _detect_evaluation_type(self, models_data: List[Dict]) -> str:
|
|
2458
|
+
"""Detect whether this is a RAG or summarization evaluation based on the data structure."""
|
|
2459
|
+
if not models_data:
|
|
2460
|
+
return "unknown"
|
|
2461
|
+
|
|
2462
|
+
# Check first model's per_question data structure
|
|
2463
|
+
first_model = models_data[0]
|
|
2464
|
+
per_question = first_model.get("per_question", [])
|
|
2465
|
+
|
|
2466
|
+
if not per_question:
|
|
2467
|
+
return "unknown"
|
|
2468
|
+
|
|
2469
|
+
# Look at the first question to determine evaluation type
|
|
2470
|
+
first_question = per_question[0]
|
|
2471
|
+
|
|
2472
|
+
# Summarization evaluations have specific analysis fields
|
|
2473
|
+
analysis = first_question.get("analysis", {})
|
|
2474
|
+
# Check for new aspect names
|
|
2475
|
+
if any(
|
|
2476
|
+
key in analysis
|
|
2477
|
+
for key in [
|
|
2478
|
+
"executive_summary_quality",
|
|
2479
|
+
"detail_completeness",
|
|
2480
|
+
"action_items_structure",
|
|
2481
|
+
"key_decisions_clarity",
|
|
2482
|
+
"participant_information",
|
|
2483
|
+
"topic_organization",
|
|
2484
|
+
]
|
|
2485
|
+
):
|
|
2486
|
+
return "summarization"
|
|
2487
|
+
|
|
2488
|
+
# Also check for old aspect names (for backwards compatibility)
|
|
2489
|
+
if any(
|
|
2490
|
+
key in analysis
|
|
2491
|
+
for key in [
|
|
2492
|
+
"executive_summary_accuracy",
|
|
2493
|
+
"completeness",
|
|
2494
|
+
"action_items_accuracy",
|
|
2495
|
+
"key_decisions_accuracy",
|
|
2496
|
+
"participant_identification",
|
|
2497
|
+
"topic_coverage",
|
|
2498
|
+
]
|
|
2499
|
+
):
|
|
2500
|
+
return "summarization"
|
|
2501
|
+
|
|
2502
|
+
# RAG/QA evaluations have similarity scores and different structure
|
|
2503
|
+
# Both Q&A and RAG evaluations are treated the same way
|
|
2504
|
+
if "similarity_score" in first_question or "passed_threshold" in first_question:
|
|
2505
|
+
return "rag"
|
|
2506
|
+
|
|
2507
|
+
# Additional check for Q&A evaluations that have qa_inputs
|
|
2508
|
+
if "qa_inputs" in first_question:
|
|
2509
|
+
return "rag"
|
|
2510
|
+
|
|
2511
|
+
# If we can't detect the evaluation type, log the issue for debugging
|
|
2512
|
+
self.log.warning(
|
|
2513
|
+
f"Could not detect evaluation type from data structure: {list(first_question.keys())}"
|
|
2514
|
+
)
|
|
2515
|
+
return "unknown"
|
|
2516
|
+
|
|
2517
|
+
def _generate_summarization_report(self, models_data: List[Dict]) -> str:
|
|
2518
|
+
"""Generate markdown content specifically for summarization evaluation reports."""
|
|
2519
|
+
|
|
2520
|
+
# Build performance ranking based on overall quality ratings
|
|
2521
|
+
ranking = []
|
|
2522
|
+
for model in models_data:
|
|
2523
|
+
# Count quality ratings from per_question data
|
|
2524
|
+
excellent_count = 0
|
|
2525
|
+
good_count = 0
|
|
2526
|
+
fair_count = 0
|
|
2527
|
+
poor_count = 0
|
|
2528
|
+
|
|
2529
|
+
for question in model.get("per_question", []):
|
|
2530
|
+
analysis = question.get("analysis", {})
|
|
2531
|
+
overall_quality = analysis.get("overall_quality", "")
|
|
2532
|
+
if overall_quality == "excellent":
|
|
2533
|
+
excellent_count += 1
|
|
2534
|
+
elif overall_quality == "good":
|
|
2535
|
+
good_count += 1
|
|
2536
|
+
elif overall_quality == "fair":
|
|
2537
|
+
fair_count += 1
|
|
2538
|
+
elif overall_quality == "poor":
|
|
2539
|
+
poor_count += 1
|
|
2540
|
+
# Note: "error" and other invalid ratings are excluded from ranking
|
|
2541
|
+
|
|
2542
|
+
total_summaries = excellent_count + good_count + fair_count + poor_count
|
|
2543
|
+
if total_summaries > 0:
|
|
2544
|
+
quality_score_raw = (
|
|
2545
|
+
excellent_count * 4
|
|
2546
|
+
+ good_count * 3
|
|
2547
|
+
+ fair_count * 2
|
|
2548
|
+
+ poor_count * 1
|
|
2549
|
+
) / total_summaries
|
|
2550
|
+
quality_score_percentage = ((quality_score_raw - 1) / 3) * 100
|
|
2551
|
+
ranking.append(f"**{model['name']}** ({quality_score_percentage:.1f}%)")
|
|
2552
|
+
|
|
2553
|
+
ranking_text = " > ".join(ranking)
|
|
2554
|
+
|
|
2555
|
+
# Determine production readiness for summarization
|
|
2556
|
+
production_ready = any(
|
|
2557
|
+
"excellent" in str(m.get("per_question", [])) for m in models_data
|
|
2558
|
+
)
|
|
2559
|
+
production_note = (
|
|
2560
|
+
"Some models show excellent summarization capabilities."
|
|
2561
|
+
if production_ready
|
|
2562
|
+
else "All models need improvement for production summarization."
|
|
2563
|
+
)
|
|
2564
|
+
|
|
2565
|
+
# Build metrics table for summarization
|
|
2566
|
+
table_rows = []
|
|
2567
|
+
for model in models_data:
|
|
2568
|
+
# Count quality ratings
|
|
2569
|
+
excellent_count = 0
|
|
2570
|
+
good_count = 0
|
|
2571
|
+
fair_count = 0
|
|
2572
|
+
poor_count = 0
|
|
2573
|
+
|
|
2574
|
+
for question in model.get("per_question", []):
|
|
2575
|
+
analysis = question.get("analysis", {})
|
|
2576
|
+
overall_quality = analysis.get("overall_quality", "")
|
|
2577
|
+
if overall_quality == "excellent":
|
|
2578
|
+
excellent_count += 1
|
|
2579
|
+
elif overall_quality == "good":
|
|
2580
|
+
good_count += 1
|
|
2581
|
+
elif overall_quality == "fair":
|
|
2582
|
+
fair_count += 1
|
|
2583
|
+
elif overall_quality == "poor":
|
|
2584
|
+
poor_count += 1
|
|
2585
|
+
# Note: "error" and other invalid ratings are excluded from metrics
|
|
2586
|
+
|
|
2587
|
+
total_summaries = excellent_count + good_count + fair_count + poor_count
|
|
2588
|
+
excellent_rate = (
|
|
2589
|
+
(excellent_count / total_summaries * 100) if total_summaries > 0 else 0
|
|
2590
|
+
)
|
|
2591
|
+
|
|
2592
|
+
rating_map = {
|
|
2593
|
+
"excellent": "Excellent",
|
|
2594
|
+
"good": "Good",
|
|
2595
|
+
"fair": "Fair",
|
|
2596
|
+
"poor": "Poor",
|
|
2597
|
+
"unknown": "Unknown",
|
|
2598
|
+
}
|
|
2599
|
+
rating = rating_map.get(model["rating"], model["rating"].title())
|
|
2600
|
+
|
|
2601
|
+
table_rows.append(
|
|
2602
|
+
f"| **{model['name']}** | {excellent_rate:.0f}% | {excellent_count}/{total_summaries} | {good_count} | {fair_count} | {poor_count} | {rating} |"
|
|
2603
|
+
)
|
|
2604
|
+
|
|
2605
|
+
# Identify common summarization issues
|
|
2606
|
+
failure_patterns = []
|
|
2607
|
+
|
|
2608
|
+
# Analyze common weaknesses across models
|
|
2609
|
+
all_weaknesses = []
|
|
2610
|
+
for model in models_data:
|
|
2611
|
+
all_weaknesses.extend(model.get("weaknesses", []))
|
|
2612
|
+
|
|
2613
|
+
if "Manual review recommended" in str(all_weaknesses):
|
|
2614
|
+
failure_patterns.append("**Quality Consistency Issues** (Multiple Models)")
|
|
2615
|
+
failure_patterns.append("- Manual review recommended for complex summaries")
|
|
2616
|
+
failure_patterns.append(
|
|
2617
|
+
"- Inconsistent quality across different summary types"
|
|
2618
|
+
)
|
|
2619
|
+
failure_patterns.append("- Need for human validation of critical details")
|
|
2620
|
+
|
|
2621
|
+
# Check for specific summarization challenges
|
|
2622
|
+
poor_performers = [
|
|
2623
|
+
m for m in models_data if "poor" in str(m.get("per_question", []))
|
|
2624
|
+
]
|
|
2625
|
+
if poor_performers:
|
|
2626
|
+
failure_patterns.append("")
|
|
2627
|
+
failure_patterns.append(
|
|
2628
|
+
"**Content Structure Issues** "
|
|
2629
|
+
+ f"({', '.join([m['name'] for m in poor_performers])})"
|
|
2630
|
+
)
|
|
2631
|
+
failure_patterns.append("- Poor action item organization and clarity")
|
|
2632
|
+
failure_patterns.append("- Missing key decisions or incomplete details")
|
|
2633
|
+
failure_patterns.append("- Inadequate participant information capture")
|
|
2634
|
+
|
|
2635
|
+
# Model-specific analysis for summarization
|
|
2636
|
+
model_analyses = []
|
|
2637
|
+
|
|
2638
|
+
if models_data:
|
|
2639
|
+
best = models_data[0]
|
|
2640
|
+
best_strengths = (
|
|
2641
|
+
best["strengths"][:2]
|
|
2642
|
+
if best["strengths"]
|
|
2643
|
+
else ["Maintains summary structure", "Comprehensive analysis performed"]
|
|
2644
|
+
)
|
|
2645
|
+
best_weakness = (
|
|
2646
|
+
best["weaknesses"][0]
|
|
2647
|
+
if best["weaknesses"]
|
|
2648
|
+
else "Needs validation for complex scenarios"
|
|
2649
|
+
)
|
|
2650
|
+
|
|
2651
|
+
model_analyses.append(f"### **{best['name']}** - Best Performer")
|
|
2652
|
+
model_analyses.append(f"- **Strengths**: {', '.join(best_strengths)}")
|
|
2653
|
+
model_analyses.append(f"- **Weakness**: {best_weakness}")
|
|
2654
|
+
model_analyses.append(
|
|
2655
|
+
f"- **Actionable**: Implement quality validation workflows, standardize summary templates"
|
|
2656
|
+
)
|
|
2657
|
+
|
|
2658
|
+
if len(models_data) > 1:
|
|
2659
|
+
worst = models_data[-1]
|
|
2660
|
+
worst_issues = (
|
|
2661
|
+
worst["weaknesses"][:2]
|
|
2662
|
+
if worst["weaknesses"]
|
|
2663
|
+
else ["Inconsistent summary quality"]
|
|
2664
|
+
)
|
|
2665
|
+
|
|
2666
|
+
model_analyses.append("")
|
|
2667
|
+
model_analyses.append(f"### **{worst['name']}** - Needs Improvement")
|
|
2668
|
+
model_analyses.append(f"- **Issues**: {', '.join(worst_issues)}")
|
|
2669
|
+
model_analyses.append(
|
|
2670
|
+
f"- **Actionable**: Enhance prompt engineering, add structured output validation"
|
|
2671
|
+
)
|
|
2672
|
+
|
|
2673
|
+
# Cost efficiency analysis
|
|
2674
|
+
cost_analyses = []
|
|
2675
|
+
if all(m["total_cost"] > 0 for m in models_data):
|
|
2676
|
+
for model in models_data:
|
|
2677
|
+
roi_desc = (
|
|
2678
|
+
"best value"
|
|
2679
|
+
if model == models_data[0]
|
|
2680
|
+
else (
|
|
2681
|
+
"poor value"
|
|
2682
|
+
if "poor" in str(model.get("per_question", []))
|
|
2683
|
+
else "moderate value"
|
|
2684
|
+
)
|
|
2685
|
+
)
|
|
2686
|
+
cost_analyses.append(
|
|
2687
|
+
f"- **{model['name']}**: ${model['total_cost']:.3f} total cost, {roi_desc} for summarization quality"
|
|
2688
|
+
)
|
|
2689
|
+
|
|
2690
|
+
# Technical actions for summarization
|
|
2691
|
+
tech_actions = [
|
|
2692
|
+
"1. **Summary Template Standardization**: Create consistent output formats for different meeting types",
|
|
2693
|
+
"2. **Quality Validation Pipeline**: Implement automated checks for completeness and accuracy",
|
|
2694
|
+
"3. **Prompt Engineering Optimization**: Improve prompts for better action item extraction and decision clarity",
|
|
2695
|
+
]
|
|
2696
|
+
|
|
2697
|
+
tech_actions.extend(
|
|
2698
|
+
[
|
|
2699
|
+
"4. **Human-in-the-Loop Validation**: Add review workflows for critical summaries",
|
|
2700
|
+
"5. **Meeting Type Classification**: Tailor summarization approach based on meeting context",
|
|
2701
|
+
"6. **Output Formatting Enhancement**: Improve structure and readability of generated summaries",
|
|
2702
|
+
]
|
|
2703
|
+
)
|
|
2704
|
+
|
|
2705
|
+
# Investment decision for summarization
|
|
2706
|
+
if models_data:
|
|
2707
|
+
best_model = models_data[0]
|
|
2708
|
+
if "excellent" in str(best_model.get("per_question", [])):
|
|
2709
|
+
investment_decision = f"**{best_model['name']}** shows production potential with proper validation workflows."
|
|
2710
|
+
timeline = "2-4 weeks for validation pipeline implementation."
|
|
2711
|
+
else:
|
|
2712
|
+
investment_decision = (
|
|
2713
|
+
"All models require improvement before reliable production use."
|
|
2714
|
+
)
|
|
2715
|
+
timeline = "4-8 weeks for prompt optimization and quality improvements."
|
|
2716
|
+
else:
|
|
2717
|
+
investment_decision = (
|
|
2718
|
+
"Unable to recommend specific model - insufficient evaluation data."
|
|
2719
|
+
)
|
|
2720
|
+
timeline = "Timeline uncertain due to limited baseline data."
|
|
2721
|
+
|
|
2722
|
+
# Build the complete summarization report
|
|
2723
|
+
report = f"""# Meeting Summarization Performance Analysis: {len(models_data)} LLM Comparison
|
|
2724
|
+
|
|
2725
|
+
## Executive Summary
|
|
2726
|
+
Performance ranking: {ranking_text}
|
|
2727
|
+
|
|
2728
|
+
{production_note}
|
|
2729
|
+
|
|
2730
|
+
## Key Performance Metrics
|
|
2731
|
+
|
|
2732
|
+
| Model | Excellent Rate | Excellent/Total | Good | Fair | Poor | Rating |
|
|
2733
|
+
|-------|----------------|-----------------|------|------|------|---------|
|
|
2734
|
+
{chr(10).join(table_rows)}
|
|
2735
|
+
|
|
2736
|
+
## Common Challenges
|
|
2737
|
+
|
|
2738
|
+
{chr(10).join(failure_patterns)}
|
|
2739
|
+
|
|
2740
|
+
## Model-Specific Analysis
|
|
2741
|
+
|
|
2742
|
+
{chr(10).join(model_analyses)}
|
|
2743
|
+
|
|
2744
|
+
## Cost Efficiency Analysis
|
|
2745
|
+
{chr(10).join(cost_analyses) if cost_analyses else "Cost data not available for analysis"}
|
|
2746
|
+
|
|
2747
|
+
## Immediate Improvement Actions
|
|
2748
|
+
|
|
2749
|
+
### High Priority (Quality Enhancement)
|
|
2750
|
+
{chr(10).join(tech_actions[:3])}
|
|
2751
|
+
|
|
2752
|
+
### Medium Priority (Process Optimization)
|
|
2753
|
+
{chr(10).join(tech_actions[3:])}
|
|
2754
|
+
|
|
2755
|
+
## Bottom Line
|
|
2756
|
+
**Investment decision**: {investment_decision} **Timeline**: {timeline}"""
|
|
2757
|
+
|
|
2758
|
+
return report
|
|
2759
|
+
|
|
2760
|
+
def generate_summary_report(
|
|
2761
|
+
self, eval_dir: str, output_path: str = "LLM_Evaluation_Report.md"
|
|
2762
|
+
) -> Dict:
|
|
2763
|
+
"""
|
|
2764
|
+
Generate a comprehensive summary report from multiple evaluation files.
|
|
2765
|
+
|
|
2766
|
+
Args:
|
|
2767
|
+
eval_dir: Directory containing .eval.json files
|
|
2768
|
+
output_path: Path to save the markdown report
|
|
2769
|
+
|
|
2770
|
+
Returns:
|
|
2771
|
+
Dict containing summary data
|
|
2772
|
+
"""
|
|
2773
|
+
try:
|
|
2774
|
+
eval_path = Path(eval_dir)
|
|
2775
|
+
if not eval_path.exists():
|
|
2776
|
+
raise FileNotFoundError(f"Evaluation directory not found: {eval_dir}")
|
|
2777
|
+
|
|
2778
|
+
# Find all .eval.json files (recursively)
|
|
2779
|
+
eval_files = list(eval_path.rglob("*.eval.json"))
|
|
2780
|
+
if not eval_files:
|
|
2781
|
+
raise FileNotFoundError(f"No .eval.json files found in {eval_dir}")
|
|
2782
|
+
|
|
2783
|
+
self.log.info(f"Found {len(eval_files)} evaluation files")
|
|
2784
|
+
|
|
2785
|
+
# Parse evaluation data
|
|
2786
|
+
models_data = []
|
|
2787
|
+
for eval_file in eval_files:
|
|
2788
|
+
try:
|
|
2789
|
+
with open(eval_file, "r", encoding="utf-8") as f:
|
|
2790
|
+
eval_data = json.load(f)
|
|
2791
|
+
|
|
2792
|
+
# Extract model name from filename or metadata
|
|
2793
|
+
filename = eval_file.stem
|
|
2794
|
+
model_name = filename.replace(".eval", "")
|
|
2795
|
+
|
|
2796
|
+
# Extract key metrics
|
|
2797
|
+
overall_rating = eval_data.get("overall_rating", {})
|
|
2798
|
+
metrics = overall_rating.get("metrics", {})
|
|
2799
|
+
total_cost = eval_data.get("total_cost", {})
|
|
2800
|
+
|
|
2801
|
+
# Calculate quality score for summarization evaluations
|
|
2802
|
+
quality_score = 0.0
|
|
2803
|
+
overall_rating_metrics = overall_rating.get("metrics", {})
|
|
2804
|
+
if overall_rating_metrics:
|
|
2805
|
+
# Use existing quality_score if available (could be None for error cases)
|
|
2806
|
+
quality_score = overall_rating_metrics.get("quality_score", 0.0)
|
|
2807
|
+
if quality_score is None:
|
|
2808
|
+
quality_score = 0.0 # Treat None as 0 for ranking purposes
|
|
2809
|
+
else:
|
|
2810
|
+
# Calculate from per_question data if metrics not available
|
|
2811
|
+
excellent_count = 0
|
|
2812
|
+
good_count = 0
|
|
2813
|
+
fair_count = 0
|
|
2814
|
+
poor_count = 0
|
|
2815
|
+
|
|
2816
|
+
for question in eval_data.get("per_question", []):
|
|
2817
|
+
analysis = question.get("analysis", {})
|
|
2818
|
+
overall_quality = analysis.get("overall_quality", "")
|
|
2819
|
+
if overall_quality == "excellent":
|
|
2820
|
+
excellent_count += 1
|
|
2821
|
+
elif overall_quality == "good":
|
|
2822
|
+
good_count += 1
|
|
2823
|
+
elif overall_quality == "fair":
|
|
2824
|
+
fair_count += 1
|
|
2825
|
+
elif overall_quality == "poor":
|
|
2826
|
+
poor_count += 1
|
|
2827
|
+
# Note: "error" and other invalid ratings are excluded from quality score calculation
|
|
2828
|
+
|
|
2829
|
+
total_summaries = (
|
|
2830
|
+
excellent_count + good_count + fair_count + poor_count
|
|
2831
|
+
)
|
|
2832
|
+
if total_summaries > 0:
|
|
2833
|
+
quality_score_raw = (
|
|
2834
|
+
excellent_count * 4
|
|
2835
|
+
+ good_count * 3
|
|
2836
|
+
+ fair_count * 2
|
|
2837
|
+
+ poor_count * 1
|
|
2838
|
+
) / total_summaries
|
|
2839
|
+
quality_score = ((quality_score_raw - 1) / 3) * 100
|
|
2840
|
+
|
|
2841
|
+
model_info = {
|
|
2842
|
+
"name": model_name,
|
|
2843
|
+
"filename": eval_file.name,
|
|
2844
|
+
"pass_rate": metrics.get("pass_rate", 0),
|
|
2845
|
+
"accuracy": metrics.get("accuracy_percentage", 0),
|
|
2846
|
+
"mean_similarity": metrics.get("mean_similarity", 0),
|
|
2847
|
+
"std_similarity": metrics.get("std_similarity", 0),
|
|
2848
|
+
"min_similarity": metrics.get("min_similarity", 0),
|
|
2849
|
+
"max_similarity": metrics.get("max_similarity", 0),
|
|
2850
|
+
"num_questions": metrics.get("num_questions", 0),
|
|
2851
|
+
"num_passed": metrics.get("num_passed", 0),
|
|
2852
|
+
"num_failed": metrics.get("num_failed", 0),
|
|
2853
|
+
"threshold": metrics.get("similarity_threshold", 0.7),
|
|
2854
|
+
"rating": overall_rating.get("rating", "unknown"),
|
|
2855
|
+
"quality_score": quality_score, # Add quality score to model info
|
|
2856
|
+
"total_cost": total_cost.get("total_cost", 0),
|
|
2857
|
+
"analysis": eval_data.get("overall_analysis", ""),
|
|
2858
|
+
"strengths": eval_data.get("strengths", []),
|
|
2859
|
+
"weaknesses": eval_data.get("weaknesses", []),
|
|
2860
|
+
"recommendations": eval_data.get("recommendations", []),
|
|
2861
|
+
"per_question": eval_data.get("per_question", []),
|
|
2862
|
+
}
|
|
2863
|
+
models_data.append(model_info)
|
|
2864
|
+
|
|
2865
|
+
except Exception as e:
|
|
2866
|
+
self.log.warning(f"Error processing {eval_file}: {e}")
|
|
2867
|
+
continue
|
|
2868
|
+
|
|
2869
|
+
if not models_data:
|
|
2870
|
+
raise ValueError("No valid evaluation data found")
|
|
2871
|
+
|
|
2872
|
+
# Detect evaluation type first
|
|
2873
|
+
evaluation_type = self._detect_evaluation_type(models_data)
|
|
2874
|
+
|
|
2875
|
+
# Sort by appropriate metric based on evaluation type
|
|
2876
|
+
if evaluation_type == "summarization":
|
|
2877
|
+
# Sort by quality score (descending) for summarization
|
|
2878
|
+
models_data.sort(key=lambda x: x["quality_score"], reverse=True)
|
|
2879
|
+
else:
|
|
2880
|
+
# Sort by pass rate (descending) for RAG and unknown types
|
|
2881
|
+
models_data.sort(key=lambda x: x["pass_rate"], reverse=True)
|
|
2882
|
+
|
|
2883
|
+
if evaluation_type == "summarization":
|
|
2884
|
+
report_content = self._generate_summarization_report(models_data)
|
|
2885
|
+
elif evaluation_type == "rag":
|
|
2886
|
+
report_content = self._generate_markdown_report(models_data)
|
|
2887
|
+
else:
|
|
2888
|
+
# Handle unknown evaluation type
|
|
2889
|
+
self.log.error(
|
|
2890
|
+
f"Unknown evaluation type detected: {evaluation_type}. Cannot generate report."
|
|
2891
|
+
)
|
|
2892
|
+
raise ValueError(
|
|
2893
|
+
f"Unsupported evaluation type: {evaluation_type}. Expected 'summarization' or 'rag'."
|
|
2894
|
+
)
|
|
2895
|
+
|
|
2896
|
+
# Save report
|
|
2897
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
2898
|
+
f.write(report_content)
|
|
2899
|
+
|
|
2900
|
+
self.log.info(f"Summary report saved to: {output_path}")
|
|
2901
|
+
|
|
2902
|
+
return {
|
|
2903
|
+
"models_analyzed": len(models_data),
|
|
2904
|
+
"report_path": output_path,
|
|
2905
|
+
"summary_data": models_data,
|
|
2906
|
+
"evaluation_type": evaluation_type,
|
|
2907
|
+
}
|
|
2908
|
+
|
|
2909
|
+
except Exception as e:
|
|
2910
|
+
self.log.error(f"Error generating summary report: {e}")
|
|
2911
|
+
raise
|
|
2912
|
+
|
|
2913
|
+
def _generate_markdown_report(self, models_data: List[Dict]) -> str:
|
|
2914
|
+
"""Generate markdown content for the summary report."""
|
|
2915
|
+
|
|
2916
|
+
# Create executive summary
|
|
2917
|
+
best_model = models_data[0] if models_data else None
|
|
2918
|
+
worst_model = models_data[-1] if models_data else None
|
|
2919
|
+
|
|
2920
|
+
# Build performance ranking
|
|
2921
|
+
ranking = []
|
|
2922
|
+
for i, model in enumerate(models_data):
|
|
2923
|
+
ranking.append(f"**{model['name']}** ({model['pass_rate']:.0%})")
|
|
2924
|
+
ranking_text = " > ".join(ranking)
|
|
2925
|
+
|
|
2926
|
+
# Determine if any model meets production standards
|
|
2927
|
+
production_ready = any(
|
|
2928
|
+
m["pass_rate"] >= 0.7 and m["mean_similarity"] >= 0.7 for m in models_data
|
|
2929
|
+
)
|
|
2930
|
+
production_note = (
|
|
2931
|
+
"None achieve production standards (70% pass rate + 0.7 similarity)."
|
|
2932
|
+
if not production_ready
|
|
2933
|
+
else "Some models approach production readiness."
|
|
2934
|
+
)
|
|
2935
|
+
|
|
2936
|
+
# Build metrics table
|
|
2937
|
+
table_rows = []
|
|
2938
|
+
for model in models_data:
|
|
2939
|
+
rating_map = {
|
|
2940
|
+
"excellent": "Excellent",
|
|
2941
|
+
"good": "Good",
|
|
2942
|
+
"fair": "Fair",
|
|
2943
|
+
"poor": "Poor",
|
|
2944
|
+
"unknown": "Unknown",
|
|
2945
|
+
}
|
|
2946
|
+
rating = rating_map.get(model["rating"], model["rating"].title())
|
|
2947
|
+
table_rows.append(
|
|
2948
|
+
f"| **{model['name']}** | {model['pass_rate']:.0%} | {model['mean_similarity']:.3f} | {model['std_similarity']:.3f} | {rating} |"
|
|
2949
|
+
)
|
|
2950
|
+
|
|
2951
|
+
# Identify failure patterns
|
|
2952
|
+
failure_patterns = []
|
|
2953
|
+
|
|
2954
|
+
# Knowledge retrieval gaps (check if models consistently fail on specific question types)
|
|
2955
|
+
knowledge_issues = [m for m in models_data if m["mean_similarity"] < 0.4]
|
|
2956
|
+
if len(knowledge_issues) >= 2:
|
|
2957
|
+
failure_patterns.append("**Knowledge Retrieval Gaps** (All Models)")
|
|
2958
|
+
failure_patterns.append("- Unable to access specific document sections")
|
|
2959
|
+
failure_patterns.append("- Missing organizational information")
|
|
2960
|
+
failure_patterns.append(
|
|
2961
|
+
"- Poor semantic matching between queries and knowledge base"
|
|
2962
|
+
)
|
|
2963
|
+
|
|
2964
|
+
# Factual accuracy issues
|
|
2965
|
+
accuracy_issues = [m for m in models_data if m["pass_rate"] < 0.5]
|
|
2966
|
+
if accuracy_issues:
|
|
2967
|
+
failure_patterns.append("")
|
|
2968
|
+
failure_patterns.append(
|
|
2969
|
+
"**Factual Accuracy Issues** "
|
|
2970
|
+
+ f"({', '.join([m['name'] for m in accuracy_issues])})"
|
|
2971
|
+
)
|
|
2972
|
+
# Add specific issues from analysis
|
|
2973
|
+
for model in accuracy_issues[:3]: # Limit to top 3 worst performers
|
|
2974
|
+
if (
|
|
2975
|
+
"jurisdictional" in model["analysis"].lower()
|
|
2976
|
+
or "confusion" in model["analysis"].lower()
|
|
2977
|
+
):
|
|
2978
|
+
failure_patterns.append(
|
|
2979
|
+
f"- **{model['name']}**: Jurisdictional confusion (US vs Canadian regulations)"
|
|
2980
|
+
)
|
|
2981
|
+
if (
|
|
2982
|
+
"incorrect" in model["analysis"].lower()
|
|
2983
|
+
or "wrong" in model["analysis"].lower()
|
|
2984
|
+
):
|
|
2985
|
+
failure_patterns.append(
|
|
2986
|
+
f"- **{model['name']}**: Incorrect core values, wrong regulatory stages"
|
|
2987
|
+
)
|
|
2988
|
+
|
|
2989
|
+
# Completeness problems
|
|
2990
|
+
if len([m for m in models_data if m["mean_similarity"] < 0.5]) >= 2:
|
|
2991
|
+
failure_patterns.append("")
|
|
2992
|
+
failure_patterns.append("**Completeness Problems** (All Models)")
|
|
2993
|
+
failure_patterns.append("- Partial answers missing key regulatory details")
|
|
2994
|
+
failure_patterns.append(
|
|
2995
|
+
"- Incomplete permit types (missing multiple authorization categories)"
|
|
2996
|
+
)
|
|
2997
|
+
failure_patterns.append("- Poor handling of comprehensive queries")
|
|
2998
|
+
|
|
2999
|
+
# Model-specific analysis
|
|
3000
|
+
model_analyses = []
|
|
3001
|
+
|
|
3002
|
+
if models_data:
|
|
3003
|
+
best = models_data[0]
|
|
3004
|
+
best_strengths = (
|
|
3005
|
+
best["strengths"][:2]
|
|
3006
|
+
if best["strengths"]
|
|
3007
|
+
else ["Good performance when information is available"]
|
|
3008
|
+
)
|
|
3009
|
+
best_weakness = (
|
|
3010
|
+
best["weaknesses"][0]
|
|
3011
|
+
if best["weaknesses"]
|
|
3012
|
+
else "Inconsistent retrieval quality"
|
|
3013
|
+
)
|
|
3014
|
+
|
|
3015
|
+
model_analyses.append(f"### **{best['name']}** - Best Performer")
|
|
3016
|
+
model_analyses.append(f"- **Strengths**: {', '.join(best_strengths)}")
|
|
3017
|
+
model_analyses.append(f"- **Weakness**: {best_weakness}")
|
|
3018
|
+
model_analyses.append(
|
|
3019
|
+
f"- **Actionable**: Improve retrieval consistency, expand knowledge base coverage"
|
|
3020
|
+
)
|
|
3021
|
+
|
|
3022
|
+
if len(models_data) > 1:
|
|
3023
|
+
worst = models_data[-1]
|
|
3024
|
+
worst_issues = (
|
|
3025
|
+
worst["weaknesses"][:2]
|
|
3026
|
+
if worst["weaknesses"]
|
|
3027
|
+
else ["Poor overall performance"]
|
|
3028
|
+
)
|
|
3029
|
+
|
|
3030
|
+
model_analyses.append("")
|
|
3031
|
+
model_analyses.append(f"### **{worst['name']}** - Needs Improvement")
|
|
3032
|
+
model_analyses.append(f"- **Issues**: {', '.join(worst_issues)}")
|
|
3033
|
+
model_analyses.append(
|
|
3034
|
+
f"- **Actionable**: Requires significant system improvements before production use"
|
|
3035
|
+
)
|
|
3036
|
+
|
|
3037
|
+
# Cost efficiency analysis
|
|
3038
|
+
cost_analyses = []
|
|
3039
|
+
if all(m["total_cost"] > 0 for m in models_data):
|
|
3040
|
+
for model in models_data:
|
|
3041
|
+
roi_desc = (
|
|
3042
|
+
"best ROI"
|
|
3043
|
+
if model == models_data[0]
|
|
3044
|
+
else ("poor ROI" if model["pass_rate"] < 0.3 else "moderate ROI")
|
|
3045
|
+
)
|
|
3046
|
+
cost_analyses.append(
|
|
3047
|
+
f"- **{model['name']}**: ${model['total_cost']:.3f} total cost, {roi_desc} at {model['pass_rate']:.0%} accuracy"
|
|
3048
|
+
)
|
|
3049
|
+
|
|
3050
|
+
# Technical actions
|
|
3051
|
+
tech_actions = [
|
|
3052
|
+
"1. **Document Indexing Overhaul**: Fix content gaps, improve chunking strategy",
|
|
3053
|
+
"2. **Embedding Model Upgrade**: Current semantic matching insufficient (mean similarity <0.4)",
|
|
3054
|
+
"3. **Context Validation**: Implement regulatory framework filters",
|
|
3055
|
+
]
|
|
3056
|
+
|
|
3057
|
+
if any("runtime" in str(m["weaknesses"]).lower() for m in models_data):
|
|
3058
|
+
tech_actions.append(
|
|
3059
|
+
"4. **Token Limit Fixes**: Address runtime errors and token constraints"
|
|
3060
|
+
)
|
|
3061
|
+
|
|
3062
|
+
tech_actions.extend(
|
|
3063
|
+
[
|
|
3064
|
+
"5. **Response Validation**: Add factual accuracy checks before output",
|
|
3065
|
+
"6. **Retrieval Redundancy**: Multi-step retrieval for complex queries",
|
|
3066
|
+
]
|
|
3067
|
+
)
|
|
3068
|
+
|
|
3069
|
+
# Investment decision
|
|
3070
|
+
if best_model:
|
|
3071
|
+
if best_model["pass_rate"] >= 0.5:
|
|
3072
|
+
investment_decision = f"Focus resources on **{best_model['name']}** optimization rather than fixing underperforming models."
|
|
3073
|
+
else:
|
|
3074
|
+
investment_decision = "All models require significant improvement before production deployment."
|
|
3075
|
+
|
|
3076
|
+
timeline = "3-6 months minimum before regulatory compliance readiness."
|
|
3077
|
+
else:
|
|
3078
|
+
investment_decision = (
|
|
3079
|
+
"Unable to recommend specific model - all require substantial work."
|
|
3080
|
+
)
|
|
3081
|
+
timeline = "Timeline uncertain due to poor baseline performance."
|
|
3082
|
+
|
|
3083
|
+
# Build the complete report
|
|
3084
|
+
report = f"""# RAG System Performance Analysis: {len(models_data)} LLM Comparison
|
|
3085
|
+
|
|
3086
|
+
## Executive Summary
|
|
3087
|
+
Performance ranking: {ranking_text}
|
|
3088
|
+
|
|
3089
|
+
{production_note}
|
|
3090
|
+
|
|
3091
|
+
## Key Performance Metrics
|
|
3092
|
+
|
|
3093
|
+
| Model | Pass Rate | Mean Similarity | Std Dev | Rating |
|
|
3094
|
+
|-------|-----------|----------------|---------|---------|
|
|
3095
|
+
{chr(10).join(table_rows)}
|
|
3096
|
+
|
|
3097
|
+
## Critical Failure Patterns
|
|
3098
|
+
|
|
3099
|
+
{chr(10).join(failure_patterns)}
|
|
3100
|
+
|
|
3101
|
+
## Model-Specific Analysis
|
|
3102
|
+
|
|
3103
|
+
{chr(10).join(model_analyses)}
|
|
3104
|
+
|
|
3105
|
+
## Cost Efficiency Analysis
|
|
3106
|
+
{chr(10).join(cost_analyses) if cost_analyses else "Cost data not available for analysis"}
|
|
3107
|
+
|
|
3108
|
+
## Immediate Technical Actions
|
|
3109
|
+
|
|
3110
|
+
### High Priority (Critical Fixes)
|
|
3111
|
+
{chr(10).join(tech_actions[:3])}
|
|
3112
|
+
|
|
3113
|
+
### Medium Priority (Performance Optimization)
|
|
3114
|
+
{chr(10).join(tech_actions[3:])}
|
|
3115
|
+
|
|
3116
|
+
## Bottom Line
|
|
3117
|
+
**Investment decision**: {investment_decision} **Timeline**: {timeline}"""
|
|
3118
|
+
|
|
3119
|
+
return report
|
|
3120
|
+
|
|
3121
|
+
|
|
3122
|
+
if __name__ == "__main__":
|
|
3123
|
+
# Example usage
|
|
3124
|
+
evaluator = Evaluator()
|
|
3125
|
+
results_file = "./output/rag/introduction.results.json"
|
|
3126
|
+
|
|
3127
|
+
try:
|
|
3128
|
+
evaluation_data = evaluator.generate_enhanced_report(
|
|
3129
|
+
results_file, output_dir="./output/eval"
|
|
3130
|
+
)
|
|
3131
|
+
|
|
3132
|
+
# Print key metrics from the analysis
|
|
3133
|
+
overall_rating = evaluation_data.get("overall_rating", {})
|
|
3134
|
+
print("\nStatus:", overall_rating.get("rating", "N/A"))
|
|
3135
|
+
print("Explanation:", overall_rating.get("explanation", ""))
|
|
3136
|
+
|
|
3137
|
+
# Print metrics if available
|
|
3138
|
+
metrics = overall_rating.get("metrics", {})
|
|
3139
|
+
if metrics:
|
|
3140
|
+
print("\nMetrics:")
|
|
3141
|
+
print(f"Number of questions: {metrics.get('num_questions', 'N/A')}")
|
|
3142
|
+
print(
|
|
3143
|
+
f"Similarity threshold: {metrics.get('similarity_threshold', 'N/A'):.3f}"
|
|
3144
|
+
)
|
|
3145
|
+
print(f"Pass rate: {metrics.get('pass_rate', 'N/A'):.3f}")
|
|
3146
|
+
print(f"Passed threshold: {metrics.get('num_passed', 'N/A')}")
|
|
3147
|
+
print(f"Failed threshold: {metrics.get('num_failed', 'N/A')}")
|
|
3148
|
+
print("\nSimilarity Statistics:")
|
|
3149
|
+
print(f"Mean: {metrics.get('mean_similarity', 'N/A'):.3f}")
|
|
3150
|
+
print(f"Median: {metrics.get('median_similarity', 'N/A'):.3f}")
|
|
3151
|
+
print(f"Min: {metrics.get('min_similarity', 'N/A'):.3f}")
|
|
3152
|
+
print(f"Max: {metrics.get('max_similarity', 'N/A'):.3f}")
|
|
3153
|
+
print(f"Standard deviation: {metrics.get('std_similarity', 'N/A'):.3f}")
|
|
3154
|
+
|
|
3155
|
+
print("\nAnalysis:", evaluation_data.get("overall_analysis", "N/A"))
|
|
3156
|
+
|
|
3157
|
+
# Print cost information if available
|
|
3158
|
+
if evaluation_data.get("total_usage") and evaluation_data.get("total_cost"):
|
|
3159
|
+
total_usage = evaluation_data["total_usage"]
|
|
3160
|
+
total_cost = evaluation_data["total_cost"]
|
|
3161
|
+
print("\nCost Analysis:")
|
|
3162
|
+
print(
|
|
3163
|
+
f"Token usage: {total_usage['input_tokens']:,} input + {total_usage['output_tokens']:,} output = {total_usage['total_tokens']:,} total"
|
|
3164
|
+
)
|
|
3165
|
+
print(
|
|
3166
|
+
f"Total cost: ${total_cost['input_cost']:.4f} input + ${total_cost['output_cost']:.4f} output = ${total_cost['total_cost']:.4f} total"
|
|
3167
|
+
)
|
|
3168
|
+
if evaluation_data.get("per_question"):
|
|
3169
|
+
print(
|
|
3170
|
+
f"Average cost per question: ${total_cost['total_cost']/len(evaluation_data['per_question']):.4f}"
|
|
3171
|
+
)
|
|
3172
|
+
|
|
3173
|
+
if evaluation_data.get("strengths"):
|
|
3174
|
+
print("\nStrengths:")
|
|
3175
|
+
for strength in evaluation_data["strengths"]:
|
|
3176
|
+
print(f"- {strength}")
|
|
3177
|
+
|
|
3178
|
+
except Exception as e:
|
|
3179
|
+
print(f"Error during evaluation: {e}")
|