amd-gaia 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (800) hide show
  1. amd_gaia-0.14.1.dist-info/METADATA +768 -0
  2. amd_gaia-0.14.1.dist-info/RECORD +800 -0
  3. amd_gaia-0.14.1.dist-info/WHEEL +5 -0
  4. amd_gaia-0.14.1.dist-info/entry_points.txt +5 -0
  5. amd_gaia-0.14.1.dist-info/licenses/LICENSE.md +21 -0
  6. amd_gaia-0.14.1.dist-info/top_level.txt +1 -0
  7. gaia/__init__.py +2 -0
  8. gaia/agents/__init__.py +19 -0
  9. gaia/agents/base/__init__.py +9 -0
  10. gaia/agents/base/agent.py +2072 -0
  11. gaia/agents/base/api_agent.py +120 -0
  12. gaia/agents/base/console.py +1457 -0
  13. gaia/agents/base/mcp_agent.py +86 -0
  14. gaia/agents/base/tools.py +83 -0
  15. gaia/agents/blender/agent.py +556 -0
  16. gaia/agents/blender/agent_simple.py +135 -0
  17. gaia/agents/blender/app.py +211 -0
  18. gaia/agents/blender/app_simple.py +41 -0
  19. gaia/agents/blender/core/__init__.py +16 -0
  20. gaia/agents/blender/core/materials.py +506 -0
  21. gaia/agents/blender/core/objects.py +316 -0
  22. gaia/agents/blender/core/rendering.py +225 -0
  23. gaia/agents/blender/core/scene.py +220 -0
  24. gaia/agents/blender/core/view.py +146 -0
  25. gaia/agents/chat/__init__.py +9 -0
  26. gaia/agents/chat/agent.py +975 -0
  27. gaia/agents/chat/app.py +1058 -0
  28. gaia/agents/chat/session.py +508 -0
  29. gaia/agents/chat/tools/__init__.py +15 -0
  30. gaia/agents/chat/tools/file_tools.py +96 -0
  31. gaia/agents/chat/tools/rag_tools.py +1729 -0
  32. gaia/agents/chat/tools/shell_tools.py +436 -0
  33. gaia/agents/code/__init__.py +7 -0
  34. gaia/agents/code/agent.py +547 -0
  35. gaia/agents/code/app.py +266 -0
  36. gaia/agents/code/models.py +135 -0
  37. gaia/agents/code/orchestration/__init__.py +24 -0
  38. gaia/agents/code/orchestration/checklist_executor.py +1739 -0
  39. gaia/agents/code/orchestration/checklist_generator.py +709 -0
  40. gaia/agents/code/orchestration/factories/__init__.py +9 -0
  41. gaia/agents/code/orchestration/factories/base.py +63 -0
  42. gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -0
  43. gaia/agents/code/orchestration/factories/python_factory.py +106 -0
  44. gaia/agents/code/orchestration/orchestrator.py +610 -0
  45. gaia/agents/code/orchestration/project_analyzer.py +391 -0
  46. gaia/agents/code/orchestration/steps/__init__.py +67 -0
  47. gaia/agents/code/orchestration/steps/base.py +188 -0
  48. gaia/agents/code/orchestration/steps/error_handler.py +314 -0
  49. gaia/agents/code/orchestration/steps/nextjs.py +828 -0
  50. gaia/agents/code/orchestration/steps/python.py +307 -0
  51. gaia/agents/code/orchestration/template_catalog.py +463 -0
  52. gaia/agents/code/orchestration/workflows/__init__.py +14 -0
  53. gaia/agents/code/orchestration/workflows/base.py +80 -0
  54. gaia/agents/code/orchestration/workflows/nextjs.py +186 -0
  55. gaia/agents/code/orchestration/workflows/python.py +94 -0
  56. gaia/agents/code/prompts/__init__.py +11 -0
  57. gaia/agents/code/prompts/base_prompt.py +77 -0
  58. gaia/agents/code/prompts/code_patterns.py +1925 -0
  59. gaia/agents/code/prompts/nextjs_prompt.py +40 -0
  60. gaia/agents/code/prompts/python_prompt.py +109 -0
  61. gaia/agents/code/schema_inference.py +365 -0
  62. gaia/agents/code/system_prompt.py +41 -0
  63. gaia/agents/code/tools/__init__.py +42 -0
  64. gaia/agents/code/tools/cli_tools.py +1138 -0
  65. gaia/agents/code/tools/code_formatting.py +319 -0
  66. gaia/agents/code/tools/code_tools.py +769 -0
  67. gaia/agents/code/tools/error_fixing.py +1347 -0
  68. gaia/agents/code/tools/external_tools.py +180 -0
  69. gaia/agents/code/tools/file_io.py +845 -0
  70. gaia/agents/code/tools/prisma_tools.py +190 -0
  71. gaia/agents/code/tools/project_management.py +1016 -0
  72. gaia/agents/code/tools/testing.py +321 -0
  73. gaia/agents/code/tools/typescript_tools.py +122 -0
  74. gaia/agents/code/tools/validation_parsing.py +461 -0
  75. gaia/agents/code/tools/validation_tools.py +803 -0
  76. gaia/agents/code/tools/web_dev_tools.py +1744 -0
  77. gaia/agents/code/validators/__init__.py +16 -0
  78. gaia/agents/code/validators/antipattern_checker.py +241 -0
  79. gaia/agents/code/validators/ast_analyzer.py +197 -0
  80. gaia/agents/code/validators/requirements_validator.py +145 -0
  81. gaia/agents/code/validators/syntax_validator.py +171 -0
  82. gaia/agents/docker/__init__.py +7 -0
  83. gaia/agents/docker/agent.py +642 -0
  84. gaia/agents/jira/__init__.py +11 -0
  85. gaia/agents/jira/agent.py +894 -0
  86. gaia/agents/jira/jql_templates.py +299 -0
  87. gaia/agents/routing/__init__.py +7 -0
  88. gaia/agents/routing/agent.py +512 -0
  89. gaia/agents/routing/system_prompt.py +75 -0
  90. gaia/api/__init__.py +23 -0
  91. gaia/api/agent_registry.py +238 -0
  92. gaia/api/app.py +305 -0
  93. gaia/api/openai_server.py +575 -0
  94. gaia/api/schemas.py +186 -0
  95. gaia/api/sse_handler.py +370 -0
  96. gaia/apps/__init__.py +4 -0
  97. gaia/apps/llm/__init__.py +6 -0
  98. gaia/apps/llm/app.py +169 -0
  99. gaia/apps/summarize/app.py +633 -0
  100. gaia/apps/summarize/html_viewer.py +133 -0
  101. gaia/apps/summarize/pdf_formatter.py +284 -0
  102. gaia/audio/__init__.py +2 -0
  103. gaia/audio/audio_client.py +439 -0
  104. gaia/audio/audio_recorder.py +269 -0
  105. gaia/audio/kokoro_tts.py +599 -0
  106. gaia/audio/whisper_asr.py +432 -0
  107. gaia/chat/__init__.py +16 -0
  108. gaia/chat/app.py +430 -0
  109. gaia/chat/prompts.py +522 -0
  110. gaia/chat/sdk.py +1200 -0
  111. gaia/cli.py +5621 -0
  112. gaia/eval/batch_experiment.py +2332 -0
  113. gaia/eval/claude.py +542 -0
  114. gaia/eval/config.py +37 -0
  115. gaia/eval/email_generator.py +512 -0
  116. gaia/eval/eval.py +3179 -0
  117. gaia/eval/groundtruth.py +1130 -0
  118. gaia/eval/transcript_generator.py +582 -0
  119. gaia/eval/webapp/README.md +168 -0
  120. gaia/eval/webapp/node_modules/.bin/mime +16 -0
  121. gaia/eval/webapp/node_modules/.bin/mime.cmd +17 -0
  122. gaia/eval/webapp/node_modules/.bin/mime.ps1 +28 -0
  123. gaia/eval/webapp/node_modules/.package-lock.json +865 -0
  124. gaia/eval/webapp/node_modules/accepts/HISTORY.md +243 -0
  125. gaia/eval/webapp/node_modules/accepts/LICENSE +23 -0
  126. gaia/eval/webapp/node_modules/accepts/README.md +140 -0
  127. gaia/eval/webapp/node_modules/accepts/index.js +238 -0
  128. gaia/eval/webapp/node_modules/accepts/package.json +47 -0
  129. gaia/eval/webapp/node_modules/array-flatten/LICENSE +21 -0
  130. gaia/eval/webapp/node_modules/array-flatten/README.md +43 -0
  131. gaia/eval/webapp/node_modules/array-flatten/array-flatten.js +64 -0
  132. gaia/eval/webapp/node_modules/array-flatten/package.json +39 -0
  133. gaia/eval/webapp/node_modules/body-parser/HISTORY.md +672 -0
  134. gaia/eval/webapp/node_modules/body-parser/LICENSE +23 -0
  135. gaia/eval/webapp/node_modules/body-parser/README.md +476 -0
  136. gaia/eval/webapp/node_modules/body-parser/SECURITY.md +25 -0
  137. gaia/eval/webapp/node_modules/body-parser/index.js +156 -0
  138. gaia/eval/webapp/node_modules/body-parser/lib/read.js +205 -0
  139. gaia/eval/webapp/node_modules/body-parser/lib/types/json.js +247 -0
  140. gaia/eval/webapp/node_modules/body-parser/lib/types/raw.js +101 -0
  141. gaia/eval/webapp/node_modules/body-parser/lib/types/text.js +121 -0
  142. gaia/eval/webapp/node_modules/body-parser/lib/types/urlencoded.js +307 -0
  143. gaia/eval/webapp/node_modules/body-parser/package.json +56 -0
  144. gaia/eval/webapp/node_modules/bytes/History.md +97 -0
  145. gaia/eval/webapp/node_modules/bytes/LICENSE +23 -0
  146. gaia/eval/webapp/node_modules/bytes/Readme.md +152 -0
  147. gaia/eval/webapp/node_modules/bytes/index.js +170 -0
  148. gaia/eval/webapp/node_modules/bytes/package.json +42 -0
  149. gaia/eval/webapp/node_modules/call-bind-apply-helpers/.eslintrc +17 -0
  150. gaia/eval/webapp/node_modules/call-bind-apply-helpers/.github/FUNDING.yml +12 -0
  151. gaia/eval/webapp/node_modules/call-bind-apply-helpers/.nycrc +9 -0
  152. gaia/eval/webapp/node_modules/call-bind-apply-helpers/CHANGELOG.md +30 -0
  153. gaia/eval/webapp/node_modules/call-bind-apply-helpers/LICENSE +21 -0
  154. gaia/eval/webapp/node_modules/call-bind-apply-helpers/README.md +62 -0
  155. gaia/eval/webapp/node_modules/call-bind-apply-helpers/actualApply.d.ts +1 -0
  156. gaia/eval/webapp/node_modules/call-bind-apply-helpers/actualApply.js +10 -0
  157. gaia/eval/webapp/node_modules/call-bind-apply-helpers/applyBind.d.ts +19 -0
  158. gaia/eval/webapp/node_modules/call-bind-apply-helpers/applyBind.js +10 -0
  159. gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionApply.d.ts +1 -0
  160. gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionApply.js +4 -0
  161. gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionCall.d.ts +1 -0
  162. gaia/eval/webapp/node_modules/call-bind-apply-helpers/functionCall.js +4 -0
  163. gaia/eval/webapp/node_modules/call-bind-apply-helpers/index.d.ts +64 -0
  164. gaia/eval/webapp/node_modules/call-bind-apply-helpers/index.js +15 -0
  165. gaia/eval/webapp/node_modules/call-bind-apply-helpers/package.json +85 -0
  166. gaia/eval/webapp/node_modules/call-bind-apply-helpers/reflectApply.d.ts +3 -0
  167. gaia/eval/webapp/node_modules/call-bind-apply-helpers/reflectApply.js +4 -0
  168. gaia/eval/webapp/node_modules/call-bind-apply-helpers/test/index.js +63 -0
  169. gaia/eval/webapp/node_modules/call-bind-apply-helpers/tsconfig.json +9 -0
  170. gaia/eval/webapp/node_modules/call-bound/.eslintrc +13 -0
  171. gaia/eval/webapp/node_modules/call-bound/.github/FUNDING.yml +12 -0
  172. gaia/eval/webapp/node_modules/call-bound/.nycrc +9 -0
  173. gaia/eval/webapp/node_modules/call-bound/CHANGELOG.md +42 -0
  174. gaia/eval/webapp/node_modules/call-bound/LICENSE +21 -0
  175. gaia/eval/webapp/node_modules/call-bound/README.md +53 -0
  176. gaia/eval/webapp/node_modules/call-bound/index.d.ts +94 -0
  177. gaia/eval/webapp/node_modules/call-bound/index.js +19 -0
  178. gaia/eval/webapp/node_modules/call-bound/package.json +99 -0
  179. gaia/eval/webapp/node_modules/call-bound/test/index.js +61 -0
  180. gaia/eval/webapp/node_modules/call-bound/tsconfig.json +10 -0
  181. gaia/eval/webapp/node_modules/content-disposition/HISTORY.md +60 -0
  182. gaia/eval/webapp/node_modules/content-disposition/LICENSE +22 -0
  183. gaia/eval/webapp/node_modules/content-disposition/README.md +142 -0
  184. gaia/eval/webapp/node_modules/content-disposition/index.js +458 -0
  185. gaia/eval/webapp/node_modules/content-disposition/package.json +44 -0
  186. gaia/eval/webapp/node_modules/content-type/HISTORY.md +29 -0
  187. gaia/eval/webapp/node_modules/content-type/LICENSE +22 -0
  188. gaia/eval/webapp/node_modules/content-type/README.md +94 -0
  189. gaia/eval/webapp/node_modules/content-type/index.js +225 -0
  190. gaia/eval/webapp/node_modules/content-type/package.json +42 -0
  191. gaia/eval/webapp/node_modules/cookie/LICENSE +24 -0
  192. gaia/eval/webapp/node_modules/cookie/README.md +317 -0
  193. gaia/eval/webapp/node_modules/cookie/SECURITY.md +25 -0
  194. gaia/eval/webapp/node_modules/cookie/index.js +334 -0
  195. gaia/eval/webapp/node_modules/cookie/package.json +44 -0
  196. gaia/eval/webapp/node_modules/cookie-signature/.npmignore +4 -0
  197. gaia/eval/webapp/node_modules/cookie-signature/History.md +38 -0
  198. gaia/eval/webapp/node_modules/cookie-signature/Readme.md +42 -0
  199. gaia/eval/webapp/node_modules/cookie-signature/index.js +51 -0
  200. gaia/eval/webapp/node_modules/cookie-signature/package.json +18 -0
  201. gaia/eval/webapp/node_modules/debug/.coveralls.yml +1 -0
  202. gaia/eval/webapp/node_modules/debug/.eslintrc +11 -0
  203. gaia/eval/webapp/node_modules/debug/.npmignore +9 -0
  204. gaia/eval/webapp/node_modules/debug/.travis.yml +14 -0
  205. gaia/eval/webapp/node_modules/debug/CHANGELOG.md +362 -0
  206. gaia/eval/webapp/node_modules/debug/LICENSE +19 -0
  207. gaia/eval/webapp/node_modules/debug/Makefile +50 -0
  208. gaia/eval/webapp/node_modules/debug/README.md +312 -0
  209. gaia/eval/webapp/node_modules/debug/component.json +19 -0
  210. gaia/eval/webapp/node_modules/debug/karma.conf.js +70 -0
  211. gaia/eval/webapp/node_modules/debug/node.js +1 -0
  212. gaia/eval/webapp/node_modules/debug/package.json +49 -0
  213. gaia/eval/webapp/node_modules/debug/src/browser.js +185 -0
  214. gaia/eval/webapp/node_modules/debug/src/debug.js +202 -0
  215. gaia/eval/webapp/node_modules/debug/src/index.js +10 -0
  216. gaia/eval/webapp/node_modules/debug/src/inspector-log.js +15 -0
  217. gaia/eval/webapp/node_modules/debug/src/node.js +248 -0
  218. gaia/eval/webapp/node_modules/depd/History.md +103 -0
  219. gaia/eval/webapp/node_modules/depd/LICENSE +22 -0
  220. gaia/eval/webapp/node_modules/depd/Readme.md +280 -0
  221. gaia/eval/webapp/node_modules/depd/index.js +538 -0
  222. gaia/eval/webapp/node_modules/depd/lib/browser/index.js +77 -0
  223. gaia/eval/webapp/node_modules/depd/package.json +45 -0
  224. gaia/eval/webapp/node_modules/destroy/LICENSE +23 -0
  225. gaia/eval/webapp/node_modules/destroy/README.md +63 -0
  226. gaia/eval/webapp/node_modules/destroy/index.js +209 -0
  227. gaia/eval/webapp/node_modules/destroy/package.json +48 -0
  228. gaia/eval/webapp/node_modules/dunder-proto/.eslintrc +5 -0
  229. gaia/eval/webapp/node_modules/dunder-proto/.github/FUNDING.yml +12 -0
  230. gaia/eval/webapp/node_modules/dunder-proto/.nycrc +13 -0
  231. gaia/eval/webapp/node_modules/dunder-proto/CHANGELOG.md +24 -0
  232. gaia/eval/webapp/node_modules/dunder-proto/LICENSE +21 -0
  233. gaia/eval/webapp/node_modules/dunder-proto/README.md +54 -0
  234. gaia/eval/webapp/node_modules/dunder-proto/get.d.ts +5 -0
  235. gaia/eval/webapp/node_modules/dunder-proto/get.js +30 -0
  236. gaia/eval/webapp/node_modules/dunder-proto/package.json +76 -0
  237. gaia/eval/webapp/node_modules/dunder-proto/set.d.ts +5 -0
  238. gaia/eval/webapp/node_modules/dunder-proto/set.js +35 -0
  239. gaia/eval/webapp/node_modules/dunder-proto/test/get.js +34 -0
  240. gaia/eval/webapp/node_modules/dunder-proto/test/index.js +4 -0
  241. gaia/eval/webapp/node_modules/dunder-proto/test/set.js +50 -0
  242. gaia/eval/webapp/node_modules/dunder-proto/tsconfig.json +9 -0
  243. gaia/eval/webapp/node_modules/ee-first/LICENSE +22 -0
  244. gaia/eval/webapp/node_modules/ee-first/README.md +80 -0
  245. gaia/eval/webapp/node_modules/ee-first/index.js +95 -0
  246. gaia/eval/webapp/node_modules/ee-first/package.json +29 -0
  247. gaia/eval/webapp/node_modules/encodeurl/LICENSE +22 -0
  248. gaia/eval/webapp/node_modules/encodeurl/README.md +109 -0
  249. gaia/eval/webapp/node_modules/encodeurl/index.js +60 -0
  250. gaia/eval/webapp/node_modules/encodeurl/package.json +40 -0
  251. gaia/eval/webapp/node_modules/es-define-property/.eslintrc +13 -0
  252. gaia/eval/webapp/node_modules/es-define-property/.github/FUNDING.yml +12 -0
  253. gaia/eval/webapp/node_modules/es-define-property/.nycrc +9 -0
  254. gaia/eval/webapp/node_modules/es-define-property/CHANGELOG.md +29 -0
  255. gaia/eval/webapp/node_modules/es-define-property/LICENSE +21 -0
  256. gaia/eval/webapp/node_modules/es-define-property/README.md +49 -0
  257. gaia/eval/webapp/node_modules/es-define-property/index.d.ts +3 -0
  258. gaia/eval/webapp/node_modules/es-define-property/index.js +14 -0
  259. gaia/eval/webapp/node_modules/es-define-property/package.json +81 -0
  260. gaia/eval/webapp/node_modules/es-define-property/test/index.js +56 -0
  261. gaia/eval/webapp/node_modules/es-define-property/tsconfig.json +10 -0
  262. gaia/eval/webapp/node_modules/es-errors/.eslintrc +5 -0
  263. gaia/eval/webapp/node_modules/es-errors/.github/FUNDING.yml +12 -0
  264. gaia/eval/webapp/node_modules/es-errors/CHANGELOG.md +40 -0
  265. gaia/eval/webapp/node_modules/es-errors/LICENSE +21 -0
  266. gaia/eval/webapp/node_modules/es-errors/README.md +55 -0
  267. gaia/eval/webapp/node_modules/es-errors/eval.d.ts +3 -0
  268. gaia/eval/webapp/node_modules/es-errors/eval.js +4 -0
  269. gaia/eval/webapp/node_modules/es-errors/index.d.ts +3 -0
  270. gaia/eval/webapp/node_modules/es-errors/index.js +4 -0
  271. gaia/eval/webapp/node_modules/es-errors/package.json +80 -0
  272. gaia/eval/webapp/node_modules/es-errors/range.d.ts +3 -0
  273. gaia/eval/webapp/node_modules/es-errors/range.js +4 -0
  274. gaia/eval/webapp/node_modules/es-errors/ref.d.ts +3 -0
  275. gaia/eval/webapp/node_modules/es-errors/ref.js +4 -0
  276. gaia/eval/webapp/node_modules/es-errors/syntax.d.ts +3 -0
  277. gaia/eval/webapp/node_modules/es-errors/syntax.js +4 -0
  278. gaia/eval/webapp/node_modules/es-errors/test/index.js +19 -0
  279. gaia/eval/webapp/node_modules/es-errors/tsconfig.json +49 -0
  280. gaia/eval/webapp/node_modules/es-errors/type.d.ts +3 -0
  281. gaia/eval/webapp/node_modules/es-errors/type.js +4 -0
  282. gaia/eval/webapp/node_modules/es-errors/uri.d.ts +3 -0
  283. gaia/eval/webapp/node_modules/es-errors/uri.js +4 -0
  284. gaia/eval/webapp/node_modules/es-object-atoms/.eslintrc +16 -0
  285. gaia/eval/webapp/node_modules/es-object-atoms/.github/FUNDING.yml +12 -0
  286. gaia/eval/webapp/node_modules/es-object-atoms/CHANGELOG.md +37 -0
  287. gaia/eval/webapp/node_modules/es-object-atoms/LICENSE +21 -0
  288. gaia/eval/webapp/node_modules/es-object-atoms/README.md +63 -0
  289. gaia/eval/webapp/node_modules/es-object-atoms/RequireObjectCoercible.d.ts +3 -0
  290. gaia/eval/webapp/node_modules/es-object-atoms/RequireObjectCoercible.js +11 -0
  291. gaia/eval/webapp/node_modules/es-object-atoms/ToObject.d.ts +7 -0
  292. gaia/eval/webapp/node_modules/es-object-atoms/ToObject.js +10 -0
  293. gaia/eval/webapp/node_modules/es-object-atoms/index.d.ts +3 -0
  294. gaia/eval/webapp/node_modules/es-object-atoms/index.js +4 -0
  295. gaia/eval/webapp/node_modules/es-object-atoms/isObject.d.ts +3 -0
  296. gaia/eval/webapp/node_modules/es-object-atoms/isObject.js +6 -0
  297. gaia/eval/webapp/node_modules/es-object-atoms/package.json +80 -0
  298. gaia/eval/webapp/node_modules/es-object-atoms/test/index.js +38 -0
  299. gaia/eval/webapp/node_modules/es-object-atoms/tsconfig.json +6 -0
  300. gaia/eval/webapp/node_modules/escape-html/LICENSE +24 -0
  301. gaia/eval/webapp/node_modules/escape-html/Readme.md +43 -0
  302. gaia/eval/webapp/node_modules/escape-html/index.js +78 -0
  303. gaia/eval/webapp/node_modules/escape-html/package.json +24 -0
  304. gaia/eval/webapp/node_modules/etag/HISTORY.md +83 -0
  305. gaia/eval/webapp/node_modules/etag/LICENSE +22 -0
  306. gaia/eval/webapp/node_modules/etag/README.md +159 -0
  307. gaia/eval/webapp/node_modules/etag/index.js +131 -0
  308. gaia/eval/webapp/node_modules/etag/package.json +47 -0
  309. gaia/eval/webapp/node_modules/express/History.md +3656 -0
  310. gaia/eval/webapp/node_modules/express/LICENSE +24 -0
  311. gaia/eval/webapp/node_modules/express/Readme.md +260 -0
  312. gaia/eval/webapp/node_modules/express/index.js +11 -0
  313. gaia/eval/webapp/node_modules/express/lib/application.js +661 -0
  314. gaia/eval/webapp/node_modules/express/lib/express.js +116 -0
  315. gaia/eval/webapp/node_modules/express/lib/middleware/init.js +43 -0
  316. gaia/eval/webapp/node_modules/express/lib/middleware/query.js +47 -0
  317. gaia/eval/webapp/node_modules/express/lib/request.js +525 -0
  318. gaia/eval/webapp/node_modules/express/lib/response.js +1179 -0
  319. gaia/eval/webapp/node_modules/express/lib/router/index.js +673 -0
  320. gaia/eval/webapp/node_modules/express/lib/router/layer.js +181 -0
  321. gaia/eval/webapp/node_modules/express/lib/router/route.js +230 -0
  322. gaia/eval/webapp/node_modules/express/lib/utils.js +303 -0
  323. gaia/eval/webapp/node_modules/express/lib/view.js +182 -0
  324. gaia/eval/webapp/node_modules/express/package.json +102 -0
  325. gaia/eval/webapp/node_modules/finalhandler/HISTORY.md +210 -0
  326. gaia/eval/webapp/node_modules/finalhandler/LICENSE +22 -0
  327. gaia/eval/webapp/node_modules/finalhandler/README.md +147 -0
  328. gaia/eval/webapp/node_modules/finalhandler/SECURITY.md +25 -0
  329. gaia/eval/webapp/node_modules/finalhandler/index.js +341 -0
  330. gaia/eval/webapp/node_modules/finalhandler/package.json +47 -0
  331. gaia/eval/webapp/node_modules/forwarded/HISTORY.md +21 -0
  332. gaia/eval/webapp/node_modules/forwarded/LICENSE +22 -0
  333. gaia/eval/webapp/node_modules/forwarded/README.md +57 -0
  334. gaia/eval/webapp/node_modules/forwarded/index.js +90 -0
  335. gaia/eval/webapp/node_modules/forwarded/package.json +45 -0
  336. gaia/eval/webapp/node_modules/fresh/HISTORY.md +70 -0
  337. gaia/eval/webapp/node_modules/fresh/LICENSE +23 -0
  338. gaia/eval/webapp/node_modules/fresh/README.md +119 -0
  339. gaia/eval/webapp/node_modules/fresh/index.js +137 -0
  340. gaia/eval/webapp/node_modules/fresh/package.json +46 -0
  341. gaia/eval/webapp/node_modules/fs/README.md +9 -0
  342. gaia/eval/webapp/node_modules/fs/package.json +20 -0
  343. gaia/eval/webapp/node_modules/function-bind/.eslintrc +21 -0
  344. gaia/eval/webapp/node_modules/function-bind/.github/FUNDING.yml +12 -0
  345. gaia/eval/webapp/node_modules/function-bind/.github/SECURITY.md +3 -0
  346. gaia/eval/webapp/node_modules/function-bind/.nycrc +13 -0
  347. gaia/eval/webapp/node_modules/function-bind/CHANGELOG.md +136 -0
  348. gaia/eval/webapp/node_modules/function-bind/LICENSE +20 -0
  349. gaia/eval/webapp/node_modules/function-bind/README.md +46 -0
  350. gaia/eval/webapp/node_modules/function-bind/implementation.js +84 -0
  351. gaia/eval/webapp/node_modules/function-bind/index.js +5 -0
  352. gaia/eval/webapp/node_modules/function-bind/package.json +87 -0
  353. gaia/eval/webapp/node_modules/function-bind/test/.eslintrc +9 -0
  354. gaia/eval/webapp/node_modules/function-bind/test/index.js +252 -0
  355. gaia/eval/webapp/node_modules/get-intrinsic/.eslintrc +42 -0
  356. gaia/eval/webapp/node_modules/get-intrinsic/.github/FUNDING.yml +12 -0
  357. gaia/eval/webapp/node_modules/get-intrinsic/.nycrc +9 -0
  358. gaia/eval/webapp/node_modules/get-intrinsic/CHANGELOG.md +186 -0
  359. gaia/eval/webapp/node_modules/get-intrinsic/LICENSE +21 -0
  360. gaia/eval/webapp/node_modules/get-intrinsic/README.md +71 -0
  361. gaia/eval/webapp/node_modules/get-intrinsic/index.js +378 -0
  362. gaia/eval/webapp/node_modules/get-intrinsic/package.json +97 -0
  363. gaia/eval/webapp/node_modules/get-intrinsic/test/GetIntrinsic.js +274 -0
  364. gaia/eval/webapp/node_modules/get-proto/.eslintrc +10 -0
  365. gaia/eval/webapp/node_modules/get-proto/.github/FUNDING.yml +12 -0
  366. gaia/eval/webapp/node_modules/get-proto/.nycrc +9 -0
  367. gaia/eval/webapp/node_modules/get-proto/CHANGELOG.md +21 -0
  368. gaia/eval/webapp/node_modules/get-proto/LICENSE +21 -0
  369. gaia/eval/webapp/node_modules/get-proto/Object.getPrototypeOf.d.ts +5 -0
  370. gaia/eval/webapp/node_modules/get-proto/Object.getPrototypeOf.js +6 -0
  371. gaia/eval/webapp/node_modules/get-proto/README.md +50 -0
  372. gaia/eval/webapp/node_modules/get-proto/Reflect.getPrototypeOf.d.ts +3 -0
  373. gaia/eval/webapp/node_modules/get-proto/Reflect.getPrototypeOf.js +4 -0
  374. gaia/eval/webapp/node_modules/get-proto/index.d.ts +5 -0
  375. gaia/eval/webapp/node_modules/get-proto/index.js +27 -0
  376. gaia/eval/webapp/node_modules/get-proto/package.json +81 -0
  377. gaia/eval/webapp/node_modules/get-proto/test/index.js +68 -0
  378. gaia/eval/webapp/node_modules/get-proto/tsconfig.json +9 -0
  379. gaia/eval/webapp/node_modules/gopd/.eslintrc +16 -0
  380. gaia/eval/webapp/node_modules/gopd/.github/FUNDING.yml +12 -0
  381. gaia/eval/webapp/node_modules/gopd/CHANGELOG.md +45 -0
  382. gaia/eval/webapp/node_modules/gopd/LICENSE +21 -0
  383. gaia/eval/webapp/node_modules/gopd/README.md +40 -0
  384. gaia/eval/webapp/node_modules/gopd/gOPD.d.ts +1 -0
  385. gaia/eval/webapp/node_modules/gopd/gOPD.js +4 -0
  386. gaia/eval/webapp/node_modules/gopd/index.d.ts +5 -0
  387. gaia/eval/webapp/node_modules/gopd/index.js +15 -0
  388. gaia/eval/webapp/node_modules/gopd/package.json +77 -0
  389. gaia/eval/webapp/node_modules/gopd/test/index.js +36 -0
  390. gaia/eval/webapp/node_modules/gopd/tsconfig.json +9 -0
  391. gaia/eval/webapp/node_modules/has-symbols/.eslintrc +11 -0
  392. gaia/eval/webapp/node_modules/has-symbols/.github/FUNDING.yml +12 -0
  393. gaia/eval/webapp/node_modules/has-symbols/.nycrc +9 -0
  394. gaia/eval/webapp/node_modules/has-symbols/CHANGELOG.md +91 -0
  395. gaia/eval/webapp/node_modules/has-symbols/LICENSE +21 -0
  396. gaia/eval/webapp/node_modules/has-symbols/README.md +46 -0
  397. gaia/eval/webapp/node_modules/has-symbols/index.d.ts +3 -0
  398. gaia/eval/webapp/node_modules/has-symbols/index.js +14 -0
  399. gaia/eval/webapp/node_modules/has-symbols/package.json +111 -0
  400. gaia/eval/webapp/node_modules/has-symbols/shams.d.ts +3 -0
  401. gaia/eval/webapp/node_modules/has-symbols/shams.js +45 -0
  402. gaia/eval/webapp/node_modules/has-symbols/test/index.js +22 -0
  403. gaia/eval/webapp/node_modules/has-symbols/test/shams/core-js.js +29 -0
  404. gaia/eval/webapp/node_modules/has-symbols/test/shams/get-own-property-symbols.js +29 -0
  405. gaia/eval/webapp/node_modules/has-symbols/test/tests.js +58 -0
  406. gaia/eval/webapp/node_modules/has-symbols/tsconfig.json +10 -0
  407. gaia/eval/webapp/node_modules/hasown/.eslintrc +5 -0
  408. gaia/eval/webapp/node_modules/hasown/.github/FUNDING.yml +12 -0
  409. gaia/eval/webapp/node_modules/hasown/.nycrc +13 -0
  410. gaia/eval/webapp/node_modules/hasown/CHANGELOG.md +40 -0
  411. gaia/eval/webapp/node_modules/hasown/LICENSE +21 -0
  412. gaia/eval/webapp/node_modules/hasown/README.md +40 -0
  413. gaia/eval/webapp/node_modules/hasown/index.d.ts +3 -0
  414. gaia/eval/webapp/node_modules/hasown/index.js +8 -0
  415. gaia/eval/webapp/node_modules/hasown/package.json +92 -0
  416. gaia/eval/webapp/node_modules/hasown/tsconfig.json +6 -0
  417. gaia/eval/webapp/node_modules/http-errors/HISTORY.md +180 -0
  418. gaia/eval/webapp/node_modules/http-errors/LICENSE +23 -0
  419. gaia/eval/webapp/node_modules/http-errors/README.md +169 -0
  420. gaia/eval/webapp/node_modules/http-errors/index.js +289 -0
  421. gaia/eval/webapp/node_modules/http-errors/package.json +50 -0
  422. gaia/eval/webapp/node_modules/iconv-lite/Changelog.md +162 -0
  423. gaia/eval/webapp/node_modules/iconv-lite/LICENSE +21 -0
  424. gaia/eval/webapp/node_modules/iconv-lite/README.md +156 -0
  425. gaia/eval/webapp/node_modules/iconv-lite/encodings/dbcs-codec.js +555 -0
  426. gaia/eval/webapp/node_modules/iconv-lite/encodings/dbcs-data.js +176 -0
  427. gaia/eval/webapp/node_modules/iconv-lite/encodings/index.js +22 -0
  428. gaia/eval/webapp/node_modules/iconv-lite/encodings/internal.js +188 -0
  429. gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-codec.js +72 -0
  430. gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-data-generated.js +451 -0
  431. gaia/eval/webapp/node_modules/iconv-lite/encodings/sbcs-data.js +174 -0
  432. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/big5-added.json +122 -0
  433. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp936.json +264 -0
  434. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp949.json +273 -0
  435. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/cp950.json +177 -0
  436. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/eucjp.json +182 -0
  437. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/gb18030-ranges.json +1 -0
  438. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/gbk-added.json +55 -0
  439. gaia/eval/webapp/node_modules/iconv-lite/encodings/tables/shiftjis.json +125 -0
  440. gaia/eval/webapp/node_modules/iconv-lite/encodings/utf16.js +177 -0
  441. gaia/eval/webapp/node_modules/iconv-lite/encodings/utf7.js +290 -0
  442. gaia/eval/webapp/node_modules/iconv-lite/lib/bom-handling.js +52 -0
  443. gaia/eval/webapp/node_modules/iconv-lite/lib/extend-node.js +217 -0
  444. gaia/eval/webapp/node_modules/iconv-lite/lib/index.d.ts +24 -0
  445. gaia/eval/webapp/node_modules/iconv-lite/lib/index.js +153 -0
  446. gaia/eval/webapp/node_modules/iconv-lite/lib/streams.js +121 -0
  447. gaia/eval/webapp/node_modules/iconv-lite/package.json +46 -0
  448. gaia/eval/webapp/node_modules/inherits/LICENSE +16 -0
  449. gaia/eval/webapp/node_modules/inherits/README.md +42 -0
  450. gaia/eval/webapp/node_modules/inherits/inherits.js +9 -0
  451. gaia/eval/webapp/node_modules/inherits/inherits_browser.js +27 -0
  452. gaia/eval/webapp/node_modules/inherits/package.json +29 -0
  453. gaia/eval/webapp/node_modules/ipaddr.js/LICENSE +19 -0
  454. gaia/eval/webapp/node_modules/ipaddr.js/README.md +233 -0
  455. gaia/eval/webapp/node_modules/ipaddr.js/ipaddr.min.js +1 -0
  456. gaia/eval/webapp/node_modules/ipaddr.js/lib/ipaddr.js +673 -0
  457. gaia/eval/webapp/node_modules/ipaddr.js/lib/ipaddr.js.d.ts +68 -0
  458. gaia/eval/webapp/node_modules/ipaddr.js/package.json +35 -0
  459. gaia/eval/webapp/node_modules/math-intrinsics/.eslintrc +16 -0
  460. gaia/eval/webapp/node_modules/math-intrinsics/.github/FUNDING.yml +12 -0
  461. gaia/eval/webapp/node_modules/math-intrinsics/CHANGELOG.md +24 -0
  462. gaia/eval/webapp/node_modules/math-intrinsics/LICENSE +21 -0
  463. gaia/eval/webapp/node_modules/math-intrinsics/README.md +50 -0
  464. gaia/eval/webapp/node_modules/math-intrinsics/abs.d.ts +1 -0
  465. gaia/eval/webapp/node_modules/math-intrinsics/abs.js +4 -0
  466. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxArrayLength.d.ts +3 -0
  467. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxArrayLength.js +4 -0
  468. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxSafeInteger.d.ts +3 -0
  469. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxSafeInteger.js +5 -0
  470. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxValue.d.ts +3 -0
  471. gaia/eval/webapp/node_modules/math-intrinsics/constants/maxValue.js +5 -0
  472. gaia/eval/webapp/node_modules/math-intrinsics/floor.d.ts +1 -0
  473. gaia/eval/webapp/node_modules/math-intrinsics/floor.js +4 -0
  474. gaia/eval/webapp/node_modules/math-intrinsics/isFinite.d.ts +3 -0
  475. gaia/eval/webapp/node_modules/math-intrinsics/isFinite.js +12 -0
  476. gaia/eval/webapp/node_modules/math-intrinsics/isInteger.d.ts +3 -0
  477. gaia/eval/webapp/node_modules/math-intrinsics/isInteger.js +16 -0
  478. gaia/eval/webapp/node_modules/math-intrinsics/isNaN.d.ts +1 -0
  479. gaia/eval/webapp/node_modules/math-intrinsics/isNaN.js +6 -0
  480. gaia/eval/webapp/node_modules/math-intrinsics/isNegativeZero.d.ts +3 -0
  481. gaia/eval/webapp/node_modules/math-intrinsics/isNegativeZero.js +6 -0
  482. gaia/eval/webapp/node_modules/math-intrinsics/max.d.ts +1 -0
  483. gaia/eval/webapp/node_modules/math-intrinsics/max.js +4 -0
  484. gaia/eval/webapp/node_modules/math-intrinsics/min.d.ts +1 -0
  485. gaia/eval/webapp/node_modules/math-intrinsics/min.js +4 -0
  486. gaia/eval/webapp/node_modules/math-intrinsics/mod.d.ts +3 -0
  487. gaia/eval/webapp/node_modules/math-intrinsics/mod.js +9 -0
  488. gaia/eval/webapp/node_modules/math-intrinsics/package.json +86 -0
  489. gaia/eval/webapp/node_modules/math-intrinsics/pow.d.ts +1 -0
  490. gaia/eval/webapp/node_modules/math-intrinsics/pow.js +4 -0
  491. gaia/eval/webapp/node_modules/math-intrinsics/round.d.ts +1 -0
  492. gaia/eval/webapp/node_modules/math-intrinsics/round.js +4 -0
  493. gaia/eval/webapp/node_modules/math-intrinsics/sign.d.ts +3 -0
  494. gaia/eval/webapp/node_modules/math-intrinsics/sign.js +11 -0
  495. gaia/eval/webapp/node_modules/math-intrinsics/test/index.js +192 -0
  496. gaia/eval/webapp/node_modules/math-intrinsics/tsconfig.json +3 -0
  497. gaia/eval/webapp/node_modules/media-typer/HISTORY.md +22 -0
  498. gaia/eval/webapp/node_modules/media-typer/LICENSE +22 -0
  499. gaia/eval/webapp/node_modules/media-typer/README.md +81 -0
  500. gaia/eval/webapp/node_modules/media-typer/index.js +270 -0
  501. gaia/eval/webapp/node_modules/media-typer/package.json +26 -0
  502. gaia/eval/webapp/node_modules/merge-descriptors/HISTORY.md +21 -0
  503. gaia/eval/webapp/node_modules/merge-descriptors/LICENSE +23 -0
  504. gaia/eval/webapp/node_modules/merge-descriptors/README.md +49 -0
  505. gaia/eval/webapp/node_modules/merge-descriptors/index.js +60 -0
  506. gaia/eval/webapp/node_modules/merge-descriptors/package.json +39 -0
  507. gaia/eval/webapp/node_modules/methods/HISTORY.md +29 -0
  508. gaia/eval/webapp/node_modules/methods/LICENSE +24 -0
  509. gaia/eval/webapp/node_modules/methods/README.md +51 -0
  510. gaia/eval/webapp/node_modules/methods/index.js +69 -0
  511. gaia/eval/webapp/node_modules/methods/package.json +36 -0
  512. gaia/eval/webapp/node_modules/mime/.npmignore +0 -0
  513. gaia/eval/webapp/node_modules/mime/CHANGELOG.md +164 -0
  514. gaia/eval/webapp/node_modules/mime/LICENSE +21 -0
  515. gaia/eval/webapp/node_modules/mime/README.md +90 -0
  516. gaia/eval/webapp/node_modules/mime/cli.js +8 -0
  517. gaia/eval/webapp/node_modules/mime/mime.js +108 -0
  518. gaia/eval/webapp/node_modules/mime/package.json +44 -0
  519. gaia/eval/webapp/node_modules/mime/src/build.js +53 -0
  520. gaia/eval/webapp/node_modules/mime/src/test.js +60 -0
  521. gaia/eval/webapp/node_modules/mime/types.json +1 -0
  522. gaia/eval/webapp/node_modules/mime-db/HISTORY.md +507 -0
  523. gaia/eval/webapp/node_modules/mime-db/LICENSE +23 -0
  524. gaia/eval/webapp/node_modules/mime-db/README.md +100 -0
  525. gaia/eval/webapp/node_modules/mime-db/db.json +8519 -0
  526. gaia/eval/webapp/node_modules/mime-db/index.js +12 -0
  527. gaia/eval/webapp/node_modules/mime-db/package.json +60 -0
  528. gaia/eval/webapp/node_modules/mime-types/HISTORY.md +397 -0
  529. gaia/eval/webapp/node_modules/mime-types/LICENSE +23 -0
  530. gaia/eval/webapp/node_modules/mime-types/README.md +113 -0
  531. gaia/eval/webapp/node_modules/mime-types/index.js +188 -0
  532. gaia/eval/webapp/node_modules/mime-types/package.json +44 -0
  533. gaia/eval/webapp/node_modules/ms/index.js +152 -0
  534. gaia/eval/webapp/node_modules/ms/license.md +21 -0
  535. gaia/eval/webapp/node_modules/ms/package.json +37 -0
  536. gaia/eval/webapp/node_modules/ms/readme.md +51 -0
  537. gaia/eval/webapp/node_modules/negotiator/HISTORY.md +108 -0
  538. gaia/eval/webapp/node_modules/negotiator/LICENSE +24 -0
  539. gaia/eval/webapp/node_modules/negotiator/README.md +203 -0
  540. gaia/eval/webapp/node_modules/negotiator/index.js +82 -0
  541. gaia/eval/webapp/node_modules/negotiator/lib/charset.js +169 -0
  542. gaia/eval/webapp/node_modules/negotiator/lib/encoding.js +184 -0
  543. gaia/eval/webapp/node_modules/negotiator/lib/language.js +179 -0
  544. gaia/eval/webapp/node_modules/negotiator/lib/mediaType.js +294 -0
  545. gaia/eval/webapp/node_modules/negotiator/package.json +42 -0
  546. gaia/eval/webapp/node_modules/object-inspect/.eslintrc +53 -0
  547. gaia/eval/webapp/node_modules/object-inspect/.github/FUNDING.yml +12 -0
  548. gaia/eval/webapp/node_modules/object-inspect/.nycrc +13 -0
  549. gaia/eval/webapp/node_modules/object-inspect/CHANGELOG.md +424 -0
  550. gaia/eval/webapp/node_modules/object-inspect/LICENSE +21 -0
  551. gaia/eval/webapp/node_modules/object-inspect/example/all.js +23 -0
  552. gaia/eval/webapp/node_modules/object-inspect/example/circular.js +6 -0
  553. gaia/eval/webapp/node_modules/object-inspect/example/fn.js +5 -0
  554. gaia/eval/webapp/node_modules/object-inspect/example/inspect.js +10 -0
  555. gaia/eval/webapp/node_modules/object-inspect/index.js +544 -0
  556. gaia/eval/webapp/node_modules/object-inspect/package-support.json +20 -0
  557. gaia/eval/webapp/node_modules/object-inspect/package.json +105 -0
  558. gaia/eval/webapp/node_modules/object-inspect/readme.markdown +84 -0
  559. gaia/eval/webapp/node_modules/object-inspect/test/bigint.js +58 -0
  560. gaia/eval/webapp/node_modules/object-inspect/test/browser/dom.js +15 -0
  561. gaia/eval/webapp/node_modules/object-inspect/test/circular.js +16 -0
  562. gaia/eval/webapp/node_modules/object-inspect/test/deep.js +12 -0
  563. gaia/eval/webapp/node_modules/object-inspect/test/element.js +53 -0
  564. gaia/eval/webapp/node_modules/object-inspect/test/err.js +48 -0
  565. gaia/eval/webapp/node_modules/object-inspect/test/fakes.js +29 -0
  566. gaia/eval/webapp/node_modules/object-inspect/test/fn.js +76 -0
  567. gaia/eval/webapp/node_modules/object-inspect/test/global.js +17 -0
  568. gaia/eval/webapp/node_modules/object-inspect/test/has.js +15 -0
  569. gaia/eval/webapp/node_modules/object-inspect/test/holes.js +15 -0
  570. gaia/eval/webapp/node_modules/object-inspect/test/indent-option.js +271 -0
  571. gaia/eval/webapp/node_modules/object-inspect/test/inspect.js +139 -0
  572. gaia/eval/webapp/node_modules/object-inspect/test/lowbyte.js +12 -0
  573. gaia/eval/webapp/node_modules/object-inspect/test/number.js +58 -0
  574. gaia/eval/webapp/node_modules/object-inspect/test/quoteStyle.js +26 -0
  575. gaia/eval/webapp/node_modules/object-inspect/test/toStringTag.js +40 -0
  576. gaia/eval/webapp/node_modules/object-inspect/test/undef.js +12 -0
  577. gaia/eval/webapp/node_modules/object-inspect/test/values.js +261 -0
  578. gaia/eval/webapp/node_modules/object-inspect/test-core-js.js +26 -0
  579. gaia/eval/webapp/node_modules/object-inspect/util.inspect.js +1 -0
  580. gaia/eval/webapp/node_modules/on-finished/HISTORY.md +98 -0
  581. gaia/eval/webapp/node_modules/on-finished/LICENSE +23 -0
  582. gaia/eval/webapp/node_modules/on-finished/README.md +162 -0
  583. gaia/eval/webapp/node_modules/on-finished/index.js +234 -0
  584. gaia/eval/webapp/node_modules/on-finished/package.json +39 -0
  585. gaia/eval/webapp/node_modules/parseurl/HISTORY.md +58 -0
  586. gaia/eval/webapp/node_modules/parseurl/LICENSE +24 -0
  587. gaia/eval/webapp/node_modules/parseurl/README.md +133 -0
  588. gaia/eval/webapp/node_modules/parseurl/index.js +158 -0
  589. gaia/eval/webapp/node_modules/parseurl/package.json +40 -0
  590. gaia/eval/webapp/node_modules/path/.npmignore +1 -0
  591. gaia/eval/webapp/node_modules/path/LICENSE +18 -0
  592. gaia/eval/webapp/node_modules/path/README.md +15 -0
  593. gaia/eval/webapp/node_modules/path/package.json +24 -0
  594. gaia/eval/webapp/node_modules/path/path.js +628 -0
  595. gaia/eval/webapp/node_modules/path-to-regexp/LICENSE +21 -0
  596. gaia/eval/webapp/node_modules/path-to-regexp/Readme.md +35 -0
  597. gaia/eval/webapp/node_modules/path-to-regexp/index.js +156 -0
  598. gaia/eval/webapp/node_modules/path-to-regexp/package.json +30 -0
  599. gaia/eval/webapp/node_modules/process/.eslintrc +21 -0
  600. gaia/eval/webapp/node_modules/process/LICENSE +22 -0
  601. gaia/eval/webapp/node_modules/process/README.md +26 -0
  602. gaia/eval/webapp/node_modules/process/browser.js +184 -0
  603. gaia/eval/webapp/node_modules/process/index.js +2 -0
  604. gaia/eval/webapp/node_modules/process/package.json +27 -0
  605. gaia/eval/webapp/node_modules/process/test.js +199 -0
  606. gaia/eval/webapp/node_modules/proxy-addr/HISTORY.md +161 -0
  607. gaia/eval/webapp/node_modules/proxy-addr/LICENSE +22 -0
  608. gaia/eval/webapp/node_modules/proxy-addr/README.md +139 -0
  609. gaia/eval/webapp/node_modules/proxy-addr/index.js +327 -0
  610. gaia/eval/webapp/node_modules/proxy-addr/package.json +47 -0
  611. gaia/eval/webapp/node_modules/qs/.editorconfig +46 -0
  612. gaia/eval/webapp/node_modules/qs/.eslintrc +38 -0
  613. gaia/eval/webapp/node_modules/qs/.github/FUNDING.yml +12 -0
  614. gaia/eval/webapp/node_modules/qs/.nycrc +13 -0
  615. gaia/eval/webapp/node_modules/qs/CHANGELOG.md +600 -0
  616. gaia/eval/webapp/node_modules/qs/LICENSE.md +29 -0
  617. gaia/eval/webapp/node_modules/qs/README.md +709 -0
  618. gaia/eval/webapp/node_modules/qs/dist/qs.js +90 -0
  619. gaia/eval/webapp/node_modules/qs/lib/formats.js +23 -0
  620. gaia/eval/webapp/node_modules/qs/lib/index.js +11 -0
  621. gaia/eval/webapp/node_modules/qs/lib/parse.js +296 -0
  622. gaia/eval/webapp/node_modules/qs/lib/stringify.js +351 -0
  623. gaia/eval/webapp/node_modules/qs/lib/utils.js +265 -0
  624. gaia/eval/webapp/node_modules/qs/package.json +91 -0
  625. gaia/eval/webapp/node_modules/qs/test/empty-keys-cases.js +267 -0
  626. gaia/eval/webapp/node_modules/qs/test/parse.js +1170 -0
  627. gaia/eval/webapp/node_modules/qs/test/stringify.js +1298 -0
  628. gaia/eval/webapp/node_modules/qs/test/utils.js +136 -0
  629. gaia/eval/webapp/node_modules/range-parser/HISTORY.md +56 -0
  630. gaia/eval/webapp/node_modules/range-parser/LICENSE +23 -0
  631. gaia/eval/webapp/node_modules/range-parser/README.md +84 -0
  632. gaia/eval/webapp/node_modules/range-parser/index.js +162 -0
  633. gaia/eval/webapp/node_modules/range-parser/package.json +44 -0
  634. gaia/eval/webapp/node_modules/raw-body/HISTORY.md +308 -0
  635. gaia/eval/webapp/node_modules/raw-body/LICENSE +22 -0
  636. gaia/eval/webapp/node_modules/raw-body/README.md +223 -0
  637. gaia/eval/webapp/node_modules/raw-body/SECURITY.md +24 -0
  638. gaia/eval/webapp/node_modules/raw-body/index.d.ts +87 -0
  639. gaia/eval/webapp/node_modules/raw-body/index.js +336 -0
  640. gaia/eval/webapp/node_modules/raw-body/package.json +49 -0
  641. gaia/eval/webapp/node_modules/safe-buffer/LICENSE +21 -0
  642. gaia/eval/webapp/node_modules/safe-buffer/README.md +584 -0
  643. gaia/eval/webapp/node_modules/safe-buffer/index.d.ts +187 -0
  644. gaia/eval/webapp/node_modules/safe-buffer/index.js +65 -0
  645. gaia/eval/webapp/node_modules/safe-buffer/package.json +51 -0
  646. gaia/eval/webapp/node_modules/safer-buffer/LICENSE +21 -0
  647. gaia/eval/webapp/node_modules/safer-buffer/Porting-Buffer.md +268 -0
  648. gaia/eval/webapp/node_modules/safer-buffer/Readme.md +156 -0
  649. gaia/eval/webapp/node_modules/safer-buffer/dangerous.js +58 -0
  650. gaia/eval/webapp/node_modules/safer-buffer/package.json +34 -0
  651. gaia/eval/webapp/node_modules/safer-buffer/safer.js +77 -0
  652. gaia/eval/webapp/node_modules/safer-buffer/tests.js +406 -0
  653. gaia/eval/webapp/node_modules/send/HISTORY.md +526 -0
  654. gaia/eval/webapp/node_modules/send/LICENSE +23 -0
  655. gaia/eval/webapp/node_modules/send/README.md +327 -0
  656. gaia/eval/webapp/node_modules/send/SECURITY.md +24 -0
  657. gaia/eval/webapp/node_modules/send/index.js +1142 -0
  658. gaia/eval/webapp/node_modules/send/node_modules/encodeurl/HISTORY.md +14 -0
  659. gaia/eval/webapp/node_modules/send/node_modules/encodeurl/LICENSE +22 -0
  660. gaia/eval/webapp/node_modules/send/node_modules/encodeurl/README.md +128 -0
  661. gaia/eval/webapp/node_modules/send/node_modules/encodeurl/index.js +60 -0
  662. gaia/eval/webapp/node_modules/send/node_modules/encodeurl/package.json +40 -0
  663. gaia/eval/webapp/node_modules/send/node_modules/ms/index.js +162 -0
  664. gaia/eval/webapp/node_modules/send/node_modules/ms/license.md +21 -0
  665. gaia/eval/webapp/node_modules/send/node_modules/ms/package.json +38 -0
  666. gaia/eval/webapp/node_modules/send/node_modules/ms/readme.md +59 -0
  667. gaia/eval/webapp/node_modules/send/package.json +62 -0
  668. gaia/eval/webapp/node_modules/serve-static/HISTORY.md +487 -0
  669. gaia/eval/webapp/node_modules/serve-static/LICENSE +25 -0
  670. gaia/eval/webapp/node_modules/serve-static/README.md +257 -0
  671. gaia/eval/webapp/node_modules/serve-static/index.js +209 -0
  672. gaia/eval/webapp/node_modules/serve-static/package.json +42 -0
  673. gaia/eval/webapp/node_modules/setprototypeof/LICENSE +13 -0
  674. gaia/eval/webapp/node_modules/setprototypeof/README.md +31 -0
  675. gaia/eval/webapp/node_modules/setprototypeof/index.d.ts +2 -0
  676. gaia/eval/webapp/node_modules/setprototypeof/index.js +17 -0
  677. gaia/eval/webapp/node_modules/setprototypeof/package.json +38 -0
  678. gaia/eval/webapp/node_modules/setprototypeof/test/index.js +24 -0
  679. gaia/eval/webapp/node_modules/side-channel/.editorconfig +9 -0
  680. gaia/eval/webapp/node_modules/side-channel/.eslintrc +12 -0
  681. gaia/eval/webapp/node_modules/side-channel/.github/FUNDING.yml +12 -0
  682. gaia/eval/webapp/node_modules/side-channel/.nycrc +13 -0
  683. gaia/eval/webapp/node_modules/side-channel/CHANGELOG.md +110 -0
  684. gaia/eval/webapp/node_modules/side-channel/LICENSE +21 -0
  685. gaia/eval/webapp/node_modules/side-channel/README.md +61 -0
  686. gaia/eval/webapp/node_modules/side-channel/index.d.ts +14 -0
  687. gaia/eval/webapp/node_modules/side-channel/index.js +43 -0
  688. gaia/eval/webapp/node_modules/side-channel/package.json +85 -0
  689. gaia/eval/webapp/node_modules/side-channel/test/index.js +104 -0
  690. gaia/eval/webapp/node_modules/side-channel/tsconfig.json +9 -0
  691. gaia/eval/webapp/node_modules/side-channel-list/.editorconfig +9 -0
  692. gaia/eval/webapp/node_modules/side-channel-list/.eslintrc +11 -0
  693. gaia/eval/webapp/node_modules/side-channel-list/.github/FUNDING.yml +12 -0
  694. gaia/eval/webapp/node_modules/side-channel-list/.nycrc +13 -0
  695. gaia/eval/webapp/node_modules/side-channel-list/CHANGELOG.md +15 -0
  696. gaia/eval/webapp/node_modules/side-channel-list/LICENSE +21 -0
  697. gaia/eval/webapp/node_modules/side-channel-list/README.md +62 -0
  698. gaia/eval/webapp/node_modules/side-channel-list/index.d.ts +13 -0
  699. gaia/eval/webapp/node_modules/side-channel-list/index.js +113 -0
  700. gaia/eval/webapp/node_modules/side-channel-list/list.d.ts +14 -0
  701. gaia/eval/webapp/node_modules/side-channel-list/package.json +77 -0
  702. gaia/eval/webapp/node_modules/side-channel-list/test/index.js +104 -0
  703. gaia/eval/webapp/node_modules/side-channel-list/tsconfig.json +9 -0
  704. gaia/eval/webapp/node_modules/side-channel-map/.editorconfig +9 -0
  705. gaia/eval/webapp/node_modules/side-channel-map/.eslintrc +11 -0
  706. gaia/eval/webapp/node_modules/side-channel-map/.github/FUNDING.yml +12 -0
  707. gaia/eval/webapp/node_modules/side-channel-map/.nycrc +13 -0
  708. gaia/eval/webapp/node_modules/side-channel-map/CHANGELOG.md +22 -0
  709. gaia/eval/webapp/node_modules/side-channel-map/LICENSE +21 -0
  710. gaia/eval/webapp/node_modules/side-channel-map/README.md +62 -0
  711. gaia/eval/webapp/node_modules/side-channel-map/index.d.ts +15 -0
  712. gaia/eval/webapp/node_modules/side-channel-map/index.js +68 -0
  713. gaia/eval/webapp/node_modules/side-channel-map/package.json +80 -0
  714. gaia/eval/webapp/node_modules/side-channel-map/test/index.js +114 -0
  715. gaia/eval/webapp/node_modules/side-channel-map/tsconfig.json +9 -0
  716. gaia/eval/webapp/node_modules/side-channel-weakmap/.editorconfig +9 -0
  717. gaia/eval/webapp/node_modules/side-channel-weakmap/.eslintrc +12 -0
  718. gaia/eval/webapp/node_modules/side-channel-weakmap/.github/FUNDING.yml +12 -0
  719. gaia/eval/webapp/node_modules/side-channel-weakmap/.nycrc +13 -0
  720. gaia/eval/webapp/node_modules/side-channel-weakmap/CHANGELOG.md +28 -0
  721. gaia/eval/webapp/node_modules/side-channel-weakmap/LICENSE +21 -0
  722. gaia/eval/webapp/node_modules/side-channel-weakmap/README.md +62 -0
  723. gaia/eval/webapp/node_modules/side-channel-weakmap/index.d.ts +15 -0
  724. gaia/eval/webapp/node_modules/side-channel-weakmap/index.js +84 -0
  725. gaia/eval/webapp/node_modules/side-channel-weakmap/package.json +87 -0
  726. gaia/eval/webapp/node_modules/side-channel-weakmap/test/index.js +114 -0
  727. gaia/eval/webapp/node_modules/side-channel-weakmap/tsconfig.json +9 -0
  728. gaia/eval/webapp/node_modules/statuses/HISTORY.md +82 -0
  729. gaia/eval/webapp/node_modules/statuses/LICENSE +23 -0
  730. gaia/eval/webapp/node_modules/statuses/README.md +136 -0
  731. gaia/eval/webapp/node_modules/statuses/codes.json +65 -0
  732. gaia/eval/webapp/node_modules/statuses/index.js +146 -0
  733. gaia/eval/webapp/node_modules/statuses/package.json +49 -0
  734. gaia/eval/webapp/node_modules/toidentifier/HISTORY.md +9 -0
  735. gaia/eval/webapp/node_modules/toidentifier/LICENSE +21 -0
  736. gaia/eval/webapp/node_modules/toidentifier/README.md +61 -0
  737. gaia/eval/webapp/node_modules/toidentifier/index.js +32 -0
  738. gaia/eval/webapp/node_modules/toidentifier/package.json +38 -0
  739. gaia/eval/webapp/node_modules/type-is/HISTORY.md +259 -0
  740. gaia/eval/webapp/node_modules/type-is/LICENSE +23 -0
  741. gaia/eval/webapp/node_modules/type-is/README.md +170 -0
  742. gaia/eval/webapp/node_modules/type-is/index.js +266 -0
  743. gaia/eval/webapp/node_modules/type-is/package.json +45 -0
  744. gaia/eval/webapp/node_modules/unpipe/HISTORY.md +4 -0
  745. gaia/eval/webapp/node_modules/unpipe/LICENSE +22 -0
  746. gaia/eval/webapp/node_modules/unpipe/README.md +43 -0
  747. gaia/eval/webapp/node_modules/unpipe/index.js +69 -0
  748. gaia/eval/webapp/node_modules/unpipe/package.json +27 -0
  749. gaia/eval/webapp/node_modules/util/LICENSE +18 -0
  750. gaia/eval/webapp/node_modules/util/README.md +15 -0
  751. gaia/eval/webapp/node_modules/util/node_modules/inherits/LICENSE +16 -0
  752. gaia/eval/webapp/node_modules/util/node_modules/inherits/README.md +42 -0
  753. gaia/eval/webapp/node_modules/util/node_modules/inherits/inherits.js +7 -0
  754. gaia/eval/webapp/node_modules/util/node_modules/inherits/inherits_browser.js +23 -0
  755. gaia/eval/webapp/node_modules/util/node_modules/inherits/package.json +29 -0
  756. gaia/eval/webapp/node_modules/util/package.json +35 -0
  757. gaia/eval/webapp/node_modules/util/support/isBuffer.js +3 -0
  758. gaia/eval/webapp/node_modules/util/support/isBufferBrowser.js +6 -0
  759. gaia/eval/webapp/node_modules/util/util.js +586 -0
  760. gaia/eval/webapp/node_modules/utils-merge/.npmignore +9 -0
  761. gaia/eval/webapp/node_modules/utils-merge/LICENSE +20 -0
  762. gaia/eval/webapp/node_modules/utils-merge/README.md +34 -0
  763. gaia/eval/webapp/node_modules/utils-merge/index.js +23 -0
  764. gaia/eval/webapp/node_modules/utils-merge/package.json +40 -0
  765. gaia/eval/webapp/node_modules/vary/HISTORY.md +39 -0
  766. gaia/eval/webapp/node_modules/vary/LICENSE +22 -0
  767. gaia/eval/webapp/node_modules/vary/README.md +101 -0
  768. gaia/eval/webapp/node_modules/vary/index.js +149 -0
  769. gaia/eval/webapp/node_modules/vary/package.json +43 -0
  770. gaia/eval/webapp/package-lock.json +875 -0
  771. gaia/eval/webapp/package.json +21 -0
  772. gaia/eval/webapp/public/app.js +3403 -0
  773. gaia/eval/webapp/public/index.html +88 -0
  774. gaia/eval/webapp/public/styles.css +3661 -0
  775. gaia/eval/webapp/server.js +416 -0
  776. gaia/eval/webapp/test-setup.js +73 -0
  777. gaia/llm/__init__.py +2 -0
  778. gaia/llm/lemonade_client.py +3083 -0
  779. gaia/llm/lemonade_manager.py +269 -0
  780. gaia/llm/llm_client.py +729 -0
  781. gaia/llm/vlm_client.py +307 -0
  782. gaia/logger.py +189 -0
  783. gaia/mcp/agent_mcp_server.py +245 -0
  784. gaia/mcp/blender_mcp_client.py +138 -0
  785. gaia/mcp/blender_mcp_server.py +648 -0
  786. gaia/mcp/context7_cache.py +332 -0
  787. gaia/mcp/external_services.py +518 -0
  788. gaia/mcp/mcp_bridge.py +550 -0
  789. gaia/mcp/servers/__init__.py +6 -0
  790. gaia/mcp/servers/docker_mcp.py +83 -0
  791. gaia/rag/__init__.py +10 -0
  792. gaia/rag/app.py +293 -0
  793. gaia/rag/demo.py +304 -0
  794. gaia/rag/pdf_utils.py +235 -0
  795. gaia/rag/sdk.py +2194 -0
  796. gaia/security.py +163 -0
  797. gaia/talk/app.py +289 -0
  798. gaia/talk/sdk.py +538 -0
  799. gaia/util.py +46 -0
  800. gaia/version.py +100 -0
gaia/eval/eval.py ADDED
@@ -0,0 +1,3179 @@
1
+ # Copyright(C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ import json
5
+ import re
6
+ import time
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional
10
+
11
+ import numpy as np
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+
15
+ from gaia.eval.claude import ClaudeClient
16
+ from gaia.logger import get_logger
17
+
18
+
19
+ class Evaluator:
20
+ """Evaluates AI model performance across various use cases (summarization, Q&A, RAG, etc.)."""
21
+
22
+ def __init__(self, model="claude-sonnet-4-20250514"):
23
+ self.log = get_logger(__name__)
24
+ # Increase max_tokens to 4096 to avoid truncation of complex JSON responses
25
+ self.claude = ClaudeClient(model=model, max_tokens=4096)
26
+
27
+ def calculate_similarity(self, text1: str, text2: str) -> float:
28
+ """
29
+ Calculate cosine similarity between two texts using TF-IDF vectors.
30
+
31
+ Args:
32
+ text1: First text (ground truth)
33
+ text2: Second text (response)
34
+
35
+ Returns:
36
+ float: Cosine similarity score between 0 and 1
37
+ """
38
+ if not text1.strip() or not text2.strip():
39
+ return 0.0
40
+
41
+ try:
42
+ vectorizer = TfidfVectorizer(stop_words="english", lowercase=True)
43
+ vectors = vectorizer.fit_transform([text1, text2])
44
+ similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
45
+ return float(similarity)
46
+ except Exception as e:
47
+ self.log.warning(f"Error calculating similarity: {e}")
48
+ return 0.0
49
+
50
+ def determine_pass_fail(
51
+ self, similarity: float, threshold: float, claude_analysis: Dict = None
52
+ ) -> Dict:
53
+ """
54
+ Determine pass/fail based on comprehensive evaluation criteria.
55
+
56
+ Args:
57
+ similarity: Similarity score between ground truth and response
58
+ threshold: Similarity threshold
59
+ claude_analysis: Claude's qualitative analysis (correctness, completeness, etc.)
60
+
61
+ Returns:
62
+ Dict containing pass/fail determination and reasoning
63
+ """
64
+ # Start with similarity-based evaluation
65
+ similarity_pass = similarity >= threshold
66
+
67
+ # If no Claude analysis available, fall back to similarity only
68
+ if not claude_analysis:
69
+ return {
70
+ "is_pass": similarity_pass,
71
+ "pass_fail": "pass" if similarity_pass else "fail",
72
+ "criteria": "similarity_only",
73
+ "reasoning": f"Similarity score {similarity:.3f} {'meets' if similarity_pass else 'below'} threshold {threshold:.3f}",
74
+ }
75
+
76
+ # Extract Claude's ratings
77
+ ratings = {}
78
+ for criterion in ["correctness", "completeness", "conciseness", "relevance"]:
79
+ if criterion in claude_analysis:
80
+ rating = claude_analysis[criterion].get("rating", "").lower()
81
+ ratings[criterion] = rating
82
+
83
+ # Define scoring system: excellent=4, good=3, fair=2, poor=1
84
+ score_map = {"excellent": 4, "good": 3, "fair": 2, "poor": 1}
85
+
86
+ # Calculate weighted scores (correctness and completeness are more important)
87
+ weights = {
88
+ "correctness": 0.4,
89
+ "completeness": 0.3,
90
+ "conciseness": 0.15,
91
+ "relevance": 0.15,
92
+ }
93
+
94
+ total_score = 0
95
+ max_possible = 0
96
+ criteria_details = []
97
+
98
+ for criterion, weight in weights.items():
99
+ if criterion in ratings:
100
+ rating = ratings[criterion]
101
+ score = score_map.get(rating, 1)
102
+ weighted_score = score * weight
103
+ total_score += weighted_score
104
+ max_possible += 4 * weight
105
+ criteria_details.append(f"{criterion}: {rating} ({score}/4)")
106
+
107
+ # Calculate normalized score (0-1)
108
+ normalized_score = total_score / max_possible if max_possible > 0 else 0
109
+
110
+ # Determine pass/fail using combined criteria:
111
+ # 1. Must meet minimum qualitative threshold (normalized score >= 0.6)
112
+ # 2. Correctness must be at least "fair"
113
+ # 3. Either high similarity OR good qualitative scores can pass
114
+
115
+ correctness_acceptable = ratings.get("correctness", "poor") in [
116
+ "fair",
117
+ "good",
118
+ "excellent",
119
+ ]
120
+ qualitative_pass = normalized_score >= 0.6 and correctness_acceptable
121
+
122
+ # Final determination: pass if either high similarity OR good qualitative scores
123
+ final_pass = similarity_pass or qualitative_pass
124
+
125
+ # Override: fail if correctness is "poor" regardless of other factors
126
+ if ratings.get("correctness", "") == "poor":
127
+ final_pass = False
128
+
129
+ reasoning_parts = [
130
+ f"Similarity: {similarity:.3f} ({'✓' if similarity_pass else '✗'} threshold {threshold:.3f})",
131
+ f"Qualitative score: {normalized_score:.2f} ({'✓' if qualitative_pass else '✗'} ≥0.6)",
132
+ f"Correctness: {ratings.get('correctness', 'N/A')} ({'✓' if correctness_acceptable else '✗'} ≥fair)",
133
+ ]
134
+
135
+ return {
136
+ "is_pass": final_pass,
137
+ "pass_fail": "pass" if final_pass else "fail",
138
+ "criteria": "comprehensive",
139
+ "reasoning": "; ".join(reasoning_parts),
140
+ "scores": {
141
+ "similarity": similarity,
142
+ "qualitative_normalized": normalized_score,
143
+ "qualitative_details": criteria_details,
144
+ },
145
+ }
146
+
147
+ def load_results(self, results_path: str) -> Dict:
148
+ """Load test results from a JSON file."""
149
+ try:
150
+ with open(results_path, "r") as f:
151
+ return json.load(f)
152
+ except Exception as e:
153
+ self.log.error(f"Error loading results file: {e}")
154
+ raise
155
+
156
+ def check_evaluation_exists(self, experiment_file: str, output_dir: str) -> bool:
157
+ """Check if evaluation already exists for experiment file.
158
+
159
+ Args:
160
+ experiment_file: Path to the experiment file
161
+ output_dir: Output directory for evaluations
162
+
163
+ Returns:
164
+ True if evaluation file already exists, False otherwise
165
+ """
166
+ experiment_path = Path(experiment_file)
167
+ output_base_path = Path(output_dir)
168
+
169
+ # Generate expected eval filename: <name>.experiment.eval.json
170
+ eval_filename = f"{experiment_path.stem}.eval.json"
171
+
172
+ # Check for hierarchical structure first
173
+ relative_path = None
174
+ if "experiments" in experiment_path.parts:
175
+ # Extract relative path from experiments directory
176
+ exp_idx = experiment_path.parts.index("experiments")
177
+ if exp_idx + 1 < len(experiment_path.parts):
178
+ relative_path = Path(*experiment_path.parts[exp_idx + 1 : -1])
179
+
180
+ # Check both locations: hierarchical and flat
181
+ eval_paths = []
182
+ if relative_path:
183
+ eval_paths.append(output_base_path / relative_path / eval_filename)
184
+ eval_paths.append(output_base_path / eval_filename)
185
+
186
+ for eval_path in eval_paths:
187
+ if eval_path.exists():
188
+ self.log.info(f"Evaluation already exists: {eval_path}")
189
+ return True
190
+
191
+ return False
192
+
193
+ def evaluate(self, results_path: str) -> Dict:
194
+ """
195
+ Evaluate RAG results and generate metrics.
196
+
197
+ Args:
198
+ results_path: Path to the results JSON file
199
+
200
+ Returns:
201
+ Dict containing evaluation metrics
202
+ """
203
+ results = self.load_results(results_path)
204
+ qa_results = results["analysis"]["qa_results"]
205
+
206
+ # Calculate similarity scores and pass/fail during evaluation
207
+ similarities = []
208
+ pass_results = []
209
+ threshold = results["metadata"]["similarity_threshold"]
210
+
211
+ for result in qa_results:
212
+ similarity = self.calculate_similarity(
213
+ result["ground_truth"], result["response"]
214
+ )
215
+ similarities.append(similarity)
216
+ pass_results.append(similarity >= threshold)
217
+
218
+ # Calculate accuracy metrics
219
+ total_questions = len(pass_results)
220
+ passed_questions = sum(pass_results)
221
+ failed_questions = total_questions - passed_questions
222
+ accuracy = passed_questions / total_questions if total_questions > 0 else 0.0
223
+
224
+ metrics = {
225
+ "test_file": results["metadata"]["test_file"],
226
+ "timestamp": results["metadata"]["timestamp"],
227
+ "threshold": results["metadata"]["similarity_threshold"],
228
+ "num_questions": len(qa_results),
229
+ "similarity_scores": {
230
+ "mean": float(np.mean(similarities)),
231
+ "median": float(np.median(similarities)),
232
+ "std": float(np.std(similarities)),
233
+ "min": float(np.min(similarities)),
234
+ "max": float(np.max(similarities)),
235
+ },
236
+ "threshold_metrics": {
237
+ "num_passed": passed_questions,
238
+ "num_failed": failed_questions,
239
+ "accuracy": accuracy,
240
+ "accuracy_percentage": accuracy * 100.0,
241
+ },
242
+ }
243
+
244
+ # Calculate pass rate
245
+ metrics["threshold_metrics"]["pass_rate"] = (
246
+ metrics["threshold_metrics"]["num_passed"] / metrics["num_questions"]
247
+ )
248
+
249
+ # Add overall rating based on pass rate and mean similarity
250
+ pass_rate = metrics["threshold_metrics"]["pass_rate"]
251
+ mean_similarity = metrics["similarity_scores"]["mean"]
252
+
253
+ if pass_rate >= 0.9 and mean_similarity >= 0.8:
254
+ rating = "excellent"
255
+ elif pass_rate >= 0.8 and mean_similarity >= 0.7:
256
+ rating = "good"
257
+ elif pass_rate >= 0.6 and mean_similarity >= 0.6:
258
+ rating = "fair"
259
+ else:
260
+ rating = "poor"
261
+
262
+ metrics["overall_rating"] = {
263
+ "rating": rating,
264
+ "pass_rate": pass_rate,
265
+ "mean_similarity": mean_similarity,
266
+ }
267
+
268
+ return metrics
269
+
270
+ def analyze_with_claude(
271
+ self, results_path: str, groundtruth_path: Optional[str] = None
272
+ ) -> Dict:
273
+ """
274
+ Use Claude to perform qualitative analysis of RAG results.
275
+
276
+ Args:
277
+ results_path: Path to results JSON file
278
+ groundtruth_path: Optional path to groundtruth file for comparison
279
+
280
+ Returns:
281
+ Dict containing Claude's analysis
282
+ """
283
+ # Start timing
284
+ start_time = time.time()
285
+
286
+ try:
287
+ results = self.load_results(results_path)
288
+
289
+ # Detect result type and extract appropriate data
290
+ analysis_data = results.get("analysis", {})
291
+ qa_results = analysis_data.get("qa_results", results.get("qa_results", []))
292
+ summarization_results = analysis_data.get("summarization_results", [])
293
+
294
+ # Determine evaluation type
295
+ if qa_results:
296
+ return self._analyze_qa_results(results, qa_results)
297
+ elif summarization_results:
298
+ return self._analyze_summarization_results(
299
+ results, summarization_results, groundtruth_path
300
+ )
301
+ else:
302
+ return {
303
+ "overall_analysis": "No QA or summarization results found to analyze",
304
+ "strengths": [],
305
+ "weaknesses": ["No data available for analysis"],
306
+ "recommendations": [
307
+ "Ensure input data contains QA or summarization results"
308
+ ],
309
+ "use_case_fit": "Unable to determine",
310
+ "per_question": [],
311
+ "overall_rating": {
312
+ "rating": "error",
313
+ "explanation": "No analyzable results found",
314
+ },
315
+ "timing": {
316
+ "total_processing_time_seconds": round(
317
+ time.time() - start_time, 3
318
+ )
319
+ },
320
+ }
321
+ except Exception as e:
322
+ self.log.error(f"Error in analyze_with_claude: {e}")
323
+ return {
324
+ "overall_analysis": f"Analysis failed: {str(e)}",
325
+ "strengths": [],
326
+ "weaknesses": ["Analysis failed to complete"],
327
+ "recommendations": ["Check logs for error details"],
328
+ "use_case_fit": "",
329
+ "per_question": [],
330
+ "overall_rating": {"rating": "error", "explanation": str(e)},
331
+ "timing": {
332
+ "total_processing_time_seconds": round(time.time() - start_time, 3)
333
+ },
334
+ }
335
+
336
+ def _analyze_qa_results(self, results: Dict, qa_results: List) -> Dict:
337
+ """Analyze QA results using Claude."""
338
+ # Start timing
339
+ analysis_start_time = time.time()
340
+
341
+ # Initialize analysis structure
342
+ analysis = {
343
+ "overall_analysis": "",
344
+ "strengths": [],
345
+ "weaknesses": [],
346
+ "recommendations": [],
347
+ "use_case_fit": "",
348
+ "per_question": [],
349
+ "overall_rating": {"rating": "", "explanation": ""},
350
+ "timing": {}, # Add timing information
351
+ }
352
+
353
+ if not qa_results:
354
+ return {
355
+ "overall_analysis": "No QA results found to analyze",
356
+ "strengths": [],
357
+ "weaknesses": ["No data available for analysis"],
358
+ "recommendations": ["Ensure input data contains QA results"],
359
+ "use_case_fit": "Unable to determine",
360
+ "per_question": [],
361
+ "overall_rating": {
362
+ "rating": "error",
363
+ "explanation": "No QA results found",
364
+ },
365
+ }
366
+
367
+ try:
368
+ per_question_timings = [] # Track timing for each question
369
+
370
+ # Set up intermediate output directory for crash recovery
371
+ intermediate_dir = None
372
+ experiment_name = results.get("metadata", {}).get(
373
+ "experiment_name", "qa_evaluation"
374
+ )
375
+ if hasattr(self, "intermediate_dir") and self.intermediate_dir:
376
+ # Use existing intermediate directory if set
377
+ intermediate_dir = (
378
+ Path(self.intermediate_dir)
379
+ / f"{experiment_name}_qa_analysis.intermediate"
380
+ )
381
+ else:
382
+ # Create in temp directory
383
+ import tempfile
384
+
385
+ temp_dir = Path(tempfile.gettempdir()) / "gaia_eval"
386
+ intermediate_dir = (
387
+ temp_dir / f"{experiment_name}_qa_analysis.intermediate"
388
+ )
389
+
390
+ if intermediate_dir:
391
+ intermediate_dir.mkdir(parents=True, exist_ok=True)
392
+ self.log.info(
393
+ f"Writing intermediate QA analysis results to: {intermediate_dir}"
394
+ )
395
+
396
+ for qa_result in qa_results:
397
+ question_start_time = time.time()
398
+
399
+ # Calculate similarity score between ground truth and response
400
+ similarity_score = self.calculate_similarity(
401
+ qa_result["ground_truth"], qa_result["response"]
402
+ )
403
+
404
+ # Store initial data (pass/fail will be determined after Claude analysis)
405
+ threshold = results["metadata"]["similarity_threshold"]
406
+
407
+ # Restructure the qa_result into qa_inputs
408
+ qa_inputs = {
409
+ "query": qa_result["query"],
410
+ "ground_truth": qa_result["ground_truth"],
411
+ "response": qa_result["response"],
412
+ "similarity": similarity_score,
413
+ "threshold": threshold,
414
+ }
415
+
416
+ prompt = f"""
417
+ Analyze this RAG (Retrieval Augmented Generation) system test result and provide detailed insights.
418
+
419
+ Query: {qa_inputs['query']}
420
+ Ground Truth: {qa_inputs['ground_truth']}
421
+ System Response: {qa_inputs['response']}
422
+ Similarity Score: {qa_inputs['similarity']}
423
+
424
+ Evaluate the response on these criteria, providing both a rating (excellent/good/fair/poor) and detailed explanation:
425
+ 1. Correctness: Is it factually correct compared to ground truth?
426
+ 2. Completeness: Does it fully answer the question?
427
+ 3. Conciseness: Is it appropriately brief while maintaining accuracy?
428
+ 4. Relevance: Does it directly address the query?
429
+
430
+ Return your analysis in this exact JSON format:
431
+ {{
432
+ "correctness": {{
433
+ "rating": "one of: excellent/good/fair/poor",
434
+ "explanation": "analysis of factual correctness"
435
+ }},
436
+ "completeness": {{
437
+ "rating": "one of: excellent/good/fair/poor",
438
+ "explanation": "analysis of answer completeness"
439
+ }},
440
+ "conciseness": {{
441
+ "rating": "one of: excellent/good/fair/poor",
442
+ "explanation": "analysis of brevity and clarity"
443
+ }},
444
+ "relevance": {{
445
+ "rating": "one of: excellent/good/fair/poor",
446
+ "explanation": "analysis of how well it addresses the query"
447
+ }}
448
+ }}
449
+ """
450
+
451
+ response_data = self.claude.get_completion_with_usage(prompt)
452
+
453
+ try:
454
+ # Extract JSON and combine with qa_inputs
455
+ response = response_data["content"]
456
+ usage = response_data["usage"]
457
+ cost = response_data["cost"]
458
+
459
+ if isinstance(response, list):
460
+ response_text = (
461
+ response[0].text
462
+ if hasattr(response[0], "text")
463
+ else str(response[0])
464
+ )
465
+ else:
466
+ response_text = (
467
+ response.text
468
+ if hasattr(response, "text")
469
+ else str(response)
470
+ )
471
+
472
+ json_start = response_text.find("{")
473
+ json_end = response_text.rfind("}") + 1
474
+ if json_start >= 0 and json_end > json_start:
475
+ json_content = response_text[json_start:json_end]
476
+ qa_analysis = json.loads(json_content)
477
+
478
+ # Determine comprehensive pass/fail
479
+ pass_fail_result = self.determine_pass_fail(
480
+ similarity_score, threshold, qa_analysis
481
+ )
482
+
483
+ # Add all data to qa_inputs
484
+ qa_inputs.update(pass_fail_result)
485
+
486
+ # Add qa_inputs, usage, and cost as nested dictionaries
487
+ qa_analysis["qa_inputs"] = qa_inputs
488
+ qa_analysis["usage"] = usage
489
+ qa_analysis["cost"] = cost
490
+
491
+ # Add timing for this question
492
+ question_time = time.time() - question_start_time
493
+ qa_analysis["processing_time_seconds"] = round(question_time, 3)
494
+ per_question_timings.append(question_time)
495
+
496
+ analysis["per_question"].append(qa_analysis)
497
+
498
+ # Write intermediate result immediately for crash recovery
499
+ if intermediate_dir:
500
+ try:
501
+ intermediate_file = (
502
+ intermediate_dir
503
+ / f"qa_{len(analysis['per_question']):04d}_analysis.json"
504
+ )
505
+ intermediate_data = {
506
+ "question_index": len(analysis["per_question"]) - 1,
507
+ "experiment_name": experiment_name,
508
+ "qa_inputs": qa_inputs,
509
+ "analysis": qa_analysis,
510
+ "usage": qa_analysis.get("usage", {}),
511
+ "cost": qa_analysis.get("cost", {}),
512
+ "processing_time_seconds": qa_analysis.get(
513
+ "processing_time_seconds", 0
514
+ ),
515
+ "timestamp": datetime.now().isoformat(),
516
+ }
517
+
518
+ with open(
519
+ intermediate_file, "w", encoding="utf-8"
520
+ ) as f:
521
+ json.dump(intermediate_data, f, indent=2)
522
+
523
+ # Update progress file
524
+ progress_file = (
525
+ intermediate_dir / "qa_analysis_progress.json"
526
+ )
527
+ progress_data = {
528
+ "experiment_name": experiment_name,
529
+ "total_questions": len(qa_results),
530
+ "completed_questions": len(
531
+ analysis["per_question"]
532
+ ),
533
+ "progress_percent": round(
534
+ len(analysis["per_question"])
535
+ / len(qa_results)
536
+ * 100,
537
+ 1,
538
+ ),
539
+ "last_updated": datetime.now().isoformat(),
540
+ "estimated_remaining_time": None,
541
+ }
542
+
543
+ # Calculate estimated remaining time
544
+ if len(per_question_timings) > 0:
545
+ avg_time_per_question = sum(
546
+ per_question_timings
547
+ ) / len(per_question_timings)
548
+ remaining_questions = len(qa_results) - len(
549
+ analysis["per_question"]
550
+ )
551
+ estimated_remaining = (
552
+ remaining_questions * avg_time_per_question
553
+ )
554
+ progress_data["estimated_remaining_time"] = round(
555
+ estimated_remaining, 1
556
+ )
557
+
558
+ with open(progress_file, "w", encoding="utf-8") as f:
559
+ json.dump(progress_data, f, indent=2)
560
+
561
+ self.log.info(
562
+ f"QA analysis progress: {len(analysis['per_question'])}/{len(qa_results)} questions completed ({progress_data['progress_percent']}%)"
563
+ )
564
+
565
+ except Exception as e:
566
+ self.log.warning(
567
+ f"Failed to write intermediate QA analysis result {len(analysis['per_question'])}: {e}"
568
+ )
569
+
570
+ else:
571
+ self.log.error(f"No JSON found in response for question")
572
+
573
+ # Determine pass/fail without Claude analysis (similarity only)
574
+ pass_fail_result = self.determine_pass_fail(
575
+ similarity_score, threshold, None
576
+ )
577
+ qa_inputs.update(pass_fail_result)
578
+
579
+ # Add timing even for failed parsing
580
+ question_time = time.time() - question_start_time
581
+ per_question_timings.append(question_time)
582
+
583
+ analysis["per_question"].append(
584
+ {
585
+ "error": "Failed to parse analysis",
586
+ "raw_response": response_text,
587
+ "qa_inputs": qa_inputs,
588
+ "usage": usage,
589
+ "cost": cost,
590
+ "processing_time_seconds": round(question_time, 3),
591
+ }
592
+ )
593
+ except Exception as e:
594
+ self.log.error(f"Error processing analysis: {e}")
595
+
596
+ # Determine pass/fail without Claude analysis (similarity only)
597
+ pass_fail_result = self.determine_pass_fail(
598
+ similarity_score, threshold, None
599
+ )
600
+ qa_inputs.update(pass_fail_result)
601
+
602
+ # Add timing even for exceptions
603
+ question_time = time.time() - question_start_time
604
+ per_question_timings.append(question_time)
605
+
606
+ analysis["per_question"].append(
607
+ {
608
+ "error": str(e),
609
+ "raw_response": str(response_data),
610
+ "qa_inputs": qa_inputs,
611
+ "usage": response_data.get("usage", {}),
612
+ "cost": response_data.get("cost", {}),
613
+ "processing_time_seconds": round(question_time, 3),
614
+ }
615
+ )
616
+
617
+ # Calculate similarity scores and accuracy metrics (extract from per_question analysis)
618
+ calculated_similarities = [
619
+ q["qa_inputs"]["similarity"]
620
+ for q in analysis["per_question"]
621
+ if "qa_inputs" in q
622
+ ]
623
+ pass_results = [
624
+ q["qa_inputs"]["is_pass"]
625
+ for q in analysis["per_question"]
626
+ if "qa_inputs" in q
627
+ ]
628
+
629
+ # Calculate accuracy metrics
630
+ total_questions = len(pass_results)
631
+ passed_questions = sum(pass_results)
632
+ failed_questions = total_questions - passed_questions
633
+ accuracy = (
634
+ passed_questions / total_questions if total_questions > 0 else 0.0
635
+ )
636
+
637
+ # After analyzing all questions, get overall analysis
638
+ overall_start_time = time.time()
639
+ overall_prompt = f"""
640
+ Review these RAG system test results and provide an overall analysis.
641
+
642
+ Number of questions: {total_questions}
643
+ Similarity threshold: {results["metadata"]["similarity_threshold"]}
644
+ Number passed threshold: {passed_questions}
645
+ Number failed threshold: {failed_questions}
646
+ Pass rate: {accuracy:.3f}
647
+ Accuracy: {accuracy * 100:.1f}%
648
+
649
+ Similarity statistics:
650
+ - Mean: {np.mean(calculated_similarities):.3f}
651
+ - Median: {np.median(calculated_similarities):.3f}
652
+ - Min: {np.min(calculated_similarities):.3f}
653
+ - Max: {np.max(calculated_similarities):.3f}
654
+ - Standard Deviation: {np.std(calculated_similarities):.3f}
655
+
656
+ Individual analyses: {json.dumps(analysis['per_question'], indent=2)}
657
+
658
+ Provide a comprehensive analysis including:
659
+ 1. Overall Rating: Rate the system (excellent/good/fair/poor) with explanation
660
+ 2. Overall Analysis: General assessment of the RAG system's performance
661
+ 3. Strengths: What the system does well
662
+ 4. Weaknesses: Areas needing improvement
663
+ 5. Recommendations: Specific suggestions for improvement
664
+ 6. Use Case Fit: Types of queries the system handles well/poorly
665
+
666
+ Return your analysis in this exact JSON format:
667
+ {{
668
+ "overall_rating": {{
669
+ "rating": "one of: excellent/good/fair/poor",
670
+ "explanation": "explanation of the rating",
671
+ "metrics": {{
672
+ "num_questions": number of questions analyzed,
673
+ "similarity_threshold": threshold value used,
674
+ "num_passed": number of questions that passed threshold,
675
+ "num_failed": number of questions that failed threshold,
676
+ "pass_rate": pass rate as decimal,
677
+ "accuracy": accuracy as decimal,
678
+ "accuracy_percentage": accuracy as percentage,
679
+ "mean_similarity": average similarity score,
680
+ "median_similarity": median similarity score,
681
+ "min_similarity": minimum similarity score,
682
+ "max_similarity": maximum similarity score,
683
+ "std_similarity": standard deviation of similarity scores
684
+ }}
685
+ }},
686
+ "overall_analysis": "general assessment",
687
+ "strengths": ["strength 1", "strength 2", ...],
688
+ "weaknesses": ["weakness 1", "weakness 2", ...],
689
+ "recommendations": ["recommendation 1", "recommendation 2", ...],
690
+ "use_case_fit": "analysis of suitable use cases"
691
+ }}
692
+ """
693
+
694
+ overall_response_data = self.claude.get_completion_with_usage(
695
+ overall_prompt
696
+ )
697
+
698
+ try:
699
+ # Extract JSON from overall response
700
+ overall_response = overall_response_data["content"]
701
+ overall_usage = overall_response_data["usage"]
702
+ overall_cost = overall_response_data["cost"]
703
+
704
+ if isinstance(overall_response, list):
705
+ response_text = (
706
+ overall_response[0].text
707
+ if hasattr(overall_response[0], "text")
708
+ else str(overall_response[0])
709
+ )
710
+ else:
711
+ response_text = (
712
+ overall_response.text
713
+ if hasattr(overall_response, "text")
714
+ else str(overall_response)
715
+ )
716
+
717
+ json_start = response_text.find("{")
718
+ json_end = response_text.rfind("}") + 1
719
+ if json_start >= 0 and json_end > json_start:
720
+ json_content = response_text[json_start:json_end]
721
+ overall_analysis = json.loads(json_content)
722
+ # Add overall usage and cost to the analysis
723
+ overall_analysis["overall_usage"] = overall_usage
724
+ overall_analysis["overall_cost"] = overall_cost
725
+
726
+ # Add overall timing
727
+ overall_time = time.time() - overall_start_time
728
+ overall_analysis["overall_processing_time_seconds"] = round(
729
+ overall_time, 3
730
+ )
731
+
732
+ analysis.update(overall_analysis)
733
+ else:
734
+ self.log.error("No JSON found in overall analysis response")
735
+ analysis.update(
736
+ {
737
+ "error": "Failed to parse overall analysis",
738
+ "raw_response": response_text,
739
+ "overall_usage": overall_usage,
740
+ "overall_cost": overall_cost,
741
+ }
742
+ )
743
+ except Exception as e:
744
+ self.log.error(f"Error processing overall analysis: {e}")
745
+ analysis.update(
746
+ {
747
+ "error": str(e),
748
+ "raw_response": str(overall_response_data),
749
+ "overall_usage": overall_response_data.get("usage", {}),
750
+ "overall_cost": overall_response_data.get("cost", {}),
751
+ }
752
+ )
753
+
754
+ # Calculate total cost across all questions and overall analysis
755
+ total_usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
756
+ total_cost = {"input_cost": 0.0, "output_cost": 0.0, "total_cost": 0.0}
757
+
758
+ # Sum up costs from per-question analysis
759
+ for question_analysis in analysis["per_question"]:
760
+ if "usage" in question_analysis and "cost" in question_analysis:
761
+ usage = question_analysis["usage"]
762
+ cost = question_analysis["cost"]
763
+ total_usage["input_tokens"] += usage.get("input_tokens", 0)
764
+ total_usage["output_tokens"] += usage.get("output_tokens", 0)
765
+ total_usage["total_tokens"] += usage.get("total_tokens", 0)
766
+ total_cost["input_cost"] += cost.get("input_cost", 0.0)
767
+ total_cost["output_cost"] += cost.get("output_cost", 0.0)
768
+ total_cost["total_cost"] += cost.get("total_cost", 0.0)
769
+
770
+ # Add overall analysis costs if available
771
+ if "overall_usage" in analysis and "overall_cost" in analysis:
772
+ overall_usage = analysis["overall_usage"]
773
+ overall_cost = analysis["overall_cost"]
774
+ total_usage["input_tokens"] += overall_usage.get("input_tokens", 0)
775
+ total_usage["output_tokens"] += overall_usage.get("output_tokens", 0)
776
+ total_usage["total_tokens"] += overall_usage.get("total_tokens", 0)
777
+ total_cost["input_cost"] += overall_cost.get("input_cost", 0.0)
778
+ total_cost["output_cost"] += overall_cost.get("output_cost", 0.0)
779
+ total_cost["total_cost"] += overall_cost.get("total_cost", 0.0)
780
+
781
+ # Add total cost summary to analysis
782
+ analysis["total_usage"] = total_usage
783
+ analysis["total_cost"] = total_cost
784
+
785
+ # Add comprehensive timing information
786
+ total_time = time.time() - analysis_start_time
787
+ analysis["timing"] = {
788
+ "total_processing_time_seconds": round(total_time, 3),
789
+ "per_question_times_seconds": [
790
+ round(t, 3) for t in per_question_timings
791
+ ],
792
+ "average_per_question_seconds": (
793
+ round(np.mean(per_question_timings), 3)
794
+ if per_question_timings
795
+ else 0
796
+ ),
797
+ "max_per_question_seconds": (
798
+ round(max(per_question_timings), 3) if per_question_timings else 0
799
+ ),
800
+ "min_per_question_seconds": (
801
+ round(min(per_question_timings), 3) if per_question_timings else 0
802
+ ),
803
+ }
804
+
805
+ # Clean up intermediate files after successful completion
806
+ if intermediate_dir and intermediate_dir.exists():
807
+ try:
808
+ import shutil
809
+
810
+ shutil.rmtree(intermediate_dir)
811
+ self.log.info(
812
+ f"Cleaned up intermediate QA analysis files from: {intermediate_dir}"
813
+ )
814
+ except Exception as e:
815
+ self.log.warning(
816
+ f"Failed to clean up intermediate directory {intermediate_dir}: {e}"
817
+ )
818
+
819
+ return analysis
820
+ except Exception as api_error:
821
+ if "529" in str(api_error) or "overloaded" in str(api_error).lower():
822
+ self.log.warning(
823
+ "Claude API is currently overloaded. Returning partial analysis with raw data."
824
+ )
825
+ # Include raw QA results without Claude analysis
826
+ for qa_result in qa_results:
827
+ # Calculate similarity score even when Claude analysis fails
828
+ similarity_score = self.calculate_similarity(
829
+ qa_result["ground_truth"], qa_result["response"]
830
+ )
831
+
832
+ # Determine pass/fail without Claude analysis (similarity only)
833
+ threshold = results["metadata"]["similarity_threshold"]
834
+
835
+ qa_inputs = {
836
+ "query": qa_result["query"],
837
+ "ground_truth": qa_result["ground_truth"],
838
+ "response": qa_result["response"],
839
+ "similarity": similarity_score,
840
+ "threshold": threshold,
841
+ }
842
+
843
+ # Add pass/fail determination
844
+ pass_fail_result = self.determine_pass_fail(
845
+ similarity_score, threshold, None
846
+ )
847
+ qa_inputs.update(pass_fail_result)
848
+ analysis["per_question"].append(
849
+ {
850
+ "status": "raw_data_only",
851
+ "analysis_error": "Claude API overloaded",
852
+ "qa_inputs": qa_inputs,
853
+ }
854
+ )
855
+
856
+ analysis.update(
857
+ {
858
+ "overall_analysis": "Analysis incomplete due to Claude API overload",
859
+ "strengths": ["Raw data preserved"],
860
+ "weaknesses": [
861
+ "Claude analysis unavailable due to API overload"
862
+ ],
863
+ "recommendations": ["Retry analysis when API is available"],
864
+ "use_case_fit": "Analysis pending",
865
+ "overall_rating": {
866
+ "rating": "pending",
867
+ "explanation": "Claude API temporarily unavailable",
868
+ },
869
+ }
870
+ )
871
+ return analysis
872
+ raise # Re-raise if it's not an overload error
873
+
874
+ except Exception as e:
875
+ self.log.error(f"Error in analyze_with_claude: {e}")
876
+ return {
877
+ "overall_analysis": f"Analysis failed: {str(e)}",
878
+ "strengths": [],
879
+ "weaknesses": ["Analysis failed to complete"],
880
+ "recommendations": ["Check logs for error details"],
881
+ "use_case_fit": "",
882
+ "per_question": [],
883
+ "overall_rating": {"rating": "error", "explanation": str(e)},
884
+ }
885
+
886
+ def _analyze_summarization_results(
887
+ self,
888
+ results: Dict,
889
+ summarization_results: List,
890
+ groundtruth_path: Optional[str] = None,
891
+ ) -> Dict:
892
+ """Analyze summarization results using Claude."""
893
+ # Start timing
894
+ analysis_start_time = time.time()
895
+
896
+ analysis = {
897
+ "overall_analysis": "",
898
+ "strengths": [],
899
+ "weaknesses": [],
900
+ "recommendations": [],
901
+ "use_case_fit": "",
902
+ "per_question": [],
903
+ "overall_rating": {"rating": "", "explanation": ""},
904
+ "timing": {}, # Add timing information
905
+ }
906
+
907
+ if not summarization_results:
908
+ return {
909
+ "overall_analysis": "No summarization results found to analyze",
910
+ "strengths": [],
911
+ "weaknesses": ["No summarization data available for analysis"],
912
+ "recommendations": ["Ensure input data contains summarization results"],
913
+ "use_case_fit": "Unable to determine",
914
+ "per_question": [],
915
+ "overall_rating": {
916
+ "rating": "error",
917
+ "explanation": "No summarization results found",
918
+ },
919
+ }
920
+
921
+ try:
922
+ # Load ground truth summaries from separate file if provided
923
+ ground_truth_data = None
924
+ if groundtruth_path and Path(groundtruth_path).exists():
925
+ try:
926
+ with open(groundtruth_path, "r", encoding="utf-8") as f:
927
+ ground_truth_data = json.load(f)
928
+ self.log.info(f"Loaded ground truth data from: {groundtruth_path}")
929
+
930
+ # Check if this is a consolidated ground truth file
931
+ if "consolidated_from" in ground_truth_data.get("metadata", {}):
932
+ self.log.info(
933
+ f"Using consolidated ground truth with {ground_truth_data['metadata']['consolidated_from']} transcripts"
934
+ )
935
+
936
+ except Exception as e:
937
+ self.log.warning(
938
+ f"Failed to load ground truth file {groundtruth_path}: {e}"
939
+ )
940
+ ground_truth_data = None
941
+ elif groundtruth_path:
942
+ self.log.warning(f"Ground truth file not found: {groundtruth_path}")
943
+
944
+ total_usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
945
+ total_cost = {"input_cost": 0.0, "output_cost": 0.0, "total_cost": 0.0}
946
+ per_summary_timings = [] # Track timing for each summary
947
+
948
+ # Set up intermediate output directory for crash recovery
949
+ intermediate_dir = None
950
+ experiment_name = results.get("metadata", {}).get(
951
+ "experiment_name", "evaluation"
952
+ )
953
+ if hasattr(self, "intermediate_dir") and self.intermediate_dir:
954
+ # Use existing intermediate directory if set
955
+ intermediate_dir = (
956
+ Path(self.intermediate_dir)
957
+ / f"{experiment_name}_analysis.intermediate"
958
+ )
959
+ else:
960
+ # Create in temp directory
961
+ import tempfile
962
+
963
+ temp_dir = Path(tempfile.gettempdir()) / "gaia_eval"
964
+ intermediate_dir = temp_dir / f"{experiment_name}_analysis.intermediate"
965
+
966
+ if intermediate_dir:
967
+ intermediate_dir.mkdir(parents=True, exist_ok=True)
968
+ self.log.info(
969
+ f"Writing intermediate analysis results to: {intermediate_dir}"
970
+ )
971
+
972
+ for i, summary_result in enumerate(summarization_results):
973
+ summary_start_time = time.time()
974
+ generated_summaries = summary_result.get("generated_summaries", {})
975
+
976
+ # Get ground truth summaries from embedded data or separate file
977
+ groundtruth_summaries = summary_result.get("groundtruth_summaries", {})
978
+
979
+ # If no embedded ground truth but we have a ground truth file, extract from it
980
+ if not groundtruth_summaries and ground_truth_data:
981
+ gt_analysis = ground_truth_data.get("analysis", {})
982
+ gt_summaries = gt_analysis.get("summaries", {})
983
+
984
+ # Handle both regular and consolidated ground truth formats
985
+ if gt_summaries:
986
+ # Check if this is consolidated format (summaries have transcript_id keys)
987
+ if "consolidated_from" in ground_truth_data.get("metadata", {}):
988
+ # For consolidated format, try to match by source file or use first available
989
+ source_file = summary_result.get("source_file", "")
990
+ transcript_id = None
991
+
992
+ # Try to match by source file name using metadata.source_files
993
+ source_files = ground_truth_data.get("metadata", {}).get(
994
+ "source_files", []
995
+ )
996
+ for source_mapping in source_files:
997
+ mapped_source = source_mapping.get("source_file", "")
998
+ if source_file and (
999
+ source_file == mapped_source
1000
+ or source_file.replace("/", "\\") == mapped_source
1001
+ or source_file.replace("\\", "/") == mapped_source
1002
+ ):
1003
+ transcript_id = source_mapping.get("transcript_id")
1004
+ break
1005
+
1006
+ # If no match found, fail loudly - DO NOT use fallback
1007
+ if not transcript_id:
1008
+ available_sources = [
1009
+ s.get("source_file", "") for s in source_files
1010
+ ]
1011
+ available_ids = (
1012
+ list(gt_summaries.keys()) if gt_summaries else []
1013
+ )
1014
+
1015
+ error_msg = (
1016
+ f"\n{'='*70}\n"
1017
+ f"ERROR: No matching ground truth found for experiment result\n"
1018
+ f"{'='*70}\n"
1019
+ f"Source file in experiment: {source_file}\n"
1020
+ f"\nAvailable source files in ground truth:\n"
1021
+ )
1022
+ for idx, src in enumerate(available_sources[:10], 1):
1023
+ error_msg += f" {idx}. {src}\n"
1024
+ if len(available_sources) > 10:
1025
+ error_msg += f" ... and {len(available_sources) - 10} more\n"
1026
+
1027
+ error_msg += f"\nAvailable transcript IDs:\n"
1028
+ for idx, tid in enumerate(available_ids[:10], 1):
1029
+ error_msg += f" {idx}. {tid}\n"
1030
+ if len(available_ids) > 10:
1031
+ error_msg += (
1032
+ f" ... and {len(available_ids) - 10} more\n"
1033
+ )
1034
+
1035
+ error_msg += (
1036
+ f"\nPossible fixes:\n"
1037
+ f" 1. Ensure ground truth source_file paths match experiment paths exactly\n"
1038
+ f" 2. Check if ground truth was generated from the same test data\n"
1039
+ f" 3. Verify path separators (forward vs backslash) are consistent\n"
1040
+ f" 4. Run fix_groundtruth_paths.py to normalize path prefixes\n"
1041
+ f"{'='*70}\n"
1042
+ )
1043
+
1044
+ self.log.error(error_msg)
1045
+ raise ValueError(
1046
+ f"No matching ground truth found for source: {source_file}. "
1047
+ f"Cannot evaluate without correct ground truth data."
1048
+ )
1049
+
1050
+ if transcript_id and transcript_id in gt_summaries:
1051
+ groundtruth_summaries = gt_summaries[transcript_id]
1052
+ self.log.debug(
1053
+ f"Using consolidated ground truth summaries for {transcript_id}"
1054
+ )
1055
+ else:
1056
+ # Regular format - summaries are directly under gt_summaries
1057
+ groundtruth_summaries = gt_summaries
1058
+ self.log.debug(
1059
+ f"Using regular ground truth summaries from file for summary {i}"
1060
+ )
1061
+
1062
+ # Analyze each summary component
1063
+ summary_analysis = {
1064
+ "summary_index": i,
1065
+ "source_file": summary_result.get("source_file", ""),
1066
+ "analysis": {},
1067
+ "overall_quality": "",
1068
+ }
1069
+
1070
+ # Compare generated vs ground truth if available
1071
+ if groundtruth_summaries:
1072
+ prompt = f"""
1073
+ Analyze this summarization system result by comparing the generated summary against the ground truth.
1074
+
1075
+ GENERATED SUMMARY:
1076
+ Executive Summary: {generated_summaries.get('executive_summary', 'N/A')}
1077
+ Detailed Summary: {generated_summaries.get('detailed_summary', 'N/A')}
1078
+ Action Items: {generated_summaries.get('action_items', [])}
1079
+ Key Decisions: {generated_summaries.get('key_decisions', [])}
1080
+ Participants: {generated_summaries.get('participants', [])}
1081
+ Topics Discussed: {generated_summaries.get('topics_discussed', [])}
1082
+
1083
+ GROUND TRUTH SUMMARY:
1084
+ Executive Summary: {groundtruth_summaries.get('executive_summary', 'N/A')}
1085
+ Detailed Summary: {groundtruth_summaries.get('detailed_summary', 'N/A')}
1086
+ Action Items: {groundtruth_summaries.get('action_items', [])}
1087
+ Key Decisions: {groundtruth_summaries.get('key_decisions', [])}
1088
+ Participants: {groundtruth_summaries.get('participants', [])}
1089
+ Topics Discussed: {groundtruth_summaries.get('topics_discussed', [])}
1090
+
1091
+ Evaluate the generated summary on these criteria (rate each as excellent/good/fair/poor):
1092
+ 1. Executive Summary Accuracy: How well does the executive summary capture the key points?
1093
+ 2. Completeness: Are all important details covered?
1094
+ 3. Action Items Accuracy: Are action items correctly identified and detailed?
1095
+ 4. Key Decisions Accuracy: Are key decisions properly captured?
1096
+ 5. Participant Identification: Are participants correctly identified?
1097
+ 6. Topic Coverage: Are all discussed topics included?
1098
+
1099
+ Return your analysis in this JSON format:
1100
+ {{
1101
+ "executive_summary_quality": {{
1102
+ "rating": "excellent/good/fair/poor",
1103
+ "explanation": "detailed analysis"
1104
+ }},
1105
+ "detail_completeness": {{
1106
+ "rating": "excellent/good/fair/poor",
1107
+ "explanation": "detailed analysis"
1108
+ }},
1109
+ "action_items_structure": {{
1110
+ "rating": "excellent/good/fair/poor",
1111
+ "explanation": "detailed analysis"
1112
+ }},
1113
+ "key_decisions_clarity": {{
1114
+ "rating": "excellent/good/fair/poor",
1115
+ "explanation": "detailed analysis"
1116
+ }},
1117
+ "participant_information": {{
1118
+ "rating": "excellent/good/fair/poor",
1119
+ "explanation": "detailed analysis"
1120
+ }},
1121
+ "topic_organization": {{
1122
+ "rating": "excellent/good/fair/poor",
1123
+ "explanation": "detailed analysis"
1124
+ }},
1125
+ "overall_quality": "excellent/good/fair/poor"
1126
+ }}
1127
+ """
1128
+ else:
1129
+ # Analyze standalone summary quality
1130
+ prompt = f"""
1131
+ Analyze this generated meeting summary for quality and completeness.
1132
+
1133
+ GENERATED SUMMARY:
1134
+ Executive Summary: {generated_summaries.get('executive_summary', 'N/A')}
1135
+ Detailed Summary: {generated_summaries.get('detailed_summary', 'N/A')}
1136
+ Action Items: {generated_summaries.get('action_items', [])}
1137
+ Key Decisions: {generated_summaries.get('key_decisions', [])}
1138
+ Participants: {generated_summaries.get('participants', [])}
1139
+ Topics Discussed: {generated_summaries.get('topics_discussed', [])}
1140
+
1141
+ Evaluate the summary quality (rate each as excellent/good/fair/poor):
1142
+ 1. Executive Summary Quality: Is it clear and high-level?
1143
+ 2. Detail Completeness: Does the detailed summary provide sufficient context?
1144
+ 3. Action Items Structure: Are action items specific and actionable?
1145
+ 4. Key Decisions Clarity: Are decisions clearly stated?
1146
+ 5. Participant Information: Are participants properly identified?
1147
+ 6. Topic Organization: Are topics well-organized and comprehensive?
1148
+
1149
+ IMPORTANT: Return ONLY valid JSON with no additional text, markdown formatting, or explanations.
1150
+ Ensure all JSON syntax is correct - no trailing commas, proper quotes, and complete structure.
1151
+
1152
+ Return your analysis in this exact JSON format:
1153
+ {{
1154
+ "executive_summary_quality": {{
1155
+ "rating": "excellent/good/fair/poor",
1156
+ "explanation": "detailed analysis"
1157
+ }},
1158
+ "detail_completeness": {{
1159
+ "rating": "excellent/good/fair/poor",
1160
+ "explanation": "detailed analysis"
1161
+ }},
1162
+ "action_items_structure": {{
1163
+ "rating": "excellent/good/fair/poor",
1164
+ "explanation": "detailed analysis"
1165
+ }},
1166
+ "key_decisions_clarity": {{
1167
+ "rating": "excellent/good/fair/poor",
1168
+ "explanation": "detailed analysis"
1169
+ }},
1170
+ "participant_information": {{
1171
+ "rating": "excellent/good/fair/poor",
1172
+ "explanation": "detailed analysis"
1173
+ }},
1174
+ "topic_organization": {{
1175
+ "rating": "excellent/good/fair/poor",
1176
+ "explanation": "detailed analysis"
1177
+ }},
1178
+ "overall_quality": "excellent/good/fair/poor"
1179
+ }}
1180
+ """
1181
+
1182
+ try:
1183
+ response_data = self.claude.get_completion_with_usage(prompt)
1184
+ response = response_data["content"]
1185
+ usage = response_data["usage"]
1186
+ cost = response_data["cost"]
1187
+
1188
+ # Extract text from response
1189
+ if isinstance(response, list):
1190
+ response_text = (
1191
+ response[0].text
1192
+ if hasattr(response[0], "text")
1193
+ else str(response[0])
1194
+ )
1195
+ else:
1196
+ response_text = (
1197
+ response.text
1198
+ if hasattr(response, "text")
1199
+ else str(response)
1200
+ )
1201
+
1202
+ # Parse JSON response with improved error handling
1203
+ # First try to extract from markdown code blocks
1204
+ markdown_json = re.search(
1205
+ r"```(?:json)?\s*(\{.*?\})\s*```", response_text, re.DOTALL
1206
+ )
1207
+ if markdown_json:
1208
+ json_content = markdown_json.group(1)
1209
+ else:
1210
+ # Fall back to finding raw JSON
1211
+ json_start = response_text.find("{")
1212
+ json_end = response_text.rfind("}") + 1
1213
+ if json_start >= 0 and json_end > json_start:
1214
+ json_content = response_text[json_start:json_end]
1215
+ else:
1216
+ json_content = None
1217
+
1218
+ if json_content:
1219
+ try:
1220
+ # First attempt: direct JSON parsing
1221
+ summary_analysis["analysis"] = json.loads(json_content)
1222
+ except json.JSONDecodeError as e:
1223
+ self.log.warning(f"Initial JSON parse failed: {e}")
1224
+ # Second attempt: clean up common issues
1225
+ # Remove any trailing commas before closing braces/brackets
1226
+ cleaned_json = re.sub(r",\s*([}\]])", r"\1", json_content)
1227
+ # Replace single quotes with double quotes (if any) - but not within strings
1228
+ # This is a simple heuristic, not perfect
1229
+ cleaned_json = cleaned_json.replace("'", '"')
1230
+ # Remove any control characters except newlines and tabs
1231
+ cleaned_json = re.sub(
1232
+ r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]",
1233
+ "",
1234
+ cleaned_json,
1235
+ )
1236
+ # Fix common escape issues
1237
+ cleaned_json = cleaned_json.replace(
1238
+ '\\"', '"'
1239
+ ) # Remove escaped quotes that might be double-escaped
1240
+ cleaned_json = re.sub(
1241
+ r'(?<!\\)\\(?!["\\/bfnrt])', r"\\\\", cleaned_json
1242
+ ) # Fix unescaped backslashes
1243
+
1244
+ try:
1245
+ summary_analysis["analysis"] = json.loads(cleaned_json)
1246
+ self.log.info("Successfully parsed JSON after cleanup")
1247
+ except json.JSONDecodeError as e2:
1248
+ self.log.error(f"JSON parse failed after cleanup: {e2}")
1249
+ # Third attempt: extract individual fields manually
1250
+ analysis_dict = {}
1251
+
1252
+ # Try to extract each field individually
1253
+ fields = [
1254
+ "executive_summary_quality",
1255
+ "detail_completeness",
1256
+ "action_items_structure",
1257
+ "key_decisions_clarity",
1258
+ "participant_information",
1259
+ "topic_organization",
1260
+ "overall_quality",
1261
+ ]
1262
+
1263
+ for field in fields:
1264
+ # Find the field and extract its rating
1265
+ pattern = rf'"{field}":\s*(?:"([^"]+)"|{{[^}}]+}})'
1266
+ match = re.search(pattern, json_content)
1267
+ if match:
1268
+ if field == "overall_quality":
1269
+ analysis_dict[field] = (
1270
+ match.group(1)
1271
+ if match.group(1)
1272
+ else "unknown"
1273
+ )
1274
+ else:
1275
+ # Try to extract rating from nested object
1276
+ rating_pattern = rf'"{field}":\s*{{[^}}]*"rating":\s*"([^"]+)"'
1277
+ rating_match = re.search(
1278
+ rating_pattern, json_content
1279
+ )
1280
+ if rating_match:
1281
+ analysis_dict[field] = {
1282
+ "rating": rating_match.group(1),
1283
+ "explanation": "Extracted from partial JSON",
1284
+ }
1285
+
1286
+ if analysis_dict:
1287
+ summary_analysis["analysis"] = analysis_dict
1288
+ self.log.warning(
1289
+ f"PARTIAL RECOVERY - Used fallback field extraction for summary {i}, extracted {len(analysis_dict)} fields"
1290
+ )
1291
+ else:
1292
+ # Final fallback: save raw response for debugging
1293
+ self.log.error(
1294
+ f"FALLBACK VALUES USED - Complete JSON parse failure for summary {i}"
1295
+ )
1296
+ summary_analysis["analysis"] = {
1297
+ "error": f"[FALLBACK - JSON PARSE FAILED] {str(e2)}",
1298
+ "raw_response": response_text[
1299
+ :1000
1300
+ ], # First 1000 chars for debugging
1301
+ "_warning": "This is a fallback response - Claude's analysis could not be parsed",
1302
+ "executive_summary_quality": {
1303
+ "rating": "error",
1304
+ "explanation": "[PARSE ERROR - See raw_response]",
1305
+ },
1306
+ "detail_completeness": {
1307
+ "rating": "error",
1308
+ "explanation": "[PARSE ERROR - See raw_response]",
1309
+ },
1310
+ "action_items_structure": {
1311
+ "rating": "error",
1312
+ "explanation": "[PARSE ERROR - See raw_response]",
1313
+ },
1314
+ "key_decisions_clarity": {
1315
+ "rating": "error",
1316
+ "explanation": "[PARSE ERROR - See raw_response]",
1317
+ },
1318
+ "participant_information": {
1319
+ "rating": "error",
1320
+ "explanation": "[PARSE ERROR - See raw_response]",
1321
+ },
1322
+ "topic_organization": {
1323
+ "rating": "error",
1324
+ "explanation": "[PARSE ERROR - See raw_response]",
1325
+ },
1326
+ }
1327
+ summary_analysis["overall_quality"] = "error"
1328
+
1329
+ # Set overall quality if successfully parsed
1330
+ if "analysis" in summary_analysis and isinstance(
1331
+ summary_analysis["analysis"], dict
1332
+ ):
1333
+ summary_analysis["overall_quality"] = summary_analysis[
1334
+ "analysis"
1335
+ ].get("overall_quality", "unknown")
1336
+ else:
1337
+ summary_analysis["overall_quality"] = "error"
1338
+ else:
1339
+ summary_analysis["analysis"] = {
1340
+ "error": "No JSON content found in Claude response",
1341
+ "raw_response": response_text[:500],
1342
+ }
1343
+ summary_analysis["overall_quality"] = "error"
1344
+
1345
+ # Add usage and cost
1346
+ summary_analysis["usage"] = usage
1347
+ summary_analysis["cost"] = cost
1348
+
1349
+ # Add timing for this summary
1350
+ summary_time = time.time() - summary_start_time
1351
+ summary_analysis["processing_time_seconds"] = round(summary_time, 3)
1352
+ per_summary_timings.append(summary_time)
1353
+
1354
+ # Accumulate totals
1355
+ total_usage["input_tokens"] += usage.get("input_tokens", 0)
1356
+ total_usage["output_tokens"] += usage.get("output_tokens", 0)
1357
+ total_usage["total_tokens"] += usage.get("total_tokens", 0)
1358
+ total_cost["input_cost"] += cost.get("input_cost", 0.0)
1359
+ total_cost["output_cost"] += cost.get("output_cost", 0.0)
1360
+ total_cost["total_cost"] += cost.get("total_cost", 0.0)
1361
+
1362
+ except Exception as e:
1363
+ self.log.error(f"Error analyzing summary {i}: {e}")
1364
+ summary_analysis["analysis"] = {"error": str(e)}
1365
+ summary_analysis["overall_quality"] = "error"
1366
+
1367
+ # Add timing even for errors
1368
+ summary_time = time.time() - summary_start_time
1369
+ summary_analysis["processing_time_seconds"] = round(summary_time, 3)
1370
+ per_summary_timings.append(summary_time)
1371
+
1372
+ analysis["per_question"].append(summary_analysis)
1373
+
1374
+ # Write intermediate result immediately for crash recovery
1375
+ if intermediate_dir:
1376
+ try:
1377
+ intermediate_file = (
1378
+ intermediate_dir / f"summary_{i+1:04d}_analysis.json"
1379
+ )
1380
+ intermediate_data = {
1381
+ "summary_index": i,
1382
+ "experiment_name": experiment_name,
1383
+ "source_file": summary_result.get("source_file", ""),
1384
+ "analysis": summary_analysis,
1385
+ "usage": summary_analysis.get("usage", {}),
1386
+ "cost": summary_analysis.get("cost", {}),
1387
+ "processing_time_seconds": summary_analysis.get(
1388
+ "processing_time_seconds", 0
1389
+ ),
1390
+ "timestamp": datetime.now().isoformat(),
1391
+ }
1392
+
1393
+ with open(intermediate_file, "w", encoding="utf-8") as f:
1394
+ json.dump(intermediate_data, f, indent=2)
1395
+
1396
+ # Update progress file
1397
+ progress_file = intermediate_dir / "analysis_progress.json"
1398
+ progress_data = {
1399
+ "experiment_name": experiment_name,
1400
+ "total_summaries": len(summarization_results),
1401
+ "completed_summaries": i + 1,
1402
+ "progress_percent": round(
1403
+ (i + 1) / len(summarization_results) * 100, 1
1404
+ ),
1405
+ "total_usage": total_usage.copy(),
1406
+ "total_cost": total_cost.copy(),
1407
+ "last_updated": datetime.now().isoformat(),
1408
+ "estimated_remaining_time": None,
1409
+ }
1410
+
1411
+ # Calculate estimated remaining time
1412
+ if i > 0:
1413
+ avg_time_per_summary = sum(per_summary_timings) / len(
1414
+ per_summary_timings
1415
+ )
1416
+ remaining_summaries = len(summarization_results) - (i + 1)
1417
+ estimated_remaining = (
1418
+ remaining_summaries * avg_time_per_summary
1419
+ )
1420
+ progress_data["estimated_remaining_time"] = round(
1421
+ estimated_remaining, 1
1422
+ )
1423
+
1424
+ with open(progress_file, "w", encoding="utf-8") as f:
1425
+ json.dump(progress_data, f, indent=2)
1426
+
1427
+ self.log.info(
1428
+ f"Analysis progress: {i+1}/{len(summarization_results)} summaries completed ({progress_data['progress_percent']}%)"
1429
+ )
1430
+
1431
+ except Exception as e:
1432
+ self.log.warning(
1433
+ f"Failed to write intermediate analysis result {i+1}: {e}"
1434
+ )
1435
+
1436
+ # Generate overall analysis
1437
+ quality_ratings = [
1438
+ s.get("overall_quality", "unknown") for s in analysis["per_question"]
1439
+ ]
1440
+
1441
+ # Filter out error and unknown ratings for scoring
1442
+ valid_quality_ratings = [
1443
+ rating
1444
+ for rating in quality_ratings
1445
+ if rating in ["excellent", "good", "fair", "poor"]
1446
+ ]
1447
+
1448
+ excellent_count = valid_quality_ratings.count("excellent")
1449
+ good_count = valid_quality_ratings.count("good")
1450
+ fair_count = valid_quality_ratings.count("fair")
1451
+ poor_count = valid_quality_ratings.count("poor")
1452
+ total_summaries = len(valid_quality_ratings)
1453
+ error_count = quality_ratings.count("error")
1454
+
1455
+ # Log information about errors if any
1456
+ if error_count > 0:
1457
+ self.log.warning(
1458
+ f"Excluded {error_count} error entries from quality scoring"
1459
+ )
1460
+
1461
+ # Handle case where no valid summaries are available for scoring
1462
+ if total_summaries == 0:
1463
+ if error_count > 0:
1464
+ self.log.error(
1465
+ "All summaries failed analysis - cannot compute quality score"
1466
+ )
1467
+ overall_rating = "error"
1468
+ else:
1469
+ self.log.warning("No summaries found for analysis")
1470
+ overall_rating = "unknown"
1471
+ elif excellent_count >= total_summaries * 0.7:
1472
+ overall_rating = "excellent"
1473
+ elif (excellent_count + good_count) >= total_summaries * 0.7:
1474
+ overall_rating = "good"
1475
+ elif (excellent_count + good_count + fair_count) >= total_summaries * 0.7:
1476
+ overall_rating = "fair"
1477
+ else:
1478
+ overall_rating = "poor"
1479
+
1480
+ # Send individual analyses to Claude for comprehensive overall analysis
1481
+ overall_start_time = time.time()
1482
+
1483
+ # Get experiment/model information
1484
+ experiment_name = results.get("metadata", {}).get(
1485
+ "experiment_name", "Unknown Model"
1486
+ )
1487
+ model_type = results.get("metadata", {}).get("model", "")
1488
+
1489
+ overall_prompt = f"""
1490
+ Review these summarization test results and provide a comprehensive overall analysis.
1491
+
1492
+ Model/Experiment: {experiment_name}
1493
+ Number of summaries analyzed: {total_summaries}
1494
+ Quality distribution:
1495
+ - Excellent: {excellent_count} ({excellent_count/total_summaries*100:.1f}%)
1496
+ - Good: {good_count} ({good_count/total_summaries*100:.1f}%)
1497
+ - Fair: {fair_count} ({fair_count/total_summaries*100:.1f}%)
1498
+ - Poor: {poor_count} ({poor_count/total_summaries*100:.1f}%)
1499
+
1500
+ Overall quality rating: {overall_rating}
1501
+
1502
+ Individual summary analyses: {json.dumps(analysis['per_question'], indent=2)}
1503
+
1504
+ Based on the detailed analysis of each summary above, provide a comprehensive assessment including:
1505
+
1506
+ 1. Overall Analysis: General assessment of the summarization system's performance
1507
+ 2. Strengths: Specific aspects the model does well (be specific based on the individual analyses)
1508
+ 3. Weaknesses: Concrete areas needing improvement (based on patterns in the individual analyses)
1509
+ 4. Recommendations: Actionable suggestions for improvement
1510
+ 5. Use Case Fit: Types of meetings/content this model handles well or poorly
1511
+
1512
+ Consider the following in your analysis:
1513
+ - Patterns in accuracy, completeness, organization across summaries
1514
+ - Consistency of performance
1515
+ - Specific failure modes observed
1516
+ - Model characteristics (e.g., if it's Claude, Llama, Qwen, etc.)
1517
+
1518
+ IMPORTANT: Return ONLY valid JSON with no additional text, markdown formatting, or explanations.
1519
+ Ensure all JSON syntax is correct - no trailing commas, proper quotes, and complete structure.
1520
+
1521
+ Return your analysis in this exact JSON format:
1522
+ {{
1523
+ "overall_analysis": "comprehensive assessment of overall performance",
1524
+ "strengths": ["specific strength 1", "specific strength 2", ...],
1525
+ "weaknesses": ["specific weakness 1", "specific weakness 2", ...],
1526
+ "recommendations": ["actionable recommendation 1", "actionable recommendation 2", ...],
1527
+ "use_case_fit": "detailed analysis of suitable use cases and limitations"
1528
+ }}
1529
+ """
1530
+
1531
+ try:
1532
+ overall_response_data = self.claude.get_completion_with_usage(
1533
+ overall_prompt
1534
+ )
1535
+
1536
+ # Extract JSON from overall response
1537
+ overall_response = overall_response_data["content"]
1538
+ overall_usage = overall_response_data["usage"]
1539
+ overall_cost = overall_response_data["cost"]
1540
+
1541
+ if isinstance(overall_response, list):
1542
+ response_text = (
1543
+ overall_response[0].text
1544
+ if hasattr(overall_response[0], "text")
1545
+ else str(overall_response[0])
1546
+ )
1547
+ else:
1548
+ response_text = (
1549
+ overall_response.text
1550
+ if hasattr(overall_response, "text")
1551
+ else str(overall_response)
1552
+ )
1553
+
1554
+ # Try to extract JSON from various formats (markdown, plain, etc.)
1555
+ # First try to extract from markdown code blocks
1556
+ markdown_json = re.search(
1557
+ r"```(?:json)?\s*(\{.*?\})\s*```", response_text, re.DOTALL
1558
+ )
1559
+ if markdown_json:
1560
+ json_content = markdown_json.group(1)
1561
+ json_found = True
1562
+ else:
1563
+ # Fall back to finding raw JSON
1564
+ json_start = response_text.find("{")
1565
+ json_end = response_text.rfind("}") + 1
1566
+ if json_start >= 0 and json_end > json_start:
1567
+ json_content = response_text[json_start:json_end]
1568
+ json_found = True
1569
+ else:
1570
+ json_found = False
1571
+
1572
+ if json_found:
1573
+ try:
1574
+ # First attempt: direct JSON parsing
1575
+ claude_analysis = json.loads(json_content)
1576
+ except json.JSONDecodeError as e:
1577
+ self.log.warning(
1578
+ f"Initial JSON parse failed for overall analysis: {e}"
1579
+ )
1580
+ # Second attempt: clean up common issues
1581
+ # Remove trailing commas before closing braces/brackets
1582
+ cleaned_json = re.sub(r",\s*([}\]])", r"\1", json_content)
1583
+ # Replace single quotes with double quotes (if any) - simple heuristic
1584
+ cleaned_json = cleaned_json.replace("'", '"')
1585
+ # Remove control characters except newlines and tabs
1586
+ cleaned_json = re.sub(
1587
+ r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]", "", cleaned_json
1588
+ )
1589
+ # Fix common escape issues
1590
+ cleaned_json = cleaned_json.replace(
1591
+ '\\"', '"'
1592
+ ) # Remove escaped quotes that might be double-escaped
1593
+ cleaned_json = re.sub(
1594
+ r'(?<!\\)\\(?!["\\/bfnrt])', r"\\\\", cleaned_json
1595
+ ) # Fix unescaped backslashes
1596
+
1597
+ try:
1598
+ claude_analysis = json.loads(cleaned_json)
1599
+ self.log.info(
1600
+ "Successfully parsed overall analysis JSON after cleanup"
1601
+ )
1602
+ except json.JSONDecodeError as e2:
1603
+ self.log.error(
1604
+ f"FALLBACK VALUES USED - Failed to parse Claude's overall analysis response after cleanup: {e2}"
1605
+ )
1606
+ self.log.error(
1607
+ f"Raw response preview: {json_content[:500]}..."
1608
+ )
1609
+ # Use fallback values - CLEARLY MARKED
1610
+ claude_analysis = {
1611
+ "overall_analysis": f"[FALLBACK - JSON PARSE ERROR] Analyzed {total_summaries} summaries. Quality distribution: {excellent_count} excellent, {good_count} good, {fair_count} fair, {poor_count} poor. NOTE: Claude's detailed analysis could not be parsed due to malformed JSON response.",
1612
+ "strengths": [
1613
+ "[FALLBACK VALUE - Real analysis unavailable due to JSON parse error]"
1614
+ ],
1615
+ "weaknesses": [
1616
+ "[FALLBACK VALUE - Real analysis unavailable due to JSON parse error]"
1617
+ ],
1618
+ "recommendations": [
1619
+ "[FALLBACK VALUE - Real analysis unavailable due to JSON parse error]",
1620
+ "Review raw Claude response in logs for actual recommendations",
1621
+ ],
1622
+ "use_case_fit": "[FALLBACK VALUE - Real analysis unavailable due to JSON parse error]",
1623
+ "_warning": "These are fallback values - Claude's actual analysis failed to parse. Check logs for details.",
1624
+ }
1625
+
1626
+ # Add Claude's analysis to our results
1627
+ overall_analysis_text = claude_analysis.get(
1628
+ "overall_analysis",
1629
+ f"Analyzed {total_summaries} summaries. Quality distribution: {excellent_count} excellent, {good_count} good, {fair_count} fair, {poor_count} poor.",
1630
+ )
1631
+ strengths = claude_analysis.get(
1632
+ "strengths", ["Summary generation completed"]
1633
+ )
1634
+ weaknesses = claude_analysis.get(
1635
+ "weaknesses", ["Areas for improvement identified"]
1636
+ )
1637
+ recommendations = claude_analysis.get(
1638
+ "recommendations", ["Continue monitoring performance"]
1639
+ )
1640
+ use_case_fit = claude_analysis.get(
1641
+ "use_case_fit", "Suitable for meeting summarization"
1642
+ )
1643
+
1644
+ # Track Claude API usage for overall analysis
1645
+ analysis["overall_usage"] = overall_usage
1646
+ analysis["overall_cost"] = overall_cost
1647
+ analysis["overall_processing_time_seconds"] = round(
1648
+ time.time() - overall_start_time, 3
1649
+ )
1650
+ else:
1651
+ self.log.error(
1652
+ "FALLBACK VALUES USED - No JSON content found in Claude's overall analysis response"
1653
+ )
1654
+ self.log.error(
1655
+ f"Raw response preview (first 1000 chars): {response_text[:1000]}"
1656
+ )
1657
+ # Save full response to debug file
1658
+ debug_dir = Path("debug_claude_responses")
1659
+ debug_dir.mkdir(exist_ok=True)
1660
+ debug_file = (
1661
+ debug_dir
1662
+ / f"overall_analysis_no_json_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
1663
+ )
1664
+ with open(debug_file, "w", encoding="utf-8") as f:
1665
+ f.write(
1666
+ f"No JSON found in response. JSON start: {json_start}, JSON end: {json_end}\n"
1667
+ )
1668
+ f.write(f"Full response:\n{response_text}")
1669
+ self.log.error(f"Full response saved to: {debug_file}")
1670
+ # Fallback to programmatic analysis - CLEARLY MARKED
1671
+ overall_analysis_text = f"[FALLBACK - NO JSON FOUND] Analyzed {total_summaries} summaries. Quality distribution: {excellent_count} excellent, {good_count} good, {fair_count} fair, {poor_count} poor. NOTE: Claude's response contained no parseable JSON."
1672
+ strengths = [
1673
+ "[FALLBACK VALUE - Claude response had no JSON content]"
1674
+ ]
1675
+ weaknesses = [
1676
+ "[FALLBACK VALUE - Manual review of logs required to see actual Claude response]"
1677
+ ]
1678
+ recommendations = [
1679
+ "[FALLBACK VALUE - Check logs for Claude's actual response]"
1680
+ ]
1681
+ use_case_fit = "[FALLBACK VALUE - Claude's analysis not available]"
1682
+
1683
+ except Exception as e:
1684
+ self.log.error(
1685
+ f"FALLBACK VALUES USED - Exception during Claude overall analysis: {e}"
1686
+ )
1687
+ # Fallback to basic programmatic analysis if Claude fails - CLEARLY MARKED
1688
+ overall_analysis_text = f"[FALLBACK - EXCEPTION: {str(e)[:100]}] Analyzed {total_summaries} summaries. Quality distribution: {excellent_count} excellent, {good_count} good, {fair_count} fair, {poor_count} poor."
1689
+ strengths = [f"[FALLBACK VALUE - Claude API error: {str(e)[:100]}]"]
1690
+ weaknesses = ["[FALLBACK VALUE - Analysis failed due to API error]"]
1691
+ recommendations = [
1692
+ "[FALLBACK VALUE - Check API connectivity and retry]"
1693
+ ]
1694
+
1695
+ # Basic programmatic fallback analysis
1696
+ if excellent_count > 0:
1697
+ strengths.append(
1698
+ f"Achieved excellent quality in {excellent_count}/{total_summaries} summaries"
1699
+ )
1700
+ if good_count > 0:
1701
+ strengths.append(
1702
+ f"Produced good quality summaries in {good_count}/{total_summaries} cases"
1703
+ )
1704
+
1705
+ if poor_count > 0:
1706
+ weaknesses.append(
1707
+ f"Generated poor quality summaries in {poor_count}/{total_summaries} cases"
1708
+ )
1709
+ if excellent_count == 0:
1710
+ weaknesses.append("No summaries achieved excellent quality rating")
1711
+
1712
+ if poor_count > 0 or fair_count > total_summaries * 0.3:
1713
+ recommendations.append("Review and improve prompt engineering")
1714
+ if excellent_count == 0:
1715
+ recommendations.append("Consider using a more capable model")
1716
+
1717
+ if not strengths:
1718
+ strengths = ["Summary generation completed"]
1719
+ if not weaknesses:
1720
+ weaknesses = ["Some areas for improvement"]
1721
+ if not recommendations:
1722
+ recommendations = ["Continue monitoring performance"]
1723
+
1724
+ use_case_fit = "Suitable for meeting summarization with review"
1725
+
1726
+ analysis.update(
1727
+ {
1728
+ "overall_analysis": overall_analysis_text,
1729
+ "strengths": strengths,
1730
+ "weaknesses": weaknesses,
1731
+ "recommendations": recommendations,
1732
+ "use_case_fit": use_case_fit,
1733
+ "overall_rating": {
1734
+ "rating": overall_rating,
1735
+ "explanation": f"Based on {total_summaries} valid summaries with {excellent_count + good_count} high-quality results"
1736
+ + (
1737
+ f" ({error_count} errors excluded)"
1738
+ if error_count > 0
1739
+ else ""
1740
+ ),
1741
+ "metrics": {
1742
+ "total_summaries": total_summaries,
1743
+ "excellent_count": excellent_count,
1744
+ "good_count": good_count,
1745
+ "fair_count": fair_count,
1746
+ "poor_count": poor_count,
1747
+ "error_count": error_count,
1748
+ "quality_score": (
1749
+ (
1750
+ (
1751
+ excellent_count * 4
1752
+ + good_count * 3
1753
+ + fair_count * 2
1754
+ + poor_count * 1
1755
+ )
1756
+ / total_summaries
1757
+ - 1 # Convert from 1-4 scale to 0-3 scale
1758
+ )
1759
+ * 100
1760
+ / 3 # Convert to percentage (0-100%)
1761
+ if total_summaries > 0
1762
+ else None # Return None instead of 0 when no valid summaries
1763
+ ),
1764
+ },
1765
+ },
1766
+ }
1767
+ )
1768
+
1769
+ # Add overall analysis costs to totals if available
1770
+ if "overall_usage" in analysis and "overall_cost" in analysis:
1771
+ total_usage["input_tokens"] += analysis["overall_usage"].get(
1772
+ "input_tokens", 0
1773
+ )
1774
+ total_usage["output_tokens"] += analysis["overall_usage"].get(
1775
+ "output_tokens", 0
1776
+ )
1777
+ total_usage["total_tokens"] += analysis["overall_usage"].get(
1778
+ "total_tokens", 0
1779
+ )
1780
+ total_cost["input_cost"] += analysis["overall_cost"].get(
1781
+ "input_cost", 0.0
1782
+ )
1783
+ total_cost["output_cost"] += analysis["overall_cost"].get(
1784
+ "output_cost", 0.0
1785
+ )
1786
+ total_cost["total_cost"] += analysis["overall_cost"].get(
1787
+ "total_cost", 0.0
1788
+ )
1789
+
1790
+ # Update with final totals
1791
+ analysis["total_usage"] = total_usage
1792
+ analysis["total_cost"] = total_cost
1793
+
1794
+ # Add comprehensive timing information
1795
+ total_time = time.time() - analysis_start_time
1796
+ analysis["timing"] = {
1797
+ "total_processing_time_seconds": round(total_time, 3),
1798
+ "per_summary_times_seconds": [round(t, 3) for t in per_summary_timings],
1799
+ "average_per_summary_seconds": (
1800
+ round(np.mean(per_summary_timings), 3) if per_summary_timings else 0
1801
+ ),
1802
+ "max_per_summary_seconds": (
1803
+ round(max(per_summary_timings), 3) if per_summary_timings else 0
1804
+ ),
1805
+ "min_per_summary_seconds": (
1806
+ round(min(per_summary_timings), 3) if per_summary_timings else 0
1807
+ ),
1808
+ }
1809
+
1810
+ # Clean up intermediate files after successful completion
1811
+ if intermediate_dir and intermediate_dir.exists():
1812
+ try:
1813
+ import shutil
1814
+
1815
+ shutil.rmtree(intermediate_dir)
1816
+ self.log.info(
1817
+ f"Cleaned up intermediate analysis files from: {intermediate_dir}"
1818
+ )
1819
+ except Exception as e:
1820
+ self.log.warning(
1821
+ f"Failed to clean up intermediate directory {intermediate_dir}: {e}"
1822
+ )
1823
+
1824
+ return analysis
1825
+
1826
+ except Exception as e:
1827
+ self.log.error(f"Error in summarization analysis: {e}")
1828
+ return {
1829
+ "overall_analysis": f"Summarization analysis failed: {str(e)}",
1830
+ "strengths": [],
1831
+ "weaknesses": ["Analysis failed to complete"],
1832
+ "recommendations": ["Check logs for error details"],
1833
+ "use_case_fit": "",
1834
+ "per_question": [],
1835
+ "overall_rating": {"rating": "error", "explanation": str(e)},
1836
+ }
1837
+
1838
+ def generate_enhanced_report(
1839
+ self,
1840
+ results_path: str,
1841
+ output_dir: Optional[str] = None,
1842
+ groundtruth_path: Optional[str] = None,
1843
+ base_experiment_dir: Optional[str] = None,
1844
+ ) -> None:
1845
+ """
1846
+ Generate a detailed evaluation report including Claude's analysis.
1847
+
1848
+ Args:
1849
+ results_path: Path to results JSON file
1850
+ output_dir: Optional dir path to save report. If None, returns the data.
1851
+ groundtruth_path: Optional path to groundtruth file for comparison (especially for summarization)
1852
+ """
1853
+ # Start timing
1854
+ report_start_time = time.time()
1855
+
1856
+ try:
1857
+ if output_dir:
1858
+ output_path = Path(output_dir)
1859
+ output_path.mkdir(parents=True, exist_ok=True)
1860
+
1861
+ # Get Claude analysis
1862
+ claude_analysis = self.analyze_with_claude(results_path, groundtruth_path)
1863
+
1864
+ # Calculate total report generation time
1865
+ report_generation_time = time.time() - report_start_time
1866
+
1867
+ # Load experiment results to extract tested model info
1868
+ with open(results_path, "r", encoding="utf-8") as f:
1869
+ experiment_results = json.load(f)
1870
+
1871
+ # Extract tested model info from experiment results
1872
+ experiment_metadata = experiment_results.get("metadata", {})
1873
+ tested_model = experiment_metadata.get("model", "unknown")
1874
+ tested_model_type = experiment_metadata.get("llm_type", "unknown")
1875
+ inference_type = experiment_metadata.get("inference_type", "unknown")
1876
+
1877
+ # Create evaluation data without depending on threshold_metrics
1878
+ evaluation_data = {
1879
+ "metadata": {
1880
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
1881
+ "evaluator_model": self.claude.model, # Model doing the evaluation
1882
+ "tested_model": tested_model, # The model being evaluated
1883
+ "tested_model_type": tested_model_type, # Provider (lemonade, anthropic, etc.)
1884
+ "tested_model_inference": inference_type, # local or cloud
1885
+ "original_results_file": str(results_path),
1886
+ "groundtruth_file": (
1887
+ str(groundtruth_path) if groundtruth_path else None
1888
+ ),
1889
+ "report_generation_time_seconds": round(report_generation_time, 3),
1890
+ },
1891
+ **claude_analysis,
1892
+ }
1893
+
1894
+ if output_dir:
1895
+ results_path_obj = Path(results_path)
1896
+ results_filename = results_path_obj.name
1897
+
1898
+ # Preserve directory hierarchy if base_experiment_dir is provided
1899
+ if base_experiment_dir:
1900
+ base_exp_path = Path(base_experiment_dir)
1901
+ try:
1902
+ # Calculate relative path from base experiment directory
1903
+ relative_path = results_path_obj.relative_to(base_exp_path)
1904
+ # Create the same directory structure in output
1905
+ eval_subdir = output_path / relative_path.parent
1906
+ eval_subdir.mkdir(parents=True, exist_ok=True)
1907
+ json_path = eval_subdir / f"{results_path_obj.stem}.eval.json"
1908
+ except ValueError:
1909
+ # If results_path is not relative to base_experiment_dir, use flat structure
1910
+ json_path = output_path / f"{results_path_obj.stem}.eval.json"
1911
+ else:
1912
+ # Flat structure (original behavior)
1913
+ json_path = output_path / f"{results_path_obj.stem}.eval.json"
1914
+
1915
+ with open(json_path, "w") as f:
1916
+ json.dump(evaluation_data, f, indent=2)
1917
+ self.log.info(f"Evaluation data saved to: {json_path}")
1918
+
1919
+ return evaluation_data
1920
+
1921
+ except Exception as e:
1922
+ self.log.error(f"Error during evaluation: {str(e)}")
1923
+ raise
1924
+
1925
+ def create_template(
1926
+ self,
1927
+ groundtruth_file: str,
1928
+ output_dir: str = "./output/templates",
1929
+ similarity_threshold: float = 0.7,
1930
+ ) -> str:
1931
+ """
1932
+ Create a template results file from ground truth data for manual RAG evaluation.
1933
+
1934
+ Args:
1935
+ groundtruth_file: Path to the ground truth JSON file
1936
+ output_dir: Directory to save the template file
1937
+ similarity_threshold: Similarity threshold for evaluation
1938
+
1939
+ Returns:
1940
+ Path to the created template file
1941
+ """
1942
+ try:
1943
+ # Load ground truth data
1944
+ with open(groundtruth_file, "r", encoding="utf-8") as f:
1945
+ groundtruth_data = json.load(f)
1946
+
1947
+ # Extract QA pairs from ground truth
1948
+ qa_pairs = groundtruth_data.get("analysis", {}).get("qa_pairs", [])
1949
+ if not qa_pairs:
1950
+ raise ValueError("No QA pairs found in ground truth file")
1951
+
1952
+ # Create template structure
1953
+ template_data = {
1954
+ "metadata": {
1955
+ "test_file": groundtruth_file,
1956
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
1957
+ "similarity_threshold": similarity_threshold,
1958
+ "instructions": "Fill in the 'response' fields with your RAG system outputs, then evaluate using gaia eval",
1959
+ },
1960
+ "analysis": {"qa_results": []},
1961
+ }
1962
+
1963
+ # Convert QA pairs to result template format
1964
+ for i, qa_pair in enumerate(qa_pairs):
1965
+ result_entry = {
1966
+ "query": qa_pair.get("question", qa_pair.get("query", "")),
1967
+ "ground_truth": qa_pair.get(
1968
+ "answer",
1969
+ qa_pair.get("response", qa_pair.get("ground_truth", "")),
1970
+ ),
1971
+ "response": f"[FILL IN YOUR RAG SYSTEM RESPONSE FOR QUESTION {i+1}]",
1972
+ }
1973
+ template_data["analysis"]["qa_results"].append(result_entry)
1974
+
1975
+ # Create output directory
1976
+ output_path = Path(output_dir)
1977
+ output_path.mkdir(parents=True, exist_ok=True)
1978
+
1979
+ # Generate output filename
1980
+ groundtruth_filename = Path(groundtruth_file).stem
1981
+ if groundtruth_filename.endswith(".groundtruth"):
1982
+ base_name = groundtruth_filename[:-12] # Remove '.groundtruth'
1983
+ else:
1984
+ base_name = groundtruth_filename
1985
+
1986
+ template_filename = f"{base_name}.template.json"
1987
+ template_path = output_path / template_filename
1988
+
1989
+ # Save template file
1990
+ with open(template_path, "w", encoding="utf-8") as f:
1991
+ json.dump(template_data, f, indent=2, ensure_ascii=False)
1992
+
1993
+ self.log.info(f"Created template with {len(qa_pairs)} questions")
1994
+ return str(template_path)
1995
+
1996
+ except Exception as e:
1997
+ self.log.error(f"Error creating template: {e}")
1998
+ raise
1999
+
2000
+ def create_consolidated_evaluation_report(
2001
+ self, evaluation_files: List[str], output_dir: str, base_experiment_dir: str
2002
+ ) -> str:
2003
+ """Create a consolidated report of all evaluations."""
2004
+ from datetime import datetime
2005
+
2006
+ output_base_path = Path(output_dir)
2007
+
2008
+ # Load all evaluation results
2009
+ all_evaluations = []
2010
+ total_usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
2011
+ total_cost = {"input_cost": 0.0, "output_cost": 0.0, "total_cost": 0.0}
2012
+
2013
+ for eval_file in evaluation_files:
2014
+ # Find the actual evaluation file (could be in subdirectory)
2015
+ eval_paths = list(output_base_path.rglob(eval_file))
2016
+ if not eval_paths:
2017
+ self.log.warning(f"Evaluation file not found: {eval_file}")
2018
+ continue
2019
+
2020
+ eval_path = eval_paths[0] # Take first match
2021
+
2022
+ try:
2023
+ with open(eval_path, "r", encoding="utf-8") as f:
2024
+ evaluation_data = json.load(f)
2025
+
2026
+ # For consolidated report, only include summary statistics
2027
+ metadata = evaluation_data.get("metadata", {})
2028
+ eval_info = {
2029
+ "experiment_name": eval_path.stem.replace(".eval", ""),
2030
+ "file_path": str(eval_path.relative_to(output_base_path)),
2031
+ "timestamp": metadata.get("timestamp", ""),
2032
+ "evaluator_model": metadata.get(
2033
+ "evaluator_model", ""
2034
+ ), # Model doing the evaluation
2035
+ "tested_model": metadata.get(
2036
+ "tested_model", "unknown"
2037
+ ), # Model being tested
2038
+ "tested_model_type": metadata.get(
2039
+ "tested_model_type", "unknown"
2040
+ ), # Provider
2041
+ "tested_model_inference": metadata.get(
2042
+ "tested_model_inference", "unknown"
2043
+ ), # Local/cloud
2044
+ "overall_rating": evaluation_data.get("overall_rating", {}),
2045
+ "original_results_file": metadata.get("original_results_file", ""),
2046
+ "usage": evaluation_data.get("total_usage", {}),
2047
+ "cost": evaluation_data.get(
2048
+ "total_cost", {}
2049
+ ), # This is evaluation cost
2050
+ }
2051
+
2052
+ # Load the corresponding experiment file to get inference cost
2053
+ experiment_name = eval_path.stem.replace(".experiment.eval", "")
2054
+
2055
+ # Preserve the subdirectory structure when looking for experiment file
2056
+ relative_eval_path = eval_path.relative_to(output_base_path)
2057
+ relative_dir = relative_eval_path.parent
2058
+
2059
+ experiment_file = (
2060
+ Path(base_experiment_dir)
2061
+ / relative_dir
2062
+ / f"{experiment_name}.experiment.json"
2063
+ )
2064
+
2065
+ if experiment_file.exists():
2066
+ try:
2067
+ with open(experiment_file, "r", encoding="utf-8") as f:
2068
+ experiment_data = json.load(f)
2069
+ # Add inference cost from experiment file
2070
+ eval_info["inference_cost"] = experiment_data.get(
2071
+ "metadata", {}
2072
+ ).get("total_cost", {})
2073
+ eval_info["inference_usage"] = experiment_data.get(
2074
+ "metadata", {}
2075
+ ).get("total_usage", {})
2076
+ eval_info["inference_type"] = experiment_data.get(
2077
+ "metadata", {}
2078
+ ).get("inference_type", "unknown")
2079
+ except Exception as e:
2080
+ self.log.warning(
2081
+ f"Could not load experiment file {experiment_file}: {e}"
2082
+ )
2083
+ # Set default values for missing experiment data
2084
+ eval_info["inference_cost"] = {
2085
+ "input_cost": 0.0,
2086
+ "output_cost": 0.0,
2087
+ "total_cost": 0.0,
2088
+ }
2089
+ eval_info["inference_usage"] = {
2090
+ "input_tokens": 0,
2091
+ "output_tokens": 0,
2092
+ "total_tokens": 0,
2093
+ }
2094
+ eval_info["inference_type"] = "unknown"
2095
+ else:
2096
+ self.log.warning(f"Experiment file not found: {experiment_file}")
2097
+ # Set default values for missing experiment data
2098
+ eval_info["inference_cost"] = {
2099
+ "input_cost": 0.0,
2100
+ "output_cost": 0.0,
2101
+ "total_cost": 0.0,
2102
+ }
2103
+ eval_info["inference_usage"] = {
2104
+ "input_tokens": 0,
2105
+ "output_tokens": 0,
2106
+ "total_tokens": 0,
2107
+ }
2108
+ eval_info["inference_type"] = "unknown"
2109
+
2110
+ # Extract aspect summary if available (aggregate only)
2111
+ if evaluation_data.get("per_question"):
2112
+ aspect_summary = {}
2113
+ # Define the aspects we want to extract (matching visualization expectations)
2114
+ # Map old aspect names to new ones for backwards compatibility
2115
+ aspect_mapping = {
2116
+ # Old names -> New names
2117
+ "executive_summary_accuracy": "executive_summary_quality",
2118
+ "completeness": "detail_completeness",
2119
+ "action_items_accuracy": "action_items_structure",
2120
+ "key_decisions_accuracy": "key_decisions_clarity",
2121
+ "participant_identification": "participant_information",
2122
+ "topic_coverage": "topic_organization",
2123
+ # New names (map to themselves)
2124
+ "executive_summary_quality": "executive_summary_quality",
2125
+ "detail_completeness": "detail_completeness",
2126
+ "action_items_structure": "action_items_structure",
2127
+ "key_decisions_clarity": "key_decisions_clarity",
2128
+ "participant_information": "participant_information",
2129
+ "topic_organization": "topic_organization",
2130
+ }
2131
+
2132
+ aspects = [
2133
+ "executive_summary_quality",
2134
+ "detail_completeness",
2135
+ "action_items_structure",
2136
+ "key_decisions_clarity",
2137
+ "participant_information",
2138
+ "topic_organization",
2139
+ ]
2140
+
2141
+ for aspect in aspects:
2142
+ aspect_ratings = []
2143
+ for question in evaluation_data.get("per_question", []):
2144
+ analysis = question.get("analysis", {})
2145
+ # Check for the aspect using both old and new names
2146
+ for old_name, new_name in aspect_mapping.items():
2147
+ if new_name == aspect and old_name in analysis:
2148
+ rating = analysis[old_name].get("rating")
2149
+ if rating:
2150
+ aspect_ratings.append(rating)
2151
+ break
2152
+
2153
+ if aspect_ratings:
2154
+ # Count occurrences of each rating
2155
+ rating_counts = {}
2156
+ for rating in aspect_ratings:
2157
+ rating_counts[rating] = rating_counts.get(rating, 0) + 1
2158
+
2159
+ # Find most common rating
2160
+ most_common = max(rating_counts.items(), key=lambda x: x[1])
2161
+ aspect_summary[aspect] = {
2162
+ "most_common_rating": most_common[0],
2163
+ "rating_distribution": rating_counts,
2164
+ }
2165
+
2166
+ if aspect_summary:
2167
+ eval_info["aspect_summary"] = aspect_summary
2168
+
2169
+ # Include timing summary if available
2170
+ if evaluation_data.get("timing"):
2171
+ eval_info["avg_processing_time_seconds"] = evaluation_data[
2172
+ "timing"
2173
+ ].get(
2174
+ "average_per_summary_seconds",
2175
+ evaluation_data["timing"].get(
2176
+ "total_processing_time_seconds", 0
2177
+ ),
2178
+ )
2179
+
2180
+ all_evaluations.append(eval_info)
2181
+
2182
+ # Accumulate totals
2183
+ usage = evaluation_data.get("total_usage", {})
2184
+ for key in total_usage:
2185
+ total_usage[key] += usage.get(key, 0)
2186
+
2187
+ cost = evaluation_data.get("total_cost", {})
2188
+ for key in total_cost:
2189
+ total_cost[key] += cost.get(key, 0.0)
2190
+
2191
+ except Exception as e:
2192
+ self.log.error(f"Error loading evaluation file {eval_path}: {e}")
2193
+ continue
2194
+
2195
+ # Create consolidated report with enhanced metadata tracking
2196
+ evaluation_files_metadata = []
2197
+ for eval_file in evaluation_files:
2198
+ # Find the actual evaluation file (could be in subdirectory)
2199
+ eval_paths = list(output_base_path.rglob(eval_file))
2200
+ if eval_paths:
2201
+ eval_path = eval_paths[0]
2202
+ relative_path = str(eval_path.relative_to(output_base_path))
2203
+ evaluation_files_metadata.append(
2204
+ {
2205
+ "file_path": relative_path,
2206
+ "added_at": datetime.now().isoformat(),
2207
+ "last_modified": datetime.fromtimestamp(
2208
+ eval_path.stat().st_mtime
2209
+ ).isoformat(),
2210
+ "fingerprint": self.get_evaluation_fingerprint(str(eval_path)),
2211
+ }
2212
+ )
2213
+
2214
+ consolidated_report = {
2215
+ "metadata": {
2216
+ "report_type": "consolidated_evaluations",
2217
+ "created_at": datetime.now().isoformat(),
2218
+ "last_updated": datetime.now().isoformat(),
2219
+ "timestamp": datetime.now().strftime(
2220
+ "%Y-%m-%d %H:%M:%S"
2221
+ ), # Keep for backwards compatibility
2222
+ "experiment_directory": base_experiment_dir,
2223
+ "output_directory": output_dir,
2224
+ "total_evaluations": len(all_evaluations),
2225
+ "total_usage": total_usage,
2226
+ "total_cost": total_cost,
2227
+ "evaluation_files": evaluation_files_metadata,
2228
+ },
2229
+ "evaluations": all_evaluations,
2230
+ }
2231
+
2232
+ # Save consolidated report
2233
+ consolidated_filename = "consolidated_evaluations_report.json"
2234
+ consolidated_path = output_base_path / consolidated_filename
2235
+
2236
+ with open(consolidated_path, "w", encoding="utf-8") as f:
2237
+ json.dump(consolidated_report, f, indent=2)
2238
+
2239
+ return str(consolidated_path)
2240
+
2241
+ def get_evaluation_fingerprint(self, eval_file: str) -> str:
2242
+ """Generate fingerprint for evaluation file to detect changes.
2243
+
2244
+ Args:
2245
+ eval_file: Path to the evaluation file
2246
+
2247
+ Returns:
2248
+ Fingerprint string combining modification time and file size
2249
+ """
2250
+ eval_path = Path(eval_file)
2251
+ if not eval_path.exists():
2252
+ return ""
2253
+
2254
+ # Use file modification time + file size as fingerprint
2255
+ stat = eval_path.stat()
2256
+ return f"{stat.st_mtime}_{stat.st_size}"
2257
+
2258
+ def find_changed_evaluations(self, output_dir: str) -> List[str]:
2259
+ """Find evaluations that have changed since last consolidation.
2260
+
2261
+ Args:
2262
+ output_dir: Output directory containing evaluations
2263
+
2264
+ Returns:
2265
+ List of paths to changed evaluation files
2266
+ """
2267
+ output_base_path = Path(output_dir)
2268
+ consolidated_path = output_base_path / "consolidated_evaluations_report.json"
2269
+
2270
+ if not consolidated_path.exists():
2271
+ return [str(f) for f in output_base_path.rglob("*.eval.json")]
2272
+
2273
+ # Load existing fingerprints
2274
+ try:
2275
+ with open(consolidated_path, "r", encoding="utf-8") as f:
2276
+ existing_report = json.load(f)
2277
+
2278
+ existing_fingerprints = {}
2279
+ if "evaluation_files" in existing_report.get("metadata", {}):
2280
+ for item in existing_report["metadata"]["evaluation_files"]:
2281
+ existing_fingerprints[item["file_path"]] = item.get(
2282
+ "fingerprint", ""
2283
+ )
2284
+ except Exception as e:
2285
+ self.log.warning(f"Error reading existing consolidated report: {e}")
2286
+ return [str(f) for f in output_base_path.rglob("*.eval.json")]
2287
+
2288
+ changed_files = []
2289
+ for eval_file in output_base_path.rglob("*.eval.json"):
2290
+ relative_path = str(eval_file.relative_to(output_base_path))
2291
+ current_fingerprint = self.get_evaluation_fingerprint(str(eval_file))
2292
+
2293
+ if (
2294
+ relative_path not in existing_fingerprints
2295
+ or existing_fingerprints[relative_path] != current_fingerprint
2296
+ ):
2297
+ changed_files.append(str(eval_file))
2298
+
2299
+ return changed_files
2300
+
2301
+ def update_consolidated_evaluation_report(
2302
+ self,
2303
+ output_dir: str,
2304
+ new_eval_files: List[str] = None,
2305
+ regenerate: bool = False,
2306
+ base_experiment_dir: str = None,
2307
+ ) -> str:
2308
+ """Update consolidated report with new evaluations or regenerate completely.
2309
+
2310
+ Args:
2311
+ output_dir: Output directory containing evaluations
2312
+ new_eval_files: List of new evaluation files to add (if None, auto-detect)
2313
+ regenerate: Force full regeneration of the report
2314
+ base_experiment_dir: Base experiment directory path
2315
+
2316
+ Returns:
2317
+ Path to the consolidated report file
2318
+ """
2319
+ from datetime import datetime
2320
+
2321
+ output_base_path = Path(output_dir)
2322
+ consolidated_filename = "consolidated_evaluations_report.json"
2323
+ consolidated_path = output_base_path / consolidated_filename
2324
+
2325
+ if regenerate or not consolidated_path.exists():
2326
+ # Full regeneration (use existing logic)
2327
+ evaluation_files = [f.name for f in output_base_path.rglob("*.eval.json")]
2328
+ return self.create_consolidated_evaluation_report(
2329
+ evaluation_files, output_dir, base_experiment_dir or output_dir
2330
+ )
2331
+
2332
+ # Load existing consolidated report
2333
+ try:
2334
+ with open(consolidated_path, "r", encoding="utf-8") as f:
2335
+ existing_report = json.load(f)
2336
+ except Exception as e:
2337
+ self.log.error(f"Error loading existing consolidated report: {e}")
2338
+ # Fallback to full regeneration
2339
+ evaluation_files = [f.name for f in output_base_path.rglob("*.eval.json")]
2340
+ return self.create_consolidated_evaluation_report(
2341
+ evaluation_files, output_dir, base_experiment_dir or output_dir
2342
+ )
2343
+
2344
+ # Initialize metadata structure if missing
2345
+ if "evaluation_files" not in existing_report.get("metadata", {}):
2346
+ existing_report["metadata"]["evaluation_files"] = []
2347
+
2348
+ # Find new evaluation files
2349
+ if not new_eval_files:
2350
+ all_eval_files = list(output_base_path.rglob("*.eval.json"))
2351
+ existing_files = {
2352
+ item["file_path"]
2353
+ for item in existing_report["metadata"]["evaluation_files"]
2354
+ }
2355
+ new_eval_files = [
2356
+ str(f)
2357
+ for f in all_eval_files
2358
+ if str(f.relative_to(output_base_path)) not in existing_files
2359
+ ]
2360
+
2361
+ if not new_eval_files:
2362
+ self.log.info(
2363
+ "No new evaluations found - consolidated report is up to date"
2364
+ )
2365
+ return str(consolidated_path)
2366
+
2367
+ self.log.info(
2368
+ f"Adding {len(new_eval_files)} new evaluations to consolidated report"
2369
+ )
2370
+
2371
+ # Process new files and update report
2372
+ new_evaluations = []
2373
+ updated_usage = existing_report["metadata"].get(
2374
+ "total_usage", {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
2375
+ )
2376
+ updated_cost = existing_report["metadata"].get(
2377
+ "total_cost", {"input_cost": 0.0, "output_cost": 0.0, "total_cost": 0.0}
2378
+ )
2379
+
2380
+ for eval_file in new_eval_files:
2381
+ eval_path = Path(eval_file)
2382
+ relative_path = str(eval_path.relative_to(output_base_path))
2383
+
2384
+ # Add to metadata tracking
2385
+ existing_report["metadata"]["evaluation_files"].append(
2386
+ {
2387
+ "file_path": relative_path,
2388
+ "added_at": datetime.now().isoformat(),
2389
+ "last_modified": datetime.fromtimestamp(
2390
+ eval_path.stat().st_mtime
2391
+ ).isoformat(),
2392
+ "fingerprint": self.get_evaluation_fingerprint(str(eval_path)),
2393
+ }
2394
+ )
2395
+
2396
+ # Load and integrate evaluation data
2397
+ try:
2398
+ with open(eval_path, "r", encoding="utf-8") as f:
2399
+ eval_data = json.load(f)
2400
+
2401
+ # Create evaluation summary (similar to existing logic)
2402
+ eval_info = {
2403
+ "experiment_name": eval_path.stem.replace(".eval", ""),
2404
+ "file_path": relative_path,
2405
+ "timestamp": eval_data.get("metadata", {}).get("timestamp", ""),
2406
+ "model": eval_data.get("metadata", {}).get("model", "unknown"),
2407
+ }
2408
+
2409
+ # Add overall analysis if available
2410
+ if "overall_analysis" in eval_data:
2411
+ eval_info["overall_analysis"] = (
2412
+ eval_data["overall_analysis"][:200] + "..."
2413
+ if len(eval_data["overall_analysis"]) > 200
2414
+ else eval_data["overall_analysis"]
2415
+ )
2416
+
2417
+ # Add timing info if available
2418
+ if eval_data.get("timing"):
2419
+ eval_info["avg_processing_time_seconds"] = eval_data["timing"].get(
2420
+ "average_per_summary_seconds",
2421
+ eval_data["timing"].get("total_processing_time_seconds", 0),
2422
+ )
2423
+
2424
+ new_evaluations.append(eval_info)
2425
+
2426
+ # Accumulate usage and cost
2427
+ usage = eval_data.get("total_usage", {})
2428
+ for key in updated_usage:
2429
+ updated_usage[key] += usage.get(key, 0)
2430
+
2431
+ cost = eval_data.get("total_cost", {})
2432
+ for key in updated_cost:
2433
+ updated_cost[key] += cost.get(key, 0.0)
2434
+
2435
+ except Exception as e:
2436
+ self.log.error(f"Error processing new evaluation file {eval_path}: {e}")
2437
+ continue
2438
+
2439
+ # Update the consolidated report
2440
+ existing_report["evaluations"].extend(new_evaluations)
2441
+ existing_report["metadata"]["last_updated"] = datetime.now().isoformat()
2442
+ existing_report["metadata"]["total_evaluations"] = len(
2443
+ existing_report["evaluations"]
2444
+ )
2445
+ existing_report["metadata"]["total_usage"] = updated_usage
2446
+ existing_report["metadata"]["total_cost"] = updated_cost
2447
+
2448
+ # Save updated report
2449
+ with open(consolidated_path, "w", encoding="utf-8") as f:
2450
+ json.dump(existing_report, f, indent=2)
2451
+
2452
+ self.log.info(
2453
+ f"Updated consolidated report with {len(new_evaluations)} new evaluations"
2454
+ )
2455
+ return str(consolidated_path)
2456
+
2457
+ def _detect_evaluation_type(self, models_data: List[Dict]) -> str:
2458
+ """Detect whether this is a RAG or summarization evaluation based on the data structure."""
2459
+ if not models_data:
2460
+ return "unknown"
2461
+
2462
+ # Check first model's per_question data structure
2463
+ first_model = models_data[0]
2464
+ per_question = first_model.get("per_question", [])
2465
+
2466
+ if not per_question:
2467
+ return "unknown"
2468
+
2469
+ # Look at the first question to determine evaluation type
2470
+ first_question = per_question[0]
2471
+
2472
+ # Summarization evaluations have specific analysis fields
2473
+ analysis = first_question.get("analysis", {})
2474
+ # Check for new aspect names
2475
+ if any(
2476
+ key in analysis
2477
+ for key in [
2478
+ "executive_summary_quality",
2479
+ "detail_completeness",
2480
+ "action_items_structure",
2481
+ "key_decisions_clarity",
2482
+ "participant_information",
2483
+ "topic_organization",
2484
+ ]
2485
+ ):
2486
+ return "summarization"
2487
+
2488
+ # Also check for old aspect names (for backwards compatibility)
2489
+ if any(
2490
+ key in analysis
2491
+ for key in [
2492
+ "executive_summary_accuracy",
2493
+ "completeness",
2494
+ "action_items_accuracy",
2495
+ "key_decisions_accuracy",
2496
+ "participant_identification",
2497
+ "topic_coverage",
2498
+ ]
2499
+ ):
2500
+ return "summarization"
2501
+
2502
+ # RAG/QA evaluations have similarity scores and different structure
2503
+ # Both Q&A and RAG evaluations are treated the same way
2504
+ if "similarity_score" in first_question or "passed_threshold" in first_question:
2505
+ return "rag"
2506
+
2507
+ # Additional check for Q&A evaluations that have qa_inputs
2508
+ if "qa_inputs" in first_question:
2509
+ return "rag"
2510
+
2511
+ # If we can't detect the evaluation type, log the issue for debugging
2512
+ self.log.warning(
2513
+ f"Could not detect evaluation type from data structure: {list(first_question.keys())}"
2514
+ )
2515
+ return "unknown"
2516
+
2517
+ def _generate_summarization_report(self, models_data: List[Dict]) -> str:
2518
+ """Generate markdown content specifically for summarization evaluation reports."""
2519
+
2520
+ # Build performance ranking based on overall quality ratings
2521
+ ranking = []
2522
+ for model in models_data:
2523
+ # Count quality ratings from per_question data
2524
+ excellent_count = 0
2525
+ good_count = 0
2526
+ fair_count = 0
2527
+ poor_count = 0
2528
+
2529
+ for question in model.get("per_question", []):
2530
+ analysis = question.get("analysis", {})
2531
+ overall_quality = analysis.get("overall_quality", "")
2532
+ if overall_quality == "excellent":
2533
+ excellent_count += 1
2534
+ elif overall_quality == "good":
2535
+ good_count += 1
2536
+ elif overall_quality == "fair":
2537
+ fair_count += 1
2538
+ elif overall_quality == "poor":
2539
+ poor_count += 1
2540
+ # Note: "error" and other invalid ratings are excluded from ranking
2541
+
2542
+ total_summaries = excellent_count + good_count + fair_count + poor_count
2543
+ if total_summaries > 0:
2544
+ quality_score_raw = (
2545
+ excellent_count * 4
2546
+ + good_count * 3
2547
+ + fair_count * 2
2548
+ + poor_count * 1
2549
+ ) / total_summaries
2550
+ quality_score_percentage = ((quality_score_raw - 1) / 3) * 100
2551
+ ranking.append(f"**{model['name']}** ({quality_score_percentage:.1f}%)")
2552
+
2553
+ ranking_text = " > ".join(ranking)
2554
+
2555
+ # Determine production readiness for summarization
2556
+ production_ready = any(
2557
+ "excellent" in str(m.get("per_question", [])) for m in models_data
2558
+ )
2559
+ production_note = (
2560
+ "Some models show excellent summarization capabilities."
2561
+ if production_ready
2562
+ else "All models need improvement for production summarization."
2563
+ )
2564
+
2565
+ # Build metrics table for summarization
2566
+ table_rows = []
2567
+ for model in models_data:
2568
+ # Count quality ratings
2569
+ excellent_count = 0
2570
+ good_count = 0
2571
+ fair_count = 0
2572
+ poor_count = 0
2573
+
2574
+ for question in model.get("per_question", []):
2575
+ analysis = question.get("analysis", {})
2576
+ overall_quality = analysis.get("overall_quality", "")
2577
+ if overall_quality == "excellent":
2578
+ excellent_count += 1
2579
+ elif overall_quality == "good":
2580
+ good_count += 1
2581
+ elif overall_quality == "fair":
2582
+ fair_count += 1
2583
+ elif overall_quality == "poor":
2584
+ poor_count += 1
2585
+ # Note: "error" and other invalid ratings are excluded from metrics
2586
+
2587
+ total_summaries = excellent_count + good_count + fair_count + poor_count
2588
+ excellent_rate = (
2589
+ (excellent_count / total_summaries * 100) if total_summaries > 0 else 0
2590
+ )
2591
+
2592
+ rating_map = {
2593
+ "excellent": "Excellent",
2594
+ "good": "Good",
2595
+ "fair": "Fair",
2596
+ "poor": "Poor",
2597
+ "unknown": "Unknown",
2598
+ }
2599
+ rating = rating_map.get(model["rating"], model["rating"].title())
2600
+
2601
+ table_rows.append(
2602
+ f"| **{model['name']}** | {excellent_rate:.0f}% | {excellent_count}/{total_summaries} | {good_count} | {fair_count} | {poor_count} | {rating} |"
2603
+ )
2604
+
2605
+ # Identify common summarization issues
2606
+ failure_patterns = []
2607
+
2608
+ # Analyze common weaknesses across models
2609
+ all_weaknesses = []
2610
+ for model in models_data:
2611
+ all_weaknesses.extend(model.get("weaknesses", []))
2612
+
2613
+ if "Manual review recommended" in str(all_weaknesses):
2614
+ failure_patterns.append("**Quality Consistency Issues** (Multiple Models)")
2615
+ failure_patterns.append("- Manual review recommended for complex summaries")
2616
+ failure_patterns.append(
2617
+ "- Inconsistent quality across different summary types"
2618
+ )
2619
+ failure_patterns.append("- Need for human validation of critical details")
2620
+
2621
+ # Check for specific summarization challenges
2622
+ poor_performers = [
2623
+ m for m in models_data if "poor" in str(m.get("per_question", []))
2624
+ ]
2625
+ if poor_performers:
2626
+ failure_patterns.append("")
2627
+ failure_patterns.append(
2628
+ "**Content Structure Issues** "
2629
+ + f"({', '.join([m['name'] for m in poor_performers])})"
2630
+ )
2631
+ failure_patterns.append("- Poor action item organization and clarity")
2632
+ failure_patterns.append("- Missing key decisions or incomplete details")
2633
+ failure_patterns.append("- Inadequate participant information capture")
2634
+
2635
+ # Model-specific analysis for summarization
2636
+ model_analyses = []
2637
+
2638
+ if models_data:
2639
+ best = models_data[0]
2640
+ best_strengths = (
2641
+ best["strengths"][:2]
2642
+ if best["strengths"]
2643
+ else ["Maintains summary structure", "Comprehensive analysis performed"]
2644
+ )
2645
+ best_weakness = (
2646
+ best["weaknesses"][0]
2647
+ if best["weaknesses"]
2648
+ else "Needs validation for complex scenarios"
2649
+ )
2650
+
2651
+ model_analyses.append(f"### **{best['name']}** - Best Performer")
2652
+ model_analyses.append(f"- **Strengths**: {', '.join(best_strengths)}")
2653
+ model_analyses.append(f"- **Weakness**: {best_weakness}")
2654
+ model_analyses.append(
2655
+ f"- **Actionable**: Implement quality validation workflows, standardize summary templates"
2656
+ )
2657
+
2658
+ if len(models_data) > 1:
2659
+ worst = models_data[-1]
2660
+ worst_issues = (
2661
+ worst["weaknesses"][:2]
2662
+ if worst["weaknesses"]
2663
+ else ["Inconsistent summary quality"]
2664
+ )
2665
+
2666
+ model_analyses.append("")
2667
+ model_analyses.append(f"### **{worst['name']}** - Needs Improvement")
2668
+ model_analyses.append(f"- **Issues**: {', '.join(worst_issues)}")
2669
+ model_analyses.append(
2670
+ f"- **Actionable**: Enhance prompt engineering, add structured output validation"
2671
+ )
2672
+
2673
+ # Cost efficiency analysis
2674
+ cost_analyses = []
2675
+ if all(m["total_cost"] > 0 for m in models_data):
2676
+ for model in models_data:
2677
+ roi_desc = (
2678
+ "best value"
2679
+ if model == models_data[0]
2680
+ else (
2681
+ "poor value"
2682
+ if "poor" in str(model.get("per_question", []))
2683
+ else "moderate value"
2684
+ )
2685
+ )
2686
+ cost_analyses.append(
2687
+ f"- **{model['name']}**: ${model['total_cost']:.3f} total cost, {roi_desc} for summarization quality"
2688
+ )
2689
+
2690
+ # Technical actions for summarization
2691
+ tech_actions = [
2692
+ "1. **Summary Template Standardization**: Create consistent output formats for different meeting types",
2693
+ "2. **Quality Validation Pipeline**: Implement automated checks for completeness and accuracy",
2694
+ "3. **Prompt Engineering Optimization**: Improve prompts for better action item extraction and decision clarity",
2695
+ ]
2696
+
2697
+ tech_actions.extend(
2698
+ [
2699
+ "4. **Human-in-the-Loop Validation**: Add review workflows for critical summaries",
2700
+ "5. **Meeting Type Classification**: Tailor summarization approach based on meeting context",
2701
+ "6. **Output Formatting Enhancement**: Improve structure and readability of generated summaries",
2702
+ ]
2703
+ )
2704
+
2705
+ # Investment decision for summarization
2706
+ if models_data:
2707
+ best_model = models_data[0]
2708
+ if "excellent" in str(best_model.get("per_question", [])):
2709
+ investment_decision = f"**{best_model['name']}** shows production potential with proper validation workflows."
2710
+ timeline = "2-4 weeks for validation pipeline implementation."
2711
+ else:
2712
+ investment_decision = (
2713
+ "All models require improvement before reliable production use."
2714
+ )
2715
+ timeline = "4-8 weeks for prompt optimization and quality improvements."
2716
+ else:
2717
+ investment_decision = (
2718
+ "Unable to recommend specific model - insufficient evaluation data."
2719
+ )
2720
+ timeline = "Timeline uncertain due to limited baseline data."
2721
+
2722
+ # Build the complete summarization report
2723
+ report = f"""# Meeting Summarization Performance Analysis: {len(models_data)} LLM Comparison
2724
+
2725
+ ## Executive Summary
2726
+ Performance ranking: {ranking_text}
2727
+
2728
+ {production_note}
2729
+
2730
+ ## Key Performance Metrics
2731
+
2732
+ | Model | Excellent Rate | Excellent/Total | Good | Fair | Poor | Rating |
2733
+ |-------|----------------|-----------------|------|------|------|---------|
2734
+ {chr(10).join(table_rows)}
2735
+
2736
+ ## Common Challenges
2737
+
2738
+ {chr(10).join(failure_patterns)}
2739
+
2740
+ ## Model-Specific Analysis
2741
+
2742
+ {chr(10).join(model_analyses)}
2743
+
2744
+ ## Cost Efficiency Analysis
2745
+ {chr(10).join(cost_analyses) if cost_analyses else "Cost data not available for analysis"}
2746
+
2747
+ ## Immediate Improvement Actions
2748
+
2749
+ ### High Priority (Quality Enhancement)
2750
+ {chr(10).join(tech_actions[:3])}
2751
+
2752
+ ### Medium Priority (Process Optimization)
2753
+ {chr(10).join(tech_actions[3:])}
2754
+
2755
+ ## Bottom Line
2756
+ **Investment decision**: {investment_decision} **Timeline**: {timeline}"""
2757
+
2758
+ return report
2759
+
2760
+ def generate_summary_report(
2761
+ self, eval_dir: str, output_path: str = "LLM_Evaluation_Report.md"
2762
+ ) -> Dict:
2763
+ """
2764
+ Generate a comprehensive summary report from multiple evaluation files.
2765
+
2766
+ Args:
2767
+ eval_dir: Directory containing .eval.json files
2768
+ output_path: Path to save the markdown report
2769
+
2770
+ Returns:
2771
+ Dict containing summary data
2772
+ """
2773
+ try:
2774
+ eval_path = Path(eval_dir)
2775
+ if not eval_path.exists():
2776
+ raise FileNotFoundError(f"Evaluation directory not found: {eval_dir}")
2777
+
2778
+ # Find all .eval.json files (recursively)
2779
+ eval_files = list(eval_path.rglob("*.eval.json"))
2780
+ if not eval_files:
2781
+ raise FileNotFoundError(f"No .eval.json files found in {eval_dir}")
2782
+
2783
+ self.log.info(f"Found {len(eval_files)} evaluation files")
2784
+
2785
+ # Parse evaluation data
2786
+ models_data = []
2787
+ for eval_file in eval_files:
2788
+ try:
2789
+ with open(eval_file, "r", encoding="utf-8") as f:
2790
+ eval_data = json.load(f)
2791
+
2792
+ # Extract model name from filename or metadata
2793
+ filename = eval_file.stem
2794
+ model_name = filename.replace(".eval", "")
2795
+
2796
+ # Extract key metrics
2797
+ overall_rating = eval_data.get("overall_rating", {})
2798
+ metrics = overall_rating.get("metrics", {})
2799
+ total_cost = eval_data.get("total_cost", {})
2800
+
2801
+ # Calculate quality score for summarization evaluations
2802
+ quality_score = 0.0
2803
+ overall_rating_metrics = overall_rating.get("metrics", {})
2804
+ if overall_rating_metrics:
2805
+ # Use existing quality_score if available (could be None for error cases)
2806
+ quality_score = overall_rating_metrics.get("quality_score", 0.0)
2807
+ if quality_score is None:
2808
+ quality_score = 0.0 # Treat None as 0 for ranking purposes
2809
+ else:
2810
+ # Calculate from per_question data if metrics not available
2811
+ excellent_count = 0
2812
+ good_count = 0
2813
+ fair_count = 0
2814
+ poor_count = 0
2815
+
2816
+ for question in eval_data.get("per_question", []):
2817
+ analysis = question.get("analysis", {})
2818
+ overall_quality = analysis.get("overall_quality", "")
2819
+ if overall_quality == "excellent":
2820
+ excellent_count += 1
2821
+ elif overall_quality == "good":
2822
+ good_count += 1
2823
+ elif overall_quality == "fair":
2824
+ fair_count += 1
2825
+ elif overall_quality == "poor":
2826
+ poor_count += 1
2827
+ # Note: "error" and other invalid ratings are excluded from quality score calculation
2828
+
2829
+ total_summaries = (
2830
+ excellent_count + good_count + fair_count + poor_count
2831
+ )
2832
+ if total_summaries > 0:
2833
+ quality_score_raw = (
2834
+ excellent_count * 4
2835
+ + good_count * 3
2836
+ + fair_count * 2
2837
+ + poor_count * 1
2838
+ ) / total_summaries
2839
+ quality_score = ((quality_score_raw - 1) / 3) * 100
2840
+
2841
+ model_info = {
2842
+ "name": model_name,
2843
+ "filename": eval_file.name,
2844
+ "pass_rate": metrics.get("pass_rate", 0),
2845
+ "accuracy": metrics.get("accuracy_percentage", 0),
2846
+ "mean_similarity": metrics.get("mean_similarity", 0),
2847
+ "std_similarity": metrics.get("std_similarity", 0),
2848
+ "min_similarity": metrics.get("min_similarity", 0),
2849
+ "max_similarity": metrics.get("max_similarity", 0),
2850
+ "num_questions": metrics.get("num_questions", 0),
2851
+ "num_passed": metrics.get("num_passed", 0),
2852
+ "num_failed": metrics.get("num_failed", 0),
2853
+ "threshold": metrics.get("similarity_threshold", 0.7),
2854
+ "rating": overall_rating.get("rating", "unknown"),
2855
+ "quality_score": quality_score, # Add quality score to model info
2856
+ "total_cost": total_cost.get("total_cost", 0),
2857
+ "analysis": eval_data.get("overall_analysis", ""),
2858
+ "strengths": eval_data.get("strengths", []),
2859
+ "weaknesses": eval_data.get("weaknesses", []),
2860
+ "recommendations": eval_data.get("recommendations", []),
2861
+ "per_question": eval_data.get("per_question", []),
2862
+ }
2863
+ models_data.append(model_info)
2864
+
2865
+ except Exception as e:
2866
+ self.log.warning(f"Error processing {eval_file}: {e}")
2867
+ continue
2868
+
2869
+ if not models_data:
2870
+ raise ValueError("No valid evaluation data found")
2871
+
2872
+ # Detect evaluation type first
2873
+ evaluation_type = self._detect_evaluation_type(models_data)
2874
+
2875
+ # Sort by appropriate metric based on evaluation type
2876
+ if evaluation_type == "summarization":
2877
+ # Sort by quality score (descending) for summarization
2878
+ models_data.sort(key=lambda x: x["quality_score"], reverse=True)
2879
+ else:
2880
+ # Sort by pass rate (descending) for RAG and unknown types
2881
+ models_data.sort(key=lambda x: x["pass_rate"], reverse=True)
2882
+
2883
+ if evaluation_type == "summarization":
2884
+ report_content = self._generate_summarization_report(models_data)
2885
+ elif evaluation_type == "rag":
2886
+ report_content = self._generate_markdown_report(models_data)
2887
+ else:
2888
+ # Handle unknown evaluation type
2889
+ self.log.error(
2890
+ f"Unknown evaluation type detected: {evaluation_type}. Cannot generate report."
2891
+ )
2892
+ raise ValueError(
2893
+ f"Unsupported evaluation type: {evaluation_type}. Expected 'summarization' or 'rag'."
2894
+ )
2895
+
2896
+ # Save report
2897
+ with open(output_path, "w", encoding="utf-8") as f:
2898
+ f.write(report_content)
2899
+
2900
+ self.log.info(f"Summary report saved to: {output_path}")
2901
+
2902
+ return {
2903
+ "models_analyzed": len(models_data),
2904
+ "report_path": output_path,
2905
+ "summary_data": models_data,
2906
+ "evaluation_type": evaluation_type,
2907
+ }
2908
+
2909
+ except Exception as e:
2910
+ self.log.error(f"Error generating summary report: {e}")
2911
+ raise
2912
+
2913
+ def _generate_markdown_report(self, models_data: List[Dict]) -> str:
2914
+ """Generate markdown content for the summary report."""
2915
+
2916
+ # Create executive summary
2917
+ best_model = models_data[0] if models_data else None
2918
+ worst_model = models_data[-1] if models_data else None
2919
+
2920
+ # Build performance ranking
2921
+ ranking = []
2922
+ for i, model in enumerate(models_data):
2923
+ ranking.append(f"**{model['name']}** ({model['pass_rate']:.0%})")
2924
+ ranking_text = " > ".join(ranking)
2925
+
2926
+ # Determine if any model meets production standards
2927
+ production_ready = any(
2928
+ m["pass_rate"] >= 0.7 and m["mean_similarity"] >= 0.7 for m in models_data
2929
+ )
2930
+ production_note = (
2931
+ "None achieve production standards (70% pass rate + 0.7 similarity)."
2932
+ if not production_ready
2933
+ else "Some models approach production readiness."
2934
+ )
2935
+
2936
+ # Build metrics table
2937
+ table_rows = []
2938
+ for model in models_data:
2939
+ rating_map = {
2940
+ "excellent": "Excellent",
2941
+ "good": "Good",
2942
+ "fair": "Fair",
2943
+ "poor": "Poor",
2944
+ "unknown": "Unknown",
2945
+ }
2946
+ rating = rating_map.get(model["rating"], model["rating"].title())
2947
+ table_rows.append(
2948
+ f"| **{model['name']}** | {model['pass_rate']:.0%} | {model['mean_similarity']:.3f} | {model['std_similarity']:.3f} | {rating} |"
2949
+ )
2950
+
2951
+ # Identify failure patterns
2952
+ failure_patterns = []
2953
+
2954
+ # Knowledge retrieval gaps (check if models consistently fail on specific question types)
2955
+ knowledge_issues = [m for m in models_data if m["mean_similarity"] < 0.4]
2956
+ if len(knowledge_issues) >= 2:
2957
+ failure_patterns.append("**Knowledge Retrieval Gaps** (All Models)")
2958
+ failure_patterns.append("- Unable to access specific document sections")
2959
+ failure_patterns.append("- Missing organizational information")
2960
+ failure_patterns.append(
2961
+ "- Poor semantic matching between queries and knowledge base"
2962
+ )
2963
+
2964
+ # Factual accuracy issues
2965
+ accuracy_issues = [m for m in models_data if m["pass_rate"] < 0.5]
2966
+ if accuracy_issues:
2967
+ failure_patterns.append("")
2968
+ failure_patterns.append(
2969
+ "**Factual Accuracy Issues** "
2970
+ + f"({', '.join([m['name'] for m in accuracy_issues])})"
2971
+ )
2972
+ # Add specific issues from analysis
2973
+ for model in accuracy_issues[:3]: # Limit to top 3 worst performers
2974
+ if (
2975
+ "jurisdictional" in model["analysis"].lower()
2976
+ or "confusion" in model["analysis"].lower()
2977
+ ):
2978
+ failure_patterns.append(
2979
+ f"- **{model['name']}**: Jurisdictional confusion (US vs Canadian regulations)"
2980
+ )
2981
+ if (
2982
+ "incorrect" in model["analysis"].lower()
2983
+ or "wrong" in model["analysis"].lower()
2984
+ ):
2985
+ failure_patterns.append(
2986
+ f"- **{model['name']}**: Incorrect core values, wrong regulatory stages"
2987
+ )
2988
+
2989
+ # Completeness problems
2990
+ if len([m for m in models_data if m["mean_similarity"] < 0.5]) >= 2:
2991
+ failure_patterns.append("")
2992
+ failure_patterns.append("**Completeness Problems** (All Models)")
2993
+ failure_patterns.append("- Partial answers missing key regulatory details")
2994
+ failure_patterns.append(
2995
+ "- Incomplete permit types (missing multiple authorization categories)"
2996
+ )
2997
+ failure_patterns.append("- Poor handling of comprehensive queries")
2998
+
2999
+ # Model-specific analysis
3000
+ model_analyses = []
3001
+
3002
+ if models_data:
3003
+ best = models_data[0]
3004
+ best_strengths = (
3005
+ best["strengths"][:2]
3006
+ if best["strengths"]
3007
+ else ["Good performance when information is available"]
3008
+ )
3009
+ best_weakness = (
3010
+ best["weaknesses"][0]
3011
+ if best["weaknesses"]
3012
+ else "Inconsistent retrieval quality"
3013
+ )
3014
+
3015
+ model_analyses.append(f"### **{best['name']}** - Best Performer")
3016
+ model_analyses.append(f"- **Strengths**: {', '.join(best_strengths)}")
3017
+ model_analyses.append(f"- **Weakness**: {best_weakness}")
3018
+ model_analyses.append(
3019
+ f"- **Actionable**: Improve retrieval consistency, expand knowledge base coverage"
3020
+ )
3021
+
3022
+ if len(models_data) > 1:
3023
+ worst = models_data[-1]
3024
+ worst_issues = (
3025
+ worst["weaknesses"][:2]
3026
+ if worst["weaknesses"]
3027
+ else ["Poor overall performance"]
3028
+ )
3029
+
3030
+ model_analyses.append("")
3031
+ model_analyses.append(f"### **{worst['name']}** - Needs Improvement")
3032
+ model_analyses.append(f"- **Issues**: {', '.join(worst_issues)}")
3033
+ model_analyses.append(
3034
+ f"- **Actionable**: Requires significant system improvements before production use"
3035
+ )
3036
+
3037
+ # Cost efficiency analysis
3038
+ cost_analyses = []
3039
+ if all(m["total_cost"] > 0 for m in models_data):
3040
+ for model in models_data:
3041
+ roi_desc = (
3042
+ "best ROI"
3043
+ if model == models_data[0]
3044
+ else ("poor ROI" if model["pass_rate"] < 0.3 else "moderate ROI")
3045
+ )
3046
+ cost_analyses.append(
3047
+ f"- **{model['name']}**: ${model['total_cost']:.3f} total cost, {roi_desc} at {model['pass_rate']:.0%} accuracy"
3048
+ )
3049
+
3050
+ # Technical actions
3051
+ tech_actions = [
3052
+ "1. **Document Indexing Overhaul**: Fix content gaps, improve chunking strategy",
3053
+ "2. **Embedding Model Upgrade**: Current semantic matching insufficient (mean similarity <0.4)",
3054
+ "3. **Context Validation**: Implement regulatory framework filters",
3055
+ ]
3056
+
3057
+ if any("runtime" in str(m["weaknesses"]).lower() for m in models_data):
3058
+ tech_actions.append(
3059
+ "4. **Token Limit Fixes**: Address runtime errors and token constraints"
3060
+ )
3061
+
3062
+ tech_actions.extend(
3063
+ [
3064
+ "5. **Response Validation**: Add factual accuracy checks before output",
3065
+ "6. **Retrieval Redundancy**: Multi-step retrieval for complex queries",
3066
+ ]
3067
+ )
3068
+
3069
+ # Investment decision
3070
+ if best_model:
3071
+ if best_model["pass_rate"] >= 0.5:
3072
+ investment_decision = f"Focus resources on **{best_model['name']}** optimization rather than fixing underperforming models."
3073
+ else:
3074
+ investment_decision = "All models require significant improvement before production deployment."
3075
+
3076
+ timeline = "3-6 months minimum before regulatory compliance readiness."
3077
+ else:
3078
+ investment_decision = (
3079
+ "Unable to recommend specific model - all require substantial work."
3080
+ )
3081
+ timeline = "Timeline uncertain due to poor baseline performance."
3082
+
3083
+ # Build the complete report
3084
+ report = f"""# RAG System Performance Analysis: {len(models_data)} LLM Comparison
3085
+
3086
+ ## Executive Summary
3087
+ Performance ranking: {ranking_text}
3088
+
3089
+ {production_note}
3090
+
3091
+ ## Key Performance Metrics
3092
+
3093
+ | Model | Pass Rate | Mean Similarity | Std Dev | Rating |
3094
+ |-------|-----------|----------------|---------|---------|
3095
+ {chr(10).join(table_rows)}
3096
+
3097
+ ## Critical Failure Patterns
3098
+
3099
+ {chr(10).join(failure_patterns)}
3100
+
3101
+ ## Model-Specific Analysis
3102
+
3103
+ {chr(10).join(model_analyses)}
3104
+
3105
+ ## Cost Efficiency Analysis
3106
+ {chr(10).join(cost_analyses) if cost_analyses else "Cost data not available for analysis"}
3107
+
3108
+ ## Immediate Technical Actions
3109
+
3110
+ ### High Priority (Critical Fixes)
3111
+ {chr(10).join(tech_actions[:3])}
3112
+
3113
+ ### Medium Priority (Performance Optimization)
3114
+ {chr(10).join(tech_actions[3:])}
3115
+
3116
+ ## Bottom Line
3117
+ **Investment decision**: {investment_decision} **Timeline**: {timeline}"""
3118
+
3119
+ return report
3120
+
3121
+
3122
+ if __name__ == "__main__":
3123
+ # Example usage
3124
+ evaluator = Evaluator()
3125
+ results_file = "./output/rag/introduction.results.json"
3126
+
3127
+ try:
3128
+ evaluation_data = evaluator.generate_enhanced_report(
3129
+ results_file, output_dir="./output/eval"
3130
+ )
3131
+
3132
+ # Print key metrics from the analysis
3133
+ overall_rating = evaluation_data.get("overall_rating", {})
3134
+ print("\nStatus:", overall_rating.get("rating", "N/A"))
3135
+ print("Explanation:", overall_rating.get("explanation", ""))
3136
+
3137
+ # Print metrics if available
3138
+ metrics = overall_rating.get("metrics", {})
3139
+ if metrics:
3140
+ print("\nMetrics:")
3141
+ print(f"Number of questions: {metrics.get('num_questions', 'N/A')}")
3142
+ print(
3143
+ f"Similarity threshold: {metrics.get('similarity_threshold', 'N/A'):.3f}"
3144
+ )
3145
+ print(f"Pass rate: {metrics.get('pass_rate', 'N/A'):.3f}")
3146
+ print(f"Passed threshold: {metrics.get('num_passed', 'N/A')}")
3147
+ print(f"Failed threshold: {metrics.get('num_failed', 'N/A')}")
3148
+ print("\nSimilarity Statistics:")
3149
+ print(f"Mean: {metrics.get('mean_similarity', 'N/A'):.3f}")
3150
+ print(f"Median: {metrics.get('median_similarity', 'N/A'):.3f}")
3151
+ print(f"Min: {metrics.get('min_similarity', 'N/A'):.3f}")
3152
+ print(f"Max: {metrics.get('max_similarity', 'N/A'):.3f}")
3153
+ print(f"Standard deviation: {metrics.get('std_similarity', 'N/A'):.3f}")
3154
+
3155
+ print("\nAnalysis:", evaluation_data.get("overall_analysis", "N/A"))
3156
+
3157
+ # Print cost information if available
3158
+ if evaluation_data.get("total_usage") and evaluation_data.get("total_cost"):
3159
+ total_usage = evaluation_data["total_usage"]
3160
+ total_cost = evaluation_data["total_cost"]
3161
+ print("\nCost Analysis:")
3162
+ print(
3163
+ f"Token usage: {total_usage['input_tokens']:,} input + {total_usage['output_tokens']:,} output = {total_usage['total_tokens']:,} total"
3164
+ )
3165
+ print(
3166
+ f"Total cost: ${total_cost['input_cost']:.4f} input + ${total_cost['output_cost']:.4f} output = ${total_cost['total_cost']:.4f} total"
3167
+ )
3168
+ if evaluation_data.get("per_question"):
3169
+ print(
3170
+ f"Average cost per question: ${total_cost['total_cost']/len(evaluation_data['per_question']):.4f}"
3171
+ )
3172
+
3173
+ if evaluation_data.get("strengths"):
3174
+ print("\nStrengths:")
3175
+ for strength in evaluation_data["strengths"]:
3176
+ print(f"- {strength}")
3177
+
3178
+ except Exception as e:
3179
+ print(f"Error during evaluation: {e}")