solace-agent-mesh 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of solace-agent-mesh might be problematic. Click here for more details.

Files changed (184) hide show
  1. solace_agent_mesh/agent/adk/callbacks.py +0 -5
  2. solace_agent_mesh/agent/adk/models/lite_llm.py +123 -8
  3. solace_agent_mesh/agent/adk/models/oauth2_token_manager.py +245 -0
  4. solace_agent_mesh/agent/protocol/event_handlers.py +213 -31
  5. solace_agent_mesh/agent/proxies/__init__.py +0 -0
  6. solace_agent_mesh/agent/proxies/a2a/__init__.py +3 -0
  7. solace_agent_mesh/agent/proxies/a2a/app.py +55 -0
  8. solace_agent_mesh/agent/proxies/a2a/component.py +1115 -0
  9. solace_agent_mesh/agent/proxies/a2a/config.py +140 -0
  10. solace_agent_mesh/agent/proxies/a2a/oauth_token_cache.py +104 -0
  11. solace_agent_mesh/agent/proxies/base/__init__.py +3 -0
  12. solace_agent_mesh/agent/proxies/base/app.py +99 -0
  13. solace_agent_mesh/agent/proxies/base/component.py +650 -0
  14. solace_agent_mesh/agent/proxies/base/config.py +85 -0
  15. solace_agent_mesh/agent/proxies/base/proxy_task_context.py +17 -0
  16. solace_agent_mesh/agent/sac/app.py +58 -5
  17. solace_agent_mesh/agent/sac/component.py +238 -75
  18. solace_agent_mesh/agent/sac/task_execution_context.py +46 -0
  19. solace_agent_mesh/agent/tools/audio_tools.py +125 -8
  20. solace_agent_mesh/agent/tools/web_tools.py +10 -5
  21. solace_agent_mesh/agent/utils/artifact_helpers.py +141 -3
  22. solace_agent_mesh/assets/docs/404.html +3 -3
  23. solace_agent_mesh/assets/docs/assets/js/5c2bd65f.eda4bcb2.js +1 -0
  24. solace_agent_mesh/assets/docs/assets/js/6ad8f0bd.f4b15f3b.js +1 -0
  25. solace_agent_mesh/assets/docs/assets/js/71da7b71.38583438.js +1 -0
  26. solace_agent_mesh/assets/docs/assets/js/77cf947d.48cb18a2.js +1 -0
  27. solace_agent_mesh/assets/docs/assets/js/924ffdeb.8095e148.js +1 -0
  28. solace_agent_mesh/assets/docs/assets/js/9e9d0a82.570c057b.js +1 -0
  29. solace_agent_mesh/assets/docs/assets/js/{ad71b5ed.60668e9e.js → ad71b5ed.af3ecfd1.js} +1 -1
  30. solace_agent_mesh/assets/docs/assets/js/ceb2a7a6.5d92d7d0.js +1 -0
  31. solace_agent_mesh/assets/docs/assets/js/{da0b5bad.9d369087.js → da0b5bad.d08a9466.js} +1 -1
  32. solace_agent_mesh/assets/docs/assets/js/db924877.e98d12a1.js +1 -0
  33. solace_agent_mesh/assets/docs/assets/js/de915948.27d6b065.js +1 -0
  34. solace_agent_mesh/assets/docs/assets/js/{e3d9abda.2b916f9e.js → e3d9abda.6b9493d0.js} +1 -1
  35. solace_agent_mesh/assets/docs/assets/js/e6f9706b.e74a984d.js +1 -0
  36. solace_agent_mesh/assets/docs/assets/js/f284c35a.42f59cdd.js +1 -0
  37. solace_agent_mesh/assets/docs/assets/js/ff4d71f2.15b02f97.js +1 -0
  38. solace_agent_mesh/assets/docs/assets/js/{main.bd3c34f3.js → main.b12eac43.js} +2 -2
  39. solace_agent_mesh/assets/docs/assets/js/runtime~main.e268214e.js +1 -0
  40. solace_agent_mesh/assets/docs/docs/documentation/components/agents/index.html +15 -4
  41. solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/artifact-management/index.html +4 -4
  42. solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/audio-tools/index.html +4 -4
  43. solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/data-analysis-tools/index.html +4 -4
  44. solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/embeds/index.html +4 -4
  45. solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/index.html +4 -4
  46. solace_agent_mesh/assets/docs/docs/documentation/components/cli/index.html +4 -4
  47. solace_agent_mesh/assets/docs/docs/documentation/components/gateways/index.html +4 -4
  48. solace_agent_mesh/assets/docs/docs/documentation/components/index.html +4 -4
  49. solace_agent_mesh/assets/docs/docs/documentation/components/orchestrator/index.html +4 -4
  50. solace_agent_mesh/assets/docs/docs/documentation/components/plugins/index.html +4 -4
  51. solace_agent_mesh/assets/docs/docs/documentation/components/proxies/index.html +262 -0
  52. solace_agent_mesh/assets/docs/docs/documentation/deploying/debugging/index.html +3 -3
  53. solace_agent_mesh/assets/docs/docs/documentation/deploying/deployment-options/index.html +31 -3
  54. solace_agent_mesh/assets/docs/docs/documentation/deploying/index.html +3 -3
  55. solace_agent_mesh/assets/docs/docs/documentation/deploying/observability/index.html +3 -3
  56. solace_agent_mesh/assets/docs/docs/documentation/developing/create-agents/index.html +4 -4
  57. solace_agent_mesh/assets/docs/docs/documentation/developing/create-gateways/index.html +5 -5
  58. solace_agent_mesh/assets/docs/docs/documentation/developing/creating-python-tools/index.html +4 -4
  59. solace_agent_mesh/assets/docs/docs/documentation/developing/creating-service-providers/index.html +4 -4
  60. solace_agent_mesh/assets/docs/docs/documentation/developing/evaluations/index.html +135 -0
  61. solace_agent_mesh/assets/docs/docs/documentation/developing/index.html +6 -4
  62. solace_agent_mesh/assets/docs/docs/documentation/developing/structure/index.html +4 -4
  63. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/bedrock-agents/index.html +4 -4
  64. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/custom-agent/index.html +4 -4
  65. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/event-mesh-gateway/index.html +5 -5
  66. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/mcp-integration/index.html +4 -4
  67. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/mongodb-integration/index.html +4 -4
  68. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/rag-integration/index.html +4 -4
  69. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/rest-gateway/index.html +4 -4
  70. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/slack-integration/index.html +4 -4
  71. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/sql-database/index.html +4 -4
  72. solace_agent_mesh/assets/docs/docs/documentation/enterprise/index.html +3 -3
  73. solace_agent_mesh/assets/docs/docs/documentation/enterprise/installation/index.html +3 -3
  74. solace_agent_mesh/assets/docs/docs/documentation/enterprise/rbac-setup-guide/index.html +3 -3
  75. solace_agent_mesh/assets/docs/docs/documentation/enterprise/single-sign-on/index.html +3 -3
  76. solace_agent_mesh/assets/docs/docs/documentation/getting-started/architecture/index.html +3 -3
  77. solace_agent_mesh/assets/docs/docs/documentation/getting-started/index.html +3 -3
  78. solace_agent_mesh/assets/docs/docs/documentation/getting-started/introduction/index.html +3 -3
  79. solace_agent_mesh/assets/docs/docs/documentation/getting-started/try-agent-mesh/index.html +3 -3
  80. solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/configurations/index.html +6 -5
  81. solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/index.html +3 -3
  82. solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/installation/index.html +3 -3
  83. solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/large_language_models/index.html +100 -3
  84. solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/run-project/index.html +3 -3
  85. solace_agent_mesh/assets/docs/docs/documentation/migrations/a2a-upgrade/a2a-gateway-upgrade-to-0.3.0/index.html +3 -3
  86. solace_agent_mesh/assets/docs/docs/documentation/migrations/a2a-upgrade/a2a-technical-migration-map/index.html +3 -3
  87. solace_agent_mesh/assets/docs/lunr-index-1761248203150.json +1 -0
  88. solace_agent_mesh/assets/docs/lunr-index.json +1 -1
  89. solace_agent_mesh/assets/docs/search-doc-1761248203150.json +1 -0
  90. solace_agent_mesh/assets/docs/search-doc.json +1 -1
  91. solace_agent_mesh/assets/docs/sitemap.xml +1 -1
  92. solace_agent_mesh/cli/__init__.py +1 -1
  93. solace_agent_mesh/cli/commands/add_cmd/agent_cmd.py +2 -69
  94. solace_agent_mesh/cli/commands/eval_cmd.py +11 -49
  95. solace_agent_mesh/cli/commands/init_cmd/__init__.py +0 -5
  96. solace_agent_mesh/cli/commands/init_cmd/env_step.py +10 -12
  97. solace_agent_mesh/cli/commands/init_cmd/orchestrator_step.py +9 -61
  98. solace_agent_mesh/cli/commands/init_cmd/webui_gateway_step.py +9 -49
  99. solace_agent_mesh/cli/commands/plugin_cmd/add_cmd.py +1 -2
  100. solace_agent_mesh/client/webui/frontend/static/assets/{authCallback-DwrxZE0E.js → authCallback-BTf6dqwp.js} +1 -1
  101. solace_agent_mesh/client/webui/frontend/static/assets/{client-DarGQzyw.js → client-CaY59VuC.js} +1 -1
  102. solace_agent_mesh/client/webui/frontend/static/assets/main-B32noGmR.js +342 -0
  103. solace_agent_mesh/client/webui/frontend/static/assets/main-DHJKSW1S.css +1 -0
  104. solace_agent_mesh/client/webui/frontend/static/assets/{vendor-BKIeiHj_.js → vendor-BEmvJSYz.js} +1 -1
  105. solace_agent_mesh/client/webui/frontend/static/auth-callback.html +3 -3
  106. solace_agent_mesh/client/webui/frontend/static/index.html +4 -4
  107. solace_agent_mesh/common/a2a/__init__.py +24 -0
  108. solace_agent_mesh/common/a2a/artifact.py +39 -0
  109. solace_agent_mesh/common/a2a/events.py +29 -0
  110. solace_agent_mesh/common/a2a/message.py +68 -0
  111. solace_agent_mesh/common/a2a/protocol.py +151 -1
  112. solace_agent_mesh/common/agent_registry.py +83 -3
  113. solace_agent_mesh/common/constants.py +3 -1
  114. solace_agent_mesh/common/sac/sam_component_base.py +383 -4
  115. solace_agent_mesh/common/utils/pydantic_utils.py +12 -0
  116. solace_agent_mesh/config_portal/backend/common.py +1 -1
  117. solace_agent_mesh/config_portal/frontend/static/client/assets/_index-ByU1X1HD.js +98 -0
  118. solace_agent_mesh/config_portal/frontend/static/client/assets/{manifest-44d62be6.js → manifest-61038fc6.js} +1 -1
  119. solace_agent_mesh/config_portal/frontend/static/client/index.html +1 -1
  120. solace_agent_mesh/evaluation/evaluator.py +128 -104
  121. solace_agent_mesh/evaluation/message_organizer.py +116 -110
  122. solace_agent_mesh/evaluation/report_data_processor.py +84 -86
  123. solace_agent_mesh/evaluation/report_generator.py +73 -79
  124. solace_agent_mesh/evaluation/run.py +421 -235
  125. solace_agent_mesh/evaluation/shared/__init__.py +92 -0
  126. solace_agent_mesh/evaluation/shared/constants.py +47 -0
  127. solace_agent_mesh/evaluation/shared/exceptions.py +50 -0
  128. solace_agent_mesh/evaluation/shared/helpers.py +35 -0
  129. solace_agent_mesh/evaluation/shared/test_case_loader.py +167 -0
  130. solace_agent_mesh/evaluation/shared/test_suite_loader.py +280 -0
  131. solace_agent_mesh/evaluation/subscriber.py +111 -232
  132. solace_agent_mesh/evaluation/summary_builder.py +227 -117
  133. solace_agent_mesh/gateway/base/app.py +16 -1
  134. solace_agent_mesh/gateway/base/component.py +112 -39
  135. solace_agent_mesh/gateway/http_sse/alembic/versions/20251015_add_session_performance_indexes.py +70 -0
  136. solace_agent_mesh/gateway/http_sse/component.py +99 -3
  137. solace_agent_mesh/gateway/http_sse/dependencies.py +4 -4
  138. solace_agent_mesh/gateway/http_sse/main.py +1 -0
  139. solace_agent_mesh/gateway/http_sse/repository/chat_task_repository.py +12 -13
  140. solace_agent_mesh/gateway/http_sse/repository/feedback_repository.py +15 -18
  141. solace_agent_mesh/gateway/http_sse/repository/interfaces.py +25 -18
  142. solace_agent_mesh/gateway/http_sse/repository/session_repository.py +30 -26
  143. solace_agent_mesh/gateway/http_sse/repository/task_repository.py +35 -44
  144. solace_agent_mesh/gateway/http_sse/routers/agent_cards.py +4 -3
  145. solace_agent_mesh/gateway/http_sse/routers/artifacts.py +95 -203
  146. solace_agent_mesh/gateway/http_sse/routers/dto/responses/session_responses.py +4 -3
  147. solace_agent_mesh/gateway/http_sse/routers/sessions.py +2 -2
  148. solace_agent_mesh/gateway/http_sse/routers/tasks.py +33 -41
  149. solace_agent_mesh/gateway/http_sse/routers/users.py +47 -1
  150. solace_agent_mesh/gateway/http_sse/routers/visualization.py +17 -11
  151. solace_agent_mesh/gateway/http_sse/services/data_retention_service.py +4 -4
  152. solace_agent_mesh/gateway/http_sse/services/feedback_service.py +51 -43
  153. solace_agent_mesh/gateway/http_sse/services/session_service.py +20 -20
  154. solace_agent_mesh/gateway/http_sse/services/task_logger_service.py +8 -8
  155. solace_agent_mesh/gateway/http_sse/shared/base_repository.py +45 -71
  156. solace_agent_mesh/gateway/http_sse/shared/types.py +0 -18
  157. solace_agent_mesh/templates/gateway_config_template.yaml +0 -5
  158. solace_agent_mesh/templates/logging_config_template.ini +10 -6
  159. solace_agent_mesh/templates/plugin_gateway_config_template.yaml +0 -3
  160. solace_agent_mesh/templates/shared_config.yaml +40 -0
  161. {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.1.dist-info}/METADATA +47 -21
  162. {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.1.dist-info}/RECORD +166 -145
  163. solace_agent_mesh/assets/docs/assets/js/5c2bd65f.e49689dd.js +0 -1
  164. solace_agent_mesh/assets/docs/assets/js/6ad8f0bd.39d5851d.js +0 -1
  165. solace_agent_mesh/assets/docs/assets/js/71da7b71.804d6567.js +0 -1
  166. solace_agent_mesh/assets/docs/assets/js/77cf947d.64c9bd6c.js +0 -1
  167. solace_agent_mesh/assets/docs/assets/js/9e9d0a82.dd810042.js +0 -1
  168. solace_agent_mesh/assets/docs/assets/js/db924877.cbc66f02.js +0 -1
  169. solace_agent_mesh/assets/docs/assets/js/de915948.139b4b9c.js +0 -1
  170. solace_agent_mesh/assets/docs/assets/js/e6f9706b.582a78ca.js +0 -1
  171. solace_agent_mesh/assets/docs/assets/js/f284c35a.5766a13d.js +0 -1
  172. solace_agent_mesh/assets/docs/assets/js/ff4d71f2.9c0297a6.js +0 -1
  173. solace_agent_mesh/assets/docs/assets/js/runtime~main.18dc45dd.js +0 -1
  174. solace_agent_mesh/assets/docs/lunr-index-1760121512891.json +0 -1
  175. solace_agent_mesh/assets/docs/search-doc-1760121512891.json +0 -1
  176. solace_agent_mesh/client/webui/frontend/static/assets/main-2nd1gbaH.js +0 -339
  177. solace_agent_mesh/client/webui/frontend/static/assets/main-DoKXctCM.css +0 -1
  178. solace_agent_mesh/config_portal/frontend/static/client/assets/_index-BNuqpWDc.js +0 -98
  179. solace_agent_mesh/evaluation/config_loader.py +0 -657
  180. solace_agent_mesh/evaluation/test_case_loader.py +0 -714
  181. /solace_agent_mesh/assets/docs/assets/js/{main.bd3c34f3.js.LICENSE.txt → main.b12eac43.js.LICENSE.txt} +0 -0
  182. {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.1.dist-info}/WHEEL +0 -0
  183. {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.1.dist-info}/entry_points.txt +0 -0
  184. {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.1.dist-info}/licenses/LICENSE +0 -0
@@ -1 +1 @@
1
- window.__remixManifest={"entry":{"module":"/assets/entry.client-mvZjNKiz.js","imports":["/assets/index-DzNKzXrc.js","/assets/components-Rk0n-9cK.js"],"css":[]},"routes":{"root":{"id":"root","path":"","hasAction":false,"hasLoader":false,"hasClientAction":false,"hasClientLoader":false,"hasErrorBoundary":false,"module":"/assets/root-BWvk5-gF.js","imports":["/assets/index-DzNKzXrc.js","/assets/components-Rk0n-9cK.js"],"css":["/assets/root-DxRwaWiE.css"]},"routes/_index":{"id":"routes/_index","parentId":"root","index":true,"hasAction":false,"hasLoader":false,"hasClientAction":false,"hasClientLoader":false,"hasErrorBoundary":false,"module":"/assets/_index-BNuqpWDc.js","imports":["/assets/index-DzNKzXrc.js"],"css":[]}},"url":"/assets/manifest-44d62be6.js","version":"44d62be6"};
1
+ window.__remixManifest={"entry":{"module":"/assets/entry.client-mvZjNKiz.js","imports":["/assets/index-DzNKzXrc.js","/assets/components-Rk0n-9cK.js"],"css":[]},"routes":{"root":{"id":"root","path":"","hasAction":false,"hasLoader":false,"hasClientAction":false,"hasClientLoader":false,"hasErrorBoundary":false,"module":"/assets/root-BWvk5-gF.js","imports":["/assets/index-DzNKzXrc.js","/assets/components-Rk0n-9cK.js"],"css":["/assets/root-DxRwaWiE.css"]},"routes/_index":{"id":"routes/_index","parentId":"root","index":true,"hasAction":false,"hasLoader":false,"hasClientAction":false,"hasClientLoader":false,"hasErrorBoundary":false,"module":"/assets/_index-ByU1X1HD.js","imports":["/assets/index-DzNKzXrc.js"],"css":[]}},"url":"/assets/manifest-61038fc6.js","version":"61038fc6"};
@@ -1,5 +1,5 @@
1
1
  <!DOCTYPE html>
2
- <html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="stylesheet" href="/assets/root-DxRwaWiE.css"/><link rel="preconnect" href="https://fonts.googleapis.com"/><link rel="preconnect" href="https://fonts.gstatic.com" crossorigin="anonymous"/><link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:ital,opsz,wght@0,14..32,100..900;1,14..32,100..900&amp;display=swap"/></head><body><p>Loading...</p><link rel="modulepreload" href="/assets/manifest-44d62be6.js"/><link rel="modulepreload" href="/assets/entry.client-mvZjNKiz.js"/><link rel="modulepreload" href="/assets/index-DzNKzXrc.js"/><link rel="modulepreload" href="/assets/components-Rk0n-9cK.js"/><link rel="modulepreload" href="/assets/root-BWvk5-gF.js"/><script>window.__remixContext = {"basename":"/","future":{"v3_fetcherPersist":false,"v3_relativeSplatPath":false,"v3_throwAbortReason":false,"v3_routeConfig":false,"v3_singleFetch":false,"v3_lazyRouteDiscovery":false,"unstable_optimizeDeps":false},"isSpaMode":true,"state":{"loaderData":{"root":null,"routes/_index":null},"actionData":null,"errors":null}};</script><script type="module" async="">import "/assets/manifest-44d62be6.js";
2
+ <html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="stylesheet" href="/assets/root-DxRwaWiE.css"/><link rel="preconnect" href="https://fonts.googleapis.com"/><link rel="preconnect" href="https://fonts.gstatic.com" crossorigin="anonymous"/><link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:ital,opsz,wght@0,14..32,100..900;1,14..32,100..900&amp;display=swap"/></head><body><p>Loading...</p><link rel="modulepreload" href="/assets/manifest-61038fc6.js"/><link rel="modulepreload" href="/assets/entry.client-mvZjNKiz.js"/><link rel="modulepreload" href="/assets/index-DzNKzXrc.js"/><link rel="modulepreload" href="/assets/components-Rk0n-9cK.js"/><link rel="modulepreload" href="/assets/root-BWvk5-gF.js"/><script>window.__remixContext = {"basename":"/","future":{"v3_fetcherPersist":false,"v3_relativeSplatPath":false,"v3_throwAbortReason":false,"v3_routeConfig":false,"v3_singleFetch":false,"v3_lazyRouteDiscovery":false,"unstable_optimizeDeps":false},"isSpaMode":true,"state":{"loaderData":{"root":null,"routes/_index":null},"actionData":null,"errors":null}};</script><script type="module" async="">import "/assets/manifest-61038fc6.js";
3
3
  import * as route0 from "/assets/root-BWvk5-gF.js";
4
4
 
5
5
  window.__remixRouteModules = {"root":route0};
@@ -5,27 +5,25 @@ This module evaluates AI model performance against test cases using multiple eva
5
5
 
6
6
  import concurrent.futures
7
7
  import json
8
- import os
8
+ import logging
9
9
  import re
10
- import sys
11
10
  from abc import ABC, abstractmethod
12
11
  from collections import defaultdict
13
12
  from dataclasses import dataclass, field
14
- from typing import Dict, List, Optional, Any, Tuple
15
- import logging
13
+ from pathlib import Path
16
14
 
15
+ import litellm
17
16
  import numpy as np
18
17
  from rouge import Rouge
19
- import litellm
20
-
21
- sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
22
- from evaluation.config_loader import ConfigLoader
23
- from evaluation.test_case_loader import load_test_case
24
18
 
25
- logging.basicConfig(level=logging.INFO)
26
- logger = logging.getLogger(__name__)
19
+ from .shared import (
20
+ EvaluationConfigLoader,
21
+ EvaluationOptions,
22
+ TestSuiteConfiguration,
23
+ load_test_case,
24
+ )
27
25
 
28
- SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
26
+ log = logging.getLogger(__name__)
29
27
 
30
28
 
31
29
  @dataclass
@@ -35,14 +33,14 @@ class EvaluationResult:
35
33
  run_number: int
36
34
  test_case_id: str
37
35
  test_case_path: str
38
- tool_match_score: Optional[float] = None
39
- response_match_score: Optional[float] = None
40
- llm_eval_score: Optional[float] = None
41
- llm_eval_reasoning: Optional[str] = None
42
- duration_seconds: Optional[float] = None
43
- errors: List[str] = field(default_factory=list)
44
-
45
- def to_dict(self) -> Dict[str, Any]:
36
+ tool_match_score: float | None = None
37
+ response_match_score: float | None = None
38
+ llm_eval_score: float | None = None
39
+ llm_eval_reasoning: str | None = None
40
+ duration_seconds: float | None = None
41
+ errors: list[str] = field(default_factory=list)
42
+
43
+ def to_dict(self) -> dict[str, any]:
46
44
  """Convert to dictionary format for JSON serialization."""
47
45
  result = {
48
46
  "run": self.run_number,
@@ -74,10 +72,10 @@ class ScoreStatistics:
74
72
  """Statistical summary of evaluation scores."""
75
73
 
76
74
  average: float
77
- distribution: Dict[str, float]
75
+ distribution: dict[str, float]
78
76
 
79
77
  @classmethod
80
- def from_scores(cls, scores: List[float]) -> "ScoreStatistics":
78
+ def from_scores(cls, scores: list[float]) -> "ScoreStatistics":
81
79
  """Create statistics from a list of scores."""
82
80
  if not scores:
83
81
  return cls(
@@ -103,13 +101,13 @@ class TestCaseResults:
103
101
 
104
102
  test_case_id: str
105
103
  category: str
106
- runs: List[EvaluationResult]
104
+ runs: list[EvaluationResult]
107
105
  average_duration: float
108
106
  tool_match_scores: ScoreStatistics
109
107
  response_match_scores: ScoreStatistics
110
108
  llm_eval_scores: ScoreStatistics
111
109
 
112
- def to_dict(self) -> Dict[str, Any]:
110
+ def to_dict(self) -> dict[str, any]:
113
111
  """Convert to dictionary format for JSON serialization."""
114
112
  return {
115
113
  "test_case_id": self.test_case_id,
@@ -136,10 +134,10 @@ class ModelResults:
136
134
  """Complete evaluation results for a model."""
137
135
 
138
136
  model_name: str
139
- total_execution_time: Optional[float]
140
- test_cases: List[TestCaseResults]
137
+ total_execution_time: float | None
138
+ test_cases: list[TestCaseResults]
141
139
 
142
- def to_dict(self) -> Dict[str, Any]:
140
+ def to_dict(self) -> dict[str, any]:
143
141
  """Convert to dictionary format for JSON serialization."""
144
142
  return {
145
143
  "model_name": self.model_name,
@@ -152,71 +150,63 @@ class ConfigurationService:
152
150
  """Handles configuration loading and validation."""
153
151
 
154
152
  def __init__(self, config_path: str):
155
- self.config_loader = ConfigLoader(config_path)
153
+ self.config_loader = EvaluationConfigLoader(config_path)
156
154
  self._config_cache = None
157
155
  self._evaluation_settings_cache = None
158
156
 
159
- def get_config(self) -> Dict[str, Any]:
157
+ def get_config(self) -> TestSuiteConfiguration:
160
158
  """Get the main configuration."""
161
159
  if self._config_cache is None:
162
- self._config_cache = self.config_loader.load_config()
160
+ self._config_cache = self.config_loader.load_configuration()
163
161
  return self._config_cache
164
162
 
165
- def get_evaluation_settings(self) -> Dict[str, Any]:
163
+ def get_evaluation_settings(self) -> EvaluationOptions:
166
164
  """Get evaluation settings."""
167
165
  if self._evaluation_settings_cache is None:
168
- self._evaluation_settings_cache = (
169
- self.config_loader.get_evaluation_settings()
170
- )
166
+ self._evaluation_settings_cache = self.config_loader.get_evaluation_options()
171
167
  return self._evaluation_settings_cache
172
168
 
173
- def get_results_path(self) -> str:
174
- """Get the base results path."""
175
- config = self.get_config()
176
- results_dir_name = config["results_dir_name"]
177
- return os.path.join(SCRIPT_DIR, "results", results_dir_name)
178
-
179
169
 
180
170
  class FileService:
181
171
  """Handles file I/O operations."""
182
172
 
183
173
  @staticmethod
184
- def load_json(filepath: str) -> Any:
174
+ def load_json(filepath: Path) -> any:
185
175
  """Load JSON data from file."""
186
176
  try:
187
- with open(filepath, "r") as f:
177
+ with filepath.open() as f:
188
178
  return json.load(f)
189
179
  except (FileNotFoundError, json.JSONDecodeError) as e:
190
- logger.error(f"Failed to load JSON from {filepath}: {e}")
180
+ log.error(f"Failed to load JSON from {filepath}: {e}")
191
181
  raise
192
182
 
193
183
  @staticmethod
194
- def save_json(data: Any, filepath: str):
184
+ def save_json(data: any, filepath: Path):
195
185
  """Save data as JSON to file."""
196
186
  try:
197
- os.makedirs(os.path.dirname(filepath), exist_ok=True)
198
- with open(filepath, "w") as f:
187
+ filepath.parent.mkdir(parents=True, exist_ok=True)
188
+ with filepath.open("w") as f:
199
189
  json.dump(data, f, indent=4)
200
190
  except Exception as e:
201
- logger.error(f"Failed to save JSON to {filepath}: {e}")
191
+ log.error(f"Failed to save JSON to {filepath}: {e}")
202
192
  raise
203
193
 
204
194
  @staticmethod
205
- def file_exists(filepath: str) -> bool:
195
+ def file_exists(filepath: Path) -> bool:
206
196
  """Check if file exists."""
207
- return os.path.exists(filepath)
197
+ return filepath.exists()
208
198
 
209
199
 
210
200
  class StatisticsService:
211
201
  """Handles statistical calculations and aggregations."""
212
202
 
213
203
  @staticmethod
214
- def calculate_score_statistics(scores: List[float]) -> ScoreStatistics:
204
+ def calculate_score_statistics(scores: list[float]) -> ScoreStatistics:
215
205
  """Calculate statistical summary for a list of scores."""
216
206
  return ScoreStatistics.from_scores(scores)
217
207
 
218
208
  @staticmethod
219
- def calculate_average_duration(durations: List[float]) -> float:
209
+ def calculate_average_duration(durations: list[float]) -> float:
220
210
  """Calculate average duration from a list of durations."""
221
211
  if not durations:
222
212
  return 0.0
@@ -228,8 +218,8 @@ class EvaluationStrategy(ABC):
228
218
 
229
219
  @abstractmethod
230
220
  def evaluate(
231
- self, test_case: Dict[str, Any], summary_data: Dict[str, Any]
232
- ) -> Optional[float]:
221
+ self, test_case: dict[str, any], summary_data: dict[str, any]
222
+ ) -> float | None:
233
223
  """Evaluate a test case run and return a score."""
234
224
  pass
235
225
 
@@ -238,8 +228,8 @@ class ToolMatchEvaluator(EvaluationStrategy):
238
228
  """Evaluates tool usage against expected tools."""
239
229
 
240
230
  def evaluate(
241
- self, test_case: Dict[str, Any], summary_data: Dict[str, Any]
242
- ) -> Optional[float]:
231
+ self, test_case: dict[str, any], summary_data: dict[str, any]
232
+ ) -> float | None:
243
233
  """Evaluate tool matching score."""
244
234
  try:
245
235
  expected_tools = test_case["evaluation"]["expected_tools"]
@@ -257,7 +247,7 @@ class ToolMatchEvaluator(EvaluationStrategy):
257
247
  return len(found_tools) / len(expected_set)
258
248
 
259
249
  except (KeyError, TypeError) as e:
260
- logger.warning(f"Error in tool match evaluation: {e}")
250
+ log.warning(f"Error in tool match evaluation: {e}")
261
251
  return None
262
252
 
263
253
 
@@ -268,8 +258,8 @@ class ResponseMatchEvaluator(EvaluationStrategy):
268
258
  self.rouge = Rouge()
269
259
 
270
260
  def evaluate(
271
- self, test_case: Dict[str, Any], summary_data: Dict[str, Any]
272
- ) -> Optional[float]:
261
+ self, test_case: dict[str, any], summary_data: dict[str, any]
262
+ ) -> float | None:
273
263
  """Evaluate response matching score using a weighted ROUGE average."""
274
264
  try:
275
265
  expected_response = test_case["evaluation"]["expected_response"]
@@ -290,14 +280,14 @@ class ResponseMatchEvaluator(EvaluationStrategy):
290
280
  return weighted_score
291
281
 
292
282
  except (ValueError, KeyError, TypeError) as e:
293
- logger.warning(f"Error in response match evaluation: {e}")
283
+ log.warning(f"Error in response match evaluation: {e}")
294
284
  return 0.0
295
285
 
296
286
 
297
287
  class LLMEvaluator(EvaluationStrategy):
298
288
  """Evaluates responses using an LLM judge."""
299
289
 
300
- def __init__(self, llm_config: Dict[str, Any]):
290
+ def __init__(self, llm_config: dict[str, any]):
301
291
  self.model = llm_config.get("LLM_SERVICE_PLANNING_MODEL_NAME")
302
292
  self.api_key = llm_config.get("LLM_SERVICE_API_KEY")
303
293
  self.api_base = llm_config.get("LLM_SERVICE_ENDPOINT")
@@ -308,8 +298,8 @@ class LLMEvaluator(EvaluationStrategy):
308
298
  )
309
299
 
310
300
  def evaluate(
311
- self, test_case: Dict[str, Any], summary_data: Dict[str, Any]
312
- ) -> Optional[Dict[str, Any]]:
301
+ self, test_case: dict[str, any], summary_data: dict[str, any]
302
+ ) -> dict[str, any] | None:
313
303
  """Evaluate response using LLM and return score with reasoning."""
314
304
  try:
315
305
  query = test_case["query"]
@@ -342,7 +332,7 @@ class LLMEvaluator(EvaluationStrategy):
342
332
  return {"score": score, "reasoning": reasoning}
343
333
 
344
334
  except Exception as e:
345
- logger.error(f"Error in LLM evaluation: {e}")
335
+ log.error(f"Error in LLM evaluation: {e}")
346
336
  return None
347
337
 
348
338
  def _build_evaluation_prompt(
@@ -351,8 +341,8 @@ class LLMEvaluator(EvaluationStrategy):
351
341
  expected_response: str,
352
342
  actual_response: str,
353
343
  criterion: str,
354
- input_artifacts: List[Dict],
355
- output_artifacts: List[Dict],
344
+ input_artifacts: list[dict],
345
+ output_artifacts: list[dict],
356
346
  ) -> str:
357
347
  """Build the evaluation prompt for the LLM."""
358
348
  return f"""
@@ -367,7 +357,7 @@ class LLMEvaluator(EvaluationStrategy):
367
357
  Format your response exactly as:
368
358
  Score: [0.0-1.0]
369
359
  Reasoning: [Your detailed explanation of why you gave this score, considering both the response and any artifacts created]
370
-
360
+
371
361
  Provide a score from 0.0 to 1.0 where:
372
362
  - 1.0 = Excellent: Fully meets the criterion and expectations
373
363
  - 0.8-0.9 = Good: Mostly meets the criterion with minor issues
@@ -415,7 +405,7 @@ class LLMEvaluator(EvaluationStrategy):
415
405
  class RunEvaluator:
416
406
  """Evaluates individual test runs."""
417
407
 
418
- def __init__(self, evaluation_settings: Dict[str, Any]):
408
+ def __init__(self, evaluation_settings: dict[str, any]):
419
409
  self.evaluation_settings = evaluation_settings
420
410
  self.file_service = FileService()
421
411
 
@@ -437,24 +427,25 @@ class RunEvaluator:
437
427
  llm_config = evaluation_settings["llm_evaluator"]["env"]
438
428
  self.llm_evaluator = LLMEvaluator(llm_config)
439
429
  except Exception as e:
440
- logger.error(f"Failed to initialize LLM evaluator: {e}")
430
+ log.error(f"Failed to initialize LLM evaluator: {e}")
441
431
 
442
432
  def evaluate_run(
443
433
  self,
444
434
  run_number: int,
445
- run_path: str,
446
- test_case: Dict[str, Any],
435
+ run_path: Path,
436
+ test_case: dict[str, any],
447
437
  test_case_path: str,
448
- ) -> Optional[EvaluationResult]:
438
+ ) -> EvaluationResult | None:
449
439
  """Evaluate a single test run."""
450
- logger.info(
440
+ log.info(
451
441
  f" - Evaluating run {run_number} for test case {test_case['test_case_id']}"
452
442
  )
453
443
 
454
444
  # Load summary data
455
- summary_path = os.path.join(run_path, "summary.json")
445
+ summary_path = run_path / "summary.json"
446
+ log.info(f"Summary file path: {summary_path}")
456
447
  if not self.file_service.file_exists(summary_path):
457
- logger.warning(
448
+ log.warning(
458
449
  f" Summary file not found for run {run_number}, skipping."
459
450
  )
460
451
  return None
@@ -462,7 +453,7 @@ class RunEvaluator:
462
453
  try:
463
454
  summary_data = self.file_service.load_json(summary_path)
464
455
  except Exception as e:
465
- logger.error(f" Error loading summary.json for run {run_number}: {e}")
456
+ log.error(f" Error loading summary.json for run {run_number}: {e}")
466
457
  return None
467
458
 
468
459
  # Create evaluation result
@@ -496,7 +487,7 @@ class RunEvaluator:
496
487
  class ModelEvaluator:
497
488
  """Evaluates all runs for a single model."""
498
489
 
499
- def __init__(self, config: Dict[str, Any], evaluation_settings: Dict[str, Any]):
490
+ def __init__(self, config: dict[str, any], evaluation_settings: dict[str, any]):
500
491
  self.config = config
501
492
  self.evaluation_settings = evaluation_settings
502
493
  self.run_evaluator = RunEvaluator(evaluation_settings)
@@ -504,9 +495,9 @@ class ModelEvaluator:
504
495
 
505
496
  def evaluate_model(self, model_name: str, base_results_path: str) -> ModelResults:
506
497
  """Evaluate all test cases for a model."""
507
- logger.info(f"Evaluating model: {model_name}")
498
+ log.info(f"Evaluating model: {model_name}")
508
499
 
509
- model_results_path = os.path.join(base_results_path, model_name)
500
+ model_results_path = Path(base_results_path) / model_name
510
501
 
511
502
  # Collect all evaluation tasks
512
503
  tasks = self._collect_evaluation_tasks(model_results_path)
@@ -525,7 +516,7 @@ class ModelEvaluator:
525
516
  if result:
526
517
  model_results_data[result.test_case_id].append(result)
527
518
  except Exception as e:
528
- logger.error(f"An error occurred during evaluation: {e}")
519
+ log.error(f"An error occurred during evaluation: {e}")
529
520
 
530
521
  # Aggregate results by test case
531
522
  test_case_results = []
@@ -541,24 +532,24 @@ class ModelEvaluator:
541
532
  )
542
533
 
543
534
  def _collect_evaluation_tasks(
544
- self, model_results_path: str
545
- ) -> List[Tuple[int, str, Dict[str, Any], str]]:
535
+ self, model_results_path: Path
536
+ ) -> list[tuple[int, Path, dict[str, any], str]]:
546
537
  """Collect all evaluation tasks for the model."""
547
538
  tasks = []
548
539
 
549
540
  for test_case_path in self.config["test_cases"]:
550
541
  test_case = load_test_case(test_case_path)
551
- test_case_id = test_case["test_case_id"]
552
- test_case_results_path = os.path.join(model_results_path, test_case_id)
542
+ test_case_name = Path(test_case_path).stem.replace(".test", "")
543
+ test_case_results_path = model_results_path / test_case_name
553
544
 
554
545
  for i in range(1, self.config["runs"] + 1):
555
- run_path = os.path.join(test_case_results_path, f"run_{i}")
546
+ run_path = test_case_results_path / f"run_{i}"
556
547
  tasks.append((i, run_path, test_case, test_case_path))
557
548
 
558
549
  return tasks
559
550
 
560
551
  def _aggregate_test_case_results(
561
- self, test_case_id: str, runs: List[EvaluationResult]
552
+ self, test_case_id: str, runs: list[EvaluationResult]
562
553
  ) -> TestCaseResults:
563
554
  """Aggregate results for a test case across multiple runs."""
564
555
  # Load test case to get category
@@ -604,11 +595,11 @@ class ResultsWriter:
604
595
 
605
596
  def write_model_results(self, model_results: ModelResults, base_results_path: str):
606
597
  """Write model results to file."""
607
- results_path = os.path.join(
608
- base_results_path, model_results.model_name, "results.json"
598
+ results_path = (
599
+ Path(base_results_path) / model_results.model_name / "results.json"
609
600
  )
610
601
  self.file_service.save_json(model_results.to_dict(), results_path)
611
- logger.info(
602
+ log.info(
612
603
  f"Results for model {model_results.model_name} written to {results_path}"
613
604
  )
614
605
 
@@ -623,10 +614,13 @@ class EvaluationOrchestrator:
623
614
  def run_evaluation(
624
615
  self,
625
616
  base_results_path: str,
626
- model_execution_times: Optional[Dict[str, float]] = None,
617
+ model_execution_times: dict[str, float] | None = None,
627
618
  ):
628
619
  """Main entry point for the evaluation process."""
629
- logger.info("--- Starting evaluation ---")
620
+ log.info("Starting evaluation")
621
+
622
+ # Resolve to an absolute path to ensure consistency
623
+ base_results_path = str(Path(base_results_path).resolve())
630
624
 
631
625
  if model_execution_times is None:
632
626
  model_execution_times = {}
@@ -634,32 +628,62 @@ class EvaluationOrchestrator:
634
628
  config = self.config_service.get_config()
635
629
  evaluation_settings = self.config_service.get_evaluation_settings()
636
630
 
637
- model_evaluator = ModelEvaluator(config, evaluation_settings)
631
+ # Convert evaluation settings to dict format for backwards compatibility
632
+ settings_dict = {
633
+ "tool_match": {"enabled": evaluation_settings.tool_matching_enabled},
634
+ "response_match": {"enabled": evaluation_settings.response_matching_enabled},
635
+ "llm_evaluator": {
636
+ "enabled": evaluation_settings.llm_evaluation_enabled,
637
+ "env": evaluation_settings.llm_evaluator_environment.variables if evaluation_settings.llm_evaluator_environment else {}
638
+ }
639
+ }
638
640
 
639
- for model_config in config["llm_models"]:
640
- model_name = model_config["name"]
641
+ # Convert config to dict format for backwards compatibility
642
+ config_dict = {
643
+ "test_cases": config.test_case_files,
644
+ "runs": config.run_count
645
+ }
641
646
 
642
- # Evaluate the model
643
- model_results = model_evaluator.evaluate_model(
644
- model_name, base_results_path
645
- )
647
+ model_evaluator = ModelEvaluator(config_dict, settings_dict)
646
648
 
647
- # Add execution time if available
649
+ if config.remote:
650
+ # Handle remote evaluation
651
+ model_name = "remote"
652
+ model_results = model_evaluator.evaluate_model(model_name, base_results_path)
648
653
  execution_time = model_execution_times.get(model_name)
649
654
  if execution_time is not None:
650
655
  model_results.total_execution_time = execution_time
651
-
652
- # Write results to file
653
656
  self.results_writer.write_model_results(model_results, base_results_path)
657
+ else:
658
+ # Handle local evaluation
659
+ for model_config in config.model_configurations:
660
+ model_name = model_config.name
661
+
662
+ # Evaluate the model
663
+ model_results = model_evaluator.evaluate_model(
664
+ model_name, base_results_path
665
+ )
666
+
667
+ # Add execution time if available
668
+ execution_time = model_execution_times.get(model_name)
669
+ if execution_time is not None:
670
+ model_results.total_execution_time = execution_time
671
+
672
+ # Write results to file
673
+ self.results_writer.write_model_results(model_results, base_results_path)
654
674
 
655
- logger.info("--- Evaluation finished ---")
675
+ log.info("--- Evaluation finished ---")
656
676
 
657
677
 
658
- def main(config_path: str = "evaluation/test_suite_config.json"):
678
+ def main(config_path: str):
659
679
  """Main entry point for command-line usage."""
660
680
  orchestrator = EvaluationOrchestrator(config_path)
661
- results_path = orchestrator.config_service.get_results_path()
662
- orchestrator.run_evaluation(results_path)
681
+ # Results path should be based on the current working directory, not the package location.
682
+ # This main function is for standalone testing.
683
+ config = orchestrator.config_service.get_config()
684
+ results_path = Path.cwd() / "results" / config.results_directory
685
+ results_path.mkdir(parents=True, exist_ok=True)
686
+ orchestrator.run_evaluation(str(results_path))
663
687
 
664
688
 
665
689
  if __name__ == "__main__":