solace-agent-mesh 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of solace-agent-mesh might be problematic. Click here for more details.

Files changed (180) hide show
  1. solace_agent_mesh/agent/adk/callbacks.py +0 -5
  2. solace_agent_mesh/agent/adk/models/lite_llm.py +123 -8
  3. solace_agent_mesh/agent/adk/models/oauth2_token_manager.py +245 -0
  4. solace_agent_mesh/agent/protocol/event_handlers.py +40 -1
  5. solace_agent_mesh/agent/proxies/__init__.py +0 -0
  6. solace_agent_mesh/agent/proxies/a2a/__init__.py +3 -0
  7. solace_agent_mesh/agent/proxies/a2a/app.py +55 -0
  8. solace_agent_mesh/agent/proxies/a2a/component.py +1115 -0
  9. solace_agent_mesh/agent/proxies/a2a/config.py +140 -0
  10. solace_agent_mesh/agent/proxies/a2a/oauth_token_cache.py +104 -0
  11. solace_agent_mesh/agent/proxies/base/__init__.py +3 -0
  12. solace_agent_mesh/agent/proxies/base/app.py +99 -0
  13. solace_agent_mesh/agent/proxies/base/component.py +619 -0
  14. solace_agent_mesh/agent/proxies/base/config.py +85 -0
  15. solace_agent_mesh/agent/proxies/base/proxy_task_context.py +17 -0
  16. solace_agent_mesh/agent/sac/app.py +9 -3
  17. solace_agent_mesh/agent/sac/component.py +160 -8
  18. solace_agent_mesh/agent/tools/audio_tools.py +125 -8
  19. solace_agent_mesh/agent/tools/web_tools.py +10 -5
  20. solace_agent_mesh/agent/utils/artifact_helpers.py +141 -3
  21. solace_agent_mesh/assets/docs/404.html +3 -3
  22. solace_agent_mesh/assets/docs/assets/js/5c2bd65f.eda4bcb2.js +1 -0
  23. solace_agent_mesh/assets/docs/assets/js/6ad8f0bd.f4b15f3b.js +1 -0
  24. solace_agent_mesh/assets/docs/assets/js/71da7b71.38583438.js +1 -0
  25. solace_agent_mesh/assets/docs/assets/js/77cf947d.48cb18a2.js +1 -0
  26. solace_agent_mesh/assets/docs/assets/js/924ffdeb.8095e148.js +1 -0
  27. solace_agent_mesh/assets/docs/assets/js/9e9d0a82.570c057b.js +1 -0
  28. solace_agent_mesh/assets/docs/assets/js/{ad71b5ed.60668e9e.js → ad71b5ed.af3ecfd1.js} +1 -1
  29. solace_agent_mesh/assets/docs/assets/js/ceb2a7a6.5d92d7d0.js +1 -0
  30. solace_agent_mesh/assets/docs/assets/js/{da0b5bad.9d369087.js → da0b5bad.d08a9466.js} +1 -1
  31. solace_agent_mesh/assets/docs/assets/js/db924877.e98d12a1.js +1 -0
  32. solace_agent_mesh/assets/docs/assets/js/de915948.27d6b065.js +1 -0
  33. solace_agent_mesh/assets/docs/assets/js/e6f9706b.e74a984d.js +1 -0
  34. solace_agent_mesh/assets/docs/assets/js/f284c35a.42f59cdd.js +1 -0
  35. solace_agent_mesh/assets/docs/assets/js/ff4d71f2.15b02f97.js +1 -0
  36. solace_agent_mesh/assets/docs/assets/js/{main.bd3c34f3.js → main.20feee82.js} +2 -2
  37. solace_agent_mesh/assets/docs/assets/js/runtime~main.0d198646.js +1 -0
  38. solace_agent_mesh/assets/docs/docs/documentation/components/agents/index.html +15 -4
  39. solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/artifact-management/index.html +4 -4
  40. solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/audio-tools/index.html +4 -4
  41. solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/data-analysis-tools/index.html +4 -4
  42. solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/embeds/index.html +4 -4
  43. solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/index.html +4 -4
  44. solace_agent_mesh/assets/docs/docs/documentation/components/cli/index.html +4 -4
  45. solace_agent_mesh/assets/docs/docs/documentation/components/gateways/index.html +4 -4
  46. solace_agent_mesh/assets/docs/docs/documentation/components/index.html +4 -4
  47. solace_agent_mesh/assets/docs/docs/documentation/components/orchestrator/index.html +4 -4
  48. solace_agent_mesh/assets/docs/docs/documentation/components/plugins/index.html +4 -4
  49. solace_agent_mesh/assets/docs/docs/documentation/components/proxies/index.html +262 -0
  50. solace_agent_mesh/assets/docs/docs/documentation/deploying/debugging/index.html +3 -3
  51. solace_agent_mesh/assets/docs/docs/documentation/deploying/deployment-options/index.html +31 -3
  52. solace_agent_mesh/assets/docs/docs/documentation/deploying/index.html +3 -3
  53. solace_agent_mesh/assets/docs/docs/documentation/deploying/observability/index.html +3 -3
  54. solace_agent_mesh/assets/docs/docs/documentation/developing/create-agents/index.html +4 -4
  55. solace_agent_mesh/assets/docs/docs/documentation/developing/create-gateways/index.html +5 -5
  56. solace_agent_mesh/assets/docs/docs/documentation/developing/creating-python-tools/index.html +4 -4
  57. solace_agent_mesh/assets/docs/docs/documentation/developing/creating-service-providers/index.html +4 -4
  58. solace_agent_mesh/assets/docs/docs/documentation/developing/evaluations/index.html +135 -0
  59. solace_agent_mesh/assets/docs/docs/documentation/developing/index.html +6 -4
  60. solace_agent_mesh/assets/docs/docs/documentation/developing/structure/index.html +4 -4
  61. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/bedrock-agents/index.html +4 -4
  62. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/custom-agent/index.html +4 -4
  63. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/event-mesh-gateway/index.html +5 -5
  64. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/mcp-integration/index.html +4 -4
  65. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/mongodb-integration/index.html +4 -4
  66. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/rag-integration/index.html +4 -4
  67. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/rest-gateway/index.html +4 -4
  68. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/slack-integration/index.html +4 -4
  69. solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/sql-database/index.html +4 -4
  70. solace_agent_mesh/assets/docs/docs/documentation/enterprise/index.html +3 -3
  71. solace_agent_mesh/assets/docs/docs/documentation/enterprise/installation/index.html +3 -3
  72. solace_agent_mesh/assets/docs/docs/documentation/enterprise/rbac-setup-guide/index.html +3 -3
  73. solace_agent_mesh/assets/docs/docs/documentation/enterprise/single-sign-on/index.html +3 -3
  74. solace_agent_mesh/assets/docs/docs/documentation/getting-started/architecture/index.html +3 -3
  75. solace_agent_mesh/assets/docs/docs/documentation/getting-started/index.html +3 -3
  76. solace_agent_mesh/assets/docs/docs/documentation/getting-started/introduction/index.html +3 -3
  77. solace_agent_mesh/assets/docs/docs/documentation/getting-started/try-agent-mesh/index.html +3 -3
  78. solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/configurations/index.html +6 -5
  79. solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/index.html +3 -3
  80. solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/installation/index.html +3 -3
  81. solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/large_language_models/index.html +100 -3
  82. solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/run-project/index.html +3 -3
  83. solace_agent_mesh/assets/docs/docs/documentation/migrations/a2a-upgrade/a2a-gateway-upgrade-to-0.3.0/index.html +3 -3
  84. solace_agent_mesh/assets/docs/docs/documentation/migrations/a2a-upgrade/a2a-technical-migration-map/index.html +3 -3
  85. solace_agent_mesh/assets/docs/lunr-index-1761165361160.json +1 -0
  86. solace_agent_mesh/assets/docs/lunr-index.json +1 -1
  87. solace_agent_mesh/assets/docs/search-doc-1761165361160.json +1 -0
  88. solace_agent_mesh/assets/docs/search-doc.json +1 -1
  89. solace_agent_mesh/assets/docs/sitemap.xml +1 -1
  90. solace_agent_mesh/cli/__init__.py +1 -1
  91. solace_agent_mesh/cli/commands/add_cmd/agent_cmd.py +2 -69
  92. solace_agent_mesh/cli/commands/eval_cmd.py +11 -49
  93. solace_agent_mesh/cli/commands/init_cmd/__init__.py +0 -5
  94. solace_agent_mesh/cli/commands/init_cmd/env_step.py +10 -12
  95. solace_agent_mesh/cli/commands/init_cmd/orchestrator_step.py +9 -61
  96. solace_agent_mesh/cli/commands/init_cmd/webui_gateway_step.py +9 -49
  97. solace_agent_mesh/cli/commands/plugin_cmd/add_cmd.py +1 -2
  98. solace_agent_mesh/client/webui/frontend/static/assets/{authCallback-DwrxZE0E.js → authCallback-BTf6dqwp.js} +1 -1
  99. solace_agent_mesh/client/webui/frontend/static/assets/{client-DarGQzyw.js → client-CaY59VuC.js} +1 -1
  100. solace_agent_mesh/client/webui/frontend/static/assets/main-BGTaW0uv.js +342 -0
  101. solace_agent_mesh/client/webui/frontend/static/assets/main-DHJKSW1S.css +1 -0
  102. solace_agent_mesh/client/webui/frontend/static/assets/{vendor-BKIeiHj_.js → vendor-BEmvJSYz.js} +1 -1
  103. solace_agent_mesh/client/webui/frontend/static/auth-callback.html +3 -3
  104. solace_agent_mesh/client/webui/frontend/static/index.html +4 -4
  105. solace_agent_mesh/common/a2a/__init__.py +24 -0
  106. solace_agent_mesh/common/a2a/artifact.py +39 -0
  107. solace_agent_mesh/common/a2a/events.py +29 -0
  108. solace_agent_mesh/common/a2a/message.py +68 -0
  109. solace_agent_mesh/common/a2a/protocol.py +73 -1
  110. solace_agent_mesh/common/agent_registry.py +83 -3
  111. solace_agent_mesh/common/constants.py +3 -1
  112. solace_agent_mesh/common/utils/pydantic_utils.py +12 -0
  113. solace_agent_mesh/config_portal/backend/common.py +1 -1
  114. solace_agent_mesh/config_portal/frontend/static/client/assets/_index-ByU1X1HD.js +98 -0
  115. solace_agent_mesh/config_portal/frontend/static/client/assets/{manifest-44d62be6.js → manifest-61038fc6.js} +1 -1
  116. solace_agent_mesh/config_portal/frontend/static/client/index.html +1 -1
  117. solace_agent_mesh/evaluation/evaluator.py +128 -104
  118. solace_agent_mesh/evaluation/message_organizer.py +116 -110
  119. solace_agent_mesh/evaluation/report_data_processor.py +84 -86
  120. solace_agent_mesh/evaluation/report_generator.py +73 -79
  121. solace_agent_mesh/evaluation/run.py +421 -235
  122. solace_agent_mesh/evaluation/shared/__init__.py +92 -0
  123. solace_agent_mesh/evaluation/shared/constants.py +47 -0
  124. solace_agent_mesh/evaluation/shared/exceptions.py +50 -0
  125. solace_agent_mesh/evaluation/shared/helpers.py +35 -0
  126. solace_agent_mesh/evaluation/shared/test_case_loader.py +167 -0
  127. solace_agent_mesh/evaluation/shared/test_suite_loader.py +280 -0
  128. solace_agent_mesh/evaluation/subscriber.py +111 -232
  129. solace_agent_mesh/evaluation/summary_builder.py +227 -117
  130. solace_agent_mesh/gateway/base/app.py +1 -1
  131. solace_agent_mesh/gateway/base/component.py +8 -1
  132. solace_agent_mesh/gateway/http_sse/alembic/versions/20251015_add_session_performance_indexes.py +70 -0
  133. solace_agent_mesh/gateway/http_sse/component.py +98 -2
  134. solace_agent_mesh/gateway/http_sse/dependencies.py +4 -4
  135. solace_agent_mesh/gateway/http_sse/main.py +2 -1
  136. solace_agent_mesh/gateway/http_sse/repository/chat_task_repository.py +12 -13
  137. solace_agent_mesh/gateway/http_sse/repository/feedback_repository.py +15 -18
  138. solace_agent_mesh/gateway/http_sse/repository/interfaces.py +25 -18
  139. solace_agent_mesh/gateway/http_sse/repository/session_repository.py +30 -26
  140. solace_agent_mesh/gateway/http_sse/repository/task_repository.py +35 -44
  141. solace_agent_mesh/gateway/http_sse/routers/agent_cards.py +4 -3
  142. solace_agent_mesh/gateway/http_sse/routers/artifacts.py +95 -203
  143. solace_agent_mesh/gateway/http_sse/routers/dto/responses/session_responses.py +4 -3
  144. solace_agent_mesh/gateway/http_sse/routers/sessions.py +2 -2
  145. solace_agent_mesh/gateway/http_sse/routers/tasks.py +33 -41
  146. solace_agent_mesh/gateway/http_sse/routers/visualization.py +17 -11
  147. solace_agent_mesh/gateway/http_sse/services/data_retention_service.py +4 -4
  148. solace_agent_mesh/gateway/http_sse/services/feedback_service.py +51 -43
  149. solace_agent_mesh/gateway/http_sse/services/session_service.py +20 -20
  150. solace_agent_mesh/gateway/http_sse/services/task_logger_service.py +8 -8
  151. solace_agent_mesh/gateway/http_sse/shared/base_repository.py +45 -71
  152. solace_agent_mesh/gateway/http_sse/shared/types.py +0 -18
  153. solace_agent_mesh/templates/gateway_config_template.yaml +0 -5
  154. solace_agent_mesh/templates/logging_config_template.ini +10 -6
  155. solace_agent_mesh/templates/plugin_gateway_config_template.yaml +0 -3
  156. solace_agent_mesh/templates/shared_config.yaml +40 -0
  157. {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.0.dist-info}/METADATA +47 -21
  158. {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.0.dist-info}/RECORD +162 -141
  159. solace_agent_mesh/assets/docs/assets/js/5c2bd65f.e49689dd.js +0 -1
  160. solace_agent_mesh/assets/docs/assets/js/6ad8f0bd.39d5851d.js +0 -1
  161. solace_agent_mesh/assets/docs/assets/js/71da7b71.804d6567.js +0 -1
  162. solace_agent_mesh/assets/docs/assets/js/77cf947d.64c9bd6c.js +0 -1
  163. solace_agent_mesh/assets/docs/assets/js/9e9d0a82.dd810042.js +0 -1
  164. solace_agent_mesh/assets/docs/assets/js/db924877.cbc66f02.js +0 -1
  165. solace_agent_mesh/assets/docs/assets/js/de915948.139b4b9c.js +0 -1
  166. solace_agent_mesh/assets/docs/assets/js/e6f9706b.582a78ca.js +0 -1
  167. solace_agent_mesh/assets/docs/assets/js/f284c35a.5766a13d.js +0 -1
  168. solace_agent_mesh/assets/docs/assets/js/ff4d71f2.9c0297a6.js +0 -1
  169. solace_agent_mesh/assets/docs/assets/js/runtime~main.18dc45dd.js +0 -1
  170. solace_agent_mesh/assets/docs/lunr-index-1760121512891.json +0 -1
  171. solace_agent_mesh/assets/docs/search-doc-1760121512891.json +0 -1
  172. solace_agent_mesh/client/webui/frontend/static/assets/main-2nd1gbaH.js +0 -339
  173. solace_agent_mesh/client/webui/frontend/static/assets/main-DoKXctCM.css +0 -1
  174. solace_agent_mesh/config_portal/frontend/static/client/assets/_index-BNuqpWDc.js +0 -98
  175. solace_agent_mesh/evaluation/config_loader.py +0 -657
  176. solace_agent_mesh/evaluation/test_case_loader.py +0 -714
  177. /solace_agent_mesh/assets/docs/assets/js/{main.bd3c34f3.js.LICENSE.txt → main.20feee82.js.LICENSE.txt} +0 -0
  178. {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.0.dist-info}/WHEEL +0 -0
  179. {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.0.dist-info}/entry_points.txt +0 -0
  180. {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -4,64 +4,95 @@ This module orchestrates the evaluation of AI models against test cases.
4
4
  """
5
5
 
6
6
  import json
7
+ import logging
8
+ import mimetypes
7
9
  import os
10
+ import shutil
11
+ import subprocess
8
12
  import sys
13
+ import threading
9
14
  import time
10
- import subprocess
11
- import requests
12
15
  import uuid
13
- import shutil
14
- import mimetypes
15
- import threading
16
+ from concurrent.futures import ThreadPoolExecutor, as_completed
17
+ from dataclasses import dataclass
18
+ from importlib import metadata
16
19
  from pathlib import Path
17
20
 
18
- from typing import Dict, List, Optional, Tuple, Any
19
- from dataclasses import dataclass
21
+ import click
22
+ import requests
20
23
  from dotenv import load_dotenv
21
- from .config_loader import ConfigLoader
22
- from .message_organizer import MessageOrganizer
23
- from .summary_builder import SummaryBuilder
24
- from .subscriber import Subscriber
24
+
25
25
  from .evaluator import EvaluationOrchestrator
26
+ from .message_organizer import MessageOrganizer
26
27
  from .report_generator import ReportGenerator
28
+ from .shared import (
29
+ DEFAULT_STARTUP_WAIT_TIME,
30
+ DEFAULT_TEST_TIMEOUT,
31
+ EVALUATION_DIR,
32
+ MAX_ARTIFACT_SIZE_MB,
33
+ EvaluationConfigLoader,
34
+ TestSuiteConfiguration,
35
+ get_local_base_url,
36
+ )
37
+ from .subscriber import Subscriber
38
+ from .summary_builder import SummaryBuilder
27
39
 
28
- SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
40
+ log = logging.getLogger(__name__)
29
41
 
30
42
 
31
- @dataclass
32
- class EvaluationConfig:
33
- """Centralized configuration with validation and defaults."""
43
+ def _error_exit(message: str):
44
+ """Logs an error message and exits."""
45
+ log.error(message)
46
+ sys.exit(1)
34
47
 
35
- # Constants
36
- DEFAULT_STARTUP_WAIT_TIME = 60
37
- DEFAULT_TEST_TIMEOUT = 60
38
48
 
39
- def __init__(self, config_data: Dict[str, Any]):
40
- load_dotenv()
41
- host = os.getenv("REST_API_HOST", "0.0.0.0")
42
- port = os.getenv("REST_API_PORT", "8080")
43
- self.API_BASE_URL = f"http://{host}:{port}/api/v2"
44
- self.config_data = config_data
45
- self.agents = config_data.get("agents", [])
46
- self.test_cases = config_data.get("test_cases", [])
47
- self.llm_models = config_data.get("llm_models", [])
48
- self.runs = config_data.get("runs", 1)
49
- self.results_dir_name = config_data.get("results_dir_name", "tests")
50
-
51
- self._validate_config()
52
-
53
- def _validate_config(self):
54
- """Validate required configuration fields."""
55
- if not self.agents:
56
- raise ValueError("'agents' configuration is required and cannot be empty")
57
- if not self.test_cases:
58
- raise ValueError(
59
- "'test_cases' configuration is required and cannot be empty"
60
- )
61
- if not self.llm_models:
62
- raise ValueError(
63
- "'llm_models' configuration is required and cannot be empty"
49
+ def _ensure_eval_backend_config_exists():
50
+ """Checks for eval_backend.yaml and creates it from a template if missing."""
51
+ project_root = Path.cwd()
52
+ configs_dir = project_root / "configs"
53
+ eval_backend_config_path = configs_dir / "eval_backend.yaml"
54
+
55
+ if eval_backend_config_path.exists():
56
+ return
57
+
58
+ click.echo(
59
+ f"'{eval_backend_config_path.relative_to(project_root)}' not found. Creating it..."
60
+ )
61
+
62
+ if not (configs_dir / "shared_config.yaml").exists():
63
+ _error_exit(
64
+ "Error: 'configs/shared_config.yaml' not found. Please run 'sam init' first."
65
+ )
66
+
67
+ try:
68
+ # This is a simplified way to get the template content.
69
+ # In a real CLI, you'd use a more robust method like `importlib.resources`.
70
+ template_path = Path(__file__).parent.parent / "templates" / "eval_backend_template.yaml"
71
+ with open(template_path, encoding="utf-8") as f:
72
+ template_content = f.read()
73
+
74
+ with open(eval_backend_config_path, "w", encoding="utf-8") as f:
75
+ f.write(template_content)
76
+ click.echo(
77
+ click.style(
78
+ f"Successfully created '{eval_backend_config_path.relative_to(project_root)}'.",
79
+ fg="green",
64
80
  )
81
+ )
82
+ except Exception as e:
83
+ _error_exit(f"Failed to create eval_backend.yaml: {e}")
84
+
85
+
86
+ def _ensure_sam_rest_gateway_installed():
87
+ """Checks if the sam-rest-gateway package is installed for local evaluation."""
88
+ try:
89
+ metadata.distribution("sam-rest-gateway")
90
+ except metadata.PackageNotFoundError:
91
+ _error_exit(
92
+ "Error: 'sam-rest-gateway' is not installed. "
93
+ "Please install it using: "
94
+ 'pip install "sam-rest-gateway @ git+https://github.com/SolaceLabs/solace-agent-mesh-core-plugins#subdirectory=sam-rest-gateway"'
95
+ )
65
96
 
66
97
 
67
98
  @dataclass
@@ -70,7 +101,7 @@ class TestRun:
70
101
 
71
102
  agent: str
72
103
  query: str
73
- artifacts: List[str]
104
+ artifacts: list[str]
74
105
  wait_time: int
75
106
  test_case_file: str
76
107
  run_num: int
@@ -78,192 +109,220 @@ class TestRun:
78
109
  @property
79
110
  def test_case_id(self) -> str:
80
111
  """Extract test case ID from filename."""
81
- base_name = os.path.basename(self.test_case_file)
82
- return os.path.splitext(base_name)[0].replace(".test", "")
112
+ return Path(self.test_case_file).stem.replace(".test", "")
83
113
 
84
114
 
85
115
  class ProcessManager:
86
116
  """Manages subprocess lifecycle for the Solace AI Connector."""
87
117
 
88
- def __init__(self, config: EvaluationConfig, verbose: bool = False):
118
+ def __init__(self, config: TestSuiteConfiguration, verbose: bool = False):
89
119
  self.config = config
90
- self.process: Optional[subprocess.Popen] = None
91
- self.namespace: Optional[str] = None
120
+ self.process: subprocess.Popen | None = None
121
+ self.namespace: str | None = None
92
122
  self.verbose = verbose
93
123
 
94
- def start_services(self) -> Tuple[subprocess.Popen, str]:
124
+ def start_services(self) -> tuple[subprocess.Popen, str]:
95
125
  """Start the Solace AI Connector and return process and namespace."""
96
126
  load_dotenv()
97
127
  self.namespace = f"eval-{uuid.uuid4()}"
98
128
  os.environ["NAMESPACE"] = self.namespace
99
129
 
100
- agent_files = self.config.agents
130
+ # Set broker environment variables from the required configuration
131
+ log.info("Setting broker configuration from test suite...")
132
+ for key, value in self.config.broker.dict().items():
133
+ if value is not None:
134
+ env_key = f"SOLACE_BROKER_{key.upper()}"
135
+ os.environ[env_key] = str(value)
136
+ log.info(f" - Set {env_key}")
137
+
138
+ agent_files = self.config.agent_configs
101
139
 
102
140
  command = [sys.executable, "-m", "solace_ai_connector.main", *agent_files]
103
141
 
104
- print("Starting Solace AI Connector as a subprocess...")
105
- project_root = os.path.abspath(os.path.join(SCRIPT_DIR, ".."))
142
+ log.info("Starting Solace AI Connector as a subprocess...")
143
+ project_root = Path(EVALUATION_DIR).parent.resolve()
106
144
 
107
145
  self.process = subprocess.Popen(
108
146
  command, stdout=sys.stdout, stderr=sys.stderr, cwd=project_root
109
147
  )
110
148
 
111
- print("Waiting for server to become healthy...")
112
- self._wait_for_server_ready()
149
+ log.info("Waiting for server to become healthy...")
150
+ self._wait_for_server_ready(get_local_base_url())
113
151
 
114
152
  return self.process, self.namespace
115
153
 
116
- def _wait_for_server_ready(self):
154
+ def _wait_for_server_ready(self, base_url: str):
117
155
  """Poll the health endpoint until the server is ready."""
118
156
  start_time = time.time()
119
- health_url = f"{self.config.API_BASE_URL.replace('/api/v2', '')}/health"
157
+ health_url = f"{base_url}/health"
120
158
 
121
- while time.time() - start_time < self.config.DEFAULT_STARTUP_WAIT_TIME:
159
+ while time.time() - start_time < DEFAULT_STARTUP_WAIT_TIME:
122
160
  try:
123
161
  response = requests.get(health_url)
124
162
  if response.status_code == 200:
125
- print("Server is healthy.")
126
- time.sleep(1) # Wait an extra second as requested
163
+ log.info("Server is healthy.")
164
+ time.sleep(5)
127
165
  return
128
166
  except requests.ConnectionError:
129
167
  # Server is not yet available, wait and retry
130
168
  time.sleep(1)
131
169
  except Exception as e:
132
- print(f"An unexpected error occurred during health check: {e}")
170
+ log.error(f"An unexpected error occurred during health check: {e}")
133
171
  time.sleep(1)
134
172
 
135
173
  raise RuntimeError(
136
- f"Server did not become healthy within {self.config.DEFAULT_STARTUP_WAIT_TIME} seconds."
174
+ f"Server did not become healthy within {DEFAULT_STARTUP_WAIT_TIME} seconds."
137
175
  )
138
176
 
139
- def stop_services(self, subscriber: Optional[Subscriber] = None):
177
+ def stop_services(self, subscriber: Subscriber | None = None):
140
178
  """Clean up running processes."""
141
179
  if subscriber:
142
- print("--- Terminating subscriber ---")
180
+ log.info("Terminating subscriber")
143
181
  subscriber.stop()
144
182
  subscriber.join()
145
- print("Subscriber terminated.")
183
+ log.info("Subscriber terminated.")
146
184
 
147
185
  if self.process:
148
- print("--- Terminating subprocess ---")
186
+ log.info("Terminating subprocess")
149
187
  self.process.terminate()
150
188
  try:
151
189
  self.process.wait(timeout=5)
152
- print("Subprocess terminated.")
190
+ log.info("Subprocess terminated.")
153
191
  except subprocess.TimeoutExpired:
154
- print("Subprocess did not terminate gracefully, killing.")
192
+ log.info("Subprocess did not terminate gracefully, killing.")
155
193
  self.process.kill()
156
194
 
157
- print("Process cleanup completed.")
195
+ log.info("Process cleanup completed.")
158
196
 
159
197
 
160
198
  class TaskService:
161
199
  """Handles task submission and tracking."""
162
200
 
163
- def __init__(self, config: EvaluationConfig, verbose: bool = False):
164
- self.config = config
165
- self.base_url = config.API_BASE_URL
201
+ def __init__(self, config: TestSuiteConfiguration, verbose: bool = False):
166
202
  self.verbose = verbose
203
+ self.config = config
204
+ if config.remote:
205
+ self.base_url = config.remote.environment.get("EVAL_REMOTE_URL")
206
+ else:
207
+ self.base_url = get_local_base_url()
167
208
 
168
209
  def submit_task(
169
- self, agent_name: str, message: str, artifact_paths: Optional[List[str]] = None
170
- ) -> Optional[str]:
210
+ self, agent_name: str, message: str, artifact_paths: list[str] | None = None
211
+ ) -> str | None:
171
212
  """Submit a test case to the agent and return the task ID."""
172
- print("--- Sending test request ---")
173
- url = f"{self.base_url}/tasks"
213
+ log.info("Sending test request")
214
+ url = f"{self.base_url}/api/v2/tasks"
174
215
  data = {
175
216
  "agent_name": agent_name,
176
217
  "prompt": message,
177
218
  }
178
219
 
220
+ headers = {}
221
+ if self.config.remote:
222
+ auth_token = self.config.remote.environment.get("EVAL_AUTH_TOKEN")
223
+ if auth_token:
224
+ headers["Authorization"] = f"Bearer {auth_token}"
225
+
179
226
  files_to_upload = []
180
227
  if artifact_paths:
181
228
  files_to_upload = self._prepare_file_uploads(artifact_paths)
182
229
 
183
230
  try:
184
231
  with requests.Session() as session:
185
- response = session.post(url, data=data, files=files_to_upload)
232
+ response = session.post(url, data=data, files=files_to_upload, headers=headers)
186
233
 
187
234
  response.raise_for_status()
188
235
  task_id = response.json()["taskId"]
189
- print(f"Task submitted with ID: {task_id}")
236
+ log.info(f"Task submitted with ID: {task_id}")
190
237
  return task_id
191
238
 
192
239
  except requests.RequestException as e:
193
- print(f"Failed to submit task: {e}")
240
+ log.error(f"Failed to submit task: {e}")
194
241
  return None
195
242
  finally:
196
243
  self._close_file_uploads(files_to_upload)
197
244
 
198
- def _prepare_file_uploads(self, artifact_paths: List[str]) -> List[Tuple]:
245
+ def _prepare_file_uploads(self, artifact_paths: list[str]) -> list[tuple]:
199
246
  """Prepare file uploads for the request."""
200
247
  files_to_upload = []
201
- for path in artifact_paths:
248
+ for path_str in artifact_paths:
249
+ path = Path(path_str)
250
+ # Check file size before reading
251
+ try:
252
+ file_size_mb = path.stat().st_size / (1024 * 1024)
253
+ if file_size_mb > MAX_ARTIFACT_SIZE_MB:
254
+ log.warning(
255
+ f"Artifact '{path.name}' is {file_size_mb:.2f} MB, "
256
+ f"which is larger than the recommended maximum of {MAX_ARTIFACT_SIZE_MB} MB. "
257
+ "This may cause memory issues."
258
+ )
259
+ except OSError as e:
260
+ log.error(f"Could not get size of artifact {path}: {e}")
261
+ continue
262
+
202
263
  mimetype, _ = mimetypes.guess_type(path)
203
264
  if mimetype is None:
204
265
  mimetype = "text/plain"
205
- files_to_upload.append(
206
- ("files", (os.path.basename(path), open(path, "rb"), mimetype))
207
- )
266
+ # Read file content with context manager
267
+ with path.open("rb") as f:
268
+ file_content = f.read()
269
+ files_to_upload.append(("files", (path.name, file_content, mimetype)))
208
270
  return files_to_upload
209
271
 
210
- def _close_file_uploads(self, files_to_upload: List[Tuple]):
211
- """Close file handles after upload."""
212
- for _, file_tuple in files_to_upload:
213
- file_tuple[1].close()
272
+ def _close_file_uploads(self, files_to_upload: list[tuple]):
273
+ """Close file handles after upload (no longer needed)."""
274
+ # No longer needed
275
+ pass
214
276
 
215
277
 
216
278
  class FileService:
217
279
  """Handles file operations and path management."""
218
280
 
219
281
  @staticmethod
220
- def ensure_directory(path: str):
282
+ def ensure_directory(path: Path):
221
283
  """Ensure directory exists, create if necessary."""
222
- os.makedirs(path, exist_ok=True)
284
+ path.mkdir(parents=True, exist_ok=True)
223
285
 
224
286
  @staticmethod
225
- def remove_directory(path: str):
287
+ def remove_directory(path: Path):
226
288
  """Remove directory and all contents."""
227
- if os.path.exists(path):
289
+ if path.exists():
228
290
  shutil.rmtree(path)
229
291
 
230
292
  @staticmethod
231
- def save_json(data: Any, filepath: str):
293
+ def save_json(data: any, filepath: Path):
232
294
  """Save data as JSON to file."""
233
- with open(filepath, "w") as f:
295
+ with filepath.open("w") as f:
234
296
  json.dump(data, f, indent=4)
235
297
 
236
298
  @staticmethod
237
- def load_json(filepath: str) -> Any:
299
+ def load_json(filepath: Path) -> any:
238
300
  """Load JSON data from file."""
239
- with open(filepath, "r") as f:
301
+ with filepath.open() as f:
240
302
  return json.load(f)
241
303
 
242
304
 
243
305
  class TestRunBuilder:
244
306
  """Builds test run configurations from test cases."""
245
307
 
246
- def __init__(self, config: EvaluationConfig):
308
+ def __init__(self, config: TestSuiteConfiguration):
247
309
  self.config = config
248
310
 
249
- def build_test_runs(self) -> List[TestRun]:
311
+ def build_test_runs(self) -> list[TestRun]:
250
312
  """Build all test runs from configuration."""
251
313
  test_runs = []
252
314
 
253
- for test_case_path in self.config.test_cases:
254
- test_case = FileService.load_json(test_case_path)
315
+ for test_case_path in self.config.test_case_files:
316
+ test_case = FileService.load_json(Path(test_case_path))
255
317
 
256
318
  artifact_paths = self._get_artifact_paths(test_case, test_case_path)
257
319
 
258
- test_case_file = os.path.basename(test_case_path)
259
- for run_num in range(1, self.config.runs + 1):
320
+ for run_num in range(1, self.config.run_count + 1):
260
321
  test_run = TestRun(
261
322
  agent=test_case["target_agent"],
262
323
  query=test_case["query"],
263
324
  artifacts=artifact_paths,
264
- wait_time=test_case.get(
265
- "wait_time", self.config.DEFAULT_TEST_TIMEOUT
266
- ),
325
+ wait_time=test_case.get("wait_time", DEFAULT_TEST_TIMEOUT),
267
326
  test_case_file=test_case_path,
268
327
  run_num=run_num,
269
328
  )
@@ -271,14 +330,14 @@ class TestRunBuilder:
271
330
 
272
331
  return test_runs
273
332
 
274
- def _get_artifact_paths(self, test_case: Dict, test_case_path: str) -> List[str]:
333
+ def _get_artifact_paths(self, test_case: dict, test_case_path: str) -> list[str]:
275
334
  """Extract artifact paths from test case."""
276
335
  artifact_paths = []
277
336
  if "artifacts" in test_case:
278
- test_case_dir = os.path.dirname(test_case_path)
337
+ test_case_dir = Path(test_case_path).parent
279
338
  for artifact in test_case["artifacts"]:
280
339
  if artifact.get("type") == "file":
281
- artifact_paths.append(os.path.join(test_case_dir, artifact["path"]))
340
+ artifact_paths.append(str(test_case_dir / artifact["path"]))
282
341
  return artifact_paths
283
342
 
284
343
 
@@ -293,13 +352,14 @@ class TestExecutor:
293
352
  def execute_test(
294
353
  self,
295
354
  test_run: TestRun,
296
- model_results_path: str,
297
- task_mappings: Dict[str, str],
355
+ model_results_path: Path,
356
+ task_mappings: dict[str, str],
298
357
  subscriber: Subscriber,
358
+ task_mappings_lock: threading.Lock,
299
359
  ) -> bool:
300
360
  """Execute a single test case and wait for completion."""
301
- print(
302
- f"--- Starting test: {test_run.test_case_file} (run {test_run.run_num}) ---"
361
+ log.info(
362
+ f"Starting test: {test_run.test_case_file} (run {test_run.run_num})"
303
363
  )
304
364
 
305
365
  # Submit the task
@@ -308,25 +368,23 @@ class TestExecutor:
308
368
  )
309
369
 
310
370
  if not task_id:
311
- print(
371
+ log.error(
312
372
  f"Failed to start test case: {test_run.test_case_file} (run {test_run.run_num})"
313
373
  )
314
374
  return False
315
375
 
316
376
  # Set up result directory
317
- run_dir = os.path.join(
318
- model_results_path, test_run.test_case_id, f"run_{test_run.run_num}"
319
- )
377
+ test_case_name = Path(test_run.test_case_file).stem.replace(".test", "")
378
+ run_dir = model_results_path / test_case_name / f"run_{test_run.run_num}"
320
379
  self.file_service.ensure_directory(run_dir)
321
380
 
322
381
  # Save test case path for summary builder
323
382
  test_info = {"path": test_run.test_case_file}
324
- self.file_service.save_json(
325
- test_info, os.path.join(run_dir, "test_case_info.json")
326
- )
383
+ self.file_service.save_json(test_info, run_dir / "test_case_info.json")
327
384
 
328
385
  # Track the task
329
- task_mappings[task_id] = run_dir
386
+ with task_mappings_lock:
387
+ task_mappings[task_id] = str(run_dir)
330
388
  subscriber.active_tasks.add(task_id)
331
389
 
332
390
  # Wait for completion
@@ -336,26 +394,26 @@ class TestExecutor:
336
394
  self, task_id: str, wait_time: int, subscriber: Subscriber
337
395
  ) -> bool:
338
396
  """Wait for task completion with timeout."""
339
- print(
397
+ log.info(
340
398
  f"Waiting for task {task_id} to complete (timeout: {wait_time} seconds)..."
341
399
  )
342
400
 
343
401
  start_time = time.time()
344
402
  while task_id in subscriber.active_tasks:
345
403
  if time.time() - start_time > wait_time:
346
- print(f"Task {task_id} timed out after {wait_time} seconds")
404
+ log.warning(f"Task {task_id} timed out after {wait_time} seconds")
347
405
  subscriber.active_tasks.discard(task_id)
348
406
  return False
349
407
  time.sleep(1)
350
408
 
351
- print(f"Task {task_id} completed successfully")
409
+ log.info(f"Task {task_id} completed successfully")
352
410
  return True
353
411
 
354
412
 
355
413
  class ModelEvaluator:
356
414
  """Handles the evaluation of a single model."""
357
415
 
358
- def __init__(self, config: EvaluationConfig, verbose: bool = False):
416
+ def __init__(self, config: TestSuiteConfiguration, verbose: bool = False):
359
417
  self.config = config
360
418
  self.process_manager = ProcessManager(config, verbose=verbose)
361
419
  self.task_service = TaskService(config, verbose=verbose)
@@ -363,20 +421,21 @@ class ModelEvaluator:
363
421
  self.test_builder = TestRunBuilder(config)
364
422
  self.test_executor = TestExecutor(self.task_service, self.file_service, verbose=verbose)
365
423
  self.verbose = verbose
424
+ self._task_mappings_lock = threading.Lock()
366
425
 
367
426
  def evaluate_model(
368
- self, model_config: Dict[str, Any], base_results_path: str
427
+ self, model_config: dict[str, any], base_results_path: Path
369
428
  ) -> float:
370
429
  """Evaluate a single model and return execution time."""
371
- model_name = model_config["name"]
372
- print(f"--- Starting evaluation for model: {model_name} ---")
430
+ model_name = model_config.name
431
+ log.info(f"Starting evaluation for model: {model_name}")
373
432
  start_time = time.time()
374
433
 
375
434
  # Set environment variables for the model
376
435
  self._set_model_environment(model_config)
377
436
 
378
437
  # Set up paths
379
- model_results_path = os.path.join(base_results_path, model_name)
438
+ model_results_path = base_results_path / model_name
380
439
  self.file_service.ensure_directory(model_results_path)
381
440
 
382
441
  # Start services
@@ -388,10 +447,10 @@ class ModelEvaluator:
388
447
  try:
389
448
  # Execute tests
390
449
  successful_tests = self._execute_all_tests(model_results_path, subscriber)
391
- print(f"--- Completed {successful_tests} tests successfully ---")
450
+ log.info(f"Completed {successful_tests} tests successfully")
392
451
 
393
452
  except Exception as e:
394
- print(f"Error during test case execution for model {model_name}: {e}")
453
+ log.error(f"Error during test case execution for model {model_name}: {e}")
395
454
  finally:
396
455
  # Cleanup
397
456
  task_mappings = getattr(self, "_task_mappings", {})
@@ -401,52 +460,84 @@ class ModelEvaluator:
401
460
 
402
461
  end_time = time.time()
403
462
  execution_time = end_time - start_time
404
- print(
405
- f"--- Evaluation for model: {model_name} complete in {execution_time:.2f} seconds ---"
463
+ log.info(
464
+ f"Evaluation for model: {model_name} complete in {execution_time:.2f} seconds"
406
465
  )
407
466
 
408
467
  return execution_time
409
468
 
410
- def _set_model_environment(self, model_config: Dict[str, Any]):
469
+ def _set_model_environment(self, model_config: dict[str, any]):
411
470
  """Set environment variables for the model."""
412
- for key, value in model_config.get("env", {}).items():
413
- os.environ[key] = value
471
+ for key, value in model_config.environment.variables.items():
472
+ if value is not None:
473
+ os.environ[key] = value
414
474
 
415
- def _setup_subscriber(self, namespace: str, model_results_path: str) -> Subscriber:
475
+ def _setup_subscriber(self, namespace: str, model_results_path: Path) -> Subscriber:
416
476
  """Set up and start the subscriber."""
417
477
  subscription_ready_event = threading.Event()
418
478
  subscriber = Subscriber(
419
- namespace, set(), None, subscription_ready_event, model_results_path
479
+ self.config.broker,
480
+ namespace,
481
+ set(),
482
+ None,
483
+ subscription_ready_event,
484
+ model_results_path,
420
485
  )
421
486
  subscriber.start()
422
487
 
423
- print("Waiting for subscriber to be ready...")
488
+ log.info("Waiting for subscriber to be ready...")
424
489
  subscription_ready_event.wait()
425
- print("Subscriber is ready.")
490
+ log.info("Subscriber is ready.")
426
491
 
427
492
  return subscriber
428
493
 
429
494
  def _execute_all_tests(
430
- self, model_results_path: str, subscriber: Subscriber
495
+ self, model_results_path: Path, subscriber: Subscriber
431
496
  ) -> int:
432
- """Execute all test cases and return count of successful tests."""
497
+ """Execute all test cases in parallel and return count of successful tests."""
433
498
  test_runs = self.test_builder.build_test_runs()
434
499
 
435
500
  self._task_mappings = {}
436
501
  total_tests = len(test_runs)
437
502
  successful_tests = 0
438
503
 
439
- print(f"--- Starting sequential execution of {total_tests} tests ---")
504
+ log.info(
505
+ f"Starting parallel execution of {total_tests} tests with {self.config.workers} workers."
506
+ )
440
507
 
441
- for i, test_run in enumerate(test_runs, 1):
442
- print(f"--- Test {i}/{total_tests} ---")
443
- success = self.test_executor.execute_test(
444
- test_run, model_results_path, self._task_mappings, subscriber
445
- )
446
- if success:
447
- successful_tests += 1
448
- else:
449
- print(f"Test {i} failed or timed out")
508
+ with ThreadPoolExecutor(max_workers=self.config.workers) as executor:
509
+ # Create a dictionary to map futures to their test_run
510
+ future_to_run = {
511
+ executor.submit(
512
+ self.test_executor.execute_test,
513
+ test_run,
514
+ model_results_path,
515
+ self._task_mappings,
516
+ subscriber,
517
+ self._task_mappings_lock, # Pass the lock to the worker
518
+ ): test_run
519
+ for test_run in test_runs
520
+ }
521
+
522
+ # Process results as they complete
523
+ for i, future in enumerate(as_completed(future_to_run), 1):
524
+ test_run = future_to_run[future]
525
+ log.info(
526
+ f"Processing result for test {i}/{total_tests}: {test_run.test_case_id}"
527
+ )
528
+ try:
529
+ success = future.result()
530
+ if success:
531
+ successful_tests += 1
532
+ else:
533
+ log.warning(
534
+ f"Test {test_run.test_case_id} (run {test_run.run_num}) failed or timed out."
535
+ )
536
+ except Exception as e:
537
+ log.error(
538
+ f"Test {test_run.test_case_id} (run {test_run.run_num}) generated an exception: {e}",
539
+ exc_info=True,
540
+ )
450
541
 
451
542
  return successful_tests
452
543
 
@@ -454,16 +545,16 @@ class ModelEvaluator:
454
545
  self,
455
546
  app_process: subprocess.Popen,
456
547
  subscriber: Subscriber,
457
- model_results_path: str,
458
- task_mappings: Dict[str, str],
548
+ model_results_path: Path,
549
+ task_mappings: dict[str, str],
459
550
  ):
460
551
  """Clean up after model evaluation."""
461
552
  self.process_manager.stop_services(subscriber)
462
553
 
463
554
  # Save task mappings
464
- mappings_file = os.path.join(model_results_path, "task_mappings.json")
555
+ mappings_file = model_results_path / "task_mappings.json"
465
556
  self.file_service.save_json(task_mappings, mappings_file)
466
- print(f"Task mappings saved to {mappings_file}")
557
+ log.info(f"Task mappings saved to {mappings_file}")
467
558
 
468
559
 
469
560
  class ResultsProcessor:
@@ -471,52 +562,49 @@ class ResultsProcessor:
471
562
 
472
563
  def __init__(self, file_service: FileService, verbose: bool = False):
473
564
  self.file_service = file_service
474
- self.summary_builder = SummaryBuilder()
565
+ self.summary_builder: SummaryBuilder | None = None
475
566
  self.verbose = verbose
476
567
 
477
- def summarize_results(self, base_results_path: str):
568
+ def summarize_results(self, base_results_path: Path, config: TestSuiteConfiguration):
478
569
  """Generate summaries for all test results."""
479
- print("--- Summarizing results ---")
570
+ log.info("Summarizing results")
480
571
 
481
- for model_name in os.listdir(base_results_path):
482
- model_path = os.path.join(base_results_path, model_name)
483
- if not os.path.isdir(model_path):
484
- continue
572
+ self.summary_builder = SummaryBuilder(config)
485
573
 
574
+ for model_path in base_results_path.iterdir():
575
+ if not model_path.is_dir():
576
+ continue
486
577
  self._process_model_results(model_path)
487
578
 
488
- def _process_model_results(self, model_path: str):
579
+ def _process_model_results(self, model_path: Path):
489
580
  """Process results for a single model."""
490
- for test_case_name in os.listdir(model_path):
491
- test_case_path = os.path.join(model_path, test_case_name)
492
- if not os.path.isdir(test_case_path):
581
+ for test_case_path in model_path.iterdir():
582
+ if not test_case_path.is_dir():
493
583
  continue
494
-
495
584
  self._process_test_case_results(test_case_path)
496
585
 
497
- def _process_test_case_results(self, test_case_path: str):
586
+ def _process_test_case_results(self, test_case_path: Path):
498
587
  """Process results for a single test case."""
499
- for run_name in os.listdir(test_case_path):
500
- run_path = os.path.join(test_case_path, run_name)
501
- if not os.path.isdir(run_path):
588
+ for run_path in test_case_path.iterdir():
589
+ if not run_path.is_dir():
502
590
  continue
503
591
 
504
- messages_file = os.path.join(run_path, "messages.json")
505
- if os.path.exists(messages_file):
592
+ messages_file = run_path / "messages.json"
593
+ if messages_file.exists():
506
594
  summary_data = self.summary_builder.summarize_run(messages_file)
507
- summary_file = os.path.join(run_path, "summary.json")
595
+ summary_file = run_path / "summary.json"
508
596
  self.file_service.save_json(summary_data, summary_file)
509
- print(f"Summary created for {run_path}")
597
+ log.info(f"Summary created for {run_path}")
510
598
 
511
599
 
512
600
  class EvaluationRunner:
513
601
  """Main orchestrator that coordinates the entire evaluation process."""
514
602
 
515
603
  def __init__(self, verbose: bool = False):
516
- self.config: Optional[EvaluationConfig] = None
604
+ self.config: TestSuiteConfiguration | None = None
517
605
  self.file_service = FileService()
518
606
  self.results_processor = ResultsProcessor(self.file_service, verbose=verbose)
519
- self.report_generator: Optional[ReportGenerator] = None
607
+ self.report_generator: ReportGenerator | None = None
520
608
  self.verbose = verbose
521
609
 
522
610
  def run_evaluation(self, config_path: str):
@@ -528,91 +616,188 @@ class EvaluationRunner:
528
616
  self._load_configuration(config_path)
529
617
 
530
618
  # Set up results directory in the current working directory
531
- base_results_path = Path.cwd() / "results" / self.config.results_dir_name
619
+ base_results_path = Path.cwd() / "results" / self.config.results_directory
532
620
  self._setup_results_directory(base_results_path)
533
621
 
534
622
  # Run model evaluations
535
- model_execution_times = self._evaluate_all_models(str(base_results_path))
623
+ if self.config.remote:
624
+ model_execution_times = self._run_remote_evaluation(base_results_path)
625
+ else:
626
+ model_execution_times = self._run_local_evaluation(base_results_path)
536
627
 
537
628
  # Post-process results
538
629
  self._post_process_results(
539
- str(base_results_path), model_execution_times, config_path
630
+ base_results_path, model_execution_times, config_path
540
631
  )
541
632
 
542
633
  # Save overall statistics
543
- self._save_execution_stats(str(base_results_path), start_time)
634
+ self._save_execution_stats(base_results_path, start_time)
544
635
 
545
636
  # Generate reports
546
637
  self._generate_reports(config_path, base_results_path)
547
638
 
548
- # Display verbose summary if enabled
549
- if self.verbose:
550
- self._display_verbose_summary(base_results_path)
639
+ # Display summary
640
+ self._display_summary(base_results_path)
551
641
 
552
642
  except Exception as e:
553
- print(f"Evaluation failed: {e}")
643
+ log.error(f"Evaluation failed: {e}")
554
644
  raise
555
645
 
556
646
  def _load_configuration(self, config_path: str):
557
647
  """Load and validate the evaluation configuration."""
558
- config_loader = ConfigLoader(config_path)
559
- config_data = config_loader.load_config()
560
- self.config = EvaluationConfig(config_data)
648
+ config_loader = EvaluationConfigLoader(config_path)
649
+ self.config = config_loader.load_configuration()
561
650
  self.report_generator = ReportGenerator(config_path)
562
- print("Configuration loaded and validated successfully.")
651
+ log.info("Configuration loaded and validated successfully.")
563
652
 
564
653
  def _setup_results_directory(self, base_results_path: Path):
565
654
  """Set up the results directory."""
566
655
  # Clean up existing results
567
- self.file_service.remove_directory(str(base_results_path))
568
- self.file_service.ensure_directory(str(base_results_path))
656
+ self.file_service.remove_directory(base_results_path)
657
+ self.file_service.ensure_directory(base_results_path)
569
658
 
570
- print(f"Results directory set up at: {base_results_path}")
659
+ log.info(f"Results directory set up at: {base_results_path}")
571
660
 
572
- def _evaluate_all_models(self, base_results_path: str) -> Dict[str, float]:
573
- """Evaluate all configured models."""
661
+ def _run_local_evaluation(self, base_results_path: Path) -> dict[str, float]:
662
+ """Run the full local evaluation with service management."""
663
+ _ensure_eval_backend_config_exists()
664
+ _ensure_sam_rest_gateway_installed()
665
+ log.info("Starting local evaluation")
574
666
  model_execution_times = {}
575
667
 
576
- for model_config in self.config.llm_models:
668
+ # This loop iterates through the models defined in the config
669
+ for model_config in self.config.model_configurations:
670
+ # ModelEvaluator manages the lifecycle of local services for each model
577
671
  model_evaluator = ModelEvaluator(self.config, verbose=self.verbose)
578
672
  execution_time = model_evaluator.evaluate_model(
579
673
  model_config, base_results_path
580
674
  )
581
- model_execution_times[model_config["name"]] = execution_time
675
+ model_execution_times[model_config.name] = execution_time
582
676
 
583
677
  return model_execution_times
584
678
 
679
+ def _run_remote_evaluation(self, base_results_path: Path) -> dict[str, float]:
680
+ """Run evaluation against a remote endpoint in parallel."""
681
+ remote_url = self.config.remote.environment.get("EVAL_REMOTE_URL")
682
+ log.info(f"Starting remote evaluation against: {remote_url}")
683
+ start_time = time.time()
684
+
685
+ # Check if the remote server is healthy before proceeding
686
+ process_manager = ProcessManager(self.config, self.verbose)
687
+ process_manager._wait_for_server_ready(remote_url)
688
+
689
+ # Instantiate services with the remote configuration
690
+ task_service = TaskService(self.config, self.verbose)
691
+ test_builder = TestRunBuilder(self.config)
692
+ test_executor = TestExecutor(task_service, self.file_service, self.verbose)
693
+
694
+ # In remote mode, there's no model loop. We create a single "remote" results directory.
695
+ remote_results_path = base_results_path / "remote"
696
+ self.file_service.ensure_directory(remote_results_path)
697
+
698
+ # The subscriber needs to be configured for remote use.
699
+ subscriber = self._setup_remote_subscriber(str(remote_results_path))
700
+
701
+ task_mappings = {}
702
+ try:
703
+ test_runs = test_builder.build_test_runs()
704
+ successful_tests = 0
705
+ task_mappings_lock = threading.Lock()
706
+
707
+ log.info(
708
+ f"Starting parallel execution of {len(test_runs)} remote tests with {self.config.workers} workers."
709
+ )
710
+
711
+ with ThreadPoolExecutor(max_workers=self.config.workers) as executor:
712
+ future_to_run = {
713
+ executor.submit(
714
+ test_executor.execute_test,
715
+ test_run,
716
+ remote_results_path,
717
+ task_mappings,
718
+ subscriber,
719
+ task_mappings_lock,
720
+ ): test_run
721
+ for test_run in test_runs
722
+ }
723
+
724
+ for i, future in enumerate(as_completed(future_to_run), 1):
725
+ test_run = future_to_run[future]
726
+ log.info(
727
+ f"Processing result for remote test {i}/{len(test_runs)}: {test_run.test_case_id}"
728
+ )
729
+ try:
730
+ success = future.result()
731
+ if success:
732
+ successful_tests += 1
733
+ except Exception as e:
734
+ log.error(
735
+ f"Remote test {test_run.test_case_id} generated an exception: {e}",
736
+ exc_info=True,
737
+ )
738
+
739
+ log.info(f"Completed {successful_tests} remote tests successfully")
740
+
741
+ finally:
742
+ if subscriber:
743
+ subscriber.stop()
744
+ subscriber.join()
745
+
746
+ # Save task mappings for remote run
747
+ mappings_file = remote_results_path / "task_mappings.json"
748
+ self.file_service.save_json(task_mappings, mappings_file)
749
+
750
+ execution_time = time.time() - start_time
751
+ return {"remote": execution_time}
752
+
753
+ def _setup_remote_subscriber(self, results_path: str) -> Subscriber:
754
+ """Set up a subscriber for remote evaluation."""
755
+ subscription_ready_event = threading.Event()
756
+ namespace = self.config.remote.environment.get("EVAL_NAMESPACE")
757
+ subscriber = Subscriber(
758
+ self.config.broker,
759
+ namespace,
760
+ set(),
761
+ None,
762
+ subscription_ready_event,
763
+ results_path,
764
+ )
765
+ subscriber.start()
766
+ subscription_ready_event.wait()
767
+ log.info("Remote subscriber is ready.")
768
+ return subscriber
769
+
585
770
  def _post_process_results(
586
771
  self,
587
- base_results_path: str,
588
- model_execution_times: Dict[str, float],
772
+ base_results_path: Path,
773
+ model_execution_times: dict[str, float],
589
774
  config_path: str,
590
775
  ):
591
776
  """Post-process evaluation results."""
592
777
  # Categorize messages using the refactored categorizer
593
- print("--- Categorizing messages ---")
778
+ log.info("Categorizing messages")
594
779
  message_organizer = MessageOrganizer()
595
- categorization_results = message_organizer.categorize_all_messages(
596
- base_results_path
597
- )
598
- print("--- Message categorization finished ---")
780
+ message_organizer.categorize_all_messages(base_results_path)
781
+ log.info("Message categorization finished")
599
782
 
600
783
  # Generate summaries
601
- self.results_processor.summarize_results(base_results_path)
784
+ self.results_processor.summarize_results(base_results_path, self.config)
602
785
 
603
786
  # Run evaluation
604
- print("--- Starting evaluation of results ---")
787
+ log.info("Starting evaluation of results")
605
788
  evaluation_orchestrator = EvaluationOrchestrator(config_path)
606
- evaluation_orchestrator.run_evaluation(base_results_path, model_execution_times)
607
- print("--- Evaluation of results finished ---")
789
+ evaluation_orchestrator.run_evaluation(
790
+ base_results_path, model_execution_times
791
+ )
792
+ log.info("Evaluation of results finished")
608
793
 
609
794
  def _generate_reports(self, config_path: str, base_results_path: Path):
610
795
  """Generate evaluation reports."""
611
796
  if self.report_generator:
612
797
  self.report_generator.generate_report(base_results_path)
613
798
 
614
- def _display_verbose_summary(self, base_results_path: Path):
615
- """Display a verbose summary of the evaluation results in the terminal."""
799
+ def _display_summary(self, base_results_path: Path):
800
+ """Display a summary of the evaluation results in the terminal."""
616
801
 
617
802
  # Pre-process data to find column widths
618
803
  summary_data = []
@@ -628,7 +813,7 @@ class EvaluationRunner:
628
813
  continue
629
814
 
630
815
  try:
631
- results_data = self.file_service.load_json(str(results_file))
816
+ results_data = self.file_service.load_json(results_file)
632
817
  model_name = results_data.get("model_name", model_dir.name)
633
818
  max_model_len = max(max_model_len, len(model_name))
634
819
 
@@ -656,44 +841,42 @@ class EvaluationRunner:
656
841
  summary_data.append((model_name, test_case_id, scores))
657
842
 
658
843
  except Exception as e:
659
- print(f"Error processing results for {model_dir.name}: {e}")
844
+ log.error(f"Error processing results for {model_dir.name}: {e}")
660
845
 
661
- # Print formatted output
662
846
  if not summary_data:
663
- print("No summary data to display.")
847
+ log.warning("No summary data to display.")
664
848
  return
665
849
 
666
- # Define headers and find max score lengths
667
- headers = ["Tool Match", "Response Match", "LLM Eval"]
668
-
669
- # Print header
850
+ # Define header line
670
851
  header_line = (
671
852
  f"{'Model':<{max_model_len}} | {'Test Case':<{max_test_case_len}} | "
672
853
  f"{'Tool Match':<12} | {'Response Match':<16} | {'LLM Eval':<10}"
673
854
  )
674
- print(header_line)
675
- print("-" * len(header_line))
855
+ click.echo(click.style(header_line, fg="white", bold=True))
856
+ click.echo(click.style("-" * len(header_line), fg="white", bold=True))
676
857
 
677
- # Print data rows
678
858
  for model_name, test_case_id, scores in summary_data:
679
859
  tool_score = scores.get("Tool Match", "N/A")
680
860
  response_score = scores.get("Response Match", "N/A")
681
861
  llm_score = scores.get("LLM Eval", "N/A")
682
862
 
683
- print(
684
- f"{model_name:<{max_model_len}} | {test_case_id:<{max_test_case_len}} | "
685
- f"{tool_score:<12} | {response_score:<16} | {llm_score:<10}"
863
+ click.echo(
864
+ click.style(
865
+ f"{model_name:<{max_model_len}} | {test_case_id:<{max_test_case_len}} | "
866
+ f"{tool_score:<12} | {response_score:<16} | {llm_score:<10}",
867
+ fg="white",
868
+ )
686
869
  )
687
870
 
688
- def _get_model_stats(self, model_path: str) -> Dict[str, Any]:
871
+ def _get_model_stats(self, model_path: Path) -> dict[str, any]:
689
872
  """Process results for a single model and return stats."""
690
873
  model_stats = {}
691
- results_file = os.path.join(model_path, "results.json")
692
- if not os.path.exists(results_file):
874
+ results_file = model_path / "results.json"
875
+ if not results_file.exists():
693
876
  return model_stats
694
877
 
695
878
  results_data = self.file_service.load_json(results_file)
696
- model_name = results_data.get("model_name", os.path.basename(model_path))
879
+ model_name = results_data.get("model_name", model_path.name)
697
880
  model_stats[model_name] = {}
698
881
 
699
882
  for test_case in results_data.get("test_cases", []):
@@ -718,32 +901,35 @@ class EvaluationRunner:
718
901
  model_stats[model_name][test_case_id] = scores
719
902
  return model_stats
720
903
 
721
- def _save_execution_stats(self, base_results_path: str, start_time: float):
904
+ def _save_execution_stats(self, base_results_path: Path, start_time: float):
722
905
  """Save overall execution statistics."""
723
906
  end_time = time.time()
724
907
  total_execution_time = end_time - start_time
725
908
  stats = {"total_execution_time": total_execution_time, "models": {}}
726
909
 
727
910
  try:
728
- for model_dir in os.listdir(base_results_path):
729
- model_path = os.path.join(base_results_path, model_dir)
730
- if not os.path.isdir(model_path):
911
+ for model_path in base_results_path.iterdir():
912
+ if not model_path.is_dir():
731
913
  continue
732
914
  model_stats = self._get_model_stats(model_path)
733
915
  stats["models"].update(model_stats)
734
916
 
735
917
  except Exception as e:
736
- print(f"Error processing results for stats: {e}")
918
+ log.error(f"Error processing results for stats: {e}")
737
919
 
738
- stats_path = os.path.join(base_results_path, "stats.json")
920
+ stats_path = base_results_path / "stats.json"
739
921
  self.file_service.save_json(stats, stats_path)
740
922
 
741
- print(f"Overall stats written to {stats_path}")
742
- print(f"Total execution time: {total_execution_time:.2f} seconds")
923
+ log.info(f"Overall stats written to {stats_path}")
924
+ log.info(f"Total execution time: {total_execution_time:.2f} seconds")
743
925
 
744
926
 
745
927
  def main(config_path: str, verbose: bool = False):
746
928
  """Main entry point for the evaluation script."""
929
+ if verbose:
930
+ logging.basicConfig(level=logging.INFO)
931
+ log.info("Verbose logging enabled.")
932
+
747
933
  orchestrator = EvaluationRunner(verbose=verbose)
748
934
  orchestrator.run_evaluation(config_path)
749
935