solace-agent-mesh 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of solace-agent-mesh might be problematic. Click here for more details.
- solace_agent_mesh/agent/adk/callbacks.py +0 -5
- solace_agent_mesh/agent/adk/models/lite_llm.py +123 -8
- solace_agent_mesh/agent/adk/models/oauth2_token_manager.py +245 -0
- solace_agent_mesh/agent/protocol/event_handlers.py +213 -31
- solace_agent_mesh/agent/proxies/__init__.py +0 -0
- solace_agent_mesh/agent/proxies/a2a/__init__.py +3 -0
- solace_agent_mesh/agent/proxies/a2a/app.py +55 -0
- solace_agent_mesh/agent/proxies/a2a/component.py +1115 -0
- solace_agent_mesh/agent/proxies/a2a/config.py +140 -0
- solace_agent_mesh/agent/proxies/a2a/oauth_token_cache.py +104 -0
- solace_agent_mesh/agent/proxies/base/__init__.py +3 -0
- solace_agent_mesh/agent/proxies/base/app.py +99 -0
- solace_agent_mesh/agent/proxies/base/component.py +650 -0
- solace_agent_mesh/agent/proxies/base/config.py +85 -0
- solace_agent_mesh/agent/proxies/base/proxy_task_context.py +17 -0
- solace_agent_mesh/agent/sac/app.py +58 -5
- solace_agent_mesh/agent/sac/component.py +238 -75
- solace_agent_mesh/agent/sac/task_execution_context.py +46 -0
- solace_agent_mesh/agent/tools/audio_tools.py +125 -8
- solace_agent_mesh/agent/tools/web_tools.py +10 -5
- solace_agent_mesh/agent/utils/artifact_helpers.py +141 -3
- solace_agent_mesh/assets/docs/404.html +3 -3
- solace_agent_mesh/assets/docs/assets/js/5c2bd65f.eda4bcb2.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/6ad8f0bd.f4b15f3b.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/71da7b71.38583438.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/77cf947d.48cb18a2.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/924ffdeb.8095e148.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/9e9d0a82.570c057b.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/{ad71b5ed.60668e9e.js → ad71b5ed.af3ecfd1.js} +1 -1
- solace_agent_mesh/assets/docs/assets/js/ceb2a7a6.5d92d7d0.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/{da0b5bad.9d369087.js → da0b5bad.d08a9466.js} +1 -1
- solace_agent_mesh/assets/docs/assets/js/db924877.e98d12a1.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/de915948.27d6b065.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/{e3d9abda.2b916f9e.js → e3d9abda.6b9493d0.js} +1 -1
- solace_agent_mesh/assets/docs/assets/js/e6f9706b.e74a984d.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/f284c35a.42f59cdd.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/ff4d71f2.15b02f97.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/{main.bd3c34f3.js → main.b12eac43.js} +2 -2
- solace_agent_mesh/assets/docs/assets/js/runtime~main.e268214e.js +1 -0
- solace_agent_mesh/assets/docs/docs/documentation/components/agents/index.html +15 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/artifact-management/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/audio-tools/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/data-analysis-tools/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/embeds/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/cli/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/gateways/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/orchestrator/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/plugins/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/proxies/index.html +262 -0
- solace_agent_mesh/assets/docs/docs/documentation/deploying/debugging/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/deploying/deployment-options/index.html +31 -3
- solace_agent_mesh/assets/docs/docs/documentation/deploying/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/deploying/observability/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/developing/create-agents/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/create-gateways/index.html +5 -5
- solace_agent_mesh/assets/docs/docs/documentation/developing/creating-python-tools/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/creating-service-providers/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/evaluations/index.html +135 -0
- solace_agent_mesh/assets/docs/docs/documentation/developing/index.html +6 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/structure/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/bedrock-agents/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/custom-agent/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/event-mesh-gateway/index.html +5 -5
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/mcp-integration/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/mongodb-integration/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/rag-integration/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/rest-gateway/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/slack-integration/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/sql-database/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/enterprise/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/enterprise/installation/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/enterprise/rbac-setup-guide/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/enterprise/single-sign-on/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/getting-started/architecture/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/getting-started/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/getting-started/introduction/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/getting-started/try-agent-mesh/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/configurations/index.html +6 -5
- solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/installation/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/large_language_models/index.html +100 -3
- solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/run-project/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/migrations/a2a-upgrade/a2a-gateway-upgrade-to-0.3.0/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/migrations/a2a-upgrade/a2a-technical-migration-map/index.html +3 -3
- solace_agent_mesh/assets/docs/lunr-index-1761248203150.json +1 -0
- solace_agent_mesh/assets/docs/lunr-index.json +1 -1
- solace_agent_mesh/assets/docs/search-doc-1761248203150.json +1 -0
- solace_agent_mesh/assets/docs/search-doc.json +1 -1
- solace_agent_mesh/assets/docs/sitemap.xml +1 -1
- solace_agent_mesh/cli/__init__.py +1 -1
- solace_agent_mesh/cli/commands/add_cmd/agent_cmd.py +2 -69
- solace_agent_mesh/cli/commands/eval_cmd.py +11 -49
- solace_agent_mesh/cli/commands/init_cmd/__init__.py +0 -5
- solace_agent_mesh/cli/commands/init_cmd/env_step.py +10 -12
- solace_agent_mesh/cli/commands/init_cmd/orchestrator_step.py +9 -61
- solace_agent_mesh/cli/commands/init_cmd/webui_gateway_step.py +9 -49
- solace_agent_mesh/cli/commands/plugin_cmd/add_cmd.py +1 -2
- solace_agent_mesh/client/webui/frontend/static/assets/{authCallback-DwrxZE0E.js → authCallback-BTf6dqwp.js} +1 -1
- solace_agent_mesh/client/webui/frontend/static/assets/{client-DarGQzyw.js → client-CaY59VuC.js} +1 -1
- solace_agent_mesh/client/webui/frontend/static/assets/main-B32noGmR.js +342 -0
- solace_agent_mesh/client/webui/frontend/static/assets/main-DHJKSW1S.css +1 -0
- solace_agent_mesh/client/webui/frontend/static/assets/{vendor-BKIeiHj_.js → vendor-BEmvJSYz.js} +1 -1
- solace_agent_mesh/client/webui/frontend/static/auth-callback.html +3 -3
- solace_agent_mesh/client/webui/frontend/static/index.html +4 -4
- solace_agent_mesh/common/a2a/__init__.py +24 -0
- solace_agent_mesh/common/a2a/artifact.py +39 -0
- solace_agent_mesh/common/a2a/events.py +29 -0
- solace_agent_mesh/common/a2a/message.py +68 -0
- solace_agent_mesh/common/a2a/protocol.py +151 -1
- solace_agent_mesh/common/agent_registry.py +83 -3
- solace_agent_mesh/common/constants.py +3 -1
- solace_agent_mesh/common/sac/sam_component_base.py +383 -4
- solace_agent_mesh/common/utils/pydantic_utils.py +12 -0
- solace_agent_mesh/config_portal/backend/common.py +1 -1
- solace_agent_mesh/config_portal/frontend/static/client/assets/_index-ByU1X1HD.js +98 -0
- solace_agent_mesh/config_portal/frontend/static/client/assets/{manifest-44d62be6.js → manifest-61038fc6.js} +1 -1
- solace_agent_mesh/config_portal/frontend/static/client/index.html +1 -1
- solace_agent_mesh/evaluation/evaluator.py +128 -104
- solace_agent_mesh/evaluation/message_organizer.py +116 -110
- solace_agent_mesh/evaluation/report_data_processor.py +84 -86
- solace_agent_mesh/evaluation/report_generator.py +73 -79
- solace_agent_mesh/evaluation/run.py +421 -235
- solace_agent_mesh/evaluation/shared/__init__.py +92 -0
- solace_agent_mesh/evaluation/shared/constants.py +47 -0
- solace_agent_mesh/evaluation/shared/exceptions.py +50 -0
- solace_agent_mesh/evaluation/shared/helpers.py +35 -0
- solace_agent_mesh/evaluation/shared/test_case_loader.py +167 -0
- solace_agent_mesh/evaluation/shared/test_suite_loader.py +280 -0
- solace_agent_mesh/evaluation/subscriber.py +111 -232
- solace_agent_mesh/evaluation/summary_builder.py +227 -117
- solace_agent_mesh/gateway/base/app.py +16 -1
- solace_agent_mesh/gateway/base/component.py +112 -39
- solace_agent_mesh/gateway/http_sse/alembic/versions/20251015_add_session_performance_indexes.py +70 -0
- solace_agent_mesh/gateway/http_sse/component.py +99 -3
- solace_agent_mesh/gateway/http_sse/dependencies.py +4 -4
- solace_agent_mesh/gateway/http_sse/main.py +1 -0
- solace_agent_mesh/gateway/http_sse/repository/chat_task_repository.py +12 -13
- solace_agent_mesh/gateway/http_sse/repository/feedback_repository.py +15 -18
- solace_agent_mesh/gateway/http_sse/repository/interfaces.py +25 -18
- solace_agent_mesh/gateway/http_sse/repository/session_repository.py +30 -26
- solace_agent_mesh/gateway/http_sse/repository/task_repository.py +35 -44
- solace_agent_mesh/gateway/http_sse/routers/agent_cards.py +4 -3
- solace_agent_mesh/gateway/http_sse/routers/artifacts.py +95 -203
- solace_agent_mesh/gateway/http_sse/routers/dto/responses/session_responses.py +4 -3
- solace_agent_mesh/gateway/http_sse/routers/sessions.py +2 -2
- solace_agent_mesh/gateway/http_sse/routers/tasks.py +33 -41
- solace_agent_mesh/gateway/http_sse/routers/users.py +47 -1
- solace_agent_mesh/gateway/http_sse/routers/visualization.py +17 -11
- solace_agent_mesh/gateway/http_sse/services/data_retention_service.py +4 -4
- solace_agent_mesh/gateway/http_sse/services/feedback_service.py +51 -43
- solace_agent_mesh/gateway/http_sse/services/session_service.py +20 -20
- solace_agent_mesh/gateway/http_sse/services/task_logger_service.py +8 -8
- solace_agent_mesh/gateway/http_sse/shared/base_repository.py +45 -71
- solace_agent_mesh/gateway/http_sse/shared/types.py +0 -18
- solace_agent_mesh/templates/gateway_config_template.yaml +0 -5
- solace_agent_mesh/templates/logging_config_template.ini +10 -6
- solace_agent_mesh/templates/plugin_gateway_config_template.yaml +0 -3
- solace_agent_mesh/templates/shared_config.yaml +40 -0
- {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.1.dist-info}/METADATA +47 -21
- {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.1.dist-info}/RECORD +166 -145
- solace_agent_mesh/assets/docs/assets/js/5c2bd65f.e49689dd.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/6ad8f0bd.39d5851d.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/71da7b71.804d6567.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/77cf947d.64c9bd6c.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/9e9d0a82.dd810042.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/db924877.cbc66f02.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/de915948.139b4b9c.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/e6f9706b.582a78ca.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/f284c35a.5766a13d.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/ff4d71f2.9c0297a6.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/runtime~main.18dc45dd.js +0 -1
- solace_agent_mesh/assets/docs/lunr-index-1760121512891.json +0 -1
- solace_agent_mesh/assets/docs/search-doc-1760121512891.json +0 -1
- solace_agent_mesh/client/webui/frontend/static/assets/main-2nd1gbaH.js +0 -339
- solace_agent_mesh/client/webui/frontend/static/assets/main-DoKXctCM.css +0 -1
- solace_agent_mesh/config_portal/frontend/static/client/assets/_index-BNuqpWDc.js +0 -98
- solace_agent_mesh/evaluation/config_loader.py +0 -657
- solace_agent_mesh/evaluation/test_case_loader.py +0 -714
- /solace_agent_mesh/assets/docs/assets/js/{main.bd3c34f3.js.LICENSE.txt → main.b12eac43.js.LICENSE.txt} +0 -0
- {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.1.dist-info}/WHEEL +0 -0
- {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.1.dist-info}/entry_points.txt +0 -0
- {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,64 +4,95 @@ This module orchestrates the evaluation of AI models against test cases.
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import json
|
|
7
|
+
import logging
|
|
8
|
+
import mimetypes
|
|
7
9
|
import os
|
|
10
|
+
import shutil
|
|
11
|
+
import subprocess
|
|
8
12
|
import sys
|
|
13
|
+
import threading
|
|
9
14
|
import time
|
|
10
|
-
import subprocess
|
|
11
|
-
import requests
|
|
12
15
|
import uuid
|
|
13
|
-
import
|
|
14
|
-
import
|
|
15
|
-
import
|
|
16
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from importlib import metadata
|
|
16
19
|
from pathlib import Path
|
|
17
20
|
|
|
18
|
-
|
|
19
|
-
|
|
21
|
+
import click
|
|
22
|
+
import requests
|
|
20
23
|
from dotenv import load_dotenv
|
|
21
|
-
|
|
22
|
-
from .message_organizer import MessageOrganizer
|
|
23
|
-
from .summary_builder import SummaryBuilder
|
|
24
|
-
from .subscriber import Subscriber
|
|
24
|
+
|
|
25
25
|
from .evaluator import EvaluationOrchestrator
|
|
26
|
+
from .message_organizer import MessageOrganizer
|
|
26
27
|
from .report_generator import ReportGenerator
|
|
28
|
+
from .shared import (
|
|
29
|
+
DEFAULT_STARTUP_WAIT_TIME,
|
|
30
|
+
DEFAULT_TEST_TIMEOUT,
|
|
31
|
+
EVALUATION_DIR,
|
|
32
|
+
MAX_ARTIFACT_SIZE_MB,
|
|
33
|
+
EvaluationConfigLoader,
|
|
34
|
+
TestSuiteConfiguration,
|
|
35
|
+
get_local_base_url,
|
|
36
|
+
)
|
|
37
|
+
from .subscriber import Subscriber
|
|
38
|
+
from .summary_builder import SummaryBuilder
|
|
27
39
|
|
|
28
|
-
|
|
40
|
+
log = logging.getLogger(__name__)
|
|
29
41
|
|
|
30
42
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
43
|
+
def _error_exit(message: str):
|
|
44
|
+
"""Logs an error message and exits."""
|
|
45
|
+
log.error(message)
|
|
46
|
+
sys.exit(1)
|
|
34
47
|
|
|
35
|
-
# Constants
|
|
36
|
-
DEFAULT_STARTUP_WAIT_TIME = 60
|
|
37
|
-
DEFAULT_TEST_TIMEOUT = 60
|
|
38
48
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
49
|
+
def _ensure_eval_backend_config_exists():
|
|
50
|
+
"""Checks for eval_backend.yaml and creates it from a template if missing."""
|
|
51
|
+
project_root = Path.cwd()
|
|
52
|
+
configs_dir = project_root / "configs"
|
|
53
|
+
eval_backend_config_path = configs_dir / "eval_backend.yaml"
|
|
54
|
+
|
|
55
|
+
if eval_backend_config_path.exists():
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
click.echo(
|
|
59
|
+
f"'{eval_backend_config_path.relative_to(project_root)}' not found. Creating it..."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
if not (configs_dir / "shared_config.yaml").exists():
|
|
63
|
+
_error_exit(
|
|
64
|
+
"Error: 'configs/shared_config.yaml' not found. Please run 'sam init' first."
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
# This is a simplified way to get the template content.
|
|
69
|
+
# In a real CLI, you'd use a more robust method like `importlib.resources`.
|
|
70
|
+
template_path = Path(__file__).parent.parent / "templates" / "eval_backend_template.yaml"
|
|
71
|
+
with open(template_path, encoding="utf-8") as f:
|
|
72
|
+
template_content = f.read()
|
|
73
|
+
|
|
74
|
+
with open(eval_backend_config_path, "w", encoding="utf-8") as f:
|
|
75
|
+
f.write(template_content)
|
|
76
|
+
click.echo(
|
|
77
|
+
click.style(
|
|
78
|
+
f"Successfully created '{eval_backend_config_path.relative_to(project_root)}'.",
|
|
79
|
+
fg="green",
|
|
64
80
|
)
|
|
81
|
+
)
|
|
82
|
+
except Exception as e:
|
|
83
|
+
_error_exit(f"Failed to create eval_backend.yaml: {e}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _ensure_sam_rest_gateway_installed():
|
|
87
|
+
"""Checks if the sam-rest-gateway package is installed for local evaluation."""
|
|
88
|
+
try:
|
|
89
|
+
metadata.distribution("sam-rest-gateway")
|
|
90
|
+
except metadata.PackageNotFoundError:
|
|
91
|
+
_error_exit(
|
|
92
|
+
"Error: 'sam-rest-gateway' is not installed. "
|
|
93
|
+
"Please install it using: "
|
|
94
|
+
'pip install "sam-rest-gateway @ git+https://github.com/SolaceLabs/solace-agent-mesh-core-plugins#subdirectory=sam-rest-gateway"'
|
|
95
|
+
)
|
|
65
96
|
|
|
66
97
|
|
|
67
98
|
@dataclass
|
|
@@ -70,7 +101,7 @@ class TestRun:
|
|
|
70
101
|
|
|
71
102
|
agent: str
|
|
72
103
|
query: str
|
|
73
|
-
artifacts:
|
|
104
|
+
artifacts: list[str]
|
|
74
105
|
wait_time: int
|
|
75
106
|
test_case_file: str
|
|
76
107
|
run_num: int
|
|
@@ -78,192 +109,220 @@ class TestRun:
|
|
|
78
109
|
@property
|
|
79
110
|
def test_case_id(self) -> str:
|
|
80
111
|
"""Extract test case ID from filename."""
|
|
81
|
-
|
|
82
|
-
return os.path.splitext(base_name)[0].replace(".test", "")
|
|
112
|
+
return Path(self.test_case_file).stem.replace(".test", "")
|
|
83
113
|
|
|
84
114
|
|
|
85
115
|
class ProcessManager:
|
|
86
116
|
"""Manages subprocess lifecycle for the Solace AI Connector."""
|
|
87
117
|
|
|
88
|
-
def __init__(self, config:
|
|
118
|
+
def __init__(self, config: TestSuiteConfiguration, verbose: bool = False):
|
|
89
119
|
self.config = config
|
|
90
|
-
self.process:
|
|
91
|
-
self.namespace:
|
|
120
|
+
self.process: subprocess.Popen | None = None
|
|
121
|
+
self.namespace: str | None = None
|
|
92
122
|
self.verbose = verbose
|
|
93
123
|
|
|
94
|
-
def start_services(self) ->
|
|
124
|
+
def start_services(self) -> tuple[subprocess.Popen, str]:
|
|
95
125
|
"""Start the Solace AI Connector and return process and namespace."""
|
|
96
126
|
load_dotenv()
|
|
97
127
|
self.namespace = f"eval-{uuid.uuid4()}"
|
|
98
128
|
os.environ["NAMESPACE"] = self.namespace
|
|
99
129
|
|
|
100
|
-
|
|
130
|
+
# Set broker environment variables from the required configuration
|
|
131
|
+
log.info("Setting broker configuration from test suite...")
|
|
132
|
+
for key, value in self.config.broker.dict().items():
|
|
133
|
+
if value is not None:
|
|
134
|
+
env_key = f"SOLACE_BROKER_{key.upper()}"
|
|
135
|
+
os.environ[env_key] = str(value)
|
|
136
|
+
log.info(f" - Set {env_key}")
|
|
137
|
+
|
|
138
|
+
agent_files = self.config.agent_configs
|
|
101
139
|
|
|
102
140
|
command = [sys.executable, "-m", "solace_ai_connector.main", *agent_files]
|
|
103
141
|
|
|
104
|
-
|
|
105
|
-
project_root =
|
|
142
|
+
log.info("Starting Solace AI Connector as a subprocess...")
|
|
143
|
+
project_root = Path(EVALUATION_DIR).parent.resolve()
|
|
106
144
|
|
|
107
145
|
self.process = subprocess.Popen(
|
|
108
146
|
command, stdout=sys.stdout, stderr=sys.stderr, cwd=project_root
|
|
109
147
|
)
|
|
110
148
|
|
|
111
|
-
|
|
112
|
-
self._wait_for_server_ready()
|
|
149
|
+
log.info("Waiting for server to become healthy...")
|
|
150
|
+
self._wait_for_server_ready(get_local_base_url())
|
|
113
151
|
|
|
114
152
|
return self.process, self.namespace
|
|
115
153
|
|
|
116
|
-
def _wait_for_server_ready(self):
|
|
154
|
+
def _wait_for_server_ready(self, base_url: str):
|
|
117
155
|
"""Poll the health endpoint until the server is ready."""
|
|
118
156
|
start_time = time.time()
|
|
119
|
-
health_url = f"{
|
|
157
|
+
health_url = f"{base_url}/health"
|
|
120
158
|
|
|
121
|
-
while time.time() - start_time <
|
|
159
|
+
while time.time() - start_time < DEFAULT_STARTUP_WAIT_TIME:
|
|
122
160
|
try:
|
|
123
161
|
response = requests.get(health_url)
|
|
124
162
|
if response.status_code == 200:
|
|
125
|
-
|
|
126
|
-
time.sleep(
|
|
163
|
+
log.info("Server is healthy.")
|
|
164
|
+
time.sleep(5)
|
|
127
165
|
return
|
|
128
166
|
except requests.ConnectionError:
|
|
129
167
|
# Server is not yet available, wait and retry
|
|
130
168
|
time.sleep(1)
|
|
131
169
|
except Exception as e:
|
|
132
|
-
|
|
170
|
+
log.error(f"An unexpected error occurred during health check: {e}")
|
|
133
171
|
time.sleep(1)
|
|
134
172
|
|
|
135
173
|
raise RuntimeError(
|
|
136
|
-
f"Server did not become healthy within {
|
|
174
|
+
f"Server did not become healthy within {DEFAULT_STARTUP_WAIT_TIME} seconds."
|
|
137
175
|
)
|
|
138
176
|
|
|
139
|
-
def stop_services(self, subscriber:
|
|
177
|
+
def stop_services(self, subscriber: Subscriber | None = None):
|
|
140
178
|
"""Clean up running processes."""
|
|
141
179
|
if subscriber:
|
|
142
|
-
|
|
180
|
+
log.info("Terminating subscriber")
|
|
143
181
|
subscriber.stop()
|
|
144
182
|
subscriber.join()
|
|
145
|
-
|
|
183
|
+
log.info("Subscriber terminated.")
|
|
146
184
|
|
|
147
185
|
if self.process:
|
|
148
|
-
|
|
186
|
+
log.info("Terminating subprocess")
|
|
149
187
|
self.process.terminate()
|
|
150
188
|
try:
|
|
151
189
|
self.process.wait(timeout=5)
|
|
152
|
-
|
|
190
|
+
log.info("Subprocess terminated.")
|
|
153
191
|
except subprocess.TimeoutExpired:
|
|
154
|
-
|
|
192
|
+
log.info("Subprocess did not terminate gracefully, killing.")
|
|
155
193
|
self.process.kill()
|
|
156
194
|
|
|
157
|
-
|
|
195
|
+
log.info("Process cleanup completed.")
|
|
158
196
|
|
|
159
197
|
|
|
160
198
|
class TaskService:
|
|
161
199
|
"""Handles task submission and tracking."""
|
|
162
200
|
|
|
163
|
-
def __init__(self, config:
|
|
164
|
-
self.config = config
|
|
165
|
-
self.base_url = config.API_BASE_URL
|
|
201
|
+
def __init__(self, config: TestSuiteConfiguration, verbose: bool = False):
|
|
166
202
|
self.verbose = verbose
|
|
203
|
+
self.config = config
|
|
204
|
+
if config.remote:
|
|
205
|
+
self.base_url = config.remote.environment.get("EVAL_REMOTE_URL")
|
|
206
|
+
else:
|
|
207
|
+
self.base_url = get_local_base_url()
|
|
167
208
|
|
|
168
209
|
def submit_task(
|
|
169
|
-
self, agent_name: str, message: str, artifact_paths:
|
|
170
|
-
) ->
|
|
210
|
+
self, agent_name: str, message: str, artifact_paths: list[str] | None = None
|
|
211
|
+
) -> str | None:
|
|
171
212
|
"""Submit a test case to the agent and return the task ID."""
|
|
172
|
-
|
|
173
|
-
url = f"{self.base_url}/tasks"
|
|
213
|
+
log.info("Sending test request")
|
|
214
|
+
url = f"{self.base_url}/api/v2/tasks"
|
|
174
215
|
data = {
|
|
175
216
|
"agent_name": agent_name,
|
|
176
217
|
"prompt": message,
|
|
177
218
|
}
|
|
178
219
|
|
|
220
|
+
headers = {}
|
|
221
|
+
if self.config.remote:
|
|
222
|
+
auth_token = self.config.remote.environment.get("EVAL_AUTH_TOKEN")
|
|
223
|
+
if auth_token:
|
|
224
|
+
headers["Authorization"] = f"Bearer {auth_token}"
|
|
225
|
+
|
|
179
226
|
files_to_upload = []
|
|
180
227
|
if artifact_paths:
|
|
181
228
|
files_to_upload = self._prepare_file_uploads(artifact_paths)
|
|
182
229
|
|
|
183
230
|
try:
|
|
184
231
|
with requests.Session() as session:
|
|
185
|
-
response = session.post(url, data=data, files=files_to_upload)
|
|
232
|
+
response = session.post(url, data=data, files=files_to_upload, headers=headers)
|
|
186
233
|
|
|
187
234
|
response.raise_for_status()
|
|
188
235
|
task_id = response.json()["taskId"]
|
|
189
|
-
|
|
236
|
+
log.info(f"Task submitted with ID: {task_id}")
|
|
190
237
|
return task_id
|
|
191
238
|
|
|
192
239
|
except requests.RequestException as e:
|
|
193
|
-
|
|
240
|
+
log.error(f"Failed to submit task: {e}")
|
|
194
241
|
return None
|
|
195
242
|
finally:
|
|
196
243
|
self._close_file_uploads(files_to_upload)
|
|
197
244
|
|
|
198
|
-
def _prepare_file_uploads(self, artifact_paths:
|
|
245
|
+
def _prepare_file_uploads(self, artifact_paths: list[str]) -> list[tuple]:
|
|
199
246
|
"""Prepare file uploads for the request."""
|
|
200
247
|
files_to_upload = []
|
|
201
|
-
for
|
|
248
|
+
for path_str in artifact_paths:
|
|
249
|
+
path = Path(path_str)
|
|
250
|
+
# Check file size before reading
|
|
251
|
+
try:
|
|
252
|
+
file_size_mb = path.stat().st_size / (1024 * 1024)
|
|
253
|
+
if file_size_mb > MAX_ARTIFACT_SIZE_MB:
|
|
254
|
+
log.warning(
|
|
255
|
+
f"Artifact '{path.name}' is {file_size_mb:.2f} MB, "
|
|
256
|
+
f"which is larger than the recommended maximum of {MAX_ARTIFACT_SIZE_MB} MB. "
|
|
257
|
+
"This may cause memory issues."
|
|
258
|
+
)
|
|
259
|
+
except OSError as e:
|
|
260
|
+
log.error(f"Could not get size of artifact {path}: {e}")
|
|
261
|
+
continue
|
|
262
|
+
|
|
202
263
|
mimetype, _ = mimetypes.guess_type(path)
|
|
203
264
|
if mimetype is None:
|
|
204
265
|
mimetype = "text/plain"
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
266
|
+
# Read file content with context manager
|
|
267
|
+
with path.open("rb") as f:
|
|
268
|
+
file_content = f.read()
|
|
269
|
+
files_to_upload.append(("files", (path.name, file_content, mimetype)))
|
|
208
270
|
return files_to_upload
|
|
209
271
|
|
|
210
|
-
def _close_file_uploads(self, files_to_upload:
|
|
211
|
-
"""Close file handles after upload."""
|
|
212
|
-
|
|
213
|
-
|
|
272
|
+
def _close_file_uploads(self, files_to_upload: list[tuple]):
|
|
273
|
+
"""Close file handles after upload (no longer needed)."""
|
|
274
|
+
# No longer needed
|
|
275
|
+
pass
|
|
214
276
|
|
|
215
277
|
|
|
216
278
|
class FileService:
|
|
217
279
|
"""Handles file operations and path management."""
|
|
218
280
|
|
|
219
281
|
@staticmethod
|
|
220
|
-
def ensure_directory(path:
|
|
282
|
+
def ensure_directory(path: Path):
|
|
221
283
|
"""Ensure directory exists, create if necessary."""
|
|
222
|
-
|
|
284
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
223
285
|
|
|
224
286
|
@staticmethod
|
|
225
|
-
def remove_directory(path:
|
|
287
|
+
def remove_directory(path: Path):
|
|
226
288
|
"""Remove directory and all contents."""
|
|
227
|
-
if
|
|
289
|
+
if path.exists():
|
|
228
290
|
shutil.rmtree(path)
|
|
229
291
|
|
|
230
292
|
@staticmethod
|
|
231
|
-
def save_json(data:
|
|
293
|
+
def save_json(data: any, filepath: Path):
|
|
232
294
|
"""Save data as JSON to file."""
|
|
233
|
-
with open(
|
|
295
|
+
with filepath.open("w") as f:
|
|
234
296
|
json.dump(data, f, indent=4)
|
|
235
297
|
|
|
236
298
|
@staticmethod
|
|
237
|
-
def load_json(filepath:
|
|
299
|
+
def load_json(filepath: Path) -> any:
|
|
238
300
|
"""Load JSON data from file."""
|
|
239
|
-
with open(
|
|
301
|
+
with filepath.open() as f:
|
|
240
302
|
return json.load(f)
|
|
241
303
|
|
|
242
304
|
|
|
243
305
|
class TestRunBuilder:
|
|
244
306
|
"""Builds test run configurations from test cases."""
|
|
245
307
|
|
|
246
|
-
def __init__(self, config:
|
|
308
|
+
def __init__(self, config: TestSuiteConfiguration):
|
|
247
309
|
self.config = config
|
|
248
310
|
|
|
249
|
-
def build_test_runs(self) ->
|
|
311
|
+
def build_test_runs(self) -> list[TestRun]:
|
|
250
312
|
"""Build all test runs from configuration."""
|
|
251
313
|
test_runs = []
|
|
252
314
|
|
|
253
|
-
for test_case_path in self.config.
|
|
254
|
-
test_case = FileService.load_json(test_case_path)
|
|
315
|
+
for test_case_path in self.config.test_case_files:
|
|
316
|
+
test_case = FileService.load_json(Path(test_case_path))
|
|
255
317
|
|
|
256
318
|
artifact_paths = self._get_artifact_paths(test_case, test_case_path)
|
|
257
319
|
|
|
258
|
-
|
|
259
|
-
for run_num in range(1, self.config.runs + 1):
|
|
320
|
+
for run_num in range(1, self.config.run_count + 1):
|
|
260
321
|
test_run = TestRun(
|
|
261
322
|
agent=test_case["target_agent"],
|
|
262
323
|
query=test_case["query"],
|
|
263
324
|
artifacts=artifact_paths,
|
|
264
|
-
wait_time=test_case.get(
|
|
265
|
-
"wait_time", self.config.DEFAULT_TEST_TIMEOUT
|
|
266
|
-
),
|
|
325
|
+
wait_time=test_case.get("wait_time", DEFAULT_TEST_TIMEOUT),
|
|
267
326
|
test_case_file=test_case_path,
|
|
268
327
|
run_num=run_num,
|
|
269
328
|
)
|
|
@@ -271,14 +330,14 @@ class TestRunBuilder:
|
|
|
271
330
|
|
|
272
331
|
return test_runs
|
|
273
332
|
|
|
274
|
-
def _get_artifact_paths(self, test_case:
|
|
333
|
+
def _get_artifact_paths(self, test_case: dict, test_case_path: str) -> list[str]:
|
|
275
334
|
"""Extract artifact paths from test case."""
|
|
276
335
|
artifact_paths = []
|
|
277
336
|
if "artifacts" in test_case:
|
|
278
|
-
test_case_dir =
|
|
337
|
+
test_case_dir = Path(test_case_path).parent
|
|
279
338
|
for artifact in test_case["artifacts"]:
|
|
280
339
|
if artifact.get("type") == "file":
|
|
281
|
-
artifact_paths.append(
|
|
340
|
+
artifact_paths.append(str(test_case_dir / artifact["path"]))
|
|
282
341
|
return artifact_paths
|
|
283
342
|
|
|
284
343
|
|
|
@@ -293,13 +352,14 @@ class TestExecutor:
|
|
|
293
352
|
def execute_test(
|
|
294
353
|
self,
|
|
295
354
|
test_run: TestRun,
|
|
296
|
-
model_results_path:
|
|
297
|
-
task_mappings:
|
|
355
|
+
model_results_path: Path,
|
|
356
|
+
task_mappings: dict[str, str],
|
|
298
357
|
subscriber: Subscriber,
|
|
358
|
+
task_mappings_lock: threading.Lock,
|
|
299
359
|
) -> bool:
|
|
300
360
|
"""Execute a single test case and wait for completion."""
|
|
301
|
-
|
|
302
|
-
f"
|
|
361
|
+
log.info(
|
|
362
|
+
f"Starting test: {test_run.test_case_file} (run {test_run.run_num})"
|
|
303
363
|
)
|
|
304
364
|
|
|
305
365
|
# Submit the task
|
|
@@ -308,25 +368,23 @@ class TestExecutor:
|
|
|
308
368
|
)
|
|
309
369
|
|
|
310
370
|
if not task_id:
|
|
311
|
-
|
|
371
|
+
log.error(
|
|
312
372
|
f"Failed to start test case: {test_run.test_case_file} (run {test_run.run_num})"
|
|
313
373
|
)
|
|
314
374
|
return False
|
|
315
375
|
|
|
316
376
|
# Set up result directory
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
)
|
|
377
|
+
test_case_name = Path(test_run.test_case_file).stem.replace(".test", "")
|
|
378
|
+
run_dir = model_results_path / test_case_name / f"run_{test_run.run_num}"
|
|
320
379
|
self.file_service.ensure_directory(run_dir)
|
|
321
380
|
|
|
322
381
|
# Save test case path for summary builder
|
|
323
382
|
test_info = {"path": test_run.test_case_file}
|
|
324
|
-
self.file_service.save_json(
|
|
325
|
-
test_info, os.path.join(run_dir, "test_case_info.json")
|
|
326
|
-
)
|
|
383
|
+
self.file_service.save_json(test_info, run_dir / "test_case_info.json")
|
|
327
384
|
|
|
328
385
|
# Track the task
|
|
329
|
-
|
|
386
|
+
with task_mappings_lock:
|
|
387
|
+
task_mappings[task_id] = str(run_dir)
|
|
330
388
|
subscriber.active_tasks.add(task_id)
|
|
331
389
|
|
|
332
390
|
# Wait for completion
|
|
@@ -336,26 +394,26 @@ class TestExecutor:
|
|
|
336
394
|
self, task_id: str, wait_time: int, subscriber: Subscriber
|
|
337
395
|
) -> bool:
|
|
338
396
|
"""Wait for task completion with timeout."""
|
|
339
|
-
|
|
397
|
+
log.info(
|
|
340
398
|
f"Waiting for task {task_id} to complete (timeout: {wait_time} seconds)..."
|
|
341
399
|
)
|
|
342
400
|
|
|
343
401
|
start_time = time.time()
|
|
344
402
|
while task_id in subscriber.active_tasks:
|
|
345
403
|
if time.time() - start_time > wait_time:
|
|
346
|
-
|
|
404
|
+
log.warning(f"Task {task_id} timed out after {wait_time} seconds")
|
|
347
405
|
subscriber.active_tasks.discard(task_id)
|
|
348
406
|
return False
|
|
349
407
|
time.sleep(1)
|
|
350
408
|
|
|
351
|
-
|
|
409
|
+
log.info(f"Task {task_id} completed successfully")
|
|
352
410
|
return True
|
|
353
411
|
|
|
354
412
|
|
|
355
413
|
class ModelEvaluator:
|
|
356
414
|
"""Handles the evaluation of a single model."""
|
|
357
415
|
|
|
358
|
-
def __init__(self, config:
|
|
416
|
+
def __init__(self, config: TestSuiteConfiguration, verbose: bool = False):
|
|
359
417
|
self.config = config
|
|
360
418
|
self.process_manager = ProcessManager(config, verbose=verbose)
|
|
361
419
|
self.task_service = TaskService(config, verbose=verbose)
|
|
@@ -363,20 +421,21 @@ class ModelEvaluator:
|
|
|
363
421
|
self.test_builder = TestRunBuilder(config)
|
|
364
422
|
self.test_executor = TestExecutor(self.task_service, self.file_service, verbose=verbose)
|
|
365
423
|
self.verbose = verbose
|
|
424
|
+
self._task_mappings_lock = threading.Lock()
|
|
366
425
|
|
|
367
426
|
def evaluate_model(
|
|
368
|
-
self, model_config:
|
|
427
|
+
self, model_config: dict[str, any], base_results_path: Path
|
|
369
428
|
) -> float:
|
|
370
429
|
"""Evaluate a single model and return execution time."""
|
|
371
|
-
model_name = model_config
|
|
372
|
-
|
|
430
|
+
model_name = model_config.name
|
|
431
|
+
log.info(f"Starting evaluation for model: {model_name}")
|
|
373
432
|
start_time = time.time()
|
|
374
433
|
|
|
375
434
|
# Set environment variables for the model
|
|
376
435
|
self._set_model_environment(model_config)
|
|
377
436
|
|
|
378
437
|
# Set up paths
|
|
379
|
-
model_results_path =
|
|
438
|
+
model_results_path = base_results_path / model_name
|
|
380
439
|
self.file_service.ensure_directory(model_results_path)
|
|
381
440
|
|
|
382
441
|
# Start services
|
|
@@ -388,10 +447,10 @@ class ModelEvaluator:
|
|
|
388
447
|
try:
|
|
389
448
|
# Execute tests
|
|
390
449
|
successful_tests = self._execute_all_tests(model_results_path, subscriber)
|
|
391
|
-
|
|
450
|
+
log.info(f"Completed {successful_tests} tests successfully")
|
|
392
451
|
|
|
393
452
|
except Exception as e:
|
|
394
|
-
|
|
453
|
+
log.error(f"Error during test case execution for model {model_name}: {e}")
|
|
395
454
|
finally:
|
|
396
455
|
# Cleanup
|
|
397
456
|
task_mappings = getattr(self, "_task_mappings", {})
|
|
@@ -401,52 +460,84 @@ class ModelEvaluator:
|
|
|
401
460
|
|
|
402
461
|
end_time = time.time()
|
|
403
462
|
execution_time = end_time - start_time
|
|
404
|
-
|
|
405
|
-
f"
|
|
463
|
+
log.info(
|
|
464
|
+
f"Evaluation for model: {model_name} complete in {execution_time:.2f} seconds"
|
|
406
465
|
)
|
|
407
466
|
|
|
408
467
|
return execution_time
|
|
409
468
|
|
|
410
|
-
def _set_model_environment(self, model_config:
|
|
469
|
+
def _set_model_environment(self, model_config: dict[str, any]):
|
|
411
470
|
"""Set environment variables for the model."""
|
|
412
|
-
for key, value in model_config.
|
|
413
|
-
|
|
471
|
+
for key, value in model_config.environment.variables.items():
|
|
472
|
+
if value is not None:
|
|
473
|
+
os.environ[key] = value
|
|
414
474
|
|
|
415
|
-
def _setup_subscriber(self, namespace: str, model_results_path:
|
|
475
|
+
def _setup_subscriber(self, namespace: str, model_results_path: Path) -> Subscriber:
|
|
416
476
|
"""Set up and start the subscriber."""
|
|
417
477
|
subscription_ready_event = threading.Event()
|
|
418
478
|
subscriber = Subscriber(
|
|
419
|
-
|
|
479
|
+
self.config.broker,
|
|
480
|
+
namespace,
|
|
481
|
+
set(),
|
|
482
|
+
None,
|
|
483
|
+
subscription_ready_event,
|
|
484
|
+
model_results_path,
|
|
420
485
|
)
|
|
421
486
|
subscriber.start()
|
|
422
487
|
|
|
423
|
-
|
|
488
|
+
log.info("Waiting for subscriber to be ready...")
|
|
424
489
|
subscription_ready_event.wait()
|
|
425
|
-
|
|
490
|
+
log.info("Subscriber is ready.")
|
|
426
491
|
|
|
427
492
|
return subscriber
|
|
428
493
|
|
|
429
494
|
def _execute_all_tests(
|
|
430
|
-
self, model_results_path:
|
|
495
|
+
self, model_results_path: Path, subscriber: Subscriber
|
|
431
496
|
) -> int:
|
|
432
|
-
"""Execute all test cases and return count of successful tests."""
|
|
497
|
+
"""Execute all test cases in parallel and return count of successful tests."""
|
|
433
498
|
test_runs = self.test_builder.build_test_runs()
|
|
434
499
|
|
|
435
500
|
self._task_mappings = {}
|
|
436
501
|
total_tests = len(test_runs)
|
|
437
502
|
successful_tests = 0
|
|
438
503
|
|
|
439
|
-
|
|
504
|
+
log.info(
|
|
505
|
+
f"Starting parallel execution of {total_tests} tests with {self.config.workers} workers."
|
|
506
|
+
)
|
|
440
507
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
508
|
+
with ThreadPoolExecutor(max_workers=self.config.workers) as executor:
|
|
509
|
+
# Create a dictionary to map futures to their test_run
|
|
510
|
+
future_to_run = {
|
|
511
|
+
executor.submit(
|
|
512
|
+
self.test_executor.execute_test,
|
|
513
|
+
test_run,
|
|
514
|
+
model_results_path,
|
|
515
|
+
self._task_mappings,
|
|
516
|
+
subscriber,
|
|
517
|
+
self._task_mappings_lock, # Pass the lock to the worker
|
|
518
|
+
): test_run
|
|
519
|
+
for test_run in test_runs
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
# Process results as they complete
|
|
523
|
+
for i, future in enumerate(as_completed(future_to_run), 1):
|
|
524
|
+
test_run = future_to_run[future]
|
|
525
|
+
log.info(
|
|
526
|
+
f"Processing result for test {i}/{total_tests}: {test_run.test_case_id}"
|
|
527
|
+
)
|
|
528
|
+
try:
|
|
529
|
+
success = future.result()
|
|
530
|
+
if success:
|
|
531
|
+
successful_tests += 1
|
|
532
|
+
else:
|
|
533
|
+
log.warning(
|
|
534
|
+
f"Test {test_run.test_case_id} (run {test_run.run_num}) failed or timed out."
|
|
535
|
+
)
|
|
536
|
+
except Exception as e:
|
|
537
|
+
log.error(
|
|
538
|
+
f"Test {test_run.test_case_id} (run {test_run.run_num}) generated an exception: {e}",
|
|
539
|
+
exc_info=True,
|
|
540
|
+
)
|
|
450
541
|
|
|
451
542
|
return successful_tests
|
|
452
543
|
|
|
@@ -454,16 +545,16 @@ class ModelEvaluator:
|
|
|
454
545
|
self,
|
|
455
546
|
app_process: subprocess.Popen,
|
|
456
547
|
subscriber: Subscriber,
|
|
457
|
-
model_results_path:
|
|
458
|
-
task_mappings:
|
|
548
|
+
model_results_path: Path,
|
|
549
|
+
task_mappings: dict[str, str],
|
|
459
550
|
):
|
|
460
551
|
"""Clean up after model evaluation."""
|
|
461
552
|
self.process_manager.stop_services(subscriber)
|
|
462
553
|
|
|
463
554
|
# Save task mappings
|
|
464
|
-
mappings_file =
|
|
555
|
+
mappings_file = model_results_path / "task_mappings.json"
|
|
465
556
|
self.file_service.save_json(task_mappings, mappings_file)
|
|
466
|
-
|
|
557
|
+
log.info(f"Task mappings saved to {mappings_file}")
|
|
467
558
|
|
|
468
559
|
|
|
469
560
|
class ResultsProcessor:
|
|
@@ -471,52 +562,49 @@ class ResultsProcessor:
|
|
|
471
562
|
|
|
472
563
|
def __init__(self, file_service: FileService, verbose: bool = False):
|
|
473
564
|
self.file_service = file_service
|
|
474
|
-
self.summary_builder =
|
|
565
|
+
self.summary_builder: SummaryBuilder | None = None
|
|
475
566
|
self.verbose = verbose
|
|
476
567
|
|
|
477
|
-
def summarize_results(self, base_results_path:
|
|
568
|
+
def summarize_results(self, base_results_path: Path, config: TestSuiteConfiguration):
|
|
478
569
|
"""Generate summaries for all test results."""
|
|
479
|
-
|
|
570
|
+
log.info("Summarizing results")
|
|
480
571
|
|
|
481
|
-
|
|
482
|
-
model_path = os.path.join(base_results_path, model_name)
|
|
483
|
-
if not os.path.isdir(model_path):
|
|
484
|
-
continue
|
|
572
|
+
self.summary_builder = SummaryBuilder(config)
|
|
485
573
|
|
|
574
|
+
for model_path in base_results_path.iterdir():
|
|
575
|
+
if not model_path.is_dir():
|
|
576
|
+
continue
|
|
486
577
|
self._process_model_results(model_path)
|
|
487
578
|
|
|
488
|
-
def _process_model_results(self, model_path:
|
|
579
|
+
def _process_model_results(self, model_path: Path):
|
|
489
580
|
"""Process results for a single model."""
|
|
490
|
-
for
|
|
491
|
-
|
|
492
|
-
if not os.path.isdir(test_case_path):
|
|
581
|
+
for test_case_path in model_path.iterdir():
|
|
582
|
+
if not test_case_path.is_dir():
|
|
493
583
|
continue
|
|
494
|
-
|
|
495
584
|
self._process_test_case_results(test_case_path)
|
|
496
585
|
|
|
497
|
-
def _process_test_case_results(self, test_case_path:
|
|
586
|
+
def _process_test_case_results(self, test_case_path: Path):
|
|
498
587
|
"""Process results for a single test case."""
|
|
499
|
-
for
|
|
500
|
-
|
|
501
|
-
if not os.path.isdir(run_path):
|
|
588
|
+
for run_path in test_case_path.iterdir():
|
|
589
|
+
if not run_path.is_dir():
|
|
502
590
|
continue
|
|
503
591
|
|
|
504
|
-
messages_file =
|
|
505
|
-
if
|
|
592
|
+
messages_file = run_path / "messages.json"
|
|
593
|
+
if messages_file.exists():
|
|
506
594
|
summary_data = self.summary_builder.summarize_run(messages_file)
|
|
507
|
-
summary_file =
|
|
595
|
+
summary_file = run_path / "summary.json"
|
|
508
596
|
self.file_service.save_json(summary_data, summary_file)
|
|
509
|
-
|
|
597
|
+
log.info(f"Summary created for {run_path}")
|
|
510
598
|
|
|
511
599
|
|
|
512
600
|
class EvaluationRunner:
|
|
513
601
|
"""Main orchestrator that coordinates the entire evaluation process."""
|
|
514
602
|
|
|
515
603
|
def __init__(self, verbose: bool = False):
|
|
516
|
-
self.config:
|
|
604
|
+
self.config: TestSuiteConfiguration | None = None
|
|
517
605
|
self.file_service = FileService()
|
|
518
606
|
self.results_processor = ResultsProcessor(self.file_service, verbose=verbose)
|
|
519
|
-
self.report_generator:
|
|
607
|
+
self.report_generator: ReportGenerator | None = None
|
|
520
608
|
self.verbose = verbose
|
|
521
609
|
|
|
522
610
|
def run_evaluation(self, config_path: str):
|
|
@@ -528,91 +616,188 @@ class EvaluationRunner:
|
|
|
528
616
|
self._load_configuration(config_path)
|
|
529
617
|
|
|
530
618
|
# Set up results directory in the current working directory
|
|
531
|
-
base_results_path = Path.cwd() / "results" / self.config.
|
|
619
|
+
base_results_path = Path.cwd() / "results" / self.config.results_directory
|
|
532
620
|
self._setup_results_directory(base_results_path)
|
|
533
621
|
|
|
534
622
|
# Run model evaluations
|
|
535
|
-
|
|
623
|
+
if self.config.remote:
|
|
624
|
+
model_execution_times = self._run_remote_evaluation(base_results_path)
|
|
625
|
+
else:
|
|
626
|
+
model_execution_times = self._run_local_evaluation(base_results_path)
|
|
536
627
|
|
|
537
628
|
# Post-process results
|
|
538
629
|
self._post_process_results(
|
|
539
|
-
|
|
630
|
+
base_results_path, model_execution_times, config_path
|
|
540
631
|
)
|
|
541
632
|
|
|
542
633
|
# Save overall statistics
|
|
543
|
-
self._save_execution_stats(
|
|
634
|
+
self._save_execution_stats(base_results_path, start_time)
|
|
544
635
|
|
|
545
636
|
# Generate reports
|
|
546
637
|
self._generate_reports(config_path, base_results_path)
|
|
547
638
|
|
|
548
|
-
# Display
|
|
549
|
-
|
|
550
|
-
self._display_verbose_summary(base_results_path)
|
|
639
|
+
# Display summary
|
|
640
|
+
self._display_summary(base_results_path)
|
|
551
641
|
|
|
552
642
|
except Exception as e:
|
|
553
|
-
|
|
643
|
+
log.error(f"Evaluation failed: {e}")
|
|
554
644
|
raise
|
|
555
645
|
|
|
556
646
|
def _load_configuration(self, config_path: str):
|
|
557
647
|
"""Load and validate the evaluation configuration."""
|
|
558
|
-
config_loader =
|
|
559
|
-
|
|
560
|
-
self.config = EvaluationConfig(config_data)
|
|
648
|
+
config_loader = EvaluationConfigLoader(config_path)
|
|
649
|
+
self.config = config_loader.load_configuration()
|
|
561
650
|
self.report_generator = ReportGenerator(config_path)
|
|
562
|
-
|
|
651
|
+
log.info("Configuration loaded and validated successfully.")
|
|
563
652
|
|
|
564
653
|
def _setup_results_directory(self, base_results_path: Path):
|
|
565
654
|
"""Set up the results directory."""
|
|
566
655
|
# Clean up existing results
|
|
567
|
-
self.file_service.remove_directory(
|
|
568
|
-
self.file_service.ensure_directory(
|
|
656
|
+
self.file_service.remove_directory(base_results_path)
|
|
657
|
+
self.file_service.ensure_directory(base_results_path)
|
|
569
658
|
|
|
570
|
-
|
|
659
|
+
log.info(f"Results directory set up at: {base_results_path}")
|
|
571
660
|
|
|
572
|
-
def
|
|
573
|
-
"""
|
|
661
|
+
def _run_local_evaluation(self, base_results_path: Path) -> dict[str, float]:
|
|
662
|
+
"""Run the full local evaluation with service management."""
|
|
663
|
+
_ensure_eval_backend_config_exists()
|
|
664
|
+
_ensure_sam_rest_gateway_installed()
|
|
665
|
+
log.info("Starting local evaluation")
|
|
574
666
|
model_execution_times = {}
|
|
575
667
|
|
|
576
|
-
|
|
668
|
+
# This loop iterates through the models defined in the config
|
|
669
|
+
for model_config in self.config.model_configurations:
|
|
670
|
+
# ModelEvaluator manages the lifecycle of local services for each model
|
|
577
671
|
model_evaluator = ModelEvaluator(self.config, verbose=self.verbose)
|
|
578
672
|
execution_time = model_evaluator.evaluate_model(
|
|
579
673
|
model_config, base_results_path
|
|
580
674
|
)
|
|
581
|
-
model_execution_times[model_config
|
|
675
|
+
model_execution_times[model_config.name] = execution_time
|
|
582
676
|
|
|
583
677
|
return model_execution_times
|
|
584
678
|
|
|
679
|
+
def _run_remote_evaluation(self, base_results_path: Path) -> dict[str, float]:
|
|
680
|
+
"""Run evaluation against a remote endpoint in parallel."""
|
|
681
|
+
remote_url = self.config.remote.environment.get("EVAL_REMOTE_URL")
|
|
682
|
+
log.info(f"Starting remote evaluation against: {remote_url}")
|
|
683
|
+
start_time = time.time()
|
|
684
|
+
|
|
685
|
+
# Check if the remote server is healthy before proceeding
|
|
686
|
+
process_manager = ProcessManager(self.config, self.verbose)
|
|
687
|
+
process_manager._wait_for_server_ready(remote_url)
|
|
688
|
+
|
|
689
|
+
# Instantiate services with the remote configuration
|
|
690
|
+
task_service = TaskService(self.config, self.verbose)
|
|
691
|
+
test_builder = TestRunBuilder(self.config)
|
|
692
|
+
test_executor = TestExecutor(task_service, self.file_service, self.verbose)
|
|
693
|
+
|
|
694
|
+
# In remote mode, there's no model loop. We create a single "remote" results directory.
|
|
695
|
+
remote_results_path = base_results_path / "remote"
|
|
696
|
+
self.file_service.ensure_directory(remote_results_path)
|
|
697
|
+
|
|
698
|
+
# The subscriber needs to be configured for remote use.
|
|
699
|
+
subscriber = self._setup_remote_subscriber(str(remote_results_path))
|
|
700
|
+
|
|
701
|
+
task_mappings = {}
|
|
702
|
+
try:
|
|
703
|
+
test_runs = test_builder.build_test_runs()
|
|
704
|
+
successful_tests = 0
|
|
705
|
+
task_mappings_lock = threading.Lock()
|
|
706
|
+
|
|
707
|
+
log.info(
|
|
708
|
+
f"Starting parallel execution of {len(test_runs)} remote tests with {self.config.workers} workers."
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
with ThreadPoolExecutor(max_workers=self.config.workers) as executor:
|
|
712
|
+
future_to_run = {
|
|
713
|
+
executor.submit(
|
|
714
|
+
test_executor.execute_test,
|
|
715
|
+
test_run,
|
|
716
|
+
remote_results_path,
|
|
717
|
+
task_mappings,
|
|
718
|
+
subscriber,
|
|
719
|
+
task_mappings_lock,
|
|
720
|
+
): test_run
|
|
721
|
+
for test_run in test_runs
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
for i, future in enumerate(as_completed(future_to_run), 1):
|
|
725
|
+
test_run = future_to_run[future]
|
|
726
|
+
log.info(
|
|
727
|
+
f"Processing result for remote test {i}/{len(test_runs)}: {test_run.test_case_id}"
|
|
728
|
+
)
|
|
729
|
+
try:
|
|
730
|
+
success = future.result()
|
|
731
|
+
if success:
|
|
732
|
+
successful_tests += 1
|
|
733
|
+
except Exception as e:
|
|
734
|
+
log.error(
|
|
735
|
+
f"Remote test {test_run.test_case_id} generated an exception: {e}",
|
|
736
|
+
exc_info=True,
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
log.info(f"Completed {successful_tests} remote tests successfully")
|
|
740
|
+
|
|
741
|
+
finally:
|
|
742
|
+
if subscriber:
|
|
743
|
+
subscriber.stop()
|
|
744
|
+
subscriber.join()
|
|
745
|
+
|
|
746
|
+
# Save task mappings for remote run
|
|
747
|
+
mappings_file = remote_results_path / "task_mappings.json"
|
|
748
|
+
self.file_service.save_json(task_mappings, mappings_file)
|
|
749
|
+
|
|
750
|
+
execution_time = time.time() - start_time
|
|
751
|
+
return {"remote": execution_time}
|
|
752
|
+
|
|
753
|
+
def _setup_remote_subscriber(self, results_path: str) -> Subscriber:
|
|
754
|
+
"""Set up a subscriber for remote evaluation."""
|
|
755
|
+
subscription_ready_event = threading.Event()
|
|
756
|
+
namespace = self.config.remote.environment.get("EVAL_NAMESPACE")
|
|
757
|
+
subscriber = Subscriber(
|
|
758
|
+
self.config.broker,
|
|
759
|
+
namespace,
|
|
760
|
+
set(),
|
|
761
|
+
None,
|
|
762
|
+
subscription_ready_event,
|
|
763
|
+
results_path,
|
|
764
|
+
)
|
|
765
|
+
subscriber.start()
|
|
766
|
+
subscription_ready_event.wait()
|
|
767
|
+
log.info("Remote subscriber is ready.")
|
|
768
|
+
return subscriber
|
|
769
|
+
|
|
585
770
|
def _post_process_results(
|
|
586
771
|
self,
|
|
587
|
-
base_results_path:
|
|
588
|
-
model_execution_times:
|
|
772
|
+
base_results_path: Path,
|
|
773
|
+
model_execution_times: dict[str, float],
|
|
589
774
|
config_path: str,
|
|
590
775
|
):
|
|
591
776
|
"""Post-process evaluation results."""
|
|
592
777
|
# Categorize messages using the refactored categorizer
|
|
593
|
-
|
|
778
|
+
log.info("Categorizing messages")
|
|
594
779
|
message_organizer = MessageOrganizer()
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
)
|
|
598
|
-
print("--- Message categorization finished ---")
|
|
780
|
+
message_organizer.categorize_all_messages(base_results_path)
|
|
781
|
+
log.info("Message categorization finished")
|
|
599
782
|
|
|
600
783
|
# Generate summaries
|
|
601
|
-
self.results_processor.summarize_results(base_results_path)
|
|
784
|
+
self.results_processor.summarize_results(base_results_path, self.config)
|
|
602
785
|
|
|
603
786
|
# Run evaluation
|
|
604
|
-
|
|
787
|
+
log.info("Starting evaluation of results")
|
|
605
788
|
evaluation_orchestrator = EvaluationOrchestrator(config_path)
|
|
606
|
-
evaluation_orchestrator.run_evaluation(
|
|
607
|
-
|
|
789
|
+
evaluation_orchestrator.run_evaluation(
|
|
790
|
+
base_results_path, model_execution_times
|
|
791
|
+
)
|
|
792
|
+
log.info("Evaluation of results finished")
|
|
608
793
|
|
|
609
794
|
def _generate_reports(self, config_path: str, base_results_path: Path):
|
|
610
795
|
"""Generate evaluation reports."""
|
|
611
796
|
if self.report_generator:
|
|
612
797
|
self.report_generator.generate_report(base_results_path)
|
|
613
798
|
|
|
614
|
-
def
|
|
615
|
-
"""Display a
|
|
799
|
+
def _display_summary(self, base_results_path: Path):
|
|
800
|
+
"""Display a summary of the evaluation results in the terminal."""
|
|
616
801
|
|
|
617
802
|
# Pre-process data to find column widths
|
|
618
803
|
summary_data = []
|
|
@@ -628,7 +813,7 @@ class EvaluationRunner:
|
|
|
628
813
|
continue
|
|
629
814
|
|
|
630
815
|
try:
|
|
631
|
-
results_data = self.file_service.load_json(
|
|
816
|
+
results_data = self.file_service.load_json(results_file)
|
|
632
817
|
model_name = results_data.get("model_name", model_dir.name)
|
|
633
818
|
max_model_len = max(max_model_len, len(model_name))
|
|
634
819
|
|
|
@@ -656,44 +841,42 @@ class EvaluationRunner:
|
|
|
656
841
|
summary_data.append((model_name, test_case_id, scores))
|
|
657
842
|
|
|
658
843
|
except Exception as e:
|
|
659
|
-
|
|
844
|
+
log.error(f"Error processing results for {model_dir.name}: {e}")
|
|
660
845
|
|
|
661
|
-
# Print formatted output
|
|
662
846
|
if not summary_data:
|
|
663
|
-
|
|
847
|
+
log.warning("No summary data to display.")
|
|
664
848
|
return
|
|
665
849
|
|
|
666
|
-
# Define
|
|
667
|
-
headers = ["Tool Match", "Response Match", "LLM Eval"]
|
|
668
|
-
|
|
669
|
-
# Print header
|
|
850
|
+
# Define header line
|
|
670
851
|
header_line = (
|
|
671
852
|
f"{'Model':<{max_model_len}} | {'Test Case':<{max_test_case_len}} | "
|
|
672
853
|
f"{'Tool Match':<12} | {'Response Match':<16} | {'LLM Eval':<10}"
|
|
673
854
|
)
|
|
674
|
-
|
|
675
|
-
|
|
855
|
+
click.echo(click.style(header_line, fg="white", bold=True))
|
|
856
|
+
click.echo(click.style("-" * len(header_line), fg="white", bold=True))
|
|
676
857
|
|
|
677
|
-
# Print data rows
|
|
678
858
|
for model_name, test_case_id, scores in summary_data:
|
|
679
859
|
tool_score = scores.get("Tool Match", "N/A")
|
|
680
860
|
response_score = scores.get("Response Match", "N/A")
|
|
681
861
|
llm_score = scores.get("LLM Eval", "N/A")
|
|
682
862
|
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
863
|
+
click.echo(
|
|
864
|
+
click.style(
|
|
865
|
+
f"{model_name:<{max_model_len}} | {test_case_id:<{max_test_case_len}} | "
|
|
866
|
+
f"{tool_score:<12} | {response_score:<16} | {llm_score:<10}",
|
|
867
|
+
fg="white",
|
|
868
|
+
)
|
|
686
869
|
)
|
|
687
870
|
|
|
688
|
-
def _get_model_stats(self, model_path:
|
|
871
|
+
def _get_model_stats(self, model_path: Path) -> dict[str, any]:
|
|
689
872
|
"""Process results for a single model and return stats."""
|
|
690
873
|
model_stats = {}
|
|
691
|
-
results_file =
|
|
692
|
-
if not
|
|
874
|
+
results_file = model_path / "results.json"
|
|
875
|
+
if not results_file.exists():
|
|
693
876
|
return model_stats
|
|
694
877
|
|
|
695
878
|
results_data = self.file_service.load_json(results_file)
|
|
696
|
-
model_name = results_data.get("model_name",
|
|
879
|
+
model_name = results_data.get("model_name", model_path.name)
|
|
697
880
|
model_stats[model_name] = {}
|
|
698
881
|
|
|
699
882
|
for test_case in results_data.get("test_cases", []):
|
|
@@ -718,32 +901,35 @@ class EvaluationRunner:
|
|
|
718
901
|
model_stats[model_name][test_case_id] = scores
|
|
719
902
|
return model_stats
|
|
720
903
|
|
|
721
|
-
def _save_execution_stats(self, base_results_path:
|
|
904
|
+
def _save_execution_stats(self, base_results_path: Path, start_time: float):
|
|
722
905
|
"""Save overall execution statistics."""
|
|
723
906
|
end_time = time.time()
|
|
724
907
|
total_execution_time = end_time - start_time
|
|
725
908
|
stats = {"total_execution_time": total_execution_time, "models": {}}
|
|
726
909
|
|
|
727
910
|
try:
|
|
728
|
-
for
|
|
729
|
-
|
|
730
|
-
if not os.path.isdir(model_path):
|
|
911
|
+
for model_path in base_results_path.iterdir():
|
|
912
|
+
if not model_path.is_dir():
|
|
731
913
|
continue
|
|
732
914
|
model_stats = self._get_model_stats(model_path)
|
|
733
915
|
stats["models"].update(model_stats)
|
|
734
916
|
|
|
735
917
|
except Exception as e:
|
|
736
|
-
|
|
918
|
+
log.error(f"Error processing results for stats: {e}")
|
|
737
919
|
|
|
738
|
-
stats_path =
|
|
920
|
+
stats_path = base_results_path / "stats.json"
|
|
739
921
|
self.file_service.save_json(stats, stats_path)
|
|
740
922
|
|
|
741
|
-
|
|
742
|
-
|
|
923
|
+
log.info(f"Overall stats written to {stats_path}")
|
|
924
|
+
log.info(f"Total execution time: {total_execution_time:.2f} seconds")
|
|
743
925
|
|
|
744
926
|
|
|
745
927
|
def main(config_path: str, verbose: bool = False):
|
|
746
928
|
"""Main entry point for the evaluation script."""
|
|
929
|
+
if verbose:
|
|
930
|
+
logging.basicConfig(level=logging.INFO)
|
|
931
|
+
log.info("Verbose logging enabled.")
|
|
932
|
+
|
|
747
933
|
orchestrator = EvaluationRunner(verbose=verbose)
|
|
748
934
|
orchestrator.run_evaluation(config_path)
|
|
749
935
|
|