solace-agent-mesh 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of solace-agent-mesh might be problematic. Click here for more details.
- solace_agent_mesh/agent/adk/callbacks.py +0 -5
- solace_agent_mesh/agent/adk/models/lite_llm.py +123 -8
- solace_agent_mesh/agent/adk/models/oauth2_token_manager.py +245 -0
- solace_agent_mesh/agent/protocol/event_handlers.py +40 -1
- solace_agent_mesh/agent/proxies/__init__.py +0 -0
- solace_agent_mesh/agent/proxies/a2a/__init__.py +3 -0
- solace_agent_mesh/agent/proxies/a2a/app.py +55 -0
- solace_agent_mesh/agent/proxies/a2a/component.py +1115 -0
- solace_agent_mesh/agent/proxies/a2a/config.py +140 -0
- solace_agent_mesh/agent/proxies/a2a/oauth_token_cache.py +104 -0
- solace_agent_mesh/agent/proxies/base/__init__.py +3 -0
- solace_agent_mesh/agent/proxies/base/app.py +99 -0
- solace_agent_mesh/agent/proxies/base/component.py +619 -0
- solace_agent_mesh/agent/proxies/base/config.py +85 -0
- solace_agent_mesh/agent/proxies/base/proxy_task_context.py +17 -0
- solace_agent_mesh/agent/sac/app.py +9 -3
- solace_agent_mesh/agent/sac/component.py +160 -8
- solace_agent_mesh/agent/tools/audio_tools.py +125 -8
- solace_agent_mesh/agent/tools/web_tools.py +10 -5
- solace_agent_mesh/agent/utils/artifact_helpers.py +141 -3
- solace_agent_mesh/assets/docs/404.html +3 -3
- solace_agent_mesh/assets/docs/assets/js/5c2bd65f.eda4bcb2.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/6ad8f0bd.f4b15f3b.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/71da7b71.38583438.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/77cf947d.48cb18a2.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/924ffdeb.8095e148.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/9e9d0a82.570c057b.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/{ad71b5ed.60668e9e.js → ad71b5ed.af3ecfd1.js} +1 -1
- solace_agent_mesh/assets/docs/assets/js/ceb2a7a6.5d92d7d0.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/{da0b5bad.9d369087.js → da0b5bad.d08a9466.js} +1 -1
- solace_agent_mesh/assets/docs/assets/js/db924877.e98d12a1.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/de915948.27d6b065.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/e6f9706b.e74a984d.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/f284c35a.42f59cdd.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/ff4d71f2.15b02f97.js +1 -0
- solace_agent_mesh/assets/docs/assets/js/{main.bd3c34f3.js → main.20feee82.js} +2 -2
- solace_agent_mesh/assets/docs/assets/js/runtime~main.0d198646.js +1 -0
- solace_agent_mesh/assets/docs/docs/documentation/components/agents/index.html +15 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/artifact-management/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/audio-tools/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/data-analysis-tools/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/embeds/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/builtin-tools/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/cli/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/gateways/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/orchestrator/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/plugins/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/components/proxies/index.html +262 -0
- solace_agent_mesh/assets/docs/docs/documentation/deploying/debugging/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/deploying/deployment-options/index.html +31 -3
- solace_agent_mesh/assets/docs/docs/documentation/deploying/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/deploying/observability/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/developing/create-agents/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/create-gateways/index.html +5 -5
- solace_agent_mesh/assets/docs/docs/documentation/developing/creating-python-tools/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/creating-service-providers/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/evaluations/index.html +135 -0
- solace_agent_mesh/assets/docs/docs/documentation/developing/index.html +6 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/structure/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/bedrock-agents/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/custom-agent/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/event-mesh-gateway/index.html +5 -5
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/mcp-integration/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/mongodb-integration/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/rag-integration/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/rest-gateway/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/slack-integration/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/developing/tutorials/sql-database/index.html +4 -4
- solace_agent_mesh/assets/docs/docs/documentation/enterprise/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/enterprise/installation/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/enterprise/rbac-setup-guide/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/enterprise/single-sign-on/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/getting-started/architecture/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/getting-started/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/getting-started/introduction/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/getting-started/try-agent-mesh/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/configurations/index.html +6 -5
- solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/installation/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/large_language_models/index.html +100 -3
- solace_agent_mesh/assets/docs/docs/documentation/installing-and-configuring/run-project/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/migrations/a2a-upgrade/a2a-gateway-upgrade-to-0.3.0/index.html +3 -3
- solace_agent_mesh/assets/docs/docs/documentation/migrations/a2a-upgrade/a2a-technical-migration-map/index.html +3 -3
- solace_agent_mesh/assets/docs/lunr-index-1761165361160.json +1 -0
- solace_agent_mesh/assets/docs/lunr-index.json +1 -1
- solace_agent_mesh/assets/docs/search-doc-1761165361160.json +1 -0
- solace_agent_mesh/assets/docs/search-doc.json +1 -1
- solace_agent_mesh/assets/docs/sitemap.xml +1 -1
- solace_agent_mesh/cli/__init__.py +1 -1
- solace_agent_mesh/cli/commands/add_cmd/agent_cmd.py +2 -69
- solace_agent_mesh/cli/commands/eval_cmd.py +11 -49
- solace_agent_mesh/cli/commands/init_cmd/__init__.py +0 -5
- solace_agent_mesh/cli/commands/init_cmd/env_step.py +10 -12
- solace_agent_mesh/cli/commands/init_cmd/orchestrator_step.py +9 -61
- solace_agent_mesh/cli/commands/init_cmd/webui_gateway_step.py +9 -49
- solace_agent_mesh/cli/commands/plugin_cmd/add_cmd.py +1 -2
- solace_agent_mesh/client/webui/frontend/static/assets/{authCallback-DwrxZE0E.js → authCallback-BTf6dqwp.js} +1 -1
- solace_agent_mesh/client/webui/frontend/static/assets/{client-DarGQzyw.js → client-CaY59VuC.js} +1 -1
- solace_agent_mesh/client/webui/frontend/static/assets/main-BGTaW0uv.js +342 -0
- solace_agent_mesh/client/webui/frontend/static/assets/main-DHJKSW1S.css +1 -0
- solace_agent_mesh/client/webui/frontend/static/assets/{vendor-BKIeiHj_.js → vendor-BEmvJSYz.js} +1 -1
- solace_agent_mesh/client/webui/frontend/static/auth-callback.html +3 -3
- solace_agent_mesh/client/webui/frontend/static/index.html +4 -4
- solace_agent_mesh/common/a2a/__init__.py +24 -0
- solace_agent_mesh/common/a2a/artifact.py +39 -0
- solace_agent_mesh/common/a2a/events.py +29 -0
- solace_agent_mesh/common/a2a/message.py +68 -0
- solace_agent_mesh/common/a2a/protocol.py +73 -1
- solace_agent_mesh/common/agent_registry.py +83 -3
- solace_agent_mesh/common/constants.py +3 -1
- solace_agent_mesh/common/utils/pydantic_utils.py +12 -0
- solace_agent_mesh/config_portal/backend/common.py +1 -1
- solace_agent_mesh/config_portal/frontend/static/client/assets/_index-ByU1X1HD.js +98 -0
- solace_agent_mesh/config_portal/frontend/static/client/assets/{manifest-44d62be6.js → manifest-61038fc6.js} +1 -1
- solace_agent_mesh/config_portal/frontend/static/client/index.html +1 -1
- solace_agent_mesh/evaluation/evaluator.py +128 -104
- solace_agent_mesh/evaluation/message_organizer.py +116 -110
- solace_agent_mesh/evaluation/report_data_processor.py +84 -86
- solace_agent_mesh/evaluation/report_generator.py +73 -79
- solace_agent_mesh/evaluation/run.py +421 -235
- solace_agent_mesh/evaluation/shared/__init__.py +92 -0
- solace_agent_mesh/evaluation/shared/constants.py +47 -0
- solace_agent_mesh/evaluation/shared/exceptions.py +50 -0
- solace_agent_mesh/evaluation/shared/helpers.py +35 -0
- solace_agent_mesh/evaluation/shared/test_case_loader.py +167 -0
- solace_agent_mesh/evaluation/shared/test_suite_loader.py +280 -0
- solace_agent_mesh/evaluation/subscriber.py +111 -232
- solace_agent_mesh/evaluation/summary_builder.py +227 -117
- solace_agent_mesh/gateway/base/app.py +1 -1
- solace_agent_mesh/gateway/base/component.py +8 -1
- solace_agent_mesh/gateway/http_sse/alembic/versions/20251015_add_session_performance_indexes.py +70 -0
- solace_agent_mesh/gateway/http_sse/component.py +98 -2
- solace_agent_mesh/gateway/http_sse/dependencies.py +4 -4
- solace_agent_mesh/gateway/http_sse/main.py +2 -1
- solace_agent_mesh/gateway/http_sse/repository/chat_task_repository.py +12 -13
- solace_agent_mesh/gateway/http_sse/repository/feedback_repository.py +15 -18
- solace_agent_mesh/gateway/http_sse/repository/interfaces.py +25 -18
- solace_agent_mesh/gateway/http_sse/repository/session_repository.py +30 -26
- solace_agent_mesh/gateway/http_sse/repository/task_repository.py +35 -44
- solace_agent_mesh/gateway/http_sse/routers/agent_cards.py +4 -3
- solace_agent_mesh/gateway/http_sse/routers/artifacts.py +95 -203
- solace_agent_mesh/gateway/http_sse/routers/dto/responses/session_responses.py +4 -3
- solace_agent_mesh/gateway/http_sse/routers/sessions.py +2 -2
- solace_agent_mesh/gateway/http_sse/routers/tasks.py +33 -41
- solace_agent_mesh/gateway/http_sse/routers/visualization.py +17 -11
- solace_agent_mesh/gateway/http_sse/services/data_retention_service.py +4 -4
- solace_agent_mesh/gateway/http_sse/services/feedback_service.py +51 -43
- solace_agent_mesh/gateway/http_sse/services/session_service.py +20 -20
- solace_agent_mesh/gateway/http_sse/services/task_logger_service.py +8 -8
- solace_agent_mesh/gateway/http_sse/shared/base_repository.py +45 -71
- solace_agent_mesh/gateway/http_sse/shared/types.py +0 -18
- solace_agent_mesh/templates/gateway_config_template.yaml +0 -5
- solace_agent_mesh/templates/logging_config_template.ini +10 -6
- solace_agent_mesh/templates/plugin_gateway_config_template.yaml +0 -3
- solace_agent_mesh/templates/shared_config.yaml +40 -0
- {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.0.dist-info}/METADATA +47 -21
- {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.0.dist-info}/RECORD +162 -141
- solace_agent_mesh/assets/docs/assets/js/5c2bd65f.e49689dd.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/6ad8f0bd.39d5851d.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/71da7b71.804d6567.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/77cf947d.64c9bd6c.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/9e9d0a82.dd810042.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/db924877.cbc66f02.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/de915948.139b4b9c.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/e6f9706b.582a78ca.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/f284c35a.5766a13d.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/ff4d71f2.9c0297a6.js +0 -1
- solace_agent_mesh/assets/docs/assets/js/runtime~main.18dc45dd.js +0 -1
- solace_agent_mesh/assets/docs/lunr-index-1760121512891.json +0 -1
- solace_agent_mesh/assets/docs/search-doc-1760121512891.json +0 -1
- solace_agent_mesh/client/webui/frontend/static/assets/main-2nd1gbaH.js +0 -339
- solace_agent_mesh/client/webui/frontend/static/assets/main-DoKXctCM.css +0 -1
- solace_agent_mesh/config_portal/frontend/static/client/assets/_index-BNuqpWDc.js +0 -98
- solace_agent_mesh/evaluation/config_loader.py +0 -657
- solace_agent_mesh/evaluation/test_case_loader.py +0 -714
- /solace_agent_mesh/assets/docs/assets/js/{main.bd3c34f3.js.LICENSE.txt → main.20feee82.js.LICENSE.txt} +0 -0
- {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.0.dist-info}/WHEEL +0 -0
- {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.0.dist-info}/entry_points.txt +0 -0
- {solace_agent_mesh-1.5.1.dist-info → solace_agent_mesh-1.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
window.__remixManifest={"entry":{"module":"/assets/entry.client-mvZjNKiz.js","imports":["/assets/index-DzNKzXrc.js","/assets/components-Rk0n-9cK.js"],"css":[]},"routes":{"root":{"id":"root","path":"","hasAction":false,"hasLoader":false,"hasClientAction":false,"hasClientLoader":false,"hasErrorBoundary":false,"module":"/assets/root-BWvk5-gF.js","imports":["/assets/index-DzNKzXrc.js","/assets/components-Rk0n-9cK.js"],"css":["/assets/root-DxRwaWiE.css"]},"routes/_index":{"id":"routes/_index","parentId":"root","index":true,"hasAction":false,"hasLoader":false,"hasClientAction":false,"hasClientLoader":false,"hasErrorBoundary":false,"module":"/assets/_index-
|
|
1
|
+
window.__remixManifest={"entry":{"module":"/assets/entry.client-mvZjNKiz.js","imports":["/assets/index-DzNKzXrc.js","/assets/components-Rk0n-9cK.js"],"css":[]},"routes":{"root":{"id":"root","path":"","hasAction":false,"hasLoader":false,"hasClientAction":false,"hasClientLoader":false,"hasErrorBoundary":false,"module":"/assets/root-BWvk5-gF.js","imports":["/assets/index-DzNKzXrc.js","/assets/components-Rk0n-9cK.js"],"css":["/assets/root-DxRwaWiE.css"]},"routes/_index":{"id":"routes/_index","parentId":"root","index":true,"hasAction":false,"hasLoader":false,"hasClientAction":false,"hasClientLoader":false,"hasErrorBoundary":false,"module":"/assets/_index-ByU1X1HD.js","imports":["/assets/index-DzNKzXrc.js"],"css":[]}},"url":"/assets/manifest-61038fc6.js","version":"61038fc6"};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
<!DOCTYPE html>
|
|
2
|
-
<html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="stylesheet" href="/assets/root-DxRwaWiE.css"/><link rel="preconnect" href="https://fonts.googleapis.com"/><link rel="preconnect" href="https://fonts.gstatic.com" crossorigin="anonymous"/><link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:ital,opsz,wght@0,14..32,100..900;1,14..32,100..900&display=swap"/></head><body><p>Loading...</p><link rel="modulepreload" href="/assets/manifest-
|
|
2
|
+
<html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="stylesheet" href="/assets/root-DxRwaWiE.css"/><link rel="preconnect" href="https://fonts.googleapis.com"/><link rel="preconnect" href="https://fonts.gstatic.com" crossorigin="anonymous"/><link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:ital,opsz,wght@0,14..32,100..900;1,14..32,100..900&display=swap"/></head><body><p>Loading...</p><link rel="modulepreload" href="/assets/manifest-61038fc6.js"/><link rel="modulepreload" href="/assets/entry.client-mvZjNKiz.js"/><link rel="modulepreload" href="/assets/index-DzNKzXrc.js"/><link rel="modulepreload" href="/assets/components-Rk0n-9cK.js"/><link rel="modulepreload" href="/assets/root-BWvk5-gF.js"/><script>window.__remixContext = {"basename":"/","future":{"v3_fetcherPersist":false,"v3_relativeSplatPath":false,"v3_throwAbortReason":false,"v3_routeConfig":false,"v3_singleFetch":false,"v3_lazyRouteDiscovery":false,"unstable_optimizeDeps":false},"isSpaMode":true,"state":{"loaderData":{"root":null,"routes/_index":null},"actionData":null,"errors":null}};</script><script type="module" async="">import "/assets/manifest-61038fc6.js";
|
|
3
3
|
import * as route0 from "/assets/root-BWvk5-gF.js";
|
|
4
4
|
|
|
5
5
|
window.__remixRouteModules = {"root":route0};
|
|
@@ -5,27 +5,25 @@ This module evaluates AI model performance against test cases using multiple eva
|
|
|
5
5
|
|
|
6
6
|
import concurrent.futures
|
|
7
7
|
import json
|
|
8
|
-
import
|
|
8
|
+
import logging
|
|
9
9
|
import re
|
|
10
|
-
import sys
|
|
11
10
|
from abc import ABC, abstractmethod
|
|
12
11
|
from collections import defaultdict
|
|
13
12
|
from dataclasses import dataclass, field
|
|
14
|
-
from
|
|
15
|
-
import logging
|
|
13
|
+
from pathlib import Path
|
|
16
14
|
|
|
15
|
+
import litellm
|
|
17
16
|
import numpy as np
|
|
18
17
|
from rouge import Rouge
|
|
19
|
-
import litellm
|
|
20
|
-
|
|
21
|
-
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
22
|
-
from evaluation.config_loader import ConfigLoader
|
|
23
|
-
from evaluation.test_case_loader import load_test_case
|
|
24
18
|
|
|
25
|
-
|
|
26
|
-
|
|
19
|
+
from .shared import (
|
|
20
|
+
EvaluationConfigLoader,
|
|
21
|
+
EvaluationOptions,
|
|
22
|
+
TestSuiteConfiguration,
|
|
23
|
+
load_test_case,
|
|
24
|
+
)
|
|
27
25
|
|
|
28
|
-
|
|
26
|
+
log = logging.getLogger(__name__)
|
|
29
27
|
|
|
30
28
|
|
|
31
29
|
@dataclass
|
|
@@ -35,14 +33,14 @@ class EvaluationResult:
|
|
|
35
33
|
run_number: int
|
|
36
34
|
test_case_id: str
|
|
37
35
|
test_case_path: str
|
|
38
|
-
tool_match_score:
|
|
39
|
-
response_match_score:
|
|
40
|
-
llm_eval_score:
|
|
41
|
-
llm_eval_reasoning:
|
|
42
|
-
duration_seconds:
|
|
43
|
-
errors:
|
|
44
|
-
|
|
45
|
-
def to_dict(self) ->
|
|
36
|
+
tool_match_score: float | None = None
|
|
37
|
+
response_match_score: float | None = None
|
|
38
|
+
llm_eval_score: float | None = None
|
|
39
|
+
llm_eval_reasoning: str | None = None
|
|
40
|
+
duration_seconds: float | None = None
|
|
41
|
+
errors: list[str] = field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
def to_dict(self) -> dict[str, any]:
|
|
46
44
|
"""Convert to dictionary format for JSON serialization."""
|
|
47
45
|
result = {
|
|
48
46
|
"run": self.run_number,
|
|
@@ -74,10 +72,10 @@ class ScoreStatistics:
|
|
|
74
72
|
"""Statistical summary of evaluation scores."""
|
|
75
73
|
|
|
76
74
|
average: float
|
|
77
|
-
distribution:
|
|
75
|
+
distribution: dict[str, float]
|
|
78
76
|
|
|
79
77
|
@classmethod
|
|
80
|
-
def from_scores(cls, scores:
|
|
78
|
+
def from_scores(cls, scores: list[float]) -> "ScoreStatistics":
|
|
81
79
|
"""Create statistics from a list of scores."""
|
|
82
80
|
if not scores:
|
|
83
81
|
return cls(
|
|
@@ -103,13 +101,13 @@ class TestCaseResults:
|
|
|
103
101
|
|
|
104
102
|
test_case_id: str
|
|
105
103
|
category: str
|
|
106
|
-
runs:
|
|
104
|
+
runs: list[EvaluationResult]
|
|
107
105
|
average_duration: float
|
|
108
106
|
tool_match_scores: ScoreStatistics
|
|
109
107
|
response_match_scores: ScoreStatistics
|
|
110
108
|
llm_eval_scores: ScoreStatistics
|
|
111
109
|
|
|
112
|
-
def to_dict(self) ->
|
|
110
|
+
def to_dict(self) -> dict[str, any]:
|
|
113
111
|
"""Convert to dictionary format for JSON serialization."""
|
|
114
112
|
return {
|
|
115
113
|
"test_case_id": self.test_case_id,
|
|
@@ -136,10 +134,10 @@ class ModelResults:
|
|
|
136
134
|
"""Complete evaluation results for a model."""
|
|
137
135
|
|
|
138
136
|
model_name: str
|
|
139
|
-
total_execution_time:
|
|
140
|
-
test_cases:
|
|
137
|
+
total_execution_time: float | None
|
|
138
|
+
test_cases: list[TestCaseResults]
|
|
141
139
|
|
|
142
|
-
def to_dict(self) ->
|
|
140
|
+
def to_dict(self) -> dict[str, any]:
|
|
143
141
|
"""Convert to dictionary format for JSON serialization."""
|
|
144
142
|
return {
|
|
145
143
|
"model_name": self.model_name,
|
|
@@ -152,71 +150,63 @@ class ConfigurationService:
|
|
|
152
150
|
"""Handles configuration loading and validation."""
|
|
153
151
|
|
|
154
152
|
def __init__(self, config_path: str):
|
|
155
|
-
self.config_loader =
|
|
153
|
+
self.config_loader = EvaluationConfigLoader(config_path)
|
|
156
154
|
self._config_cache = None
|
|
157
155
|
self._evaluation_settings_cache = None
|
|
158
156
|
|
|
159
|
-
def get_config(self) ->
|
|
157
|
+
def get_config(self) -> TestSuiteConfiguration:
|
|
160
158
|
"""Get the main configuration."""
|
|
161
159
|
if self._config_cache is None:
|
|
162
|
-
self._config_cache = self.config_loader.
|
|
160
|
+
self._config_cache = self.config_loader.load_configuration()
|
|
163
161
|
return self._config_cache
|
|
164
162
|
|
|
165
|
-
def get_evaluation_settings(self) ->
|
|
163
|
+
def get_evaluation_settings(self) -> EvaluationOptions:
|
|
166
164
|
"""Get evaluation settings."""
|
|
167
165
|
if self._evaluation_settings_cache is None:
|
|
168
|
-
self._evaluation_settings_cache = (
|
|
169
|
-
self.config_loader.get_evaluation_settings()
|
|
170
|
-
)
|
|
166
|
+
self._evaluation_settings_cache = self.config_loader.get_evaluation_options()
|
|
171
167
|
return self._evaluation_settings_cache
|
|
172
168
|
|
|
173
|
-
def get_results_path(self) -> str:
|
|
174
|
-
"""Get the base results path."""
|
|
175
|
-
config = self.get_config()
|
|
176
|
-
results_dir_name = config["results_dir_name"]
|
|
177
|
-
return os.path.join(SCRIPT_DIR, "results", results_dir_name)
|
|
178
|
-
|
|
179
169
|
|
|
180
170
|
class FileService:
|
|
181
171
|
"""Handles file I/O operations."""
|
|
182
172
|
|
|
183
173
|
@staticmethod
|
|
184
|
-
def load_json(filepath:
|
|
174
|
+
def load_json(filepath: Path) -> any:
|
|
185
175
|
"""Load JSON data from file."""
|
|
186
176
|
try:
|
|
187
|
-
with open(
|
|
177
|
+
with filepath.open() as f:
|
|
188
178
|
return json.load(f)
|
|
189
179
|
except (FileNotFoundError, json.JSONDecodeError) as e:
|
|
190
|
-
|
|
180
|
+
log.error(f"Failed to load JSON from {filepath}: {e}")
|
|
191
181
|
raise
|
|
192
182
|
|
|
193
183
|
@staticmethod
|
|
194
|
-
def save_json(data:
|
|
184
|
+
def save_json(data: any, filepath: Path):
|
|
195
185
|
"""Save data as JSON to file."""
|
|
196
186
|
try:
|
|
197
|
-
|
|
198
|
-
with open(
|
|
187
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
188
|
+
with filepath.open("w") as f:
|
|
199
189
|
json.dump(data, f, indent=4)
|
|
200
190
|
except Exception as e:
|
|
201
|
-
|
|
191
|
+
log.error(f"Failed to save JSON to {filepath}: {e}")
|
|
202
192
|
raise
|
|
203
193
|
|
|
204
194
|
@staticmethod
|
|
205
|
-
def file_exists(filepath:
|
|
195
|
+
def file_exists(filepath: Path) -> bool:
|
|
206
196
|
"""Check if file exists."""
|
|
207
|
-
return
|
|
197
|
+
return filepath.exists()
|
|
208
198
|
|
|
209
199
|
|
|
210
200
|
class StatisticsService:
|
|
211
201
|
"""Handles statistical calculations and aggregations."""
|
|
212
202
|
|
|
213
203
|
@staticmethod
|
|
214
|
-
def calculate_score_statistics(scores:
|
|
204
|
+
def calculate_score_statistics(scores: list[float]) -> ScoreStatistics:
|
|
215
205
|
"""Calculate statistical summary for a list of scores."""
|
|
216
206
|
return ScoreStatistics.from_scores(scores)
|
|
217
207
|
|
|
218
208
|
@staticmethod
|
|
219
|
-
def calculate_average_duration(durations:
|
|
209
|
+
def calculate_average_duration(durations: list[float]) -> float:
|
|
220
210
|
"""Calculate average duration from a list of durations."""
|
|
221
211
|
if not durations:
|
|
222
212
|
return 0.0
|
|
@@ -228,8 +218,8 @@ class EvaluationStrategy(ABC):
|
|
|
228
218
|
|
|
229
219
|
@abstractmethod
|
|
230
220
|
def evaluate(
|
|
231
|
-
self, test_case:
|
|
232
|
-
) ->
|
|
221
|
+
self, test_case: dict[str, any], summary_data: dict[str, any]
|
|
222
|
+
) -> float | None:
|
|
233
223
|
"""Evaluate a test case run and return a score."""
|
|
234
224
|
pass
|
|
235
225
|
|
|
@@ -238,8 +228,8 @@ class ToolMatchEvaluator(EvaluationStrategy):
|
|
|
238
228
|
"""Evaluates tool usage against expected tools."""
|
|
239
229
|
|
|
240
230
|
def evaluate(
|
|
241
|
-
self, test_case:
|
|
242
|
-
) ->
|
|
231
|
+
self, test_case: dict[str, any], summary_data: dict[str, any]
|
|
232
|
+
) -> float | None:
|
|
243
233
|
"""Evaluate tool matching score."""
|
|
244
234
|
try:
|
|
245
235
|
expected_tools = test_case["evaluation"]["expected_tools"]
|
|
@@ -257,7 +247,7 @@ class ToolMatchEvaluator(EvaluationStrategy):
|
|
|
257
247
|
return len(found_tools) / len(expected_set)
|
|
258
248
|
|
|
259
249
|
except (KeyError, TypeError) as e:
|
|
260
|
-
|
|
250
|
+
log.warning(f"Error in tool match evaluation: {e}")
|
|
261
251
|
return None
|
|
262
252
|
|
|
263
253
|
|
|
@@ -268,8 +258,8 @@ class ResponseMatchEvaluator(EvaluationStrategy):
|
|
|
268
258
|
self.rouge = Rouge()
|
|
269
259
|
|
|
270
260
|
def evaluate(
|
|
271
|
-
self, test_case:
|
|
272
|
-
) ->
|
|
261
|
+
self, test_case: dict[str, any], summary_data: dict[str, any]
|
|
262
|
+
) -> float | None:
|
|
273
263
|
"""Evaluate response matching score using a weighted ROUGE average."""
|
|
274
264
|
try:
|
|
275
265
|
expected_response = test_case["evaluation"]["expected_response"]
|
|
@@ -290,14 +280,14 @@ class ResponseMatchEvaluator(EvaluationStrategy):
|
|
|
290
280
|
return weighted_score
|
|
291
281
|
|
|
292
282
|
except (ValueError, KeyError, TypeError) as e:
|
|
293
|
-
|
|
283
|
+
log.warning(f"Error in response match evaluation: {e}")
|
|
294
284
|
return 0.0
|
|
295
285
|
|
|
296
286
|
|
|
297
287
|
class LLMEvaluator(EvaluationStrategy):
|
|
298
288
|
"""Evaluates responses using an LLM judge."""
|
|
299
289
|
|
|
300
|
-
def __init__(self, llm_config:
|
|
290
|
+
def __init__(self, llm_config: dict[str, any]):
|
|
301
291
|
self.model = llm_config.get("LLM_SERVICE_PLANNING_MODEL_NAME")
|
|
302
292
|
self.api_key = llm_config.get("LLM_SERVICE_API_KEY")
|
|
303
293
|
self.api_base = llm_config.get("LLM_SERVICE_ENDPOINT")
|
|
@@ -308,8 +298,8 @@ class LLMEvaluator(EvaluationStrategy):
|
|
|
308
298
|
)
|
|
309
299
|
|
|
310
300
|
def evaluate(
|
|
311
|
-
self, test_case:
|
|
312
|
-
) ->
|
|
301
|
+
self, test_case: dict[str, any], summary_data: dict[str, any]
|
|
302
|
+
) -> dict[str, any] | None:
|
|
313
303
|
"""Evaluate response using LLM and return score with reasoning."""
|
|
314
304
|
try:
|
|
315
305
|
query = test_case["query"]
|
|
@@ -342,7 +332,7 @@ class LLMEvaluator(EvaluationStrategy):
|
|
|
342
332
|
return {"score": score, "reasoning": reasoning}
|
|
343
333
|
|
|
344
334
|
except Exception as e:
|
|
345
|
-
|
|
335
|
+
log.error(f"Error in LLM evaluation: {e}")
|
|
346
336
|
return None
|
|
347
337
|
|
|
348
338
|
def _build_evaluation_prompt(
|
|
@@ -351,8 +341,8 @@ class LLMEvaluator(EvaluationStrategy):
|
|
|
351
341
|
expected_response: str,
|
|
352
342
|
actual_response: str,
|
|
353
343
|
criterion: str,
|
|
354
|
-
input_artifacts:
|
|
355
|
-
output_artifacts:
|
|
344
|
+
input_artifacts: list[dict],
|
|
345
|
+
output_artifacts: list[dict],
|
|
356
346
|
) -> str:
|
|
357
347
|
"""Build the evaluation prompt for the LLM."""
|
|
358
348
|
return f"""
|
|
@@ -367,7 +357,7 @@ class LLMEvaluator(EvaluationStrategy):
|
|
|
367
357
|
Format your response exactly as:
|
|
368
358
|
Score: [0.0-1.0]
|
|
369
359
|
Reasoning: [Your detailed explanation of why you gave this score, considering both the response and any artifacts created]
|
|
370
|
-
|
|
360
|
+
|
|
371
361
|
Provide a score from 0.0 to 1.0 where:
|
|
372
362
|
- 1.0 = Excellent: Fully meets the criterion and expectations
|
|
373
363
|
- 0.8-0.9 = Good: Mostly meets the criterion with minor issues
|
|
@@ -415,7 +405,7 @@ class LLMEvaluator(EvaluationStrategy):
|
|
|
415
405
|
class RunEvaluator:
|
|
416
406
|
"""Evaluates individual test runs."""
|
|
417
407
|
|
|
418
|
-
def __init__(self, evaluation_settings:
|
|
408
|
+
def __init__(self, evaluation_settings: dict[str, any]):
|
|
419
409
|
self.evaluation_settings = evaluation_settings
|
|
420
410
|
self.file_service = FileService()
|
|
421
411
|
|
|
@@ -437,24 +427,25 @@ class RunEvaluator:
|
|
|
437
427
|
llm_config = evaluation_settings["llm_evaluator"]["env"]
|
|
438
428
|
self.llm_evaluator = LLMEvaluator(llm_config)
|
|
439
429
|
except Exception as e:
|
|
440
|
-
|
|
430
|
+
log.error(f"Failed to initialize LLM evaluator: {e}")
|
|
441
431
|
|
|
442
432
|
def evaluate_run(
|
|
443
433
|
self,
|
|
444
434
|
run_number: int,
|
|
445
|
-
run_path:
|
|
446
|
-
test_case:
|
|
435
|
+
run_path: Path,
|
|
436
|
+
test_case: dict[str, any],
|
|
447
437
|
test_case_path: str,
|
|
448
|
-
) ->
|
|
438
|
+
) -> EvaluationResult | None:
|
|
449
439
|
"""Evaluate a single test run."""
|
|
450
|
-
|
|
440
|
+
log.info(
|
|
451
441
|
f" - Evaluating run {run_number} for test case {test_case['test_case_id']}"
|
|
452
442
|
)
|
|
453
443
|
|
|
454
444
|
# Load summary data
|
|
455
|
-
summary_path =
|
|
445
|
+
summary_path = run_path / "summary.json"
|
|
446
|
+
log.info(f"Summary file path: {summary_path}")
|
|
456
447
|
if not self.file_service.file_exists(summary_path):
|
|
457
|
-
|
|
448
|
+
log.warning(
|
|
458
449
|
f" Summary file not found for run {run_number}, skipping."
|
|
459
450
|
)
|
|
460
451
|
return None
|
|
@@ -462,7 +453,7 @@ class RunEvaluator:
|
|
|
462
453
|
try:
|
|
463
454
|
summary_data = self.file_service.load_json(summary_path)
|
|
464
455
|
except Exception as e:
|
|
465
|
-
|
|
456
|
+
log.error(f" Error loading summary.json for run {run_number}: {e}")
|
|
466
457
|
return None
|
|
467
458
|
|
|
468
459
|
# Create evaluation result
|
|
@@ -496,7 +487,7 @@ class RunEvaluator:
|
|
|
496
487
|
class ModelEvaluator:
|
|
497
488
|
"""Evaluates all runs for a single model."""
|
|
498
489
|
|
|
499
|
-
def __init__(self, config:
|
|
490
|
+
def __init__(self, config: dict[str, any], evaluation_settings: dict[str, any]):
|
|
500
491
|
self.config = config
|
|
501
492
|
self.evaluation_settings = evaluation_settings
|
|
502
493
|
self.run_evaluator = RunEvaluator(evaluation_settings)
|
|
@@ -504,9 +495,9 @@ class ModelEvaluator:
|
|
|
504
495
|
|
|
505
496
|
def evaluate_model(self, model_name: str, base_results_path: str) -> ModelResults:
|
|
506
497
|
"""Evaluate all test cases for a model."""
|
|
507
|
-
|
|
498
|
+
log.info(f"Evaluating model: {model_name}")
|
|
508
499
|
|
|
509
|
-
model_results_path =
|
|
500
|
+
model_results_path = Path(base_results_path) / model_name
|
|
510
501
|
|
|
511
502
|
# Collect all evaluation tasks
|
|
512
503
|
tasks = self._collect_evaluation_tasks(model_results_path)
|
|
@@ -525,7 +516,7 @@ class ModelEvaluator:
|
|
|
525
516
|
if result:
|
|
526
517
|
model_results_data[result.test_case_id].append(result)
|
|
527
518
|
except Exception as e:
|
|
528
|
-
|
|
519
|
+
log.error(f"An error occurred during evaluation: {e}")
|
|
529
520
|
|
|
530
521
|
# Aggregate results by test case
|
|
531
522
|
test_case_results = []
|
|
@@ -541,24 +532,24 @@ class ModelEvaluator:
|
|
|
541
532
|
)
|
|
542
533
|
|
|
543
534
|
def _collect_evaluation_tasks(
|
|
544
|
-
self, model_results_path:
|
|
545
|
-
) ->
|
|
535
|
+
self, model_results_path: Path
|
|
536
|
+
) -> list[tuple[int, Path, dict[str, any], str]]:
|
|
546
537
|
"""Collect all evaluation tasks for the model."""
|
|
547
538
|
tasks = []
|
|
548
539
|
|
|
549
540
|
for test_case_path in self.config["test_cases"]:
|
|
550
541
|
test_case = load_test_case(test_case_path)
|
|
551
|
-
|
|
552
|
-
test_case_results_path =
|
|
542
|
+
test_case_name = Path(test_case_path).stem.replace(".test", "")
|
|
543
|
+
test_case_results_path = model_results_path / test_case_name
|
|
553
544
|
|
|
554
545
|
for i in range(1, self.config["runs"] + 1):
|
|
555
|
-
run_path =
|
|
546
|
+
run_path = test_case_results_path / f"run_{i}"
|
|
556
547
|
tasks.append((i, run_path, test_case, test_case_path))
|
|
557
548
|
|
|
558
549
|
return tasks
|
|
559
550
|
|
|
560
551
|
def _aggregate_test_case_results(
|
|
561
|
-
self, test_case_id: str, runs:
|
|
552
|
+
self, test_case_id: str, runs: list[EvaluationResult]
|
|
562
553
|
) -> TestCaseResults:
|
|
563
554
|
"""Aggregate results for a test case across multiple runs."""
|
|
564
555
|
# Load test case to get category
|
|
@@ -604,11 +595,11 @@ class ResultsWriter:
|
|
|
604
595
|
|
|
605
596
|
def write_model_results(self, model_results: ModelResults, base_results_path: str):
|
|
606
597
|
"""Write model results to file."""
|
|
607
|
-
results_path =
|
|
608
|
-
base_results_path
|
|
598
|
+
results_path = (
|
|
599
|
+
Path(base_results_path) / model_results.model_name / "results.json"
|
|
609
600
|
)
|
|
610
601
|
self.file_service.save_json(model_results.to_dict(), results_path)
|
|
611
|
-
|
|
602
|
+
log.info(
|
|
612
603
|
f"Results for model {model_results.model_name} written to {results_path}"
|
|
613
604
|
)
|
|
614
605
|
|
|
@@ -623,10 +614,13 @@ class EvaluationOrchestrator:
|
|
|
623
614
|
def run_evaluation(
|
|
624
615
|
self,
|
|
625
616
|
base_results_path: str,
|
|
626
|
-
model_execution_times:
|
|
617
|
+
model_execution_times: dict[str, float] | None = None,
|
|
627
618
|
):
|
|
628
619
|
"""Main entry point for the evaluation process."""
|
|
629
|
-
|
|
620
|
+
log.info("Starting evaluation")
|
|
621
|
+
|
|
622
|
+
# Resolve to an absolute path to ensure consistency
|
|
623
|
+
base_results_path = str(Path(base_results_path).resolve())
|
|
630
624
|
|
|
631
625
|
if model_execution_times is None:
|
|
632
626
|
model_execution_times = {}
|
|
@@ -634,32 +628,62 @@ class EvaluationOrchestrator:
|
|
|
634
628
|
config = self.config_service.get_config()
|
|
635
629
|
evaluation_settings = self.config_service.get_evaluation_settings()
|
|
636
630
|
|
|
637
|
-
|
|
631
|
+
# Convert evaluation settings to dict format for backwards compatibility
|
|
632
|
+
settings_dict = {
|
|
633
|
+
"tool_match": {"enabled": evaluation_settings.tool_matching_enabled},
|
|
634
|
+
"response_match": {"enabled": evaluation_settings.response_matching_enabled},
|
|
635
|
+
"llm_evaluator": {
|
|
636
|
+
"enabled": evaluation_settings.llm_evaluation_enabled,
|
|
637
|
+
"env": evaluation_settings.llm_evaluator_environment.variables if evaluation_settings.llm_evaluator_environment else {}
|
|
638
|
+
}
|
|
639
|
+
}
|
|
638
640
|
|
|
639
|
-
for
|
|
640
|
-
|
|
641
|
+
# Convert config to dict format for backwards compatibility
|
|
642
|
+
config_dict = {
|
|
643
|
+
"test_cases": config.test_case_files,
|
|
644
|
+
"runs": config.run_count
|
|
645
|
+
}
|
|
641
646
|
|
|
642
|
-
|
|
643
|
-
model_results = model_evaluator.evaluate_model(
|
|
644
|
-
model_name, base_results_path
|
|
645
|
-
)
|
|
647
|
+
model_evaluator = ModelEvaluator(config_dict, settings_dict)
|
|
646
648
|
|
|
647
|
-
|
|
649
|
+
if config.remote:
|
|
650
|
+
# Handle remote evaluation
|
|
651
|
+
model_name = "remote"
|
|
652
|
+
model_results = model_evaluator.evaluate_model(model_name, base_results_path)
|
|
648
653
|
execution_time = model_execution_times.get(model_name)
|
|
649
654
|
if execution_time is not None:
|
|
650
655
|
model_results.total_execution_time = execution_time
|
|
651
|
-
|
|
652
|
-
# Write results to file
|
|
653
656
|
self.results_writer.write_model_results(model_results, base_results_path)
|
|
657
|
+
else:
|
|
658
|
+
# Handle local evaluation
|
|
659
|
+
for model_config in config.model_configurations:
|
|
660
|
+
model_name = model_config.name
|
|
661
|
+
|
|
662
|
+
# Evaluate the model
|
|
663
|
+
model_results = model_evaluator.evaluate_model(
|
|
664
|
+
model_name, base_results_path
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
# Add execution time if available
|
|
668
|
+
execution_time = model_execution_times.get(model_name)
|
|
669
|
+
if execution_time is not None:
|
|
670
|
+
model_results.total_execution_time = execution_time
|
|
671
|
+
|
|
672
|
+
# Write results to file
|
|
673
|
+
self.results_writer.write_model_results(model_results, base_results_path)
|
|
654
674
|
|
|
655
|
-
|
|
675
|
+
log.info("--- Evaluation finished ---")
|
|
656
676
|
|
|
657
677
|
|
|
658
|
-
def main(config_path: str
|
|
678
|
+
def main(config_path: str):
|
|
659
679
|
"""Main entry point for command-line usage."""
|
|
660
680
|
orchestrator = EvaluationOrchestrator(config_path)
|
|
661
|
-
|
|
662
|
-
|
|
681
|
+
# Results path should be based on the current working directory, not the package location.
|
|
682
|
+
# This main function is for standalone testing.
|
|
683
|
+
config = orchestrator.config_service.get_config()
|
|
684
|
+
results_path = Path.cwd() / "results" / config.results_directory
|
|
685
|
+
results_path.mkdir(parents=True, exist_ok=True)
|
|
686
|
+
orchestrator.run_evaluation(str(results_path))
|
|
663
687
|
|
|
664
688
|
|
|
665
689
|
if __name__ == "__main__":
|