vanna 0.7.9__py3-none-any.whl → 2.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vanna/__init__.py +167 -395
- vanna/agents/__init__.py +7 -0
- vanna/capabilities/__init__.py +17 -0
- vanna/capabilities/agent_memory/__init__.py +21 -0
- vanna/capabilities/agent_memory/base.py +103 -0
- vanna/capabilities/agent_memory/models.py +53 -0
- vanna/capabilities/file_system/__init__.py +14 -0
- vanna/capabilities/file_system/base.py +71 -0
- vanna/capabilities/file_system/models.py +25 -0
- vanna/capabilities/sql_runner/__init__.py +13 -0
- vanna/capabilities/sql_runner/base.py +37 -0
- vanna/capabilities/sql_runner/models.py +13 -0
- vanna/components/__init__.py +92 -0
- vanna/components/base.py +11 -0
- vanna/components/rich/__init__.py +83 -0
- vanna/components/rich/containers/__init__.py +7 -0
- vanna/components/rich/containers/card.py +20 -0
- vanna/components/rich/data/__init__.py +9 -0
- vanna/components/rich/data/chart.py +17 -0
- vanna/components/rich/data/dataframe.py +93 -0
- vanna/components/rich/feedback/__init__.py +21 -0
- vanna/components/rich/feedback/badge.py +16 -0
- vanna/components/rich/feedback/icon_text.py +14 -0
- vanna/components/rich/feedback/log_viewer.py +41 -0
- vanna/components/rich/feedback/notification.py +19 -0
- vanna/components/rich/feedback/progress.py +37 -0
- vanna/components/rich/feedback/status_card.py +28 -0
- vanna/components/rich/feedback/status_indicator.py +14 -0
- vanna/components/rich/interactive/__init__.py +21 -0
- vanna/components/rich/interactive/button.py +95 -0
- vanna/components/rich/interactive/task_list.py +58 -0
- vanna/components/rich/interactive/ui_state.py +93 -0
- vanna/components/rich/specialized/__init__.py +7 -0
- vanna/components/rich/specialized/artifact.py +20 -0
- vanna/components/rich/text.py +16 -0
- vanna/components/simple/__init__.py +15 -0
- vanna/components/simple/image.py +15 -0
- vanna/components/simple/link.py +15 -0
- vanna/components/simple/text.py +11 -0
- vanna/core/__init__.py +193 -0
- vanna/core/_compat.py +19 -0
- vanna/core/agent/__init__.py +10 -0
- vanna/core/agent/agent.py +1407 -0
- vanna/core/agent/config.py +123 -0
- vanna/core/audit/__init__.py +28 -0
- vanna/core/audit/base.py +299 -0
- vanna/core/audit/models.py +131 -0
- vanna/core/component_manager.py +329 -0
- vanna/core/components.py +53 -0
- vanna/core/enhancer/__init__.py +11 -0
- vanna/core/enhancer/base.py +94 -0
- vanna/core/enhancer/default.py +118 -0
- vanna/core/enricher/__init__.py +10 -0
- vanna/core/enricher/base.py +59 -0
- vanna/core/errors.py +47 -0
- vanna/core/evaluation/__init__.py +81 -0
- vanna/core/evaluation/base.py +186 -0
- vanna/core/evaluation/dataset.py +254 -0
- vanna/core/evaluation/evaluators.py +376 -0
- vanna/core/evaluation/report.py +289 -0
- vanna/core/evaluation/runner.py +313 -0
- vanna/core/filter/__init__.py +10 -0
- vanna/core/filter/base.py +67 -0
- vanna/core/lifecycle/__init__.py +10 -0
- vanna/core/lifecycle/base.py +83 -0
- vanna/core/llm/__init__.py +16 -0
- vanna/core/llm/base.py +40 -0
- vanna/core/llm/models.py +61 -0
- vanna/core/middleware/__init__.py +10 -0
- vanna/core/middleware/base.py +69 -0
- vanna/core/observability/__init__.py +11 -0
- vanna/core/observability/base.py +88 -0
- vanna/core/observability/models.py +47 -0
- vanna/core/recovery/__init__.py +11 -0
- vanna/core/recovery/base.py +84 -0
- vanna/core/recovery/models.py +32 -0
- vanna/core/registry.py +278 -0
- vanna/core/rich_component.py +156 -0
- vanna/core/simple_component.py +27 -0
- vanna/core/storage/__init__.py +14 -0
- vanna/core/storage/base.py +46 -0
- vanna/core/storage/models.py +46 -0
- vanna/core/system_prompt/__init__.py +13 -0
- vanna/core/system_prompt/base.py +36 -0
- vanna/core/system_prompt/default.py +157 -0
- vanna/core/tool/__init__.py +18 -0
- vanna/core/tool/base.py +70 -0
- vanna/core/tool/models.py +84 -0
- vanna/core/user/__init__.py +17 -0
- vanna/core/user/base.py +29 -0
- vanna/core/user/models.py +25 -0
- vanna/core/user/request_context.py +70 -0
- vanna/core/user/resolver.py +42 -0
- vanna/core/validation.py +164 -0
- vanna/core/workflow/__init__.py +12 -0
- vanna/core/workflow/base.py +254 -0
- vanna/core/workflow/default.py +789 -0
- vanna/examples/__init__.py +1 -0
- vanna/examples/__main__.py +44 -0
- vanna/examples/anthropic_quickstart.py +80 -0
- vanna/examples/artifact_example.py +293 -0
- vanna/examples/claude_sqlite_example.py +236 -0
- vanna/examples/coding_agent_example.py +300 -0
- vanna/examples/custom_system_prompt_example.py +174 -0
- vanna/examples/default_workflow_handler_example.py +208 -0
- vanna/examples/email_auth_example.py +340 -0
- vanna/examples/evaluation_example.py +269 -0
- vanna/examples/extensibility_example.py +262 -0
- vanna/examples/minimal_example.py +67 -0
- vanna/examples/mock_auth_example.py +227 -0
- vanna/examples/mock_custom_tool.py +311 -0
- vanna/examples/mock_quickstart.py +79 -0
- vanna/examples/mock_quota_example.py +145 -0
- vanna/examples/mock_rich_components_demo.py +396 -0
- vanna/examples/mock_sqlite_example.py +223 -0
- vanna/examples/openai_quickstart.py +83 -0
- vanna/examples/primitive_components_demo.py +305 -0
- vanna/examples/quota_lifecycle_example.py +139 -0
- vanna/examples/visualization_example.py +251 -0
- vanna/integrations/__init__.py +17 -0
- vanna/integrations/anthropic/__init__.py +9 -0
- vanna/integrations/anthropic/llm.py +270 -0
- vanna/integrations/azureopenai/__init__.py +9 -0
- vanna/integrations/azureopenai/llm.py +329 -0
- vanna/integrations/azuresearch/__init__.py +7 -0
- vanna/integrations/azuresearch/agent_memory.py +413 -0
- vanna/integrations/bigquery/__init__.py +5 -0
- vanna/integrations/bigquery/sql_runner.py +81 -0
- vanna/integrations/chromadb/__init__.py +104 -0
- vanna/integrations/chromadb/agent_memory.py +416 -0
- vanna/integrations/clickhouse/__init__.py +5 -0
- vanna/integrations/clickhouse/sql_runner.py +82 -0
- vanna/integrations/duckdb/__init__.py +5 -0
- vanna/integrations/duckdb/sql_runner.py +65 -0
- vanna/integrations/faiss/__init__.py +7 -0
- vanna/integrations/faiss/agent_memory.py +431 -0
- vanna/integrations/google/__init__.py +9 -0
- vanna/integrations/google/gemini.py +370 -0
- vanna/integrations/hive/__init__.py +5 -0
- vanna/integrations/hive/sql_runner.py +87 -0
- vanna/integrations/local/__init__.py +17 -0
- vanna/integrations/local/agent_memory/__init__.py +7 -0
- vanna/integrations/local/agent_memory/in_memory.py +285 -0
- vanna/integrations/local/audit.py +59 -0
- vanna/integrations/local/file_system.py +242 -0
- vanna/integrations/local/file_system_conversation_store.py +255 -0
- vanna/integrations/local/storage.py +62 -0
- vanna/integrations/marqo/__init__.py +7 -0
- vanna/integrations/marqo/agent_memory.py +354 -0
- vanna/integrations/milvus/__init__.py +7 -0
- vanna/integrations/milvus/agent_memory.py +458 -0
- vanna/integrations/mock/__init__.py +9 -0
- vanna/integrations/mock/llm.py +65 -0
- vanna/integrations/mssql/__init__.py +5 -0
- vanna/integrations/mssql/sql_runner.py +66 -0
- vanna/integrations/mysql/__init__.py +5 -0
- vanna/integrations/mysql/sql_runner.py +92 -0
- vanna/integrations/ollama/__init__.py +7 -0
- vanna/integrations/ollama/llm.py +252 -0
- vanna/integrations/openai/__init__.py +10 -0
- vanna/integrations/openai/llm.py +267 -0
- vanna/integrations/openai/responses.py +163 -0
- vanna/integrations/opensearch/__init__.py +7 -0
- vanna/integrations/opensearch/agent_memory.py +411 -0
- vanna/integrations/oracle/__init__.py +5 -0
- vanna/integrations/oracle/sql_runner.py +75 -0
- vanna/integrations/pinecone/__init__.py +7 -0
- vanna/integrations/pinecone/agent_memory.py +329 -0
- vanna/integrations/plotly/__init__.py +5 -0
- vanna/integrations/plotly/chart_generator.py +313 -0
- vanna/integrations/postgres/__init__.py +9 -0
- vanna/integrations/postgres/sql_runner.py +112 -0
- vanna/integrations/premium/agent_memory/__init__.py +7 -0
- vanna/integrations/premium/agent_memory/premium.py +186 -0
- vanna/integrations/presto/__init__.py +5 -0
- vanna/integrations/presto/sql_runner.py +107 -0
- vanna/integrations/qdrant/__init__.py +7 -0
- vanna/integrations/qdrant/agent_memory.py +439 -0
- vanna/integrations/snowflake/__init__.py +5 -0
- vanna/integrations/snowflake/sql_runner.py +147 -0
- vanna/integrations/sqlite/__init__.py +9 -0
- vanna/integrations/sqlite/sql_runner.py +65 -0
- vanna/integrations/weaviate/__init__.py +7 -0
- vanna/integrations/weaviate/agent_memory.py +428 -0
- vanna/{ZhipuAI → legacy/ZhipuAI}/ZhipuAI_embeddings.py +11 -11
- vanna/legacy/__init__.py +403 -0
- vanna/legacy/adapter.py +463 -0
- vanna/{advanced → legacy/advanced}/__init__.py +3 -1
- vanna/{anthropic → legacy/anthropic}/anthropic_chat.py +9 -7
- vanna/{azuresearch → legacy/azuresearch}/azuresearch_vector.py +79 -41
- vanna/{base → legacy/base}/base.py +224 -217
- vanna/legacy/bedrock/__init__.py +1 -0
- vanna/{bedrock → legacy/bedrock}/bedrock_converse.py +13 -12
- vanna/{chromadb → legacy/chromadb}/chromadb_vector.py +3 -1
- vanna/legacy/cohere/__init__.py +2 -0
- vanna/{cohere → legacy/cohere}/cohere_chat.py +19 -14
- vanna/{cohere → legacy/cohere}/cohere_embeddings.py +25 -19
- vanna/{deepseek → legacy/deepseek}/deepseek_chat.py +5 -6
- vanna/legacy/faiss/__init__.py +1 -0
- vanna/{faiss → legacy/faiss}/faiss.py +113 -59
- vanna/{flask → legacy/flask}/__init__.py +84 -43
- vanna/{flask → legacy/flask}/assets.py +5 -5
- vanna/{flask → legacy/flask}/auth.py +5 -4
- vanna/{google → legacy/google}/bigquery_vector.py +75 -42
- vanna/{google → legacy/google}/gemini_chat.py +7 -3
- vanna/{hf → legacy/hf}/hf.py +0 -1
- vanna/{milvus → legacy/milvus}/milvus_vector.py +58 -35
- vanna/{mock → legacy/mock}/llm.py +0 -1
- vanna/legacy/mock/vectordb.py +67 -0
- vanna/legacy/ollama/ollama.py +110 -0
- vanna/{openai → legacy/openai}/openai_chat.py +2 -6
- vanna/legacy/opensearch/opensearch_vector.py +369 -0
- vanna/legacy/opensearch/opensearch_vector_semantic.py +200 -0
- vanna/legacy/oracle/oracle_vector.py +584 -0
- vanna/{pgvector → legacy/pgvector}/pgvector.py +42 -13
- vanna/{qdrant → legacy/qdrant}/qdrant.py +2 -6
- vanna/legacy/qianfan/Qianfan_Chat.py +170 -0
- vanna/legacy/qianfan/Qianfan_embeddings.py +36 -0
- vanna/legacy/qianwen/QianwenAI_chat.py +132 -0
- vanna/{remote.py → legacy/remote.py} +28 -26
- vanna/{utils.py → legacy/utils.py} +6 -11
- vanna/{vannadb → legacy/vannadb}/vannadb_vector.py +115 -46
- vanna/{vllm → legacy/vllm}/vllm.py +5 -6
- vanna/{weaviate → legacy/weaviate}/weaviate_vector.py +59 -40
- vanna/{xinference → legacy/xinference}/xinference.py +6 -6
- vanna/py.typed +0 -0
- vanna/servers/__init__.py +16 -0
- vanna/servers/__main__.py +8 -0
- vanna/servers/base/__init__.py +18 -0
- vanna/servers/base/chat_handler.py +65 -0
- vanna/servers/base/models.py +111 -0
- vanna/servers/base/rich_chat_handler.py +141 -0
- vanna/servers/base/templates.py +331 -0
- vanna/servers/cli/__init__.py +7 -0
- vanna/servers/cli/server_runner.py +204 -0
- vanna/servers/fastapi/__init__.py +7 -0
- vanna/servers/fastapi/app.py +163 -0
- vanna/servers/fastapi/routes.py +183 -0
- vanna/servers/flask/__init__.py +7 -0
- vanna/servers/flask/app.py +132 -0
- vanna/servers/flask/routes.py +137 -0
- vanna/tools/__init__.py +41 -0
- vanna/tools/agent_memory.py +322 -0
- vanna/tools/file_system.py +879 -0
- vanna/tools/python.py +222 -0
- vanna/tools/run_sql.py +165 -0
- vanna/tools/visualize_data.py +195 -0
- vanna/utils/__init__.py +0 -0
- vanna/web_components/__init__.py +44 -0
- vanna-2.0.0rc1.dist-info/METADATA +868 -0
- vanna-2.0.0rc1.dist-info/RECORD +289 -0
- vanna-2.0.0rc1.dist-info/entry_points.txt +3 -0
- vanna/bedrock/__init__.py +0 -1
- vanna/cohere/__init__.py +0 -2
- vanna/faiss/__init__.py +0 -1
- vanna/mock/vectordb.py +0 -55
- vanna/ollama/ollama.py +0 -103
- vanna/opensearch/opensearch_vector.py +0 -392
- vanna/opensearch/opensearch_vector_semantic.py +0 -175
- vanna/oracle/oracle_vector.py +0 -585
- vanna/qianfan/Qianfan_Chat.py +0 -165
- vanna/qianfan/Qianfan_embeddings.py +0 -36
- vanna/qianwen/QianwenAI_chat.py +0 -133
- vanna-0.7.9.dist-info/METADATA +0 -408
- vanna-0.7.9.dist-info/RECORD +0 -79
- /vanna/{ZhipuAI → legacy/ZhipuAI}/ZhipuAI_Chat.py +0 -0
- /vanna/{ZhipuAI → legacy/ZhipuAI}/__init__.py +0 -0
- /vanna/{anthropic → legacy/anthropic}/__init__.py +0 -0
- /vanna/{azuresearch → legacy/azuresearch}/__init__.py +0 -0
- /vanna/{base → legacy/base}/__init__.py +0 -0
- /vanna/{chromadb → legacy/chromadb}/__init__.py +0 -0
- /vanna/{deepseek → legacy/deepseek}/__init__.py +0 -0
- /vanna/{exceptions → legacy/exceptions}/__init__.py +0 -0
- /vanna/{google → legacy/google}/__init__.py +0 -0
- /vanna/{hf → legacy/hf}/__init__.py +0 -0
- /vanna/{local.py → legacy/local.py} +0 -0
- /vanna/{marqo → legacy/marqo}/__init__.py +0 -0
- /vanna/{marqo → legacy/marqo}/marqo.py +0 -0
- /vanna/{milvus → legacy/milvus}/__init__.py +0 -0
- /vanna/{mistral → legacy/mistral}/__init__.py +0 -0
- /vanna/{mistral → legacy/mistral}/mistral.py +0 -0
- /vanna/{mock → legacy/mock}/__init__.py +0 -0
- /vanna/{mock → legacy/mock}/embedding.py +0 -0
- /vanna/{ollama → legacy/ollama}/__init__.py +0 -0
- /vanna/{openai → legacy/openai}/__init__.py +0 -0
- /vanna/{openai → legacy/openai}/openai_embeddings.py +0 -0
- /vanna/{opensearch → legacy/opensearch}/__init__.py +0 -0
- /vanna/{oracle → legacy/oracle}/__init__.py +0 -0
- /vanna/{pgvector → legacy/pgvector}/__init__.py +0 -0
- /vanna/{pinecone → legacy/pinecone}/__init__.py +0 -0
- /vanna/{pinecone → legacy/pinecone}/pinecone_vector.py +0 -0
- /vanna/{qdrant → legacy/qdrant}/__init__.py +0 -0
- /vanna/{qianfan → legacy/qianfan}/__init__.py +0 -0
- /vanna/{qianwen → legacy/qianwen}/QianwenAI_embeddings.py +0 -0
- /vanna/{qianwen → legacy/qianwen}/__init__.py +0 -0
- /vanna/{types → legacy/types}/__init__.py +0 -0
- /vanna/{vannadb → legacy/vannadb}/__init__.py +0 -0
- /vanna/{vllm → legacy/vllm}/__init__.py +0 -0
- /vanna/{weaviate → legacy/weaviate}/__init__.py +0 -0
- /vanna/{xinference → legacy/xinference}/__init__.py +0 -0
- {vanna-0.7.9.dist-info → vanna-2.0.0rc1.dist-info}/WHEEL +0 -0
- {vanna-0.7.9.dist-info → vanna-2.0.0rc1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Built-in evaluators for common evaluation tasks.
|
|
3
|
+
|
|
4
|
+
This module provides ready-to-use evaluators for:
|
|
5
|
+
- Trajectory evaluation (tools called, order, efficiency)
|
|
6
|
+
- Output evaluation (content matching, quality)
|
|
7
|
+
- LLM-as-judge evaluation (custom criteria)
|
|
8
|
+
- Efficiency evaluation (time, tokens, cost)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Dict, Any, Optional
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
|
|
14
|
+
from .base import Evaluator, TestCase, AgentResult, EvaluationResult
|
|
15
|
+
from vanna.core import LlmService
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TrajectoryEvaluator(Evaluator):
|
|
19
|
+
"""Evaluate the path the agent took (tools called, order, etc).
|
|
20
|
+
|
|
21
|
+
Checks if the agent called the expected tools and didn't call
|
|
22
|
+
unexpected ones. Useful for verifying agent reasoning and planning.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def name(self) -> str:
|
|
27
|
+
return "trajectory"
|
|
28
|
+
|
|
29
|
+
async def evaluate(
|
|
30
|
+
self, test_case: TestCase, agent_result: AgentResult
|
|
31
|
+
) -> EvaluationResult:
|
|
32
|
+
"""Evaluate tool call trajectory."""
|
|
33
|
+
if agent_result.error:
|
|
34
|
+
return EvaluationResult(
|
|
35
|
+
test_case_id=test_case.id,
|
|
36
|
+
evaluator_name=self.name,
|
|
37
|
+
passed=False,
|
|
38
|
+
score=0.0,
|
|
39
|
+
reasoning=f"Agent execution failed: {agent_result.error}",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
expected = test_case.expected_outcome
|
|
43
|
+
if not expected:
|
|
44
|
+
return EvaluationResult(
|
|
45
|
+
test_case_id=test_case.id,
|
|
46
|
+
evaluator_name=self.name,
|
|
47
|
+
passed=True,
|
|
48
|
+
score=1.0,
|
|
49
|
+
reasoning="No expected outcome specified, passing by default",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
tools_called = agent_result.get_tool_names_called()
|
|
53
|
+
issues = []
|
|
54
|
+
score = 1.0
|
|
55
|
+
|
|
56
|
+
# Check expected tools were called
|
|
57
|
+
if expected.tools_called:
|
|
58
|
+
for expected_tool in expected.tools_called:
|
|
59
|
+
if expected_tool not in tools_called:
|
|
60
|
+
issues.append(f"Expected tool '{expected_tool}' was not called")
|
|
61
|
+
score -= 0.5 / len(expected.tools_called)
|
|
62
|
+
|
|
63
|
+
# Check unexpected tools were not called
|
|
64
|
+
if expected.tools_not_called:
|
|
65
|
+
for unexpected_tool in expected.tools_not_called:
|
|
66
|
+
if unexpected_tool in tools_called:
|
|
67
|
+
issues.append(f"Unexpected tool '{unexpected_tool}' was called")
|
|
68
|
+
score -= 0.5 / len(expected.tools_not_called)
|
|
69
|
+
|
|
70
|
+
score = max(0.0, min(1.0, score))
|
|
71
|
+
passed = score >= 0.7 # 70% threshold
|
|
72
|
+
|
|
73
|
+
reasoning = "Trajectory evaluation: "
|
|
74
|
+
if issues:
|
|
75
|
+
reasoning += "; ".join(issues)
|
|
76
|
+
else:
|
|
77
|
+
reasoning += "All expected tools called, no unexpected tools"
|
|
78
|
+
|
|
79
|
+
return EvaluationResult(
|
|
80
|
+
test_case_id=test_case.id,
|
|
81
|
+
evaluator_name=self.name,
|
|
82
|
+
passed=passed,
|
|
83
|
+
score=score,
|
|
84
|
+
reasoning=reasoning,
|
|
85
|
+
metrics={
|
|
86
|
+
"tools_called": tools_called,
|
|
87
|
+
"num_tools_called": len(tools_called),
|
|
88
|
+
"issues": issues,
|
|
89
|
+
},
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class OutputEvaluator(Evaluator):
|
|
94
|
+
"""Evaluate the final output quality.
|
|
95
|
+
|
|
96
|
+
Checks if the output contains expected content and doesn't
|
|
97
|
+
contain forbidden content. Case-insensitive substring matching.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def name(self) -> str:
|
|
102
|
+
return "output"
|
|
103
|
+
|
|
104
|
+
async def evaluate(
|
|
105
|
+
self, test_case: TestCase, agent_result: AgentResult
|
|
106
|
+
) -> EvaluationResult:
|
|
107
|
+
"""Evaluate output content."""
|
|
108
|
+
if agent_result.error:
|
|
109
|
+
return EvaluationResult(
|
|
110
|
+
test_case_id=test_case.id,
|
|
111
|
+
evaluator_name=self.name,
|
|
112
|
+
passed=False,
|
|
113
|
+
score=0.0,
|
|
114
|
+
reasoning=f"Agent execution failed: {agent_result.error}",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
expected = test_case.expected_outcome
|
|
118
|
+
if not expected:
|
|
119
|
+
return EvaluationResult(
|
|
120
|
+
test_case_id=test_case.id,
|
|
121
|
+
evaluator_name=self.name,
|
|
122
|
+
passed=True,
|
|
123
|
+
score=1.0,
|
|
124
|
+
reasoning="No expected outcome specified, passing by default",
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
final_answer = agent_result.get_final_answer().lower()
|
|
128
|
+
issues = []
|
|
129
|
+
score = 1.0
|
|
130
|
+
|
|
131
|
+
# Check expected content is present
|
|
132
|
+
if expected.final_answer_contains:
|
|
133
|
+
for expected_content in expected.final_answer_contains:
|
|
134
|
+
if expected_content.lower() not in final_answer:
|
|
135
|
+
issues.append(
|
|
136
|
+
f"Expected content '{expected_content}' not found in output"
|
|
137
|
+
)
|
|
138
|
+
score -= 0.5 / len(expected.final_answer_contains)
|
|
139
|
+
|
|
140
|
+
# Check forbidden content is absent
|
|
141
|
+
if expected.final_answer_not_contains:
|
|
142
|
+
for forbidden_content in expected.final_answer_not_contains:
|
|
143
|
+
if forbidden_content.lower() in final_answer:
|
|
144
|
+
issues.append(
|
|
145
|
+
f"Forbidden content '{forbidden_content}' found in output"
|
|
146
|
+
)
|
|
147
|
+
score -= 0.5 / len(expected.final_answer_not_contains)
|
|
148
|
+
|
|
149
|
+
score = max(0.0, min(1.0, score))
|
|
150
|
+
passed = score >= 0.7 # 70% threshold
|
|
151
|
+
|
|
152
|
+
reasoning = "Output evaluation: "
|
|
153
|
+
if issues:
|
|
154
|
+
reasoning += "; ".join(issues)
|
|
155
|
+
else:
|
|
156
|
+
reasoning += "All expected content present, no forbidden content"
|
|
157
|
+
|
|
158
|
+
return EvaluationResult(
|
|
159
|
+
test_case_id=test_case.id,
|
|
160
|
+
evaluator_name=self.name,
|
|
161
|
+
passed=passed,
|
|
162
|
+
score=score,
|
|
163
|
+
reasoning=reasoning,
|
|
164
|
+
metrics={
|
|
165
|
+
"output_length": len(final_answer),
|
|
166
|
+
"issues": issues,
|
|
167
|
+
},
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class LLMAsJudgeEvaluator(Evaluator):
|
|
172
|
+
"""Use an LLM to judge agent performance based on custom criteria.
|
|
173
|
+
|
|
174
|
+
This evaluator uses a separate LLM to assess the quality of the
|
|
175
|
+
agent's output based on natural language criteria.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
def __init__(self, judge_llm: LlmService, criteria: str):
|
|
179
|
+
"""Initialize LLM-as-judge evaluator.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
judge_llm: The LLM service to use for judging
|
|
183
|
+
criteria: Natural language description of what to evaluate
|
|
184
|
+
"""
|
|
185
|
+
self.judge_llm = judge_llm
|
|
186
|
+
self.criteria = criteria
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def name(self) -> str:
|
|
190
|
+
return "llm_judge"
|
|
191
|
+
|
|
192
|
+
async def evaluate(
|
|
193
|
+
self, test_case: TestCase, agent_result: AgentResult
|
|
194
|
+
) -> EvaluationResult:
|
|
195
|
+
"""Evaluate using LLM as judge."""
|
|
196
|
+
if agent_result.error:
|
|
197
|
+
return EvaluationResult(
|
|
198
|
+
test_case_id=test_case.id,
|
|
199
|
+
evaluator_name=self.name,
|
|
200
|
+
passed=False,
|
|
201
|
+
score=0.0,
|
|
202
|
+
reasoning=f"Agent execution failed: {agent_result.error}",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
final_answer = agent_result.get_final_answer()
|
|
206
|
+
|
|
207
|
+
# Build prompt for judge
|
|
208
|
+
judge_prompt = f"""You are evaluating an AI agent's response to a user query.
|
|
209
|
+
|
|
210
|
+
User Query: {test_case.message}
|
|
211
|
+
|
|
212
|
+
Agent's Response:
|
|
213
|
+
{final_answer}
|
|
214
|
+
|
|
215
|
+
Evaluation Criteria:
|
|
216
|
+
{self.criteria}
|
|
217
|
+
|
|
218
|
+
Please evaluate the response and provide:
|
|
219
|
+
1. A score from 0.0 to 1.0 (where 1.0 is perfect)
|
|
220
|
+
2. Whether it passes (score >= 0.7)
|
|
221
|
+
3. Brief reasoning for your evaluation
|
|
222
|
+
|
|
223
|
+
Respond in this format:
|
|
224
|
+
SCORE: <number>
|
|
225
|
+
PASSED: <yes/no>
|
|
226
|
+
REASONING: <your explanation>
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
# Call judge LLM
|
|
231
|
+
from vanna.core.llm import LlmRequest, LlmMessage
|
|
232
|
+
|
|
233
|
+
request = LlmRequest(
|
|
234
|
+
user=test_case.user,
|
|
235
|
+
messages=[LlmMessage(role="user", content=judge_prompt)],
|
|
236
|
+
temperature=0.0, # Deterministic judging
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
response = await self.judge_llm.send_request(request)
|
|
240
|
+
judgment = response.content or ""
|
|
241
|
+
|
|
242
|
+
# Parse response
|
|
243
|
+
score = self._parse_score(judgment)
|
|
244
|
+
passed = self._parse_passed(judgment)
|
|
245
|
+
reasoning = self._parse_reasoning(judgment)
|
|
246
|
+
|
|
247
|
+
return EvaluationResult(
|
|
248
|
+
test_case_id=test_case.id,
|
|
249
|
+
evaluator_name=self.name,
|
|
250
|
+
passed=passed,
|
|
251
|
+
score=score,
|
|
252
|
+
reasoning=reasoning,
|
|
253
|
+
metrics={"judge_response": judgment},
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
except Exception as e:
|
|
257
|
+
return EvaluationResult(
|
|
258
|
+
test_case_id=test_case.id,
|
|
259
|
+
evaluator_name=self.name,
|
|
260
|
+
passed=False,
|
|
261
|
+
score=0.0,
|
|
262
|
+
reasoning=f"LLM judge evaluation failed: {str(e)}",
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def _parse_score(self, judgment: str) -> float:
|
|
266
|
+
"""Parse score from judge response."""
|
|
267
|
+
try:
|
|
268
|
+
for line in judgment.split("\n"):
|
|
269
|
+
if line.startswith("SCORE:"):
|
|
270
|
+
score_str = line.replace("SCORE:", "").strip()
|
|
271
|
+
return float(score_str)
|
|
272
|
+
except Exception:
|
|
273
|
+
pass
|
|
274
|
+
return 0.5 # Default if parsing fails
|
|
275
|
+
|
|
276
|
+
def _parse_passed(self, judgment: str) -> bool:
|
|
277
|
+
"""Parse pass/fail from judge response."""
|
|
278
|
+
for line in judgment.split("\n"):
|
|
279
|
+
if line.startswith("PASSED:"):
|
|
280
|
+
passed_str = line.replace("PASSED:", "").strip().lower()
|
|
281
|
+
return passed_str in ["yes", "true", "pass"]
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
def _parse_reasoning(self, judgment: str) -> str:
|
|
285
|
+
"""Parse reasoning from judge response."""
|
|
286
|
+
for line in judgment.split("\n"):
|
|
287
|
+
if line.startswith("REASONING:"):
|
|
288
|
+
return line.replace("REASONING:", "").strip()
|
|
289
|
+
return judgment # Return full judgment if no reasoning line found
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class EfficiencyEvaluator(Evaluator):
|
|
293
|
+
"""Evaluate resource usage (time, tokens, cost).
|
|
294
|
+
|
|
295
|
+
Checks if the agent completed within acceptable resource limits.
|
|
296
|
+
"""
|
|
297
|
+
|
|
298
|
+
def __init__(
|
|
299
|
+
self,
|
|
300
|
+
max_execution_time_ms: Optional[float] = None,
|
|
301
|
+
max_tokens: Optional[int] = None,
|
|
302
|
+
max_cost_usd: Optional[float] = None,
|
|
303
|
+
):
|
|
304
|
+
"""Initialize efficiency evaluator.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
max_execution_time_ms: Maximum allowed execution time in milliseconds
|
|
308
|
+
max_tokens: Maximum allowed token usage
|
|
309
|
+
max_cost_usd: Maximum allowed cost in USD
|
|
310
|
+
"""
|
|
311
|
+
self.max_execution_time_ms = max_execution_time_ms
|
|
312
|
+
self.max_tokens = max_tokens
|
|
313
|
+
self.max_cost_usd = max_cost_usd
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
def name(self) -> str:
|
|
317
|
+
return "efficiency"
|
|
318
|
+
|
|
319
|
+
async def evaluate(
|
|
320
|
+
self, test_case: TestCase, agent_result: AgentResult
|
|
321
|
+
) -> EvaluationResult:
|
|
322
|
+
"""Evaluate resource efficiency."""
|
|
323
|
+
issues = []
|
|
324
|
+
score = 1.0
|
|
325
|
+
|
|
326
|
+
# Check execution time
|
|
327
|
+
if self.max_execution_time_ms:
|
|
328
|
+
if agent_result.execution_time_ms > self.max_execution_time_ms:
|
|
329
|
+
issues.append(
|
|
330
|
+
f"Execution time {agent_result.execution_time_ms:.0f}ms "
|
|
331
|
+
f"exceeded limit {self.max_execution_time_ms:.0f}ms"
|
|
332
|
+
)
|
|
333
|
+
score -= 0.33
|
|
334
|
+
|
|
335
|
+
# Check token usage
|
|
336
|
+
if self.max_tokens:
|
|
337
|
+
if agent_result.total_tokens > self.max_tokens:
|
|
338
|
+
issues.append(
|
|
339
|
+
f"Token usage {agent_result.total_tokens} exceeded limit {self.max_tokens}"
|
|
340
|
+
)
|
|
341
|
+
score -= 0.33
|
|
342
|
+
|
|
343
|
+
# Check cost (would need cost calculation from metadata)
|
|
344
|
+
# For now, skip cost evaluation
|
|
345
|
+
|
|
346
|
+
# Check from expected outcome if specified
|
|
347
|
+
expected = test_case.expected_outcome
|
|
348
|
+
if expected and expected.max_execution_time_ms:
|
|
349
|
+
if agent_result.execution_time_ms > expected.max_execution_time_ms:
|
|
350
|
+
issues.append(
|
|
351
|
+
f"Execution time {agent_result.execution_time_ms:.0f}ms "
|
|
352
|
+
f"exceeded test case limit {expected.max_execution_time_ms:.0f}ms"
|
|
353
|
+
)
|
|
354
|
+
score -= 0.34
|
|
355
|
+
|
|
356
|
+
score = max(0.0, min(1.0, score))
|
|
357
|
+
passed = score >= 0.7
|
|
358
|
+
|
|
359
|
+
reasoning = "Efficiency evaluation: "
|
|
360
|
+
if issues:
|
|
361
|
+
reasoning += "; ".join(issues)
|
|
362
|
+
else:
|
|
363
|
+
reasoning += "Within resource limits"
|
|
364
|
+
|
|
365
|
+
return EvaluationResult(
|
|
366
|
+
test_case_id=test_case.id,
|
|
367
|
+
evaluator_name=self.name,
|
|
368
|
+
passed=passed,
|
|
369
|
+
score=score,
|
|
370
|
+
reasoning=reasoning,
|
|
371
|
+
metrics={
|
|
372
|
+
"execution_time_ms": agent_result.execution_time_ms,
|
|
373
|
+
"total_tokens": agent_result.total_tokens,
|
|
374
|
+
"issues": issues,
|
|
375
|
+
},
|
|
376
|
+
)
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation reporting with HTML, CSV, and console output.
|
|
3
|
+
|
|
4
|
+
This module provides classes for generating evaluation reports,
|
|
5
|
+
including comparison reports for evaluating multiple agent variants.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import csv
|
|
9
|
+
from typing import List, Dict, Optional, Any
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
|
|
13
|
+
from .base import TestCaseResult, AgentVariant, Evaluator, TestCase
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class EvaluationReport:
|
|
18
|
+
"""Report for a single agent's evaluation results.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
agent_name: Name of the agent evaluated
|
|
22
|
+
results: List of results for each test case
|
|
23
|
+
evaluators: List of evaluators used
|
|
24
|
+
metadata: Additional metadata about the agent/run
|
|
25
|
+
timestamp: When the evaluation was run
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
agent_name: str
|
|
29
|
+
results: List[TestCaseResult]
|
|
30
|
+
evaluators: List[Evaluator]
|
|
31
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
32
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
33
|
+
|
|
34
|
+
def pass_rate(self) -> float:
|
|
35
|
+
"""Calculate overall pass rate (0.0 to 1.0)."""
|
|
36
|
+
if not self.results:
|
|
37
|
+
return 0.0
|
|
38
|
+
passed = sum(1 for r in self.results if r.overall_passed())
|
|
39
|
+
return passed / len(self.results)
|
|
40
|
+
|
|
41
|
+
def average_score(self) -> float:
|
|
42
|
+
"""Calculate average score across all test cases."""
|
|
43
|
+
if not self.results:
|
|
44
|
+
return 0.0
|
|
45
|
+
return sum(r.overall_score() for r in self.results) / len(self.results)
|
|
46
|
+
|
|
47
|
+
def average_time(self) -> float:
|
|
48
|
+
"""Calculate average execution time in milliseconds."""
|
|
49
|
+
if not self.results:
|
|
50
|
+
return 0.0
|
|
51
|
+
return sum(r.execution_time_ms for r in self.results) / len(self.results)
|
|
52
|
+
|
|
53
|
+
def total_tokens(self) -> int:
|
|
54
|
+
"""Calculate total tokens used across all test cases."""
|
|
55
|
+
return sum(r.agent_result.total_tokens for r in self.results)
|
|
56
|
+
|
|
57
|
+
def get_failures(self) -> List[TestCaseResult]:
|
|
58
|
+
"""Get all failed test cases."""
|
|
59
|
+
return [r for r in self.results if not r.overall_passed()]
|
|
60
|
+
|
|
61
|
+
def print_summary(self) -> None:
|
|
62
|
+
"""Print summary to console."""
|
|
63
|
+
print(f"\n{'=' * 80}")
|
|
64
|
+
print(f"EVALUATION REPORT: {self.agent_name}")
|
|
65
|
+
print(f"{'=' * 80}")
|
|
66
|
+
print(f"Timestamp: {self.timestamp.isoformat()}")
|
|
67
|
+
print(f"Test Cases: {len(self.results)}")
|
|
68
|
+
print(f"Pass Rate: {self.pass_rate():.1%}")
|
|
69
|
+
print(f"Average Score: {self.average_score():.2f}")
|
|
70
|
+
print(f"Average Time: {self.average_time():.0f}ms")
|
|
71
|
+
print(f"Total Tokens: {self.total_tokens()}")
|
|
72
|
+
print(f"{'=' * 80}\n")
|
|
73
|
+
|
|
74
|
+
failures = self.get_failures()
|
|
75
|
+
if failures:
|
|
76
|
+
print(f"FAILURES ({len(failures)}):")
|
|
77
|
+
for result in failures:
|
|
78
|
+
print(f"\n Test Case: {result.test_case.id}")
|
|
79
|
+
print(f" Message: {result.test_case.message}")
|
|
80
|
+
print(f" Score: {result.overall_score():.2f}")
|
|
81
|
+
for eval_result in result.evaluations:
|
|
82
|
+
if not eval_result.passed:
|
|
83
|
+
print(
|
|
84
|
+
f" [{eval_result.evaluator_name}] {eval_result.reasoning}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class ComparisonReport:
|
|
90
|
+
"""Report comparing multiple agent variants.
|
|
91
|
+
|
|
92
|
+
This is the primary report type for LLM comparison use cases.
|
|
93
|
+
|
|
94
|
+
Attributes:
|
|
95
|
+
variants: List of agent variants compared
|
|
96
|
+
reports: Dict mapping variant name to EvaluationReport
|
|
97
|
+
test_cases: Test cases used for comparison
|
|
98
|
+
timestamp: When the comparison was run
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
variants: List[AgentVariant]
|
|
102
|
+
reports: Dict[str, EvaluationReport]
|
|
103
|
+
test_cases: List[TestCase]
|
|
104
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
105
|
+
|
|
106
|
+
def print_summary(self) -> None:
|
|
107
|
+
"""Print comparison summary to console."""
|
|
108
|
+
print("\n" + "=" * 80)
|
|
109
|
+
print("AGENT COMPARISON SUMMARY")
|
|
110
|
+
print("=" * 80)
|
|
111
|
+
print(f"Timestamp: {self.timestamp.isoformat()}")
|
|
112
|
+
print(f"Variants: {len(self.variants)}")
|
|
113
|
+
print(f"Test Cases: {len(self.test_cases)}")
|
|
114
|
+
|
|
115
|
+
# Table of results
|
|
116
|
+
print(
|
|
117
|
+
f"\n{'Agent':<25} {'Pass Rate':<12} {'Avg Score':<12} {'Avg Time':<12} {'Tokens':<12}"
|
|
118
|
+
)
|
|
119
|
+
print("-" * 80)
|
|
120
|
+
|
|
121
|
+
for variant_name, report in self.reports.items():
|
|
122
|
+
print(
|
|
123
|
+
f"{variant_name:<25} "
|
|
124
|
+
f"{report.pass_rate():<12.1%} "
|
|
125
|
+
f"{report.average_score():<12.2f} "
|
|
126
|
+
f"{report.average_time():<12.0f} "
|
|
127
|
+
f"{report.total_tokens():<12,}"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
print("=" * 80 + "\n")
|
|
131
|
+
|
|
132
|
+
def get_best_variant(self, metric: str = "score") -> str:
|
|
133
|
+
"""Get the best performing variant by metric.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
metric: Metric to optimize ('score', 'speed', 'pass_rate')
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Name of the best variant
|
|
140
|
+
"""
|
|
141
|
+
if metric == "score":
|
|
142
|
+
return max(self.reports.items(), key=lambda x: x[1].average_score())[0]
|
|
143
|
+
elif metric == "speed":
|
|
144
|
+
return min(self.reports.items(), key=lambda x: x[1].average_time())[0]
|
|
145
|
+
elif metric == "pass_rate":
|
|
146
|
+
return max(self.reports.items(), key=lambda x: x[1].pass_rate())[0]
|
|
147
|
+
else:
|
|
148
|
+
raise ValueError(f"Unknown metric: {metric}")
|
|
149
|
+
|
|
150
|
+
def save_csv(self, path: str) -> None:
|
|
151
|
+
"""Save detailed CSV for further analysis.
|
|
152
|
+
|
|
153
|
+
Each row represents one test case × one variant combination.
|
|
154
|
+
"""
|
|
155
|
+
with open(path, "w", newline="") as f:
|
|
156
|
+
writer = csv.writer(f)
|
|
157
|
+
|
|
158
|
+
# Header
|
|
159
|
+
writer.writerow(
|
|
160
|
+
[
|
|
161
|
+
"variant",
|
|
162
|
+
"test_case_id",
|
|
163
|
+
"test_message",
|
|
164
|
+
"passed",
|
|
165
|
+
"score",
|
|
166
|
+
"execution_time_ms",
|
|
167
|
+
"tokens",
|
|
168
|
+
"error",
|
|
169
|
+
"evaluator_scores",
|
|
170
|
+
]
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Data rows
|
|
174
|
+
for variant_name, report in self.reports.items():
|
|
175
|
+
for result in report.results:
|
|
176
|
+
evaluator_scores = {
|
|
177
|
+
e.evaluator_name: e.score for e in result.evaluations
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
writer.writerow(
|
|
181
|
+
[
|
|
182
|
+
variant_name,
|
|
183
|
+
result.test_case.id,
|
|
184
|
+
result.test_case.message[:50], # Truncate
|
|
185
|
+
result.overall_passed(),
|
|
186
|
+
result.overall_score(),
|
|
187
|
+
result.execution_time_ms,
|
|
188
|
+
result.agent_result.total_tokens,
|
|
189
|
+
result.agent_result.error or "",
|
|
190
|
+
str(evaluator_scores),
|
|
191
|
+
]
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def save_html(self, path: str) -> None:
|
|
195
|
+
"""Save interactive HTML comparison report.
|
|
196
|
+
|
|
197
|
+
Generates a rich HTML report with:
|
|
198
|
+
- Summary statistics
|
|
199
|
+
- Charts comparing variants
|
|
200
|
+
- Side-by-side test case results
|
|
201
|
+
"""
|
|
202
|
+
html = self._generate_html()
|
|
203
|
+
with open(path, "w") as f:
|
|
204
|
+
f.write(html)
|
|
205
|
+
|
|
206
|
+
def _generate_html(self) -> str:
|
|
207
|
+
"""Generate HTML content for report."""
|
|
208
|
+
# Build HTML report
|
|
209
|
+
html_parts = [
|
|
210
|
+
"<!DOCTYPE html>",
|
|
211
|
+
"<html>",
|
|
212
|
+
"<head>",
|
|
213
|
+
"<title>Agent Comparison Report</title>",
|
|
214
|
+
"<style>",
|
|
215
|
+
"body { font-family: Arial, sans-serif; margin: 20px; }",
|
|
216
|
+
"h1 { color: #333; }",
|
|
217
|
+
"table { border-collapse: collapse; width: 100%; margin: 20px 0; }",
|
|
218
|
+
"th, td { border: 1px solid #ddd; padding: 12px; text-align: left; }",
|
|
219
|
+
"th { background-color: #4CAF50; color: white; }",
|
|
220
|
+
"tr:nth-child(even) { background-color: #f2f2f2; }",
|
|
221
|
+
".passed { color: green; font-weight: bold; }",
|
|
222
|
+
".failed { color: red; font-weight: bold; }",
|
|
223
|
+
".best { background-color: #d4edda !important; }",
|
|
224
|
+
"</style>",
|
|
225
|
+
"</head>",
|
|
226
|
+
"<body>",
|
|
227
|
+
f"<h1>Agent Comparison Report</h1>",
|
|
228
|
+
f"<p>Generated: {self.timestamp.isoformat()}</p>",
|
|
229
|
+
f"<p>Variants: {len(self.variants)} | Test Cases: {len(self.test_cases)}</p>",
|
|
230
|
+
]
|
|
231
|
+
|
|
232
|
+
# Summary table
|
|
233
|
+
html_parts.append("<h2>Summary</h2>")
|
|
234
|
+
html_parts.append("<table>")
|
|
235
|
+
html_parts.append(
|
|
236
|
+
"<tr><th>Agent</th><th>Pass Rate</th><th>Avg Score</th><th>Avg Time (ms)</th><th>Total Tokens</th></tr>"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
best_by_score = self.get_best_variant("score")
|
|
240
|
+
|
|
241
|
+
for variant_name, report in self.reports.items():
|
|
242
|
+
row_class = "best" if variant_name == best_by_score else ""
|
|
243
|
+
html_parts.append(
|
|
244
|
+
f"<tr class='{row_class}'>"
|
|
245
|
+
f"<td>{variant_name}</td>"
|
|
246
|
+
f"<td>{report.pass_rate():.1%}</td>"
|
|
247
|
+
f"<td>{report.average_score():.2f}</td>"
|
|
248
|
+
f"<td>{report.average_time():.0f}</td>"
|
|
249
|
+
f"<td>{report.total_tokens():,}</td>"
|
|
250
|
+
f"</tr>"
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
html_parts.append("</table>")
|
|
254
|
+
|
|
255
|
+
# Test case details
|
|
256
|
+
html_parts.append("<h2>Test Case Details</h2>")
|
|
257
|
+
|
|
258
|
+
for i, test_case in enumerate(self.test_cases):
|
|
259
|
+
html_parts.append(f"<h3>Test Case {i + 1}: {test_case.id}</h3>")
|
|
260
|
+
html_parts.append(f"<p><strong>Message:</strong> {test_case.message}</p>")
|
|
261
|
+
|
|
262
|
+
html_parts.append("<table>")
|
|
263
|
+
html_parts.append(
|
|
264
|
+
"<tr><th>Variant</th><th>Result</th><th>Score</th><th>Time (ms)</th></tr>"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
for variant_name, report in self.reports.items():
|
|
268
|
+
result = next(
|
|
269
|
+
(r for r in report.results if r.test_case.id == test_case.id), None
|
|
270
|
+
)
|
|
271
|
+
if result:
|
|
272
|
+
passed_class = "passed" if result.overall_passed() else "failed"
|
|
273
|
+
passed_text = "PASS" if result.overall_passed() else "FAIL"
|
|
274
|
+
|
|
275
|
+
html_parts.append(
|
|
276
|
+
f"<tr>"
|
|
277
|
+
f"<td>{variant_name}</td>"
|
|
278
|
+
f"<td class='{passed_class}'>{passed_text}</td>"
|
|
279
|
+
f"<td>{result.overall_score():.2f}</td>"
|
|
280
|
+
f"<td>{result.execution_time_ms:.0f}</td>"
|
|
281
|
+
f"</tr>"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
html_parts.append("</table>")
|
|
285
|
+
|
|
286
|
+
html_parts.append("</body>")
|
|
287
|
+
html_parts.append("</html>")
|
|
288
|
+
|
|
289
|
+
return "\n".join(html_parts)
|