vanna 0.7.9__py3-none-any.whl → 2.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. vanna/__init__.py +167 -395
  2. vanna/agents/__init__.py +7 -0
  3. vanna/capabilities/__init__.py +17 -0
  4. vanna/capabilities/agent_memory/__init__.py +21 -0
  5. vanna/capabilities/agent_memory/base.py +103 -0
  6. vanna/capabilities/agent_memory/models.py +53 -0
  7. vanna/capabilities/file_system/__init__.py +14 -0
  8. vanna/capabilities/file_system/base.py +71 -0
  9. vanna/capabilities/file_system/models.py +25 -0
  10. vanna/capabilities/sql_runner/__init__.py +13 -0
  11. vanna/capabilities/sql_runner/base.py +37 -0
  12. vanna/capabilities/sql_runner/models.py +13 -0
  13. vanna/components/__init__.py +92 -0
  14. vanna/components/base.py +11 -0
  15. vanna/components/rich/__init__.py +83 -0
  16. vanna/components/rich/containers/__init__.py +7 -0
  17. vanna/components/rich/containers/card.py +20 -0
  18. vanna/components/rich/data/__init__.py +9 -0
  19. vanna/components/rich/data/chart.py +17 -0
  20. vanna/components/rich/data/dataframe.py +93 -0
  21. vanna/components/rich/feedback/__init__.py +21 -0
  22. vanna/components/rich/feedback/badge.py +16 -0
  23. vanna/components/rich/feedback/icon_text.py +14 -0
  24. vanna/components/rich/feedback/log_viewer.py +41 -0
  25. vanna/components/rich/feedback/notification.py +19 -0
  26. vanna/components/rich/feedback/progress.py +37 -0
  27. vanna/components/rich/feedback/status_card.py +28 -0
  28. vanna/components/rich/feedback/status_indicator.py +14 -0
  29. vanna/components/rich/interactive/__init__.py +21 -0
  30. vanna/components/rich/interactive/button.py +95 -0
  31. vanna/components/rich/interactive/task_list.py +58 -0
  32. vanna/components/rich/interactive/ui_state.py +93 -0
  33. vanna/components/rich/specialized/__init__.py +7 -0
  34. vanna/components/rich/specialized/artifact.py +20 -0
  35. vanna/components/rich/text.py +16 -0
  36. vanna/components/simple/__init__.py +15 -0
  37. vanna/components/simple/image.py +15 -0
  38. vanna/components/simple/link.py +15 -0
  39. vanna/components/simple/text.py +11 -0
  40. vanna/core/__init__.py +193 -0
  41. vanna/core/_compat.py +19 -0
  42. vanna/core/agent/__init__.py +10 -0
  43. vanna/core/agent/agent.py +1407 -0
  44. vanna/core/agent/config.py +123 -0
  45. vanna/core/audit/__init__.py +28 -0
  46. vanna/core/audit/base.py +299 -0
  47. vanna/core/audit/models.py +131 -0
  48. vanna/core/component_manager.py +329 -0
  49. vanna/core/components.py +53 -0
  50. vanna/core/enhancer/__init__.py +11 -0
  51. vanna/core/enhancer/base.py +94 -0
  52. vanna/core/enhancer/default.py +118 -0
  53. vanna/core/enricher/__init__.py +10 -0
  54. vanna/core/enricher/base.py +59 -0
  55. vanna/core/errors.py +47 -0
  56. vanna/core/evaluation/__init__.py +81 -0
  57. vanna/core/evaluation/base.py +186 -0
  58. vanna/core/evaluation/dataset.py +254 -0
  59. vanna/core/evaluation/evaluators.py +376 -0
  60. vanna/core/evaluation/report.py +289 -0
  61. vanna/core/evaluation/runner.py +313 -0
  62. vanna/core/filter/__init__.py +10 -0
  63. vanna/core/filter/base.py +67 -0
  64. vanna/core/lifecycle/__init__.py +10 -0
  65. vanna/core/lifecycle/base.py +83 -0
  66. vanna/core/llm/__init__.py +16 -0
  67. vanna/core/llm/base.py +40 -0
  68. vanna/core/llm/models.py +61 -0
  69. vanna/core/middleware/__init__.py +10 -0
  70. vanna/core/middleware/base.py +69 -0
  71. vanna/core/observability/__init__.py +11 -0
  72. vanna/core/observability/base.py +88 -0
  73. vanna/core/observability/models.py +47 -0
  74. vanna/core/recovery/__init__.py +11 -0
  75. vanna/core/recovery/base.py +84 -0
  76. vanna/core/recovery/models.py +32 -0
  77. vanna/core/registry.py +278 -0
  78. vanna/core/rich_component.py +156 -0
  79. vanna/core/simple_component.py +27 -0
  80. vanna/core/storage/__init__.py +14 -0
  81. vanna/core/storage/base.py +46 -0
  82. vanna/core/storage/models.py +46 -0
  83. vanna/core/system_prompt/__init__.py +13 -0
  84. vanna/core/system_prompt/base.py +36 -0
  85. vanna/core/system_prompt/default.py +157 -0
  86. vanna/core/tool/__init__.py +18 -0
  87. vanna/core/tool/base.py +70 -0
  88. vanna/core/tool/models.py +84 -0
  89. vanna/core/user/__init__.py +17 -0
  90. vanna/core/user/base.py +29 -0
  91. vanna/core/user/models.py +25 -0
  92. vanna/core/user/request_context.py +70 -0
  93. vanna/core/user/resolver.py +42 -0
  94. vanna/core/validation.py +164 -0
  95. vanna/core/workflow/__init__.py +12 -0
  96. vanna/core/workflow/base.py +254 -0
  97. vanna/core/workflow/default.py +789 -0
  98. vanna/examples/__init__.py +1 -0
  99. vanna/examples/__main__.py +44 -0
  100. vanna/examples/anthropic_quickstart.py +80 -0
  101. vanna/examples/artifact_example.py +293 -0
  102. vanna/examples/claude_sqlite_example.py +236 -0
  103. vanna/examples/coding_agent_example.py +300 -0
  104. vanna/examples/custom_system_prompt_example.py +174 -0
  105. vanna/examples/default_workflow_handler_example.py +208 -0
  106. vanna/examples/email_auth_example.py +340 -0
  107. vanna/examples/evaluation_example.py +269 -0
  108. vanna/examples/extensibility_example.py +262 -0
  109. vanna/examples/minimal_example.py +67 -0
  110. vanna/examples/mock_auth_example.py +227 -0
  111. vanna/examples/mock_custom_tool.py +311 -0
  112. vanna/examples/mock_quickstart.py +79 -0
  113. vanna/examples/mock_quota_example.py +145 -0
  114. vanna/examples/mock_rich_components_demo.py +396 -0
  115. vanna/examples/mock_sqlite_example.py +223 -0
  116. vanna/examples/openai_quickstart.py +83 -0
  117. vanna/examples/primitive_components_demo.py +305 -0
  118. vanna/examples/quota_lifecycle_example.py +139 -0
  119. vanna/examples/visualization_example.py +251 -0
  120. vanna/integrations/__init__.py +17 -0
  121. vanna/integrations/anthropic/__init__.py +9 -0
  122. vanna/integrations/anthropic/llm.py +270 -0
  123. vanna/integrations/azureopenai/__init__.py +9 -0
  124. vanna/integrations/azureopenai/llm.py +329 -0
  125. vanna/integrations/azuresearch/__init__.py +7 -0
  126. vanna/integrations/azuresearch/agent_memory.py +413 -0
  127. vanna/integrations/bigquery/__init__.py +5 -0
  128. vanna/integrations/bigquery/sql_runner.py +81 -0
  129. vanna/integrations/chromadb/__init__.py +104 -0
  130. vanna/integrations/chromadb/agent_memory.py +416 -0
  131. vanna/integrations/clickhouse/__init__.py +5 -0
  132. vanna/integrations/clickhouse/sql_runner.py +82 -0
  133. vanna/integrations/duckdb/__init__.py +5 -0
  134. vanna/integrations/duckdb/sql_runner.py +65 -0
  135. vanna/integrations/faiss/__init__.py +7 -0
  136. vanna/integrations/faiss/agent_memory.py +431 -0
  137. vanna/integrations/google/__init__.py +9 -0
  138. vanna/integrations/google/gemini.py +370 -0
  139. vanna/integrations/hive/__init__.py +5 -0
  140. vanna/integrations/hive/sql_runner.py +87 -0
  141. vanna/integrations/local/__init__.py +17 -0
  142. vanna/integrations/local/agent_memory/__init__.py +7 -0
  143. vanna/integrations/local/agent_memory/in_memory.py +285 -0
  144. vanna/integrations/local/audit.py +59 -0
  145. vanna/integrations/local/file_system.py +242 -0
  146. vanna/integrations/local/file_system_conversation_store.py +255 -0
  147. vanna/integrations/local/storage.py +62 -0
  148. vanna/integrations/marqo/__init__.py +7 -0
  149. vanna/integrations/marqo/agent_memory.py +354 -0
  150. vanna/integrations/milvus/__init__.py +7 -0
  151. vanna/integrations/milvus/agent_memory.py +458 -0
  152. vanna/integrations/mock/__init__.py +9 -0
  153. vanna/integrations/mock/llm.py +65 -0
  154. vanna/integrations/mssql/__init__.py +5 -0
  155. vanna/integrations/mssql/sql_runner.py +66 -0
  156. vanna/integrations/mysql/__init__.py +5 -0
  157. vanna/integrations/mysql/sql_runner.py +92 -0
  158. vanna/integrations/ollama/__init__.py +7 -0
  159. vanna/integrations/ollama/llm.py +252 -0
  160. vanna/integrations/openai/__init__.py +10 -0
  161. vanna/integrations/openai/llm.py +267 -0
  162. vanna/integrations/openai/responses.py +163 -0
  163. vanna/integrations/opensearch/__init__.py +7 -0
  164. vanna/integrations/opensearch/agent_memory.py +411 -0
  165. vanna/integrations/oracle/__init__.py +5 -0
  166. vanna/integrations/oracle/sql_runner.py +75 -0
  167. vanna/integrations/pinecone/__init__.py +7 -0
  168. vanna/integrations/pinecone/agent_memory.py +329 -0
  169. vanna/integrations/plotly/__init__.py +5 -0
  170. vanna/integrations/plotly/chart_generator.py +313 -0
  171. vanna/integrations/postgres/__init__.py +9 -0
  172. vanna/integrations/postgres/sql_runner.py +112 -0
  173. vanna/integrations/premium/agent_memory/__init__.py +7 -0
  174. vanna/integrations/premium/agent_memory/premium.py +186 -0
  175. vanna/integrations/presto/__init__.py +5 -0
  176. vanna/integrations/presto/sql_runner.py +107 -0
  177. vanna/integrations/qdrant/__init__.py +7 -0
  178. vanna/integrations/qdrant/agent_memory.py +439 -0
  179. vanna/integrations/snowflake/__init__.py +5 -0
  180. vanna/integrations/snowflake/sql_runner.py +147 -0
  181. vanna/integrations/sqlite/__init__.py +9 -0
  182. vanna/integrations/sqlite/sql_runner.py +65 -0
  183. vanna/integrations/weaviate/__init__.py +7 -0
  184. vanna/integrations/weaviate/agent_memory.py +428 -0
  185. vanna/{ZhipuAI → legacy/ZhipuAI}/ZhipuAI_embeddings.py +11 -11
  186. vanna/legacy/__init__.py +403 -0
  187. vanna/legacy/adapter.py +463 -0
  188. vanna/{advanced → legacy/advanced}/__init__.py +3 -1
  189. vanna/{anthropic → legacy/anthropic}/anthropic_chat.py +9 -7
  190. vanna/{azuresearch → legacy/azuresearch}/azuresearch_vector.py +79 -41
  191. vanna/{base → legacy/base}/base.py +224 -217
  192. vanna/legacy/bedrock/__init__.py +1 -0
  193. vanna/{bedrock → legacy/bedrock}/bedrock_converse.py +13 -12
  194. vanna/{chromadb → legacy/chromadb}/chromadb_vector.py +3 -1
  195. vanna/legacy/cohere/__init__.py +2 -0
  196. vanna/{cohere → legacy/cohere}/cohere_chat.py +19 -14
  197. vanna/{cohere → legacy/cohere}/cohere_embeddings.py +25 -19
  198. vanna/{deepseek → legacy/deepseek}/deepseek_chat.py +5 -6
  199. vanna/legacy/faiss/__init__.py +1 -0
  200. vanna/{faiss → legacy/faiss}/faiss.py +113 -59
  201. vanna/{flask → legacy/flask}/__init__.py +84 -43
  202. vanna/{flask → legacy/flask}/assets.py +5 -5
  203. vanna/{flask → legacy/flask}/auth.py +5 -4
  204. vanna/{google → legacy/google}/bigquery_vector.py +75 -42
  205. vanna/{google → legacy/google}/gemini_chat.py +7 -3
  206. vanna/{hf → legacy/hf}/hf.py +0 -1
  207. vanna/{milvus → legacy/milvus}/milvus_vector.py +58 -35
  208. vanna/{mock → legacy/mock}/llm.py +0 -1
  209. vanna/legacy/mock/vectordb.py +67 -0
  210. vanna/legacy/ollama/ollama.py +110 -0
  211. vanna/{openai → legacy/openai}/openai_chat.py +2 -6
  212. vanna/legacy/opensearch/opensearch_vector.py +369 -0
  213. vanna/legacy/opensearch/opensearch_vector_semantic.py +200 -0
  214. vanna/legacy/oracle/oracle_vector.py +584 -0
  215. vanna/{pgvector → legacy/pgvector}/pgvector.py +42 -13
  216. vanna/{qdrant → legacy/qdrant}/qdrant.py +2 -6
  217. vanna/legacy/qianfan/Qianfan_Chat.py +170 -0
  218. vanna/legacy/qianfan/Qianfan_embeddings.py +36 -0
  219. vanna/legacy/qianwen/QianwenAI_chat.py +132 -0
  220. vanna/{remote.py → legacy/remote.py} +28 -26
  221. vanna/{utils.py → legacy/utils.py} +6 -11
  222. vanna/{vannadb → legacy/vannadb}/vannadb_vector.py +115 -46
  223. vanna/{vllm → legacy/vllm}/vllm.py +5 -6
  224. vanna/{weaviate → legacy/weaviate}/weaviate_vector.py +59 -40
  225. vanna/{xinference → legacy/xinference}/xinference.py +6 -6
  226. vanna/py.typed +0 -0
  227. vanna/servers/__init__.py +16 -0
  228. vanna/servers/__main__.py +8 -0
  229. vanna/servers/base/__init__.py +18 -0
  230. vanna/servers/base/chat_handler.py +65 -0
  231. vanna/servers/base/models.py +111 -0
  232. vanna/servers/base/rich_chat_handler.py +141 -0
  233. vanna/servers/base/templates.py +331 -0
  234. vanna/servers/cli/__init__.py +7 -0
  235. vanna/servers/cli/server_runner.py +204 -0
  236. vanna/servers/fastapi/__init__.py +7 -0
  237. vanna/servers/fastapi/app.py +163 -0
  238. vanna/servers/fastapi/routes.py +183 -0
  239. vanna/servers/flask/__init__.py +7 -0
  240. vanna/servers/flask/app.py +132 -0
  241. vanna/servers/flask/routes.py +137 -0
  242. vanna/tools/__init__.py +41 -0
  243. vanna/tools/agent_memory.py +322 -0
  244. vanna/tools/file_system.py +879 -0
  245. vanna/tools/python.py +222 -0
  246. vanna/tools/run_sql.py +165 -0
  247. vanna/tools/visualize_data.py +195 -0
  248. vanna/utils/__init__.py +0 -0
  249. vanna/web_components/__init__.py +44 -0
  250. vanna-2.0.0rc1.dist-info/METADATA +868 -0
  251. vanna-2.0.0rc1.dist-info/RECORD +289 -0
  252. vanna-2.0.0rc1.dist-info/entry_points.txt +3 -0
  253. vanna/bedrock/__init__.py +0 -1
  254. vanna/cohere/__init__.py +0 -2
  255. vanna/faiss/__init__.py +0 -1
  256. vanna/mock/vectordb.py +0 -55
  257. vanna/ollama/ollama.py +0 -103
  258. vanna/opensearch/opensearch_vector.py +0 -392
  259. vanna/opensearch/opensearch_vector_semantic.py +0 -175
  260. vanna/oracle/oracle_vector.py +0 -585
  261. vanna/qianfan/Qianfan_Chat.py +0 -165
  262. vanna/qianfan/Qianfan_embeddings.py +0 -36
  263. vanna/qianwen/QianwenAI_chat.py +0 -133
  264. vanna-0.7.9.dist-info/METADATA +0 -408
  265. vanna-0.7.9.dist-info/RECORD +0 -79
  266. /vanna/{ZhipuAI → legacy/ZhipuAI}/ZhipuAI_Chat.py +0 -0
  267. /vanna/{ZhipuAI → legacy/ZhipuAI}/__init__.py +0 -0
  268. /vanna/{anthropic → legacy/anthropic}/__init__.py +0 -0
  269. /vanna/{azuresearch → legacy/azuresearch}/__init__.py +0 -0
  270. /vanna/{base → legacy/base}/__init__.py +0 -0
  271. /vanna/{chromadb → legacy/chromadb}/__init__.py +0 -0
  272. /vanna/{deepseek → legacy/deepseek}/__init__.py +0 -0
  273. /vanna/{exceptions → legacy/exceptions}/__init__.py +0 -0
  274. /vanna/{google → legacy/google}/__init__.py +0 -0
  275. /vanna/{hf → legacy/hf}/__init__.py +0 -0
  276. /vanna/{local.py → legacy/local.py} +0 -0
  277. /vanna/{marqo → legacy/marqo}/__init__.py +0 -0
  278. /vanna/{marqo → legacy/marqo}/marqo.py +0 -0
  279. /vanna/{milvus → legacy/milvus}/__init__.py +0 -0
  280. /vanna/{mistral → legacy/mistral}/__init__.py +0 -0
  281. /vanna/{mistral → legacy/mistral}/mistral.py +0 -0
  282. /vanna/{mock → legacy/mock}/__init__.py +0 -0
  283. /vanna/{mock → legacy/mock}/embedding.py +0 -0
  284. /vanna/{ollama → legacy/ollama}/__init__.py +0 -0
  285. /vanna/{openai → legacy/openai}/__init__.py +0 -0
  286. /vanna/{openai → legacy/openai}/openai_embeddings.py +0 -0
  287. /vanna/{opensearch → legacy/opensearch}/__init__.py +0 -0
  288. /vanna/{oracle → legacy/oracle}/__init__.py +0 -0
  289. /vanna/{pgvector → legacy/pgvector}/__init__.py +0 -0
  290. /vanna/{pinecone → legacy/pinecone}/__init__.py +0 -0
  291. /vanna/{pinecone → legacy/pinecone}/pinecone_vector.py +0 -0
  292. /vanna/{qdrant → legacy/qdrant}/__init__.py +0 -0
  293. /vanna/{qianfan → legacy/qianfan}/__init__.py +0 -0
  294. /vanna/{qianwen → legacy/qianwen}/QianwenAI_embeddings.py +0 -0
  295. /vanna/{qianwen → legacy/qianwen}/__init__.py +0 -0
  296. /vanna/{types → legacy/types}/__init__.py +0 -0
  297. /vanna/{vannadb → legacy/vannadb}/__init__.py +0 -0
  298. /vanna/{vllm → legacy/vllm}/__init__.py +0 -0
  299. /vanna/{weaviate → legacy/weaviate}/__init__.py +0 -0
  300. /vanna/{xinference → legacy/xinference}/__init__.py +0 -0
  301. {vanna-0.7.9.dist-info → vanna-2.0.0rc1.dist-info}/WHEEL +0 -0
  302. {vanna-0.7.9.dist-info → vanna-2.0.0rc1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,376 @@
1
+ """
2
+ Built-in evaluators for common evaluation tasks.
3
+
4
+ This module provides ready-to-use evaluators for:
5
+ - Trajectory evaluation (tools called, order, efficiency)
6
+ - Output evaluation (content matching, quality)
7
+ - LLM-as-judge evaluation (custom criteria)
8
+ - Efficiency evaluation (time, tokens, cost)
9
+ """
10
+
11
+ from typing import Dict, Any, Optional
12
+ from datetime import datetime
13
+
14
+ from .base import Evaluator, TestCase, AgentResult, EvaluationResult
15
+ from vanna.core import LlmService
16
+
17
+
18
+ class TrajectoryEvaluator(Evaluator):
19
+ """Evaluate the path the agent took (tools called, order, etc).
20
+
21
+ Checks if the agent called the expected tools and didn't call
22
+ unexpected ones. Useful for verifying agent reasoning and planning.
23
+ """
24
+
25
+ @property
26
+ def name(self) -> str:
27
+ return "trajectory"
28
+
29
+ async def evaluate(
30
+ self, test_case: TestCase, agent_result: AgentResult
31
+ ) -> EvaluationResult:
32
+ """Evaluate tool call trajectory."""
33
+ if agent_result.error:
34
+ return EvaluationResult(
35
+ test_case_id=test_case.id,
36
+ evaluator_name=self.name,
37
+ passed=False,
38
+ score=0.0,
39
+ reasoning=f"Agent execution failed: {agent_result.error}",
40
+ )
41
+
42
+ expected = test_case.expected_outcome
43
+ if not expected:
44
+ return EvaluationResult(
45
+ test_case_id=test_case.id,
46
+ evaluator_name=self.name,
47
+ passed=True,
48
+ score=1.0,
49
+ reasoning="No expected outcome specified, passing by default",
50
+ )
51
+
52
+ tools_called = agent_result.get_tool_names_called()
53
+ issues = []
54
+ score = 1.0
55
+
56
+ # Check expected tools were called
57
+ if expected.tools_called:
58
+ for expected_tool in expected.tools_called:
59
+ if expected_tool not in tools_called:
60
+ issues.append(f"Expected tool '{expected_tool}' was not called")
61
+ score -= 0.5 / len(expected.tools_called)
62
+
63
+ # Check unexpected tools were not called
64
+ if expected.tools_not_called:
65
+ for unexpected_tool in expected.tools_not_called:
66
+ if unexpected_tool in tools_called:
67
+ issues.append(f"Unexpected tool '{unexpected_tool}' was called")
68
+ score -= 0.5 / len(expected.tools_not_called)
69
+
70
+ score = max(0.0, min(1.0, score))
71
+ passed = score >= 0.7 # 70% threshold
72
+
73
+ reasoning = "Trajectory evaluation: "
74
+ if issues:
75
+ reasoning += "; ".join(issues)
76
+ else:
77
+ reasoning += "All expected tools called, no unexpected tools"
78
+
79
+ return EvaluationResult(
80
+ test_case_id=test_case.id,
81
+ evaluator_name=self.name,
82
+ passed=passed,
83
+ score=score,
84
+ reasoning=reasoning,
85
+ metrics={
86
+ "tools_called": tools_called,
87
+ "num_tools_called": len(tools_called),
88
+ "issues": issues,
89
+ },
90
+ )
91
+
92
+
93
+ class OutputEvaluator(Evaluator):
94
+ """Evaluate the final output quality.
95
+
96
+ Checks if the output contains expected content and doesn't
97
+ contain forbidden content. Case-insensitive substring matching.
98
+ """
99
+
100
+ @property
101
+ def name(self) -> str:
102
+ return "output"
103
+
104
+ async def evaluate(
105
+ self, test_case: TestCase, agent_result: AgentResult
106
+ ) -> EvaluationResult:
107
+ """Evaluate output content."""
108
+ if agent_result.error:
109
+ return EvaluationResult(
110
+ test_case_id=test_case.id,
111
+ evaluator_name=self.name,
112
+ passed=False,
113
+ score=0.0,
114
+ reasoning=f"Agent execution failed: {agent_result.error}",
115
+ )
116
+
117
+ expected = test_case.expected_outcome
118
+ if not expected:
119
+ return EvaluationResult(
120
+ test_case_id=test_case.id,
121
+ evaluator_name=self.name,
122
+ passed=True,
123
+ score=1.0,
124
+ reasoning="No expected outcome specified, passing by default",
125
+ )
126
+
127
+ final_answer = agent_result.get_final_answer().lower()
128
+ issues = []
129
+ score = 1.0
130
+
131
+ # Check expected content is present
132
+ if expected.final_answer_contains:
133
+ for expected_content in expected.final_answer_contains:
134
+ if expected_content.lower() not in final_answer:
135
+ issues.append(
136
+ f"Expected content '{expected_content}' not found in output"
137
+ )
138
+ score -= 0.5 / len(expected.final_answer_contains)
139
+
140
+ # Check forbidden content is absent
141
+ if expected.final_answer_not_contains:
142
+ for forbidden_content in expected.final_answer_not_contains:
143
+ if forbidden_content.lower() in final_answer:
144
+ issues.append(
145
+ f"Forbidden content '{forbidden_content}' found in output"
146
+ )
147
+ score -= 0.5 / len(expected.final_answer_not_contains)
148
+
149
+ score = max(0.0, min(1.0, score))
150
+ passed = score >= 0.7 # 70% threshold
151
+
152
+ reasoning = "Output evaluation: "
153
+ if issues:
154
+ reasoning += "; ".join(issues)
155
+ else:
156
+ reasoning += "All expected content present, no forbidden content"
157
+
158
+ return EvaluationResult(
159
+ test_case_id=test_case.id,
160
+ evaluator_name=self.name,
161
+ passed=passed,
162
+ score=score,
163
+ reasoning=reasoning,
164
+ metrics={
165
+ "output_length": len(final_answer),
166
+ "issues": issues,
167
+ },
168
+ )
169
+
170
+
171
+ class LLMAsJudgeEvaluator(Evaluator):
172
+ """Use an LLM to judge agent performance based on custom criteria.
173
+
174
+ This evaluator uses a separate LLM to assess the quality of the
175
+ agent's output based on natural language criteria.
176
+ """
177
+
178
+ def __init__(self, judge_llm: LlmService, criteria: str):
179
+ """Initialize LLM-as-judge evaluator.
180
+
181
+ Args:
182
+ judge_llm: The LLM service to use for judging
183
+ criteria: Natural language description of what to evaluate
184
+ """
185
+ self.judge_llm = judge_llm
186
+ self.criteria = criteria
187
+
188
+ @property
189
+ def name(self) -> str:
190
+ return "llm_judge"
191
+
192
+ async def evaluate(
193
+ self, test_case: TestCase, agent_result: AgentResult
194
+ ) -> EvaluationResult:
195
+ """Evaluate using LLM as judge."""
196
+ if agent_result.error:
197
+ return EvaluationResult(
198
+ test_case_id=test_case.id,
199
+ evaluator_name=self.name,
200
+ passed=False,
201
+ score=0.0,
202
+ reasoning=f"Agent execution failed: {agent_result.error}",
203
+ )
204
+
205
+ final_answer = agent_result.get_final_answer()
206
+
207
+ # Build prompt for judge
208
+ judge_prompt = f"""You are evaluating an AI agent's response to a user query.
209
+
210
+ User Query: {test_case.message}
211
+
212
+ Agent's Response:
213
+ {final_answer}
214
+
215
+ Evaluation Criteria:
216
+ {self.criteria}
217
+
218
+ Please evaluate the response and provide:
219
+ 1. A score from 0.0 to 1.0 (where 1.0 is perfect)
220
+ 2. Whether it passes (score >= 0.7)
221
+ 3. Brief reasoning for your evaluation
222
+
223
+ Respond in this format:
224
+ SCORE: <number>
225
+ PASSED: <yes/no>
226
+ REASONING: <your explanation>
227
+ """
228
+
229
+ try:
230
+ # Call judge LLM
231
+ from vanna.core.llm import LlmRequest, LlmMessage
232
+
233
+ request = LlmRequest(
234
+ user=test_case.user,
235
+ messages=[LlmMessage(role="user", content=judge_prompt)],
236
+ temperature=0.0, # Deterministic judging
237
+ )
238
+
239
+ response = await self.judge_llm.send_request(request)
240
+ judgment = response.content or ""
241
+
242
+ # Parse response
243
+ score = self._parse_score(judgment)
244
+ passed = self._parse_passed(judgment)
245
+ reasoning = self._parse_reasoning(judgment)
246
+
247
+ return EvaluationResult(
248
+ test_case_id=test_case.id,
249
+ evaluator_name=self.name,
250
+ passed=passed,
251
+ score=score,
252
+ reasoning=reasoning,
253
+ metrics={"judge_response": judgment},
254
+ )
255
+
256
+ except Exception as e:
257
+ return EvaluationResult(
258
+ test_case_id=test_case.id,
259
+ evaluator_name=self.name,
260
+ passed=False,
261
+ score=0.0,
262
+ reasoning=f"LLM judge evaluation failed: {str(e)}",
263
+ )
264
+
265
+ def _parse_score(self, judgment: str) -> float:
266
+ """Parse score from judge response."""
267
+ try:
268
+ for line in judgment.split("\n"):
269
+ if line.startswith("SCORE:"):
270
+ score_str = line.replace("SCORE:", "").strip()
271
+ return float(score_str)
272
+ except Exception:
273
+ pass
274
+ return 0.5 # Default if parsing fails
275
+
276
+ def _parse_passed(self, judgment: str) -> bool:
277
+ """Parse pass/fail from judge response."""
278
+ for line in judgment.split("\n"):
279
+ if line.startswith("PASSED:"):
280
+ passed_str = line.replace("PASSED:", "").strip().lower()
281
+ return passed_str in ["yes", "true", "pass"]
282
+ return False
283
+
284
+ def _parse_reasoning(self, judgment: str) -> str:
285
+ """Parse reasoning from judge response."""
286
+ for line in judgment.split("\n"):
287
+ if line.startswith("REASONING:"):
288
+ return line.replace("REASONING:", "").strip()
289
+ return judgment # Return full judgment if no reasoning line found
290
+
291
+
292
+ class EfficiencyEvaluator(Evaluator):
293
+ """Evaluate resource usage (time, tokens, cost).
294
+
295
+ Checks if the agent completed within acceptable resource limits.
296
+ """
297
+
298
+ def __init__(
299
+ self,
300
+ max_execution_time_ms: Optional[float] = None,
301
+ max_tokens: Optional[int] = None,
302
+ max_cost_usd: Optional[float] = None,
303
+ ):
304
+ """Initialize efficiency evaluator.
305
+
306
+ Args:
307
+ max_execution_time_ms: Maximum allowed execution time in milliseconds
308
+ max_tokens: Maximum allowed token usage
309
+ max_cost_usd: Maximum allowed cost in USD
310
+ """
311
+ self.max_execution_time_ms = max_execution_time_ms
312
+ self.max_tokens = max_tokens
313
+ self.max_cost_usd = max_cost_usd
314
+
315
+ @property
316
+ def name(self) -> str:
317
+ return "efficiency"
318
+
319
+ async def evaluate(
320
+ self, test_case: TestCase, agent_result: AgentResult
321
+ ) -> EvaluationResult:
322
+ """Evaluate resource efficiency."""
323
+ issues = []
324
+ score = 1.0
325
+
326
+ # Check execution time
327
+ if self.max_execution_time_ms:
328
+ if agent_result.execution_time_ms > self.max_execution_time_ms:
329
+ issues.append(
330
+ f"Execution time {agent_result.execution_time_ms:.0f}ms "
331
+ f"exceeded limit {self.max_execution_time_ms:.0f}ms"
332
+ )
333
+ score -= 0.33
334
+
335
+ # Check token usage
336
+ if self.max_tokens:
337
+ if agent_result.total_tokens > self.max_tokens:
338
+ issues.append(
339
+ f"Token usage {agent_result.total_tokens} exceeded limit {self.max_tokens}"
340
+ )
341
+ score -= 0.33
342
+
343
+ # Check cost (would need cost calculation from metadata)
344
+ # For now, skip cost evaluation
345
+
346
+ # Check from expected outcome if specified
347
+ expected = test_case.expected_outcome
348
+ if expected and expected.max_execution_time_ms:
349
+ if agent_result.execution_time_ms > expected.max_execution_time_ms:
350
+ issues.append(
351
+ f"Execution time {agent_result.execution_time_ms:.0f}ms "
352
+ f"exceeded test case limit {expected.max_execution_time_ms:.0f}ms"
353
+ )
354
+ score -= 0.34
355
+
356
+ score = max(0.0, min(1.0, score))
357
+ passed = score >= 0.7
358
+
359
+ reasoning = "Efficiency evaluation: "
360
+ if issues:
361
+ reasoning += "; ".join(issues)
362
+ else:
363
+ reasoning += "Within resource limits"
364
+
365
+ return EvaluationResult(
366
+ test_case_id=test_case.id,
367
+ evaluator_name=self.name,
368
+ passed=passed,
369
+ score=score,
370
+ reasoning=reasoning,
371
+ metrics={
372
+ "execution_time_ms": agent_result.execution_time_ms,
373
+ "total_tokens": agent_result.total_tokens,
374
+ "issues": issues,
375
+ },
376
+ )
@@ -0,0 +1,289 @@
1
+ """
2
+ Evaluation reporting with HTML, CSV, and console output.
3
+
4
+ This module provides classes for generating evaluation reports,
5
+ including comparison reports for evaluating multiple agent variants.
6
+ """
7
+
8
+ import csv
9
+ from typing import List, Dict, Optional, Any
10
+ from dataclasses import dataclass, field
11
+ from datetime import datetime
12
+
13
+ from .base import TestCaseResult, AgentVariant, Evaluator, TestCase
14
+
15
+
16
+ @dataclass
17
+ class EvaluationReport:
18
+ """Report for a single agent's evaluation results.
19
+
20
+ Attributes:
21
+ agent_name: Name of the agent evaluated
22
+ results: List of results for each test case
23
+ evaluators: List of evaluators used
24
+ metadata: Additional metadata about the agent/run
25
+ timestamp: When the evaluation was run
26
+ """
27
+
28
+ agent_name: str
29
+ results: List[TestCaseResult]
30
+ evaluators: List[Evaluator]
31
+ metadata: Dict[str, Any] = field(default_factory=dict)
32
+ timestamp: datetime = field(default_factory=datetime.now)
33
+
34
+ def pass_rate(self) -> float:
35
+ """Calculate overall pass rate (0.0 to 1.0)."""
36
+ if not self.results:
37
+ return 0.0
38
+ passed = sum(1 for r in self.results if r.overall_passed())
39
+ return passed / len(self.results)
40
+
41
+ def average_score(self) -> float:
42
+ """Calculate average score across all test cases."""
43
+ if not self.results:
44
+ return 0.0
45
+ return sum(r.overall_score() for r in self.results) / len(self.results)
46
+
47
+ def average_time(self) -> float:
48
+ """Calculate average execution time in milliseconds."""
49
+ if not self.results:
50
+ return 0.0
51
+ return sum(r.execution_time_ms for r in self.results) / len(self.results)
52
+
53
+ def total_tokens(self) -> int:
54
+ """Calculate total tokens used across all test cases."""
55
+ return sum(r.agent_result.total_tokens for r in self.results)
56
+
57
+ def get_failures(self) -> List[TestCaseResult]:
58
+ """Get all failed test cases."""
59
+ return [r for r in self.results if not r.overall_passed()]
60
+
61
+ def print_summary(self) -> None:
62
+ """Print summary to console."""
63
+ print(f"\n{'=' * 80}")
64
+ print(f"EVALUATION REPORT: {self.agent_name}")
65
+ print(f"{'=' * 80}")
66
+ print(f"Timestamp: {self.timestamp.isoformat()}")
67
+ print(f"Test Cases: {len(self.results)}")
68
+ print(f"Pass Rate: {self.pass_rate():.1%}")
69
+ print(f"Average Score: {self.average_score():.2f}")
70
+ print(f"Average Time: {self.average_time():.0f}ms")
71
+ print(f"Total Tokens: {self.total_tokens()}")
72
+ print(f"{'=' * 80}\n")
73
+
74
+ failures = self.get_failures()
75
+ if failures:
76
+ print(f"FAILURES ({len(failures)}):")
77
+ for result in failures:
78
+ print(f"\n Test Case: {result.test_case.id}")
79
+ print(f" Message: {result.test_case.message}")
80
+ print(f" Score: {result.overall_score():.2f}")
81
+ for eval_result in result.evaluations:
82
+ if not eval_result.passed:
83
+ print(
84
+ f" [{eval_result.evaluator_name}] {eval_result.reasoning}"
85
+ )
86
+
87
+
88
+ @dataclass
89
+ class ComparisonReport:
90
+ """Report comparing multiple agent variants.
91
+
92
+ This is the primary report type for LLM comparison use cases.
93
+
94
+ Attributes:
95
+ variants: List of agent variants compared
96
+ reports: Dict mapping variant name to EvaluationReport
97
+ test_cases: Test cases used for comparison
98
+ timestamp: When the comparison was run
99
+ """
100
+
101
+ variants: List[AgentVariant]
102
+ reports: Dict[str, EvaluationReport]
103
+ test_cases: List[TestCase]
104
+ timestamp: datetime = field(default_factory=datetime.now)
105
+
106
+ def print_summary(self) -> None:
107
+ """Print comparison summary to console."""
108
+ print("\n" + "=" * 80)
109
+ print("AGENT COMPARISON SUMMARY")
110
+ print("=" * 80)
111
+ print(f"Timestamp: {self.timestamp.isoformat()}")
112
+ print(f"Variants: {len(self.variants)}")
113
+ print(f"Test Cases: {len(self.test_cases)}")
114
+
115
+ # Table of results
116
+ print(
117
+ f"\n{'Agent':<25} {'Pass Rate':<12} {'Avg Score':<12} {'Avg Time':<12} {'Tokens':<12}"
118
+ )
119
+ print("-" * 80)
120
+
121
+ for variant_name, report in self.reports.items():
122
+ print(
123
+ f"{variant_name:<25} "
124
+ f"{report.pass_rate():<12.1%} "
125
+ f"{report.average_score():<12.2f} "
126
+ f"{report.average_time():<12.0f} "
127
+ f"{report.total_tokens():<12,}"
128
+ )
129
+
130
+ print("=" * 80 + "\n")
131
+
132
+ def get_best_variant(self, metric: str = "score") -> str:
133
+ """Get the best performing variant by metric.
134
+
135
+ Args:
136
+ metric: Metric to optimize ('score', 'speed', 'pass_rate')
137
+
138
+ Returns:
139
+ Name of the best variant
140
+ """
141
+ if metric == "score":
142
+ return max(self.reports.items(), key=lambda x: x[1].average_score())[0]
143
+ elif metric == "speed":
144
+ return min(self.reports.items(), key=lambda x: x[1].average_time())[0]
145
+ elif metric == "pass_rate":
146
+ return max(self.reports.items(), key=lambda x: x[1].pass_rate())[0]
147
+ else:
148
+ raise ValueError(f"Unknown metric: {metric}")
149
+
150
+ def save_csv(self, path: str) -> None:
151
+ """Save detailed CSV for further analysis.
152
+
153
+ Each row represents one test case × one variant combination.
154
+ """
155
+ with open(path, "w", newline="") as f:
156
+ writer = csv.writer(f)
157
+
158
+ # Header
159
+ writer.writerow(
160
+ [
161
+ "variant",
162
+ "test_case_id",
163
+ "test_message",
164
+ "passed",
165
+ "score",
166
+ "execution_time_ms",
167
+ "tokens",
168
+ "error",
169
+ "evaluator_scores",
170
+ ]
171
+ )
172
+
173
+ # Data rows
174
+ for variant_name, report in self.reports.items():
175
+ for result in report.results:
176
+ evaluator_scores = {
177
+ e.evaluator_name: e.score for e in result.evaluations
178
+ }
179
+
180
+ writer.writerow(
181
+ [
182
+ variant_name,
183
+ result.test_case.id,
184
+ result.test_case.message[:50], # Truncate
185
+ result.overall_passed(),
186
+ result.overall_score(),
187
+ result.execution_time_ms,
188
+ result.agent_result.total_tokens,
189
+ result.agent_result.error or "",
190
+ str(evaluator_scores),
191
+ ]
192
+ )
193
+
194
+ def save_html(self, path: str) -> None:
195
+ """Save interactive HTML comparison report.
196
+
197
+ Generates a rich HTML report with:
198
+ - Summary statistics
199
+ - Charts comparing variants
200
+ - Side-by-side test case results
201
+ """
202
+ html = self._generate_html()
203
+ with open(path, "w") as f:
204
+ f.write(html)
205
+
206
+ def _generate_html(self) -> str:
207
+ """Generate HTML content for report."""
208
+ # Build HTML report
209
+ html_parts = [
210
+ "<!DOCTYPE html>",
211
+ "<html>",
212
+ "<head>",
213
+ "<title>Agent Comparison Report</title>",
214
+ "<style>",
215
+ "body { font-family: Arial, sans-serif; margin: 20px; }",
216
+ "h1 { color: #333; }",
217
+ "table { border-collapse: collapse; width: 100%; margin: 20px 0; }",
218
+ "th, td { border: 1px solid #ddd; padding: 12px; text-align: left; }",
219
+ "th { background-color: #4CAF50; color: white; }",
220
+ "tr:nth-child(even) { background-color: #f2f2f2; }",
221
+ ".passed { color: green; font-weight: bold; }",
222
+ ".failed { color: red; font-weight: bold; }",
223
+ ".best { background-color: #d4edda !important; }",
224
+ "</style>",
225
+ "</head>",
226
+ "<body>",
227
+ f"<h1>Agent Comparison Report</h1>",
228
+ f"<p>Generated: {self.timestamp.isoformat()}</p>",
229
+ f"<p>Variants: {len(self.variants)} | Test Cases: {len(self.test_cases)}</p>",
230
+ ]
231
+
232
+ # Summary table
233
+ html_parts.append("<h2>Summary</h2>")
234
+ html_parts.append("<table>")
235
+ html_parts.append(
236
+ "<tr><th>Agent</th><th>Pass Rate</th><th>Avg Score</th><th>Avg Time (ms)</th><th>Total Tokens</th></tr>"
237
+ )
238
+
239
+ best_by_score = self.get_best_variant("score")
240
+
241
+ for variant_name, report in self.reports.items():
242
+ row_class = "best" if variant_name == best_by_score else ""
243
+ html_parts.append(
244
+ f"<tr class='{row_class}'>"
245
+ f"<td>{variant_name}</td>"
246
+ f"<td>{report.pass_rate():.1%}</td>"
247
+ f"<td>{report.average_score():.2f}</td>"
248
+ f"<td>{report.average_time():.0f}</td>"
249
+ f"<td>{report.total_tokens():,}</td>"
250
+ f"</tr>"
251
+ )
252
+
253
+ html_parts.append("</table>")
254
+
255
+ # Test case details
256
+ html_parts.append("<h2>Test Case Details</h2>")
257
+
258
+ for i, test_case in enumerate(self.test_cases):
259
+ html_parts.append(f"<h3>Test Case {i + 1}: {test_case.id}</h3>")
260
+ html_parts.append(f"<p><strong>Message:</strong> {test_case.message}</p>")
261
+
262
+ html_parts.append("<table>")
263
+ html_parts.append(
264
+ "<tr><th>Variant</th><th>Result</th><th>Score</th><th>Time (ms)</th></tr>"
265
+ )
266
+
267
+ for variant_name, report in self.reports.items():
268
+ result = next(
269
+ (r for r in report.results if r.test_case.id == test_case.id), None
270
+ )
271
+ if result:
272
+ passed_class = "passed" if result.overall_passed() else "failed"
273
+ passed_text = "PASS" if result.overall_passed() else "FAIL"
274
+
275
+ html_parts.append(
276
+ f"<tr>"
277
+ f"<td>{variant_name}</td>"
278
+ f"<td class='{passed_class}'>{passed_text}</td>"
279
+ f"<td>{result.overall_score():.2f}</td>"
280
+ f"<td>{result.execution_time_ms:.0f}</td>"
281
+ f"</tr>"
282
+ )
283
+
284
+ html_parts.append("</table>")
285
+
286
+ html_parts.append("</body>")
287
+ html_parts.append("</html>")
288
+
289
+ return "\n".join(html_parts)