vanna 0.7.9__py3-none-any.whl → 2.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. vanna/__init__.py +167 -395
  2. vanna/agents/__init__.py +7 -0
  3. vanna/capabilities/__init__.py +17 -0
  4. vanna/capabilities/agent_memory/__init__.py +21 -0
  5. vanna/capabilities/agent_memory/base.py +103 -0
  6. vanna/capabilities/agent_memory/models.py +53 -0
  7. vanna/capabilities/file_system/__init__.py +14 -0
  8. vanna/capabilities/file_system/base.py +71 -0
  9. vanna/capabilities/file_system/models.py +25 -0
  10. vanna/capabilities/sql_runner/__init__.py +13 -0
  11. vanna/capabilities/sql_runner/base.py +37 -0
  12. vanna/capabilities/sql_runner/models.py +13 -0
  13. vanna/components/__init__.py +92 -0
  14. vanna/components/base.py +11 -0
  15. vanna/components/rich/__init__.py +83 -0
  16. vanna/components/rich/containers/__init__.py +7 -0
  17. vanna/components/rich/containers/card.py +20 -0
  18. vanna/components/rich/data/__init__.py +9 -0
  19. vanna/components/rich/data/chart.py +17 -0
  20. vanna/components/rich/data/dataframe.py +93 -0
  21. vanna/components/rich/feedback/__init__.py +21 -0
  22. vanna/components/rich/feedback/badge.py +16 -0
  23. vanna/components/rich/feedback/icon_text.py +14 -0
  24. vanna/components/rich/feedback/log_viewer.py +41 -0
  25. vanna/components/rich/feedback/notification.py +19 -0
  26. vanna/components/rich/feedback/progress.py +37 -0
  27. vanna/components/rich/feedback/status_card.py +28 -0
  28. vanna/components/rich/feedback/status_indicator.py +14 -0
  29. vanna/components/rich/interactive/__init__.py +21 -0
  30. vanna/components/rich/interactive/button.py +95 -0
  31. vanna/components/rich/interactive/task_list.py +58 -0
  32. vanna/components/rich/interactive/ui_state.py +93 -0
  33. vanna/components/rich/specialized/__init__.py +7 -0
  34. vanna/components/rich/specialized/artifact.py +20 -0
  35. vanna/components/rich/text.py +16 -0
  36. vanna/components/simple/__init__.py +15 -0
  37. vanna/components/simple/image.py +15 -0
  38. vanna/components/simple/link.py +15 -0
  39. vanna/components/simple/text.py +11 -0
  40. vanna/core/__init__.py +193 -0
  41. vanna/core/_compat.py +19 -0
  42. vanna/core/agent/__init__.py +10 -0
  43. vanna/core/agent/agent.py +1407 -0
  44. vanna/core/agent/config.py +123 -0
  45. vanna/core/audit/__init__.py +28 -0
  46. vanna/core/audit/base.py +299 -0
  47. vanna/core/audit/models.py +131 -0
  48. vanna/core/component_manager.py +329 -0
  49. vanna/core/components.py +53 -0
  50. vanna/core/enhancer/__init__.py +11 -0
  51. vanna/core/enhancer/base.py +94 -0
  52. vanna/core/enhancer/default.py +118 -0
  53. vanna/core/enricher/__init__.py +10 -0
  54. vanna/core/enricher/base.py +59 -0
  55. vanna/core/errors.py +47 -0
  56. vanna/core/evaluation/__init__.py +81 -0
  57. vanna/core/evaluation/base.py +186 -0
  58. vanna/core/evaluation/dataset.py +254 -0
  59. vanna/core/evaluation/evaluators.py +376 -0
  60. vanna/core/evaluation/report.py +289 -0
  61. vanna/core/evaluation/runner.py +313 -0
  62. vanna/core/filter/__init__.py +10 -0
  63. vanna/core/filter/base.py +67 -0
  64. vanna/core/lifecycle/__init__.py +10 -0
  65. vanna/core/lifecycle/base.py +83 -0
  66. vanna/core/llm/__init__.py +16 -0
  67. vanna/core/llm/base.py +40 -0
  68. vanna/core/llm/models.py +61 -0
  69. vanna/core/middleware/__init__.py +10 -0
  70. vanna/core/middleware/base.py +69 -0
  71. vanna/core/observability/__init__.py +11 -0
  72. vanna/core/observability/base.py +88 -0
  73. vanna/core/observability/models.py +47 -0
  74. vanna/core/recovery/__init__.py +11 -0
  75. vanna/core/recovery/base.py +84 -0
  76. vanna/core/recovery/models.py +32 -0
  77. vanna/core/registry.py +278 -0
  78. vanna/core/rich_component.py +156 -0
  79. vanna/core/simple_component.py +27 -0
  80. vanna/core/storage/__init__.py +14 -0
  81. vanna/core/storage/base.py +46 -0
  82. vanna/core/storage/models.py +46 -0
  83. vanna/core/system_prompt/__init__.py +13 -0
  84. vanna/core/system_prompt/base.py +36 -0
  85. vanna/core/system_prompt/default.py +157 -0
  86. vanna/core/tool/__init__.py +18 -0
  87. vanna/core/tool/base.py +70 -0
  88. vanna/core/tool/models.py +84 -0
  89. vanna/core/user/__init__.py +17 -0
  90. vanna/core/user/base.py +29 -0
  91. vanna/core/user/models.py +25 -0
  92. vanna/core/user/request_context.py +70 -0
  93. vanna/core/user/resolver.py +42 -0
  94. vanna/core/validation.py +164 -0
  95. vanna/core/workflow/__init__.py +12 -0
  96. vanna/core/workflow/base.py +254 -0
  97. vanna/core/workflow/default.py +789 -0
  98. vanna/examples/__init__.py +1 -0
  99. vanna/examples/__main__.py +44 -0
  100. vanna/examples/anthropic_quickstart.py +80 -0
  101. vanna/examples/artifact_example.py +293 -0
  102. vanna/examples/claude_sqlite_example.py +236 -0
  103. vanna/examples/coding_agent_example.py +300 -0
  104. vanna/examples/custom_system_prompt_example.py +174 -0
  105. vanna/examples/default_workflow_handler_example.py +208 -0
  106. vanna/examples/email_auth_example.py +340 -0
  107. vanna/examples/evaluation_example.py +269 -0
  108. vanna/examples/extensibility_example.py +262 -0
  109. vanna/examples/minimal_example.py +67 -0
  110. vanna/examples/mock_auth_example.py +227 -0
  111. vanna/examples/mock_custom_tool.py +311 -0
  112. vanna/examples/mock_quickstart.py +79 -0
  113. vanna/examples/mock_quota_example.py +145 -0
  114. vanna/examples/mock_rich_components_demo.py +396 -0
  115. vanna/examples/mock_sqlite_example.py +223 -0
  116. vanna/examples/openai_quickstart.py +83 -0
  117. vanna/examples/primitive_components_demo.py +305 -0
  118. vanna/examples/quota_lifecycle_example.py +139 -0
  119. vanna/examples/visualization_example.py +251 -0
  120. vanna/integrations/__init__.py +17 -0
  121. vanna/integrations/anthropic/__init__.py +9 -0
  122. vanna/integrations/anthropic/llm.py +270 -0
  123. vanna/integrations/azureopenai/__init__.py +9 -0
  124. vanna/integrations/azureopenai/llm.py +329 -0
  125. vanna/integrations/azuresearch/__init__.py +7 -0
  126. vanna/integrations/azuresearch/agent_memory.py +413 -0
  127. vanna/integrations/bigquery/__init__.py +5 -0
  128. vanna/integrations/bigquery/sql_runner.py +81 -0
  129. vanna/integrations/chromadb/__init__.py +104 -0
  130. vanna/integrations/chromadb/agent_memory.py +416 -0
  131. vanna/integrations/clickhouse/__init__.py +5 -0
  132. vanna/integrations/clickhouse/sql_runner.py +82 -0
  133. vanna/integrations/duckdb/__init__.py +5 -0
  134. vanna/integrations/duckdb/sql_runner.py +65 -0
  135. vanna/integrations/faiss/__init__.py +7 -0
  136. vanna/integrations/faiss/agent_memory.py +431 -0
  137. vanna/integrations/google/__init__.py +9 -0
  138. vanna/integrations/google/gemini.py +370 -0
  139. vanna/integrations/hive/__init__.py +5 -0
  140. vanna/integrations/hive/sql_runner.py +87 -0
  141. vanna/integrations/local/__init__.py +17 -0
  142. vanna/integrations/local/agent_memory/__init__.py +7 -0
  143. vanna/integrations/local/agent_memory/in_memory.py +285 -0
  144. vanna/integrations/local/audit.py +59 -0
  145. vanna/integrations/local/file_system.py +242 -0
  146. vanna/integrations/local/file_system_conversation_store.py +255 -0
  147. vanna/integrations/local/storage.py +62 -0
  148. vanna/integrations/marqo/__init__.py +7 -0
  149. vanna/integrations/marqo/agent_memory.py +354 -0
  150. vanna/integrations/milvus/__init__.py +7 -0
  151. vanna/integrations/milvus/agent_memory.py +458 -0
  152. vanna/integrations/mock/__init__.py +9 -0
  153. vanna/integrations/mock/llm.py +65 -0
  154. vanna/integrations/mssql/__init__.py +5 -0
  155. vanna/integrations/mssql/sql_runner.py +66 -0
  156. vanna/integrations/mysql/__init__.py +5 -0
  157. vanna/integrations/mysql/sql_runner.py +92 -0
  158. vanna/integrations/ollama/__init__.py +7 -0
  159. vanna/integrations/ollama/llm.py +252 -0
  160. vanna/integrations/openai/__init__.py +10 -0
  161. vanna/integrations/openai/llm.py +267 -0
  162. vanna/integrations/openai/responses.py +163 -0
  163. vanna/integrations/opensearch/__init__.py +7 -0
  164. vanna/integrations/opensearch/agent_memory.py +411 -0
  165. vanna/integrations/oracle/__init__.py +5 -0
  166. vanna/integrations/oracle/sql_runner.py +75 -0
  167. vanna/integrations/pinecone/__init__.py +7 -0
  168. vanna/integrations/pinecone/agent_memory.py +329 -0
  169. vanna/integrations/plotly/__init__.py +5 -0
  170. vanna/integrations/plotly/chart_generator.py +313 -0
  171. vanna/integrations/postgres/__init__.py +9 -0
  172. vanna/integrations/postgres/sql_runner.py +112 -0
  173. vanna/integrations/premium/agent_memory/__init__.py +7 -0
  174. vanna/integrations/premium/agent_memory/premium.py +186 -0
  175. vanna/integrations/presto/__init__.py +5 -0
  176. vanna/integrations/presto/sql_runner.py +107 -0
  177. vanna/integrations/qdrant/__init__.py +7 -0
  178. vanna/integrations/qdrant/agent_memory.py +439 -0
  179. vanna/integrations/snowflake/__init__.py +5 -0
  180. vanna/integrations/snowflake/sql_runner.py +147 -0
  181. vanna/integrations/sqlite/__init__.py +9 -0
  182. vanna/integrations/sqlite/sql_runner.py +65 -0
  183. vanna/integrations/weaviate/__init__.py +7 -0
  184. vanna/integrations/weaviate/agent_memory.py +428 -0
  185. vanna/{ZhipuAI → legacy/ZhipuAI}/ZhipuAI_embeddings.py +11 -11
  186. vanna/legacy/__init__.py +403 -0
  187. vanna/legacy/adapter.py +463 -0
  188. vanna/{advanced → legacy/advanced}/__init__.py +3 -1
  189. vanna/{anthropic → legacy/anthropic}/anthropic_chat.py +9 -7
  190. vanna/{azuresearch → legacy/azuresearch}/azuresearch_vector.py +79 -41
  191. vanna/{base → legacy/base}/base.py +224 -217
  192. vanna/legacy/bedrock/__init__.py +1 -0
  193. vanna/{bedrock → legacy/bedrock}/bedrock_converse.py +13 -12
  194. vanna/{chromadb → legacy/chromadb}/chromadb_vector.py +3 -1
  195. vanna/legacy/cohere/__init__.py +2 -0
  196. vanna/{cohere → legacy/cohere}/cohere_chat.py +19 -14
  197. vanna/{cohere → legacy/cohere}/cohere_embeddings.py +25 -19
  198. vanna/{deepseek → legacy/deepseek}/deepseek_chat.py +5 -6
  199. vanna/legacy/faiss/__init__.py +1 -0
  200. vanna/{faiss → legacy/faiss}/faiss.py +113 -59
  201. vanna/{flask → legacy/flask}/__init__.py +84 -43
  202. vanna/{flask → legacy/flask}/assets.py +5 -5
  203. vanna/{flask → legacy/flask}/auth.py +5 -4
  204. vanna/{google → legacy/google}/bigquery_vector.py +75 -42
  205. vanna/{google → legacy/google}/gemini_chat.py +7 -3
  206. vanna/{hf → legacy/hf}/hf.py +0 -1
  207. vanna/{milvus → legacy/milvus}/milvus_vector.py +58 -35
  208. vanna/{mock → legacy/mock}/llm.py +0 -1
  209. vanna/legacy/mock/vectordb.py +67 -0
  210. vanna/legacy/ollama/ollama.py +110 -0
  211. vanna/{openai → legacy/openai}/openai_chat.py +2 -6
  212. vanna/legacy/opensearch/opensearch_vector.py +369 -0
  213. vanna/legacy/opensearch/opensearch_vector_semantic.py +200 -0
  214. vanna/legacy/oracle/oracle_vector.py +584 -0
  215. vanna/{pgvector → legacy/pgvector}/pgvector.py +42 -13
  216. vanna/{qdrant → legacy/qdrant}/qdrant.py +2 -6
  217. vanna/legacy/qianfan/Qianfan_Chat.py +170 -0
  218. vanna/legacy/qianfan/Qianfan_embeddings.py +36 -0
  219. vanna/legacy/qianwen/QianwenAI_chat.py +132 -0
  220. vanna/{remote.py → legacy/remote.py} +28 -26
  221. vanna/{utils.py → legacy/utils.py} +6 -11
  222. vanna/{vannadb → legacy/vannadb}/vannadb_vector.py +115 -46
  223. vanna/{vllm → legacy/vllm}/vllm.py +5 -6
  224. vanna/{weaviate → legacy/weaviate}/weaviate_vector.py +59 -40
  225. vanna/{xinference → legacy/xinference}/xinference.py +6 -6
  226. vanna/py.typed +0 -0
  227. vanna/servers/__init__.py +16 -0
  228. vanna/servers/__main__.py +8 -0
  229. vanna/servers/base/__init__.py +18 -0
  230. vanna/servers/base/chat_handler.py +65 -0
  231. vanna/servers/base/models.py +111 -0
  232. vanna/servers/base/rich_chat_handler.py +141 -0
  233. vanna/servers/base/templates.py +331 -0
  234. vanna/servers/cli/__init__.py +7 -0
  235. vanna/servers/cli/server_runner.py +204 -0
  236. vanna/servers/fastapi/__init__.py +7 -0
  237. vanna/servers/fastapi/app.py +163 -0
  238. vanna/servers/fastapi/routes.py +183 -0
  239. vanna/servers/flask/__init__.py +7 -0
  240. vanna/servers/flask/app.py +132 -0
  241. vanna/servers/flask/routes.py +137 -0
  242. vanna/tools/__init__.py +41 -0
  243. vanna/tools/agent_memory.py +322 -0
  244. vanna/tools/file_system.py +879 -0
  245. vanna/tools/python.py +222 -0
  246. vanna/tools/run_sql.py +165 -0
  247. vanna/tools/visualize_data.py +195 -0
  248. vanna/utils/__init__.py +0 -0
  249. vanna/web_components/__init__.py +44 -0
  250. vanna-2.0.0rc1.dist-info/METADATA +868 -0
  251. vanna-2.0.0rc1.dist-info/RECORD +289 -0
  252. vanna-2.0.0rc1.dist-info/entry_points.txt +3 -0
  253. vanna/bedrock/__init__.py +0 -1
  254. vanna/cohere/__init__.py +0 -2
  255. vanna/faiss/__init__.py +0 -1
  256. vanna/mock/vectordb.py +0 -55
  257. vanna/ollama/ollama.py +0 -103
  258. vanna/opensearch/opensearch_vector.py +0 -392
  259. vanna/opensearch/opensearch_vector_semantic.py +0 -175
  260. vanna/oracle/oracle_vector.py +0 -585
  261. vanna/qianfan/Qianfan_Chat.py +0 -165
  262. vanna/qianfan/Qianfan_embeddings.py +0 -36
  263. vanna/qianwen/QianwenAI_chat.py +0 -133
  264. vanna-0.7.9.dist-info/METADATA +0 -408
  265. vanna-0.7.9.dist-info/RECORD +0 -79
  266. /vanna/{ZhipuAI → legacy/ZhipuAI}/ZhipuAI_Chat.py +0 -0
  267. /vanna/{ZhipuAI → legacy/ZhipuAI}/__init__.py +0 -0
  268. /vanna/{anthropic → legacy/anthropic}/__init__.py +0 -0
  269. /vanna/{azuresearch → legacy/azuresearch}/__init__.py +0 -0
  270. /vanna/{base → legacy/base}/__init__.py +0 -0
  271. /vanna/{chromadb → legacy/chromadb}/__init__.py +0 -0
  272. /vanna/{deepseek → legacy/deepseek}/__init__.py +0 -0
  273. /vanna/{exceptions → legacy/exceptions}/__init__.py +0 -0
  274. /vanna/{google → legacy/google}/__init__.py +0 -0
  275. /vanna/{hf → legacy/hf}/__init__.py +0 -0
  276. /vanna/{local.py → legacy/local.py} +0 -0
  277. /vanna/{marqo → legacy/marqo}/__init__.py +0 -0
  278. /vanna/{marqo → legacy/marqo}/marqo.py +0 -0
  279. /vanna/{milvus → legacy/milvus}/__init__.py +0 -0
  280. /vanna/{mistral → legacy/mistral}/__init__.py +0 -0
  281. /vanna/{mistral → legacy/mistral}/mistral.py +0 -0
  282. /vanna/{mock → legacy/mock}/__init__.py +0 -0
  283. /vanna/{mock → legacy/mock}/embedding.py +0 -0
  284. /vanna/{ollama → legacy/ollama}/__init__.py +0 -0
  285. /vanna/{openai → legacy/openai}/__init__.py +0 -0
  286. /vanna/{openai → legacy/openai}/openai_embeddings.py +0 -0
  287. /vanna/{opensearch → legacy/opensearch}/__init__.py +0 -0
  288. /vanna/{oracle → legacy/oracle}/__init__.py +0 -0
  289. /vanna/{pgvector → legacy/pgvector}/__init__.py +0 -0
  290. /vanna/{pinecone → legacy/pinecone}/__init__.py +0 -0
  291. /vanna/{pinecone → legacy/pinecone}/pinecone_vector.py +0 -0
  292. /vanna/{qdrant → legacy/qdrant}/__init__.py +0 -0
  293. /vanna/{qianfan → legacy/qianfan}/__init__.py +0 -0
  294. /vanna/{qianwen → legacy/qianwen}/QianwenAI_embeddings.py +0 -0
  295. /vanna/{qianwen → legacy/qianwen}/__init__.py +0 -0
  296. /vanna/{types → legacy/types}/__init__.py +0 -0
  297. /vanna/{vannadb → legacy/vannadb}/__init__.py +0 -0
  298. /vanna/{vllm → legacy/vllm}/__init__.py +0 -0
  299. /vanna/{weaviate → legacy/weaviate}/__init__.py +0 -0
  300. /vanna/{xinference → legacy/xinference}/__init__.py +0 -0
  301. {vanna-0.7.9.dist-info → vanna-2.0.0rc1.dist-info}/WHEEL +0 -0
  302. {vanna-0.7.9.dist-info → vanna-2.0.0rc1.dist-info}/licenses/LICENSE +0 -0
vanna/core/errors.py ADDED
@@ -0,0 +1,47 @@
1
+ """
2
+ Exception classes for the Vanna Agents framework.
3
+
4
+ This module defines all custom exceptions used throughout the framework.
5
+ """
6
+
7
+
8
+ class AgentError(Exception):
9
+ """Base exception for agent framework."""
10
+
11
+ pass
12
+
13
+
14
+ class ToolExecutionError(AgentError):
15
+ """Error during tool execution."""
16
+
17
+ pass
18
+
19
+
20
+ class ToolNotFoundError(AgentError):
21
+ """Tool not found in registry."""
22
+
23
+ pass
24
+
25
+
26
+ class PermissionError(AgentError):
27
+ """User lacks required permissions."""
28
+
29
+ pass
30
+
31
+
32
+ class ConversationNotFoundError(AgentError):
33
+ """Conversation not found."""
34
+
35
+ pass
36
+
37
+
38
+ class LlmServiceError(AgentError):
39
+ """Error communicating with LLM service."""
40
+
41
+ pass
42
+
43
+
44
+ class ValidationError(AgentError):
45
+ """Data validation error."""
46
+
47
+ pass
@@ -0,0 +1,81 @@
1
+ """
2
+ Evaluation framework for Vanna Agents.
3
+
4
+ This module provides a complete evaluation system for testing and comparing
5
+ agent variants, with special focus on LLM comparison use cases.
6
+
7
+ Key Features:
8
+ - Parallel execution for efficient I/O-bound operations
9
+ - Multiple built-in evaluators (trajectory, output, LLM-as-judge, efficiency)
10
+ - Rich reporting (HTML, CSV, console)
11
+ - Dataset loaders (YAML, JSON)
12
+ - Agent variant comparison
13
+
14
+ Example:
15
+ >>> from vanna.evaluation import (
16
+ ... EvaluationRunner,
17
+ ... EvaluationDataset,
18
+ ... AgentVariant,
19
+ ... TrajectoryEvaluator,
20
+ ... OutputEvaluator,
21
+ ... )
22
+ >>>
23
+ >>> # Load test dataset
24
+ >>> dataset = EvaluationDataset.from_yaml("tests/sql_tasks.yaml")
25
+ >>>
26
+ >>> # Create agent variants
27
+ >>> variants = [
28
+ ... AgentVariant("claude", claude_agent),
29
+ ... AgentVariant("gpt", gpt_agent),
30
+ ... ]
31
+ >>>
32
+ >>> # Run comparison
33
+ >>> runner = EvaluationRunner(
34
+ ... evaluators=[TrajectoryEvaluator(), OutputEvaluator()],
35
+ ... max_concurrency=20
36
+ ... )
37
+ >>> comparison = await runner.compare_agents(variants, dataset.test_cases)
38
+ >>> comparison.print_summary()
39
+ """
40
+
41
+ from .base import (
42
+ Evaluator,
43
+ TestCase,
44
+ ExpectedOutcome,
45
+ AgentResult,
46
+ EvaluationResult,
47
+ TestCaseResult,
48
+ AgentVariant,
49
+ )
50
+ from .runner import EvaluationRunner
51
+ from .evaluators import (
52
+ TrajectoryEvaluator,
53
+ OutputEvaluator,
54
+ LLMAsJudgeEvaluator,
55
+ EfficiencyEvaluator,
56
+ )
57
+ from .report import EvaluationReport, ComparisonReport
58
+ from .dataset import EvaluationDataset
59
+
60
+ __all__ = [
61
+ # Base classes
62
+ "Evaluator",
63
+ "TestCase",
64
+ "ExpectedOutcome",
65
+ "AgentResult",
66
+ "EvaluationResult",
67
+ "TestCaseResult",
68
+ "AgentVariant",
69
+ # Runner
70
+ "EvaluationRunner",
71
+ # Built-in evaluators
72
+ "TrajectoryEvaluator",
73
+ "OutputEvaluator",
74
+ "LLMAsJudgeEvaluator",
75
+ "EfficiencyEvaluator",
76
+ # Reporting
77
+ "EvaluationReport",
78
+ "ComparisonReport",
79
+ # Datasets
80
+ "EvaluationDataset",
81
+ ]
@@ -0,0 +1,186 @@
1
+ """
2
+ Core evaluation abstractions for the Vanna Agents framework.
3
+
4
+ This module provides the base classes and models for evaluating agent behavior,
5
+ including test cases, expected outcomes, and evaluation results.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import Any, Dict, List, Optional, Callable
10
+ from dataclasses import dataclass, field
11
+ from datetime import datetime
12
+ from pydantic import BaseModel
13
+
14
+ from vanna.core import User, UiComponent
15
+
16
+
17
+ class ExpectedOutcome(BaseModel):
18
+ """Defines what we expect from the agent for a test case.
19
+
20
+ Provides multiple ways to specify expectations:
21
+ - tools_called: List of tool names that should be called
22
+ - tools_not_called: List of tool names that should NOT be called
23
+ - final_answer_contains: Keywords/phrases that should appear in output
24
+ - final_answer_not_contains: Keywords/phrases that should NOT appear
25
+ - min_components: Minimum number of UI components expected
26
+ - max_execution_time_ms: Maximum allowed execution time
27
+ - custom_validators: Custom validation functions
28
+ """
29
+
30
+ tools_called: Optional[List[str]] = None
31
+ tools_not_called: Optional[List[str]] = None
32
+ final_answer_contains: Optional[List[str]] = None
33
+ final_answer_not_contains: Optional[List[str]] = None
34
+ min_components: Optional[int] = None
35
+ max_components: Optional[int] = None
36
+ max_execution_time_ms: Optional[float] = None
37
+ metadata: Dict[str, Any] = {}
38
+
39
+
40
+ class TestCase(BaseModel):
41
+ """A single evaluation test case.
42
+
43
+ Attributes:
44
+ id: Unique identifier for the test case
45
+ user: User context for the test
46
+ message: The message to send to the agent
47
+ conversation_id: Optional conversation ID for multi-turn tests
48
+ expected_outcome: What we expect the agent to do/produce
49
+ metadata: Additional metadata for categorization/filtering
50
+ """
51
+
52
+ id: str
53
+ user: User
54
+ message: str
55
+ conversation_id: Optional[str] = None
56
+ expected_outcome: Optional[ExpectedOutcome] = None
57
+ metadata: Dict[str, Any] = {}
58
+
59
+
60
+ @dataclass
61
+ class AgentResult:
62
+ """The result of running an agent on a test case.
63
+
64
+ Captures everything that happened during agent execution
65
+ for later evaluation.
66
+ """
67
+
68
+ test_case_id: str
69
+ components: List[UiComponent]
70
+ tool_calls: List[Dict[str, Any]] = field(default_factory=list)
71
+ llm_requests: List[Dict[str, Any]] = field(default_factory=list)
72
+ execution_time_ms: float = 0.0
73
+ total_tokens: int = 0
74
+ error: Optional[str] = None
75
+ metadata: Dict[str, Any] = field(default_factory=dict)
76
+
77
+ def get_final_answer(self) -> str:
78
+ """Extract the final answer from components."""
79
+ # Find text components and concatenate
80
+ texts = []
81
+ for component in self.components:
82
+ if hasattr(component, "rich_component"):
83
+ rich_comp = component.rich_component
84
+ if hasattr(rich_comp, "type") and rich_comp.type.value == "text":
85
+ content = rich_comp.data.get("content") or getattr(
86
+ rich_comp, "content", ""
87
+ )
88
+ if content:
89
+ texts.append(content)
90
+ return "\n".join(texts)
91
+
92
+ def get_tool_names_called(self) -> List[str]:
93
+ """Get list of tool names that were called."""
94
+ return [call.get("tool_name", "") for call in self.tool_calls]
95
+
96
+
97
+ class EvaluationResult(BaseModel):
98
+ """Result of evaluating a single test case.
99
+
100
+ Attributes:
101
+ test_case_id: ID of the test case evaluated
102
+ evaluator_name: Name of the evaluator that produced this result
103
+ passed: Whether the test case passed
104
+ score: Score from 0.0 to 1.0
105
+ reasoning: Explanation of the evaluation
106
+ metrics: Additional metrics captured during evaluation
107
+ timestamp: When the evaluation was performed
108
+ """
109
+
110
+ test_case_id: str
111
+ evaluator_name: str
112
+ passed: bool
113
+ score: float # 0.0 to 1.0
114
+ reasoning: str
115
+ metrics: Dict[str, Any] = {}
116
+ timestamp: datetime = datetime.now()
117
+
118
+
119
+ @dataclass
120
+ class TestCaseResult:
121
+ """Complete result for a single test case including all evaluations."""
122
+
123
+ test_case: TestCase
124
+ agent_result: AgentResult
125
+ evaluations: List[EvaluationResult]
126
+ execution_time_ms: float
127
+
128
+ def overall_passed(self) -> bool:
129
+ """Check if all evaluations passed."""
130
+ return all(e.passed for e in self.evaluations)
131
+
132
+ def overall_score(self) -> float:
133
+ """Calculate average score across all evaluations."""
134
+ if not self.evaluations:
135
+ return 0.0
136
+ return sum(e.score for e in self.evaluations) / len(self.evaluations)
137
+
138
+
139
+ @dataclass
140
+ class AgentVariant:
141
+ """A variant of an agent to evaluate (different LLM, config, etc).
142
+
143
+ Used for comparing different agent configurations, especially
144
+ different LLMs or model versions.
145
+
146
+ Attributes:
147
+ name: Human-readable name for this variant
148
+ agent: The agent instance to evaluate
149
+ metadata: Additional info (model name, provider, config, etc)
150
+ """
151
+
152
+ name: str
153
+ agent: Any # Agent type - avoiding circular import
154
+ metadata: Dict[str, Any] = field(default_factory=dict)
155
+
156
+
157
+ class Evaluator(ABC):
158
+ """Base class for evaluating agent behavior.
159
+
160
+ Evaluators examine the agent's execution and determine if it
161
+ met expectations. Multiple evaluators can be composed to check
162
+ different aspects (trajectory, output quality, efficiency, etc).
163
+ """
164
+
165
+ @property
166
+ @abstractmethod
167
+ def name(self) -> str:
168
+ """Name of this evaluator."""
169
+ pass
170
+
171
+ @abstractmethod
172
+ async def evaluate(
173
+ self,
174
+ test_case: TestCase,
175
+ agent_result: AgentResult,
176
+ ) -> EvaluationResult:
177
+ """Evaluate a single test case execution.
178
+
179
+ Args:
180
+ test_case: The test case that was executed
181
+ agent_result: The result from running the agent
182
+
183
+ Returns:
184
+ EvaluationResult with pass/fail, score, and reasoning
185
+ """
186
+ pass
@@ -0,0 +1,254 @@
1
+ """
2
+ Dataset loaders for evaluation test cases.
3
+
4
+ This module provides utilities for loading test case datasets from
5
+ YAML and JSON files.
6
+ """
7
+
8
+ import json
9
+ import yaml
10
+ from typing import Any, Dict, List
11
+ from pathlib import Path
12
+
13
+ from .base import TestCase, ExpectedOutcome
14
+ from vanna.core import User
15
+
16
+
17
+ class EvaluationDataset:
18
+ """Collection of test cases with metadata.
19
+
20
+ Example YAML format:
21
+ dataset:
22
+ name: "SQL Generation Tasks"
23
+ description: "Test cases for SQL generation"
24
+ test_cases:
25
+ - id: "sql_001"
26
+ user_id: "test_user"
27
+ message: "Show me total sales by region"
28
+ expected_outcome:
29
+ tools_called: ["generate_sql", "execute_query"]
30
+ final_answer_contains: ["SELECT", "GROUP BY", "region"]
31
+ """
32
+
33
+ def __init__(self, name: str, test_cases: List[TestCase], description: str = ""):
34
+ """Initialize evaluation dataset.
35
+
36
+ Args:
37
+ name: Name of the dataset
38
+ test_cases: List of test cases
39
+ description: Optional description
40
+ """
41
+ self.name = name
42
+ self.test_cases = test_cases
43
+ self.description = description
44
+
45
+ @classmethod
46
+ def from_yaml(cls, path: str) -> "EvaluationDataset":
47
+ """Load dataset from YAML file.
48
+
49
+ Args:
50
+ path: Path to YAML file
51
+
52
+ Returns:
53
+ EvaluationDataset instance
54
+ """
55
+ with open(path, "r") as f:
56
+ data = yaml.safe_load(f)
57
+
58
+ return cls._from_dict(data)
59
+
60
+ @classmethod
61
+ def from_json(cls, path: str) -> "EvaluationDataset":
62
+ """Load dataset from JSON file.
63
+
64
+ Args:
65
+ path: Path to JSON file
66
+
67
+ Returns:
68
+ EvaluationDataset instance
69
+ """
70
+ with open(path, "r") as f:
71
+ data = json.load(f)
72
+
73
+ return cls._from_dict(data)
74
+
75
+ @classmethod
76
+ def _from_dict(cls, data: Dict[str, Any]) -> "EvaluationDataset":
77
+ """Create dataset from dictionary.
78
+
79
+ Args:
80
+ data: Dictionary with dataset structure
81
+
82
+ Returns:
83
+ EvaluationDataset instance
84
+ """
85
+ dataset_config = data.get("dataset", data)
86
+ name = dataset_config.get("name", "Unnamed Dataset")
87
+ description = dataset_config.get("description", "")
88
+
89
+ test_cases = []
90
+ for tc_data in dataset_config.get("test_cases", []):
91
+ test_case = cls._parse_test_case(tc_data)
92
+ test_cases.append(test_case)
93
+
94
+ return cls(name=name, test_cases=test_cases, description=description)
95
+
96
+ @classmethod
97
+ def _parse_test_case(cls, data: Dict[str, Any]) -> TestCase:
98
+ """Parse a single test case from dictionary.
99
+
100
+ Args:
101
+ data: Test case dictionary
102
+
103
+ Returns:
104
+ TestCase instance
105
+ """
106
+ # Create user
107
+ user_id = data.get("user_id", "test_user")
108
+ user = User(
109
+ id=user_id,
110
+ username=data.get("username", user_id),
111
+ email=data.get("email", f"{user_id}@example.com"),
112
+ group_memberships=data.get("user_groups", []),
113
+ )
114
+
115
+ # Parse expected outcome if present
116
+ expected_outcome = None
117
+ if "expected_outcome" in data:
118
+ outcome_data = data["expected_outcome"]
119
+ expected_outcome = ExpectedOutcome(
120
+ tools_called=outcome_data.get("tools_called"),
121
+ tools_not_called=outcome_data.get("tools_not_called"),
122
+ final_answer_contains=outcome_data.get("final_answer_contains"),
123
+ final_answer_not_contains=outcome_data.get("final_answer_not_contains"),
124
+ min_components=outcome_data.get("min_components"),
125
+ max_components=outcome_data.get("max_components"),
126
+ max_execution_time_ms=outcome_data.get("max_execution_time_ms"),
127
+ metadata=outcome_data.get("metadata", {}),
128
+ )
129
+
130
+ return TestCase(
131
+ id=data["id"],
132
+ user=user,
133
+ message=data["message"],
134
+ conversation_id=data.get("conversation_id"),
135
+ expected_outcome=expected_outcome,
136
+ metadata=data.get("metadata", {}),
137
+ )
138
+
139
+ def save_yaml(self, path: str) -> None:
140
+ """Save dataset to YAML file.
141
+
142
+ Args:
143
+ path: Path to save YAML file
144
+ """
145
+ data = self._to_dict()
146
+ with open(path, "w") as f:
147
+ yaml.dump(data, f, default_flow_style=False, sort_keys=False)
148
+
149
+ def save_json(self, path: str) -> None:
150
+ """Save dataset to JSON file.
151
+
152
+ Args:
153
+ path: Path to save JSON file
154
+ """
155
+ data = self._to_dict()
156
+ with open(path, "w") as f:
157
+ json.dump(data, f, indent=2)
158
+
159
+ def _to_dict(self) -> Dict[str, Any]:
160
+ """Convert dataset to dictionary.
161
+
162
+ Returns:
163
+ Dictionary representation
164
+ """
165
+ return {
166
+ "dataset": {
167
+ "name": self.name,
168
+ "description": self.description,
169
+ "test_cases": [self._test_case_to_dict(tc) for tc in self.test_cases],
170
+ }
171
+ }
172
+
173
+ def _test_case_to_dict(self, test_case: TestCase) -> Dict[str, Any]:
174
+ """Convert test case to dictionary.
175
+
176
+ Args:
177
+ test_case: TestCase to convert
178
+
179
+ Returns:
180
+ Dictionary representation
181
+ """
182
+ data: Dict[str, Any] = {
183
+ "id": test_case.id,
184
+ "user_id": test_case.user.id,
185
+ "username": test_case.user.username,
186
+ "email": test_case.user.email,
187
+ "user_groups": test_case.user.group_memberships,
188
+ "message": test_case.message,
189
+ }
190
+
191
+ if test_case.conversation_id:
192
+ data["conversation_id"] = test_case.conversation_id
193
+
194
+ if test_case.expected_outcome:
195
+ outcome = test_case.expected_outcome
196
+ outcome_dict: Dict[str, Any] = {}
197
+
198
+ if outcome.tools_called:
199
+ outcome_dict["tools_called"] = outcome.tools_called
200
+ if outcome.tools_not_called:
201
+ outcome_dict["tools_not_called"] = outcome.tools_not_called
202
+ if outcome.final_answer_contains:
203
+ outcome_dict["final_answer_contains"] = outcome.final_answer_contains
204
+ if outcome.final_answer_not_contains:
205
+ outcome_dict["final_answer_not_contains"] = (
206
+ outcome.final_answer_not_contains
207
+ )
208
+ if outcome.min_components is not None:
209
+ outcome_dict["min_components"] = outcome.min_components
210
+ if outcome.max_components is not None:
211
+ outcome_dict["max_components"] = outcome.max_components
212
+ if outcome.max_execution_time_ms is not None:
213
+ outcome_dict["max_execution_time_ms"] = outcome.max_execution_time_ms
214
+ if outcome.metadata:
215
+ outcome_dict["metadata"] = outcome.metadata
216
+
217
+ if outcome_dict:
218
+ data["expected_outcome"] = outcome_dict
219
+
220
+ if test_case.metadata:
221
+ data["metadata"] = test_case.metadata
222
+
223
+ return data
224
+
225
+ def filter_by_metadata(self, **kwargs: Any) -> "EvaluationDataset":
226
+ """Filter test cases by metadata fields.
227
+
228
+ Args:
229
+ **kwargs: Metadata fields to match
230
+
231
+ Returns:
232
+ New EvaluationDataset with filtered test cases
233
+ """
234
+ filtered = [
235
+ tc
236
+ for tc in self.test_cases
237
+ if all(tc.metadata.get(k) == v for k, v in kwargs.items())
238
+ ]
239
+
240
+ return EvaluationDataset(
241
+ name=f"{self.name} (filtered)",
242
+ test_cases=filtered,
243
+ description=f"Filtered from: {self.description}",
244
+ )
245
+
246
+ def __len__(self) -> int:
247
+ """Get number of test cases."""
248
+ return len(self.test_cases)
249
+
250
+ def __repr__(self) -> str:
251
+ """String representation."""
252
+ return (
253
+ f"EvaluationDataset(name='{self.name}', test_cases={len(self.test_cases)})"
254
+ )