vanna 0.7.9__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vanna/__init__.py +167 -395
- vanna/agents/__init__.py +7 -0
- vanna/capabilities/__init__.py +17 -0
- vanna/capabilities/agent_memory/__init__.py +21 -0
- vanna/capabilities/agent_memory/base.py +103 -0
- vanna/capabilities/agent_memory/models.py +53 -0
- vanna/capabilities/file_system/__init__.py +14 -0
- vanna/capabilities/file_system/base.py +71 -0
- vanna/capabilities/file_system/models.py +25 -0
- vanna/capabilities/sql_runner/__init__.py +13 -0
- vanna/capabilities/sql_runner/base.py +37 -0
- vanna/capabilities/sql_runner/models.py +13 -0
- vanna/components/__init__.py +92 -0
- vanna/components/base.py +11 -0
- vanna/components/rich/__init__.py +83 -0
- vanna/components/rich/containers/__init__.py +7 -0
- vanna/components/rich/containers/card.py +20 -0
- vanna/components/rich/data/__init__.py +9 -0
- vanna/components/rich/data/chart.py +17 -0
- vanna/components/rich/data/dataframe.py +93 -0
- vanna/components/rich/feedback/__init__.py +21 -0
- vanna/components/rich/feedback/badge.py +16 -0
- vanna/components/rich/feedback/icon_text.py +14 -0
- vanna/components/rich/feedback/log_viewer.py +41 -0
- vanna/components/rich/feedback/notification.py +19 -0
- vanna/components/rich/feedback/progress.py +37 -0
- vanna/components/rich/feedback/status_card.py +28 -0
- vanna/components/rich/feedback/status_indicator.py +14 -0
- vanna/components/rich/interactive/__init__.py +21 -0
- vanna/components/rich/interactive/button.py +95 -0
- vanna/components/rich/interactive/task_list.py +58 -0
- vanna/components/rich/interactive/ui_state.py +93 -0
- vanna/components/rich/specialized/__init__.py +7 -0
- vanna/components/rich/specialized/artifact.py +20 -0
- vanna/components/rich/text.py +16 -0
- vanna/components/simple/__init__.py +15 -0
- vanna/components/simple/image.py +15 -0
- vanna/components/simple/link.py +15 -0
- vanna/components/simple/text.py +11 -0
- vanna/core/__init__.py +193 -0
- vanna/core/_compat.py +19 -0
- vanna/core/agent/__init__.py +10 -0
- vanna/core/agent/agent.py +1407 -0
- vanna/core/agent/config.py +123 -0
- vanna/core/audit/__init__.py +28 -0
- vanna/core/audit/base.py +299 -0
- vanna/core/audit/models.py +131 -0
- vanna/core/component_manager.py +329 -0
- vanna/core/components.py +53 -0
- vanna/core/enhancer/__init__.py +11 -0
- vanna/core/enhancer/base.py +94 -0
- vanna/core/enhancer/default.py +118 -0
- vanna/core/enricher/__init__.py +10 -0
- vanna/core/enricher/base.py +59 -0
- vanna/core/errors.py +47 -0
- vanna/core/evaluation/__init__.py +81 -0
- vanna/core/evaluation/base.py +186 -0
- vanna/core/evaluation/dataset.py +254 -0
- vanna/core/evaluation/evaluators.py +376 -0
- vanna/core/evaluation/report.py +289 -0
- vanna/core/evaluation/runner.py +313 -0
- vanna/core/filter/__init__.py +10 -0
- vanna/core/filter/base.py +67 -0
- vanna/core/lifecycle/__init__.py +10 -0
- vanna/core/lifecycle/base.py +83 -0
- vanna/core/llm/__init__.py +16 -0
- vanna/core/llm/base.py +40 -0
- vanna/core/llm/models.py +61 -0
- vanna/core/middleware/__init__.py +10 -0
- vanna/core/middleware/base.py +69 -0
- vanna/core/observability/__init__.py +11 -0
- vanna/core/observability/base.py +88 -0
- vanna/core/observability/models.py +47 -0
- vanna/core/recovery/__init__.py +11 -0
- vanna/core/recovery/base.py +84 -0
- vanna/core/recovery/models.py +32 -0
- vanna/core/registry.py +278 -0
- vanna/core/rich_component.py +156 -0
- vanna/core/simple_component.py +27 -0
- vanna/core/storage/__init__.py +14 -0
- vanna/core/storage/base.py +46 -0
- vanna/core/storage/models.py +46 -0
- vanna/core/system_prompt/__init__.py +13 -0
- vanna/core/system_prompt/base.py +36 -0
- vanna/core/system_prompt/default.py +157 -0
- vanna/core/tool/__init__.py +18 -0
- vanna/core/tool/base.py +70 -0
- vanna/core/tool/models.py +84 -0
- vanna/core/user/__init__.py +17 -0
- vanna/core/user/base.py +29 -0
- vanna/core/user/models.py +25 -0
- vanna/core/user/request_context.py +70 -0
- vanna/core/user/resolver.py +42 -0
- vanna/core/validation.py +164 -0
- vanna/core/workflow/__init__.py +12 -0
- vanna/core/workflow/base.py +254 -0
- vanna/core/workflow/default.py +789 -0
- vanna/examples/__init__.py +1 -0
- vanna/examples/__main__.py +44 -0
- vanna/examples/anthropic_quickstart.py +80 -0
- vanna/examples/artifact_example.py +293 -0
- vanna/examples/claude_sqlite_example.py +236 -0
- vanna/examples/coding_agent_example.py +300 -0
- vanna/examples/custom_system_prompt_example.py +174 -0
- vanna/examples/default_workflow_handler_example.py +208 -0
- vanna/examples/email_auth_example.py +340 -0
- vanna/examples/evaluation_example.py +269 -0
- vanna/examples/extensibility_example.py +262 -0
- vanna/examples/minimal_example.py +67 -0
- vanna/examples/mock_auth_example.py +227 -0
- vanna/examples/mock_custom_tool.py +311 -0
- vanna/examples/mock_quickstart.py +79 -0
- vanna/examples/mock_quota_example.py +145 -0
- vanna/examples/mock_rich_components_demo.py +396 -0
- vanna/examples/mock_sqlite_example.py +223 -0
- vanna/examples/openai_quickstart.py +83 -0
- vanna/examples/primitive_components_demo.py +305 -0
- vanna/examples/quota_lifecycle_example.py +139 -0
- vanna/examples/visualization_example.py +251 -0
- vanna/integrations/__init__.py +17 -0
- vanna/integrations/anthropic/__init__.py +9 -0
- vanna/integrations/anthropic/llm.py +270 -0
- vanna/integrations/azureopenai/__init__.py +9 -0
- vanna/integrations/azureopenai/llm.py +329 -0
- vanna/integrations/azuresearch/__init__.py +7 -0
- vanna/integrations/azuresearch/agent_memory.py +413 -0
- vanna/integrations/bigquery/__init__.py +5 -0
- vanna/integrations/bigquery/sql_runner.py +81 -0
- vanna/integrations/chromadb/__init__.py +104 -0
- vanna/integrations/chromadb/agent_memory.py +416 -0
- vanna/integrations/clickhouse/__init__.py +5 -0
- vanna/integrations/clickhouse/sql_runner.py +82 -0
- vanna/integrations/duckdb/__init__.py +5 -0
- vanna/integrations/duckdb/sql_runner.py +65 -0
- vanna/integrations/faiss/__init__.py +7 -0
- vanna/integrations/faiss/agent_memory.py +431 -0
- vanna/integrations/google/__init__.py +9 -0
- vanna/integrations/google/gemini.py +370 -0
- vanna/integrations/hive/__init__.py +5 -0
- vanna/integrations/hive/sql_runner.py +87 -0
- vanna/integrations/local/__init__.py +17 -0
- vanna/integrations/local/agent_memory/__init__.py +7 -0
- vanna/integrations/local/agent_memory/in_memory.py +285 -0
- vanna/integrations/local/audit.py +59 -0
- vanna/integrations/local/file_system.py +242 -0
- vanna/integrations/local/file_system_conversation_store.py +255 -0
- vanna/integrations/local/storage.py +62 -0
- vanna/integrations/marqo/__init__.py +7 -0
- vanna/integrations/marqo/agent_memory.py +354 -0
- vanna/integrations/milvus/__init__.py +7 -0
- vanna/integrations/milvus/agent_memory.py +458 -0
- vanna/integrations/mock/__init__.py +9 -0
- vanna/integrations/mock/llm.py +65 -0
- vanna/integrations/mssql/__init__.py +5 -0
- vanna/integrations/mssql/sql_runner.py +66 -0
- vanna/integrations/mysql/__init__.py +5 -0
- vanna/integrations/mysql/sql_runner.py +92 -0
- vanna/integrations/ollama/__init__.py +7 -0
- vanna/integrations/ollama/llm.py +252 -0
- vanna/integrations/openai/__init__.py +10 -0
- vanna/integrations/openai/llm.py +267 -0
- vanna/integrations/openai/responses.py +163 -0
- vanna/integrations/opensearch/__init__.py +7 -0
- vanna/integrations/opensearch/agent_memory.py +411 -0
- vanna/integrations/oracle/__init__.py +5 -0
- vanna/integrations/oracle/sql_runner.py +75 -0
- vanna/integrations/pinecone/__init__.py +7 -0
- vanna/integrations/pinecone/agent_memory.py +329 -0
- vanna/integrations/plotly/__init__.py +5 -0
- vanna/integrations/plotly/chart_generator.py +313 -0
- vanna/integrations/postgres/__init__.py +9 -0
- vanna/integrations/postgres/sql_runner.py +112 -0
- vanna/integrations/premium/agent_memory/__init__.py +7 -0
- vanna/integrations/premium/agent_memory/premium.py +186 -0
- vanna/integrations/presto/__init__.py +5 -0
- vanna/integrations/presto/sql_runner.py +107 -0
- vanna/integrations/qdrant/__init__.py +7 -0
- vanna/integrations/qdrant/agent_memory.py +461 -0
- vanna/integrations/snowflake/__init__.py +5 -0
- vanna/integrations/snowflake/sql_runner.py +147 -0
- vanna/integrations/sqlite/__init__.py +9 -0
- vanna/integrations/sqlite/sql_runner.py +65 -0
- vanna/integrations/weaviate/__init__.py +7 -0
- vanna/integrations/weaviate/agent_memory.py +428 -0
- vanna/{ZhipuAI → legacy/ZhipuAI}/ZhipuAI_embeddings.py +11 -11
- vanna/legacy/__init__.py +403 -0
- vanna/legacy/adapter.py +463 -0
- vanna/{advanced → legacy/advanced}/__init__.py +3 -1
- vanna/{anthropic → legacy/anthropic}/anthropic_chat.py +9 -7
- vanna/{azuresearch → legacy/azuresearch}/azuresearch_vector.py +79 -41
- vanna/{base → legacy/base}/base.py +224 -217
- vanna/legacy/bedrock/__init__.py +1 -0
- vanna/{bedrock → legacy/bedrock}/bedrock_converse.py +13 -12
- vanna/{chromadb → legacy/chromadb}/chromadb_vector.py +3 -1
- vanna/legacy/cohere/__init__.py +2 -0
- vanna/{cohere → legacy/cohere}/cohere_chat.py +19 -14
- vanna/{cohere → legacy/cohere}/cohere_embeddings.py +25 -19
- vanna/{deepseek → legacy/deepseek}/deepseek_chat.py +5 -6
- vanna/legacy/faiss/__init__.py +1 -0
- vanna/{faiss → legacy/faiss}/faiss.py +113 -59
- vanna/{flask → legacy/flask}/__init__.py +84 -43
- vanna/{flask → legacy/flask}/assets.py +5 -5
- vanna/{flask → legacy/flask}/auth.py +5 -4
- vanna/{google → legacy/google}/bigquery_vector.py +75 -42
- vanna/{google → legacy/google}/gemini_chat.py +7 -3
- vanna/{hf → legacy/hf}/hf.py +0 -1
- vanna/{milvus → legacy/milvus}/milvus_vector.py +58 -35
- vanna/{mock → legacy/mock}/llm.py +0 -1
- vanna/legacy/mock/vectordb.py +67 -0
- vanna/legacy/ollama/ollama.py +110 -0
- vanna/{openai → legacy/openai}/openai_chat.py +2 -6
- vanna/legacy/opensearch/opensearch_vector.py +369 -0
- vanna/legacy/opensearch/opensearch_vector_semantic.py +200 -0
- vanna/legacy/oracle/oracle_vector.py +584 -0
- vanna/{pgvector → legacy/pgvector}/pgvector.py +42 -13
- vanna/{qdrant → legacy/qdrant}/qdrant.py +2 -6
- vanna/legacy/qianfan/Qianfan_Chat.py +170 -0
- vanna/legacy/qianfan/Qianfan_embeddings.py +36 -0
- vanna/legacy/qianwen/QianwenAI_chat.py +132 -0
- vanna/{remote.py → legacy/remote.py} +28 -26
- vanna/{utils.py → legacy/utils.py} +6 -11
- vanna/{vannadb → legacy/vannadb}/vannadb_vector.py +115 -46
- vanna/{vllm → legacy/vllm}/vllm.py +5 -6
- vanna/{weaviate → legacy/weaviate}/weaviate_vector.py +59 -40
- vanna/{xinference → legacy/xinference}/xinference.py +6 -6
- vanna/py.typed +0 -0
- vanna/servers/__init__.py +16 -0
- vanna/servers/__main__.py +8 -0
- vanna/servers/base/__init__.py +18 -0
- vanna/servers/base/chat_handler.py +65 -0
- vanna/servers/base/models.py +111 -0
- vanna/servers/base/rich_chat_handler.py +141 -0
- vanna/servers/base/templates.py +331 -0
- vanna/servers/cli/__init__.py +7 -0
- vanna/servers/cli/server_runner.py +204 -0
- vanna/servers/fastapi/__init__.py +7 -0
- vanna/servers/fastapi/app.py +163 -0
- vanna/servers/fastapi/routes.py +183 -0
- vanna/servers/flask/__init__.py +7 -0
- vanna/servers/flask/app.py +132 -0
- vanna/servers/flask/routes.py +137 -0
- vanna/tools/__init__.py +41 -0
- vanna/tools/agent_memory.py +322 -0
- vanna/tools/file_system.py +879 -0
- vanna/tools/python.py +222 -0
- vanna/tools/run_sql.py +165 -0
- vanna/tools/visualize_data.py +195 -0
- vanna/utils/__init__.py +0 -0
- vanna/web_components/__init__.py +44 -0
- vanna-2.0.0.dist-info/METADATA +485 -0
- vanna-2.0.0.dist-info/RECORD +289 -0
- vanna-2.0.0.dist-info/entry_points.txt +3 -0
- vanna/bedrock/__init__.py +0 -1
- vanna/cohere/__init__.py +0 -2
- vanna/faiss/__init__.py +0 -1
- vanna/mock/vectordb.py +0 -55
- vanna/ollama/ollama.py +0 -103
- vanna/opensearch/opensearch_vector.py +0 -392
- vanna/opensearch/opensearch_vector_semantic.py +0 -175
- vanna/oracle/oracle_vector.py +0 -585
- vanna/qianfan/Qianfan_Chat.py +0 -165
- vanna/qianfan/Qianfan_embeddings.py +0 -36
- vanna/qianwen/QianwenAI_chat.py +0 -133
- vanna-0.7.9.dist-info/METADATA +0 -408
- vanna-0.7.9.dist-info/RECORD +0 -79
- /vanna/{ZhipuAI → legacy/ZhipuAI}/ZhipuAI_Chat.py +0 -0
- /vanna/{ZhipuAI → legacy/ZhipuAI}/__init__.py +0 -0
- /vanna/{anthropic → legacy/anthropic}/__init__.py +0 -0
- /vanna/{azuresearch → legacy/azuresearch}/__init__.py +0 -0
- /vanna/{base → legacy/base}/__init__.py +0 -0
- /vanna/{chromadb → legacy/chromadb}/__init__.py +0 -0
- /vanna/{deepseek → legacy/deepseek}/__init__.py +0 -0
- /vanna/{exceptions → legacy/exceptions}/__init__.py +0 -0
- /vanna/{google → legacy/google}/__init__.py +0 -0
- /vanna/{hf → legacy/hf}/__init__.py +0 -0
- /vanna/{local.py → legacy/local.py} +0 -0
- /vanna/{marqo → legacy/marqo}/__init__.py +0 -0
- /vanna/{marqo → legacy/marqo}/marqo.py +0 -0
- /vanna/{milvus → legacy/milvus}/__init__.py +0 -0
- /vanna/{mistral → legacy/mistral}/__init__.py +0 -0
- /vanna/{mistral → legacy/mistral}/mistral.py +0 -0
- /vanna/{mock → legacy/mock}/__init__.py +0 -0
- /vanna/{mock → legacy/mock}/embedding.py +0 -0
- /vanna/{ollama → legacy/ollama}/__init__.py +0 -0
- /vanna/{openai → legacy/openai}/__init__.py +0 -0
- /vanna/{openai → legacy/openai}/openai_embeddings.py +0 -0
- /vanna/{opensearch → legacy/opensearch}/__init__.py +0 -0
- /vanna/{oracle → legacy/oracle}/__init__.py +0 -0
- /vanna/{pgvector → legacy/pgvector}/__init__.py +0 -0
- /vanna/{pinecone → legacy/pinecone}/__init__.py +0 -0
- /vanna/{pinecone → legacy/pinecone}/pinecone_vector.py +0 -0
- /vanna/{qdrant → legacy/qdrant}/__init__.py +0 -0
- /vanna/{qianfan → legacy/qianfan}/__init__.py +0 -0
- /vanna/{qianwen → legacy/qianwen}/QianwenAI_embeddings.py +0 -0
- /vanna/{qianwen → legacy/qianwen}/__init__.py +0 -0
- /vanna/{types → legacy/types}/__init__.py +0 -0
- /vanna/{vannadb → legacy/vannadb}/__init__.py +0 -0
- /vanna/{vllm → legacy/vllm}/__init__.py +0 -0
- /vanna/{weaviate → legacy/weaviate}/__init__.py +0 -0
- /vanna/{xinference → legacy/xinference}/__init__.py +0 -0
- {vanna-0.7.9.dist-info → vanna-2.0.0.dist-info}/WHEEL +0 -0
- {vanna-0.7.9.dist-info → vanna-2.0.0.dist-info}/licenses/LICENSE +0 -0
vanna/core/errors.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Exception classes for the Vanna Agents framework.
|
|
3
|
+
|
|
4
|
+
This module defines all custom exceptions used throughout the framework.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AgentError(Exception):
|
|
9
|
+
"""Base exception for agent framework."""
|
|
10
|
+
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ToolExecutionError(AgentError):
|
|
15
|
+
"""Error during tool execution."""
|
|
16
|
+
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ToolNotFoundError(AgentError):
|
|
21
|
+
"""Tool not found in registry."""
|
|
22
|
+
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PermissionError(AgentError):
|
|
27
|
+
"""User lacks required permissions."""
|
|
28
|
+
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ConversationNotFoundError(AgentError):
|
|
33
|
+
"""Conversation not found."""
|
|
34
|
+
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class LlmServiceError(AgentError):
|
|
39
|
+
"""Error communicating with LLM service."""
|
|
40
|
+
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ValidationError(AgentError):
|
|
45
|
+
"""Data validation error."""
|
|
46
|
+
|
|
47
|
+
pass
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation framework for Vanna Agents.
|
|
3
|
+
|
|
4
|
+
This module provides a complete evaluation system for testing and comparing
|
|
5
|
+
agent variants, with special focus on LLM comparison use cases.
|
|
6
|
+
|
|
7
|
+
Key Features:
|
|
8
|
+
- Parallel execution for efficient I/O-bound operations
|
|
9
|
+
- Multiple built-in evaluators (trajectory, output, LLM-as-judge, efficiency)
|
|
10
|
+
- Rich reporting (HTML, CSV, console)
|
|
11
|
+
- Dataset loaders (YAML, JSON)
|
|
12
|
+
- Agent variant comparison
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
>>> from vanna.evaluation import (
|
|
16
|
+
... EvaluationRunner,
|
|
17
|
+
... EvaluationDataset,
|
|
18
|
+
... AgentVariant,
|
|
19
|
+
... TrajectoryEvaluator,
|
|
20
|
+
... OutputEvaluator,
|
|
21
|
+
... )
|
|
22
|
+
>>>
|
|
23
|
+
>>> # Load test dataset
|
|
24
|
+
>>> dataset = EvaluationDataset.from_yaml("tests/sql_tasks.yaml")
|
|
25
|
+
>>>
|
|
26
|
+
>>> # Create agent variants
|
|
27
|
+
>>> variants = [
|
|
28
|
+
... AgentVariant("claude", claude_agent),
|
|
29
|
+
... AgentVariant("gpt", gpt_agent),
|
|
30
|
+
... ]
|
|
31
|
+
>>>
|
|
32
|
+
>>> # Run comparison
|
|
33
|
+
>>> runner = EvaluationRunner(
|
|
34
|
+
... evaluators=[TrajectoryEvaluator(), OutputEvaluator()],
|
|
35
|
+
... max_concurrency=20
|
|
36
|
+
... )
|
|
37
|
+
>>> comparison = await runner.compare_agents(variants, dataset.test_cases)
|
|
38
|
+
>>> comparison.print_summary()
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from .base import (
|
|
42
|
+
Evaluator,
|
|
43
|
+
TestCase,
|
|
44
|
+
ExpectedOutcome,
|
|
45
|
+
AgentResult,
|
|
46
|
+
EvaluationResult,
|
|
47
|
+
TestCaseResult,
|
|
48
|
+
AgentVariant,
|
|
49
|
+
)
|
|
50
|
+
from .runner import EvaluationRunner
|
|
51
|
+
from .evaluators import (
|
|
52
|
+
TrajectoryEvaluator,
|
|
53
|
+
OutputEvaluator,
|
|
54
|
+
LLMAsJudgeEvaluator,
|
|
55
|
+
EfficiencyEvaluator,
|
|
56
|
+
)
|
|
57
|
+
from .report import EvaluationReport, ComparisonReport
|
|
58
|
+
from .dataset import EvaluationDataset
|
|
59
|
+
|
|
60
|
+
__all__ = [
|
|
61
|
+
# Base classes
|
|
62
|
+
"Evaluator",
|
|
63
|
+
"TestCase",
|
|
64
|
+
"ExpectedOutcome",
|
|
65
|
+
"AgentResult",
|
|
66
|
+
"EvaluationResult",
|
|
67
|
+
"TestCaseResult",
|
|
68
|
+
"AgentVariant",
|
|
69
|
+
# Runner
|
|
70
|
+
"EvaluationRunner",
|
|
71
|
+
# Built-in evaluators
|
|
72
|
+
"TrajectoryEvaluator",
|
|
73
|
+
"OutputEvaluator",
|
|
74
|
+
"LLMAsJudgeEvaluator",
|
|
75
|
+
"EfficiencyEvaluator",
|
|
76
|
+
# Reporting
|
|
77
|
+
"EvaluationReport",
|
|
78
|
+
"ComparisonReport",
|
|
79
|
+
# Datasets
|
|
80
|
+
"EvaluationDataset",
|
|
81
|
+
]
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core evaluation abstractions for the Vanna Agents framework.
|
|
3
|
+
|
|
4
|
+
This module provides the base classes and models for evaluating agent behavior,
|
|
5
|
+
including test cases, expected outcomes, and evaluation results.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Any, Dict, List, Optional, Callable
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from vanna.core import User, UiComponent
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ExpectedOutcome(BaseModel):
|
|
18
|
+
"""Defines what we expect from the agent for a test case.
|
|
19
|
+
|
|
20
|
+
Provides multiple ways to specify expectations:
|
|
21
|
+
- tools_called: List of tool names that should be called
|
|
22
|
+
- tools_not_called: List of tool names that should NOT be called
|
|
23
|
+
- final_answer_contains: Keywords/phrases that should appear in output
|
|
24
|
+
- final_answer_not_contains: Keywords/phrases that should NOT appear
|
|
25
|
+
- min_components: Minimum number of UI components expected
|
|
26
|
+
- max_execution_time_ms: Maximum allowed execution time
|
|
27
|
+
- custom_validators: Custom validation functions
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
tools_called: Optional[List[str]] = None
|
|
31
|
+
tools_not_called: Optional[List[str]] = None
|
|
32
|
+
final_answer_contains: Optional[List[str]] = None
|
|
33
|
+
final_answer_not_contains: Optional[List[str]] = None
|
|
34
|
+
min_components: Optional[int] = None
|
|
35
|
+
max_components: Optional[int] = None
|
|
36
|
+
max_execution_time_ms: Optional[float] = None
|
|
37
|
+
metadata: Dict[str, Any] = {}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class TestCase(BaseModel):
|
|
41
|
+
"""A single evaluation test case.
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
id: Unique identifier for the test case
|
|
45
|
+
user: User context for the test
|
|
46
|
+
message: The message to send to the agent
|
|
47
|
+
conversation_id: Optional conversation ID for multi-turn tests
|
|
48
|
+
expected_outcome: What we expect the agent to do/produce
|
|
49
|
+
metadata: Additional metadata for categorization/filtering
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
id: str
|
|
53
|
+
user: User
|
|
54
|
+
message: str
|
|
55
|
+
conversation_id: Optional[str] = None
|
|
56
|
+
expected_outcome: Optional[ExpectedOutcome] = None
|
|
57
|
+
metadata: Dict[str, Any] = {}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class AgentResult:
|
|
62
|
+
"""The result of running an agent on a test case.
|
|
63
|
+
|
|
64
|
+
Captures everything that happened during agent execution
|
|
65
|
+
for later evaluation.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
test_case_id: str
|
|
69
|
+
components: List[UiComponent]
|
|
70
|
+
tool_calls: List[Dict[str, Any]] = field(default_factory=list)
|
|
71
|
+
llm_requests: List[Dict[str, Any]] = field(default_factory=list)
|
|
72
|
+
execution_time_ms: float = 0.0
|
|
73
|
+
total_tokens: int = 0
|
|
74
|
+
error: Optional[str] = None
|
|
75
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
76
|
+
|
|
77
|
+
def get_final_answer(self) -> str:
|
|
78
|
+
"""Extract the final answer from components."""
|
|
79
|
+
# Find text components and concatenate
|
|
80
|
+
texts = []
|
|
81
|
+
for component in self.components:
|
|
82
|
+
if hasattr(component, "rich_component"):
|
|
83
|
+
rich_comp = component.rich_component
|
|
84
|
+
if hasattr(rich_comp, "type") and rich_comp.type.value == "text":
|
|
85
|
+
content = rich_comp.data.get("content") or getattr(
|
|
86
|
+
rich_comp, "content", ""
|
|
87
|
+
)
|
|
88
|
+
if content:
|
|
89
|
+
texts.append(content)
|
|
90
|
+
return "\n".join(texts)
|
|
91
|
+
|
|
92
|
+
def get_tool_names_called(self) -> List[str]:
|
|
93
|
+
"""Get list of tool names that were called."""
|
|
94
|
+
return [call.get("tool_name", "") for call in self.tool_calls]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class EvaluationResult(BaseModel):
|
|
98
|
+
"""Result of evaluating a single test case.
|
|
99
|
+
|
|
100
|
+
Attributes:
|
|
101
|
+
test_case_id: ID of the test case evaluated
|
|
102
|
+
evaluator_name: Name of the evaluator that produced this result
|
|
103
|
+
passed: Whether the test case passed
|
|
104
|
+
score: Score from 0.0 to 1.0
|
|
105
|
+
reasoning: Explanation of the evaluation
|
|
106
|
+
metrics: Additional metrics captured during evaluation
|
|
107
|
+
timestamp: When the evaluation was performed
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
test_case_id: str
|
|
111
|
+
evaluator_name: str
|
|
112
|
+
passed: bool
|
|
113
|
+
score: float # 0.0 to 1.0
|
|
114
|
+
reasoning: str
|
|
115
|
+
metrics: Dict[str, Any] = {}
|
|
116
|
+
timestamp: datetime = datetime.now()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass
|
|
120
|
+
class TestCaseResult:
|
|
121
|
+
"""Complete result for a single test case including all evaluations."""
|
|
122
|
+
|
|
123
|
+
test_case: TestCase
|
|
124
|
+
agent_result: AgentResult
|
|
125
|
+
evaluations: List[EvaluationResult]
|
|
126
|
+
execution_time_ms: float
|
|
127
|
+
|
|
128
|
+
def overall_passed(self) -> bool:
|
|
129
|
+
"""Check if all evaluations passed."""
|
|
130
|
+
return all(e.passed for e in self.evaluations)
|
|
131
|
+
|
|
132
|
+
def overall_score(self) -> float:
|
|
133
|
+
"""Calculate average score across all evaluations."""
|
|
134
|
+
if not self.evaluations:
|
|
135
|
+
return 0.0
|
|
136
|
+
return sum(e.score for e in self.evaluations) / len(self.evaluations)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass
|
|
140
|
+
class AgentVariant:
|
|
141
|
+
"""A variant of an agent to evaluate (different LLM, config, etc).
|
|
142
|
+
|
|
143
|
+
Used for comparing different agent configurations, especially
|
|
144
|
+
different LLMs or model versions.
|
|
145
|
+
|
|
146
|
+
Attributes:
|
|
147
|
+
name: Human-readable name for this variant
|
|
148
|
+
agent: The agent instance to evaluate
|
|
149
|
+
metadata: Additional info (model name, provider, config, etc)
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
name: str
|
|
153
|
+
agent: Any # Agent type - avoiding circular import
|
|
154
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class Evaluator(ABC):
|
|
158
|
+
"""Base class for evaluating agent behavior.
|
|
159
|
+
|
|
160
|
+
Evaluators examine the agent's execution and determine if it
|
|
161
|
+
met expectations. Multiple evaluators can be composed to check
|
|
162
|
+
different aspects (trajectory, output quality, efficiency, etc).
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
@abstractmethod
|
|
167
|
+
def name(self) -> str:
|
|
168
|
+
"""Name of this evaluator."""
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
@abstractmethod
|
|
172
|
+
async def evaluate(
|
|
173
|
+
self,
|
|
174
|
+
test_case: TestCase,
|
|
175
|
+
agent_result: AgentResult,
|
|
176
|
+
) -> EvaluationResult:
|
|
177
|
+
"""Evaluate a single test case execution.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
test_case: The test case that was executed
|
|
181
|
+
agent_result: The result from running the agent
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
EvaluationResult with pass/fail, score, and reasoning
|
|
185
|
+
"""
|
|
186
|
+
pass
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dataset loaders for evaluation test cases.
|
|
3
|
+
|
|
4
|
+
This module provides utilities for loading test case datasets from
|
|
5
|
+
YAML and JSON files.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import yaml
|
|
10
|
+
from typing import Any, Dict, List
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from .base import TestCase, ExpectedOutcome
|
|
14
|
+
from vanna.core import User
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EvaluationDataset:
|
|
18
|
+
"""Collection of test cases with metadata.
|
|
19
|
+
|
|
20
|
+
Example YAML format:
|
|
21
|
+
dataset:
|
|
22
|
+
name: "SQL Generation Tasks"
|
|
23
|
+
description: "Test cases for SQL generation"
|
|
24
|
+
test_cases:
|
|
25
|
+
- id: "sql_001"
|
|
26
|
+
user_id: "test_user"
|
|
27
|
+
message: "Show me total sales by region"
|
|
28
|
+
expected_outcome:
|
|
29
|
+
tools_called: ["generate_sql", "execute_query"]
|
|
30
|
+
final_answer_contains: ["SELECT", "GROUP BY", "region"]
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, name: str, test_cases: List[TestCase], description: str = ""):
|
|
34
|
+
"""Initialize evaluation dataset.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
name: Name of the dataset
|
|
38
|
+
test_cases: List of test cases
|
|
39
|
+
description: Optional description
|
|
40
|
+
"""
|
|
41
|
+
self.name = name
|
|
42
|
+
self.test_cases = test_cases
|
|
43
|
+
self.description = description
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def from_yaml(cls, path: str) -> "EvaluationDataset":
|
|
47
|
+
"""Load dataset from YAML file.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
path: Path to YAML file
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
EvaluationDataset instance
|
|
54
|
+
"""
|
|
55
|
+
with open(path, "r") as f:
|
|
56
|
+
data = yaml.safe_load(f)
|
|
57
|
+
|
|
58
|
+
return cls._from_dict(data)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_json(cls, path: str) -> "EvaluationDataset":
|
|
62
|
+
"""Load dataset from JSON file.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
path: Path to JSON file
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
EvaluationDataset instance
|
|
69
|
+
"""
|
|
70
|
+
with open(path, "r") as f:
|
|
71
|
+
data = json.load(f)
|
|
72
|
+
|
|
73
|
+
return cls._from_dict(data)
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def _from_dict(cls, data: Dict[str, Any]) -> "EvaluationDataset":
|
|
77
|
+
"""Create dataset from dictionary.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
data: Dictionary with dataset structure
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
EvaluationDataset instance
|
|
84
|
+
"""
|
|
85
|
+
dataset_config = data.get("dataset", data)
|
|
86
|
+
name = dataset_config.get("name", "Unnamed Dataset")
|
|
87
|
+
description = dataset_config.get("description", "")
|
|
88
|
+
|
|
89
|
+
test_cases = []
|
|
90
|
+
for tc_data in dataset_config.get("test_cases", []):
|
|
91
|
+
test_case = cls._parse_test_case(tc_data)
|
|
92
|
+
test_cases.append(test_case)
|
|
93
|
+
|
|
94
|
+
return cls(name=name, test_cases=test_cases, description=description)
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def _parse_test_case(cls, data: Dict[str, Any]) -> TestCase:
|
|
98
|
+
"""Parse a single test case from dictionary.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
data: Test case dictionary
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
TestCase instance
|
|
105
|
+
"""
|
|
106
|
+
# Create user
|
|
107
|
+
user_id = data.get("user_id", "test_user")
|
|
108
|
+
user = User(
|
|
109
|
+
id=user_id,
|
|
110
|
+
username=data.get("username", user_id),
|
|
111
|
+
email=data.get("email", f"{user_id}@example.com"),
|
|
112
|
+
group_memberships=data.get("user_groups", []),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Parse expected outcome if present
|
|
116
|
+
expected_outcome = None
|
|
117
|
+
if "expected_outcome" in data:
|
|
118
|
+
outcome_data = data["expected_outcome"]
|
|
119
|
+
expected_outcome = ExpectedOutcome(
|
|
120
|
+
tools_called=outcome_data.get("tools_called"),
|
|
121
|
+
tools_not_called=outcome_data.get("tools_not_called"),
|
|
122
|
+
final_answer_contains=outcome_data.get("final_answer_contains"),
|
|
123
|
+
final_answer_not_contains=outcome_data.get("final_answer_not_contains"),
|
|
124
|
+
min_components=outcome_data.get("min_components"),
|
|
125
|
+
max_components=outcome_data.get("max_components"),
|
|
126
|
+
max_execution_time_ms=outcome_data.get("max_execution_time_ms"),
|
|
127
|
+
metadata=outcome_data.get("metadata", {}),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return TestCase(
|
|
131
|
+
id=data["id"],
|
|
132
|
+
user=user,
|
|
133
|
+
message=data["message"],
|
|
134
|
+
conversation_id=data.get("conversation_id"),
|
|
135
|
+
expected_outcome=expected_outcome,
|
|
136
|
+
metadata=data.get("metadata", {}),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def save_yaml(self, path: str) -> None:
|
|
140
|
+
"""Save dataset to YAML file.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
path: Path to save YAML file
|
|
144
|
+
"""
|
|
145
|
+
data = self._to_dict()
|
|
146
|
+
with open(path, "w") as f:
|
|
147
|
+
yaml.dump(data, f, default_flow_style=False, sort_keys=False)
|
|
148
|
+
|
|
149
|
+
def save_json(self, path: str) -> None:
|
|
150
|
+
"""Save dataset to JSON file.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
path: Path to save JSON file
|
|
154
|
+
"""
|
|
155
|
+
data = self._to_dict()
|
|
156
|
+
with open(path, "w") as f:
|
|
157
|
+
json.dump(data, f, indent=2)
|
|
158
|
+
|
|
159
|
+
def _to_dict(self) -> Dict[str, Any]:
|
|
160
|
+
"""Convert dataset to dictionary.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Dictionary representation
|
|
164
|
+
"""
|
|
165
|
+
return {
|
|
166
|
+
"dataset": {
|
|
167
|
+
"name": self.name,
|
|
168
|
+
"description": self.description,
|
|
169
|
+
"test_cases": [self._test_case_to_dict(tc) for tc in self.test_cases],
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
def _test_case_to_dict(self, test_case: TestCase) -> Dict[str, Any]:
|
|
174
|
+
"""Convert test case to dictionary.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
test_case: TestCase to convert
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Dictionary representation
|
|
181
|
+
"""
|
|
182
|
+
data: Dict[str, Any] = {
|
|
183
|
+
"id": test_case.id,
|
|
184
|
+
"user_id": test_case.user.id,
|
|
185
|
+
"username": test_case.user.username,
|
|
186
|
+
"email": test_case.user.email,
|
|
187
|
+
"user_groups": test_case.user.group_memberships,
|
|
188
|
+
"message": test_case.message,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
if test_case.conversation_id:
|
|
192
|
+
data["conversation_id"] = test_case.conversation_id
|
|
193
|
+
|
|
194
|
+
if test_case.expected_outcome:
|
|
195
|
+
outcome = test_case.expected_outcome
|
|
196
|
+
outcome_dict: Dict[str, Any] = {}
|
|
197
|
+
|
|
198
|
+
if outcome.tools_called:
|
|
199
|
+
outcome_dict["tools_called"] = outcome.tools_called
|
|
200
|
+
if outcome.tools_not_called:
|
|
201
|
+
outcome_dict["tools_not_called"] = outcome.tools_not_called
|
|
202
|
+
if outcome.final_answer_contains:
|
|
203
|
+
outcome_dict["final_answer_contains"] = outcome.final_answer_contains
|
|
204
|
+
if outcome.final_answer_not_contains:
|
|
205
|
+
outcome_dict["final_answer_not_contains"] = (
|
|
206
|
+
outcome.final_answer_not_contains
|
|
207
|
+
)
|
|
208
|
+
if outcome.min_components is not None:
|
|
209
|
+
outcome_dict["min_components"] = outcome.min_components
|
|
210
|
+
if outcome.max_components is not None:
|
|
211
|
+
outcome_dict["max_components"] = outcome.max_components
|
|
212
|
+
if outcome.max_execution_time_ms is not None:
|
|
213
|
+
outcome_dict["max_execution_time_ms"] = outcome.max_execution_time_ms
|
|
214
|
+
if outcome.metadata:
|
|
215
|
+
outcome_dict["metadata"] = outcome.metadata
|
|
216
|
+
|
|
217
|
+
if outcome_dict:
|
|
218
|
+
data["expected_outcome"] = outcome_dict
|
|
219
|
+
|
|
220
|
+
if test_case.metadata:
|
|
221
|
+
data["metadata"] = test_case.metadata
|
|
222
|
+
|
|
223
|
+
return data
|
|
224
|
+
|
|
225
|
+
def filter_by_metadata(self, **kwargs: Any) -> "EvaluationDataset":
|
|
226
|
+
"""Filter test cases by metadata fields.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
**kwargs: Metadata fields to match
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
New EvaluationDataset with filtered test cases
|
|
233
|
+
"""
|
|
234
|
+
filtered = [
|
|
235
|
+
tc
|
|
236
|
+
for tc in self.test_cases
|
|
237
|
+
if all(tc.metadata.get(k) == v for k, v in kwargs.items())
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
return EvaluationDataset(
|
|
241
|
+
name=f"{self.name} (filtered)",
|
|
242
|
+
test_cases=filtered,
|
|
243
|
+
description=f"Filtered from: {self.description}",
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
def __len__(self) -> int:
|
|
247
|
+
"""Get number of test cases."""
|
|
248
|
+
return len(self.test_cases)
|
|
249
|
+
|
|
250
|
+
def __repr__(self) -> str:
|
|
251
|
+
"""String representation."""
|
|
252
|
+
return (
|
|
253
|
+
f"EvaluationDataset(name='{self.name}', test_cases={len(self.test_cases)})"
|
|
254
|
+
)
|