PraisonAI 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- praisonai/__init__.py +54 -0
- praisonai/__main__.py +15 -0
- praisonai/acp/__init__.py +54 -0
- praisonai/acp/config.py +159 -0
- praisonai/acp/server.py +587 -0
- praisonai/acp/session.py +219 -0
- praisonai/adapters/__init__.py +50 -0
- praisonai/adapters/readers.py +395 -0
- praisonai/adapters/rerankers.py +315 -0
- praisonai/adapters/retrievers.py +394 -0
- praisonai/adapters/vector_stores.py +409 -0
- praisonai/agent_scheduler.py +337 -0
- praisonai/agents_generator.py +903 -0
- praisonai/api/call.py +292 -0
- praisonai/auto.py +1197 -0
- praisonai/capabilities/__init__.py +275 -0
- praisonai/capabilities/a2a.py +140 -0
- praisonai/capabilities/assistants.py +283 -0
- praisonai/capabilities/audio.py +320 -0
- praisonai/capabilities/batches.py +469 -0
- praisonai/capabilities/completions.py +336 -0
- praisonai/capabilities/container_files.py +155 -0
- praisonai/capabilities/containers.py +93 -0
- praisonai/capabilities/embeddings.py +158 -0
- praisonai/capabilities/files.py +467 -0
- praisonai/capabilities/fine_tuning.py +293 -0
- praisonai/capabilities/guardrails.py +182 -0
- praisonai/capabilities/images.py +330 -0
- praisonai/capabilities/mcp.py +190 -0
- praisonai/capabilities/messages.py +270 -0
- praisonai/capabilities/moderations.py +154 -0
- praisonai/capabilities/ocr.py +217 -0
- praisonai/capabilities/passthrough.py +204 -0
- praisonai/capabilities/rag.py +207 -0
- praisonai/capabilities/realtime.py +160 -0
- praisonai/capabilities/rerank.py +165 -0
- praisonai/capabilities/responses.py +266 -0
- praisonai/capabilities/search.py +109 -0
- praisonai/capabilities/skills.py +133 -0
- praisonai/capabilities/vector_store_files.py +334 -0
- praisonai/capabilities/vector_stores.py +304 -0
- praisonai/capabilities/videos.py +141 -0
- praisonai/chainlit_ui.py +304 -0
- praisonai/chat/__init__.py +106 -0
- praisonai/chat/app.py +125 -0
- praisonai/cli/__init__.py +26 -0
- praisonai/cli/app.py +213 -0
- praisonai/cli/commands/__init__.py +75 -0
- praisonai/cli/commands/acp.py +70 -0
- praisonai/cli/commands/completion.py +333 -0
- praisonai/cli/commands/config.py +166 -0
- praisonai/cli/commands/debug.py +142 -0
- praisonai/cli/commands/diag.py +55 -0
- praisonai/cli/commands/doctor.py +166 -0
- praisonai/cli/commands/environment.py +179 -0
- praisonai/cli/commands/lsp.py +112 -0
- praisonai/cli/commands/mcp.py +210 -0
- praisonai/cli/commands/profile.py +457 -0
- praisonai/cli/commands/run.py +228 -0
- praisonai/cli/commands/schedule.py +150 -0
- praisonai/cli/commands/serve.py +97 -0
- praisonai/cli/commands/session.py +212 -0
- praisonai/cli/commands/traces.py +145 -0
- praisonai/cli/commands/version.py +101 -0
- praisonai/cli/configuration/__init__.py +18 -0
- praisonai/cli/configuration/loader.py +353 -0
- praisonai/cli/configuration/paths.py +114 -0
- praisonai/cli/configuration/schema.py +164 -0
- praisonai/cli/features/__init__.py +268 -0
- praisonai/cli/features/acp.py +236 -0
- praisonai/cli/features/action_orchestrator.py +546 -0
- praisonai/cli/features/agent_scheduler.py +773 -0
- praisonai/cli/features/agent_tools.py +474 -0
- praisonai/cli/features/agents.py +375 -0
- praisonai/cli/features/at_mentions.py +471 -0
- praisonai/cli/features/auto_memory.py +182 -0
- praisonai/cli/features/autonomy_mode.py +490 -0
- praisonai/cli/features/background.py +356 -0
- praisonai/cli/features/base.py +168 -0
- praisonai/cli/features/capabilities.py +1326 -0
- praisonai/cli/features/checkpoints.py +338 -0
- praisonai/cli/features/code_intelligence.py +652 -0
- praisonai/cli/features/compaction.py +294 -0
- praisonai/cli/features/compare.py +534 -0
- praisonai/cli/features/cost_tracker.py +514 -0
- praisonai/cli/features/debug.py +810 -0
- praisonai/cli/features/deploy.py +517 -0
- praisonai/cli/features/diag.py +289 -0
- praisonai/cli/features/doctor/__init__.py +63 -0
- praisonai/cli/features/doctor/checks/__init__.py +24 -0
- praisonai/cli/features/doctor/checks/acp_checks.py +240 -0
- praisonai/cli/features/doctor/checks/config_checks.py +366 -0
- praisonai/cli/features/doctor/checks/db_checks.py +366 -0
- praisonai/cli/features/doctor/checks/env_checks.py +543 -0
- praisonai/cli/features/doctor/checks/lsp_checks.py +199 -0
- praisonai/cli/features/doctor/checks/mcp_checks.py +349 -0
- praisonai/cli/features/doctor/checks/memory_checks.py +268 -0
- praisonai/cli/features/doctor/checks/network_checks.py +251 -0
- praisonai/cli/features/doctor/checks/obs_checks.py +328 -0
- praisonai/cli/features/doctor/checks/performance_checks.py +235 -0
- praisonai/cli/features/doctor/checks/permissions_checks.py +259 -0
- praisonai/cli/features/doctor/checks/selftest_checks.py +322 -0
- praisonai/cli/features/doctor/checks/serve_checks.py +426 -0
- praisonai/cli/features/doctor/checks/skills_checks.py +231 -0
- praisonai/cli/features/doctor/checks/tools_checks.py +371 -0
- praisonai/cli/features/doctor/engine.py +266 -0
- praisonai/cli/features/doctor/formatters.py +310 -0
- praisonai/cli/features/doctor/handler.py +397 -0
- praisonai/cli/features/doctor/models.py +264 -0
- praisonai/cli/features/doctor/registry.py +239 -0
- praisonai/cli/features/endpoints.py +1019 -0
- praisonai/cli/features/eval.py +560 -0
- praisonai/cli/features/external_agents.py +231 -0
- praisonai/cli/features/fast_context.py +410 -0
- praisonai/cli/features/flow_display.py +566 -0
- praisonai/cli/features/git_integration.py +651 -0
- praisonai/cli/features/guardrail.py +171 -0
- praisonai/cli/features/handoff.py +185 -0
- praisonai/cli/features/hooks.py +583 -0
- praisonai/cli/features/image.py +384 -0
- praisonai/cli/features/interactive_runtime.py +585 -0
- praisonai/cli/features/interactive_tools.py +380 -0
- praisonai/cli/features/interactive_tui.py +603 -0
- praisonai/cli/features/jobs.py +632 -0
- praisonai/cli/features/knowledge.py +531 -0
- praisonai/cli/features/lite.py +244 -0
- praisonai/cli/features/lsp_cli.py +225 -0
- praisonai/cli/features/mcp.py +169 -0
- praisonai/cli/features/message_queue.py +587 -0
- praisonai/cli/features/metrics.py +211 -0
- praisonai/cli/features/n8n.py +673 -0
- praisonai/cli/features/observability.py +293 -0
- praisonai/cli/features/ollama.py +361 -0
- praisonai/cli/features/output_style.py +273 -0
- praisonai/cli/features/package.py +631 -0
- praisonai/cli/features/performance.py +308 -0
- praisonai/cli/features/persistence.py +636 -0
- praisonai/cli/features/profile.py +226 -0
- praisonai/cli/features/profiler/__init__.py +81 -0
- praisonai/cli/features/profiler/core.py +558 -0
- praisonai/cli/features/profiler/optimizations.py +652 -0
- praisonai/cli/features/profiler/suite.py +386 -0
- praisonai/cli/features/profiling.py +350 -0
- praisonai/cli/features/queue/__init__.py +73 -0
- praisonai/cli/features/queue/manager.py +395 -0
- praisonai/cli/features/queue/models.py +286 -0
- praisonai/cli/features/queue/persistence.py +564 -0
- praisonai/cli/features/queue/scheduler.py +484 -0
- praisonai/cli/features/queue/worker.py +372 -0
- praisonai/cli/features/recipe.py +1723 -0
- praisonai/cli/features/recipes.py +449 -0
- praisonai/cli/features/registry.py +229 -0
- praisonai/cli/features/repo_map.py +860 -0
- praisonai/cli/features/router.py +466 -0
- praisonai/cli/features/sandbox_executor.py +515 -0
- praisonai/cli/features/serve.py +829 -0
- praisonai/cli/features/session.py +222 -0
- praisonai/cli/features/skills.py +856 -0
- praisonai/cli/features/slash_commands.py +650 -0
- praisonai/cli/features/telemetry.py +179 -0
- praisonai/cli/features/templates.py +1384 -0
- praisonai/cli/features/thinking.py +305 -0
- praisonai/cli/features/todo.py +334 -0
- praisonai/cli/features/tools.py +680 -0
- praisonai/cli/features/tui/__init__.py +83 -0
- praisonai/cli/features/tui/app.py +580 -0
- praisonai/cli/features/tui/cli.py +566 -0
- praisonai/cli/features/tui/debug.py +511 -0
- praisonai/cli/features/tui/events.py +99 -0
- praisonai/cli/features/tui/mock_provider.py +328 -0
- praisonai/cli/features/tui/orchestrator.py +652 -0
- praisonai/cli/features/tui/screens/__init__.py +50 -0
- praisonai/cli/features/tui/screens/main.py +245 -0
- praisonai/cli/features/tui/screens/queue.py +174 -0
- praisonai/cli/features/tui/screens/session.py +124 -0
- praisonai/cli/features/tui/screens/settings.py +148 -0
- praisonai/cli/features/tui/widgets/__init__.py +56 -0
- praisonai/cli/features/tui/widgets/chat.py +261 -0
- praisonai/cli/features/tui/widgets/composer.py +224 -0
- praisonai/cli/features/tui/widgets/queue_panel.py +200 -0
- praisonai/cli/features/tui/widgets/status.py +167 -0
- praisonai/cli/features/tui/widgets/tool_panel.py +248 -0
- praisonai/cli/features/workflow.py +720 -0
- praisonai/cli/legacy.py +236 -0
- praisonai/cli/main.py +5559 -0
- praisonai/cli/schedule_cli.py +54 -0
- praisonai/cli/state/__init__.py +31 -0
- praisonai/cli/state/identifiers.py +161 -0
- praisonai/cli/state/sessions.py +313 -0
- praisonai/code/__init__.py +93 -0
- praisonai/code/agent_tools.py +344 -0
- praisonai/code/diff/__init__.py +21 -0
- praisonai/code/diff/diff_strategy.py +432 -0
- praisonai/code/tools/__init__.py +27 -0
- praisonai/code/tools/apply_diff.py +221 -0
- praisonai/code/tools/execute_command.py +275 -0
- praisonai/code/tools/list_files.py +274 -0
- praisonai/code/tools/read_file.py +206 -0
- praisonai/code/tools/search_replace.py +248 -0
- praisonai/code/tools/write_file.py +217 -0
- praisonai/code/utils/__init__.py +46 -0
- praisonai/code/utils/file_utils.py +307 -0
- praisonai/code/utils/ignore_utils.py +308 -0
- praisonai/code/utils/text_utils.py +276 -0
- praisonai/db/__init__.py +64 -0
- praisonai/db/adapter.py +531 -0
- praisonai/deploy/__init__.py +62 -0
- praisonai/deploy/api.py +231 -0
- praisonai/deploy/docker.py +454 -0
- praisonai/deploy/doctor.py +367 -0
- praisonai/deploy/main.py +327 -0
- praisonai/deploy/models.py +179 -0
- praisonai/deploy/providers/__init__.py +33 -0
- praisonai/deploy/providers/aws.py +331 -0
- praisonai/deploy/providers/azure.py +358 -0
- praisonai/deploy/providers/base.py +101 -0
- praisonai/deploy/providers/gcp.py +314 -0
- praisonai/deploy/schema.py +208 -0
- praisonai/deploy.py +185 -0
- praisonai/endpoints/__init__.py +53 -0
- praisonai/endpoints/a2u_server.py +410 -0
- praisonai/endpoints/discovery.py +165 -0
- praisonai/endpoints/providers/__init__.py +28 -0
- praisonai/endpoints/providers/a2a.py +253 -0
- praisonai/endpoints/providers/a2u.py +208 -0
- praisonai/endpoints/providers/agents_api.py +171 -0
- praisonai/endpoints/providers/base.py +231 -0
- praisonai/endpoints/providers/mcp.py +263 -0
- praisonai/endpoints/providers/recipe.py +206 -0
- praisonai/endpoints/providers/tools_mcp.py +150 -0
- praisonai/endpoints/registry.py +131 -0
- praisonai/endpoints/server.py +161 -0
- praisonai/inbuilt_tools/__init__.py +24 -0
- praisonai/inbuilt_tools/autogen_tools.py +117 -0
- praisonai/inc/__init__.py +2 -0
- praisonai/inc/config.py +96 -0
- praisonai/inc/models.py +155 -0
- praisonai/integrations/__init__.py +56 -0
- praisonai/integrations/base.py +303 -0
- praisonai/integrations/claude_code.py +270 -0
- praisonai/integrations/codex_cli.py +255 -0
- praisonai/integrations/cursor_cli.py +195 -0
- praisonai/integrations/gemini_cli.py +222 -0
- praisonai/jobs/__init__.py +67 -0
- praisonai/jobs/executor.py +425 -0
- praisonai/jobs/models.py +230 -0
- praisonai/jobs/router.py +314 -0
- praisonai/jobs/server.py +186 -0
- praisonai/jobs/store.py +203 -0
- praisonai/llm/__init__.py +66 -0
- praisonai/llm/registry.py +382 -0
- praisonai/mcp_server/__init__.py +152 -0
- praisonai/mcp_server/adapters/__init__.py +74 -0
- praisonai/mcp_server/adapters/agents.py +128 -0
- praisonai/mcp_server/adapters/capabilities.py +168 -0
- praisonai/mcp_server/adapters/cli_tools.py +568 -0
- praisonai/mcp_server/adapters/extended_capabilities.py +462 -0
- praisonai/mcp_server/adapters/knowledge.py +93 -0
- praisonai/mcp_server/adapters/memory.py +104 -0
- praisonai/mcp_server/adapters/prompts.py +306 -0
- praisonai/mcp_server/adapters/resources.py +124 -0
- praisonai/mcp_server/adapters/tools_bridge.py +280 -0
- praisonai/mcp_server/auth/__init__.py +48 -0
- praisonai/mcp_server/auth/api_key.py +291 -0
- praisonai/mcp_server/auth/oauth.py +460 -0
- praisonai/mcp_server/auth/oidc.py +289 -0
- praisonai/mcp_server/auth/scopes.py +260 -0
- praisonai/mcp_server/cli.py +852 -0
- praisonai/mcp_server/elicitation.py +445 -0
- praisonai/mcp_server/icons.py +302 -0
- praisonai/mcp_server/recipe_adapter.py +573 -0
- praisonai/mcp_server/recipe_cli.py +824 -0
- praisonai/mcp_server/registry.py +703 -0
- praisonai/mcp_server/sampling.py +422 -0
- praisonai/mcp_server/server.py +490 -0
- praisonai/mcp_server/tasks.py +443 -0
- praisonai/mcp_server/transports/__init__.py +18 -0
- praisonai/mcp_server/transports/http_stream.py +376 -0
- praisonai/mcp_server/transports/stdio.py +132 -0
- praisonai/persistence/__init__.py +84 -0
- praisonai/persistence/config.py +238 -0
- praisonai/persistence/conversation/__init__.py +25 -0
- praisonai/persistence/conversation/async_mysql.py +427 -0
- praisonai/persistence/conversation/async_postgres.py +410 -0
- praisonai/persistence/conversation/async_sqlite.py +371 -0
- praisonai/persistence/conversation/base.py +151 -0
- praisonai/persistence/conversation/json_store.py +250 -0
- praisonai/persistence/conversation/mysql.py +387 -0
- praisonai/persistence/conversation/postgres.py +401 -0
- praisonai/persistence/conversation/singlestore.py +240 -0
- praisonai/persistence/conversation/sqlite.py +341 -0
- praisonai/persistence/conversation/supabase.py +203 -0
- praisonai/persistence/conversation/surrealdb.py +287 -0
- praisonai/persistence/factory.py +301 -0
- praisonai/persistence/hooks/__init__.py +18 -0
- praisonai/persistence/hooks/agent_hooks.py +297 -0
- praisonai/persistence/knowledge/__init__.py +26 -0
- praisonai/persistence/knowledge/base.py +144 -0
- praisonai/persistence/knowledge/cassandra.py +232 -0
- praisonai/persistence/knowledge/chroma.py +295 -0
- praisonai/persistence/knowledge/clickhouse.py +242 -0
- praisonai/persistence/knowledge/cosmosdb_vector.py +438 -0
- praisonai/persistence/knowledge/couchbase.py +286 -0
- praisonai/persistence/knowledge/lancedb.py +216 -0
- praisonai/persistence/knowledge/langchain_adapter.py +291 -0
- praisonai/persistence/knowledge/lightrag_adapter.py +212 -0
- praisonai/persistence/knowledge/llamaindex_adapter.py +256 -0
- praisonai/persistence/knowledge/milvus.py +277 -0
- praisonai/persistence/knowledge/mongodb_vector.py +306 -0
- praisonai/persistence/knowledge/pgvector.py +335 -0
- praisonai/persistence/knowledge/pinecone.py +253 -0
- praisonai/persistence/knowledge/qdrant.py +301 -0
- praisonai/persistence/knowledge/redis_vector.py +291 -0
- praisonai/persistence/knowledge/singlestore_vector.py +299 -0
- praisonai/persistence/knowledge/surrealdb_vector.py +309 -0
- praisonai/persistence/knowledge/upstash_vector.py +266 -0
- praisonai/persistence/knowledge/weaviate.py +223 -0
- praisonai/persistence/migrations/__init__.py +10 -0
- praisonai/persistence/migrations/manager.py +251 -0
- praisonai/persistence/orchestrator.py +406 -0
- praisonai/persistence/state/__init__.py +21 -0
- praisonai/persistence/state/async_mongodb.py +200 -0
- praisonai/persistence/state/base.py +107 -0
- praisonai/persistence/state/dynamodb.py +226 -0
- praisonai/persistence/state/firestore.py +175 -0
- praisonai/persistence/state/gcs.py +155 -0
- praisonai/persistence/state/memory.py +245 -0
- praisonai/persistence/state/mongodb.py +158 -0
- praisonai/persistence/state/redis.py +190 -0
- praisonai/persistence/state/upstash.py +144 -0
- praisonai/persistence/tests/__init__.py +3 -0
- praisonai/persistence/tests/test_all_backends.py +633 -0
- praisonai/profiler.py +1214 -0
- praisonai/recipe/__init__.py +134 -0
- praisonai/recipe/bridge.py +278 -0
- praisonai/recipe/core.py +893 -0
- praisonai/recipe/exceptions.py +54 -0
- praisonai/recipe/history.py +402 -0
- praisonai/recipe/models.py +266 -0
- praisonai/recipe/operations.py +440 -0
- praisonai/recipe/policy.py +422 -0
- praisonai/recipe/registry.py +849 -0
- praisonai/recipe/runtime.py +214 -0
- praisonai/recipe/security.py +711 -0
- praisonai/recipe/serve.py +859 -0
- praisonai/recipe/server.py +613 -0
- praisonai/scheduler/__init__.py +45 -0
- praisonai/scheduler/agent_scheduler.py +552 -0
- praisonai/scheduler/base.py +124 -0
- praisonai/scheduler/daemon_manager.py +225 -0
- praisonai/scheduler/state_manager.py +155 -0
- praisonai/scheduler/yaml_loader.py +193 -0
- praisonai/scheduler.py +194 -0
- praisonai/setup/__init__.py +1 -0
- praisonai/setup/build.py +21 -0
- praisonai/setup/post_install.py +23 -0
- praisonai/setup/setup_conda_env.py +25 -0
- praisonai/setup.py +16 -0
- praisonai/templates/__init__.py +116 -0
- praisonai/templates/cache.py +364 -0
- praisonai/templates/dependency_checker.py +358 -0
- praisonai/templates/discovery.py +391 -0
- praisonai/templates/loader.py +564 -0
- praisonai/templates/registry.py +511 -0
- praisonai/templates/resolver.py +206 -0
- praisonai/templates/security.py +327 -0
- praisonai/templates/tool_override.py +498 -0
- praisonai/templates/tools_doctor.py +256 -0
- praisonai/test.py +105 -0
- praisonai/train.py +562 -0
- praisonai/train_vision.py +306 -0
- praisonai/ui/agents.py +824 -0
- praisonai/ui/callbacks.py +57 -0
- praisonai/ui/chainlit_compat.py +246 -0
- praisonai/ui/chat.py +532 -0
- praisonai/ui/code.py +717 -0
- praisonai/ui/colab.py +474 -0
- praisonai/ui/colab_chainlit.py +81 -0
- praisonai/ui/components/aicoder.py +284 -0
- praisonai/ui/context.py +283 -0
- praisonai/ui/database_config.py +56 -0
- praisonai/ui/db.py +294 -0
- praisonai/ui/realtime.py +488 -0
- praisonai/ui/realtimeclient/__init__.py +756 -0
- praisonai/ui/realtimeclient/tools.py +242 -0
- praisonai/ui/sql_alchemy.py +710 -0
- praisonai/upload_vision.py +140 -0
- praisonai/version.py +1 -0
- praisonai-3.0.0.dist-info/METADATA +3493 -0
- praisonai-3.0.0.dist-info/RECORD +393 -0
- praisonai-3.0.0.dist-info/WHEEL +5 -0
- praisonai-3.0.0.dist-info/entry_points.txt +4 -0
- praisonai-3.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,560 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation CLI feature for PraisonAI.
|
|
3
|
+
|
|
4
|
+
Provides CLI commands for running agent evaluations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Optional, List, Dict, Any
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EvalHandler:
|
|
16
|
+
"""Handler for evaluation CLI commands."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, verbose: bool = False):
|
|
19
|
+
"""
|
|
20
|
+
Initialize the evaluation handler.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
verbose: Enable verbose output
|
|
24
|
+
"""
|
|
25
|
+
self.verbose = verbose
|
|
26
|
+
|
|
27
|
+
def run_accuracy(
|
|
28
|
+
self,
|
|
29
|
+
agent_file: Optional[str] = None,
|
|
30
|
+
input_text: str = "",
|
|
31
|
+
expected_output: str = "",
|
|
32
|
+
iterations: int = 1,
|
|
33
|
+
model: Optional[str] = None,
|
|
34
|
+
output_file: Optional[str] = None,
|
|
35
|
+
prompt: Optional[str] = None,
|
|
36
|
+
llm: Optional[str] = None
|
|
37
|
+
) -> Dict[str, Any]:
|
|
38
|
+
"""
|
|
39
|
+
Run accuracy evaluation on an agent.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
agent_file: Path to agents.yaml file (optional if prompt is provided)
|
|
43
|
+
input_text: Input to provide to the agent
|
|
44
|
+
expected_output: Expected output to compare against
|
|
45
|
+
iterations: Number of evaluation iterations
|
|
46
|
+
model: LLM model for judging
|
|
47
|
+
output_file: Path to save results
|
|
48
|
+
prompt: Direct prompt (alternative to agent_file)
|
|
49
|
+
llm: LLM model for the agent (when using prompt)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Evaluation result dictionary
|
|
53
|
+
"""
|
|
54
|
+
try:
|
|
55
|
+
from praisonaiagents.eval import AccuracyEvaluator
|
|
56
|
+
from praisonaiagents import Agent
|
|
57
|
+
except ImportError as e:
|
|
58
|
+
logger.error(f"Failed to import evaluation modules: {e}")
|
|
59
|
+
return {"error": str(e)}
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
# Create agent either from file or from prompt
|
|
63
|
+
if prompt:
|
|
64
|
+
# Direct prompt mode - create agent on the fly
|
|
65
|
+
agent = Agent(
|
|
66
|
+
name="EvalAgent",
|
|
67
|
+
role="Assistant",
|
|
68
|
+
goal="Complete the given task",
|
|
69
|
+
backstory="You are a helpful assistant.",
|
|
70
|
+
llm=llm or model or "gpt-4o-mini",
|
|
71
|
+
verbose=False
|
|
72
|
+
)
|
|
73
|
+
# Use prompt as input if input_text not provided
|
|
74
|
+
if not input_text:
|
|
75
|
+
input_text = prompt
|
|
76
|
+
elif agent_file:
|
|
77
|
+
# Load from agents.yaml
|
|
78
|
+
try:
|
|
79
|
+
from praisonai.agents_generator import AgentsGenerator
|
|
80
|
+
generator = AgentsGenerator(agent_file)
|
|
81
|
+
agents = generator.generate_agents()
|
|
82
|
+
|
|
83
|
+
if not agents:
|
|
84
|
+
return {"error": "No agents found in configuration"}
|
|
85
|
+
|
|
86
|
+
agent = agents[0] if isinstance(agents, list) else agents
|
|
87
|
+
except Exception as e:
|
|
88
|
+
return {"error": f"Failed to load agents from {agent_file}: {e}"}
|
|
89
|
+
else:
|
|
90
|
+
return {"error": "Either --agent or --prompt must be provided"}
|
|
91
|
+
|
|
92
|
+
evaluator = AccuracyEvaluator(
|
|
93
|
+
agent=agent,
|
|
94
|
+
input_text=input_text,
|
|
95
|
+
expected_output=expected_output,
|
|
96
|
+
num_iterations=iterations,
|
|
97
|
+
model=model,
|
|
98
|
+
save_results_path=output_file,
|
|
99
|
+
verbose=self.verbose
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
result = evaluator.run(print_summary=True)
|
|
103
|
+
return result.to_dict()
|
|
104
|
+
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.error(f"Accuracy evaluation failed: {e}")
|
|
107
|
+
return {"error": str(e)}
|
|
108
|
+
|
|
109
|
+
def run_performance(
|
|
110
|
+
self,
|
|
111
|
+
agent_file: str,
|
|
112
|
+
input_text: str = "Hello",
|
|
113
|
+
iterations: int = 10,
|
|
114
|
+
warmup: int = 2,
|
|
115
|
+
track_memory: bool = True,
|
|
116
|
+
output_file: Optional[str] = None
|
|
117
|
+
) -> Dict[str, Any]:
|
|
118
|
+
"""
|
|
119
|
+
Run performance evaluation on an agent.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
agent_file: Path to agents.yaml file
|
|
123
|
+
input_text: Input to provide to the agent
|
|
124
|
+
iterations: Number of benchmark iterations
|
|
125
|
+
warmup: Number of warmup runs
|
|
126
|
+
track_memory: Whether to track memory usage
|
|
127
|
+
output_file: Path to save results
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Evaluation result dictionary
|
|
131
|
+
"""
|
|
132
|
+
try:
|
|
133
|
+
from praisonaiagents.eval import PerformanceEvaluator
|
|
134
|
+
from praisonai.agents_generator import AgentsGenerator
|
|
135
|
+
except ImportError as e:
|
|
136
|
+
logger.error(f"Failed to import evaluation modules: {e}")
|
|
137
|
+
return {"error": str(e)}
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
generator = AgentsGenerator(agent_file)
|
|
141
|
+
agents = generator.generate_agents()
|
|
142
|
+
|
|
143
|
+
if not agents:
|
|
144
|
+
return {"error": "No agents found in configuration"}
|
|
145
|
+
|
|
146
|
+
agent = agents[0] if isinstance(agents, list) else agents
|
|
147
|
+
|
|
148
|
+
evaluator = PerformanceEvaluator(
|
|
149
|
+
agent=agent,
|
|
150
|
+
input_text=input_text,
|
|
151
|
+
num_iterations=iterations,
|
|
152
|
+
warmup_runs=warmup,
|
|
153
|
+
track_memory=track_memory,
|
|
154
|
+
save_results_path=output_file,
|
|
155
|
+
verbose=self.verbose
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
result = evaluator.run(print_summary=True)
|
|
159
|
+
return result.to_dict()
|
|
160
|
+
|
|
161
|
+
except Exception as e:
|
|
162
|
+
logger.error(f"Performance evaluation failed: {e}")
|
|
163
|
+
return {"error": str(e)}
|
|
164
|
+
|
|
165
|
+
def run_reliability(
|
|
166
|
+
self,
|
|
167
|
+
agent_file: str,
|
|
168
|
+
input_text: str,
|
|
169
|
+
expected_tools: List[str],
|
|
170
|
+
forbidden_tools: Optional[List[str]] = None,
|
|
171
|
+
output_file: Optional[str] = None
|
|
172
|
+
) -> Dict[str, Any]:
|
|
173
|
+
"""
|
|
174
|
+
Run reliability evaluation on an agent.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
agent_file: Path to agents.yaml file
|
|
178
|
+
input_text: Input to provide to the agent
|
|
179
|
+
expected_tools: List of tools that should be called
|
|
180
|
+
forbidden_tools: List of tools that should NOT be called
|
|
181
|
+
output_file: Path to save results
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Evaluation result dictionary
|
|
185
|
+
"""
|
|
186
|
+
try:
|
|
187
|
+
from praisonaiagents.eval import ReliabilityEvaluator
|
|
188
|
+
from praisonai.agents_generator import AgentsGenerator
|
|
189
|
+
except ImportError as e:
|
|
190
|
+
logger.error(f"Failed to import evaluation modules: {e}")
|
|
191
|
+
return {"error": str(e)}
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
generator = AgentsGenerator(agent_file)
|
|
195
|
+
agents = generator.generate_agents()
|
|
196
|
+
|
|
197
|
+
if not agents:
|
|
198
|
+
return {"error": "No agents found in configuration"}
|
|
199
|
+
|
|
200
|
+
agent = agents[0] if isinstance(agents, list) else agents
|
|
201
|
+
|
|
202
|
+
evaluator = ReliabilityEvaluator(
|
|
203
|
+
agent=agent,
|
|
204
|
+
input_text=input_text,
|
|
205
|
+
expected_tools=expected_tools,
|
|
206
|
+
forbidden_tools=forbidden_tools,
|
|
207
|
+
save_results_path=output_file,
|
|
208
|
+
verbose=self.verbose
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
result = evaluator.run(print_summary=True)
|
|
212
|
+
return result.to_dict()
|
|
213
|
+
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.error(f"Reliability evaluation failed: {e}")
|
|
216
|
+
return {"error": str(e)}
|
|
217
|
+
|
|
218
|
+
def run_criteria(
|
|
219
|
+
self,
|
|
220
|
+
agent_file: str,
|
|
221
|
+
input_text: str,
|
|
222
|
+
criteria: str,
|
|
223
|
+
scoring_type: str = "numeric",
|
|
224
|
+
threshold: float = 7.0,
|
|
225
|
+
iterations: int = 1,
|
|
226
|
+
model: Optional[str] = None,
|
|
227
|
+
output_file: Optional[str] = None
|
|
228
|
+
) -> Dict[str, Any]:
|
|
229
|
+
"""
|
|
230
|
+
Run criteria-based evaluation on an agent.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
agent_file: Path to agents.yaml file
|
|
234
|
+
input_text: Input to provide to the agent
|
|
235
|
+
criteria: Criteria to evaluate against
|
|
236
|
+
scoring_type: "numeric" or "binary"
|
|
237
|
+
threshold: Score threshold for passing (numeric mode)
|
|
238
|
+
iterations: Number of evaluation iterations
|
|
239
|
+
model: LLM model for judging
|
|
240
|
+
output_file: Path to save results
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Evaluation result dictionary
|
|
244
|
+
"""
|
|
245
|
+
try:
|
|
246
|
+
from praisonaiagents.eval import CriteriaEvaluator
|
|
247
|
+
from praisonai.agents_generator import AgentsGenerator
|
|
248
|
+
except ImportError as e:
|
|
249
|
+
logger.error(f"Failed to import evaluation modules: {e}")
|
|
250
|
+
return {"error": str(e)}
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
generator = AgentsGenerator(agent_file)
|
|
254
|
+
agents = generator.generate_agents()
|
|
255
|
+
|
|
256
|
+
if not agents:
|
|
257
|
+
return {"error": "No agents found in configuration"}
|
|
258
|
+
|
|
259
|
+
agent = agents[0] if isinstance(agents, list) else agents
|
|
260
|
+
|
|
261
|
+
evaluator = CriteriaEvaluator(
|
|
262
|
+
criteria=criteria,
|
|
263
|
+
agent=agent,
|
|
264
|
+
input_text=input_text,
|
|
265
|
+
scoring_type=scoring_type,
|
|
266
|
+
threshold=threshold,
|
|
267
|
+
num_iterations=iterations,
|
|
268
|
+
model=model,
|
|
269
|
+
save_results_path=output_file,
|
|
270
|
+
verbose=self.verbose
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
result = evaluator.run(print_summary=True)
|
|
274
|
+
return result.to_dict()
|
|
275
|
+
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logger.error(f"Criteria evaluation failed: {e}")
|
|
278
|
+
return {"error": str(e)}
|
|
279
|
+
|
|
280
|
+
def run_batch(
|
|
281
|
+
self,
|
|
282
|
+
agent_file: str,
|
|
283
|
+
test_file: str,
|
|
284
|
+
eval_type: str = "accuracy",
|
|
285
|
+
output_file: Optional[str] = None
|
|
286
|
+
) -> Dict[str, Any]:
|
|
287
|
+
"""
|
|
288
|
+
Run batch evaluation from a test file.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
agent_file: Path to agents.yaml file
|
|
292
|
+
test_file: Path to JSON test file with test cases
|
|
293
|
+
eval_type: Type of evaluation ("accuracy", "criteria")
|
|
294
|
+
output_file: Path to save results
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Batch evaluation results
|
|
298
|
+
"""
|
|
299
|
+
try:
|
|
300
|
+
with open(test_file, 'r') as f:
|
|
301
|
+
test_cases = json.load(f)
|
|
302
|
+
except Exception as e:
|
|
303
|
+
return {"error": f"Failed to load test file: {e}"}
|
|
304
|
+
|
|
305
|
+
results = []
|
|
306
|
+
for i, test_case in enumerate(test_cases):
|
|
307
|
+
if self.verbose:
|
|
308
|
+
print(f"Running test case {i + 1}/{len(test_cases)}")
|
|
309
|
+
|
|
310
|
+
if eval_type == "accuracy":
|
|
311
|
+
result = self.run_accuracy(
|
|
312
|
+
agent_file=agent_file,
|
|
313
|
+
input_text=test_case.get("input", ""),
|
|
314
|
+
expected_output=test_case.get("expected", ""),
|
|
315
|
+
iterations=test_case.get("iterations", 1)
|
|
316
|
+
)
|
|
317
|
+
elif eval_type == "criteria":
|
|
318
|
+
result = self.run_criteria(
|
|
319
|
+
agent_file=agent_file,
|
|
320
|
+
input_text=test_case.get("input", ""),
|
|
321
|
+
criteria=test_case.get("criteria", ""),
|
|
322
|
+
scoring_type=test_case.get("scoring_type", "numeric"),
|
|
323
|
+
threshold=test_case.get("threshold", 7.0)
|
|
324
|
+
)
|
|
325
|
+
else:
|
|
326
|
+
result = {"error": f"Unknown eval type: {eval_type}"}
|
|
327
|
+
|
|
328
|
+
results.append({
|
|
329
|
+
"test_case": i + 1,
|
|
330
|
+
"input": test_case.get("input", ""),
|
|
331
|
+
"result": result
|
|
332
|
+
})
|
|
333
|
+
|
|
334
|
+
batch_result = {
|
|
335
|
+
"total_tests": len(test_cases),
|
|
336
|
+
"eval_type": eval_type,
|
|
337
|
+
"results": results
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
if output_file:
|
|
341
|
+
try:
|
|
342
|
+
with open(output_file, 'w') as f:
|
|
343
|
+
json.dump(batch_result, f, indent=2)
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logger.warning(f"Failed to save batch results: {e}")
|
|
346
|
+
|
|
347
|
+
return batch_result
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def handle_eval_command(args) -> int:
|
|
351
|
+
"""
|
|
352
|
+
Handle the eval CLI command.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
args: Command line arguments (list or parsed namespace)
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Exit code
|
|
359
|
+
"""
|
|
360
|
+
import argparse
|
|
361
|
+
|
|
362
|
+
# If args is a list, parse it first
|
|
363
|
+
if isinstance(args, list):
|
|
364
|
+
parser = argparse.ArgumentParser(prog="praisonai eval")
|
|
365
|
+
subparsers = parser.add_subparsers(dest='eval_type')
|
|
366
|
+
add_eval_parser_subcommands(subparsers)
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
args = parser.parse_args(args)
|
|
370
|
+
except SystemExit:
|
|
371
|
+
return 1
|
|
372
|
+
|
|
373
|
+
if not args.eval_type:
|
|
374
|
+
parser.print_help()
|
|
375
|
+
print("\n[bold]Examples:[/bold]")
|
|
376
|
+
print(" praisonai eval accuracy --prompt \"What is 2+2?\" --expected \"4\"")
|
|
377
|
+
print(" praisonai eval performance --agent agents.yaml --input \"Hello\"")
|
|
378
|
+
return 0
|
|
379
|
+
|
|
380
|
+
handler = EvalHandler(verbose=getattr(args, 'verbose', False))
|
|
381
|
+
|
|
382
|
+
eval_type = getattr(args, 'eval_type', 'accuracy')
|
|
383
|
+
agent_file = getattr(args, 'agent', None)
|
|
384
|
+
output_file = getattr(args, 'output', None)
|
|
385
|
+
prompt = getattr(args, 'prompt', None)
|
|
386
|
+
llm = getattr(args, 'llm', None)
|
|
387
|
+
|
|
388
|
+
# If no agent file and no prompt, check if agents.yaml exists
|
|
389
|
+
if not agent_file and not prompt:
|
|
390
|
+
import os
|
|
391
|
+
if os.path.exists('agents.yaml'):
|
|
392
|
+
agent_file = 'agents.yaml'
|
|
393
|
+
|
|
394
|
+
if eval_type == 'accuracy':
|
|
395
|
+
result = handler.run_accuracy(
|
|
396
|
+
agent_file=agent_file,
|
|
397
|
+
input_text=getattr(args, 'input', ''),
|
|
398
|
+
expected_output=getattr(args, 'expected', ''),
|
|
399
|
+
iterations=getattr(args, 'iterations', 1),
|
|
400
|
+
model=getattr(args, 'model', None),
|
|
401
|
+
output_file=output_file,
|
|
402
|
+
prompt=prompt,
|
|
403
|
+
llm=llm
|
|
404
|
+
)
|
|
405
|
+
elif eval_type == 'performance':
|
|
406
|
+
result = handler.run_performance(
|
|
407
|
+
agent_file=agent_file,
|
|
408
|
+
input_text=getattr(args, 'input', 'Hello'),
|
|
409
|
+
iterations=getattr(args, 'iterations', 10),
|
|
410
|
+
warmup=getattr(args, 'warmup', 2),
|
|
411
|
+
track_memory=getattr(args, 'memory', True),
|
|
412
|
+
output_file=output_file
|
|
413
|
+
)
|
|
414
|
+
elif eval_type == 'reliability':
|
|
415
|
+
expected_tools = getattr(args, 'expected_tools', '').split(',')
|
|
416
|
+
forbidden_tools = getattr(args, 'forbidden_tools', '')
|
|
417
|
+
forbidden_tools = forbidden_tools.split(',') if forbidden_tools else None
|
|
418
|
+
|
|
419
|
+
result = handler.run_reliability(
|
|
420
|
+
agent_file=agent_file,
|
|
421
|
+
input_text=getattr(args, 'input', ''),
|
|
422
|
+
expected_tools=expected_tools,
|
|
423
|
+
forbidden_tools=forbidden_tools,
|
|
424
|
+
output_file=output_file
|
|
425
|
+
)
|
|
426
|
+
elif eval_type == 'criteria':
|
|
427
|
+
result = handler.run_criteria(
|
|
428
|
+
agent_file=agent_file,
|
|
429
|
+
input_text=getattr(args, 'input', ''),
|
|
430
|
+
criteria=getattr(args, 'criteria', ''),
|
|
431
|
+
scoring_type=getattr(args, 'scoring', 'numeric'),
|
|
432
|
+
threshold=getattr(args, 'threshold', 7.0),
|
|
433
|
+
iterations=getattr(args, 'iterations', 1),
|
|
434
|
+
model=getattr(args, 'model', None),
|
|
435
|
+
output_file=output_file
|
|
436
|
+
)
|
|
437
|
+
elif eval_type == 'batch':
|
|
438
|
+
result = handler.run_batch(
|
|
439
|
+
agent_file=agent_file,
|
|
440
|
+
test_file=getattr(args, 'test_file', ''),
|
|
441
|
+
eval_type=getattr(args, 'batch_type', 'accuracy'),
|
|
442
|
+
output_file=output_file
|
|
443
|
+
)
|
|
444
|
+
else:
|
|
445
|
+
print(f"Unknown evaluation type: {eval_type}")
|
|
446
|
+
return 1
|
|
447
|
+
|
|
448
|
+
if 'error' in result:
|
|
449
|
+
print(f"Error: {result['error']}")
|
|
450
|
+
return 1
|
|
451
|
+
elif not getattr(args, 'quiet', False):
|
|
452
|
+
print(json.dumps(result, indent=2))
|
|
453
|
+
|
|
454
|
+
return 0
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def add_eval_parser_subcommands(subparsers) -> None:
|
|
458
|
+
"""Add eval subcommand parsers to an existing subparsers object."""
|
|
459
|
+
accuracy_parser = subparsers.add_parser('accuracy', help='Run accuracy evaluation')
|
|
460
|
+
accuracy_parser.add_argument('--agent', '-a', help='Agent config file (optional if --prompt used)')
|
|
461
|
+
accuracy_parser.add_argument('--prompt', '-p', type=str, help='Direct prompt (alternative to --agent)')
|
|
462
|
+
accuracy_parser.add_argument('--llm', help='LLM model for agent (when using --prompt)')
|
|
463
|
+
accuracy_parser.add_argument('--input', '-i', help='Input text (defaults to --prompt if not provided)')
|
|
464
|
+
accuracy_parser.add_argument('--expected', '-e', required=True, help='Expected output')
|
|
465
|
+
accuracy_parser.add_argument('--iterations', '-n', type=int, default=1, help='Number of iterations')
|
|
466
|
+
accuracy_parser.add_argument('--model', '-m', help='Judge model')
|
|
467
|
+
accuracy_parser.add_argument('--output', '-o', help='Output file')
|
|
468
|
+
accuracy_parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
469
|
+
accuracy_parser.add_argument('--quiet', '-q', action='store_true', help='Suppress JSON output')
|
|
470
|
+
|
|
471
|
+
perf_parser = subparsers.add_parser('performance', help='Run performance evaluation')
|
|
472
|
+
perf_parser.add_argument('--agent', '-a', default='agents.yaml', help='Agent config file')
|
|
473
|
+
perf_parser.add_argument('--input', '-i', default='Hello', help='Input text')
|
|
474
|
+
perf_parser.add_argument('--iterations', '-n', type=int, default=10, help='Number of iterations')
|
|
475
|
+
perf_parser.add_argument('--warmup', '-w', type=int, default=2, help='Warmup runs')
|
|
476
|
+
perf_parser.add_argument('--memory', action='store_true', default=True, help='Track memory')
|
|
477
|
+
perf_parser.add_argument('--output', '-o', help='Output file')
|
|
478
|
+
perf_parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
479
|
+
perf_parser.add_argument('--quiet', '-q', action='store_true', help='Suppress JSON output')
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def add_eval_parser(subparsers) -> None:
|
|
483
|
+
"""
|
|
484
|
+
Add eval subcommand parser.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
subparsers: Argument parser subparsers
|
|
488
|
+
"""
|
|
489
|
+
eval_parser = subparsers.add_parser(
|
|
490
|
+
'eval',
|
|
491
|
+
help='Run agent evaluations'
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
eval_subparsers = eval_parser.add_subparsers(dest='eval_type')
|
|
495
|
+
|
|
496
|
+
accuracy_parser = eval_subparsers.add_parser(
|
|
497
|
+
'accuracy',
|
|
498
|
+
help='Run accuracy evaluation'
|
|
499
|
+
)
|
|
500
|
+
accuracy_parser.add_argument('--agent', '-a', help='Agent config file (optional if --prompt used)')
|
|
501
|
+
accuracy_parser.add_argument('--prompt', '-p', help='Direct prompt (alternative to --agent)')
|
|
502
|
+
accuracy_parser.add_argument('--llm', help='LLM model for agent (when using --prompt)')
|
|
503
|
+
accuracy_parser.add_argument('--input', '-i', help='Input text (defaults to --prompt if not provided)')
|
|
504
|
+
accuracy_parser.add_argument('--expected', '-e', required=True, help='Expected output')
|
|
505
|
+
accuracy_parser.add_argument('--iterations', '-n', type=int, default=1, help='Number of iterations')
|
|
506
|
+
accuracy_parser.add_argument('--model', '-m', help='Judge model')
|
|
507
|
+
accuracy_parser.add_argument('--output', '-o', help='Output file')
|
|
508
|
+
accuracy_parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
509
|
+
accuracy_parser.add_argument('--quiet', '-q', action='store_true', help='Suppress JSON output')
|
|
510
|
+
|
|
511
|
+
perf_parser = eval_subparsers.add_parser(
|
|
512
|
+
'performance',
|
|
513
|
+
help='Run performance evaluation'
|
|
514
|
+
)
|
|
515
|
+
perf_parser.add_argument('--agent', '-a', default='agents.yaml', help='Agent config file')
|
|
516
|
+
perf_parser.add_argument('--input', '-i', default='Hello', help='Input text')
|
|
517
|
+
perf_parser.add_argument('--iterations', '-n', type=int, default=10, help='Number of iterations')
|
|
518
|
+
perf_parser.add_argument('--warmup', '-w', type=int, default=2, help='Warmup runs')
|
|
519
|
+
perf_parser.add_argument('--memory', action='store_true', default=True, help='Track memory')
|
|
520
|
+
perf_parser.add_argument('--output', '-o', help='Output file')
|
|
521
|
+
perf_parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
522
|
+
perf_parser.add_argument('--quiet', '-q', action='store_true', help='Suppress JSON output')
|
|
523
|
+
|
|
524
|
+
rel_parser = eval_subparsers.add_parser(
|
|
525
|
+
'reliability',
|
|
526
|
+
help='Run reliability evaluation'
|
|
527
|
+
)
|
|
528
|
+
rel_parser.add_argument('--agent', '-a', default='agents.yaml', help='Agent config file')
|
|
529
|
+
rel_parser.add_argument('--input', '-i', required=True, help='Input text')
|
|
530
|
+
rel_parser.add_argument('--expected-tools', '-t', required=True, help='Expected tools (comma-separated)')
|
|
531
|
+
rel_parser.add_argument('--forbidden-tools', '-f', help='Forbidden tools (comma-separated)')
|
|
532
|
+
rel_parser.add_argument('--output', '-o', help='Output file')
|
|
533
|
+
rel_parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
534
|
+
rel_parser.add_argument('--quiet', '-q', action='store_true', help='Suppress JSON output')
|
|
535
|
+
|
|
536
|
+
criteria_parser = eval_subparsers.add_parser(
|
|
537
|
+
'criteria',
|
|
538
|
+
help='Run criteria-based evaluation'
|
|
539
|
+
)
|
|
540
|
+
criteria_parser.add_argument('--agent', '-a', default='agents.yaml', help='Agent config file')
|
|
541
|
+
criteria_parser.add_argument('--input', '-i', required=True, help='Input text')
|
|
542
|
+
criteria_parser.add_argument('--criteria', '-c', required=True, help='Evaluation criteria')
|
|
543
|
+
criteria_parser.add_argument('--scoring', '-s', choices=['numeric', 'binary'], default='numeric', help='Scoring type')
|
|
544
|
+
criteria_parser.add_argument('--threshold', type=float, default=7.0, help='Pass threshold')
|
|
545
|
+
criteria_parser.add_argument('--iterations', '-n', type=int, default=1, help='Number of iterations')
|
|
546
|
+
criteria_parser.add_argument('--model', '-m', help='Judge model')
|
|
547
|
+
criteria_parser.add_argument('--output', '-o', help='Output file')
|
|
548
|
+
criteria_parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
549
|
+
criteria_parser.add_argument('--quiet', '-q', action='store_true', help='Suppress JSON output')
|
|
550
|
+
|
|
551
|
+
batch_parser = eval_subparsers.add_parser(
|
|
552
|
+
'batch',
|
|
553
|
+
help='Run batch evaluation from test file'
|
|
554
|
+
)
|
|
555
|
+
batch_parser.add_argument('--agent', '-a', default='agents.yaml', help='Agent config file')
|
|
556
|
+
batch_parser.add_argument('--test-file', '-t', required=True, help='JSON test file')
|
|
557
|
+
batch_parser.add_argument('--batch-type', '-b', choices=['accuracy', 'criteria'], default='accuracy', help='Evaluation type')
|
|
558
|
+
batch_parser.add_argument('--output', '-o', help='Output file')
|
|
559
|
+
batch_parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
560
|
+
batch_parser.add_argument('--quiet', '-q', action='store_true', help='Suppress JSON output')
|