eval-protocol 0.2.11.dev1__tar.gz → 0.2.98.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/LICENSE +1 -1
- eval_protocol-0.2.98.dev1/PKG-INFO +156 -0
- eval_protocol-0.2.98.dev1/README.md +39 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/normalize_sandbox_fusion.py +9 -10
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/utils/subprocess_manager.py +1 -1
- eval_protocol-0.2.98.dev1/eval_protocol/__init__.py +178 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/_version.py +3 -3
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/__init__.py +101 -0
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/base.py +25 -0
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/bigquery.py +304 -0
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/braintrust.py +315 -0
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/fireworks_tracing.py +453 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/adapters/huggingface.py +5 -12
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/langchain.py +214 -0
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/langfuse.py +552 -0
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/langsmith.py +413 -0
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/openai_responses.py +216 -0
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/utils.py +98 -0
- eval_protocol-0.2.98.dev1/eval_protocol/adapters/weave.py +130 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/orchestrator.py +53 -52
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +1 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/docker_resource.py +10 -13
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/task_manager.py +19 -10
- eval_protocol-0.2.98.dev1/eval_protocol/auth.py +331 -0
- eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/data/airline_dataset.jsonl +50 -0
- eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/data/retail_dataset.jsonl +114 -0
- eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/suites/aime25.py → eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_aime25.py +24 -8
- eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_frozen_lake.py +80 -0
- eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_glm_streaming_compliance.py +3477 -0
- eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/suites/gpqa.py → eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_gpqa.py +49 -21
- eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/suites/livebench_data_analysis.py → eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_livebench_data_analysis.py +76 -33
- eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_tau_bench_airline.py +304 -0
- eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/suites/tau_bench_retail.py → eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_tau_bench_retail.py +71 -18
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli.py +306 -25
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/agent_eval_cmd.py +1 -5
- eval_protocol-0.2.98.dev1/eval_protocol/cli_commands/create_rft.py +880 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/deploy.py +34 -11
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/deploy_mcp.py +7 -4
- eval_protocol-0.2.98.dev1/eval_protocol/cli_commands/local_test.py +212 -0
- eval_protocol-0.2.98.dev1/eval_protocol/cli_commands/logs.py +57 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/preview.py +3 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/run_eval_cmd.py +2 -1
- eval_protocol-0.2.98.dev1/eval_protocol/cli_commands/upload.py +306 -0
- eval_protocol-0.2.98.dev1/eval_protocol/cli_commands/utils.py +511 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/common_utils.py +17 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/config.py +3 -3
- eval_protocol-0.2.98.dev1/eval_protocol/data_loader/__init__.py +5 -0
- eval_protocol-0.2.98.dev1/eval_protocol/data_loader/dynamic_data_loader.py +38 -0
- eval_protocol-0.2.98.dev1/eval_protocol/data_loader/factory_data_loader.py +38 -0
- eval_protocol-0.2.98.dev1/eval_protocol/data_loader/inline_data_loader.py +68 -0
- eval_protocol-0.2.98.dev1/eval_protocol/data_loader/jsonl_data_loader.py +42 -0
- eval_protocol-0.2.98.dev1/eval_protocol/data_loader/models.py +129 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/dataset_logger/__init__.py +9 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +8 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +7 -5
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/datasets/loader.py +3 -5
- eval_protocol-0.2.98.dev1/eval_protocol/directory_utils.py +39 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/evaluation.py +499 -73
- eval_protocol-0.2.98.dev1/eval_protocol/event_bus/__init__.py +38 -0
- eval_protocol-0.2.98.dev1/eval_protocol/event_bus/sqlite_event_bus.py +126 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/event_bus/sqlite_event_bus_database.py +6 -8
- eval_protocol-0.2.98.dev1/eval_protocol/exceptions.py +184 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/execution/pipeline.py +51 -17
- eval_protocol-0.2.98.dev1/eval_protocol/fireworks_rft.py +249 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/gcp_tools.py +3 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/generation/clients.py +5 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/generic_server.py +1 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/get_pep440_version.py +9 -1
- eval_protocol-0.2.98.dev1/eval_protocol/human_id/__init__.py +77 -0
- eval_protocol-0.2.98.dev1/eval_protocol/integrations/__init__.py +7 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/integrations/deepeval.py +11 -2
- eval_protocol-0.2.98.dev1/eval_protocol/integrations/openai_rft.py +190 -0
- eval_protocol-0.2.98.dev1/eval_protocol/integrations/tinker_cookbook.py +197 -0
- eval_protocol-0.2.98.dev1/eval_protocol/integrations/tinker_rollout_processor.py +170 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/integrations/trl.py +1 -1
- eval_protocol-0.2.98.dev1/eval_protocol/log_utils/elasticsearch_client.py +338 -0
- eval_protocol-0.2.98.dev1/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +160 -0
- eval_protocol-0.2.98.dev1/eval_protocol/log_utils/elasticsearch_index_manager.py +168 -0
- eval_protocol-0.2.98.dev1/eval_protocol/log_utils/fireworks_tracing_http_handler.py +138 -0
- eval_protocol-0.2.98.dev1/eval_protocol/log_utils/init.py +69 -0
- eval_protocol-0.2.98.dev1/eval_protocol/log_utils/rollout_context.py +84 -0
- eval_protocol-0.2.98.dev1/eval_protocol/log_utils/rollout_id_filter.py +28 -0
- eval_protocol-0.2.98.dev1/eval_protocol/log_utils/util.py +22 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/client/connection.py +46 -29
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/clients.py +7 -5
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/execution/base_policy.py +7 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/execution/manager.py +155 -73
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/execution/policy.py +57 -29
- eval_protocol-0.2.98.dev1/eval_protocol/mcp/execution/vllm_policy.py +186 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/mcp_multi_client.py +62 -16
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/mcpgym.py +44 -19
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/session/manager.py +1 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/simple_process_manager.py +2 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/simulation_server.py +30 -8
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_agent/main.py +18 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +13 -4
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_env.py +25 -10
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +160 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +102 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/frozen_lake/server.py +57 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/README.md +250 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/__init__.py +61 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +107 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +100 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +112 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/server.py +83 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/tau2_mcp.py +767 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +178 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +18 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +147 -0
- eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +1689 -0
- eval_protocol-0.2.98.dev1/eval_protocol/models.py +1192 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/platform_api.py +35 -16
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/playback_policy.py +14 -38
- eval_protocol-0.2.98.dev1/eval_protocol/proxy/__init__.py +18 -0
- eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/__init__.py +13 -0
- eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/app.py +305 -0
- eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/auth.py +17 -0
- eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/langfuse.py +546 -0
- eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/litellm.py +173 -0
- eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/main.py +10 -0
- eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/models.py +98 -0
- eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/redis_utils.py +57 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/__init__.py +55 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/buffer.py +82 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/default_agent_rollout_processor.py +284 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/default_dataset_adapter.py +9 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/default_langchain_rollout_processor.py +159 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +110 -43
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +163 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/default_single_turn_rollout_process.py +197 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/dual_mode_wrapper.py +78 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/elasticsearch_setup.py +167 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/evaluation_test.py +778 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/evaluation_test_postprocess.py +208 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/evaluation_test_utils.py +613 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/exception_config.py +151 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/execution.py +111 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/generate_parameter_combinations.py +145 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/github_action_rollout_processor.py +225 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/handle_persist_flow.py +225 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/integrations/openenv_trl_vllm.py +473 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/openenv_rollout_processor.py +585 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/parameterize.py +424 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/plugin.py +413 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/priority_scheduler.py +348 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/remote_rollout_processor.py +207 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/pytest/rollout_processor.py +5 -2
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/rollout_result_post_processor.py +57 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/store_experiment_link.py +41 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/store_results_url.py +49 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/tracing_utils.py +185 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/types.py +79 -0
- eval_protocol-0.2.98.dev1/eval_protocol/pytest/validate_signature.py +69 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/__init__.py +8 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/__init__.py +4 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/llm_judge.py +90 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +63 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +58 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +82 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +66 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/utils.py +133 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/llm_judge.py +90 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/llm_judge_braintrust.py +63 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +223 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/svg_agent/evaluator/utils.py +523 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +202 -0
- eval_protocol-0.2.98.dev1/eval_protocol/quickstart/utils.py +251 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/resources.py +2 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/accuracy.py +28 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/accuracy_length.py +19 -6
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/apps_coding_reward.py +2 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/apps_execution_utils.py +1 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/apps_testing_util.py +8 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/bfcl_reward.py +3 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/code_execution.py +20 -6
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/cpp_code.py +2 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/deepcoder_reward.py +8 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/format.py +5 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/function_calling.py +3 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/json_schema.py +36 -6
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/language_consistency.py +25 -10
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/lean_prover.py +14 -11
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/length.py +6 -4
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/list_comparison_math_reward.py +6 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/math.py +22 -17
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/multiple_choice_math_reward.py +12 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/reasoning_steps.py +2 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/repetition.py +28 -4
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/tag_count.py +28 -5
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/server.py +5 -5
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/stats/__init__.py +0 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/stats/confidence_intervals.py +8 -10
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/typed_interface.py +58 -12
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/types/__init__.py +2 -1
- eval_protocol-0.2.98.dev1/eval_protocol/types/errors.py +11 -0
- eval_protocol-0.2.98.dev1/eval_protocol/types/remote_rollout_processor.py +87 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/types/types.py +8 -4
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/batch_transformation.py +1 -1
- eval_protocol-0.2.98.dev1/eval_protocol/utils/browser_utils.py +114 -0
- eval_protocol-0.2.98.dev1/eval_protocol/utils/check_server_status.py +77 -0
- eval_protocol-0.2.98.dev1/eval_protocol/utils/evaluation_row_utils.py +158 -0
- eval_protocol-0.2.98.dev1/eval_protocol/utils/logs_models.py +45 -0
- eval_protocol-0.2.98.dev1/eval_protocol/utils/logs_server.py +720 -0
- eval_protocol-0.2.98.dev1/eval_protocol/utils/show_results_url.py +74 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/static_policy.py +7 -7
- eval_protocol-0.2.98.dev1/eval_protocol/utils/subprocess_utils.py +118 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/vite_server.py +3 -3
- eval_protocol-0.2.98.dev1/eval_protocol.egg-info/PKG-INFO +156 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol.egg-info/SOURCES.txt +143 -17
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol.egg-info/requires.txt +53 -24
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/pyproject.toml +96 -36
- eval_protocol-0.2.98.dev1/tests/test_adapters_e2e.py +765 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_agent_resources.py +6 -6
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_auth.py +10 -5
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_batch_evaluation.py +6 -12
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_cli_agent.py +2 -4
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_cli_args.py +0 -1
- eval_protocol-0.2.98.dev1/tests/test_cli_create_rft.py +1233 -0
- eval_protocol-0.2.98.dev1/tests/test_cli_local_test.py +285 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_code_execution.py +4 -4
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_data_driven_task_manager.py +0 -5
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_deepeval_integration.py +20 -4
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_deploy_integration.py +0 -1
- eval_protocol-0.2.98.dev1/tests/test_directory_utils.py +95 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_e2b_integration.py +1 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_e2b_js_integration.py +1 -1
- eval_protocol-0.2.98.dev1/tests/test_ep_upload_e2e.py +647 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_eval_protocol_import.py +35 -39
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_evaluation.py +71 -34
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_evaluation_integration.py +123 -29
- eval_protocol-0.2.98.dev1/tests/test_evaluation_postprocess.py +526 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_evaluation_preview_integration.py +113 -53
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_event_bus.py +75 -39
- eval_protocol-0.2.98.dev1/tests/test_event_bus_helper.py +73 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_examples_end_to_end.py +0 -1
- eval_protocol-0.2.98.dev1/tests/test_exception_config.py +114 -0
- eval_protocol-0.2.98.dev1/tests/test_exceptions.py +385 -0
- eval_protocol-0.2.98.dev1/tests/test_fireworks_api.py +68 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_function_calling.py +6 -6
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_gcp_tools.py +0 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_generic_server.py +1 -2
- eval_protocol-0.2.98.dev1/tests/test_human_id.py +94 -0
- eval_protocol-0.2.98.dev1/tests/test_litellm_policy_provider_fields.py +95 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_logs_server.py +20 -9
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_math.py +5 -2
- eval_protocol-0.2.98.dev1/tests/test_message_field_filtering.py +64 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_models.py +363 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_models_rl.py +1 -4
- eval_protocol-0.2.98.dev1/tests/test_openai_rft_integration.py +66 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_packaging.py +0 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_parallel_rollouts.py +2 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_platform_api.py +0 -1
- eval_protocol-0.2.98.dev1/tests/test_priority_scheduler.py +322 -0
- eval_protocol-0.2.98.dev1/tests/test_quickstart_utils.py +388 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_readiness.py +0 -2
- eval_protocol-0.2.98.dev1/tests/test_retry_mechanism.py +485 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_reward_protocol_import.py +35 -39
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_rl_processing.py +1 -4
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_rollout_control_plane_integration.py +14 -14
- eval_protocol-0.2.98.dev1/tests/test_show_results_url.py +336 -0
- eval_protocol-0.2.98.dev1/tests/test_status_migration_changes.py +440 -0
- eval_protocol-0.2.98.dev1/tests/test_status_migration_integration.py +388 -0
- eval_protocol-0.2.98.dev1/tests/test_status_model.py +360 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_typed_interface_rl.py +0 -1
- eval_protocol-0.2.98.dev1/tests/test_upload_entrypoint.py +227 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_url_handling.py +1 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/__init__.py +21 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/agent/base.py +1 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/agent/llm_agent.py +9 -9
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/cli.py +1 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/config.py +1 -1
- eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/airline/policy.md +167 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/mock/policy.md +7 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/mock/policy_solo.md +6 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/retail/policy.md +136 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/telecom/main_policy.md +159 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/telecom/main_policy_solo.md +155 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/telecom/tech_support_manual.md +206 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/telecom/tech_support_workflow.md +303 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +299 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/data/user_simulator/simulation_guidelines.md +18 -0
- eval_protocol-0.2.98.dev1/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +30 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/data_model/message.py +14 -44
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/data_model/simulation.py +14 -44
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/data_model/tasks.py +20 -75
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/airline/data_model.py +22 -66
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/airline/tools.py +19 -60
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/mock/data_model.py +2 -6
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/mock/environment.py +1 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/mock/tools.py +1 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/retail/data_model.py +24 -72
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/retail/tools.py +8 -25
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/data_model.py +26 -78
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/environment.py +3 -9
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/const.py +1 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +3 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +6 -20
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +2 -6
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/utils.py +1 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tools.py +11 -36
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/user_data_model.py +18 -56
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/user_tools.py +29 -73
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/environment.py +3 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/server.py +7 -11
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/tool.py +3 -9
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/toolkit.py +4 -12
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/utils/interface_agent.py +4 -12
- eval_protocol-0.2.98.dev1/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator.py +1 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator_action.py +1 -4
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator_communicate.py +2 -6
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator_env.py +5 -19
- eval_protocol-0.2.98.dev1/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/metrics/agent_metrics.py +2 -6
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/metrics/break_down_metrics.py +6 -20
- eval_protocol-0.2.98.dev1/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/orchestrator/environment_manager.py +5 -15
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/orchestrator/orchestrator.py +23 -85
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/orchestrator/utils.py +1 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/registry.py +23 -27
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/run.py +23 -33
- eval_protocol-0.2.98.dev1/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/scripts/show_domain_doc.py +1 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/scripts/start_servers.py +2 -6
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/scripts/view_simulations.py +20 -50
- eval_protocol-0.2.98.dev1/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/user/base.py +3 -9
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/user/user_simulator.py +3 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/display.py +13 -45
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/pydantic_utils.py +1 -3
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/utils.py +2 -8
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/versioneer.py +1 -3
- eval_protocol-0.2.98.dev1/vite-app/dist/assets/index-CuQbfdPD.js +46 -0
- eval_protocol-0.2.98.dev1/vite-app/dist/assets/index-CuQbfdPD.js.map +1 -0
- eval_protocol-0.2.98.dev1/vite-app/dist/assets/index-iZp_HgyW.css +1 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vite-app/dist/index.html +2 -2
- eval_protocol-0.2.11.dev1/PKG-INFO +0 -173
- eval_protocol-0.2.11.dev1/README.md +0 -75
- eval_protocol-0.2.11.dev1/eval_protocol/__init__.py +0 -82
- eval_protocol-0.2.11.dev1/eval_protocol/adapters/__init__.py +0 -47
- eval_protocol-0.2.11.dev1/eval_protocol/adapters/braintrust.py +0 -8
- eval_protocol-0.2.11.dev1/eval_protocol/adapters/langfuse.py +0 -392
- eval_protocol-0.2.11.dev1/eval_protocol/auth.py +0 -156
- eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/__init__.py +0 -9
- eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/registry.py +0 -330
- eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/run.py +0 -100
- eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/suites/__init__.py +0 -3
- eval_protocol-0.2.11.dev1/eval_protocol/cli_commands/logs.py +0 -30
- eval_protocol-0.2.11.dev1/eval_protocol/directory_utils.py +0 -55
- eval_protocol-0.2.11.dev1/eval_protocol/event_bus/__init__.py +0 -5
- eval_protocol-0.2.11.dev1/eval_protocol/event_bus/sqlite_event_bus.py +0 -109
- eval_protocol-0.2.11.dev1/eval_protocol/human_id/__init__.py +0 -35
- eval_protocol-0.2.11.dev1/eval_protocol/integrations/__init__.py +0 -12
- eval_protocol-0.2.11.dev1/eval_protocol/integrations/braintrust.py +0 -51
- eval_protocol-0.2.11.dev1/eval_protocol/mcp_agent/intermediary_server.py +0 -542
- eval_protocol-0.2.11.dev1/eval_protocol/mcp_agent/main.py +0 -210
- eval_protocol-0.2.11.dev1/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -304
- eval_protocol-0.2.11.dev1/eval_protocol/mcp_agent/session.py +0 -79
- eval_protocol-0.2.11.dev1/eval_protocol/models.py +0 -563
- eval_protocol-0.2.11.dev1/eval_protocol/pytest/__init__.py +0 -19
- eval_protocol-0.2.11.dev1/eval_protocol/pytest/default_agent_rollout_processor.py +0 -158
- eval_protocol-0.2.11.dev1/eval_protocol/pytest/default_dataset_adapter.py +0 -10
- eval_protocol-0.2.11.dev1/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -118
- eval_protocol-0.2.11.dev1/eval_protocol/pytest/evaluation_test.py +0 -962
- eval_protocol-0.2.11.dev1/eval_protocol/pytest/plugin.py +0 -161
- eval_protocol-0.2.11.dev1/eval_protocol/pytest/types.py +0 -52
- eval_protocol-0.2.11.dev1/eval_protocol/pytest/utils.py +0 -363
- eval_protocol-0.2.11.dev1/eval_protocol/utils/logs_server.py +0 -388
- eval_protocol-0.2.11.dev1/eval_protocol.egg-info/PKG-INFO +0 -173
- eval_protocol-0.2.11.dev1/tests/test_adapters_e2e.py +0 -447
- eval_protocol-0.2.11.dev1/tests/test_braintrust_adapter.py +0 -34
- eval_protocol-0.2.11.dev1/tests/test_braintrust_example.py +0 -49
- eval_protocol-0.2.11.dev1/tests/test_fireworks_api.py +0 -66
- eval_protocol-0.2.11.dev1/tests/test_retry_mechanism.py +0 -157
- eval_protocol-0.2.11.dev1/vendor/tau2/__init__.py +0 -1
- eval_protocol-0.2.11.dev1/vendor/tau2/data_model/__init__.py +0 -1
- eval_protocol-0.2.11.dev1/vendor/tau2/evaluator/__init__.py +0 -1
- eval_protocol-0.2.11.dev1/vite-app/dist/assets/index-D1ErODUS.js +0 -93
- eval_protocol-0.2.11.dev1/vite-app/dist/assets/index-D1ErODUS.js.map +0 -1
- eval_protocol-0.2.11.dev1/vite-app/dist/assets/index-D5KxcfFQ.css +0 -1
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.11.dev1/vendor/tau2/agent → eval_protocol-0.2.98.dev1/eval_protocol/benchmarks}/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.11.dev1/vendor/tau2/domains → eval_protocol-0.2.98.dev1/eval_protocol/log_utils}/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.11.dev1/vendor/tau2/domains/telecom/tasks → eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers}/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/setup.cfg +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/setup.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_config.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_format.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_length.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_server.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.11.dev1/vendor/tau2/environment → eval_protocol-0.2.98.dev1/vendor/tau2/agent}/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.11.dev1/vendor/tau2/metrics → eval_protocol-0.2.98.dev1/vendor/tau2/data_model}/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1/vendor/tau2/orchestrator → eval_protocol-0.2.98.dev1/vendor/tau2/domains}/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1/vendor/tau2/scripts → eval_protocol-0.2.98.dev1/vendor/tau2/domains/telecom/tasks}/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.11.dev1/vendor/tau2/user → eval_protocol-0.2.98.dev1/vendor/tau2/environment}/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/llm_utils.py +2 -2
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
|
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
18
18
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
19
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
20
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: eval-protocol
|
|
3
|
+
Version: 0.2.98.dev1
|
|
4
|
+
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
|
+
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/fireworks-ai/eval-protocol
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: requests>=2.25.0
|
|
14
|
+
Requires-Dist: pydantic>=2.0.0
|
|
15
|
+
Requires-Dist: dataclasses-json>=0.5.7
|
|
16
|
+
Requires-Dist: uvicorn>=0.15.0
|
|
17
|
+
Requires-Dist: python-dotenv>=0.19.0
|
|
18
|
+
Requires-Dist: openai>=1.78.1
|
|
19
|
+
Requires-Dist: aiosqlite
|
|
20
|
+
Requires-Dist: aiohttp
|
|
21
|
+
Requires-Dist: mcp>=1.9.2
|
|
22
|
+
Requires-Dist: PyYAML>=5.0
|
|
23
|
+
Requires-Dist: hydra-core>=1.3.2
|
|
24
|
+
Requires-Dist: omegaconf>=2.3.0
|
|
25
|
+
Requires-Dist: httpx>=0.24.0
|
|
26
|
+
Requires-Dist: anthropic>=0.59.0
|
|
27
|
+
Requires-Dist: litellm<1.75.0
|
|
28
|
+
Requires-Dist: pytest>=6.0.0
|
|
29
|
+
Requires-Dist: pytest-asyncio>=0.21.0
|
|
30
|
+
Requires-Dist: peewee>=3.18.2
|
|
31
|
+
Requires-Dist: backoff>=2.2.0
|
|
32
|
+
Requires-Dist: questionary>=2.0.0
|
|
33
|
+
Requires-Dist: toml>=0.10.0
|
|
34
|
+
Requires-Dist: loguru>=0.6.0
|
|
35
|
+
Requires-Dist: docstring-parser>=0.15
|
|
36
|
+
Requires-Dist: rich>=12.0.0
|
|
37
|
+
Requires-Dist: psutil>=6.0.0
|
|
38
|
+
Requires-Dist: addict>=2.4.0
|
|
39
|
+
Requires-Dist: deepdiff>=6.0.0
|
|
40
|
+
Requires-Dist: websockets>=15.0.1
|
|
41
|
+
Requires-Dist: fastapi>=0.116.1
|
|
42
|
+
Provides-Extra: dev
|
|
43
|
+
Requires-Dist: build; extra == "dev"
|
|
44
|
+
Requires-Dist: twine; extra == "dev"
|
|
45
|
+
Requires-Dist: pytest-httpserver; extra == "dev"
|
|
46
|
+
Requires-Dist: werkzeug>=2.0.0; extra == "dev"
|
|
47
|
+
Requires-Dist: ruff>=0.5.0; extra == "dev"
|
|
48
|
+
Requires-Dist: transformers>=4.0.0; extra == "dev"
|
|
49
|
+
Requires-Dist: pandas>=1.5.0; extra == "dev"
|
|
50
|
+
Requires-Dist: types-setuptools; extra == "dev"
|
|
51
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
52
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
53
|
+
Requires-Dist: types-docker; extra == "dev"
|
|
54
|
+
Requires-Dist: versioneer>=0.20; extra == "dev"
|
|
55
|
+
Requires-Dist: openai>=1.78.1; extra == "dev"
|
|
56
|
+
Requires-Dist: pre-commit; extra == "dev"
|
|
57
|
+
Requires-Dist: e2b; extra == "dev"
|
|
58
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
59
|
+
Requires-Dist: pytest-xdist; extra == "dev"
|
|
60
|
+
Requires-Dist: docker==7.1.0; extra == "dev"
|
|
61
|
+
Requires-Dist: ipykernel>=6.30.0; extra == "dev"
|
|
62
|
+
Requires-Dist: jupyter>=1.1.1; extra == "dev"
|
|
63
|
+
Requires-Dist: pip>=25.1.1; extra == "dev"
|
|
64
|
+
Requires-Dist: haikus==0.3.8; extra == "dev"
|
|
65
|
+
Requires-Dist: syrupy>=4.0.0; extra == "dev"
|
|
66
|
+
Requires-Dist: gymnasium>=1.2.0; extra == "dev"
|
|
67
|
+
Provides-Extra: trl
|
|
68
|
+
Requires-Dist: torch>=1.9; extra == "trl"
|
|
69
|
+
Requires-Dist: trl>=0.7.0; extra == "trl"
|
|
70
|
+
Requires-Dist: peft>=0.7.0; extra == "trl"
|
|
71
|
+
Requires-Dist: transformers>=4.0.0; extra == "trl"
|
|
72
|
+
Requires-Dist: accelerate>=0.28.0; extra == "trl"
|
|
73
|
+
Provides-Extra: openevals
|
|
74
|
+
Requires-Dist: openevals>=0.1.0; extra == "openevals"
|
|
75
|
+
Provides-Extra: fireworks
|
|
76
|
+
Requires-Dist: fireworks-ai>=0.19.19; extra == "fireworks"
|
|
77
|
+
Provides-Extra: box2d
|
|
78
|
+
Requires-Dist: swig; extra == "box2d"
|
|
79
|
+
Requires-Dist: gymnasium[box2d]>=0.29.0; extra == "box2d"
|
|
80
|
+
Requires-Dist: Pillow; extra == "box2d"
|
|
81
|
+
Provides-Extra: langfuse
|
|
82
|
+
Requires-Dist: langfuse>=2.0.0; extra == "langfuse"
|
|
83
|
+
Provides-Extra: huggingface
|
|
84
|
+
Requires-Dist: datasets>=3.0.0; extra == "huggingface"
|
|
85
|
+
Requires-Dist: transformers>=4.0.0; extra == "huggingface"
|
|
86
|
+
Provides-Extra: langsmith
|
|
87
|
+
Requires-Dist: langsmith>=0.1.86; extra == "langsmith"
|
|
88
|
+
Provides-Extra: bigquery
|
|
89
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == "bigquery"
|
|
90
|
+
Requires-Dist: google-auth>=2.0.0; extra == "bigquery"
|
|
91
|
+
Provides-Extra: svgbench
|
|
92
|
+
Requires-Dist: selenium>=4.0.0; extra == "svgbench"
|
|
93
|
+
Provides-Extra: pydantic
|
|
94
|
+
Requires-Dist: pydantic-ai>=1.0.2; extra == "pydantic"
|
|
95
|
+
Provides-Extra: supabase
|
|
96
|
+
Requires-Dist: supabase>=2.18.1; extra == "supabase"
|
|
97
|
+
Provides-Extra: chinook
|
|
98
|
+
Requires-Dist: psycopg2-binary>=2.9.10; extra == "chinook"
|
|
99
|
+
Provides-Extra: langchain
|
|
100
|
+
Requires-Dist: langchain-core>=0.3.0; extra == "langchain"
|
|
101
|
+
Provides-Extra: braintrust
|
|
102
|
+
Requires-Dist: braintrust[otel]; extra == "braintrust"
|
|
103
|
+
Provides-Extra: openenv
|
|
104
|
+
Requires-Dist: openenv-core; extra == "openenv"
|
|
105
|
+
Provides-Extra: langgraph
|
|
106
|
+
Requires-Dist: langgraph>=0.6.7; extra == "langgraph"
|
|
107
|
+
Requires-Dist: langchain-core>=0.3.75; extra == "langgraph"
|
|
108
|
+
Provides-Extra: langgraph-tools
|
|
109
|
+
Requires-Dist: langgraph>=0.6.7; extra == "langgraph-tools"
|
|
110
|
+
Requires-Dist: langchain>=0.3.0; extra == "langgraph-tools"
|
|
111
|
+
Requires-Dist: langchain-fireworks>=0.3.0; extra == "langgraph-tools"
|
|
112
|
+
Provides-Extra: proxy
|
|
113
|
+
Requires-Dist: redis>=5.0.0; extra == "proxy"
|
|
114
|
+
Requires-Dist: langfuse>=2.0.0; extra == "proxy"
|
|
115
|
+
Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
|
|
116
|
+
Dynamic: license-file
|
|
117
|
+
|
|
118
|
+
# Eval Protocol
|
|
119
|
+
|
|
120
|
+
[](https://pypi.org/project/eval-protocol/)
|
|
121
|
+
[](https://deepwiki.com/eval-protocol/python-sdk)
|
|
122
|
+
|
|
123
|
+
**Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
|
|
124
|
+
|
|
125
|
+

|
|
126
|
+
|
|
127
|
+
Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
|
|
128
|
+
|
|
129
|
+
Eval Protocol makes this possible in two ways:
|
|
130
|
+
|
|
131
|
+
1. **Expose your agent through a simple API**
|
|
132
|
+
Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
|
|
133
|
+
2. **Connect with any trainer**
|
|
134
|
+
Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
|
|
135
|
+
|
|
136
|
+
The result: RL that works out-of-the-box for existing production agents.
|
|
137
|
+
|
|
138
|
+
## Who This Is For
|
|
139
|
+
|
|
140
|
+
- **Applied AI teams** adding RL to existing production agents.
|
|
141
|
+
- **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
|
|
142
|
+
- **MLOps teams** building reproducible, language-agnostic rollout pipelines.
|
|
143
|
+
|
|
144
|
+
## Quickstart
|
|
145
|
+
|
|
146
|
+
- See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
|
|
147
|
+
|
|
148
|
+
## Resources
|
|
149
|
+
|
|
150
|
+
- **[Documentation](https://evalprotocol.io)** – Guides and API reference
|
|
151
|
+
- **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
|
|
152
|
+
- **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Eval Protocol
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/eval-protocol/)
|
|
4
|
+
[](https://deepwiki.com/eval-protocol/python-sdk)
|
|
5
|
+
|
|
6
|
+
**Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
|
|
7
|
+
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
|
|
11
|
+
|
|
12
|
+
Eval Protocol makes this possible in two ways:
|
|
13
|
+
|
|
14
|
+
1. **Expose your agent through a simple API**
|
|
15
|
+
Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
|
|
16
|
+
2. **Connect with any trainer**
|
|
17
|
+
Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
|
|
18
|
+
|
|
19
|
+
The result: RL that works out-of-the-box for existing production agents.
|
|
20
|
+
|
|
21
|
+
## Who This Is For
|
|
22
|
+
|
|
23
|
+
- **Applied AI teams** adding RL to existing production agents.
|
|
24
|
+
- **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
|
|
25
|
+
- **MLOps teams** building reproducible, language-agnostic rollout pipelines.
|
|
26
|
+
|
|
27
|
+
## Quickstart
|
|
28
|
+
|
|
29
|
+
- See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
|
|
30
|
+
|
|
31
|
+
## Resources
|
|
32
|
+
|
|
33
|
+
- **[Documentation](https://evalprotocol.io)** – Guides and API reference
|
|
34
|
+
- **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
|
|
35
|
+
- **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
|
|
36
|
+
|
|
37
|
+
## License
|
|
38
|
+
|
|
39
|
+
[MIT](LICENSE)
|
{eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/normalize_sandbox_fusion.py
RENAMED
|
@@ -56,7 +56,7 @@ OUTPUT_JSONL_FILE = "./development/CODING_DATASET.jsonl"
|
|
|
56
56
|
try:
|
|
57
57
|
repobench_p_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
58
58
|
except OSError:
|
|
59
|
-
print("Warning: Could not load gpt2 tokenizer for Repobench-P.
|
|
59
|
+
print("Warning: Could not load gpt2 tokenizer for Repobench-P. Falling back to basic split for token counting.")
|
|
60
60
|
repobench_p_tokenizer = None
|
|
61
61
|
|
|
62
62
|
|
|
@@ -108,8 +108,7 @@ def format_aider_prompt(problem_json: dict) -> str:
|
|
|
108
108
|
"""Format the prompt for Aider benchmark style problems."""
|
|
109
109
|
question = problem_json.get("content", "")
|
|
110
110
|
return (
|
|
111
|
-
f"{question}\n\nPlease generate the code in the following format:\n"
|
|
112
|
-
"```python\n# Your code response here\n```"
|
|
111
|
+
f"{question}\n\nPlease generate the code in the following format:\n```python\n# Your code response here\n```"
|
|
113
112
|
)
|
|
114
113
|
|
|
115
114
|
|
|
@@ -327,7 +326,7 @@ def normalize_problem_to_openai_format(
|
|
|
327
326
|
try:
|
|
328
327
|
labels = json.loads(labels_data)
|
|
329
328
|
except json.JSONDecodeError:
|
|
330
|
-
print(f"Warning: Skipping ID {problem_id_str} in {filename}
|
|
329
|
+
print(f"Warning: Skipping ID {problem_id_str} in {filename} - malformed JSON in labels.")
|
|
331
330
|
return None
|
|
332
331
|
elif isinstance(labels_data, dict):
|
|
333
332
|
labels = labels_data
|
|
@@ -426,10 +425,10 @@ def normalize_problem_to_openai_format(
|
|
|
426
425
|
)
|
|
427
426
|
return None
|
|
428
427
|
if not final_user_content.strip() or not final_assistant_content.strip():
|
|
429
|
-
print(f"Warning: Skipping ID {problem_id_str} in {filename} -
|
|
428
|
+
print(f"Warning: Skipping ID {problem_id_str} in {filename} - empty processed content.")
|
|
430
429
|
return None
|
|
431
430
|
if final_assistant_content.strip() == "import sys; sys.exit(0)":
|
|
432
|
-
print(f"Warning: Skipping ID {problem_id_str} in {filename} -
|
|
431
|
+
print(f"Warning: Skipping ID {problem_id_str} in {filename} - placeholder solution.")
|
|
433
432
|
return None
|
|
434
433
|
|
|
435
434
|
return {
|
|
@@ -439,7 +438,7 @@ def normalize_problem_to_openai_format(
|
|
|
439
438
|
]
|
|
440
439
|
}
|
|
441
440
|
except Exception as e:
|
|
442
|
-
print(f"Warning: Skipping ID {problem_id_str} in {filename} -
|
|
441
|
+
print(f"Warning: Skipping ID {problem_id_str} in {filename} - error ({type(e).__name__}: {e}).")
|
|
443
442
|
import traceback
|
|
444
443
|
|
|
445
444
|
traceback.print_exc()
|
|
@@ -474,7 +473,7 @@ def main():
|
|
|
474
473
|
file_error_count += 1
|
|
475
474
|
continue
|
|
476
475
|
|
|
477
|
-
print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}:
|
|
476
|
+
print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: {filename}...")
|
|
478
477
|
lines_in_file = 0
|
|
479
478
|
processed_in_file = 0
|
|
480
479
|
skipped_in_file = 0
|
|
@@ -488,7 +487,7 @@ def main():
|
|
|
488
487
|
try:
|
|
489
488
|
problem_data = json.loads(stripped_line)
|
|
490
489
|
except json.JSONDecodeError:
|
|
491
|
-
print(f"Warning: Malformed JSON on line {line_number}
|
|
490
|
+
print(f"Warning: Malformed JSON on line {line_number} in {filepath}. Skipping line.")
|
|
492
491
|
skipped_in_file += 1
|
|
493
492
|
continue
|
|
494
493
|
|
|
@@ -507,7 +506,7 @@ def main():
|
|
|
507
506
|
processed_count += processed_in_file
|
|
508
507
|
skipped_count += skipped_in_file
|
|
509
508
|
except Exception as e:
|
|
510
|
-
print(f"Error processing file {filepath}: {type(e).__name__}: {e}.
|
|
509
|
+
print(f"Error processing file {filepath}: {type(e).__name__}: {e}. Skipping rest of file.")
|
|
511
510
|
import traceback
|
|
512
511
|
|
|
513
512
|
traceback.print_exc()
|
{eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/utils/subprocess_manager.py
RENAMED
|
@@ -139,7 +139,7 @@ def start_ngrok_and_get_url(
|
|
|
139
139
|
# Or by setting NGROK_AUTHTOKEN environment variable.
|
|
140
140
|
# Forcing it via command line is also an option but less common for persistent setup.
|
|
141
141
|
print(
|
|
142
|
-
|
|
142
|
+
"Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken <token>') or via NGROK_AUTHTOKEN env var."
|
|
143
143
|
)
|
|
144
144
|
# Example if passing via env for the subprocess:
|
|
145
145
|
# ngrok_env = os.environ.copy()
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fireworks Eval Protocol - Simplify reward modeling and evaluation for LLM RL fine-tuning.
|
|
3
|
+
|
|
4
|
+
A Python library for defining, testing, deploying, and using reward functions
|
|
5
|
+
for LLM fine-tuning, including launching full RL jobs on the Fireworks platform.
|
|
6
|
+
|
|
7
|
+
The library also provides an agent evaluation framework for testing and evaluating
|
|
8
|
+
tool-augmented models using self-contained task bundles.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import warnings
|
|
12
|
+
|
|
13
|
+
from .auth import get_fireworks_account_id, get_fireworks_api_key
|
|
14
|
+
from .common_utils import load_jsonl
|
|
15
|
+
from .config import RewardKitConfig, get_config, load_config
|
|
16
|
+
from .mcp_env import (
|
|
17
|
+
AnthropicPolicy,
|
|
18
|
+
FireworksPolicy,
|
|
19
|
+
LiteLLMPolicy,
|
|
20
|
+
OpenAIPolicy,
|
|
21
|
+
make,
|
|
22
|
+
rollout,
|
|
23
|
+
test_mcp,
|
|
24
|
+
)
|
|
25
|
+
from .data_loader import DynamicDataLoader, InlineDataLoader
|
|
26
|
+
from . import mcp, rewards
|
|
27
|
+
from .models import EvaluateResult, Message, MetricResult, EvaluationRow, InputMetadata, Status
|
|
28
|
+
from .playback_policy import PlaybackPolicyBase
|
|
29
|
+
from .resources import create_llm_resource
|
|
30
|
+
from .reward_function import RewardFunction
|
|
31
|
+
from .typed_interface import reward_function
|
|
32
|
+
from .quickstart.aha_judge import aha_judge
|
|
33
|
+
from .utils.evaluation_row_utils import (
|
|
34
|
+
multi_turn_assistant_to_ground_truth,
|
|
35
|
+
assistant_to_ground_truth,
|
|
36
|
+
filter_longest_conversation,
|
|
37
|
+
)
|
|
38
|
+
from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor, GithubActionRolloutProcessor
|
|
39
|
+
from .pytest.parameterize import DefaultParameterIdGenerator
|
|
40
|
+
from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
|
|
41
|
+
from .log_utils.rollout_id_filter import RolloutIdFilter
|
|
42
|
+
from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
|
|
43
|
+
from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
|
|
44
|
+
from .log_utils.elasticsearch_client import ElasticsearchConfig
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
from .types.remote_rollout_processor import (
|
|
48
|
+
InitRequest,
|
|
49
|
+
RolloutMetadata,
|
|
50
|
+
StatusResponse,
|
|
51
|
+
create_langfuse_config_tags,
|
|
52
|
+
DataLoaderConfig,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
from .adapters import OpenAIResponsesAdapter
|
|
57
|
+
except ImportError:
|
|
58
|
+
OpenAIResponsesAdapter = None
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
from .adapters import LangfuseAdapter, create_langfuse_adapter
|
|
62
|
+
except ImportError:
|
|
63
|
+
LangfuseAdapter = None
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
from .adapters import BraintrustAdapter, create_braintrust_adapter
|
|
67
|
+
except ImportError:
|
|
68
|
+
BraintrustAdapter = None
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
from .adapters import LangSmithAdapter
|
|
72
|
+
except ImportError:
|
|
73
|
+
LangSmithAdapter = None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
from .adapters import WeaveAdapter
|
|
78
|
+
except ImportError:
|
|
79
|
+
WeaveAdapter = None
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
from .proxy import create_app, AuthProvider, AccountInfo # pyright: ignore[reportAssignmentType]
|
|
83
|
+
except ImportError:
|
|
84
|
+
|
|
85
|
+
def create_app(*args, **kwargs):
|
|
86
|
+
raise ImportError(
|
|
87
|
+
"Proxy functionality requires additional dependencies. "
|
|
88
|
+
"Please install with: pip install eval-protocol[proxy]"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
class AuthProvider:
|
|
92
|
+
def __init__(self, *args, **kwargs):
|
|
93
|
+
raise ImportError(
|
|
94
|
+
"Proxy functionality requires additional dependencies. "
|
|
95
|
+
"Please install with: pip install eval-protocol[proxy]"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
class AccountInfo:
|
|
99
|
+
def __init__(self, *args, **kwargs):
|
|
100
|
+
raise ImportError(
|
|
101
|
+
"Proxy functionality requires additional dependencies. "
|
|
102
|
+
"Please install with: pip install eval-protocol[proxy]"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
|
|
107
|
+
|
|
108
|
+
__all__ = [
|
|
109
|
+
"ElasticsearchConfig",
|
|
110
|
+
"ElasticsearchDirectHttpHandler",
|
|
111
|
+
"RolloutIdFilter",
|
|
112
|
+
"setup_rollout_logging_for_elasticsearch_handler",
|
|
113
|
+
"DataLoaderConfig",
|
|
114
|
+
"Status",
|
|
115
|
+
"RemoteRolloutProcessor",
|
|
116
|
+
"GithubActionRolloutProcessor",
|
|
117
|
+
"InputMetadata",
|
|
118
|
+
"EvaluationRow",
|
|
119
|
+
"DefaultParameterIdGenerator",
|
|
120
|
+
"DynamicDataLoader",
|
|
121
|
+
"InlineDataLoader",
|
|
122
|
+
"aha_judge",
|
|
123
|
+
"multi_turn_assistant_to_ground_truth",
|
|
124
|
+
"assistant_to_ground_truth",
|
|
125
|
+
"filter_longest_conversation",
|
|
126
|
+
"evaluation_test",
|
|
127
|
+
"SingleTurnRolloutProcessor",
|
|
128
|
+
"OpenAIResponsesAdapter",
|
|
129
|
+
"LangfuseAdapter",
|
|
130
|
+
"create_langfuse_adapter",
|
|
131
|
+
"BraintrustAdapter",
|
|
132
|
+
"create_braintrust_adapter",
|
|
133
|
+
"LangSmithAdapter",
|
|
134
|
+
"FireworksTracingHttpHandler",
|
|
135
|
+
# Core interfaces
|
|
136
|
+
"Message",
|
|
137
|
+
"MetricResult",
|
|
138
|
+
"EvaluateResult",
|
|
139
|
+
"reward_function",
|
|
140
|
+
"RewardFunction",
|
|
141
|
+
# Authentication
|
|
142
|
+
"get_fireworks_api_key",
|
|
143
|
+
"get_fireworks_account_id",
|
|
144
|
+
# Configuration
|
|
145
|
+
"load_config",
|
|
146
|
+
"get_config",
|
|
147
|
+
"RewardKitConfig",
|
|
148
|
+
# Utilities
|
|
149
|
+
"load_jsonl",
|
|
150
|
+
# MCP Environment API
|
|
151
|
+
"make",
|
|
152
|
+
"rollout",
|
|
153
|
+
"LiteLLMPolicy",
|
|
154
|
+
"AnthropicPolicy",
|
|
155
|
+
"FireworksPolicy",
|
|
156
|
+
"OpenAIPolicy",
|
|
157
|
+
"test_mcp",
|
|
158
|
+
# Playback functionality
|
|
159
|
+
"PlaybackPolicyBase",
|
|
160
|
+
# Resource management
|
|
161
|
+
"create_llm_resource",
|
|
162
|
+
# Submodules
|
|
163
|
+
"rewards",
|
|
164
|
+
"mcp",
|
|
165
|
+
# Remote server types
|
|
166
|
+
"InitRequest",
|
|
167
|
+
"RolloutMetadata",
|
|
168
|
+
"StatusResponse",
|
|
169
|
+
"create_langfuse_config_tags",
|
|
170
|
+
# Proxy
|
|
171
|
+
"create_app",
|
|
172
|
+
"AuthProvider",
|
|
173
|
+
"AccountInfo",
|
|
174
|
+
]
|
|
175
|
+
|
|
176
|
+
from . import _version
|
|
177
|
+
|
|
178
|
+
__version__ = _version.get_versions()["version"]
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-
|
|
11
|
+
"date": "2025-12-15T16:40:32-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "438a49431d16626a8e883cfb04afecfb188eb9dc",
|
|
15
|
+
"version": "0.2.98.dev.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Data source adapters for Eval Protocol.
|
|
2
|
+
|
|
3
|
+
This package provides adapters for integrating with various data sources
|
|
4
|
+
and converting them to EvaluationRow format for use in evaluation pipelines.
|
|
5
|
+
|
|
6
|
+
Available adapters:
|
|
7
|
+
- BaseAdapter: Abstract base class for all adapters
|
|
8
|
+
- LangfuseAdapter: Pull data from Langfuse deployments
|
|
9
|
+
- FireworksTracingAdapter: Pull data from Langfuse via Fireworks tracing proxy
|
|
10
|
+
- HuggingFaceAdapter: Load datasets from HuggingFace Hub
|
|
11
|
+
- BigQueryAdapter: Query data from Google BigQuery
|
|
12
|
+
- TRL integration (legacy)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Always available
|
|
16
|
+
from .base import BaseAdapter
|
|
17
|
+
|
|
18
|
+
__all__ = ["BaseAdapter"]
|
|
19
|
+
|
|
20
|
+
# Conditional imports based on available dependencies
|
|
21
|
+
try:
|
|
22
|
+
from .langfuse import LangfuseAdapter, create_langfuse_adapter
|
|
23
|
+
|
|
24
|
+
__all__.extend(["LangfuseAdapter", "create_langfuse_adapter"])
|
|
25
|
+
except ImportError:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
from .fireworks_tracing import FireworksTracingAdapter
|
|
29
|
+
|
|
30
|
+
__all__.extend(["FireworksTracingAdapter"])
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from .huggingface import (
|
|
34
|
+
HuggingFaceAdapter,
|
|
35
|
+
create_gsm8k_adapter,
|
|
36
|
+
create_huggingface_adapter,
|
|
37
|
+
create_math_adapter,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
__all__.extend(
|
|
41
|
+
[
|
|
42
|
+
"HuggingFaceAdapter",
|
|
43
|
+
"create_huggingface_adapter",
|
|
44
|
+
"create_gsm8k_adapter",
|
|
45
|
+
"create_math_adapter",
|
|
46
|
+
]
|
|
47
|
+
)
|
|
48
|
+
except ImportError:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
from .bigquery import (
|
|
53
|
+
BigQueryAdapter,
|
|
54
|
+
create_bigquery_adapter,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
__all__.extend(
|
|
58
|
+
[
|
|
59
|
+
"BigQueryAdapter",
|
|
60
|
+
"create_bigquery_adapter",
|
|
61
|
+
]
|
|
62
|
+
)
|
|
63
|
+
except ImportError:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
from .braintrust import BraintrustAdapter, create_braintrust_adapter
|
|
68
|
+
|
|
69
|
+
__all__.extend(["BraintrustAdapter", "create_braintrust_adapter"])
|
|
70
|
+
except ImportError:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
# Legacy adapters (always available)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
from .trl import create_trl_adapter
|
|
77
|
+
|
|
78
|
+
__all__.extend(["create_trl_adapter"])
|
|
79
|
+
except ImportError:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
from .openai_responses import OpenAIResponsesAdapter
|
|
84
|
+
|
|
85
|
+
__all__.extend(["OpenAIResponsesAdapter"])
|
|
86
|
+
except ImportError:
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
from .langsmith import LangSmithAdapter
|
|
91
|
+
|
|
92
|
+
__all__.extend(["LangSmithAdapter"])
|
|
93
|
+
except ImportError:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
from .weave import WeaveAdapter
|
|
98
|
+
|
|
99
|
+
__all__.extend(["WeaveAdapter"])
|
|
100
|
+
except ImportError:
|
|
101
|
+
pass
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base adapter interface for Eval Protocol.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from eval_protocol.models import EvaluationRow
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseAdapter(ABC):
|
|
12
|
+
"""Abstract base class for all Eval Protocol adapters."""
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
|
|
16
|
+
"""Get evaluation rows from the data source."""
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
|
|
20
|
+
"""Upload evaluation scores back to the data source for tracking and analysis."""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
def upload_score(self, row: EvaluationRow, model_name: str) -> None:
|
|
24
|
+
"""Upload evaluation score for a single row back to the data source."""
|
|
25
|
+
pass
|