benchflow 0.5.3.dev899__tar.gz → 0.5.3.dev906__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/PKG-INFO +1 -1
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/pyproject.toml +1 -1
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/registry.py +13 -13
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_agent_registry.py +8 -3
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_agent_setup.py +3 -1
- benchflow-0.5.3.dev906/tests/test_skillsbench_publish_scrub.py +75 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/.gitignore +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/CHANGELOG.md +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/LICENSE +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/README.md +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_dotenv.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_paths.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_run.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_types.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/benchmark_repos.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/config.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/evaluation_results.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/json_safe.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/learner_memory.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/result_metadata.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/reward_events.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/scoring.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/source_provenance.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/task_authoring.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/yaml_loader.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/acp/client.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/acp/container_transport.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/acp/runtime.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/acp/session.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/acp/transport.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/adapters/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/adapters/harbor.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/adapters/inbound.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/adapters/inspect_ai.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/adapters/ors.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/adapters/terminal_bench.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/codex_config.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/credentials.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/env.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/errors.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/install.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/protocol.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/providers.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/branch.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/cli/main.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/cli/trace_import.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/compat/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/compat/harbor_registry.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/contracts/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/contracts/planes.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/contracts/user.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/demo_task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/demo_task/instruction.md +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/demo_task/task.toml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/demo_task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/diagnostics.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/environment/manifest.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/environment/manifest_env.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/environment/protocol.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/environment/readiness.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/eval_sharding.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/eval_worker.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/evaluation.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/experimental/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/experimental/mcp/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/experimental/mcp/hooks.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/hosted_env.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/learner_skills.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/learner_store.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/metrics.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/models.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/monitor.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/providers/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_config.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_logging.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/providers/runtime.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/py.typed +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/README.md +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/builtins.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/events.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/file_readers.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/llm.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/memory_scorer.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/node.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/protocol.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/rubric.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/rubric_config.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rewards/validation.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rollout.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rollout_branch.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/rollout_planes.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/runtime.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_base.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_compose.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_sdk_ops.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/daytona.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/docker.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/lockdown.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/metadata.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/modal_impl.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/process.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/protocol.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/services.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/setup.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/snapshot.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/user.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/scenes.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/sdk.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/self_gen.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/skill_eval/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/skill_eval/_core.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/skill_eval/gepa_export.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/skill_eval/schema.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/skill_policy.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/skills.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/task/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/task/config.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/task/env.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/task/paths.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/task/task.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/task/verifier.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/templates/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/templates/judge.py.tmpl +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/templates/test.sh.tmpl +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/traces/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/traces/huggingface.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/traces/local.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/traces/models.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/traces/parsers.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/traces/task_gen.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/_capture.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/export.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/metrics.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/tree.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/viewer.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/usage_tracking.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/agents/test_protocol.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/README.md +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/instruction.md +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/task.toml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/tests/test.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/conformance-results.json +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/proof_multi_agent.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/proof_snapshot.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/run_conformance.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/conftest.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/environment/test_chibench_manifest.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/environment/test_clawsbench_manifest.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/environment/test_manifest.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/environment/test_manifest_env.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/environment/test_protocol.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/environment/test_readiness.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/test_codex.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/test_codex_custom_provider.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/traces/minimal-claude.jsonl +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/fixtures/mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/check_adapter_evidence.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/check_hosted_env_evidence.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/check_results.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/check_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/configs/claude-agent-acp.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/configs/codex-acp.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/configs/gemini.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/configs/openclaw.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/configs/opencode.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/configs/openhands.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/configs/pi-acp.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/run.sh +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/run_suite.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/integration/suites/release.yaml +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_acp.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_acp_capability_advertising.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_acp_model_config_dispatch.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_acp_pinned_protocol_guard.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_acp_setup_failure_propagation.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_adapter_scripts.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_adapters.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_agent_cli.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_agent_env_resolution.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_agent_gemini_defaults.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_agent_idle_timeout_cli.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_agent_spec.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_base_install_imports.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_bedrock_thinking.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_branch.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_capture_trajectory.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_clawsbench_slice.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_cli_daytona.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_cli_docs_drift.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_cli_misc.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_compat_harbor_registry.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_config_redaction.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_connect_as_env.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_continuallearningbench_adapter.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_dashboard_credential_env_scrub.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_dashboard_daytona_key.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_dashboard_no_host_paths.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_dashboard_release_evidence.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_dashboard_roadmap.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_dashboard_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_dashboard_sync.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_daytona_command_polling.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_daytona_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_daytona_status.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_docker_prune_scoping.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_docker_uploads.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_docs_examples.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_eng50_capabilities.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_env_setup.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_environment_manifest_controls.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_eval_filters_applied.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_eval_sharding.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_eval_single_task_summary.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_eval_source_provenance.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_eval_worker_retry.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_eval_zero_task_guard.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_evaluation_environment_manifest.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_experiments_status.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_harvey_lab_shim.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_hf_scores.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_hilbench_adapter.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_hosted_env.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_hosted_env_rollout_contract.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_inbound_adapter_manifest.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_inbound_adapters.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_integration_check_results.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_integration_run_suite.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_internet_policy.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_job.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_job_sequential_shared.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_job_sequential_shared_resume.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_judge_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_learner_skills.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_learner_skills_traversal.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_learner_store.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_learner_store_persistence.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_litellm_config.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_litellm_hardening.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_litellm_logging.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_litellm_smoke.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_llm_judge.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_llm_judge_event_tags.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_llm_judge_verifier.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_memory_scorer.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_metrics.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_monitor_scaffold.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_native_acp_usage.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_no_cross_provider_fallback.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_notification_order_real.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_oracle.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_oracle_chokepoint.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_paths_safe.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_paths_symlink_helpers.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_process.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_provider_auth_detection.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_providers.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_reexport.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_registry_invariants.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_release_version.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_resolve_env_helpers.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_reward_node.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_reward_unified_contract.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rewards.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rewards_jsonl.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rollout_architecture.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rollout_branch.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rollout_config_path_coercion.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rollout_environment.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rollout_import_no_side_effects.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rollout_on_ask_user_wiring.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rollout_probe_sandbox_health.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rollout_upload.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_rubric_config.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_runtime.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_runtime_config_wired.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_runtime_live_sandbox.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sandbox.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sandbox_exec_secret_handling.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sandbox_hardening.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sandbox_multi_service.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sandbox_protocol.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sandbox_setup.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sandbox_snapshot_contract.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sandbox_upload_symlink.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sandbox_verifier_workspace.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_scene.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_scene_outbox_trial.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_scene_parallel_group.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_scene_result_aggregation.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_scoring.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sdk_internals.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_sdk_lockdown.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_self_gen_cli.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_self_gen_export_error_channel.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_self_gen_export_failures.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_self_gen_orchestration.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_session_request_permission_dispatch.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skill_eval.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skill_eval_dryrun.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skill_eval_integration.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skill_eval_sweep.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skill_eval_traversal.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skill_invocation_artifacts.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skill_policy.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skills.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skills_dir_agent_home_link.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_skillsbench_harbor_run_suite.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_smoke.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_subscription_auth.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_task_check_eval_consistency.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_task_config.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_task_download.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_tasks.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_token_usage_normalization.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_trace_import_cli.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_trace_task_gen_traversal.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_traces_huggingface.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_traces_parsers.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_traces_task_gen.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_train_mode_artifact_emission.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_trajectory_integration.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_trajectory_streaming.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_trial_agent_timeout_verify.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_trial_install_agent_timeout.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_trial_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_usage_litellm.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_usage_required.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_usage_tracking.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_user.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_verifier_multi_container.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_verifier_output.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_verifier_output_freshness.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_verify.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_workflow_action_pinning.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/test_yaml_config.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/trajectories/test_export.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/trajectories/test_export_nan_handling.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/trajectories/test_redaction.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/trajectories/test_step_granularity.py +0 -0
- {benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/tests/trajectories/test_tree.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.5.3.
|
|
3
|
+
Version: 0.5.3.dev906
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -106,7 +106,7 @@ def _apt_install(*packages: str) -> str:
|
|
|
106
106
|
_BENCHFLOW_NODE_PREFIX = "/opt/benchflow/node"
|
|
107
107
|
_BENCHFLOW_JS_AGENT_PREFIX = "/opt/benchflow/js-agents"
|
|
108
108
|
_BENCHFLOW_BIN_PREFIX = "/opt/benchflow/bin"
|
|
109
|
-
|
|
109
|
+
_OPENHANDS_CLI_GIT_REV = "3ca17446c5d9c1e35e054803478a3501ec251ecf"
|
|
110
110
|
_OPENHANDS_SDK_VERSION = "1.22.1"
|
|
111
111
|
_OPENHANDS_TOOLS_VERSION = "1.22.1"
|
|
112
112
|
_JS_AGENT_PATH = (
|
|
@@ -546,15 +546,15 @@ AGENTS: dict[str, AgentConfig] = {
|
|
|
546
546
|
install_cmd=(
|
|
547
547
|
"export DEBIAN_FRONTEND=noninteractive && "
|
|
548
548
|
'export PATH="$HOME/.local/bin:$PATH" && '
|
|
549
|
-
"( command -v curl >/dev/null 2>&1 || "
|
|
549
|
+
"( command -v curl >/dev/null 2>&1 && command -v git >/dev/null 2>&1 || "
|
|
550
550
|
" if command -v apt-get >/dev/null 2>&1; then "
|
|
551
|
-
f" {_apt_install('curl', 'ca-certificates')}; "
|
|
551
|
+
f" {_apt_install('curl', 'ca-certificates', 'git')}; "
|
|
552
552
|
" elif command -v dnf >/dev/null 2>&1; then "
|
|
553
|
-
" dnf -y --allowerasing install curl ca-certificates >/dev/null 2>&1; "
|
|
553
|
+
" dnf -y --allowerasing install curl ca-certificates git >/dev/null 2>&1; "
|
|
554
554
|
" elif command -v apk >/dev/null 2>&1; then "
|
|
555
|
-
" apk add --no-cache curl ca-certificates >/dev/null 2>&1; "
|
|
555
|
+
" apk add --no-cache curl ca-certificates git >/dev/null 2>&1; "
|
|
556
556
|
" else "
|
|
557
|
-
" echo 'OpenHands install requires curl' >&2; "
|
|
557
|
+
" echo 'OpenHands GitHub install requires curl and git' >&2; "
|
|
558
558
|
" exit 127; "
|
|
559
559
|
" fi ) && "
|
|
560
560
|
"( UV_OK=0; "
|
|
@@ -569,18 +569,18 @@ AGENTS: dict[str, AgentConfig] = {
|
|
|
569
569
|
" curl -LsSf https://astral.sh/uv/install.sh | sh >/dev/null 2>&1 && "
|
|
570
570
|
' export PATH="$HOME/.local/bin:$PATH"; '
|
|
571
571
|
" fi && "
|
|
572
|
-
# OpenHands CLI
|
|
573
|
-
#
|
|
574
|
-
#
|
|
575
|
-
#
|
|
576
|
-
# on validation errors until timeout. 1.22.x restores the intended
|
|
577
|
-
# default-to-UNKNOWN behavior without the API drift seen in 1.26.x.
|
|
572
|
+
# Pin the OpenHands CLI source so the agent workflow cannot drift
|
|
573
|
+
# with GitHub main; only override the buggy sdk/tools 1.21.0 pins.
|
|
574
|
+
# SDK 1.22.x restores default-to-UNKNOWN for the synthetic
|
|
575
|
+
# `security_risk` tool field without the API drift seen in 1.26.x.
|
|
578
576
|
f"printf 'openhands-sdk=={_OPENHANDS_SDK_VERSION}\\n"
|
|
579
577
|
f"openhands-tools=={_OPENHANDS_TOOLS_VERSION}\\n' "
|
|
580
578
|
"> /tmp/oh-sdk-overrides.txt && "
|
|
581
579
|
"uv tool install --force --refresh "
|
|
582
580
|
"--overrides /tmp/oh-sdk-overrides.txt "
|
|
583
|
-
|
|
581
|
+
"--from "
|
|
582
|
+
f"'git+https://github.com/OpenHands/OpenHands-CLI.git@{_OPENHANDS_CLI_GIT_REV}' "
|
|
583
|
+
"openhands --python 3.12 && "
|
|
584
584
|
" uv tool list | grep -q '^openhands\\b' ) && "
|
|
585
585
|
# Let sandbox user traverse to uv-managed Python interpreter path.
|
|
586
586
|
"chmod o+x /root /root/.local /root/.local/share "
|
|
@@ -99,18 +99,23 @@ class TestOpenHandsConfig:
|
|
|
99
99
|
assert "$HOME/.agents/skills" in cfg.skill_paths
|
|
100
100
|
assert "$WORKSPACE/.agents/skills" in cfg.skill_paths
|
|
101
101
|
|
|
102
|
-
def
|
|
102
|
+
def test_openhands_install_cmd_pins_cli_git_revision(self):
|
|
103
103
|
cfg = AGENTS["openhands"]
|
|
104
104
|
assert (
|
|
105
|
-
"apt-get -o Acquire::Retries=3 install -y -qq curl ca-certificates"
|
|
105
|
+
"apt-get -o Acquire::Retries=3 install -y -qq curl ca-certificates git"
|
|
106
106
|
in cfg.install_cmd
|
|
107
107
|
)
|
|
108
108
|
assert (
|
|
109
109
|
"uv tool install --force --refresh "
|
|
110
110
|
"--overrides /tmp/oh-sdk-overrides.txt "
|
|
111
|
-
"
|
|
111
|
+
"--from "
|
|
112
|
+
"'git+https://github.com/OpenHands/OpenHands-CLI.git@"
|
|
113
|
+
"3ca17446c5d9c1e35e054803478a3501ec251ecf' "
|
|
114
|
+
"openhands --python 3.12" in cfg.install_cmd
|
|
112
115
|
)
|
|
113
116
|
assert "OpenHands/OpenHands-CLI.git@main" not in cfg.install_cmd
|
|
117
|
+
assert "openhands==1.16.0" not in cfg.install_cmd
|
|
118
|
+
assert "command -v git" in cfg.install_cmd
|
|
114
119
|
assert "install.openhands.dev/install.sh" not in cfg.install_cmd
|
|
115
120
|
|
|
116
121
|
def test_openhands_install_cmd_overrides_buggy_sdk_pin(self):
|
|
@@ -638,7 +638,9 @@ async def test_install_agent_writes_command_stdout_and_stderr_on_failure(
|
|
|
638
638
|
assert (
|
|
639
639
|
"uv tool install --force --refresh "
|
|
640
640
|
"--overrides /tmp/oh-sdk-overrides.txt "
|
|
641
|
-
"
|
|
641
|
+
"--from 'git+https://github.com/OpenHands/OpenHands-CLI.git@"
|
|
642
|
+
"3ca17446c5d9c1e35e054803478a3501ec251ecf' "
|
|
643
|
+
"openhands --python 3.12" in log_text
|
|
642
644
|
)
|
|
643
645
|
assert "=== stderr ===" in log_text
|
|
644
646
|
assert "uv: command not found" in log_text
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _load_publish_module():
|
|
7
|
+
path = (
|
|
8
|
+
Path(__file__).resolve().parents[1]
|
|
9
|
+
/ "experiments"
|
|
10
|
+
/ "skillsbench-fill"
|
|
11
|
+
/ "publish.py"
|
|
12
|
+
)
|
|
13
|
+
spec = importlib.util.spec_from_file_location("skillsbench_fill_publish", path)
|
|
14
|
+
assert spec and spec.loader
|
|
15
|
+
module = importlib.util.module_from_spec(spec)
|
|
16
|
+
spec.loader.exec_module(module)
|
|
17
|
+
return module
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_safe_bytes_preserves_token_usage_counters(tmp_path):
|
|
21
|
+
"""Guards Hugging Face PR #4 token-usage recovery against scrub redaction."""
|
|
22
|
+
publish = _load_publish_module()
|
|
23
|
+
result = {
|
|
24
|
+
"agent_result": {
|
|
25
|
+
"n_input_tokens": 123,
|
|
26
|
+
"n_output_tokens": 45,
|
|
27
|
+
"n_cache_read_tokens": 678,
|
|
28
|
+
"n_cache_creation_tokens": 90,
|
|
29
|
+
"total_tokens": 936,
|
|
30
|
+
"usage_source": "provider_response",
|
|
31
|
+
},
|
|
32
|
+
"final_metrics": {
|
|
33
|
+
"total_prompt_tokens": 123,
|
|
34
|
+
"total_completion_tokens": 45,
|
|
35
|
+
"total_cached_tokens": 678,
|
|
36
|
+
"total_cost_usd": 1.23,
|
|
37
|
+
},
|
|
38
|
+
"HUGGING_FACE_TOKEN": "hf_abcdefghijklmnopqrstuvwxyz",
|
|
39
|
+
}
|
|
40
|
+
src = tmp_path / "result.json"
|
|
41
|
+
src.write_text(json.dumps(result))
|
|
42
|
+
|
|
43
|
+
scrubbed = json.loads(publish.safe_bytes(src))
|
|
44
|
+
|
|
45
|
+
assert scrubbed["agent_result"]["n_input_tokens"] == 123
|
|
46
|
+
assert scrubbed["agent_result"]["n_output_tokens"] == 45
|
|
47
|
+
assert scrubbed["agent_result"]["n_cache_read_tokens"] == 678
|
|
48
|
+
assert scrubbed["agent_result"]["n_cache_creation_tokens"] == 90
|
|
49
|
+
assert scrubbed["agent_result"]["total_tokens"] == 936
|
|
50
|
+
assert scrubbed["final_metrics"]["total_prompt_tokens"] == 123
|
|
51
|
+
assert scrubbed["final_metrics"]["total_completion_tokens"] == 45
|
|
52
|
+
assert scrubbed["final_metrics"]["total_cached_tokens"] == 678
|
|
53
|
+
assert scrubbed["HUGGING_FACE_TOKEN"] == "[REDACTED]"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_safe_bytes_normalizes_config_without_leaking_secret_tokens(tmp_path):
|
|
57
|
+
"""Guards Hugging Face PR #4 token-usage recovery against credential leakage."""
|
|
58
|
+
publish = _load_publish_module()
|
|
59
|
+
config = {
|
|
60
|
+
"include_task_skills": False,
|
|
61
|
+
"agent_env": {
|
|
62
|
+
"AWS_BEARER_TOKEN_BEDROCK": "Bearer abcdefghijklmnop",
|
|
63
|
+
"OPENAI_API_KEY": "sk-abcdefghijklmnopqrstuvwxyz",
|
|
64
|
+
"BENCHFLOW_MODEL_MAX_TOKENS": 8192,
|
|
65
|
+
},
|
|
66
|
+
}
|
|
67
|
+
src = tmp_path / "config.json"
|
|
68
|
+
src.write_text(json.dumps(config))
|
|
69
|
+
|
|
70
|
+
scrubbed = json.loads(publish.safe_bytes(src, is_config=True, mode="with"))
|
|
71
|
+
|
|
72
|
+
assert scrubbed["include_task_skills"] is True
|
|
73
|
+
assert scrubbed["agent_env"]["AWS_BEARER_TOKEN_BEDROCK"] == "[REDACTED]"
|
|
74
|
+
assert scrubbed["agent_env"]["OPENAI_API_KEY"] == "[REDACTED]"
|
|
75
|
+
assert scrubbed["agent_env"]["BENCHFLOW_MODEL_MAX_TOKENS"] == 8192
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/_utils/evaluation_results.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/agents/harvey_lab_acp_shim.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/demo_task/environment/Dockerfile
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/experimental/mcp/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/experimental/mcp/reviewer_server.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_bedrock_patch.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_logging.py
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev899 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_runtime.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|