benchflow 0.5.3.dev906__tar.gz → 0.5.3.dev908__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/CHANGELOG.md +10 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/PKG-INFO +1 -1
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/pyproject.toml +1 -1
- benchflow-0.5.3.dev908/src/benchflow/cli/continue_cmd.py +236 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/cli/main.py +4 -0
- benchflow-0.5.3.dev908/src/benchflow/continue_run/__init__.py +35 -0
- benchflow-0.5.3.dev908/src/benchflow/continue_run/batch.py +125 -0
- benchflow-0.5.3.dev908/src/benchflow/continue_run/orchestrator.py +743 -0
- benchflow-0.5.3.dev908/src/benchflow/continue_run/replay_proxy.py +409 -0
- benchflow-0.5.3.dev908/src/benchflow/continue_run/run_folder.py +211 -0
- benchflow-0.5.3.dev908/src/benchflow/continue_run/sandbox_proxy.py +472 -0
- benchflow-0.5.3.dev908/tests/continue_run/_helpers.py +100 -0
- benchflow-0.5.3.dev908/tests/continue_run/test_batch.py +111 -0
- benchflow-0.5.3.dev908/tests/continue_run/test_orchestrator.py +302 -0
- benchflow-0.5.3.dev908/tests/continue_run/test_replay_proxy.py +176 -0
- benchflow-0.5.3.dev908/tests/continue_run/test_run_folder.py +97 -0
- benchflow-0.5.3.dev908/tests/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/.gitignore +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/LICENSE +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/README.md +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_dotenv.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_paths.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_run.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_types.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/benchmark_repos.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/config.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/evaluation_results.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/json_safe.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/learner_memory.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/result_metadata.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/reward_events.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/scoring.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/source_provenance.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/task_authoring.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/yaml_loader.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/client.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/container_transport.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/runtime.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/session.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/transport.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/harbor.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/inbound.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/inspect_ai.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/ors.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/terminal_bench.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/codex_config.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/credentials.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/env.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/errors.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/install.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/protocol.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/providers.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/registry.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/branch.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/cli/trace_import.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/compat/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/compat/harbor_registry.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/contracts/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/contracts/planes.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/contracts/user.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/demo_task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/demo_task/instruction.md +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/demo_task/task.toml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/demo_task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/diagnostics.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/environment/manifest.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/environment/manifest_env.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/environment/protocol.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/environment/readiness.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/eval_sharding.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/eval_worker.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/evaluation.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/experimental/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/experimental/mcp/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/experimental/mcp/hooks.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/hosted_env.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/learner_skills.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/learner_store.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/metrics.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/models.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/monitor.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/litellm_config.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/litellm_logging.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/runtime.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/py.typed +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/README.md +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/builtins.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/events.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/file_readers.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/llm.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/memory_scorer.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/node.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/protocol.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/rubric.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/rubric_config.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/validation.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rollout.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rollout_branch.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rollout_planes.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/runtime.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_base.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_compose.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_sdk_ops.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/daytona.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/docker.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/lockdown.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/metadata.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/modal_impl.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/process.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/protocol.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/services.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/setup.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/snapshot.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/user.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/scenes.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sdk.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/self_gen.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skill_eval/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skill_eval/_core.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skill_eval/gepa_export.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skill_eval/schema.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skill_policy.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skills.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/config.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/env.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/paths.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/task.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/verifier.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/templates/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/templates/judge.py.tmpl +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/templates/test.sh.tmpl +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/huggingface.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/local.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/models.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/parsers.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/task_gen.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/_capture.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/export.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/metrics.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/tree.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/viewer.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/usage_tracking.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/agents/test_protocol.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/README.md +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/instruction.md +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/task.toml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/tests/test.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/conformance-results.json +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/proof_multi_agent.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/proof_snapshot.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/run_conformance.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conftest.py +0 -0
- {benchflow-0.5.3.dev906/tests/environment → benchflow-0.5.3.dev908/tests/continue_run}/__init__.py +0 -0
- {benchflow-0.5.3.dev906/tests/trajectories → benchflow-0.5.3.dev908/tests/environment}/__init__.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_chibench_manifest.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_clawsbench_manifest.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_manifest.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_manifest_env.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_protocol.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_readiness.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/test_codex.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/test_codex_custom_provider.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/traces/minimal-claude.jsonl +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/fixtures/mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/check_adapter_evidence.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/check_hosted_env_evidence.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/check_results.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/check_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/claude-agent-acp.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/codex-acp.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/gemini.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/openclaw.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/opencode.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/openhands.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/pi-acp.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/run.sh +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/run_suite.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/suites/release.yaml +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_acp.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_acp_capability_advertising.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_acp_model_config_dispatch.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_acp_pinned_protocol_guard.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_acp_setup_failure_propagation.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_adapter_scripts.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_adapters.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_cli.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_env_resolution.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_gemini_defaults.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_idle_timeout_cli.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_registry.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_setup.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_spec.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_base_install_imports.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_bedrock_thinking.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_branch.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_capture_trajectory.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_clawsbench_slice.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_cli_daytona.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_cli_docs_drift.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_cli_misc.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_compat_harbor_registry.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_config_redaction.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_connect_as_env.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_continuallearningbench_adapter.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_credential_env_scrub.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_daytona_key.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_no_host_paths.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_release_evidence.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_roadmap.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_sync.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_daytona_command_polling.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_daytona_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_daytona_status.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_docker_prune_scoping.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_docker_uploads.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_docs_examples.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eng50_capabilities.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_env_setup.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_environment_manifest_controls.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_filters_applied.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_sharding.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_single_task_summary.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_source_provenance.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_worker_retry.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_zero_task_guard.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_evaluation_environment_manifest.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_experiments_status.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_harvey_lab_shim.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_hf_scores.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_hilbench_adapter.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_hosted_env.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_hosted_env_rollout_contract.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_inbound_adapter_manifest.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_inbound_adapters.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_integration_check_results.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_integration_run_suite.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_internet_policy.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_job.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_job_sequential_shared.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_job_sequential_shared_resume.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_judge_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_learner_skills.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_learner_skills_traversal.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_learner_store.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_learner_store_persistence.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_litellm_config.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_litellm_hardening.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_litellm_logging.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_litellm_smoke.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_llm_judge.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_llm_judge_event_tags.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_llm_judge_verifier.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_memory_scorer.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_metrics.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_monitor_scaffold.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_native_acp_usage.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_no_cross_provider_fallback.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_notification_order_real.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_oracle.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_oracle_chokepoint.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_paths_safe.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_paths_symlink_helpers.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_process.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_provider_auth_detection.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_providers.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_reexport.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_registry_invariants.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_release_version.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_resolve_env_helpers.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_reward_node.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_reward_unified_contract.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rewards.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rewards_jsonl.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_architecture.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_branch.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_config_path_coercion.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_environment.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_import_no_side_effects.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_on_ask_user_wiring.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_probe_sandbox_health.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_upload.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rubric_config.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_runtime.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_runtime_config_wired.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_runtime_live_sandbox.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_exec_secret_handling.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_hardening.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_multi_service.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_protocol.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_setup.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_snapshot_contract.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_upload_symlink.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_verifier_workspace.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_scene.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_scene_outbox_trial.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_scene_parallel_group.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_scene_result_aggregation.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_scoring.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sdk_internals.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sdk_lockdown.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_self_gen_cli.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_self_gen_export_error_channel.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_self_gen_export_failures.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_self_gen_orchestration.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_session_request_permission_dispatch.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_eval.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_eval_dryrun.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_eval_integration.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_eval_sweep.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_eval_traversal.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_invocation_artifacts.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_policy.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skills.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skills_dir_agent_home_link.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skillsbench_harbor_run_suite.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skillsbench_publish_scrub.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_smoke.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_subscription_auth.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_task_check_eval_consistency.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_task_config.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_task_download.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_tasks.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_token_usage_normalization.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trace_import_cli.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trace_task_gen_traversal.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_traces_huggingface.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_traces_parsers.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_traces_task_gen.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_train_mode_artifact_emission.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trajectory_integration.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trajectory_streaming.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trial_agent_timeout_verify.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trial_install_agent_timeout.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trial_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_usage_litellm.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_usage_required.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_usage_tracking.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_user.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_verifier_multi_container.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_verifier_output.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_verifier_output_freshness.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_verify.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_workflow_action_pinning.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_yaml_config.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/trajectories/test_export.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/trajectories/test_export_nan_handling.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/trajectories/test_redaction.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/trajectories/test_step_granularity.py +0 -0
- {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/trajectories/test_tree.py +0 -0
|
@@ -2,6 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- **`benchflow continue <run-folder>`** — resume a previous, unfinished
|
|
8
|
+
(timed-out) `openhands` run to completion. A standalone tool (it does not
|
|
9
|
+
touch the normal run path) that reconstructs the run's exact workspace and
|
|
10
|
+
agent memory from the recorded `llm_trajectory.jsonl` via record-replay,
|
|
11
|
+
then continues with the live model — no injected prompt — and writes a new
|
|
12
|
+
HF-compatible folder with `continued_from` provenance. See
|
|
13
|
+
[`docs/continue-runs.md`](docs/continue-runs.md).
|
|
14
|
+
|
|
5
15
|
### Changed
|
|
6
16
|
|
|
7
17
|
- Document the public vs internal preview install/upgrade command matrix,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.5.3.
|
|
3
|
+
Version: 0.5.3.dev908
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""``benchflow continue`` — resume a timed-out run to completion.
|
|
2
|
+
|
|
3
|
+
Standalone command (does not touch the normal eval/run path): reconstruct a
|
|
4
|
+
previous unfinished ``openhands`` run's exact env + memory from its recorded
|
|
5
|
+
trajectory via record-replay, continue it live, and write a new HF-compatible
|
|
6
|
+
folder linked to the parent. See :mod:`benchflow.continue_run`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Annotated
|
|
16
|
+
|
|
17
|
+
import typer
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _load_env_defaults() -> None:
|
|
23
|
+
from benchflow._dotenv import load_dotenv_env
|
|
24
|
+
|
|
25
|
+
for key, value in load_dotenv_env().items():
|
|
26
|
+
os.environ.setdefault(key, value)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def register_continue(app: typer.Typer) -> None:
|
|
30
|
+
"""Attach the ``continue`` command to the top-level benchflow app."""
|
|
31
|
+
|
|
32
|
+
@app.command("continue")
|
|
33
|
+
def continue_cmd(
|
|
34
|
+
folder: Annotated[
|
|
35
|
+
Path,
|
|
36
|
+
typer.Argument(
|
|
37
|
+
help="Original run output folder (contains config.json + "
|
|
38
|
+
"trajectory/llm_trajectory.jsonl)."
|
|
39
|
+
),
|
|
40
|
+
],
|
|
41
|
+
tasks_dir: Annotated[
|
|
42
|
+
Path | None,
|
|
43
|
+
typer.Option(
|
|
44
|
+
"--tasks-dir",
|
|
45
|
+
help="Directory holding the task source (instruction + verifier). "
|
|
46
|
+
"Required unless the recorded task_path still exists on disk.",
|
|
47
|
+
),
|
|
48
|
+
] = None,
|
|
49
|
+
model: Annotated[
|
|
50
|
+
str | None,
|
|
51
|
+
typer.Option(
|
|
52
|
+
"--model",
|
|
53
|
+
help="Override the live-continuation model (default: the "
|
|
54
|
+
"original run's model). Tests use gemini-3.1-flash-lite-preview.",
|
|
55
|
+
),
|
|
56
|
+
] = None,
|
|
57
|
+
timeout: Annotated[
|
|
58
|
+
int | None,
|
|
59
|
+
typer.Option(
|
|
60
|
+
"--timeout",
|
|
61
|
+
help="Wall-clock budget for the continuation, in seconds "
|
|
62
|
+
"(default: the original run's timeout).",
|
|
63
|
+
),
|
|
64
|
+
] = None,
|
|
65
|
+
output: Annotated[
|
|
66
|
+
Path | None,
|
|
67
|
+
typer.Option(
|
|
68
|
+
"--output",
|
|
69
|
+
help="Output jobs dir for the new run (default: "
|
|
70
|
+
"<orig-parent>/continued).",
|
|
71
|
+
),
|
|
72
|
+
] = None,
|
|
73
|
+
require_timeout: Annotated[
|
|
74
|
+
bool,
|
|
75
|
+
typer.Option(
|
|
76
|
+
"--require-timeout/--no-require-timeout",
|
|
77
|
+
help="Refuse runs whose recorded status is not a timeout.",
|
|
78
|
+
),
|
|
79
|
+
] = False,
|
|
80
|
+
strict_divergence: Annotated[
|
|
81
|
+
bool,
|
|
82
|
+
typer.Option(
|
|
83
|
+
"--strict-divergence/--no-strict-divergence",
|
|
84
|
+
help="Abort if replay leaves the original rails (message-count "
|
|
85
|
+
"mismatch) instead of warning.",
|
|
86
|
+
),
|
|
87
|
+
] = False,
|
|
88
|
+
replay_only: Annotated[
|
|
89
|
+
bool,
|
|
90
|
+
typer.Option(
|
|
91
|
+
"--replay-only/--no-replay-only",
|
|
92
|
+
help="Rebuild the env via replay and stop at the cut-point "
|
|
93
|
+
"(no live model needed) — useful for testing.",
|
|
94
|
+
),
|
|
95
|
+
] = False,
|
|
96
|
+
proxy_mode: Annotated[
|
|
97
|
+
str,
|
|
98
|
+
typer.Option(
|
|
99
|
+
"--proxy-mode",
|
|
100
|
+
help=(
|
|
101
|
+
"Replay proxy placement: auto, host, or sandbox. Auto uses "
|
|
102
|
+
"sandbox-local replay for Daytona/Modal and host replay for Docker."
|
|
103
|
+
),
|
|
104
|
+
),
|
|
105
|
+
] = "auto",
|
|
106
|
+
) -> None:
|
|
107
|
+
"""Resume a previous unfinished (timed-out) openhands run to completion."""
|
|
108
|
+
from benchflow.continue_run.orchestrator import continue_run
|
|
109
|
+
from benchflow.continue_run.run_folder import RunFolderError
|
|
110
|
+
|
|
111
|
+
_load_env_defaults()
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
result = asyncio.run(
|
|
115
|
+
continue_run(
|
|
116
|
+
folder,
|
|
117
|
+
tasks_dir=tasks_dir,
|
|
118
|
+
model=model,
|
|
119
|
+
timeout=timeout,
|
|
120
|
+
output_dir=output,
|
|
121
|
+
require_timeout=require_timeout,
|
|
122
|
+
strict_divergence=strict_divergence,
|
|
123
|
+
replay_only=replay_only,
|
|
124
|
+
proxy_mode=proxy_mode,
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
except RunFolderError as exc:
|
|
128
|
+
typer.secho(f"benchflow continue: {exc}", fg=typer.colors.RED, err=True)
|
|
129
|
+
raise typer.Exit(1) from exc
|
|
130
|
+
|
|
131
|
+
typer.secho(
|
|
132
|
+
f"\n✓ continued run written to {result.rollout_dir}", fg=typer.colors.GREEN
|
|
133
|
+
)
|
|
134
|
+
typer.echo(
|
|
135
|
+
f" replayed {result.n_recorded} recorded turn(s); "
|
|
136
|
+
f"{result.n_live} live turn(s); {result.divergences} divergence(s)"
|
|
137
|
+
)
|
|
138
|
+
if result.rewards is not None:
|
|
139
|
+
typer.echo(f" rewards: {result.rewards}")
|
|
140
|
+
if result.error:
|
|
141
|
+
typer.secho(f" agent error: {result.error}", fg=typer.colors.YELLOW)
|
|
142
|
+
|
|
143
|
+
@app.command("continue-batch")
|
|
144
|
+
def continue_batch_cmd(
|
|
145
|
+
root: Annotated[
|
|
146
|
+
Path,
|
|
147
|
+
typer.Argument(
|
|
148
|
+
help=(
|
|
149
|
+
"Run folder or directory tree containing timeout run folders "
|
|
150
|
+
"(config.json + trajectory/llm_trajectory.jsonl)."
|
|
151
|
+
)
|
|
152
|
+
),
|
|
153
|
+
],
|
|
154
|
+
tasks_dir: Annotated[
|
|
155
|
+
Path | None,
|
|
156
|
+
typer.Option(
|
|
157
|
+
"--tasks-dir",
|
|
158
|
+
help="Directory holding task sources; required unless recorded task_path exists.",
|
|
159
|
+
),
|
|
160
|
+
] = None,
|
|
161
|
+
model: Annotated[
|
|
162
|
+
str | None,
|
|
163
|
+
typer.Option("--model", help="Override live-continuation model."),
|
|
164
|
+
] = None,
|
|
165
|
+
timeout: Annotated[
|
|
166
|
+
int | None,
|
|
167
|
+
typer.Option("--timeout", help="Wall-clock budget per continuation."),
|
|
168
|
+
] = None,
|
|
169
|
+
output: Annotated[
|
|
170
|
+
Path | None,
|
|
171
|
+
typer.Option("--output", help="Output jobs dir for continued runs."),
|
|
172
|
+
] = None,
|
|
173
|
+
concurrency: Annotated[
|
|
174
|
+
int,
|
|
175
|
+
typer.Option(
|
|
176
|
+
"--concurrency",
|
|
177
|
+
help="Maximum number of continuation runs in flight.",
|
|
178
|
+
),
|
|
179
|
+
] = 100,
|
|
180
|
+
limit: Annotated[
|
|
181
|
+
int | None,
|
|
182
|
+
typer.Option("--limit", help="Limit discovered timeout folders."),
|
|
183
|
+
] = None,
|
|
184
|
+
strict_divergence: Annotated[
|
|
185
|
+
bool,
|
|
186
|
+
typer.Option(
|
|
187
|
+
"--strict-divergence/--no-strict-divergence",
|
|
188
|
+
help="Abort a run if replay leaves the original rails.",
|
|
189
|
+
),
|
|
190
|
+
] = False,
|
|
191
|
+
proxy_mode: Annotated[
|
|
192
|
+
str,
|
|
193
|
+
typer.Option(
|
|
194
|
+
"--proxy-mode",
|
|
195
|
+
help=(
|
|
196
|
+
"Replay proxy placement: auto, host, or sandbox. For PR5 "
|
|
197
|
+
"Daytona runs, use the default auto or sandbox."
|
|
198
|
+
),
|
|
199
|
+
),
|
|
200
|
+
] = "auto",
|
|
201
|
+
) -> None:
|
|
202
|
+
"""Continue all timed-out OpenHands runs under a directory tree."""
|
|
203
|
+
import json
|
|
204
|
+
|
|
205
|
+
from benchflow.continue_run.batch import (
|
|
206
|
+
continue_batch,
|
|
207
|
+
discover_timeout_run_folders,
|
|
208
|
+
summarize_batch,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
_load_env_defaults()
|
|
212
|
+
folders = discover_timeout_run_folders(root, limit=limit)
|
|
213
|
+
if not folders:
|
|
214
|
+
typer.secho("No timeout run folders found.", fg=typer.colors.YELLOW)
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
typer.echo(
|
|
218
|
+
f"Continuing {len(folders)} timeout run(s) with concurrency={concurrency}"
|
|
219
|
+
)
|
|
220
|
+
results = asyncio.run(
|
|
221
|
+
continue_batch(
|
|
222
|
+
folders,
|
|
223
|
+
concurrency=concurrency,
|
|
224
|
+
tasks_dir=tasks_dir,
|
|
225
|
+
model=model,
|
|
226
|
+
timeout=timeout,
|
|
227
|
+
output_dir=output,
|
|
228
|
+
require_timeout=True,
|
|
229
|
+
strict_divergence=strict_divergence,
|
|
230
|
+
proxy_mode=proxy_mode,
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
summary = summarize_batch(results)
|
|
234
|
+
typer.echo(json.dumps(summary, indent=2))
|
|
235
|
+
if summary["failed"]:
|
|
236
|
+
raise typer.Exit(1)
|
|
@@ -21,6 +21,7 @@ from benchflow._utils.config import (
|
|
|
21
21
|
normalize_sandbox_user,
|
|
22
22
|
)
|
|
23
23
|
from benchflow.agents.registry import parse_agent_spec
|
|
24
|
+
from benchflow.cli.continue_cmd import register_continue
|
|
24
25
|
from benchflow.cli.trace_import import register_tasks_generate
|
|
25
26
|
from benchflow.evaluation import DEFAULT_AGENT, effective_model
|
|
26
27
|
from benchflow.skill_policy import SKILL_MODE_NO_SKILL
|
|
@@ -40,6 +41,9 @@ app = typer.Typer(
|
|
|
40
41
|
no_args_is_help=True,
|
|
41
42
|
)
|
|
42
43
|
|
|
44
|
+
# Standalone `benchflow continue <orig-run-folder>` — resume a timed-out run.
|
|
45
|
+
register_continue(app)
|
|
46
|
+
|
|
43
47
|
|
|
44
48
|
def _version_callback(value: bool) -> None:
|
|
45
49
|
if value:
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Resume a previous, unfinished (timed-out) agent run to completion.
|
|
2
|
+
|
|
3
|
+
``benchflow continue <orig-run-output-folder>`` is a *standalone* tool that does
|
|
4
|
+
**not** touch benchflow's normal run path. It reconstructs a timed-out run's
|
|
5
|
+
exact workspace and agent memory from the recorded trajectory (record-replay),
|
|
6
|
+
then lets the agent continue as if the timeout had simply been larger, and
|
|
7
|
+
writes a new HF-compatible result folder linked to the parent.
|
|
8
|
+
|
|
9
|
+
The mechanism (agreed design):
|
|
10
|
+
|
|
11
|
+
1. Load the original run folder (``config.json`` + ``trajectory/llm_trajectory.jsonl``).
|
|
12
|
+
2. Boot a fresh *pristine* sandbox from the same base image.
|
|
13
|
+
3. Stand up a :class:`~benchflow.continue_run.replay_proxy.ReplayProxy` that
|
|
14
|
+
OpenHands talks to via ``LLM_BASE_URL``. It serves the recorded LLM
|
|
15
|
+
responses **in order**, so the agent re-executes its own past decisions for
|
|
16
|
+
real — rebuilding the byte-exact workspace and its exact internal state.
|
|
17
|
+
4. When the recorded responses run out (the timeout cut-point), the proxy flips
|
|
18
|
+
to the **live** model and the agent continues — no injected prompt.
|
|
19
|
+
5. Re-verify and write a new folder with ``continued_from`` provenance.
|
|
20
|
+
|
|
21
|
+
Only the ``openhands`` agent is supported for now (the LLM-proxy seam this relies
|
|
22
|
+
on is wired for openhands via ``LLM_BASE_URL``).
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from benchflow.continue_run.run_folder import (
|
|
26
|
+
RunFolder,
|
|
27
|
+
RunFolderError,
|
|
28
|
+
load_run_folder,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"RunFolder",
|
|
33
|
+
"RunFolderError",
|
|
34
|
+
"load_run_folder",
|
|
35
|
+
]
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Batch orchestration for continuing many timed-out runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from collections.abc import Awaitable, Callable
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from benchflow.continue_run.orchestrator import ContinueResult, continue_run
|
|
12
|
+
from benchflow.continue_run.run_folder import RunFolderError, load_run_folder
|
|
13
|
+
|
|
14
|
+
ContinueRunner = Callable[..., Awaitable[ContinueResult]]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class BatchContinueResult:
|
|
19
|
+
"""Result for one source folder in a batch continuation."""
|
|
20
|
+
|
|
21
|
+
folder: Path
|
|
22
|
+
ok: bool
|
|
23
|
+
continued: ContinueResult | None = None
|
|
24
|
+
error: str | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def discover_timeout_run_folders(
|
|
28
|
+
root: str | Path, *, limit: int | None = None
|
|
29
|
+
) -> list[Path]:
|
|
30
|
+
"""Find OpenHands timeout run folders below ``root``.
|
|
31
|
+
|
|
32
|
+
Discovery is intentionally artifact-based: a candidate must have a
|
|
33
|
+
``config.json`` and a usable ``trajectory/llm_trajectory.jsonl``. Non-timeout
|
|
34
|
+
runs are skipped by ``load_run_folder(require_timeout=True)``.
|
|
35
|
+
"""
|
|
36
|
+
root_path = Path(root).expanduser()
|
|
37
|
+
candidates = [root_path] if (root_path / "config.json").is_file() else []
|
|
38
|
+
candidates.extend(path.parent for path in root_path.rglob("config.json"))
|
|
39
|
+
|
|
40
|
+
folders: list[Path] = []
|
|
41
|
+
seen: set[Path] = set()
|
|
42
|
+
for folder in sorted(candidates):
|
|
43
|
+
resolved = folder.resolve()
|
|
44
|
+
if resolved in seen:
|
|
45
|
+
continue
|
|
46
|
+
seen.add(resolved)
|
|
47
|
+
try:
|
|
48
|
+
load_run_folder(folder, require_timeout=True)
|
|
49
|
+
except RunFolderError:
|
|
50
|
+
continue
|
|
51
|
+
folders.append(folder)
|
|
52
|
+
if limit is not None and len(folders) >= limit:
|
|
53
|
+
break
|
|
54
|
+
return folders
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
async def continue_batch(
|
|
58
|
+
folders: list[Path],
|
|
59
|
+
*,
|
|
60
|
+
concurrency: int,
|
|
61
|
+
tasks_dir: str | Path | None,
|
|
62
|
+
model: str | None,
|
|
63
|
+
timeout: int | None,
|
|
64
|
+
output_dir: str | Path | None,
|
|
65
|
+
require_timeout: bool = True,
|
|
66
|
+
strict_divergence: bool = False,
|
|
67
|
+
proxy_mode: str = "auto",
|
|
68
|
+
runner: ContinueRunner = continue_run,
|
|
69
|
+
) -> list[BatchContinueResult]:
|
|
70
|
+
"""Run ``benchflow continue`` over folders with rolling concurrency."""
|
|
71
|
+
if concurrency < 1:
|
|
72
|
+
raise ValueError("concurrency must be >= 1")
|
|
73
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
74
|
+
|
|
75
|
+
async def _one(folder: Path) -> BatchContinueResult:
|
|
76
|
+
async with semaphore:
|
|
77
|
+
try:
|
|
78
|
+
result = await runner(
|
|
79
|
+
folder,
|
|
80
|
+
tasks_dir=tasks_dir,
|
|
81
|
+
model=model,
|
|
82
|
+
timeout=timeout,
|
|
83
|
+
output_dir=output_dir,
|
|
84
|
+
require_timeout=require_timeout,
|
|
85
|
+
strict_divergence=strict_divergence,
|
|
86
|
+
proxy_mode=proxy_mode,
|
|
87
|
+
)
|
|
88
|
+
except Exception as exc:
|
|
89
|
+
return BatchContinueResult(folder=folder, ok=False, error=str(exc))
|
|
90
|
+
if result.error:
|
|
91
|
+
return BatchContinueResult(
|
|
92
|
+
folder=folder,
|
|
93
|
+
ok=False,
|
|
94
|
+
continued=result,
|
|
95
|
+
error=result.error,
|
|
96
|
+
)
|
|
97
|
+
return BatchContinueResult(folder=folder, ok=True, continued=result)
|
|
98
|
+
|
|
99
|
+
return list(await asyncio.gather(*(_one(folder) for folder in folders)))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def summarize_batch(results: list[BatchContinueResult]) -> dict[str, Any]:
|
|
103
|
+
"""Small JSON-serializable summary for CLI output and dashboards."""
|
|
104
|
+
ok = [result for result in results if result.ok]
|
|
105
|
+
failed = [result for result in results if not result.ok]
|
|
106
|
+
return {
|
|
107
|
+
"total": len(results),
|
|
108
|
+
"succeeded": len(ok),
|
|
109
|
+
"failed": len(failed),
|
|
110
|
+
"outputs": [
|
|
111
|
+
str(result.continued.rollout_dir)
|
|
112
|
+
for result in ok
|
|
113
|
+
if result.continued is not None
|
|
114
|
+
],
|
|
115
|
+
"errors": [
|
|
116
|
+
{
|
|
117
|
+
"folder": str(result.folder),
|
|
118
|
+
"output": str(result.continued.rollout_dir)
|
|
119
|
+
if result.continued is not None
|
|
120
|
+
else None,
|
|
121
|
+
"error": result.error,
|
|
122
|
+
}
|
|
123
|
+
for result in failed
|
|
124
|
+
],
|
|
125
|
+
}
|