benchflow 0.5.3.dev902__tar.gz → 0.5.3.dev906__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (399) hide show
  1. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/PKG-INFO +1 -1
  2. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/pyproject.toml +1 -1
  3. benchflow-0.5.3.dev906/tests/test_skillsbench_publish_scrub.py +75 -0
  4. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/.gitignore +0 -0
  5. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/CHANGELOG.md +0 -0
  6. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/LICENSE +0 -0
  7. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/README.md +0 -0
  8. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/__init__.py +0 -0
  9. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_dotenv.py +0 -0
  10. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_paths.py +0 -0
  11. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_run.py +0 -0
  12. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_types.py +0 -0
  13. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/__init__.py +0 -0
  14. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/benchmark_repos.py +0 -0
  15. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/config.py +0 -0
  16. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/evaluation_results.py +0 -0
  17. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/json_safe.py +0 -0
  18. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/learner_memory.py +0 -0
  19. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/result_metadata.py +0 -0
  20. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/reward_events.py +0 -0
  21. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/scoring.py +0 -0
  22. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/source_provenance.py +0 -0
  23. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/task_authoring.py +0 -0
  24. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/_utils/yaml_loader.py +0 -0
  25. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/acp/__init__.py +0 -0
  26. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/acp/client.py +0 -0
  27. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/acp/container_transport.py +0 -0
  28. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/acp/runtime.py +0 -0
  29. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/acp/session.py +0 -0
  30. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/acp/transport.py +0 -0
  31. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/acp/types.py +0 -0
  32. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/adapters/__init__.py +0 -0
  33. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/adapters/harbor.py +0 -0
  34. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/adapters/inbound.py +0 -0
  35. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/adapters/inspect_ai.py +0 -0
  36. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/adapters/ors.py +0 -0
  37. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/adapters/terminal_bench.py +0 -0
  38. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/__init__.py +0 -0
  39. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/codex_config.py +0 -0
  40. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/credentials.py +0 -0
  41. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/env.py +0 -0
  42. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/errors.py +0 -0
  43. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
  44. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/install.py +0 -0
  45. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
  46. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/pi_acp_launcher.py +0 -0
  47. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/protocol.py +0 -0
  48. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/providers.py +0 -0
  49. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/agents/registry.py +0 -0
  50. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/branch.py +0 -0
  51. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/cli/__init__.py +0 -0
  52. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/cli/main.py +0 -0
  53. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/cli/trace_import.py +0 -0
  54. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/compat/__init__.py +0 -0
  55. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/compat/harbor_registry.py +0 -0
  56. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/contracts/__init__.py +0 -0
  57. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/contracts/planes.py +0 -0
  58. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/contracts/user.py +0 -0
  59. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/demo_task/environment/Dockerfile +0 -0
  60. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/demo_task/instruction.md +0 -0
  61. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/demo_task/task.toml +0 -0
  62. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/demo_task/tests/test.sh +0 -0
  63. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/diagnostics.py +0 -0
  64. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/environment/__init__.py +0 -0
  65. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/environment/manifest.py +0 -0
  66. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/environment/manifest_env.py +0 -0
  67. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/environment/protocol.py +0 -0
  68. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/environment/readiness.py +0 -0
  69. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/eval_sharding.py +0 -0
  70. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/eval_worker.py +0 -0
  71. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/evaluation.py +0 -0
  72. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/experimental/__init__.py +0 -0
  73. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/experimental/mcp/__init__.py +0 -0
  74. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/experimental/mcp/hooks.py +0 -0
  75. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
  76. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/hosted_env.py +0 -0
  77. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/learner_skills.py +0 -0
  78. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/learner_store.py +0 -0
  79. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/metrics.py +0 -0
  80. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/models.py +0 -0
  81. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/monitor.py +0 -0
  82. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/providers/__init__.py +0 -0
  83. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
  84. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_config.py +0 -0
  85. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_logging.py +0 -0
  86. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/providers/litellm_runtime.py +0 -0
  87. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/providers/runtime.py +0 -0
  88. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/py.typed +0 -0
  89. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/README.md +0 -0
  90. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/__init__.py +0 -0
  91. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/builtins.py +0 -0
  92. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/events.py +0 -0
  93. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/file_readers.py +0 -0
  94. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/llm.py +0 -0
  95. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/memory_scorer.py +0 -0
  96. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/node.py +0 -0
  97. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/protocol.py +0 -0
  98. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/rubric.py +0 -0
  99. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/rubric_config.py +0 -0
  100. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rewards/validation.py +0 -0
  101. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rollout.py +0 -0
  102. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rollout_branch.py +0 -0
  103. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/rollout_planes.py +0 -0
  104. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/runtime.py +0 -0
  105. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/__init__.py +0 -0
  106. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_base.py +0 -0
  107. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_compose.py +0 -0
  108. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
  109. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
  110. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
  111. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
  112. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/_sdk_ops.py +0 -0
  113. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/daytona.py +0 -0
  114. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/docker.py +0 -0
  115. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/lockdown.py +0 -0
  116. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/metadata.py +0 -0
  117. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/modal_impl.py +0 -0
  118. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/process.py +0 -0
  119. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/protocol.py +0 -0
  120. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/services.py +0 -0
  121. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/setup.py +0 -0
  122. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/snapshot.py +0 -0
  123. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sandbox/user.py +0 -0
  124. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/scenes.py +0 -0
  125. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/sdk.py +0 -0
  126. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/self_gen.py +0 -0
  127. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/skill_eval/__init__.py +0 -0
  128. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/skill_eval/_core.py +0 -0
  129. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/skill_eval/gepa_export.py +0 -0
  130. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/skill_eval/schema.py +0 -0
  131. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/skill_policy.py +0 -0
  132. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/skills.py +0 -0
  133. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/task/__init__.py +0 -0
  134. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/task/config.py +0 -0
  135. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/task/env.py +0 -0
  136. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/task/paths.py +0 -0
  137. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/task/task.py +0 -0
  138. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/task/verifier.py +0 -0
  139. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/templates/__init__.py +0 -0
  140. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/templates/judge.py.tmpl +0 -0
  141. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/templates/test.sh.tmpl +0 -0
  142. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/traces/__init__.py +0 -0
  143. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/traces/huggingface.py +0 -0
  144. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/traces/local.py +0 -0
  145. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/traces/models.py +0 -0
  146. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/traces/parsers.py +0 -0
  147. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/traces/task_gen.py +0 -0
  148. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/__init__.py +0 -0
  149. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/_capture.py +0 -0
  150. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/export.py +0 -0
  151. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/metrics.py +0 -0
  152. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/otel.py +0 -0
  153. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/tree.py +0 -0
  154. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/types.py +0 -0
  155. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/trajectories/viewer.py +0 -0
  156. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/src/benchflow/usage_tracking.py +0 -0
  157. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/__init__.py +0 -0
  158. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/agents/__init__.py +0 -0
  159. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/agents/test_protocol.py +0 -0
  160. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/README.md +0 -0
  161. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
  162. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
  163. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
  164. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/instruction.md +0 -0
  165. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
  166. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/task.toml +0 -0
  167. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/acp_smoke/tests/test.sh +0 -0
  168. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/conformance-results.json +0 -0
  169. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/proof_multi_agent.py +0 -0
  170. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/proof_snapshot.py +0 -0
  171. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/run_conformance.py +0 -0
  172. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
  173. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/conftest.py +0 -0
  174. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/environment/__init__.py +0 -0
  175. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/environment/test_chibench_manifest.py +0 -0
  176. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/environment/test_clawsbench_manifest.py +0 -0
  177. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/environment/test_manifest.py +0 -0
  178. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/environment/test_manifest_env.py +0 -0
  179. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/environment/test_protocol.py +0 -0
  180. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/environment/test_readiness.py +0 -0
  181. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
  182. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/hello-world-task/instruction.md +0 -0
  183. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/hello-world-task/solution/solve.sh +0 -0
  184. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/hello-world-task/task.toml +0 -0
  185. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/hello-world-task/tests/test.sh +0 -0
  186. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
  187. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
  188. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
  189. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
  190. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
  191. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
  192. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/test_claude.sh +0 -0
  193. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/test_codex.sh +0 -0
  194. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/test_codex_custom_provider.sh +0 -0
  195. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/test_gemini.sh +0 -0
  196. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/test_openclaw.sh +0 -0
  197. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/traces/minimal-claude.jsonl +0 -0
  198. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
  199. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/fixtures/mock_acp_agent.py +0 -0
  200. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
  201. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
  202. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/fixtures/mock_openai_responses_server.py +0 -0
  203. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/check_adapter_evidence.py +0 -0
  204. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/check_hosted_env_evidence.py +0 -0
  205. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/check_results.py +0 -0
  206. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
  207. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/check_trace_to_task_evidence.py +0 -0
  208. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/configs/claude-agent-acp.yaml +0 -0
  209. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/configs/codex-acp.yaml +0 -0
  210. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/configs/gemini.yaml +0 -0
  211. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
  212. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/configs/openclaw.yaml +0 -0
  213. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/configs/opencode.yaml +0 -0
  214. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/configs/openhands.yaml +0 -0
  215. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/configs/pi-acp.yaml +0 -0
  216. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/run.sh +0 -0
  217. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/run_suite.py +0 -0
  218. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/integration/suites/release.yaml +0 -0
  219. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_acp.py +0 -0
  220. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_acp_capability_advertising.py +0 -0
  221. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_acp_model_config_dispatch.py +0 -0
  222. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_acp_pinned_protocol_guard.py +0 -0
  223. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_acp_setup_failure_propagation.py +0 -0
  224. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_adapter_scripts.py +0 -0
  225. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_adapters.py +0 -0
  226. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_agent_cli.py +0 -0
  227. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_agent_env_resolution.py +0 -0
  228. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_agent_gemini_defaults.py +0 -0
  229. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_agent_idle_timeout_cli.py +0 -0
  230. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_agent_model_decouple.py +0 -0
  231. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_agent_registry.py +0 -0
  232. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_agent_setup.py +0 -0
  233. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_agent_spec.py +0 -0
  234. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_base_install_imports.py +0 -0
  235. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_bedrock_thinking.py +0 -0
  236. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_branch.py +0 -0
  237. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_capture_trajectory.py +0 -0
  238. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_clawsbench_slice.py +0 -0
  239. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_cli_daytona.py +0 -0
  240. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_cli_docs_drift.py +0 -0
  241. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_cli_misc.py +0 -0
  242. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_compat_harbor_registry.py +0 -0
  243. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_config_redaction.py +0 -0
  244. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_connect_as_env.py +0 -0
  245. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_continuallearningbench_adapter.py +0 -0
  246. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_dashboard_credential_env_scrub.py +0 -0
  247. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_dashboard_daytona_key.py +0 -0
  248. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_dashboard_no_host_paths.py +0 -0
  249. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_dashboard_release_evidence.py +0 -0
  250. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_dashboard_roadmap.py +0 -0
  251. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_dashboard_symlink_ingestion.py +0 -0
  252. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_dashboard_sync.py +0 -0
  253. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_daytona_command_polling.py +0 -0
  254. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_daytona_litellm_runtime.py +0 -0
  255. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_daytona_status.py +0 -0
  256. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_docker_prune_scoping.py +0 -0
  257. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_docker_uploads.py +0 -0
  258. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_docs_examples.py +0 -0
  259. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_eng50_capabilities.py +0 -0
  260. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_env_setup.py +0 -0
  261. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_environment_manifest_controls.py +0 -0
  262. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_eval_filters_applied.py +0 -0
  263. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_eval_sharding.py +0 -0
  264. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_eval_single_task_summary.py +0 -0
  265. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_eval_source_provenance.py +0 -0
  266. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_eval_worker_retry.py +0 -0
  267. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_eval_zero_task_guard.py +0 -0
  268. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_evaluation_environment_manifest.py +0 -0
  269. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_exclude_tasks.py +0 -0
  270. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_experiments_status.py +0 -0
  271. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_harvey_lab_shim.py +0 -0
  272. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_hf_scores.py +0 -0
  273. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_hilbench_adapter.py +0 -0
  274. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_hosted_env.py +0 -0
  275. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_hosted_env_rollout_contract.py +0 -0
  276. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_inbound_adapter_manifest.py +0 -0
  277. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_inbound_adapters.py +0 -0
  278. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_integration_check_results.py +0 -0
  279. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_integration_run_suite.py +0 -0
  280. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_internet_policy.py +0 -0
  281. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_job.py +0 -0
  282. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_job_sequential_shared.py +0 -0
  283. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_job_sequential_shared_resume.py +0 -0
  284. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_judge_symlink_ingestion.py +0 -0
  285. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_learner_skills.py +0 -0
  286. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_learner_skills_traversal.py +0 -0
  287. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_learner_store.py +0 -0
  288. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_learner_store_persistence.py +0 -0
  289. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_litellm_config.py +0 -0
  290. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_litellm_hardening.py +0 -0
  291. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_litellm_logging.py +0 -0
  292. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_litellm_runtime.py +0 -0
  293. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_litellm_smoke.py +0 -0
  294. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_llm_judge.py +0 -0
  295. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_llm_judge_event_tags.py +0 -0
  296. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_llm_judge_verifier.py +0 -0
  297. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_memory_scorer.py +0 -0
  298. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_metrics.py +0 -0
  299. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_mock_openai_responses_server.py +0 -0
  300. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_monitor_scaffold.py +0 -0
  301. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_native_acp_usage.py +0 -0
  302. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_no_cross_provider_fallback.py +0 -0
  303. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_notification_order_real.py +0 -0
  304. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_oracle.py +0 -0
  305. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_oracle_chokepoint.py +0 -0
  306. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_paths_safe.py +0 -0
  307. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_paths_symlink_helpers.py +0 -0
  308. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_pi_acp_launcher.py +0 -0
  309. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_process.py +0 -0
  310. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_provider_auth_detection.py +0 -0
  311. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_providers.py +0 -0
  312. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_reexport.py +0 -0
  313. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_registry_invariants.py +0 -0
  314. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_release_version.py +0 -0
  315. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_resolve_env_helpers.py +0 -0
  316. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_reward_node.py +0 -0
  317. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_reward_unified_contract.py +0 -0
  318. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rewards.py +0 -0
  319. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rewards_jsonl.py +0 -0
  320. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rollout_architecture.py +0 -0
  321. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rollout_branch.py +0 -0
  322. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rollout_config_path_coercion.py +0 -0
  323. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rollout_environment.py +0 -0
  324. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rollout_import_no_side_effects.py +0 -0
  325. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rollout_on_ask_user_wiring.py +0 -0
  326. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rollout_probe_sandbox_health.py +0 -0
  327. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rollout_upload.py +0 -0
  328. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_rubric_config.py +0 -0
  329. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_runtime.py +0 -0
  330. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_runtime_config_wired.py +0 -0
  331. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_runtime_live_sandbox.py +0 -0
  332. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sandbox.py +0 -0
  333. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sandbox_exec_secret_handling.py +0 -0
  334. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sandbox_hardening.py +0 -0
  335. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
  336. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sandbox_multi_service.py +0 -0
  337. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sandbox_protocol.py +0 -0
  338. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sandbox_setup.py +0 -0
  339. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sandbox_snapshot_contract.py +0 -0
  340. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sandbox_upload_symlink.py +0 -0
  341. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sandbox_verifier_workspace.py +0 -0
  342. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_scene.py +0 -0
  343. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_scene_outbox_trial.py +0 -0
  344. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_scene_parallel_group.py +0 -0
  345. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_scene_result_aggregation.py +0 -0
  346. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_scoring.py +0 -0
  347. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sdk_internals.py +0 -0
  348. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_sdk_lockdown.py +0 -0
  349. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_self_gen_cli.py +0 -0
  350. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_self_gen_export_error_channel.py +0 -0
  351. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_self_gen_export_failures.py +0 -0
  352. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_self_gen_orchestration.py +0 -0
  353. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_session_request_permission_dispatch.py +0 -0
  354. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skill_eval.py +0 -0
  355. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skill_eval_dryrun.py +0 -0
  356. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skill_eval_integration.py +0 -0
  357. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skill_eval_sweep.py +0 -0
  358. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skill_eval_traversal.py +0 -0
  359. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skill_invocation_artifacts.py +0 -0
  360. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skill_policy.py +0 -0
  361. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skills.py +0 -0
  362. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skills_dir_agent_home_link.py +0 -0
  363. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skillsbench_harbor_parity.py +0 -0
  364. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_skillsbench_harbor_run_suite.py +0 -0
  365. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_smoke.py +0 -0
  366. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_subscription_auth.py +0 -0
  367. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_task_check_eval_consistency.py +0 -0
  368. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_task_config.py +0 -0
  369. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_task_download.py +0 -0
  370. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_tasks.py +0 -0
  371. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_token_usage_normalization.py +0 -0
  372. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_trace_import_cli.py +0 -0
  373. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_trace_task_gen_traversal.py +0 -0
  374. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_trace_to_task_evidence.py +0 -0
  375. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_traces_huggingface.py +0 -0
  376. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_traces_parsers.py +0 -0
  377. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_traces_task_gen.py +0 -0
  378. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_train_mode_artifact_emission.py +0 -0
  379. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_trajectory_integration.py +0 -0
  380. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_trajectory_streaming.py +0 -0
  381. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_trial_agent_timeout_verify.py +0 -0
  382. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_trial_install_agent_timeout.py +0 -0
  383. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_trial_litellm_runtime.py +0 -0
  384. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_usage_litellm.py +0 -0
  385. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_usage_required.py +0 -0
  386. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_usage_tracking.py +0 -0
  387. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_user.py +0 -0
  388. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_verifier_multi_container.py +0 -0
  389. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_verifier_output.py +0 -0
  390. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_verifier_output_freshness.py +0 -0
  391. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_verify.py +0 -0
  392. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_workflow_action_pinning.py +0 -0
  393. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/test_yaml_config.py +0 -0
  394. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/trajectories/__init__.py +0 -0
  395. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/trajectories/test_export.py +0 -0
  396. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/trajectories/test_export_nan_handling.py +0 -0
  397. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/trajectories/test_redaction.py +0 -0
  398. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/trajectories/test_step_granularity.py +0 -0
  399. {benchflow-0.5.3.dev902 → benchflow-0.5.3.dev906}/tests/trajectories/test_tree.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchflow
3
- Version: 0.5.3.dev902
3
+ Version: 0.5.3.dev906
4
4
  Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
5
5
  Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
6
6
  Project-URL: Repository, https://github.com/benchflow-ai/benchflow
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "benchflow"
3
- version = "0.5.3.dev902"
3
+ version = "0.5.3.dev906"
4
4
  description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -0,0 +1,75 @@
1
+ import importlib.util
2
+ import json
3
+ from pathlib import Path
4
+
5
+
6
+ def _load_publish_module():
7
+ path = (
8
+ Path(__file__).resolve().parents[1]
9
+ / "experiments"
10
+ / "skillsbench-fill"
11
+ / "publish.py"
12
+ )
13
+ spec = importlib.util.spec_from_file_location("skillsbench_fill_publish", path)
14
+ assert spec and spec.loader
15
+ module = importlib.util.module_from_spec(spec)
16
+ spec.loader.exec_module(module)
17
+ return module
18
+
19
+
20
+ def test_safe_bytes_preserves_token_usage_counters(tmp_path):
21
+ """Guards Hugging Face PR #4 token-usage recovery against scrub redaction."""
22
+ publish = _load_publish_module()
23
+ result = {
24
+ "agent_result": {
25
+ "n_input_tokens": 123,
26
+ "n_output_tokens": 45,
27
+ "n_cache_read_tokens": 678,
28
+ "n_cache_creation_tokens": 90,
29
+ "total_tokens": 936,
30
+ "usage_source": "provider_response",
31
+ },
32
+ "final_metrics": {
33
+ "total_prompt_tokens": 123,
34
+ "total_completion_tokens": 45,
35
+ "total_cached_tokens": 678,
36
+ "total_cost_usd": 1.23,
37
+ },
38
+ "HUGGING_FACE_TOKEN": "hf_abcdefghijklmnopqrstuvwxyz",
39
+ }
40
+ src = tmp_path / "result.json"
41
+ src.write_text(json.dumps(result))
42
+
43
+ scrubbed = json.loads(publish.safe_bytes(src))
44
+
45
+ assert scrubbed["agent_result"]["n_input_tokens"] == 123
46
+ assert scrubbed["agent_result"]["n_output_tokens"] == 45
47
+ assert scrubbed["agent_result"]["n_cache_read_tokens"] == 678
48
+ assert scrubbed["agent_result"]["n_cache_creation_tokens"] == 90
49
+ assert scrubbed["agent_result"]["total_tokens"] == 936
50
+ assert scrubbed["final_metrics"]["total_prompt_tokens"] == 123
51
+ assert scrubbed["final_metrics"]["total_completion_tokens"] == 45
52
+ assert scrubbed["final_metrics"]["total_cached_tokens"] == 678
53
+ assert scrubbed["HUGGING_FACE_TOKEN"] == "[REDACTED]"
54
+
55
+
56
+ def test_safe_bytes_normalizes_config_without_leaking_secret_tokens(tmp_path):
57
+ """Guards Hugging Face PR #4 token-usage recovery against credential leakage."""
58
+ publish = _load_publish_module()
59
+ config = {
60
+ "include_task_skills": False,
61
+ "agent_env": {
62
+ "AWS_BEARER_TOKEN_BEDROCK": "Bearer abcdefghijklmnop",
63
+ "OPENAI_API_KEY": "sk-abcdefghijklmnopqrstuvwxyz",
64
+ "BENCHFLOW_MODEL_MAX_TOKENS": 8192,
65
+ },
66
+ }
67
+ src = tmp_path / "config.json"
68
+ src.write_text(json.dumps(config))
69
+
70
+ scrubbed = json.loads(publish.safe_bytes(src, is_config=True, mode="with"))
71
+
72
+ assert scrubbed["include_task_skills"] is True
73
+ assert scrubbed["agent_env"]["AWS_BEARER_TOKEN_BEDROCK"] == "[REDACTED]"
74
+ assert scrubbed["agent_env"]["OPENAI_API_KEY"] == "[REDACTED]"
75
+ assert scrubbed["agent_env"]["BENCHFLOW_MODEL_MAX_TOKENS"] == 8192