tolokaforge 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (693) hide show
  1. tolokaforge-0.2.0/.cursor/rules/cursor.mdc +19 -0
  2. tolokaforge-0.2.0/.dockerignore +63 -0
  3. tolokaforge-0.2.0/.env.example +24 -0
  4. tolokaforge-0.2.0/.gitattributes +6 -0
  5. tolokaforge-0.2.0/.github/workflows/ci.yml +309 -0
  6. tolokaforge-0.2.0/.github/workflows/claude-review.yml +100 -0
  7. tolokaforge-0.2.0/.github/workflows/publish-adapter-terminal-bench.yml +132 -0
  8. tolokaforge-0.2.0/.github/workflows/publish-tolokaforge.yml +132 -0
  9. tolokaforge-0.2.0/.github/workflows/release-gate.yml +97 -0
  10. tolokaforge-0.2.0/.gitignore +93 -0
  11. tolokaforge-0.2.0/.mcp.json +9 -0
  12. tolokaforge-0.2.0/.pre-commit-config.yaml +22 -0
  13. tolokaforge-0.2.0/.python-version +1 -0
  14. tolokaforge-0.2.0/.roo/mcp.json +21 -0
  15. tolokaforge-0.2.0/.roomodes +34 -0
  16. tolokaforge-0.2.0/.vscode/settings.json +30 -0
  17. tolokaforge-0.2.0/.vscode/tasks.json +72 -0
  18. tolokaforge-0.2.0/AGENTS.md +461 -0
  19. tolokaforge-0.2.0/CHANGELOG.md +79 -0
  20. tolokaforge-0.2.0/CITATION.bib +8 -0
  21. tolokaforge-0.2.0/CITATION.cff +11 -0
  22. tolokaforge-0.2.0/CLAUDE.md +11 -0
  23. tolokaforge-0.2.0/CONTRIBUTING.md +35 -0
  24. tolokaforge-0.2.0/CONTRIBUTORS.md +12 -0
  25. tolokaforge-0.2.0/LICENSE +13 -0
  26. tolokaforge-0.2.0/Makefile +143 -0
  27. tolokaforge-0.2.0/PKG-INFO +230 -0
  28. tolokaforge-0.2.0/README.md +152 -0
  29. tolokaforge-0.2.0/docs/ADAPTERS.md +154 -0
  30. tolokaforge-0.2.0/docs/ADAPTER_ARCHITECTURE.md +220 -0
  31. tolokaforge-0.2.0/docs/ADAPTER_INTERFACE.md +83 -0
  32. tolokaforge-0.2.0/docs/ADD_NEW_MODEL.md +290 -0
  33. tolokaforge-0.2.0/docs/ANALYTICS.md +110 -0
  34. tolokaforge-0.2.0/docs/API.md +71 -0
  35. tolokaforge-0.2.0/docs/BACKEND_STATUS_MATRIX.md +29 -0
  36. tolokaforge-0.2.0/docs/BENCHMARK_BACKEND_DESIGNS.md +68 -0
  37. tolokaforge-0.2.0/docs/BENCHMARK_TYPES.md +41 -0
  38. tolokaforge-0.2.0/docs/BROWSER_TOOLS.md +156 -0
  39. tolokaforge-0.2.0/docs/CONFIG.md +316 -0
  40. tolokaforge-0.2.0/docs/CONVERSION_LAYER.md +214 -0
  41. tolokaforge-0.2.0/docs/DB_SERVICE_API.md +921 -0
  42. tolokaforge-0.2.0/docs/DEEP_RESEARCH.md +45 -0
  43. tolokaforge-0.2.0/docs/FINAL_AUDIT.md +434 -0
  44. tolokaforge-0.2.0/docs/FUTURE_DEVELOPMENT.md +617 -0
  45. tolokaforge-0.2.0/docs/GEMINI_QUIRKS.md +467 -0
  46. tolokaforge-0.2.0/docs/GETTING_STARTED.md +180 -0
  47. tolokaforge-0.2.0/docs/GOLDEN_TRIALS.md +16 -0
  48. tolokaforge-0.2.0/docs/GRADING.md +187 -0
  49. tolokaforge-0.2.0/docs/GRADING_VERIFICATION.md +367 -0
  50. tolokaforge-0.2.0/docs/GRPC_PROTOCOL.md +699 -0
  51. tolokaforge-0.2.0/docs/KNOWLEDGE_REASONING.md +42 -0
  52. tolokaforge-0.2.0/docs/LLM_LAYER.md +733 -0
  53. tolokaforge-0.2.0/docs/LOGGING.md +667 -0
  54. tolokaforge-0.2.0/docs/MCP_INTEGRATION.md +22 -0
  55. tolokaforge-0.2.0/docs/NATIVE_ADAPTER.md +401 -0
  56. tolokaforge-0.2.0/docs/NOVA_INTEGRATION.md +155 -0
  57. tolokaforge-0.2.0/docs/OUTPUT_FORMAT.md +433 -0
  58. tolokaforge-0.2.0/docs/PERFORMANCE.md +94 -0
  59. tolokaforge-0.2.0/docs/PYTHON_PACKAGE.md +110 -0
  60. tolokaforge-0.2.0/docs/REFERENCE.md +428 -0
  61. tolokaforge-0.2.0/docs/RUNNER.md +168 -0
  62. tolokaforge-0.2.0/docs/SECURITY.md +140 -0
  63. tolokaforge-0.2.0/docs/TASKS.md +197 -0
  64. tolokaforge-0.2.0/docs/TASK_DESCRIPTION_SCHEMA.md +698 -0
  65. tolokaforge-0.2.0/docs/TASK_PACKS.md +104 -0
  66. tolokaforge-0.2.0/docs/TOOLS.md +57 -0
  67. tolokaforge-0.2.0/docs/TROUBLESHOOTING.md +54 -0
  68. tolokaforge-0.2.0/docs/TYPESENSE_INTEGRATION.md +421 -0
  69. tolokaforge-0.2.0/docs/custom_checks.md +18 -0
  70. tolokaforge-0.2.0/examples/README.md +35 -0
  71. tolokaforge-0.2.0/examples/native/browser_task/README.md +57 -0
  72. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_01/fixtures/policy_brief.txt +9 -0
  73. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_01/grading.yaml +36 -0
  74. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_01/index.html +12 -0
  75. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_01/order_7712.html +10 -0
  76. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_01/policy_cancellation.html +8 -0
  77. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_01/policy_enterprise_addendum.html +7 -0
  78. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_01/policy_refunds.html +8 -0
  79. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_01/task.yaml +30 -0
  80. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_02/fixtures/runbook_notes.txt +5 -0
  81. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_02/grading.yaml +36 -0
  82. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_02/incident_443_ticket.html +8 -0
  83. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_02/index.html +10 -0
  84. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_02/runbook_base.html +7 -0
  85. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_02/runbook_emergency_addendum.html +7 -0
  86. tolokaforge-0.2.0/examples/native/browser_task/dataset/tasks/browser/browser_public_example_02/task.yaml +30 -0
  87. tolokaforge-0.2.0/examples/native/browser_task/run_config.yaml +24 -0
  88. tolokaforge-0.2.0/examples/native/coding/README.md +22 -0
  89. tolokaforge-0.2.0/examples/native/coding/dataset/tasks/coding/coding_public_example_01/fixtures/README.md +11 -0
  90. tolokaforge-0.2.0/examples/native/coding/dataset/tasks/coding/coding_public_example_01/fixtures/buggy_math.py +7 -0
  91. tolokaforge-0.2.0/examples/native/coding/dataset/tasks/coding/coding_public_example_01/grading.yaml +37 -0
  92. tolokaforge-0.2.0/examples/native/coding/dataset/tasks/coding/coding_public_example_01/task.yaml +27 -0
  93. tolokaforge-0.2.0/examples/native/coding/dataset/tasks/coding/coding_public_example_02/fixtures/README.md +17 -0
  94. tolokaforge-0.2.0/examples/native/coding/dataset/tasks/coding/coding_public_example_02/fixtures/data_parser.py +3 -0
  95. tolokaforge-0.2.0/examples/native/coding/dataset/tasks/coding/coding_public_example_02/grading.yaml +40 -0
  96. tolokaforge-0.2.0/examples/native/coding/dataset/tasks/coding/coding_public_example_02/task.yaml +27 -0
  97. tolokaforge-0.2.0/examples/native/coding/run_config.yaml +25 -0
  98. tolokaforge-0.2.0/examples/native/native_shared_domain/README.md +46 -0
  99. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/_shared/domain.yaml +18 -0
  100. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/_shared/mcp_server.py +18 -0
  101. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/_shared/models.py +17 -0
  102. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/_shared/system_prompt.md +20 -0
  103. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/_shared/tools/__init__.py +10 -0
  104. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/_shared/tools/notes.py +19 -0
  105. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/fixtures/tools.json +59 -0
  106. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/testcases/add_first_note/grading.yaml +22 -0
  107. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/testcases/add_first_note/initial_state.json +3 -0
  108. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/testcases/add_first_note/task.yaml +20 -0
  109. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/testcases/recall_existing_note/grading.yaml +18 -0
  110. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/testcases/recall_existing_note/initial_state.json +14 -0
  111. tolokaforge-0.2.0/examples/native/native_shared_domain/dataset/notes/testcases/recall_existing_note/task.yaml +21 -0
  112. tolokaforge-0.2.0/examples/native/native_shared_domain/run_config.yaml +25 -0
  113. tolokaforge-0.2.0/examples/native/tool_use/README.md +22 -0
  114. tolokaforge-0.2.0/examples/native/tool_use/dataset/tasks/tool_use/tool_use_public_example_01/fixtures/customer_ticket.json +8 -0
  115. tolokaforge-0.2.0/examples/native/tool_use/dataset/tasks/tool_use/tool_use_public_example_01/grading.yaml +42 -0
  116. tolokaforge-0.2.0/examples/native/tool_use/dataset/tasks/tool_use/tool_use_public_example_01/initial_state.json +13 -0
  117. tolokaforge-0.2.0/examples/native/tool_use/dataset/tasks/tool_use/tool_use_public_example_01/task.yaml +26 -0
  118. tolokaforge-0.2.0/examples/native/tool_use/dataset/tasks/tool_use/tool_use_public_example_02/fixtures/account_policy.md +5 -0
  119. tolokaforge-0.2.0/examples/native/tool_use/dataset/tasks/tool_use/tool_use_public_example_02/grading.yaml +41 -0
  120. tolokaforge-0.2.0/examples/native/tool_use/dataset/tasks/tool_use/tool_use_public_example_02/initial_state.json +11 -0
  121. tolokaforge-0.2.0/examples/native/tool_use/dataset/tasks/tool_use/tool_use_public_example_02/task.yaml +26 -0
  122. tolokaforge-0.2.0/examples/native/tool_use/run_config.yaml +25 -0
  123. tolokaforge-0.2.0/examples/terminal_bench/README.md +84 -0
  124. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/README.md +78 -0
  125. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/docker-compose.yaml +20 -0
  126. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/Dockerfile +50 -0
  127. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/docs/pipeline_design.md +87 -0
  128. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/entrypoint.sh +13 -0
  129. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/generate_data.py +249 -0
  130. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/init.sql +87 -0
  131. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/pipeline/cluster.py +46 -0
  132. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/pipeline/config.py +16 -0
  133. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/pipeline/extract.py +59 -0
  134. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/pipeline/features.py +72 -0
  135. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/pipeline/main.py +75 -0
  136. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/environment/pipeline/report.py +95 -0
  137. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/instruction.md +58 -0
  138. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/task.toml +17 -0
  139. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/task.yaml +73 -0
  140. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/tests/test.sh +53 -0
  141. tolokaforge-0.2.0/examples/terminal_bench/fix-airline-segmentation/tests/test_segmentation.py +304 -0
  142. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/README.md +75 -0
  143. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/docker-compose.yaml +20 -0
  144. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/Dockerfile +41 -0
  145. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/app/aggregator.py +119 -0
  146. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/app/config.py +9 -0
  147. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/app/database.py +15 -0
  148. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/app/fee_utils.py +91 -0
  149. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/app/legacy_fee_config.json +12 -0
  150. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/app/main.py +13 -0
  151. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/app/models.py +70 -0
  152. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/app/routers/__init__.py +0 -0
  153. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/app/routers/holds.py +146 -0
  154. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/app/routers/reports.py +28 -0
  155. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/entrypoint.sh +17 -0
  156. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/environment/init.sql +279 -0
  157. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/instruction.md +132 -0
  158. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/run-tests.sh +76 -0
  159. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/task.toml +17 -0
  160. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/task.yaml +121 -0
  161. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/tests/test.sh +76 -0
  162. tolokaforge-0.2.0/examples/terminal_bench/fix-billing-holds/tests/test_billing.py +557 -0
  163. tolokaforge-0.2.0/examples/terminal_bench/run_airline_segmentation.yaml +30 -0
  164. tolokaforge-0.2.0/examples/terminal_bench/run_billing_holds.yaml +30 -0
  165. tolokaforge-0.2.0/examples/terminal_bench/run_config.yaml +37 -0
  166. tolokaforge-0.2.0/pyproject.toml +315 -0
  167. tolokaforge-0.2.0/scripts/README.md +39 -0
  168. tolokaforge-0.2.0/scripts/common.sh +123 -0
  169. tolokaforge-0.2.0/scripts/generate_task_pack_compose_override.py +77 -0
  170. tolokaforge-0.2.0/scripts/setup/create_python_venv.sh +101 -0
  171. tolokaforge-0.2.0/scripts/setup/init_git_lfs.sh +32 -0
  172. tolokaforge-0.2.0/scripts/setup/setup_env.sh +171 -0
  173. tolokaforge-0.2.0/scripts/tests/smoke.sh +78 -0
  174. tolokaforge-0.2.0/scripts/tests/task_pack_docker_smoke.sh +124 -0
  175. tolokaforge-0.2.0/scripts/with_env.sh +44 -0
  176. tolokaforge-0.2.0/scripts/with_profile.sh +32 -0
  177. tolokaforge-0.2.0/tests/AGENTS.md +79 -0
  178. tolokaforge-0.2.0/tests/README.md +155 -0
  179. tolokaforge-0.2.0/tests/__init__.py +1 -0
  180. tolokaforge-0.2.0/tests/canonical/README.md +61 -0
  181. tolokaforge-0.2.0/tests/canonical/__init__.py +0 -0
  182. tolokaforge-0.2.0/tests/canonical/conftest.py +63 -0
  183. tolokaforge-0.2.0/tests/canonical/snapshots/.gitkeep +0 -0
  184. tolokaforge-0.2.0/tests/canonical/snapshots/golden_set_bug_reproduction/food_delivery_2_hash_comparison.json +6 -0
  185. tolokaforge-0.2.0/tests/canonical/snapshots/golden_set_execution/food_delivery_2_golden_execution.json +4 -0
  186. tolokaforge-0.2.0/tests/canonical/snapshots/grading_state_calc/fail_result.json +4 -0
  187. tolokaforge-0.2.0/tests/canonical/snapshots/grading_state_calc/pass_result.json +4 -0
  188. tolokaforge-0.2.0/tests/canonical/snapshots/grading_transcript_calc/fail_result.json +4 -0
  189. tolokaforge-0.2.0/tests/canonical/snapshots/grading_transcript_calc/pass_result.json +4 -0
  190. tolokaforge-0.2.0/tests/canonical/snapshots/native_browser_basic/grading_config.json +32 -0
  191. tolokaforge-0.2.0/tests/canonical/snapshots/native_browser_basic/task_config.json +52 -0
  192. tolokaforge-0.2.0/tests/canonical/snapshots/native_calc_basic/grading_config.json +31 -0
  193. tolokaforge-0.2.0/tests/canonical/snapshots/native_calc_basic/task_config.json +51 -0
  194. tolokaforge-0.2.0/tests/canonical/snapshots/native_example_domain_case_a/bundle_artifact_keys.json +10 -0
  195. tolokaforge-0.2.0/tests/canonical/snapshots/native_example_domain_case_a/grading_config.json +24 -0
  196. tolokaforge-0.2.0/tests/canonical/snapshots/native_example_domain_case_a/task_config.json +44 -0
  197. tolokaforge-0.2.0/tests/canonical/snapshots/native_shop_orders_02/grading_config.json +135 -0
  198. tolokaforge-0.2.0/tests/canonical/snapshots/native_shop_orders_02/initial_state_tables.json +46 -0
  199. tolokaforge-0.2.0/tests/canonical/snapshots/native_shop_orders_02/task_config.json +53 -0
  200. tolokaforge-0.2.0/tests/canonical/snapshots/native_shop_orders_02/tool_schemas.json +145 -0
  201. tolokaforge-0.2.0/tests/canonical/snapshots/sanitizer_contract/anthropic.json +373 -0
  202. tolokaforge-0.2.0/tests/canonical/snapshots/sanitizer_contract/aws_nova.json +373 -0
  203. tolokaforge-0.2.0/tests/canonical/snapshots/sanitizer_contract/default.json +373 -0
  204. tolokaforge-0.2.0/tests/canonical/snapshots/sanitizer_contract/openai_gpt5.json +379 -0
  205. tolokaforge-0.2.0/tests/canonical/snapshots/sanitizer_contract/qwen.json +373 -0
  206. tolokaforge-0.2.0/tests/canonical/snapshots/sanitizer_contract/xai_grok.json +379 -0
  207. tolokaforge-0.2.0/tests/canonical/snapshots/schema_policy_dict_map_hints_tau/dict_map_hints.json +3 -0
  208. tolokaforge-0.2.0/tests/canonical/snapshots/schema_policy_strict_tau/strict_transform.json +91 -0
  209. tolokaforge-0.2.0/tests/canonical/snapshots/tau_conversion/task_config.json +28 -0
  210. tolokaforge-0.2.0/tests/canonical/snapshots/tbench_echo_hello/grading_config.json +13 -0
  211. tolokaforge-0.2.0/tests/canonical/snapshots/tbench_echo_hello/task_config.json +51 -0
  212. tolokaforge-0.2.0/tests/canonical/snapshots/tbench_echo_hello/task_description.json +98 -0
  213. tolokaforge-0.2.0/tests/canonical/snapshots/tbench_echo_hello/tool_schemas.json +44 -0
  214. tolokaforge-0.2.0/tests/canonical/snapshots/trajectory_reasoning/trajectory.yaml +36 -0
  215. tolokaforge-0.2.0/tests/canonical/test_cache_policy_preset_routing.py +73 -0
  216. tolokaforge-0.2.0/tests/canonical/test_capability_registry.py +194 -0
  217. tolokaforge-0.2.0/tests/canonical/test_content_policy_filler_routing.py +89 -0
  218. tolokaforge-0.2.0/tests/canonical/test_cost_extraction_canon.py +314 -0
  219. tolokaforge-0.2.0/tests/canonical/test_custom_checks_canon.py +324 -0
  220. tolokaforge-0.2.0/tests/canonical/test_grading_canon.py +135 -0
  221. tolokaforge-0.2.0/tests/canonical/test_grading_pipeline_canon.py +114 -0
  222. tolokaforge-0.2.0/tests/canonical/test_native_adapter_canon.py +360 -0
  223. tolokaforge-0.2.0/tests/canonical/test_output_layout.py +351 -0
  224. tolokaforge-0.2.0/tests/canonical/test_sanitizer_contract.py +365 -0
  225. tolokaforge-0.2.0/tests/canonical/test_schema_policies.py +126 -0
  226. tolokaforge-0.2.0/tests/canonical/test_shop_orders_02_behavior_canon.py +984 -0
  227. tolokaforge-0.2.0/tests/canonical/test_terminal_bench_adapter_canon.py +147 -0
  228. tolokaforge-0.2.0/tests/canonical/test_trajectory_reasoning_snapshot.py +156 -0
  229. tolokaforge-0.2.0/tests/conftest.py +111 -0
  230. tolokaforge-0.2.0/tests/data/__init__.py +0 -0
  231. tolokaforge-0.2.0/tests/data/configs/tau_retail_mini.yaml +33 -0
  232. tolokaforge-0.2.0/tests/data/projects/README.md +230 -0
  233. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/check_helpers.py +190 -0
  234. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/cities.json +62 -0
  235. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/combined_initial_state.json +11267 -0
  236. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/menu_item_categories.json +92 -0
  237. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/menu_items.json +2891 -0
  238. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/menu_items_per_cuisine.json +462 -0
  239. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/money_back_requests.json +1 -0
  240. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/orders.json +4467 -0
  241. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/restaurant_names_desc.json +462 -0
  242. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/restaurant_rates.json +284 -0
  243. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/restaurants.json +1698 -0
  244. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/data/users.json +846 -0
  245. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/mcp_server.py +131 -0
  246. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/output/trials/051fa6cb-a29e-4a0d-9ccf-e0f95802eee5/0/env.yaml +3 -0
  247. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/output/trials/051fa6cb-a29e-4a0d-9ccf-e0f95802eee5/0/grade.yaml +3 -0
  248. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/output/trials/051fa6cb-a29e-4a0d-9ccf-e0f95802eee5/0/logs.yaml +3 -0
  249. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/output/trials/051fa6cb-a29e-4a0d-9ccf-e0f95802eee5/0/metrics.yaml +3 -0
  250. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/output/trials/051fa6cb-a29e-4a0d-9ccf-e0f95802eee5/0/task.yaml +3 -0
  251. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/output/trials/051fa6cb-a29e-4a0d-9ccf-e0f95802eee5/0/trajectory.yaml +3 -0
  252. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tasks/order_modify_with_checks/checks.py +255 -0
  253. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tasks/order_modify_with_checks/grading.yaml +38 -0
  254. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tasks/order_modify_with_checks/task.yaml +84 -0
  255. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tasks/order_six_items_golden/grading.yaml +52 -0
  256. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tasks/order_six_items_golden/task.yaml +88 -0
  257. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/__init__.py +4 -0
  258. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/add_payment_method.py +106 -0
  259. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/add_restaurant_rating.py +99 -0
  260. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/calculate.py +36 -0
  261. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/cancel_order.py +76 -0
  262. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/change_primary_payment_method.py +81 -0
  263. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/create_money_back_request.py +103 -0
  264. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/create_order.py +274 -0
  265. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/data/__init__.py +36 -0
  266. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/data/constants.py +81 -0
  267. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/data/schemas.py +201 -0
  268. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/delete_money_back_request.py +69 -0
  269. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/delete_payment_method.py +84 -0
  270. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/delete_restaurant_rating.py +92 -0
  271. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/get_order_details.py +47 -0
  272. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/get_restaurant_details.py +87 -0
  273. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/get_restaurant_rating.py +86 -0
  274. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/get_restaurants_list.py +100 -0
  275. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/get_user_details.py +35 -0
  276. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/get_user_money_back_requests.py +100 -0
  277. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/get_user_payments_history.py +103 -0
  278. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/lookup_for_city_id.py +54 -0
  279. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/modify_order.py +219 -0
  280. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/think.py +31 -0
  281. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/tool_base.py +7 -0
  282. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/tools_helpers.py +40 -0
  283. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/transfer_to_human_agents.py +36 -0
  284. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/update_user_address.py +80 -0
  285. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tau_tools/update_user_details.py +111 -0
  286. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/tools_helpers.py +40 -0
  287. tolokaforge-0.2.0/tests/data/projects/food_delivery_2/wiki.md +153 -0
  288. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/data/__init__.py +66 -0
  289. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/env.py +5 -0
  290. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/aggregate.json +3 -0
  291. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/per_task_metrics.json +3 -0
  292. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/run_state.json +3 -0
  293. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/0f3b1ff7/0/env.yaml +3 -0
  294. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/0f3b1ff7/0/grade.yaml +3 -0
  295. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/0f3b1ff7/0/logs.yaml +3 -0
  296. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/0f3b1ff7/0/metrics.yaml +3 -0
  297. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/0f3b1ff7/0/task.yaml +3 -0
  298. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/0f3b1ff7/0/trajectory.yaml +3 -0
  299. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_001/0/env.yaml +3 -0
  300. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_001/0/grade.yaml +3 -0
  301. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_001/0/logs.yaml +3 -0
  302. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_001/0/metrics.yaml +3 -0
  303. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_001/0/task.yaml +3 -0
  304. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_001/0/trajectory.yaml +3 -0
  305. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_002/0/env.yaml +3 -0
  306. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_002/0/grade.yaml +3 -0
  307. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_002/0/logs.yaml +3 -0
  308. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_002/0/metrics.yaml +3 -0
  309. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_002/0/task.yaml +3 -0
  310. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/output/trials/test_002/0/trajectory.yaml +3 -0
  311. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/tasks_test.py +60 -0
  312. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/tools/__init__.py +115 -0
  313. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/types_local.py +26 -0
  314. tolokaforge-0.2.0/tests/data/projects/tau_retail_mini/wiki.md +17 -0
  315. tolokaforge-0.2.0/tests/data/tasks/bad_mobile/grading.yaml +2 -0
  316. tolokaforge-0.2.0/tests/data/tasks/bad_mobile/task.yaml +16 -0
  317. tolokaforge-0.2.0/tests/data/tasks/browser_basic/grading.yaml +29 -0
  318. tolokaforge-0.2.0/tests/data/tasks/browser_basic/task.yaml +44 -0
  319. tolokaforge-0.2.0/tests/data/tasks/calc_basic/grading.yaml +28 -0
  320. tolokaforge-0.2.0/tests/data/tasks/calc_basic/task.yaml +29 -0
  321. tolokaforge-0.2.0/tests/data/tasks/calc_custom_checks/checks.py +84 -0
  322. tolokaforge-0.2.0/tests/data/tasks/calc_custom_checks/grading.yaml +26 -0
  323. tolokaforge-0.2.0/tests/data/tasks/calc_custom_checks/task.yaml +30 -0
  324. tolokaforge-0.2.0/tests/data/tasks/example_domain/_shared/domain.yaml +18 -0
  325. tolokaforge-0.2.0/tests/data/tasks/example_domain/_shared/mcp_server.py +7 -0
  326. tolokaforge-0.2.0/tests/data/tasks/example_domain/_shared/system_prompt.md +4 -0
  327. tolokaforge-0.2.0/tests/data/tasks/example_domain/testcases/case_a/grading.yaml +11 -0
  328. tolokaforge-0.2.0/tests/data/tasks/example_domain/testcases/case_a/initial_state.json +3 -0
  329. tolokaforge-0.2.0/tests/data/tasks/example_domain/testcases/case_a/task.yaml +11 -0
  330. tolokaforge-0.2.0/tests/data/tasks/shop_orders_02/fixtures/tools.json +101 -0
  331. tolokaforge-0.2.0/tests/data/tasks/shop_orders_02/grading.yaml +127 -0
  332. tolokaforge-0.2.0/tests/data/tasks/shop_orders_02/initial_state.json +41 -0
  333. tolokaforge-0.2.0/tests/data/tasks/shop_orders_02/mcp_server.py +180 -0
  334. tolokaforge-0.2.0/tests/data/tasks/shop_orders_02/system_prompt.md +44 -0
  335. tolokaforge-0.2.0/tests/data/tasks/shop_orders_02/task.yaml +46 -0
  336. tolokaforge-0.2.0/tests/data/tasks/synth_mobile_01/grading.yaml +2 -0
  337. tolokaforge-0.2.0/tests/data/tasks/synth_mobile_01/initial_state.json +3 -0
  338. tolokaforge-0.2.0/tests/data/tasks/synth_mobile_01/task.yaml +23 -0
  339. tolokaforge-0.2.0/tests/data/terminal_bench_tasks/echo-hello/docker-compose.yaml +12 -0
  340. tolokaforge-0.2.0/tests/data/terminal_bench_tasks/echo-hello/environment/Dockerfile +3 -0
  341. tolokaforge-0.2.0/tests/data/terminal_bench_tasks/echo-hello/run-tests.sh +24 -0
  342. tolokaforge-0.2.0/tests/data/terminal_bench_tasks/echo-hello/task.toml +16 -0
  343. tolokaforge-0.2.0/tests/data/terminal_bench_tasks/echo-hello/task.yaml +2 -0
  344. tolokaforge-0.2.0/tests/data/terminal_bench_tasks/echo-hello/tests/test_outputs.py +11 -0
  345. tolokaforge-0.2.0/tests/integration/__init__.py +1 -0
  346. tolokaforge-0.2.0/tests/integration/docker/__init__.py +8 -0
  347. tolokaforge-0.2.0/tests/integration/docker/conftest.py +74 -0
  348. tolokaforge-0.2.0/tests/integration/docker/test_docker_build_context.py +103 -0
  349. tolokaforge-0.2.0/tests/integration/docker/test_docker_caching.py +254 -0
  350. tolokaforge-0.2.0/tests/integration/docker/test_docker_integration.py +547 -0
  351. tolokaforge-0.2.0/tests/integration/docker/test_docker_stack.py +278 -0
  352. tolokaforge-0.2.0/tests/integration/llm/__init__.py +14 -0
  353. tolokaforge-0.2.0/tests/integration/llm/_capability.py +388 -0
  354. tolokaforge-0.2.0/tests/integration/llm/conftest.py +122 -0
  355. tolokaforge-0.2.0/tests/integration/llm/registry.py +893 -0
  356. tolokaforge-0.2.0/tests/integration/llm/test_basic_completion.py +42 -0
  357. tolokaforge-0.2.0/tests/integration/llm/test_cost_populated.py +73 -0
  358. tolokaforge-0.2.0/tests/integration/llm/test_decimal_field_tool_call.py +108 -0
  359. tolokaforge-0.2.0/tests/integration/llm/test_dict_map_tool_call.py +123 -0
  360. tolokaforge-0.2.0/tests/integration/llm/test_discriminated_union_tool_call.py +327 -0
  361. tolokaforge-0.2.0/tests/integration/llm/test_enum_slash_tolerance.py +132 -0
  362. tolokaforge-0.2.0/tests/integration/llm/test_enum_slash_tolerance_unsupported_ratchet.py +127 -0
  363. tolokaforge-0.2.0/tests/integration/llm/test_gemini_placeholder_signature_replay.py +362 -0
  364. tolokaforge-0.2.0/tests/integration/llm/test_implicit_prompt_caching.py +168 -0
  365. tolokaforge-0.2.0/tests/integration/llm/test_implicit_prompt_caching_unsupported_ratchet.py +184 -0
  366. tolokaforge-0.2.0/tests/integration/llm/test_lexical_tool_invention.py +145 -0
  367. tolokaforge-0.2.0/tests/integration/llm/test_multi_turn_error_recovery.py +232 -0
  368. tolokaforge-0.2.0/tests/integration/llm/test_multi_turn_tool_use.py +101 -0
  369. tolokaforge-0.2.0/tests/integration/llm/test_nova_api.py +549 -0
  370. tolokaforge-0.2.0/tests/integration/llm/test_progress_after_success.py +268 -0
  371. tolokaforge-0.2.0/tests/integration/llm/test_prompt_caching.py +155 -0
  372. tolokaforge-0.2.0/tests/integration/llm/test_re2_pattern_tolerance.py +142 -0
  373. tolokaforge-0.2.0/tests/integration/llm/test_re2_pattern_tolerance_unsupported_ratchet.py +134 -0
  374. tolokaforge-0.2.0/tests/integration/llm/test_required_fields_complete.py +154 -0
  375. tolokaforge-0.2.0/tests/integration/llm/test_simple_tool_call.py +100 -0
  376. tolokaforge-0.2.0/tests/integration/llm/test_thinking_emits_blocks.py +88 -0
  377. tolokaforge-0.2.0/tests/integration/llm/test_thinking_replay_roundtrip.py +185 -0
  378. tolokaforge-0.2.0/tests/integration/llm/test_tool_name_discipline.py +133 -0
  379. tolokaforge-0.2.0/tests/integration/llm/test_unsigned_thinking_replay.py +179 -0
  380. tolokaforge-0.2.0/tests/integration/llm/test_usage_metrics_populated.py +51 -0
  381. tolokaforge-0.2.0/tests/integration/test_browser_tool.py +109 -0
  382. tolokaforge-0.2.0/tests/integration/test_docker_grading.py +39 -0
  383. tolokaforge-0.2.0/tests/integration/test_docker_services.py +182 -0
  384. tolokaforge-0.2.0/tests/integration/test_run_queue_postgres.py +78 -0
  385. tolokaforge-0.2.0/tests/integration/test_runner_cleanup_trial_grpc.py +109 -0
  386. tolokaforge-0.2.0/tests/integration/test_security.py +84 -0
  387. tolokaforge-0.2.0/tests/integration/test_typesense_lifecycle.py +50 -0
  388. tolokaforge-0.2.0/tests/unit/__init__.py +1 -0
  389. tolokaforge-0.2.0/tests/unit/adapters/__init__.py +1 -0
  390. tolokaforge-0.2.0/tests/unit/conftest.py +51 -0
  391. tolokaforge-0.2.0/tests/unit/grading/__init__.py +1 -0
  392. tolokaforge-0.2.0/tests/unit/grading/test_custom_checks.py +721 -0
  393. tolokaforge-0.2.0/tests/unit/grading/test_custom_checks_runner.py +281 -0
  394. tolokaforge-0.2.0/tests/unit/grading/test_evaluators.py +333 -0
  395. tolokaforge-0.2.0/tests/unit/grading/test_fuzzy_compare.py +200 -0
  396. tolokaforge-0.2.0/tests/unit/grading/test_grading_correctness.py +639 -0
  397. tolokaforge-0.2.0/tests/unit/grading/test_hash.py +144 -0
  398. tolokaforge-0.2.0/tests/unit/grading/test_judge.py +802 -0
  399. tolokaforge-0.2.0/tests/unit/grading/test_llm_judge_runner.py +241 -0
  400. tolokaforge-0.2.0/tests/unit/grading/test_state_checks.py +319 -0
  401. tolokaforge-0.2.0/tests/unit/grading/test_transcript.py +221 -0
  402. tolokaforge-0.2.0/tests/unit/llm/__init__.py +0 -0
  403. tolokaforge-0.2.0/tests/unit/llm/fixtures/__init__.py +4 -0
  404. tolokaforge-0.2.0/tests/unit/llm/fixtures/anthropic_display_omitted_response.json +19 -0
  405. tolokaforge-0.2.0/tests/unit/llm/fixtures/anthropic_thinking_response.json +27 -0
  406. tolokaforge-0.2.0/tests/unit/llm/fixtures/anthropic_usage_with_cache.json +15 -0
  407. tolokaforge-0.2.0/tests/unit/llm/fixtures/minimal_usage.json +7 -0
  408. tolokaforge-0.2.0/tests/unit/llm/fixtures/openai_gpt5_reasoning_response.json +17 -0
  409. tolokaforge-0.2.0/tests/unit/llm/fixtures/openai_gpt5_usage_with_reasoning.json +13 -0
  410. tolokaforge-0.2.0/tests/unit/llm/fixtures/openrouter_anthropic_reasoning_response.json +22 -0
  411. tolokaforge-0.2.0/tests/unit/llm/fixtures/openrouter_anthropic_usage.json +17 -0
  412. tolokaforge-0.2.0/tests/unit/llm/fixtures/openrouter_anthropic_usage_real.json +27 -0
  413. tolokaforge-0.2.0/tests/unit/llm/fixtures/openrouter_gemini_reasoning_encrypted_response.json +17 -0
  414. tolokaforge-0.2.0/tests/unit/llm/fixtures/openrouter_gemini_reasoning_text_response.json +21 -0
  415. tolokaforge-0.2.0/tests/unit/llm/test_anthropic_claude_47_preset.py +63 -0
  416. tolokaforge-0.2.0/tests/unit/llm/test_cache_policy_anthropic.py +303 -0
  417. tolokaforge-0.2.0/tests/unit/llm/test_capability_certificate.py +127 -0
  418. tolokaforge-0.2.0/tests/unit/llm/test_detect_dict_maps.py +181 -0
  419. tolokaforge-0.2.0/tests/unit/llm/test_json_coerce_response.py +247 -0
  420. tolokaforge-0.2.0/tests/unit/llm/test_message_replay.py +426 -0
  421. tolokaforge-0.2.0/tests/unit/llm/test_model_config_reasoning_rejects_string.py +68 -0
  422. tolokaforge-0.2.0/tests/unit/llm/test_openrouter_dict_stringify_recovery_preset.py +82 -0
  423. tolokaforge-0.2.0/tests/unit/llm/test_params_policy_budget.py +310 -0
  424. tolokaforge-0.2.0/tests/unit/llm/test_params_policy_unsupported_effort.py +176 -0
  425. tolokaforge-0.2.0/tests/unit/llm/test_preset_fingerprint.py +187 -0
  426. tolokaforge-0.2.0/tests/unit/llm/test_preset_overrides.py +69 -0
  427. tolokaforge-0.2.0/tests/unit/llm/test_qwen_dict_map_hints.py +225 -0
  428. tolokaforge-0.2.0/tests/unit/llm/test_qwen_preset.py +94 -0
  429. tolokaforge-0.2.0/tests/unit/llm/test_reasoning_codec.py +39 -0
  430. tolokaforge-0.2.0/tests/unit/llm/test_reasoning_codec_anthropic.py +469 -0
  431. tolokaforge-0.2.0/tests/unit/llm/test_reasoning_codec_gemini.py +609 -0
  432. tolokaforge-0.2.0/tests/unit/llm/test_reasoning_codec_openai.py +95 -0
  433. tolokaforge-0.2.0/tests/unit/llm/test_reasoning_dataclasses.py +124 -0
  434. tolokaforge-0.2.0/tests/unit/llm/test_response_policy_empty_container_coercion.py +157 -0
  435. tolokaforge-0.2.0/tests/unit/llm/test_response_policy_param_types_wiring.py +245 -0
  436. tolokaforge-0.2.0/tests/unit/llm/test_schema_sanitizer.py +104 -0
  437. tolokaforge-0.2.0/tests/unit/llm/test_schema_sanitizer_position_aware.py +415 -0
  438. tolokaforge-0.2.0/tests/unit/llm/test_schema_sanitizer_strict.py +325 -0
  439. tolokaforge-0.2.0/tests/unit/llm/test_synthetic_envelope_detection.py +358 -0
  440. tolokaforge-0.2.0/tests/unit/llm/test_usage.py +335 -0
  441. tolokaforge-0.2.0/tests/unit/llm/test_usage_pipeline_e2e.py +156 -0
  442. tolokaforge-0.2.0/tests/unit/secrets/__init__.py +0 -0
  443. tolokaforge-0.2.0/tests/unit/secrets/test_known_values.py +91 -0
  444. tolokaforge-0.2.0/tests/unit/secrets/test_log_filter.py +436 -0
  445. tolokaforge-0.2.0/tests/unit/secrets/test_no_raw_secret_access.py +103 -0
  446. tolokaforge-0.2.0/tests/unit/secrets/test_singleton.py +88 -0
  447. tolokaforge-0.2.0/tests/unit/test_adapters.py +225 -0
  448. tolokaforge-0.2.0/tests/unit/test_assemble_result_per_call_record.py +202 -0
  449. tolokaforge-0.2.0/tests/unit/test_browser_tool.py +159 -0
  450. tolokaforge-0.2.0/tests/unit/test_builtin_generic_wrapper.py +117 -0
  451. tolokaforge-0.2.0/tests/unit/test_builtin_registry.py +91 -0
  452. tolokaforge-0.2.0/tests/unit/test_builtin_tool_url_env_fallback.py +90 -0
  453. tolokaforge-0.2.0/tests/unit/test_builtin_tool_wrapper.py +212 -0
  454. tolokaforge-0.2.0/tests/unit/test_calculator_tool.py +165 -0
  455. tolokaforge-0.2.0/tests/unit/test_cli_commands.py +517 -0
  456. tolokaforge-0.2.0/tests/unit/test_cli_status.py +98 -0
  457. tolokaforge-0.2.0/tests/unit/test_config_validator.py +271 -0
  458. tolokaforge-0.2.0/tests/unit/test_conftest_docker_extra.py +32 -0
  459. tolokaforge-0.2.0/tests/unit/test_core_stack_playwright.py +40 -0
  460. tolokaforge-0.2.0/tests/unit/test_diff.py +147 -0
  461. tolokaforge-0.2.0/tests/unit/test_docker_adapter_cleanup_trial.py +37 -0
  462. tolokaforge-0.2.0/tests/unit/test_docker_build_context.py +280 -0
  463. tolokaforge-0.2.0/tests/unit/test_docker_runtime_grade_sentinel.py +31 -0
  464. tolokaforge-0.2.0/tests/unit/test_dockerfile_paths.py +73 -0
  465. tolokaforge-0.2.0/tests/unit/test_env_state_url_defaults.py +47 -0
  466. tolokaforge-0.2.0/tests/unit/test_executor_service_registry.py +135 -0
  467. tolokaforge-0.2.0/tests/unit/test_failure_attribution.py +138 -0
  468. tolokaforge-0.2.0/tests/unit/test_file_and_bash_tools.py +39 -0
  469. tolokaforge-0.2.0/tests/unit/test_full_stack_kwargs.py +92 -0
  470. tolokaforge-0.2.0/tests/unit/test_gemini_preset_routing.py +62 -0
  471. tolokaforge-0.2.0/tests/unit/test_golden_replay.py +176 -0
  472. tolokaforge-0.2.0/tests/unit/test_llm_providers.py +373 -0
  473. tolokaforge-0.2.0/tests/unit/test_logging.py +231 -0
  474. tolokaforge-0.2.0/tests/unit/test_metrics.py +157 -0
  475. tolokaforge-0.2.0/tests/unit/test_metrics_usage_accumulation.py +112 -0
  476. tolokaforge-0.2.0/tests/unit/test_mobile_tool.py +46 -0
  477. tolokaforge-0.2.0/tests/unit/test_mock_web_task_roots.py +88 -0
  478. tolokaforge-0.2.0/tests/unit/test_model_client.py +1854 -0
  479. tolokaforge-0.2.0/tests/unit/test_mounts.py +52 -0
  480. tolokaforge-0.2.0/tests/unit/test_native_adapter_builtin_dispatch.py +78 -0
  481. tolokaforge-0.2.0/tests/unit/test_native_adapter_domain.py +213 -0
  482. tolokaforge-0.2.0/tests/unit/test_network_409_race.py +61 -0
  483. tolokaforge-0.2.0/tests/unit/test_nova_logic.py +37 -0
  484. tolokaforge-0.2.0/tests/unit/test_orchestrator_full_stack_detection.py +77 -0
  485. tolokaforge-0.2.0/tests/unit/test_orchestrator_logic.py +975 -0
  486. tolokaforge-0.2.0/tests/unit/test_orchestrator_playwright_detection.py +57 -0
  487. tolokaforge-0.2.0/tests/unit/test_output_artifacts.py +300 -0
  488. tolokaforge-0.2.0/tests/unit/test_output_writer.py +264 -0
  489. tolokaforge-0.2.0/tests/unit/test_performance.py +76 -0
  490. tolokaforge-0.2.0/tests/unit/test_pricing.py +392 -0
  491. tolokaforge-0.2.0/tests/unit/test_rate_limiter.py +102 -0
  492. tolokaforge-0.2.0/tests/unit/test_resume.py +268 -0
  493. tolokaforge-0.2.0/tests/unit/test_run_queue.py +85 -0
  494. tolokaforge-0.2.0/tests/unit/test_runner_bootstrap_secrets.py +105 -0
  495. tolokaforge-0.2.0/tests/unit/test_runner_builtin_dispatch.py +113 -0
  496. tolokaforge-0.2.0/tests/unit/test_runner_cleanup_trial.py +109 -0
  497. tolokaforge-0.2.0/tests/unit/test_runner_filesystem_provisioning.py +97 -0
  498. tolokaforge-0.2.0/tests/unit/test_runner_jsonpath_grading.py +163 -0
  499. tolokaforge-0.2.0/tests/unit/test_runner_logic.py +630 -0
  500. tolokaforge-0.2.0/tests/unit/test_runner_per_trial_cost_accounting.py +197 -0
  501. tolokaforge-0.2.0/tests/unit/test_runner_pipeline.py +389 -0
  502. tolokaforge-0.2.0/tests/unit/test_service_definition.py +117 -0
  503. tolokaforge-0.2.0/tests/unit/test_stack_network_aliases.py +55 -0
  504. tolokaforge-0.2.0/tests/unit/test_stuck_detector.py +219 -0
  505. tolokaforge-0.2.0/tests/unit/test_task_loader.py +441 -0
  506. tolokaforge-0.2.0/tests/unit/test_task_packs.py +157 -0
  507. tolokaforge-0.2.0/tests/unit/test_terminal_bench.py +654 -0
  508. tolokaforge-0.2.0/tests/unit/test_tool_builtins.py +667 -0
  509. tolokaforge-0.2.0/tests/unit/test_tool_schema_tool_config.py +52 -0
  510. tolokaforge-0.2.0/tests/unit/test_tool_security.py +135 -0
  511. tolokaforge-0.2.0/tests/unit/test_tools_interface.py +532 -0
  512. tolokaforge-0.2.0/tests/unit/test_tools_registry.py +170 -0
  513. tolokaforge-0.2.0/tests/unit/test_trajectory_stage7_fields.py +83 -0
  514. tolokaforge-0.2.0/tests/unit/test_usage_calls_aggregation.py +123 -0
  515. tolokaforge-0.2.0/tests/unit/test_user_simulator_prompt_capture.py +77 -0
  516. tolokaforge-0.2.0/tests/unit/test_user_simulator_tools.py +48 -0
  517. tolokaforge-0.2.0/tests/unit/test_user_tools.py +138 -0
  518. tolokaforge-0.2.0/tests/unit/test_wheel_resolver.py +545 -0
  519. tolokaforge-0.2.0/tests/unit/test_workspace_editing_tools.py +179 -0
  520. tolokaforge-0.2.0/tests/utils/__init__.py +1 -0
  521. tolokaforge-0.2.0/tests/utils/containers.py +220 -0
  522. tolokaforge-0.2.0/tests/utils/docker_helpers.py +75 -0
  523. tolokaforge-0.2.0/tests/utils/fixtures.py +218 -0
  524. tolokaforge-0.2.0/tests/utils/mock_clients.py +79 -0
  525. tolokaforge-0.2.0/tests/utils/networks.py +86 -0
  526. tolokaforge-0.2.0/tests/utils/project_fixtures.py +408 -0
  527. tolokaforge-0.2.0/tests/utils/validators.py +243 -0
  528. tolokaforge-0.2.0/tolokaforge/__init__.py +54 -0
  529. tolokaforge-0.2.0/tolokaforge/adapters/__init__.py +136 -0
  530. tolokaforge-0.2.0/tolokaforge/adapters/_task_loader.py +270 -0
  531. tolokaforge-0.2.0/tolokaforge/adapters/base.py +495 -0
  532. tolokaforge-0.2.0/tolokaforge/adapters/native.py +939 -0
  533. tolokaforge-0.2.0/tolokaforge/agent/__init__.py +12 -0
  534. tolokaforge-0.2.0/tolokaforge/agent/__main__.py +6 -0
  535. tolokaforge-0.2.0/tolokaforge/agent/agent.proto +84 -0
  536. tolokaforge-0.2.0/tolokaforge/agent/agent_pb2.py +52 -0
  537. tolokaforge-0.2.0/tolokaforge/agent/agent_pb2_grpc.py +150 -0
  538. tolokaforge-0.2.0/tolokaforge/agent/service.py +193 -0
  539. tolokaforge-0.2.0/tolokaforge/cli/__init__.py +1 -0
  540. tolokaforge-0.2.0/tolokaforge/cli/adapter_commands.py +160 -0
  541. tolokaforge-0.2.0/tolokaforge/cli/config_commands.py +122 -0
  542. tolokaforge-0.2.0/tolokaforge/cli/docker_commands.py +186 -0
  543. tolokaforge-0.2.0/tolokaforge/cli/main.py +472 -0
  544. tolokaforge-0.2.0/tolokaforge/core/__init__.py +1 -0
  545. tolokaforge-0.2.0/tolokaforge/core/config_validator.py +361 -0
  546. tolokaforge-0.2.0/tolokaforge/core/data/model_presets.yaml +176 -0
  547. tolokaforge-0.2.0/tolokaforge/core/data/pricing.json +1841 -0
  548. tolokaforge-0.2.0/tolokaforge/core/docker_adapter.py +220 -0
  549. tolokaforge-0.2.0/tolokaforge/core/docker_runtime.py +597 -0
  550. tolokaforge-0.2.0/tolokaforge/core/env_state.py +268 -0
  551. tolokaforge-0.2.0/tolokaforge/core/evaluators/__init__.py +7 -0
  552. tolokaforge-0.2.0/tolokaforge/core/evaluators/action_evaluator.py +167 -0
  553. tolokaforge-0.2.0/tolokaforge/core/evaluators/communicate_evaluator.py +145 -0
  554. tolokaforge-0.2.0/tolokaforge/core/evaluators/environment_evaluator.py +238 -0
  555. tolokaforge-0.2.0/tolokaforge/core/failure_attribution.py +173 -0
  556. tolokaforge-0.2.0/tolokaforge/core/grading/__init__.py +87 -0
  557. tolokaforge-0.2.0/tolokaforge/core/grading/check_runner.py +626 -0
  558. tolokaforge-0.2.0/tolokaforge/core/grading/checks_helpers.py +471 -0
  559. tolokaforge-0.2.0/tolokaforge/core/grading/checks_interface.py +514 -0
  560. tolokaforge-0.2.0/tolokaforge/core/grading/combine.py +459 -0
  561. tolokaforge-0.2.0/tolokaforge/core/grading/fuzzy_compare.py +371 -0
  562. tolokaforge-0.2.0/tolokaforge/core/grading/judge.py +1215 -0
  563. tolokaforge-0.2.0/tolokaforge/core/grading/state_checks.py +468 -0
  564. tolokaforge-0.2.0/tolokaforge/core/grading/transcript.py +131 -0
  565. tolokaforge-0.2.0/tolokaforge/core/hash.py +183 -0
  566. tolokaforge-0.2.0/tolokaforge/core/llm/__init__.py +131 -0
  567. tolokaforge-0.2.0/tolokaforge/core/llm/_dict_maps.py +140 -0
  568. tolokaforge-0.2.0/tolokaforge/core/llm/cache_policy.py +127 -0
  569. tolokaforge-0.2.0/tolokaforge/core/llm/capabilities.py +71 -0
  570. tolokaforge-0.2.0/tolokaforge/core/llm/client.py +1563 -0
  571. tolokaforge-0.2.0/tolokaforge/core/llm/content_policy.py +117 -0
  572. tolokaforge-0.2.0/tolokaforge/core/llm/params_policy.py +263 -0
  573. tolokaforge-0.2.0/tolokaforge/core/llm/presets.py +431 -0
  574. tolokaforge-0.2.0/tolokaforge/core/llm/prompt_policy.py +99 -0
  575. tolokaforge-0.2.0/tolokaforge/core/llm/reasoning.py +134 -0
  576. tolokaforge-0.2.0/tolokaforge/core/llm/reasoning_codec.py +467 -0
  577. tolokaforge-0.2.0/tolokaforge/core/llm/response_policy.py +297 -0
  578. tolokaforge-0.2.0/tolokaforge/core/llm/schema_sanitizer.py +816 -0
  579. tolokaforge-0.2.0/tolokaforge/core/llm/usage.py +277 -0
  580. tolokaforge-0.2.0/tolokaforge/core/logging.py +224 -0
  581. tolokaforge-0.2.0/tolokaforge/core/metrics.py +320 -0
  582. tolokaforge-0.2.0/tolokaforge/core/models.py +562 -0
  583. tolokaforge-0.2.0/tolokaforge/core/mounts.py +73 -0
  584. tolokaforge-0.2.0/tolokaforge/core/orchestrator.py +1907 -0
  585. tolokaforge-0.2.0/tolokaforge/core/output/__init__.py +32 -0
  586. tolokaforge-0.2.0/tolokaforge/core/output/artifacts.py +313 -0
  587. tolokaforge-0.2.0/tolokaforge/core/output_writer.py +215 -0
  588. tolokaforge-0.2.0/tolokaforge/core/pricing.py +296 -0
  589. tolokaforge-0.2.0/tolokaforge/core/rate_limiter.py +34 -0
  590. tolokaforge-0.2.0/tolokaforge/core/resume.py +255 -0
  591. tolokaforge-0.2.0/tolokaforge/core/run_queue.py +614 -0
  592. tolokaforge-0.2.0/tolokaforge/core/runner.py +587 -0
  593. tolokaforge-0.2.0/tolokaforge/core/schema/grade.json +34 -0
  594. tolokaforge-0.2.0/tolokaforge/core/schema/metrics.json +28 -0
  595. tolokaforge-0.2.0/tolokaforge/core/schema/tool_call.json +15 -0
  596. tolokaforge-0.2.0/tolokaforge/core/schema/trajectory.json +79 -0
  597. tolokaforge-0.2.0/tolokaforge/core/search/__init__.py +18 -0
  598. tolokaforge-0.2.0/tolokaforge/core/search/domain_state.py +251 -0
  599. tolokaforge-0.2.0/tolokaforge/core/search/typesense.py +286 -0
  600. tolokaforge-0.2.0/tolokaforge/core/search/typesense_server.py +384 -0
  601. tolokaforge-0.2.0/tolokaforge/core/stuck.py +115 -0
  602. tolokaforge-0.2.0/tolokaforge/core/tools_interface.py +388 -0
  603. tolokaforge-0.2.0/tolokaforge/core/utils/__init__.py +1 -0
  604. tolokaforge-0.2.0/tolokaforge/core/utils/diff.py +125 -0
  605. tolokaforge-0.2.0/tolokaforge/docker/__init__.py +165 -0
  606. tolokaforge-0.2.0/tolokaforge/docker/builder.py +371 -0
  607. tolokaforge-0.2.0/tolokaforge/docker/config.py +197 -0
  608. tolokaforge-0.2.0/tolokaforge/docker/container.py +1083 -0
  609. tolokaforge-0.2.0/tolokaforge/docker/dockerfiles/__init__.py +40 -0
  610. tolokaforge-0.2.0/tolokaforge/docker/dockerfiles/agent.Dockerfile +18 -0
  611. tolokaforge-0.2.0/tolokaforge/docker/dockerfiles/db_service.Dockerfile +38 -0
  612. tolokaforge-0.2.0/tolokaforge/docker/dockerfiles/executor.Dockerfile +25 -0
  613. tolokaforge-0.2.0/tolokaforge/docker/dockerfiles/json_db.Dockerfile +17 -0
  614. tolokaforge-0.2.0/tolokaforge/docker/dockerfiles/mock_web.Dockerfile +17 -0
  615. tolokaforge-0.2.0/tolokaforge/docker/dockerfiles/orchestrator.Dockerfile +22 -0
  616. tolokaforge-0.2.0/tolokaforge/docker/dockerfiles/rag.Dockerfile +35 -0
  617. tolokaforge-0.2.0/tolokaforge/docker/dockerfiles/runner.Dockerfile +87 -0
  618. tolokaforge-0.2.0/tolokaforge/docker/health.py +812 -0
  619. tolokaforge-0.2.0/tolokaforge/docker/image.py +644 -0
  620. tolokaforge-0.2.0/tolokaforge/docker/logging.py +750 -0
  621. tolokaforge-0.2.0/tolokaforge/docker/mount.py +376 -0
  622. tolokaforge-0.2.0/tolokaforge/docker/network.py +546 -0
  623. tolokaforge-0.2.0/tolokaforge/docker/policy.py +296 -0
  624. tolokaforge-0.2.0/tolokaforge/docker/ports.py +240 -0
  625. tolokaforge-0.2.0/tolokaforge/docker/registry.py +347 -0
  626. tolokaforge-0.2.0/tolokaforge/docker/stack.py +932 -0
  627. tolokaforge-0.2.0/tolokaforge/docker/stacks/__init__.py +30 -0
  628. tolokaforge-0.2.0/tolokaforge/docker/stacks/core.py +196 -0
  629. tolokaforge-0.2.0/tolokaforge/docker/stacks/full.py +135 -0
  630. tolokaforge-0.2.0/tolokaforge/docker/stacks/test.py +70 -0
  631. tolokaforge-0.2.0/tolokaforge/docker/stacks/typesense.py +83 -0
  632. tolokaforge-0.2.0/tolokaforge/docker/wait_for_services.py +251 -0
  633. tolokaforge-0.2.0/tolokaforge/docker/wheel_resolver.py +656 -0
  634. tolokaforge-0.2.0/tolokaforge/env/__init__.py +1 -0
  635. tolokaforge-0.2.0/tolokaforge/env/json_db_service/app.py +1195 -0
  636. tolokaforge-0.2.0/tolokaforge/env/json_db_service/requirements.txt +4 -0
  637. tolokaforge-0.2.0/tolokaforge/env/mock_web_service/app.py +470 -0
  638. tolokaforge-0.2.0/tolokaforge/env/mock_web_service/requirements.txt +5 -0
  639. tolokaforge-0.2.0/tolokaforge/env/rag_service/app.py +527 -0
  640. tolokaforge-0.2.0/tolokaforge/env/rag_service/requirements.txt +12 -0
  641. tolokaforge-0.2.0/tolokaforge/executor/__init__.py +12 -0
  642. tolokaforge-0.2.0/tolokaforge/executor/__main__.py +6 -0
  643. tolokaforge-0.2.0/tolokaforge/executor/executor.proto +78 -0
  644. tolokaforge-0.2.0/tolokaforge/executor/executor_pb2.py +50 -0
  645. tolokaforge-0.2.0/tolokaforge/executor/executor_pb2_grpc.py +197 -0
  646. tolokaforge-0.2.0/tolokaforge/executor/service.py +288 -0
  647. tolokaforge-0.2.0/tolokaforge/runner/__init__.py +75 -0
  648. tolokaforge-0.2.0/tolokaforge/runner/__main__.py +335 -0
  649. tolokaforge-0.2.0/tolokaforge/runner/db_client.py +696 -0
  650. tolokaforge-0.2.0/tolokaforge/runner/db_proxy.py +727 -0
  651. tolokaforge-0.2.0/tolokaforge/runner/grading.py +841 -0
  652. tolokaforge-0.2.0/tolokaforge/runner/models.py +690 -0
  653. tolokaforge-0.2.0/tolokaforge/runner/rag_client.py +540 -0
  654. tolokaforge-0.2.0/tolokaforge/runner/runner.proto +333 -0
  655. tolokaforge-0.2.0/tolokaforge/runner/runner_pb2.py +73 -0
  656. tolokaforge-0.2.0/tolokaforge/runner/runner_pb2_grpc.py +407 -0
  657. tolokaforge-0.2.0/tolokaforge/runner/service.py +1872 -0
  658. tolokaforge-0.2.0/tolokaforge/runner/tool_factory.py +1656 -0
  659. tolokaforge-0.2.0/tolokaforge/secrets/__init__.py +50 -0
  660. tolokaforge-0.2.0/tolokaforge/secrets/config.py +154 -0
  661. tolokaforge-0.2.0/tolokaforge/secrets/log_filter.py +134 -0
  662. tolokaforge-0.2.0/tolokaforge/secrets/manager.py +400 -0
  663. tolokaforge-0.2.0/tolokaforge/secrets/providers.py +360 -0
  664. tolokaforge-0.2.0/tolokaforge/tools/__init__.py +1 -0
  665. tolokaforge-0.2.0/tolokaforge/tools/builtin/__init__.py +47 -0
  666. tolokaforge-0.2.0/tolokaforge/tools/builtin/bash.py +124 -0
  667. tolokaforge-0.2.0/tolokaforge/tools/builtin/browser.py +1273 -0
  668. tolokaforge-0.2.0/tolokaforge/tools/builtin/calculator.py +102 -0
  669. tolokaforge-0.2.0/tolokaforge/tools/builtin/db_json.py +286 -0
  670. tolokaforge-0.2.0/tolokaforge/tools/builtin/files.py +731 -0
  671. tolokaforge-0.2.0/tolokaforge/tools/builtin/http_request.py +163 -0
  672. tolokaforge-0.2.0/tolokaforge/tools/builtin/mobile.py +235 -0
  673. tolokaforge-0.2.0/tolokaforge/tools/builtin/rag_search.py +116 -0
  674. tolokaforge-0.2.0/tolokaforge/tools/builtin/registry.py +125 -0
  675. tolokaforge-0.2.0/tolokaforge/tools/registry.py +449 -0
  676. tolokaforge-0.2.0/tolokaforge/tools/user_tools.py +379 -0
  677. tolokaforge-0.2.0/tools/AGENTS.md +33 -0
  678. tolokaforge-0.2.0/tools/dev-mcp/README.md +47 -0
  679. tolokaforge-0.2.0/tools/dev-mcp/pyproject.toml +16 -0
  680. tolokaforge-0.2.0/tools/dev-mcp/src/dev_mcp/__init__.py +0 -0
  681. tolokaforge-0.2.0/tools/dev-mcp/src/dev_mcp/server.py +375 -0
  682. tolokaforge-0.2.0/tools/dev-mcp/src/dev_mcp/subprocess_utils.py +182 -0
  683. tolokaforge-0.2.0/tools/dev-mcp/tests/__init__.py +0 -0
  684. tolokaforge-0.2.0/tools/dev-mcp/tests/test_server.py +177 -0
  685. tolokaforge-0.2.0/tools/pricing-updater/README.md +33 -0
  686. tolokaforge-0.2.0/tools/pricing-updater/pyproject.toml +18 -0
  687. tolokaforge-0.2.0/tools/pricing-updater/src/pricing_updater/__init__.py +1 -0
  688. tolokaforge-0.2.0/tools/pricing-updater/src/pricing_updater/__main__.py +5 -0
  689. tolokaforge-0.2.0/tools/pricing-updater/src/pricing_updater/cli.py +145 -0
  690. tolokaforge-0.2.0/tools/pricing-updater/src/pricing_updater/fetcher.py +232 -0
  691. tolokaforge-0.2.0/tools/pricing-updater/tests/__init__.py +0 -0
  692. tolokaforge-0.2.0/tools/pricing-updater/tests/test_fetcher.py +636 -0
  693. tolokaforge-0.2.0/uv.lock +4365 -0
@@ -0,0 +1,19 @@
1
+ ---
2
+ description: Cursor settings — all project rules are in AGENTS.md
3
+ alwaysApply: true
4
+ ---
5
+ # Cursor Configuration
6
+
7
+ All project rules, conventions, and commands are defined in `AGENTS.md` at the repository root.
8
+ Read it first — it is the single source of truth for all AI agents.
9
+
10
+ ## Session Startup
11
+
12
+ 1. Read `README.md` and `.vscode/tasks.json` before writing any code
13
+ 2. Do not ask permission — these are essential context
14
+
15
+ ## Cursor-Specific
16
+
17
+ - Use Context7 MCP (if available) to look up library/framework documentation before guessing at APIs
18
+ - Available VSCode tasks are in `.vscode/tasks.json`
19
+ - Plans and scratch files go in `plans/` directory — it is gitignored
@@ -0,0 +1,63 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ *.egg
11
+
12
+ # Virtual environments
13
+ venv/
14
+ env/
15
+ ENV/
16
+ .venv
17
+
18
+ # IDEs
19
+ .vscode/
20
+ .idea/
21
+ *.swp
22
+ *.swo
23
+ *~
24
+
25
+ # OS
26
+ .DS_Store
27
+ Thumbs.db
28
+
29
+ # Git
30
+ .git/
31
+ .gitignore
32
+ .gitattributes
33
+
34
+ # Documentation
35
+ *.md
36
+ !README.md
37
+ docs/
38
+
39
+ # Testing
40
+ .pytest_cache/
41
+ .coverage
42
+ htmlcov/
43
+ .tox/
44
+
45
+ # Results and outputs
46
+ results/
47
+ *.log
48
+ *.json
49
+ *.csv
50
+ *.html
51
+
52
+ # CI/CD
53
+ .github/
54
+ .gitlab-ci.yml
55
+
56
+ # Docker
57
+ docker-compose.override.yml
58
+ .dockerignore
59
+
60
+ # Temporary files
61
+ tmp/
62
+ temp/
63
+ *.tmp
@@ -0,0 +1,24 @@
1
+ # Environment variables for Tolokaforge
2
+ # Copy this file to .env and fill in your API keys:
3
+ # cp .env.example .env
4
+
5
+ # At least one LLM provider key is required.
6
+ # Most examples use OpenRouter by default.
7
+ OPENROUTER_API_KEY=your-openrouter-api-key-here
8
+
9
+ # Optional: direct provider keys (used when provider is set to "anthropic", "openai", etc.)
10
+ # ANTHROPIC_API_KEY=your-anthropic-api-key-here
11
+ # OPENAI_API_KEY=your-openai-api-key-here
12
+ # GOOGLE_API_KEY=your-google-api-key-here
13
+
14
+ # Gemini direct (Google AI Studio): use when provider=gemini in ModelConfig.
15
+ # Create at https://aistudio.google.com (no GCP IAM required; key is a bearer token).
16
+ # Bypasses OpenRouter — useful for discriminating model behavior from provider transport.
17
+ # GEMINI_API_KEY=your-gemini-studio-api-key-here
18
+
19
+ # Vertex AI (GCP): use when provider=vertex_ai in ModelConfig.
20
+ # Service account needs role roles/aiplatform.user (or aiplatform.endpoints.predict).
21
+ # Download a JSON key, point GOOGLE_APPLICATION_CREDENTIALS at it.
22
+ # GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
23
+ # VERTEXAI_PROJECT=your-gcp-project-id
24
+ # VERTEXAI_LOCATION=us-central1
@@ -0,0 +1,6 @@
1
+ # Trajectory output files (large JSON/YAML from benchmark runs)
2
+ tests/data/projects/*/output/**/*.json filter=lfs diff=lfs merge=lfs -text
3
+ tests/data/projects/*/output/**/*.yaml filter=lfs diff=lfs merge=lfs -text
4
+
5
+ # Mobile task screenshots (binary PNGs)
6
+ tasks/mobile/_assets/images/*.png filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,309 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - 'tolokaforge/**'
9
+ - 'tests/**'
10
+ - 'scripts/**'
11
+ - 'tools/**'
12
+ - 'pyproject.toml'
13
+ - 'uv.lock'
14
+ - '.github/workflows/ci.yml'
15
+ - '.pre-commit-config.yaml'
16
+ pull_request:
17
+ types: [opened, synchronize, reopened, labeled]
18
+ paths:
19
+ - 'tolokaforge/**'
20
+ - 'tests/**'
21
+ - 'scripts/**'
22
+ - 'tools/**'
23
+ - 'pyproject.toml'
24
+ - 'uv.lock'
25
+ - '.github/workflows/ci.yml'
26
+ - '.pre-commit-config.yaml'
27
+ schedule:
28
+ - cron: '0 6 * * *'
29
+
30
+ permissions:
31
+ contents: read
32
+
33
+ jobs:
34
+ # ============================================================================
35
+ # Lint job - Fast feedback on code quality
36
+ # ============================================================================
37
+ lint:
38
+ runs-on: ubuntu-latest
39
+ timeout-minutes: 20
40
+
41
+ steps:
42
+ - uses: actions/checkout@v4
43
+
44
+ - name: Install uv
45
+ uses: astral-sh/setup-uv@v4
46
+ with:
47
+ version: "latest"
48
+ enable-cache: true
49
+ cache-dependency-glob: "uv.lock"
50
+
51
+ - name: Set up Python 3.12
52
+ run: uv python install 3.12
53
+
54
+
55
+ - name: Install dependencies
56
+ run: uv sync --dev
57
+
58
+ - name: Run pre-commit checks
59
+ run: uv run pre-commit run --all-files --show-diff-on-failure
60
+
61
+ # ============================================================================
62
+ # PR smoke tests - fast required gate
63
+ # ============================================================================
64
+ test-smoke:
65
+ runs-on: ubuntu-latest
66
+ if: github.event_name == 'pull_request'
67
+ needs: lint # Only run tests if linting passes
68
+ timeout-minutes: 45
69
+
70
+ steps:
71
+ - uses: actions/checkout@v4
72
+ with:
73
+ token: ${{ github.token }}
74
+ lfs: true
75
+
76
+
77
+ - name: Install uv
78
+ uses: astral-sh/setup-uv@v4
79
+ with:
80
+ version: "latest"
81
+ enable-cache: true
82
+ cache-dependency-glob: "uv.lock"
83
+
84
+ - name: Set up Python 3.12
85
+ run: uv python install 3.12
86
+
87
+
88
+ - name: Install dependencies
89
+ run: |
90
+ uv sync --dev
91
+
92
+ - name: Install Playwright browsers
93
+ run: |
94
+ uv run python -m playwright install chromium
95
+
96
+ - name: Create .env file
97
+ run: |
98
+ echo "OPENROUTER_API_KEY=${{ secrets.OPENROUTER_TESTING_TOKEN }}" > .env
99
+
100
+ - name: Run unit + canonical tests with coverage
101
+ run: |
102
+ uv run pytest tests/unit/ tests/canonical/ -v --tb=short \
103
+ --cov=tolokaforge --cov-report=term-missing --cov-fail-under=60
104
+
105
+ - name: Run tool tests
106
+ run: |
107
+ for tool_dir in tools/*/; do
108
+ if [ -d "${tool_dir}tests" ]; then
109
+ echo "=== Testing ${tool_dir} ==="
110
+ (cd "${tool_dir}" && uv run pytest tests/ -v --tb=short -p no:cacheprovider) || exit 1
111
+ fi
112
+ done
113
+
114
+ - name: Validate public example suites
115
+ run: |
116
+ uv run tolokaforge validate --tasks "examples/native/*/dataset/**/task.yaml"
117
+
118
+
119
+ - name: Docker task-pack mount smoke
120
+ run: |
121
+ scripts/tests/task_pack_docker_smoke.sh
122
+
123
+ # TODO: Re-enable once in-process runtime or Docker-less orchestrator is available.
124
+ # The smoke script requires `tolokaforge run` which now needs a Docker runtime.
125
+ # - name: Run public examples end-to-end (mock smoke)
126
+ # run: |
127
+ # scripts/tests/run_public_examples_smoke.sh
128
+ #
129
+ # - name: Summarize public examples
130
+ # run: |
131
+ # uv run python scripts/tests/summarize_public_examples.py --min-pass-rate 0.0 --min-completion-rate 1.0
132
+ #
133
+ # - name: Upload public example summary artifact
134
+ # uses: actions/upload-artifact@v4
135
+ # with:
136
+ # name: public-example-summary-pr
137
+ # path: |
138
+ # output/public_examples_summary.json
139
+ # output/public_examples_summary.md
140
+
141
+ # ============================================================================
142
+ # Full test suite - push + nightly confidence
143
+ # ============================================================================
144
+ test-full:
145
+ runs-on: ubuntu-latest
146
+ if: github.event_name != 'pull_request'
147
+ needs: [lint]
148
+ timeout-minutes: 120
149
+
150
+ steps:
151
+ - uses: actions/checkout@v4
152
+ with:
153
+ token: ${{ github.token }}
154
+ lfs: true
155
+
156
+
157
+ - name: Install uv
158
+ uses: astral-sh/setup-uv@v4
159
+ with:
160
+ version: "latest"
161
+ enable-cache: true
162
+ cache-dependency-glob: "uv.lock"
163
+
164
+ - name: Set up Python 3.12
165
+ run: uv python install 3.12
166
+
167
+
168
+ - name: Install dependencies
169
+ run: |
170
+ uv sync --dev
171
+
172
+ - name: Install Playwright browsers
173
+ run: |
174
+ uv run python -m playwright install chromium
175
+
176
+ - name: Create .env file
177
+ run: |
178
+ echo "OPENROUTER_API_KEY=${{ secrets.OPENROUTER_TESTING_TOKEN }}" > .env
179
+
180
+ - name: Validate public example suites
181
+ run: |
182
+ uv run tolokaforge validate --tasks "examples/native/*/dataset/**/task.yaml"
183
+
184
+
185
+ - name: Docker task-pack mount smoke
186
+ run: |
187
+ scripts/tests/task_pack_docker_smoke.sh
188
+
189
+ # TODO: Re-enable once in-process runtime or Docker-less orchestrator is available.
190
+ # - name: Run public examples end-to-end (mock smoke)
191
+ # run: |
192
+ # scripts/tests/run_public_examples_smoke.sh
193
+ #
194
+ # - name: Summarize public examples
195
+ # run: |
196
+ # uv run python scripts/tests/summarize_public_examples.py --min-pass-rate 0.0 --min-completion-rate 1.0
197
+
198
+ - name: Run unit + canonical tests with coverage
199
+ run: |
200
+ uv run pytest tests/unit/ tests/canonical/ -v --tb=short \
201
+ --cov=tolokaforge --cov-report=term-missing --cov-fail-under=60
202
+
203
+ - name: Run tool tests
204
+ run: |
205
+ for tool_dir in tools/*/; do
206
+ if [ -d "${tool_dir}tests" ]; then
207
+ echo "=== Testing ${tool_dir} ==="
208
+ (cd "${tool_dir}" && uv run pytest tests/ -v --tb=short -p no:cacheprovider) || exit 1
209
+ fi
210
+ done
211
+
212
+ - name: Build Docker images for integration tests
213
+ if: github.event_name == 'schedule' || github.event_name == 'push'
214
+ id: docker-build-full
215
+ run: |
216
+ uv run tolokaforge docker build --core
217
+
218
+ - name: Run integration tests
219
+ if: (github.event_name == 'schedule' || github.event_name == 'push') && steps.docker-build-full.outcome == 'success'
220
+ run: |
221
+ uv run pytest tests/integration/ -v --tb=short
222
+
223
+ - name: Generate coverage XML for Codecov
224
+ if: github.event_name == 'schedule'
225
+ run: |
226
+ uv run coverage xml
227
+
228
+ - name: Upload coverage to Codecov
229
+ if: github.event_name == 'schedule'
230
+ uses: codecov/codecov-action@v3
231
+ with:
232
+ files: ./coverage.xml
233
+ flags: unittests
234
+ name: codecov-umbrella
235
+ continue-on-error: true
236
+
237
+ # ============================================================================
238
+ # Label-triggered gate - full suite + coverage (add 'ready-to-merge' label)
239
+ # ============================================================================
240
+ test-gate:
241
+ runs-on: ubuntu-latest
242
+ if: >-
243
+ github.event_name == 'pull_request' &&
244
+ contains(github.event.pull_request.labels.*.name, 'ready-to-merge')
245
+ needs: [lint]
246
+ timeout-minutes: 120
247
+
248
+ steps:
249
+ - uses: actions/checkout@v4
250
+ with:
251
+ token: ${{ github.token }}
252
+ lfs: true
253
+
254
+
255
+ - name: Install uv
256
+ uses: astral-sh/setup-uv@v4
257
+ with:
258
+ version: "latest"
259
+ enable-cache: true
260
+ cache-dependency-glob: "uv.lock"
261
+
262
+ - name: Set up Python 3.12
263
+ run: uv python install 3.12
264
+
265
+
266
+ - name: Install dependencies
267
+ run: |
268
+ uv sync --dev
269
+
270
+ - name: Install Playwright browsers
271
+ run: |
272
+ uv run python -m playwright install chromium
273
+
274
+ - name: Create .env file
275
+ run: |
276
+ echo "OPENROUTER_API_KEY=${{ secrets.OPENROUTER_TESTING_TOKEN }}" > .env
277
+
278
+ - name: Validate public example suites
279
+ run: |
280
+ uv run tolokaforge validate --tasks "examples/native/*/dataset/**/task.yaml"
281
+
282
+
283
+ - name: Docker task-pack mount smoke
284
+ run: |
285
+ scripts/tests/task_pack_docker_smoke.sh
286
+
287
+ - name: Run unit + canonical tests with coverage
288
+ run: |
289
+ uv run pytest tests/unit/ tests/canonical/ -v --tb=short \
290
+ --cov=tolokaforge --cov-report=xml --cov-report=term-missing \
291
+ --cov-fail-under=60
292
+
293
+ - name: Build Docker images for integration tests
294
+ id: docker-build
295
+ run: |
296
+ uv run tolokaforge docker build --core
297
+
298
+ - name: Run integration tests
299
+ if: steps.docker-build.outcome == 'success'
300
+ run: |
301
+ uv run pytest tests/integration/ -v --tb=short
302
+
303
+ - name: Upload coverage to Codecov
304
+ uses: codecov/codecov-action@v3
305
+ with:
306
+ files: ./coverage.xml
307
+ flags: merge-gate
308
+ name: codecov-merge-gate
309
+ continue-on-error: true
@@ -0,0 +1,100 @@
1
+ name: Claude Code PR Review
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened]
6
+ issue_comment:
7
+ types: [created]
8
+ pull_request_review_comment:
9
+ types: [created]
10
+
11
+ jobs:
12
+ hygiene-review:
13
+ runs-on: ubuntu-latest
14
+ if: >
15
+ github.event_name == 'pull_request' ||
16
+ (github.event_name == 'issue_comment' &&
17
+ contains(github.event.comment.body, '@claude')) ||
18
+ (github.event_name == 'pull_request_review_comment' &&
19
+ contains(github.event.comment.body, '@claude'))
20
+ permissions:
21
+ contents: read
22
+ pull-requests: write
23
+ issues: write
24
+ id-token: write
25
+ steps:
26
+ - uses: actions/checkout@v4
27
+ with:
28
+ fetch-depth: 0
29
+
30
+ - uses: anthropics/claude-code-action@beta
31
+ with:
32
+ anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
33
+ github_token: ${{ secrets.GITHUB_TOKEN }}
34
+ direct_prompt: |
35
+ Review this PR against the project's AGENTS.md rules. Flag any
36
+ violation with file path, line number, and a concrete suggested
37
+ fix. If everything checks out, say so explicitly.
38
+
39
+ 1. CODE QUALITY: Errors are surfaced explicitly. No silent
40
+ fallbacks, no swallowed exceptions (`except Exception: pass`),
41
+ no returning `None` instead of raising. Functions under 100
42
+ lines, nesting depth < 3.
43
+
44
+ 2. SECRETS — single abstraction (HARD RULE): API keys, DB
45
+ credentials, tokens, signing keys, and any other credential
46
+ are accessed *only* via `SecretManager`
47
+ (`tolokaforge.secrets`). Flag any new `os.environ.get` /
48
+ `os.getenv` for credentials, any `load_dotenv()` call, any
49
+ `from dotenv import` outside `tolokaforge.secrets`, any
50
+ direct `.env` / `.netrc` / `.aws/credentials` file read,
51
+ any one-off env-var helper, any secret baked into a Docker
52
+ image / build-arg / mount / image-tag. New secret backends
53
+ must ship as new `SecretProvider` subclasses, never as
54
+ ad-hoc call sites. The
55
+ `tests/unit/secrets/test_no_raw_secret_access.py`
56
+ enforcement test must remain green.
57
+
58
+ 3. TESTING: Every new test file has a `pytestmark` marker
59
+ (`pytest.mark.unit`, `.canonical`, or `.integration`).
60
+ Zero `xfail`, zero bare `@skip` — use conditional markers
61
+ (`requires_api`, `requires_docker`). MockAsyncClient comes
62
+ from `tests.utils.mock_clients` only — no local copies.
63
+
64
+ 4. ARCHITECTURE: Harness logic stays generic. Task-specific
65
+ logic lives in task packs only. Clean abstraction
66
+ boundaries. Backward compatibility for task contracts.
67
+
68
+ 5. REPOSITORY HYGIENE: No scripts, data, temp files, or logs
69
+ in the repo root. Documentation updated when user-facing
70
+ behaviour changes.
71
+
72
+ 6. CODE STANDARDS: DRY — no duplicated logic. Self-describing
73
+ names. Early returns to minimise nesting. No warnings
74
+ suppressed.
75
+
76
+ 7. ROOT CLEANLINESS: New files in the repo root must be on
77
+ this allow-list — README.md, LICENSE, CHANGELOG.md,
78
+ CONTRIBUTING.md, CONTRIBUTORS.md, CITATION.*, CLAUDE.md,
79
+ AGENTS.md, pyproject.toml, uv.lock, Makefile,
80
+ docker-compose.yaml, and dotfiles (.gitignore,
81
+ .pre-commit-config.yaml, etc.).
82
+
83
+ 8. NO TEMP ARTIFACTS: No temporary plans, log files, JSON
84
+ data dumps, or build outputs.
85
+
86
+ 9. SCRIPT LOCATION: Bash scripts only in scripts/. Exceptions:
87
+ tests/ for test helpers, tasks/ for benchmark data,
88
+ .devcontainer/ for container setup, Docker entrypoints
89
+ alongside Dockerfiles.
90
+
91
+ 10. SCRIPTS ORGANIZATION: New scripts placed in the correct
92
+ scripts/ subdirectory (benchmark/, setup/, lint/, tests/,
93
+ release/, analysis/).
94
+
95
+ 11. PYTHON TOOLS: Complex Python tools in tools/ as uv
96
+ workspace members, linked in
97
+ `pyproject.toml [tool.uv.workspace]`.
98
+
99
+ 12. NO PROJECT-SPECIFIC CONTENT ON MAIN: No domain-specific
100
+ configs or runner scripts on the `main` branch.
@@ -0,0 +1,132 @@
1
+ name: Publish tolokaforge-adapter-terminal-bench to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "adapter-terminal-bench-v*"
7
+ workflow_dispatch:
8
+ inputs:
9
+ target:
10
+ description: "Publish target"
11
+ required: true
12
+ type: choice
13
+ options:
14
+ - testpypi
15
+ - pypi
16
+
17
+ # Cancel in-progress runs for the same tag
18
+ concurrency:
19
+ group: publish-adapter-terminal-bench-${{ github.ref }}
20
+ cancel-in-progress: true
21
+
22
+ jobs:
23
+ build:
24
+ name: Build distribution
25
+ runs-on: ubuntu-latest
26
+ permissions:
27
+ contents: read
28
+ steps:
29
+ - uses: actions/checkout@v4
30
+ with:
31
+ fetch-depth: 0
32
+
33
+ - name: Install uv
34
+ uses: astral-sh/setup-uv@v7
35
+
36
+ - name: Install Python
37
+ run: uv python install 3.12
38
+
39
+ - name: Build package
40
+ run: uv build --package tolokaforge-adapter-terminal-bench
41
+
42
+ - name: Verify package contents
43
+ run: |
44
+ echo "=== Built artifacts ==="
45
+ ls -lh dist/
46
+ echo ""
47
+ echo "=== Wheel contents ==="
48
+ uv run python -c "
49
+ import zipfile, sys
50
+ for f in __import__('pathlib').Path('dist').glob('*.whl'):
51
+ print(f'--- {f.name} ---')
52
+ with zipfile.ZipFile(f) as zf:
53
+ for name in sorted(zf.namelist()):
54
+ print(f' {name}')
55
+ "
56
+
57
+ - name: Upload distribution artifacts
58
+ uses: actions/upload-artifact@v4
59
+ with:
60
+ name: adapter-terminal-bench-dist
61
+ path: dist/
62
+ if-no-files-found: error
63
+
64
+ publish-testpypi:
65
+ name: Publish to TestPyPI
66
+ needs: build
67
+ if: github.event_name == 'workflow_dispatch' && github.event.inputs.target == 'testpypi'
68
+ runs-on: ubuntu-latest
69
+ environment: testpypi
70
+ permissions:
71
+ id-token: write
72
+ steps:
73
+ - name: Install uv
74
+ uses: astral-sh/setup-uv@v7
75
+
76
+ - name: Download distribution artifacts
77
+ uses: actions/download-artifact@v4
78
+ with:
79
+ name: adapter-terminal-bench-dist
80
+ path: dist/
81
+
82
+ - name: List downloaded files
83
+ run: find dist/ -type f
84
+
85
+ - name: Publish to TestPyPI
86
+ run: uv publish dist/* --publish-url https://test.pypi.org/legacy/ --trusted-publishing always
87
+
88
+ publish-pypi:
89
+ name: Publish to PyPI
90
+ needs: build
91
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/adapter-terminal-bench-v') || (github.event_name == 'workflow_dispatch' && github.event.inputs.target == 'pypi')
92
+ runs-on: ubuntu-latest
93
+ environment: release
94
+ permissions:
95
+ id-token: write
96
+ steps:
97
+ - name: Install uv
98
+ uses: astral-sh/setup-uv@v7
99
+
100
+ - name: Download distribution artifacts
101
+ uses: actions/download-artifact@v4
102
+ with:
103
+ name: adapter-terminal-bench-dist
104
+ path: dist/
105
+
106
+ - name: List downloaded files
107
+ run: find dist/ -type f
108
+
109
+ - name: Publish to PyPI
110
+ run: uv publish dist/* --trusted-publishing always
111
+
112
+ github-release:
113
+ name: Create GitHub Release
114
+ needs: publish-pypi
115
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/adapter-terminal-bench-v')
116
+ runs-on: ubuntu-latest
117
+ permissions:
118
+ contents: write
119
+ steps:
120
+ - uses: actions/checkout@v4
121
+
122
+ - name: Download distribution artifacts
123
+ uses: actions/download-artifact@v4
124
+ with:
125
+ name: adapter-terminal-bench-dist
126
+ path: dist/
127
+
128
+ - name: Create GitHub Release
129
+ uses: softprops/action-gh-release@v2
130
+ with:
131
+ generate_release_notes: true
132
+ files: dist/*