agent-belt 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (517) hide show
  1. agent_belt-0.1.0/.frogbot/frogbot-config.yml +11 -0
  2. agent_belt-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml +39 -0
  3. agent_belt-0.1.0/.github/ISSUE_TEMPLATE/config.yml +1 -0
  4. agent_belt-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml +23 -0
  5. agent_belt-0.1.0/.github/ISSUE_TEMPLATE/new_agent.yml +45 -0
  6. agent_belt-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +22 -0
  7. agent_belt-0.1.0/.github/release.yml +29 -0
  8. agent_belt-0.1.0/.github/workflows/cla.yml +29 -0
  9. agent_belt-0.1.0/.github/workflows/frogbot-scan-pull-request.yml +35 -0
  10. agent_belt-0.1.0/.github/workflows/release.yml +258 -0
  11. agent_belt-0.1.0/.github/workflows/removeLabel.yml +33 -0
  12. agent_belt-0.1.0/.github/workflows/test.yml +179 -0
  13. agent_belt-0.1.0/.gitignore +56 -0
  14. agent_belt-0.1.0/.markdownlint-cli2.yaml +19 -0
  15. agent_belt-0.1.0/.pre-commit-config.yaml +61 -0
  16. agent_belt-0.1.0/.python-version +1 -0
  17. agent_belt-0.1.0/AGENTS.md +184 -0
  18. agent_belt-0.1.0/CONTRIBUTING.md +221 -0
  19. agent_belt-0.1.0/LICENSE +202 -0
  20. agent_belt-0.1.0/Makefile +260 -0
  21. agent_belt-0.1.0/NOTICE +108 -0
  22. agent_belt-0.1.0/PKG-INFO +265 -0
  23. agent_belt-0.1.0/README.md +223 -0
  24. agent_belt-0.1.0/SECURITY.md +7 -0
  25. agent_belt-0.1.0/docs/glossary/AGENT-FEATURES.md +129 -0
  26. agent_belt-0.1.0/docs/glossary/ARCHITECTURE.md +532 -0
  27. agent_belt-0.1.0/docs/glossary/CI.md +278 -0
  28. agent_belt-0.1.0/docs/glossary/CLI.md +365 -0
  29. agent_belt-0.1.0/docs/glossary/CONFIGURATION.md +246 -0
  30. agent_belt-0.1.0/docs/glossary/OUTCOMES.md +410 -0
  31. agent_belt-0.1.0/docs/glossary/PLUGGABILITY.md +517 -0
  32. agent_belt-0.1.0/docs/glossary/SANDBOXING.md +55 -0
  33. agent_belt-0.1.0/docs/glossary/SCENARIOS.md +450 -0
  34. agent_belt-0.1.0/docs/glossary/SCORING.md +419 -0
  35. agent_belt-0.1.0/examples/README.md +270 -0
  36. agent_belt-0.1.0/examples/belt.yaml.example +27 -0
  37. agent_belt-0.1.0/examples/custom-agent/README.md +68 -0
  38. agent_belt-0.1.0/examples/custom-agent/echo_agent.py +69 -0
  39. agent_belt-0.1.0/examples/custom-agent/pyproject.toml +16 -0
  40. agent_belt-0.1.0/examples/custom-agent/scenarios/echo/_config.json +4 -0
  41. agent_belt-0.1.0/examples/custom-agent/scenarios/echo/hello.json +15 -0
  42. agent_belt-0.1.0/examples/custom-agent/scenarios/echo/multi_turn_with_expectations.json +32 -0
  43. agent_belt-0.1.0/examples/exporter-config/exporters.yaml +40 -0
  44. agent_belt-0.1.0/examples/fixtures/.gitignore +5 -0
  45. agent_belt-0.1.0/examples/fixtures/bookstore-api/README.md +48 -0
  46. agent_belt-0.1.0/examples/fixtures/bookstore-api/jest.config.js +7 -0
  47. agent_belt-0.1.0/examples/fixtures/bookstore-api/package.json +27 -0
  48. agent_belt-0.1.0/examples/fixtures/bookstore-api/src/index.ts +26 -0
  49. agent_belt-0.1.0/examples/fixtures/bookstore-api/src/middleware/auth.ts +53 -0
  50. agent_belt-0.1.0/examples/fixtures/bookstore-api/src/middleware/validate.ts +20 -0
  51. agent_belt-0.1.0/examples/fixtures/bookstore-api/src/models/author.ts +14 -0
  52. agent_belt-0.1.0/examples/fixtures/bookstore-api/src/models/book.ts +18 -0
  53. agent_belt-0.1.0/examples/fixtures/bookstore-api/src/routes/authors.ts +40 -0
  54. agent_belt-0.1.0/examples/fixtures/bookstore-api/src/routes/books.ts +38 -0
  55. agent_belt-0.1.0/examples/fixtures/bookstore-api/src/services/bookService.ts +47 -0
  56. agent_belt-0.1.0/examples/fixtures/bookstore-api/src/utils/pagination.ts +31 -0
  57. agent_belt-0.1.0/examples/fixtures/bookstore-api/tests/books.test.ts +34 -0
  58. agent_belt-0.1.0/examples/fixtures/bookstore-api/tests/setup.ts +7 -0
  59. agent_belt-0.1.0/examples/fixtures/bookstore-api/tsconfig.json +16 -0
  60. agent_belt-0.1.0/examples/fixtures/folio/.gitignore +6 -0
  61. agent_belt-0.1.0/examples/fixtures/folio/.mcp.json +8 -0
  62. agent_belt-0.1.0/examples/fixtures/folio/README.md +167 -0
  63. agent_belt-0.1.0/examples/fixtures/folio/data/orders_seed.json +79 -0
  64. agent_belt-0.1.0/examples/fixtures/folio/data/seed.sql +100 -0
  65. agent_belt-0.1.0/examples/fixtures/folio/folio/__init__.py +5 -0
  66. agent_belt-0.1.0/examples/fixtures/folio/folio/api.py +140 -0
  67. agent_belt-0.1.0/examples/fixtures/folio/folio/db.py +293 -0
  68. agent_belt-0.1.0/examples/fixtures/folio/folio/mcp_server.py +173 -0
  69. agent_belt-0.1.0/examples/fixtures/folio/folio/reset.py +44 -0
  70. agent_belt-0.1.0/examples/fixtures/folio/folio/server.py +68 -0
  71. agent_belt-0.1.0/examples/fixtures/folio/folio/tests/__init__.py +1 -0
  72. agent_belt-0.1.0/examples/fixtures/folio/folio/tests/smoke_mcp.py +96 -0
  73. agent_belt-0.1.0/examples/fixtures/folio/pyproject.toml +32 -0
  74. agent_belt-0.1.0/examples/fixtures/folio/tests/__init__.py +1 -0
  75. agent_belt-0.1.0/examples/fixtures/folio/tests/test_api.py +122 -0
  76. agent_belt-0.1.0/examples/fixtures/folio/tests/test_db.py +156 -0
  77. agent_belt-0.1.0/examples/fixtures/folio/tests/test_mcp.py +98 -0
  78. agent_belt-0.1.0/examples/fixtures/programmatic-setup/.gitignore +6 -0
  79. agent_belt-0.1.0/examples/fixtures/programmatic-setup/.mcp.json +9 -0
  80. agent_belt-0.1.0/examples/fixtures/programmatic-setup/mcp_servers/orders_db/README.md +20 -0
  81. agent_belt-0.1.0/examples/fixtures/programmatic-setup/mcp_servers/orders_db/server.py +183 -0
  82. agent_belt-0.1.0/examples/fixtures/sample-project/README.md +6 -0
  83. agent_belt-0.1.0/examples/fixtures/sample-project/pyproject.toml +7 -0
  84. agent_belt-0.1.0/examples/fixtures/sample-project/src/calculator.py +20 -0
  85. agent_belt-0.1.0/examples/fixtures/sample-resource.txt +9 -0
  86. agent_belt-0.1.0/examples/fixtures/tasktracker/README.md +48 -0
  87. agent_belt-0.1.0/examples/fixtures/tasktracker/pyproject.toml +11 -0
  88. agent_belt-0.1.0/examples/fixtures/tasktracker/src/tasktracker/__init__.py +5 -0
  89. agent_belt-0.1.0/examples/fixtures/tasktracker/src/tasktracker/cli.py +75 -0
  90. agent_belt-0.1.0/examples/fixtures/tasktracker/src/tasktracker/formatters.py +34 -0
  91. agent_belt-0.1.0/examples/fixtures/tasktracker/src/tasktracker/models.py +47 -0
  92. agent_belt-0.1.0/examples/fixtures/tasktracker/src/tasktracker/storage.py +32 -0
  93. agent_belt-0.1.0/examples/fixtures/tasktracker/tests/conftest.py +27 -0
  94. agent_belt-0.1.0/examples/fixtures/tasktracker/tests/test_models.py +44 -0
  95. agent_belt-0.1.0/examples/fixtures/tasktracker/tests/test_storage.py +31 -0
  96. agent_belt-0.1.0/examples/fixtures/urlshortener/Dockerfile +15 -0
  97. agent_belt-0.1.0/examples/fixtures/urlshortener/Makefile +12 -0
  98. agent_belt-0.1.0/examples/fixtures/urlshortener/README.md +44 -0
  99. agent_belt-0.1.0/examples/fixtures/urlshortener/cmd/server/main.go +35 -0
  100. agent_belt-0.1.0/examples/fixtures/urlshortener/go.mod +5 -0
  101. agent_belt-0.1.0/examples/fixtures/urlshortener/internal/handler/handler.go +82 -0
  102. agent_belt-0.1.0/examples/fixtures/urlshortener/internal/handler/handler_test.go +86 -0
  103. agent_belt-0.1.0/examples/fixtures/urlshortener/internal/shortener/shortener.go +41 -0
  104. agent_belt-0.1.0/examples/fixtures/urlshortener/internal/shortener/shortener_test.go +50 -0
  105. agent_belt-0.1.0/examples/fixtures/urlshortener/internal/store/memory.go +34 -0
  106. agent_belt-0.1.0/examples/fixtures/urlshortener/internal/store/postgres.go +45 -0
  107. agent_belt-0.1.0/examples/fixtures/urlshortener/internal/store/store.go +10 -0
  108. agent_belt-0.1.0/examples/fixtures/urlshortener/migrations/001_init.sql +10 -0
  109. agent_belt-0.1.0/examples/sandbox-images/Dockerfile.cursor.example +30 -0
  110. agent_belt-0.1.0/examples/sandbox-images/Dockerfile.generic +53 -0
  111. agent_belt-0.1.0/examples/sandbox-images/README.md +141 -0
  112. agent_belt-0.1.0/examples/scenarios/experience/bookstore-api-claude/_config.json +7 -0
  113. agent_belt-0.1.0/examples/scenarios/experience/bookstore-api-claude/l1_explain_architecture.json +16 -0
  114. agent_belt-0.1.0/examples/scenarios/experience/bookstore-api-claude/l1_security_review.json +17 -0
  115. agent_belt-0.1.0/examples/scenarios/experience/bookstore-api-claude/l2_fix_pagination.json +21 -0
  116. agent_belt-0.1.0/examples/scenarios/experience/bookstore-api-claude/l2_fix_sql_injection.json +21 -0
  117. agent_belt-0.1.0/examples/scenarios/experience/bookstore-api-claude/l3_add_author_tests.json +21 -0
  118. agent_belt-0.1.0/examples/scenarios/experience/bookstore-api-claude/l3_add_author_validation.json +17 -0
  119. agent_belt-0.1.0/examples/scenarios/experience/bookstore-api-claude/l4_security_audit_and_fix.json +42 -0
  120. agent_belt-0.1.0/examples/scenarios/experience/folio/L10_plugin_skill_inventory.json +44 -0
  121. agent_belt-0.1.0/examples/scenarios/experience/folio/L1_catalog_search.json +42 -0
  122. agent_belt-0.1.0/examples/scenarios/experience/folio/L2_refund_within_policy.json +51 -0
  123. agent_belt-0.1.0/examples/scenarios/experience/folio/L3_store_credit_tier.json +46 -0
  124. agent_belt-0.1.0/examples/scenarios/experience/folio/L4_escalate_out_of_policy.json +49 -0
  125. agent_belt-0.1.0/examples/scenarios/experience/folio/L5_partial_stock_guard.json +41 -0
  126. agent_belt-0.1.0/examples/scenarios/experience/folio/L6_unknown_order_no_hallucination.json +43 -0
  127. agent_belt-0.1.0/examples/scenarios/experience/folio/L7_place_then_refund_multiturn.json +67 -0
  128. agent_belt-0.1.0/examples/scenarios/experience/folio/L8_slash_command_refund.json +42 -0
  129. agent_belt-0.1.0/examples/scenarios/experience/folio/L9_plugin_command_lookup.json +49 -0
  130. agent_belt-0.1.0/examples/scenarios/experience/folio/_config.json +6 -0
  131. agent_belt-0.1.0/examples/scenarios/experience/programmatic-setup-claude/_config.json +6 -0
  132. agent_belt-0.1.0/examples/scenarios/experience/programmatic-setup-claude/mcp_get_order.json +32 -0
  133. agent_belt-0.1.0/examples/scenarios/experience/programmatic-setup-claude/mcp_list_orders.json +33 -0
  134. agent_belt-0.1.0/examples/scenarios/experience/programmatic-setup-claude/plugin_command.json +41 -0
  135. agent_belt-0.1.0/examples/scenarios/experience/programmatic-setup-claude/plugin_skill.json +43 -0
  136. agent_belt-0.1.0/examples/scenarios/experience/programmatic-setup-claude/skill_invoked.json +40 -0
  137. agent_belt-0.1.0/examples/scenarios/experience/programmatic-setup-claude/slash_command.json +37 -0
  138. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-claude/_config.json +7 -0
  139. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-claude/l1_explain_architecture.json +17 -0
  140. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-claude/l1_find_bug.json +17 -0
  141. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-claude/l2_add_completed_at.json +21 -0
  142. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-claude/l2_fix_formatter_bug.json +21 -0
  143. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-claude/l3_add_delete_command.json +23 -0
  144. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-claude/l3_add_json_format.json +22 -0
  145. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-claude/l4_investigate_and_fix.json +43 -0
  146. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-cursor/_config.json +7 -0
  147. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-cursor/l1_explain_architecture.json +17 -0
  148. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-cursor/l1_find_bug.json +17 -0
  149. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-cursor/l2_add_completed_at.json +20 -0
  150. agent_belt-0.1.0/examples/scenarios/experience/tasktracker-cursor/l2_fix_formatter_bug.json +20 -0
  151. agent_belt-0.1.0/examples/scenarios/experience/urlshortener-claude/_config.json +7 -0
  152. agent_belt-0.1.0/examples/scenarios/experience/urlshortener-claude/l1_explain_architecture.json +17 -0
  153. agent_belt-0.1.0/examples/scenarios/experience/urlshortener-claude/l1_find_concurrency_bug.json +16 -0
  154. agent_belt-0.1.0/examples/scenarios/experience/urlshortener-claude/l2_fix_memory_store.json +21 -0
  155. agent_belt-0.1.0/examples/scenarios/experience/urlshortener-claude/l2_validate_url.json +21 -0
  156. agent_belt-0.1.0/examples/scenarios/experience/urlshortener-claude/l3_fix_dockerfile.json +21 -0
  157. agent_belt-0.1.0/examples/scenarios/experience/urlshortener-claude/l3_graceful_shutdown.json +20 -0
  158. agent_belt-0.1.0/examples/scenarios/experience/urlshortener-claude/l4_full_review_and_fix.json +44 -0
  159. agent_belt-0.1.0/examples/scenarios/showcase/README.md +62 -0
  160. agent_belt-0.1.0/examples/scenarios/showcase/agent-capabilities/_config.json +4 -0
  161. agent_belt-0.1.0/examples/scenarios/showcase/agent-capabilities/has_thinking_dry_run.json +16 -0
  162. agent_belt-0.1.0/examples/scenarios/showcase/agent-capabilities/skills_invoked_dry_run.json +15 -0
  163. agent_belt-0.1.0/examples/scenarios/showcase/budgets-latency/_config.json +4 -0
  164. agent_belt-0.1.0/examples/scenarios/showcase/budgets-latency/budgets.json +17 -0
  165. agent_belt-0.1.0/examples/scenarios/showcase/budgets-latency/cost_budget_dry_run.json +15 -0
  166. agent_belt-0.1.0/examples/scenarios/showcase/budgets-latency/latency_slo.json +16 -0
  167. agent_belt-0.1.0/examples/scenarios/showcase/budgets-latency/latency_streaming_dry_run.json +17 -0
  168. agent_belt-0.1.0/examples/scenarios/showcase/correctness/_config.json +4 -0
  169. agent_belt-0.1.0/examples/scenarios/showcase/correctness/correctness_basic.json +16 -0
  170. agent_belt-0.1.0/examples/scenarios/showcase/correctness/llm_scorer_evidence_files.json +18 -0
  171. agent_belt-0.1.0/examples/scenarios/showcase/correctness/llm_scorer_evidence_files.md +15 -0
  172. agent_belt-0.1.0/examples/scenarios/showcase/correctness/llm_scorer_raw_transcript_optin.json +18 -0
  173. agent_belt-0.1.0/examples/scenarios/showcase/correctness/multi_turn_templating.json +31 -0
  174. agent_belt-0.1.0/examples/scenarios/showcase/correctness/multi_turn_with_judge.json +25 -0
  175. agent_belt-0.1.0/examples/scenarios/showcase/correctness/reply_pattern.json +21 -0
  176. agent_belt-0.1.0/examples/scenarios/showcase/correctness/reply_pattern_multiline.json +19 -0
  177. agent_belt-0.1.0/examples/scenarios/showcase/editing-workspace/_config.json +7 -0
  178. agent_belt-0.1.0/examples/scenarios/showcase/editing-workspace/files_modified.json +18 -0
  179. agent_belt-0.1.0/examples/scenarios/showcase/editing-workspace/multi_turn_templating_workspace.json +23 -0
  180. agent_belt-0.1.0/examples/scenarios/showcase/editing-workspace/state_files.json +24 -0
  181. agent_belt-0.1.0/examples/scenarios/showcase/error-types/_config.json +4 -0
  182. agent_belt-0.1.0/examples/scenarios/showcase/error-types/error_type_negative_dry_run.json +15 -0
  183. agent_belt-0.1.0/examples/scenarios/showcase/external-fixture/_config.json +16 -0
  184. agent_belt-0.1.0/examples/scenarios/showcase/external-fixture/external_fixture_demo.json +15 -0
  185. agent_belt-0.1.0/examples/scenarios/showcase/group-config-fields/_config.json +18 -0
  186. agent_belt-0.1.0/examples/scenarios/showcase/group-config-fields/group_config_demo.json +15 -0
  187. agent_belt-0.1.0/examples/scenarios/showcase/sandboxed/_config.json +13 -0
  188. agent_belt-0.1.0/examples/scenarios/showcase/sandboxed/sandboxed_hello.json +15 -0
  189. agent_belt-0.1.0/examples/scenarios/showcase/sandboxed/sandboxed_multi_turn_state.json +24 -0
  190. agent_belt-0.1.0/examples/scenarios/showcase/sandboxed/sandboxed_workspace_write.json +17 -0
  191. agent_belt-0.1.0/examples/scenarios/showcase/sandboxed-offline/_config.json +12 -0
  192. agent_belt-0.1.0/examples/scenarios/showcase/sandboxed-offline/sandboxed_offline_dry_run.json +14 -0
  193. agent_belt-0.1.0/examples/scenarios/showcase/tool-trajectory/_config.json +4 -0
  194. agent_belt-0.1.0/examples/scenarios/showcase/tool-trajectory/tool_args.json +18 -0
  195. agent_belt-0.1.0/examples/scenarios/showcase/tool-trajectory/tool_result.json +21 -0
  196. agent_belt-0.1.0/examples/scenarios/showcase/tool-trajectory/tools_basic.json +16 -0
  197. agent_belt-0.1.0/examples/scenarios/showcase/tool-trajectory/tools_forbidden.json +15 -0
  198. agent_belt-0.1.0/examples/scenarios/showcase/tool-trajectory/tools_in_order.json +15 -0
  199. agent_belt-0.1.0/examples/scenarios/showcase/tool-trajectory/tools_invoked_strict_dry_run.json +13 -0
  200. agent_belt-0.1.0/examples/scenarios/showcase/tool-trajectory/tools_only_used.json +15 -0
  201. agent_belt-0.1.0/examples/scenarios/showcase/verdict-scales/_config.json +41 -0
  202. agent_belt-0.1.0/examples/scenarios/showcase/verdict-scales/verdict_scales_demo.json +15 -0
  203. agent_belt-0.1.0/examples/scorer-config/consensus.yaml +38 -0
  204. agent_belt-0.1.0/examples/scorer-config/custom-dimensions.yaml +47 -0
  205. agent_belt-0.1.0/examples/scorer-config/judges.yaml +58 -0
  206. agent_belt-0.1.0/pyproject.toml +219 -0
  207. agent_belt-0.1.0/requirements.txt +7 -0
  208. agent_belt-0.1.0/scripts/check_design.py +551 -0
  209. agent_belt-0.1.0/scripts/verify_judge_infra_failures.py +261 -0
  210. agent_belt-0.1.0/scripts/verify_judge_preflight.py +192 -0
  211. agent_belt-0.1.0/src/belt/.agents/skills/belt/SKILL.md +326 -0
  212. agent_belt-0.1.0/src/belt/__init__.py +53 -0
  213. agent_belt-0.1.0/src/belt/_bundled.py +104 -0
  214. agent_belt-0.1.0/src/belt/_git.py +120 -0
  215. agent_belt-0.1.0/src/belt/_internal_envvars.py +85 -0
  216. agent_belt-0.1.0/src/belt/_io.py +92 -0
  217. agent_belt-0.1.0/src/belt/_logging.py +74 -0
  218. agent_belt-0.1.0/src/belt/_public_api.py +82 -0
  219. agent_belt-0.1.0/src/belt/_redact.py +355 -0
  220. agent_belt-0.1.0/src/belt/_regex_policy.py +61 -0
  221. agent_belt-0.1.0/src/belt/_safe.py +168 -0
  222. agent_belt-0.1.0/src/belt/_sanitize.py +109 -0
  223. agent_belt-0.1.0/src/belt/_ui.py +52 -0
  224. agent_belt-0.1.0/src/belt/agent/__init__.py +25 -0
  225. agent_belt-0.1.0/src/belt/agent/base.py +863 -0
  226. agent_belt-0.1.0/src/belt/agent/claude_code.py +413 -0
  227. agent_belt-0.1.0/src/belt/agent/codex.py +476 -0
  228. agent_belt-0.1.0/src/belt/agent/copilot.py +503 -0
  229. agent_belt-0.1.0/src/belt/agent/cursor.py +455 -0
  230. agent_belt-0.1.0/src/belt/agent/error_types.py +229 -0
  231. agent_belt-0.1.0/src/belt/agent/gemini.py +329 -0
  232. agent_belt-0.1.0/src/belt/agent/goose.py +448 -0
  233. agent_belt-0.1.0/src/belt/agent/opencode.py +381 -0
  234. agent_belt-0.1.0/src/belt/agent/registry.py +123 -0
  235. agent_belt-0.1.0/src/belt/agent/scoring.py +262 -0
  236. agent_belt-0.1.0/src/belt/aggregator/__init__.py +25 -0
  237. agent_belt-0.1.0/src/belt/aggregator/render_markdown.py +125 -0
  238. agent_belt-0.1.0/src/belt/aggregator/render_terminal.py +466 -0
  239. agent_belt-0.1.0/src/belt/aggregator/stats.py +716 -0
  240. agent_belt-0.1.0/src/belt/aggregator/thresholds.py +209 -0
  241. agent_belt-0.1.0/src/belt/benchmark_card/__init__.py +87 -0
  242. agent_belt-0.1.0/src/belt/benchmark_card/build.py +248 -0
  243. agent_belt-0.1.0/src/belt/benchmark_card/collect.py +246 -0
  244. agent_belt-0.1.0/src/belt/benchmark_card/entities.py +292 -0
  245. agent_belt-0.1.0/src/belt/benchmark_card/io.py +103 -0
  246. agent_belt-0.1.0/src/belt/benchmark_card/render.py +319 -0
  247. agent_belt-0.1.0/src/belt/cli.py +261 -0
  248. agent_belt-0.1.0/src/belt/cli_utils.py +20 -0
  249. agent_belt-0.1.0/src/belt/commands/__init__.py +15 -0
  250. agent_belt-0.1.0/src/belt/commands/aggregate.py +469 -0
  251. agent_belt-0.1.0/src/belt/commands/compare.py +348 -0
  252. agent_belt-0.1.0/src/belt/commands/doctor.py +878 -0
  253. agent_belt-0.1.0/src/belt/commands/eval.py +578 -0
  254. agent_belt-0.1.0/src/belt/commands/export.py +335 -0
  255. agent_belt-0.1.0/src/belt/commands/gc.py +174 -0
  256. agent_belt-0.1.0/src/belt/commands/quickstart.py +173 -0
  257. agent_belt-0.1.0/src/belt/commands/run.py +726 -0
  258. agent_belt-0.1.0/src/belt/commands/score.py +421 -0
  259. agent_belt-0.1.0/src/belt/commands/view.py +654 -0
  260. agent_belt-0.1.0/src/belt/commands/watch.py +565 -0
  261. agent_belt-0.1.0/src/belt/config.py +184 -0
  262. agent_belt-0.1.0/src/belt/constants.py +143 -0
  263. agent_belt-0.1.0/src/belt/entities.py +345 -0
  264. agent_belt-0.1.0/src/belt/envvars.py +258 -0
  265. agent_belt-0.1.0/src/belt/errors.py +62 -0
  266. agent_belt-0.1.0/src/belt/exporter/__init__.py +46 -0
  267. agent_belt-0.1.0/src/belt/exporter/base.py +76 -0
  268. agent_belt-0.1.0/src/belt/exporter/csv.py +163 -0
  269. agent_belt-0.1.0/src/belt/exporter/entities.py +86 -0
  270. agent_belt-0.1.0/src/belt/exporter/helpers.py +93 -0
  271. agent_belt-0.1.0/src/belt/exporter/jsonl.py +66 -0
  272. agent_belt-0.1.0/src/belt/exporter/junit.py +264 -0
  273. agent_belt-0.1.0/src/belt/exporter/markdown.py +115 -0
  274. agent_belt-0.1.0/src/belt/exporter/registry.py +123 -0
  275. agent_belt-0.1.0/src/belt/filter.py +266 -0
  276. agent_belt-0.1.0/src/belt/manifest.py +236 -0
  277. agent_belt-0.1.0/src/belt/parser/__init__.py +5 -0
  278. agent_belt-0.1.0/src/belt/parser/ndjson.py +86 -0
  279. agent_belt-0.1.0/src/belt/parser/scenario.py +66 -0
  280. agent_belt-0.1.0/src/belt/parser/strict.py +258 -0
  281. agent_belt-0.1.0/src/belt/progress.py +780 -0
  282. agent_belt-0.1.0/src/belt/py.typed +0 -0
  283. agent_belt-0.1.0/src/belt/runner/__init__.py +3 -0
  284. agent_belt-0.1.0/src/belt/runner/context.py +221 -0
  285. agent_belt-0.1.0/src/belt/runner/entities.py +52 -0
  286. agent_belt-0.1.0/src/belt/runner/orchestrator.py +663 -0
  287. agent_belt-0.1.0/src/belt/runner/phases/__init__.py +14 -0
  288. agent_belt-0.1.0/src/belt/runner/phases/parse_filter.py +160 -0
  289. agent_belt-0.1.0/src/belt/runner/phases/run_scenarios.py +222 -0
  290. agent_belt-0.1.0/src/belt/runner/phases/setup_groups.py +431 -0
  291. agent_belt-0.1.0/src/belt/runner/phases/teardown.py +35 -0
  292. agent_belt-0.1.0/src/belt/runner/process/__init__.py +19 -0
  293. agent_belt-0.1.0/src/belt/runner/process/spawner.py +136 -0
  294. agent_belt-0.1.0/src/belt/runner/sandbox/__init__.py +24 -0
  295. agent_belt-0.1.0/src/belt/runner/sandbox/base.py +178 -0
  296. agent_belt-0.1.0/src/belt/runner/sandbox/docker.py +281 -0
  297. agent_belt-0.1.0/src/belt/runner/sandbox/host.py +77 -0
  298. agent_belt-0.1.0/src/belt/runner/sandbox/registry.py +85 -0
  299. agent_belt-0.1.0/src/belt/runner/workspace.py +651 -0
  300. agent_belt-0.1.0/src/belt/scenario.py +481 -0
  301. agent_belt-0.1.0/src/belt/schema.py +36 -0
  302. agent_belt-0.1.0/src/belt/scorer/__init__.py +48 -0
  303. agent_belt-0.1.0/src/belt/scorer/base.py +30 -0
  304. agent_belt-0.1.0/src/belt/scorer/display.py +64 -0
  305. agent_belt-0.1.0/src/belt/scorer/dotenv_safety.py +82 -0
  306. agent_belt-0.1.0/src/belt/scorer/dry_run.py +177 -0
  307. agent_belt-0.1.0/src/belt/scorer/entities.py +194 -0
  308. agent_belt-0.1.0/src/belt/scorer/event_sink.py +105 -0
  309. agent_belt-0.1.0/src/belt/scorer/llm/__init__.py +46 -0
  310. agent_belt-0.1.0/src/belt/scorer/llm/backend.py +859 -0
  311. agent_belt-0.1.0/src/belt/scorer/llm/cache.py +169 -0
  312. agent_belt-0.1.0/src/belt/scorer/llm/consensus.py +337 -0
  313. agent_belt-0.1.0/src/belt/scorer/llm/events.py +96 -0
  314. agent_belt-0.1.0/src/belt/scorer/llm/judge_hints.py +137 -0
  315. agent_belt-0.1.0/src/belt/scorer/llm/preflight.py +136 -0
  316. agent_belt-0.1.0/src/belt/scorer/llm/pricing.py +184 -0
  317. agent_belt-0.1.0/src/belt/scorer/llm/pricing.toml +252 -0
  318. agent_belt-0.1.0/src/belt/scorer/llm/scorer.py +818 -0
  319. agent_belt-0.1.0/src/belt/scorer/payloads.py +504 -0
  320. agent_belt-0.1.0/src/belt/scorer/pipeline.py +474 -0
  321. agent_belt-0.1.0/src/belt/scorer/registry.py +115 -0
  322. agent_belt-0.1.0/src/belt/scorer/rules/__init__.py +7 -0
  323. agent_belt-0.1.0/src/belt/scorer/rules/efficiency.py +57 -0
  324. agent_belt-0.1.0/src/belt/scorer/rules/execution.py +47 -0
  325. agent_belt-0.1.0/src/belt/scorer/rules/file_diff.py +84 -0
  326. agent_belt-0.1.0/src/belt/scorer/rules/helpers.py +170 -0
  327. agent_belt-0.1.0/src/belt/scorer/rules/performance.py +47 -0
  328. agent_belt-0.1.0/src/belt/scorer/rules/response.py +48 -0
  329. agent_belt-0.1.0/src/belt/scorer/rules/scorer.py +70 -0
  330. agent_belt-0.1.0/src/belt/scorer/rules/state.py +67 -0
  331. agent_belt-0.1.0/src/belt/scorer/rules/trajectory.py +195 -0
  332. agent_belt-0.1.0/src/belt/scorer/scenario_map.py +206 -0
  333. agent_belt-0.1.0/tests/agent/fixtures/README.md +62 -0
  334. agent_belt-0.1.0/tests/agent/fixtures/auth_failure/claude-code.ndjson +2 -0
  335. agent_belt-0.1.0/tests/agent/fixtures/auth_failure/codex.ndjson +2 -0
  336. agent_belt-0.1.0/tests/agent/fixtures/auth_failure/copilot.ndjson +2 -0
  337. agent_belt-0.1.0/tests/agent/fixtures/auth_failure/cursor.ndjson +2 -0
  338. agent_belt-0.1.0/tests/agent/fixtures/auth_failure/gemini.ndjson +2 -0
  339. agent_belt-0.1.0/tests/agent/fixtures/auth_failure/goose.ndjson +2 -0
  340. agent_belt-0.1.0/tests/agent/fixtures/auth_failure/opencode.ndjson +2 -0
  341. agent_belt-0.1.0/tests/agent/fixtures/codex/resume_state.jsonl +4 -0
  342. agent_belt-0.1.0/tests/agent/fixtures/codex/retry_error.jsonl +9 -0
  343. agent_belt-0.1.0/tests/agent/fixtures/codex/single_turn.jsonl +4 -0
  344. agent_belt-0.1.0/tests/agent/fixtures/codex/tool_use.jsonl +7 -0
  345. agent_belt-0.1.0/tests/agent/fixtures/rate_limited/claude-code.ndjson +2 -0
  346. agent_belt-0.1.0/tests/agent/fixtures/rate_limited/codex.ndjson +2 -0
  347. agent_belt-0.1.0/tests/agent/fixtures/rate_limited/copilot.ndjson +2 -0
  348. agent_belt-0.1.0/tests/agent/fixtures/rate_limited/cursor.ndjson +2 -0
  349. agent_belt-0.1.0/tests/agent/fixtures/rate_limited/gemini.ndjson +2 -0
  350. agent_belt-0.1.0/tests/agent/fixtures/rate_limited/goose.ndjson +2 -0
  351. agent_belt-0.1.0/tests/agent/fixtures/rate_limited/opencode.ndjson +2 -0
  352. agent_belt-0.1.0/tests/agent/fixtures/refused/claude-code.ndjson +2 -0
  353. agent_belt-0.1.0/tests/agent/fixtures/refused/codex.ndjson +2 -0
  354. agent_belt-0.1.0/tests/agent/fixtures/refused/copilot.ndjson +2 -0
  355. agent_belt-0.1.0/tests/agent/fixtures/refused/cursor.ndjson +2 -0
  356. agent_belt-0.1.0/tests/agent/fixtures/refused/gemini.ndjson +2 -0
  357. agent_belt-0.1.0/tests/agent/fixtures/refused/goose.ndjson +2 -0
  358. agent_belt-0.1.0/tests/agent/fixtures/refused/opencode.ndjson +2 -0
  359. agent_belt-0.1.0/tests/agent/fixtures/timeout/claude-code.ndjson +2 -0
  360. agent_belt-0.1.0/tests/agent/fixtures/timeout/codex.ndjson +2 -0
  361. agent_belt-0.1.0/tests/agent/fixtures/timeout/copilot.ndjson +2 -0
  362. agent_belt-0.1.0/tests/agent/fixtures/timeout/cursor.ndjson +2 -0
  363. agent_belt-0.1.0/tests/agent/fixtures/timeout/gemini.ndjson +2 -0
  364. agent_belt-0.1.0/tests/agent/fixtures/timeout/goose.ndjson +2 -0
  365. agent_belt-0.1.0/tests/agent/fixtures/timeout/opencode.ndjson +2 -0
  366. agent_belt-0.1.0/tests/agent/test_agent_parity.py +260 -0
  367. agent_belt-0.1.0/tests/agent/test_base.py +324 -0
  368. agent_belt-0.1.0/tests/agent/test_capabilities.py +92 -0
  369. agent_belt-0.1.0/tests/agent/test_claude_code.py +772 -0
  370. agent_belt-0.1.0/tests/agent/test_codex.py +551 -0
  371. agent_belt-0.1.0/tests/agent/test_copilot.py +731 -0
  372. agent_belt-0.1.0/tests/agent/test_cursor.py +810 -0
  373. agent_belt-0.1.0/tests/agent/test_error_types.py +183 -0
  374. agent_belt-0.1.0/tests/agent/test_gemini.py +439 -0
  375. agent_belt-0.1.0/tests/agent/test_goose.py +270 -0
  376. agent_belt-0.1.0/tests/agent/test_opencode.py +507 -0
  377. agent_belt-0.1.0/tests/agent/test_registry.py +49 -0
  378. agent_belt-0.1.0/tests/agent/test_scoring.py +231 -0
  379. agent_belt-0.1.0/tests/aggregator/__init__.py +1 -0
  380. agent_belt-0.1.0/tests/aggregator/test_agent_errors.py +275 -0
  381. agent_belt-0.1.0/tests/aggregator/test_dry_run_footnote.py +117 -0
  382. agent_belt-0.1.0/tests/aggregator/test_judge_infra_failures.py +284 -0
  383. agent_belt-0.1.0/tests/aggregator/test_render_diet.py +160 -0
  384. agent_belt-0.1.0/tests/aggregator/test_reply_pattern_render_safety.py +108 -0
  385. agent_belt-0.1.0/tests/aggregator/test_task_quality_split.py +346 -0
  386. agent_belt-0.1.0/tests/aggregator/test_verdict_scales_coverage.py +310 -0
  387. agent_belt-0.1.0/tests/benchmark_card/__init__.py +1 -0
  388. agent_belt-0.1.0/tests/benchmark_card/conftest.py +44 -0
  389. agent_belt-0.1.0/tests/benchmark_card/test_agent_errors_block.py +232 -0
  390. agent_belt-0.1.0/tests/benchmark_card/test_build.py +193 -0
  391. agent_belt-0.1.0/tests/benchmark_card/test_collect.py +158 -0
  392. agent_belt-0.1.0/tests/benchmark_card/test_entities.py +33 -0
  393. agent_belt-0.1.0/tests/benchmark_card/test_io.py +31 -0
  394. agent_belt-0.1.0/tests/benchmark_card/test_render.py +128 -0
  395. agent_belt-0.1.0/tests/commands/__init__.py +1 -0
  396. agent_belt-0.1.0/tests/commands/test_aggregate.py +766 -0
  397. agent_belt-0.1.0/tests/commands/test_aggregate_resolve_agent.py +90 -0
  398. agent_belt-0.1.0/tests/commands/test_aggregate_scenarios_skipped.py +59 -0
  399. agent_belt-0.1.0/tests/commands/test_compare.py +284 -0
  400. agent_belt-0.1.0/tests/commands/test_doctor.py +824 -0
  401. agent_belt-0.1.0/tests/commands/test_doctor_auth_hedge.py +92 -0
  402. agent_belt-0.1.0/tests/commands/test_eval.py +531 -0
  403. agent_belt-0.1.0/tests/commands/test_quickstart.py +254 -0
  404. agent_belt-0.1.0/tests/commands/test_run_meta_groups.py +139 -0
  405. agent_belt-0.1.0/tests/commands/test_score.py +294 -0
  406. agent_belt-0.1.0/tests/commands/test_view.py +429 -0
  407. agent_belt-0.1.0/tests/commands/test_watch.py +626 -0
  408. agent_belt-0.1.0/tests/conftest.py +35 -0
  409. agent_belt-0.1.0/tests/exporter/__init__.py +1 -0
  410. agent_belt-0.1.0/tests/exporter/conftest.py +194 -0
  411. agent_belt-0.1.0/tests/exporter/test_base.py +67 -0
  412. agent_belt-0.1.0/tests/exporter/test_command_export.py +204 -0
  413. agent_belt-0.1.0/tests/exporter/test_csv.py +149 -0
  414. agent_belt-0.1.0/tests/exporter/test_entities.py +74 -0
  415. agent_belt-0.1.0/tests/exporter/test_helpers.py +90 -0
  416. agent_belt-0.1.0/tests/exporter/test_jsonl.py +54 -0
  417. agent_belt-0.1.0/tests/exporter/test_junit.py +91 -0
  418. agent_belt-0.1.0/tests/exporter/test_markdown.py +47 -0
  419. agent_belt-0.1.0/tests/exporter/test_registry.py +56 -0
  420. agent_belt-0.1.0/tests/parser/test_scenario.py +514 -0
  421. agent_belt-0.1.0/tests/parser/test_strict.py +441 -0
  422. agent_belt-0.1.0/tests/runner/phases/__init__.py +1 -0
  423. agent_belt-0.1.0/tests/runner/phases/test_parse_filter.py +301 -0
  424. agent_belt-0.1.0/tests/runner/phases/test_run_scenarios.py +143 -0
  425. agent_belt-0.1.0/tests/runner/phases/test_setup_groups.py +515 -0
  426. agent_belt-0.1.0/tests/runner/process/test_spawner.py +74 -0
  427. agent_belt-0.1.0/tests/runner/sandbox/test_docker_e2e.py +645 -0
  428. agent_belt-0.1.0/tests/runner/sandbox/test_docker_provider.py +424 -0
  429. agent_belt-0.1.0/tests/runner/sandbox/test_host_provider.py +108 -0
  430. agent_belt-0.1.0/tests/runner/sandbox/test_registry.py +38 -0
  431. agent_belt-0.1.0/tests/runner/test_create_agent.py +199 -0
  432. agent_belt-0.1.0/tests/runner/test_orchestrator.py +564 -0
  433. agent_belt-0.1.0/tests/runner/test_orchestrator_workspace.py +186 -0
  434. agent_belt-0.1.0/tests/runner/test_render_turn_message.py +128 -0
  435. agent_belt-0.1.0/tests/runner/test_workspace.py +347 -0
  436. agent_belt-0.1.0/tests/runner/test_workspace_e2e.py +226 -0
  437. agent_belt-0.1.0/tests/scorer/llm/test_backend.py +455 -0
  438. agent_belt-0.1.0/tests/scorer/llm/test_cache.py +81 -0
  439. agent_belt-0.1.0/tests/scorer/llm/test_consensus.py +327 -0
  440. agent_belt-0.1.0/tests/scorer/llm/test_consensus_judge_errors.py +139 -0
  441. agent_belt-0.1.0/tests/scorer/llm/test_events.py +99 -0
  442. agent_belt-0.1.0/tests/scorer/llm/test_judge_hints.py +122 -0
  443. agent_belt-0.1.0/tests/scorer/llm/test_judge_infra_classification.py +222 -0
  444. agent_belt-0.1.0/tests/scorer/llm/test_preflight_backends.py +342 -0
  445. agent_belt-0.1.0/tests/scorer/llm/test_preflight_orchestration.py +202 -0
  446. agent_belt-0.1.0/tests/scorer/llm/test_pricing.py +163 -0
  447. agent_belt-0.1.0/tests/scorer/llm/test_scorer.py +1069 -0
  448. agent_belt-0.1.0/tests/scorer/llm/test_streaming.py +256 -0
  449. agent_belt-0.1.0/tests/scorer/rules/test_file_diff.py +164 -0
  450. agent_belt-0.1.0/tests/scorer/rules/test_helpers.py +149 -0
  451. agent_belt-0.1.0/tests/scorer/rules/test_rules.py +779 -0
  452. agent_belt-0.1.0/tests/scorer/rules/test_trajectory.py +152 -0
  453. agent_belt-0.1.0/tests/scorer/test_dimension_resolution.py +328 -0
  454. agent_belt-0.1.0/tests/scorer/test_dry_run.py +234 -0
  455. agent_belt-0.1.0/tests/scorer/test_payloads.py +190 -0
  456. agent_belt-0.1.0/tests/scorer/test_pipeline_judge_errored.py +197 -0
  457. agent_belt-0.1.0/tests/scorer/test_regex_policy.py +77 -0
  458. agent_belt-0.1.0/tests/scorer/test_registry.py +106 -0
  459. agent_belt-0.1.0/tests/scorer/test_validate_scorers_preflight.py +139 -0
  460. agent_belt-0.1.0/tests/test_agent_doc_parity.py +80 -0
  461. agent_belt-0.1.0/tests/test_bundled.py +148 -0
  462. agent_belt-0.1.0/tests/test_check_design.py +382 -0
  463. agent_belt-0.1.0/tests/test_cli.py +101 -0
  464. agent_belt-0.1.0/tests/test_cli_doc_parity.py +123 -0
  465. agent_belt-0.1.0/tests/test_cli_order.py +163 -0
  466. agent_belt-0.1.0/tests/test_cli_smoke.py +208 -0
  467. agent_belt-0.1.0/tests/test_cli_utils.py +46 -0
  468. agent_belt-0.1.0/tests/test_config.py +221 -0
  469. agent_belt-0.1.0/tests/test_dependency_groups_match.py +38 -0
  470. agent_belt-0.1.0/tests/test_doctor_ollama_trailing.py +58 -0
  471. agent_belt-0.1.0/tests/test_doctor_providers.py +113 -0
  472. agent_belt-0.1.0/tests/test_e2e_agent_errors.py +259 -0
  473. agent_belt-0.1.0/tests/test_e2e_scenarios_skipped.py +130 -0
  474. agent_belt-0.1.0/tests/test_entities.py +409 -0
  475. agent_belt-0.1.0/tests/test_envvars.py +418 -0
  476. agent_belt-0.1.0/tests/test_error_types_doc_parity.py +134 -0
  477. agent_belt-0.1.0/tests/test_errors.py +75 -0
  478. agent_belt-0.1.0/tests/test_example_model_centralization.py +120 -0
  479. agent_belt-0.1.0/tests/test_example_scenarios.py +32 -0
  480. agent_belt-0.1.0/tests/test_exporter_doc_parity.py +39 -0
  481. agent_belt-0.1.0/tests/test_filter.py +244 -0
  482. agent_belt-0.1.0/tests/test_fixture_resources.py +582 -0
  483. agent_belt-0.1.0/tests/test_git.py +122 -0
  484. agent_belt-0.1.0/tests/test_integration_flow.py +335 -0
  485. agent_belt-0.1.0/tests/test_io.py +80 -0
  486. agent_belt-0.1.0/tests/test_live_progress.py +207 -0
  487. agent_belt-0.1.0/tests/test_logging.py +81 -0
  488. agent_belt-0.1.0/tests/test_manifest.py +307 -0
  489. agent_belt-0.1.0/tests/test_no_inline_user_regex.py +159 -0
  490. agent_belt-0.1.0/tests/test_no_personal_paths.py +82 -0
  491. agent_belt-0.1.0/tests/test_oss_readiness_regressions.py +178 -0
  492. agent_belt-0.1.0/tests/test_outcomes_dir.py +300 -0
  493. agent_belt-0.1.0/tests/test_packaging.py +75 -0
  494. agent_belt-0.1.0/tests/test_preflight_judge_model.py +140 -0
  495. agent_belt-0.1.0/tests/test_progress.py +136 -0
  496. agent_belt-0.1.0/tests/test_provider_doc_parity.py +58 -0
  497. agent_belt-0.1.0/tests/test_public_api.py +85 -0
  498. agent_belt-0.1.0/tests/test_redact.py +290 -0
  499. agent_belt-0.1.0/tests/test_run_summary_agent_errors.py +75 -0
  500. agent_belt-0.1.0/tests/test_runner/__init__.py +1 -0
  501. agent_belt-0.1.0/tests/test_runner/test_filter.py +218 -0
  502. agent_belt-0.1.0/tests/test_sanitize.py +103 -0
  503. agent_belt-0.1.0/tests/test_scenario_round_trip.py +84 -0
  504. agent_belt-0.1.0/tests/test_scenarios_layout.py +187 -0
  505. agent_belt-0.1.0/tests/test_schema_version.py +256 -0
  506. agent_belt-0.1.0/tests/test_security.py +2773 -0
  507. agent_belt-0.1.0/tests/test_setup_errors_sidecar.py +122 -0
  508. agent_belt-0.1.0/tests/test_showcase_scoring_modes.py +92 -0
  509. agent_belt-0.1.0/tests/test_skill_md.py +138 -0
  510. agent_belt-0.1.0/tests/test_stream_event_matrix.py +474 -0
  511. agent_belt-0.1.0/tests/test_subcommand_prog.py +81 -0
  512. agent_belt-0.1.0/tests/test_templating_parity.py +83 -0
  513. agent_belt-0.1.0/tests/test_threshold_enforcement.py +228 -0
  514. agent_belt-0.1.0/tests/test_ui_pluralize.py +33 -0
  515. agent_belt-0.1.0/tests/test_ui_streams.py +109 -0
  516. agent_belt-0.1.0/tests/test_watch_system_init.py +50 -0
  517. agent_belt-0.1.0/uv.lock +713 -0
@@ -0,0 +1,11 @@
1
+ - params:
2
+ git:
3
+ repoName: agent-belt
4
+ branches:
5
+ - main
6
+ scan:
7
+ projects:
8
+ - workingDirs:
9
+ - "."
10
+ excludePatterns:
11
+ - "examples/**"
@@ -0,0 +1,39 @@
1
+ name: Bug Report
2
+ description: Report a bug in agent-belt
3
+ labels: ["bug"]
4
+ body:
5
+ - type: textarea
6
+ id: description
7
+ attributes:
8
+ label: Description
9
+ description: What happened? What did you expect?
10
+ validations:
11
+ required: true
12
+
13
+ - type: textarea
14
+ id: reproduce
15
+ attributes:
16
+ label: Steps to Reproduce
17
+ description: Minimal command or code to trigger the bug
18
+ placeholder: |
19
+ ```bash
20
+ belt eval examples/scenarios/ --modes rules
21
+ ```
22
+
23
+ - type: textarea
24
+ id: environment
25
+ attributes:
26
+ label: Environment
27
+ description: OS, Python version, belt version, agent
28
+ placeholder: |
29
+ - OS: macOS 15
30
+ - Python: 3.13.0
31
+ - agent-belt: 0.1.0
32
+ - Agent: claude-code
33
+
34
+ - type: textarea
35
+ id: logs
36
+ attributes:
37
+ label: Relevant Output
38
+ description: Error messages, tracebacks, or score.json content
39
+ render: shell
@@ -0,0 +1 @@
1
+ blank_issues_enabled: false
@@ -0,0 +1,23 @@
1
+ name: Feature Request
2
+ description: Suggest an enhancement or new capability
3
+ labels: ["enhancement"]
4
+ body:
5
+ - type: textarea
6
+ id: problem
7
+ attributes:
8
+ label: Problem or Use Case
9
+ description: What are you trying to do that belt doesn't support?
10
+ validations:
11
+ required: true
12
+
13
+ - type: textarea
14
+ id: proposal
15
+ attributes:
16
+ label: Proposed Solution
17
+ description: How would you like this to work?
18
+
19
+ - type: textarea
20
+ id: alternatives
21
+ attributes:
22
+ label: Alternatives Considered
23
+ description: Any workarounds you've tried or other approaches you've considered
@@ -0,0 +1,45 @@
1
+ name: New Agent Request
2
+ description: Request support for a new CLI agent
3
+ labels: ["new feature", "agent"]
4
+ body:
5
+ - type: input
6
+ id: agent_name
7
+ attributes:
8
+ label: Agent Name
9
+ description: Name of the CLI agent (e.g., "aider", "continue")
10
+ validations:
11
+ required: true
12
+
13
+ - type: input
14
+ id: cli_command
15
+ attributes:
16
+ label: CLI Command
17
+ description: How is the agent invoked? (e.g., "aider --message")
18
+ validations:
19
+ required: true
20
+
21
+ - type: dropdown
22
+ id: output_format
23
+ attributes:
24
+ label: Output Format
25
+ options:
26
+ - Plain text
27
+ - JSON
28
+ - NDJSON (streaming)
29
+ - Other
30
+
31
+ - type: checkboxes
32
+ id: capabilities
33
+ attributes:
34
+ label: Capabilities
35
+ options:
36
+ - label: Multi-turn sessions (resume/conversation ID)
37
+ - label: Structured tool call output
38
+ - label: Streaming events
39
+ - label: Non-interactive/headless mode
40
+
41
+ - type: textarea
42
+ id: notes
43
+ attributes:
44
+ label: Additional Context
45
+ description: Links to docs, CLI reference, or output examples
@@ -0,0 +1,22 @@
1
+ ## Summary
2
+
3
+ <!-- What does this PR do? 1-3 sentences. -->
4
+
5
+ ## Changes
6
+
7
+ <!-- Key changes, one bullet per logical change. -->
8
+
9
+ -
10
+
11
+ ## Testing
12
+
13
+ <!-- How was this tested? Include commands run, test output, or scenario results. -->
14
+
15
+ - [ ] `make check` passes (lint + test)
16
+ - [ ] New/changed code has tests
17
+ - [ ] Scenarios run successfully (if agent/scorer/scenario changes)
18
+
19
+ ## Label
20
+
21
+ <!-- Every PR needs exactly one release label. Pick one: -->
22
+ <!-- new feature | improvement | bug | breaking change | ignore for release -->
@@ -0,0 +1,29 @@
1
+ # GitHub Release notes configuration. Used by ``gh release create
2
+ # --generate-notes`` (see ``.github/workflows/release.yml``) to categorise
3
+ # merged PRs by label.
4
+ #
5
+ # Category titles and labels track the JFrog OSS reference set used by
6
+ # jfrog-cli, frogbot, artifactory-client-java, and jfrog-client-go. Keep
7
+ # the wording and emoji identical so the GitHub Releases page for this
8
+ # project reads in the same voice as the rest of the JFrog OSS family.
9
+
10
+ changelog:
11
+ exclude:
12
+ labels:
13
+ - "ignore for release"
14
+ categories:
15
+ - title: Breaking Changes 🚨
16
+ labels:
17
+ - "breaking change"
18
+ - title: Exciting New Features 🎉
19
+ labels:
20
+ - "new feature"
21
+ - title: Improvements 🌱
22
+ labels:
23
+ - "improvement"
24
+ - title: Bug Fixes 🛠
25
+ labels:
26
+ - "bug"
27
+ - title: Other Changes 📚
28
+ labels:
29
+ - "*"
@@ -0,0 +1,29 @@
1
+ name: "CLA Assistant"
2
+
3
+ on:
4
+ # issue_comment triggers this action on each comment on issues and pull requests
5
+ issue_comment:
6
+ types: [created]
7
+ pull_request_target:
8
+ types: [opened, synchronize]
9
+ branches:
10
+ - main
11
+
12
+ # explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
13
+ permissions:
14
+ actions: write
15
+ contents: write # this can be 'read' if the signatures are in remote repository
16
+ pull-requests: write
17
+ statuses: write
18
+
19
+ jobs:
20
+ CLAssistant:
21
+ runs-on: ubuntu-latest
22
+ steps:
23
+ - name: Run CLA Check
24
+ uses: jfrog/.github/actions/cla@main
25
+ with:
26
+ event_comment_body: ${{ github.event.comment.body }}
27
+ event_name: ${{ github.event_name }}
28
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
29
+ CLA_SIGN_TOKEN: ${{ secrets.CLA_SIGN_TOKEN }}
@@ -0,0 +1,35 @@
1
+ name: "Frogbot Scan Pull Request"
2
+
3
+ on:
4
+ pull_request_target:
5
+ types: [opened, synchronize]
6
+ branches:
7
+ - "main"
8
+
9
+ permissions:
10
+ pull-requests: write
11
+ contents: read
12
+
13
+ jobs:
14
+ scan-pull-request:
15
+ if: ${{ github.actor != 'dependabot[bot]' }}
16
+ runs-on: ubuntu-latest
17
+ environment: frogbot
18
+ steps:
19
+ - name: Checkout the repository
20
+ uses: actions/checkout@v6
21
+ with:
22
+ ref: ${{ github.event.pull_request.head.sha || github.ref }}
23
+
24
+ - name: Set up Python
25
+ uses: actions/setup-python@v6
26
+ with:
27
+ python-version: "3.13"
28
+
29
+ - uses: jfrog/frogbot@v2
30
+ env:
31
+ JF_URL: ${{ secrets.FROGBOT_URL }}
32
+ JF_ACCESS_TOKEN: ${{ secrets.FROGBOT_ACCESS_TOKEN }}
33
+ JF_GIT_TOKEN: ${{ secrets.GITHUB_TOKEN }}
34
+ JF_FAIL: "TRUE"
35
+ JF_MIN_SEVERITY: "High"
@@ -0,0 +1,258 @@
1
+ name: Release
2
+
3
+ # Triggered ONLY on annotated/lightweight tags matching ``v*``. Manual,
4
+ # deliberate, never auto-publish-on-merge.
5
+ #
6
+ # Tag conventions:
7
+ # v0.1.0 -> production PyPI + GitHub Release
8
+ # v0.1.0-rc1 -> TestPyPI only (rehearsal)
9
+ # v0.1.0-alpha.1 -> TestPyPI only (rehearsal)
10
+ #
11
+ # The build job runs first with NO secrets, produces the wheel + sdist as a
12
+ # GitHub artifact. Each publish job downloads the artifact - they never see
13
+ # the source.
14
+
15
+ on:
16
+ push:
17
+ tags:
18
+ - "v*"
19
+
20
+ permissions:
21
+ contents: read
22
+
23
+ jobs:
24
+ classify-tag:
25
+ name: Classify tag (rehearsal vs. real release)
26
+ runs-on: ubuntu-latest
27
+ outputs:
28
+ version: ${{ steps.parse.outputs.version }}
29
+ is_prerelease: ${{ steps.parse.outputs.is_prerelease }}
30
+ steps:
31
+ - name: Parse the tag
32
+ id: parse
33
+ run: |
34
+ TAG="${GITHUB_REF#refs/tags/}"
35
+ VERSION="${TAG#v}"
36
+ echo "version=$VERSION" >> "$GITHUB_OUTPUT"
37
+ # PEP 440 pre-release / dev-release markers - any of these ship to
38
+ # TestPyPI only. We accept both spellings:
39
+ # dash-separated: v0.1.0-rc1, v0.1.0-alpha.1, v0.1.0-beta1, v0.1.0-dev0
40
+ # PEP 440 canonical: v0.1.0rc1, v0.1.0a1, v0.1.0b1, v0.1.0.dev0
41
+ # A naive ``*-*`` check would accept the dash-form only and silently
42
+ # publish ``v0.1.0rc1`` to production PyPI - real foot-gun.
43
+ if [[ "$TAG" =~ -|[0-9](rc|a|b|alpha|beta)[0-9]|\.dev[0-9] ]]; then
44
+ echo "is_prerelease=true" >> "$GITHUB_OUTPUT"
45
+ echo "Pre-release tag - rehearsal flow (TestPyPI only)"
46
+ else
47
+ echo "is_prerelease=false" >> "$GITHUB_OUTPUT"
48
+ echo "Release tag - full publish flow"
49
+ fi
50
+
51
+ build:
52
+ name: Build wheel + sdist (no secrets)
53
+ needs: classify-tag
54
+ runs-on: ubuntu-latest
55
+ permissions:
56
+ contents: read
57
+ # Required for ``actions/attest-build-provenance`` below.
58
+ id-token: write
59
+ attestations: write
60
+
61
+ steps:
62
+ - uses: actions/checkout@v4
63
+ with:
64
+ # ``hatch-vcs`` derives the package version from git metadata.
65
+ fetch-depth: 0
66
+
67
+ - name: Set up uv
68
+ uses: astral-sh/setup-uv@v6
69
+ with:
70
+ # Pinned for release reproducibility - the build that produces the
71
+ # wheel must not move underneath us between runs. Bump deliberately
72
+ # in a separate PR after verifying the new uv minor on a rehearsal.
73
+ version: "0.11"
74
+ enable-cache: true
75
+
76
+ - name: Set up Python
77
+ run: uv python install
78
+
79
+ - name: Sanity check - tag matches ``hatch-vcs`` resolution
80
+ # Catches the rare case where a tag was created on a non-main commit
81
+ # by hand and ``hatch-vcs`` resolves to something else (e.g. a stale
82
+ # cached version). Hard-fail before we publish anything.
83
+ env:
84
+ # Override hatch-vcs's ``git describe`` resolution with the version
85
+ # parsed from the tag that actually triggered this workflow. When
86
+ # multiple ``v*`` tags point at the same commit (typical: an rc tag
87
+ # rehearsed in §2.8 followed by a strict tag cut from the same SHA),
88
+ # ``git describe`` resolution is non-deterministic across git and
89
+ # hatch-vcs versions, and the build can pick the rc, producing a
90
+ # wheel whose version fails the assertion below.
91
+ #
92
+ # The global ``SETUPTOOLS_SCM_PRETEND_VERSION`` is used rather than
93
+ # the per-project ``SETUPTOOLS_SCM_PRETEND_VERSION_FOR_AGENT_BELT``
94
+ # because hatch-vcs invokes ``setuptools_scm.get_version()`` without
95
+ # passing ``dist_name``, so the per-project env-var lookup never
96
+ # fires. The global form is the canonical CI escape hatch documented
97
+ # by setuptools-scm and used by ruff, uv, and hatch's own release
98
+ # workflows.
99
+ #
100
+ # The wheel-version assertion below stays as defense in depth: it
101
+ # now validates the build did what we told it to, rather than
102
+ # arbitrating between competing tag resolutions.
103
+ SETUPTOOLS_SCM_PRETEND_VERSION: ${{ needs.classify-tag.outputs.version }}
104
+ run: |
105
+ uv sync --locked
106
+ uv build
107
+ # Parse wheel filename via ``packaging.utils`` rather than ``ls | sed``:
108
+ # robust to multiple wheels, locale-dependent ``ls`` ordering, and PEP
109
+ # 425 build tags. Asserts exactly one wheel landed in ``dist/``.
110
+ WHEEL_VERSION=$(uv run python -c "
111
+ from packaging.utils import parse_wheel_filename
112
+ from pathlib import Path
113
+ wheels = list(Path('dist').glob('*.whl'))
114
+ assert len(wheels) == 1, f'expected exactly one wheel, found: {wheels}'
115
+ print(parse_wheel_filename(wheels[0].name)[1])
116
+ ")
117
+ echo "Tag version : ${{ needs.classify-tag.outputs.version }}"
118
+ echo "Wheel version: $WHEEL_VERSION"
119
+ # Allow PEP 440 normalisation (e.g. tag ``v0.1.0-rc1`` -> wheel
120
+ # ``0.1.0rc1``) but block silent mismatches.
121
+ if [[ "$WHEEL_VERSION" != "${{ needs.classify-tag.outputs.version }}" \
122
+ && "$WHEEL_VERSION" != "$(echo '${{ needs.classify-tag.outputs.version }}' | tr -d '-')" ]]; then
123
+ echo "::error::Wheel version $WHEEL_VERSION does not match tag ${{ needs.classify-tag.outputs.version }}"
124
+ exit 1
125
+ fi
126
+
127
+ - name: Verify wheel installs and smoke-tests pass
128
+ run: make verify-wheel
129
+
130
+ - name: Generate build provenance attestation
131
+ # Sigstore-signed SLSA L3 attestation tying the wheel hash to this
132
+ # workflow run, source SHA, and runner identity. Uploaded to the
133
+ # GitHub Release alongside the artifact.
134
+ uses: actions/attest-build-provenance@v2
135
+ with:
136
+ subject-path: dist/*
137
+
138
+ - name: Upload built distributions
139
+ uses: actions/upload-artifact@v4
140
+ with:
141
+ name: dist
142
+ path: dist/
143
+ retention-days: 14
144
+ if-no-files-found: error
145
+
146
+ publish-testpypi:
147
+ name: Publish to TestPyPI (rehearsal)
148
+ needs: [classify-tag, build]
149
+ # Mirrors the repo guard on ``publish-pypi``: a fork that pushes a
150
+ # pre-release tag should not even attempt to upload (the Trusted Publisher
151
+ # binding would reject it, but defense-in-depth lives in the workflow too).
152
+ # Pre-release (rc / alpha / beta / dev) tags from ``jfrog/agent-belt``
153
+ # publish to TestPyPI for rehearsal before a strict tag goes to PyPI.
154
+ if: needs.classify-tag.outputs.is_prerelease == 'true' && github.repository == 'jfrog/agent-belt'
155
+ runs-on: ubuntu-latest
156
+ environment:
157
+ name: testpypi
158
+ url: https://test.pypi.org/p/agent-belt
159
+ permissions:
160
+ # OIDC token for Trusted Publisher - no PyPI API token required.
161
+ id-token: write
162
+
163
+ steps:
164
+ - name: Download built distributions
165
+ uses: actions/download-artifact@v4
166
+ with:
167
+ name: dist
168
+ path: dist/
169
+
170
+ - name: Publish to TestPyPI
171
+ # v1.12+ ships a twine that understands Metadata-Version 2.4, which is
172
+ # what current ``hatchling`` emits (PEP 639 license metadata). v1.11
173
+ # rejects the wheel before upload with "Metadata is missing required
174
+ # fields: Name, Version" - misleading symptom, real cause is the
175
+ # metadata version. v1.12+ also keeps PEP 740 attestation support.
176
+ uses: pypa/gh-action-pypi-publish@v1.14.0
177
+ with:
178
+ repository-url: https://test.pypi.org/legacy/
179
+ # TestPyPI rejects identical version overwrites; ``skip-existing``
180
+ # makes a re-run of the same rc tag idempotent (e.g. after a manual
181
+ # workflow restart) instead of failing the whole release flow.
182
+ skip-existing: true
183
+
184
+ publish-pypi:
185
+ name: Publish to PyPI (Trusted Publisher)
186
+ needs: [classify-tag, build]
187
+ # Pinned to this repo so a fork running release.yml cannot publish under
188
+ # our PyPI name. The PyPI Trusted Publisher binding is the second layer.
189
+ if: needs.classify-tag.outputs.is_prerelease == 'false' && github.repository == 'jfrog/agent-belt'
190
+ runs-on: ubuntu-latest
191
+ environment:
192
+ name: pypi
193
+ url: https://pypi.org/p/agent-belt
194
+ permissions:
195
+ id-token: write
196
+
197
+ steps:
198
+ - name: Download built distributions
199
+ uses: actions/download-artifact@v4
200
+ with:
201
+ name: dist
202
+ path: dist/
203
+
204
+ - name: Publish to PyPI
205
+ uses: pypa/gh-action-pypi-publish@v1.14.0
206
+
207
+ github-release:
208
+ name: GitHub Release (notes + provenance)
209
+ needs: [classify-tag, build, publish-pypi]
210
+ if: needs.classify-tag.outputs.is_prerelease == 'false' && github.repository == 'jfrog/agent-belt'
211
+ runs-on: ubuntu-latest
212
+ permissions:
213
+ contents: write
214
+
215
+ steps:
216
+ - uses: actions/checkout@v4
217
+
218
+ - name: Download built distributions
219
+ uses: actions/download-artifact@v4
220
+ with:
221
+ name: dist
222
+ path: dist/
223
+
224
+ - name: Create GitHub Release with auto-generated notes
225
+ env:
226
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
227
+ run: |
228
+ gh release create "v${{ needs.classify-tag.outputs.version }}" \
229
+ --title "v${{ needs.classify-tag.outputs.version }}" \
230
+ --generate-notes \
231
+ dist/*
232
+
233
+ - name: Summary
234
+ if: always()
235
+ run: |
236
+ {
237
+ echo "## Release Summary"
238
+ echo "- **Version:** ${{ needs.classify-tag.outputs.version }}"
239
+ echo "- **Tag:** v${{ needs.classify-tag.outputs.version }}"
240
+ echo "- **PyPI:** https://pypi.org/project/agent-belt/${{ needs.classify-tag.outputs.version }}/"
241
+ echo "- **GitHub Release:** https://github.com/jfrog/agent-belt/releases/tag/v${{ needs.classify-tag.outputs.version }}"
242
+ echo "- **Provenance:** Sigstore-signed SLSA build provenance attached to the wheel"
243
+ } >> "$GITHUB_STEP_SUMMARY"
244
+
245
+ notice-build-only:
246
+ # Surfaces the situation when ``classify-tag`` + ``build`` succeed but
247
+ # every publish job is skipped by a repo guard. Without this notice the
248
+ # run shows as a green success with no surfaced explanation -
249
+ # operationally confusing. Triggers on any tag (strict or pre-release)
250
+ # pushed from a repo other than ``jfrog/agent-belt`` (typically a fork).
251
+ name: Notice (build-only on this repo)
252
+ needs: [classify-tag, build]
253
+ if: github.repository != 'jfrog/agent-belt'
254
+ runs-on: ubuntu-latest
255
+ steps:
256
+ - name: Print notice
257
+ run: |
258
+ echo "::notice::Tag v${{ needs.classify-tag.outputs.version }} built on ${{ github.repository }}, but this repo has no publish path configured. Strict release tags publish from jfrog/agent-belt to PyPI; pre-release tags (rc, alpha, beta, dev) publish from jfrog/agent-belt to TestPyPI."
@@ -0,0 +1,33 @@
1
+ # Strip the ``safe to test`` label as soon as it is applied. Pairs with any
2
+ # integration / live-credential workflow that gates on
3
+ # ``contains(...labels...'safe to test')`` - the next push will then need a
4
+ # fresh maintainer approval, so the label is single-use per review action.
5
+ # Reference impl: jfrog/jfrog-cli ``.github/workflows/removeLabel.yml``.
6
+ name: Remove Label
7
+
8
+ on:
9
+ pull_request_target:
10
+ types: [labeled]
11
+
12
+ # Cancel superseded runs (e.g., maintainer toggling the label twice quickly)
13
+ # so we never end up with two concurrent strip attempts.
14
+ concurrency:
15
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.ref }}
16
+ cancel-in-progress: true
17
+
18
+ permissions:
19
+ pull-requests: write
20
+
21
+ jobs:
22
+ Remove-Label:
23
+ if: contains(github.event.pull_request.labels.*.name, 'safe to test')
24
+ name: Remove 'safe to test'
25
+ runs-on: ubuntu-latest
26
+ steps:
27
+ - name: Remove 'safe to test'
28
+ # Pinned to the commit SHA of v1.3.0 (mutable tags are a known supply-chain
29
+ # risk on ``pull_request_target``, which runs in the base-repo context with
30
+ # write permissions). Update by re-resolving the desired tag to its commit.
31
+ uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0 # v1.3.0
32
+ with:
33
+ labels: "safe to test"