tokenjam-bench 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. tokenjam_bench-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +25 -0
  2. tokenjam_bench-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  3. tokenjam_bench-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +21 -0
  4. tokenjam_bench-0.1.0/.github/workflows/benchmark.yml +56 -0
  5. tokenjam_bench-0.1.0/.github/workflows/ci.yml +35 -0
  6. tokenjam_bench-0.1.0/.github/workflows/publish-pypi.yml +34 -0
  7. tokenjam_bench-0.1.0/.gitignore +32 -0
  8. tokenjam_bench-0.1.0/Assets/ChatGPT Image Jun 25, 2026, 05_19_19 PM.png +0 -0
  9. tokenjam_bench-0.1.0/Assets/ChatGPT Image Jun 25, 2026, 05_31_49 PM.png +0 -0
  10. tokenjam_bench-0.1.0/Assets/logo_mark.png +0 -0
  11. tokenjam_bench-0.1.0/CHANGELOG.md +81 -0
  12. tokenjam_bench-0.1.0/CONTRIBUTING.md +63 -0
  13. tokenjam_bench-0.1.0/LICENSE +22 -0
  14. tokenjam_bench-0.1.0/Makefile +37 -0
  15. tokenjam_bench-0.1.0/NOTICE +7 -0
  16. tokenjam_bench-0.1.0/PKG-INFO +410 -0
  17. tokenjam_bench-0.1.0/README.md +385 -0
  18. tokenjam_bench-0.1.0/datasets/customer_support/README.md +51 -0
  19. tokenjam_bench-0.1.0/datasets/customer_support/tickets.json +167 -0
  20. tokenjam_bench-0.1.0/datasets/email/tasks.json +115 -0
  21. tokenjam_bench-0.1.0/datasets/rag/qa.json +127 -0
  22. tokenjam_bench-0.1.0/datasets/research/tasks.json +148 -0
  23. tokenjam_bench-0.1.0/demo/seed_demo.py +350 -0
  24. tokenjam_bench-0.1.0/docs/README.md +46 -0
  25. tokenjam_bench-0.1.0/docs/agents.md +289 -0
  26. tokenjam_bench-0.1.0/docs/analytics.md +60 -0
  27. tokenjam_bench-0.1.0/docs/api-reference.md +468 -0
  28. tokenjam_bench-0.1.0/docs/architecture.md +221 -0
  29. tokenjam_bench-0.1.0/docs/benchmarks.md +222 -0
  30. tokenjam_bench-0.1.0/docs/brand/tokenjam-bench-banner.png +0 -0
  31. tokenjam_bench-0.1.0/docs/brand/tokenjam-bench-banner.svg +29 -0
  32. tokenjam_bench-0.1.0/docs/brand/tokenjam-bench-icon.png +0 -0
  33. tokenjam_bench-0.1.0/docs/brand/tokenjam-bench-icon.svg +22 -0
  34. tokenjam_bench-0.1.0/docs/brand/tokenjam-bench.png +0 -0
  35. tokenjam_bench-0.1.0/docs/brand/tokenjam-bench.svg +24 -0
  36. tokenjam_bench-0.1.0/docs/ci.md +61 -0
  37. tokenjam_bench-0.1.0/docs/cli-reference.md +233 -0
  38. tokenjam_bench-0.1.0/docs/cost-pricing.md +93 -0
  39. tokenjam_bench-0.1.0/docs/development.md +97 -0
  40. tokenjam_bench-0.1.0/docs/evidence/2026-06-26-deepseek-live.md +97 -0
  41. tokenjam_bench-0.1.0/docs/evidence/2026-06-26-real-workflow-dashboard.md +48 -0
  42. tokenjam_bench-0.1.0/docs/evidence/README.md +67 -0
  43. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_customer-support_tj0.5.1_1782480389.html +57 -0
  44. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_customer-support_tj0.5.1_1782480389.json +195 -0
  45. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_email-assistant_tj0.5.1_1782480651.html +57 -0
  46. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_email-assistant_tj0.5.1_1782480651.json +195 -0
  47. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_enterprise-rag_tj0.5.1_1782480507.html +57 -0
  48. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_enterprise-rag_tj0.5.1_1782480507.json +195 -0
  49. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_gsm8k_tj0.5.1_1782480094.html +57 -0
  50. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_gsm8k_tj0.5.1_1782480094.json +195 -0
  51. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_humaneval_tj0.5.1_1782480162.html +57 -0
  52. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_humaneval_tj0.5.1_1782480162.json +195 -0
  53. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_judged_tj0.5.1_1782480215.html +57 -0
  54. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_judged_tj0.5.1_1782480215.json +111 -0
  55. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_research-assistant_tj0.5.1_1782480799.html +57 -0
  56. tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_research-assistant_tj0.5.1_1782480799.json +195 -0
  57. tokenjam_bench-0.1.0/docs/evidence/archive/README.md +21 -0
  58. tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_gsm8k_tj0.5.1_1782443915.html +45 -0
  59. tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_gsm8k_tj0.5.1_1782443915.json +195 -0
  60. tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_humaneval_tj0.5.1_1782443963.html +45 -0
  61. tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_humaneval_tj0.5.1_1782443963.json +171 -0
  62. tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_judged_tj0.5.1_1782444012.html +45 -0
  63. tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_judged_tj0.5.1_1782444012.json +111 -0
  64. tokenjam_bench-0.1.0/docs/evidence/humaneval_deepseek_reasoner_to_chat.html +45 -0
  65. tokenjam_bench-0.1.0/docs/evidence/humaneval_deepseek_reasoner_to_chat.json +291 -0
  66. tokenjam_bench-0.1.0/docs/evidence/judged_deepseek_correctness.html +45 -0
  67. tokenjam_bench-0.1.0/docs/evidence/judged_deepseek_correctness.json +111 -0
  68. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/README.md +144 -0
  69. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_customer-support_tj0.5.2_1782506002.html +57 -0
  70. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_customer-support_tj0.5.2_1782506002.json +195 -0
  71. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_customer-support_tj0.5.2_1782506688.html +57 -0
  72. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_customer-support_tj0.5.2_1782506688.json +195 -0
  73. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_email-assistant_tj0.5.2_1782506301.html +57 -0
  74. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_email-assistant_tj0.5.2_1782506301.json +195 -0
  75. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_email-assistant_tj0.5.2_1782506913.html +57 -0
  76. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_email-assistant_tj0.5.2_1782506913.json +195 -0
  77. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_enterprise-rag_tj0.5.2_1782506164.html +57 -0
  78. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_enterprise-rag_tj0.5.2_1782506164.json +195 -0
  79. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_enterprise-rag_tj0.5.2_1782506788.html +57 -0
  80. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_enterprise-rag_tj0.5.2_1782506788.json +195 -0
  81. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782503878.html +57 -0
  82. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782503878.json +651 -0
  83. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504203.html +57 -0
  84. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504203.json +651 -0
  85. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504371.html +57 -0
  86. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504371.json +651 -0
  87. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504768.html +57 -0
  88. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504768.json +651 -0
  89. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504077.html +57 -0
  90. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504077.json +651 -0
  91. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504400.html +57 -0
  92. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504400.json +651 -0
  93. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504580.html +57 -0
  94. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504580.json +651 -0
  95. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782505333.html +57 -0
  96. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782505333.json +651 -0
  97. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_judged_tj0.5.2_1782505847.html +57 -0
  98. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_judged_tj0.5.2_1782505847.json +111 -0
  99. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_judged_tj0.5.2_1782506545.html +57 -0
  100. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_judged_tj0.5.2_1782506545.json +111 -0
  101. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_research-assistant_tj0.5.2_1782506498.html +57 -0
  102. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_research-assistant_tj0.5.2_1782506498.json +195 -0
  103. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_research-assistant_tj0.5.2_1782507084.html +57 -0
  104. tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_research-assistant_tj0.5.2_1782507084.json +195 -0
  105. tokenjam_bench-0.1.0/docs/faq.md +139 -0
  106. tokenjam_bench-0.1.0/docs/history.md +72 -0
  107. tokenjam_bench-0.1.0/docs/models.md +190 -0
  108. tokenjam_bench-0.1.0/docs/overview.md +70 -0
  109. tokenjam_bench-0.1.0/docs/pipelines.md +209 -0
  110. tokenjam_bench-0.1.0/docs/proof-runbook.md +138 -0
  111. tokenjam_bench-0.1.0/docs/quickstart.md +126 -0
  112. tokenjam_bench-0.1.0/docs/replay.md +97 -0
  113. tokenjam_bench-0.1.0/docs/scenarios.md +71 -0
  114. tokenjam_bench-0.1.0/docs/security.md +105 -0
  115. tokenjam_bench-0.1.0/docs/statistics.md +126 -0
  116. tokenjam_bench-0.1.0/docs/swe-bench-lite.md +73 -0
  117. tokenjam_bench-0.1.0/docs/tests.md +206 -0
  118. tokenjam_bench-0.1.0/docs/tokenjam-integration.md +148 -0
  119. tokenjam_bench-0.1.0/docs/workflows.md +109 -0
  120. tokenjam_bench-0.1.0/pyproject.toml +58 -0
  121. tokenjam_bench-0.1.0/results/.gitkeep +0 -0
  122. tokenjam_bench-0.1.0/run.py +15 -0
  123. tokenjam_bench-0.1.0/scripts/run_multipair_evidence.sh +58 -0
  124. tokenjam_bench-0.1.0/scripts/run_real_benchmarks.sh +58 -0
  125. tokenjam_bench-0.1.0/tests/test_agent_pipeline_offline.py +48 -0
  126. tokenjam_bench-0.1.0/tests/test_agent_runner.py +49 -0
  127. tokenjam_bench-0.1.0/tests/test_agent_validation.py +50 -0
  128. tokenjam_bench-0.1.0/tests/test_ci_benchmark.py +36 -0
  129. tokenjam_bench-0.1.0/tests/test_dashboard.py +82 -0
  130. tokenjam_bench-0.1.0/tests/test_deepseek.py +78 -0
  131. tokenjam_bench-0.1.0/tests/test_history.py +140 -0
  132. tokenjam_bench-0.1.0/tests/test_honesty_guard.py +227 -0
  133. tokenjam_bench-0.1.0/tests/test_judge.py +57 -0
  134. tokenjam_bench-0.1.0/tests/test_matrix.py +88 -0
  135. tokenjam_bench-0.1.0/tests/test_pipeline_offline.py +116 -0
  136. tokenjam_bench-0.1.0/tests/test_real_scenarios.py +69 -0
  137. tokenjam_bench-0.1.0/tests/test_replay.py +115 -0
  138. tokenjam_bench-0.1.0/tests/test_report.py +44 -0
  139. tokenjam_bench-0.1.0/tests/test_report_html.py +41 -0
  140. tokenjam_bench-0.1.0/tests/test_scenario_suites.py +61 -0
  141. tokenjam_bench-0.1.0/tests/test_scoring.py +42 -0
  142. tokenjam_bench-0.1.0/tests/test_stats.py +60 -0
  143. tokenjam_bench-0.1.0/tests/test_swe_bench_lite.py +325 -0
  144. tokenjam_bench-0.1.0/tests/test_version_stamp.py +13 -0
  145. tokenjam_bench-0.1.0/tests/test_workflows.py +101 -0
  146. tokenjam_bench-0.1.0/tjbench/__init__.py +6 -0
  147. tokenjam_bench-0.1.0/tjbench/agent_pipeline.py +117 -0
  148. tokenjam_bench-0.1.0/tjbench/agents/__init__.py +25 -0
  149. tokenjam_bench-0.1.0/tjbench/agents/runner.py +66 -0
  150. tokenjam_bench-0.1.0/tjbench/agents/swe_bench_tools.py +296 -0
  151. tokenjam_bench-0.1.0/tjbench/agents/tools.py +63 -0
  152. tokenjam_bench-0.1.0/tjbench/agents/trace.py +72 -0
  153. tokenjam_bench-0.1.0/tjbench/agents/validation.py +68 -0
  154. tokenjam_bench-0.1.0/tjbench/bench_meta.py +2 -0
  155. tokenjam_bench-0.1.0/tjbench/benchmarks/__init__.py +65 -0
  156. tokenjam_bench-0.1.0/tjbench/benchmarks/agent_base.py +37 -0
  157. tokenjam_bench-0.1.0/tjbench/benchmarks/base.py +37 -0
  158. tokenjam_bench-0.1.0/tjbench/benchmarks/gsm8k.py +48 -0
  159. tokenjam_bench-0.1.0/tjbench/benchmarks/humaneval.py +58 -0
  160. tokenjam_bench-0.1.0/tjbench/benchmarks/judged.py +74 -0
  161. tokenjam_bench-0.1.0/tjbench/benchmarks/real_scenarios.py +144 -0
  162. tokenjam_bench-0.1.0/tjbench/benchmarks/sample_agent.py +107 -0
  163. tokenjam_bench-0.1.0/tjbench/benchmarks/samples.py +73 -0
  164. tokenjam_bench-0.1.0/tjbench/benchmarks/scenario_lib.py +108 -0
  165. tokenjam_bench-0.1.0/tjbench/benchmarks/scenario_suites.py +153 -0
  166. tokenjam_bench-0.1.0/tjbench/benchmarks/scoring.py +63 -0
  167. tokenjam_bench-0.1.0/tjbench/benchmarks/swe_bench_lite.py +288 -0
  168. tokenjam_bench-0.1.0/tjbench/ci_benchmark.py +108 -0
  169. tokenjam_bench-0.1.0/tjbench/cli.py +647 -0
  170. tokenjam_bench-0.1.0/tjbench/cost.py +44 -0
  171. tokenjam_bench-0.1.0/tjbench/dashboard.py +980 -0
  172. tokenjam_bench-0.1.0/tjbench/deepeval_judge.py +137 -0
  173. tokenjam_bench-0.1.0/tjbench/exec_sandbox.py +54 -0
  174. tokenjam_bench-0.1.0/tjbench/history.py +290 -0
  175. tokenjam_bench-0.1.0/tjbench/judge.py +116 -0
  176. tokenjam_bench-0.1.0/tjbench/matrix.py +170 -0
  177. tokenjam_bench-0.1.0/tjbench/models/__init__.py +7 -0
  178. tokenjam_bench-0.1.0/tjbench/models/anthropic_agent_client.py +114 -0
  179. tokenjam_bench-0.1.0/tjbench/models/anthropic_client.py +52 -0
  180. tokenjam_bench-0.1.0/tjbench/models/base.py +30 -0
  181. tokenjam_bench-0.1.0/tjbench/models/google_client.py +41 -0
  182. tokenjam_bench-0.1.0/tjbench/models/mock_agent_client.py +129 -0
  183. tokenjam_bench-0.1.0/tjbench/models/mock_client.py +73 -0
  184. tokenjam_bench-0.1.0/tjbench/models/openai_client.py +42 -0
  185. tokenjam_bench-0.1.0/tjbench/models/openai_compatible.py +208 -0
  186. tokenjam_bench-0.1.0/tjbench/models/registry.py +50 -0
  187. tokenjam_bench-0.1.0/tjbench/models/tool_calling.py +51 -0
  188. tokenjam_bench-0.1.0/tjbench/pipeline.py +218 -0
  189. tokenjam_bench-0.1.0/tjbench/recommend.py +28 -0
  190. tokenjam_bench-0.1.0/tjbench/replay.py +139 -0
  191. tokenjam_bench-0.1.0/tjbench/replay_pipeline.py +151 -0
  192. tokenjam_bench-0.1.0/tjbench/report.py +172 -0
  193. tokenjam_bench-0.1.0/tjbench/report_html.py +322 -0
  194. tokenjam_bench-0.1.0/tjbench/stats.py +96 -0
  195. tokenjam_bench-0.1.0/tjbench/version.py +43 -0
  196. tokenjam_bench-0.1.0/tjbench/workflows/__init__.py +151 -0
  197. tokenjam_bench-0.1.0/tjbench/workflows/agentic.py +119 -0
@@ -0,0 +1,25 @@
1
+ ---
2
+ name: Bug report
3
+ about: Something is wrong, broken, or a number looks off
4
+ title: ""
5
+ labels: bug
6
+ assignees: ""
7
+ ---
8
+
9
+ **What happened**
10
+ A clear description of the bug.
11
+
12
+ **Expected**
13
+ What you expected instead.
14
+
15
+ **Reproduce**
16
+ Steps or the exact command, e.g. `tjb run --benchmark humaneval --limit 50`.
17
+
18
+ **Evidence (for a wrong/surprising number)**
19
+ The artifact filename under `docs/evidence/` or `results/`, or paste the
20
+ headline line, so it's reproducible.
21
+
22
+ **Environment**
23
+ - OS:
24
+ - Python (`python --version`): 3.10 / 3.11 / 3.12
25
+ - `tjb version` (tokenjam build):
@@ -0,0 +1,20 @@
1
+ ---
2
+ name: Feature request
3
+ about: Propose a new benchmark, provider, or capability
4
+ title: ""
5
+ labels: enhancement
6
+ assignees: ""
7
+ ---
8
+
9
+ **Problem**
10
+ What can't you measure or prove today?
11
+
12
+ **Proposal**
13
+ What you'd like added — a benchmark, model client, statistic, or dashboard view.
14
+
15
+ **Honesty check**
16
+ How would the result stay measured and hedged (real ground truth, CI + p-value,
17
+ no placeholder pricing, no overclaim)? See CONTRIBUTING.md.
18
+
19
+ **Alternatives**
20
+ Anything you considered or worked around.
@@ -0,0 +1,21 @@
1
+ <!-- Thanks for contributing to TokenJam Bench. -->
2
+
3
+ ## What & why
4
+
5
+ <!-- What does this change do, and why? Link any related issue. -->
6
+
7
+ ## Checklist
8
+
9
+ - [ ] `ruff check .` passes
10
+ - [ ] `pytest` passes locally (includes the honesty guard)
11
+ - [ ] Docs updated if behavior or CLI changed
12
+ - [ ] No placeholder-priced run is surfaced as headline/dashboard evidence
13
+ (`priced_with_defaults=true` stays under `docs/evidence/archive/`)
14
+ - [ ] No banned overclaim strings added (use Wilson CI + McNemar p + the hedged
15
+ verdicts, never "quality preserved" / a single `confidence = NN%` scalar /
16
+ ROI extrapolation)
17
+
18
+ ## Evidence (if numbers changed)
19
+
20
+ <!-- Paste the relevant artifact filename(s) under docs/evidence/ or results/,
21
+ or the headline line, so reviewers can reproduce. -->
@@ -0,0 +1,56 @@
1
+ name: benchmark
2
+
3
+ # Continuous benchmarking against the LATEST TokenJam release.
4
+ # - nightly cron picks up new TokenJam releases automatically (pip install -U)
5
+ # - manual dispatch for on-demand runs
6
+ on:
7
+ schedule:
8
+ - cron: "0 6 * * *" # 06:00 UTC nightly
9
+ workflow_dispatch:
10
+ inputs:
11
+ live:
12
+ description: "Run LIVE benchmarks (needs the DEEPSEEK_API_KEY secret)"
13
+ type: boolean
14
+ default: true
15
+ limit:
16
+ description: "Task limit for the live HumanEval run"
17
+ type: string
18
+ default: "10"
19
+
20
+ permissions:
21
+ contents: read
22
+
23
+ jobs:
24
+ benchmark:
25
+ runs-on: ubuntu-latest
26
+ env:
27
+ # Key-gated: live proofs only run if this secret is configured on the repo.
28
+ DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
29
+ TJBENCH_JUDGE: deepseek
30
+ TJBENCH_JUDGE_METRIC: correctness
31
+ HF_HUB_DISABLE_PROGRESS_BARS: "1"
32
+ DEEPEVAL_TELEMETRY_OPT_OUT: "YES"
33
+ steps:
34
+ - uses: actions/checkout@v4
35
+ - uses: actions/setup-python@v5
36
+ with:
37
+ python-version: "3.11"
38
+
39
+ - name: Install latest TokenJam + bench extras
40
+ run: |
41
+ pip install -U tokenjam
42
+ pip install -e ".[dev,providers,datasets,judge]"
43
+
44
+ - name: Show the TokenJam version under test
45
+ run: python -c "import importlib.metadata as m; print('tokenjam', m.version('tokenjam'))"
46
+
47
+ - name: Run benchmarks (offline always; live if the key secret is set)
48
+ run: python -m tjbench.ci_benchmark
49
+
50
+ - name: Upload version-stamped artifacts
51
+ if: always()
52
+ uses: actions/upload-artifact@v4
53
+ with:
54
+ name: benchmark-results
55
+ path: results/
56
+ if-no-files-found: warn
@@ -0,0 +1,35 @@
1
+ name: ci
2
+
3
+ # Always-on gate: lint + tests + an offline proof, across supported Pythons.
4
+ # No keys, no spend. (Branch-protection / required-review rules are a GitHub-UI
5
+ # setting configured at the public flip — not expressible here.)
6
+ on:
7
+ push:
8
+ branches: [main]
9
+ pull_request:
10
+ branches: [main]
11
+
12
+ jobs:
13
+ test:
14
+ runs-on: ubuntu-latest
15
+ strategy:
16
+ fail-fast: false
17
+ matrix:
18
+ python-version: ["3.10", "3.11", "3.12"]
19
+ name: test (py${{ matrix.python-version }})
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ - uses: actions/setup-python@v5
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+ - name: Install (consumes tokenjam exactly like an external user)
26
+ run: pip install -e ".[dev]"
27
+ - name: Lint
28
+ run: ruff check .
29
+ # `pytest -q` runs the full offline suite, which INCLUDES the Brief C
30
+ # honesty guard (tests/test_honesty_guard.py): no placeholder-priced
31
+ # headlines, no banned overclaim strings in README/docs/dashboard.
32
+ - name: Tests (all offline / mock, incl. honesty guard)
33
+ run: pytest -q
34
+ - name: Offline proof smoke (pipeline produces a valid stamped proof)
35
+ run: python -m tjbench.ci_benchmark
@@ -0,0 +1,34 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ # Trusted Publishing (OIDC) — no API token. The PyPI project must have a
8
+ # matching publisher configured: owner Metabuilder-Labs, repo tokenjam-bench,
9
+ # workflow publish-pypi.yml, environment pypi.
10
+ jobs:
11
+ publish-pypi:
12
+ if: startsWith(github.ref_name, 'v')
13
+ runs-on: ubuntu-latest
14
+ permissions:
15
+ id-token: write
16
+ environment: pypi
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+
20
+ - uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.12"
23
+
24
+ - name: Install build + dev deps
25
+ run: pip install build ".[dev]"
26
+
27
+ - name: Run tests (offline)
28
+ run: pytest -q
29
+
30
+ - name: Build sdist + wheel
31
+ run: python -m build
32
+
33
+ - name: Publish to PyPI
34
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,32 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .pytest_cache/
4
+ .ruff_cache/
5
+ *.egg-info/
6
+ .venv/
7
+ # Proof artifacts are generated; keep the dir, not the runs.
8
+ results/*.json
9
+ results/*.html
10
+ results/*.duckdb
11
+ results/*.duckdb.wal
12
+ !results/.gitkeep
13
+
14
+ # `tjb serve` writes a local history index next to whatever dir it serves
15
+ # (e.g. docs/evidence/…); never commit that byproduct.
16
+ history.duckdb
17
+ history.duckdb.wal
18
+
19
+ # DeepEval local cache/config (created on a live judge run)
20
+ .deepeval/
21
+ .deepeval-cache.json
22
+
23
+ # Secrets — never commit. Live provider keys land here for real benchmark runs.
24
+ .env
25
+ .env.local
26
+ .env.*.local
27
+
28
+ # Local agent scratch (briefs, notes) — not part of the published repo.
29
+ .claude/
30
+
31
+ # macOS
32
+ .DS_Store
@@ -0,0 +1,81 @@
1
+ # Changelog
2
+
3
+ ## 0.1.0 (2025-06-24)
4
+
5
+ ### Added
6
+
7
+ #### Agent Evaluation Framework
8
+ - **AgentRunner** — Multi-turn agent execution loop with max-turns guard
9
+ - **ToolRegistry** — Register and execute tools with JSON-schema advertisement
10
+ - **AgentTrace** — Observable record of every turn, tool call, and result
11
+ - **Safety Gate** — `validate_tools()` catches forbidden tools even with correct answers
12
+ - **ToolValidation** — Reports expected tools, ordering, safety, and error rate
13
+
14
+ #### Agent Benchmarks
15
+ - **sample-agent** — 3 offline tool-use tasks with safety gate validation
16
+ - **swe-bench-lite** — experimental scaffold (dataset loader + developer tools); fix-verification is NOT implemented, so it does not produce a pass-rate
17
+
18
+ #### SWE-Bench Tools
19
+ - **view** — Read file contents with line numbers
20
+ - **view_range** — Read specific line range
21
+ - **str_replace** — Exact-match string replacement (must match exactly once)
22
+ - **create** — Create new file
23
+ - **insert** — Insert text after specific line
24
+ - **bash** — Run shell commands with timeout
25
+ - **Path traversal blocking** — Prevents escaping workspace
26
+ - **Exact-match enforcement** — Prevents accidental mass-replace
27
+
28
+ #### Model Clients
29
+ - **AnthropicAgentClient** — Live tool-calling client for Anthropic
30
+ - **MockAgentClient** — Deterministic offline tool-calling client
31
+ - **ToolCallingClient** protocol — Multi-turn chat with tool use
32
+
33
+ #### Pipelines
34
+ - **Agent Proof Pipeline** — `run_agent_proof()` for multi-turn agent evaluation
35
+ - **Token summation** — Aggregates token usage across all turns for pricing
36
+ - **Tool validation scoring** — Safety gate + ordering + expected tools
37
+
38
+ #### Documentation (14 files)
39
+ - docs/README.md — Master documentation index
40
+ - docs/overview.md — Project overview and design principles
41
+ - docs/architecture.md — System design, data flow, module relationships
42
+ - docs/quickstart.md — 5-minute quickstart guide
43
+ - docs/cli-reference.md — Complete `tjbench` command reference
44
+ - docs/pipelines.md — Single-shot and agent proof pipeline deep dive
45
+ - docs/models.md — Model client adapters and protocols
46
+ - docs/benchmarks.md — Available benchmarks and scoring
47
+ - docs/agents.md — Multi-turn agent execution framework
48
+ - docs/statistics.md — Statistical methods used for proof
49
+ - docs/cost-pricing.md — How costs are computed
50
+ - docs/tokenjam-integration.md — How we consume TokenJam
51
+ - docs/development.md — Contributing, testing, extending
52
+ - docs/api-reference.md — Module-level API documentation
53
+ - docs/swe-bench-lite.md — SWE-Bench Lite integration guide
54
+ - docs/tests.md — Complete test suite inventory
55
+
56
+ #### Tests
57
+ - 55 total tests (20 new for SWE-Bench Lite)
58
+ - Mock scoring tests for SWE-Bench Lite
59
+ - Tool operation tests (view, replace, create, insert, bash)
60
+ - Path traversal safety tests
61
+ - Patch parsing tests
62
+
63
+ ### Design Principles
64
+
65
+ - **Black-box consumer** of TokenJam — imports as pip dependency, never vendored
66
+ - **Offline-first** — All tests run without API keys using mock clients
67
+ - **Objective ground truth** — Code execution and exact-match scoring, not LLM-as-judge
68
+ - **Statistical honesty** — Wilson CIs, McNemar exact tests, never claim significance on small samples
69
+ - **Safety-first** — Agent benchmarks include safety gate for dangerous tool calls
70
+
71
+ ### Integration Points
72
+
73
+ | Feature | TokenJam API | Module |
74
+ |---------|-------------|--------|
75
+ | Candidate recommendation | `tokenjam.core.optimize.DOWNGRADE_CANDIDATES` | `recommend.py` |
76
+ | Cost pricing | `tokenjam.core.pricing.get_rates` | `cost.py` |
77
+ | Version stamp | `importlib.metadata.version("tokenjam")` | `version.py` |
78
+
79
+ ---
80
+
81
+ *This changelog documents the initial release of tokenjam-bench as an agent evaluation framework.*
@@ -0,0 +1,63 @@
1
+ # Contributing to TokenJam Bench
2
+
3
+ Thanks for helping. TokenJam Bench is honesty-branded: its credibility is its
4
+ evidence. The bar for a change is that every number it shows traces to a real,
5
+ reproducible measurement.
6
+
7
+ ## Setup
8
+
9
+ ```bash
10
+ pip install -e ".[dev]"
11
+ pytest # full offline suite — no keys, no spend
12
+ ruff check . # lint
13
+ ```
14
+
15
+ Supported Python: 3.10, 3.11, 3.12. CI runs the suite on all three.
16
+
17
+ Run the app to see your change:
18
+
19
+ ```bash
20
+ tjb run # zero-flag offline proof
21
+ tjb serve # dashboard over the bundled real evidence
22
+ ```
23
+
24
+ ## The honesty rules (enforced in CI)
25
+
26
+ `tests/test_honesty_guard.py` fails CI if a change reintroduces a dishonest
27
+ surface. Before you open a PR:
28
+
29
+ - No headline/dashboard number may come from a placeholder-priced run
30
+ (`priced_with_defaults=true`). Re-run with real rates, or keep legacy runs
31
+ under `docs/evidence/archive/` (non-headline).
32
+ - No banned overclaim strings in README, docs, or the dashboard: no
33
+ "quality preserved", no "safe to replace", no single `confidence = NN%`
34
+ scalar, no ROI extrapolation ("at 10x", "annual savings"). The honest forms —
35
+ Wilson CI, McNemar p-value, and the three hedged verdicts
36
+ (`no_significant_regression` / `significant_regression` /
37
+ `insufficient_evidence`) — are what to use instead.
38
+ - Accuracy is the pass-rate on a named suite. It is never a general quality
39
+ claim.
40
+
41
+ ## How it's built
42
+
43
+ - **Offline-first.** Tests, lint, and the default `tjb run` work with no
44
+ provider keys and no network. Live providers are opt-in via a key in the env.
45
+ - **Flat layout.** Top-level modules and subpackages live under `tjbench/`.
46
+ - **TokenJam is a published dependency**, consumed like any external user — never
47
+ vendored. Every artifact is stamped with the exact `tokenjam_version`.
48
+
49
+ See [docs/development.md](docs/development.md) for adding a benchmark or model
50
+ client, and [docs/architecture.md](docs/architecture.md) for the data flow.
51
+
52
+ ## Pull requests
53
+
54
+ 1. Branch off `main`.
55
+ 2. Keep the change focused; update docs when behavior changes.
56
+ 3. Make sure `ruff check .` and `pytest` pass locally (the honesty guard runs
57
+ inside `pytest`).
58
+ 4. Fill out the PR template. CI must be green before review.
59
+
60
+ ## Reporting issues
61
+
62
+ Use the issue templates. For a wrong or surprising number, include the artifact
63
+ JSON (or its filename under `docs/evidence/` or `results/`) so it's reproducible.
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Metabuilder Labs
4
+ Copyright (c) 2026 Hooman Digital
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
@@ -0,0 +1,37 @@
1
+ .PHONY: install update-tokenjam test lint bench-smoke version serve dashboard
2
+
3
+ # Use the python where the deps (tokenjam, click, rich) are installed.
4
+ PY ?= python3
5
+
6
+ # Install the bench (editable) + dev tooling.
7
+ install:
8
+ pip install -e ".[dev]"
9
+
10
+ # THE daily-pull command: upgrade to the latest published TokenJam and show the
11
+ # version every subsequent proof will be stamped with.
12
+ update-tokenjam:
13
+ pip install -U tokenjam
14
+ @$(PY) -c "import importlib.metadata as m; print('tokenjam now at', m.version('tokenjam'))"
15
+
16
+ # All commands go through run.py so they work without relying on the installed
17
+ # console script (flat layout → `cli` collides with the `cli` PyPI package).
18
+ version:
19
+ $(PY) run.py version
20
+
21
+ # Live proof dashboard (offline, auto-refreshing) at http://127.0.0.1:7392/
22
+ serve dashboard:
23
+ $(PY) run.py serve --open
24
+
25
+ # Offline end-to-end smoke (no keys, no spend).
26
+ bench-smoke:
27
+ $(PY) run.py run --benchmark samples --original anthropic:claude-opus-4-7 --mock
28
+
29
+ test:
30
+ pytest -q
31
+
32
+ lint:
33
+ ruff check .
34
+
35
+ # Continuous-benchmark set: offline always; live if a provider key is exported.
36
+ ci-bench:
37
+ $(PY) -m tjbench.ci_benchmark
@@ -0,0 +1,7 @@
1
+ tokenjam-bench
2
+ Copyright (c) 2026 Metabuilder Labs
3
+
4
+ This project was originally created by Piyush (Hooman Digital) and is maintained
5
+ by Metabuilder Labs as the official benchmark for TokenJam
6
+ (https://github.com/Metabuilder-Labs/tokenjam). Licensed under the MIT License;
7
+ the original Hooman Digital copyright is retained in LICENSE.