tokenjam-bench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokenjam_bench-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +25 -0
- tokenjam_bench-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- tokenjam_bench-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +21 -0
- tokenjam_bench-0.1.0/.github/workflows/benchmark.yml +56 -0
- tokenjam_bench-0.1.0/.github/workflows/ci.yml +35 -0
- tokenjam_bench-0.1.0/.github/workflows/publish-pypi.yml +34 -0
- tokenjam_bench-0.1.0/.gitignore +32 -0
- tokenjam_bench-0.1.0/Assets/ChatGPT Image Jun 25, 2026, 05_19_19 PM.png +0 -0
- tokenjam_bench-0.1.0/Assets/ChatGPT Image Jun 25, 2026, 05_31_49 PM.png +0 -0
- tokenjam_bench-0.1.0/Assets/logo_mark.png +0 -0
- tokenjam_bench-0.1.0/CHANGELOG.md +81 -0
- tokenjam_bench-0.1.0/CONTRIBUTING.md +63 -0
- tokenjam_bench-0.1.0/LICENSE +22 -0
- tokenjam_bench-0.1.0/Makefile +37 -0
- tokenjam_bench-0.1.0/NOTICE +7 -0
- tokenjam_bench-0.1.0/PKG-INFO +410 -0
- tokenjam_bench-0.1.0/README.md +385 -0
- tokenjam_bench-0.1.0/datasets/customer_support/README.md +51 -0
- tokenjam_bench-0.1.0/datasets/customer_support/tickets.json +167 -0
- tokenjam_bench-0.1.0/datasets/email/tasks.json +115 -0
- tokenjam_bench-0.1.0/datasets/rag/qa.json +127 -0
- tokenjam_bench-0.1.0/datasets/research/tasks.json +148 -0
- tokenjam_bench-0.1.0/demo/seed_demo.py +350 -0
- tokenjam_bench-0.1.0/docs/README.md +46 -0
- tokenjam_bench-0.1.0/docs/agents.md +289 -0
- tokenjam_bench-0.1.0/docs/analytics.md +60 -0
- tokenjam_bench-0.1.0/docs/api-reference.md +468 -0
- tokenjam_bench-0.1.0/docs/architecture.md +221 -0
- tokenjam_bench-0.1.0/docs/benchmarks.md +222 -0
- tokenjam_bench-0.1.0/docs/brand/tokenjam-bench-banner.png +0 -0
- tokenjam_bench-0.1.0/docs/brand/tokenjam-bench-banner.svg +29 -0
- tokenjam_bench-0.1.0/docs/brand/tokenjam-bench-icon.png +0 -0
- tokenjam_bench-0.1.0/docs/brand/tokenjam-bench-icon.svg +22 -0
- tokenjam_bench-0.1.0/docs/brand/tokenjam-bench.png +0 -0
- tokenjam_bench-0.1.0/docs/brand/tokenjam-bench.svg +24 -0
- tokenjam_bench-0.1.0/docs/ci.md +61 -0
- tokenjam_bench-0.1.0/docs/cli-reference.md +233 -0
- tokenjam_bench-0.1.0/docs/cost-pricing.md +93 -0
- tokenjam_bench-0.1.0/docs/development.md +97 -0
- tokenjam_bench-0.1.0/docs/evidence/2026-06-26-deepseek-live.md +97 -0
- tokenjam_bench-0.1.0/docs/evidence/2026-06-26-real-workflow-dashboard.md +48 -0
- tokenjam_bench-0.1.0/docs/evidence/README.md +67 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_customer-support_tj0.5.1_1782480389.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_customer-support_tj0.5.1_1782480389.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_email-assistant_tj0.5.1_1782480651.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_email-assistant_tj0.5.1_1782480651.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_enterprise-rag_tj0.5.1_1782480507.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_enterprise-rag_tj0.5.1_1782480507.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_gsm8k_tj0.5.1_1782480094.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_gsm8k_tj0.5.1_1782480094.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_humaneval_tj0.5.1_1782480162.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_humaneval_tj0.5.1_1782480162.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_judged_tj0.5.1_1782480215.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_judged_tj0.5.1_1782480215.json +111 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_research-assistant_tj0.5.1_1782480799.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/2026-06-26-real-dashboard/tjbench_research-assistant_tj0.5.1_1782480799.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/README.md +21 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_gsm8k_tj0.5.1_1782443915.html +45 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_gsm8k_tj0.5.1_1782443915.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_humaneval_tj0.5.1_1782443963.html +45 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_humaneval_tj0.5.1_1782443963.json +171 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_judged_tj0.5.1_1782444012.html +45 -0
- tokenjam_bench-0.1.0/docs/evidence/archive/tjbench_judged_tj0.5.1_1782444012.json +111 -0
- tokenjam_bench-0.1.0/docs/evidence/humaneval_deepseek_reasoner_to_chat.html +45 -0
- tokenjam_bench-0.1.0/docs/evidence/humaneval_deepseek_reasoner_to_chat.json +291 -0
- tokenjam_bench-0.1.0/docs/evidence/judged_deepseek_correctness.html +45 -0
- tokenjam_bench-0.1.0/docs/evidence/judged_deepseek_correctness.json +111 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/README.md +144 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_customer-support_tj0.5.2_1782506002.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_customer-support_tj0.5.2_1782506002.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_customer-support_tj0.5.2_1782506688.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_customer-support_tj0.5.2_1782506688.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_email-assistant_tj0.5.2_1782506301.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_email-assistant_tj0.5.2_1782506301.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_email-assistant_tj0.5.2_1782506913.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_email-assistant_tj0.5.2_1782506913.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_enterprise-rag_tj0.5.2_1782506164.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_enterprise-rag_tj0.5.2_1782506164.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_enterprise-rag_tj0.5.2_1782506788.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_enterprise-rag_tj0.5.2_1782506788.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782503878.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782503878.json +651 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504203.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504203.json +651 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504371.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504371.json +651 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504768.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_gsm8k_tj0.5.2_1782504768.json +651 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504077.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504077.json +651 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504400.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504400.json +651 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504580.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782504580.json +651 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782505333.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_humaneval_tj0.5.2_1782505333.json +651 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_judged_tj0.5.2_1782505847.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_judged_tj0.5.2_1782505847.json +111 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_judged_tj0.5.2_1782506545.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_judged_tj0.5.2_1782506545.json +111 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_research-assistant_tj0.5.2_1782506498.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_research-assistant_tj0.5.2_1782506498.json +195 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_research-assistant_tj0.5.2_1782507084.html +57 -0
- tokenjam_bench-0.1.0/docs/evidence/live/2026-06-26-multipair/tjbench_research-assistant_tj0.5.2_1782507084.json +195 -0
- tokenjam_bench-0.1.0/docs/faq.md +139 -0
- tokenjam_bench-0.1.0/docs/history.md +72 -0
- tokenjam_bench-0.1.0/docs/models.md +190 -0
- tokenjam_bench-0.1.0/docs/overview.md +70 -0
- tokenjam_bench-0.1.0/docs/pipelines.md +209 -0
- tokenjam_bench-0.1.0/docs/proof-runbook.md +138 -0
- tokenjam_bench-0.1.0/docs/quickstart.md +126 -0
- tokenjam_bench-0.1.0/docs/replay.md +97 -0
- tokenjam_bench-0.1.0/docs/scenarios.md +71 -0
- tokenjam_bench-0.1.0/docs/security.md +105 -0
- tokenjam_bench-0.1.0/docs/statistics.md +126 -0
- tokenjam_bench-0.1.0/docs/swe-bench-lite.md +73 -0
- tokenjam_bench-0.1.0/docs/tests.md +206 -0
- tokenjam_bench-0.1.0/docs/tokenjam-integration.md +148 -0
- tokenjam_bench-0.1.0/docs/workflows.md +109 -0
- tokenjam_bench-0.1.0/pyproject.toml +58 -0
- tokenjam_bench-0.1.0/results/.gitkeep +0 -0
- tokenjam_bench-0.1.0/run.py +15 -0
- tokenjam_bench-0.1.0/scripts/run_multipair_evidence.sh +58 -0
- tokenjam_bench-0.1.0/scripts/run_real_benchmarks.sh +58 -0
- tokenjam_bench-0.1.0/tests/test_agent_pipeline_offline.py +48 -0
- tokenjam_bench-0.1.0/tests/test_agent_runner.py +49 -0
- tokenjam_bench-0.1.0/tests/test_agent_validation.py +50 -0
- tokenjam_bench-0.1.0/tests/test_ci_benchmark.py +36 -0
- tokenjam_bench-0.1.0/tests/test_dashboard.py +82 -0
- tokenjam_bench-0.1.0/tests/test_deepseek.py +78 -0
- tokenjam_bench-0.1.0/tests/test_history.py +140 -0
- tokenjam_bench-0.1.0/tests/test_honesty_guard.py +227 -0
- tokenjam_bench-0.1.0/tests/test_judge.py +57 -0
- tokenjam_bench-0.1.0/tests/test_matrix.py +88 -0
- tokenjam_bench-0.1.0/tests/test_pipeline_offline.py +116 -0
- tokenjam_bench-0.1.0/tests/test_real_scenarios.py +69 -0
- tokenjam_bench-0.1.0/tests/test_replay.py +115 -0
- tokenjam_bench-0.1.0/tests/test_report.py +44 -0
- tokenjam_bench-0.1.0/tests/test_report_html.py +41 -0
- tokenjam_bench-0.1.0/tests/test_scenario_suites.py +61 -0
- tokenjam_bench-0.1.0/tests/test_scoring.py +42 -0
- tokenjam_bench-0.1.0/tests/test_stats.py +60 -0
- tokenjam_bench-0.1.0/tests/test_swe_bench_lite.py +325 -0
- tokenjam_bench-0.1.0/tests/test_version_stamp.py +13 -0
- tokenjam_bench-0.1.0/tests/test_workflows.py +101 -0
- tokenjam_bench-0.1.0/tjbench/__init__.py +6 -0
- tokenjam_bench-0.1.0/tjbench/agent_pipeline.py +117 -0
- tokenjam_bench-0.1.0/tjbench/agents/__init__.py +25 -0
- tokenjam_bench-0.1.0/tjbench/agents/runner.py +66 -0
- tokenjam_bench-0.1.0/tjbench/agents/swe_bench_tools.py +296 -0
- tokenjam_bench-0.1.0/tjbench/agents/tools.py +63 -0
- tokenjam_bench-0.1.0/tjbench/agents/trace.py +72 -0
- tokenjam_bench-0.1.0/tjbench/agents/validation.py +68 -0
- tokenjam_bench-0.1.0/tjbench/bench_meta.py +2 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/__init__.py +65 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/agent_base.py +37 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/base.py +37 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/gsm8k.py +48 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/humaneval.py +58 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/judged.py +74 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/real_scenarios.py +144 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/sample_agent.py +107 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/samples.py +73 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/scenario_lib.py +108 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/scenario_suites.py +153 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/scoring.py +63 -0
- tokenjam_bench-0.1.0/tjbench/benchmarks/swe_bench_lite.py +288 -0
- tokenjam_bench-0.1.0/tjbench/ci_benchmark.py +108 -0
- tokenjam_bench-0.1.0/tjbench/cli.py +647 -0
- tokenjam_bench-0.1.0/tjbench/cost.py +44 -0
- tokenjam_bench-0.1.0/tjbench/dashboard.py +980 -0
- tokenjam_bench-0.1.0/tjbench/deepeval_judge.py +137 -0
- tokenjam_bench-0.1.0/tjbench/exec_sandbox.py +54 -0
- tokenjam_bench-0.1.0/tjbench/history.py +290 -0
- tokenjam_bench-0.1.0/tjbench/judge.py +116 -0
- tokenjam_bench-0.1.0/tjbench/matrix.py +170 -0
- tokenjam_bench-0.1.0/tjbench/models/__init__.py +7 -0
- tokenjam_bench-0.1.0/tjbench/models/anthropic_agent_client.py +114 -0
- tokenjam_bench-0.1.0/tjbench/models/anthropic_client.py +52 -0
- tokenjam_bench-0.1.0/tjbench/models/base.py +30 -0
- tokenjam_bench-0.1.0/tjbench/models/google_client.py +41 -0
- tokenjam_bench-0.1.0/tjbench/models/mock_agent_client.py +129 -0
- tokenjam_bench-0.1.0/tjbench/models/mock_client.py +73 -0
- tokenjam_bench-0.1.0/tjbench/models/openai_client.py +42 -0
- tokenjam_bench-0.1.0/tjbench/models/openai_compatible.py +208 -0
- tokenjam_bench-0.1.0/tjbench/models/registry.py +50 -0
- tokenjam_bench-0.1.0/tjbench/models/tool_calling.py +51 -0
- tokenjam_bench-0.1.0/tjbench/pipeline.py +218 -0
- tokenjam_bench-0.1.0/tjbench/recommend.py +28 -0
- tokenjam_bench-0.1.0/tjbench/replay.py +139 -0
- tokenjam_bench-0.1.0/tjbench/replay_pipeline.py +151 -0
- tokenjam_bench-0.1.0/tjbench/report.py +172 -0
- tokenjam_bench-0.1.0/tjbench/report_html.py +322 -0
- tokenjam_bench-0.1.0/tjbench/stats.py +96 -0
- tokenjam_bench-0.1.0/tjbench/version.py +43 -0
- tokenjam_bench-0.1.0/tjbench/workflows/__init__.py +151 -0
- tokenjam_bench-0.1.0/tjbench/workflows/agentic.py +119 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Bug report
|
|
3
|
+
about: Something is wrong, broken, or a number looks off
|
|
4
|
+
title: ""
|
|
5
|
+
labels: bug
|
|
6
|
+
assignees: ""
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
**What happened**
|
|
10
|
+
A clear description of the bug.
|
|
11
|
+
|
|
12
|
+
**Expected**
|
|
13
|
+
What you expected instead.
|
|
14
|
+
|
|
15
|
+
**Reproduce**
|
|
16
|
+
Steps or the exact command, e.g. `tjb run --benchmark humaneval --limit 50`.
|
|
17
|
+
|
|
18
|
+
**Evidence (for a wrong/surprising number)**
|
|
19
|
+
The artifact filename under `docs/evidence/` or `results/`, or paste the
|
|
20
|
+
headline line, so it's reproducible.
|
|
21
|
+
|
|
22
|
+
**Environment**
|
|
23
|
+
- OS:
|
|
24
|
+
- Python (`python --version`): 3.10 / 3.11 / 3.12
|
|
25
|
+
- `tjb version` (tokenjam build):
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Feature request
|
|
3
|
+
about: Propose a new benchmark, provider, or capability
|
|
4
|
+
title: ""
|
|
5
|
+
labels: enhancement
|
|
6
|
+
assignees: ""
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
**Problem**
|
|
10
|
+
What can't you measure or prove today?
|
|
11
|
+
|
|
12
|
+
**Proposal**
|
|
13
|
+
What you'd like added — a benchmark, model client, statistic, or dashboard view.
|
|
14
|
+
|
|
15
|
+
**Honesty check**
|
|
16
|
+
How would the result stay measured and hedged (real ground truth, CI + p-value,
|
|
17
|
+
no placeholder pricing, no overclaim)? See CONTRIBUTING.md.
|
|
18
|
+
|
|
19
|
+
**Alternatives**
|
|
20
|
+
Anything you considered or worked around.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
<!-- Thanks for contributing to TokenJam Bench. -->
|
|
2
|
+
|
|
3
|
+
## What & why
|
|
4
|
+
|
|
5
|
+
<!-- What does this change do, and why? Link any related issue. -->
|
|
6
|
+
|
|
7
|
+
## Checklist
|
|
8
|
+
|
|
9
|
+
- [ ] `ruff check .` passes
|
|
10
|
+
- [ ] `pytest` passes locally (includes the honesty guard)
|
|
11
|
+
- [ ] Docs updated if behavior or CLI changed
|
|
12
|
+
- [ ] No placeholder-priced run is surfaced as headline/dashboard evidence
|
|
13
|
+
(`priced_with_defaults=true` stays under `docs/evidence/archive/`)
|
|
14
|
+
- [ ] No banned overclaim strings added (use Wilson CI + McNemar p + the hedged
|
|
15
|
+
verdicts, never "quality preserved" / a single `confidence = NN%` scalar /
|
|
16
|
+
ROI extrapolation)
|
|
17
|
+
|
|
18
|
+
## Evidence (if numbers changed)
|
|
19
|
+
|
|
20
|
+
<!-- Paste the relevant artifact filename(s) under docs/evidence/ or results/,
|
|
21
|
+
or the headline line, so reviewers can reproduce. -->
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
name: benchmark
|
|
2
|
+
|
|
3
|
+
# Continuous benchmarking against the LATEST TokenJam release.
|
|
4
|
+
# - nightly cron picks up new TokenJam releases automatically (pip install -U)
|
|
5
|
+
# - manual dispatch for on-demand runs
|
|
6
|
+
on:
|
|
7
|
+
schedule:
|
|
8
|
+
- cron: "0 6 * * *" # 06:00 UTC nightly
|
|
9
|
+
workflow_dispatch:
|
|
10
|
+
inputs:
|
|
11
|
+
live:
|
|
12
|
+
description: "Run LIVE benchmarks (needs the DEEPSEEK_API_KEY secret)"
|
|
13
|
+
type: boolean
|
|
14
|
+
default: true
|
|
15
|
+
limit:
|
|
16
|
+
description: "Task limit for the live HumanEval run"
|
|
17
|
+
type: string
|
|
18
|
+
default: "10"
|
|
19
|
+
|
|
20
|
+
permissions:
|
|
21
|
+
contents: read
|
|
22
|
+
|
|
23
|
+
jobs:
|
|
24
|
+
benchmark:
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
env:
|
|
27
|
+
# Key-gated: live proofs only run if this secret is configured on the repo.
|
|
28
|
+
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
|
|
29
|
+
TJBENCH_JUDGE: deepseek
|
|
30
|
+
TJBENCH_JUDGE_METRIC: correctness
|
|
31
|
+
HF_HUB_DISABLE_PROGRESS_BARS: "1"
|
|
32
|
+
DEEPEVAL_TELEMETRY_OPT_OUT: "YES"
|
|
33
|
+
steps:
|
|
34
|
+
- uses: actions/checkout@v4
|
|
35
|
+
- uses: actions/setup-python@v5
|
|
36
|
+
with:
|
|
37
|
+
python-version: "3.11"
|
|
38
|
+
|
|
39
|
+
- name: Install latest TokenJam + bench extras
|
|
40
|
+
run: |
|
|
41
|
+
pip install -U tokenjam
|
|
42
|
+
pip install -e ".[dev,providers,datasets,judge]"
|
|
43
|
+
|
|
44
|
+
- name: Show the TokenJam version under test
|
|
45
|
+
run: python -c "import importlib.metadata as m; print('tokenjam', m.version('tokenjam'))"
|
|
46
|
+
|
|
47
|
+
- name: Run benchmarks (offline always; live if the key secret is set)
|
|
48
|
+
run: python -m tjbench.ci_benchmark
|
|
49
|
+
|
|
50
|
+
- name: Upload version-stamped artifacts
|
|
51
|
+
if: always()
|
|
52
|
+
uses: actions/upload-artifact@v4
|
|
53
|
+
with:
|
|
54
|
+
name: benchmark-results
|
|
55
|
+
path: results/
|
|
56
|
+
if-no-files-found: warn
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: ci
|
|
2
|
+
|
|
3
|
+
# Always-on gate: lint + tests + an offline proof, across supported Pythons.
|
|
4
|
+
# No keys, no spend. (Branch-protection / required-review rules are a GitHub-UI
|
|
5
|
+
# setting configured at the public flip — not expressible here.)
|
|
6
|
+
on:
|
|
7
|
+
push:
|
|
8
|
+
branches: [main]
|
|
9
|
+
pull_request:
|
|
10
|
+
branches: [main]
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
test:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
strategy:
|
|
16
|
+
fail-fast: false
|
|
17
|
+
matrix:
|
|
18
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
19
|
+
name: test (py${{ matrix.python-version }})
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
- uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: ${{ matrix.python-version }}
|
|
25
|
+
- name: Install (consumes tokenjam exactly like an external user)
|
|
26
|
+
run: pip install -e ".[dev]"
|
|
27
|
+
- name: Lint
|
|
28
|
+
run: ruff check .
|
|
29
|
+
# `pytest -q` runs the full offline suite, which INCLUDES the Brief C
|
|
30
|
+
# honesty guard (tests/test_honesty_guard.py): no placeholder-priced
|
|
31
|
+
# headlines, no banned overclaim strings in README/docs/dashboard.
|
|
32
|
+
- name: Tests (all offline / mock, incl. honesty guard)
|
|
33
|
+
run: pytest -q
|
|
34
|
+
- name: Offline proof smoke (pipeline produces a valid stamped proof)
|
|
35
|
+
run: python -m tjbench.ci_benchmark
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
# Trusted Publishing (OIDC) — no API token. The PyPI project must have a
|
|
8
|
+
# matching publisher configured: owner Metabuilder-Labs, repo tokenjam-bench,
|
|
9
|
+
# workflow publish-pypi.yml, environment pypi.
|
|
10
|
+
jobs:
|
|
11
|
+
publish-pypi:
|
|
12
|
+
if: startsWith(github.ref_name, 'v')
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
permissions:
|
|
15
|
+
id-token: write
|
|
16
|
+
environment: pypi
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.12"
|
|
23
|
+
|
|
24
|
+
- name: Install build + dev deps
|
|
25
|
+
run: pip install build ".[dev]"
|
|
26
|
+
|
|
27
|
+
- name: Run tests (offline)
|
|
28
|
+
run: pytest -q
|
|
29
|
+
|
|
30
|
+
- name: Build sdist + wheel
|
|
31
|
+
run: python -m build
|
|
32
|
+
|
|
33
|
+
- name: Publish to PyPI
|
|
34
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.pyc
|
|
3
|
+
.pytest_cache/
|
|
4
|
+
.ruff_cache/
|
|
5
|
+
*.egg-info/
|
|
6
|
+
.venv/
|
|
7
|
+
# Proof artifacts are generated; keep the dir, not the runs.
|
|
8
|
+
results/*.json
|
|
9
|
+
results/*.html
|
|
10
|
+
results/*.duckdb
|
|
11
|
+
results/*.duckdb.wal
|
|
12
|
+
!results/.gitkeep
|
|
13
|
+
|
|
14
|
+
# `tjb serve` writes a local history index next to whatever dir it serves
|
|
15
|
+
# (e.g. docs/evidence/…); never commit that byproduct.
|
|
16
|
+
history.duckdb
|
|
17
|
+
history.duckdb.wal
|
|
18
|
+
|
|
19
|
+
# DeepEval local cache/config (created on a live judge run)
|
|
20
|
+
.deepeval/
|
|
21
|
+
.deepeval-cache.json
|
|
22
|
+
|
|
23
|
+
# Secrets — never commit. Live provider keys land here for real benchmark runs.
|
|
24
|
+
.env
|
|
25
|
+
.env.local
|
|
26
|
+
.env.*.local
|
|
27
|
+
|
|
28
|
+
# Local agent scratch (briefs, notes) — not part of the published repo.
|
|
29
|
+
.claude/
|
|
30
|
+
|
|
31
|
+
# macOS
|
|
32
|
+
.DS_Store
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0 (2025-06-24)
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
#### Agent Evaluation Framework
|
|
8
|
+
- **AgentRunner** — Multi-turn agent execution loop with max-turns guard
|
|
9
|
+
- **ToolRegistry** — Register and execute tools with JSON-schema advertisement
|
|
10
|
+
- **AgentTrace** — Observable record of every turn, tool call, and result
|
|
11
|
+
- **Safety Gate** — `validate_tools()` catches forbidden tools even with correct answers
|
|
12
|
+
- **ToolValidation** — Reports expected tools, ordering, safety, and error rate
|
|
13
|
+
|
|
14
|
+
#### Agent Benchmarks
|
|
15
|
+
- **sample-agent** — 3 offline tool-use tasks with safety gate validation
|
|
16
|
+
- **swe-bench-lite** — experimental scaffold (dataset loader + developer tools); fix-verification is NOT implemented, so it does not produce a pass-rate
|
|
17
|
+
|
|
18
|
+
#### SWE-Bench Tools
|
|
19
|
+
- **view** — Read file contents with line numbers
|
|
20
|
+
- **view_range** — Read specific line range
|
|
21
|
+
- **str_replace** — Exact-match string replacement (must match exactly once)
|
|
22
|
+
- **create** — Create new file
|
|
23
|
+
- **insert** — Insert text after specific line
|
|
24
|
+
- **bash** — Run shell commands with timeout
|
|
25
|
+
- **Path traversal blocking** — Prevents escaping workspace
|
|
26
|
+
- **Exact-match enforcement** — Prevents accidental mass-replace
|
|
27
|
+
|
|
28
|
+
#### Model Clients
|
|
29
|
+
- **AnthropicAgentClient** — Live tool-calling client for Anthropic
|
|
30
|
+
- **MockAgentClient** — Deterministic offline tool-calling client
|
|
31
|
+
- **ToolCallingClient** protocol — Multi-turn chat with tool use
|
|
32
|
+
|
|
33
|
+
#### Pipelines
|
|
34
|
+
- **Agent Proof Pipeline** — `run_agent_proof()` for multi-turn agent evaluation
|
|
35
|
+
- **Token summation** — Aggregates token usage across all turns for pricing
|
|
36
|
+
- **Tool validation scoring** — Safety gate + ordering + expected tools
|
|
37
|
+
|
|
38
|
+
#### Documentation (14 files)
|
|
39
|
+
- docs/README.md — Master documentation index
|
|
40
|
+
- docs/overview.md — Project overview and design principles
|
|
41
|
+
- docs/architecture.md — System design, data flow, module relationships
|
|
42
|
+
- docs/quickstart.md — 5-minute quickstart guide
|
|
43
|
+
- docs/cli-reference.md — Complete `tjbench` command reference
|
|
44
|
+
- docs/pipelines.md — Single-shot and agent proof pipeline deep dive
|
|
45
|
+
- docs/models.md — Model client adapters and protocols
|
|
46
|
+
- docs/benchmarks.md — Available benchmarks and scoring
|
|
47
|
+
- docs/agents.md — Multi-turn agent execution framework
|
|
48
|
+
- docs/statistics.md — Statistical methods used for proof
|
|
49
|
+
- docs/cost-pricing.md — How costs are computed
|
|
50
|
+
- docs/tokenjam-integration.md — How we consume TokenJam
|
|
51
|
+
- docs/development.md — Contributing, testing, extending
|
|
52
|
+
- docs/api-reference.md — Module-level API documentation
|
|
53
|
+
- docs/swe-bench-lite.md — SWE-Bench Lite integration guide
|
|
54
|
+
- docs/tests.md — Complete test suite inventory
|
|
55
|
+
|
|
56
|
+
#### Tests
|
|
57
|
+
- 55 total tests (20 new for SWE-Bench Lite)
|
|
58
|
+
- Mock scoring tests for SWE-Bench Lite
|
|
59
|
+
- Tool operation tests (view, replace, create, insert, bash)
|
|
60
|
+
- Path traversal safety tests
|
|
61
|
+
- Patch parsing tests
|
|
62
|
+
|
|
63
|
+
### Design Principles
|
|
64
|
+
|
|
65
|
+
- **Black-box consumer** of TokenJam — imports as pip dependency, never vendored
|
|
66
|
+
- **Offline-first** — All tests run without API keys using mock clients
|
|
67
|
+
- **Objective ground truth** — Code execution and exact-match scoring, not LLM-as-judge
|
|
68
|
+
- **Statistical honesty** — Wilson CIs, McNemar exact tests, never claim significance on small samples
|
|
69
|
+
- **Safety-first** — Agent benchmarks include safety gate for dangerous tool calls
|
|
70
|
+
|
|
71
|
+
### Integration Points
|
|
72
|
+
|
|
73
|
+
| Feature | TokenJam API | Module |
|
|
74
|
+
|---------|-------------|--------|
|
|
75
|
+
| Candidate recommendation | `tokenjam.core.optimize.DOWNGRADE_CANDIDATES` | `recommend.py` |
|
|
76
|
+
| Cost pricing | `tokenjam.core.pricing.get_rates` | `cost.py` |
|
|
77
|
+
| Version stamp | `importlib.metadata.version("tokenjam")` | `version.py` |
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
*This changelog documents the initial release of tokenjam-bench as an agent evaluation framework.*
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Contributing to TokenJam Bench
|
|
2
|
+
|
|
3
|
+
Thanks for helping. TokenJam Bench is honesty-branded: its credibility is its
|
|
4
|
+
evidence. The bar for a change is that every number it shows traces to a real,
|
|
5
|
+
reproducible measurement.
|
|
6
|
+
|
|
7
|
+
## Setup
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install -e ".[dev]"
|
|
11
|
+
pytest # full offline suite — no keys, no spend
|
|
12
|
+
ruff check . # lint
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Supported Python: 3.10, 3.11, 3.12. CI runs the suite on all three.
|
|
16
|
+
|
|
17
|
+
Run the app to see your change:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
tjb run # zero-flag offline proof
|
|
21
|
+
tjb serve # dashboard over the bundled real evidence
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## The honesty rules (enforced in CI)
|
|
25
|
+
|
|
26
|
+
`tests/test_honesty_guard.py` fails CI if a change reintroduces a dishonest
|
|
27
|
+
surface. Before you open a PR:
|
|
28
|
+
|
|
29
|
+
- No headline/dashboard number may come from a placeholder-priced run
|
|
30
|
+
(`priced_with_defaults=true`). Re-run with real rates, or keep legacy runs
|
|
31
|
+
under `docs/evidence/archive/` (non-headline).
|
|
32
|
+
- No banned overclaim strings in README, docs, or the dashboard: no
|
|
33
|
+
"quality preserved", no "safe to replace", no single `confidence = NN%`
|
|
34
|
+
scalar, no ROI extrapolation ("at 10x", "annual savings"). The honest forms —
|
|
35
|
+
Wilson CI, McNemar p-value, and the three hedged verdicts
|
|
36
|
+
(`no_significant_regression` / `significant_regression` /
|
|
37
|
+
`insufficient_evidence`) — are what to use instead.
|
|
38
|
+
- Accuracy is the pass-rate on a named suite. It is never a general quality
|
|
39
|
+
claim.
|
|
40
|
+
|
|
41
|
+
## How it's built
|
|
42
|
+
|
|
43
|
+
- **Offline-first.** Tests, lint, and the default `tjb run` work with no
|
|
44
|
+
provider keys and no network. Live providers are opt-in via a key in the env.
|
|
45
|
+
- **Flat layout.** Top-level modules and subpackages live under `tjbench/`.
|
|
46
|
+
- **TokenJam is a published dependency**, consumed like any external user — never
|
|
47
|
+
vendored. Every artifact is stamped with the exact `tokenjam_version`.
|
|
48
|
+
|
|
49
|
+
See [docs/development.md](docs/development.md) for adding a benchmark or model
|
|
50
|
+
client, and [docs/architecture.md](docs/architecture.md) for the data flow.
|
|
51
|
+
|
|
52
|
+
## Pull requests
|
|
53
|
+
|
|
54
|
+
1. Branch off `main`.
|
|
55
|
+
2. Keep the change focused; update docs when behavior changes.
|
|
56
|
+
3. Make sure `ruff check .` and `pytest` pass locally (the honesty guard runs
|
|
57
|
+
inside `pytest`).
|
|
58
|
+
4. Fill out the PR template. CI must be green before review.
|
|
59
|
+
|
|
60
|
+
## Reporting issues
|
|
61
|
+
|
|
62
|
+
Use the issue templates. For a wrong or surprising number, include the artifact
|
|
63
|
+
JSON (or its filename under `docs/evidence/` or `results/`) so it's reproducible.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Metabuilder Labs
|
|
4
|
+
Copyright (c) 2026 Hooman Digital
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
.PHONY: install update-tokenjam test lint bench-smoke version serve dashboard
|
|
2
|
+
|
|
3
|
+
# Use the python where the deps (tokenjam, click, rich) are installed.
|
|
4
|
+
PY ?= python3
|
|
5
|
+
|
|
6
|
+
# Install the bench (editable) + dev tooling.
|
|
7
|
+
install:
|
|
8
|
+
pip install -e ".[dev]"
|
|
9
|
+
|
|
10
|
+
# THE daily-pull command: upgrade to the latest published TokenJam and show the
|
|
11
|
+
# version every subsequent proof will be stamped with.
|
|
12
|
+
update-tokenjam:
|
|
13
|
+
pip install -U tokenjam
|
|
14
|
+
@$(PY) -c "import importlib.metadata as m; print('tokenjam now at', m.version('tokenjam'))"
|
|
15
|
+
|
|
16
|
+
# All commands go through run.py so they work without relying on the installed
|
|
17
|
+
# console script (flat layout → `cli` collides with the `cli` PyPI package).
|
|
18
|
+
version:
|
|
19
|
+
$(PY) run.py version
|
|
20
|
+
|
|
21
|
+
# Live proof dashboard (offline, auto-refreshing) at http://127.0.0.1:7392/
|
|
22
|
+
serve dashboard:
|
|
23
|
+
$(PY) run.py serve --open
|
|
24
|
+
|
|
25
|
+
# Offline end-to-end smoke (no keys, no spend).
|
|
26
|
+
bench-smoke:
|
|
27
|
+
$(PY) run.py run --benchmark samples --original anthropic:claude-opus-4-7 --mock
|
|
28
|
+
|
|
29
|
+
test:
|
|
30
|
+
pytest -q
|
|
31
|
+
|
|
32
|
+
lint:
|
|
33
|
+
ruff check .
|
|
34
|
+
|
|
35
|
+
# Continuous-benchmark set: offline always; live if a provider key is exported.
|
|
36
|
+
ci-bench:
|
|
37
|
+
$(PY) -m tjbench.ci_benchmark
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
tokenjam-bench
|
|
2
|
+
Copyright (c) 2026 Metabuilder Labs
|
|
3
|
+
|
|
4
|
+
This project was originally created by Piyush (Hooman Digital) and is maintained
|
|
5
|
+
by Metabuilder Labs as the official benchmark for TokenJam
|
|
6
|
+
(https://github.com/Metabuilder-Labs/tokenjam). Licensed under the MIT License;
|
|
7
|
+
the original Hooman Digital copyright is retained in LICENSE.
|