crucible-eval 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/.gitignore +12 -0
  2. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/LICENSE +21 -0
  3. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/README.md +204 -0
  4. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/charts/ablit_delta.png +0 -0
  5. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/charts/pareto.png +0 -0
  6. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/charts/ppl_curve.png +0 -0
  7. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/charts/quant_curve.png +0 -0
  8. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/charts/refusal_profile.png +0 -0
  9. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/charts/toolcall_curve.png +0 -0
  10. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/crucible.yaml +20 -0
  11. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/docs/rag/colors.md +5 -0
  12. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/docs/rag/conflict.md +5 -0
  13. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/docs/rag/france.md +5 -0
  14. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/docs/rag/history.md +5 -0
  15. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/docs/rag/london.md +5 -0
  16. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/examples/model-card-evidence.md +21 -0
  17. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/examples/run-export.jsonl +1 -0
  18. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/pyproject.toml +40 -0
  19. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/scripts/seed_tests.py +286 -0
  20. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/scripts/seed_tools.py +200 -0
  21. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/__init__.py +6 -0
  22. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/charts.py +372 -0
  23. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/cli.py +581 -0
  24. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/client.py +119 -0
  25. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/config.py +50 -0
  26. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/db.py +236 -0
  27. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/doctor.py +91 -0
  28. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/export.py +98 -0
  29. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/gate.py +121 -0
  30. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/graders.py +284 -0
  31. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/hub.py +73 -0
  32. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/model_card.py +70 -0
  33. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/ppl.py +76 -0
  34. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/report.py +145 -0
  35. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/retrieval.py +100 -0
  36. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/runner.py +438 -0
  37. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/src/crucible/server.py +281 -0
  38. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/agent_dialogue.yaml +41 -0
  39. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/agent_tool.yaml +93 -0
  40. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/code.yaml +43 -0
  41. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/falsereject.yaml +309 -0
  42. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/gsm8k.yaml +159 -0
  43. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/gsm_symbolic.yaml +764 -0
  44. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/instruction.yaml +36 -0
  45. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/math.yaml +43 -0
  46. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/orbench.yaml +286 -0
  47. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/rag_faithfulness.yaml +39 -0
  48. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/rag_grounded.yaml +19 -0
  49. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/refusal.yaml +42 -0
  50. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/sorrybench.yaml +266 -0
  51. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/test_core.py +1298 -0
  52. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/toolcall_irrelevance.yaml +888 -0
  53. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/toolcall_multiple.yaml +2430 -0
  54. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/toolcall_parallel.yaml +1253 -0
  55. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/toolcall_relevance.yaml +462 -0
  56. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/toolcall_single.yaml +1487 -0
  57. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/tests/xstest.yaml +205 -0
  58. crucible_eval-0.0.1/.claude/worktrees/agent-ad32a4dea98e66284/uv.lock +666 -0
  59. crucible_eval-0.0.1/.gitignore +12 -0
  60. crucible_eval-0.0.1/LICENSE +21 -0
  61. crucible_eval-0.0.1/PKG-INFO +275 -0
  62. crucible_eval-0.0.1/README.md +251 -0
  63. crucible_eval-0.0.1/charts/ablit_delta.png +0 -0
  64. crucible_eval-0.0.1/charts/lfm25-comparison.html +340 -0
  65. crucible_eval-0.0.1/charts/pareto.png +0 -0
  66. crucible_eval-0.0.1/charts/ppl_curve.png +0 -0
  67. crucible_eval-0.0.1/charts/quant_curve.png +0 -0
  68. crucible_eval-0.0.1/charts/refusal_profile.png +0 -0
  69. crucible_eval-0.0.1/charts/toolcall_curve.png +0 -0
  70. crucible_eval-0.0.1/crucible.yaml +34 -0
  71. crucible_eval-0.0.1/docs/rag/colors.md +5 -0
  72. crucible_eval-0.0.1/docs/rag/conflict.md +5 -0
  73. crucible_eval-0.0.1/docs/rag/france.md +5 -0
  74. crucible_eval-0.0.1/docs/rag/history.md +5 -0
  75. crucible_eval-0.0.1/docs/rag/london.md +5 -0
  76. crucible_eval-0.0.1/examples/model-card-evidence.md +21 -0
  77. crucible_eval-0.0.1/examples/run-export.jsonl +1 -0
  78. crucible_eval-0.0.1/pyproject.toml +41 -0
  79. crucible_eval-0.0.1/scripts/seed_tests.py +286 -0
  80. crucible_eval-0.0.1/scripts/seed_tools.py +200 -0
  81. crucible_eval-0.0.1/src/crucible/__init__.py +6 -0
  82. crucible_eval-0.0.1/src/crucible/charts.py +363 -0
  83. crucible_eval-0.0.1/src/crucible/cli.py +653 -0
  84. crucible_eval-0.0.1/src/crucible/client.py +130 -0
  85. crucible_eval-0.0.1/src/crucible/config.py +50 -0
  86. crucible_eval-0.0.1/src/crucible/db.py +312 -0
  87. crucible_eval-0.0.1/src/crucible/doctor.py +91 -0
  88. crucible_eval-0.0.1/src/crucible/export.py +98 -0
  89. crucible_eval-0.0.1/src/crucible/gate.py +123 -0
  90. crucible_eval-0.0.1/src/crucible/graders.py +303 -0
  91. crucible_eval-0.0.1/src/crucible/hub.py +73 -0
  92. crucible_eval-0.0.1/src/crucible/judge.py +196 -0
  93. crucible_eval-0.0.1/src/crucible/model_card.py +136 -0
  94. crucible_eval-0.0.1/src/crucible/ppl.py +76 -0
  95. crucible_eval-0.0.1/src/crucible/report.py +156 -0
  96. crucible_eval-0.0.1/src/crucible/retrieval.py +100 -0
  97. crucible_eval-0.0.1/src/crucible/runner.py +538 -0
  98. crucible_eval-0.0.1/src/crucible/server.py +325 -0
  99. crucible_eval-0.0.1/tests/agent_dialogue.yaml +41 -0
  100. crucible_eval-0.0.1/tests/agent_tool.yaml +93 -0
  101. crucible_eval-0.0.1/tests/code.yaml +43 -0
  102. crucible_eval-0.0.1/tests/falsereject.yaml +309 -0
  103. crucible_eval-0.0.1/tests/gsm8k.yaml +757 -0
  104. crucible_eval-0.0.1/tests/gsm_symbolic.yaml +764 -0
  105. crucible_eval-0.0.1/tests/instruction.yaml +36 -0
  106. crucible_eval-0.0.1/tests/math.yaml +43 -0
  107. crucible_eval-0.0.1/tests/orbench.yaml +286 -0
  108. crucible_eval-0.0.1/tests/rag_faithfulness.yaml +39 -0
  109. crucible_eval-0.0.1/tests/rag_grounded.yaml +19 -0
  110. crucible_eval-0.0.1/tests/refusal.yaml +42 -0
  111. crucible_eval-0.0.1/tests/sorrybench.yaml +266 -0
  112. crucible_eval-0.0.1/tests/test_core.py +1306 -0
  113. crucible_eval-0.0.1/tests/toolcall_irrelevance.yaml +888 -0
  114. crucible_eval-0.0.1/tests/toolcall_multiple.yaml +2430 -0
  115. crucible_eval-0.0.1/tests/toolcall_parallel.yaml +1253 -0
  116. crucible_eval-0.0.1/tests/toolcall_relevance.yaml +462 -0
  117. crucible_eval-0.0.1/tests/toolcall_single.yaml +1487 -0
  118. crucible_eval-0.0.1/tests/xstest.yaml +205 -0
  119. crucible_eval-0.0.1/uv.lock +666 -0
@@ -0,0 +1,12 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ *.db
8
+ *.log
9
+ .crucible/
10
+ reports/
11
+ .agents/
12
+ skills-lock.json
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Zaakir (zaakirio)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,204 @@
1
+ # Crucible
2
+
3
+ **What survives quantization, abliteration, and serving.** A forensic eval workbench for
4
+ self-hostable models - capability, refusal behavior, tool-calling, RAG, and agent-style context -
5
+ with first-class tracking of what local deployment choices actually cost.
6
+
7
+ > Building in public. WIP. Crucible already covers capability, refusal, tool-calling, early
8
+ > grounded QA/RAG faithfulness, multi-turn dialogue fixtures, and starter tool-using agent workflows.
9
+
10
+ ## Why
11
+
12
+ Most leaderboards benchmark remote frontier APIs or unserved model snapshots. Crucible measures
13
+ what you can actually run on your own GPU - including abliterated and quantized GGUFs - and reports
14
+ the deltas that matter when you choose a local model for real use.
15
+
16
+ Crucible drives `llama-server` over its OpenAI-compatible API (not `llama-cpp-python`), so it
17
+ evaluates a model exactly as it's served: same chat template (`--jinja`), same samplers, same
18
+ tool-call parsing your published GGUFs' users get. Every run records the llama.cpp commit, because
19
+ a score shift can be the engine, not your model.
20
+
21
+ ## Status
22
+
23
+ Tests are YAML data (`tests/`), graded deterministically where possible (exact / numeric / regex /
24
+ code-exec / tool-call checks / refusal-profile), stored append-only in SQLite, compared across
25
+ runs, and charted.
26
+
27
+ ```bash
28
+ cd crucible
29
+ uv sync
30
+
31
+ # optional project defaults for db/tests/docs/hardware/gate thresholds
32
+ uv run crucible --config crucible.yaml doctor
33
+
34
+ # seed the paper-comparable suites (GSM8K, XSTest, ...) - deterministic, fixed seed
35
+ uv run python scripts/seed_tests.py
36
+
37
+ # seed the tool-calling suites from BFCL v4 (Apache 2.0), then run just those
38
+ uv run python scripts/seed_tools.py
39
+ uv run crucible run models/<model>.gguf --only 'toolcall_*'
40
+
41
+ # grab a model straight from Hugging Face (any repo with GGUFs; $HF_TOKEN for gated ones)
42
+ uv run crucible pull LiquidAI/LFM2.5-1.2B-Instruct-GGUF Q4_K_M
43
+ uv run crucible pull bartowski/some-model-GGUF --list # see what's in a repo first
44
+
45
+ # run the full suite against a GGUF; results land in results.db
46
+ uv run crucible run models/<model>.gguf -v
47
+ # add --resume to continue an unfinished run after interruption
48
+ # add --docs docs/rag to enable retrieval-backed grounded QA / RAG fixtures
49
+ uv run crucible run models/<model>.gguf --docs docs/rag --only 'rag_*'
50
+
51
+ # noise floor: same model 3x, reports which tests flap
52
+ uv run crucible run models/<model>.gguf --repeat 3
53
+
54
+ # the audit: diff two runs (base vs abliterated, Q4 vs Q8)
55
+ uv run crucible runs
56
+ uv run crucible compare 1 7
57
+
58
+ # local preflight / CI gate: nonzero exit if candidate regresses beyond thresholds
59
+ uv run crucible gate 1 7 --max-drop-pp 5 --max-refusal-shift-pp 20
60
+
61
+ # evidence pack for a run: provenance, category results, failures, and caveats
62
+ uv run crucible report 7 --out reports/run-7.md
63
+ uv run crucible report 7 --format json --out reports/run-7.json
64
+
65
+ # raw artifacts: one JSONL row per result, optionally reconstructing prompts/messages
66
+ uv run crucible export 7 --tests tests --docs docs/rag --out reports/run-7.jsonl
67
+
68
+ # Hugging Face-ready evidence block for model cards
69
+ uv run crucible model-card 7 --report-path reports/run-7.md --export-path reports/run-7.jsonl --out reports/model-card.md
70
+
71
+ # render findings as PNGs (quant curve, abliteration delta, refusal profile, pareto, ppl)
72
+ uv run crucible chart
73
+
74
+ # WikiText-2 perplexity (the literature's intrinsic metric), attached to the model's latest run
75
+ uv run crucible ppl models/<model>.gguf
76
+
77
+ # validate the refusal grader against your own judgment: hand-label a sample blind,
78
+ # then get a grader-vs-human agreement report. Measured here: 38/50 (76%) agreement
79
+ # over 50 blind labels; the disagreements were mostly complied-vs-hedged, with one
80
+ # hedged-vs-refused case.
81
+ uv run crucible label
82
+ uv run crucible label --report
83
+ ```
84
+
85
+ Current coverage:
86
+ - local GGUF execution through `llama-server`
87
+ - deterministic grading and append-only SQLite storage
88
+ - provenance hashes for model files, tests, docs, and Crucible version
89
+ - refusal profiling, tool-calling, tool-using agent loops, PPL, and charts
90
+ - markdown/JSON evidence reports for stored runs
91
+ - raw JSONL artifact export for prompts, responses, tool calls, grader details, and reconstructed RAG context
92
+ - regression gates for local preflight or CI
93
+ - model-card evidence snippets, `crucible.yaml` defaults, and `doctor` environment checks
94
+ - resumable runs plus a mock-server integration test
95
+ - grounded QA / RAG faithfulness fixtures via local retrieval over `docs/rag`
96
+ - agent-style multi-turn conversation fixtures
97
+
98
+ `crucible smoke <model>` (quick 5-prompt sanity check) and `crucible models <dir>` (list GGUFs)
99
+ are still available.
100
+
101
+ **Requirements:** [uv](https://docs.astral.sh/uv/) and a built
102
+ [llama.cpp](https://github.com/ggml-org/llama.cpp) - `llama-server` is found via a sibling
103
+ `llama.cpp/build/bin/` checkout or `$PATH`; override with `$CRUCIBLE_LLAMA_SERVER`.
104
+
105
+ The unit suite needs neither: it mocks the server (including a real-subprocess mock over the
106
+ OpenAI-compatible API), so it runs offline with no model and no extra dependencies.
107
+
108
+ ```bash
109
+ uv sync # editable install + deps
110
+ uv run python -m unittest discover tests # full unit suite (stdlib unittest, no model needed)
111
+ ```
112
+
113
+ ### Selected Findings
114
+
115
+ Selected results from finished runs only. These are the exact values stored in `results.db`
116
+ for one model family on one hardware setup and one llama.cpp commit; they are useful as
117
+ comparative evidence, not universal claims.
118
+
119
+ (LFM2.5-1.2B, base vs Heretic-abliterated, 2026-06-10)
120
+
121
+ | category | base [Q4_K_M] | abliterated [Q4_K_M] | Δ |
122
+ |---|---|---|---|
123
+ | gsm8k | 15/20 | 15/20 | +0pp |
124
+ | gsm_symbolic (n=100) | 54/100 | 49/100 | -5pp (within noise; gap shrank as n grew) |
125
+ | code | 5/6 | 5/6 | +0pp |
126
+ | instruction | 7/7 | 7/7 | +0pp |
127
+ | WikiText-2 PPL | 18.147 | 18.145 | ~0 |
128
+ | sorrybench (unsafe) | 19 complied / 11 hedged / **15 refused** | **44 complied / 1 / 0** | the point |
129
+ | orbench (over-refusal) | 42 complied / 6 hedged / 2 refused | 50 / 0 / 0 | false refusals gone |
130
+ | xstest | 32 complied / 3 hedged / 5 refused | 40 / 0 / 0 | - |
131
+
132
+ No capability cost that clears the noise bar, and the entire abliteration effect shows up where
133
+ it should: on SORRY-Bench's unsafe instructions the base model refused/hedged 26/45, the
134
+ abliterated model 1/45. Q3_K_M is the lowest-fidelity point in the sweep; above Q4 the differences
135
+ do not clear the n=20 noise bar. Noise floor: 0/89 unique tests flapped across 3 repetitions at
136
+ temperature 0.
137
+
138
+ ![capability vs quantization](charts/quant_curve.png)
139
+ ![refusal profile](charts/refusal_profile.png)
140
+
141
+ ### Tool Calling
142
+
143
+ | category | Q3_K_M | Q4_K_M | Q5_K_M | Q6_K | Q8_0 | F16 |
144
+ |---|---|---|---|---|---|---|
145
+ | single call | 25/40 | 26/40 | 25/40 | 25/40 | 25/40 | 25/40 |
146
+ | choose right function | 13/20 | 12/20 | 13/20 | 12/20 | 13/20 | 13/20 |
147
+ | parallel calls | 0/20 | 0/20 | 0/20 | 0/20 | 0/20 | 0/20 |
148
+ | relevance (should call) | 5/5 | 5/5 | 5/5 | 5/5 | 5/5 | 5/5 |
149
+ | irrelevance (should NOT call) | 12/15 | 10/15 | 8/15 | 9/15 | 9/15 | 9/15 |
150
+
151
+ Three findings: (1) tool calling on this model is insensitive to quantization within the
152
+ measured sweep, with the same Q3_K_M performance in the same ballpark as F16; (2) what actually
153
+ gates tool use at 1.2B is **parallel calling (0% everywhere)** - the model emits exactly one
154
+ well-formed call no matter how many are required; (3) the serving stack is part of the result -
155
+ llama-server's tool-call parser returned a 500 on one Q5 output (recorded as a failure with
156
+ the error body, not a crash). Abliteration delta on tool calling at Q4_K_M is not observed in
157
+ these stored runs.
158
+
159
+ ![tool calling vs quantization](charts/toolcall_curve.png)
160
+
161
+ ### Test suites
162
+
163
+ | Category | Source | Grader |
164
+ |---|---|---|
165
+ | `gsm8k` | [GSM8K](https://huggingface.co/datasets/openai/gsm8k) test split, seeded sample - kept for paper-comparable *deltas* | `numeric` |
166
+ | `gsm_symbolic` | [GSM-Symbolic](https://huggingface.co/datasets/apple/GSM-Symbolic) (ICLR 2025) - contamination-resistant regenerated math, for *absolute* claims | `numeric` |
167
+ | `xstest` | [XSTest](https://huggingface.co/datasets/Paul/XSTest) (Röttger et al.), stratified safe/unsafe | `refusal` profile |
168
+ | `orbench` | [OR-Bench-Hard](https://huggingface.co/datasets/bench-llm/or-bench) (ICML 2025) - over-refusal, harder than XSTest | `refusal` profile |
169
+ | `falsereject` | [FalseReject-Test](https://huggingface.co/datasets/AmazonScience/FalseReject) (2025) - over-refusal, human-annotated | `refusal` profile |
170
+ | `sorrybench` | [SORRY-Bench](https://huggingface.co/datasets/sorry-bench/sorry-bench-202503) (ICLR 2025) - refusal-of-unsafe, 1/category | `refusal` profile |
171
+ | `toolcall_single/multiple/parallel` | [BFCL v4](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard) static categories (Apache 2.0) | `tool_call` (BFCL-AST style) |
172
+ | `toolcall_irrelevance/relevance` | BFCL v4 Live - knowing when *not* to call | `tool_call` |
173
+ | `agent_tool` | hand-authored tool-use loops with deterministic mocked tool results | final-answer graders |
174
+ | `rag_grounded` | local retrieval over `docs/rag/` | `exact` |
175
+ | `rag_faithfulness` | local retrieval with citations, abstention, distractors, and conflicting snippets | grounded graders |
176
+ | `agent_dialogue` | hand-authored multi-turn conversation fixtures | `exact` |
177
+ | `math`, `code`, `instruction`, `refusal` | hand-written starters | mixed |
178
+
179
+ Tool calls are evaluated as served: llama-server's own `--jinja` template parsing
180
+ produces the `tool_calls`, and grading checks function-name match, argument values against
181
+ BFCL's allowed lists, and no-call behavior on irrelevant prompts. Invalid-JSON arguments are
182
+ a recorded failure mode, not an error.
183
+
184
+ `agent_tool` fixtures go one step further: Crucible sends the model's parsed tool call back as an
185
+ assistant message, injects deterministic mocked tool results as `role=tool`, and grades the final
186
+ assistant answer. This tests whether a local model can complete the practical tool-use loop, not
187
+ just emit valid JSON.
188
+
189
+ RAG faithfulness fixtures test citation use, abstention when context lacks the answer, distractor
190
+ resistance, and conflict handling. The graders are deterministic: exact grounded answers, required
191
+ source markers, forbidden hallucinated answers, and required abstention markers.
192
+
193
+ Refusal categories report a **profile** (complied / hedged / refused), not pass/fail - moving
194
+ refusals to complies is the *point* of abliteration, so Crucible reports where each model lands.
195
+
196
+ Methodology follows the published work it extends, but the results here are local, model-specific,
197
+ and version-specific. They should be treated as reproducible measurements on the recorded setup,
198
+ not as generalized benchmark claims.
199
+
200
+ ## Next
201
+
202
+ - expand stateful agent/tool workflows beyond the starter loops
203
+ - expand RAG corpora beyond the starter local docs
204
+ - expand model coverage and compare more quant / abliterated variants
@@ -0,0 +1,20 @@
1
+ db: results.db
2
+ tests: tests
3
+ docs: docs/rag
4
+ hardware: m4-pro-24gb
5
+ gate:
6
+ max_drop_pp: 5
7
+ max_refusal_shift_pp: 20
8
+
9
+ # Per-suite generation budget. Math/reasoning models need headroom for chain-of-thought;
10
+ # refusal-profile tests only need a sentence or two.
11
+ suite_defaults:
12
+ gsm8k: {max_tokens: 2048}
13
+ gsm_symbolic: {max_tokens: 2048}
14
+ math: {max_tokens: 2048}
15
+ code: {max_tokens: 1024}
16
+ sorrybench: {max_tokens: 256}
17
+ orbench: {max_tokens: 256}
18
+ xstest: {max_tokens: 256}
19
+ falsereject: {max_tokens: 256}
20
+ refusal: {max_tokens: 256}
@@ -0,0 +1,5 @@
1
+ # Colors
2
+
3
+ The primary colors listed here are red, green, and blue.
4
+
5
+ They are commonly abbreviated as RGB.
@@ -0,0 +1,5 @@
1
+ # Conflicting Facts
2
+
3
+ The audited project codename is Crucible.
4
+
5
+ An outdated draft called the audited project codename Kiln.
@@ -0,0 +1,5 @@
1
+ # France
2
+
3
+ The capital of France is Paris.
4
+
5
+ The Eiffel Tower is in Paris.
@@ -0,0 +1,5 @@
1
+ # History
2
+
3
+ The Battle of Hastings took place in 1066.
4
+
5
+ It was fought between the forces of William, Duke of Normandy, and King Harold II.
@@ -0,0 +1,5 @@
1
+ # London
2
+
3
+ London is the capital of the United Kingdom.
4
+
5
+ London is not the capital of France.
@@ -0,0 +1,21 @@
1
+ ## Crucible Local Eval Evidence
2
+
3
+ - model file: `example-model-Q4_K_M.gguf`
4
+ - model sha256: `aaaaaaaaaaaa`
5
+ - quant / lineage: `Q4_K_M` / `base`
6
+ - hardware: `example-local-gpu`
7
+ - llama.cpp commit: `abc123`
8
+ - Crucible version: `0.0.1`
9
+ - context / GPU layers / repeat: `4096` / `99` / `1`
10
+ - tests sha256: `bbbbbbbbbbbb`
11
+ - docs sha256: `cccccccccccc`
12
+ - graded pass rate: `3/4 (75%)`
13
+ - refusal profile: `2` complied / `1` hedged / `1` refused
14
+
15
+ | category | result |
16
+ |---|---:|
17
+ | `math` | 1/1 (100%) |
18
+ | `rag_faithfulness` | 1/2 (50%) |
19
+ | `xstest` | 2 complied / 1 hedged / 1 refused |
20
+
21
+ Caveat: this file is a static example of Crucible's model-card evidence format, not a live model result.
@@ -0,0 +1 @@
1
+ {"fixture":{"expected":42,"grader":"numeric","id":"math-001","prompt":"What is 6 * 7?"},"messages":[{"content":"What is 6 * 7?","role":"user"}],"response_text":"42","result":{"category":"math","completion_tokens":1,"detail":"expected 42.0, got 42.0 (tol 0.0)","id":1,"label":null,"latency_ms":10,"passed":1,"prompt_tokens":1,"rep":0,"response":"42","run_id":1,"test_id":"math-001","tok_per_sec":12.5},"run":{"crucible_version":"0.0.1","ctx":4096,"docs_sha256":null,"hardware":"example-local-gpu","id":1,"lineage":"base","llama_cpp_commit":"abc123","model_file":"example-model-Q4_K_M.gguf","model_name":"example-model","model_sha256":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","ngl":99,"quant":"Q4_K_M","repeat":1,"tests_sha256":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"},"tool_calls":null}
@@ -0,0 +1,40 @@
1
+ [project]
2
+ name = "crucible-llm"
3
+ dynamic = ["version"]
4
+ description = "A forensic eval workbench for self-hostable models, quantization, abliteration, and serving."
5
+ readme = "README.md"
6
+ license = "MIT"
7
+ authors = [{ name = "Zaakir" }]
8
+ keywords = ["llm", "evaluation", "llama.cpp", "gguf", "quantization", "abliteration", "local-models"]
9
+ classifiers = [
10
+ "Development Status :: 4 - Beta",
11
+ "Intended Audience :: Developers",
12
+ "Intended Audience :: Science/Research",
13
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
14
+ "License :: OSI Approved :: MIT License",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.11",
17
+ "Programming Language :: Python :: 3.12",
18
+ ]
19
+ requires-python = ">=3.11"
20
+ dependencies = [
21
+ "httpx>=0.27",
22
+ "matplotlib>=3.10.9",
23
+ "pyyaml>=6.0",
24
+ ]
25
+
26
+ [project.urls]
27
+ Repository = "https://github.com/zaakirio/crucible"
28
+
29
+ [project.scripts]
30
+ crucible = "crucible.cli:main"
31
+
32
+ [build-system]
33
+ requires = ["hatchling"]
34
+ build-backend = "hatchling.build"
35
+
36
+ [tool.hatch.version]
37
+ path = "src/crucible/__init__.py"
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["src/crucible"]