fieldtest 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. fieldtest-0.1.0/.gitignore +18 -0
  2. fieldtest-0.1.0/LICENSE +21 -0
  3. fieldtest-0.1.0/PKG-INFO +662 -0
  4. fieldtest-0.1.0/README.md +632 -0
  5. fieldtest-0.1.0/examples/eval-patterns.md +314 -0
  6. fieldtest-0.1.0/examples/runner-patterns.md +174 -0
  7. fieldtest-0.1.0/examples/runner_anthropic.py +115 -0
  8. fieldtest-0.1.0/examples/runner_openai.py +70 -0
  9. fieldtest-0.1.0/examples/runner_subprocess.py +109 -0
  10. fieldtest-0.1.0/fieldtest/__init__.py +5 -0
  11. fieldtest-0.1.0/fieldtest/cli.py +476 -0
  12. fieldtest-0.1.0/fieldtest/config.py +288 -0
  13. fieldtest-0.1.0/fieldtest/errors.py +36 -0
  14. fieldtest-0.1.0/fieldtest/init_template.py +83 -0
  15. fieldtest-0.1.0/fieldtest/judges/__init__.py +1 -0
  16. fieldtest-0.1.0/fieldtest/judges/dispatch.py +80 -0
  17. fieldtest-0.1.0/fieldtest/judges/llm.py +185 -0
  18. fieldtest-0.1.0/fieldtest/judges/reference.py +41 -0
  19. fieldtest-0.1.0/fieldtest/judges/regex_.py +32 -0
  20. fieldtest-0.1.0/fieldtest/judges/registry.py +75 -0
  21. fieldtest-0.1.0/fieldtest/providers/__init__.py +25 -0
  22. fieldtest-0.1.0/fieldtest/providers/anthropic.py +42 -0
  23. fieldtest-0.1.0/fieldtest/providers/base.py +19 -0
  24. fieldtest-0.1.0/fieldtest/results/__init__.py +1 -0
  25. fieldtest-0.1.0/fieldtest/results/aggregator.py +211 -0
  26. fieldtest-0.1.0/fieldtest/results/report.py +418 -0
  27. fieldtest-0.1.0/fieldtest/results/writer.py +115 -0
  28. fieldtest-0.1.0/fieldtest/runner.py +154 -0
  29. fieldtest-0.1.0/pyproject.toml +49 -0
  30. fieldtest-0.1.0/tests/__init__.py +0 -0
  31. fieldtest-0.1.0/tests/evals/.gitignore +4 -0
  32. fieldtest-0.1.0/tests/evals/config.yaml +85 -0
  33. fieldtest-0.1.0/tests/evals/fixtures/score_basic.yaml +21 -0
  34. fieldtest-0.1.0/tests/evals/fixtures/score_with_judge_errors.yaml +11 -0
  35. fieldtest-0.1.0/tests/evals/outputs/score_basic/run-1.txt +154 -0
  36. fieldtest-0.1.0/tests/evals/outputs/score_basic/run-2.txt +154 -0
  37. fieldtest-0.1.0/tests/evals/outputs/score_basic/run-3.txt +154 -0
  38. fieldtest-0.1.0/tests/evals/outputs/score_with_judge_errors/run-1.txt +94 -0
  39. fieldtest-0.1.0/tests/evals/outputs/score_with_judge_errors/run-2.txt +94 -0
  40. fieldtest-0.1.0/tests/evals/outputs/score_with_judge_errors/run-3.txt +94 -0
  41. fieldtest-0.1.0/tests/evals/results/2026-03-24T09-58-54-f4b8.csv +31 -0
  42. fieldtest-0.1.0/tests/evals/results/2026-03-24T09-58-54-f4b8.json +504 -0
  43. fieldtest-0.1.0/tests/evals/results/2026-03-24T09-58-54-f4b8.md +25 -0
  44. fieldtest-0.1.0/tests/evals/results/2026-03-24T10-04-14-67ec.csv +31 -0
  45. fieldtest-0.1.0/tests/evals/results/2026-03-24T10-04-14-67ec.json +535 -0
  46. fieldtest-0.1.0/tests/evals/results/2026-03-24T10-04-14-67ec.md +25 -0
  47. fieldtest-0.1.0/tests/evals/results/2026-03-24T10-04-49-9a88.csv +31 -0
  48. fieldtest-0.1.0/tests/evals/results/2026-03-24T10-04-49-9a88.json +516 -0
  49. fieldtest-0.1.0/tests/evals/results/2026-03-24T10-04-49-9a88.md +25 -0
  50. fieldtest-0.1.0/tests/evals/rules.py +252 -0
  51. fieldtest-0.1.0/tests/evals/runner.py +98 -0
  52. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/.gitignore +2 -0
  53. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/config.yaml +34 -0
  54. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/fixtures/fixture_a.yaml +11 -0
  55. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/fixtures/fixture_b.yaml +5 -0
  56. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/outputs/fixture_a/run-1.txt +1 -0
  57. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/outputs/fixture_a/run-2.txt +1 -0
  58. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/outputs/fixture_b/run-1.txt +1 -0
  59. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/outputs/fixture_b/run-2.txt +1 -0
  60. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-23T18-52-09-79fb.csv +5 -0
  61. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-23T18-52-09-79fb.json +94 -0
  62. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-23T18-52-09-79fb.md +17 -0
  63. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-24T09-54-10-5884.csv +9 -0
  64. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-24T09-54-10-5884.md +17 -0
  65. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-24T10-04-10-222c.csv +9 -0
  66. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-24T10-04-10-222c.md +17 -0
  67. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-24T10-04-10-4c50.csv +9 -0
  68. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-24T10-04-10-4c50.md +17 -0
  69. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-24T10-04-10-c1f6.csv +9 -0
  70. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-24T10-04-10-c1f6.json +154 -0
  71. fieldtest-0.1.0/tests/evals/test_projects/basic/evals/results/2026-03-24T10-04-10-c1f6.md +17 -0
  72. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/.gitignore +2 -0
  73. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/config.yaml +36 -0
  74. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/fixtures/fixture_c.yaml +7 -0
  75. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/outputs/fixture_c/run-1.txt +1 -0
  76. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/outputs/fixture_c/run-2.txt +1 -0
  77. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/results/2026-03-24T09-55-15-caa5.csv +5 -0
  78. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/results/2026-03-24T09-55-15-caa5.md +21 -0
  79. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/results/2026-03-24T10-04-11-18e5.csv +5 -0
  80. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/results/2026-03-24T10-04-11-18e5.md +21 -0
  81. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/results/2026-03-24T10-04-11-8820.csv +5 -0
  82. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/results/2026-03-24T10-04-11-8820.md +21 -0
  83. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/results/2026-03-24T10-04-11-bb36.csv +5 -0
  84. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/results/2026-03-24T10-04-11-bb36.json +94 -0
  85. fieldtest-0.1.0/tests/evals/test_projects/with_errors/evals/results/2026-03-24T10-04-11-bb36.md +21 -0
  86. fieldtest-0.1.0/tests/test_aggregator.py +226 -0
  87. fieldtest-0.1.0/tests/test_cli.py +425 -0
  88. fieldtest-0.1.0/tests/test_config.py +313 -0
  89. fieldtest-0.1.0/tests/test_judges.py +305 -0
@@ -0,0 +1,18 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+ .Python
6
+ build/
7
+ dist/
8
+ *.egg-info/
9
+ .installed.cfg
10
+ *.egg
11
+ .env
12
+ .venv
13
+ env/
14
+ venv/
15
+ .pytest_cache/
16
+ .coverage
17
+ htmlcov/
18
+ .DS_Store
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Galen Mittermann
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,662 @@
1
+ Metadata-Version: 2.4
2
+ Name: fieldtest
3
+ Version: 0.1.0
4
+ Summary: Structured AI eval practice for any project
5
+ Project-URL: Homepage, https://github.com/gmitt98/fieldtest
6
+ Project-URL: Repository, https://github.com/gmitt98/fieldtest
7
+ Project-URL: Issues, https://github.com/gmitt98/fieldtest
8
+ Author: Galen Mittermann
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: ai,evals,evaluation,llm,testing
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: anthropic>=0.20.0
23
+ Requires-Dist: click>=8.0
24
+ Requires-Dist: pydantic>=2.0
25
+ Requires-Dist: pyyaml>=6.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest-asyncio; extra == 'dev'
28
+ Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Description-Content-Type: text/markdown
30
+
31
+ # fieldtest
32
+
33
+ The eval landscape is crowded at the execution layer and nearly empty at the practice layer.
34
+
35
+ Most eval tools assume you already know what to evaluate. You install a framework, run some metrics, see numbers. The numbers feel like quality yet they're not: they are measurements without meaning, because nobody defined what the measurements are supposed to catch before running them.
36
+
37
+ **fieldtest is a tool for the layer that's missing: the reasoning that produces the evals.**
38
+
39
+ The config asks you — in order — to name your use cases, define what right, good, and safe means for each, and specify how you'll test them. That sequence is the thing most teams skip, which is why they end up with evals that measure what's easy rather than what matters. The structure of the testing enforces the reasoning. With fieldtest, cannot skip to measurement without first doing the definitional work. How well you do that is up to you, but we provide the scaffolding to reason about what you are actually trying to measure.
40
+
41
+ ---
42
+
43
+ ## Install
44
+
45
+ ```bash
46
+ pip install fieldtest
47
+ export ANTHROPIC_API_KEY=sk-ant-... # for LLM judge calls
48
+ ```
49
+
50
+ ---
51
+
52
+ ## How it works
53
+
54
+ fieldtest expects your project to have an `evals/` directory with a `config.yaml` file. All commands default to `evals/config.yaml` relative to your working directory. Use `--config <path>` to override.
55
+
56
+ ```
57
+ your-project/
58
+ evals/
59
+ config.yaml ← fieldtest reads this
60
+ fixtures/ ← your test inputs
61
+ outputs/ ← your runner writes here
62
+ results/ ← fieldtest score writes here
63
+ ```
64
+
65
+ Run all fieldtest commands from your project root (the directory that contains `evals/`).
66
+
67
+ ---
68
+
69
+ ## Quickstart
70
+
71
+ ### 1. Scaffold your eval directory
72
+
73
+ ```bash
74
+ fieldtest init
75
+ ```
76
+
77
+ This creates:
78
+
79
+ ```
80
+ evals/
81
+ config.yaml ← fill this out first
82
+ fixtures/
83
+ golden/ ← fixtures with expected output (used for regression)
84
+ variations/ ← fixtures without expected output
85
+ outputs/ ← your runner writes here (git-ignored)
86
+ results/ ← fieldtest score writes here
87
+ .gitignore ← outputs/ excluded from git
88
+ ```
89
+
90
+ ### 2. Fill out config.yaml
91
+
92
+ The config walks you through the reasoning in order. Here's a complete example for a resume tailoring assistant:
93
+
94
+ ```yaml
95
+ # evals/config.yaml
96
+ schema_version: 1
97
+
98
+ system:
99
+ name: Resume tailoring assistant
100
+ domain: >
101
+ English-language resumes tailored to job descriptions.
102
+ Input: plain-text base resume + job description.
103
+ Output: Markdown resume tailored to the specific role.
104
+
105
+ use_cases:
106
+ - id: tailor_resume
107
+ description: >
108
+ User submits a base resume and job description.
109
+ System returns a Markdown resume tailored to the role.
110
+
111
+ evals:
112
+
113
+ # RIGHT — correctness evals
114
+ # Failure → grounding or reasoning problem in your system
115
+
116
+ - id: no_fabrication
117
+ tag: right
118
+ type: llm
119
+ description: Output does not invent facts not present in the source
120
+ pass_criteria: >
121
+ Every company name, date, metric, and credential in the output
122
+ can be traced to the source material. Minor rephrasing is fine.
123
+ fail_criteria: >
124
+ The output contains a company, date, metric, or credential that
125
+ does not appear in the source material.
126
+
127
+ - id: contact_preserved
128
+ tag: right
129
+ type: rule
130
+ description: Name and email in output match the base resume
131
+
132
+ # GOOD — quality evals
133
+ # Failure → prompt engineering or format problem; iterate instructions
134
+
135
+ - id: format_compliance
136
+ tag: good
137
+ type: rule
138
+ description: Output follows required Markdown structure
139
+
140
+ - id: bullet_quality
141
+ tag: good
142
+ type: llm
143
+ description: Bullets are specific, quantified, and free of filler language
144
+ pass_criteria: >
145
+ Bullets begin with action verbs, are specific, include quantified
146
+ results where the source provides data, and contain no filler phrases
147
+ (responsible for, helped with, worked on).
148
+ fail_criteria: >
149
+ Bullets are vague, omit available quantification, or use filler phrases.
150
+
151
+ # SAFE — guardrail evals
152
+ # Failure → architectural problem; structural fix, not prompt iteration
153
+
154
+ - id: no_preamble
155
+ tag: safe
156
+ type: regex
157
+ description: Output starts with the resume, not commentary
158
+ pattern: "^# "
159
+ match: true
160
+
161
+ - id: no_horizontal_rules
162
+ tag: safe
163
+ type: regex
164
+ description: No --- in output (forbidden by format spec)
165
+ pattern: "(?m)^---$"
166
+ match: false
167
+
168
+ fixtures:
169
+ directory: fixtures/
170
+ sets:
171
+ smoke:
172
+ # A few fixtures covering each eval type.
173
+ # Run after any prompt change for fast signal.
174
+ - experienced-swe__senior-swe
175
+ - recent-grad__data-scientist
176
+ - marketing-manager__product-manager
177
+ regression:
178
+ # Golden fixtures only — deterministic reference + rule + regex evals.
179
+ # No LLM judge cost. Use this in CI on every PR.
180
+ - experienced-swe__senior-swe
181
+ - recent-grad__senior-swe
182
+ full: all # everything — run before releases
183
+ runs: 3 # how many times to run each fixture
184
+
185
+ defaults:
186
+ provider: anthropic
187
+ model: claude-haiku-3-5-20251001 # judge model — NOT your system's model
188
+ runs: 3
189
+ ```
190
+
191
+ **Sets** are just named lists of fixture IDs you define. Use whatever names make sense. `all` is a special keyword meaning every fixture in the directory.
192
+
193
+ ### 3. Add fixtures
194
+
195
+ A fixture is a YAML file in `evals/fixtures/` describing one test case. The filename is the fixture ID.
196
+
197
+ **`evals/fixtures/experienced-swe__senior-swe.yaml`:**
198
+
199
+ ```yaml
200
+ id: experienced-swe__senior-swe
201
+ description: >
202
+ Experienced SWE applying to a senior SWE role — ideal match.
203
+ Baseline fixture; should score well across all evals.
204
+
205
+ inputs:
206
+ resume: fixtures/resumes/experienced-swe.txt
207
+ job: fixtures/jobs/senior-swe.txt
208
+ is_recent_grad: false
209
+ expected_name: "Alex Rivera"
210
+ expected_email: "alex.rivera@email.com"
211
+
212
+ # The expected block makes this a "golden" fixture.
213
+ # These are deterministic string checks — no API cost.
214
+ # Base them on actual outputs you've reviewed and accepted.
215
+ expected:
216
+ contains:
217
+ - "alex.rivera@email.com"
218
+ - "Stripe"
219
+ - "## EXPERIENCE"
220
+ - "## EDUCATION"
221
+ not_contains:
222
+ - "responsible for"
223
+ - "helped with"
224
+ - "---"
225
+ ```
226
+
227
+ A fixture without an `expected` block is a **variation fixture** — only rule, regex, and LLM evals run on it. Use variations when you don't have reviewed expected output yet. Add them to `golden/` once you've reviewed outputs and written the `expected` block.
228
+
229
+ The `inputs` block is yours to define. Whatever your runner needs — file paths, flags, metadata — put it here. Your runner reads `inputs` directly.
230
+
231
+ ### 4. Write your runner
232
+
233
+ The runner is a script you write (~30 lines). It calls your system and writes outputs to `evals/outputs/[fixture-id]/run-N.txt`. fieldtest only reads those files — it never calls your system directly.
234
+
235
+ **`evals/runner.py`:**
236
+
237
+ ```python
238
+ import os
239
+ import pathlib
240
+ import sys
241
+ import yaml
242
+ import anthropic
243
+
244
+ SYSTEM_PROMPT = "You are a resume tailoring assistant..."
245
+ MODEL = "claude-sonnet-4-20250514"
246
+
247
+ def tailor_resume(resume_text, job_text):
248
+ client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
249
+ message = client.messages.create(
250
+ model=MODEL,
251
+ max_tokens=4096,
252
+ system=SYSTEM_PROMPT,
253
+ messages=[{"role": "user", "content": job_text}],
254
+ )
255
+ return message.content[0].text
256
+
257
+ def main():
258
+ config = yaml.safe_load(pathlib.Path("evals/config.yaml").read_text())
259
+ set_name = sys.argv[1] if len(sys.argv) > 1 else "full"
260
+ base_dir = pathlib.Path("evals")
261
+ runs = config["defaults"]["runs"]
262
+
263
+ fixture_ids = config["use_cases"][0]["fixtures"]["sets"][set_name]
264
+ if fixture_ids == "all":
265
+ fixture_ids = [p.stem for p in sorted((base_dir / "fixtures").rglob("*.yaml"))]
266
+
267
+ for fixture_id in fixture_ids:
268
+ fixture = yaml.safe_load((base_dir / "fixtures" / f"{fixture_id}.yaml").read_text())
269
+ inputs = fixture["inputs"]
270
+
271
+ resume_text = (base_dir / inputs["resume"]).read_text()
272
+ job_text = (base_dir / inputs["job"]).read_text()
273
+
274
+ out_dir = base_dir / "outputs" / fixture_id
275
+ out_dir.mkdir(parents=True, exist_ok=True)
276
+
277
+ for run in range(1, runs + 1):
278
+ print(f" {fixture_id} run {run}/{runs}...", end=" ", flush=True)
279
+ output = tailor_resume(resume_text, job_text)
280
+ (out_dir / f"run-{run}.txt").write_text(output)
281
+ print("✓")
282
+
283
+ if __name__ == "__main__":
284
+ main()
285
+ ```
286
+
287
+ Run it for a specific set:
288
+
289
+ ```bash
290
+ python3 evals/runner.py smoke # run only the smoke set
291
+ python3 evals/runner.py full # run everything
292
+ ```
293
+
294
+ ### 5. Score
295
+
296
+ ```bash
297
+ fieldtest score
298
+ ```
299
+
300
+ Output:
301
+
302
+ ```
303
+ scoring tailor_resume: 3 fixtures × 3 runs = 9 evaluations per eval
304
+ ✓ results written to evals/results/2026-03-24T14-30-00-a3f9
305
+ ```
306
+
307
+ Four files are written to `evals/results/`:
308
+
309
+ ```
310
+ 2026-03-24T14-30-00-a3f9-data.json full result data, machine-readable
311
+ 2026-03-24T14-30-00-a3f9-data.csv flat rows, one per fixture × eval × run
312
+ 2026-03-24T14-30-00-a3f9-report.md human report
313
+ 2026-03-24T14-30-00-a3f9-report.csv spreadsheet report
314
+ ```
315
+
316
+ The `-report.md` looks like:
317
+
318
+ ```
319
+ # Eval Report
320
+ 2026-03-24 14:30 | set: full | 3 fixtures × 3 runs = 9 evaluations per eval
321
+
322
+ ---
323
+
324
+ ## tailor_resume
325
+
326
+ ### Tag Health
327
+ | tag | pass rate | passed / total |
328
+ |-------|-----------|----------------|
329
+ | RIGHT | 100% | 18 / 18 |
330
+ | GOOD | 91% | 33 / 36 |
331
+ | SAFE | 100% | 54 / 54 |
332
+
333
+ ### RIGHT
334
+ | eval | failure rate | errors | vs prior |
335
+ |-------------------|-------------|--------|---------|
336
+ | no_fabrication | 0% | 0 | ↔ |
337
+ | contact_preserved | 0% | 0 | ↔ |
338
+
339
+ ### GOOD
340
+ | eval | failure rate | errors | vs prior |
341
+ |-------------------|-------------|--------|---------|
342
+ | format_compliance | 0% | 0 | ↔ |
343
+ | bullet_quality | 9% | 0 | +3% |
344
+
345
+ ### SAFE
346
+ | eval | failure rate | errors | vs prior |
347
+ |---------------------|-------------|--------|---------|
348
+ | no_preamble | 0% | 0 | ↔ |
349
+ | no_horizontal_rules | 0% | 0 | ↔ |
350
+
351
+ ### Fixture × Eval Matrix
352
+ | fixture | no_fabrication | contact_preserved | format_compliance | bullet_quality | no_preamble | no_horizontal_rules |
353
+ | --- | --- | --- | --- | --- | --- | --- |
354
+ | experienced-swe__senior-swe | 3/3 | 3/3 | 3/3 | 3/3 | 3/3 | 3/3 |
355
+ | recent-grad__data-scientist | 3/3 | 3/3 | 3/3 | 2/3 | 3/3 | 3/3 |
356
+ | marketing-manager__pm | 3/3 | 3/3 | 3/3 | 2/3 | 3/3 | 3/3 |
357
+
358
+ ### Failure Details
359
+
360
+ **bullet_quality**
361
+ - `recent-grad__data-scientist` run 2: Bullets omit available quantification from source
362
+ - `marketing-manager__pm` run 1: "Responsible for managing" — filler phrase present
363
+ ```
364
+
365
+ **The tool reports distributions. You decide what's a regression.** `bullet_quality` failing on 2 of 9 runs might be acceptable or might need a prompt fix — you know your system's risk tolerance; the tool doesn't.
366
+
367
+ ---
368
+
369
+ ## CLI Reference
370
+
371
+ ### `fieldtest validate`
372
+
373
+ Check that your config is valid before running anything.
374
+
375
+ ```bash
376
+ fieldtest validate
377
+ fieldtest validate --config path/to/config.yaml
378
+ ```
379
+
380
+ ```
381
+ ✓ config valid — 1 use case, 6 evals, 8 fixtures
382
+ ```
383
+
384
+ On error:
385
+
386
+ ```
387
+ Error: eval 'no_fabrication' (type: llm) missing required field: pass_criteria
388
+ ```
389
+
390
+ ---
391
+
392
+ ### `fieldtest score`
393
+
394
+ Score all fixtures in the `full` set (the default).
395
+
396
+ ```bash
397
+ fieldtest score
398
+ fieldtest score --set smoke # fast subset
399
+ fieldtest score --set regression # golden fixtures only
400
+ fieldtest score --config path/to/config.yaml
401
+ ```
402
+
403
+ **Sets** are defined in your config under `fixtures.sets`. There's nothing special about the names `smoke`, `regression`, or `full` — use whatever names fit your workflow. The only special value is `all`, which means every fixture in the directory.
404
+
405
+ ```yaml
406
+ fixtures:
407
+ sets:
408
+ smoke: [fixture-a, fixture-b] # named list of fixture IDs
409
+ regression: golden/* # all fixtures in a subdirectory
410
+ full: all # every fixture in fixtures/
411
+ ```
412
+
413
+ **Golden fixtures** are just fixtures with an `expected` block. The `regression` set conventionally contains these — but "golden" and "regression" are just conventions, not enforced by the tool. What makes a fixture golden is whether it has `expected.contains` or `expected.not_contains` entries, not which set it's in.
414
+
415
+ ---
416
+
417
+ ### `fieldtest score --allow-partial`
418
+
419
+ By default, `fieldtest score` exits with an error if any expected output file is missing. Use `--allow-partial` to skip missing outputs and continue scoring what exists.
420
+
421
+ ```bash
422
+ fieldtest score --allow-partial
423
+ ```
424
+
425
+ ```
426
+ ⚠ partial results: recent-grad__data-scientist run 2, recent-grad__data-scientist run 3 not found — excluded from rates
427
+ scoring tailor_resume: 2 fixtures × 3 runs (PARTIAL — 2 outputs missing, skipped)
428
+ ✓ results written to evals/results/2026-03-24T14-30-00-a3f9
429
+ ```
430
+
431
+ Skipped runs are excluded from failure rates — they don't count as passes or failures. The report header flags the run as partial so you know the rates are based on incomplete data. All available outputs are still scored normally.
432
+
433
+ Use this when you're iterating on evals and don't have a complete runner output yet, or when a runner run partially failed.
434
+
435
+ ---
436
+
437
+ ### `fieldtest score --concurrency 1`
438
+
439
+ By default fieldtest dispatches judge calls in parallel (5 threads) and prints the full report only at the end. `--concurrency 1` runs judges sequentially and prints each result as it completes — useful when debugging a judge error.
440
+
441
+ ```bash
442
+ fieldtest score --concurrency 1
443
+ ```
444
+
445
+ ```
446
+ no_fabrication experienced-swe__senior-swe run 1 ✓ pass
447
+ no_fabrication experienced-swe__senior-swe run 2 ✓ pass
448
+ no_fabrication experienced-swe__senior-swe run 3 ✓ pass
449
+ contact_preserved experienced-swe__senior-swe run 1 ✓ pass
450
+ bullet_quality recent-grad__data-scientist run 1 ✗ fail
451
+ bullet_quality recent-grad__data-scientist run 2 ✓ pass
452
+ no_fabrication marketing-manager__pm run 1 ⚠ error
453
+ ...
454
+ ```
455
+
456
+ When a judge is erroring (API failure, malformed response), `--concurrency 1` shows you exactly which fixture and run is triggering it. With parallel execution the errors surface only in the final report, mixed with everything else.
457
+
458
+ ---
459
+
460
+ ### `fieldtest history`
461
+
462
+ List all past runs, newest first, with tag-level failure rates.
463
+
464
+ ```bash
465
+ fieldtest history
466
+ ```
467
+
468
+ ```
469
+ RUN ID TIMESTAMP SET FIXTURES RIGHT GOOD SAFE
470
+ 2026-03-24T14-30-00-a3f9 2026-03-24 14:30 full 11 0% 9% 0%
471
+ 2026-03-24T11-31-00-da96 2026-03-24 11:31 full 11 0% 18% 0%
472
+ 2026-03-23T18-52-00-79fb 2026-03-23 18:52 smoke 6 0% 12% 0%
473
+ ```
474
+
475
+ The failure rates shown are averages across all evals with that tag. Use this to spot when a set of changes improved or hurt a whole category. Open the `-report.md` for the specific run to see which evals moved.
476
+
477
+ ---
478
+
479
+ ### `fieldtest diff`
480
+
481
+ Compare two runs. Default: most recent vs prior (same set).
482
+
483
+ ```bash
484
+ fieldtest diff # most recent vs prior
485
+ fieldtest diff 2026-03-24T14-30-00-a3f9 # specific run vs its prior
486
+ fieldtest diff 2026-03-24T14-30-00-a3f9 \
487
+ --baseline 2026-03-23T18-52-00-79fb # explicit comparison
488
+ ```
489
+
490
+ ```
491
+ Comparing: 2026-03-24T14-30-00-a3f9
492
+ Baseline: 2026-03-23T18-52-00-79fb
493
+
494
+ Increased:
495
+ bullet_quality: 0.180 → 0.090 (+0.090)
496
+
497
+ Decreased:
498
+ education_placement: 0.240 → 0.180 (-0.060)
499
+
500
+ Unchanged: no_fabrication, contact_preserved, format_compliance, no_preamble, no_horizontal_rules
501
+ ```
502
+
503
+ Deltas use neutral language — "increased" means the failure rate went up, "decreased" means it went down. You decide if a change is a regression. A decrease in `education_placement` failure rate after a prompt fix is expected. An increase in `no_fabrication` is always worth investigating.
504
+
505
+ ---
506
+
507
+ ### `fieldtest clean`
508
+
509
+ Remove accumulated run artifacts.
510
+
511
+ ```bash
512
+ # Interactive — shows what would be removed, asks to confirm
513
+ fieldtest clean
514
+
515
+ # Clear outputs/ (your runner's generated files)
516
+ fieldtest clean --outputs
517
+
518
+ # Prune old results, keeping the 10 most recent
519
+ fieldtest clean --results --keep 10
520
+
521
+ # Both
522
+ fieldtest clean --outputs --results --keep 5
523
+ ```
524
+
525
+ Interactive mode:
526
+
527
+ ```
528
+ Would remove:
529
+ outputs/: 33 run files
530
+ results/: 8 old result sets (keeping 20)
531
+ Proceed? [y/N]:
532
+ ```
533
+
534
+ Only what's listed in the prompt gets removed. If only results need pruning, outputs are untouched.
535
+
536
+ `--keep` defaults to 20. Each result set is 4 files (`-data.json`, `-data.csv`, `-report.md`, `-report.csv`); all four are removed together when pruning.
537
+
538
+ ---
539
+
540
+ ### `fieldtest init`
541
+
542
+ Scaffold the eval directory structure in your project. Safe to run in an existing project — won't overwrite files unless you pass `--force`.
543
+
544
+ ```bash
545
+ fieldtest init # creates evals/ in current directory
546
+ fieldtest init --dir ci/evals # custom location
547
+ fieldtest init --force # overwrite existing files
548
+ ```
549
+
550
+ ```
551
+ ✓ Scaffolded eval structure at evals/
552
+ evals/config.yaml — fill this out first
553
+ evals/fixtures/golden/ — fixtures with expected outputs
554
+ evals/fixtures/variations/ — fixtures without expected outputs
555
+ evals/.gitignore — outputs/ excluded from git
556
+
557
+ Next steps:
558
+ 1. Edit evals/config.yaml
559
+ 2. Add fixtures to evals/fixtures/
560
+ 3. Run your system → write outputs to evals/outputs/
561
+ 4. fieldtest score
562
+ ```
563
+
564
+ ---
565
+
566
+ ## Right / Good / Safe
567
+
568
+ Every eval requires a `tag`. The tag is the diagnostic path when something fails.
569
+
570
+ | tag | what it means | failure → |
571
+ |-----|--------------|-----------|
572
+ | `right` | correctness — did the system do the correct thing? | grounding, retrieval, or reasoning fix |
573
+ | `good` | quality — did the system do it well? | prompt engineering or format fix |
574
+ | `safe` | guardrails — did the system violate a hard constraint? | architectural fix, not prompt iteration |
575
+
576
+ A single quality score hides which category failed. `right` and `safe` failures have completely different fixes — one is a reasoning problem, one is a structural problem. Tagging forces you to classify before you measure.
577
+
578
+ ---
579
+
580
+ ## Eval types
581
+
582
+ | type | when to use | example |
583
+ |------|-------------|---------|
584
+ | `rule` | deterministic Python logic; can read fixture `inputs` | contact info check, section ordering |
585
+ | `regex` | pattern matching; `match: true` = must match, `match: false` = must not match | forbidden strings, required format |
586
+ | `llm` | semantic judgment that requires reading the output | fabrication, quality, keyword alignment |
587
+ | `reference` | compare against `expected` block in fixture file | golden output regression check |
588
+
589
+ Writing rules:
590
+
591
+ ```python
592
+ # evals/rules.py
593
+ from fieldtest import rule
594
+
595
+ @rule("contact_preserved")
596
+ def check_contact(output: str, inputs: dict) -> dict:
597
+ name = inputs.get("expected_name", "")
598
+ email = inputs.get("expected_email", "")
599
+ header = "\n".join(output.splitlines()[:3])
600
+ if name and name not in header:
601
+ return {"passed": False, "detail": f"'{name}' not in first 3 lines"}
602
+ if email and email not in header:
603
+ return {"passed": False, "detail": f"'{email}' not in first 3 lines"}
604
+ return {"passed": True, "detail": "name and email present"}
605
+ ```
606
+
607
+ ---
608
+
609
+ ## Two LLMs, two purposes
610
+
611
+ Your runner calls **your system**. `fieldtest score` calls its own **judge LLM**. Completely separate — different models, different credentials, different purposes.
612
+
613
+ ```
614
+ YOUR SYSTEM (runner) JUDGE (fieldtest score)
615
+ ────────────────────────────── ──────────────────────────────────
616
+ calls your model or pipeline calls a judge LLM to score outputs
617
+ configured by: your runner code configured by: defaults.model in config.yaml
618
+ auth: your credentials auth: ANTHROPIC_API_KEY in environment
619
+ ```
620
+
621
+ `defaults.model` in config is the judge model. Set it independently of whatever your system uses.
622
+
623
+ ---
624
+
625
+ ## Results files
626
+
627
+ Four files per run, named `[run-id]-data.*` or `[run-id]-report.*`:
628
+
629
+ | file | what it is |
630
+ |------|-----------|
631
+ | `[run-id]-data.json` | Full result data — rows, summary, delta. Machine-readable, CI-parseable. |
632
+ | `[run-id]-data.csv` | Flat rows, one per fixture × eval × run. Analyst-ready. |
633
+ | `[run-id]-report.md` | Human report — tag health, per-eval tables, fixture × eval matrix, failure details. |
634
+ | `[run-id]-report.csv` | Spreadsheet report — same three views, designed to open in Excel or Numbers. |
635
+
636
+ CI gating: `fieldtest score` exits 0 on success, 1 on error. It does not exit non-zero on high failure rates — the tool measures; you judge. To gate CI on specific failure rates, parse the `-data.json`:
637
+
638
+ ```bash
639
+ python3 -c "
640
+ import json, glob, sys
641
+ f = sorted(glob.glob('evals/results/*-data.json'))[-1]
642
+ rows = json.load(open(f))['rows']
643
+ failures = [r for r in rows if r['eval_id'] == 'no_fabrication' and r.get('passed') is False]
644
+ if failures:
645
+ print(f'no_fabrication failed on {len(failures)} runs')
646
+ sys.exit(1)
647
+ "
648
+ ```
649
+
650
+ ---
651
+
652
+ ## Examples and patterns
653
+
654
+ - `examples/runner_anthropic.py` — complete runner calling Claude directly
655
+ - `examples/runner_openai.py` — complete runner calling OpenAI
656
+ - `examples/runner_subprocess.py` — complete runner calling any CLI tool
657
+ - `examples/runner-patterns.md` — sets, CI integration, scheduling, multiple runners, production traffic sampling
658
+ - `examples/eval-patterns.md` — eval design cookbook: refusals, format compliance, forbidden content, conditional behavior, classification, and more
659
+
660
+ ---
661
+
662
+ *The practice is the point. The tool makes the practice tractable.*