fieldtest 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fieldtest/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """fieldtest — structured AI eval practice for any project."""
2
+
3
+ from fieldtest.judges.registry import rule
4
+
5
+ __all__ = ["rule"]
fieldtest/cli.py ADDED
@@ -0,0 +1,476 @@
1
+ """
2
+ fieldtest/cli.py
3
+
4
+ Click entry point. All CLI commands: validate, score, history, diff, clean, init.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import math
10
+ import sys
11
+ import traceback
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ import click
16
+
17
+ from fieldtest.errors import FieldtestError
18
+
19
+
20
+ def _handle_error(e: Exception) -> None:
21
+ """Print error to stderr and exit 1. Unexpected errors show traceback + bug URL."""
22
+ if isinstance(e, FieldtestError):
23
+ click.echo(str(e), err=True)
24
+ sys.exit(1)
25
+ else:
26
+ click.echo(traceback.format_exc(), err=True)
27
+ click.echo(
28
+ "Please file a bug at https://github.com/galenmittermann/fieldtest/issues",
29
+ err=True,
30
+ )
31
+ sys.exit(1)
32
+
33
+
34
+ def _load_config(config_path: Path):
35
+ """Load and validate config. Calls sys.exit(1) on error."""
36
+ from fieldtest.config import parse_and_validate
37
+ try:
38
+ return parse_and_validate(config_path)
39
+ except Exception as e:
40
+ _handle_error(e)
41
+
42
+
43
+ def _default_config_path() -> Path:
44
+ return Path("evals/config.yaml")
45
+
46
+
47
+ @click.group()
48
+ def main():
49
+ """fieldtest — structured AI eval practice for any project."""
50
+ pass
51
+
52
+
53
+ # ---------------------------------------------------------------------------
54
+ # validate
55
+ # ---------------------------------------------------------------------------
56
+
57
+ @main.command()
58
+ @click.option("--config", "config_path", default=None, type=click.Path(),
59
+ help="Path to config.yaml (default: evals/config.yaml)")
60
+ def validate(config_path: Optional[str]):
61
+ """Check config.yaml is valid. Does not run anything."""
62
+ path = Path(config_path) if config_path else _default_config_path()
63
+ config = _load_config(path)
64
+
65
+ base_dir = path.resolve().parent
66
+ fixture_dir = base_dir / "fixtures"
67
+
68
+ # Coverage summary
69
+ total_evals = sum(len(uc.evals) for uc in config.use_cases)
70
+ tag_counts = {"right": 0, "good": 0, "safe": 0}
71
+ fixture_count = 0
72
+ warnings = []
73
+
74
+ for uc in config.use_cases:
75
+ for ev in uc.evals:
76
+ tag_counts[ev.tag] = tag_counts.get(ev.tag, 0) + 1
77
+
78
+ # Warn: variation fixtures paired with reference evals
79
+ if ev.type == "reference":
80
+ for set_val in uc.fixtures.sets.values():
81
+ if isinstance(set_val, str) and "variations" in set_val:
82
+ warnings.append(
83
+ f" ⚠ use_case '{uc.id}', eval '{ev.id}': "
84
+ f"reference eval paired with variations set — will always skip"
85
+ )
86
+
87
+ # Warn: rule evals with no registered function
88
+ if ev.type == "rule":
89
+ from fieldtest.judges.registry import get_rule
90
+ rules_path = base_dir / "rules.py"
91
+ if rules_path.exists():
92
+ from fieldtest.judges.registry import load_rules
93
+ try:
94
+ load_rules(rules_path)
95
+ except Exception:
96
+ pass
97
+ if get_rule(ev.id) is None:
98
+ warnings.append(
99
+ f" ⚠ use_case '{uc.id}', eval '{ev.id}': "
100
+ f"type:rule but no @rule('{ev.id}') registered in evals/rules.py"
101
+ )
102
+
103
+ # Count fixtures referenced in sets
104
+ for set_val in uc.fixtures.sets.values():
105
+ if isinstance(set_val, list):
106
+ fixture_count += len(set_val)
107
+ # Warn: fixtures referenced but not on disk
108
+ for fid in set_val:
109
+ fixture_file = base_dir / uc.fixtures.directory / f"{fid}.yaml"
110
+ if not fixture_file.exists():
111
+ warnings.append(
112
+ f" ⚠ fixture '{fid}' referenced in '{uc.id}' "
113
+ f"but not found at {fixture_file}"
114
+ )
115
+
116
+ click.echo(f"✓ config valid: {path}")
117
+ click.echo(f" {len(config.use_cases)} use case(s), {total_evals} eval(s)")
118
+ click.echo(
119
+ f" by tag — right: {tag_counts['right']}, "
120
+ f"good: {tag_counts['good']}, safe: {tag_counts['safe']}"
121
+ )
122
+ click.echo(f" {fixture_count} explicitly listed fixture(s)")
123
+
124
+ if warnings:
125
+ click.echo("")
126
+ for w in warnings:
127
+ click.echo(w)
128
+
129
+
130
+ # ---------------------------------------------------------------------------
131
+ # score
132
+ # ---------------------------------------------------------------------------
133
+
134
+ @main.command()
135
+ @click.argument("set_name", default="full", metavar="[SET]")
136
+ @click.option("--set", "set_name_opt", default=None, help="Fixture set to score")
137
+ @click.option("--config", "config_path", default=None, type=click.Path(),
138
+ help="Path to config.yaml (default: evals/config.yaml)")
139
+ @click.option("--baseline", "baseline_path", default=None, type=click.Path(),
140
+ help="Path to baseline results JSON for delta")
141
+ @click.option("--allow-partial", is_flag=True, default=False,
142
+ help="Warn and skip missing outputs instead of failing")
143
+ @click.option("--concurrency", default=5, type=int,
144
+ help="Max parallel judge calls (default: 5; 1 = sequential with per-judge output)")
145
+ def score(
146
+ set_name: str,
147
+ set_name_opt: Optional[str],
148
+ config_path: Optional[str],
149
+ baseline_path: Optional[str],
150
+ allow_partial: bool,
151
+ concurrency: int,
152
+ ):
153
+ """Score outputs for a given fixture set."""
154
+ # --set flag wins over positional if both provided
155
+ effective_set = set_name_opt or set_name
156
+
157
+ path = Path(config_path) if config_path else _default_config_path()
158
+ config = _load_config(path)
159
+
160
+ # Load rules
161
+ from fieldtest.judges.registry import load_rules
162
+ rules_path = path.resolve().parent / "rules.py"
163
+ try:
164
+ load_rules(rules_path)
165
+ except Exception as e:
166
+ _handle_error(e)
167
+
168
+ baseline = Path(baseline_path) if baseline_path else None
169
+
170
+ # verbose = per-judge output; only useful when sequential (concurrency 1)
171
+ verbose = concurrency == 1
172
+
173
+ from fieldtest.runner import score as _score
174
+ try:
175
+ run_id, rows = _score(
176
+ config=config,
177
+ config_path=path,
178
+ set_name=effective_set,
179
+ baseline_path=baseline,
180
+ allow_partial=allow_partial,
181
+ concurrency=concurrency,
182
+ verbose=verbose,
183
+ )
184
+ except Exception as e:
185
+ _handle_error(e)
186
+
187
+ # Print report to terminal
188
+ results_dir = path.resolve().parent / "results"
189
+ md_path = results_dir / f"{run_id}-report.md"
190
+ if md_path.exists():
191
+ click.echo(md_path.read_text())
192
+ click.echo(f"\nResults written to: {results_dir / run_id}")
193
+
194
+
195
+ # ---------------------------------------------------------------------------
196
+ # history
197
+ # ---------------------------------------------------------------------------
198
+
199
+ @main.command()
200
+ @click.option("--config", "config_path", default=None, type=click.Path(),
201
+ help="Path to config.yaml (default: evals/config.yaml)")
202
+ def history(config_path: Optional[str]):
203
+ """List past result files, newest first."""
204
+ path = Path(config_path) if config_path else _default_config_path()
205
+ base_dir = path.resolve().parent
206
+ results_dir = base_dir / "results"
207
+
208
+ if not results_dir.exists():
209
+ click.echo(
210
+ f"No results found at {results_dir}.\n"
211
+ f" Run 'fieldtest score' to generate results, or\n"
212
+ f" 'fieldtest init' if you haven't set up a project yet."
213
+ )
214
+ return
215
+
216
+ result_files = sorted(results_dir.glob("*-data.json"), reverse=True)
217
+ if not result_files:
218
+ click.echo(
219
+ f"No results found at {results_dir}.\n"
220
+ f" Run 'fieldtest score' to generate results."
221
+ )
222
+ return
223
+
224
+ # Header
225
+ header = (
226
+ f"{'RUN ID':<26} {'TIMESTAMP':<18} {'SET':<12} "
227
+ f"{'FIXTURES':<10} {'RIGHT':<8} {'GOOD':<8} {'SAFE':<8}"
228
+ )
229
+ click.echo(header)
230
+
231
+ for p in result_files:
232
+ try:
233
+ data = json.loads(p.read_text())
234
+ except Exception:
235
+ continue
236
+
237
+ run_id = data.get("run_id", p.stem)
238
+ set_name = data.get("set", "—")
239
+ fixture_count = data.get("fixture_count", 0)
240
+ summary = data.get("summary", {})
241
+
242
+ # Parse timestamp from run_id: 2026-03-22T14-30-00-a3f9
243
+ try:
244
+ ts_part = run_id[:19].replace("T", " ").replace("-", ":")
245
+ # format: 2026:03:22 14:30:00 → fix date separators
246
+ date_part, time_part = ts_part.split(" ")
247
+ date_str = date_part.replace(":", "-")
248
+ ts_display = f"{date_str} {time_part[:5]}"
249
+ except Exception:
250
+ ts_display = "—"
251
+
252
+ def _tag_rate(tag: str) -> str:
253
+ rates = []
254
+ for uc_stats in summary.values():
255
+ for ev_id, stats in uc_stats.get(tag, {}).items():
256
+ fr = stats.get("failure_rate")
257
+ if fr is not None:
258
+ rates.append(fr)
259
+ if not rates:
260
+ return "—"
261
+ avg = sum(rates) / len(rates)
262
+ return f"{math.ceil(avg * 100)}%"
263
+
264
+ right = _tag_rate("right")
265
+ good = _tag_rate("good")
266
+ safe = _tag_rate("safe")
267
+
268
+ click.echo(
269
+ f"{run_id:<26} {ts_display:<18} {set_name:<12} "
270
+ f"{fixture_count:<10} {right:<8} {good:<8} {safe:<8}"
271
+ )
272
+
273
+
274
+ # ---------------------------------------------------------------------------
275
+ # diff
276
+ # ---------------------------------------------------------------------------
277
+
278
+ @main.command()
279
+ @click.argument("run_id", default=None, required=False)
280
+ @click.option("--baseline", "baseline_id", default=None,
281
+ help="Baseline run ID to compare against")
282
+ @click.option("--config", "config_path", default=None, type=click.Path(),
283
+ help="Path to config.yaml (default: evals/config.yaml)")
284
+ def diff(run_id: Optional[str], baseline_id: Optional[str], config_path: Optional[str]):
285
+ """Compare two runs — default: most recent vs prior."""
286
+ path = Path(config_path) if config_path else _default_config_path()
287
+ base_dir = path.resolve().parent
288
+ results_dir = base_dir / "results"
289
+
290
+ if not results_dir.exists():
291
+ click.echo(
292
+ f"No results found at {results_dir}.\n"
293
+ f" Run 'fieldtest score' to generate results."
294
+ )
295
+ return
296
+
297
+ result_files = sorted(results_dir.glob("*-data.json"), reverse=True)
298
+ if not result_files:
299
+ click.echo(
300
+ f"No results found at {results_dir}.\n"
301
+ f" Run 'fieldtest score' to generate results."
302
+ )
303
+ return
304
+
305
+ # Resolve current and baseline
306
+ if run_id:
307
+ current_path = results_dir / f"{run_id}-data.json"
308
+ else:
309
+ current_path = result_files[0]
310
+
311
+ if baseline_id:
312
+ baseline_path = results_dir / f"{baseline_id}-data.json"
313
+ else:
314
+ # most recent that isn't current
315
+ others = [f for f in result_files if f != current_path]
316
+ baseline_path = others[0] if others else None
317
+
318
+ if not current_path.exists():
319
+ click.echo(f"Run not found: {current_path}", err=True)
320
+ sys.exit(1)
321
+
322
+ current_data = json.loads(current_path.read_text())
323
+ delta = current_data.get("delta", {})
324
+
325
+ click.echo(f"Comparing: {current_path.stem}")
326
+ click.echo(f"Baseline: {delta.get('baseline_run_id', '—')}")
327
+ click.echo("")
328
+
329
+ increased = delta.get("increased", [])
330
+ decreased = delta.get("decreased", [])
331
+ unchanged = delta.get("unchanged", [])
332
+
333
+ if increased:
334
+ click.echo("Increased:")
335
+ for item in increased:
336
+ click.echo(
337
+ f" {item['eval_id']}: {item['previous']:.3f} → {item['current']:.3f} "
338
+ f"({item['delta']:+.3f})"
339
+ )
340
+
341
+ if decreased:
342
+ click.echo("Decreased:")
343
+ for item in decreased:
344
+ click.echo(
345
+ f" {item['eval_id']}: {item['previous']:.3f} → {item['current']:.3f} "
346
+ f"({item['delta']:+.3f})"
347
+ )
348
+
349
+ if unchanged:
350
+ click.echo(f"Unchanged: {', '.join(unchanged)}")
351
+
352
+ if not increased and not decreased and not unchanged:
353
+ click.echo("No comparable evals found between runs.")
354
+
355
+
356
+ # ---------------------------------------------------------------------------
357
+ # clean
358
+ # ---------------------------------------------------------------------------
359
+
360
+ @main.command()
361
+ @click.option("--outputs", is_flag=True, default=False, help="Clear outputs/ directory")
362
+ @click.option("--results", is_flag=True, default=False,
363
+ help="Remove old result files (keeps most recent N)")
364
+ @click.option("--keep", default=20, type=int, help="Number of results to keep (default: 20)")
365
+ @click.option("--config", "config_path", default=None, type=click.Path(),
366
+ help="Path to config.yaml (default: evals/config.yaml)")
367
+ def clean(outputs: bool, results: bool, keep: int, config_path: Optional[str]):
368
+ """Clean up accumulated run artifacts."""
369
+ path = Path(config_path) if config_path else _default_config_path()
370
+ base_dir = path.resolve().parent
371
+ outputs_dir = base_dir / "outputs"
372
+ results_dir = base_dir / "results"
373
+
374
+ if not outputs and not results:
375
+ # Interactive mode — show only what actually needs cleaning,
376
+ # then set flags based on what was shown (not unconditionally).
377
+ to_remove = []
378
+ output_files: list = []
379
+ old_results: list = []
380
+
381
+ if outputs_dir.exists():
382
+ output_files = list(outputs_dir.rglob("*.txt"))
383
+ if output_files:
384
+ to_remove.append(f" outputs/: {len(output_files)} run files")
385
+
386
+ if results_dir.exists():
387
+ result_files = sorted(results_dir.glob("*-data.json"), reverse=True)
388
+ old_results = result_files[keep:]
389
+ if old_results:
390
+ to_remove.append(
391
+ f" results/: {len(old_results)} old result sets (keeping {keep})"
392
+ )
393
+
394
+ if not to_remove:
395
+ click.echo("Nothing to clean.")
396
+ return
397
+
398
+ click.echo("Would remove:")
399
+ for line in to_remove:
400
+ click.echo(line)
401
+ if click.confirm("Proceed?"):
402
+ # Only act on what was described in the prompt above
403
+ outputs = bool(output_files)
404
+ results = bool(old_results)
405
+ else:
406
+ click.echo("Cancelled.")
407
+ return
408
+
409
+ if outputs and outputs_dir.exists():
410
+ import shutil
411
+ shutil.rmtree(outputs_dir)
412
+ outputs_dir.mkdir(parents=True, exist_ok=True)
413
+ click.echo("✓ outputs/ cleared")
414
+
415
+ if results and results_dir.exists():
416
+ result_files = sorted(results_dir.glob("*-data.json"), reverse=True)
417
+ removed = 0
418
+ for p in result_files[keep:]:
419
+ run_id = p.stem.removesuffix("-data")
420
+ for fp in results_dir.glob(f"{run_id}-*"):
421
+ fp.unlink()
422
+ removed += 1
423
+ click.echo(f"✓ results/ pruned — kept {min(keep, len(result_files))}, removed {removed}")
424
+
425
+
426
+ # ---------------------------------------------------------------------------
427
+ # init
428
+ # ---------------------------------------------------------------------------
429
+
430
+ @main.command("init")
431
+ @click.option("--dir", "target_dir", default="evals", show_default=True,
432
+ help="Directory to scaffold (default: ./evals)")
433
+ @click.option("--force", is_flag=True, default=False,
434
+ help="Overwrite if directory already exists")
435
+ def init_cmd(target_dir: str, force: bool):
436
+ """Scaffold evals/ directory structure in current project."""
437
+ from fieldtest.init_template import GITIGNORE_CONTENT, STARTER_CONFIG
438
+
439
+ evals_dir = Path(target_dir)
440
+
441
+ if evals_dir.exists() and not force:
442
+ click.echo(
443
+ f"Error: '{evals_dir}' already exists. Use --force to overwrite.",
444
+ err=True,
445
+ )
446
+ sys.exit(1)
447
+
448
+ # Create structure
449
+ (evals_dir / "fixtures" / "golden").mkdir(parents=True, exist_ok=True)
450
+ (evals_dir / "fixtures" / "variations").mkdir(parents=True, exist_ok=True)
451
+ (evals_dir / "outputs").mkdir(parents=True, exist_ok=True)
452
+ (evals_dir / "results").mkdir(parents=True, exist_ok=True)
453
+
454
+ config_path = evals_dir / "config.yaml"
455
+ if not config_path.exists() or force:
456
+ config_path.write_text(STARTER_CONFIG)
457
+
458
+ gitignore_path = evals_dir / ".gitignore"
459
+ if not gitignore_path.exists() or force:
460
+ gitignore_path.write_text(GITIGNORE_CONTENT)
461
+
462
+ click.echo(f"✓ Scaffolded eval structure at {evals_dir}/")
463
+ click.echo(f" {evals_dir}/config.yaml — fill this out first")
464
+ click.echo(f" {evals_dir}/fixtures/golden/ — fixtures with expected outputs")
465
+ click.echo(f" {evals_dir}/fixtures/variations/ — fixtures without expected outputs")
466
+ click.echo(f" {evals_dir}/.gitignore — outputs/ excluded from git")
467
+ click.echo("")
468
+ click.echo("Next steps:")
469
+ click.echo(f" 1. Edit {evals_dir}/config.yaml")
470
+ click.echo(f" 2. Add fixtures to {evals_dir}/fixtures/")
471
+ click.echo(f" 3. Run your system → write outputs to {evals_dir}/outputs/")
472
+ click.echo(f" 4. fieldtest score")
473
+
474
+
475
+ if __name__ == "__main__":
476
+ main()