proofrag 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
proofrag/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """proofrag: zero-config RAG/LLM evaluation — golden sets, LLM-as-judge, scorecards."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ try:
6
+ __version__ = version("proofrag")
7
+ except PackageNotFoundError: # running from a source tree without install metadata
8
+ __version__ = "0+unknown"
proofrag/cli.py ADDED
@@ -0,0 +1,187 @@
1
+ """proofrag command-line interface.
2
+
3
+ proofrag generate --corpus DIR # docs -> goldenset.jsonl
4
+ proofrag evaluate --goldenset ... # +preds -> results.json (+ optional CI gate)
5
+ proofrag report --results ... # results -> scorecard.html
6
+ proofrag diff --baseline ... # compare vs a baseline; fail on regression
7
+ proofrag demo # canned scorecard, no API key
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import sys
14
+
15
+ from . import __version__
16
+ from .judge import JUDGE_DIMENSIONS
17
+
18
+
19
+ def _eprint(*a):
20
+ print(*a, file=sys.stderr)
21
+
22
+
23
+ def cmd_generate(args) -> int:
24
+ from .corpus import load_corpus
25
+ from .goldenset import generate, write_jsonl
26
+ from .llm import LLM, LLMError
27
+
28
+ chunks = load_corpus(args.corpus, max_chars=args.chunk_chars)
29
+ _eprint(f"Loaded {len(chunks)} chunks from {args.corpus}")
30
+ try:
31
+ records = generate(chunks, n=args.n, seed=args.seed, llm=LLM(model=args.model))
32
+ except LLMError as e:
33
+ _eprint(f"error: {e}")
34
+ return 2
35
+ write_jsonl(records, args.out)
36
+ tiers = {}
37
+ for r in records:
38
+ tiers[r["difficulty"]] = tiers.get(r["difficulty"], 0) + 1
39
+ _eprint(f"Wrote {len(records)} golden cases -> {args.out} ({dict(tiers)})")
40
+ return 0
41
+
42
+
43
+ def cmd_evaluate(args) -> int:
44
+ from .goldenset import read_jsonl
45
+ from .judge import evaluate, write_results
46
+ from .llm import LLM, LLMError
47
+
48
+ goldenset = read_jsonl(args.goldenset)
49
+ predictions = read_jsonl(args.predictions)
50
+ matcher = None
51
+ if args.semantic:
52
+ from .embeddings import embedding_matcher
53
+
54
+ matcher = embedding_matcher()
55
+ try:
56
+ results = evaluate(
57
+ goldenset, predictions, llm=LLM(model=args.model), k=args.k, matcher=matcher
58
+ )
59
+ except LLMError as e:
60
+ _eprint(f"error: {e}")
61
+ return 2
62
+ write_results(results, args.out)
63
+ agg = results["aggregate"]
64
+ _eprint(f"Judged {results['n']} cases with {results['judge_fingerprint']} -> {args.out}")
65
+ for k, v in agg.items():
66
+ _eprint(f" {k:>18}: {v:.3f}")
67
+
68
+ if args.fail_under is not None:
69
+ overall = sum(agg[d] for d in JUDGE_DIMENSIONS) / len(JUDGE_DIMENSIONS)
70
+ if overall < args.fail_under:
71
+ _eprint(f"GATE FAIL: overall {overall:.3f} < {args.fail_under:.3f}")
72
+ return 1
73
+ _eprint(f"GATE PASS: overall {overall:.3f} >= {args.fail_under:.3f}")
74
+ return 0
75
+
76
+
77
+ def cmd_report(args) -> int:
78
+ from .judge import read_results
79
+ from .scorecard import write_html
80
+
81
+ results = read_results(args.results)
82
+ write_html(results, args.out)
83
+ _eprint(f"Wrote scorecard -> {args.out}")
84
+ return 0
85
+
86
+
87
+ def cmd_diff(args) -> int:
88
+ from .diffing import diff, format_table
89
+ from .judge import read_results
90
+
91
+ baseline = read_results(args.baseline)
92
+ candidate = read_results(args.candidate)
93
+ res = diff(baseline, candidate, tolerance=args.tolerance)
94
+ _eprint(format_table(res))
95
+
96
+ if res["judge_mismatch"]:
97
+ msg = (
98
+ f"judge mismatch: baseline={res['baseline_judge']} vs "
99
+ f"candidate={res['candidate_judge']} — scores are not comparable across judges"
100
+ )
101
+ if not args.allow_judge_mismatch:
102
+ _eprint(
103
+ f"error: {msg} (re-run both with the same judge, or pass --allow-judge-mismatch)"
104
+ )
105
+ return 2
106
+ _eprint(f"warning: {msg}")
107
+
108
+ if res["regressed"]:
109
+ _eprint(f"REGRESSION: {', '.join(res['regressed'])} dropped more than {args.tolerance}")
110
+ return 1
111
+ _eprint(f"OK: no metric regressed beyond {args.tolerance}")
112
+ return 0
113
+
114
+
115
+ def cmd_demo(args) -> int:
116
+ from .demo import DEMO_RESULTS
117
+ from .scorecard import write_html
118
+
119
+ write_html(DEMO_RESULTS, args.out)
120
+ _eprint(f"Wrote demo scorecard -> {args.out} (open it in a browser)")
121
+ return 0
122
+
123
+
124
+ def build_parser() -> argparse.ArgumentParser:
125
+ p = argparse.ArgumentParser(prog="proofrag", description="Zero-config RAG/LLM evaluation.")
126
+ p.add_argument("--version", action="version", version=f"proofrag {__version__}")
127
+ sub = p.add_subparsers(dest="cmd", required=True)
128
+
129
+ g = sub.add_parser("generate", help="synthesize a golden set from a corpus")
130
+ g.add_argument("--corpus", required=True, help="file or directory of docs/code")
131
+ g.add_argument("--out", default="goldenset.jsonl")
132
+ g.add_argument("--n", type=int, default=20, help="number of cases")
133
+ g.add_argument("--seed", type=int, default=0)
134
+ g.add_argument("--chunk-chars", type=int, default=1200)
135
+ g.add_argument("--model", default=None, help="override judge/generator model")
136
+ g.set_defaults(func=cmd_generate)
137
+
138
+ e = sub.add_parser("evaluate", help="judge predictions against a golden set")
139
+ e.add_argument("--goldenset", required=True)
140
+ e.add_argument("--predictions", required=True, help="jsonl of {id, answer, retrieved_contexts}")
141
+ e.add_argument("--out", default="results.json")
142
+ e.add_argument("--model", default=None)
143
+ e.add_argument(
144
+ "--k", type=int, default=5, help="cutoff for retrieval metrics (Recall@k, NDCG@k, ...)"
145
+ )
146
+ e.add_argument(
147
+ "--semantic",
148
+ action="store_true",
149
+ help="use embedding cosine for chunk relevance instead of token overlap (needs [openai])",
150
+ )
151
+ e.add_argument(
152
+ "--fail-under",
153
+ type=float,
154
+ default=None,
155
+ help="CI gate: exit 1 if overall generation score < this (0-1)",
156
+ )
157
+ e.set_defaults(func=cmd_evaluate)
158
+
159
+ r = sub.add_parser("report", help="render results.json to an HTML scorecard")
160
+ r.add_argument("--results", required=True)
161
+ r.add_argument("--out", default="scorecard.html")
162
+ r.set_defaults(func=cmd_report)
163
+
164
+ df = sub.add_parser("diff", help="compare results against a baseline; fail on regression")
165
+ df.add_argument("--baseline", required=True, help="baseline results.json (a known-good run)")
166
+ df.add_argument("--candidate", required=True, help="new results.json to compare")
167
+ df.add_argument(
168
+ "--tolerance", type=float, default=0.02, help="allowed drop before flagging a regression"
169
+ )
170
+ df.add_argument(
171
+ "--allow-judge-mismatch", action="store_true", help="compare even if judge models differ"
172
+ )
173
+ df.set_defaults(func=cmd_diff)
174
+
175
+ d = sub.add_parser("demo", help="render a sample scorecard (no API key needed)")
176
+ d.add_argument("--out", default="scorecard.html")
177
+ d.set_defaults(func=cmd_demo)
178
+ return p
179
+
180
+
181
+ def main(argv=None) -> int:
182
+ args = build_parser().parse_args(argv)
183
+ return args.func(args)
184
+
185
+
186
+ if __name__ == "__main__":
187
+ raise SystemExit(main())
proofrag/corpus.py ADDED
@@ -0,0 +1,59 @@
1
+ """Load and chunk a corpus from a file or directory tree."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ TEXT_EXT = {
8
+ ".md",
9
+ ".markdown",
10
+ ".txt",
11
+ ".rst",
12
+ ".mdx",
13
+ ".py",
14
+ ".js",
15
+ ".ts",
16
+ ".tsx",
17
+ ".java",
18
+ ".go",
19
+ ".rb",
20
+ ".rs",
21
+ }
22
+
23
+
24
+ def load_corpus(path: str, max_chars: int = 1200) -> list[dict]:
25
+ """Return a flat list of chunks: {source, chunk_id, text}."""
26
+ p = Path(path)
27
+ if not p.exists():
28
+ raise FileNotFoundError(f"Corpus path not found: {path}")
29
+ files = (
30
+ [p]
31
+ if p.is_file()
32
+ else sorted(f for f in p.rglob("*") if f.is_file() and f.suffix.lower() in TEXT_EXT)
33
+ )
34
+ chunks: list[dict] = []
35
+ for f in files:
36
+ try:
37
+ text = f.read_text(encoding="utf-8", errors="ignore")
38
+ except OSError:
39
+ continue
40
+ for i, body in enumerate(_split(text, max_chars)):
41
+ chunks.append({"source": str(f), "chunk_id": f"{f.name}::{i}", "text": body})
42
+ if not chunks:
43
+ raise ValueError(f"No readable text chunks found under {path}")
44
+ return chunks
45
+
46
+
47
+ def _split(text: str, max_chars: int) -> list[str]:
48
+ """Greedy paragraph packing so chunks stay under max_chars where possible."""
49
+ paras = [p.strip() for p in text.split("\n\n") if p.strip()]
50
+ out: list[str] = []
51
+ buf = ""
52
+ for para in paras:
53
+ if buf and len(buf) + len(para) > max_chars:
54
+ out.append(buf.strip())
55
+ buf = ""
56
+ buf += para + "\n\n"
57
+ if buf.strip():
58
+ out.append(buf.strip())
59
+ return out
proofrag/demo.py ADDED
@@ -0,0 +1,143 @@
1
+ """Canned results so `proofrag demo` renders a real scorecard with no API key.
2
+
3
+ Used for the README screenshot, for trying the tool in 5 seconds, and for CI
4
+ smoke tests that must run without credentials.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+
10
+ def _ret(recall, precision, ndcg, mrr):
11
+ return {"recall_at_k": recall, "precision_at_k": precision, "ndcg_at_k": ndcg, "mrr": mrr}
12
+
13
+
14
+ DEMO_RESULTS = {
15
+ "judge_fingerprint": "anthropic:claude-haiku-4-5-20251001",
16
+ "created": "2026-05-31T00:00:00+00:00",
17
+ "k": 5,
18
+ "n": 8,
19
+ "aggregate": {
20
+ "groundedness": 0.86,
21
+ "correctness": 0.79,
22
+ "completeness": 0.71,
23
+ "citation_quality": 0.68,
24
+ "recall_at_k": 0.77,
25
+ "precision_at_k": 0.55,
26
+ "ndcg_at_k": 0.73,
27
+ "mrr": 0.81,
28
+ },
29
+ "records": [
30
+ {
31
+ "id": "q000",
32
+ "question": "How do I rotate an API key without downtime?",
33
+ "difficulty": "single_doc",
34
+ "answer": "Create a new key, deploy it, then revoke the old one.",
35
+ "scores": {
36
+ "groundedness": 0.95,
37
+ "correctness": 0.92,
38
+ "completeness": 0.88,
39
+ "citation_quality": 0.85,
40
+ },
41
+ "retrieval": _ret(1.0, 0.6, 1.0, 1.0),
42
+ "rationale": "Fully grounded and matches the reference.",
43
+ },
44
+ {
45
+ "id": "q001",
46
+ "question": "What regions support the EU data residency tier?",
47
+ "difficulty": "single_doc",
48
+ "answer": "Frankfurt and Dublin.",
49
+ "scores": {
50
+ "groundedness": 0.9,
51
+ "correctness": 0.85,
52
+ "completeness": 0.6,
53
+ "citation_quality": 0.7,
54
+ },
55
+ "retrieval": _ret(1.0, 0.4, 0.92, 1.0),
56
+ "rationale": "Correct but omits the Paris region the reference lists.",
57
+ },
58
+ {
59
+ "id": "q002",
60
+ "question": "Does the free plan include webhook retries and a dead-letter queue?",
61
+ "difficulty": "multi_doc",
62
+ "answer": "Yes, the free plan includes both.",
63
+ "scores": {
64
+ "groundedness": 0.3,
65
+ "correctness": 0.2,
66
+ "completeness": 0.4,
67
+ "citation_quality": 0.25,
68
+ },
69
+ "retrieval": _ret(0.5, 0.2, 0.39, 0.33),
70
+ "rationale": "Hallucinated: only retries are free; DLQ is paid. Retriever ranked the pricing doc low.",
71
+ },
72
+ {
73
+ "id": "q003",
74
+ "question": "What is the maximum payload size for the batch endpoint?",
75
+ "difficulty": "single_doc",
76
+ "answer": "10 MB per request.",
77
+ "scores": {
78
+ "groundedness": 0.88,
79
+ "correctness": 0.9,
80
+ "completeness": 0.8,
81
+ "citation_quality": 0.75,
82
+ },
83
+ "retrieval": _ret(1.0, 0.8, 1.0, 1.0),
84
+ "rationale": "Accurate and grounded.",
85
+ },
86
+ {
87
+ "id": "q004",
88
+ "question": "How does SSO group mapping interact with custom roles?",
89
+ "difficulty": "multi_doc",
90
+ "answer": "Groups map to roles automatically; custom roles override defaults.",
91
+ "scores": {
92
+ "groundedness": 0.6,
93
+ "correctness": 0.55,
94
+ "completeness": 0.5,
95
+ "citation_quality": 0.45,
96
+ },
97
+ "retrieval": _ret(0.5, 0.4, 0.63, 0.5),
98
+ "rationale": "Partially right; the precedence rule is stated backwards.",
99
+ },
100
+ {
101
+ "id": "q005",
102
+ "question": "What is the CEO's home address?",
103
+ "difficulty": "unanswerable",
104
+ "answer": "I don't have that information in the provided context.",
105
+ "scores": {
106
+ "groundedness": 1.0,
107
+ "correctness": 1.0,
108
+ "completeness": 1.0,
109
+ "citation_quality": 0.9,
110
+ },
111
+ "retrieval": None,
112
+ "rationale": "Correctly refused an unanswerable question.",
113
+ },
114
+ {
115
+ "id": "q006",
116
+ "question": "How long are audit logs retained on the enterprise plan?",
117
+ "difficulty": "single_doc",
118
+ "answer": "Forever.",
119
+ "scores": {
120
+ "groundedness": 0.2,
121
+ "correctness": 0.15,
122
+ "completeness": 0.3,
123
+ "citation_quality": 0.2,
124
+ },
125
+ "retrieval": _ret(0.0, 0.0, 0.0, 0.0),
126
+ "rationale": "Wrong (retention is 2 years) and no relevant context was retrieved.",
127
+ },
128
+ {
129
+ "id": "q007",
130
+ "question": "Which auth methods does the CLI support?",
131
+ "difficulty": "single_doc",
132
+ "answer": "API key and OAuth device flow.",
133
+ "scores": {
134
+ "groundedness": 0.92,
135
+ "correctness": 0.88,
136
+ "completeness": 0.85,
137
+ "citation_quality": 0.8,
138
+ },
139
+ "retrieval": _ret(1.0, 0.8, 1.0, 1.0),
140
+ "rationale": "Grounded and complete.",
141
+ },
142
+ ],
143
+ }
proofrag/diffing.py ADDED
@@ -0,0 +1,57 @@
1
+ """Baseline diffing: compare two results.json runs and flag regressions.
2
+
3
+ A baseline is just a results.json from a known-good run (commit it to the repo).
4
+ On every change you re-evaluate and `diff` against it: any metric that drops by
5
+ more than the tolerance is a regression and fails the build. Because all metrics
6
+ here are higher-is-better, "regression" simply means delta < -tolerance.
7
+
8
+ Judge models are pinned for a reason — comparing scores produced by different
9
+ judges is meaningless, so a fingerprint mismatch is refused unless explicitly
10
+ overridden.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from .judge import JUDGE_DIMENSIONS
16
+ from .metrics import RETRIEVAL_METRICS
17
+
18
+ ALL_METRICS = JUDGE_DIMENSIONS + RETRIEVAL_METRICS
19
+
20
+
21
+ def diff(baseline: dict, candidate: dict, tolerance: float = 0.02) -> dict:
22
+ """Compare candidate vs baseline aggregates. All metrics are higher-is-better."""
23
+ b = baseline.get("aggregate", {})
24
+ c = candidate.get("aggregate", {})
25
+ rows = []
26
+ regressed = []
27
+ for m in ALL_METRICS:
28
+ if m not in b and m not in c:
29
+ continue
30
+ bv, cv = b.get(m), c.get(m)
31
+ delta = None if bv is None or cv is None else round(cv - bv, 3)
32
+ is_reg = delta is not None and delta < -tolerance
33
+ rows.append(
34
+ {"metric": m, "baseline": bv, "candidate": cv, "delta": delta, "regressed": is_reg}
35
+ )
36
+ if is_reg:
37
+ regressed.append(m)
38
+ return {
39
+ "rows": rows,
40
+ "regressed": regressed,
41
+ "tolerance": tolerance,
42
+ "judge_mismatch": baseline.get("judge_fingerprint") != candidate.get("judge_fingerprint"),
43
+ "baseline_judge": baseline.get("judge_fingerprint"),
44
+ "candidate_judge": candidate.get("judge_fingerprint"),
45
+ }
46
+
47
+
48
+ def format_table(result: dict) -> str:
49
+ """Plain-text delta table for the terminal / CI logs."""
50
+ out = [f"{'metric':>16} {'base':>7} {'cand':>7} {'delta':>7}"]
51
+ for r in result["rows"]:
52
+ b = "—" if r["baseline"] is None else f"{r['baseline']:.3f}"
53
+ c = "—" if r["candidate"] is None else f"{r['candidate']:.3f}"
54
+ d = "—" if r["delta"] is None else f"{r['delta']:+.3f}"
55
+ flag = " << REGRESSION" if r["regressed"] else ""
56
+ out.append(f"{r['metric']:>16} {b:>7} {c:>7} {d:>7}{flag}")
57
+ return "\n".join(out)
proofrag/embeddings.py ADDED
@@ -0,0 +1,53 @@
1
+ """Optional semantic matcher for retrieval metrics.
2
+
3
+ Lexical (token-overlap) matching is the zero-dependency default. When chunks are
4
+ paraphrased rather than copied, swap in an embedding matcher: it marks a retrieved
5
+ chunk relevant to a gold context when their cosine similarity clears a threshold.
6
+
7
+ Uses the OpenAI-compatible embeddings API (also covers local servers via
8
+ OPENAI_BASE_URL). Requires the `openai` extra and OPENAI_API_KEY.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import math
14
+ import os
15
+
16
+ from .metrics import Matcher
17
+
18
+ DEFAULT_EMBED_MODEL = "text-embedding-3-small"
19
+
20
+
21
+ def _cosine(a: list[float], b: list[float]) -> float:
22
+ dot = sum(x * y for x, y in zip(a, b, strict=True))
23
+ na = math.sqrt(sum(x * x for x in a))
24
+ nb = math.sqrt(sum(y * y for y in b))
25
+ return dot / (na * nb) if na and nb else 0.0
26
+
27
+
28
+ def embedding_matcher(threshold: float = 0.75, model: str | None = None) -> Matcher:
29
+ """Return a matcher backed by embedding cosine similarity.
30
+
31
+ Embeddings are cached per text within the matcher, so repeated gold/retrieved
32
+ strings across a run are embedded once.
33
+ """
34
+ try:
35
+ import openai
36
+ except ImportError as e:
37
+ raise RuntimeError("embedding_matcher needs: pip install 'proofrag[openai]'") from e
38
+
39
+ base = os.environ.get("OPENAI_BASE_URL")
40
+ client = openai.OpenAI(base_url=base) if base else openai.OpenAI()
41
+ model = model or os.environ.get("PROOFRAG_EMBED_MODEL") or DEFAULT_EMBED_MODEL
42
+ cache: dict[str, list[float]] = {}
43
+
44
+ def embed(text: str) -> list[float]:
45
+ key = text[:2000]
46
+ if key not in cache:
47
+ cache[key] = client.embeddings.create(model=model, input=key).data[0].embedding
48
+ return cache[key]
49
+
50
+ def match(gold: str, chunk: str) -> bool:
51
+ return _cosine(embed(gold), embed(chunk)) >= threshold
52
+
53
+ return match
proofrag/goldenset.py ADDED
@@ -0,0 +1,128 @@
1
+ """Synthesize a golden evaluation set from a corpus.
2
+
3
+ The wedge: most teams never build evals because hand-writing a balanced,
4
+ non-contaminated test set is tedious. This generates one from your own docs,
5
+ with difficulty tiers that catch the failure modes papers flag as worst:
6
+ multi-document questions and unanswerable (refusal) cases.
7
+
8
+ Output is versioned JSONL. One record per line:
9
+ {id, question, gold_answer, gold_contexts[], difficulty, sources[]}
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import random
16
+
17
+ from .llm import LLM
18
+
19
+ SYS = "You write evaluation questions for a RAG system. Output strict JSON only, no prose."
20
+
21
+ _SINGLE = '''From this passage write ONE specific question fully answerable from it, and the ideal grounded answer.
22
+ Source: {source}
23
+ Passage:
24
+ """{text}"""
25
+ Return JSON: {{"question": "...", "gold_answer": "..."}}'''
26
+
27
+ _MULTI = '''From these TWO passages write ONE question that requires BOTH to answer, and the ideal answer that synthesizes them.
28
+ Passage A ({src_a}):
29
+ """{text_a}"""
30
+ Passage B ({src_b}):
31
+ """{text_b}"""
32
+ Return JSON: {{"question": "...", "gold_answer": "..."}}'''
33
+
34
+ _UNANS = '''Here is a passage from a knowledge base. Write ONE realistic question that is on-topic but CANNOT be answered from this passage (the info is simply not present).
35
+ Passage:
36
+ """{text}"""
37
+ Return JSON: {{"question": "..."}}'''
38
+
39
+ _REFUSAL = "I don't have enough information in the provided context to answer that."
40
+
41
+
42
+ def generate(chunks: list[dict], n: int = 20, seed: int = 0, llm: LLM | None = None) -> list[dict]:
43
+ """Generate `n` golden records with ~70% single / 20% multi / 10% unanswerable."""
44
+ llm = llm or LLM()
45
+ rng = random.Random(seed)
46
+ pool = chunks[:]
47
+ rng.shuffle(pool)
48
+
49
+ n_single = max(1, round(n * 0.7))
50
+ n_multi = round(n * 0.2) if len(pool) >= 2 else 0
51
+ n_unans = max(0, n - n_single - n_multi)
52
+
53
+ records: list[dict] = []
54
+ cursor = 0
55
+
56
+ for c in pool[:n_single]:
57
+ out = _try(llm, _SINGLE.format(source=c["source"], text=c["text"][:1500]))
58
+ if out and out.get("question"):
59
+ records.append(
60
+ _record(
61
+ out["question"],
62
+ out.get("gold_answer", ""),
63
+ [c["text"]],
64
+ "single_doc",
65
+ [c["source"]],
66
+ )
67
+ )
68
+ cursor = n_single
69
+
70
+ for _ in range(n_multi):
71
+ if cursor + 1 >= len(pool):
72
+ break
73
+ a, b = pool[cursor], pool[cursor + 1]
74
+ cursor += 2
75
+ out = _try(
76
+ llm,
77
+ _MULTI.format(
78
+ src_a=a["source"], text_a=a["text"][:900], src_b=b["source"], text_b=b["text"][:900]
79
+ ),
80
+ )
81
+ if out and out.get("question"):
82
+ records.append(
83
+ _record(
84
+ out["question"],
85
+ out.get("gold_answer", ""),
86
+ [a["text"], b["text"]],
87
+ "multi_doc",
88
+ [a["source"], b["source"]],
89
+ )
90
+ )
91
+
92
+ for c in pool[cursor : cursor + n_unans]:
93
+ out = _try(llm, _UNANS.format(text=c["text"][:1500]))
94
+ if out and out.get("question"):
95
+ records.append(_record(out["question"], _REFUSAL, [], "unanswerable", []))
96
+
97
+ for i, r in enumerate(records):
98
+ r["id"] = f"q{i:03d}"
99
+ return records
100
+
101
+
102
+ def _try(llm: LLM, prompt: str) -> dict | None:
103
+ try:
104
+ return llm.complete_json(SYS, prompt)
105
+ except Exception: # noqa: BLE001 - one bad generation shouldn't abort the run
106
+ return None
107
+
108
+
109
+ def _record(question, gold_answer, gold_contexts, difficulty, sources) -> dict:
110
+ return {
111
+ "id": "",
112
+ "question": question.strip(),
113
+ "gold_answer": gold_answer.strip(),
114
+ "gold_contexts": gold_contexts,
115
+ "difficulty": difficulty,
116
+ "sources": sources,
117
+ }
118
+
119
+
120
+ def write_jsonl(records: list[dict], path: str) -> None:
121
+ with open(path, "w", encoding="utf-8") as f:
122
+ for r in records:
123
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
124
+
125
+
126
+ def read_jsonl(path: str) -> list[dict]:
127
+ with open(path, encoding="utf-8") as f:
128
+ return [json.loads(line) for line in f if line.strip()]