passiveworkers 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
council/fidelity.py ADDED
@@ -0,0 +1,197 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ council/fidelity.py — citation-grounding measurement (R15/D27, the honest eval core)
4
+ ====================================================================================
5
+ Pure, dependency-free logic for the question the whole product rests on:
6
+
7
+ when a report says X [S3], does source S3 actually support X?
8
+
9
+ This is a LEXICAL grounding *floor*, not a semantic judge. It reliably catches the
10
+ failure modes that matter and recur:
11
+ • a citation pointing at an off-topic source (the claim and the cited source barely
12
+ share vocabulary), and
13
+ • a number / date / code in the claim that is ABSENT from the cited source — the
14
+ classic fabricated statistic.
15
+ It CANNOT tell whether a source that shares the claim's vocabulary is being faithfully
16
+ represented; that needs an entailment model (an optional, off-by-default hook in
17
+ scripts/eval_citation_fidelity.py). So read a GROUNDED verdict as "not obviously
18
+ fabricated", never as "proven true". A high score is necessary, not sufficient.
19
+
20
+ No network, no Ollama, no new dependencies — it reuses council.retrieval.tokenize so
21
+ the scorer tokenizes EXACTLY like the retriever it is grading.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import re
26
+
27
+ from council.retrieval import tokenize
28
+
29
+ # Compact English stop-word set: content-overlap must not be inflated by glue words.
30
+ # (Kept small and obvious on purpose — this is a floor, not a linguistics project.)
31
+ STOPWORDS = frozenset("""
32
+ a an the and or but nor so yet if then else of to in on at by for with from into onto
33
+ over under above below up down out off about as is are was were be been being am do does
34
+ did has have had having will would shall should can could may might must not no than that
35
+ this these those it its their there here they them he she his her you your we our us i my
36
+ me what which who whom whose when where why how all any both each few more most other some
37
+ such only own same too very s t just also against between through during before after while
38
+ """.split())
39
+
40
+ _MARKER = re.compile(r"[SL]\d+") # a single citation marker, e.g. S3 / L1
41
+ _BRACKET = re.compile(r"\[([^\]]*)\]") # contents of every [...] span
42
+ _DIGIT = re.compile(r"\d")
43
+ _SENT_SPLIT = re.compile(r"(?<=[.!?])\s+") # naive sentence split (good enough for an eval)
44
+ # the headers researcher.py appends to separate a draft from its source listing
45
+ _SOURCE_HEADER = re.compile(r"(?m)^\s*(?:WEB SOURCES|YOUR DOCUMENTS)\b")
46
+ # a listing line: "[S1] Title (date) — https://host/x" / "[L1] title — /path/to.md"
47
+ # greedy up to the LAST em-dash/hyphen separator so an em-dash inside the title is tolerated
48
+ _SRC_LINE = re.compile(r"(?m)^\s*\[([SL]\d+)\]\s+.*[—–-]\s+(\S+)\s*$")
49
+ # a contribution heading in a compiled report ("### model" / "### Country — model")
50
+ _SECTION_SPLIT = re.compile(r"(?m)^###\s+")
51
+
52
+
53
+ # ----------------------------------------------------------------------------- tokens
54
+ def _strip_markers(text: str) -> str:
55
+ """Remove [...] spans so the markers themselves never count as claim content
56
+ (otherwise every cited sentence carries a guaranteed-missing 's3' token)."""
57
+ return _BRACKET.sub(" ", text or "")
58
+
59
+
60
+ def content_tokens(text: str) -> set[str]:
61
+ """Meaningful tokens: drop stop-words and single chars, but KEEP digit tokens
62
+ (a lone '5' or '18' is exactly the kind of fact we want to verify)."""
63
+ return {t for t in tokenize(text)
64
+ if t not in STOPWORDS and (len(t) > 1 or t.isdigit())}
65
+
66
+
67
+ def numeric_tokens(text: str) -> set[str]:
68
+ """Tokens containing a digit — numbers, years, codes (INV-7731 → '7731')."""
69
+ return {t for t in tokenize(text) if _DIGIT.search(t)}
70
+
71
+
72
+ def significant_numbers(text: str) -> set[str]:
73
+ """The numeric tokens worth treating as checkable FACTS: pure multi-digit integers
74
+ (statistics, years, codes — '18', '2026', '7731'). Deliberately EXCLUDES:
75
+ • single digits ('5') — too common to be a reliable fabrication signal, and
76
+ • decimals/alnum codes ('4.2'→'4','2'; 'v1') — token overlap can't match format
77
+ variants ('4.2 million' vs '4,200,000'), so flagging them yields false positives.
78
+ This is why num_cov is reported separately and never folded into the headline score
79
+ (review: HONESTY-001, BUG-001)."""
80
+ return {t for t in tokenize(text) if t.isdigit() and len(t) >= 2}
81
+
82
+
83
+ # ----------------------------------------------------------------------------- scoring
84
+ def grounding_score(claim: str, source: str) -> dict:
85
+ """Lexical grounding of one claim against the text of its cited source(s).
86
+
87
+ Returns a dict:
88
+ content_cov fraction of the claim's content tokens present in the source [0..1]
89
+ num_cov fraction of the claim's numeric tokens present (1.0 if it has none)
90
+ missing_numbers numeric tokens in the claim absent from the source (the red flags)
91
+ n_content how many content tokens the claim had (0 ⇒ nothing checkable)
92
+ source_empty True when there was no source text to check against
93
+ score headline = content_cov (numbers are surfaced separately, not folded
94
+ in harshly, because legitimate format drift — '4.2 million' vs
95
+ '4,200,000' — would otherwise read as fabrication)
96
+ """
97
+ claim = _strip_markers(claim)
98
+ claim_c = content_tokens(claim)
99
+ if not (source or "").strip():
100
+ return {"score": 0.0, "content_cov": 0.0, "num_cov": 0.0,
101
+ "missing_numbers": sorted(significant_numbers(claim)),
102
+ "n_content": len(claim_c), "source_empty": True}
103
+ src_all = set(tokenize(source))
104
+ if not claim_c:
105
+ # a sentence with a marker but no checkable content (e.g. a pure transition) —
106
+ # not a fidelity failure; flag it as such so the runner can exclude it.
107
+ return {"score": 1.0, "content_cov": 1.0, "num_cov": 1.0,
108
+ "missing_numbers": [], "n_content": 0, "source_empty": False}
109
+ content_cov = len(claim_c & src_all) / len(claim_c)
110
+ claim_nums = significant_numbers(claim)
111
+ if claim_nums:
112
+ present = claim_nums & src_all
113
+ num_cov = len(present) / len(claim_nums)
114
+ missing = sorted(claim_nums - present)
115
+ else:
116
+ num_cov, missing = 1.0, []
117
+ return {"score": round(content_cov, 3), "content_cov": round(content_cov, 3),
118
+ "num_cov": round(num_cov, 3), "missing_numbers": missing,
119
+ "n_content": len(claim_c), "source_empty": False}
120
+
121
+
122
+ def classify(g: dict, grounded: float = 0.5, weak: float = 0.3) -> str:
123
+ """Bucket a grounding result. UNVERIFIABLE keeps fetch failures out of the
124
+ fabrication count — an unreachable source is not the same as a fabricated one."""
125
+ if g.get("source_empty"):
126
+ return "UNVERIFIABLE"
127
+ if g["n_content"] == 0:
128
+ return "NO_CONTENT"
129
+ if g["content_cov"] >= grounded:
130
+ return "GROUNDED"
131
+ if g["content_cov"] >= weak:
132
+ return "WEAK"
133
+ return "UNGROUNDED"
134
+
135
+
136
+ # ----------------------------------------------------------------------------- parsing
137
+ def split_draft(text: str) -> str:
138
+ """The draft body, with the appended WEB SOURCES / YOUR DOCUMENTS listing removed."""
139
+ m = _SOURCE_HEADER.search(text or "")
140
+ return (text[:m.start()] if m else (text or "")).rstrip()
141
+
142
+
143
+ def markers_in(text: str) -> list[str]:
144
+ """Citation markers inside any [...] span, de-duplicated, order-preserving.
145
+ Handles [S1], [S1, S2], [S1][S2], [L1]; ignores non-marker brackets like [2024]."""
146
+ out, seen = [], set()
147
+ for span in _BRACKET.findall(text or ""):
148
+ for m in _MARKER.findall(span):
149
+ if m not in seen:
150
+ seen.add(m)
151
+ out.append(m)
152
+ return out
153
+
154
+
155
+ def parse_cited_claims(draft: str) -> list[tuple[str, list[str]]]:
156
+ """Sentences carrying at least one [S#]/[L#] marker → (sentence, [markers])."""
157
+ out = []
158
+ for sent in _SENT_SPLIT.split(split_draft(draft)):
159
+ sent = sent.strip()
160
+ if not sent:
161
+ continue
162
+ markers = markers_in(sent)
163
+ if markers:
164
+ out.append((sent, markers))
165
+ return out
166
+
167
+
168
+ def parse_source_map(text: str) -> dict[str, str]:
169
+ """Marker → URL/path, read from the source-listing lines ('[S1] Title — https://x')."""
170
+ return {m.group(1): m.group(2) for m in _SRC_LINE.finditer(text or "")}
171
+
172
+
173
+ def split_sections(report: str) -> list[str]:
174
+ """A compiled report's per-analyst '### …' blocks (each self-contained: draft +
175
+ its own source list). Anything before the first heading (summary, agree/differ) is
176
+ dropped — the executive summary carries no citations to grade."""
177
+ parts = _SECTION_SPLIT.split(report or "")
178
+ return [p for p in parts[1:] if p.strip()] if len(parts) > 1 else []
179
+
180
+
181
+ # ----------------------------------------------------------------------------- one claim
182
+ def score_claim(claim: str, markers: list[str], sources: dict[str, str]) -> dict:
183
+ """Grounding of a claim against the UNION of the sources IT cites — a sentence is
184
+ fairly judged against every source it points at (the fact may be split across them).
185
+ A claim is UNVERIFIABLE only when ALL of its cited sources are empty/unreachable.
186
+
187
+ Limitation (review: HONESTY-005): checking the union means a claim asserting a
188
+ RELATIONSHIP across sources ("X founded Y [S1, S2]") can pass when S1 mentions X and
189
+ S2 mentions Y, even if neither source states the relationship. Union grounding proves
190
+ the terms are attributable to the cited set, not that the claim's logic is supported —
191
+ another reason a GROUNDED verdict is a floor, not proof of truth."""
192
+ union = "\n".join(sources.get(m, "") for m in markers)
193
+ g = grounding_score(claim, union)
194
+ g["markers"] = list(markers)
195
+ g["cited_present"] = [m for m in markers if (sources.get(m) or "").strip()]
196
+ g["claim"] = claim.strip()
197
+ return g
council/judge.py ADDED
@@ -0,0 +1,393 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ council/judge.py — Score, then MERGE
4
+ ====================================
5
+ The judge is what turns "many answers" into "better intelligence":
6
+
7
+ 1. SCORE — reads the candidate answers BLIND (anonymized, shuffled order so neither
8
+ identity nor position biases the result) and rates each 0-10. The scores feed the
9
+ ledger, so good answers earn more credit (ideas compete).
10
+ 2. MERGE — synthesizes the candidates into one answer that is better than any single
11
+ one: it keeps the consensus, ADDS the unique points only one perspective found, and
12
+ reconciles disagreements instead of hiding them. This is the diversity dividend.
13
+ 3. COMPARE — a blind A/B check (used for verification) of merged vs. best-single.
14
+
15
+ Use a STRONG model, ideally a different family from the workers, at temperature 0 so
16
+ judging is steady while the workers diverge.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import os
23
+ import re
24
+ import time
25
+ from dataclasses import dataclass
26
+ from typing import Optional
27
+
28
+ import requests
29
+
30
+ from council.sanitize import strip_invisible
31
+
32
+ OLLAMA_BASE = "http://localhost:11434"
33
+
34
+
35
+ def _extract_json(text: str):
36
+ """Tolerantly pull the first JSON value out of a model response."""
37
+ # Strip code fences.
38
+ fenced = re.search(r"```(?:json)?\s*(.*?)```", text, re.DOTALL)
39
+ candidate = fenced.group(1) if fenced else text
40
+ try:
41
+ return json.loads(candidate.strip())
42
+ except Exception:
43
+ pass
44
+ # Find the first balanced [...] or {...} span.
45
+ for open_ch, close_ch in (("[", "]"), ("{", "}")):
46
+ start = candidate.find(open_ch)
47
+ if start == -1:
48
+ continue
49
+ depth = 0
50
+ for i in range(start, len(candidate)):
51
+ if candidate[i] == open_ch:
52
+ depth += 1
53
+ elif candidate[i] == close_ch:
54
+ depth -= 1
55
+ if depth == 0:
56
+ try:
57
+ return json.loads(candidate[start : i + 1])
58
+ except Exception:
59
+ break
60
+ return None
61
+
62
+
63
+ _CITE_TOKEN = re.compile(r"[SL]\d+")
64
+ _CITE_SPAN = re.compile(r"\[[SL]\d+(?:\s*,\s*[SL]\d+)*\]")
65
+
66
+
67
+ def _known_markers(answers) -> set[str]:
68
+ """Every [S#]/[L#] marker that appears in the source answers — the only citations a
69
+ synthesis is allowed to keep."""
70
+ return set(_CITE_TOKEN.findall(" ".join(getattr(a, "text", "") or "" for a in answers)))
71
+
72
+
73
+ def _drop_invented_markers(text: str, known: set[str]) -> str:
74
+ """Remove any citation marker the synthesis INVENTED (not present in any source answer),
75
+ so a merge can't fabricate a citation even if it ignores the prompt rule (review
76
+ CITATION_MERGE_001). It may still drop/renumber — that the prompt discourages — but it can
77
+ never point at a source that was never cited."""
78
+ if "[" not in text:
79
+ return text
80
+
81
+ def _repl(m: re.Match) -> str:
82
+ kept = [t for t in _CITE_TOKEN.findall(m.group(0)) if t in known]
83
+ return ("[" + ", ".join(kept) + "]") if kept else ""
84
+
85
+ return _CITE_SPAN.sub(_repl, text)
86
+
87
+
88
+ @dataclass
89
+ class ScoredCandidate:
90
+ worker_id: str
91
+ score: float
92
+ reason: str
93
+
94
+
95
+ @dataclass
96
+ class Judge:
97
+ model: str
98
+ ollama_base: str = OLLAMA_BASE
99
+ num_predict: int = 900
100
+
101
+ def _generate(self, prompt: str, num_predict: Optional[int] = None) -> str:
102
+ resp = requests.post(
103
+ f"{self.ollama_base}/api/generate",
104
+ json={
105
+ "model": self.model,
106
+ "prompt": prompt,
107
+ "stream": False,
108
+ "options": {"temperature": 0.0, "num_predict": num_predict or self.num_predict},
109
+ "keep_alive": os.environ.get("PW_OLLAMA_KEEP_ALIVE", "30m"), # warm judge (R17)
110
+ },
111
+ # CPU-only/busy machines need headroom (measured: a 4B judge can exceed 400s
112
+ # under contention); configurable like the worker/researcher timeouts.
113
+ timeout=float(os.environ.get("PW_JUDGE_TIMEOUT",
114
+ os.environ.get("PW_OLLAMA_TIMEOUT", "400"))),
115
+ )
116
+ resp.raise_for_status()
117
+ return (resp.json().get("response") or "").strip()
118
+
119
+ # ------------------------------------------------------------------ 1. SCORE
120
+ def score(self, question: str, answers: list) -> list[ScoredCandidate]:
121
+ """answers: list[council.worker.Answer]. Blind, shuffled, deterministic mapping."""
122
+ # Deterministic shuffle (rotate by length) so the judge can't learn an ordering.
123
+ order = list(range(len(answers)))
124
+ rot = len(answers) % max(1, len(answers))
125
+ order = order[rot:] + order[:rot]
126
+
127
+ blocks = []
128
+ for display_idx, real_idx in enumerate(order, start=1):
129
+ blocks.append(f"--- Answer {display_idx} ---\n{answers[real_idx].text}")
130
+ joined = "\n\n".join(blocks)
131
+
132
+ prompt = (
133
+ "You are an impartial judge. Score each candidate answer to the question on a "
134
+ "0-10 scale for correctness, depth, usefulness, and insight. Judge only on merit; "
135
+ "you do not know who wrote them.\n\n"
136
+ f"QUESTION:\n{question}\n\n"
137
+ f"CANDIDATES:\n{joined}\n\n"
138
+ "Respond with ONLY a JSON array, one object per answer, like:\n"
139
+ '[{"answer": 1, "score": 7.5, "reason": "..."}, ...]'
140
+ )
141
+ raw = self._generate(prompt, num_predict=600)
142
+ parsed = _extract_json(raw) or []
143
+
144
+ # Map display index -> real worker.
145
+ by_display = {}
146
+ if isinstance(parsed, list):
147
+ for obj in parsed:
148
+ try:
149
+ by_display[int(obj["answer"])] = (
150
+ float(obj["score"]), strip_invisible(str(obj.get("reason", "")).strip()))
151
+ except Exception:
152
+ continue
153
+
154
+ results: list[ScoredCandidate] = []
155
+ for display_idx, real_idx in enumerate(order, start=1):
156
+ score, reason = by_display.get(display_idx, (5.0, "(unscored — defaulted)"))
157
+ score = max(0.0, min(10.0, score))
158
+ results.append(ScoredCandidate(answers[real_idx].worker_id, score, reason))
159
+ return results
160
+
161
+ # ------------------------------------------------------------------ 2. MERGE
162
+ def merge(self, question: str, answers: list) -> str:
163
+ blocks = [f"--- Perspective {i + 1} ---\n{a.text}" for i, a in enumerate(answers)]
164
+ joined = "\n\n".join(blocks)
165
+ longest = max((len(a.text.split()) for a in answers), default=200)
166
+ prompt = (
167
+ "You are a synthesizer. Several independent perspectives answer the same question below. "
168
+ "Write ONE answer that is strictly BETTER and NO LONGER than the best single perspective — "
169
+ "win on DENSITY, not length.\n"
170
+ "Rules:\n"
171
+ " • Integrate the views — do NOT append them or describe each separately.\n"
172
+ " • Include the strongest points and any correct insight only one perspective found.\n"
173
+ " • If they disagree, state the resolution in ONE line.\n"
174
+ " • Cut filler, repetition, hedging, and preamble. Lead with the answer.\n"
175
+ " • Preserve any [S#]/[L#] citation markers EXACTLY as written next to the claims "
176
+ "they support; never renumber, merge, or invent a marker (R17).\n"
177
+ " • Write ONE direct answer to the asker — never mention 'Perspective N' or that "
178
+ "multiple answers exist.\n"
179
+ f" • Length target: {max(60, int(longest * 0.8))}–{longest} words — as substantive as the "
180
+ "best perspective, never padded, never a stub. End with one line 'Diverse angles: …' "
181
+ "(≤15 words) naming the distinct contributions.\n\n"
182
+ f"QUESTION:\n{question}\n\n"
183
+ f"PERSPECTIVES:\n{joined}\n\n"
184
+ "Write the tight merged answer now."
185
+ )
186
+ # the synthesized text is the last untrusted-output hop before the report → strip hidden
187
+ # chars, then drop any citation marker the synthesis invented (keep merges honest, R17)
188
+ out = strip_invisible(self._generate(prompt, num_predict=min(900, max(300, longest * 2))))
189
+ return _drop_invented_markers(out, _known_markers(answers))
190
+
191
+ # ------------------------------------------------------------------ DELIBERATE (one blind call)
192
+ def deliberate(self, question: str, answers: list) -> dict:
193
+ """
194
+ One blind pass that powers the UI: per-answer scores, a terse merge (TL;DR), and the
195
+ 'council read' — where the perspectives AGREE, where they DIFFER, and each UNIQUE point.
196
+ Returns {"scores": {worker_id: score}, "merged": str,
197
+ "council": {"consensus": [...], "disagreements": [{"point","sides"}],
198
+ "unique": [{"worker_id","point"}]}}.
199
+ Answers are anonymized + rotated so identity/position can't bias the read.
200
+ """
201
+ order = list(range(len(answers)))
202
+ order = order[len(answers) % max(1, len(answers)):] + order[:len(answers) % max(1, len(answers))]
203
+ blocks = []
204
+ for disp, real in enumerate(order, start=1):
205
+ blocks.append(f"--- Answer {disp} ---\n{answers[real].text}")
206
+ joined = "\n\n".join(blocks)
207
+ longest = max((len(a.text.split()) for a in answers), default=200)
208
+ prompt = (
209
+ "You are an impartial council secretary. Read the candidate answers to the question and "
210
+ "return STRICT JSON only, no prose, with this exact shape:\n"
211
+ '{"scores":[{"answer":1,"score":0-10}],'
212
+ '"consensus":["points all/most answers agree on"],'
213
+ '"disagreements":[{"point":"what they differ on","sides":"who says what"}],'
214
+ '"unique":[{"answer":N,"point":"a valuable point only answer N made"}],'
215
+ f'"merge":"a TIGHT synthesis of {max(60, int(longest * 0.8))}-{longest} words — as substantive '
216
+ 'as the best candidate, never padded, never a stub; integrated not appended, '
217
+ 'leading with the answer. Preserve any [S#]/[L#] citation markers exactly as written; '
218
+ 'never renumber or invent one. Write it as ONE direct answer to the asker — never mention '
219
+ 'Answer 1/2/3, candidates, or that multiple answers exist"}\n'
220
+ "Judge on merit only; you do not know who wrote them.\n\n"
221
+ f"QUESTION:\n{question}\n\nCANDIDATES:\n{joined}\n\nJSON:"
222
+ )
223
+ raw = self._generate(prompt, num_predict=min(1100, max(500, longest * 3)))
224
+ parsed = _extract_json(raw)
225
+ if not isinstance(parsed, dict): # models sometimes emit a bare list — never crash
226
+ parsed = {}
227
+
228
+ def _wid(disp_idx: int):
229
+ try:
230
+ return answers[order[int(disp_idx) - 1]].worker_id
231
+ except (ValueError, IndexError, TypeError):
232
+ return None
233
+
234
+ scores: dict[str, float] = {}
235
+ for obj in parsed.get("scores", []) if isinstance(parsed.get("scores"), list) else []:
236
+ wid = _wid(obj.get("answer"))
237
+ if wid is not None:
238
+ try:
239
+ scores[wid] = max(0.0, min(10.0, float(obj.get("score", 5.0))))
240
+ except (TypeError, ValueError):
241
+ scores[wid] = 5.0
242
+ for a in answers: # default any unscored answer
243
+ scores.setdefault(a.worker_id, 5.0)
244
+
245
+ unique = []
246
+ for u in parsed.get("unique", []) if isinstance(parsed.get("unique"), list) else []:
247
+ wid = _wid(u.get("answer"))
248
+ pt = strip_invisible(str(u.get("point", "")).strip())
249
+ if wid and pt:
250
+ unique.append({"worker_id": wid, "point": pt})
251
+
252
+ def _sides(v) -> str:
253
+ if isinstance(v, dict): # models sometimes return {"Answer 1": "...", ...}
254
+ return " · ".join(f"{k}: {x}" for k, x in v.items())
255
+ if isinstance(v, list): # …or ["Answer 1: ...", "Answer 2: ..."]
256
+ return " · ".join(str(x) for x in v)
257
+ return str(v or "").strip()
258
+
259
+ # every council-read string is model output → strip smuggled hidden characters
260
+ consensus = [strip_invisible(str(x).strip()) for x in (parsed.get("consensus") or []) if str(x).strip()][:6]
261
+ disagreements = [
262
+ {"point": strip_invisible(str(d.get("point", "")).strip()),
263
+ "sides": strip_invisible(_sides(d.get("sides")))}
264
+ for d in (parsed.get("disagreements") or []) if isinstance(d, dict) and d.get("point")
265
+ ][:6]
266
+ merged = _drop_invented_markers(strip_invisible(str(parsed.get("merge", "")).strip()),
267
+ _known_markers(answers))
268
+ if not merged: # fall back to the dedicated merge prompt if JSON merge was empty (already guarded)
269
+ merged = self.merge(question, answers)
270
+ return {"scores": scores, "merged": merged,
271
+ "council": {"consensus": consensus, "disagreements": disagreements, "unique": unique}}
272
+
273
+ # ------------------------------------------------------------------ 2b. COMPILE REPORT (D13)
274
+ def compile_report(self, question: str, contributions: list[dict], read: dict,
275
+ local: bool = False) -> str:
276
+ """
277
+ Editor pass for `research_report` jobs: one cited markdown report from the
278
+ per-country contributions. The model writes ONLY the executive summary and the
279
+ agreement/difference synthesis; the per-country findings (with their [S#]
280
+ citations and source lists) are assembled verbatim by code — small models
281
+ garble citation markers if asked to rewrite them.
282
+ contributions: payload answers [{country, model, lens, text, research}].
283
+ read: the deliberate() result (used as fallback + disagreement bullets).
284
+ """
285
+ drafts = "\n\n".join(
286
+ f"--- Contribution from {c.get('country', '?')} ---\n{strip_invisible((c.get('text') or '')[:2500])}"
287
+ for c in contributions)
288
+ raw = self._generate(
289
+ "You are the editor of a multi-country research report. Read the brief and the "
290
+ "contributions, then reply STRICT JSON only, no prose:\n"
291
+ '{"summary":"a 120-180 word executive summary leading with the most '
292
+ 'decision-relevant findings (concrete numbers/dates; mention which country a '
293
+ 'finding came from)",'
294
+ '"agreements":"2-3 sentences: what the countries\' findings agree on",'
295
+ '"differences":"2-3 sentences: where they genuinely differ and why it matters"}\n\n'
296
+ f"BRIEF:\n{question}\n\nCONTRIBUTIONS:\n{drafts}\n\nJSON:",
297
+ num_predict=700)
298
+ parsed = _extract_json(raw)
299
+ if not isinstance(parsed, dict): # model may emit a list/garbage — never crash the report
300
+ parsed = {}
301
+ summary = strip_invisible(str(parsed.get("summary", "")).strip()) or str(read.get("merged", "")).strip()
302
+ agreements = strip_invisible(str(parsed.get("agreements", "")).strip())
303
+ differences = strip_invisible(str(parsed.get("differences", "")).strip())
304
+
305
+ countries = sorted({c.get("country", "?") for c in contributions})
306
+ if local:
307
+ byline = (f"_Researched independently by {len(contributions)} local model(s) "
308
+ f"({', '.join(c.get('model', '?') for c in contributions)}) on this "
309
+ "computer, live web, compiled by a blind editor._")
310
+ else:
311
+ byline = (f"_Researched independently by {len(contributions)} computer(s) in "
312
+ f"{len(countries)} countr{'y' if len(countries) == 1 else 'ies'} "
313
+ f"({', '.join(countries)}), live web, compiled by a blind editor._")
314
+ parts = [f"# Research report\n**Brief:** {question}", byline,
315
+ "## Executive summary", summary]
316
+ if agreements or differences:
317
+ parts.append("## Where the analysts agree — and differ" if local
318
+ else "## Where the countries agree — and differ")
319
+ if agreements:
320
+ parts.append(f"**Agree:** {agreements}")
321
+ if differences:
322
+ parts.append(f"**Differ:** {differences}")
323
+ dis = (read.get("council") or {}).get("disagreements") or []
324
+ if dis:
325
+ parts.append("\n".join(f"- {d.get('point', '')}"
326
+ + (f" — _{d['sides']}_" if d.get("sides") else "")
327
+ for d in dis if d.get("point")))
328
+ parts.append("## Findings by analyst" if local else "## Findings by country")
329
+ for c in contributions:
330
+ head = c.get("model", "") if local else f"{c.get('country', '?')} — {c.get('model', '')}"
331
+ parts.append(f"### {head}")
332
+ # contribution text is model output that may echo an injected source → strip hidden
333
+ # chars but KEEP markdown layout/citations (don't collapse indentation)
334
+ parts.append(strip_invisible(c.get("text") or "") or "_(no findings)_")
335
+ return "\n\n".join(p for p in parts if p)
336
+
337
+ # ------------------------------------------------------------------ 2c. SPOT-CHECK (shard_map QA)
338
+ def spot_check(self, instruction: str, answers: list[dict]) -> dict:
339
+ """
340
+ QA sampler for batch jobs: each node's payload entry carries a small `sample` of
341
+ its (item, output) pairs. One blind call scores instruction-compliance per node
342
+ 0–10 (anonymized as Worker N). Returns {"scores": {worker_id: float}}.
343
+ """
344
+ order = list(range(len(answers)))
345
+ order = order[len(answers) % max(1, len(answers)):] + order[:len(answers) % max(1, len(answers))]
346
+ blocks = []
347
+ for disp, real in enumerate(order, start=1):
348
+ sample = answers[real].get("sample") or []
349
+ pairs = "\n".join(
350
+ f" ITEM: {str(s.get('item', ''))[:300]}\n OUTPUT: {str(s.get('output', ''))[:400]}"
351
+ for s in sample) or " (no sample — node returned nothing)"
352
+ blocks.append(f"--- Worker {disp} sample ---\n{pairs}")
353
+ raw = self._generate(
354
+ "You are a QA inspector for batch work. Each worker applied the INSTRUCTION to "
355
+ "its items; you see a random sample per worker. Score each worker 0-10 on "
356
+ "instruction-compliance and output quality (0 = empty/garbage, 10 = flawless). "
357
+ 'Reply STRICT JSON only: {"scores":[{"worker":1,"score":0-10}]}\n\n'
358
+ f"INSTRUCTION:\n{instruction}\n\n" + "\n\n".join(blocks) + "\n\nJSON:",
359
+ num_predict=200)
360
+ parsed = _extract_json(raw)
361
+ if not isinstance(parsed, dict):
362
+ parsed = {}
363
+ scores: dict[str, float] = {}
364
+ for obj in parsed.get("scores", []) if isinstance(parsed.get("scores"), list) else []:
365
+ try:
366
+ real = order[int(obj.get("worker")) - 1]
367
+ scores[answers[real]["worker_id"]] = max(0.0, min(10.0, float(obj.get("score", 5.0))))
368
+ except (ValueError, IndexError, TypeError, KeyError):
369
+ continue
370
+ for a in answers:
371
+ scores.setdefault(a["worker_id"], 5.0)
372
+ return {"scores": scores, "merged": "",
373
+ "council": {"consensus": [], "disagreements": [], "unique": []}}
374
+
375
+ # ------------------------------------------------------------------ 3. COMPARE (verification)
376
+ def compare(self, question: str, text_a: str, text_b: str) -> dict:
377
+ """Blind A/B. Returns {'winner': 'A'|'B'|'tie', 'reason': str}."""
378
+ prompt = (
379
+ "Two answers to the same question are below. Decide which is the better answer "
380
+ "(more complete, accurate, and useful). Judge blind.\n\n"
381
+ f"QUESTION:\n{question}\n\n"
382
+ f"=== Answer A ===\n{text_a}\n\n"
383
+ f"=== Answer B ===\n{text_b}\n\n"
384
+ 'Respond with ONLY JSON: {"winner": "A" | "B" | "tie", "reason": "..."}'
385
+ )
386
+ raw = self._generate(prompt, num_predict=300)
387
+ parsed = _extract_json(raw)
388
+ if not isinstance(parsed, dict):
389
+ parsed = {}
390
+ winner = str(parsed.get("winner", "tie")).strip().upper()
391
+ winner = winner if winner in {"A", "B", "TIE"} else "TIE"
392
+ return {"winner": "tie" if winner == "TIE" else winner,
393
+ "reason": strip_invisible(str(parsed.get("reason", "")).strip())}