PyPI - passiveworkers - Versions diffs - 0.1.0__py3-none-any.whl - Mend

passiveworkers 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

council/__init__.py +1 -0
council/artifacts.py +161 -0
council/batch.py +84 -0
council/cli.py +54 -0
council/coordinator.py +133 -0
council/crypto.py +133 -0
council/fidelity.py +197 -0
council/judge.py +393 -0
council/ledger.py +230 -0
council/library.py +431 -0
council/local.py +228 -0
council/mcp_server.py +87 -0
council/net/__init__.py +1 -0
council/net/agent.py +231 -0
council/net/app.py +390 -0
council/net/baseline.py +86 -0
council/net/config.py +79 -0
council/net/coordinator_app.py +370 -0
council/net/dashboard.py +111 -0
council/net/store.py +964 -0
council/net/submit.py +102 -0
council/operator.py +412 -0
council/research.py +520 -0
council/researcher.py +300 -0
council/retrieval.py +80 -0
council/run_demo.py +175 -0
council/sanitize.py +78 -0
council/serve.py +183 -0
council/trust.py +168 -0
council/worker.py +123 -0
passiveworkers-0.1.0.dist-info/METADATA +269 -0
passiveworkers-0.1.0.dist-info/RECORD +36 -0
passiveworkers-0.1.0.dist-info/WHEEL +5 -0
passiveworkers-0.1.0.dist-info/entry_points.txt +2 -0
passiveworkers-0.1.0.dist-info/licenses/LICENSE +21 -0
passiveworkers-0.1.0.dist-info/top_level.txt +1 -0

council/fidelity.py ADDED Viewed

@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+"""
+council/fidelity.py — citation-grounding measurement (R15/D27, the honest eval core)
+====================================================================================
+Pure, dependency-free logic for the question the whole product rests on:
+    when a report says X [S3], does source S3 actually support X?
+This is a LEXICAL grounding *floor*, not a semantic judge. It reliably catches the
+failure modes that matter and recur:
+  • a citation pointing at an off-topic source (the claim and the cited source barely
+    share vocabulary), and
+  • a number / date / code in the claim that is ABSENT from the cited source — the
+    classic fabricated statistic.
+It CANNOT tell whether a source that shares the claim's vocabulary is being faithfully
+represented; that needs an entailment model (an optional, off-by-default hook in
+scripts/eval_citation_fidelity.py). So read a GROUNDED verdict as "not obviously
+fabricated", never as "proven true". A high score is necessary, not sufficient.
+No network, no Ollama, no new dependencies — it reuses council.retrieval.tokenize so
+the scorer tokenizes EXACTLY like the retriever it is grading.
+"""
+from __future__ import annotations
+import re
+from council.retrieval import tokenize
+# Compact English stop-word set: content-overlap must not be inflated by glue words.
+# (Kept small and obvious on purpose — this is a floor, not a linguistics project.)
+STOPWORDS = frozenset("""
+a an the and or but nor so yet if then else of to in on at by for with from into onto
+over under above below up down out off about as is are was were be been being am do does
+did has have had having will would shall should can could may might must not no than that
+this these those it its their there here they them he she his her you your we our us i my
+me what which who whom whose when where why how all any both each few more most other some
+such only own same too very s t just also against between through during before after while
+""".split())
+_MARKER = re.compile(r"[SL]\d+")                 # a single citation marker, e.g. S3 / L1
+_BRACKET = re.compile(r"\[([^\]]*)\]")           # contents of every [...] span
+_DIGIT = re.compile(r"\d")
+_SENT_SPLIT = re.compile(r"(?<=[.!?])\s+")       # naive sentence split (good enough for an eval)
+# the headers researcher.py appends to separate a draft from its source listing
+_SOURCE_HEADER = re.compile(r"(?m)^\s*(?:WEB SOURCES|YOUR DOCUMENTS)\b")
+# a listing line: "[S1] Title (date) — https://host/x"  /  "[L1] title — /path/to.md"
+# greedy up to the LAST em-dash/hyphen separator so an em-dash inside the title is tolerated
+_SRC_LINE = re.compile(r"(?m)^\s*\[([SL]\d+)\]\s+.*[—–-]\s+(\S+)\s*$")
+# a contribution heading in a compiled report ("### model" / "### Country — model")
+_SECTION_SPLIT = re.compile(r"(?m)^###\s+")
+# ----------------------------------------------------------------------------- tokens
+def _strip_markers(text: str) -> str:
+    """Remove [...] spans so the markers themselves never count as claim content
+    (otherwise every cited sentence carries a guaranteed-missing 's3' token)."""
+    return _BRACKET.sub(" ", text or "")
+def content_tokens(text: str) -> set[str]:
+    """Meaningful tokens: drop stop-words and single chars, but KEEP digit tokens
+    (a lone '5' or '18' is exactly the kind of fact we want to verify)."""
+    return {t for t in tokenize(text)
+            if t not in STOPWORDS and (len(t) > 1 or t.isdigit())}
+def numeric_tokens(text: str) -> set[str]:
+    """Tokens containing a digit — numbers, years, codes (INV-7731 → '7731')."""
+    return {t for t in tokenize(text) if _DIGIT.search(t)}
+def significant_numbers(text: str) -> set[str]:
+    """The numeric tokens worth treating as checkable FACTS: pure multi-digit integers
+    (statistics, years, codes — '18', '2026', '7731'). Deliberately EXCLUDES:
+      • single digits ('5') — too common to be a reliable fabrication signal, and
+      • decimals/alnum codes ('4.2'→'4','2'; 'v1') — token overlap can't match format
+        variants ('4.2 million' vs '4,200,000'), so flagging them yields false positives.
+    This is why num_cov is reported separately and never folded into the headline score
+    (review: HONESTY-001, BUG-001)."""
+    return {t for t in tokenize(text) if t.isdigit() and len(t) >= 2}
+# ----------------------------------------------------------------------------- scoring
+def grounding_score(claim: str, source: str) -> dict:
+    """Lexical grounding of one claim against the text of its cited source(s).
+    Returns a dict:
+      content_cov   fraction of the claim's content tokens present in the source [0..1]
+      num_cov       fraction of the claim's numeric tokens present (1.0 if it has none)
+      missing_numbers   numeric tokens in the claim absent from the source (the red flags)
+      n_content     how many content tokens the claim had (0 ⇒ nothing checkable)
+      source_empty  True when there was no source text to check against
+      score         headline = content_cov (numbers are surfaced separately, not folded
+                    in harshly, because legitimate format drift — '4.2 million' vs
+                    '4,200,000' — would otherwise read as fabrication)
+    """
+    claim = _strip_markers(claim)
+    claim_c = content_tokens(claim)
+    if not (source or "").strip():
+        return {"score": 0.0, "content_cov": 0.0, "num_cov": 0.0,
+                "missing_numbers": sorted(significant_numbers(claim)),
+                "n_content": len(claim_c), "source_empty": True}
+    src_all = set(tokenize(source))
+    if not claim_c:
+        # a sentence with a marker but no checkable content (e.g. a pure transition) —
+        # not a fidelity failure; flag it as such so the runner can exclude it.
+        return {"score": 1.0, "content_cov": 1.0, "num_cov": 1.0,
+                "missing_numbers": [], "n_content": 0, "source_empty": False}
+    content_cov = len(claim_c & src_all) / len(claim_c)
+    claim_nums = significant_numbers(claim)
+    if claim_nums:
+        present = claim_nums & src_all
+        num_cov = len(present) / len(claim_nums)
+        missing = sorted(claim_nums - present)
+    else:
+        num_cov, missing = 1.0, []
+    return {"score": round(content_cov, 3), "content_cov": round(content_cov, 3),
+            "num_cov": round(num_cov, 3), "missing_numbers": missing,
+            "n_content": len(claim_c), "source_empty": False}
+def classify(g: dict, grounded: float = 0.5, weak: float = 0.3) -> str:
+    """Bucket a grounding result. UNVERIFIABLE keeps fetch failures out of the
+    fabrication count — an unreachable source is not the same as a fabricated one."""
+    if g.get("source_empty"):
+        return "UNVERIFIABLE"
+    if g["n_content"] == 0:
+        return "NO_CONTENT"
+    if g["content_cov"] >= grounded:
+        return "GROUNDED"
+    if g["content_cov"] >= weak:
+        return "WEAK"
+    return "UNGROUNDED"
+# ----------------------------------------------------------------------------- parsing
+def split_draft(text: str) -> str:
+    """The draft body, with the appended WEB SOURCES / YOUR DOCUMENTS listing removed."""
+    m = _SOURCE_HEADER.search(text or "")
+    return (text[:m.start()] if m else (text or "")).rstrip()
+def markers_in(text: str) -> list[str]:
+    """Citation markers inside any [...] span, de-duplicated, order-preserving.
+    Handles [S1], [S1, S2], [S1][S2], [L1]; ignores non-marker brackets like [2024]."""
+    out, seen = [], set()
+    for span in _BRACKET.findall(text or ""):
+        for m in _MARKER.findall(span):
+            if m not in seen:
+                seen.add(m)
+                out.append(m)
+    return out
+def parse_cited_claims(draft: str) -> list[tuple[str, list[str]]]:
+    """Sentences carrying at least one [S#]/[L#] marker → (sentence, [markers])."""
+    out = []
+    for sent in _SENT_SPLIT.split(split_draft(draft)):
+        sent = sent.strip()
+        if not sent:
+            continue
+        markers = markers_in(sent)
+        if markers:
+            out.append((sent, markers))
+    return out
+def parse_source_map(text: str) -> dict[str, str]:
+    """Marker → URL/path, read from the source-listing lines ('[S1] Title — https://x')."""
+    return {m.group(1): m.group(2) for m in _SRC_LINE.finditer(text or "")}
+def split_sections(report: str) -> list[str]:
+    """A compiled report's per-analyst '### …' blocks (each self-contained: draft +
+    its own source list). Anything before the first heading (summary, agree/differ) is
+    dropped — the executive summary carries no citations to grade."""
+    parts = _SECTION_SPLIT.split(report or "")
+    return [p for p in parts[1:] if p.strip()] if len(parts) > 1 else []
+# ----------------------------------------------------------------------------- one claim
+def score_claim(claim: str, markers: list[str], sources: dict[str, str]) -> dict:
+    """Grounding of a claim against the UNION of the sources IT cites — a sentence is
+    fairly judged against every source it points at (the fact may be split across them).
+    A claim is UNVERIFIABLE only when ALL of its cited sources are empty/unreachable.
+    Limitation (review: HONESTY-005): checking the union means a claim asserting a
+    RELATIONSHIP across sources ("X founded Y [S1, S2]") can pass when S1 mentions X and
+    S2 mentions Y, even if neither source states the relationship. Union grounding proves
+    the terms are attributable to the cited set, not that the claim's logic is supported —
+    another reason a GROUNDED verdict is a floor, not proof of truth."""
+    union = "\n".join(sources.get(m, "") for m in markers)
+    g = grounding_score(claim, union)
+    g["markers"] = list(markers)
+    g["cited_present"] = [m for m in markers if (sources.get(m) or "").strip()]
+    g["claim"] = claim.strip()
+    return g

council/judge.py ADDED Viewed

@@ -0,0 +1,393 @@
+#!/usr/bin/env python3
+"""
+council/judge.py — Score, then MERGE
+====================================
+The judge is what turns "many answers" into "better intelligence":
+  1. SCORE — reads the candidate answers BLIND (anonymized, shuffled order so neither
+     identity nor position biases the result) and rates each 0-10. The scores feed the
+     ledger, so good answers earn more credit (ideas compete).
+  2. MERGE — synthesizes the candidates into one answer that is better than any single
+     one: it keeps the consensus, ADDS the unique points only one perspective found, and
+     reconciles disagreements instead of hiding them. This is the diversity dividend.
+  3. COMPARE — a blind A/B check (used for verification) of merged vs. best-single.
+Use a STRONG model, ideally a different family from the workers, at temperature 0 so
+judging is steady while the workers diverge.
+"""
+from __future__ import annotations
+import json
+import os
+import re
+import time
+from dataclasses import dataclass
+from typing import Optional
+import requests
+from council.sanitize import strip_invisible
+OLLAMA_BASE = "http://localhost:11434"
+def _extract_json(text: str):
+    """Tolerantly pull the first JSON value out of a model response."""
+    # Strip code fences.
+    fenced = re.search(r"```(?:json)?\s*(.*?)```", text, re.DOTALL)
+    candidate = fenced.group(1) if fenced else text
+    try:
+        return json.loads(candidate.strip())
+    except Exception:
+        pass
+    # Find the first balanced [...] or {...} span.
+    for open_ch, close_ch in (("[", "]"), ("{", "}")):
+        start = candidate.find(open_ch)
+        if start == -1:
+            continue
+        depth = 0
+        for i in range(start, len(candidate)):
+            if candidate[i] == open_ch:
+                depth += 1
+            elif candidate[i] == close_ch:
+                depth -= 1
+                if depth == 0:
+                    try:
+                        return json.loads(candidate[start : i + 1])
+                    except Exception:
+                        break
+    return None
+_CITE_TOKEN = re.compile(r"[SL]\d+")
+_CITE_SPAN = re.compile(r"\[[SL]\d+(?:\s*,\s*[SL]\d+)*\]")
+def _known_markers(answers) -> set[str]:
+    """Every [S#]/[L#] marker that appears in the source answers — the only citations a
+    synthesis is allowed to keep."""
+    return set(_CITE_TOKEN.findall(" ".join(getattr(a, "text", "") or "" for a in answers)))
+def _drop_invented_markers(text: str, known: set[str]) -> str:
+    """Remove any citation marker the synthesis INVENTED (not present in any source answer),
+    so a merge can't fabricate a citation even if it ignores the prompt rule (review
+    CITATION_MERGE_001). It may still drop/renumber — that the prompt discourages — but it can
+    never point at a source that was never cited."""
+    if "[" not in text:
+        return text
+    def _repl(m: re.Match) -> str:
+        kept = [t for t in _CITE_TOKEN.findall(m.group(0)) if t in known]
+        return ("[" + ", ".join(kept) + "]") if kept else ""
+    return _CITE_SPAN.sub(_repl, text)
+@dataclass
+class ScoredCandidate:
+    worker_id: str
+    score: float
+    reason: str
+@dataclass
+class Judge:
+    model: str
+    ollama_base: str = OLLAMA_BASE
+    num_predict: int = 900
+    def _generate(self, prompt: str, num_predict: Optional[int] = None) -> str:
+        resp = requests.post(
+            f"{self.ollama_base}/api/generate",
+            json={
+                "model": self.model,
+                "prompt": prompt,
+                "stream": False,
+                "options": {"temperature": 0.0, "num_predict": num_predict or self.num_predict},
+                "keep_alive": os.environ.get("PW_OLLAMA_KEEP_ALIVE", "30m"),  # warm judge (R17)
+            },
+            # CPU-only/busy machines need headroom (measured: a 4B judge can exceed 400s
+            # under contention); configurable like the worker/researcher timeouts.
+            timeout=float(os.environ.get("PW_JUDGE_TIMEOUT",
+                                         os.environ.get("PW_OLLAMA_TIMEOUT", "400"))),
+        )
+        resp.raise_for_status()
+        return (resp.json().get("response") or "").strip()
+    # ------------------------------------------------------------------ 1. SCORE
+    def score(self, question: str, answers: list) -> list[ScoredCandidate]:
+        """answers: list[council.worker.Answer]. Blind, shuffled, deterministic mapping."""
+        # Deterministic shuffle (rotate by length) so the judge can't learn an ordering.
+        order = list(range(len(answers)))
+        rot = len(answers) % max(1, len(answers))
+        order = order[rot:] + order[:rot]
+        blocks = []
+        for display_idx, real_idx in enumerate(order, start=1):
+            blocks.append(f"--- Answer {display_idx} ---\n{answers[real_idx].text}")
+        joined = "\n\n".join(blocks)
+        prompt = (
+            "You are an impartial judge. Score each candidate answer to the question on a "
+            "0-10 scale for correctness, depth, usefulness, and insight. Judge only on merit; "
+            "you do not know who wrote them.\n\n"
+            f"QUESTION:\n{question}\n\n"
+            f"CANDIDATES:\n{joined}\n\n"
+            "Respond with ONLY a JSON array, one object per answer, like:\n"
+            '[{"answer": 1, "score": 7.5, "reason": "..."}, ...]'
+        )
+        raw = self._generate(prompt, num_predict=600)
+        parsed = _extract_json(raw) or []
+        # Map display index -> real worker.
+        by_display = {}
+        if isinstance(parsed, list):
+            for obj in parsed:
+                try:
+                    by_display[int(obj["answer"])] = (
+                        float(obj["score"]), strip_invisible(str(obj.get("reason", "")).strip()))
+                except Exception:
+                    continue
+        results: list[ScoredCandidate] = []
+        for display_idx, real_idx in enumerate(order, start=1):
+            score, reason = by_display.get(display_idx, (5.0, "(unscored — defaulted)"))
+            score = max(0.0, min(10.0, score))
+            results.append(ScoredCandidate(answers[real_idx].worker_id, score, reason))
+        return results
+    # ------------------------------------------------------------------ 2. MERGE
+    def merge(self, question: str, answers: list) -> str:
+        blocks = [f"--- Perspective {i + 1} ---\n{a.text}" for i, a in enumerate(answers)]
+        joined = "\n\n".join(blocks)
+        longest = max((len(a.text.split()) for a in answers), default=200)
+        prompt = (
+            "You are a synthesizer. Several independent perspectives answer the same question below. "
+            "Write ONE answer that is strictly BETTER and NO LONGER than the best single perspective — "
+            "win on DENSITY, not length.\n"
+            "Rules:\n"
+            "  • Integrate the views — do NOT append them or describe each separately.\n"
+            "  • Include the strongest points and any correct insight only one perspective found.\n"
+            "  • If they disagree, state the resolution in ONE line.\n"
+            "  • Cut filler, repetition, hedging, and preamble. Lead with the answer.\n"
+            "  • Preserve any [S#]/[L#] citation markers EXACTLY as written next to the claims "
+            "they support; never renumber, merge, or invent a marker (R17).\n"
+            "  • Write ONE direct answer to the asker — never mention 'Perspective N' or that "
+            "multiple answers exist.\n"
+            f"  • Length target: {max(60, int(longest * 0.8))}–{longest} words — as substantive as the "
+            "best perspective, never padded, never a stub. End with one line 'Diverse angles: …' "
+            "(≤15 words) naming the distinct contributions.\n\n"
+            f"QUESTION:\n{question}\n\n"
+            f"PERSPECTIVES:\n{joined}\n\n"
+            "Write the tight merged answer now."
+        )
+        # the synthesized text is the last untrusted-output hop before the report → strip hidden
+        # chars, then drop any citation marker the synthesis invented (keep merges honest, R17)
+        out = strip_invisible(self._generate(prompt, num_predict=min(900, max(300, longest * 2))))
+        return _drop_invented_markers(out, _known_markers(answers))
+    # ------------------------------------------------------------------ DELIBERATE (one blind call)
+    def deliberate(self, question: str, answers: list) -> dict:
+        """
+        One blind pass that powers the UI: per-answer scores, a terse merge (TL;DR), and the
+        'council read' — where the perspectives AGREE, where they DIFFER, and each UNIQUE point.
+        Returns {"scores": {worker_id: score}, "merged": str,
+                 "council": {"consensus": [...], "disagreements": [{"point","sides"}],
+                             "unique": [{"worker_id","point"}]}}.
+        Answers are anonymized + rotated so identity/position can't bias the read.
+        """
+        order = list(range(len(answers)))
+        order = order[len(answers) % max(1, len(answers)):] + order[:len(answers) % max(1, len(answers))]
+        blocks = []
+        for disp, real in enumerate(order, start=1):
+            blocks.append(f"--- Answer {disp} ---\n{answers[real].text}")
+        joined = "\n\n".join(blocks)
+        longest = max((len(a.text.split()) for a in answers), default=200)
+        prompt = (
+            "You are an impartial council secretary. Read the candidate answers to the question and "
+            "return STRICT JSON only, no prose, with this exact shape:\n"
+            '{"scores":[{"answer":1,"score":0-10}],'
+            '"consensus":["points all/most answers agree on"],'
+            '"disagreements":[{"point":"what they differ on","sides":"who says what"}],'
+            '"unique":[{"answer":N,"point":"a valuable point only answer N made"}],'
+            f'"merge":"a TIGHT synthesis of {max(60, int(longest * 0.8))}-{longest} words — as substantive '
+            'as the best candidate, never padded, never a stub; integrated not appended, '
+            'leading with the answer. Preserve any [S#]/[L#] citation markers exactly as written; '
+            'never renumber or invent one. Write it as ONE direct answer to the asker — never mention '
+            'Answer 1/2/3, candidates, or that multiple answers exist"}\n'
+            "Judge on merit only; you do not know who wrote them.\n\n"
+            f"QUESTION:\n{question}\n\nCANDIDATES:\n{joined}\n\nJSON:"
+        )
+        raw = self._generate(prompt, num_predict=min(1100, max(500, longest * 3)))
+        parsed = _extract_json(raw)
+        if not isinstance(parsed, dict):   # models sometimes emit a bare list — never crash
+            parsed = {}
+        def _wid(disp_idx: int):
+            try:
+                return answers[order[int(disp_idx) - 1]].worker_id
+            except (ValueError, IndexError, TypeError):
+                return None
+        scores: dict[str, float] = {}
+        for obj in parsed.get("scores", []) if isinstance(parsed.get("scores"), list) else []:
+            wid = _wid(obj.get("answer"))
+            if wid is not None:
+                try:
+                    scores[wid] = max(0.0, min(10.0, float(obj.get("score", 5.0))))
+                except (TypeError, ValueError):
+                    scores[wid] = 5.0
+        for a in answers:                       # default any unscored answer
+            scores.setdefault(a.worker_id, 5.0)
+        unique = []
+        for u in parsed.get("unique", []) if isinstance(parsed.get("unique"), list) else []:
+            wid = _wid(u.get("answer"))
+            pt = strip_invisible(str(u.get("point", "")).strip())
+            if wid and pt:
+                unique.append({"worker_id": wid, "point": pt})
+        def _sides(v) -> str:
+            if isinstance(v, dict):    # models sometimes return {"Answer 1": "...", ...}
+                return " · ".join(f"{k}: {x}" for k, x in v.items())
+            if isinstance(v, list):    # …or ["Answer 1: ...", "Answer 2: ..."]
+                return " · ".join(str(x) for x in v)
+            return str(v or "").strip()
+        # every council-read string is model output → strip smuggled hidden characters
+        consensus = [strip_invisible(str(x).strip()) for x in (parsed.get("consensus") or []) if str(x).strip()][:6]
+        disagreements = [
+            {"point": strip_invisible(str(d.get("point", "")).strip()),
+             "sides": strip_invisible(_sides(d.get("sides")))}
+            for d in (parsed.get("disagreements") or []) if isinstance(d, dict) and d.get("point")
+        ][:6]
+        merged = _drop_invented_markers(strip_invisible(str(parsed.get("merge", "")).strip()),
+                                        _known_markers(answers))
+        if not merged:   # fall back to the dedicated merge prompt if JSON merge was empty (already guarded)
+            merged = self.merge(question, answers)
+        return {"scores": scores, "merged": merged,
+                "council": {"consensus": consensus, "disagreements": disagreements, "unique": unique}}
+    # ------------------------------------------------------------------ 2b. COMPILE REPORT (D13)
+    def compile_report(self, question: str, contributions: list[dict], read: dict,
+                       local: bool = False) -> str:
+        """
+        Editor pass for `research_report` jobs: one cited markdown report from the
+        per-country contributions. The model writes ONLY the executive summary and the
+        agreement/difference synthesis; the per-country findings (with their [S#]
+        citations and source lists) are assembled verbatim by code — small models
+        garble citation markers if asked to rewrite them.
+        contributions: payload answers [{country, model, lens, text, research}].
+        read: the deliberate() result (used as fallback + disagreement bullets).
+        """
+        drafts = "\n\n".join(
+            f"--- Contribution from {c.get('country', '?')} ---\n{strip_invisible((c.get('text') or '')[:2500])}"
+            for c in contributions)
+        raw = self._generate(
+            "You are the editor of a multi-country research report. Read the brief and the "
+            "contributions, then reply STRICT JSON only, no prose:\n"
+            '{"summary":"a 120-180 word executive summary leading with the most '
+            'decision-relevant findings (concrete numbers/dates; mention which country a '
+            'finding came from)",'
+            '"agreements":"2-3 sentences: what the countries\' findings agree on",'
+            '"differences":"2-3 sentences: where they genuinely differ and why it matters"}\n\n'
+            f"BRIEF:\n{question}\n\nCONTRIBUTIONS:\n{drafts}\n\nJSON:",
+            num_predict=700)
+        parsed = _extract_json(raw)
+        if not isinstance(parsed, dict):   # model may emit a list/garbage — never crash the report
+            parsed = {}
+        summary = strip_invisible(str(parsed.get("summary", "")).strip()) or str(read.get("merged", "")).strip()
+        agreements = strip_invisible(str(parsed.get("agreements", "")).strip())
+        differences = strip_invisible(str(parsed.get("differences", "")).strip())
+        countries = sorted({c.get("country", "?") for c in contributions})
+        if local:
+            byline = (f"_Researched independently by {len(contributions)} local model(s) "
+                      f"({', '.join(c.get('model', '?') for c in contributions)}) on this "
+                      "computer, live web, compiled by a blind editor._")
+        else:
+            byline = (f"_Researched independently by {len(contributions)} computer(s) in "
+                      f"{len(countries)} countr{'y' if len(countries) == 1 else 'ies'} "
+                      f"({', '.join(countries)}), live web, compiled by a blind editor._")
+        parts = [f"# Research report\n**Brief:** {question}", byline,
+                 "## Executive summary", summary]
+        if agreements or differences:
+            parts.append("## Where the analysts agree — and differ" if local
+                         else "## Where the countries agree — and differ")
+            if agreements:
+                parts.append(f"**Agree:** {agreements}")
+            if differences:
+                parts.append(f"**Differ:** {differences}")
+        dis = (read.get("council") or {}).get("disagreements") or []
+        if dis:
+            parts.append("\n".join(f"- {d.get('point', '')}"
+                                   + (f" — _{d['sides']}_" if d.get("sides") else "")
+                                   for d in dis if d.get("point")))
+        parts.append("## Findings by analyst" if local else "## Findings by country")
+        for c in contributions:
+            head = c.get("model", "") if local else f"{c.get('country', '?')} — {c.get('model', '')}"
+            parts.append(f"### {head}")
+            # contribution text is model output that may echo an injected source → strip hidden
+            # chars but KEEP markdown layout/citations (don't collapse indentation)
+            parts.append(strip_invisible(c.get("text") or "") or "_(no findings)_")
+        return "\n\n".join(p for p in parts if p)
+    # ------------------------------------------------------------------ 2c. SPOT-CHECK (shard_map QA)
+    def spot_check(self, instruction: str, answers: list[dict]) -> dict:
+        """
+        QA sampler for batch jobs: each node's payload entry carries a small `sample` of
+        its (item, output) pairs. One blind call scores instruction-compliance per node
+        0–10 (anonymized as Worker N). Returns {"scores": {worker_id: float}}.
+        """
+        order = list(range(len(answers)))
+        order = order[len(answers) % max(1, len(answers)):] + order[:len(answers) % max(1, len(answers))]
+        blocks = []
+        for disp, real in enumerate(order, start=1):
+            sample = answers[real].get("sample") or []
+            pairs = "\n".join(
+                f"  ITEM: {str(s.get('item', ''))[:300]}\n  OUTPUT: {str(s.get('output', ''))[:400]}"
+                for s in sample) or "  (no sample — node returned nothing)"
+            blocks.append(f"--- Worker {disp} sample ---\n{pairs}")
+        raw = self._generate(
+            "You are a QA inspector for batch work. Each worker applied the INSTRUCTION to "
+            "its items; you see a random sample per worker. Score each worker 0-10 on "
+            "instruction-compliance and output quality (0 = empty/garbage, 10 = flawless). "
+            'Reply STRICT JSON only: {"scores":[{"worker":1,"score":0-10}]}\n\n'
+            f"INSTRUCTION:\n{instruction}\n\n" + "\n\n".join(blocks) + "\n\nJSON:",
+            num_predict=200)
+        parsed = _extract_json(raw)
+        if not isinstance(parsed, dict):
+            parsed = {}
+        scores: dict[str, float] = {}
+        for obj in parsed.get("scores", []) if isinstance(parsed.get("scores"), list) else []:
+            try:
+                real = order[int(obj.get("worker")) - 1]
+                scores[answers[real]["worker_id"]] = max(0.0, min(10.0, float(obj.get("score", 5.0))))
+            except (ValueError, IndexError, TypeError, KeyError):
+                continue
+        for a in answers:
+            scores.setdefault(a["worker_id"], 5.0)
+        return {"scores": scores, "merged": "",
+                "council": {"consensus": [], "disagreements": [], "unique": []}}
+    # ------------------------------------------------------------------ 3. COMPARE (verification)
+    def compare(self, question: str, text_a: str, text_b: str) -> dict:
+        """Blind A/B. Returns {'winner': 'A'|'B'|'tie', 'reason': str}."""
+        prompt = (
+            "Two answers to the same question are below. Decide which is the better answer "
+            "(more complete, accurate, and useful). Judge blind.\n\n"
+            f"QUESTION:\n{question}\n\n"
+            f"=== Answer A ===\n{text_a}\n\n"
+            f"=== Answer B ===\n{text_b}\n\n"
+            'Respond with ONLY JSON: {"winner": "A" | "B" | "tie", "reason": "..."}'
+        )
+        raw = self._generate(prompt, num_predict=300)
+        parsed = _extract_json(raw)
+        if not isinstance(parsed, dict):
+            parsed = {}
+        winner = str(parsed.get("winner", "tie")).strip().upper()
+        winner = winner if winner in {"A", "B", "TIE"} else "TIE"
+        return {"winner": "tie" if winner == "TIE" else winner,
+                "reason": strip_invisible(str(parsed.get("reason", "")).strip())}