scroot 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. scroot/__init__.py +109 -0
  2. scroot/agents.py +345 -0
  3. scroot/audit.py +131 -0
  4. scroot/cli/__init__.py +167 -0
  5. scroot/cli/download.py +49 -0
  6. scroot/cli/eval.py +230 -0
  7. scroot/cli/model_info.py +28 -0
  8. scroot/composite.py +170 -0
  9. scroot/config/__init__.py +0 -0
  10. scroot/config/corrector.py +92 -0
  11. scroot/connectors/__init__.py +5 -0
  12. scroot/connectors/database.py +357 -0
  13. scroot/context/__init__.py +9 -0
  14. scroot/context/adapters.py +86 -0
  15. scroot/context/builder.py +514 -0
  16. scroot/context/dedup.py +99 -0
  17. scroot/context/payload.py +66 -0
  18. scroot/context/pii.py +101 -0
  19. scroot/context/tokenizer.py +42 -0
  20. scroot/core.py +349 -0
  21. scroot/corrector/__init__.py +38 -0
  22. scroot/corrector/api.py +145 -0
  23. scroot/corrector/base.py +20 -0
  24. scroot/corrector/disabled.py +13 -0
  25. scroot/corrector/local.py +112 -0
  26. scroot/corrector/models.py +69 -0
  27. scroot/dashboard/__init__.py +0 -0
  28. scroot/dashboard/__main__.py +37 -0
  29. scroot/dashboard/routers/__init__.py +0 -0
  30. scroot/dashboard/routers/analytics.py +236 -0
  31. scroot/dashboard/routers/corrector.py +230 -0
  32. scroot/dashboard/routers/export.py +150 -0
  33. scroot/dashboard/routers/guardrails.py +41 -0
  34. scroot/dashboard/routers/pipeline.py +218 -0
  35. scroot/dashboard/routers/queue.py +188 -0
  36. scroot/dashboard/routers/records.py +252 -0
  37. scroot/dashboard/routers/settings.py +291 -0
  38. scroot/dashboard/security.py +135 -0
  39. scroot/dashboard/server.py +181 -0
  40. scroot/evidence.py +228 -0
  41. scroot/exceptions.py +62 -0
  42. scroot/feedback/__init__.py +6 -0
  43. scroot/feedback/injector.py +160 -0
  44. scroot/feedback/sanitizer.py +56 -0
  45. scroot/feedback/store.py +650 -0
  46. scroot/flags.py +42 -0
  47. scroot/metrics/__init__.py +15 -0
  48. scroot/metrics/_utils.py +9 -0
  49. scroot/metrics/completeness.py +139 -0
  50. scroot/metrics/confidence.py +83 -0
  51. scroot/metrics/consistency.py +125 -0
  52. scroot/metrics/groundedness.py +193 -0
  53. scroot/metrics/relevance.py +73 -0
  54. scroot/models.py +214 -0
  55. scroot/result.py +276 -0
  56. scroot/sampling.py +306 -0
  57. scroot/text_utils.py +136 -0
  58. scroot/ui/dist/assets/index-DW1dLzDl.js +101 -0
  59. scroot/ui/dist/assets/index-WOhrVVSM.css +2 -0
  60. scroot/ui/dist/favicon.svg +27 -0
  61. scroot/ui/dist/index.html +20 -0
  62. scroot-0.2.0.dist-info/METADATA +832 -0
  63. scroot-0.2.0.dist-info/RECORD +67 -0
  64. scroot-0.2.0.dist-info/WHEEL +5 -0
  65. scroot-0.2.0.dist-info/entry_points.txt +2 -0
  66. scroot-0.2.0.dist-info/licenses/LICENSE +201 -0
  67. scroot-0.2.0.dist-info/top_level.txt +1 -0
scroot/context/pii.py ADDED
@@ -0,0 +1,101 @@
1
+ """PII detection and scrubbing for context content.
2
+
3
+ Regex-based, fully local - no external API call, consistent with
4
+ scroot's zero-external-dependency principle. Detected entities are
5
+ replaced with typed placeholders (e.g. ``[EMAIL]``); the scrub summary
6
+ records counts by entity type only, never the original values.
7
+
8
+ Detection is best-effort: regex catches structured PII (emails, phones,
9
+ SSNs, cards, IPs, secrets, dates, street addresses) reliably, and person
10
+ names via honorifics and a common-first-name heuristic. For regulated
11
+ workloads, layer a dedicated NER scrubber in front and pass pre-scrubbed
12
+ text in with ``pii_scrub=False``.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import re
18
+ from dataclasses import dataclass
19
+
20
+
21
+ @dataclass
22
+ class ScrubResult:
23
+ """Result of scrubbing one piece of text.
24
+
25
+ Attributes:
26
+ scrubbed_text: Text with PII replaced by typed placeholders.
27
+ summary: Counts by entity type plus ``total_entities_scrubbed``.
28
+ Never contains original values.
29
+ was_scrubbed: True if at least one entity was replaced.
30
+ """
31
+ scrubbed_text: str
32
+ summary: dict[str, int]
33
+ was_scrubbed: bool
34
+
35
+
36
+ # Common first names used for best-effort [PERSON] detection when no
37
+ # honorific is present. Matches "<FirstName> <Capitalized Surname>".
38
+ _COMMON_FIRST_NAMES = (
39
+ "James|John|Robert|Michael|William|David|Richard|Joseph|Thomas|Charles|"
40
+ "Christopher|Daniel|Matthew|Anthony|Mark|Donald|Steven|Paul|Andrew|Joshua|"
41
+ "Kenneth|Kevin|Brian|George|Timothy|Ronald|Edward|Jason|Jeffrey|Ryan|"
42
+ "Mary|Patricia|Jennifer|Linda|Elizabeth|Barbara|Susan|Jessica|Sarah|Karen|"
43
+ "Lisa|Nancy|Betty|Margaret|Sandra|Ashley|Kimberly|Emily|Donna|Michelle|"
44
+ "Carol|Amanda|Dorothy|Melissa|Deborah|Stephanie|Rebecca|Sharon|Laura|"
45
+ "Jane|Emma|Olivia|Sophia|Alice|Anna|Maria|Rachel|Hannah|Grace"
46
+ )
47
+
48
+ # Ordered by priority - earlier patterns run first so that, e.g., an API
49
+ # key is redacted as [SECRET] before the generic patterns see it.
50
+ _PATTERNS: dict[str, re.Pattern] = {
51
+ "SECRET": re.compile(
52
+ r'\b(?:sk-ant-[a-zA-Z0-9-]{20,}|sk-[a-zA-Z0-9]{20,}|'
53
+ r'AKIA[A-Z0-9]{16}|ghp_[a-zA-Z0-9]{36}|[a-f0-9]{32,})\b'
54
+ ),
55
+ "EMAIL": re.compile(r'\b[\w.+-]+@[\w-]+\.[a-zA-Z]{2,}\b'),
56
+ "CARD": re.compile(r'\b(?:\d[ -]?){13,16}\b'),
57
+ "SSN": re.compile(r'\b\d{3}-\d{2}-\d{4}\b'),
58
+ "IP": re.compile(r'\b\d{1,3}(?:\.\d{1,3}){3}\b'),
59
+ "PHONE": re.compile(
60
+ r'(?:\+?\d{1,2}[-.\s])?\(?\d{3}\)?[-.\s]\d{3,4}[-.\s]?\d{0,4}\b'
61
+ r'|\b\d{3}[-.\s]\d{3}[-.\s]\d{4}\b'
62
+ ),
63
+ "DOB": re.compile(
64
+ r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|'
65
+ r'Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|'
66
+ r'Nov(?:ember)?|Dec(?:ember)?)\.?\s+\d{1,2},?\s+\d{4}\b'
67
+ r'|\b\d{1,2}/\d{1,2}/\d{4}\b'
68
+ ),
69
+ "ADDRESS": re.compile(
70
+ r'\b\d{1,5}\s+(?:[A-Z][a-zA-Z]+\s+){1,3}'
71
+ r'(?:St|Street|Ave|Avenue|Rd|Road|Blvd|Boulevard|Ln|Lane|'
72
+ r'Dr|Drive|Ct|Court|Way|Pl|Place)\b\.?'
73
+ ),
74
+ "PERSON": re.compile(
75
+ r'\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?'
76
+ rf'|\b(?:{_COMMON_FIRST_NAMES})\s+[A-Z][a-z]+\b'
77
+ ),
78
+ }
79
+
80
+
81
+ def scrub(text: str) -> ScrubResult:
82
+ """Replace detected PII entities with typed placeholders.
83
+
84
+ Args:
85
+ text: Raw text that may contain PII.
86
+
87
+ Returns:
88
+ ScrubResult with the scrubbed text and a count-only summary.
89
+ The original values are not retained anywhere.
90
+ """
91
+ summary = {k: 0 for k in _PATTERNS}
92
+ result = text
93
+ for entity_type, pattern in _PATTERNS.items():
94
+ result, n = pattern.subn(f'[{entity_type}]', result)
95
+ summary[entity_type] = n
96
+ total = sum(summary.values())
97
+ return ScrubResult(
98
+ scrubbed_text=result,
99
+ summary={**summary, "total_entities_scrubbed": total},
100
+ was_scrubbed=total > 0,
101
+ )
@@ -0,0 +1,42 @@
1
+ """Token counting for context budgeting.
2
+
3
+ Uses tiktoken when installed (accurate for OpenAI-family tokenisers);
4
+ falls back to a character-based estimate (~4 chars per token) otherwise.
5
+ The fallback intentionally over-estimates slightly so the max_tokens
6
+ budget errs on the safe side.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ _encoder = None
12
+ _tiktoken_checked = False
13
+
14
+
15
+ def _get_encoder():
16
+ global _encoder, _tiktoken_checked
17
+ if not _tiktoken_checked:
18
+ _tiktoken_checked = True
19
+ try:
20
+ import tiktoken
21
+ _encoder = tiktoken.get_encoding("cl100k_base")
22
+ except Exception:
23
+ _encoder = None
24
+ return _encoder
25
+
26
+
27
+ def count_tokens(text: str) -> int:
28
+ """Count tokens in text.
29
+
30
+ Args:
31
+ text: Input string.
32
+
33
+ Returns:
34
+ Token count via tiktoken if available, otherwise
35
+ ``max(1, ceil(len(text) / 4))`` for non-empty text.
36
+ """
37
+ if not text:
38
+ return 0
39
+ encoder = _get_encoder()
40
+ if encoder is not None:
41
+ return len(encoder.encode(text))
42
+ return max(1, -(-len(text) // 4))
scroot/core.py ADDED
@@ -0,0 +1,349 @@
1
+ """Auditor: main orchestrator class.
2
+
3
+ Loads models once, runs all metrics, computes IQS, returns result.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import warnings
10
+
11
+ from .result import EntailmentResult
12
+ from .context.payload import ContextPayload
13
+ from .evidence import build_evidence_map
14
+ from .exceptions import GroundednessComputationError, NoContextWarning
15
+ from .metrics.groundedness import score_groundedness
16
+ from .metrics.completeness import score_completeness
17
+ from .metrics.relevance import score_relevance
18
+ from .metrics.consistency import score_consistency
19
+ from .metrics.confidence import score_confidence
20
+ from .composite import DEFAULT_WEIGHTS, compute_iqs_detailed
21
+ from .flags import detect_flags
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class Auditor:
27
+ """LLM-free response quality scorer.
28
+
29
+ Scores LLM responses using NLI models and embedding similarity.
30
+ No LLM API calls required. Runs locally, deterministic, fast.
31
+
32
+ Args:
33
+ nli_model: NLI cross-encoder model name or pre-instantiated instance.
34
+ Upgrade to ``cross-encoder/nli-deberta-v3-large`` for ~4% better
35
+ accuracy at the cost of ~2x latency.
36
+ embedding_model: Sentence-transformers model name or instance.
37
+ device: Inference device. ``"cpu"`` or ``"cuda"``.
38
+ weights: Optional custom IQS component weights dict. Missing keys
39
+ default to the standard weights. Use ``scroot.RAG_WEIGHTS``
40
+ for RAG-optimised scoring (higher groundedness weight).
41
+ iqs_mode: IQS formula. ``"harmonic"`` (default) uses the weighted
42
+ harmonic mean: ``IQS = n / sum(w_i / s_i)``. Any metric near
43
+ zero drives IQS to zero. ``"geometric"`` uses weighted geometric
44
+ mean - does not collapse to zero on partial hallucination.
45
+ atomic_claims: If True (default), split compound sentences into
46
+ sub-claims before groundedness scoring. Prevents one wrong fact
47
+ from zeroing an entire multi-fact sentence.
48
+ similarity_fallback: If True (default), use bi-encoder cosine
49
+ similarity as a fallback when NLI confidence is uncertain (0.3-0.7).
50
+ Catches paraphrases that exact NLI entailment misses.
51
+ similarity_threshold: Cosine similarity threshold for paraphrase
52
+ credit in the similarity fallback. Default 0.82.
53
+ max_query_length: Truncate query to this many characters (H-3).
54
+ max_response_length: Truncate response to this many characters (H-3).
55
+ max_context_items: Maximum number of context chunks (H-3).
56
+ max_context_item_length: Truncate each context chunk to this length (H-3).
57
+ max_batch_size: ``score_batch()`` raises ValueError above this limit (H-3).
58
+ entailment_threshold: Minimum entailment probability for a claim to
59
+ be grounded. Default 0.5.
60
+ coverage_threshold: Minimum embedding similarity for a query segment
61
+ to be considered covered by the response. Default 0.45.
62
+ contradiction_threshold: Minimum contradiction probability to flag a
63
+ sentence pair as contradictory. Default 0.7.
64
+ max_sentences: Maximum sentences evaluated by consistency scorer. Default 25.
65
+ compute_evidence_map: If True (default) and context is provided,
66
+ attach a sentence-level :class:`~scroot.EvidenceMap` to
67
+ ``result.evidence_map`` showing which response sentences are
68
+ supported, contradicted, or ungrounded.
69
+ evidence_entailment_threshold: Minimum entailment probability for a
70
+ sentence to be marked "supported" in the evidence map. Default
71
+ 0.70 (stricter than the groundedness ``entailment_threshold``).
72
+ evidence_contradiction_threshold: Minimum contradiction probability
73
+ for a sentence to be marked "contradicted" in the evidence map.
74
+ Default 0.30.
75
+ """
76
+
77
+ def __init__(
78
+ self,
79
+ nli_model: str = "cross-encoder/nli-deberta-v3-base",
80
+ embedding_model: str = "all-MiniLM-L6-v2",
81
+ device: str = "cpu",
82
+ weights: dict | None = None,
83
+ iqs_mode: str = "harmonic",
84
+ atomic_claims: bool = True,
85
+ similarity_fallback: bool = True,
86
+ similarity_threshold: float = 0.82,
87
+ top_k_chunks: int = 3,
88
+ bidirectional_consistency: bool = True,
89
+ nli_completeness: bool = True,
90
+ max_query_length: int = 10_000,
91
+ max_response_length: int = 50_000,
92
+ max_context_items: int = 50,
93
+ max_context_item_length: int = 10_000,
94
+ max_batch_size: int = 1_000,
95
+ entailment_threshold: float = 0.5,
96
+ coverage_threshold: float = 0.45,
97
+ contradiction_threshold: float = 0.7,
98
+ max_sentences: int = 25,
99
+ compute_evidence_map: bool = True,
100
+ evidence_entailment_threshold: float = 0.70,
101
+ evidence_contradiction_threshold: float = 0.30,
102
+ ):
103
+ self.nli_model = nli_model
104
+ self.embedding_model = embedding_model
105
+ self.device = device
106
+ self.weights = weights
107
+ self.iqs_mode = iqs_mode
108
+ self.atomic_claims = atomic_claims
109
+ self.similarity_fallback = similarity_fallback
110
+ self.similarity_threshold = similarity_threshold
111
+ self.top_k_chunks = top_k_chunks
112
+ self.bidirectional_consistency = bidirectional_consistency
113
+ self.nli_completeness = nli_completeness
114
+ self.max_query_length = max_query_length
115
+ self.max_response_length = max_response_length
116
+ self.max_context_items = max_context_items
117
+ self.max_context_item_length = max_context_item_length
118
+ self.max_batch_size = max_batch_size
119
+ self.entailment_threshold = entailment_threshold
120
+ self.coverage_threshold = coverage_threshold
121
+ self.contradiction_threshold = contradiction_threshold
122
+ self.max_sentences = max_sentences
123
+ self.compute_evidence_map = compute_evidence_map
124
+ self.evidence_entailment_threshold = evidence_entailment_threshold
125
+ self.evidence_contradiction_threshold = evidence_contradiction_threshold
126
+
127
+ def score(
128
+ self,
129
+ query: str,
130
+ response: str,
131
+ context: "ContextPayload | str | list[str] | None" = None,
132
+ ) -> EntailmentResult:
133
+ """Score a single LLM response.
134
+
135
+ Inputs are silently truncated to the configured length limits before
136
+ processing. Non-string context items are coerced to ``str``; ``None``
137
+ items are dropped.
138
+
139
+ Args:
140
+ query: The user's query/question.
141
+ response: The LLM-generated response.
142
+ context: Grounding context. Accepts:
143
+
144
+ - :class:`~scroot.context.ContextPayload` - built by
145
+ :class:`~scroot.ContextBuilder`. Consumed here: the
146
+ assembled chunks feed the NLI scorer locally and the
147
+ payload is not retained. Only ``session_id`` and
148
+ ``checksum`` flow into ``details["context"]`` for the
149
+ audit trail.
150
+ - ``str`` - a single grounding string.
151
+ - ``list[str]`` - source context chunks.
152
+ - ``None`` - groundedness is skipped entirely.
153
+
154
+ If provided (even an empty list), groundedness is scored.
155
+
156
+ Returns:
157
+ :class:`EntailmentResult` with all metric scores and flags.
158
+ ``result.iqs`` is computed as ``IQS = n / sum(w_i / s_i)``
159
+ (the weighted harmonic mean of the five metrics, where
160
+ ``n = sum(w_i)``) by default; see ``iqs_mode``.
161
+ """
162
+ context_audit: dict | None = None
163
+ chunk_sources: list[str | None] | None = None
164
+ if isinstance(context, ContextPayload):
165
+ payload = context
166
+ # Per-source chunks preserve top-k retrieval behaviour in the
167
+ # groundedness scorer; the payload itself is consumed here.
168
+ context = [e.content for e in payload.sources] or None
169
+ chunk_sources = [e.source for e in payload.sources] or None
170
+ context_audit = {
171
+ "session_id": payload.session_id,
172
+ "checksum": payload.checksum,
173
+ "total_tokens": payload.total_tokens,
174
+ "was_truncated": payload.was_truncated,
175
+ "pii_scrubbed": payload.pii_scrubbed,
176
+ }
177
+ elif isinstance(context, str):
178
+ context = [context]
179
+
180
+ query = query[: self.max_query_length]
181
+ response = response[: self.max_response_length]
182
+ if context is not None:
183
+ context = context[: self.max_context_items]
184
+ context = [
185
+ str(c)[: self.max_context_item_length]
186
+ for c in context
187
+ if c is not None and str(c).strip() # drop empty/whitespace chunks
188
+ ]
189
+ if chunk_sources is not None:
190
+ chunk_sources = chunk_sources[: self.max_context_items]
191
+ # Empty / whitespace-only context is equivalent to no context:
192
+ # groundedness cannot be computed (spec: treat "", " ", [], None
193
+ # identically).
194
+ if not context:
195
+ context = None
196
+ chunk_sources = None
197
+
198
+ details = {}
199
+ if context_audit is not None:
200
+ details["context"] = context_audit
201
+
202
+ if context is not None:
203
+ try:
204
+ groundedness, g_details = score_groundedness(
205
+ response, context,
206
+ nli_model=self.nli_model,
207
+ embedding_model=self.embedding_model,
208
+ device=self.device,
209
+ entailment_threshold=self.entailment_threshold,
210
+ atomic_claims=self.atomic_claims,
211
+ similarity_fallback=self.similarity_fallback,
212
+ similarity_threshold=self.similarity_threshold,
213
+ top_k_chunks=self.top_k_chunks,
214
+ )
215
+ details["groundedness"] = g_details
216
+ except Exception as e:
217
+ # Context was provided but groundedness scoring failed
218
+ # unexpectedly. Degrade gracefully: exclude groundedness from
219
+ # IQS rather than failing the whole call.
220
+ logger.error("Groundedness computation failed: %s", e)
221
+ groundedness = None
222
+ warnings.warn(
223
+ f"Groundedness computation failed due to an unexpected "
224
+ f"error. IQS will be computed from the remaining metrics. "
225
+ f"Error: {e}",
226
+ GroundednessComputationError,
227
+ stacklevel=2,
228
+ )
229
+ else:
230
+ groundedness = None
231
+ # Encourage adding context - but stay silent if the caller has
232
+ # explicitly opted out by zeroing the groundedness weight.
233
+ ground_weight = (self.weights or DEFAULT_WEIGHTS).get(
234
+ "groundedness", DEFAULT_WEIGHTS["groundedness"]
235
+ )
236
+ if ground_weight > 0.0:
237
+ warnings.warn(
238
+ "auditor.score() called without context. groundedness "
239
+ "will be None and is excluded from IQS (the remaining "
240
+ "metrics' weights are redistributed). To score "
241
+ "groundedness, pass context= or use ContextBuilder.",
242
+ NoContextWarning,
243
+ stacklevel=2,
244
+ )
245
+
246
+ evidence_map = None
247
+ if context is not None and self.compute_evidence_map:
248
+ evidence_map = build_evidence_map(
249
+ response, context,
250
+ nli_model=self.nli_model,
251
+ embedding_model=self.embedding_model,
252
+ device=self.device,
253
+ entailment_threshold=self.evidence_entailment_threshold,
254
+ contradiction_threshold=self.evidence_contradiction_threshold,
255
+ top_k_chunks=self.top_k_chunks,
256
+ chunk_sources=chunk_sources,
257
+ atomic_claims=self.atomic_claims,
258
+ )
259
+
260
+ completeness, c_details = score_completeness(
261
+ query, response,
262
+ embedding_model=self.embedding_model,
263
+ nli_model=self.nli_model if self.nli_completeness else None,
264
+ device=self.device,
265
+ coverage_threshold=self.coverage_threshold,
266
+ )
267
+ details["completeness"] = c_details
268
+
269
+ relevance, r_details = score_relevance(
270
+ query, response,
271
+ embedding_model=self.embedding_model,
272
+ device=self.device,
273
+ )
274
+ details["relevance"] = r_details
275
+
276
+ consistency, cons_details = score_consistency(
277
+ response,
278
+ nli_model=self.nli_model,
279
+ device=self.device,
280
+ contradiction_threshold=self.contradiction_threshold,
281
+ max_sentences=self.max_sentences,
282
+ bidirectional=self.bidirectional_consistency,
283
+ )
284
+ details["consistency"] = cons_details
285
+
286
+ confidence, conf_details = score_confidence(response)
287
+ details["confidence"] = conf_details
288
+
289
+ iqs_scores: dict = {
290
+ "completeness": completeness,
291
+ "relevance": relevance,
292
+ "consistency": consistency,
293
+ "confidence": confidence,
294
+ }
295
+ if groundedness is not None:
296
+ iqs_scores["groundedness"] = groundedness
297
+ iqs, effective_weights = compute_iqs_detailed(
298
+ iqs_scores, weights=self.weights, mode=self.iqs_mode,
299
+ )
300
+
301
+ flags = detect_flags(
302
+ groundedness, completeness, relevance,
303
+ consistency, confidence,
304
+ )
305
+
306
+ return EntailmentResult(
307
+ groundedness=groundedness,
308
+ completeness=completeness,
309
+ relevance=relevance,
310
+ consistency=consistency,
311
+ confidence=confidence,
312
+ iqs=iqs,
313
+ flags=flags,
314
+ details=details,
315
+ evidence_map=evidence_map,
316
+ effective_weights=effective_weights,
317
+ context_used=(groundedness is not None),
318
+ iqs_metric_count=len(effective_weights),
319
+ )
320
+
321
+ def score_batch(
322
+ self,
323
+ items: list[dict],
324
+ ) -> list[EntailmentResult]:
325
+ """Score a batch of responses.
326
+
327
+ Args:
328
+ items: List of dicts with keys ``"query"``, ``"response"``,
329
+ and optionally ``"context"`` (list[str]).
330
+
331
+ Returns:
332
+ List of :class:`EntailmentResult`, one per item, in order.
333
+
334
+ Raises:
335
+ ValueError: If ``len(items)`` exceeds ``max_batch_size`` (H-3).
336
+ """
337
+ if len(items) > self.max_batch_size:
338
+ raise ValueError(
339
+ f"Batch size {len(items)} exceeds max_batch_size={self.max_batch_size}. "
340
+ f"Split into smaller batches or increase max_batch_size."
341
+ )
342
+ return [
343
+ self.score(
344
+ query=item["query"],
345
+ response=item["response"],
346
+ context=item.get("context"),
347
+ )
348
+ for item in items
349
+ ]
@@ -0,0 +1,38 @@
1
+ """Corrector provider factory."""
2
+ from __future__ import annotations
3
+
4
+ from scroot.corrector.base import BaseCorrector
5
+ from scroot.corrector.disabled import NullCorrector
6
+
7
+ _active_corrector: BaseCorrector | None = None
8
+
9
+
10
+ def get_corrector(config) -> BaseCorrector:
11
+ """
12
+ Return the active corrector for the current config.
13
+ Unloads LocalLLMCorrector from RAM when switching away from local mode.
14
+ """
15
+ global _active_corrector
16
+
17
+ mode = config.mode
18
+
19
+ if mode == "disabled":
20
+ _active_corrector = NullCorrector()
21
+
22
+ elif mode == "local":
23
+ from scroot.corrector.local import LocalLLMCorrector
24
+ if (
25
+ isinstance(_active_corrector, LocalLLMCorrector)
26
+ and _active_corrector.model_spec.id != config.local.model_id
27
+ ):
28
+ _active_corrector.unload()
29
+ _active_corrector = LocalLLMCorrector(config.local)
30
+
31
+ elif mode == "api":
32
+ from scroot.corrector.api import APICorrector
33
+ _active_corrector = APICorrector(config.api)
34
+
35
+ else:
36
+ _active_corrector = NullCorrector()
37
+
38
+ return _active_corrector
@@ -0,0 +1,145 @@
1
+ """APICorrector - OpenAI-compatible endpoint, provider auto-detected from key.
2
+
3
+ Design rationale: why ``api_key`` alone is not enough
4
+ -----------------------------------------------------
5
+
6
+ A common assumption is that an API key fully identifies an LLM connection, so
7
+ ``model`` and ``base_url`` should be unnecessary. They are not. An LLM request is::
8
+
9
+ POST {base_url}/chat/completions
10
+ Headers: {auth_header}: {api_key}
11
+ Body: {"model": "<name>", "messages": [...]}
12
+
13
+ The key only fills the **auth header** - it proves *who you are*. It does not
14
+ carry the two other things every request needs:
15
+
16
+ 1. **Where to send it (``base_url``).** Each provider has a different endpoint
17
+ and even a different auth-header name (Anthropic uses ``x-api-key``; OpenAI
18
+ uses ``Authorization: Bearer``). This *is* derivable from the key, because the
19
+ key prefix is provider-specific - that is exactly what ``detect_provider``
20
+ does (``sk-ant-`` -> Anthropic, ``AIza`` -> Gemini, ``sk-`` -> OpenAI, else
21
+ OpenRouter). So ``base_url`` can stay optional/advanced: leave it blank for the
22
+ four known providers, set it only for Groq / OpenRouter / a custom gateway.
23
+
24
+ 2. **Which model to run (``model``).** This is a *mandatory* field in the
25
+ request body and it is **not derivable from anything**. The key is tied to an
26
+ *account*, not a model: the same Anthropic key calls Opus, Sonnet, and Haiku.
27
+ There is no way to infer "the user wants Haiku" from the key, the endpoint, or
28
+ the header. The specific model is a **decision** (a cost/quality trade-off),
29
+ not data - so it cannot be auto-detected the way ``base_url`` can. It can only
30
+ be (a) defaulted to an opinionated pick, or (b) chosen by the user.
31
+
32
+ Consequence for the architecture / UI:
33
+
34
+ * ``base_url`` - safe to hide behind "Advanced"; auto-detected from the key.
35
+ * ``model`` - must remain a real (optional) field. Today it defaults to
36
+ ``gpt-4o-mini`` (see ``draft_correction``), which is only correct for OpenAI;
37
+ a non-OpenAI key with a blank model will send ``gpt-4o-mini`` to the wrong
38
+ provider and fail. The intended improvement is a **per-provider default map**
39
+ (OpenAI -> ``gpt-4o-mini``, Anthropic -> ``claude-haiku-4-5``, Gemini ->
40
+ ``gemini-2.0-flash``), deliberately the cheap/fast tier since this is response
41
+ correction, not frontier reasoning - so "paste key, leave model blank" works
42
+ for every provider while power users can still override.
43
+
44
+ See also: ``validate_base_url`` in ``scroot.dashboard.security`` (M-2), which
45
+ restricts ``base_url`` to allowlisted provider hosts to prevent key exfiltration
46
+ and SSRF.
47
+ """
48
+ from __future__ import annotations
49
+
50
+ from scroot.corrector.base import BaseCorrector
51
+
52
+ _KEY_PREFIX_MAP = {
53
+ "sk-ant-": {
54
+ "base_url": "https://api.anthropic.com/v1",
55
+ "auth_header": "x-api-key",
56
+ "provider_name": "Anthropic",
57
+ },
58
+ "AIza": {
59
+ "base_url": "https://generativelanguage.googleapis.com/v1beta/openai",
60
+ "auth_header": "Authorization",
61
+ "provider_name": "Google Gemini",
62
+ },
63
+ }
64
+ _OPENROUTER_BASE = "https://openrouter.ai/api/v1"
65
+ _OPENAI_BASE = "https://api.openai.com/v1"
66
+
67
+
68
+ def detect_provider(api_key: str, base_url_override: str = "") -> tuple[str, str, str]:
69
+ """Returns (base_url, auth_header, provider_name)."""
70
+ if base_url_override:
71
+ name = "Custom"
72
+ if "groq" in base_url_override:
73
+ name = "Groq"
74
+ elif "openrouter" in base_url_override:
75
+ name = "OpenRouter"
76
+ elif "anthropic" in base_url_override:
77
+ name = "Anthropic"
78
+ return base_url_override, "Authorization", name
79
+ for prefix, cfg in _KEY_PREFIX_MAP.items():
80
+ if api_key.startswith(prefix):
81
+ return cfg["base_url"], cfg["auth_header"], cfg["provider_name"]
82
+ if api_key.startswith("sk-"):
83
+ return _OPENAI_BASE, "Authorization", "OpenAI"
84
+ return _OPENROUTER_BASE, "Authorization", "OpenRouter"
85
+
86
+
87
+ class APICorrector(BaseCorrector):
88
+ def __init__(self, config) -> None:
89
+ self._config = config
90
+
91
+ @property
92
+ def is_available(self) -> bool:
93
+ return bool(self._config.api_key)
94
+
95
+ def draft_correction(
96
+ self,
97
+ query: str,
98
+ response: str,
99
+ context: str | None,
100
+ ) -> str:
101
+ try:
102
+ import httpx
103
+ except ImportError:
104
+ raise RuntimeError(
105
+ "httpx is not installed. Run: pip install 'scroot[api]'"
106
+ )
107
+
108
+ base_url, auth_header, _ = detect_provider(
109
+ self._config.api_key, self._config.base_url
110
+ )
111
+ # M-2: refuse to send the API key to an unvetted/internal endpoint.
112
+ from scroot.dashboard.security import validate_base_url
113
+ validate_base_url(base_url)
114
+ headers = {
115
+ "Content-Type": "application/json",
116
+ auth_header: (
117
+ self._config.api_key
118
+ if auth_header == "x-api-key"
119
+ else f"Bearer {self._config.api_key}"
120
+ ),
121
+ }
122
+ payload = {
123
+ "model": self._config.model or "gpt-4o-mini",
124
+ "messages": [
125
+ {"role": "system", "content": self._config.system_prompt},
126
+ {"role": "user", "content": self._build_prompt(query, response, context)},
127
+ ],
128
+ "max_tokens": 512,
129
+ "temperature": 0.3,
130
+ }
131
+ resp = httpx.post(
132
+ f"{base_url}/chat/completions",
133
+ json=payload,
134
+ headers=headers,
135
+ timeout=30.0,
136
+ )
137
+ resp.raise_for_status()
138
+ return resp.json()["choices"][0]["message"]["content"].strip()
139
+
140
+ def _build_prompt(self, query: str, response: str, context: str | None) -> str:
141
+ parts = [f"Query:\n{query}", f"\nOriginal response:\n{response}"]
142
+ if context:
143
+ parts.append(f"\nContext:\n{context}")
144
+ parts.append("\nRewrite the response to be more accurate and complete.")
145
+ return "\n".join(parts)