scroot 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scroot/__init__.py +109 -0
- scroot/agents.py +345 -0
- scroot/audit.py +131 -0
- scroot/cli/__init__.py +167 -0
- scroot/cli/download.py +49 -0
- scroot/cli/eval.py +230 -0
- scroot/cli/model_info.py +28 -0
- scroot/composite.py +170 -0
- scroot/config/__init__.py +0 -0
- scroot/config/corrector.py +92 -0
- scroot/connectors/__init__.py +5 -0
- scroot/connectors/database.py +357 -0
- scroot/context/__init__.py +9 -0
- scroot/context/adapters.py +86 -0
- scroot/context/builder.py +514 -0
- scroot/context/dedup.py +99 -0
- scroot/context/payload.py +66 -0
- scroot/context/pii.py +101 -0
- scroot/context/tokenizer.py +42 -0
- scroot/core.py +349 -0
- scroot/corrector/__init__.py +38 -0
- scroot/corrector/api.py +145 -0
- scroot/corrector/base.py +20 -0
- scroot/corrector/disabled.py +13 -0
- scroot/corrector/local.py +112 -0
- scroot/corrector/models.py +69 -0
- scroot/dashboard/__init__.py +0 -0
- scroot/dashboard/__main__.py +37 -0
- scroot/dashboard/routers/__init__.py +0 -0
- scroot/dashboard/routers/analytics.py +236 -0
- scroot/dashboard/routers/corrector.py +230 -0
- scroot/dashboard/routers/export.py +150 -0
- scroot/dashboard/routers/guardrails.py +41 -0
- scroot/dashboard/routers/pipeline.py +218 -0
- scroot/dashboard/routers/queue.py +188 -0
- scroot/dashboard/routers/records.py +252 -0
- scroot/dashboard/routers/settings.py +291 -0
- scroot/dashboard/security.py +135 -0
- scroot/dashboard/server.py +181 -0
- scroot/evidence.py +228 -0
- scroot/exceptions.py +62 -0
- scroot/feedback/__init__.py +6 -0
- scroot/feedback/injector.py +160 -0
- scroot/feedback/sanitizer.py +56 -0
- scroot/feedback/store.py +650 -0
- scroot/flags.py +42 -0
- scroot/metrics/__init__.py +15 -0
- scroot/metrics/_utils.py +9 -0
- scroot/metrics/completeness.py +139 -0
- scroot/metrics/confidence.py +83 -0
- scroot/metrics/consistency.py +125 -0
- scroot/metrics/groundedness.py +193 -0
- scroot/metrics/relevance.py +73 -0
- scroot/models.py +214 -0
- scroot/result.py +276 -0
- scroot/sampling.py +306 -0
- scroot/text_utils.py +136 -0
- scroot/ui/dist/assets/index-DW1dLzDl.js +101 -0
- scroot/ui/dist/assets/index-WOhrVVSM.css +2 -0
- scroot/ui/dist/favicon.svg +27 -0
- scroot/ui/dist/index.html +20 -0
- scroot-0.2.0.dist-info/METADATA +832 -0
- scroot-0.2.0.dist-info/RECORD +67 -0
- scroot-0.2.0.dist-info/WHEEL +5 -0
- scroot-0.2.0.dist-info/entry_points.txt +2 -0
- scroot-0.2.0.dist-info/licenses/LICENSE +201 -0
- scroot-0.2.0.dist-info/top_level.txt +1 -0
scroot/context/pii.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""PII detection and scrubbing for context content.
|
|
2
|
+
|
|
3
|
+
Regex-based, fully local - no external API call, consistent with
|
|
4
|
+
scroot's zero-external-dependency principle. Detected entities are
|
|
5
|
+
replaced with typed placeholders (e.g. ``[EMAIL]``); the scrub summary
|
|
6
|
+
records counts by entity type only, never the original values.
|
|
7
|
+
|
|
8
|
+
Detection is best-effort: regex catches structured PII (emails, phones,
|
|
9
|
+
SSNs, cards, IPs, secrets, dates, street addresses) reliably, and person
|
|
10
|
+
names via honorifics and a common-first-name heuristic. For regulated
|
|
11
|
+
workloads, layer a dedicated NER scrubber in front and pass pre-scrubbed
|
|
12
|
+
text in with ``pii_scrub=False``.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ScrubResult:
|
|
23
|
+
"""Result of scrubbing one piece of text.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
scrubbed_text: Text with PII replaced by typed placeholders.
|
|
27
|
+
summary: Counts by entity type plus ``total_entities_scrubbed``.
|
|
28
|
+
Never contains original values.
|
|
29
|
+
was_scrubbed: True if at least one entity was replaced.
|
|
30
|
+
"""
|
|
31
|
+
scrubbed_text: str
|
|
32
|
+
summary: dict[str, int]
|
|
33
|
+
was_scrubbed: bool
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Common first names used for best-effort [PERSON] detection when no
|
|
37
|
+
# honorific is present. Matches "<FirstName> <Capitalized Surname>".
|
|
38
|
+
_COMMON_FIRST_NAMES = (
|
|
39
|
+
"James|John|Robert|Michael|William|David|Richard|Joseph|Thomas|Charles|"
|
|
40
|
+
"Christopher|Daniel|Matthew|Anthony|Mark|Donald|Steven|Paul|Andrew|Joshua|"
|
|
41
|
+
"Kenneth|Kevin|Brian|George|Timothy|Ronald|Edward|Jason|Jeffrey|Ryan|"
|
|
42
|
+
"Mary|Patricia|Jennifer|Linda|Elizabeth|Barbara|Susan|Jessica|Sarah|Karen|"
|
|
43
|
+
"Lisa|Nancy|Betty|Margaret|Sandra|Ashley|Kimberly|Emily|Donna|Michelle|"
|
|
44
|
+
"Carol|Amanda|Dorothy|Melissa|Deborah|Stephanie|Rebecca|Sharon|Laura|"
|
|
45
|
+
"Jane|Emma|Olivia|Sophia|Alice|Anna|Maria|Rachel|Hannah|Grace"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Ordered by priority - earlier patterns run first so that, e.g., an API
|
|
49
|
+
# key is redacted as [SECRET] before the generic patterns see it.
|
|
50
|
+
_PATTERNS: dict[str, re.Pattern] = {
|
|
51
|
+
"SECRET": re.compile(
|
|
52
|
+
r'\b(?:sk-ant-[a-zA-Z0-9-]{20,}|sk-[a-zA-Z0-9]{20,}|'
|
|
53
|
+
r'AKIA[A-Z0-9]{16}|ghp_[a-zA-Z0-9]{36}|[a-f0-9]{32,})\b'
|
|
54
|
+
),
|
|
55
|
+
"EMAIL": re.compile(r'\b[\w.+-]+@[\w-]+\.[a-zA-Z]{2,}\b'),
|
|
56
|
+
"CARD": re.compile(r'\b(?:\d[ -]?){13,16}\b'),
|
|
57
|
+
"SSN": re.compile(r'\b\d{3}-\d{2}-\d{4}\b'),
|
|
58
|
+
"IP": re.compile(r'\b\d{1,3}(?:\.\d{1,3}){3}\b'),
|
|
59
|
+
"PHONE": re.compile(
|
|
60
|
+
r'(?:\+?\d{1,2}[-.\s])?\(?\d{3}\)?[-.\s]\d{3,4}[-.\s]?\d{0,4}\b'
|
|
61
|
+
r'|\b\d{3}[-.\s]\d{3}[-.\s]\d{4}\b'
|
|
62
|
+
),
|
|
63
|
+
"DOB": re.compile(
|
|
64
|
+
r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|'
|
|
65
|
+
r'Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|'
|
|
66
|
+
r'Nov(?:ember)?|Dec(?:ember)?)\.?\s+\d{1,2},?\s+\d{4}\b'
|
|
67
|
+
r'|\b\d{1,2}/\d{1,2}/\d{4}\b'
|
|
68
|
+
),
|
|
69
|
+
"ADDRESS": re.compile(
|
|
70
|
+
r'\b\d{1,5}\s+(?:[A-Z][a-zA-Z]+\s+){1,3}'
|
|
71
|
+
r'(?:St|Street|Ave|Avenue|Rd|Road|Blvd|Boulevard|Ln|Lane|'
|
|
72
|
+
r'Dr|Drive|Ct|Court|Way|Pl|Place)\b\.?'
|
|
73
|
+
),
|
|
74
|
+
"PERSON": re.compile(
|
|
75
|
+
r'\b(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?'
|
|
76
|
+
rf'|\b(?:{_COMMON_FIRST_NAMES})\s+[A-Z][a-z]+\b'
|
|
77
|
+
),
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def scrub(text: str) -> ScrubResult:
|
|
82
|
+
"""Replace detected PII entities with typed placeholders.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
text: Raw text that may contain PII.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
ScrubResult with the scrubbed text and a count-only summary.
|
|
89
|
+
The original values are not retained anywhere.
|
|
90
|
+
"""
|
|
91
|
+
summary = {k: 0 for k in _PATTERNS}
|
|
92
|
+
result = text
|
|
93
|
+
for entity_type, pattern in _PATTERNS.items():
|
|
94
|
+
result, n = pattern.subn(f'[{entity_type}]', result)
|
|
95
|
+
summary[entity_type] = n
|
|
96
|
+
total = sum(summary.values())
|
|
97
|
+
return ScrubResult(
|
|
98
|
+
scrubbed_text=result,
|
|
99
|
+
summary={**summary, "total_entities_scrubbed": total},
|
|
100
|
+
was_scrubbed=total > 0,
|
|
101
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Token counting for context budgeting.
|
|
2
|
+
|
|
3
|
+
Uses tiktoken when installed (accurate for OpenAI-family tokenisers);
|
|
4
|
+
falls back to a character-based estimate (~4 chars per token) otherwise.
|
|
5
|
+
The fallback intentionally over-estimates slightly so the max_tokens
|
|
6
|
+
budget errs on the safe side.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
_encoder = None
|
|
12
|
+
_tiktoken_checked = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _get_encoder():
|
|
16
|
+
global _encoder, _tiktoken_checked
|
|
17
|
+
if not _tiktoken_checked:
|
|
18
|
+
_tiktoken_checked = True
|
|
19
|
+
try:
|
|
20
|
+
import tiktoken
|
|
21
|
+
_encoder = tiktoken.get_encoding("cl100k_base")
|
|
22
|
+
except Exception:
|
|
23
|
+
_encoder = None
|
|
24
|
+
return _encoder
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def count_tokens(text: str) -> int:
|
|
28
|
+
"""Count tokens in text.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
text: Input string.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Token count via tiktoken if available, otherwise
|
|
35
|
+
``max(1, ceil(len(text) / 4))`` for non-empty text.
|
|
36
|
+
"""
|
|
37
|
+
if not text:
|
|
38
|
+
return 0
|
|
39
|
+
encoder = _get_encoder()
|
|
40
|
+
if encoder is not None:
|
|
41
|
+
return len(encoder.encode(text))
|
|
42
|
+
return max(1, -(-len(text) // 4))
|
scroot/core.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
"""Auditor: main orchestrator class.
|
|
2
|
+
|
|
3
|
+
Loads models once, runs all metrics, computes IQS, returns result.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
from .result import EntailmentResult
|
|
12
|
+
from .context.payload import ContextPayload
|
|
13
|
+
from .evidence import build_evidence_map
|
|
14
|
+
from .exceptions import GroundednessComputationError, NoContextWarning
|
|
15
|
+
from .metrics.groundedness import score_groundedness
|
|
16
|
+
from .metrics.completeness import score_completeness
|
|
17
|
+
from .metrics.relevance import score_relevance
|
|
18
|
+
from .metrics.consistency import score_consistency
|
|
19
|
+
from .metrics.confidence import score_confidence
|
|
20
|
+
from .composite import DEFAULT_WEIGHTS, compute_iqs_detailed
|
|
21
|
+
from .flags import detect_flags
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Auditor:
|
|
27
|
+
"""LLM-free response quality scorer.
|
|
28
|
+
|
|
29
|
+
Scores LLM responses using NLI models and embedding similarity.
|
|
30
|
+
No LLM API calls required. Runs locally, deterministic, fast.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
nli_model: NLI cross-encoder model name or pre-instantiated instance.
|
|
34
|
+
Upgrade to ``cross-encoder/nli-deberta-v3-large`` for ~4% better
|
|
35
|
+
accuracy at the cost of ~2x latency.
|
|
36
|
+
embedding_model: Sentence-transformers model name or instance.
|
|
37
|
+
device: Inference device. ``"cpu"`` or ``"cuda"``.
|
|
38
|
+
weights: Optional custom IQS component weights dict. Missing keys
|
|
39
|
+
default to the standard weights. Use ``scroot.RAG_WEIGHTS``
|
|
40
|
+
for RAG-optimised scoring (higher groundedness weight).
|
|
41
|
+
iqs_mode: IQS formula. ``"harmonic"`` (default) uses the weighted
|
|
42
|
+
harmonic mean: ``IQS = n / sum(w_i / s_i)``. Any metric near
|
|
43
|
+
zero drives IQS to zero. ``"geometric"`` uses weighted geometric
|
|
44
|
+
mean - does not collapse to zero on partial hallucination.
|
|
45
|
+
atomic_claims: If True (default), split compound sentences into
|
|
46
|
+
sub-claims before groundedness scoring. Prevents one wrong fact
|
|
47
|
+
from zeroing an entire multi-fact sentence.
|
|
48
|
+
similarity_fallback: If True (default), use bi-encoder cosine
|
|
49
|
+
similarity as a fallback when NLI confidence is uncertain (0.3-0.7).
|
|
50
|
+
Catches paraphrases that exact NLI entailment misses.
|
|
51
|
+
similarity_threshold: Cosine similarity threshold for paraphrase
|
|
52
|
+
credit in the similarity fallback. Default 0.82.
|
|
53
|
+
max_query_length: Truncate query to this many characters (H-3).
|
|
54
|
+
max_response_length: Truncate response to this many characters (H-3).
|
|
55
|
+
max_context_items: Maximum number of context chunks (H-3).
|
|
56
|
+
max_context_item_length: Truncate each context chunk to this length (H-3).
|
|
57
|
+
max_batch_size: ``score_batch()`` raises ValueError above this limit (H-3).
|
|
58
|
+
entailment_threshold: Minimum entailment probability for a claim to
|
|
59
|
+
be grounded. Default 0.5.
|
|
60
|
+
coverage_threshold: Minimum embedding similarity for a query segment
|
|
61
|
+
to be considered covered by the response. Default 0.45.
|
|
62
|
+
contradiction_threshold: Minimum contradiction probability to flag a
|
|
63
|
+
sentence pair as contradictory. Default 0.7.
|
|
64
|
+
max_sentences: Maximum sentences evaluated by consistency scorer. Default 25.
|
|
65
|
+
compute_evidence_map: If True (default) and context is provided,
|
|
66
|
+
attach a sentence-level :class:`~scroot.EvidenceMap` to
|
|
67
|
+
``result.evidence_map`` showing which response sentences are
|
|
68
|
+
supported, contradicted, or ungrounded.
|
|
69
|
+
evidence_entailment_threshold: Minimum entailment probability for a
|
|
70
|
+
sentence to be marked "supported" in the evidence map. Default
|
|
71
|
+
0.70 (stricter than the groundedness ``entailment_threshold``).
|
|
72
|
+
evidence_contradiction_threshold: Minimum contradiction probability
|
|
73
|
+
for a sentence to be marked "contradicted" in the evidence map.
|
|
74
|
+
Default 0.30.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
nli_model: str = "cross-encoder/nli-deberta-v3-base",
|
|
80
|
+
embedding_model: str = "all-MiniLM-L6-v2",
|
|
81
|
+
device: str = "cpu",
|
|
82
|
+
weights: dict | None = None,
|
|
83
|
+
iqs_mode: str = "harmonic",
|
|
84
|
+
atomic_claims: bool = True,
|
|
85
|
+
similarity_fallback: bool = True,
|
|
86
|
+
similarity_threshold: float = 0.82,
|
|
87
|
+
top_k_chunks: int = 3,
|
|
88
|
+
bidirectional_consistency: bool = True,
|
|
89
|
+
nli_completeness: bool = True,
|
|
90
|
+
max_query_length: int = 10_000,
|
|
91
|
+
max_response_length: int = 50_000,
|
|
92
|
+
max_context_items: int = 50,
|
|
93
|
+
max_context_item_length: int = 10_000,
|
|
94
|
+
max_batch_size: int = 1_000,
|
|
95
|
+
entailment_threshold: float = 0.5,
|
|
96
|
+
coverage_threshold: float = 0.45,
|
|
97
|
+
contradiction_threshold: float = 0.7,
|
|
98
|
+
max_sentences: int = 25,
|
|
99
|
+
compute_evidence_map: bool = True,
|
|
100
|
+
evidence_entailment_threshold: float = 0.70,
|
|
101
|
+
evidence_contradiction_threshold: float = 0.30,
|
|
102
|
+
):
|
|
103
|
+
self.nli_model = nli_model
|
|
104
|
+
self.embedding_model = embedding_model
|
|
105
|
+
self.device = device
|
|
106
|
+
self.weights = weights
|
|
107
|
+
self.iqs_mode = iqs_mode
|
|
108
|
+
self.atomic_claims = atomic_claims
|
|
109
|
+
self.similarity_fallback = similarity_fallback
|
|
110
|
+
self.similarity_threshold = similarity_threshold
|
|
111
|
+
self.top_k_chunks = top_k_chunks
|
|
112
|
+
self.bidirectional_consistency = bidirectional_consistency
|
|
113
|
+
self.nli_completeness = nli_completeness
|
|
114
|
+
self.max_query_length = max_query_length
|
|
115
|
+
self.max_response_length = max_response_length
|
|
116
|
+
self.max_context_items = max_context_items
|
|
117
|
+
self.max_context_item_length = max_context_item_length
|
|
118
|
+
self.max_batch_size = max_batch_size
|
|
119
|
+
self.entailment_threshold = entailment_threshold
|
|
120
|
+
self.coverage_threshold = coverage_threshold
|
|
121
|
+
self.contradiction_threshold = contradiction_threshold
|
|
122
|
+
self.max_sentences = max_sentences
|
|
123
|
+
self.compute_evidence_map = compute_evidence_map
|
|
124
|
+
self.evidence_entailment_threshold = evidence_entailment_threshold
|
|
125
|
+
self.evidence_contradiction_threshold = evidence_contradiction_threshold
|
|
126
|
+
|
|
127
|
+
def score(
|
|
128
|
+
self,
|
|
129
|
+
query: str,
|
|
130
|
+
response: str,
|
|
131
|
+
context: "ContextPayload | str | list[str] | None" = None,
|
|
132
|
+
) -> EntailmentResult:
|
|
133
|
+
"""Score a single LLM response.
|
|
134
|
+
|
|
135
|
+
Inputs are silently truncated to the configured length limits before
|
|
136
|
+
processing. Non-string context items are coerced to ``str``; ``None``
|
|
137
|
+
items are dropped.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
query: The user's query/question.
|
|
141
|
+
response: The LLM-generated response.
|
|
142
|
+
context: Grounding context. Accepts:
|
|
143
|
+
|
|
144
|
+
- :class:`~scroot.context.ContextPayload` - built by
|
|
145
|
+
:class:`~scroot.ContextBuilder`. Consumed here: the
|
|
146
|
+
assembled chunks feed the NLI scorer locally and the
|
|
147
|
+
payload is not retained. Only ``session_id`` and
|
|
148
|
+
``checksum`` flow into ``details["context"]`` for the
|
|
149
|
+
audit trail.
|
|
150
|
+
- ``str`` - a single grounding string.
|
|
151
|
+
- ``list[str]`` - source context chunks.
|
|
152
|
+
- ``None`` - groundedness is skipped entirely.
|
|
153
|
+
|
|
154
|
+
If provided (even an empty list), groundedness is scored.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
:class:`EntailmentResult` with all metric scores and flags.
|
|
158
|
+
``result.iqs`` is computed as ``IQS = n / sum(w_i / s_i)``
|
|
159
|
+
(the weighted harmonic mean of the five metrics, where
|
|
160
|
+
``n = sum(w_i)``) by default; see ``iqs_mode``.
|
|
161
|
+
"""
|
|
162
|
+
context_audit: dict | None = None
|
|
163
|
+
chunk_sources: list[str | None] | None = None
|
|
164
|
+
if isinstance(context, ContextPayload):
|
|
165
|
+
payload = context
|
|
166
|
+
# Per-source chunks preserve top-k retrieval behaviour in the
|
|
167
|
+
# groundedness scorer; the payload itself is consumed here.
|
|
168
|
+
context = [e.content for e in payload.sources] or None
|
|
169
|
+
chunk_sources = [e.source for e in payload.sources] or None
|
|
170
|
+
context_audit = {
|
|
171
|
+
"session_id": payload.session_id,
|
|
172
|
+
"checksum": payload.checksum,
|
|
173
|
+
"total_tokens": payload.total_tokens,
|
|
174
|
+
"was_truncated": payload.was_truncated,
|
|
175
|
+
"pii_scrubbed": payload.pii_scrubbed,
|
|
176
|
+
}
|
|
177
|
+
elif isinstance(context, str):
|
|
178
|
+
context = [context]
|
|
179
|
+
|
|
180
|
+
query = query[: self.max_query_length]
|
|
181
|
+
response = response[: self.max_response_length]
|
|
182
|
+
if context is not None:
|
|
183
|
+
context = context[: self.max_context_items]
|
|
184
|
+
context = [
|
|
185
|
+
str(c)[: self.max_context_item_length]
|
|
186
|
+
for c in context
|
|
187
|
+
if c is not None and str(c).strip() # drop empty/whitespace chunks
|
|
188
|
+
]
|
|
189
|
+
if chunk_sources is not None:
|
|
190
|
+
chunk_sources = chunk_sources[: self.max_context_items]
|
|
191
|
+
# Empty / whitespace-only context is equivalent to no context:
|
|
192
|
+
# groundedness cannot be computed (spec: treat "", " ", [], None
|
|
193
|
+
# identically).
|
|
194
|
+
if not context:
|
|
195
|
+
context = None
|
|
196
|
+
chunk_sources = None
|
|
197
|
+
|
|
198
|
+
details = {}
|
|
199
|
+
if context_audit is not None:
|
|
200
|
+
details["context"] = context_audit
|
|
201
|
+
|
|
202
|
+
if context is not None:
|
|
203
|
+
try:
|
|
204
|
+
groundedness, g_details = score_groundedness(
|
|
205
|
+
response, context,
|
|
206
|
+
nli_model=self.nli_model,
|
|
207
|
+
embedding_model=self.embedding_model,
|
|
208
|
+
device=self.device,
|
|
209
|
+
entailment_threshold=self.entailment_threshold,
|
|
210
|
+
atomic_claims=self.atomic_claims,
|
|
211
|
+
similarity_fallback=self.similarity_fallback,
|
|
212
|
+
similarity_threshold=self.similarity_threshold,
|
|
213
|
+
top_k_chunks=self.top_k_chunks,
|
|
214
|
+
)
|
|
215
|
+
details["groundedness"] = g_details
|
|
216
|
+
except Exception as e:
|
|
217
|
+
# Context was provided but groundedness scoring failed
|
|
218
|
+
# unexpectedly. Degrade gracefully: exclude groundedness from
|
|
219
|
+
# IQS rather than failing the whole call.
|
|
220
|
+
logger.error("Groundedness computation failed: %s", e)
|
|
221
|
+
groundedness = None
|
|
222
|
+
warnings.warn(
|
|
223
|
+
f"Groundedness computation failed due to an unexpected "
|
|
224
|
+
f"error. IQS will be computed from the remaining metrics. "
|
|
225
|
+
f"Error: {e}",
|
|
226
|
+
GroundednessComputationError,
|
|
227
|
+
stacklevel=2,
|
|
228
|
+
)
|
|
229
|
+
else:
|
|
230
|
+
groundedness = None
|
|
231
|
+
# Encourage adding context - but stay silent if the caller has
|
|
232
|
+
# explicitly opted out by zeroing the groundedness weight.
|
|
233
|
+
ground_weight = (self.weights or DEFAULT_WEIGHTS).get(
|
|
234
|
+
"groundedness", DEFAULT_WEIGHTS["groundedness"]
|
|
235
|
+
)
|
|
236
|
+
if ground_weight > 0.0:
|
|
237
|
+
warnings.warn(
|
|
238
|
+
"auditor.score() called without context. groundedness "
|
|
239
|
+
"will be None and is excluded from IQS (the remaining "
|
|
240
|
+
"metrics' weights are redistributed). To score "
|
|
241
|
+
"groundedness, pass context= or use ContextBuilder.",
|
|
242
|
+
NoContextWarning,
|
|
243
|
+
stacklevel=2,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
evidence_map = None
|
|
247
|
+
if context is not None and self.compute_evidence_map:
|
|
248
|
+
evidence_map = build_evidence_map(
|
|
249
|
+
response, context,
|
|
250
|
+
nli_model=self.nli_model,
|
|
251
|
+
embedding_model=self.embedding_model,
|
|
252
|
+
device=self.device,
|
|
253
|
+
entailment_threshold=self.evidence_entailment_threshold,
|
|
254
|
+
contradiction_threshold=self.evidence_contradiction_threshold,
|
|
255
|
+
top_k_chunks=self.top_k_chunks,
|
|
256
|
+
chunk_sources=chunk_sources,
|
|
257
|
+
atomic_claims=self.atomic_claims,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
completeness, c_details = score_completeness(
|
|
261
|
+
query, response,
|
|
262
|
+
embedding_model=self.embedding_model,
|
|
263
|
+
nli_model=self.nli_model if self.nli_completeness else None,
|
|
264
|
+
device=self.device,
|
|
265
|
+
coverage_threshold=self.coverage_threshold,
|
|
266
|
+
)
|
|
267
|
+
details["completeness"] = c_details
|
|
268
|
+
|
|
269
|
+
relevance, r_details = score_relevance(
|
|
270
|
+
query, response,
|
|
271
|
+
embedding_model=self.embedding_model,
|
|
272
|
+
device=self.device,
|
|
273
|
+
)
|
|
274
|
+
details["relevance"] = r_details
|
|
275
|
+
|
|
276
|
+
consistency, cons_details = score_consistency(
|
|
277
|
+
response,
|
|
278
|
+
nli_model=self.nli_model,
|
|
279
|
+
device=self.device,
|
|
280
|
+
contradiction_threshold=self.contradiction_threshold,
|
|
281
|
+
max_sentences=self.max_sentences,
|
|
282
|
+
bidirectional=self.bidirectional_consistency,
|
|
283
|
+
)
|
|
284
|
+
details["consistency"] = cons_details
|
|
285
|
+
|
|
286
|
+
confidence, conf_details = score_confidence(response)
|
|
287
|
+
details["confidence"] = conf_details
|
|
288
|
+
|
|
289
|
+
iqs_scores: dict = {
|
|
290
|
+
"completeness": completeness,
|
|
291
|
+
"relevance": relevance,
|
|
292
|
+
"consistency": consistency,
|
|
293
|
+
"confidence": confidence,
|
|
294
|
+
}
|
|
295
|
+
if groundedness is not None:
|
|
296
|
+
iqs_scores["groundedness"] = groundedness
|
|
297
|
+
iqs, effective_weights = compute_iqs_detailed(
|
|
298
|
+
iqs_scores, weights=self.weights, mode=self.iqs_mode,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
flags = detect_flags(
|
|
302
|
+
groundedness, completeness, relevance,
|
|
303
|
+
consistency, confidence,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
return EntailmentResult(
|
|
307
|
+
groundedness=groundedness,
|
|
308
|
+
completeness=completeness,
|
|
309
|
+
relevance=relevance,
|
|
310
|
+
consistency=consistency,
|
|
311
|
+
confidence=confidence,
|
|
312
|
+
iqs=iqs,
|
|
313
|
+
flags=flags,
|
|
314
|
+
details=details,
|
|
315
|
+
evidence_map=evidence_map,
|
|
316
|
+
effective_weights=effective_weights,
|
|
317
|
+
context_used=(groundedness is not None),
|
|
318
|
+
iqs_metric_count=len(effective_weights),
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
def score_batch(
|
|
322
|
+
self,
|
|
323
|
+
items: list[dict],
|
|
324
|
+
) -> list[EntailmentResult]:
|
|
325
|
+
"""Score a batch of responses.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
items: List of dicts with keys ``"query"``, ``"response"``,
|
|
329
|
+
and optionally ``"context"`` (list[str]).
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
List of :class:`EntailmentResult`, one per item, in order.
|
|
333
|
+
|
|
334
|
+
Raises:
|
|
335
|
+
ValueError: If ``len(items)`` exceeds ``max_batch_size`` (H-3).
|
|
336
|
+
"""
|
|
337
|
+
if len(items) > self.max_batch_size:
|
|
338
|
+
raise ValueError(
|
|
339
|
+
f"Batch size {len(items)} exceeds max_batch_size={self.max_batch_size}. "
|
|
340
|
+
f"Split into smaller batches or increase max_batch_size."
|
|
341
|
+
)
|
|
342
|
+
return [
|
|
343
|
+
self.score(
|
|
344
|
+
query=item["query"],
|
|
345
|
+
response=item["response"],
|
|
346
|
+
context=item.get("context"),
|
|
347
|
+
)
|
|
348
|
+
for item in items
|
|
349
|
+
]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Corrector provider factory."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from scroot.corrector.base import BaseCorrector
|
|
5
|
+
from scroot.corrector.disabled import NullCorrector
|
|
6
|
+
|
|
7
|
+
_active_corrector: BaseCorrector | None = None
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_corrector(config) -> BaseCorrector:
|
|
11
|
+
"""
|
|
12
|
+
Return the active corrector for the current config.
|
|
13
|
+
Unloads LocalLLMCorrector from RAM when switching away from local mode.
|
|
14
|
+
"""
|
|
15
|
+
global _active_corrector
|
|
16
|
+
|
|
17
|
+
mode = config.mode
|
|
18
|
+
|
|
19
|
+
if mode == "disabled":
|
|
20
|
+
_active_corrector = NullCorrector()
|
|
21
|
+
|
|
22
|
+
elif mode == "local":
|
|
23
|
+
from scroot.corrector.local import LocalLLMCorrector
|
|
24
|
+
if (
|
|
25
|
+
isinstance(_active_corrector, LocalLLMCorrector)
|
|
26
|
+
and _active_corrector.model_spec.id != config.local.model_id
|
|
27
|
+
):
|
|
28
|
+
_active_corrector.unload()
|
|
29
|
+
_active_corrector = LocalLLMCorrector(config.local)
|
|
30
|
+
|
|
31
|
+
elif mode == "api":
|
|
32
|
+
from scroot.corrector.api import APICorrector
|
|
33
|
+
_active_corrector = APICorrector(config.api)
|
|
34
|
+
|
|
35
|
+
else:
|
|
36
|
+
_active_corrector = NullCorrector()
|
|
37
|
+
|
|
38
|
+
return _active_corrector
|
scroot/corrector/api.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""APICorrector - OpenAI-compatible endpoint, provider auto-detected from key.
|
|
2
|
+
|
|
3
|
+
Design rationale: why ``api_key`` alone is not enough
|
|
4
|
+
-----------------------------------------------------
|
|
5
|
+
|
|
6
|
+
A common assumption is that an API key fully identifies an LLM connection, so
|
|
7
|
+
``model`` and ``base_url`` should be unnecessary. They are not. An LLM request is::
|
|
8
|
+
|
|
9
|
+
POST {base_url}/chat/completions
|
|
10
|
+
Headers: {auth_header}: {api_key}
|
|
11
|
+
Body: {"model": "<name>", "messages": [...]}
|
|
12
|
+
|
|
13
|
+
The key only fills the **auth header** - it proves *who you are*. It does not
|
|
14
|
+
carry the two other things every request needs:
|
|
15
|
+
|
|
16
|
+
1. **Where to send it (``base_url``).** Each provider has a different endpoint
|
|
17
|
+
and even a different auth-header name (Anthropic uses ``x-api-key``; OpenAI
|
|
18
|
+
uses ``Authorization: Bearer``). This *is* derivable from the key, because the
|
|
19
|
+
key prefix is provider-specific - that is exactly what ``detect_provider``
|
|
20
|
+
does (``sk-ant-`` -> Anthropic, ``AIza`` -> Gemini, ``sk-`` -> OpenAI, else
|
|
21
|
+
OpenRouter). So ``base_url`` can stay optional/advanced: leave it blank for the
|
|
22
|
+
four known providers, set it only for Groq / OpenRouter / a custom gateway.
|
|
23
|
+
|
|
24
|
+
2. **Which model to run (``model``).** This is a *mandatory* field in the
|
|
25
|
+
request body and it is **not derivable from anything**. The key is tied to an
|
|
26
|
+
*account*, not a model: the same Anthropic key calls Opus, Sonnet, and Haiku.
|
|
27
|
+
There is no way to infer "the user wants Haiku" from the key, the endpoint, or
|
|
28
|
+
the header. The specific model is a **decision** (a cost/quality trade-off),
|
|
29
|
+
not data - so it cannot be auto-detected the way ``base_url`` can. It can only
|
|
30
|
+
be (a) defaulted to an opinionated pick, or (b) chosen by the user.
|
|
31
|
+
|
|
32
|
+
Consequence for the architecture / UI:
|
|
33
|
+
|
|
34
|
+
* ``base_url`` - safe to hide behind "Advanced"; auto-detected from the key.
|
|
35
|
+
* ``model`` - must remain a real (optional) field. Today it defaults to
|
|
36
|
+
``gpt-4o-mini`` (see ``draft_correction``), which is only correct for OpenAI;
|
|
37
|
+
a non-OpenAI key with a blank model will send ``gpt-4o-mini`` to the wrong
|
|
38
|
+
provider and fail. The intended improvement is a **per-provider default map**
|
|
39
|
+
(OpenAI -> ``gpt-4o-mini``, Anthropic -> ``claude-haiku-4-5``, Gemini ->
|
|
40
|
+
``gemini-2.0-flash``), deliberately the cheap/fast tier since this is response
|
|
41
|
+
correction, not frontier reasoning - so "paste key, leave model blank" works
|
|
42
|
+
for every provider while power users can still override.
|
|
43
|
+
|
|
44
|
+
See also: ``validate_base_url`` in ``scroot.dashboard.security`` (M-2), which
|
|
45
|
+
restricts ``base_url`` to allowlisted provider hosts to prevent key exfiltration
|
|
46
|
+
and SSRF.
|
|
47
|
+
"""
|
|
48
|
+
from __future__ import annotations
|
|
49
|
+
|
|
50
|
+
from scroot.corrector.base import BaseCorrector
|
|
51
|
+
|
|
52
|
+
_KEY_PREFIX_MAP = {
|
|
53
|
+
"sk-ant-": {
|
|
54
|
+
"base_url": "https://api.anthropic.com/v1",
|
|
55
|
+
"auth_header": "x-api-key",
|
|
56
|
+
"provider_name": "Anthropic",
|
|
57
|
+
},
|
|
58
|
+
"AIza": {
|
|
59
|
+
"base_url": "https://generativelanguage.googleapis.com/v1beta/openai",
|
|
60
|
+
"auth_header": "Authorization",
|
|
61
|
+
"provider_name": "Google Gemini",
|
|
62
|
+
},
|
|
63
|
+
}
|
|
64
|
+
_OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
|
65
|
+
_OPENAI_BASE = "https://api.openai.com/v1"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def detect_provider(api_key: str, base_url_override: str = "") -> tuple[str, str, str]:
|
|
69
|
+
"""Returns (base_url, auth_header, provider_name)."""
|
|
70
|
+
if base_url_override:
|
|
71
|
+
name = "Custom"
|
|
72
|
+
if "groq" in base_url_override:
|
|
73
|
+
name = "Groq"
|
|
74
|
+
elif "openrouter" in base_url_override:
|
|
75
|
+
name = "OpenRouter"
|
|
76
|
+
elif "anthropic" in base_url_override:
|
|
77
|
+
name = "Anthropic"
|
|
78
|
+
return base_url_override, "Authorization", name
|
|
79
|
+
for prefix, cfg in _KEY_PREFIX_MAP.items():
|
|
80
|
+
if api_key.startswith(prefix):
|
|
81
|
+
return cfg["base_url"], cfg["auth_header"], cfg["provider_name"]
|
|
82
|
+
if api_key.startswith("sk-"):
|
|
83
|
+
return _OPENAI_BASE, "Authorization", "OpenAI"
|
|
84
|
+
return _OPENROUTER_BASE, "Authorization", "OpenRouter"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class APICorrector(BaseCorrector):
|
|
88
|
+
def __init__(self, config) -> None:
|
|
89
|
+
self._config = config
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def is_available(self) -> bool:
|
|
93
|
+
return bool(self._config.api_key)
|
|
94
|
+
|
|
95
|
+
def draft_correction(
|
|
96
|
+
self,
|
|
97
|
+
query: str,
|
|
98
|
+
response: str,
|
|
99
|
+
context: str | None,
|
|
100
|
+
) -> str:
|
|
101
|
+
try:
|
|
102
|
+
import httpx
|
|
103
|
+
except ImportError:
|
|
104
|
+
raise RuntimeError(
|
|
105
|
+
"httpx is not installed. Run: pip install 'scroot[api]'"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
base_url, auth_header, _ = detect_provider(
|
|
109
|
+
self._config.api_key, self._config.base_url
|
|
110
|
+
)
|
|
111
|
+
# M-2: refuse to send the API key to an unvetted/internal endpoint.
|
|
112
|
+
from scroot.dashboard.security import validate_base_url
|
|
113
|
+
validate_base_url(base_url)
|
|
114
|
+
headers = {
|
|
115
|
+
"Content-Type": "application/json",
|
|
116
|
+
auth_header: (
|
|
117
|
+
self._config.api_key
|
|
118
|
+
if auth_header == "x-api-key"
|
|
119
|
+
else f"Bearer {self._config.api_key}"
|
|
120
|
+
),
|
|
121
|
+
}
|
|
122
|
+
payload = {
|
|
123
|
+
"model": self._config.model or "gpt-4o-mini",
|
|
124
|
+
"messages": [
|
|
125
|
+
{"role": "system", "content": self._config.system_prompt},
|
|
126
|
+
{"role": "user", "content": self._build_prompt(query, response, context)},
|
|
127
|
+
],
|
|
128
|
+
"max_tokens": 512,
|
|
129
|
+
"temperature": 0.3,
|
|
130
|
+
}
|
|
131
|
+
resp = httpx.post(
|
|
132
|
+
f"{base_url}/chat/completions",
|
|
133
|
+
json=payload,
|
|
134
|
+
headers=headers,
|
|
135
|
+
timeout=30.0,
|
|
136
|
+
)
|
|
137
|
+
resp.raise_for_status()
|
|
138
|
+
return resp.json()["choices"][0]["message"]["content"].strip()
|
|
139
|
+
|
|
140
|
+
def _build_prompt(self, query: str, response: str, context: str | None) -> str:
|
|
141
|
+
parts = [f"Query:\n{query}", f"\nOriginal response:\n{response}"]
|
|
142
|
+
if context:
|
|
143
|
+
parts.append(f"\nContext:\n{context}")
|
|
144
|
+
parts.append("\nRewrite the response to be more accurate and complete.")
|
|
145
|
+
return "\n".join(parts)
|