scroot 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scroot/__init__.py +109 -0
- scroot/agents.py +345 -0
- scroot/audit.py +131 -0
- scroot/cli/__init__.py +167 -0
- scroot/cli/download.py +49 -0
- scroot/cli/eval.py +230 -0
- scroot/cli/model_info.py +28 -0
- scroot/composite.py +170 -0
- scroot/config/__init__.py +0 -0
- scroot/config/corrector.py +92 -0
- scroot/connectors/__init__.py +5 -0
- scroot/connectors/database.py +357 -0
- scroot/context/__init__.py +9 -0
- scroot/context/adapters.py +86 -0
- scroot/context/builder.py +514 -0
- scroot/context/dedup.py +99 -0
- scroot/context/payload.py +66 -0
- scroot/context/pii.py +101 -0
- scroot/context/tokenizer.py +42 -0
- scroot/core.py +349 -0
- scroot/corrector/__init__.py +38 -0
- scroot/corrector/api.py +145 -0
- scroot/corrector/base.py +20 -0
- scroot/corrector/disabled.py +13 -0
- scroot/corrector/local.py +112 -0
- scroot/corrector/models.py +69 -0
- scroot/dashboard/__init__.py +0 -0
- scroot/dashboard/__main__.py +37 -0
- scroot/dashboard/routers/__init__.py +0 -0
- scroot/dashboard/routers/analytics.py +236 -0
- scroot/dashboard/routers/corrector.py +230 -0
- scroot/dashboard/routers/export.py +150 -0
- scroot/dashboard/routers/guardrails.py +41 -0
- scroot/dashboard/routers/pipeline.py +218 -0
- scroot/dashboard/routers/queue.py +188 -0
- scroot/dashboard/routers/records.py +252 -0
- scroot/dashboard/routers/settings.py +291 -0
- scroot/dashboard/security.py +135 -0
- scroot/dashboard/server.py +181 -0
- scroot/evidence.py +228 -0
- scroot/exceptions.py +62 -0
- scroot/feedback/__init__.py +6 -0
- scroot/feedback/injector.py +160 -0
- scroot/feedback/sanitizer.py +56 -0
- scroot/feedback/store.py +650 -0
- scroot/flags.py +42 -0
- scroot/metrics/__init__.py +15 -0
- scroot/metrics/_utils.py +9 -0
- scroot/metrics/completeness.py +139 -0
- scroot/metrics/confidence.py +83 -0
- scroot/metrics/consistency.py +125 -0
- scroot/metrics/groundedness.py +193 -0
- scroot/metrics/relevance.py +73 -0
- scroot/models.py +214 -0
- scroot/result.py +276 -0
- scroot/sampling.py +306 -0
- scroot/text_utils.py +136 -0
- scroot/ui/dist/assets/index-DW1dLzDl.js +101 -0
- scroot/ui/dist/assets/index-WOhrVVSM.css +2 -0
- scroot/ui/dist/favicon.svg +27 -0
- scroot/ui/dist/index.html +20 -0
- scroot-0.2.0.dist-info/METADATA +832 -0
- scroot-0.2.0.dist-info/RECORD +67 -0
- scroot-0.2.0.dist-info/WHEEL +5 -0
- scroot-0.2.0.dist-info/entry_points.txt +2 -0
- scroot-0.2.0.dist-info/licenses/LICENSE +201 -0
- scroot-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,514 @@
|
|
|
1
|
+
"""ContextBuilder - request-scoped context accumulator.
|
|
2
|
+
|
|
3
|
+
Carries grounding documents through a multi-step RAG or agentic pipeline
|
|
4
|
+
and delivers them to ``auditor.score()`` intact, without restructuring
|
|
5
|
+
the client's code.
|
|
6
|
+
|
|
7
|
+
SOC II posture: content is held in memory only, PII-scrubbed by default,
|
|
8
|
+
and never written to disk. Audit events are content-free. Only
|
|
9
|
+
floating-point scores cross tier boundaries.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
import os
|
|
16
|
+
import uuid
|
|
17
|
+
import warnings
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from .. import audit
|
|
22
|
+
from ..exceptions import (
|
|
23
|
+
ContextAssemblyWarning,
|
|
24
|
+
ContextEmptyWarning,
|
|
25
|
+
ContextSealedError,
|
|
26
|
+
ContextTooLargeWarning,
|
|
27
|
+
SecurityWarning,
|
|
28
|
+
)
|
|
29
|
+
from .adapters import extract_text
|
|
30
|
+
from .dedup import deduplicate
|
|
31
|
+
from .payload import ContextEntry, ContextPayload
|
|
32
|
+
from .pii import scrub
|
|
33
|
+
from .tokenizer import count_tokens
|
|
34
|
+
|
|
35
|
+
_SOURCE_WEIGHTS: dict[str, float] = {
|
|
36
|
+
"reranker": 1.0,
|
|
37
|
+
"retrieval": 0.85,
|
|
38
|
+
"tool_output": 0.70,
|
|
39
|
+
"system_prompt": 0.50,
|
|
40
|
+
"query": 0.30,
|
|
41
|
+
"custom": 0.60,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
_MAX_CHUNK_CHARS = 50_000
|
|
45
|
+
_MAX_CHUNKS_PER_CALL = 500
|
|
46
|
+
_MAX_SESSION_ID_LEN = 128
|
|
47
|
+
_MAX_METADATA_KEYS = 20
|
|
48
|
+
_MAX_METADATA_VALUE_CHARS = 1_000
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ContextBuilder:
|
|
52
|
+
"""Accumulates grounding context across a multi-step LLM pipeline.
|
|
53
|
+
|
|
54
|
+
Create one per request, add grounding material as it becomes
|
|
55
|
+
available at each pipeline step, and pass ``ctx.build()`` to
|
|
56
|
+
``auditor.score(context=...)`` at the end. The client's LLM call is
|
|
57
|
+
never touched.
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
>>> import scroot
|
|
61
|
+
>>> ctx = scroot.ContextBuilder()
|
|
62
|
+
>>> ctx.add_query(user_query)
|
|
63
|
+
>>> ctx.add_retrieved(retriever.search(user_query))
|
|
64
|
+
>>> result = auditor.score(query, response, context=ctx.build())
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
session_id: Ties this context to a trace; auto-generated UUID4
|
|
68
|
+
if omitted. Max 128 chars.
|
|
69
|
+
max_tokens: Hard ceiling on assembled context size. build()
|
|
70
|
+
truncates lowest-priority sources and emits
|
|
71
|
+
ContextTooLargeWarning when exceeded. Default 4096.
|
|
72
|
+
pii_scrub: Run PII detection before storing each addition
|
|
73
|
+
(default True). Detected entities are replaced with typed
|
|
74
|
+
placeholders ([EMAIL], [PHONE], [SECRET], ...). The audit
|
|
75
|
+
trail records counts only, never the original values.
|
|
76
|
+
Disabling in production (SCROOT_ENV=production) emits a
|
|
77
|
+
SecurityWarning.
|
|
78
|
+
dedup: Deduplicate overlapping chunk content on build() using
|
|
79
|
+
cosine similarity at the 0.92 threshold (default True).
|
|
80
|
+
encryption_key: Fernet key for encrypting context at rest if a
|
|
81
|
+
session store is configured. With the default None, content
|
|
82
|
+
is held in memory only - nothing is written to disk, so no
|
|
83
|
+
encryption is needed.
|
|
84
|
+
|
|
85
|
+
SOC II: content is held in memory only, PII-scrubbed by default,
|
|
86
|
+
never written to disk unless encryption_key is provided.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
session_id: str | None = None,
|
|
92
|
+
max_tokens: int = 4096,
|
|
93
|
+
pii_scrub: bool = True,
|
|
94
|
+
dedup: bool = True,
|
|
95
|
+
encryption_key: bytes | None = None,
|
|
96
|
+
) -> None:
|
|
97
|
+
if session_id is not None and len(session_id) > _MAX_SESSION_ID_LEN:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"session_id exceeds {_MAX_SESSION_ID_LEN} chars."
|
|
100
|
+
)
|
|
101
|
+
if not pii_scrub and os.environ.get("SCROOT_ENV") == "production":
|
|
102
|
+
warnings.warn(
|
|
103
|
+
"pii_scrub=False with SCROOT_ENV=production. "
|
|
104
|
+
"PII in context content will not be redacted.",
|
|
105
|
+
SecurityWarning,
|
|
106
|
+
stacklevel=2,
|
|
107
|
+
)
|
|
108
|
+
if encryption_key is not None:
|
|
109
|
+
try:
|
|
110
|
+
from cryptography.fernet import Fernet
|
|
111
|
+
Fernet(encryption_key) # validates the key format
|
|
112
|
+
except ImportError as exc:
|
|
113
|
+
raise ImportError(
|
|
114
|
+
"encryption_key requires the cryptography package: "
|
|
115
|
+
"pip install 'scroot[security]'"
|
|
116
|
+
) from exc
|
|
117
|
+
|
|
118
|
+
self._session_id = session_id or f"cb-{uuid.uuid4()}"
|
|
119
|
+
self._max_tokens = max_tokens
|
|
120
|
+
self._pii_scrub = pii_scrub
|
|
121
|
+
self._dedup = dedup
|
|
122
|
+
self._encryption_key = encryption_key
|
|
123
|
+
self._entries: list[ContextEntry] = []
|
|
124
|
+
self._sealed = False
|
|
125
|
+
self._built_at: datetime | None = None
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def session_id(self) -> str:
|
|
129
|
+
"""The trace identifier for this builder."""
|
|
130
|
+
return self._session_id
|
|
131
|
+
|
|
132
|
+
# ------------------------------------------------------------------
|
|
133
|
+
# Internal helpers
|
|
134
|
+
# ------------------------------------------------------------------
|
|
135
|
+
|
|
136
|
+
def _guard_sealed(self) -> None:
|
|
137
|
+
if self._sealed:
|
|
138
|
+
raise ContextSealedError(
|
|
139
|
+
"ContextBuilder has been sealed by build(). "
|
|
140
|
+
"Create a new ContextBuilder for each request."
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def _validate_metadata(metadata: dict) -> None:
|
|
145
|
+
if len(metadata) > _MAX_METADATA_KEYS:
|
|
146
|
+
raise ValueError(
|
|
147
|
+
f"metadata exceeds {_MAX_METADATA_KEYS} keys."
|
|
148
|
+
)
|
|
149
|
+
for key, value in metadata.items():
|
|
150
|
+
if isinstance(value, str) and len(value) > _MAX_METADATA_VALUE_CHARS:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
f"metadata value for {key!r} exceeds "
|
|
153
|
+
f"{_MAX_METADATA_VALUE_CHARS} chars."
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
def _process_text(
|
|
157
|
+
self, text: str, source: str, metadata: dict
|
|
158
|
+
) -> ContextEntry | None:
|
|
159
|
+
if not text or not text.strip():
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
if len(text) > _MAX_CHUNK_CHARS:
|
|
163
|
+
text = text[:_MAX_CHUNK_CHARS] + " [TRUNCATED]"
|
|
164
|
+
warnings.warn(
|
|
165
|
+
f"Chunk from source '{source}' exceeded "
|
|
166
|
+
f"{_MAX_CHUNK_CHARS:,} chars and was truncated.",
|
|
167
|
+
ContextAssemblyWarning,
|
|
168
|
+
stacklevel=4,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
scrub_summary: dict = {}
|
|
172
|
+
was_scrubbed = False
|
|
173
|
+
if self._pii_scrub:
|
|
174
|
+
try:
|
|
175
|
+
result = scrub(text)
|
|
176
|
+
text = result.scrubbed_text
|
|
177
|
+
scrub_summary = result.summary
|
|
178
|
+
was_scrubbed = result.was_scrubbed
|
|
179
|
+
except Exception:
|
|
180
|
+
warnings.warn(
|
|
181
|
+
"PII scrubber failed; content passed through unscrubbed.",
|
|
182
|
+
ContextAssemblyWarning,
|
|
183
|
+
stacklevel=4,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return ContextEntry(
|
|
187
|
+
source=source,
|
|
188
|
+
content=text,
|
|
189
|
+
added_at=datetime.now(timezone.utc),
|
|
190
|
+
metadata=metadata,
|
|
191
|
+
source_weight=_SOURCE_WEIGHTS.get(source, 0.60),
|
|
192
|
+
token_count=count_tokens(text),
|
|
193
|
+
was_scrubbed=was_scrubbed,
|
|
194
|
+
scrub_summary=scrub_summary,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def _add_chunks(
|
|
198
|
+
self, chunks: Any, source: str, metadata: dict
|
|
199
|
+
) -> "ContextBuilder":
|
|
200
|
+
self._guard_sealed()
|
|
201
|
+
self._validate_metadata(metadata)
|
|
202
|
+
|
|
203
|
+
if isinstance(chunks, str):
|
|
204
|
+
chunks = [chunks]
|
|
205
|
+
elif isinstance(chunks, dict) or not hasattr(chunks, '__iter__'):
|
|
206
|
+
chunks = [chunks]
|
|
207
|
+
|
|
208
|
+
chunks = list(chunks)
|
|
209
|
+
if len(chunks) > _MAX_CHUNKS_PER_CALL:
|
|
210
|
+
warnings.warn(
|
|
211
|
+
f"Received {len(chunks)} chunks for source '{source}'; "
|
|
212
|
+
f"only the first {_MAX_CHUNKS_PER_CALL} will be used.",
|
|
213
|
+
ContextAssemblyWarning,
|
|
214
|
+
stacklevel=3,
|
|
215
|
+
)
|
|
216
|
+
chunks = chunks[:_MAX_CHUNKS_PER_CALL]
|
|
217
|
+
|
|
218
|
+
added: list[ContextEntry] = []
|
|
219
|
+
for chunk in chunks:
|
|
220
|
+
text = extract_text(chunk)
|
|
221
|
+
if text is None:
|
|
222
|
+
warnings.warn(
|
|
223
|
+
f"Could not extract text from chunk of type "
|
|
224
|
+
f"'{type(chunk).__name__}' in source '{source}'. Skipped.",
|
|
225
|
+
ContextAssemblyWarning,
|
|
226
|
+
stacklevel=3,
|
|
227
|
+
)
|
|
228
|
+
continue
|
|
229
|
+
entry = self._process_text(text, source, metadata)
|
|
230
|
+
if entry:
|
|
231
|
+
self._entries.append(entry)
|
|
232
|
+
added.append(entry)
|
|
233
|
+
|
|
234
|
+
if added:
|
|
235
|
+
scrub_totals: dict[str, int] = {}
|
|
236
|
+
for entry in added:
|
|
237
|
+
for k, v in entry.scrub_summary.items():
|
|
238
|
+
if v:
|
|
239
|
+
scrub_totals[k] = scrub_totals.get(k, 0) + v
|
|
240
|
+
audit.emit(
|
|
241
|
+
"context_entry_added",
|
|
242
|
+
session_id=self._session_id,
|
|
243
|
+
source=source,
|
|
244
|
+
token_count=sum(e.token_count for e in added),
|
|
245
|
+
chunk_count=len(added),
|
|
246
|
+
pii_scrubbed=any(e.was_scrubbed for e in added),
|
|
247
|
+
scrub_summary=scrub_totals,
|
|
248
|
+
)
|
|
249
|
+
return self
|
|
250
|
+
|
|
251
|
+
# ------------------------------------------------------------------
|
|
252
|
+
# Public API
|
|
253
|
+
# ------------------------------------------------------------------
|
|
254
|
+
|
|
255
|
+
def add_query(
|
|
256
|
+
self, text: str, *, metadata: dict | None = None
|
|
257
|
+
) -> "ContextBuilder":
|
|
258
|
+
"""Record the user's query. Call first, before retrieval.
|
|
259
|
+
|
|
260
|
+
Calling more than once appends to query history with timestamps —
|
|
261
|
+
useful for multi-turn conversations where the query evolves.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
text: The user's query. Plain string only.
|
|
265
|
+
metadata: Optional dict, audit-trail only. Max 20 keys.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
self, for method chaining.
|
|
269
|
+
|
|
270
|
+
Raises:
|
|
271
|
+
ContextSealedError: If called after build().
|
|
272
|
+
"""
|
|
273
|
+
return self._add_chunks(text, "query", metadata or {})
|
|
274
|
+
|
|
275
|
+
def add_retrieved(
|
|
276
|
+
self,
|
|
277
|
+
chunks: Any,
|
|
278
|
+
*,
|
|
279
|
+
source: str = "retrieval",
|
|
280
|
+
metadata: dict | None = None,
|
|
281
|
+
) -> "ContextBuilder":
|
|
282
|
+
"""Record retrieved documents for groundedness scoring.
|
|
283
|
+
|
|
284
|
+
Call this immediately after your retrieval step, before any
|
|
285
|
+
reranking or LLM call. This is the most important method - it's
|
|
286
|
+
what gives groundedness its signal.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
chunks: Retrieved documents. See supported types below.
|
|
290
|
+
source: Label for this retrieval source. Used in audit logs
|
|
291
|
+
and dashboard provenance display. Defaults to
|
|
292
|
+
"retrieval". Use descriptive names for multi-source
|
|
293
|
+
pipelines: "pinecone", "web_search", "internal_db".
|
|
294
|
+
metadata: Optional dict for additional context. Stored in
|
|
295
|
+
audit log only - not used in scoring. Max 20 keys.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
self, for method chaining.
|
|
299
|
+
|
|
300
|
+
Supported chunk types:
|
|
301
|
+
- str: treated as a single chunk
|
|
302
|
+
- list[str]: each string is a chunk
|
|
303
|
+
- list[Document]: LangChain Documents (page_content extracted)
|
|
304
|
+
- list[dict]: dicts with 'text', 'content', or 'page_content' key
|
|
305
|
+
- QueryResult: ChromaDB result objects
|
|
306
|
+
- list[ScoredVector]: Pinecone results (metadata['text'] extracted)
|
|
307
|
+
|
|
308
|
+
Warns:
|
|
309
|
+
ContextAssemblyWarning: If a chunk type is unrecognised
|
|
310
|
+
(skipped, not raised - pipeline continues), or if more
|
|
311
|
+
than 500 chunks are passed (excess dropped).
|
|
312
|
+
|
|
313
|
+
Raises:
|
|
314
|
+
ContextSealedError: If called after build().
|
|
315
|
+
|
|
316
|
+
Example:
|
|
317
|
+
>>> ctx = ContextBuilder()
|
|
318
|
+
>>> ctx.add_query("What is the refund policy?")
|
|
319
|
+
>>> docs = retriever.get_relevant_documents(query)
|
|
320
|
+
>>> ctx.add_retrieved(docs)
|
|
321
|
+
>>> result = auditor.score(query, response, context=ctx.build())
|
|
322
|
+
"""
|
|
323
|
+
return self._add_chunks(chunks, source, metadata or {})
|
|
324
|
+
|
|
325
|
+
def add_reranked(
|
|
326
|
+
self,
|
|
327
|
+
chunks: Any,
|
|
328
|
+
*,
|
|
329
|
+
source: str = "reranker",
|
|
330
|
+
metadata: dict | None = None,
|
|
331
|
+
) -> "ContextBuilder":
|
|
332
|
+
"""Record post-reranking documents. Higher weight than raw retrieved.
|
|
333
|
+
|
|
334
|
+
Reranked chunks carry higher weight in groundedness scoring than
|
|
335
|
+
raw retrieved chunks, because they represent what the LLM
|
|
336
|
+
actually used. Same accepted types as :meth:`add_retrieved`.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
chunks: Post-reranking documents.
|
|
340
|
+
source: Source label, defaults to "reranker".
|
|
341
|
+
metadata: Optional dict, audit-trail only. Max 20 keys.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
self, for method chaining.
|
|
345
|
+
|
|
346
|
+
Raises:
|
|
347
|
+
ContextSealedError: If called after build().
|
|
348
|
+
"""
|
|
349
|
+
return self._add_chunks(chunks, source, metadata or {})
|
|
350
|
+
|
|
351
|
+
def add_system_prompt(
|
|
352
|
+
self, text: str, *, metadata: dict | None = None
|
|
353
|
+
) -> "ContextBuilder":
|
|
354
|
+
"""Record the system prompt used in the LLM call.
|
|
355
|
+
|
|
356
|
+
Included in groundedness scoring with lower weight than retrieved
|
|
357
|
+
chunks - it's instructions, not facts.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
text: The system prompt text.
|
|
361
|
+
metadata: Optional dict, audit-trail only. Max 20 keys.
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
self, for method chaining.
|
|
365
|
+
|
|
366
|
+
Raises:
|
|
367
|
+
ContextSealedError: If called after build().
|
|
368
|
+
"""
|
|
369
|
+
return self._add_chunks(text, "system_prompt", metadata or {})
|
|
370
|
+
|
|
371
|
+
def add_tool_output(
|
|
372
|
+
self,
|
|
373
|
+
output: str | list[str],
|
|
374
|
+
*,
|
|
375
|
+
tool_name: str,
|
|
376
|
+
metadata: dict | None = None,
|
|
377
|
+
) -> "ContextBuilder":
|
|
378
|
+
"""Record a tool call output (DB query result, API response, etc.).
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
output: Tool output text, or a list of output strings.
|
|
382
|
+
tool_name: Name of the tool that produced the output.
|
|
383
|
+
Recorded in entry metadata and audit logs.
|
|
384
|
+
metadata: Optional dict, audit-trail only. Max 20 keys.
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
self, for method chaining.
|
|
388
|
+
|
|
389
|
+
Raises:
|
|
390
|
+
ContextSealedError: If called after build().
|
|
391
|
+
"""
|
|
392
|
+
meta = {**(metadata or {}), "tool_name": tool_name}
|
|
393
|
+
return self._add_chunks(output, "tool_output", meta)
|
|
394
|
+
|
|
395
|
+
def snapshot(self) -> dict:
|
|
396
|
+
"""Return current state without building. For debugging/logging.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Dict with session_id, sealed flag, source labels, entry and
|
|
400
|
+
token counts, and whether PII scrubbing is enabled. Contains
|
|
401
|
+
no content text.
|
|
402
|
+
"""
|
|
403
|
+
return {
|
|
404
|
+
"session_id": self._session_id,
|
|
405
|
+
"sealed": self._sealed,
|
|
406
|
+
"sources": [e.source for e in self._entries],
|
|
407
|
+
"total_entries": len(self._entries),
|
|
408
|
+
"total_tokens": sum(e.token_count for e in self._entries),
|
|
409
|
+
"pii_scrub_enabled": self._pii_scrub,
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
def reset(self) -> "ContextBuilder":
|
|
413
|
+
"""Clear all entries and unseal. Prefer a new instance per request.
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
self, for method chaining.
|
|
417
|
+
"""
|
|
418
|
+
self._entries.clear()
|
|
419
|
+
self._sealed = False
|
|
420
|
+
self._built_at = None
|
|
421
|
+
return self
|
|
422
|
+
|
|
423
|
+
def build(self) -> ContextPayload | None:
|
|
424
|
+
"""Assemble all context into a ContextPayload for auditor.score().
|
|
425
|
+
|
|
426
|
+
Seals the builder - no further additions after this call.
|
|
427
|
+
|
|
428
|
+
Assembly steps: sort entries by source weight
|
|
429
|
+
(reranked > retrieved > tool_output > system_prompt > query),
|
|
430
|
+
deduplicate near-identical chunks if dedup=True, then truncate
|
|
431
|
+
to max_tokens keeping the highest-priority sources.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
ContextPayload, or None if nothing was added (groundedness
|
|
435
|
+
will score as None with a warning - not a crash).
|
|
436
|
+
|
|
437
|
+
Warns:
|
|
438
|
+
ContextEmptyWarning: If no content was added.
|
|
439
|
+
ContextTooLargeWarning: If max_tokens forced truncation.
|
|
440
|
+
"""
|
|
441
|
+
self._sealed = True
|
|
442
|
+
self._built_at = datetime.now(timezone.utc)
|
|
443
|
+
|
|
444
|
+
if not self._entries:
|
|
445
|
+
warnings.warn(
|
|
446
|
+
"ContextBuilder.build() called with no content. "
|
|
447
|
+
"Groundedness will be None. "
|
|
448
|
+
"Call add_retrieved() before build() for full scoring.",
|
|
449
|
+
ContextEmptyWarning,
|
|
450
|
+
stacklevel=2,
|
|
451
|
+
)
|
|
452
|
+
return None
|
|
453
|
+
|
|
454
|
+
sorted_entries = sorted(
|
|
455
|
+
self._entries, key=lambda e: e.source_weight, reverse=True
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
if self._dedup:
|
|
459
|
+
sorted_entries = deduplicate(sorted_entries, threshold=0.92)
|
|
460
|
+
|
|
461
|
+
kept: list[ContextEntry] = []
|
|
462
|
+
budget = self._max_tokens
|
|
463
|
+
was_truncated = False
|
|
464
|
+
for entry in sorted_entries:
|
|
465
|
+
if entry.token_count <= budget:
|
|
466
|
+
kept.append(entry)
|
|
467
|
+
budget -= entry.token_count
|
|
468
|
+
else:
|
|
469
|
+
was_truncated = True
|
|
470
|
+
|
|
471
|
+
if was_truncated:
|
|
472
|
+
warnings.warn(
|
|
473
|
+
f"Context exceeded max_tokens={self._max_tokens}. "
|
|
474
|
+
"Lower-priority sources were dropped. "
|
|
475
|
+
"Increase max_tokens if groundedness scores seem low.",
|
|
476
|
+
ContextTooLargeWarning,
|
|
477
|
+
stacklevel=2,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
assembled = "\n\n---\n\n".join(e.content for e in kept)
|
|
481
|
+
|
|
482
|
+
checksum = "sha256:" + hashlib.sha256(
|
|
483
|
+
assembled.encode("utf-8")
|
|
484
|
+
).hexdigest()
|
|
485
|
+
|
|
486
|
+
scrub_summary: dict[str, int] = {}
|
|
487
|
+
pii_scrubbed = False
|
|
488
|
+
for entry in kept:
|
|
489
|
+
if entry.was_scrubbed:
|
|
490
|
+
pii_scrubbed = True
|
|
491
|
+
for k, v in entry.scrub_summary.items():
|
|
492
|
+
scrub_summary[k] = scrub_summary.get(k, 0) + v
|
|
493
|
+
|
|
494
|
+
payload = ContextPayload(
|
|
495
|
+
session_id=self._session_id,
|
|
496
|
+
sources=kept,
|
|
497
|
+
assembled_text=assembled,
|
|
498
|
+
total_tokens=sum(e.token_count for e in kept),
|
|
499
|
+
was_truncated=was_truncated,
|
|
500
|
+
pii_scrubbed=pii_scrubbed,
|
|
501
|
+
scrub_summary=scrub_summary,
|
|
502
|
+
built_at=self._built_at,
|
|
503
|
+
checksum=checksum,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
audit.emit(
|
|
507
|
+
"context_built",
|
|
508
|
+
session_id=self._session_id,
|
|
509
|
+
total_tokens=payload.total_tokens,
|
|
510
|
+
was_truncated=was_truncated,
|
|
511
|
+
sources_used=sorted({e.source for e in kept}),
|
|
512
|
+
checksum=checksum,
|
|
513
|
+
)
|
|
514
|
+
return payload
|
scroot/context/dedup.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Chunk deduplication for context assembly.
|
|
2
|
+
|
|
3
|
+
Near-identical chunks frequently appear when the same document is picked
|
|
4
|
+
up by multiple retrieval steps (raw retrieval + reranking, or two vector
|
|
5
|
+
stores indexing the same corpus). Scoring duplicate text wastes the
|
|
6
|
+
token budget and skews groundedness weighting, so build() merges them.
|
|
7
|
+
|
|
8
|
+
Similarity backend: cosine similarity over sentence-transformers
|
|
9
|
+
embeddings when available (the model is shared with the Auditor's cache),
|
|
10
|
+
falling back to ``difflib.SequenceMatcher`` ratio when
|
|
11
|
+
sentence-transformers is not installed. Both use the same threshold.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from .payload import ContextEntry
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _exact_key(text: str) -> str:
|
|
20
|
+
return " ".join(text.lower().split())
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _similarity_matrix(texts: list[str], embedding_model: str, device: str):
|
|
24
|
+
"""Pairwise cosine similarity via embeddings, or None if unavailable."""
|
|
25
|
+
try:
|
|
26
|
+
import numpy as np
|
|
27
|
+
from ..models import get_embedding_model
|
|
28
|
+
model = get_embedding_model(embedding_model, device=device)
|
|
29
|
+
embs = model.encode(texts, convert_to_numpy=True)
|
|
30
|
+
norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-8
|
|
31
|
+
normalised = embs / norms
|
|
32
|
+
return normalised @ normalised.T
|
|
33
|
+
except Exception:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _fallback_similarity(a: str, b: str) -> float:
|
|
38
|
+
from difflib import SequenceMatcher
|
|
39
|
+
return SequenceMatcher(None, a, b).ratio()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def deduplicate(
|
|
43
|
+
entries: list[ContextEntry],
|
|
44
|
+
threshold: float = 0.92,
|
|
45
|
+
embedding_model: str = "all-MiniLM-L6-v2",
|
|
46
|
+
device: str = "cpu",
|
|
47
|
+
) -> list[ContextEntry]:
|
|
48
|
+
"""Remove near-duplicate entries, keeping the first occurrence.
|
|
49
|
+
|
|
50
|
+
Entries should be pre-sorted by source weight descending so the most
|
|
51
|
+
authoritative copy of duplicated content survives.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
entries: Context entries to deduplicate.
|
|
55
|
+
threshold: Cosine similarity (or fallback ratio) at or above
|
|
56
|
+
which two entries are considered duplicates. Default 0.92.
|
|
57
|
+
embedding_model: Sentence-transformers model name for the
|
|
58
|
+
embedding backend. Shares the Auditor's model cache.
|
|
59
|
+
device: "cpu" or "cuda".
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Entries with duplicates removed, original order preserved.
|
|
63
|
+
"""
|
|
64
|
+
if len(entries) <= 1:
|
|
65
|
+
return list(entries)
|
|
66
|
+
|
|
67
|
+
# Pass 1: exact duplicates after whitespace/case normalisation.
|
|
68
|
+
seen: set[str] = set()
|
|
69
|
+
unique: list[ContextEntry] = []
|
|
70
|
+
for entry in entries:
|
|
71
|
+
key = _exact_key(entry.content)
|
|
72
|
+
if key in seen:
|
|
73
|
+
continue
|
|
74
|
+
seen.add(key)
|
|
75
|
+
unique.append(entry)
|
|
76
|
+
|
|
77
|
+
if len(unique) <= 1:
|
|
78
|
+
return unique
|
|
79
|
+
|
|
80
|
+
# Pass 2: near-duplicates by similarity.
|
|
81
|
+
texts = [e.content for e in unique]
|
|
82
|
+
matrix = _similarity_matrix(texts, embedding_model, device)
|
|
83
|
+
|
|
84
|
+
kept: list[ContextEntry] = []
|
|
85
|
+
kept_idx: list[int] = []
|
|
86
|
+
for i, entry in enumerate(unique):
|
|
87
|
+
is_dup = False
|
|
88
|
+
for j in kept_idx:
|
|
89
|
+
if matrix is not None:
|
|
90
|
+
sim = float(matrix[i][j])
|
|
91
|
+
else:
|
|
92
|
+
sim = _fallback_similarity(texts[i], texts[j])
|
|
93
|
+
if sim >= threshold:
|
|
94
|
+
is_dup = True
|
|
95
|
+
break
|
|
96
|
+
if not is_dup:
|
|
97
|
+
kept.append(entry)
|
|
98
|
+
kept_idx.append(i)
|
|
99
|
+
return kept
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""ContextEntry and ContextPayload dataclasses.
|
|
2
|
+
|
|
3
|
+
ContextPayload is what auditor.score() receives. It stores the assembled
|
|
4
|
+
(scrubbed) text and the audit trail - never the raw pre-scrub additions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ContextEntry:
|
|
15
|
+
"""A single piece of context added to a ContextBuilder.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
source: Source label - 'retrieval', 'reranker', 'system_prompt',
|
|
19
|
+
'tool_output', 'query', or a custom label.
|
|
20
|
+
content: Scrubbed content (PII already replaced if pii_scrub=True).
|
|
21
|
+
added_at: UTC timestamp of the addition.
|
|
22
|
+
metadata: Caller-supplied metadata. Audit-trail only, not scored.
|
|
23
|
+
source_weight: 0.0-1.0; higher = more authoritative for groundedness.
|
|
24
|
+
token_count: Token count of content.
|
|
25
|
+
was_scrubbed: True if PII was detected and replaced in this entry.
|
|
26
|
+
scrub_summary: Entity type counts only - no original values.
|
|
27
|
+
"""
|
|
28
|
+
source: str
|
|
29
|
+
content: str
|
|
30
|
+
added_at: datetime
|
|
31
|
+
metadata: dict = field(default_factory=dict)
|
|
32
|
+
source_weight: float = 0.6
|
|
33
|
+
token_count: int = 0
|
|
34
|
+
was_scrubbed: bool = False
|
|
35
|
+
scrub_summary: dict = field(default_factory=dict)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ContextPayload:
|
|
40
|
+
"""Assembled context returned by ContextBuilder.build().
|
|
41
|
+
|
|
42
|
+
Pass this to ``auditor.score(context=...)``. The payload is consumed
|
|
43
|
+
during scoring - the assembled text feeds the NLI scorer locally and
|
|
44
|
+
is then discarded. Only ``session_id`` and ``checksum`` flow into
|
|
45
|
+
downstream records for audit-trail purposes.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
session_id: Trace identifier from the originating ContextBuilder.
|
|
49
|
+
sources: The kept ContextEntry items, highest-weight first.
|
|
50
|
+
assembled_text: Final concatenated grounding string (scrubbed).
|
|
51
|
+
total_tokens: Token count of the kept entries.
|
|
52
|
+
was_truncated: True if the max_tokens budget dropped entries.
|
|
53
|
+
pii_scrubbed: True if any kept entry had PII replaced.
|
|
54
|
+
scrub_summary: Aggregated entity-type counts (no original text).
|
|
55
|
+
built_at: UTC timestamp when build() was called.
|
|
56
|
+
checksum: ``sha256:<hex>`` of assembled_text for integrity checks.
|
|
57
|
+
"""
|
|
58
|
+
session_id: str
|
|
59
|
+
sources: list[ContextEntry]
|
|
60
|
+
assembled_text: str
|
|
61
|
+
total_tokens: int
|
|
62
|
+
was_truncated: bool
|
|
63
|
+
pii_scrubbed: bool
|
|
64
|
+
scrub_summary: dict
|
|
65
|
+
built_at: datetime
|
|
66
|
+
checksum: str
|