scroot 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. scroot/__init__.py +109 -0
  2. scroot/agents.py +345 -0
  3. scroot/audit.py +131 -0
  4. scroot/cli/__init__.py +167 -0
  5. scroot/cli/download.py +49 -0
  6. scroot/cli/eval.py +230 -0
  7. scroot/cli/model_info.py +28 -0
  8. scroot/composite.py +170 -0
  9. scroot/config/__init__.py +0 -0
  10. scroot/config/corrector.py +92 -0
  11. scroot/connectors/__init__.py +5 -0
  12. scroot/connectors/database.py +357 -0
  13. scroot/context/__init__.py +9 -0
  14. scroot/context/adapters.py +86 -0
  15. scroot/context/builder.py +514 -0
  16. scroot/context/dedup.py +99 -0
  17. scroot/context/payload.py +66 -0
  18. scroot/context/pii.py +101 -0
  19. scroot/context/tokenizer.py +42 -0
  20. scroot/core.py +349 -0
  21. scroot/corrector/__init__.py +38 -0
  22. scroot/corrector/api.py +145 -0
  23. scroot/corrector/base.py +20 -0
  24. scroot/corrector/disabled.py +13 -0
  25. scroot/corrector/local.py +112 -0
  26. scroot/corrector/models.py +69 -0
  27. scroot/dashboard/__init__.py +0 -0
  28. scroot/dashboard/__main__.py +37 -0
  29. scroot/dashboard/routers/__init__.py +0 -0
  30. scroot/dashboard/routers/analytics.py +236 -0
  31. scroot/dashboard/routers/corrector.py +230 -0
  32. scroot/dashboard/routers/export.py +150 -0
  33. scroot/dashboard/routers/guardrails.py +41 -0
  34. scroot/dashboard/routers/pipeline.py +218 -0
  35. scroot/dashboard/routers/queue.py +188 -0
  36. scroot/dashboard/routers/records.py +252 -0
  37. scroot/dashboard/routers/settings.py +291 -0
  38. scroot/dashboard/security.py +135 -0
  39. scroot/dashboard/server.py +181 -0
  40. scroot/evidence.py +228 -0
  41. scroot/exceptions.py +62 -0
  42. scroot/feedback/__init__.py +6 -0
  43. scroot/feedback/injector.py +160 -0
  44. scroot/feedback/sanitizer.py +56 -0
  45. scroot/feedback/store.py +650 -0
  46. scroot/flags.py +42 -0
  47. scroot/metrics/__init__.py +15 -0
  48. scroot/metrics/_utils.py +9 -0
  49. scroot/metrics/completeness.py +139 -0
  50. scroot/metrics/confidence.py +83 -0
  51. scroot/metrics/consistency.py +125 -0
  52. scroot/metrics/groundedness.py +193 -0
  53. scroot/metrics/relevance.py +73 -0
  54. scroot/models.py +214 -0
  55. scroot/result.py +276 -0
  56. scroot/sampling.py +306 -0
  57. scroot/text_utils.py +136 -0
  58. scroot/ui/dist/assets/index-DW1dLzDl.js +101 -0
  59. scroot/ui/dist/assets/index-WOhrVVSM.css +2 -0
  60. scroot/ui/dist/favicon.svg +27 -0
  61. scroot/ui/dist/index.html +20 -0
  62. scroot-0.2.0.dist-info/METADATA +832 -0
  63. scroot-0.2.0.dist-info/RECORD +67 -0
  64. scroot-0.2.0.dist-info/WHEEL +5 -0
  65. scroot-0.2.0.dist-info/entry_points.txt +2 -0
  66. scroot-0.2.0.dist-info/licenses/LICENSE +201 -0
  67. scroot-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,514 @@
1
+ """ContextBuilder - request-scoped context accumulator.
2
+
3
+ Carries grounding documents through a multi-step RAG or agentic pipeline
4
+ and delivers them to ``auditor.score()`` intact, without restructuring
5
+ the client's code.
6
+
7
+ SOC II posture: content is held in memory only, PII-scrubbed by default,
8
+ and never written to disk. Audit events are content-free. Only
9
+ floating-point scores cross tier boundaries.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+ import os
16
+ import uuid
17
+ import warnings
18
+ from datetime import datetime, timezone
19
+ from typing import Any
20
+
21
+ from .. import audit
22
+ from ..exceptions import (
23
+ ContextAssemblyWarning,
24
+ ContextEmptyWarning,
25
+ ContextSealedError,
26
+ ContextTooLargeWarning,
27
+ SecurityWarning,
28
+ )
29
+ from .adapters import extract_text
30
+ from .dedup import deduplicate
31
+ from .payload import ContextEntry, ContextPayload
32
+ from .pii import scrub
33
+ from .tokenizer import count_tokens
34
+
35
+ _SOURCE_WEIGHTS: dict[str, float] = {
36
+ "reranker": 1.0,
37
+ "retrieval": 0.85,
38
+ "tool_output": 0.70,
39
+ "system_prompt": 0.50,
40
+ "query": 0.30,
41
+ "custom": 0.60,
42
+ }
43
+
44
+ _MAX_CHUNK_CHARS = 50_000
45
+ _MAX_CHUNKS_PER_CALL = 500
46
+ _MAX_SESSION_ID_LEN = 128
47
+ _MAX_METADATA_KEYS = 20
48
+ _MAX_METADATA_VALUE_CHARS = 1_000
49
+
50
+
51
+ class ContextBuilder:
52
+ """Accumulates grounding context across a multi-step LLM pipeline.
53
+
54
+ Create one per request, add grounding material as it becomes
55
+ available at each pipeline step, and pass ``ctx.build()`` to
56
+ ``auditor.score(context=...)`` at the end. The client's LLM call is
57
+ never touched.
58
+
59
+ Example:
60
+ >>> import scroot
61
+ >>> ctx = scroot.ContextBuilder()
62
+ >>> ctx.add_query(user_query)
63
+ >>> ctx.add_retrieved(retriever.search(user_query))
64
+ >>> result = auditor.score(query, response, context=ctx.build())
65
+
66
+ Args:
67
+ session_id: Ties this context to a trace; auto-generated UUID4
68
+ if omitted. Max 128 chars.
69
+ max_tokens: Hard ceiling on assembled context size. build()
70
+ truncates lowest-priority sources and emits
71
+ ContextTooLargeWarning when exceeded. Default 4096.
72
+ pii_scrub: Run PII detection before storing each addition
73
+ (default True). Detected entities are replaced with typed
74
+ placeholders ([EMAIL], [PHONE], [SECRET], ...). The audit
75
+ trail records counts only, never the original values.
76
+ Disabling in production (SCROOT_ENV=production) emits a
77
+ SecurityWarning.
78
+ dedup: Deduplicate overlapping chunk content on build() using
79
+ cosine similarity at the 0.92 threshold (default True).
80
+ encryption_key: Fernet key for encrypting context at rest if a
81
+ session store is configured. With the default None, content
82
+ is held in memory only - nothing is written to disk, so no
83
+ encryption is needed.
84
+
85
+ SOC II: content is held in memory only, PII-scrubbed by default,
86
+ never written to disk unless encryption_key is provided.
87
+ """
88
+
89
+ def __init__(
90
+ self,
91
+ session_id: str | None = None,
92
+ max_tokens: int = 4096,
93
+ pii_scrub: bool = True,
94
+ dedup: bool = True,
95
+ encryption_key: bytes | None = None,
96
+ ) -> None:
97
+ if session_id is not None and len(session_id) > _MAX_SESSION_ID_LEN:
98
+ raise ValueError(
99
+ f"session_id exceeds {_MAX_SESSION_ID_LEN} chars."
100
+ )
101
+ if not pii_scrub and os.environ.get("SCROOT_ENV") == "production":
102
+ warnings.warn(
103
+ "pii_scrub=False with SCROOT_ENV=production. "
104
+ "PII in context content will not be redacted.",
105
+ SecurityWarning,
106
+ stacklevel=2,
107
+ )
108
+ if encryption_key is not None:
109
+ try:
110
+ from cryptography.fernet import Fernet
111
+ Fernet(encryption_key) # validates the key format
112
+ except ImportError as exc:
113
+ raise ImportError(
114
+ "encryption_key requires the cryptography package: "
115
+ "pip install 'scroot[security]'"
116
+ ) from exc
117
+
118
+ self._session_id = session_id or f"cb-{uuid.uuid4()}"
119
+ self._max_tokens = max_tokens
120
+ self._pii_scrub = pii_scrub
121
+ self._dedup = dedup
122
+ self._encryption_key = encryption_key
123
+ self._entries: list[ContextEntry] = []
124
+ self._sealed = False
125
+ self._built_at: datetime | None = None
126
+
127
+ @property
128
+ def session_id(self) -> str:
129
+ """The trace identifier for this builder."""
130
+ return self._session_id
131
+
132
+ # ------------------------------------------------------------------
133
+ # Internal helpers
134
+ # ------------------------------------------------------------------
135
+
136
+ def _guard_sealed(self) -> None:
137
+ if self._sealed:
138
+ raise ContextSealedError(
139
+ "ContextBuilder has been sealed by build(). "
140
+ "Create a new ContextBuilder for each request."
141
+ )
142
+
143
+ @staticmethod
144
+ def _validate_metadata(metadata: dict) -> None:
145
+ if len(metadata) > _MAX_METADATA_KEYS:
146
+ raise ValueError(
147
+ f"metadata exceeds {_MAX_METADATA_KEYS} keys."
148
+ )
149
+ for key, value in metadata.items():
150
+ if isinstance(value, str) and len(value) > _MAX_METADATA_VALUE_CHARS:
151
+ raise ValueError(
152
+ f"metadata value for {key!r} exceeds "
153
+ f"{_MAX_METADATA_VALUE_CHARS} chars."
154
+ )
155
+
156
+ def _process_text(
157
+ self, text: str, source: str, metadata: dict
158
+ ) -> ContextEntry | None:
159
+ if not text or not text.strip():
160
+ return None
161
+
162
+ if len(text) > _MAX_CHUNK_CHARS:
163
+ text = text[:_MAX_CHUNK_CHARS] + " [TRUNCATED]"
164
+ warnings.warn(
165
+ f"Chunk from source '{source}' exceeded "
166
+ f"{_MAX_CHUNK_CHARS:,} chars and was truncated.",
167
+ ContextAssemblyWarning,
168
+ stacklevel=4,
169
+ )
170
+
171
+ scrub_summary: dict = {}
172
+ was_scrubbed = False
173
+ if self._pii_scrub:
174
+ try:
175
+ result = scrub(text)
176
+ text = result.scrubbed_text
177
+ scrub_summary = result.summary
178
+ was_scrubbed = result.was_scrubbed
179
+ except Exception:
180
+ warnings.warn(
181
+ "PII scrubber failed; content passed through unscrubbed.",
182
+ ContextAssemblyWarning,
183
+ stacklevel=4,
184
+ )
185
+
186
+ return ContextEntry(
187
+ source=source,
188
+ content=text,
189
+ added_at=datetime.now(timezone.utc),
190
+ metadata=metadata,
191
+ source_weight=_SOURCE_WEIGHTS.get(source, 0.60),
192
+ token_count=count_tokens(text),
193
+ was_scrubbed=was_scrubbed,
194
+ scrub_summary=scrub_summary,
195
+ )
196
+
197
+ def _add_chunks(
198
+ self, chunks: Any, source: str, metadata: dict
199
+ ) -> "ContextBuilder":
200
+ self._guard_sealed()
201
+ self._validate_metadata(metadata)
202
+
203
+ if isinstance(chunks, str):
204
+ chunks = [chunks]
205
+ elif isinstance(chunks, dict) or not hasattr(chunks, '__iter__'):
206
+ chunks = [chunks]
207
+
208
+ chunks = list(chunks)
209
+ if len(chunks) > _MAX_CHUNKS_PER_CALL:
210
+ warnings.warn(
211
+ f"Received {len(chunks)} chunks for source '{source}'; "
212
+ f"only the first {_MAX_CHUNKS_PER_CALL} will be used.",
213
+ ContextAssemblyWarning,
214
+ stacklevel=3,
215
+ )
216
+ chunks = chunks[:_MAX_CHUNKS_PER_CALL]
217
+
218
+ added: list[ContextEntry] = []
219
+ for chunk in chunks:
220
+ text = extract_text(chunk)
221
+ if text is None:
222
+ warnings.warn(
223
+ f"Could not extract text from chunk of type "
224
+ f"'{type(chunk).__name__}' in source '{source}'. Skipped.",
225
+ ContextAssemblyWarning,
226
+ stacklevel=3,
227
+ )
228
+ continue
229
+ entry = self._process_text(text, source, metadata)
230
+ if entry:
231
+ self._entries.append(entry)
232
+ added.append(entry)
233
+
234
+ if added:
235
+ scrub_totals: dict[str, int] = {}
236
+ for entry in added:
237
+ for k, v in entry.scrub_summary.items():
238
+ if v:
239
+ scrub_totals[k] = scrub_totals.get(k, 0) + v
240
+ audit.emit(
241
+ "context_entry_added",
242
+ session_id=self._session_id,
243
+ source=source,
244
+ token_count=sum(e.token_count for e in added),
245
+ chunk_count=len(added),
246
+ pii_scrubbed=any(e.was_scrubbed for e in added),
247
+ scrub_summary=scrub_totals,
248
+ )
249
+ return self
250
+
251
+ # ------------------------------------------------------------------
252
+ # Public API
253
+ # ------------------------------------------------------------------
254
+
255
+ def add_query(
256
+ self, text: str, *, metadata: dict | None = None
257
+ ) -> "ContextBuilder":
258
+ """Record the user's query. Call first, before retrieval.
259
+
260
+ Calling more than once appends to query history with timestamps —
261
+ useful for multi-turn conversations where the query evolves.
262
+
263
+ Args:
264
+ text: The user's query. Plain string only.
265
+ metadata: Optional dict, audit-trail only. Max 20 keys.
266
+
267
+ Returns:
268
+ self, for method chaining.
269
+
270
+ Raises:
271
+ ContextSealedError: If called after build().
272
+ """
273
+ return self._add_chunks(text, "query", metadata or {})
274
+
275
+ def add_retrieved(
276
+ self,
277
+ chunks: Any,
278
+ *,
279
+ source: str = "retrieval",
280
+ metadata: dict | None = None,
281
+ ) -> "ContextBuilder":
282
+ """Record retrieved documents for groundedness scoring.
283
+
284
+ Call this immediately after your retrieval step, before any
285
+ reranking or LLM call. This is the most important method - it's
286
+ what gives groundedness its signal.
287
+
288
+ Args:
289
+ chunks: Retrieved documents. See supported types below.
290
+ source: Label for this retrieval source. Used in audit logs
291
+ and dashboard provenance display. Defaults to
292
+ "retrieval". Use descriptive names for multi-source
293
+ pipelines: "pinecone", "web_search", "internal_db".
294
+ metadata: Optional dict for additional context. Stored in
295
+ audit log only - not used in scoring. Max 20 keys.
296
+
297
+ Returns:
298
+ self, for method chaining.
299
+
300
+ Supported chunk types:
301
+ - str: treated as a single chunk
302
+ - list[str]: each string is a chunk
303
+ - list[Document]: LangChain Documents (page_content extracted)
304
+ - list[dict]: dicts with 'text', 'content', or 'page_content' key
305
+ - QueryResult: ChromaDB result objects
306
+ - list[ScoredVector]: Pinecone results (metadata['text'] extracted)
307
+
308
+ Warns:
309
+ ContextAssemblyWarning: If a chunk type is unrecognised
310
+ (skipped, not raised - pipeline continues), or if more
311
+ than 500 chunks are passed (excess dropped).
312
+
313
+ Raises:
314
+ ContextSealedError: If called after build().
315
+
316
+ Example:
317
+ >>> ctx = ContextBuilder()
318
+ >>> ctx.add_query("What is the refund policy?")
319
+ >>> docs = retriever.get_relevant_documents(query)
320
+ >>> ctx.add_retrieved(docs)
321
+ >>> result = auditor.score(query, response, context=ctx.build())
322
+ """
323
+ return self._add_chunks(chunks, source, metadata or {})
324
+
325
+ def add_reranked(
326
+ self,
327
+ chunks: Any,
328
+ *,
329
+ source: str = "reranker",
330
+ metadata: dict | None = None,
331
+ ) -> "ContextBuilder":
332
+ """Record post-reranking documents. Higher weight than raw retrieved.
333
+
334
+ Reranked chunks carry higher weight in groundedness scoring than
335
+ raw retrieved chunks, because they represent what the LLM
336
+ actually used. Same accepted types as :meth:`add_retrieved`.
337
+
338
+ Args:
339
+ chunks: Post-reranking documents.
340
+ source: Source label, defaults to "reranker".
341
+ metadata: Optional dict, audit-trail only. Max 20 keys.
342
+
343
+ Returns:
344
+ self, for method chaining.
345
+
346
+ Raises:
347
+ ContextSealedError: If called after build().
348
+ """
349
+ return self._add_chunks(chunks, source, metadata or {})
350
+
351
+ def add_system_prompt(
352
+ self, text: str, *, metadata: dict | None = None
353
+ ) -> "ContextBuilder":
354
+ """Record the system prompt used in the LLM call.
355
+
356
+ Included in groundedness scoring with lower weight than retrieved
357
+ chunks - it's instructions, not facts.
358
+
359
+ Args:
360
+ text: The system prompt text.
361
+ metadata: Optional dict, audit-trail only. Max 20 keys.
362
+
363
+ Returns:
364
+ self, for method chaining.
365
+
366
+ Raises:
367
+ ContextSealedError: If called after build().
368
+ """
369
+ return self._add_chunks(text, "system_prompt", metadata or {})
370
+
371
+ def add_tool_output(
372
+ self,
373
+ output: str | list[str],
374
+ *,
375
+ tool_name: str,
376
+ metadata: dict | None = None,
377
+ ) -> "ContextBuilder":
378
+ """Record a tool call output (DB query result, API response, etc.).
379
+
380
+ Args:
381
+ output: Tool output text, or a list of output strings.
382
+ tool_name: Name of the tool that produced the output.
383
+ Recorded in entry metadata and audit logs.
384
+ metadata: Optional dict, audit-trail only. Max 20 keys.
385
+
386
+ Returns:
387
+ self, for method chaining.
388
+
389
+ Raises:
390
+ ContextSealedError: If called after build().
391
+ """
392
+ meta = {**(metadata or {}), "tool_name": tool_name}
393
+ return self._add_chunks(output, "tool_output", meta)
394
+
395
+ def snapshot(self) -> dict:
396
+ """Return current state without building. For debugging/logging.
397
+
398
+ Returns:
399
+ Dict with session_id, sealed flag, source labels, entry and
400
+ token counts, and whether PII scrubbing is enabled. Contains
401
+ no content text.
402
+ """
403
+ return {
404
+ "session_id": self._session_id,
405
+ "sealed": self._sealed,
406
+ "sources": [e.source for e in self._entries],
407
+ "total_entries": len(self._entries),
408
+ "total_tokens": sum(e.token_count for e in self._entries),
409
+ "pii_scrub_enabled": self._pii_scrub,
410
+ }
411
+
412
+ def reset(self) -> "ContextBuilder":
413
+ """Clear all entries and unseal. Prefer a new instance per request.
414
+
415
+ Returns:
416
+ self, for method chaining.
417
+ """
418
+ self._entries.clear()
419
+ self._sealed = False
420
+ self._built_at = None
421
+ return self
422
+
423
+ def build(self) -> ContextPayload | None:
424
+ """Assemble all context into a ContextPayload for auditor.score().
425
+
426
+ Seals the builder - no further additions after this call.
427
+
428
+ Assembly steps: sort entries by source weight
429
+ (reranked > retrieved > tool_output > system_prompt > query),
430
+ deduplicate near-identical chunks if dedup=True, then truncate
431
+ to max_tokens keeping the highest-priority sources.
432
+
433
+ Returns:
434
+ ContextPayload, or None if nothing was added (groundedness
435
+ will score as None with a warning - not a crash).
436
+
437
+ Warns:
438
+ ContextEmptyWarning: If no content was added.
439
+ ContextTooLargeWarning: If max_tokens forced truncation.
440
+ """
441
+ self._sealed = True
442
+ self._built_at = datetime.now(timezone.utc)
443
+
444
+ if not self._entries:
445
+ warnings.warn(
446
+ "ContextBuilder.build() called with no content. "
447
+ "Groundedness will be None. "
448
+ "Call add_retrieved() before build() for full scoring.",
449
+ ContextEmptyWarning,
450
+ stacklevel=2,
451
+ )
452
+ return None
453
+
454
+ sorted_entries = sorted(
455
+ self._entries, key=lambda e: e.source_weight, reverse=True
456
+ )
457
+
458
+ if self._dedup:
459
+ sorted_entries = deduplicate(sorted_entries, threshold=0.92)
460
+
461
+ kept: list[ContextEntry] = []
462
+ budget = self._max_tokens
463
+ was_truncated = False
464
+ for entry in sorted_entries:
465
+ if entry.token_count <= budget:
466
+ kept.append(entry)
467
+ budget -= entry.token_count
468
+ else:
469
+ was_truncated = True
470
+
471
+ if was_truncated:
472
+ warnings.warn(
473
+ f"Context exceeded max_tokens={self._max_tokens}. "
474
+ "Lower-priority sources were dropped. "
475
+ "Increase max_tokens if groundedness scores seem low.",
476
+ ContextTooLargeWarning,
477
+ stacklevel=2,
478
+ )
479
+
480
+ assembled = "\n\n---\n\n".join(e.content for e in kept)
481
+
482
+ checksum = "sha256:" + hashlib.sha256(
483
+ assembled.encode("utf-8")
484
+ ).hexdigest()
485
+
486
+ scrub_summary: dict[str, int] = {}
487
+ pii_scrubbed = False
488
+ for entry in kept:
489
+ if entry.was_scrubbed:
490
+ pii_scrubbed = True
491
+ for k, v in entry.scrub_summary.items():
492
+ scrub_summary[k] = scrub_summary.get(k, 0) + v
493
+
494
+ payload = ContextPayload(
495
+ session_id=self._session_id,
496
+ sources=kept,
497
+ assembled_text=assembled,
498
+ total_tokens=sum(e.token_count for e in kept),
499
+ was_truncated=was_truncated,
500
+ pii_scrubbed=pii_scrubbed,
501
+ scrub_summary=scrub_summary,
502
+ built_at=self._built_at,
503
+ checksum=checksum,
504
+ )
505
+
506
+ audit.emit(
507
+ "context_built",
508
+ session_id=self._session_id,
509
+ total_tokens=payload.total_tokens,
510
+ was_truncated=was_truncated,
511
+ sources_used=sorted({e.source for e in kept}),
512
+ checksum=checksum,
513
+ )
514
+ return payload
@@ -0,0 +1,99 @@
1
+ """Chunk deduplication for context assembly.
2
+
3
+ Near-identical chunks frequently appear when the same document is picked
4
+ up by multiple retrieval steps (raw retrieval + reranking, or two vector
5
+ stores indexing the same corpus). Scoring duplicate text wastes the
6
+ token budget and skews groundedness weighting, so build() merges them.
7
+
8
+ Similarity backend: cosine similarity over sentence-transformers
9
+ embeddings when available (the model is shared with the Auditor's cache),
10
+ falling back to ``difflib.SequenceMatcher`` ratio when
11
+ sentence-transformers is not installed. Both use the same threshold.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from .payload import ContextEntry
17
+
18
+
19
+ def _exact_key(text: str) -> str:
20
+ return " ".join(text.lower().split())
21
+
22
+
23
+ def _similarity_matrix(texts: list[str], embedding_model: str, device: str):
24
+ """Pairwise cosine similarity via embeddings, or None if unavailable."""
25
+ try:
26
+ import numpy as np
27
+ from ..models import get_embedding_model
28
+ model = get_embedding_model(embedding_model, device=device)
29
+ embs = model.encode(texts, convert_to_numpy=True)
30
+ norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-8
31
+ normalised = embs / norms
32
+ return normalised @ normalised.T
33
+ except Exception:
34
+ return None
35
+
36
+
37
+ def _fallback_similarity(a: str, b: str) -> float:
38
+ from difflib import SequenceMatcher
39
+ return SequenceMatcher(None, a, b).ratio()
40
+
41
+
42
+ def deduplicate(
43
+ entries: list[ContextEntry],
44
+ threshold: float = 0.92,
45
+ embedding_model: str = "all-MiniLM-L6-v2",
46
+ device: str = "cpu",
47
+ ) -> list[ContextEntry]:
48
+ """Remove near-duplicate entries, keeping the first occurrence.
49
+
50
+ Entries should be pre-sorted by source weight descending so the most
51
+ authoritative copy of duplicated content survives.
52
+
53
+ Args:
54
+ entries: Context entries to deduplicate.
55
+ threshold: Cosine similarity (or fallback ratio) at or above
56
+ which two entries are considered duplicates. Default 0.92.
57
+ embedding_model: Sentence-transformers model name for the
58
+ embedding backend. Shares the Auditor's model cache.
59
+ device: "cpu" or "cuda".
60
+
61
+ Returns:
62
+ Entries with duplicates removed, original order preserved.
63
+ """
64
+ if len(entries) <= 1:
65
+ return list(entries)
66
+
67
+ # Pass 1: exact duplicates after whitespace/case normalisation.
68
+ seen: set[str] = set()
69
+ unique: list[ContextEntry] = []
70
+ for entry in entries:
71
+ key = _exact_key(entry.content)
72
+ if key in seen:
73
+ continue
74
+ seen.add(key)
75
+ unique.append(entry)
76
+
77
+ if len(unique) <= 1:
78
+ return unique
79
+
80
+ # Pass 2: near-duplicates by similarity.
81
+ texts = [e.content for e in unique]
82
+ matrix = _similarity_matrix(texts, embedding_model, device)
83
+
84
+ kept: list[ContextEntry] = []
85
+ kept_idx: list[int] = []
86
+ for i, entry in enumerate(unique):
87
+ is_dup = False
88
+ for j in kept_idx:
89
+ if matrix is not None:
90
+ sim = float(matrix[i][j])
91
+ else:
92
+ sim = _fallback_similarity(texts[i], texts[j])
93
+ if sim >= threshold:
94
+ is_dup = True
95
+ break
96
+ if not is_dup:
97
+ kept.append(entry)
98
+ kept_idx.append(i)
99
+ return kept
@@ -0,0 +1,66 @@
1
+ """ContextEntry and ContextPayload dataclasses.
2
+
3
+ ContextPayload is what auditor.score() receives. It stores the assembled
4
+ (scrubbed) text and the audit trail - never the raw pre-scrub additions.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime
11
+
12
+
13
+ @dataclass
14
+ class ContextEntry:
15
+ """A single piece of context added to a ContextBuilder.
16
+
17
+ Attributes:
18
+ source: Source label - 'retrieval', 'reranker', 'system_prompt',
19
+ 'tool_output', 'query', or a custom label.
20
+ content: Scrubbed content (PII already replaced if pii_scrub=True).
21
+ added_at: UTC timestamp of the addition.
22
+ metadata: Caller-supplied metadata. Audit-trail only, not scored.
23
+ source_weight: 0.0-1.0; higher = more authoritative for groundedness.
24
+ token_count: Token count of content.
25
+ was_scrubbed: True if PII was detected and replaced in this entry.
26
+ scrub_summary: Entity type counts only - no original values.
27
+ """
28
+ source: str
29
+ content: str
30
+ added_at: datetime
31
+ metadata: dict = field(default_factory=dict)
32
+ source_weight: float = 0.6
33
+ token_count: int = 0
34
+ was_scrubbed: bool = False
35
+ scrub_summary: dict = field(default_factory=dict)
36
+
37
+
38
+ @dataclass
39
+ class ContextPayload:
40
+ """Assembled context returned by ContextBuilder.build().
41
+
42
+ Pass this to ``auditor.score(context=...)``. The payload is consumed
43
+ during scoring - the assembled text feeds the NLI scorer locally and
44
+ is then discarded. Only ``session_id`` and ``checksum`` flow into
45
+ downstream records for audit-trail purposes.
46
+
47
+ Attributes:
48
+ session_id: Trace identifier from the originating ContextBuilder.
49
+ sources: The kept ContextEntry items, highest-weight first.
50
+ assembled_text: Final concatenated grounding string (scrubbed).
51
+ total_tokens: Token count of the kept entries.
52
+ was_truncated: True if the max_tokens budget dropped entries.
53
+ pii_scrubbed: True if any kept entry had PII replaced.
54
+ scrub_summary: Aggregated entity-type counts (no original text).
55
+ built_at: UTC timestamp when build() was called.
56
+ checksum: ``sha256:<hex>`` of assembled_text for integrity checks.
57
+ """
58
+ session_id: str
59
+ sources: list[ContextEntry]
60
+ assembled_text: str
61
+ total_tokens: int
62
+ was_truncated: bool
63
+ pii_scrubbed: bool
64
+ scrub_summary: dict
65
+ built_at: datetime
66
+ checksum: str