scroot 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. scroot/__init__.py +109 -0
  2. scroot/agents.py +345 -0
  3. scroot/audit.py +131 -0
  4. scroot/cli/__init__.py +167 -0
  5. scroot/cli/download.py +49 -0
  6. scroot/cli/eval.py +230 -0
  7. scroot/cli/model_info.py +28 -0
  8. scroot/composite.py +170 -0
  9. scroot/config/__init__.py +0 -0
  10. scroot/config/corrector.py +92 -0
  11. scroot/connectors/__init__.py +5 -0
  12. scroot/connectors/database.py +357 -0
  13. scroot/context/__init__.py +9 -0
  14. scroot/context/adapters.py +86 -0
  15. scroot/context/builder.py +514 -0
  16. scroot/context/dedup.py +99 -0
  17. scroot/context/payload.py +66 -0
  18. scroot/context/pii.py +101 -0
  19. scroot/context/tokenizer.py +42 -0
  20. scroot/core.py +349 -0
  21. scroot/corrector/__init__.py +38 -0
  22. scroot/corrector/api.py +145 -0
  23. scroot/corrector/base.py +20 -0
  24. scroot/corrector/disabled.py +13 -0
  25. scroot/corrector/local.py +112 -0
  26. scroot/corrector/models.py +69 -0
  27. scroot/dashboard/__init__.py +0 -0
  28. scroot/dashboard/__main__.py +37 -0
  29. scroot/dashboard/routers/__init__.py +0 -0
  30. scroot/dashboard/routers/analytics.py +236 -0
  31. scroot/dashboard/routers/corrector.py +230 -0
  32. scroot/dashboard/routers/export.py +150 -0
  33. scroot/dashboard/routers/guardrails.py +41 -0
  34. scroot/dashboard/routers/pipeline.py +218 -0
  35. scroot/dashboard/routers/queue.py +188 -0
  36. scroot/dashboard/routers/records.py +252 -0
  37. scroot/dashboard/routers/settings.py +291 -0
  38. scroot/dashboard/security.py +135 -0
  39. scroot/dashboard/server.py +181 -0
  40. scroot/evidence.py +228 -0
  41. scroot/exceptions.py +62 -0
  42. scroot/feedback/__init__.py +6 -0
  43. scroot/feedback/injector.py +160 -0
  44. scroot/feedback/sanitizer.py +56 -0
  45. scroot/feedback/store.py +650 -0
  46. scroot/flags.py +42 -0
  47. scroot/metrics/__init__.py +15 -0
  48. scroot/metrics/_utils.py +9 -0
  49. scroot/metrics/completeness.py +139 -0
  50. scroot/metrics/confidence.py +83 -0
  51. scroot/metrics/consistency.py +125 -0
  52. scroot/metrics/groundedness.py +193 -0
  53. scroot/metrics/relevance.py +73 -0
  54. scroot/models.py +214 -0
  55. scroot/result.py +276 -0
  56. scroot/sampling.py +306 -0
  57. scroot/text_utils.py +136 -0
  58. scroot/ui/dist/assets/index-DW1dLzDl.js +101 -0
  59. scroot/ui/dist/assets/index-WOhrVVSM.css +2 -0
  60. scroot/ui/dist/favicon.svg +27 -0
  61. scroot/ui/dist/index.html +20 -0
  62. scroot-0.2.0.dist-info/METADATA +832 -0
  63. scroot-0.2.0.dist-info/RECORD +67 -0
  64. scroot-0.2.0.dist-info/WHEEL +5 -0
  65. scroot-0.2.0.dist-info/entry_points.txt +2 -0
  66. scroot-0.2.0.dist-info/licenses/LICENSE +201 -0
  67. scroot-0.2.0.dist-info/top_level.txt +1 -0
scroot/__init__.py ADDED
@@ -0,0 +1,109 @@
1
+ """scroot - LLM-free response quality scoring."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .core import Auditor
6
+ from .result import EntailmentResult
7
+ from .agents import AgentRegistry, AgentConfig
8
+ from .sampling import SamplingResult, SamplingStrategy, sample_and_score
9
+ from .composite import RAG_WEIGHTS
10
+ from .context import ContextBuilder, ContextEntry, ContextPayload
11
+ from .evidence import EvidenceEntry, EvidenceMap, build_evidence_map
12
+ from .audit import configure_audit_log
13
+ from .exceptions import (
14
+ GroundednessComputationError,
15
+ GroundednessUnavailableWarning,
16
+ NoContextWarning,
17
+ )
18
+
19
+ __version__ = "0.2.0"
20
+ __all__ = [
21
+ "Auditor",
22
+ "ContextBuilder",
23
+ "ContextPayload",
24
+ "ContextEntry",
25
+ "EntailmentResult",
26
+ "EvidenceEntry",
27
+ "EvidenceMap",
28
+ "build_evidence_map",
29
+ "AgentRegistry",
30
+ "AgentConfig",
31
+ "SamplingResult",
32
+ "SamplingStrategy",
33
+ "sample_and_score",
34
+ "RAG_WEIGHTS",
35
+ "configure_audit_log",
36
+ "setup_nltk",
37
+ "score",
38
+ "verify",
39
+ "NoContextWarning",
40
+ "GroundednessUnavailableWarning",
41
+ "GroundednessComputationError",
42
+ ]
43
+
44
+
45
+ def setup_nltk() -> None:
46
+ """Download NLTK punkt_tab tokenizer data for improved sentence splitting.
47
+
48
+ Call this once after installation to enable NLTK-backed sentence
49
+ splitting (more accurate than the built-in regex fallback).
50
+ This is a one-time deployment step - not called at runtime.
51
+
52
+ Example:
53
+ python -c "import scroot; scroot.setup_nltk()"
54
+ """
55
+ import nltk
56
+ nltk.download("punkt_tab", quiet=False)
57
+
58
+
59
+ def score(
60
+ query: str,
61
+ response: str,
62
+ context: "ContextPayload | str | list[str] | None" = None,
63
+ **kwargs,
64
+ ) -> EntailmentResult:
65
+ """Score a single LLM response using default settings.
66
+
67
+ Convenience wrapper around Auditor().score(). Creates a fresh Auditor
68
+ instance on each call. For repeated scoring, instantiate Auditor once
69
+ and reuse it to avoid reloading models.
70
+
71
+ Args:
72
+ query: The user's query/question.
73
+ response: The LLM-generated response.
74
+ context: Grounding context - a ContextPayload from
75
+ ContextBuilder.build(), a plain string, a list of source
76
+ context strings, or None.
77
+ **kwargs: Passed through to Auditor().
78
+
79
+ Returns:
80
+ EntailmentResult with all metric scores and flags.
81
+ """
82
+ auditor = Auditor(**kwargs)
83
+ return auditor.score(query=query, response=response, context=context)
84
+
85
+
86
+ def verify(
87
+ query: str,
88
+ response: str,
89
+ context: "ContextPayload | str | list[str] | None" = None,
90
+ threshold: float = 0.7,
91
+ **kwargs,
92
+ ) -> bool:
93
+ """Check whether a response meets a minimum quality threshold.
94
+
95
+ Convenience wrapper that returns True if the IQS score meets or
96
+ exceeds the threshold.
97
+
98
+ Args:
99
+ query: The user's query/question.
100
+ response: The LLM-generated response.
101
+ context: Grounding context - ContextPayload, str, list[str], or None.
102
+ threshold: Minimum IQS score to pass. Default 0.7.
103
+ **kwargs: Passed through to Auditor().
104
+
105
+ Returns:
106
+ True if IQS >= threshold, False otherwise.
107
+ """
108
+ result = score(query=query, response=response, context=context, **kwargs)
109
+ return result.iqs >= threshold
scroot/agents.py ADDED
@@ -0,0 +1,345 @@
1
+ """AgentRegistry: per-agent configuration and statistics tracking.
2
+
3
+ Routes scoring calls through agent-specific configs (custom weights,
4
+ thresholds, context requirements) while sharing a single Auditor
5
+ instance and its loaded models. One process, one model load, per-agent
6
+ metrics.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import threading
12
+ import warnings
13
+ from dataclasses import dataclass, field
14
+
15
+ from .composite import DEFAULT_WEIGHTS, compute_iqs
16
+ from .result import EntailmentResult
17
+
18
+
19
+ @dataclass
20
+ class AgentConfig:
21
+ """Configuration for a single agent.
22
+
23
+ Args:
24
+ name: Unique agent identifier.
25
+ weights: Custom IQS weights. Missing keys filled from defaults.
26
+ iqs_threshold: IQS below this value triggers below_threshold tracking.
27
+ context_required: If True, warn when score() called without context.
28
+ metadata: Optional free-form dict (model name, team, description).
29
+ """
30
+
31
+ name: str
32
+ weights: dict | None = None
33
+ iqs_threshold: float = 0.7
34
+ context_required: bool = False
35
+ metadata: dict = field(default_factory=dict)
36
+
37
+
38
+ @dataclass
39
+ class AgentStats:
40
+ """Accumulated scoring statistics for a single agent."""
41
+
42
+ count: int = 0
43
+ iqs_sum: float = 0.0
44
+ iqs_min: float = 1.0
45
+ iqs_max: float = 0.0
46
+ flagged_count: int = 0
47
+ below_threshold_count: int = 0
48
+ flag_counts: dict = field(default_factory=dict)
49
+
50
+ def record(self, result: EntailmentResult, threshold: float) -> None:
51
+ """Update stats with a new scoring result.
52
+
53
+ Args:
54
+ result: EntailmentResult from a scoring call.
55
+ threshold: IQS threshold for below_threshold tracking.
56
+ """
57
+ self.count += 1
58
+ self.iqs_sum += result.iqs
59
+ self.iqs_min = min(self.iqs_min, result.iqs)
60
+ self.iqs_max = max(self.iqs_max, result.iqs)
61
+ if result.flags:
62
+ self.flagged_count += 1
63
+ for f in result.flags:
64
+ self.flag_counts[f] = self.flag_counts.get(f, 0) + 1
65
+ if result.iqs < threshold:
66
+ self.below_threshold_count += 1
67
+
68
+ def to_dict(self) -> dict:
69
+ """Serialize stats for API or logging."""
70
+ count = max(self.count, 1)
71
+ return {
72
+ "count": self.count,
73
+ "mean_iqs": round(self.iqs_sum / count, 4),
74
+ "min_iqs": self.iqs_min if self.count > 0 else None,
75
+ "max_iqs": self.iqs_max if self.count > 0 else None,
76
+ "flagged_count": self.flagged_count,
77
+ "flag_rate": round(self.flagged_count / count, 4),
78
+ "below_threshold_count": self.below_threshold_count,
79
+ "below_threshold_rate": round(self.below_threshold_count / count, 4),
80
+ "flag_counts": dict(self.flag_counts),
81
+ }
82
+
83
+
84
+ class AgentRegistry:
85
+ """Per-agent configuration and statistics routing layer.
86
+
87
+ Wraps an Auditor instance. Each registered agent can have custom
88
+ IQS weights, thresholds, and metadata. Unregistered agents use
89
+ default config unless strict=True.
90
+
91
+ The registry is duck-type compatible with Auditor: ``score()`` can be
92
+ called with only ``query``, ``response``, and ``context`` kwargs (the
93
+ ``agent`` parameter defaults to ``"_default"``), so it works as a
94
+ drop-in for ``sample_and_score`` and ``DatabaseConnector``.
95
+
96
+ Args:
97
+ auditor: Auditor instance (shared across all agents).
98
+ strict: If True, scoring an unregistered agent raises ValueError.
99
+ default_iqs_threshold: Threshold for unregistered / default agents.
100
+ """
101
+
102
+ def __init__(
103
+ self,
104
+ auditor,
105
+ strict: bool = False,
106
+ default_iqs_threshold: float = 0.7,
107
+ ):
108
+ self._auditor = auditor
109
+ self._strict = strict
110
+ self._default_threshold = default_iqs_threshold
111
+ self._configs: dict[str, AgentConfig] = {}
112
+ self._stats: dict[str, AgentStats] = {}
113
+ self._lock = threading.Lock()
114
+
115
+ def register(
116
+ self,
117
+ name: str,
118
+ weights: dict | None = None,
119
+ iqs_threshold: float | None = None,
120
+ context_required: bool = False,
121
+ metadata: dict | None = None,
122
+ ) -> None:
123
+ """Register an agent with optional custom configuration.
124
+
125
+ Args:
126
+ name: Unique agent identifier string.
127
+ weights: Custom IQS weights dict. Partial dicts OK -
128
+ missing keys filled from DEFAULT_WEIGHTS.
129
+ iqs_threshold: Custom IQS threshold. Defaults to registry default.
130
+ context_required: If True, warn when score() is called without context.
131
+ metadata: Optional dict (model, team, description, etc).
132
+
133
+ Raises:
134
+ ValueError: If an agent with this name is already registered.
135
+ """
136
+ with self._lock:
137
+ if name in self._configs:
138
+ raise ValueError(
139
+ f"Agent {name!r} already registered. "
140
+ "Use update() to modify or unregister() first."
141
+ )
142
+ self._configs[name] = AgentConfig(
143
+ name=name,
144
+ weights=weights,
145
+ iqs_threshold=iqs_threshold if iqs_threshold is not None else self._default_threshold,
146
+ context_required=context_required,
147
+ metadata=metadata or {},
148
+ )
149
+ self._stats[name] = AgentStats()
150
+
151
+ def update(self, name: str, **kwargs) -> None:
152
+ """Update a registered agent's configuration.
153
+
154
+ Args:
155
+ name: Agent identifier.
156
+ **kwargs: AgentConfig fields to update (weights, iqs_threshold,
157
+ context_required, metadata).
158
+
159
+ Raises:
160
+ ValueError: If agent is not registered or field name is invalid.
161
+ """
162
+ with self._lock:
163
+ if name not in self._configs:
164
+ raise ValueError(f"Agent {name!r} not registered.")
165
+ config = self._configs[name]
166
+ for key, value in kwargs.items():
167
+ if hasattr(config, key):
168
+ setattr(config, key, value)
169
+ else:
170
+ raise ValueError(f"Unknown config field: {key!r}")
171
+
172
+ def unregister(self, name: str) -> None:
173
+ """Remove an agent and its accumulated stats.
174
+
175
+ Args:
176
+ name: Agent identifier to remove.
177
+
178
+ Raises:
179
+ ValueError: If agent is not registered.
180
+ """
181
+ with self._lock:
182
+ if name not in self._configs:
183
+ raise ValueError(f"Agent {name!r} not registered.")
184
+ del self._configs[name]
185
+ self._stats.pop(name, None)
186
+
187
+ def list_agents(self) -> list[str]:
188
+ """Return names of all registered agents."""
189
+ with self._lock:
190
+ return list(self._configs.keys())
191
+
192
+ def get_config(self, name: str) -> AgentConfig:
193
+ """Get an agent's configuration.
194
+
195
+ Args:
196
+ name: Agent identifier.
197
+
198
+ Returns:
199
+ AgentConfig for the named agent.
200
+
201
+ Raises:
202
+ ValueError: If agent is not registered.
203
+ """
204
+ with self._lock:
205
+ if name not in self._configs:
206
+ raise ValueError(f"Agent {name!r} not registered.")
207
+ return self._configs[name]
208
+
209
+ def score(
210
+ self,
211
+ agent: str = "_default",
212
+ *,
213
+ query: str,
214
+ response: str,
215
+ context: list[str] | None = None,
216
+ ) -> EntailmentResult:
217
+ """Score a response using agent-specific configuration.
218
+
219
+ The ``agent`` parameter defaults to ``"_default"``, making this
220
+ method duck-type compatible with ``Auditor.score()`` so that the
221
+ registry can be passed to ``sample_and_score()`` or
222
+ ``DatabaseConnector`` directly.
223
+
224
+ IQS is recomputed from the raw metric scores using the agent's
225
+ custom weights. The auditor's own weights attribute is never
226
+ mutated, so concurrent calls for different agents are safe.
227
+
228
+ Args:
229
+ agent: Agent identifier. Defaults to "_default".
230
+ query: User query.
231
+ response: LLM-generated response.
232
+ context: Optional source context list.
233
+
234
+ Returns:
235
+ EntailmentResult with agent-specific IQS and details["agent"] set.
236
+
237
+ Raises:
238
+ ValueError: If strict=True and agent is not registered.
239
+ """
240
+ with self._lock:
241
+ config = self._configs.get(agent)
242
+
243
+ if config is None:
244
+ if self._strict:
245
+ raise ValueError(
246
+ f"Agent {agent!r} not registered. "
247
+ "Call registry.register() first."
248
+ )
249
+ config = AgentConfig(name=agent, iqs_threshold=self._default_threshold)
250
+
251
+ if config.context_required and context is None:
252
+ warnings.warn(
253
+ f"Agent {agent!r} requires context but none was provided. "
254
+ "Groundedness will be skipped.",
255
+ stacklevel=2,
256
+ )
257
+
258
+ raw = self._auditor.score(query=query, response=response, context=context)
259
+
260
+ # Recompute IQS with agent-specific weights (no mutation of auditor state).
261
+ effective_weights = dict(DEFAULT_WEIGHTS)
262
+ if config.weights:
263
+ effective_weights.update(config.weights)
264
+
265
+ iqs = compute_iqs(
266
+ raw.groundedness, raw.completeness, raw.relevance,
267
+ raw.consistency, raw.confidence,
268
+ weights=effective_weights,
269
+ mode=self._auditor.iqs_mode,
270
+ )
271
+
272
+ result = EntailmentResult(
273
+ groundedness=raw.groundedness,
274
+ completeness=raw.completeness,
275
+ relevance=raw.relevance,
276
+ consistency=raw.consistency,
277
+ confidence=raw.confidence,
278
+ iqs=iqs,
279
+ flags=list(raw.flags),
280
+ details={
281
+ **raw.details,
282
+ "agent": agent,
283
+ "agent_config": {
284
+ "weights": effective_weights,
285
+ "iqs_threshold": config.iqs_threshold,
286
+ },
287
+ },
288
+ )
289
+
290
+ with self._lock:
291
+ if agent not in self._stats:
292
+ self._stats[agent] = AgentStats()
293
+ self._stats[agent].record(result, config.iqs_threshold)
294
+
295
+ return result
296
+
297
+ def score_batch(self, items: list[dict]) -> list[EntailmentResult]:
298
+ """Score a batch of responses, each routed to its agent config.
299
+
300
+ Items without an "agent" key are scored under "_default".
301
+
302
+ Args:
303
+ items: List of dicts with "agent", "query", "response",
304
+ and optionally "context".
305
+
306
+ Returns:
307
+ List of EntailmentResult, one per item, in order.
308
+ """
309
+ return [
310
+ self.score(
311
+ agent=item.get("agent", "_default"),
312
+ query=item["query"],
313
+ response=item["response"],
314
+ context=item.get("context"),
315
+ )
316
+ for item in items
317
+ ]
318
+
319
+ def get_stats(self, agent: str | None = None) -> dict:
320
+ """Get accumulated scoring statistics.
321
+
322
+ Args:
323
+ agent: If provided, return stats for this agent only.
324
+ Returns ``{}`` if agent has no stats yet.
325
+ If None, return stats for all agents.
326
+ """
327
+ with self._lock:
328
+ if agent is not None:
329
+ stats = self._stats.get(agent)
330
+ return stats.to_dict() if stats is not None else {}
331
+ return {name: s.to_dict() for name, s in self._stats.items()}
332
+
333
+ def reset_stats(self, agent: str | None = None) -> None:
334
+ """Reset accumulated statistics.
335
+
336
+ Args:
337
+ agent: If provided, reset only this agent. If None, reset all.
338
+ """
339
+ with self._lock:
340
+ if agent is not None:
341
+ if agent in self._stats:
342
+ self._stats[agent] = AgentStats()
343
+ else:
344
+ for name in self._stats:
345
+ self._stats[name] = AgentStats()
scroot/audit.py ADDED
@@ -0,0 +1,131 @@
1
+ """Structured, content-free audit logging (SOC II CC7).
2
+
3
+ Every ContextBuilder operation that touches content emits a structured
4
+ audit event - metadata only (entity-type counts, token counts, sources,
5
+ checksums), never the content itself.
6
+
7
+ Default destination is structured stderr (no file write in the OSS tier).
8
+ Enterprise deployments can route to a JSONL file with retention-based
9
+ rotation via :func:`configure_audit_log`.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import os
16
+ import sys
17
+ import threading
18
+ from datetime import datetime, timezone, timedelta
19
+
20
+ _lock = threading.Lock()
21
+
22
+ _config: dict = {
23
+ "destination": "stderr", # "stderr" | "file" | "disabled"
24
+ "path": None,
25
+ "retention_days": 90,
26
+ }
27
+
28
+
29
+ def configure_audit_log(
30
+ destination: str = "stderr",
31
+ path: str | None = None,
32
+ retention_days: int = 90,
33
+ ) -> None:
34
+ """Configure where scroot audit events are written.
35
+
36
+ Args:
37
+ destination: "stderr" (default - structured JSON lines on stderr),
38
+ "file" (append to a JSONL file), or "disabled".
39
+ path: JSONL file path, required when destination="file".
40
+ ``~`` is expanded. Example: ``~/.scroot/audit.jsonl``.
41
+ retention_days: For file destination, events older than this are
42
+ pruned when the log is reconfigured or reopened. Default 90.
43
+
44
+ Raises:
45
+ ValueError: If destination is unknown, or destination="file"
46
+ without a path.
47
+ """
48
+ if destination not in ("stderr", "file", "disabled"):
49
+ raise ValueError(
50
+ f"Unknown audit destination {destination!r}. "
51
+ "Use 'stderr', 'file', or 'disabled'."
52
+ )
53
+ if destination == "file" and not path:
54
+ raise ValueError("destination='file' requires a path.")
55
+
56
+ resolved = os.path.expanduser(path) if path else None
57
+ with _lock:
58
+ _config["destination"] = destination
59
+ _config["path"] = resolved
60
+ _config["retention_days"] = retention_days
61
+ if destination == "file":
62
+ _rotate(resolved, retention_days)
63
+
64
+
65
+ def _rotate(path: str, retention_days: int) -> None:
66
+ """Drop events older than retention_days. Never raises."""
67
+ try:
68
+ if not path or not os.path.exists(path):
69
+ return
70
+ cutoff = (
71
+ datetime.now(timezone.utc) - timedelta(days=retention_days)
72
+ ).isoformat()
73
+ kept = []
74
+ with open(path, "r", encoding="utf-8") as f:
75
+ for line in f:
76
+ line = line.strip()
77
+ if not line:
78
+ continue
79
+ try:
80
+ event = json.loads(line)
81
+ if event.get("timestamp", "") >= cutoff:
82
+ kept.append(line)
83
+ except json.JSONDecodeError:
84
+ continue
85
+ with _lock:
86
+ with open(path, "w", encoding="utf-8") as f:
87
+ for line in kept:
88
+ f.write(line + "\n")
89
+ except OSError:
90
+ pass
91
+
92
+
93
+ def emit(event: str, **fields) -> None:
94
+ """Write one audit event. Metadata only - callers must never pass content.
95
+
96
+ Failures are swallowed: audit logging must never crash the client's
97
+ pipeline.
98
+
99
+ Args:
100
+ event: Event name, e.g. "context_entry_added", "context_built".
101
+ **fields: JSON-serialisable metadata (counts, flags, checksums).
102
+ """
103
+ with _lock:
104
+ destination = _config["destination"]
105
+ path = _config["path"]
106
+ if destination == "disabled":
107
+ return
108
+
109
+ from . import __version__
110
+ record = {
111
+ "event": event,
112
+ "timestamp": datetime.now(timezone.utc).isoformat(),
113
+ **fields,
114
+ "scroot_version": __version__,
115
+ }
116
+ try:
117
+ line = json.dumps(record, ensure_ascii=False, default=str)
118
+ except (TypeError, ValueError):
119
+ return
120
+
121
+ try:
122
+ if destination == "file" and path:
123
+ dir_name = os.path.dirname(os.path.abspath(path)) or "."
124
+ os.makedirs(dir_name, exist_ok=True)
125
+ with _lock:
126
+ with open(path, "a", encoding="utf-8") as f:
127
+ f.write(line + "\n")
128
+ else:
129
+ print(line, file=sys.stderr)
130
+ except OSError:
131
+ pass