scroot 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. scroot/__init__.py +109 -0
  2. scroot/agents.py +345 -0
  3. scroot/audit.py +131 -0
  4. scroot/cli/__init__.py +167 -0
  5. scroot/cli/download.py +49 -0
  6. scroot/cli/eval.py +230 -0
  7. scroot/cli/model_info.py +28 -0
  8. scroot/composite.py +170 -0
  9. scroot/config/__init__.py +0 -0
  10. scroot/config/corrector.py +92 -0
  11. scroot/connectors/__init__.py +5 -0
  12. scroot/connectors/database.py +357 -0
  13. scroot/context/__init__.py +9 -0
  14. scroot/context/adapters.py +86 -0
  15. scroot/context/builder.py +514 -0
  16. scroot/context/dedup.py +99 -0
  17. scroot/context/payload.py +66 -0
  18. scroot/context/pii.py +101 -0
  19. scroot/context/tokenizer.py +42 -0
  20. scroot/core.py +349 -0
  21. scroot/corrector/__init__.py +38 -0
  22. scroot/corrector/api.py +145 -0
  23. scroot/corrector/base.py +20 -0
  24. scroot/corrector/disabled.py +13 -0
  25. scroot/corrector/local.py +112 -0
  26. scroot/corrector/models.py +69 -0
  27. scroot/dashboard/__init__.py +0 -0
  28. scroot/dashboard/__main__.py +37 -0
  29. scroot/dashboard/routers/__init__.py +0 -0
  30. scroot/dashboard/routers/analytics.py +236 -0
  31. scroot/dashboard/routers/corrector.py +230 -0
  32. scroot/dashboard/routers/export.py +150 -0
  33. scroot/dashboard/routers/guardrails.py +41 -0
  34. scroot/dashboard/routers/pipeline.py +218 -0
  35. scroot/dashboard/routers/queue.py +188 -0
  36. scroot/dashboard/routers/records.py +252 -0
  37. scroot/dashboard/routers/settings.py +291 -0
  38. scroot/dashboard/security.py +135 -0
  39. scroot/dashboard/server.py +181 -0
  40. scroot/evidence.py +228 -0
  41. scroot/exceptions.py +62 -0
  42. scroot/feedback/__init__.py +6 -0
  43. scroot/feedback/injector.py +160 -0
  44. scroot/feedback/sanitizer.py +56 -0
  45. scroot/feedback/store.py +650 -0
  46. scroot/flags.py +42 -0
  47. scroot/metrics/__init__.py +15 -0
  48. scroot/metrics/_utils.py +9 -0
  49. scroot/metrics/completeness.py +139 -0
  50. scroot/metrics/confidence.py +83 -0
  51. scroot/metrics/consistency.py +125 -0
  52. scroot/metrics/groundedness.py +193 -0
  53. scroot/metrics/relevance.py +73 -0
  54. scroot/models.py +214 -0
  55. scroot/result.py +276 -0
  56. scroot/sampling.py +306 -0
  57. scroot/text_utils.py +136 -0
  58. scroot/ui/dist/assets/index-DW1dLzDl.js +101 -0
  59. scroot/ui/dist/assets/index-WOhrVVSM.css +2 -0
  60. scroot/ui/dist/favicon.svg +27 -0
  61. scroot/ui/dist/index.html +20 -0
  62. scroot-0.2.0.dist-info/METADATA +832 -0
  63. scroot-0.2.0.dist-info/RECORD +67 -0
  64. scroot-0.2.0.dist-info/WHEEL +5 -0
  65. scroot-0.2.0.dist-info/entry_points.txt +2 -0
  66. scroot-0.2.0.dist-info/licenses/LICENSE +201 -0
  67. scroot-0.2.0.dist-info/top_level.txt +1 -0
scroot/cli/__init__.py ADDED
@@ -0,0 +1,167 @@
1
+ """Scroot CLI - `scroot serve` starts the review dashboard."""
2
+ from __future__ import annotations
3
+
4
+ try:
5
+ import typer
6
+ app = typer.Typer(name="scroot", help="Scroot LLM response quality tools.")
7
+
8
+ @app.command("download-model")
9
+ def download_model_cmd(
10
+ model: str = typer.Option("phi4-mini", help="Model ID to download (phi4-mini, smollm3)"),
11
+ ):
12
+ """Download a local LLM model for offline correction."""
13
+ from scroot.cli.download import download_model
14
+ try:
15
+ download_model(model)
16
+ except Exception as e:
17
+ typer.echo(f"ERROR: {e}")
18
+ raise typer.Exit(1)
19
+
20
+ @app.command("model-info")
21
+ def model_info_cmd():
22
+ """List available local LLM models and their download status."""
23
+ from scroot.cli.model_info import print_model_info
24
+ print_model_info()
25
+
26
+ @app.command()
27
+ def score(
28
+ query: str = typer.Option(..., "--query", "-q", help="The user's query/question."),
29
+ response: str = typer.Option(..., "--response", "-r", help="The LLM-generated response."),
30
+ context: list[str] = typer.Option(
31
+ None, "--context", "-c", help="Grounding context chunk. Repeat for multiple chunks."
32
+ ),
33
+ json_output: bool = typer.Option(
34
+ False, "--json", help="Print the full result as JSON instead of a summary."
35
+ ),
36
+ ):
37
+ """Score a single query/response pair and print the result."""
38
+ import json as json_module
39
+ from scroot import score as score_fn
40
+
41
+ result = score_fn(query=query, response=response, context=context or None)
42
+
43
+ if json_output:
44
+ typer.echo(json_module.dumps(result.to_dict(), indent=2, default=str))
45
+ return
46
+
47
+ typer.echo(f"IQS: {result.iqs:.2f}")
48
+ typer.echo(
49
+ f"Groundedness: {result.groundedness:.2f}"
50
+ if result.groundedness is not None
51
+ else "Groundedness: n/a (no context provided)"
52
+ )
53
+ typer.echo(f"Completeness: {result.completeness:.2f}")
54
+ typer.echo(f"Relevance: {result.relevance:.2f}")
55
+ typer.echo(f"Consistency: {result.consistency:.2f}")
56
+ typer.echo(f"Confidence: {result.confidence:.2f}")
57
+ typer.echo(f"Flags: {result.flags}")
58
+
59
+ @app.command("eval")
60
+ def eval_cmd(
61
+ suite: str = typer.Option(..., "--suite", "-s", help="Path to a YAML eval suite."),
62
+ fail_below: float = typer.Option(
63
+ None, "--fail-below", help="Override the suite's fail_below_iqs threshold."
64
+ ),
65
+ json_output: bool = typer.Option(
66
+ False, "--json", help="Print a machine-readable JSON summary instead of text."
67
+ ),
68
+ output: str = typer.Option(
69
+ None, "--output", help="Write a JUnit XML report to this path (for CI)."
70
+ ),
71
+ ):
72
+ """Run a YAML-defined quality regression suite (CI/CD quality gate)."""
73
+ import json as json_module
74
+ from scroot.cli.eval import format_junit_xml, format_report, load_suite, run_suite
75
+
76
+ try:
77
+ suite_obj = load_suite(suite)
78
+ except Exception as e:
79
+ typer.echo(f"ERROR: {e}")
80
+ raise typer.Exit(1)
81
+
82
+ result = run_suite(suite_obj, fail_below=fail_below)
83
+
84
+ if output:
85
+ with open(output, "w", encoding="utf-8") as f:
86
+ f.write(format_junit_xml(suite_obj, result))
87
+
88
+ if json_output:
89
+ typer.echo(json_module.dumps({
90
+ "name": suite_obj.name,
91
+ "passed": result.passed_count,
92
+ "failed": result.failed_count,
93
+ "avg_iqs": result.avg_iqs,
94
+ "results": [
95
+ {
96
+ "query": r.example.query,
97
+ "iqs": r.iqs,
98
+ "passed": r.passed,
99
+ "gate_reason": r.gate_reason,
100
+ "tags": r.example.tags,
101
+ }
102
+ for r in result.results
103
+ ],
104
+ }, indent=2))
105
+ else:
106
+ typer.echo(format_report(suite_obj, result))
107
+
108
+ if result.failed_count:
109
+ raise typer.Exit(1)
110
+
111
+ @app.command()
112
+ def serve(
113
+ port: int = typer.Option(7432, help="Port to listen on"),
114
+ store: str = typer.Option("./scroot_store.jsonl", help="JSONL feedback store path"),
115
+ host: str = typer.Option("127.0.0.1", help="Host to bind to"),
116
+ token: str = typer.Option(
117
+ None, "--token",
118
+ help="Require this shared token on all /api routes (for network "
119
+ "binds). Falls back to SCROOT_DASHBOARD_TOKEN.",
120
+ ),
121
+ hosted: bool = typer.Option(False, hidden=True),
122
+ ):
123
+ """Start the Scroot Review Console at http://localhost:7432
124
+
125
+ The dashboard has no per-user login. The default 127.0.0.1 bind is
126
+ single-user safe. If you bind to a routable host (e.g. --host 0.0.0.0),
127
+ set --token (or SCROOT_DASHBOARD_TOKEN) and/or front it with an
128
+ authenticating reverse proxy - otherwise the correction store and the
129
+ stored LLM API key are reachable by anyone on the network.
130
+ """
131
+ try:
132
+ import uvicorn
133
+ except ImportError:
134
+ typer.echo("ERROR: Install dashboard deps: pip install 'scroot[dashboard]'")
135
+ raise typer.Exit(1)
136
+
137
+ from scroot.dashboard.security import is_loopback_host, resolve_dashboard_token
138
+ from scroot.dashboard.server import create_app
139
+
140
+ fa_app = create_app(store_path=store, hosted=hosted, host=host, auth_token=token)
141
+
142
+ typer.echo("\n * SCROOT Review Console")
143
+ typer.echo(f" Store : {store}")
144
+ typer.echo(f" URL : http://{host}:{port}")
145
+ if resolve_dashboard_token(token) is not None:
146
+ typer.echo(" Auth : token required (Authorization: Bearer / X-Scroot-Token)")
147
+ elif not is_loopback_host(host):
148
+ typer.echo(
149
+ " WARNING: non-loopback bind with NO auth - the store and "
150
+ "stored API key are exposed to the network. Set --token."
151
+ )
152
+ typer.echo("")
153
+ uvicorn.run(fa_app, host=host, port=port, log_level="info")
154
+
155
+ except ImportError:
156
+ # typer not installed - provide a minimal fallback
157
+ import sys
158
+
159
+ class _FakeCLI:
160
+ def command(self, *a, **kw):
161
+ def dec(fn): return fn
162
+ return dec
163
+ def __call__(self):
164
+ print("scroot: install typer for CLI support: pip install typer")
165
+ sys.exit(1)
166
+
167
+ app = _FakeCLI()
scroot/cli/download.py ADDED
@@ -0,0 +1,49 @@
1
+ """scroot download-model command."""
2
+ from __future__ import annotations
3
+
4
+ from scroot.corrector.models import (
5
+ DEFAULT_MODEL_ID,
6
+ MODEL_REGISTRY,
7
+ get_model_dir,
8
+ get_model_path,
9
+ is_model_downloaded,
10
+ )
11
+
12
+
13
+ def download_model(model_id: str = DEFAULT_MODEL_ID) -> None:
14
+ if model_id not in MODEL_REGISTRY:
15
+ ids = ", ".join(MODEL_REGISTRY.keys())
16
+ raise ValueError(f"Unknown model '{model_id}'. Available: {ids}")
17
+
18
+ spec = MODEL_REGISTRY[model_id]
19
+
20
+ if is_model_downloaded(model_id):
21
+ print(f"{spec.name} is already downloaded at {get_model_path(model_id)}")
22
+ return
23
+
24
+ try:
25
+ from huggingface_hub import hf_hub_download
26
+ except ImportError:
27
+ raise RuntimeError(
28
+ "huggingface-hub is not installed. "
29
+ "Run: pip install 'scroot[local]'"
30
+ )
31
+
32
+ dest = get_model_dir() / model_id
33
+ dest.mkdir(parents=True, exist_ok=True)
34
+
35
+ print(f"Downloading {spec.name} ({spec.size_gb} GB)...")
36
+ print(f"Source : {spec.hf_repo}")
37
+ print(f"Destination : {dest}")
38
+ print()
39
+
40
+ hf_hub_download(
41
+ repo_id=spec.hf_repo,
42
+ filename=spec.hf_filename,
43
+ local_dir=str(dest),
44
+ resume_download=True,
45
+ token=False,
46
+ )
47
+
48
+ print(f"\nOK {spec.name} ready at {dest / spec.hf_filename}")
49
+ print("Run `scroot serve` to start the dashboard.")
scroot/cli/eval.py ADDED
@@ -0,0 +1,230 @@
1
+ """`scroot eval` - run a YAML-defined quality regression suite.
2
+
3
+ Loads a suite of (query, response, context) examples with expected IQS/
4
+ groundedness floors, scores each with scroot, and reports pass/fail using
5
+ EntailmentResult.passes_gate() / gate_reason(). Intended as a CI/CD quality
6
+ gate - exits non-zero if any example fails its gate.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass, field
12
+ from xml.etree import ElementTree as ET
13
+
14
+ from scroot import score
15
+
16
+
17
+ @dataclass
18
+ class EvalExample:
19
+ """A single (query, response, context) case in an eval suite."""
20
+
21
+ query: str
22
+ response: str
23
+ context: "str | list[str] | None" = None
24
+ expected_iqs_min: "float | None" = None
25
+ tags: list[str] = field(default_factory=list)
26
+
27
+
28
+ @dataclass
29
+ class EvalSuite:
30
+ """A YAML-defined collection of EvalExamples with default gate thresholds."""
31
+
32
+ name: str
33
+ examples: list[EvalExample]
34
+ fail_below_iqs: "float | None" = None
35
+ fail_below_groundedness: "float | None" = None
36
+
37
+
38
+ @dataclass
39
+ class ExampleResult:
40
+ """Outcome of scoring a single EvalExample."""
41
+
42
+ example: EvalExample
43
+ iqs: float
44
+ passed: bool
45
+ gate_reason: "str | None"
46
+
47
+
48
+ @dataclass
49
+ class EvalRunResult:
50
+ """Aggregate outcome of running an EvalSuite."""
51
+
52
+ results: list[ExampleResult]
53
+
54
+ @property
55
+ def passed_count(self) -> int:
56
+ return sum(1 for r in self.results if r.passed)
57
+
58
+ @property
59
+ def failed_count(self) -> int:
60
+ return sum(1 for r in self.results if not r.passed)
61
+
62
+ @property
63
+ def avg_iqs(self) -> float:
64
+ if not self.results:
65
+ return 0.0
66
+ return sum(r.iqs for r in self.results) / len(self.results)
67
+
68
+
69
+ def _import_yaml():
70
+ try:
71
+ import yaml
72
+ except ImportError as exc:
73
+ raise RuntimeError(
74
+ "pyyaml is required for `scroot eval`. "
75
+ "Install it with: pip install pyyaml"
76
+ ) from exc
77
+ return yaml
78
+
79
+
80
+ def load_suite(path: str) -> EvalSuite:
81
+ """Load an EvalSuite from a YAML file.
82
+
83
+ Expected shape:
84
+
85
+ name: Support regression suite
86
+ fail_below_iqs: 0.70
87
+ fail_below_groundedness: 0.80
88
+ examples:
89
+ - query: "..."
90
+ response: "..."
91
+ context: "..." # or a list of strings
92
+ expected_iqs_min: 0.75 # optional, overrides fail_below_iqs
93
+ tags: [billing]
94
+
95
+ Args:
96
+ path: Path to the YAML suite file.
97
+
98
+ Returns:
99
+ Parsed EvalSuite.
100
+
101
+ Raises:
102
+ RuntimeError: If pyyaml is not installed.
103
+ OSError: If the file cannot be read.
104
+ ValueError: If the YAML is malformed or missing required fields.
105
+ """
106
+ yaml = _import_yaml()
107
+
108
+ with open(path, encoding="utf-8") as f:
109
+ try:
110
+ data = yaml.safe_load(f)
111
+ except yaml.YAMLError as exc:
112
+ raise ValueError(f"Invalid YAML in {path}: {exc}") from exc
113
+
114
+ if not isinstance(data, dict):
115
+ raise ValueError(f"{path}: top-level YAML must be a mapping")
116
+
117
+ raw_examples = data.get("examples") or []
118
+ if not isinstance(raw_examples, list):
119
+ raise ValueError(f"{path}: 'examples' must be a list")
120
+
121
+ examples = []
122
+ for i, raw in enumerate(raw_examples):
123
+ if "query" not in raw or "response" not in raw:
124
+ raise ValueError(f"{path}: examples[{i}] is missing 'query' or 'response'")
125
+ examples.append(EvalExample(
126
+ query=raw["query"],
127
+ response=raw["response"],
128
+ context=raw.get("context"),
129
+ expected_iqs_min=raw.get("expected_iqs_min"),
130
+ tags=raw.get("tags") or [],
131
+ ))
132
+
133
+ return EvalSuite(
134
+ name=data.get("name", path),
135
+ examples=examples,
136
+ fail_below_iqs=data.get("fail_below_iqs"),
137
+ fail_below_groundedness=data.get("fail_below_groundedness"),
138
+ )
139
+
140
+
141
+ def run_suite(suite: EvalSuite, fail_below: "float | None" = None) -> EvalRunResult:
142
+ """Score every example in a suite and evaluate its quality gate.
143
+
144
+ Args:
145
+ suite: The eval suite to run.
146
+ fail_below: Optional CLI override for the IQS gate threshold,
147
+ applied to examples that don't set their own
148
+ ``expected_iqs_min``.
149
+
150
+ Returns:
151
+ EvalRunResult with per-example outcomes and aggregate stats.
152
+ """
153
+ results = []
154
+ for example in suite.examples:
155
+ result = score(query=example.query, response=example.response, context=example.context)
156
+ threshold = (
157
+ example.expected_iqs_min
158
+ if example.expected_iqs_min is not None
159
+ else fail_below
160
+ if fail_below is not None
161
+ else suite.fail_below_iqs
162
+ if suite.fail_below_iqs is not None
163
+ else 0.70
164
+ )
165
+ reason = result.gate_reason(
166
+ threshold=threshold,
167
+ require_groundedness=suite.fail_below_groundedness,
168
+ )
169
+ results.append(ExampleResult(
170
+ example=example,
171
+ iqs=result.iqs,
172
+ passed=reason is None,
173
+ gate_reason=reason,
174
+ ))
175
+
176
+ return EvalRunResult(results=results)
177
+
178
+
179
+ def format_report(suite: EvalSuite, run_result: EvalRunResult) -> str:
180
+ """Format a plain-text report of an eval run for CLI output."""
181
+ lines = [f"Eval suite: {suite.name}", ""]
182
+
183
+ for i, result in enumerate(run_result.results, start=1):
184
+ if result.passed:
185
+ continue
186
+ tags = f" [{', '.join(result.example.tags)}]" if result.example.tags else ""
187
+ lines.append(f"FAIL #{i}{tags}")
188
+ lines.append(f" Query: {result.example.query}")
189
+ lines.append(f" IQS: {result.iqs:.2f}")
190
+ lines.append(f" Reason: {result.gate_reason}")
191
+ lines.append("")
192
+
193
+ lines.append(
194
+ f"Summary: {run_result.passed_count}/{len(run_result.results)} passed "
195
+ f"- avg IQS {run_result.avg_iqs:.2f}"
196
+ )
197
+ return "\n".join(lines)
198
+
199
+
200
+ def format_junit_xml(suite: EvalSuite, run_result: EvalRunResult) -> str:
201
+ """Format an eval run as a JUnit XML report for CI integration.
202
+
203
+ Each example becomes a ``<testcase>``; failing examples (per
204
+ ``passes_gate()``/``gate_reason()``) get a ``<failure>`` child with the
205
+ gate reason as the failure message.
206
+ """
207
+ testsuite = ET.Element("testsuite", {
208
+ "name": suite.name,
209
+ "tests": str(len(run_result.results)),
210
+ "failures": str(run_result.failed_count),
211
+ })
212
+
213
+ for i, result in enumerate(run_result.results, start=1):
214
+ tags = ", ".join(result.example.tags) if result.example.tags else ""
215
+ case_name = f"#{i} {tags}".strip() if tags else f"#{i} {result.example.query[:40]}"
216
+ testcase = ET.SubElement(testsuite, "testcase", {
217
+ "classname": suite.name,
218
+ "name": case_name,
219
+ })
220
+ if not result.passed:
221
+ failure = ET.SubElement(testcase, "failure", {
222
+ "message": result.gate_reason or "gate failed",
223
+ })
224
+ failure.text = (
225
+ f"Query: {result.example.query}\n"
226
+ f"IQS: {result.iqs:.2f}\n"
227
+ f"Reason: {result.gate_reason}"
228
+ )
229
+
230
+ return '<?xml version="1.0" encoding="UTF-8"?>\n' + ET.tostring(testsuite, encoding="unicode")
@@ -0,0 +1,28 @@
1
+ """scroot model-info command."""
2
+ from __future__ import annotations
3
+
4
+ from scroot.corrector.models import (
5
+ DEFAULT_MODEL_ID,
6
+ MODEL_REGISTRY,
7
+ get_model_dir,
8
+ is_model_downloaded,
9
+ )
10
+
11
+
12
+ def print_model_info() -> None:
13
+ col = "{:<16} {:<16} {:<8} {:<8} {:<12}"
14
+ print()
15
+ print(" scroot models")
16
+ print()
17
+ print(" " + col.format("Model", "Status", "Size", "RAM", "License"))
18
+ print(" " + "-" * 60)
19
+ for model_id, spec in MODEL_REGISTRY.items():
20
+ default_tag = " <- default" if model_id == DEFAULT_MODEL_ID else ""
21
+ status = "ready" if is_model_downloaded(model_id) else "not downloaded"
22
+ size = f"{spec.size_gb} GB"
23
+ ram = f"{spec.rec_ram_gb} GB"
24
+ print(f" {spec.name:<20} {status:<16} {size:<8} {ram:<8} {spec.license}{default_tag}")
25
+ print()
26
+ print(f" Models stored at: {get_model_dir()}")
27
+ print(" To download: scroot download-model [--model smollm3]")
28
+ print()
scroot/composite.py ADDED
@@ -0,0 +1,170 @@
1
+ """Information Quality Score (IQS) - composite metric.
2
+
3
+ IQS = n / sum(w_i / s_i) -- the weighted harmonic mean of the five
4
+ metric scores, where n = sum(w_i).
5
+
6
+ Two scoring modes:
7
+ harmonic (default): weighted harmonic mean. Any metric near zero drives
8
+ IQS to zero. A response with groundedness=0.1 and all others at 0.9
9
+ scores ~0.31. Zero tolerance: a single quality failure dominates the
10
+ score, which matches the goal of flagging unreliable responses.
11
+
12
+ geometric: weighted geometric mean. Penalizes low scores but does not
13
+ collapse to zero unless a metric is literally zero. Reflects partial
14
+ quality more gently: 9 correct claims + 1 wrong claim -> ~0.8 IQS
15
+ (not near 0).
16
+
17
+ Harmonic is the default and the formula documented in the README: any
18
+ metric near zero (e.g. a hallucinated claim) should drive the composite
19
+ score down hard rather than being averaged away.
20
+
21
+ Default weights:
22
+ groundedness 0.35 (most important: is it faithful to the source?)
23
+ completeness 0.25 (did it answer the full question?)
24
+ relevance 0.20 (is it on topic?)
25
+ consistency 0.15 (does it contradict itself?)
26
+ confidence 0.05 (calibration signal, low weight)
27
+
28
+ When context is not provided, groundedness weight is redistributed
29
+ proportionally across the remaining metrics.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import math
35
+
36
+
37
+ DEFAULT_WEIGHTS = {
38
+ "groundedness": 0.35,
39
+ "completeness": 0.25,
40
+ "relevance": 0.20,
41
+ "consistency": 0.15,
42
+ "confidence": 0.05,
43
+ }
44
+
45
+ # RAG-optimised preset: boost groundedness, reduce completeness weight.
46
+ # Use when the source context IS the ground truth and faithfulness is
47
+ # the primary concern.
48
+ RAG_WEIGHTS = {
49
+ "groundedness": 0.50,
50
+ "completeness": 0.15,
51
+ "relevance": 0.20,
52
+ "consistency": 0.10,
53
+ "confidence": 0.05,
54
+ }
55
+
56
+
57
+ def compute_iqs(
58
+ groundedness: float | None,
59
+ completeness: float,
60
+ relevance: float,
61
+ consistency: float,
62
+ confidence: float,
63
+ weights: dict | None = None,
64
+ mode: str = "harmonic",
65
+ ) -> float:
66
+ """Compute the Information Quality Score.
67
+
68
+ IQS = n / sum(w_i / s_i), where n = sum(w_i) and s_i are the metric
69
+ scores. This is the weighted harmonic mean.
70
+
71
+ Args:
72
+ groundedness: 0-1 or None if no context was provided.
73
+ completeness: 0-1.
74
+ relevance: 0-1.
75
+ consistency: 0-1.
76
+ confidence: 0-1.
77
+ weights: Optional custom weight dict. Missing keys default to
78
+ DEFAULT_WEIGHTS.
79
+ mode: Scoring formula.
80
+ "harmonic" (default) - weighted harmonic mean. Zero tolerance:
81
+ any metric near zero drives IQS toward zero.
82
+ "geometric" - weighted geometric mean. Gracefully handles
83
+ partial quality; does not collapse to zero unless a metric
84
+ is literally zero.
85
+
86
+ Returns:
87
+ IQS score in [0, 1].
88
+ """
89
+ scores: dict[str, float] = {
90
+ "completeness": completeness,
91
+ "relevance": relevance,
92
+ "consistency": consistency,
93
+ "confidence": confidence,
94
+ }
95
+ # None groundedness (no context) is excluded so its weight is
96
+ # redistributed proportionally rather than counted as a zero.
97
+ if groundedness is not None:
98
+ scores["groundedness"] = groundedness
99
+
100
+ iqs, _ = compute_iqs_detailed(scores, weights=weights, mode=mode)
101
+ return iqs
102
+
103
+
104
+ def compute_iqs_detailed(
105
+ scores: "dict[str, float | None]",
106
+ weights: "dict | None" = None,
107
+ mode: str = "harmonic",
108
+ ) -> "tuple[float, dict[str, float]]":
109
+ """Compute IQS and report the effective weights actually used.
110
+
111
+ The dict-based companion to :func:`compute_iqs`. A metric is *active* when
112
+ it has a non-``None`` score **and** a positive weight; only active metrics
113
+ contribute to IQS, and their weights are renormalised to sum to 1.0
114
+ (proportional redistribution). This is how an inapplicable metric -
115
+ typically ``groundedness`` when no context was provided - is excluded
116
+ without being treated as a catastrophic zero.
117
+
118
+ A metric value of exactly ``0.0`` is a *real* measurement (not missing
119
+ data) and collapses IQS to ``0.0`` under both means.
120
+
121
+ Args:
122
+ scores: Metric name -> score (float) or ``None`` (inapplicable).
123
+ weights: Metric name -> weight. Missing keys fall back to
124
+ ``DEFAULT_WEIGHTS``. Need not sum to 1.0; active weights are
125
+ renormalised.
126
+ mode: ``"harmonic"`` (default) or ``"geometric"``.
127
+
128
+ Returns:
129
+ ``(iqs, effective_weights)`` - the IQS in ``[0.0, 1.0]`` and the
130
+ normalised weights of the active metrics (sums to 1.0).
131
+
132
+ Raises:
133
+ ValueError: if no metric is active (all ``None`` or zero-weighted).
134
+ """
135
+ w = dict(DEFAULT_WEIGHTS)
136
+ if weights:
137
+ w.update(weights)
138
+
139
+ # Active = scored (non-None) AND positively weighted. Excluding
140
+ # zero-weighted metrics lets a caller opt a metric out explicitly (e.g.
141
+ # groundedness weight 0.0 when context is never available).
142
+ active = {
143
+ k: v for k, v in scores.items()
144
+ if v is not None and w.get(k, 0.0) > 0.0
145
+ }
146
+ if not active:
147
+ raise ValueError(
148
+ "All metrics are None (or zero-weighted) - nothing to compute IQS "
149
+ "from. At least one metric must have a non-None score and a "
150
+ "positive weight."
151
+ )
152
+
153
+ total_active_weight = sum(w[k] for k in active)
154
+ effective_weights = {k: w[k] / total_active_weight for k in active}
155
+
156
+ # A genuine zero score is a real failure: collapse IQS to 0.0 under both
157
+ # means (harmonic does this via eps anyway; this makes geometric match).
158
+ if any(v == 0.0 for v in active.values()):
159
+ return 0.0, effective_weights
160
+
161
+ eps = 1e-6
162
+ if mode == "geometric":
163
+ # Weighted geometric mean: exp(sum(w_i * log(s_i))) == prod(s_i ^ w_i)
164
+ log_iqs = sum(effective_weights[k] * math.log(max(active[k], eps)) for k in active)
165
+ iqs = math.exp(log_iqs)
166
+ else:
167
+ # Weighted harmonic mean: 1 / sum(w_i / s_i), weights summing to 1.
168
+ iqs = 1.0 / sum(effective_weights[k] / max(active[k], eps) for k in active)
169
+
170
+ return round(min(max(iqs, 0.0), 1.0), 4), effective_weights
File without changes