scroot 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scroot/__init__.py +109 -0
- scroot/agents.py +345 -0
- scroot/audit.py +131 -0
- scroot/cli/__init__.py +167 -0
- scroot/cli/download.py +49 -0
- scroot/cli/eval.py +230 -0
- scroot/cli/model_info.py +28 -0
- scroot/composite.py +170 -0
- scroot/config/__init__.py +0 -0
- scroot/config/corrector.py +92 -0
- scroot/connectors/__init__.py +5 -0
- scroot/connectors/database.py +357 -0
- scroot/context/__init__.py +9 -0
- scroot/context/adapters.py +86 -0
- scroot/context/builder.py +514 -0
- scroot/context/dedup.py +99 -0
- scroot/context/payload.py +66 -0
- scroot/context/pii.py +101 -0
- scroot/context/tokenizer.py +42 -0
- scroot/core.py +349 -0
- scroot/corrector/__init__.py +38 -0
- scroot/corrector/api.py +145 -0
- scroot/corrector/base.py +20 -0
- scroot/corrector/disabled.py +13 -0
- scroot/corrector/local.py +112 -0
- scroot/corrector/models.py +69 -0
- scroot/dashboard/__init__.py +0 -0
- scroot/dashboard/__main__.py +37 -0
- scroot/dashboard/routers/__init__.py +0 -0
- scroot/dashboard/routers/analytics.py +236 -0
- scroot/dashboard/routers/corrector.py +230 -0
- scroot/dashboard/routers/export.py +150 -0
- scroot/dashboard/routers/guardrails.py +41 -0
- scroot/dashboard/routers/pipeline.py +218 -0
- scroot/dashboard/routers/queue.py +188 -0
- scroot/dashboard/routers/records.py +252 -0
- scroot/dashboard/routers/settings.py +291 -0
- scroot/dashboard/security.py +135 -0
- scroot/dashboard/server.py +181 -0
- scroot/evidence.py +228 -0
- scroot/exceptions.py +62 -0
- scroot/feedback/__init__.py +6 -0
- scroot/feedback/injector.py +160 -0
- scroot/feedback/sanitizer.py +56 -0
- scroot/feedback/store.py +650 -0
- scroot/flags.py +42 -0
- scroot/metrics/__init__.py +15 -0
- scroot/metrics/_utils.py +9 -0
- scroot/metrics/completeness.py +139 -0
- scroot/metrics/confidence.py +83 -0
- scroot/metrics/consistency.py +125 -0
- scroot/metrics/groundedness.py +193 -0
- scroot/metrics/relevance.py +73 -0
- scroot/models.py +214 -0
- scroot/result.py +276 -0
- scroot/sampling.py +306 -0
- scroot/text_utils.py +136 -0
- scroot/ui/dist/assets/index-DW1dLzDl.js +101 -0
- scroot/ui/dist/assets/index-WOhrVVSM.css +2 -0
- scroot/ui/dist/favicon.svg +27 -0
- scroot/ui/dist/index.html +20 -0
- scroot-0.2.0.dist-info/METADATA +832 -0
- scroot-0.2.0.dist-info/RECORD +67 -0
- scroot-0.2.0.dist-info/WHEEL +5 -0
- scroot-0.2.0.dist-info/entry_points.txt +2 -0
- scroot-0.2.0.dist-info/licenses/LICENSE +201 -0
- scroot-0.2.0.dist-info/top_level.txt +1 -0
scroot/cli/__init__.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Scroot CLI - `scroot serve` starts the review dashboard."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
import typer
|
|
6
|
+
app = typer.Typer(name="scroot", help="Scroot LLM response quality tools.")
|
|
7
|
+
|
|
8
|
+
@app.command("download-model")
|
|
9
|
+
def download_model_cmd(
|
|
10
|
+
model: str = typer.Option("phi4-mini", help="Model ID to download (phi4-mini, smollm3)"),
|
|
11
|
+
):
|
|
12
|
+
"""Download a local LLM model for offline correction."""
|
|
13
|
+
from scroot.cli.download import download_model
|
|
14
|
+
try:
|
|
15
|
+
download_model(model)
|
|
16
|
+
except Exception as e:
|
|
17
|
+
typer.echo(f"ERROR: {e}")
|
|
18
|
+
raise typer.Exit(1)
|
|
19
|
+
|
|
20
|
+
@app.command("model-info")
|
|
21
|
+
def model_info_cmd():
|
|
22
|
+
"""List available local LLM models and their download status."""
|
|
23
|
+
from scroot.cli.model_info import print_model_info
|
|
24
|
+
print_model_info()
|
|
25
|
+
|
|
26
|
+
@app.command()
|
|
27
|
+
def score(
|
|
28
|
+
query: str = typer.Option(..., "--query", "-q", help="The user's query/question."),
|
|
29
|
+
response: str = typer.Option(..., "--response", "-r", help="The LLM-generated response."),
|
|
30
|
+
context: list[str] = typer.Option(
|
|
31
|
+
None, "--context", "-c", help="Grounding context chunk. Repeat for multiple chunks."
|
|
32
|
+
),
|
|
33
|
+
json_output: bool = typer.Option(
|
|
34
|
+
False, "--json", help="Print the full result as JSON instead of a summary."
|
|
35
|
+
),
|
|
36
|
+
):
|
|
37
|
+
"""Score a single query/response pair and print the result."""
|
|
38
|
+
import json as json_module
|
|
39
|
+
from scroot import score as score_fn
|
|
40
|
+
|
|
41
|
+
result = score_fn(query=query, response=response, context=context or None)
|
|
42
|
+
|
|
43
|
+
if json_output:
|
|
44
|
+
typer.echo(json_module.dumps(result.to_dict(), indent=2, default=str))
|
|
45
|
+
return
|
|
46
|
+
|
|
47
|
+
typer.echo(f"IQS: {result.iqs:.2f}")
|
|
48
|
+
typer.echo(
|
|
49
|
+
f"Groundedness: {result.groundedness:.2f}"
|
|
50
|
+
if result.groundedness is not None
|
|
51
|
+
else "Groundedness: n/a (no context provided)"
|
|
52
|
+
)
|
|
53
|
+
typer.echo(f"Completeness: {result.completeness:.2f}")
|
|
54
|
+
typer.echo(f"Relevance: {result.relevance:.2f}")
|
|
55
|
+
typer.echo(f"Consistency: {result.consistency:.2f}")
|
|
56
|
+
typer.echo(f"Confidence: {result.confidence:.2f}")
|
|
57
|
+
typer.echo(f"Flags: {result.flags}")
|
|
58
|
+
|
|
59
|
+
@app.command("eval")
|
|
60
|
+
def eval_cmd(
|
|
61
|
+
suite: str = typer.Option(..., "--suite", "-s", help="Path to a YAML eval suite."),
|
|
62
|
+
fail_below: float = typer.Option(
|
|
63
|
+
None, "--fail-below", help="Override the suite's fail_below_iqs threshold."
|
|
64
|
+
),
|
|
65
|
+
json_output: bool = typer.Option(
|
|
66
|
+
False, "--json", help="Print a machine-readable JSON summary instead of text."
|
|
67
|
+
),
|
|
68
|
+
output: str = typer.Option(
|
|
69
|
+
None, "--output", help="Write a JUnit XML report to this path (for CI)."
|
|
70
|
+
),
|
|
71
|
+
):
|
|
72
|
+
"""Run a YAML-defined quality regression suite (CI/CD quality gate)."""
|
|
73
|
+
import json as json_module
|
|
74
|
+
from scroot.cli.eval import format_junit_xml, format_report, load_suite, run_suite
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
suite_obj = load_suite(suite)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
typer.echo(f"ERROR: {e}")
|
|
80
|
+
raise typer.Exit(1)
|
|
81
|
+
|
|
82
|
+
result = run_suite(suite_obj, fail_below=fail_below)
|
|
83
|
+
|
|
84
|
+
if output:
|
|
85
|
+
with open(output, "w", encoding="utf-8") as f:
|
|
86
|
+
f.write(format_junit_xml(suite_obj, result))
|
|
87
|
+
|
|
88
|
+
if json_output:
|
|
89
|
+
typer.echo(json_module.dumps({
|
|
90
|
+
"name": suite_obj.name,
|
|
91
|
+
"passed": result.passed_count,
|
|
92
|
+
"failed": result.failed_count,
|
|
93
|
+
"avg_iqs": result.avg_iqs,
|
|
94
|
+
"results": [
|
|
95
|
+
{
|
|
96
|
+
"query": r.example.query,
|
|
97
|
+
"iqs": r.iqs,
|
|
98
|
+
"passed": r.passed,
|
|
99
|
+
"gate_reason": r.gate_reason,
|
|
100
|
+
"tags": r.example.tags,
|
|
101
|
+
}
|
|
102
|
+
for r in result.results
|
|
103
|
+
],
|
|
104
|
+
}, indent=2))
|
|
105
|
+
else:
|
|
106
|
+
typer.echo(format_report(suite_obj, result))
|
|
107
|
+
|
|
108
|
+
if result.failed_count:
|
|
109
|
+
raise typer.Exit(1)
|
|
110
|
+
|
|
111
|
+
@app.command()
|
|
112
|
+
def serve(
|
|
113
|
+
port: int = typer.Option(7432, help="Port to listen on"),
|
|
114
|
+
store: str = typer.Option("./scroot_store.jsonl", help="JSONL feedback store path"),
|
|
115
|
+
host: str = typer.Option("127.0.0.1", help="Host to bind to"),
|
|
116
|
+
token: str = typer.Option(
|
|
117
|
+
None, "--token",
|
|
118
|
+
help="Require this shared token on all /api routes (for network "
|
|
119
|
+
"binds). Falls back to SCROOT_DASHBOARD_TOKEN.",
|
|
120
|
+
),
|
|
121
|
+
hosted: bool = typer.Option(False, hidden=True),
|
|
122
|
+
):
|
|
123
|
+
"""Start the Scroot Review Console at http://localhost:7432
|
|
124
|
+
|
|
125
|
+
The dashboard has no per-user login. The default 127.0.0.1 bind is
|
|
126
|
+
single-user safe. If you bind to a routable host (e.g. --host 0.0.0.0),
|
|
127
|
+
set --token (or SCROOT_DASHBOARD_TOKEN) and/or front it with an
|
|
128
|
+
authenticating reverse proxy - otherwise the correction store and the
|
|
129
|
+
stored LLM API key are reachable by anyone on the network.
|
|
130
|
+
"""
|
|
131
|
+
try:
|
|
132
|
+
import uvicorn
|
|
133
|
+
except ImportError:
|
|
134
|
+
typer.echo("ERROR: Install dashboard deps: pip install 'scroot[dashboard]'")
|
|
135
|
+
raise typer.Exit(1)
|
|
136
|
+
|
|
137
|
+
from scroot.dashboard.security import is_loopback_host, resolve_dashboard_token
|
|
138
|
+
from scroot.dashboard.server import create_app
|
|
139
|
+
|
|
140
|
+
fa_app = create_app(store_path=store, hosted=hosted, host=host, auth_token=token)
|
|
141
|
+
|
|
142
|
+
typer.echo("\n * SCROOT Review Console")
|
|
143
|
+
typer.echo(f" Store : {store}")
|
|
144
|
+
typer.echo(f" URL : http://{host}:{port}")
|
|
145
|
+
if resolve_dashboard_token(token) is not None:
|
|
146
|
+
typer.echo(" Auth : token required (Authorization: Bearer / X-Scroot-Token)")
|
|
147
|
+
elif not is_loopback_host(host):
|
|
148
|
+
typer.echo(
|
|
149
|
+
" WARNING: non-loopback bind with NO auth - the store and "
|
|
150
|
+
"stored API key are exposed to the network. Set --token."
|
|
151
|
+
)
|
|
152
|
+
typer.echo("")
|
|
153
|
+
uvicorn.run(fa_app, host=host, port=port, log_level="info")
|
|
154
|
+
|
|
155
|
+
except ImportError:
|
|
156
|
+
# typer not installed - provide a minimal fallback
|
|
157
|
+
import sys
|
|
158
|
+
|
|
159
|
+
class _FakeCLI:
|
|
160
|
+
def command(self, *a, **kw):
|
|
161
|
+
def dec(fn): return fn
|
|
162
|
+
return dec
|
|
163
|
+
def __call__(self):
|
|
164
|
+
print("scroot: install typer for CLI support: pip install typer")
|
|
165
|
+
sys.exit(1)
|
|
166
|
+
|
|
167
|
+
app = _FakeCLI()
|
scroot/cli/download.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""scroot download-model command."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from scroot.corrector.models import (
|
|
5
|
+
DEFAULT_MODEL_ID,
|
|
6
|
+
MODEL_REGISTRY,
|
|
7
|
+
get_model_dir,
|
|
8
|
+
get_model_path,
|
|
9
|
+
is_model_downloaded,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def download_model(model_id: str = DEFAULT_MODEL_ID) -> None:
|
|
14
|
+
if model_id not in MODEL_REGISTRY:
|
|
15
|
+
ids = ", ".join(MODEL_REGISTRY.keys())
|
|
16
|
+
raise ValueError(f"Unknown model '{model_id}'. Available: {ids}")
|
|
17
|
+
|
|
18
|
+
spec = MODEL_REGISTRY[model_id]
|
|
19
|
+
|
|
20
|
+
if is_model_downloaded(model_id):
|
|
21
|
+
print(f"{spec.name} is already downloaded at {get_model_path(model_id)}")
|
|
22
|
+
return
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from huggingface_hub import hf_hub_download
|
|
26
|
+
except ImportError:
|
|
27
|
+
raise RuntimeError(
|
|
28
|
+
"huggingface-hub is not installed. "
|
|
29
|
+
"Run: pip install 'scroot[local]'"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
dest = get_model_dir() / model_id
|
|
33
|
+
dest.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
print(f"Downloading {spec.name} ({spec.size_gb} GB)...")
|
|
36
|
+
print(f"Source : {spec.hf_repo}")
|
|
37
|
+
print(f"Destination : {dest}")
|
|
38
|
+
print()
|
|
39
|
+
|
|
40
|
+
hf_hub_download(
|
|
41
|
+
repo_id=spec.hf_repo,
|
|
42
|
+
filename=spec.hf_filename,
|
|
43
|
+
local_dir=str(dest),
|
|
44
|
+
resume_download=True,
|
|
45
|
+
token=False,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
print(f"\nOK {spec.name} ready at {dest / spec.hf_filename}")
|
|
49
|
+
print("Run `scroot serve` to start the dashboard.")
|
scroot/cli/eval.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""`scroot eval` - run a YAML-defined quality regression suite.
|
|
2
|
+
|
|
3
|
+
Loads a suite of (query, response, context) examples with expected IQS/
|
|
4
|
+
groundedness floors, scores each with scroot, and reports pass/fail using
|
|
5
|
+
EntailmentResult.passes_gate() / gate_reason(). Intended as a CI/CD quality
|
|
6
|
+
gate - exits non-zero if any example fails its gate.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from xml.etree import ElementTree as ET
|
|
13
|
+
|
|
14
|
+
from scroot import score
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class EvalExample:
|
|
19
|
+
"""A single (query, response, context) case in an eval suite."""
|
|
20
|
+
|
|
21
|
+
query: str
|
|
22
|
+
response: str
|
|
23
|
+
context: "str | list[str] | None" = None
|
|
24
|
+
expected_iqs_min: "float | None" = None
|
|
25
|
+
tags: list[str] = field(default_factory=list)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class EvalSuite:
|
|
30
|
+
"""A YAML-defined collection of EvalExamples with default gate thresholds."""
|
|
31
|
+
|
|
32
|
+
name: str
|
|
33
|
+
examples: list[EvalExample]
|
|
34
|
+
fail_below_iqs: "float | None" = None
|
|
35
|
+
fail_below_groundedness: "float | None" = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ExampleResult:
|
|
40
|
+
"""Outcome of scoring a single EvalExample."""
|
|
41
|
+
|
|
42
|
+
example: EvalExample
|
|
43
|
+
iqs: float
|
|
44
|
+
passed: bool
|
|
45
|
+
gate_reason: "str | None"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class EvalRunResult:
|
|
50
|
+
"""Aggregate outcome of running an EvalSuite."""
|
|
51
|
+
|
|
52
|
+
results: list[ExampleResult]
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def passed_count(self) -> int:
|
|
56
|
+
return sum(1 for r in self.results if r.passed)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def failed_count(self) -> int:
|
|
60
|
+
return sum(1 for r in self.results if not r.passed)
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def avg_iqs(self) -> float:
|
|
64
|
+
if not self.results:
|
|
65
|
+
return 0.0
|
|
66
|
+
return sum(r.iqs for r in self.results) / len(self.results)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _import_yaml():
|
|
70
|
+
try:
|
|
71
|
+
import yaml
|
|
72
|
+
except ImportError as exc:
|
|
73
|
+
raise RuntimeError(
|
|
74
|
+
"pyyaml is required for `scroot eval`. "
|
|
75
|
+
"Install it with: pip install pyyaml"
|
|
76
|
+
) from exc
|
|
77
|
+
return yaml
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def load_suite(path: str) -> EvalSuite:
|
|
81
|
+
"""Load an EvalSuite from a YAML file.
|
|
82
|
+
|
|
83
|
+
Expected shape:
|
|
84
|
+
|
|
85
|
+
name: Support regression suite
|
|
86
|
+
fail_below_iqs: 0.70
|
|
87
|
+
fail_below_groundedness: 0.80
|
|
88
|
+
examples:
|
|
89
|
+
- query: "..."
|
|
90
|
+
response: "..."
|
|
91
|
+
context: "..." # or a list of strings
|
|
92
|
+
expected_iqs_min: 0.75 # optional, overrides fail_below_iqs
|
|
93
|
+
tags: [billing]
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
path: Path to the YAML suite file.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Parsed EvalSuite.
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
RuntimeError: If pyyaml is not installed.
|
|
103
|
+
OSError: If the file cannot be read.
|
|
104
|
+
ValueError: If the YAML is malformed or missing required fields.
|
|
105
|
+
"""
|
|
106
|
+
yaml = _import_yaml()
|
|
107
|
+
|
|
108
|
+
with open(path, encoding="utf-8") as f:
|
|
109
|
+
try:
|
|
110
|
+
data = yaml.safe_load(f)
|
|
111
|
+
except yaml.YAMLError as exc:
|
|
112
|
+
raise ValueError(f"Invalid YAML in {path}: {exc}") from exc
|
|
113
|
+
|
|
114
|
+
if not isinstance(data, dict):
|
|
115
|
+
raise ValueError(f"{path}: top-level YAML must be a mapping")
|
|
116
|
+
|
|
117
|
+
raw_examples = data.get("examples") or []
|
|
118
|
+
if not isinstance(raw_examples, list):
|
|
119
|
+
raise ValueError(f"{path}: 'examples' must be a list")
|
|
120
|
+
|
|
121
|
+
examples = []
|
|
122
|
+
for i, raw in enumerate(raw_examples):
|
|
123
|
+
if "query" not in raw or "response" not in raw:
|
|
124
|
+
raise ValueError(f"{path}: examples[{i}] is missing 'query' or 'response'")
|
|
125
|
+
examples.append(EvalExample(
|
|
126
|
+
query=raw["query"],
|
|
127
|
+
response=raw["response"],
|
|
128
|
+
context=raw.get("context"),
|
|
129
|
+
expected_iqs_min=raw.get("expected_iqs_min"),
|
|
130
|
+
tags=raw.get("tags") or [],
|
|
131
|
+
))
|
|
132
|
+
|
|
133
|
+
return EvalSuite(
|
|
134
|
+
name=data.get("name", path),
|
|
135
|
+
examples=examples,
|
|
136
|
+
fail_below_iqs=data.get("fail_below_iqs"),
|
|
137
|
+
fail_below_groundedness=data.get("fail_below_groundedness"),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def run_suite(suite: EvalSuite, fail_below: "float | None" = None) -> EvalRunResult:
|
|
142
|
+
"""Score every example in a suite and evaluate its quality gate.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
suite: The eval suite to run.
|
|
146
|
+
fail_below: Optional CLI override for the IQS gate threshold,
|
|
147
|
+
applied to examples that don't set their own
|
|
148
|
+
``expected_iqs_min``.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
EvalRunResult with per-example outcomes and aggregate stats.
|
|
152
|
+
"""
|
|
153
|
+
results = []
|
|
154
|
+
for example in suite.examples:
|
|
155
|
+
result = score(query=example.query, response=example.response, context=example.context)
|
|
156
|
+
threshold = (
|
|
157
|
+
example.expected_iqs_min
|
|
158
|
+
if example.expected_iqs_min is not None
|
|
159
|
+
else fail_below
|
|
160
|
+
if fail_below is not None
|
|
161
|
+
else suite.fail_below_iqs
|
|
162
|
+
if suite.fail_below_iqs is not None
|
|
163
|
+
else 0.70
|
|
164
|
+
)
|
|
165
|
+
reason = result.gate_reason(
|
|
166
|
+
threshold=threshold,
|
|
167
|
+
require_groundedness=suite.fail_below_groundedness,
|
|
168
|
+
)
|
|
169
|
+
results.append(ExampleResult(
|
|
170
|
+
example=example,
|
|
171
|
+
iqs=result.iqs,
|
|
172
|
+
passed=reason is None,
|
|
173
|
+
gate_reason=reason,
|
|
174
|
+
))
|
|
175
|
+
|
|
176
|
+
return EvalRunResult(results=results)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def format_report(suite: EvalSuite, run_result: EvalRunResult) -> str:
|
|
180
|
+
"""Format a plain-text report of an eval run for CLI output."""
|
|
181
|
+
lines = [f"Eval suite: {suite.name}", ""]
|
|
182
|
+
|
|
183
|
+
for i, result in enumerate(run_result.results, start=1):
|
|
184
|
+
if result.passed:
|
|
185
|
+
continue
|
|
186
|
+
tags = f" [{', '.join(result.example.tags)}]" if result.example.tags else ""
|
|
187
|
+
lines.append(f"FAIL #{i}{tags}")
|
|
188
|
+
lines.append(f" Query: {result.example.query}")
|
|
189
|
+
lines.append(f" IQS: {result.iqs:.2f}")
|
|
190
|
+
lines.append(f" Reason: {result.gate_reason}")
|
|
191
|
+
lines.append("")
|
|
192
|
+
|
|
193
|
+
lines.append(
|
|
194
|
+
f"Summary: {run_result.passed_count}/{len(run_result.results)} passed "
|
|
195
|
+
f"- avg IQS {run_result.avg_iqs:.2f}"
|
|
196
|
+
)
|
|
197
|
+
return "\n".join(lines)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def format_junit_xml(suite: EvalSuite, run_result: EvalRunResult) -> str:
|
|
201
|
+
"""Format an eval run as a JUnit XML report for CI integration.
|
|
202
|
+
|
|
203
|
+
Each example becomes a ``<testcase>``; failing examples (per
|
|
204
|
+
``passes_gate()``/``gate_reason()``) get a ``<failure>`` child with the
|
|
205
|
+
gate reason as the failure message.
|
|
206
|
+
"""
|
|
207
|
+
testsuite = ET.Element("testsuite", {
|
|
208
|
+
"name": suite.name,
|
|
209
|
+
"tests": str(len(run_result.results)),
|
|
210
|
+
"failures": str(run_result.failed_count),
|
|
211
|
+
})
|
|
212
|
+
|
|
213
|
+
for i, result in enumerate(run_result.results, start=1):
|
|
214
|
+
tags = ", ".join(result.example.tags) if result.example.tags else ""
|
|
215
|
+
case_name = f"#{i} {tags}".strip() if tags else f"#{i} {result.example.query[:40]}"
|
|
216
|
+
testcase = ET.SubElement(testsuite, "testcase", {
|
|
217
|
+
"classname": suite.name,
|
|
218
|
+
"name": case_name,
|
|
219
|
+
})
|
|
220
|
+
if not result.passed:
|
|
221
|
+
failure = ET.SubElement(testcase, "failure", {
|
|
222
|
+
"message": result.gate_reason or "gate failed",
|
|
223
|
+
})
|
|
224
|
+
failure.text = (
|
|
225
|
+
f"Query: {result.example.query}\n"
|
|
226
|
+
f"IQS: {result.iqs:.2f}\n"
|
|
227
|
+
f"Reason: {result.gate_reason}"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return '<?xml version="1.0" encoding="UTF-8"?>\n' + ET.tostring(testsuite, encoding="unicode")
|
scroot/cli/model_info.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""scroot model-info command."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from scroot.corrector.models import (
|
|
5
|
+
DEFAULT_MODEL_ID,
|
|
6
|
+
MODEL_REGISTRY,
|
|
7
|
+
get_model_dir,
|
|
8
|
+
is_model_downloaded,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def print_model_info() -> None:
|
|
13
|
+
col = "{:<16} {:<16} {:<8} {:<8} {:<12}"
|
|
14
|
+
print()
|
|
15
|
+
print(" scroot models")
|
|
16
|
+
print()
|
|
17
|
+
print(" " + col.format("Model", "Status", "Size", "RAM", "License"))
|
|
18
|
+
print(" " + "-" * 60)
|
|
19
|
+
for model_id, spec in MODEL_REGISTRY.items():
|
|
20
|
+
default_tag = " <- default" if model_id == DEFAULT_MODEL_ID else ""
|
|
21
|
+
status = "ready" if is_model_downloaded(model_id) else "not downloaded"
|
|
22
|
+
size = f"{spec.size_gb} GB"
|
|
23
|
+
ram = f"{spec.rec_ram_gb} GB"
|
|
24
|
+
print(f" {spec.name:<20} {status:<16} {size:<8} {ram:<8} {spec.license}{default_tag}")
|
|
25
|
+
print()
|
|
26
|
+
print(f" Models stored at: {get_model_dir()}")
|
|
27
|
+
print(" To download: scroot download-model [--model smollm3]")
|
|
28
|
+
print()
|
scroot/composite.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Information Quality Score (IQS) - composite metric.
|
|
2
|
+
|
|
3
|
+
IQS = n / sum(w_i / s_i) -- the weighted harmonic mean of the five
|
|
4
|
+
metric scores, where n = sum(w_i).
|
|
5
|
+
|
|
6
|
+
Two scoring modes:
|
|
7
|
+
harmonic (default): weighted harmonic mean. Any metric near zero drives
|
|
8
|
+
IQS to zero. A response with groundedness=0.1 and all others at 0.9
|
|
9
|
+
scores ~0.31. Zero tolerance: a single quality failure dominates the
|
|
10
|
+
score, which matches the goal of flagging unreliable responses.
|
|
11
|
+
|
|
12
|
+
geometric: weighted geometric mean. Penalizes low scores but does not
|
|
13
|
+
collapse to zero unless a metric is literally zero. Reflects partial
|
|
14
|
+
quality more gently: 9 correct claims + 1 wrong claim -> ~0.8 IQS
|
|
15
|
+
(not near 0).
|
|
16
|
+
|
|
17
|
+
Harmonic is the default and the formula documented in the README: any
|
|
18
|
+
metric near zero (e.g. a hallucinated claim) should drive the composite
|
|
19
|
+
score down hard rather than being averaged away.
|
|
20
|
+
|
|
21
|
+
Default weights:
|
|
22
|
+
groundedness 0.35 (most important: is it faithful to the source?)
|
|
23
|
+
completeness 0.25 (did it answer the full question?)
|
|
24
|
+
relevance 0.20 (is it on topic?)
|
|
25
|
+
consistency 0.15 (does it contradict itself?)
|
|
26
|
+
confidence 0.05 (calibration signal, low weight)
|
|
27
|
+
|
|
28
|
+
When context is not provided, groundedness weight is redistributed
|
|
29
|
+
proportionally across the remaining metrics.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import math
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
DEFAULT_WEIGHTS = {
|
|
38
|
+
"groundedness": 0.35,
|
|
39
|
+
"completeness": 0.25,
|
|
40
|
+
"relevance": 0.20,
|
|
41
|
+
"consistency": 0.15,
|
|
42
|
+
"confidence": 0.05,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# RAG-optimised preset: boost groundedness, reduce completeness weight.
|
|
46
|
+
# Use when the source context IS the ground truth and faithfulness is
|
|
47
|
+
# the primary concern.
|
|
48
|
+
RAG_WEIGHTS = {
|
|
49
|
+
"groundedness": 0.50,
|
|
50
|
+
"completeness": 0.15,
|
|
51
|
+
"relevance": 0.20,
|
|
52
|
+
"consistency": 0.10,
|
|
53
|
+
"confidence": 0.05,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def compute_iqs(
|
|
58
|
+
groundedness: float | None,
|
|
59
|
+
completeness: float,
|
|
60
|
+
relevance: float,
|
|
61
|
+
consistency: float,
|
|
62
|
+
confidence: float,
|
|
63
|
+
weights: dict | None = None,
|
|
64
|
+
mode: str = "harmonic",
|
|
65
|
+
) -> float:
|
|
66
|
+
"""Compute the Information Quality Score.
|
|
67
|
+
|
|
68
|
+
IQS = n / sum(w_i / s_i), where n = sum(w_i) and s_i are the metric
|
|
69
|
+
scores. This is the weighted harmonic mean.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
groundedness: 0-1 or None if no context was provided.
|
|
73
|
+
completeness: 0-1.
|
|
74
|
+
relevance: 0-1.
|
|
75
|
+
consistency: 0-1.
|
|
76
|
+
confidence: 0-1.
|
|
77
|
+
weights: Optional custom weight dict. Missing keys default to
|
|
78
|
+
DEFAULT_WEIGHTS.
|
|
79
|
+
mode: Scoring formula.
|
|
80
|
+
"harmonic" (default) - weighted harmonic mean. Zero tolerance:
|
|
81
|
+
any metric near zero drives IQS toward zero.
|
|
82
|
+
"geometric" - weighted geometric mean. Gracefully handles
|
|
83
|
+
partial quality; does not collapse to zero unless a metric
|
|
84
|
+
is literally zero.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
IQS score in [0, 1].
|
|
88
|
+
"""
|
|
89
|
+
scores: dict[str, float] = {
|
|
90
|
+
"completeness": completeness,
|
|
91
|
+
"relevance": relevance,
|
|
92
|
+
"consistency": consistency,
|
|
93
|
+
"confidence": confidence,
|
|
94
|
+
}
|
|
95
|
+
# None groundedness (no context) is excluded so its weight is
|
|
96
|
+
# redistributed proportionally rather than counted as a zero.
|
|
97
|
+
if groundedness is not None:
|
|
98
|
+
scores["groundedness"] = groundedness
|
|
99
|
+
|
|
100
|
+
iqs, _ = compute_iqs_detailed(scores, weights=weights, mode=mode)
|
|
101
|
+
return iqs
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def compute_iqs_detailed(
|
|
105
|
+
scores: "dict[str, float | None]",
|
|
106
|
+
weights: "dict | None" = None,
|
|
107
|
+
mode: str = "harmonic",
|
|
108
|
+
) -> "tuple[float, dict[str, float]]":
|
|
109
|
+
"""Compute IQS and report the effective weights actually used.
|
|
110
|
+
|
|
111
|
+
The dict-based companion to :func:`compute_iqs`. A metric is *active* when
|
|
112
|
+
it has a non-``None`` score **and** a positive weight; only active metrics
|
|
113
|
+
contribute to IQS, and their weights are renormalised to sum to 1.0
|
|
114
|
+
(proportional redistribution). This is how an inapplicable metric -
|
|
115
|
+
typically ``groundedness`` when no context was provided - is excluded
|
|
116
|
+
without being treated as a catastrophic zero.
|
|
117
|
+
|
|
118
|
+
A metric value of exactly ``0.0`` is a *real* measurement (not missing
|
|
119
|
+
data) and collapses IQS to ``0.0`` under both means.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
scores: Metric name -> score (float) or ``None`` (inapplicable).
|
|
123
|
+
weights: Metric name -> weight. Missing keys fall back to
|
|
124
|
+
``DEFAULT_WEIGHTS``. Need not sum to 1.0; active weights are
|
|
125
|
+
renormalised.
|
|
126
|
+
mode: ``"harmonic"`` (default) or ``"geometric"``.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
``(iqs, effective_weights)`` - the IQS in ``[0.0, 1.0]`` and the
|
|
130
|
+
normalised weights of the active metrics (sums to 1.0).
|
|
131
|
+
|
|
132
|
+
Raises:
|
|
133
|
+
ValueError: if no metric is active (all ``None`` or zero-weighted).
|
|
134
|
+
"""
|
|
135
|
+
w = dict(DEFAULT_WEIGHTS)
|
|
136
|
+
if weights:
|
|
137
|
+
w.update(weights)
|
|
138
|
+
|
|
139
|
+
# Active = scored (non-None) AND positively weighted. Excluding
|
|
140
|
+
# zero-weighted metrics lets a caller opt a metric out explicitly (e.g.
|
|
141
|
+
# groundedness weight 0.0 when context is never available).
|
|
142
|
+
active = {
|
|
143
|
+
k: v for k, v in scores.items()
|
|
144
|
+
if v is not None and w.get(k, 0.0) > 0.0
|
|
145
|
+
}
|
|
146
|
+
if not active:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
"All metrics are None (or zero-weighted) - nothing to compute IQS "
|
|
149
|
+
"from. At least one metric must have a non-None score and a "
|
|
150
|
+
"positive weight."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
total_active_weight = sum(w[k] for k in active)
|
|
154
|
+
effective_weights = {k: w[k] / total_active_weight for k in active}
|
|
155
|
+
|
|
156
|
+
# A genuine zero score is a real failure: collapse IQS to 0.0 under both
|
|
157
|
+
# means (harmonic does this via eps anyway; this makes geometric match).
|
|
158
|
+
if any(v == 0.0 for v in active.values()):
|
|
159
|
+
return 0.0, effective_weights
|
|
160
|
+
|
|
161
|
+
eps = 1e-6
|
|
162
|
+
if mode == "geometric":
|
|
163
|
+
# Weighted geometric mean: exp(sum(w_i * log(s_i))) == prod(s_i ^ w_i)
|
|
164
|
+
log_iqs = sum(effective_weights[k] * math.log(max(active[k], eps)) for k in active)
|
|
165
|
+
iqs = math.exp(log_iqs)
|
|
166
|
+
else:
|
|
167
|
+
# Weighted harmonic mean: 1 / sum(w_i / s_i), weights summing to 1.
|
|
168
|
+
iqs = 1.0 / sum(effective_weights[k] / max(active[k], eps) for k in active)
|
|
169
|
+
|
|
170
|
+
return round(min(max(iqs, 0.0), 1.0), 4), effective_weights
|
|
File without changes
|