vrty 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vrty/__init__.py +16 -0
- vrty/__main__.py +8 -0
- vrty/_version.py +8 -0
- vrty/cli.py +123 -0
- vrty/composite.py +164 -0
- vrty/data/__init__.py +0 -0
- vrty/data/idf.json.gz +0 -0
- vrty/data_loader.py +107 -0
- vrty/explanations.py +101 -0
- vrty/scoring.py +220 -0
- vrty-1.0.0.dist-info/METADATA +383 -0
- vrty-1.0.0.dist-info/RECORD +16 -0
- vrty-1.0.0.dist-info/WHEEL +5 -0
- vrty-1.0.0.dist-info/entry_points.txt +2 -0
- vrty-1.0.0.dist-info/licenses/LICENSE +21 -0
- vrty-1.0.0.dist-info/top_level.txt +1 -0
vrty/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""VRTY v1.0 - deterministic LLM output quality scoring.
|
|
2
|
+
|
|
3
|
+
Public surface:
|
|
4
|
+
|
|
5
|
+
>>> from vrty import score
|
|
6
|
+
>>> result = score("What is the capital of France?", "Paris is the capital.")
|
|
7
|
+
>>> result.composite
|
|
8
|
+
0.79...
|
|
9
|
+
>>> result.to_dict()["weights"]
|
|
10
|
+
{'relevance': 0.35, 'coherence': 0.2, 'completeness': 0.3, 'conciseness': 0.15}
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from vrty._version import __version__
|
|
14
|
+
from vrty.composite import VrtyScore, score
|
|
15
|
+
|
|
16
|
+
__all__ = ["__version__", "VrtyScore", "score"]
|
vrty/__main__.py
ADDED
vrty/_version.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Single source of truth for the VRTY runtime version.
|
|
2
|
+
|
|
3
|
+
Kept in a leaf module with no other imports so ``vrty/__init__.py`` can
|
|
4
|
+
re-export it and other modules (``composite.py``, ``data_loader.py``) can
|
|
5
|
+
import it without circular-import risk.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "1.0.0"
|
vrty/cli.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Command-line interface for VRTY v1.0.
|
|
2
|
+
|
|
3
|
+
A single command that scores a (prompt, response) pair and prints the locked
|
|
4
|
+
9-key result as JSON. Pure stdlib (argparse + json + pathlib); no third-party
|
|
5
|
+
CLI library, no subcommands, no output-shape changes from the library API.
|
|
6
|
+
|
|
7
|
+
Default output is one compact JSON line with ``sort_keys=True`` so the bytes
|
|
8
|
+
are stable enough to diff or pipe into ``jq`` or another tool. ``--pretty``
|
|
9
|
+
indents the JSON for human reading.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from vrty import __version__, score
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
23
|
+
p = argparse.ArgumentParser(
|
|
24
|
+
prog="vrty",
|
|
25
|
+
description=(
|
|
26
|
+
"VRTY v1.0 - deterministic LLM output quality scoring. "
|
|
27
|
+
"Returns a 0.0-1.0 composite plus per-dimension breakdown "
|
|
28
|
+
"(relevance, coherence, completeness, conciseness) for a "
|
|
29
|
+
"(prompt, response) pair. No LLM call; pure deterministic math."
|
|
30
|
+
),
|
|
31
|
+
epilog=(
|
|
32
|
+
"Examples:\n"
|
|
33
|
+
" vrty --prompt 'What is the capital of France?' "
|
|
34
|
+
"--response 'Paris is the capital of France.'\n"
|
|
35
|
+
" vrty --prompt-file p.txt --response-file r.txt --pretty\n"
|
|
36
|
+
" vrty --prompt 'hello' --response-file /dev/stdin < response.txt"
|
|
37
|
+
),
|
|
38
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
39
|
+
)
|
|
40
|
+
p.add_argument(
|
|
41
|
+
"--version",
|
|
42
|
+
action="version",
|
|
43
|
+
version=f"vrty {__version__}",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
prompt_group = p.add_mutually_exclusive_group(required=True)
|
|
47
|
+
prompt_group.add_argument(
|
|
48
|
+
"--prompt",
|
|
49
|
+
metavar="TEXT",
|
|
50
|
+
help="prompt text (literal string)",
|
|
51
|
+
)
|
|
52
|
+
prompt_group.add_argument(
|
|
53
|
+
"--prompt-file",
|
|
54
|
+
metavar="PATH",
|
|
55
|
+
help="read prompt text from this UTF-8 file (use /dev/stdin for stdin)",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
response_group = p.add_mutually_exclusive_group(required=True)
|
|
59
|
+
response_group.add_argument(
|
|
60
|
+
"--response",
|
|
61
|
+
metavar="TEXT",
|
|
62
|
+
help="response text (literal string)",
|
|
63
|
+
)
|
|
64
|
+
response_group.add_argument(
|
|
65
|
+
"--response-file",
|
|
66
|
+
metavar="PATH",
|
|
67
|
+
help="read response text from this UTF-8 file (use /dev/stdin for stdin)",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
p.add_argument(
|
|
71
|
+
"--pretty",
|
|
72
|
+
action="store_true",
|
|
73
|
+
help="indent JSON output for readability (default: one compact line)",
|
|
74
|
+
)
|
|
75
|
+
return p
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _read_text_arg(literal: str | None, file_path: str | None) -> str:
|
|
79
|
+
"""Resolve a literal-or-file pair to a single string. argparse already
|
|
80
|
+
guaranteed exactly one of the two is set via the mutually-exclusive group.
|
|
81
|
+
"""
|
|
82
|
+
if literal is not None:
|
|
83
|
+
return literal
|
|
84
|
+
assert file_path is not None # argparse contract
|
|
85
|
+
return Path(file_path).read_text(encoding="utf-8")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def main(argv: list[str] | None = None) -> int:
|
|
89
|
+
"""CLI entry point. Returns the exit code; does not call sys.exit itself.
|
|
90
|
+
|
|
91
|
+
Exit codes:
|
|
92
|
+
0 - success, score printed to stdout
|
|
93
|
+
1 - I/O error (e.g., --prompt-file does not exist or is unreadable)
|
|
94
|
+
2 - argparse error (e.g., missing required argument). argparse prints
|
|
95
|
+
its own usage message and calls sys.exit(2) before main() returns.
|
|
96
|
+
"""
|
|
97
|
+
parser = _build_parser()
|
|
98
|
+
args = parser.parse_args(argv)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
prompt = _read_text_arg(args.prompt, args.prompt_file)
|
|
102
|
+
response = _read_text_arg(args.response, args.response_file)
|
|
103
|
+
except OSError as e:
|
|
104
|
+
print(f"vrty: {e}", file=sys.stderr)
|
|
105
|
+
return 1
|
|
106
|
+
|
|
107
|
+
result = score(prompt, response)
|
|
108
|
+
payload = result.to_dict()
|
|
109
|
+
|
|
110
|
+
if args.pretty:
|
|
111
|
+
output = json.dumps(payload, sort_keys=True, indent=2, ensure_ascii=False)
|
|
112
|
+
else:
|
|
113
|
+
output = json.dumps(
|
|
114
|
+
payload, sort_keys=True, separators=(",", ":"), ensure_ascii=False,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
sys.stdout.write(output)
|
|
118
|
+
sys.stdout.write("\n")
|
|
119
|
+
return 0
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
if __name__ == "__main__": # pragma: no cover
|
|
123
|
+
sys.exit(main())
|
vrty/composite.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Composite VRTY score: weighted sum of the four pure dimension scores.
|
|
2
|
+
|
|
3
|
+
The composite layer is the public entry point: ``score(prompt, response)``
|
|
4
|
+
returns a frozen ``VrtyScore`` carrying the composite, the per-dimension
|
|
5
|
+
breakdown, and the provenance fields (``vrty_version``, ``idf_sha256``,
|
|
6
|
+
``weights``) that make any score traceable to the exact scoring logic that
|
|
7
|
+
produced it.
|
|
8
|
+
|
|
9
|
+
Weights are fixed v1.0 constants; they are not configurable. The IDF table
|
|
10
|
+
and ``idf_oov`` are loaded from the bundled frozen file via
|
|
11
|
+
``data_loader.load_idf()`` and cannot be supplied by the caller — this is
|
|
12
|
+
how the v1.0 "version-locked" guarantee is enforced in code.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from types import MappingProxyType
|
|
19
|
+
from typing import Mapping
|
|
20
|
+
|
|
21
|
+
from vrty._version import __version__
|
|
22
|
+
from vrty.data_loader import load_idf
|
|
23
|
+
from vrty.explanations import build_explanations, empty_response_explanations
|
|
24
|
+
from vrty.scoring import (
|
|
25
|
+
_tokenize,
|
|
26
|
+
coherence,
|
|
27
|
+
completeness,
|
|
28
|
+
conciseness,
|
|
29
|
+
relevance,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Fixed weights for v1.0. Pinned constants — not configurable.
|
|
33
|
+
# Configurability is explicitly out of scope per the v1.0 scope lock;
|
|
34
|
+
# revisiting requires a v1.1 conversation. Weights sum to 1.0 so the
|
|
35
|
+
# composite stays in [0, 1] without further normalization.
|
|
36
|
+
WEIGHT_RELEVANCE: float = 0.35
|
|
37
|
+
WEIGHT_COHERENCE: float = 0.20
|
|
38
|
+
WEIGHT_COMPLETENESS: float = 0.30
|
|
39
|
+
WEIGHT_CONCISENESS: float = 0.15
|
|
40
|
+
|
|
41
|
+
_WEIGHTS: Mapping[str, float] = MappingProxyType({
|
|
42
|
+
"relevance": WEIGHT_RELEVANCE,
|
|
43
|
+
"coherence": WEIGHT_COHERENCE,
|
|
44
|
+
"completeness": WEIGHT_COMPLETENESS,
|
|
45
|
+
"conciseness": WEIGHT_CONCISENESS,
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
# Fail-fast at import time: the v1.0 invariant is that weights sum to 1.0.
|
|
49
|
+
# Using exact equality is safe because the four numerators (35, 20, 30, 15)
|
|
50
|
+
# sum to 100 and dividing by 100 is exact in IEEE-754 binary floating point.
|
|
51
|
+
assert sum(_WEIGHTS.values()) == 1.0, (
|
|
52
|
+
f"VRTY weights must sum to 1.0; got {sum(_WEIGHTS.values())!r}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass(frozen=True)
|
|
57
|
+
class VrtyScore:
|
|
58
|
+
"""Result of scoring a (prompt, response) pair.
|
|
59
|
+
|
|
60
|
+
All numeric fields are bounded to ``[0.0, 1.0]``. The dataclass is frozen
|
|
61
|
+
so a score object cannot be mutated after creation (a downstream consumer
|
|
62
|
+
cannot accidentally change a score it received).
|
|
63
|
+
|
|
64
|
+
``vrty_version`` records the scoring-logic version. ``idf_sha256`` is
|
|
65
|
+
the SHA-256 of the bundled IDF data file used at scoring time; together
|
|
66
|
+
they make any single score reproducible.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
composite: float
|
|
70
|
+
relevance: float
|
|
71
|
+
coherence: float
|
|
72
|
+
completeness: float
|
|
73
|
+
conciseness: float
|
|
74
|
+
explanations: Mapping[str, str]
|
|
75
|
+
vrty_version: str
|
|
76
|
+
idf_sha256: str
|
|
77
|
+
weights: Mapping[str, float] = field(default_factory=lambda: _WEIGHTS)
|
|
78
|
+
|
|
79
|
+
def to_dict(self) -> dict:
|
|
80
|
+
"""Return a plain JSON-serializable dict (stable key ordering).
|
|
81
|
+
|
|
82
|
+
``json.dumps(score.to_dict(), sort_keys=True)`` produces a byte-stable
|
|
83
|
+
serialization suitable for caching, hashing, or diffing scores.
|
|
84
|
+
"""
|
|
85
|
+
return {
|
|
86
|
+
"composite": self.composite,
|
|
87
|
+
"relevance": self.relevance,
|
|
88
|
+
"coherence": self.coherence,
|
|
89
|
+
"completeness": self.completeness,
|
|
90
|
+
"conciseness": self.conciseness,
|
|
91
|
+
"explanations": dict(self.explanations),
|
|
92
|
+
"vrty_version": self.vrty_version,
|
|
93
|
+
"idf_sha256": self.idf_sha256,
|
|
94
|
+
"weights": dict(self.weights),
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def score(prompt: str, response: str) -> VrtyScore:
|
|
99
|
+
"""Score a (prompt, response) pair across four dimensions and a composite.
|
|
100
|
+
|
|
101
|
+
Per the v1.0 input contract, a response with no tokens (empty string,
|
|
102
|
+
whitespace only, punctuation only, or non-ASCII only after NFKD strip)
|
|
103
|
+
returns 0.0 on every dimension and the composite. Other degenerate
|
|
104
|
+
inputs (empty prompt, single word, oversized) are handled by the
|
|
105
|
+
underlying scoring functions per their documented behavior.
|
|
106
|
+
|
|
107
|
+
The function takes no IDF or weight arguments. The IDF table and
|
|
108
|
+
``idf_oov`` are pinned in the bundled frozen data file (integrity-checked
|
|
109
|
+
by SHA-256 at load); the weights are pinned module-level constants.
|
|
110
|
+
There is no API surface through which a caller can silently drift the
|
|
111
|
+
score.
|
|
112
|
+
"""
|
|
113
|
+
snap = load_idf()
|
|
114
|
+
|
|
115
|
+
# Empty-response wrapper: enforce the input-contract 0.0 across the board
|
|
116
|
+
# before any per-dimension function runs. We detect "no tokens" rather
|
|
117
|
+
# than literal empty string so punctuation-only / non-ASCII-only inputs
|
|
118
|
+
# are treated as empty (consistent with the tokenizer's view).
|
|
119
|
+
if not _tokenize(response):
|
|
120
|
+
return VrtyScore(
|
|
121
|
+
composite=0.0,
|
|
122
|
+
relevance=0.0,
|
|
123
|
+
coherence=0.0,
|
|
124
|
+
completeness=0.0,
|
|
125
|
+
conciseness=0.0,
|
|
126
|
+
explanations=empty_response_explanations(),
|
|
127
|
+
vrty_version=__version__,
|
|
128
|
+
idf_sha256=snap.sha256,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
r = relevance(prompt, response, idf=snap.idf, idf_oov=snap.idf_oov)
|
|
132
|
+
co = coherence(response, idf=snap.idf, idf_oov=snap.idf_oov)
|
|
133
|
+
cm = completeness(
|
|
134
|
+
prompt, response,
|
|
135
|
+
idf=snap.idf, idf_oov=snap.idf_oov, stopwords=snap.stopwords,
|
|
136
|
+
)
|
|
137
|
+
cn = conciseness(response, stopwords=snap.stopwords)
|
|
138
|
+
|
|
139
|
+
composite = (
|
|
140
|
+
WEIGHT_RELEVANCE * r
|
|
141
|
+
+ WEIGHT_COHERENCE * co
|
|
142
|
+
+ WEIGHT_COMPLETENESS * cm
|
|
143
|
+
+ WEIGHT_CONCISENESS * cn
|
|
144
|
+
)
|
|
145
|
+
# Same one-ulp FP overshoot clamp as in _cosine. With non-negative
|
|
146
|
+
# components in [0, 1] and weights summing to 1.0 the composite is in
|
|
147
|
+
# [0, 1] mathematically; the clamp absorbs IEEE-754 rounding noise.
|
|
148
|
+
if composite > 1.0:
|
|
149
|
+
composite = 1.0
|
|
150
|
+
elif composite < 0.0:
|
|
151
|
+
composite = 0.0
|
|
152
|
+
|
|
153
|
+
return VrtyScore(
|
|
154
|
+
composite=composite,
|
|
155
|
+
relevance=r,
|
|
156
|
+
coherence=co,
|
|
157
|
+
completeness=cm,
|
|
158
|
+
conciseness=cn,
|
|
159
|
+
explanations=build_explanations(
|
|
160
|
+
relevance=r, coherence=co, completeness=cm, conciseness=cn,
|
|
161
|
+
),
|
|
162
|
+
vrty_version=__version__,
|
|
163
|
+
idf_sha256=snap.sha256,
|
|
164
|
+
)
|
vrty/data/__init__.py
ADDED
|
File without changes
|
vrty/data/idf.json.gz
ADDED
|
Binary file
|
vrty/data_loader.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Load and integrity-check the bundled frozen IDF data file.
|
|
2
|
+
|
|
3
|
+
The IDF file ships at ``vrty/data/idf.json.gz``. Its SHA-256 is pinned in
|
|
4
|
+
``IDF_DATA_SHA256`` below. On first load, the bytes are checksummed; a
|
|
5
|
+
mismatch raises ``VrtyDataError``. This guarantees the runtime score
|
|
6
|
+
provenance: the only way to change scores is to ship a new IDF file with a
|
|
7
|
+
new pinned digest, which is then reflected in ``vrty_version``.
|
|
8
|
+
|
|
9
|
+
The loaded snapshot is cached in module state; subsequent calls return the
|
|
10
|
+
same object. The snapshot is a frozen ``dataclass`` so callers cannot
|
|
11
|
+
accidentally mutate the IDF dict at runtime.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import gzip
|
|
17
|
+
import hashlib
|
|
18
|
+
import json
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from importlib.resources import files
|
|
21
|
+
from types import MappingProxyType
|
|
22
|
+
from typing import Mapping
|
|
23
|
+
|
|
24
|
+
# SHA-256 of the gzipped bytes of ``vrty/data/idf.json.gz`` as built by
|
|
25
|
+
# ``tools/build_idf.py`` from the pinned corpus. Updating the corpus or the
|
|
26
|
+
# build script requires updating this constant in the same commit.
|
|
27
|
+
IDF_DATA_SHA256: str = "0e475bcaa5524d1e26cbb166bb5c138e37f87e1e47b75e6506c6460a94259fd2"
|
|
28
|
+
|
|
29
|
+
IDF_DATA_RESOURCE: str = "idf.json.gz"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class VrtyDataError(RuntimeError):
|
|
33
|
+
"""Raised when the bundled IDF data file fails its integrity check."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class IDFSnapshot:
|
|
38
|
+
"""Immutable view over the frozen IDF data file.
|
|
39
|
+
|
|
40
|
+
``idf`` is exposed as a ``MappingProxyType`` so callers cannot mutate the
|
|
41
|
+
cached dict. ``stopwords`` is a ``frozenset`` for O(1) membership and
|
|
42
|
+
immutability. All fields are populated from the bundled file and are
|
|
43
|
+
invariant for the lifetime of the process.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
schema_version: int
|
|
47
|
+
vrty_version: str
|
|
48
|
+
doc_count: int
|
|
49
|
+
vocab_size: int
|
|
50
|
+
idf_oov: float
|
|
51
|
+
idf: Mapping[str, float]
|
|
52
|
+
stopwords: frozenset[str]
|
|
53
|
+
sha256: str
|
|
54
|
+
|
|
55
|
+
def __post_init__(self) -> None:
|
|
56
|
+
# Sanity: snapshot fields must agree with the bundle they came from.
|
|
57
|
+
if len(self.idf) != self.vocab_size:
|
|
58
|
+
raise VrtyDataError(
|
|
59
|
+
f"vocab_size mismatch: header says {self.vocab_size}, "
|
|
60
|
+
f"idf dict has {len(self.idf)}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
_cached: IDFSnapshot | None = None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def load_idf() -> IDFSnapshot:
|
|
68
|
+
"""Return the cached frozen IDF snapshot.
|
|
69
|
+
|
|
70
|
+
On the first call, reads the bundled gzipped JSON via
|
|
71
|
+
``importlib.resources``, verifies its SHA-256 against ``IDF_DATA_SHA256``,
|
|
72
|
+
parses it, and freezes the result. On subsequent calls, returns the
|
|
73
|
+
cached snapshot.
|
|
74
|
+
|
|
75
|
+
Raises ``VrtyDataError`` if the bundled file's digest does not match.
|
|
76
|
+
"""
|
|
77
|
+
global _cached
|
|
78
|
+
if _cached is not None:
|
|
79
|
+
return _cached
|
|
80
|
+
|
|
81
|
+
resource = files("vrty.data").joinpath(IDF_DATA_RESOURCE)
|
|
82
|
+
raw = resource.read_bytes()
|
|
83
|
+
|
|
84
|
+
actual = hashlib.sha256(raw).hexdigest()
|
|
85
|
+
if actual != IDF_DATA_SHA256:
|
|
86
|
+
raise VrtyDataError(
|
|
87
|
+
f"IDF data file integrity check failed.\n"
|
|
88
|
+
f" expected SHA-256: {IDF_DATA_SHA256}\n"
|
|
89
|
+
f" actual SHA-256: {actual}\n"
|
|
90
|
+
f"The bundled {IDF_DATA_RESOURCE} has been modified after build. "
|
|
91
|
+
f"Rebuild via tools/build_idf.py and update IDF_DATA_SHA256."
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
payload = json.loads(gzip.decompress(raw).decode("ascii"))
|
|
95
|
+
|
|
96
|
+
snapshot = IDFSnapshot(
|
|
97
|
+
schema_version=payload["schema_version"],
|
|
98
|
+
vrty_version=payload["vrty_version"],
|
|
99
|
+
doc_count=payload["doc_count"],
|
|
100
|
+
vocab_size=payload["vocab_size"],
|
|
101
|
+
idf_oov=float(payload["idf_oov"]),
|
|
102
|
+
idf=MappingProxyType(dict(payload["idf"])),
|
|
103
|
+
stopwords=frozenset(payload["stopwords"]),
|
|
104
|
+
sha256=actual,
|
|
105
|
+
)
|
|
106
|
+
_cached = snapshot
|
|
107
|
+
return snapshot
|
vrty/explanations.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Templated one-line per-dimension explanations.
|
|
2
|
+
|
|
3
|
+
Fixed string templates with one interpolated float value per template. No
|
|
4
|
+
prose generation, no LLM call, no branching beyond the documented threshold
|
|
5
|
+
bands. Each dimension has three score-band templates (low / medium / high)
|
|
6
|
+
plus one empty-response template selected only when the input-contract
|
|
7
|
+
wrapper has zeroed every dimension.
|
|
8
|
+
|
|
9
|
+
Thresholds and templates are pinned v1.0 constants. Changing the template
|
|
10
|
+
text or the band cutoffs is a public API change.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from types import MappingProxyType
|
|
16
|
+
from typing import Mapping
|
|
17
|
+
|
|
18
|
+
# Score-band cutoffs. Deliberate v1.0 choice: below 0.40 a dimension signals
|
|
19
|
+
# a clear deficiency; at-or-above 0.70 it signals a clear strength; the
|
|
20
|
+
# middle is "partial / mixed". Boundaries are inclusive on the low end of
|
|
21
|
+
# each higher band: 0.40 -> medium, 0.70 -> high.
|
|
22
|
+
THRESHOLD_LOW: float = 0.40
|
|
23
|
+
THRESHOLD_HIGH: float = 0.70
|
|
24
|
+
|
|
25
|
+
# Sixteen fixed templates: 4 dimensions x (3 score bands + 1 empty-response
|
|
26
|
+
# band). Every template is one line, plain English, two-decimal interpolation.
|
|
27
|
+
_TEMPLATES: Mapping[str, Mapping[str, str]] = MappingProxyType({
|
|
28
|
+
"relevance": MappingProxyType({
|
|
29
|
+
"low": "Relevance: {value:.2f} - response shares few terms with the prompt.",
|
|
30
|
+
"medium": "Relevance: {value:.2f} - response shares some terms with the prompt.",
|
|
31
|
+
"high": "Relevance: {value:.2f} - response strongly overlaps with the prompt's key terms.",
|
|
32
|
+
"empty": "Relevance: 0.00 - response contained no scorable tokens.",
|
|
33
|
+
}),
|
|
34
|
+
"coherence": MappingProxyType({
|
|
35
|
+
"low": "Coherence: {value:.2f} - adjacent sentences share little content.",
|
|
36
|
+
"medium": "Coherence: {value:.2f} - sentence-to-sentence flow is moderate.",
|
|
37
|
+
"high": "Coherence: {value:.2f} - adjacent sentences carry consistent topic.",
|
|
38
|
+
"empty": "Coherence: 0.00 - response contained no scorable tokens.",
|
|
39
|
+
}),
|
|
40
|
+
"completeness": MappingProxyType({
|
|
41
|
+
"low": "Completeness: {value:.2f} - few of the prompt's key terms appear in the response.",
|
|
42
|
+
"medium": "Completeness: {value:.2f} - some of the prompt's key terms are covered.",
|
|
43
|
+
"high": "Completeness: {value:.2f} - most of the prompt's key terms appear in the response.",
|
|
44
|
+
"empty": "Completeness: 0.00 - response contained no scorable tokens.",
|
|
45
|
+
}),
|
|
46
|
+
"conciseness": MappingProxyType({
|
|
47
|
+
"low": "Conciseness: {value:.2f} - response is padded with repetition or filler.",
|
|
48
|
+
"medium": "Conciseness: {value:.2f} - response has moderate information density.",
|
|
49
|
+
"high": "Conciseness: {value:.2f} - response is information-dense with little padding.",
|
|
50
|
+
"empty": "Conciseness: 0.00 - response contained no scorable tokens.",
|
|
51
|
+
}),
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
DIMENSIONS: tuple[str, ...] = ("relevance", "coherence", "completeness", "conciseness")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _band(value: float) -> str:
|
|
58
|
+
"""Map a dimension score in [0, 1] to one of three score bands."""
|
|
59
|
+
if value < THRESHOLD_LOW:
|
|
60
|
+
return "low"
|
|
61
|
+
if value < THRESHOLD_HIGH:
|
|
62
|
+
return "medium"
|
|
63
|
+
return "high"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def build_explanations(
|
|
67
|
+
*,
|
|
68
|
+
relevance: float,
|
|
69
|
+
coherence: float,
|
|
70
|
+
completeness: float,
|
|
71
|
+
conciseness: float,
|
|
72
|
+
) -> Mapping[str, str]:
|
|
73
|
+
"""Return a frozen mapping of dimension name -> templated explanation.
|
|
74
|
+
|
|
75
|
+
Dimensions are inserted in canonical order
|
|
76
|
+
(relevance, coherence, completeness, conciseness) so iteration order is
|
|
77
|
+
stable across processes (Python dict preserves insertion order).
|
|
78
|
+
"""
|
|
79
|
+
values = {
|
|
80
|
+
"relevance": relevance,
|
|
81
|
+
"coherence": coherence,
|
|
82
|
+
"completeness": completeness,
|
|
83
|
+
"conciseness": conciseness,
|
|
84
|
+
}
|
|
85
|
+
out: dict[str, str] = {}
|
|
86
|
+
for dim in DIMENSIONS:
|
|
87
|
+
v = values[dim]
|
|
88
|
+
template = _TEMPLATES[dim][_band(v)]
|
|
89
|
+
out[dim] = template.format(value=v)
|
|
90
|
+
return MappingProxyType(out)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def empty_response_explanations() -> Mapping[str, str]:
|
|
94
|
+
"""Return the fixed empty-response explanation set.
|
|
95
|
+
|
|
96
|
+
Used by the composite layer's empty-response wrapper instead of feeding
|
|
97
|
+
four zeros through the band logic — the "no scorable tokens" phrasing is
|
|
98
|
+
accurate and avoids saying "adjacent sentences share little content"
|
|
99
|
+
when there are no sentences at all.
|
|
100
|
+
"""
|
|
101
|
+
return MappingProxyType({dim: _TEMPLATES[dim]["empty"] for dim in DIMENSIONS})
|
vrty/scoring.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""Pure deterministic scoring functions for VRTY v1.0.
|
|
2
|
+
|
|
3
|
+
Each function takes the required data (IDF table, stopwords) as arguments and
|
|
4
|
+
performs no I/O. There is no module-level mutable state. All set/dict iteration
|
|
5
|
+
is sorted before reduction so floating-point accumulation order is identical
|
|
6
|
+
regardless of ``PYTHONHASHSEED``.
|
|
7
|
+
|
|
8
|
+
Token cap, OOV smoothing, and the empty/degenerate behavior of every function
|
|
9
|
+
are fixed by the v1.0 input contract; see the README "Known properties and
|
|
10
|
+
limitations" section for the deliberate-choice rationales.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import math
|
|
16
|
+
import re
|
|
17
|
+
import unicodedata
|
|
18
|
+
from collections.abc import Iterable, Mapping
|
|
19
|
+
from typing import Final
|
|
20
|
+
|
|
21
|
+
MAX_TOKENS: Final[int] = 2048
|
|
22
|
+
|
|
23
|
+
_TOKEN_RE: Final[re.Pattern[str]] = re.compile(r"[a-z]+")
|
|
24
|
+
_SENT_SPLIT_RE: Final[re.Pattern[str]] = re.compile(r"(?<=[.!?])\s+")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _tokenize(text: str) -> list[str]:
|
|
28
|
+
"""Lowercase ASCII tokens.
|
|
29
|
+
|
|
30
|
+
Non-ASCII input is NFKD-normalized then stripped to ASCII, so accented
|
|
31
|
+
Latin characters fold to their base letters and characters outside the
|
|
32
|
+
Latin script are dropped. This is documented as a v1.0 caveat: quality
|
|
33
|
+
outside English is not claimed.
|
|
34
|
+
"""
|
|
35
|
+
folded = (
|
|
36
|
+
unicodedata.normalize("NFKD", text)
|
|
37
|
+
.encode("ascii", "ignore")
|
|
38
|
+
.decode("ascii")
|
|
39
|
+
.lower()
|
|
40
|
+
)
|
|
41
|
+
return _TOKEN_RE.findall(folded)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _tokens_capped(text: str) -> list[str]:
|
|
45
|
+
"""Tokenize and truncate at the v1.0 fixed token cap."""
|
|
46
|
+
toks = _tokenize(text)
|
|
47
|
+
if len(toks) > MAX_TOKENS:
|
|
48
|
+
toks = toks[:MAX_TOKENS]
|
|
49
|
+
return toks
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _split_sentences_capped(text: str) -> list[list[str]]:
|
|
53
|
+
"""Split into sentences and tokenize each, accumulating until the token cap.
|
|
54
|
+
|
|
55
|
+
A sentence that would push the running total over ``MAX_TOKENS`` is dropped
|
|
56
|
+
whole rather than included partially, so sentence boundaries remain exact.
|
|
57
|
+
"""
|
|
58
|
+
raw = _SENT_SPLIT_RE.split(text)
|
|
59
|
+
out: list[list[str]] = []
|
|
60
|
+
total = 0
|
|
61
|
+
for sent in raw:
|
|
62
|
+
if not sent.strip():
|
|
63
|
+
continue
|
|
64
|
+
toks = _tokenize(sent)
|
|
65
|
+
if not toks:
|
|
66
|
+
continue
|
|
67
|
+
if total + len(toks) > MAX_TOKENS:
|
|
68
|
+
break
|
|
69
|
+
out.append(toks)
|
|
70
|
+
total += len(toks)
|
|
71
|
+
return out
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _tfidf_vector(
|
|
75
|
+
tokens: Iterable[str],
|
|
76
|
+
idf: Mapping[str, float],
|
|
77
|
+
idf_oov: float,
|
|
78
|
+
) -> dict[str, float]:
|
|
79
|
+
"""Build a TF*IDF weighted vector as a ``{token: weight}`` dict.
|
|
80
|
+
|
|
81
|
+
OOV tokens (not found in ``idf``) receive the ``idf_oov`` weight. The
|
|
82
|
+
bundled data file pins ``idf_oov`` at the maximum IDF observed in the
|
|
83
|
+
corpus — a deliberate choice that treats unseen tokens as maximally
|
|
84
|
+
informative, matching standard add-one (Laplace) IDF smoothing.
|
|
85
|
+
"""
|
|
86
|
+
tf: dict[str, int] = {}
|
|
87
|
+
for t in tokens:
|
|
88
|
+
tf[t] = tf.get(t, 0) + 1
|
|
89
|
+
return {t: c * idf.get(t, idf_oov) for t, c in tf.items()}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _cosine(a: Mapping[str, float], b: Mapping[str, float]) -> float:
|
|
93
|
+
"""Cosine similarity of two non-negative weight dicts.
|
|
94
|
+
|
|
95
|
+
Uses the zero-vector convention ``cos(0, .) := 0`` to avoid a 0/0 NaN
|
|
96
|
+
when either side has no weighted tokens (Manning & Schuetze, FSNLP, 6.4).
|
|
97
|
+
Keys are sorted before every reduction so the floating-point accumulation
|
|
98
|
+
order is identical across processes regardless of hash randomization.
|
|
99
|
+
The final value is clamped to ``[0.0, 1.0]`` to absorb the IEEE-754
|
|
100
|
+
rounding artifact that lets identical vectors produce a dot-over-norm
|
|
101
|
+
ratio of 1.0 + 1 ulp; this is the same clamp ``sklearn`` applies in
|
|
102
|
+
``cosine_similarity`` for the same reason.
|
|
103
|
+
"""
|
|
104
|
+
a_keys = sorted(a)
|
|
105
|
+
b_keys = sorted(b)
|
|
106
|
+
norm_a_sq = sum(a[k] * a[k] for k in a_keys)
|
|
107
|
+
norm_b_sq = sum(b[k] * b[k] for k in b_keys)
|
|
108
|
+
if norm_a_sq == 0.0 or norm_b_sq == 0.0:
|
|
109
|
+
return 0.0
|
|
110
|
+
norm_a = math.sqrt(norm_a_sq)
|
|
111
|
+
norm_b = math.sqrt(norm_b_sq)
|
|
112
|
+
if len(a_keys) <= len(b_keys):
|
|
113
|
+
shared = sorted(k for k in a_keys if k in b)
|
|
114
|
+
else:
|
|
115
|
+
shared = sorted(k for k in b_keys if k in a)
|
|
116
|
+
dot = sum(a[k] * b[k] for k in shared)
|
|
117
|
+
cos = dot / (norm_a * norm_b)
|
|
118
|
+
if cos > 1.0:
|
|
119
|
+
return 1.0
|
|
120
|
+
if cos < 0.0:
|
|
121
|
+
return 0.0
|
|
122
|
+
return cos
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def relevance(
|
|
126
|
+
prompt: str,
|
|
127
|
+
response: str,
|
|
128
|
+
*,
|
|
129
|
+
idf: Mapping[str, float],
|
|
130
|
+
idf_oov: float,
|
|
131
|
+
) -> float:
|
|
132
|
+
"""TF*IDF weighted cosine similarity between prompt and response.
|
|
133
|
+
|
|
134
|
+
Returns 0.0 when either side has no tokens (zero-vector convention).
|
|
135
|
+
Output is bounded to [0.0, 1.0] because all TF and IDF weights are
|
|
136
|
+
non-negative, so the cosine reduces to Cauchy-Schwarz on the positive
|
|
137
|
+
orthant.
|
|
138
|
+
"""
|
|
139
|
+
p_toks = _tokens_capped(prompt)
|
|
140
|
+
r_toks = _tokens_capped(response)
|
|
141
|
+
v_p = _tfidf_vector(p_toks, idf, idf_oov)
|
|
142
|
+
v_r = _tfidf_vector(r_toks, idf, idf_oov)
|
|
143
|
+
return _cosine(v_p, v_r)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def coherence(
|
|
147
|
+
response: str,
|
|
148
|
+
*,
|
|
149
|
+
idf: Mapping[str, float],
|
|
150
|
+
idf_oov: float,
|
|
151
|
+
) -> float:
|
|
152
|
+
"""Mean cosine similarity of adjacent-sentence TF*IDF vectors.
|
|
153
|
+
|
|
154
|
+
Deliberate choice (v1.0): a response with fewer than two sentences
|
|
155
|
+
returns 1.0, because there is no adjacent pair that can disagree;
|
|
156
|
+
penalizing short responses on coherence would double-count what
|
|
157
|
+
``completeness`` already measures via prompt-term coverage.
|
|
158
|
+
|
|
159
|
+
Output is bounded to [0.0, 1.0] because each pairwise cosine is in
|
|
160
|
+
[0, 1] and the arithmetic mean of values in [0, 1] is in [0, 1].
|
|
161
|
+
"""
|
|
162
|
+
sents = _split_sentences_capped(response)
|
|
163
|
+
n = len(sents)
|
|
164
|
+
if n < 2:
|
|
165
|
+
return 1.0
|
|
166
|
+
vectors = [_tfidf_vector(s, idf, idf_oov) for s in sents]
|
|
167
|
+
pair_sims = [_cosine(vectors[i], vectors[i + 1]) for i in range(n - 1)]
|
|
168
|
+
return sum(pair_sims) / (n - 1)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def completeness(
|
|
172
|
+
prompt: str,
|
|
173
|
+
response: str,
|
|
174
|
+
*,
|
|
175
|
+
idf: Mapping[str, float],
|
|
176
|
+
idf_oov: float,
|
|
177
|
+
stopwords: frozenset[str],
|
|
178
|
+
) -> float:
|
|
179
|
+
"""IDF-weighted fraction of prompt content terms that appear in the response.
|
|
180
|
+
|
|
181
|
+
Returns 0.0 when the prompt has no content terms (empty prompt or
|
|
182
|
+
all-stopwords prompt) since the natural ratio is 0/0; this matches the
|
|
183
|
+
input-contract requirement that an empty prompt yields 0.0.
|
|
184
|
+
|
|
185
|
+
Output is bounded to [0.0, 1.0] because ``covered`` is a subset sum of
|
|
186
|
+
the same non-negative weights that form ``total``.
|
|
187
|
+
"""
|
|
188
|
+
p_toks = _tokens_capped(prompt)
|
|
189
|
+
r_toks = _tokens_capped(response)
|
|
190
|
+
key_terms = sorted({t for t in p_toks if t not in stopwords})
|
|
191
|
+
if not key_terms:
|
|
192
|
+
return 0.0
|
|
193
|
+
weights = [idf.get(t, idf_oov) for t in key_terms]
|
|
194
|
+
total = sum(weights)
|
|
195
|
+
if total == 0.0:
|
|
196
|
+
return 0.0
|
|
197
|
+
r_set = set(r_toks)
|
|
198
|
+
covered = sum(w for t, w in zip(key_terms, weights) if t in r_set)
|
|
199
|
+
return covered / total
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def conciseness(
|
|
203
|
+
response: str,
|
|
204
|
+
*,
|
|
205
|
+
stopwords: frozenset[str],
|
|
206
|
+
) -> float:
|
|
207
|
+
"""Content-word type-token ratio: |unique non-stopword tokens| / |all tokens|.
|
|
208
|
+
|
|
209
|
+
Penalizes padding (repetition and stopword-heavy filler) by lowering the
|
|
210
|
+
ratio; rewards information density by raising it.
|
|
211
|
+
|
|
212
|
+
Output is bounded to [0.0, 1.0] because the unique-content-token count
|
|
213
|
+
is at most the total token count.
|
|
214
|
+
"""
|
|
215
|
+
toks = _tokens_capped(response)
|
|
216
|
+
total = len(toks)
|
|
217
|
+
if total == 0:
|
|
218
|
+
return 0.0
|
|
219
|
+
unique_content = {t for t in toks if t not in stopwords}
|
|
220
|
+
return len(unique_content) / total
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vrty
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Deterministic LLM-output quality scoring in milliseconds. No AI judge in the loop.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/sundeyp/vrty
|
|
7
|
+
Project-URL: Repository, https://github.com/sundeyp/vrty
|
|
8
|
+
Project-URL: Issues, https://github.com/sundeyp/vrty/issues
|
|
9
|
+
Keywords: llm,evaluation,scoring,tf-idf,deterministic
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
16
|
+
Requires-Python: <3.12,>=3.11
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: pytest==8.3.3; extra == "dev"
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# VRTY
|
|
24
|
+
|
|
25
|
+
[](https://github.com/sundeyp/vrty/actions/workflows/vrty.yml)
|
|
26
|
+
[](https://pypi.org/project/vrty/)
|
|
27
|
+
[](https://www.python.org/downloads/release/python-3119/)
|
|
28
|
+
[](LICENSE)
|
|
29
|
+
[](pyproject.toml)
|
|
30
|
+
|
|
31
|
+
**The deterministic, zero-dependency LLM evaluator. Sub-millisecond, no API key, byte-identical across runs.**
|
|
32
|
+
|
|
33
|
+
*A stdlib alternative to ROUGE for no-reference scoring, and a sanity layer
|
|
34
|
+
in front of GPT-as-judge when reproducibility matters.*
|
|
35
|
+
|
|
36
|
+
VRTY scores a `(prompt, response)` pair on four standard, auditable
|
|
37
|
+
dimensions and returns a single composite plus a per-dimension breakdown.
|
|
38
|
+
Every formula is a textbook formula you can verify against a reference in
|
|
39
|
+
five minutes. There is no LLM call anywhere in the scoring path.
|
|
40
|
+
|
|
41
|
+
> **What VRTY does not do.** VRTY measures *surface text properties* —
|
|
42
|
+
> vocabulary overlap, sentence flow, term coverage, information density.
|
|
43
|
+
> **It does not check whether the answer is true.** A confident wrong answer
|
|
44
|
+
> that echoes the prompt's vocabulary will score *higher* than a correct
|
|
45
|
+
> one-word answer (see [Known properties and limitations](#known-properties-and-limitations):
|
|
46
|
+
> `"London is the capital of France."` scores 0.879; `"Paris."` scores 0.350).
|
|
47
|
+
> Use VRTY to catch malformed, off-topic, or padded output; pair it with a
|
|
48
|
+
> fact-check or human review when correctness matters.
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from vrty import score
|
|
52
|
+
result = score("What is the capital of France?", "Paris is the capital of France.")
|
|
53
|
+
print(result.composite) # 0.8653358523094898
|
|
54
|
+
print(result.explanations["relevance"]) # Relevance: 0.83 - response strongly overlaps with the prompt's key terms.
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
That is the entire 60-second example. Four lines, runs as-is, returns a
|
|
58
|
+
score. No configuration, no API key.
|
|
59
|
+
|
|
60
|
+
> **About that 0.865.** That number is what *factoid* prompts look like —
|
|
61
|
+
> short prompt, short answer, heavy vocabulary overlap. Open-ended prompts
|
|
62
|
+
> (customer support, instruction-following, prose drafts) typically score
|
|
63
|
+
> **0.20 – 0.40** because the response is *expected* not to echo prompt
|
|
64
|
+
> vocabulary. VRTY is calibrated *relative to a fixed prompt*, not as an
|
|
65
|
+
> absolute quality threshold. See [Calibration bands](#calibration-bands)
|
|
66
|
+
> below before setting CI gates.
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## Install
|
|
71
|
+
|
|
72
|
+
```sh
|
|
73
|
+
pip install vrty
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Or from source:
|
|
77
|
+
|
|
78
|
+
```sh
|
|
79
|
+
git clone https://github.com/sundeyp/vrty
|
|
80
|
+
cd vrty
|
|
81
|
+
pip install -e .
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Determinism is guaranteed only on the pinned interpreter (Python 3.11.9)
|
|
85
|
+
and pinned dependency set. The scoring path has **zero third-party runtime
|
|
86
|
+
dependencies** — everything is Python stdlib. See [Determinism](#determinism)
|
|
87
|
+
below.
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## The four dimensions
|
|
92
|
+
|
|
93
|
+
| Dimension | Formula | What it measures |
|
|
94
|
+
|---|---|---|
|
|
95
|
+
| **Relevance** | TF·IDF weighted cosine similarity between prompt and response | How much the response's content overlaps the prompt's content |
|
|
96
|
+
| **Coherence** | Mean cosine similarity of adjacent-sentence TF·IDF vectors | How much each sentence shares with the next (topical flow) |
|
|
97
|
+
| **Completeness** | IDF-weighted fraction of prompt content terms that appear in the response | How many of the prompt's key terms are addressed |
|
|
98
|
+
| **Conciseness** | `|unique content tokens| / |total tokens|` (content-word type–token ratio) | Information density vs padding |
|
|
99
|
+
|
|
100
|
+
Each dimension returns a value in `[0.0, 1.0]`. The composite is a fixed,
|
|
101
|
+
version-locked weighted sum:
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
composite = 0.35 * relevance
|
|
105
|
+
+ 0.20 * coherence
|
|
106
|
+
+ 0.30 * completeness
|
|
107
|
+
+ 0.15 * conciseness
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
The weights are pinned constants, not configurable. Configurability is
|
|
111
|
+
explicitly post-v1.0.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## What you get back
|
|
116
|
+
|
|
117
|
+
`score()` returns a frozen `VrtyScore` object with a 9-key `to_dict()`:
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
{
|
|
121
|
+
"composite": 0.8653358523094898,
|
|
122
|
+
"relevance": 0.8295310065985426,
|
|
123
|
+
"coherence": 1.0,
|
|
124
|
+
"completeness": 1.0,
|
|
125
|
+
"conciseness": 0.5,
|
|
126
|
+
"explanations": {
|
|
127
|
+
"relevance": "Relevance: 0.83 - response strongly overlaps with the prompt's key terms.",
|
|
128
|
+
"coherence": "Coherence: 1.00 - adjacent sentences carry consistent topic.",
|
|
129
|
+
"completeness": "Completeness: 1.00 - most of the prompt's key terms appear in the response.",
|
|
130
|
+
"conciseness": "Conciseness: 0.50 - response has moderate information density."
|
|
131
|
+
},
|
|
132
|
+
"vrty_version": "1.0.0",
|
|
133
|
+
"idf_sha256": "0e475bcaa5524d1e26cbb166bb5c138e37f87e1e47b75e6506c6460a94259fd2",
|
|
134
|
+
"weights": {"relevance": 0.35, "coherence": 0.20, "completeness": 0.30, "conciseness": 0.15}
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
`vrty_version` and `idf_sha256` make every score reproducible — together
|
|
139
|
+
they pin the scoring logic and the exact IDF data used.
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## CLI
|
|
144
|
+
|
|
145
|
+
```sh
|
|
146
|
+
vrty --prompt "What is the capital of France?" \
|
|
147
|
+
--response "Paris is the capital of France."
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Equivalent stdlib invocation:
|
|
151
|
+
|
|
152
|
+
```sh
|
|
153
|
+
python -m vrty --prompt "..." --response "..."
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Accepts `--prompt-file PATH` / `--response-file PATH` for long inputs;
|
|
157
|
+
`/dev/stdin` works as a file path. `--pretty` indents the JSON.
|
|
158
|
+
Exit codes: `0` success, `1` I/O error, `2` argparse error.
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Benchmarks
|
|
163
|
+
|
|
164
|
+
VRTY is not an embedding-based scorer; if you need semantic similarity that
|
|
165
|
+
survives paraphrase, use **BERTScore** or **MoverScore**. VRTY is not n-gram
|
|
166
|
+
precision against a reference; if you have reference answers, use **BLEU**
|
|
167
|
+
or **ROUGE**. VRTY's niche is *no-reference, no-model, deterministic*
|
|
168
|
+
scoring — the gap ROUGE leaves when you don't have a gold reference, and
|
|
169
|
+
the gap GPT-as-judge leaves when you need reproducibility.
|
|
170
|
+
|
|
171
|
+
Reproducibility, cost, and latency vs ROUGE and LLM-as-judge. VRTY and
|
|
172
|
+
ROUGE were measured on the same machine with the same 1000 synthetic
|
|
173
|
+
(prompt, response) pairs per response-size bucket; reproduce via
|
|
174
|
+
`python tools/benchmark.py`. LLM-as-judge cost and latency are intentionally
|
|
175
|
+
not measured here — they depend on model choice and provider pricing, both
|
|
176
|
+
of which drift; fill them in for your own model before relying on the
|
|
177
|
+
comparison.
|
|
178
|
+
|
|
179
|
+
| | VRTY | ROUGE (rouge-score 0.1.2) | LLM-as-judge |
|
|
180
|
+
|------------------------|------|---------------------------|--------------|
|
|
181
|
+
| **Reproducibility** | Byte-identical across processes (pinned Python 3.11.9, asserted in CI on three subprocesses with adversarial `PYTHONHASHSEED` values) | Deterministic for a fixed tokenizer | Non-deterministic; varies with temperature, sampling, model version |
|
|
182
|
+
| **Cost per score** | $0 (no API call) | $0 (local) | $ per call × tokens; measure with your chosen model |
|
|
183
|
+
| **Latency p99 — 100 tokens** | **0.16 ms** | 1.66 ms | typically 500–2000 ms (network + inference) |
|
|
184
|
+
| **Latency p99 — 500 tokens** | **0.52 ms** | 6.66 ms | typically 500–2000 ms |
|
|
185
|
+
| **Latency p99 — 2000 tokens** | **2.94 ms** | 25.96 ms | typically 1000–5000 ms |
|
|
186
|
+
| **Network required** | No | No | Yes |
|
|
187
|
+
| **Reference hardware** | AMD Ryzen 7 8745HS, 16 cores, 27 GiB RAM, Ubuntu 24.04, Python 3.11.9 | (same) | (varies by provider) |
|
|
188
|
+
|
|
189
|
+
**Latency claim (v1.0)**: `< 3 ms p99 for responses under 2000 tokens on
|
|
190
|
+
AMD Ryzen 7 8745HS`. Reproduce: `python tools/benchmark.py` from a clean
|
|
191
|
+
venv with `vrty` and `rouge-score==0.1.2` installed.
|
|
192
|
+
|
|
193
|
+
VRTY is roughly **9–10× faster than ROUGE** at every input size in this
|
|
194
|
+
table because the scoring path is pure stdlib with no regex-based stemmer
|
|
195
|
+
and no sentence-pair grid construction.
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## Calibration bands
|
|
200
|
+
|
|
201
|
+
Expected composite ranges by prompt type, observed across realistic input.
|
|
202
|
+
Use these to set CI gates and user-facing displays — do not assume a single
|
|
203
|
+
threshold works across prompt types.
|
|
204
|
+
|
|
205
|
+
| Prompt type | Typical composite | Use the score as |
|
|
206
|
+
|---|---|---|
|
|
207
|
+
| Factoid Q&A where the answer echoes prompt vocabulary (`"capital of France?"` → `"Paris is the capital of France."`) | 0.70 – 0.90 | Absolute threshold viable |
|
|
208
|
+
| Customer-support / instruction-following | 0.20 – 0.40 | Relative delta from a baseline answer on the *same* prompt |
|
|
209
|
+
| Open-ended prose (email drafts, summaries) | 0.15 – 0.35 | Relative delta only |
|
|
210
|
+
| Repetition / padding spam with OOV technical terms | can score 0.60+ | Catch by pairing with a length / repetition sanity check |
|
|
211
|
+
|
|
212
|
+
**Practical rule.** Compute a baseline composite on a known-good response
|
|
213
|
+
to your prompt, then gate on `score >= baseline * k` for some
|
|
214
|
+
`k ∈ [0.7, 0.9]`. Do not gate on `composite > 0.8` as an absolute — that
|
|
215
|
+
will fire false-negative on obviously-fine open-ended responses.
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Determinism
|
|
220
|
+
|
|
221
|
+
Identical input returns byte-identical output. This guarantee holds under
|
|
222
|
+
the following conditions, all of which are documented and enforced:
|
|
223
|
+
|
|
224
|
+
- **Pinned interpreter**: Python 3.11.9 (CPython, official build or
|
|
225
|
+
python-build-standalone). The CI matrix runs on this version. Other 3.x
|
|
226
|
+
versions are likely to produce identical output but are not asserted.
|
|
227
|
+
- **Pinned IDF data**: `vrty/data/idf.json.gz` ships with the package
|
|
228
|
+
and is SHA-256-verified at import. A modified data file fails fast with
|
|
229
|
+
`VrtyDataError` before any score is computed.
|
|
230
|
+
- **Zero third-party runtime dependencies**: the scoring path uses only
|
|
231
|
+
CPython stdlib (`re`, `math`, `collections`, `json`, `gzip`,
|
|
232
|
+
`hashlib`, `importlib.resources`, `unicodedata`). No `numpy`, no
|
|
233
|
+
`scikit-learn`, no BLAS-backed FP variance.
|
|
234
|
+
- **Sort-before-reduction**: every set and dict is sorted before any
|
|
235
|
+
floating-point accumulation, so dict-iteration order under
|
|
236
|
+
`PYTHONHASHSEED` randomization cannot change the result.
|
|
237
|
+
|
|
238
|
+
The test suite asserts byte-identity on `json.dumps(result.to_dict(),
|
|
239
|
+
sort_keys=True)` across three fresh OS subprocesses with `PYTHONHASHSEED`
|
|
240
|
+
set to `0`, `12345`, and the CPython default (`random`).
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Self-host
|
|
245
|
+
|
|
246
|
+
A one-command Docker self-host is shipped alongside the library. See the
|
|
247
|
+
[Dockerfile](Dockerfile) for the pinned image and the
|
|
248
|
+
[GitHub Actions snippet](.github/workflows/vrty.yml) for CI/CD
|
|
249
|
+
integration.
|
|
250
|
+
|
|
251
|
+
```sh
|
|
252
|
+
docker build -t vrty:1.0.0 .
|
|
253
|
+
docker run --rm vrty:1.0.0 \
|
|
254
|
+
--prompt "What is the capital of France?" \
|
|
255
|
+
--response "Paris is the capital of France."
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## Known properties and limitations
|
|
261
|
+
|
|
262
|
+
**Read this section before integrating VRTY into anything load-bearing.**
|
|
263
|
+
Seven honest limitations of the v1.0 design.
|
|
264
|
+
|
|
265
|
+
### 1. VRTY scores surface properties, not factual correctness
|
|
266
|
+
|
|
267
|
+
The four dimensions measure **term overlap, sentence flow, key-term
|
|
268
|
+
coverage, and information density**. They do *not* verify that the response
|
|
269
|
+
is factually true. A correct answer that does not echo prompt vocabulary
|
|
270
|
+
scores low on relevance and completeness; a confident wrong answer that
|
|
271
|
+
echoes prompt vocabulary scores high.
|
|
272
|
+
|
|
273
|
+
Worked example, prompt = `"What is the capital of France?"`:
|
|
274
|
+
|
|
275
|
+
| Response | Correct? | Composite | Relevance | Completeness | Conciseness |
|
|
276
|
+
|-------------------------------------------|----------|-----------|-----------|--------------|-------------|
|
|
277
|
+
| `"Paris is the capital of France."` | yes | 0.865 | 0.830 | 1.000 | 0.500 |
|
|
278
|
+
| `"London is the capital of France."` | **no** | 0.879 | 0.867 | 1.000 | 0.500 |
|
|
279
|
+
| `"Paris."` | yes | 0.350 | 0.000 | 0.000 | 1.000 |
|
|
280
|
+
| `"London."` | **no** | 0.350 | 0.000 | 0.000 | 1.000 |
|
|
281
|
+
| `"Banana."` | **no** | 0.350 | 0.000 | 0.000 | 1.000 |
|
|
282
|
+
|
|
283
|
+
The verbose incorrect answer scores *higher* than the verbose correct one
|
|
284
|
+
(slight IDF asymmetry between `"london"` and `"paris"` in the bundled
|
|
285
|
+
corpus); the three terse responses — one correct, two wrong — receive
|
|
286
|
+
identical 0.350 scores. **VRTY cannot distinguish them; an external
|
|
287
|
+
fact-check must.** Use VRTY to detect malformed, off-topic, or padded
|
|
288
|
+
outputs; use a separate fact-check or human review to verify truth.
|
|
289
|
+
|
|
290
|
+
### 2. Conciseness and completeness intentionally pull against each other
|
|
291
|
+
|
|
292
|
+
A response that covers every prompt term tends to be longer (lower
|
|
293
|
+
conciseness); a terse response tends to omit prompt terms (lower
|
|
294
|
+
completeness). This tension is correct behavior, not a bug. Always read
|
|
295
|
+
the per-dimension breakdown — a single composite hides the trade-off.
|
|
296
|
+
|
|
297
|
+
### 3. Single-sentence coherence returns 1.0 by deliberate choice
|
|
298
|
+
|
|
299
|
+
When the response is one sentence (or zero — see the empty-response
|
|
300
|
+
wrapper), there is no adjacent-sentence pair that can disagree, so
|
|
301
|
+
coherence is set to 1.0. This is a deliberate v1.0 convention: penalizing
|
|
302
|
+
short responses on coherence would double-count what completeness already
|
|
303
|
+
measures via prompt-term coverage.
|
|
304
|
+
|
|
305
|
+
### 4. OOV tokens receive maximum IDF weight by deliberate choice
|
|
306
|
+
|
|
307
|
+
Tokens not present in the bundled IDF corpus are assigned `idf_oov =
|
|
308
|
+
log(N+1) + 1`, the value the smoothed IDF formula assigns to a token that
|
|
309
|
+
appears in zero documents. This treats unseen words as maximally
|
|
310
|
+
informative — the standard add-one (Laplace) smoothing choice — so
|
|
311
|
+
technical jargon and proper nouns are not silently dropped to zero weight.
|
|
312
|
+
|
|
313
|
+
### 5. Conciseness is a type–token ratio, which is mildly length-sensitive
|
|
314
|
+
|
|
315
|
+
The conciseness measure (`|unique content tokens| / |total tokens|`) tends
|
|
316
|
+
to decline for longer responses because the vocabulary saturates while the
|
|
317
|
+
length keeps growing. This is a known property of the type–token ratio
|
|
318
|
+
(Hess et al. 1986). Two responses of very different lengths are not
|
|
319
|
+
directly comparable on conciseness alone; interpret the conciseness score
|
|
320
|
+
together with the other dimensions and the response length.
|
|
321
|
+
|
|
322
|
+
### 6. Repetition can score high when prompt terms are out-of-corpus
|
|
323
|
+
|
|
324
|
+
Because OOV tokens receive maximum IDF weight (limitation 4 above) and
|
|
325
|
+
conciseness is a type–token ratio (limitation 5), a response that *repeats*
|
|
326
|
+
OOV technical terms (e.g. `"multi-head multi-head attention attention
|
|
327
|
+
attention transformer transformer transformer."` against a transformer-
|
|
328
|
+
architecture prompt) can score *higher* than a substantive paragraph on the
|
|
329
|
+
same prompt. Mitigation: combine the VRTY composite with a basic length /
|
|
330
|
+
repetition sanity check, or treat the composite as one signal among
|
|
331
|
+
several. This is a known property of TF·IDF-family scorers, not unique to
|
|
332
|
+
VRTY.
|
|
333
|
+
|
|
334
|
+
### 7. The bundled IDF corpus is 19th-century English literature
|
|
335
|
+
|
|
336
|
+
IDF weights are computed from ten US-public-domain Project Gutenberg books
|
|
337
|
+
(Austen, Melville, Shelley, Doyle, Stoker, Carroll, Wilde, Dickens, Wells,
|
|
338
|
+
Thoreau) — about 5,400 200-token pseudo-documents, 32,000-word vocabulary.
|
|
339
|
+
Modern technical vocabulary like `"API"`, `"endpoint"`, `"deploy"`,
|
|
340
|
+
`"kubernetes"`, `"async"` is not in the corpus and falls into the OOV
|
|
341
|
+
bucket, where it receives the maximum IDF weight (see limitation 4).
|
|
342
|
+
|
|
343
|
+
This generally *helps* technical text (rare jargon is correctly treated as
|
|
344
|
+
informative) but can cause uneven weighting when one technical term is
|
|
345
|
+
in-corpus by coincidence and a similar one is not. **A domain-matched IDF
|
|
346
|
+
corpus is explicitly post-v1.0**; v1.0 disclaims this rather than fixes it.
|
|
347
|
+
Non-English text scores as-is with no special handling and is similarly
|
|
348
|
+
disclaimed.
|
|
349
|
+
|
|
350
|
+
---
|
|
351
|
+
|
|
352
|
+
## Input contract
|
|
353
|
+
|
|
354
|
+
Behavior on degenerate inputs is part of the v1.0 spec, not an afterthought:
|
|
355
|
+
|
|
356
|
+
| Input | Behavior |
|
|
357
|
+
|--------------------------------------|----------------------------------------------------------------|
|
|
358
|
+
| Empty response | Every dimension and the composite return `0.0`; explanations say "response contained no scorable tokens." |
|
|
359
|
+
| Empty prompt | Relevance and completeness return `0.0`; coherence and conciseness depend only on the response and score normally |
|
|
360
|
+
| Inputs above 2,048 tokens | Truncated at 2,048 tokens (the `MAX_TOKENS` constant) before scoring; truncation is deterministic |
|
|
361
|
+
| Non-English text | NFKD-normalized then ASCII-stripped; accented Latin folds to base letters; non-Latin scripts (CJK, Cyrillic, Arabic, ...) drop entirely. Quality outside English is not claimed |
|
|
362
|
+
| Response identical to prompt | Scored normally; no special case |
|
|
363
|
+
| Single word | Scored normally; no special case |
|
|
364
|
+
|
|
365
|
+
---
|
|
366
|
+
|
|
367
|
+
## License
|
|
368
|
+
|
|
369
|
+
MIT — see [LICENSE](LICENSE).
|
|
370
|
+
|
|
371
|
+
---
|
|
372
|
+
|
|
373
|
+
## Versioning
|
|
374
|
+
|
|
375
|
+
`vrty_version` is included with every score so any historical score is
|
|
376
|
+
traceable to the exact scoring logic that produced it. The bundled IDF
|
|
377
|
+
data file's SHA-256 (`idf_sha256`) is also returned with every score so
|
|
378
|
+
two scores from different builds can be compared at the data-pinning
|
|
379
|
+
level, not just the code level. Bumping either invalidates byte-equality
|
|
380
|
+
guarantees and requires a version bump.
|
|
381
|
+
|
|
382
|
+
A score from `vrty_version="1.0.0"` will be reproducible on any future
|
|
383
|
+
machine that installs `vrty==1.0.0` on Python 3.11.9.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
vrty/__init__.py,sha256=FuO6yoQu5IJlc52VTLvsRttDXf_1fUD0CjhIQfqfQ_g,481
|
|
2
|
+
vrty/__main__.py,sha256=rjEjw22ZO5ZfIEbogmMPgccN-9VT4jUvslVGWSfNung,152
|
|
3
|
+
vrty/_version.py,sha256=iedBM6zdqUuotYZ25tysjwduWBcIlTWp315wwuTyX48,270
|
|
4
|
+
vrty/cli.py,sha256=RPtlpjnBqzqCGY61tOxMDxV9wSFjqFscivxET2tAjsY,4004
|
|
5
|
+
vrty/composite.py,sha256=3kphBJHQSD2A5lBfTtIsWJr_Z_peJQ9SGLTs1fCOtO0,6056
|
|
6
|
+
vrty/data_loader.py,sha256=btIKFpt-m1ZU6J-Th4g4I2cN-mNye175ANG-YXFDW3c,3677
|
|
7
|
+
vrty/explanations.py,sha256=HV4GiKT5Ai0VBz-huzC082D8VA59WXjKoqGtjymCcdo,4368
|
|
8
|
+
vrty/scoring.py,sha256=jXvrVHlP7eDMUiKrpxRwJC5R4CUz1Ek_st0RJKtl5I0,7310
|
|
9
|
+
vrty/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
vrty/data/idf.json.gz,sha256=DkdbyqVSTR4my7Fmu1wTjjf4fh5Ht15lBsZGCpQln9I,156772
|
|
11
|
+
vrty-1.0.0.dist-info/licenses/LICENSE,sha256=b5p2_hBoBQO3yov45_wNuciWEOWm3KER1ZeMVQUi_1Q,1070
|
|
12
|
+
vrty-1.0.0.dist-info/METADATA,sha256=KNWP-jfjsTyUnObYf8yJuLCwsLDadPZW-yvc5FtZ_EY,17924
|
|
13
|
+
vrty-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
14
|
+
vrty-1.0.0.dist-info/entry_points.txt,sha256=pZ1sNJwW3NlNPzuhoZlPpy2yzfclj_oJU941t7EFlaA,39
|
|
15
|
+
vrty-1.0.0.dist-info/top_level.txt,sha256=YnCtW1sB6RTMnnvz_AVLu6OFv01PCx1i2oEVMPIntsI,5
|
|
16
|
+
vrty-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sundeyp Singh
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
vrty
|