genassert 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genassert/__init__.py +43 -0
- genassert/_embed.py +113 -0
- genassert/assertions/__init__.py +1 -0
- genassert/assertions/budget.py +62 -0
- genassert/assertions/hallucination.py +89 -0
- genassert/assertions/intent.py +62 -0
- genassert/assertions/language.py +145 -0
- genassert/assertions/pii.py +107 -0
- genassert/assertions/readability.py +135 -0
- genassert/assertions/schema.py +100 -0
- genassert/assertions/sentiment.py +160 -0
- genassert/assertions/similarity.py +57 -0
- genassert/assertions/tone.py +93 -0
- genassert/baseline.py +142 -0
- genassert/judge.py +185 -0
- genassert/plugin.py +74 -0
- genassert-0.2.0.dist-info/METADATA +452 -0
- genassert-0.2.0.dist-info/RECORD +21 -0
- genassert-0.2.0.dist-info/WHEEL +4 -0
- genassert-0.2.0.dist-info/entry_points.txt +2 -0
- genassert-0.2.0.dist-info/licenses/LICENSE +21 -0
genassert/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
genassert — pytest-native semantic testing for generative AI applications.
|
|
3
|
+
|
|
4
|
+
Drop-in pytest plugin. No servers. No SaaS. No config.
|
|
5
|
+
Works with OpenAI, Anthropic, LiteLLM, or any LLM client.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from genassert.assertions.intent import assert_intent
|
|
9
|
+
from genassert.assertions.tone import assert_tone
|
|
10
|
+
from genassert.assertions.hallucination import assert_no_hallucination
|
|
11
|
+
from genassert.assertions.budget import assert_token_budget
|
|
12
|
+
from genassert.assertions.schema import assert_schema
|
|
13
|
+
from genassert.assertions.similarity import assert_similar_to
|
|
14
|
+
from genassert.assertions.language import assert_language
|
|
15
|
+
from genassert.assertions.pii import assert_no_pii, PIIMatch
|
|
16
|
+
from genassert.assertions.readability import assert_readability
|
|
17
|
+
from genassert.assertions.sentiment import assert_sentiment
|
|
18
|
+
from genassert.baseline import record_baseline, compare_baseline
|
|
19
|
+
from genassert.judge import LocalJudge
|
|
20
|
+
|
|
21
|
+
__version__ = "0.2.0"
|
|
22
|
+
__all__ = [
|
|
23
|
+
# Core semantic assertions
|
|
24
|
+
"assert_intent",
|
|
25
|
+
"assert_tone",
|
|
26
|
+
"assert_no_hallucination",
|
|
27
|
+
"assert_similar_to",
|
|
28
|
+
# Structural assertions
|
|
29
|
+
"assert_token_budget",
|
|
30
|
+
"assert_schema",
|
|
31
|
+
# Advanced assertions (new in 0.2.0)
|
|
32
|
+
"assert_language",
|
|
33
|
+
"assert_no_pii",
|
|
34
|
+
"assert_readability",
|
|
35
|
+
"assert_sentiment",
|
|
36
|
+
# Types
|
|
37
|
+
"PIIMatch",
|
|
38
|
+
# Baseline regression
|
|
39
|
+
"record_baseline",
|
|
40
|
+
"compare_baseline",
|
|
41
|
+
# Local judge
|
|
42
|
+
"LocalJudge",
|
|
43
|
+
]
|
genassert/_embed.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding backend for genassert.
|
|
3
|
+
|
|
4
|
+
Priority order (auto-detected):
|
|
5
|
+
1. sentence-transformers (local, free, fast) — recommended
|
|
6
|
+
2. openai embeddings (requires OPENAI_API_KEY)
|
|
7
|
+
3. numpy hash-based fallback (no deps, for smoke tests only)
|
|
8
|
+
|
|
9
|
+
Set GENASSERT_EMBED_BACKEND=openai|local|fallback to force a backend.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
import os
|
|
14
|
+
import hashlib
|
|
15
|
+
import math
|
|
16
|
+
from functools import lru_cache
|
|
17
|
+
|
|
18
|
+
_BACKEND_ENV = "GENASSERT_EMBED_BACKEND"
|
|
19
|
+
_MODEL_ENV = "GENASSERT_EMBED_MODEL"
|
|
20
|
+
|
|
21
|
+
_DEFAULT_LOCAL_MODEL = "all-MiniLM-L6-v2"
|
|
22
|
+
_DEFAULT_OPENAI_MODEL = "text-embedding-3-small"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@lru_cache(maxsize=512)
|
|
26
|
+
def embed_text(text: str) -> tuple[float, ...]:
|
|
27
|
+
"""
|
|
28
|
+
Embed `text` into a float vector using the best available backend.
|
|
29
|
+
Results are cached in-process for performance.
|
|
30
|
+
"""
|
|
31
|
+
backend = os.environ.get(_BACKEND_ENV, "auto").lower()
|
|
32
|
+
|
|
33
|
+
if backend == "openai":
|
|
34
|
+
return _embed_openai(text)
|
|
35
|
+
elif backend in ("local", "sentence-transformers"):
|
|
36
|
+
return _embed_local(text)
|
|
37
|
+
elif backend == "fallback":
|
|
38
|
+
return _embed_fallback(text)
|
|
39
|
+
else:
|
|
40
|
+
# auto-detect
|
|
41
|
+
return _embed_auto(text)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _embed_auto(text: str) -> tuple[float, ...]:
|
|
45
|
+
"""Try backends in priority order."""
|
|
46
|
+
try:
|
|
47
|
+
return _embed_local(text)
|
|
48
|
+
except ImportError:
|
|
49
|
+
pass
|
|
50
|
+
try:
|
|
51
|
+
return _embed_openai(text)
|
|
52
|
+
except (ImportError, RuntimeError):
|
|
53
|
+
pass
|
|
54
|
+
return _embed_fallback(text)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _embed_local(text: str) -> tuple[float, ...]:
|
|
58
|
+
"""Use sentence-transformers (local, no API cost)."""
|
|
59
|
+
try:
|
|
60
|
+
from sentence_transformers import SentenceTransformer
|
|
61
|
+
except ImportError:
|
|
62
|
+
raise ImportError(
|
|
63
|
+
"sentence-transformers not installed.\n"
|
|
64
|
+
" Install: pip install sentence-transformers\n"
|
|
65
|
+
" Or set GENASSERT_EMBED_BACKEND=openai"
|
|
66
|
+
)
|
|
67
|
+
model_name = os.environ.get(_MODEL_ENV, _DEFAULT_LOCAL_MODEL)
|
|
68
|
+
model = _get_local_model(model_name)
|
|
69
|
+
vec = model.encode(text, normalize_embeddings=True)
|
|
70
|
+
return tuple(float(v) for v in vec)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@lru_cache(maxsize=4)
|
|
74
|
+
def _get_local_model(model_name: str):
|
|
75
|
+
from sentence_transformers import SentenceTransformer
|
|
76
|
+
return SentenceTransformer(model_name)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _embed_openai(text: str) -> tuple[float, ...]:
|
|
80
|
+
"""Use OpenAI embeddings API."""
|
|
81
|
+
try:
|
|
82
|
+
import openai
|
|
83
|
+
except ImportError:
|
|
84
|
+
raise ImportError("openai not installed. pip install openai")
|
|
85
|
+
|
|
86
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
|
87
|
+
if not api_key:
|
|
88
|
+
raise RuntimeError(
|
|
89
|
+
"OPENAI_API_KEY not set. "
|
|
90
|
+
"Set it or use GENASSERT_EMBED_BACKEND=local"
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
model = os.environ.get(_MODEL_ENV, _DEFAULT_OPENAI_MODEL)
|
|
94
|
+
client = openai.OpenAI(api_key=api_key)
|
|
95
|
+
result = client.embeddings.create(input=[text], model=model)
|
|
96
|
+
vec = result.data[0].embedding
|
|
97
|
+
return tuple(float(v) for v in vec)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _embed_fallback(text: str) -> tuple[float, ...]:
|
|
101
|
+
"""
|
|
102
|
+
Hash-based pseudo-embedding. No dependencies, but ONLY suitable for
|
|
103
|
+
smoke tests. Will NOT produce meaningful semantic similarity scores.
|
|
104
|
+
"""
|
|
105
|
+
dim = 384
|
|
106
|
+
vec = []
|
|
107
|
+
for i in range(dim):
|
|
108
|
+
seed = hashlib.sha256(f"{i}:{text}".encode()).digest()
|
|
109
|
+
val = int.from_bytes(seed[:4], "big") / (2**32)
|
|
110
|
+
vec.append(val - 0.5)
|
|
111
|
+
# normalize
|
|
112
|
+
mag = math.sqrt(sum(v**2 for v in vec))
|
|
113
|
+
return tuple(v / mag for v in vec)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# assertions package
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""
|
|
2
|
+
assert_token_budget: Verify LLM response stays within a token limit.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def assert_token_budget(
|
|
9
|
+
response: str,
|
|
10
|
+
max_tokens: int,
|
|
11
|
+
tokenizer: str = "approx",
|
|
12
|
+
) -> None:
|
|
13
|
+
"""
|
|
14
|
+
Assert that `response` does not exceed `max_tokens`.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
response:
|
|
19
|
+
The LLM output to evaluate.
|
|
20
|
+
max_tokens:
|
|
21
|
+
Maximum allowed token count.
|
|
22
|
+
tokenizer:
|
|
23
|
+
"approx" uses a fast word-based approximation (~1.3 tokens/word).
|
|
24
|
+
"tiktoken" uses OpenAI's tiktoken (requires `pip install tiktoken`).
|
|
25
|
+
"chars" uses character count divided by 4.
|
|
26
|
+
|
|
27
|
+
Raises
|
|
28
|
+
------
|
|
29
|
+
AssertionError
|
|
30
|
+
If the response exceeds the token budget.
|
|
31
|
+
|
|
32
|
+
Examples
|
|
33
|
+
--------
|
|
34
|
+
>>> assert_token_budget(response, max_tokens=200)
|
|
35
|
+
>>> assert_token_budget(response, max_tokens=500, tokenizer="tiktoken")
|
|
36
|
+
"""
|
|
37
|
+
count = _count_tokens(response, tokenizer)
|
|
38
|
+
if count > max_tokens:
|
|
39
|
+
raise AssertionError(
|
|
40
|
+
f"Token budget exceeded.\n"
|
|
41
|
+
f" Max allowed : {max_tokens} tokens\n"
|
|
42
|
+
f" Actual count: {count} tokens (method: {tokenizer!r})\n"
|
|
43
|
+
f" Response preview: {response[:200]!r}"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _count_tokens(text: str, method: str) -> int:
|
|
48
|
+
if method == "tiktoken":
|
|
49
|
+
try:
|
|
50
|
+
import tiktoken
|
|
51
|
+
enc = tiktoken.get_encoding("cl100k_base")
|
|
52
|
+
return len(enc.encode(text))
|
|
53
|
+
except ImportError:
|
|
54
|
+
raise ImportError(
|
|
55
|
+
"tokenizer='tiktoken' requires: pip install tiktoken"
|
|
56
|
+
)
|
|
57
|
+
elif method == "chars":
|
|
58
|
+
return len(text) // 4
|
|
59
|
+
else:
|
|
60
|
+
# approx: ~1.3 tokens per word is a reasonable average
|
|
61
|
+
words = len(text.split())
|
|
62
|
+
return int(words * 1.3)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
assert_no_hallucination: Verify LLM output doesn't contradict known facts.
|
|
3
|
+
|
|
4
|
+
Checks that factual claims in the response are consistent with a provided
|
|
5
|
+
list of ground-truth facts, using semantic similarity and contradiction detection.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def assert_no_hallucination(
|
|
12
|
+
response: str,
|
|
13
|
+
known_facts: list[str],
|
|
14
|
+
contradiction_threshold: float = 0.85,
|
|
15
|
+
) -> None:
|
|
16
|
+
"""
|
|
17
|
+
Assert that `response` does not contradict any of the `known_facts`.
|
|
18
|
+
|
|
19
|
+
This does NOT verify that every fact is mentioned — it checks that
|
|
20
|
+
the response does not make claims that semantically contradict the facts.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
response:
|
|
25
|
+
The LLM output to evaluate.
|
|
26
|
+
known_facts:
|
|
27
|
+
A list of factual statements that must NOT be contradicted.
|
|
28
|
+
E.g. ["The capital of France is Paris", "The product costs $49/month"]
|
|
29
|
+
contradiction_threshold:
|
|
30
|
+
If the negation of a fact is highly similar to the response, flag it.
|
|
31
|
+
Default 0.85.
|
|
32
|
+
|
|
33
|
+
Raises
|
|
34
|
+
------
|
|
35
|
+
AssertionError
|
|
36
|
+
If the response appears to contradict one or more known facts.
|
|
37
|
+
|
|
38
|
+
Examples
|
|
39
|
+
--------
|
|
40
|
+
>>> assert_no_hallucination(
|
|
41
|
+
... response,
|
|
42
|
+
... known_facts=["Python was created by Guido van Rossum in 1991"]
|
|
43
|
+
... )
|
|
44
|
+
"""
|
|
45
|
+
from genassert._embed import embed_text
|
|
46
|
+
from genassert.assertions.intent import _cosine_similarity
|
|
47
|
+
|
|
48
|
+
response_embedding = embed_text(response)
|
|
49
|
+
contradictions = []
|
|
50
|
+
|
|
51
|
+
for fact in known_facts:
|
|
52
|
+
# Build negation of the fact to detect contradictions
|
|
53
|
+
negation = _negate_fact(fact)
|
|
54
|
+
negation_embedding = embed_text(negation)
|
|
55
|
+
similarity = _cosine_similarity(response_embedding, negation_embedding)
|
|
56
|
+
|
|
57
|
+
if similarity > contradiction_threshold:
|
|
58
|
+
contradictions.append((fact, similarity))
|
|
59
|
+
|
|
60
|
+
if contradictions:
|
|
61
|
+
details = "\n".join(
|
|
62
|
+
f" - Contradicts: {fact!r} (score: {score:.3f})"
|
|
63
|
+
for fact, score in contradictions
|
|
64
|
+
)
|
|
65
|
+
raise AssertionError(
|
|
66
|
+
f"Hallucination detected — response contradicts known facts:\n{details}\n"
|
|
67
|
+
f" Response preview: {response[:200]!r}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _negate_fact(fact: str) -> str:
|
|
72
|
+
"""Produce a simple semantic negation of a fact for contradiction detection."""
|
|
73
|
+
fact = fact.strip()
|
|
74
|
+
negations = [
|
|
75
|
+
("is not", "is"),
|
|
76
|
+
("are not", "are"),
|
|
77
|
+
("was not", "was"),
|
|
78
|
+
("were not", "were"),
|
|
79
|
+
("does not", "does"),
|
|
80
|
+
("did not", "did"),
|
|
81
|
+
("cannot", "can"),
|
|
82
|
+
("will not", "will"),
|
|
83
|
+
]
|
|
84
|
+
lower = fact.lower()
|
|
85
|
+
for neg, pos in negations:
|
|
86
|
+
if pos in lower:
|
|
87
|
+
return fact.replace(pos, neg, 1)
|
|
88
|
+
# Fallback: prepend "It is false that"
|
|
89
|
+
return f"It is false that {fact}"
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""
|
|
2
|
+
assert_intent: Check that an LLM response addresses the expected intent.
|
|
3
|
+
|
|
4
|
+
Uses embedding cosine similarity to compare the response against a
|
|
5
|
+
natural-language description of what the response should convey.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import os
|
|
10
|
+
from genassert._embed import embed_text
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def assert_intent(
|
|
14
|
+
response: str,
|
|
15
|
+
expected_intent: str,
|
|
16
|
+
threshold: float = 0.72,
|
|
17
|
+
model: str | None = None,
|
|
18
|
+
) -> None:
|
|
19
|
+
"""
|
|
20
|
+
Assert that `response` semantically matches the `expected_intent`.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
response:
|
|
25
|
+
The LLM output to evaluate.
|
|
26
|
+
expected_intent:
|
|
27
|
+
A plain-English description of what the response should convey.
|
|
28
|
+
E.g. "a concise summary of the article".
|
|
29
|
+
threshold:
|
|
30
|
+
Cosine similarity threshold (0–1). Default 0.72 works well for
|
|
31
|
+
most English text pairs. Lower = more lenient, higher = stricter.
|
|
32
|
+
model:
|
|
33
|
+
Optional embedding model override.
|
|
34
|
+
|
|
35
|
+
Raises
|
|
36
|
+
------
|
|
37
|
+
AssertionError
|
|
38
|
+
If the cosine similarity between `response` and `expected_intent`
|
|
39
|
+
is below `threshold`.
|
|
40
|
+
|
|
41
|
+
Examples
|
|
42
|
+
--------
|
|
43
|
+
>>> assert_intent(response, "a polite refusal to the user's request")
|
|
44
|
+
>>> assert_intent(response, "Python code that reads a CSV file", threshold=0.80)
|
|
45
|
+
"""
|
|
46
|
+
similarity = _cosine_similarity(embed_text(response), embed_text(expected_intent))
|
|
47
|
+
if similarity < threshold:
|
|
48
|
+
raise AssertionError(
|
|
49
|
+
f"Intent assertion failed.\n"
|
|
50
|
+
f" Expected intent : {expected_intent!r}\n"
|
|
51
|
+
f" Cosine similarity: {similarity:.3f} (threshold: {threshold})\n"
|
|
52
|
+
f" Response preview : {response[:200]!r}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
57
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
58
|
+
mag_a = sum(x**2 for x in a) ** 0.5
|
|
59
|
+
mag_b = sum(x**2 for x in b) ** 0.5
|
|
60
|
+
if mag_a == 0 or mag_b == 0:
|
|
61
|
+
return 0.0
|
|
62
|
+
return dot / (mag_a * mag_b)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""
|
|
2
|
+
assert_language: Verify LLM response is written in the expected language.
|
|
3
|
+
|
|
4
|
+
Uses character n-gram frequency profiles to detect language without
|
|
5
|
+
any external dependencies. Supports 20+ common languages.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import re
|
|
10
|
+
import unicodedata
|
|
11
|
+
|
|
12
|
+
# Script/character-range based fast detection
|
|
13
|
+
_SCRIPT_HINTS: list[tuple[str, tuple[int, int]]] = [
|
|
14
|
+
("arabic", (0x0600, 0x06FF)),
|
|
15
|
+
("hebrew", (0x0590, 0x05FF)),
|
|
16
|
+
("chinese", (0x4E00, 0x9FFF)),
|
|
17
|
+
("japanese", (0x3040, 0x30FF)),
|
|
18
|
+
("korean", (0xAC00, 0xD7AF)),
|
|
19
|
+
("thai", (0x0E00, 0x0E7F)),
|
|
20
|
+
("greek", (0x0370, 0x03FF)),
|
|
21
|
+
("cyrillic", (0x0400, 0x04FF)),
|
|
22
|
+
("devanagari", (0x0900, 0x097F)),
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
# Common high-frequency words per language (stop-word fingerprint)
|
|
26
|
+
_STOPWORDS: dict[str, list[str]] = {
|
|
27
|
+
"english": ["the", "and", "is", "in", "to", "of", "a", "that", "it", "was"],
|
|
28
|
+
"spanish": ["el", "la", "de", "que", "en", "los", "se", "las", "un", "una"],
|
|
29
|
+
"french": ["le", "la", "les", "de", "et", "en", "un", "une", "du", "que"],
|
|
30
|
+
"german": ["der", "die", "das", "und", "in", "den", "von", "zu", "ist", "mit"],
|
|
31
|
+
"portuguese": ["de", "da", "do", "que", "em", "os", "as", "um", "uma", "para"],
|
|
32
|
+
"italian": ["il", "la", "di", "che", "e", "un", "una", "in", "del", "per"],
|
|
33
|
+
"dutch": ["de", "het", "een", "van", "en", "in", "is", "dat", "op", "te"],
|
|
34
|
+
"russian": ["и", "в", "не", "на", "я", "что", "с", "он", "как", "это"],
|
|
35
|
+
"polish": ["i", "w", "nie", "to", "się", "na", "jest", "z", "że", "do"],
|
|
36
|
+
"turkish": ["bir", "bu", "ve", "da", "de", "için", "ile", "var", "ne", "mi"],
|
|
37
|
+
"swedish": ["och", "i", "en", "att", "det", "av", "på", "är", "för", "med"],
|
|
38
|
+
"norwegian": ["og", "i", "en", "er", "til", "av", "på", "et", "som", "for"],
|
|
39
|
+
"danish": ["og", "i", "en", "er", "til", "af", "på", "et", "som", "for"],
|
|
40
|
+
"finnish": ["ja", "on", "ei", "se", "että", "hän", "oli", "en", "niin", "jo"],
|
|
41
|
+
"czech": ["a", "je", "to", "v", "se", "na", "že", "z", "do", "s"],
|
|
42
|
+
"hungarian": ["a", "az", "és", "hogy", "nem", "van", "egy", "is", "de", "meg"],
|
|
43
|
+
"romanian": ["și", "în", "de", "că", "este", "la", "cu", "se", "nu", "din"],
|
|
44
|
+
"indonesian": ["yang", "dan", "di", "ke", "dari", "ini", "itu", "untuk", "dengan", "ada"],
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
_LANG_ALIASES: dict[str, str] = {
|
|
48
|
+
"en": "english", "es": "spanish", "fr": "french", "de": "german",
|
|
49
|
+
"pt": "portuguese", "it": "italian", "nl": "dutch", "ru": "russian",
|
|
50
|
+
"pl": "polish", "tr": "turkish", "sv": "swedish", "no": "norwegian",
|
|
51
|
+
"da": "danish", "fi": "finnish", "cs": "czech", "hu": "hungarian",
|
|
52
|
+
"ro": "romanian", "id": "indonesian",
|
|
53
|
+
"ar": "arabic", "he": "hebrew", "zh": "chinese", "ja": "japanese",
|
|
54
|
+
"ko": "korean", "th": "thai", "el": "greek", "hi": "devanagari",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def assert_language(
|
|
59
|
+
response: str,
|
|
60
|
+
expected_language: str,
|
|
61
|
+
min_confidence: float = 0.3,
|
|
62
|
+
) -> str:
|
|
63
|
+
"""
|
|
64
|
+
Assert that `response` is written in the expected language.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
response:
|
|
69
|
+
The LLM output to evaluate.
|
|
70
|
+
expected_language:
|
|
71
|
+
Language name (e.g. "english", "spanish", "french") or ISO 639-1
|
|
72
|
+
code (e.g. "en", "es", "fr").
|
|
73
|
+
min_confidence:
|
|
74
|
+
Minimum confidence score (0–1) to accept the detection. Default 0.3.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
str
|
|
79
|
+
The detected language name.
|
|
80
|
+
|
|
81
|
+
Raises
|
|
82
|
+
------
|
|
83
|
+
AssertionError
|
|
84
|
+
If the detected language doesn't match expected.
|
|
85
|
+
ValueError
|
|
86
|
+
If expected_language is not recognised.
|
|
87
|
+
|
|
88
|
+
Examples
|
|
89
|
+
--------
|
|
90
|
+
>>> assert_language(response, "english")
|
|
91
|
+
>>> assert_language(response, "fr") # ISO code
|
|
92
|
+
>>> assert_language(response, "spanish")
|
|
93
|
+
"""
|
|
94
|
+
lang = expected_language.lower().strip()
|
|
95
|
+
lang = _LANG_ALIASES.get(lang, lang)
|
|
96
|
+
|
|
97
|
+
detected, confidence = _detect_language(response)
|
|
98
|
+
|
|
99
|
+
if detected != lang:
|
|
100
|
+
raise AssertionError(
|
|
101
|
+
f"Language assertion failed.\n"
|
|
102
|
+
f" Expected language : {lang!r}\n"
|
|
103
|
+
f" Detected language : {detected!r} (confidence: {confidence:.2f})\n"
|
|
104
|
+
f" Response preview : {response[:150]!r}"
|
|
105
|
+
)
|
|
106
|
+
if confidence < min_confidence:
|
|
107
|
+
raise AssertionError(
|
|
108
|
+
f"Language detected as {detected!r} but confidence too low.\n"
|
|
109
|
+
f" Confidence: {confidence:.2f} (min: {min_confidence})\n"
|
|
110
|
+
f" Response preview: {response[:150]!r}"
|
|
111
|
+
)
|
|
112
|
+
return detected
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _detect_language(text: str) -> tuple[str, float]:
|
|
116
|
+
"""Return (language_name, confidence) for `text`."""
|
|
117
|
+
if not text.strip():
|
|
118
|
+
return "unknown", 0.0
|
|
119
|
+
|
|
120
|
+
# Fast path: script detection for non-Latin scripts
|
|
121
|
+
for lang, (lo, hi) in _SCRIPT_HINTS:
|
|
122
|
+
count = sum(1 for ch in text if lo <= ord(ch) <= hi)
|
|
123
|
+
ratio = count / max(len(text), 1)
|
|
124
|
+
if ratio > 0.2:
|
|
125
|
+
return lang, min(1.0, ratio * 3)
|
|
126
|
+
|
|
127
|
+
# Stopword frequency scoring for Latin-script languages
|
|
128
|
+
words = set(re.findall(r"\b[a-zA-ZÀ-ÿ]+\b", text.lower()))
|
|
129
|
+
scores: dict[str, float] = {}
|
|
130
|
+
for lang, stopwords in _STOPWORDS.items():
|
|
131
|
+
hits = sum(1 for w in stopwords if w in words)
|
|
132
|
+
scores[lang] = hits / len(stopwords)
|
|
133
|
+
|
|
134
|
+
if not scores:
|
|
135
|
+
return "unknown", 0.0
|
|
136
|
+
|
|
137
|
+
best = max(scores, key=lambda k: scores[k])
|
|
138
|
+
best_score = scores[best]
|
|
139
|
+
|
|
140
|
+
# Normalise against second-best to get relative confidence
|
|
141
|
+
sorted_scores = sorted(scores.values(), reverse=True)
|
|
142
|
+
second = sorted_scores[1] if len(sorted_scores) > 1 else 0.0
|
|
143
|
+
confidence = best_score - second if best_score > 0 else 0.0
|
|
144
|
+
|
|
145
|
+
return best, min(1.0, confidence * 5)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""
|
|
2
|
+
assert_no_pii: Detect PII (Personally Identifiable Information) leakage in LLM responses.
|
|
3
|
+
|
|
4
|
+
Checks for emails, phone numbers, SSNs, credit card numbers, IP addresses,
|
|
5
|
+
and common name/address patterns — all via regex, zero dependencies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import re
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class PIIMatch:
|
|
15
|
+
type: str
|
|
16
|
+
value: str
|
|
17
|
+
start: int
|
|
18
|
+
end: int
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ── PII Patterns ──────────────────────────────────────────────────────────────
|
|
22
|
+
|
|
23
|
+
_PATTERNS: dict[str, str] = {
|
|
24
|
+
"email": r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b",
|
|
25
|
+
"phone_us": r"\b(\+1[\s\-.]?)?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}\b",
|
|
26
|
+
"phone_intl": r"\+\d{1,3}[\s\-.]?\(?\d{1,4}\)?[\s\-.]?\d{1,4}[\s\-.]?\d{1,9}",
|
|
27
|
+
"ssn": r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b",
|
|
28
|
+
"credit_card": r"\b(?:\d{4}[\s\-]?){3}\d{4}\b",
|
|
29
|
+
"ip_address": r"\b(?:\d{1,3}\.){3}\d{1,3}\b",
|
|
30
|
+
"date_of_birth": r"\b(?:DOB|date of birth|born)[:\s]+\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}\b",
|
|
31
|
+
"passport": r"\b[A-Z]{1,2}\d{6,9}\b",
|
|
32
|
+
"iban": r"\b[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}([A-Z0-9]?){0,16}\b",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
_COMPILED = {name: re.compile(pattern, re.IGNORECASE) for name, pattern in _PATTERNS.items()}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def assert_no_pii(
|
|
39
|
+
response: str,
|
|
40
|
+
checks: list[str] | None = None,
|
|
41
|
+
allow: list[str] | None = None,
|
|
42
|
+
) -> list[PIIMatch]:
|
|
43
|
+
"""
|
|
44
|
+
Assert that `response` contains no PII.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
response:
|
|
49
|
+
The LLM output to evaluate.
|
|
50
|
+
checks:
|
|
51
|
+
List of PII types to check. Default: all types.
|
|
52
|
+
Options: "email", "phone_us", "phone_intl", "ssn", "credit_card",
|
|
53
|
+
"ip_address", "date_of_birth", "passport", "iban"
|
|
54
|
+
allow:
|
|
55
|
+
List of PII types to explicitly allow (skip checking).
|
|
56
|
+
Useful when IP addresses or emails are expected in output.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
list[PIIMatch]
|
|
61
|
+
Empty list if no PII found (assertion passes).
|
|
62
|
+
|
|
63
|
+
Raises
|
|
64
|
+
------
|
|
65
|
+
AssertionError
|
|
66
|
+
If any PII is detected.
|
|
67
|
+
|
|
68
|
+
Examples
|
|
69
|
+
--------
|
|
70
|
+
>>> assert_no_pii(response)
|
|
71
|
+
>>> assert_no_pii(response, checks=["email", "ssn"])
|
|
72
|
+
>>> assert_no_pii(response, allow=["ip_address"]) # allow IPs
|
|
73
|
+
"""
|
|
74
|
+
active = set(checks or _COMPILED.keys()) - set(allow or [])
|
|
75
|
+
found: list[PIIMatch] = []
|
|
76
|
+
|
|
77
|
+
for pii_type in active:
|
|
78
|
+
pattern = _COMPILED.get(pii_type)
|
|
79
|
+
if pattern is None:
|
|
80
|
+
raise ValueError(f"Unknown PII type: {pii_type!r}. Valid: {list(_COMPILED)}")
|
|
81
|
+
for m in pattern.finditer(response):
|
|
82
|
+
found.append(PIIMatch(
|
|
83
|
+
type=pii_type,
|
|
84
|
+
value=_redact(m.group()),
|
|
85
|
+
start=m.start(),
|
|
86
|
+
end=m.end(),
|
|
87
|
+
))
|
|
88
|
+
|
|
89
|
+
if found:
|
|
90
|
+
details = "\n".join(
|
|
91
|
+
f" [{p.type}] {p.value!r} at position {p.start}"
|
|
92
|
+
for p in found
|
|
93
|
+
)
|
|
94
|
+
raise AssertionError(
|
|
95
|
+
f"PII detected in LLM response ({len(found)} match{'es' if len(found) > 1 else ''}):\n"
|
|
96
|
+
f"{details}\n"
|
|
97
|
+
f" Response preview: {response[:200]!r}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return found
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _redact(value: str) -> str:
|
|
104
|
+
"""Partially redact a PII value for safe display in error messages."""
|
|
105
|
+
if len(value) <= 4:
|
|
106
|
+
return "****"
|
|
107
|
+
return value[:2] + "*" * (len(value) - 4) + value[-2:]
|