faultmap 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faultmap/__init__.py +36 -0
- faultmap/analyzer.py +537 -0
- faultmap/coverage/__init__.py +3 -0
- faultmap/coverage/detector.py +127 -0
- faultmap/embeddings.py +163 -0
- faultmap/exceptions.py +25 -0
- faultmap/labeling.py +105 -0
- faultmap/llm.py +123 -0
- faultmap/models.py +243 -0
- faultmap/py.typed +0 -0
- faultmap/report.py +204 -0
- faultmap/scoring/__init__.py +6 -0
- faultmap/scoring/base.py +22 -0
- faultmap/scoring/entropy.py +194 -0
- faultmap/scoring/precomputed.py +14 -0
- faultmap/scoring/reference.py +46 -0
- faultmap/slicing/__init__.py +14 -0
- faultmap/slicing/clustering.py +201 -0
- faultmap/slicing/statistics.py +198 -0
- faultmap/utils.py +103 -0
- faultmap-0.2.0.dist-info/METADATA +412 -0
- faultmap-0.2.0.dist-info/RECORD +24 -0
- faultmap-0.2.0.dist-info/WHEEL +4 -0
- faultmap-0.2.0.dist-info/licenses/LICENSE +201 -0
faultmap/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""faultmap: Automatically discover where and why your LLM is failing."""
|
|
2
|
+
|
|
3
|
+
from .analyzer import SliceAnalyzer
|
|
4
|
+
from .exceptions import (
|
|
5
|
+
ClusteringError,
|
|
6
|
+
ConfigurationError,
|
|
7
|
+
EmbeddingError,
|
|
8
|
+
FaultmapError,
|
|
9
|
+
LLMError,
|
|
10
|
+
ScoringError,
|
|
11
|
+
)
|
|
12
|
+
from .models import (
|
|
13
|
+
AnalysisReport,
|
|
14
|
+
CoverageGap,
|
|
15
|
+
CoverageReport,
|
|
16
|
+
FailureSlice,
|
|
17
|
+
ScoringResult,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__version__ = "0.2.0"
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"SliceAnalyzer",
|
|
24
|
+
"AnalysisReport",
|
|
25
|
+
"FailureSlice",
|
|
26
|
+
"CoverageReport",
|
|
27
|
+
"CoverageGap",
|
|
28
|
+
"ScoringResult",
|
|
29
|
+
"FaultmapError",
|
|
30
|
+
"EmbeddingError",
|
|
31
|
+
"ScoringError",
|
|
32
|
+
"LLMError",
|
|
33
|
+
"ClusteringError",
|
|
34
|
+
"ConfigurationError",
|
|
35
|
+
"__version__",
|
|
36
|
+
]
|
faultmap/analyzer.py
ADDED
|
@@ -0,0 +1,537 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from .embeddings import Embedder, get_embedder
|
|
9
|
+
from .exceptions import ConfigurationError
|
|
10
|
+
from .labeling import label_clusters
|
|
11
|
+
from .llm import AsyncLLMClient
|
|
12
|
+
from .models import (
|
|
13
|
+
AnalysisReport,
|
|
14
|
+
CoverageGap,
|
|
15
|
+
CoverageReport,
|
|
16
|
+
FailureSlice,
|
|
17
|
+
ScoringResult,
|
|
18
|
+
)
|
|
19
|
+
from .utils import run_sync, validate_inputs
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SliceAnalyzer:
|
|
25
|
+
"""Discover failure slices and coverage gaps in LLM evaluations.
|
|
26
|
+
|
|
27
|
+
``SliceAnalyzer`` is the main entry point for faultmap. It provides two
|
|
28
|
+
methods:
|
|
29
|
+
|
|
30
|
+
- :meth:`analyze` — find input slices with statistically elevated failure rates
|
|
31
|
+
- :meth:`audit_coverage` — find semantic blind spots in a test suite
|
|
32
|
+
|
|
33
|
+
Example::
|
|
34
|
+
|
|
35
|
+
from faultmap import SliceAnalyzer
|
|
36
|
+
|
|
37
|
+
analyzer = SliceAnalyzer(model="gpt-4o-mini")
|
|
38
|
+
report = analyzer.analyze(prompts, responses, scores=scores)
|
|
39
|
+
print(report)
|
|
40
|
+
|
|
41
|
+
coverage = analyzer.audit_coverage(test_prompts, prod_prompts)
|
|
42
|
+
print(coverage)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
model: str = "gpt-4o-mini",
|
|
48
|
+
embedding_model: str = "text-embedding-3-small",
|
|
49
|
+
significance_level: float = 0.05,
|
|
50
|
+
min_slice_size: int = 10,
|
|
51
|
+
failure_threshold: float = 0.5,
|
|
52
|
+
n_samples: int = 8,
|
|
53
|
+
clustering_method: str = "hdbscan",
|
|
54
|
+
max_concurrent_requests: int = 50,
|
|
55
|
+
temperature: float = 1.0,
|
|
56
|
+
consistency_threshold: float = 0.8,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Initialize a SliceAnalyzer.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
model: litellm model string used for LLM calls (cluster naming and
|
|
62
|
+
Mode 3 response sampling). Supports 100+ providers, e.g.
|
|
63
|
+
``"gpt-4o-mini"``, ``"anthropic/claude-3-haiku-20240307"``,
|
|
64
|
+
``"ollama/mistral"``.
|
|
65
|
+
embedding_model: Embedding model name. Local sentence-transformers models
|
|
66
|
+
are auto-detected by prefix (``"all-MiniLM-"``, ``"all-mpnet-"``,
|
|
67
|
+
``"paraphrase-"``); all others route to ``APIEmbedder`` via litellm.
|
|
68
|
+
The default uses an API-backed embedding model so
|
|
69
|
+
``pip install faultmap`` works without optional extras. Local models
|
|
70
|
+
require ``pip install faultmap[local]``.
|
|
71
|
+
significance_level: FDR alpha for Benjamini-Hochberg correction. Slices
|
|
72
|
+
with ``adjusted_p_value < significance_level`` are reported.
|
|
73
|
+
Default ``0.05``.
|
|
74
|
+
min_slice_size: Minimum number of prompts a cluster must contain to be
|
|
75
|
+
tested. Smaller clusters are silently discarded. Default ``10``.
|
|
76
|
+
failure_threshold: Score cutoff for binary pass/fail. A prompt with
|
|
77
|
+
``score < failure_threshold`` is counted as a failure. Default ``0.5``.
|
|
78
|
+
n_samples: Mode 3 only. Number of additional LLM responses sampled per
|
|
79
|
+
prompt for entropy estimation. Must be >= 2. Higher values give more
|
|
80
|
+
accurate entropy estimates at greater API cost. Default ``8``.
|
|
81
|
+
clustering_method: Clustering algorithm.
|
|
82
|
+
- ``"hdbscan"`` (default): automatically discovers the number of
|
|
83
|
+
clusters using density-based clustering (sklearn >= 1.3 built-in).
|
|
84
|
+
- ``"agglomerative"``: Ward linkage with silhouette-based k-selection
|
|
85
|
+
over ``[5, 10, 15, 20, 25, 30]``. More predictable cluster count.
|
|
86
|
+
max_concurrent_requests: Maximum number of parallel LLM API calls.
|
|
87
|
+
Controlled via asyncio semaphore. Reduce if hitting rate limits.
|
|
88
|
+
Default ``50``.
|
|
89
|
+
temperature: Mode 3 only. Sampling temperature for response diversity.
|
|
90
|
+
Higher values increase entropy estimation accuracy. Default ``1.0``.
|
|
91
|
+
consistency_threshold: Mode 3 only. Cosine similarity threshold above
|
|
92
|
+
which a sampled response is considered "consistent" with the original.
|
|
93
|
+
Default ``0.8``.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ConfigurationError: If ``clustering_method`` is not ``"hdbscan"`` or
|
|
97
|
+
``"agglomerative"``, ``significance_level`` is not in (0, 1),
|
|
98
|
+
``failure_threshold`` is not in [0, 1], ``n_samples < 2``,
|
|
99
|
+
``min_slice_size <= 0``, ``max_concurrent_requests <= 0``,
|
|
100
|
+
``temperature < 0``, or ``consistency_threshold`` is not in [0, 1].
|
|
101
|
+
"""
|
|
102
|
+
# Validate
|
|
103
|
+
if clustering_method not in ("hdbscan", "agglomerative"):
|
|
104
|
+
raise ConfigurationError(
|
|
105
|
+
f"clustering_method must be 'hdbscan' or 'agglomerative', "
|
|
106
|
+
f"got {clustering_method!r}"
|
|
107
|
+
)
|
|
108
|
+
if not 0 < significance_level < 1:
|
|
109
|
+
raise ConfigurationError("significance_level must be in (0, 1)")
|
|
110
|
+
if min_slice_size <= 0:
|
|
111
|
+
raise ConfigurationError("min_slice_size must be > 0")
|
|
112
|
+
if not 0 <= failure_threshold <= 1:
|
|
113
|
+
raise ConfigurationError("failure_threshold must be in [0, 1]")
|
|
114
|
+
if n_samples < 2:
|
|
115
|
+
raise ConfigurationError("n_samples must be >= 2 for entropy scoring")
|
|
116
|
+
if max_concurrent_requests <= 0:
|
|
117
|
+
raise ConfigurationError("max_concurrent_requests must be > 0")
|
|
118
|
+
if temperature < 0:
|
|
119
|
+
raise ConfigurationError("temperature must be >= 0")
|
|
120
|
+
if not 0 <= consistency_threshold <= 1:
|
|
121
|
+
raise ConfigurationError("consistency_threshold must be in [0, 1]")
|
|
122
|
+
|
|
123
|
+
self.model = model
|
|
124
|
+
self.embedding_model = embedding_model
|
|
125
|
+
self.significance_level = significance_level
|
|
126
|
+
self.min_slice_size = min_slice_size
|
|
127
|
+
self.failure_threshold = failure_threshold
|
|
128
|
+
self.n_samples = n_samples
|
|
129
|
+
self.clustering_method = clustering_method
|
|
130
|
+
self.max_concurrent_requests = max_concurrent_requests
|
|
131
|
+
self.temperature = temperature
|
|
132
|
+
self.consistency_threshold = consistency_threshold
|
|
133
|
+
|
|
134
|
+
self._embedder: Embedder = get_embedder(embedding_model)
|
|
135
|
+
self._llm_client = AsyncLLMClient(
|
|
136
|
+
model=model,
|
|
137
|
+
max_concurrent_requests=max_concurrent_requests,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# ── analyze() ──────────────────────────────────────────
|
|
141
|
+
|
|
142
|
+
def analyze(
|
|
143
|
+
self,
|
|
144
|
+
prompts: list[str],
|
|
145
|
+
responses: list[str],
|
|
146
|
+
scores: list[float] | None = None,
|
|
147
|
+
references: list[str] | None = None,
|
|
148
|
+
) -> AnalysisReport:
|
|
149
|
+
"""Discover failure slices — input regions where your LLM fails significantly more.
|
|
150
|
+
|
|
151
|
+
Embeds prompts, clusters them, runs statistical tests on each cluster, applies
|
|
152
|
+
Benjamini-Hochberg FDR correction, and names significant clusters via LLM.
|
|
153
|
+
|
|
154
|
+
Scoring mode is auto-detected from the arguments:
|
|
155
|
+
|
|
156
|
+
- **Mode 1** (``scores`` provided): use pre-computed scores directly.
|
|
157
|
+
- **Mode 2** (``references`` provided): score by cosine similarity between
|
|
158
|
+
response and reference embeddings.
|
|
159
|
+
- **Mode 3** (neither): autonomous scoring via semantic entropy and
|
|
160
|
+
self-consistency (makes additional LLM API calls).
|
|
161
|
+
- Both provided: Mode 1 wins, Mode 2 is ignored (``UserWarning`` raised).
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
prompts: Input prompts to analyze. Must be non-empty and the same length
|
|
165
|
+
as ``responses``.
|
|
166
|
+
responses: Model responses corresponding to each prompt.
|
|
167
|
+
scores: Mode 1. Pre-computed quality scores in [0, 1] where higher is
|
|
168
|
+
better. Must be the same length as ``prompts``.
|
|
169
|
+
references: Mode 2. Ground-truth reference answers. Must be the same
|
|
170
|
+
length as ``prompts``.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
:class:`AnalysisReport` with ``slices`` sorted by adjusted p-value
|
|
174
|
+
ascending (most significant first). ``print(report)`` produces formatted
|
|
175
|
+
output. ``report.to_dict()`` returns a JSON-serializable dict.
|
|
176
|
+
|
|
177
|
+
Raises:
|
|
178
|
+
ConfigurationError: If inputs are empty or length-mismatched.
|
|
179
|
+
"""
|
|
180
|
+
return run_sync(
|
|
181
|
+
self._analyze_async(prompts, responses, scores, references)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
async def _analyze_async(
|
|
185
|
+
self,
|
|
186
|
+
prompts: list[str],
|
|
187
|
+
responses: list[str],
|
|
188
|
+
scores: list[float] | None,
|
|
189
|
+
references: list[str] | None,
|
|
190
|
+
) -> AnalysisReport:
|
|
191
|
+
"""
|
|
192
|
+
Async orchestration pipeline:
|
|
193
|
+
|
|
194
|
+
1. VALIDATE inputs
|
|
195
|
+
2. DETECT scoring mode
|
|
196
|
+
3. SCORE → ScoringResult
|
|
197
|
+
4. BINARIZE → failures = score < threshold
|
|
198
|
+
5. Early return if 0 failures
|
|
199
|
+
6. EMBED prompts (NOT responses)
|
|
200
|
+
7. CLUSTER prompt embeddings
|
|
201
|
+
8. TEST each cluster (chi2 or Fisher)
|
|
202
|
+
9. BH CORRECT p-values
|
|
203
|
+
10. FILTER → keep adjusted_p < alpha
|
|
204
|
+
11. NAME significant clusters via LLM
|
|
205
|
+
12. ASSEMBLE FailureSlice objects
|
|
206
|
+
13. Return AnalysisReport
|
|
207
|
+
"""
|
|
208
|
+
from .scoring import EntropyScorer, PrecomputedScorer, ReferenceScorer
|
|
209
|
+
from .slicing import (
|
|
210
|
+
benjamini_hochberg,
|
|
211
|
+
cluster_embeddings,
|
|
212
|
+
get_representative_prompts,
|
|
213
|
+
test_cluster_failure_rate,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# 1. Validate
|
|
217
|
+
validate_inputs(prompts, responses, scores, references)
|
|
218
|
+
|
|
219
|
+
# 2. Mode detection
|
|
220
|
+
if scores is not None and references is not None:
|
|
221
|
+
warnings.warn(
|
|
222
|
+
"Both scores and references provided. Using scores (Mode 1).",
|
|
223
|
+
UserWarning, stacklevel=3,
|
|
224
|
+
)
|
|
225
|
+
references = None
|
|
226
|
+
|
|
227
|
+
if scores is not None:
|
|
228
|
+
scoring_mode = "precomputed"
|
|
229
|
+
scorer = PrecomputedScorer(scores)
|
|
230
|
+
elif references is not None:
|
|
231
|
+
scoring_mode = "reference"
|
|
232
|
+
scorer = ReferenceScorer(self._embedder, references)
|
|
233
|
+
else:
|
|
234
|
+
scoring_mode = "entropy"
|
|
235
|
+
scorer = EntropyScorer(
|
|
236
|
+
client=self._llm_client, embedder=self._embedder,
|
|
237
|
+
n_samples=self.n_samples, temperature=self.temperature,
|
|
238
|
+
consistency_threshold=self.consistency_threshold,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# 3. Score
|
|
242
|
+
logger.info(f"Scoring mode: {scoring_mode}")
|
|
243
|
+
scoring_result: ScoringResult = await scorer.score(prompts, responses)
|
|
244
|
+
|
|
245
|
+
# 4. Binarize
|
|
246
|
+
score_array = np.array(scoring_result.scores)
|
|
247
|
+
failures = score_array < self.failure_threshold
|
|
248
|
+
total_failures = int(np.sum(failures))
|
|
249
|
+
total_prompts = len(prompts)
|
|
250
|
+
baseline = total_failures / total_prompts if total_prompts > 0 else 0.0
|
|
251
|
+
|
|
252
|
+
logger.info(
|
|
253
|
+
f"Failures: {total_failures}/{total_prompts} ({baseline:.1%}) "
|
|
254
|
+
f"at threshold={self.failure_threshold}"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# 5. Early return
|
|
258
|
+
if total_failures == 0:
|
|
259
|
+
return AnalysisReport(
|
|
260
|
+
slices=[], total_prompts=total_prompts, total_failures=0,
|
|
261
|
+
baseline_failure_rate=0.0, significance_level=self.significance_level,
|
|
262
|
+
failure_threshold=self.failure_threshold, scoring_mode=scoring_mode,
|
|
263
|
+
num_clusters_tested=0, num_significant=0,
|
|
264
|
+
clustering_method=self.clustering_method,
|
|
265
|
+
embedding_model=self.embedding_model,
|
|
266
|
+
metadata={"scoring_metadata": scoring_result.metadata},
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# 6. Embed prompts
|
|
270
|
+
logger.info("Embedding prompts...")
|
|
271
|
+
prompt_embeddings = self._embedder.embed(prompts)
|
|
272
|
+
|
|
273
|
+
# 7. Cluster
|
|
274
|
+
logger.info(f"Clustering ({self.clustering_method})...")
|
|
275
|
+
labels = cluster_embeddings(
|
|
276
|
+
prompt_embeddings, method=self.clustering_method,
|
|
277
|
+
min_cluster_size=self.min_slice_size,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
unique_labels = sorted(set(labels))
|
|
281
|
+
if -1 in unique_labels:
|
|
282
|
+
unique_labels.remove(-1)
|
|
283
|
+
|
|
284
|
+
# 8. Statistical testing
|
|
285
|
+
logger.info(f"Testing {len(unique_labels)} clusters...")
|
|
286
|
+
test_results = []
|
|
287
|
+
for cid in unique_labels:
|
|
288
|
+
mask = labels == cid
|
|
289
|
+
result = test_cluster_failure_rate(
|
|
290
|
+
cluster_failures=int(np.sum(failures[mask])),
|
|
291
|
+
cluster_size=int(np.sum(mask)),
|
|
292
|
+
total_failures=total_failures,
|
|
293
|
+
total_size=total_prompts,
|
|
294
|
+
cluster_id=cid,
|
|
295
|
+
)
|
|
296
|
+
test_results.append(result)
|
|
297
|
+
|
|
298
|
+
# 9. BH correction
|
|
299
|
+
corrected = benjamini_hochberg(test_results, alpha=self.significance_level)
|
|
300
|
+
|
|
301
|
+
# 10. Filter
|
|
302
|
+
significant = [
|
|
303
|
+
r for r in corrected
|
|
304
|
+
if r.adjusted_p_value < self.significance_level
|
|
305
|
+
]
|
|
306
|
+
logger.info(
|
|
307
|
+
f"{len(significant)}/{len(corrected)} clusters significant "
|
|
308
|
+
f"at alpha={self.significance_level}"
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
if not significant:
|
|
312
|
+
return AnalysisReport(
|
|
313
|
+
slices=[], total_prompts=total_prompts,
|
|
314
|
+
total_failures=total_failures, baseline_failure_rate=baseline,
|
|
315
|
+
significance_level=self.significance_level,
|
|
316
|
+
failure_threshold=self.failure_threshold, scoring_mode=scoring_mode,
|
|
317
|
+
num_clusters_tested=len(corrected), num_significant=0,
|
|
318
|
+
clustering_method=self.clustering_method,
|
|
319
|
+
embedding_model=self.embedding_model,
|
|
320
|
+
metadata={"scoring_metadata": scoring_result.metadata},
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# 11. Name
|
|
324
|
+
clusters_texts = []
|
|
325
|
+
clusters_all_indices = []
|
|
326
|
+
for r in significant:
|
|
327
|
+
rep_prompts, _ = get_representative_prompts(
|
|
328
|
+
prompt_embeddings, labels, r.cluster_id, prompts, top_k=10
|
|
329
|
+
)
|
|
330
|
+
clusters_texts.append(rep_prompts)
|
|
331
|
+
# ALL indices in this cluster
|
|
332
|
+
all_idx = np.where(labels == r.cluster_id)[0].tolist()
|
|
333
|
+
clusters_all_indices.append(all_idx)
|
|
334
|
+
|
|
335
|
+
logger.info(f"Naming {len(significant)} clusters...")
|
|
336
|
+
cluster_labels = await label_clusters(
|
|
337
|
+
self._llm_client, clusters_texts, context="failure slice"
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
# 12. Assemble
|
|
341
|
+
slices: list[FailureSlice] = []
|
|
342
|
+
for r, label, rep_texts, all_idx in zip(
|
|
343
|
+
significant, cluster_labels, clusters_texts, clusters_all_indices
|
|
344
|
+
):
|
|
345
|
+
# Build examples: top-5 with prompt, response, score
|
|
346
|
+
examples = []
|
|
347
|
+
for idx in all_idx[:5]:
|
|
348
|
+
examples.append({
|
|
349
|
+
"prompt": prompts[idx],
|
|
350
|
+
"response": responses[idx],
|
|
351
|
+
"score": float(score_array[idx]),
|
|
352
|
+
})
|
|
353
|
+
|
|
354
|
+
effect = r.failure_rate / baseline if baseline > 0 else float('inf')
|
|
355
|
+
|
|
356
|
+
slices.append(FailureSlice(
|
|
357
|
+
name=label.name,
|
|
358
|
+
description=label.description,
|
|
359
|
+
size=r.size,
|
|
360
|
+
failure_rate=r.failure_rate,
|
|
361
|
+
baseline_rate=baseline,
|
|
362
|
+
effect_size=round(effect, 2),
|
|
363
|
+
p_value=r.p_value,
|
|
364
|
+
adjusted_p_value=r.adjusted_p_value,
|
|
365
|
+
test_used=r.test_used,
|
|
366
|
+
sample_indices=all_idx,
|
|
367
|
+
examples=examples,
|
|
368
|
+
representative_prompts=rep_texts[:5],
|
|
369
|
+
cluster_id=r.cluster_id,
|
|
370
|
+
))
|
|
371
|
+
|
|
372
|
+
report = AnalysisReport(
|
|
373
|
+
slices=slices, total_prompts=total_prompts,
|
|
374
|
+
total_failures=total_failures, baseline_failure_rate=baseline,
|
|
375
|
+
significance_level=self.significance_level,
|
|
376
|
+
failure_threshold=self.failure_threshold, scoring_mode=scoring_mode,
|
|
377
|
+
num_clusters_tested=len(corrected), num_significant=len(slices),
|
|
378
|
+
clustering_method=self.clustering_method,
|
|
379
|
+
embedding_model=self.embedding_model,
|
|
380
|
+
metadata={"scoring_metadata": scoring_result.metadata},
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
logger.info(report.summary())
|
|
384
|
+
return report
|
|
385
|
+
|
|
386
|
+
# ── audit_coverage() ───────────────────────────────────
|
|
387
|
+
|
|
388
|
+
def audit_coverage(
|
|
389
|
+
self,
|
|
390
|
+
test_prompts: list[str],
|
|
391
|
+
production_prompts: list[str],
|
|
392
|
+
distance_threshold: float | None = None,
|
|
393
|
+
min_gap_size: int = 5,
|
|
394
|
+
) -> CoverageReport:
|
|
395
|
+
"""Find semantic blind spots in a test suite by comparing against production traffic.
|
|
396
|
+
|
|
397
|
+
Embeds both test and production prompts, uses k-nearest-neighbors to find
|
|
398
|
+
production prompts that have no semantically similar test prompt, clusters those
|
|
399
|
+
uncovered prompts into gap clusters, and names each gap via LLM.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
test_prompts: Prompts from your evaluation / test suite.
|
|
403
|
+
production_prompts: Prompts from real production traffic (e.g. application logs).
|
|
404
|
+
distance_threshold: L2 distance cutoff in embedding space. Production prompts
|
|
405
|
+
farther than this from any test prompt are considered "uncovered".
|
|
406
|
+
If ``None`` (default), auto-computed as ``mean(distances) + 1.5 * std(distances)``.
|
|
407
|
+
Set explicitly if the auto-threshold behaves unexpectedly on your data.
|
|
408
|
+
min_gap_size: Minimum number of production prompts required to form a
|
|
409
|
+
reportable gap cluster. Smaller clusters are discarded. Default ``5``.
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
:class:`CoverageReport` with ``gaps`` sorted by ``mean_distance`` descending
|
|
413
|
+
(most severe gaps first). ``print(coverage)`` produces formatted output.
|
|
414
|
+
``coverage.to_dict()`` returns a JSON-serializable dict.
|
|
415
|
+
|
|
416
|
+
Raises:
|
|
417
|
+
ConfigurationError: If ``test_prompts`` or ``production_prompts`` is
|
|
418
|
+
empty, ``min_gap_size <= 0``, or ``distance_threshold < 0``.
|
|
419
|
+
"""
|
|
420
|
+
return run_sync(self._audit_coverage_async(
|
|
421
|
+
test_prompts, production_prompts, distance_threshold, min_gap_size
|
|
422
|
+
))
|
|
423
|
+
|
|
424
|
+
async def _audit_coverage_async(
|
|
425
|
+
self,
|
|
426
|
+
test_prompts: list[str],
|
|
427
|
+
production_prompts: list[str],
|
|
428
|
+
distance_threshold: float | None,
|
|
429
|
+
min_gap_size: int,
|
|
430
|
+
) -> CoverageReport:
|
|
431
|
+
"""
|
|
432
|
+
Pipeline:
|
|
433
|
+
1. Validate
|
|
434
|
+
2. Embed test + production prompts
|
|
435
|
+
3. Detect gaps (NN distances + clustering)
|
|
436
|
+
4. Get representatives per gap
|
|
437
|
+
5. Name gaps via LLM
|
|
438
|
+
6. Assemble CoverageReport
|
|
439
|
+
"""
|
|
440
|
+
from .coverage import detect_coverage_gaps
|
|
441
|
+
from .slicing.clustering import get_representative_prompts
|
|
442
|
+
|
|
443
|
+
if not test_prompts:
|
|
444
|
+
raise ConfigurationError("test_prompts must be non-empty")
|
|
445
|
+
if not production_prompts:
|
|
446
|
+
raise ConfigurationError("production_prompts must be non-empty")
|
|
447
|
+
if min_gap_size <= 0:
|
|
448
|
+
raise ConfigurationError("min_gap_size must be > 0")
|
|
449
|
+
if distance_threshold is not None and distance_threshold < 0:
|
|
450
|
+
raise ConfigurationError("distance_threshold must be >= 0")
|
|
451
|
+
|
|
452
|
+
# Embed
|
|
453
|
+
logger.info("Embedding test prompts...")
|
|
454
|
+
test_emb = self._embedder.embed(test_prompts)
|
|
455
|
+
logger.info("Embedding production prompts...")
|
|
456
|
+
prod_emb = self._embedder.embed(production_prompts)
|
|
457
|
+
|
|
458
|
+
# Detect gaps
|
|
459
|
+
gap_labels, nn_distances, used_threshold = detect_coverage_gaps(
|
|
460
|
+
test_embeddings=test_emb, prod_embeddings=prod_emb,
|
|
461
|
+
prod_prompts=production_prompts,
|
|
462
|
+
distance_threshold=distance_threshold,
|
|
463
|
+
min_gap_size=min_gap_size,
|
|
464
|
+
clustering_method=self.clustering_method,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
total = len(production_prompts)
|
|
468
|
+
total_uncovered = int(np.sum(gap_labels != -1))
|
|
469
|
+
unclustered_prompt_indices = np.where(gap_labels == -2)[0].tolist()
|
|
470
|
+
coverage_metadata = {
|
|
471
|
+
"num_uncovered_total": total_uncovered,
|
|
472
|
+
"num_clustered_uncovered": int(np.sum(gap_labels >= 0)),
|
|
473
|
+
"num_unclustered_uncovered": len(unclustered_prompt_indices),
|
|
474
|
+
"unclustered_prompt_indices": unclustered_prompt_indices,
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
unique_gaps = sorted(set(gap_labels))
|
|
478
|
+
if -1 in unique_gaps:
|
|
479
|
+
unique_gaps.remove(-1)
|
|
480
|
+
if -2 in unique_gaps:
|
|
481
|
+
unique_gaps.remove(-2)
|
|
482
|
+
|
|
483
|
+
if not unique_gaps:
|
|
484
|
+
return CoverageReport(
|
|
485
|
+
gaps=[], num_test_prompts=len(test_prompts),
|
|
486
|
+
num_production_prompts=total, num_gaps=0,
|
|
487
|
+
overall_coverage_score=1.0 - (total_uncovered / total if total else 0),
|
|
488
|
+
distance_threshold=used_threshold,
|
|
489
|
+
embedding_model=self.embedding_model,
|
|
490
|
+
metadata=coverage_metadata,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
# Representatives + metadata per gap
|
|
494
|
+
clusters_texts = []
|
|
495
|
+
gaps_meta = []
|
|
496
|
+
for cid in unique_gaps:
|
|
497
|
+
mask = gap_labels == cid
|
|
498
|
+
size = int(np.sum(mask))
|
|
499
|
+
mean_dist = float(np.mean(nn_distances[mask]))
|
|
500
|
+
rep_prompts, _ = get_representative_prompts(
|
|
501
|
+
prod_emb, gap_labels, cid, production_prompts, top_k=10
|
|
502
|
+
)
|
|
503
|
+
clusters_texts.append(rep_prompts)
|
|
504
|
+
all_idx = np.where(mask)[0].tolist()
|
|
505
|
+
gaps_meta.append((cid, size, mean_dist, rep_prompts[:5], all_idx))
|
|
506
|
+
|
|
507
|
+
# Name gaps
|
|
508
|
+
logger.info(f"Naming {len(unique_gaps)} coverage gaps...")
|
|
509
|
+
cluster_labels = await label_clusters(
|
|
510
|
+
self._llm_client, clusters_texts, context="coverage gap"
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
# Assemble
|
|
514
|
+
gaps: list[CoverageGap] = []
|
|
515
|
+
for label, (cid, size, mean_dist, rep_prompts, all_idx) in zip(
|
|
516
|
+
cluster_labels, gaps_meta
|
|
517
|
+
):
|
|
518
|
+
gaps.append(CoverageGap(
|
|
519
|
+
name=label.name, description=label.description,
|
|
520
|
+
size=size, mean_distance=mean_dist,
|
|
521
|
+
representative_prompts=rep_prompts,
|
|
522
|
+
prompt_indices=all_idx, cluster_id=cid,
|
|
523
|
+
))
|
|
524
|
+
|
|
525
|
+
gaps.sort(key=lambda g: g.mean_distance, reverse=True)
|
|
526
|
+
|
|
527
|
+
report = CoverageReport(
|
|
528
|
+
gaps=gaps, num_test_prompts=len(test_prompts),
|
|
529
|
+
num_production_prompts=total, num_gaps=len(gaps),
|
|
530
|
+
overall_coverage_score=1.0 - (total_uncovered / total if total else 0),
|
|
531
|
+
distance_threshold=used_threshold,
|
|
532
|
+
embedding_model=self.embedding_model,
|
|
533
|
+
metadata=coverage_metadata,
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
logger.info(report.summary())
|
|
537
|
+
return report
|