faultmap 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
faultmap/__init__.py ADDED
@@ -0,0 +1,36 @@
1
+ """faultmap: Automatically discover where and why your LLM is failing."""
2
+
3
+ from .analyzer import SliceAnalyzer
4
+ from .exceptions import (
5
+ ClusteringError,
6
+ ConfigurationError,
7
+ EmbeddingError,
8
+ FaultmapError,
9
+ LLMError,
10
+ ScoringError,
11
+ )
12
+ from .models import (
13
+ AnalysisReport,
14
+ CoverageGap,
15
+ CoverageReport,
16
+ FailureSlice,
17
+ ScoringResult,
18
+ )
19
+
20
+ __version__ = "0.2.0"
21
+
22
+ __all__ = [
23
+ "SliceAnalyzer",
24
+ "AnalysisReport",
25
+ "FailureSlice",
26
+ "CoverageReport",
27
+ "CoverageGap",
28
+ "ScoringResult",
29
+ "FaultmapError",
30
+ "EmbeddingError",
31
+ "ScoringError",
32
+ "LLMError",
33
+ "ClusteringError",
34
+ "ConfigurationError",
35
+ "__version__",
36
+ ]
faultmap/analyzer.py ADDED
@@ -0,0 +1,537 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import warnings
5
+
6
+ import numpy as np
7
+
8
+ from .embeddings import Embedder, get_embedder
9
+ from .exceptions import ConfigurationError
10
+ from .labeling import label_clusters
11
+ from .llm import AsyncLLMClient
12
+ from .models import (
13
+ AnalysisReport,
14
+ CoverageGap,
15
+ CoverageReport,
16
+ FailureSlice,
17
+ ScoringResult,
18
+ )
19
+ from .utils import run_sync, validate_inputs
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class SliceAnalyzer:
25
+ """Discover failure slices and coverage gaps in LLM evaluations.
26
+
27
+ ``SliceAnalyzer`` is the main entry point for faultmap. It provides two
28
+ methods:
29
+
30
+ - :meth:`analyze` — find input slices with statistically elevated failure rates
31
+ - :meth:`audit_coverage` — find semantic blind spots in a test suite
32
+
33
+ Example::
34
+
35
+ from faultmap import SliceAnalyzer
36
+
37
+ analyzer = SliceAnalyzer(model="gpt-4o-mini")
38
+ report = analyzer.analyze(prompts, responses, scores=scores)
39
+ print(report)
40
+
41
+ coverage = analyzer.audit_coverage(test_prompts, prod_prompts)
42
+ print(coverage)
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ model: str = "gpt-4o-mini",
48
+ embedding_model: str = "text-embedding-3-small",
49
+ significance_level: float = 0.05,
50
+ min_slice_size: int = 10,
51
+ failure_threshold: float = 0.5,
52
+ n_samples: int = 8,
53
+ clustering_method: str = "hdbscan",
54
+ max_concurrent_requests: int = 50,
55
+ temperature: float = 1.0,
56
+ consistency_threshold: float = 0.8,
57
+ ) -> None:
58
+ """Initialize a SliceAnalyzer.
59
+
60
+ Args:
61
+ model: litellm model string used for LLM calls (cluster naming and
62
+ Mode 3 response sampling). Supports 100+ providers, e.g.
63
+ ``"gpt-4o-mini"``, ``"anthropic/claude-3-haiku-20240307"``,
64
+ ``"ollama/mistral"``.
65
+ embedding_model: Embedding model name. Local sentence-transformers models
66
+ are auto-detected by prefix (``"all-MiniLM-"``, ``"all-mpnet-"``,
67
+ ``"paraphrase-"``); all others route to ``APIEmbedder`` via litellm.
68
+ The default uses an API-backed embedding model so
69
+ ``pip install faultmap`` works without optional extras. Local models
70
+ require ``pip install faultmap[local]``.
71
+ significance_level: FDR alpha for Benjamini-Hochberg correction. Slices
72
+ with ``adjusted_p_value < significance_level`` are reported.
73
+ Default ``0.05``.
74
+ min_slice_size: Minimum number of prompts a cluster must contain to be
75
+ tested. Smaller clusters are silently discarded. Default ``10``.
76
+ failure_threshold: Score cutoff for binary pass/fail. A prompt with
77
+ ``score < failure_threshold`` is counted as a failure. Default ``0.5``.
78
+ n_samples: Mode 3 only. Number of additional LLM responses sampled per
79
+ prompt for entropy estimation. Must be >= 2. Higher values give more
80
+ accurate entropy estimates at greater API cost. Default ``8``.
81
+ clustering_method: Clustering algorithm.
82
+ - ``"hdbscan"`` (default): automatically discovers the number of
83
+ clusters using density-based clustering (sklearn >= 1.3 built-in).
84
+ - ``"agglomerative"``: Ward linkage with silhouette-based k-selection
85
+ over ``[5, 10, 15, 20, 25, 30]``. More predictable cluster count.
86
+ max_concurrent_requests: Maximum number of parallel LLM API calls.
87
+ Controlled via asyncio semaphore. Reduce if hitting rate limits.
88
+ Default ``50``.
89
+ temperature: Mode 3 only. Sampling temperature for response diversity.
90
+ Higher values increase entropy estimation accuracy. Default ``1.0``.
91
+ consistency_threshold: Mode 3 only. Cosine similarity threshold above
92
+ which a sampled response is considered "consistent" with the original.
93
+ Default ``0.8``.
94
+
95
+ Raises:
96
+ ConfigurationError: If ``clustering_method`` is not ``"hdbscan"`` or
97
+ ``"agglomerative"``, ``significance_level`` is not in (0, 1),
98
+ ``failure_threshold`` is not in [0, 1], ``n_samples < 2``,
99
+ ``min_slice_size <= 0``, ``max_concurrent_requests <= 0``,
100
+ ``temperature < 0``, or ``consistency_threshold`` is not in [0, 1].
101
+ """
102
+ # Validate
103
+ if clustering_method not in ("hdbscan", "agglomerative"):
104
+ raise ConfigurationError(
105
+ f"clustering_method must be 'hdbscan' or 'agglomerative', "
106
+ f"got {clustering_method!r}"
107
+ )
108
+ if not 0 < significance_level < 1:
109
+ raise ConfigurationError("significance_level must be in (0, 1)")
110
+ if min_slice_size <= 0:
111
+ raise ConfigurationError("min_slice_size must be > 0")
112
+ if not 0 <= failure_threshold <= 1:
113
+ raise ConfigurationError("failure_threshold must be in [0, 1]")
114
+ if n_samples < 2:
115
+ raise ConfigurationError("n_samples must be >= 2 for entropy scoring")
116
+ if max_concurrent_requests <= 0:
117
+ raise ConfigurationError("max_concurrent_requests must be > 0")
118
+ if temperature < 0:
119
+ raise ConfigurationError("temperature must be >= 0")
120
+ if not 0 <= consistency_threshold <= 1:
121
+ raise ConfigurationError("consistency_threshold must be in [0, 1]")
122
+
123
+ self.model = model
124
+ self.embedding_model = embedding_model
125
+ self.significance_level = significance_level
126
+ self.min_slice_size = min_slice_size
127
+ self.failure_threshold = failure_threshold
128
+ self.n_samples = n_samples
129
+ self.clustering_method = clustering_method
130
+ self.max_concurrent_requests = max_concurrent_requests
131
+ self.temperature = temperature
132
+ self.consistency_threshold = consistency_threshold
133
+
134
+ self._embedder: Embedder = get_embedder(embedding_model)
135
+ self._llm_client = AsyncLLMClient(
136
+ model=model,
137
+ max_concurrent_requests=max_concurrent_requests,
138
+ )
139
+
140
+ # ── analyze() ──────────────────────────────────────────
141
+
142
+ def analyze(
143
+ self,
144
+ prompts: list[str],
145
+ responses: list[str],
146
+ scores: list[float] | None = None,
147
+ references: list[str] | None = None,
148
+ ) -> AnalysisReport:
149
+ """Discover failure slices — input regions where your LLM fails significantly more.
150
+
151
+ Embeds prompts, clusters them, runs statistical tests on each cluster, applies
152
+ Benjamini-Hochberg FDR correction, and names significant clusters via LLM.
153
+
154
+ Scoring mode is auto-detected from the arguments:
155
+
156
+ - **Mode 1** (``scores`` provided): use pre-computed scores directly.
157
+ - **Mode 2** (``references`` provided): score by cosine similarity between
158
+ response and reference embeddings.
159
+ - **Mode 3** (neither): autonomous scoring via semantic entropy and
160
+ self-consistency (makes additional LLM API calls).
161
+ - Both provided: Mode 1 wins, Mode 2 is ignored (``UserWarning`` raised).
162
+
163
+ Args:
164
+ prompts: Input prompts to analyze. Must be non-empty and the same length
165
+ as ``responses``.
166
+ responses: Model responses corresponding to each prompt.
167
+ scores: Mode 1. Pre-computed quality scores in [0, 1] where higher is
168
+ better. Must be the same length as ``prompts``.
169
+ references: Mode 2. Ground-truth reference answers. Must be the same
170
+ length as ``prompts``.
171
+
172
+ Returns:
173
+ :class:`AnalysisReport` with ``slices`` sorted by adjusted p-value
174
+ ascending (most significant first). ``print(report)`` produces formatted
175
+ output. ``report.to_dict()`` returns a JSON-serializable dict.
176
+
177
+ Raises:
178
+ ConfigurationError: If inputs are empty or length-mismatched.
179
+ """
180
+ return run_sync(
181
+ self._analyze_async(prompts, responses, scores, references)
182
+ )
183
+
184
+ async def _analyze_async(
185
+ self,
186
+ prompts: list[str],
187
+ responses: list[str],
188
+ scores: list[float] | None,
189
+ references: list[str] | None,
190
+ ) -> AnalysisReport:
191
+ """
192
+ Async orchestration pipeline:
193
+
194
+ 1. VALIDATE inputs
195
+ 2. DETECT scoring mode
196
+ 3. SCORE → ScoringResult
197
+ 4. BINARIZE → failures = score < threshold
198
+ 5. Early return if 0 failures
199
+ 6. EMBED prompts (NOT responses)
200
+ 7. CLUSTER prompt embeddings
201
+ 8. TEST each cluster (chi2 or Fisher)
202
+ 9. BH CORRECT p-values
203
+ 10. FILTER → keep adjusted_p < alpha
204
+ 11. NAME significant clusters via LLM
205
+ 12. ASSEMBLE FailureSlice objects
206
+ 13. Return AnalysisReport
207
+ """
208
+ from .scoring import EntropyScorer, PrecomputedScorer, ReferenceScorer
209
+ from .slicing import (
210
+ benjamini_hochberg,
211
+ cluster_embeddings,
212
+ get_representative_prompts,
213
+ test_cluster_failure_rate,
214
+ )
215
+
216
+ # 1. Validate
217
+ validate_inputs(prompts, responses, scores, references)
218
+
219
+ # 2. Mode detection
220
+ if scores is not None and references is not None:
221
+ warnings.warn(
222
+ "Both scores and references provided. Using scores (Mode 1).",
223
+ UserWarning, stacklevel=3,
224
+ )
225
+ references = None
226
+
227
+ if scores is not None:
228
+ scoring_mode = "precomputed"
229
+ scorer = PrecomputedScorer(scores)
230
+ elif references is not None:
231
+ scoring_mode = "reference"
232
+ scorer = ReferenceScorer(self._embedder, references)
233
+ else:
234
+ scoring_mode = "entropy"
235
+ scorer = EntropyScorer(
236
+ client=self._llm_client, embedder=self._embedder,
237
+ n_samples=self.n_samples, temperature=self.temperature,
238
+ consistency_threshold=self.consistency_threshold,
239
+ )
240
+
241
+ # 3. Score
242
+ logger.info(f"Scoring mode: {scoring_mode}")
243
+ scoring_result: ScoringResult = await scorer.score(prompts, responses)
244
+
245
+ # 4. Binarize
246
+ score_array = np.array(scoring_result.scores)
247
+ failures = score_array < self.failure_threshold
248
+ total_failures = int(np.sum(failures))
249
+ total_prompts = len(prompts)
250
+ baseline = total_failures / total_prompts if total_prompts > 0 else 0.0
251
+
252
+ logger.info(
253
+ f"Failures: {total_failures}/{total_prompts} ({baseline:.1%}) "
254
+ f"at threshold={self.failure_threshold}"
255
+ )
256
+
257
+ # 5. Early return
258
+ if total_failures == 0:
259
+ return AnalysisReport(
260
+ slices=[], total_prompts=total_prompts, total_failures=0,
261
+ baseline_failure_rate=0.0, significance_level=self.significance_level,
262
+ failure_threshold=self.failure_threshold, scoring_mode=scoring_mode,
263
+ num_clusters_tested=0, num_significant=0,
264
+ clustering_method=self.clustering_method,
265
+ embedding_model=self.embedding_model,
266
+ metadata={"scoring_metadata": scoring_result.metadata},
267
+ )
268
+
269
+ # 6. Embed prompts
270
+ logger.info("Embedding prompts...")
271
+ prompt_embeddings = self._embedder.embed(prompts)
272
+
273
+ # 7. Cluster
274
+ logger.info(f"Clustering ({self.clustering_method})...")
275
+ labels = cluster_embeddings(
276
+ prompt_embeddings, method=self.clustering_method,
277
+ min_cluster_size=self.min_slice_size,
278
+ )
279
+
280
+ unique_labels = sorted(set(labels))
281
+ if -1 in unique_labels:
282
+ unique_labels.remove(-1)
283
+
284
+ # 8. Statistical testing
285
+ logger.info(f"Testing {len(unique_labels)} clusters...")
286
+ test_results = []
287
+ for cid in unique_labels:
288
+ mask = labels == cid
289
+ result = test_cluster_failure_rate(
290
+ cluster_failures=int(np.sum(failures[mask])),
291
+ cluster_size=int(np.sum(mask)),
292
+ total_failures=total_failures,
293
+ total_size=total_prompts,
294
+ cluster_id=cid,
295
+ )
296
+ test_results.append(result)
297
+
298
+ # 9. BH correction
299
+ corrected = benjamini_hochberg(test_results, alpha=self.significance_level)
300
+
301
+ # 10. Filter
302
+ significant = [
303
+ r for r in corrected
304
+ if r.adjusted_p_value < self.significance_level
305
+ ]
306
+ logger.info(
307
+ f"{len(significant)}/{len(corrected)} clusters significant "
308
+ f"at alpha={self.significance_level}"
309
+ )
310
+
311
+ if not significant:
312
+ return AnalysisReport(
313
+ slices=[], total_prompts=total_prompts,
314
+ total_failures=total_failures, baseline_failure_rate=baseline,
315
+ significance_level=self.significance_level,
316
+ failure_threshold=self.failure_threshold, scoring_mode=scoring_mode,
317
+ num_clusters_tested=len(corrected), num_significant=0,
318
+ clustering_method=self.clustering_method,
319
+ embedding_model=self.embedding_model,
320
+ metadata={"scoring_metadata": scoring_result.metadata},
321
+ )
322
+
323
+ # 11. Name
324
+ clusters_texts = []
325
+ clusters_all_indices = []
326
+ for r in significant:
327
+ rep_prompts, _ = get_representative_prompts(
328
+ prompt_embeddings, labels, r.cluster_id, prompts, top_k=10
329
+ )
330
+ clusters_texts.append(rep_prompts)
331
+ # ALL indices in this cluster
332
+ all_idx = np.where(labels == r.cluster_id)[0].tolist()
333
+ clusters_all_indices.append(all_idx)
334
+
335
+ logger.info(f"Naming {len(significant)} clusters...")
336
+ cluster_labels = await label_clusters(
337
+ self._llm_client, clusters_texts, context="failure slice"
338
+ )
339
+
340
+ # 12. Assemble
341
+ slices: list[FailureSlice] = []
342
+ for r, label, rep_texts, all_idx in zip(
343
+ significant, cluster_labels, clusters_texts, clusters_all_indices
344
+ ):
345
+ # Build examples: top-5 with prompt, response, score
346
+ examples = []
347
+ for idx in all_idx[:5]:
348
+ examples.append({
349
+ "prompt": prompts[idx],
350
+ "response": responses[idx],
351
+ "score": float(score_array[idx]),
352
+ })
353
+
354
+ effect = r.failure_rate / baseline if baseline > 0 else float('inf')
355
+
356
+ slices.append(FailureSlice(
357
+ name=label.name,
358
+ description=label.description,
359
+ size=r.size,
360
+ failure_rate=r.failure_rate,
361
+ baseline_rate=baseline,
362
+ effect_size=round(effect, 2),
363
+ p_value=r.p_value,
364
+ adjusted_p_value=r.adjusted_p_value,
365
+ test_used=r.test_used,
366
+ sample_indices=all_idx,
367
+ examples=examples,
368
+ representative_prompts=rep_texts[:5],
369
+ cluster_id=r.cluster_id,
370
+ ))
371
+
372
+ report = AnalysisReport(
373
+ slices=slices, total_prompts=total_prompts,
374
+ total_failures=total_failures, baseline_failure_rate=baseline,
375
+ significance_level=self.significance_level,
376
+ failure_threshold=self.failure_threshold, scoring_mode=scoring_mode,
377
+ num_clusters_tested=len(corrected), num_significant=len(slices),
378
+ clustering_method=self.clustering_method,
379
+ embedding_model=self.embedding_model,
380
+ metadata={"scoring_metadata": scoring_result.metadata},
381
+ )
382
+
383
+ logger.info(report.summary())
384
+ return report
385
+
386
+ # ── audit_coverage() ───────────────────────────────────
387
+
388
+ def audit_coverage(
389
+ self,
390
+ test_prompts: list[str],
391
+ production_prompts: list[str],
392
+ distance_threshold: float | None = None,
393
+ min_gap_size: int = 5,
394
+ ) -> CoverageReport:
395
+ """Find semantic blind spots in a test suite by comparing against production traffic.
396
+
397
+ Embeds both test and production prompts, uses k-nearest-neighbors to find
398
+ production prompts that have no semantically similar test prompt, clusters those
399
+ uncovered prompts into gap clusters, and names each gap via LLM.
400
+
401
+ Args:
402
+ test_prompts: Prompts from your evaluation / test suite.
403
+ production_prompts: Prompts from real production traffic (e.g. application logs).
404
+ distance_threshold: L2 distance cutoff in embedding space. Production prompts
405
+ farther than this from any test prompt are considered "uncovered".
406
+ If ``None`` (default), auto-computed as ``mean(distances) + 1.5 * std(distances)``.
407
+ Set explicitly if the auto-threshold behaves unexpectedly on your data.
408
+ min_gap_size: Minimum number of production prompts required to form a
409
+ reportable gap cluster. Smaller clusters are discarded. Default ``5``.
410
+
411
+ Returns:
412
+ :class:`CoverageReport` with ``gaps`` sorted by ``mean_distance`` descending
413
+ (most severe gaps first). ``print(coverage)`` produces formatted output.
414
+ ``coverage.to_dict()`` returns a JSON-serializable dict.
415
+
416
+ Raises:
417
+ ConfigurationError: If ``test_prompts`` or ``production_prompts`` is
418
+ empty, ``min_gap_size <= 0``, or ``distance_threshold < 0``.
419
+ """
420
+ return run_sync(self._audit_coverage_async(
421
+ test_prompts, production_prompts, distance_threshold, min_gap_size
422
+ ))
423
+
424
+ async def _audit_coverage_async(
425
+ self,
426
+ test_prompts: list[str],
427
+ production_prompts: list[str],
428
+ distance_threshold: float | None,
429
+ min_gap_size: int,
430
+ ) -> CoverageReport:
431
+ """
432
+ Pipeline:
433
+ 1. Validate
434
+ 2. Embed test + production prompts
435
+ 3. Detect gaps (NN distances + clustering)
436
+ 4. Get representatives per gap
437
+ 5. Name gaps via LLM
438
+ 6. Assemble CoverageReport
439
+ """
440
+ from .coverage import detect_coverage_gaps
441
+ from .slicing.clustering import get_representative_prompts
442
+
443
+ if not test_prompts:
444
+ raise ConfigurationError("test_prompts must be non-empty")
445
+ if not production_prompts:
446
+ raise ConfigurationError("production_prompts must be non-empty")
447
+ if min_gap_size <= 0:
448
+ raise ConfigurationError("min_gap_size must be > 0")
449
+ if distance_threshold is not None and distance_threshold < 0:
450
+ raise ConfigurationError("distance_threshold must be >= 0")
451
+
452
+ # Embed
453
+ logger.info("Embedding test prompts...")
454
+ test_emb = self._embedder.embed(test_prompts)
455
+ logger.info("Embedding production prompts...")
456
+ prod_emb = self._embedder.embed(production_prompts)
457
+
458
+ # Detect gaps
459
+ gap_labels, nn_distances, used_threshold = detect_coverage_gaps(
460
+ test_embeddings=test_emb, prod_embeddings=prod_emb,
461
+ prod_prompts=production_prompts,
462
+ distance_threshold=distance_threshold,
463
+ min_gap_size=min_gap_size,
464
+ clustering_method=self.clustering_method,
465
+ )
466
+
467
+ total = len(production_prompts)
468
+ total_uncovered = int(np.sum(gap_labels != -1))
469
+ unclustered_prompt_indices = np.where(gap_labels == -2)[0].tolist()
470
+ coverage_metadata = {
471
+ "num_uncovered_total": total_uncovered,
472
+ "num_clustered_uncovered": int(np.sum(gap_labels >= 0)),
473
+ "num_unclustered_uncovered": len(unclustered_prompt_indices),
474
+ "unclustered_prompt_indices": unclustered_prompt_indices,
475
+ }
476
+
477
+ unique_gaps = sorted(set(gap_labels))
478
+ if -1 in unique_gaps:
479
+ unique_gaps.remove(-1)
480
+ if -2 in unique_gaps:
481
+ unique_gaps.remove(-2)
482
+
483
+ if not unique_gaps:
484
+ return CoverageReport(
485
+ gaps=[], num_test_prompts=len(test_prompts),
486
+ num_production_prompts=total, num_gaps=0,
487
+ overall_coverage_score=1.0 - (total_uncovered / total if total else 0),
488
+ distance_threshold=used_threshold,
489
+ embedding_model=self.embedding_model,
490
+ metadata=coverage_metadata,
491
+ )
492
+
493
+ # Representatives + metadata per gap
494
+ clusters_texts = []
495
+ gaps_meta = []
496
+ for cid in unique_gaps:
497
+ mask = gap_labels == cid
498
+ size = int(np.sum(mask))
499
+ mean_dist = float(np.mean(nn_distances[mask]))
500
+ rep_prompts, _ = get_representative_prompts(
501
+ prod_emb, gap_labels, cid, production_prompts, top_k=10
502
+ )
503
+ clusters_texts.append(rep_prompts)
504
+ all_idx = np.where(mask)[0].tolist()
505
+ gaps_meta.append((cid, size, mean_dist, rep_prompts[:5], all_idx))
506
+
507
+ # Name gaps
508
+ logger.info(f"Naming {len(unique_gaps)} coverage gaps...")
509
+ cluster_labels = await label_clusters(
510
+ self._llm_client, clusters_texts, context="coverage gap"
511
+ )
512
+
513
+ # Assemble
514
+ gaps: list[CoverageGap] = []
515
+ for label, (cid, size, mean_dist, rep_prompts, all_idx) in zip(
516
+ cluster_labels, gaps_meta
517
+ ):
518
+ gaps.append(CoverageGap(
519
+ name=label.name, description=label.description,
520
+ size=size, mean_distance=mean_dist,
521
+ representative_prompts=rep_prompts,
522
+ prompt_indices=all_idx, cluster_id=cid,
523
+ ))
524
+
525
+ gaps.sort(key=lambda g: g.mean_distance, reverse=True)
526
+
527
+ report = CoverageReport(
528
+ gaps=gaps, num_test_prompts=len(test_prompts),
529
+ num_production_prompts=total, num_gaps=len(gaps),
530
+ overall_coverage_score=1.0 - (total_uncovered / total if total else 0),
531
+ distance_threshold=used_threshold,
532
+ embedding_model=self.embedding_model,
533
+ metadata=coverage_metadata,
534
+ )
535
+
536
+ logger.info(report.summary())
537
+ return report
@@ -0,0 +1,3 @@
1
+ from .detector import detect_coverage_gaps
2
+
3
+ __all__ = ["detect_coverage_gaps"]