faultmap 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
faultmap/__init__.py ADDED
@@ -0,0 +1,36 @@
1
+ """faultmap: Automatically discover where and why your LLM is failing."""
2
+
3
+ from .analyzer import SliceAnalyzer
4
+ from .exceptions import (
5
+ ClusteringError,
6
+ ConfigurationError,
7
+ EmbeddingError,
8
+ FaultmapError,
9
+ LLMError,
10
+ ScoringError,
11
+ )
12
+ from .models import (
13
+ AnalysisReport,
14
+ CoverageGap,
15
+ CoverageReport,
16
+ FailureSlice,
17
+ ScoringResult,
18
+ )
19
+
20
+ __version__ = "0.3.0"
21
+
22
+ __all__ = [
23
+ "SliceAnalyzer",
24
+ "AnalysisReport",
25
+ "FailureSlice",
26
+ "CoverageReport",
27
+ "CoverageGap",
28
+ "ScoringResult",
29
+ "FaultmapError",
30
+ "EmbeddingError",
31
+ "ScoringError",
32
+ "LLMError",
33
+ "ClusteringError",
34
+ "ConfigurationError",
35
+ "__version__",
36
+ ]
faultmap/analyzer.py ADDED
@@ -0,0 +1,557 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import warnings
5
+
6
+ import numpy as np
7
+
8
+ from .embeddings import Embedder, get_embedder
9
+ from .exceptions import ConfigurationError
10
+ from .labeling import label_clusters
11
+ from .llm import AsyncLLMClient
12
+ from .models import (
13
+ AnalysisReport,
14
+ CoverageGap,
15
+ CoverageReport,
16
+ FailureSlice,
17
+ ScoringResult,
18
+ )
19
+ from .utils import run_sync, validate_inputs
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class SliceAnalyzer:
25
+ """Discover failure slices and coverage gaps in LLM evaluations.
26
+
27
+ ``SliceAnalyzer`` is the main entry point for faultmap. It provides two
28
+ methods:
29
+
30
+ - :meth:`analyze` — find input slices with statistically elevated failure rates
31
+ - :meth:`audit_coverage` — find semantic blind spots in a test suite
32
+
33
+ Example::
34
+
35
+ from faultmap import SliceAnalyzer
36
+
37
+ analyzer = SliceAnalyzer(model="gpt-4o-mini")
38
+ report = analyzer.analyze(prompts, responses, scores=scores)
39
+ print(report)
40
+
41
+ coverage = analyzer.audit_coverage(test_prompts, prod_prompts)
42
+ print(coverage)
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ model: str = "gpt-4o-mini",
48
+ embedding_model: str = "text-embedding-3-small",
49
+ embedding_max_text_chars: int | None = 2000,
50
+ embedding_request_kwargs: dict[str, object] | None = None,
51
+ embedding_usage_kwargs: dict[str, dict[str, object]] | None = None,
52
+ significance_level: float = 0.05,
53
+ min_slice_size: int = 10,
54
+ failure_threshold: float = 0.5,
55
+ n_samples: int = 8,
56
+ clustering_method: str = "hdbscan",
57
+ max_concurrent_requests: int = 50,
58
+ temperature: float = 1.0,
59
+ consistency_threshold: float = 0.8,
60
+ ) -> None:
61
+ """Initialize a SliceAnalyzer.
62
+
63
+ Args:
64
+ model: litellm model string used for LLM calls (cluster naming and
65
+ Mode 3 response sampling). Supports 100+ providers, e.g.
66
+ ``"gpt-4o-mini"``, ``"anthropic/claude-3-haiku-20240307"``,
67
+ ``"ollama/mistral"``.
68
+ embedding_model: Embedding model name. Local sentence-transformers models
69
+ are auto-detected by prefix (``"all-MiniLM-"``, ``"all-mpnet-"``,
70
+ ``"paraphrase-"``); all others route to ``APIEmbedder`` via litellm.
71
+ The default uses an API-backed embedding model so
72
+ ``pip install faultmap`` works without optional extras. Local models
73
+ require ``pip install faultmap[local]``.
74
+ embedding_max_text_chars: Character-level truncation applied to API
75
+ embedding inputs before sending requests. Helps avoid provider token
76
+ limits on long texts. ``None`` disables truncation. Ignored for local
77
+ models. Default ``2000``.
78
+ embedding_request_kwargs: Optional kwargs passed to API embedding calls
79
+ for all usages. Ignored for local models.
80
+ embedding_usage_kwargs: Optional per-usage API kwargs keyed by
81
+ ``"query"`` and/or ``"document"`` for asymmetric embedding models.
82
+ Ignored for local models.
83
+ significance_level: FDR alpha for Benjamini-Hochberg correction. Slices
84
+ with ``adjusted_p_value < significance_level`` are reported.
85
+ Default ``0.05``.
86
+ min_slice_size: Minimum number of prompts a cluster must contain to be
87
+ tested. Smaller clusters are silently discarded. Default ``10``.
88
+ failure_threshold: Score cutoff for binary pass/fail. A prompt with
89
+ ``score < failure_threshold`` is counted as a failure. Default ``0.5``.
90
+ n_samples: Mode 3 only. Number of additional LLM responses sampled per
91
+ prompt for entropy estimation. Must be >= 2. Higher values give more
92
+ accurate entropy estimates at greater API cost. Default ``8``.
93
+ clustering_method: Clustering algorithm.
94
+ - ``"hdbscan"`` (default): automatically discovers the number of
95
+ clusters using density-based clustering (sklearn >= 1.3 built-in).
96
+ - ``"agglomerative"``: Ward linkage with silhouette-based k-selection
97
+ over ``[5, 10, 15, 20, 25, 30]``. More predictable cluster count.
98
+ max_concurrent_requests: Maximum number of parallel LLM API calls.
99
+ Controlled via asyncio semaphore. Reduce if hitting rate limits.
100
+ Default ``50``.
101
+ temperature: Mode 3 only. Sampling temperature for response diversity.
102
+ Higher values increase entropy estimation accuracy. Default ``1.0``.
103
+ consistency_threshold: Mode 3 only. Cosine similarity threshold above
104
+ which a sampled response is considered "consistent" with the original.
105
+ Default ``0.8``.
106
+
107
+ Raises:
108
+ ConfigurationError: If ``clustering_method`` is not ``"hdbscan"`` or
109
+ ``"agglomerative"``, ``significance_level`` is not in (0, 1),
110
+ ``failure_threshold`` is not in [0, 1], ``n_samples < 2``,
111
+ ``min_slice_size <= 0``, ``max_concurrent_requests <= 0``,
112
+ ``temperature < 0``, or ``consistency_threshold`` is not in [0, 1].
113
+ """
114
+ # Validate
115
+ if clustering_method not in ("hdbscan", "agglomerative"):
116
+ raise ConfigurationError(
117
+ f"clustering_method must be 'hdbscan' or 'agglomerative', "
118
+ f"got {clustering_method!r}"
119
+ )
120
+ if not 0 < significance_level < 1:
121
+ raise ConfigurationError("significance_level must be in (0, 1)")
122
+ if min_slice_size <= 0:
123
+ raise ConfigurationError("min_slice_size must be > 0")
124
+ if not 0 <= failure_threshold <= 1:
125
+ raise ConfigurationError("failure_threshold must be in [0, 1]")
126
+ if n_samples < 2:
127
+ raise ConfigurationError("n_samples must be >= 2 for entropy scoring")
128
+ if max_concurrent_requests <= 0:
129
+ raise ConfigurationError("max_concurrent_requests must be > 0")
130
+ if temperature < 0:
131
+ raise ConfigurationError("temperature must be >= 0")
132
+ if not 0 <= consistency_threshold <= 1:
133
+ raise ConfigurationError("consistency_threshold must be in [0, 1]")
134
+
135
+ self.model = model
136
+ self.embedding_model = embedding_model
137
+ self.embedding_max_text_chars = embedding_max_text_chars
138
+ self.embedding_request_kwargs = embedding_request_kwargs
139
+ self.embedding_usage_kwargs = embedding_usage_kwargs
140
+ self.significance_level = significance_level
141
+ self.min_slice_size = min_slice_size
142
+ self.failure_threshold = failure_threshold
143
+ self.n_samples = n_samples
144
+ self.clustering_method = clustering_method
145
+ self.max_concurrent_requests = max_concurrent_requests
146
+ self.temperature = temperature
147
+ self.consistency_threshold = consistency_threshold
148
+
149
+ self._embedder: Embedder = get_embedder(
150
+ embedding_model,
151
+ api_max_text_chars=embedding_max_text_chars,
152
+ api_request_kwargs=embedding_request_kwargs,
153
+ api_usage_request_kwargs=embedding_usage_kwargs,
154
+ )
155
+ self._llm_client = AsyncLLMClient(
156
+ model=model,
157
+ max_concurrent_requests=max_concurrent_requests,
158
+ )
159
+
160
+ # ── analyze() ──────────────────────────────────────────
161
+
162
+ def analyze(
163
+ self,
164
+ prompts: list[str],
165
+ responses: list[str],
166
+ scores: list[float] | None = None,
167
+ references: list[str] | None = None,
168
+ ) -> AnalysisReport:
169
+ """Discover failure slices — input regions where your LLM fails significantly more.
170
+
171
+ Embeds prompts, clusters them, runs statistical tests on each cluster, applies
172
+ Benjamini-Hochberg FDR correction, and names significant clusters via LLM.
173
+
174
+ Scoring mode is auto-detected from the arguments:
175
+
176
+ - **Mode 1** (``scores`` provided): use pre-computed scores directly.
177
+ - **Mode 2** (``references`` provided): score by cosine similarity between
178
+ response and reference embeddings.
179
+ - **Mode 3** (neither): autonomous scoring via semantic entropy and
180
+ self-consistency (makes additional LLM API calls).
181
+ - Both provided: Mode 1 wins, Mode 2 is ignored (``UserWarning`` raised).
182
+
183
+ Args:
184
+ prompts: Input prompts to analyze. Must be non-empty and the same length
185
+ as ``responses``.
186
+ responses: Model responses corresponding to each prompt.
187
+ scores: Mode 1. Pre-computed quality scores in [0, 1] where higher is
188
+ better. Must be the same length as ``prompts``.
189
+ references: Mode 2. Ground-truth reference answers. Must be the same
190
+ length as ``prompts``.
191
+
192
+ Returns:
193
+ :class:`AnalysisReport` with ``slices`` sorted by adjusted p-value
194
+ ascending (most significant first). ``print(report)`` produces formatted
195
+ output. ``report.to_dict()`` returns a JSON-serializable dict.
196
+
197
+ Raises:
198
+ ConfigurationError: If inputs are empty or length-mismatched.
199
+ """
200
+ return run_sync(
201
+ self._analyze_async(prompts, responses, scores, references)
202
+ )
203
+
204
+ async def _analyze_async(
205
+ self,
206
+ prompts: list[str],
207
+ responses: list[str],
208
+ scores: list[float] | None,
209
+ references: list[str] | None,
210
+ ) -> AnalysisReport:
211
+ """
212
+ Async orchestration pipeline:
213
+
214
+ 1. VALIDATE inputs
215
+ 2. DETECT scoring mode
216
+ 3. SCORE → ScoringResult
217
+ 4. BINARIZE → failures = score < threshold
218
+ 5. Early return if 0 failures
219
+ 6. EMBED prompts (NOT responses)
220
+ 7. CLUSTER prompt embeddings
221
+ 8. TEST each cluster (chi2 or Fisher)
222
+ 9. BH CORRECT p-values
223
+ 10. FILTER → keep adjusted_p < alpha
224
+ 11. NAME significant clusters via LLM
225
+ 12. ASSEMBLE FailureSlice objects
226
+ 13. Return AnalysisReport
227
+ """
228
+ from .scoring import EntropyScorer, PrecomputedScorer, ReferenceScorer
229
+ from .slicing import (
230
+ benjamini_hochberg,
231
+ cluster_embeddings,
232
+ get_representative_prompts,
233
+ test_cluster_failure_rate,
234
+ )
235
+
236
+ # 1. Validate
237
+ validate_inputs(prompts, responses, scores, references)
238
+
239
+ # 2. Mode detection
240
+ if scores is not None and references is not None:
241
+ warnings.warn(
242
+ "Both scores and references provided. Using scores (Mode 1).",
243
+ UserWarning, stacklevel=3,
244
+ )
245
+ references = None
246
+
247
+ if scores is not None:
248
+ scoring_mode = "precomputed"
249
+ scorer = PrecomputedScorer(scores)
250
+ elif references is not None:
251
+ scoring_mode = "reference"
252
+ scorer = ReferenceScorer(self._embedder, references)
253
+ else:
254
+ scoring_mode = "entropy"
255
+ scorer = EntropyScorer(
256
+ client=self._llm_client, embedder=self._embedder,
257
+ n_samples=self.n_samples, temperature=self.temperature,
258
+ consistency_threshold=self.consistency_threshold,
259
+ )
260
+
261
+ # 3. Score
262
+ logger.info(f"Scoring mode: {scoring_mode}")
263
+ scoring_result: ScoringResult = await scorer.score(prompts, responses)
264
+
265
+ # 4. Binarize
266
+ score_array = np.array(scoring_result.scores)
267
+ failures = score_array < self.failure_threshold
268
+ total_failures = int(np.sum(failures))
269
+ total_prompts = len(prompts)
270
+ baseline = total_failures / total_prompts if total_prompts > 0 else 0.0
271
+
272
+ logger.info(
273
+ f"Failures: {total_failures}/{total_prompts} ({baseline:.1%}) "
274
+ f"at threshold={self.failure_threshold}"
275
+ )
276
+
277
+ # 5. Early return
278
+ if total_failures == 0:
279
+ return AnalysisReport(
280
+ slices=[], total_prompts=total_prompts, total_failures=0,
281
+ baseline_failure_rate=0.0, significance_level=self.significance_level,
282
+ failure_threshold=self.failure_threshold, scoring_mode=scoring_mode,
283
+ num_clusters_tested=0, num_significant=0,
284
+ clustering_method=self.clustering_method,
285
+ embedding_model=self.embedding_model,
286
+ metadata={"scoring_metadata": scoring_result.metadata},
287
+ )
288
+
289
+ # 6. Embed prompts
290
+ logger.info("Embedding prompts...")
291
+ prompt_embeddings = self._embedder.embed_queries(prompts)
292
+
293
+ # 7. Cluster
294
+ logger.info(f"Clustering ({self.clustering_method})...")
295
+ labels = cluster_embeddings(
296
+ prompt_embeddings, method=self.clustering_method,
297
+ min_cluster_size=self.min_slice_size,
298
+ )
299
+
300
+ unique_labels = sorted(set(labels))
301
+ if -1 in unique_labels:
302
+ unique_labels.remove(-1)
303
+
304
+ # 8. Statistical testing
305
+ logger.info(f"Testing {len(unique_labels)} clusters...")
306
+ test_results = []
307
+ for cid in unique_labels:
308
+ mask = labels == cid
309
+ result = test_cluster_failure_rate(
310
+ cluster_failures=int(np.sum(failures[mask])),
311
+ cluster_size=int(np.sum(mask)),
312
+ total_failures=total_failures,
313
+ total_size=total_prompts,
314
+ cluster_id=cid,
315
+ )
316
+ test_results.append(result)
317
+
318
+ # 9. BH correction
319
+ corrected = benjamini_hochberg(test_results, alpha=self.significance_level)
320
+
321
+ # 10. Filter
322
+ significant = [
323
+ r for r in corrected
324
+ if r.adjusted_p_value < self.significance_level
325
+ ]
326
+ logger.info(
327
+ f"{len(significant)}/{len(corrected)} clusters significant "
328
+ f"at alpha={self.significance_level}"
329
+ )
330
+
331
+ if not significant:
332
+ return AnalysisReport(
333
+ slices=[], total_prompts=total_prompts,
334
+ total_failures=total_failures, baseline_failure_rate=baseline,
335
+ significance_level=self.significance_level,
336
+ failure_threshold=self.failure_threshold, scoring_mode=scoring_mode,
337
+ num_clusters_tested=len(corrected), num_significant=0,
338
+ clustering_method=self.clustering_method,
339
+ embedding_model=self.embedding_model,
340
+ metadata={"scoring_metadata": scoring_result.metadata},
341
+ )
342
+
343
+ # 11. Name
344
+ clusters_texts = []
345
+ clusters_all_indices = []
346
+ for r in significant:
347
+ rep_prompts, _ = get_representative_prompts(
348
+ prompt_embeddings, labels, r.cluster_id, prompts, top_k=10
349
+ )
350
+ clusters_texts.append(rep_prompts)
351
+ # ALL indices in this cluster
352
+ all_idx = np.where(labels == r.cluster_id)[0].tolist()
353
+ clusters_all_indices.append(all_idx)
354
+
355
+ logger.info(f"Naming {len(significant)} clusters...")
356
+ cluster_labels = await label_clusters(
357
+ self._llm_client, clusters_texts, context="failure slice"
358
+ )
359
+
360
+ # 12. Assemble
361
+ slices: list[FailureSlice] = []
362
+ for r, label, rep_texts, all_idx in zip(
363
+ significant, cluster_labels, clusters_texts, clusters_all_indices
364
+ ):
365
+ # Build examples: top-5 with prompt, response, score
366
+ examples = []
367
+ for idx in all_idx[:5]:
368
+ examples.append({
369
+ "prompt": prompts[idx],
370
+ "response": responses[idx],
371
+ "score": float(score_array[idx]),
372
+ })
373
+
374
+ effect = r.failure_rate / baseline if baseline > 0 else float('inf')
375
+
376
+ slices.append(FailureSlice(
377
+ name=label.name,
378
+ description=label.description,
379
+ size=r.size,
380
+ failure_rate=r.failure_rate,
381
+ baseline_rate=baseline,
382
+ effect_size=round(effect, 2),
383
+ p_value=r.p_value,
384
+ adjusted_p_value=r.adjusted_p_value,
385
+ test_used=r.test_used,
386
+ sample_indices=all_idx,
387
+ examples=examples,
388
+ representative_prompts=rep_texts[:5],
389
+ cluster_id=r.cluster_id,
390
+ ))
391
+
392
+ report = AnalysisReport(
393
+ slices=slices, total_prompts=total_prompts,
394
+ total_failures=total_failures, baseline_failure_rate=baseline,
395
+ significance_level=self.significance_level,
396
+ failure_threshold=self.failure_threshold, scoring_mode=scoring_mode,
397
+ num_clusters_tested=len(corrected), num_significant=len(slices),
398
+ clustering_method=self.clustering_method,
399
+ embedding_model=self.embedding_model,
400
+ metadata={"scoring_metadata": scoring_result.metadata},
401
+ )
402
+
403
+ logger.info(report.summary())
404
+ return report
405
+
406
+ # ── audit_coverage() ───────────────────────────────────
407
+
408
+ def audit_coverage(
409
+ self,
410
+ test_prompts: list[str],
411
+ production_prompts: list[str],
412
+ distance_threshold: float | None = None,
413
+ min_gap_size: int = 5,
414
+ ) -> CoverageReport:
415
+ """Find semantic blind spots in a test suite by comparing against production traffic.
416
+
417
+ Embeds both test and production prompts, uses k-nearest-neighbors to find
418
+ production prompts that have no semantically similar test prompt, clusters those
419
+ uncovered prompts into gap clusters, and names each gap via LLM.
420
+
421
+ Args:
422
+ test_prompts: Prompts from your evaluation / test suite.
423
+ production_prompts: Prompts from real production traffic (e.g. application logs).
424
+ distance_threshold: L2 distance cutoff in embedding space. Production prompts
425
+ farther than this from any test prompt are considered "uncovered".
426
+ If ``None`` (default), auto-computed as ``mean(distances) + 1.5 * std(distances)``.
427
+ Set explicitly if the auto-threshold behaves unexpectedly on your data.
428
+ min_gap_size: Minimum number of production prompts required to form a
429
+ reportable gap cluster. Smaller clusters are discarded. Default ``5``.
430
+
431
+ Returns:
432
+ :class:`CoverageReport` with ``gaps`` sorted by ``mean_distance`` descending
433
+ (most severe gaps first). ``print(coverage)`` produces formatted output.
434
+ ``coverage.to_dict()`` returns a JSON-serializable dict.
435
+
436
+ Raises:
437
+ ConfigurationError: If ``test_prompts`` or ``production_prompts`` is
438
+ empty, ``min_gap_size <= 0``, or ``distance_threshold < 0``.
439
+ """
440
+ return run_sync(self._audit_coverage_async(
441
+ test_prompts, production_prompts, distance_threshold, min_gap_size
442
+ ))
443
+
444
+ async def _audit_coverage_async(
445
+ self,
446
+ test_prompts: list[str],
447
+ production_prompts: list[str],
448
+ distance_threshold: float | None,
449
+ min_gap_size: int,
450
+ ) -> CoverageReport:
451
+ """
452
+ Pipeline:
453
+ 1. Validate
454
+ 2. Embed test + production prompts
455
+ 3. Detect gaps (NN distances + clustering)
456
+ 4. Get representatives per gap
457
+ 5. Name gaps via LLM
458
+ 6. Assemble CoverageReport
459
+ """
460
+ from .coverage import detect_coverage_gaps
461
+ from .slicing.clustering import get_representative_prompts
462
+
463
+ if not test_prompts:
464
+ raise ConfigurationError("test_prompts must be non-empty")
465
+ if not production_prompts:
466
+ raise ConfigurationError("production_prompts must be non-empty")
467
+ if min_gap_size <= 0:
468
+ raise ConfigurationError("min_gap_size must be > 0")
469
+ if distance_threshold is not None and distance_threshold < 0:
470
+ raise ConfigurationError("distance_threshold must be >= 0")
471
+
472
+ # Embed
473
+ logger.info("Embedding test prompts...")
474
+ test_emb = self._embedder.embed_queries(test_prompts)
475
+ logger.info("Embedding production prompts...")
476
+ prod_emb = self._embedder.embed_queries(production_prompts)
477
+
478
+ # Detect gaps
479
+ gap_labels, nn_distances, used_threshold = detect_coverage_gaps(
480
+ test_embeddings=test_emb, prod_embeddings=prod_emb,
481
+ prod_prompts=production_prompts,
482
+ distance_threshold=distance_threshold,
483
+ min_gap_size=min_gap_size,
484
+ clustering_method=self.clustering_method,
485
+ )
486
+
487
+ total = len(production_prompts)
488
+ total_uncovered = int(np.sum(gap_labels != -1))
489
+ unclustered_prompt_indices = np.where(gap_labels == -2)[0].tolist()
490
+ coverage_metadata = {
491
+ "num_uncovered_total": total_uncovered,
492
+ "num_clustered_uncovered": int(np.sum(gap_labels >= 0)),
493
+ "num_unclustered_uncovered": len(unclustered_prompt_indices),
494
+ "unclustered_prompt_indices": unclustered_prompt_indices,
495
+ }
496
+
497
+ unique_gaps = sorted(set(gap_labels))
498
+ if -1 in unique_gaps:
499
+ unique_gaps.remove(-1)
500
+ if -2 in unique_gaps:
501
+ unique_gaps.remove(-2)
502
+
503
+ if not unique_gaps:
504
+ return CoverageReport(
505
+ gaps=[], num_test_prompts=len(test_prompts),
506
+ num_production_prompts=total, num_gaps=0,
507
+ overall_coverage_score=1.0 - (total_uncovered / total if total else 0),
508
+ distance_threshold=used_threshold,
509
+ embedding_model=self.embedding_model,
510
+ metadata=coverage_metadata,
511
+ )
512
+
513
+ # Representatives + metadata per gap
514
+ clusters_texts = []
515
+ gaps_meta = []
516
+ for cid in unique_gaps:
517
+ mask = gap_labels == cid
518
+ size = int(np.sum(mask))
519
+ mean_dist = float(np.mean(nn_distances[mask]))
520
+ rep_prompts, _ = get_representative_prompts(
521
+ prod_emb, gap_labels, cid, production_prompts, top_k=10
522
+ )
523
+ clusters_texts.append(rep_prompts)
524
+ all_idx = np.where(mask)[0].tolist()
525
+ gaps_meta.append((cid, size, mean_dist, rep_prompts[:5], all_idx))
526
+
527
+ # Name gaps
528
+ logger.info(f"Naming {len(unique_gaps)} coverage gaps...")
529
+ cluster_labels = await label_clusters(
530
+ self._llm_client, clusters_texts, context="coverage gap"
531
+ )
532
+
533
+ # Assemble
534
+ gaps: list[CoverageGap] = []
535
+ for label, (cid, size, mean_dist, rep_prompts, all_idx) in zip(
536
+ cluster_labels, gaps_meta
537
+ ):
538
+ gaps.append(CoverageGap(
539
+ name=label.name, description=label.description,
540
+ size=size, mean_distance=mean_dist,
541
+ representative_prompts=rep_prompts,
542
+ prompt_indices=all_idx, cluster_id=cid,
543
+ ))
544
+
545
+ gaps.sort(key=lambda g: g.mean_distance, reverse=True)
546
+
547
+ report = CoverageReport(
548
+ gaps=gaps, num_test_prompts=len(test_prompts),
549
+ num_production_prompts=total, num_gaps=len(gaps),
550
+ overall_coverage_score=1.0 - (total_uncovered / total if total else 0),
551
+ distance_threshold=used_threshold,
552
+ embedding_model=self.embedding_model,
553
+ metadata=coverage_metadata,
554
+ )
555
+
556
+ logger.info(report.summary())
557
+ return report
@@ -0,0 +1,3 @@
1
+ from .detector import detect_coverage_gaps
2
+
3
+ __all__ = ["detect_coverage_gaps"]