codebase-stats 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,148 @@
1
+ """Codebase Statistics Analysis Library
2
+
3
+ A comprehensive library for analyzing Python codebase metrics including coverage,
4
+ test duration, code complexity, maintainability, and more.
5
+
6
+ Main Modules:
7
+ - core: Data loading and preprocessing
8
+ - coverage: Coverage analysis and reporting
9
+ - metrics: Code quality metrics (complexity, maintainability, etc.)
10
+ - duration: Test duration analysis
11
+ - sizes: File size distribution analysis
12
+ - tree: Directory structure analysis
13
+ - lowcov: Low-coverage file identification and prioritization
14
+ - utils: Utility functions for formatting and analysis
15
+
16
+ Example Usage:
17
+ from codebase_stats import load_coverage, precompute_coverage_stats
18
+ from codebase_stats import show_coverage_histogram
19
+
20
+ # Load and analyze coverage data
21
+ data = load_coverage("coverage.json")
22
+ stats = precompute_coverage_stats(data)
23
+
24
+ # Display coverage report
25
+ show_coverage_histogram(stats, bins=10, show_blame=True)
26
+ """
27
+
28
+ # Core data functions
29
+ from .core import (
30
+ load_coverage,
31
+ load_report,
32
+ precompute_coverage_stats,
33
+ extract_layer,
34
+ build_suffix_index,
35
+ suffix_lookup,
36
+ LAYER_MAP,
37
+ LAYER_ORDER,
38
+ )
39
+
40
+ # Coverage analysis
41
+ from .coverage import (
42
+ show_coverage_histogram,
43
+ scan_pragma_counts,
44
+ scan_pragma_intervals,
45
+ show_pragma_histogram,
46
+ )
47
+
48
+ # Code quality metrics
49
+ from .metrics import (
50
+ run_radon,
51
+ run_radon_mi,
52
+ run_radon_raw,
53
+ run_radon_hal,
54
+ show_complexity_histogram,
55
+ show_mi_histogram,
56
+ show_raw_histogram,
57
+ show_hal_histogram,
58
+ cc_rank,
59
+ mi_rank,
60
+ )
61
+
62
+ # Test duration
63
+ from .duration import (
64
+ show_duration_histogram,
65
+ test_duration,
66
+ )
67
+
68
+ # File sizes
69
+ from .sizes import (
70
+ show_file_size_distribution,
71
+ )
72
+
73
+ # Tree analysis
74
+ from .tree import (
75
+ analyze_tree,
76
+ )
77
+
78
+ # Low coverage reporting
79
+ from .lowcov import (
80
+ show_low_coverage,
81
+ parse_sorts,
82
+ priority_score,
83
+ VALID_SORT_FIELDS,
84
+ )
85
+
86
+ # Utilities
87
+ from .utils import (
88
+ percentile,
89
+ format_line_ranges,
90
+ ascii_histogram,
91
+ blame_header,
92
+ fmt_seconds,
93
+ )
94
+
95
+ # Reporter
96
+ from .reporter import (
97
+ CodebaseStatsReporter,
98
+ )
99
+
100
+ __version__ = "0.1.0"
101
+
102
+ __all__ = [
103
+ # Core
104
+ "load_coverage",
105
+ "load_report",
106
+ "precompute_coverage_stats",
107
+ "extract_layer",
108
+ "build_suffix_index",
109
+ "suffix_lookup",
110
+ "LAYER_MAP",
111
+ "LAYER_ORDER",
112
+ # Coverage
113
+ "show_coverage_histogram",
114
+ "scan_pragma_counts",
115
+ "scan_pragma_intervals",
116
+ "show_pragma_histogram",
117
+ # Metrics
118
+ "run_radon",
119
+ "run_radon_mi",
120
+ "run_radon_raw",
121
+ "run_radon_hal",
122
+ "show_complexity_histogram",
123
+ "show_mi_histogram",
124
+ "show_raw_histogram",
125
+ "show_hal_histogram",
126
+ "cc_rank",
127
+ "mi_rank",
128
+ # Duration
129
+ "show_duration_histogram",
130
+ "test_duration",
131
+ # Sizes
132
+ "show_file_size_distribution",
133
+ # Tree
134
+ "analyze_tree",
135
+ # Low coverage
136
+ "show_low_coverage",
137
+ "parse_sorts",
138
+ "priority_score",
139
+ "VALID_SORT_FIELDS",
140
+ # Utils
141
+ "percentile",
142
+ "format_line_ranges",
143
+ "ascii_histogram",
144
+ "blame_header",
145
+ "fmt_seconds",
146
+ # Reporter
147
+ "CodebaseStatsReporter",
148
+ ]
codebase_stats/core.py ADDED
@@ -0,0 +1,240 @@
1
+ """Core data loading and preprocessing functions."""
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+
8
+ def load_coverage(path: str) -> dict:
9
+ """Load and parse a coverage.json file.
10
+
11
+ Args:
12
+ path: Path to coverage.json file
13
+
14
+ Returns:
15
+ Parsed coverage data dictionary
16
+
17
+ Raises:
18
+ SystemExit: On file not found or invalid JSON
19
+ """
20
+ try:
21
+ with open(path, encoding="utf-8") as f:
22
+ data = json.load(f)
23
+ if "files" not in data:
24
+ raise KeyError("coverage JSON missing 'files' key")
25
+ return data
26
+ except FileNotFoundError:
27
+ sys.exit(f"❌ File not found: {path}")
28
+ except json.JSONDecodeError:
29
+ sys.exit(f"❌ Invalid JSON: {path}")
30
+
31
+
32
+ def load_report(path: str) -> dict:
33
+ """Load and parse a pytest-json-report file.
34
+
35
+ Args:
36
+ path: Path to pytest JSON report file
37
+
38
+ Returns:
39
+ Parsed report data dictionary
40
+
41
+ Raises:
42
+ SystemExit: On file not found or invalid JSON
43
+ """
44
+ try:
45
+ with open(path, encoding="utf-8") as f:
46
+ data = json.load(f)
47
+ if "tests" not in data:
48
+ sys.exit(f"❌ '{path}' has no 'tests' key — is this a pytest-json-report file?")
49
+ return data
50
+ except FileNotFoundError:
51
+ sys.exit(f"❌ File not found: {path}")
52
+ except json.JSONDecodeError:
53
+ sys.exit(f"❌ Invalid JSON: {path}")
54
+
55
+
56
+ # Layer mapping and ordering constants
57
+ LAYER_MAP = {
58
+ "domain": "Domain",
59
+ "application": "Application",
60
+ "infrastructure": "Infrastructure",
61
+ "services": "Services",
62
+ "repositories": "Repositories",
63
+ "use_cases": "Use Cases",
64
+ "api": "API",
65
+ }
66
+
67
+ LAYER_ORDER = {
68
+ "Domain": 1,
69
+ "Application": 2,
70
+ "Services": 3,
71
+ "Use Cases": 4,
72
+ "Repositories": 5,
73
+ "Infrastructure": 6,
74
+ "API": 7,
75
+ "Other": 8,
76
+ }
77
+
78
+
79
+ def extract_layer(path: str) -> str:
80
+ """Extract the architectural layer from a file path.
81
+
82
+ Args:
83
+ path: File path to analyze
84
+
85
+ Returns:
86
+ Layer name (e.g., "Domain", "Application", "Other")
87
+ """
88
+ parts = [p.lower() for p in Path(path).parts]
89
+ for key, name in LAYER_MAP.items():
90
+ if key in parts:
91
+ return name
92
+ return "Other"
93
+
94
+
95
+ def build_suffix_index(radon_map: dict) -> dict:
96
+ """Build a suffix-keyed index for path matching.
97
+
98
+ This allows matching coverage.json paths (which may use different root
99
+ prefixes or be absolute vs relative) to radon output.
100
+
101
+ Strategy: for each radon key, store it under every possible suffix:
102
+ '/home/user/proj/app/foo/bar.py' → keys: 'bar.py', 'foo/bar.py',
103
+ 'app/foo/bar.py', ...
104
+ Then look up a coverage path by trying progressively longer suffixes
105
+ until one hits. The longest match wins to avoid false collisions.
106
+
107
+ Args:
108
+ radon_map: Dictionary mapping file paths to radon analysis data
109
+
110
+ Returns:
111
+ Dictionary mapping suffixes to radon values
112
+ """
113
+ index: dict[str, object] = {} # suffix_str → radon value
114
+ for k, v in radon_map.items():
115
+ parts = Path(k).parts
116
+ for i in range(len(parts)):
117
+ suffix = "/".join(parts[i:])
118
+ # First writer wins for each suffix key (shortest path = most specific)
119
+ if suffix not in index:
120
+ index[suffix] = v
121
+ return index
122
+
123
+
124
+ def suffix_lookup(index: dict, coverage_path: str):
125
+ """Look up a coverage.json path in a suffix index.
126
+
127
+ Try suffixes from most-specific (full path) down to basename.
128
+
129
+ Args:
130
+ index: Index built by build_suffix_index()
131
+ coverage_path: Path from coverage.json to look up
132
+
133
+ Returns:
134
+ The radon value if found, None otherwise
135
+ """
136
+ parts = Path(coverage_path).parts
137
+ for i in range(len(parts)):
138
+ key = "/".join(parts[i:])
139
+ if key in index:
140
+ return index[key]
141
+ return None
142
+
143
+
144
+ def precompute_coverage_stats(
145
+ data: dict,
146
+ complexity_map: dict = None,
147
+ mi_map: dict = None,
148
+ raw_map: dict = None,
149
+ hal_map: dict = None,
150
+ ) -> dict:
151
+ """Precompute statistics from coverage.json and radon data.
152
+
153
+ One pass over data['files']. Returns everything both histogram and
154
+ list functions need so neither has to re-iterate the JSON.
155
+
156
+ Args:
157
+ data: Parsed coverage.json data
158
+ complexity_map: Radon cyclomatic complexity data (optional)
159
+ mi_map: Radon maintainability index data (optional)
160
+ raw_map: Radon raw metrics data (optional)
161
+ hal_map: Radon Halstead metrics data (optional)
162
+
163
+ Returns:
164
+ Dictionary with precomputed statistics including file_stats, project totals,
165
+ and coverage percentiles
166
+ """
167
+ proj_total = proj_covered = 0
168
+ t = data.get("totals", {})
169
+ proj_total = t.get("num_statements", 0)
170
+ proj_covered = t.get("covered_lines", 0)
171
+
172
+ # Build suffix indexes once — O(n·depth) — so the per-file lookup is O(depth)
173
+ _cc_idx = build_suffix_index(complexity_map) if complexity_map else {}
174
+ _mi_idx = build_suffix_index(mi_map) if mi_map else {}
175
+ _raw_idx = build_suffix_index(raw_map) if raw_map else {}
176
+ _hal_idx = build_suffix_index(hal_map) if hal_map else {}
177
+
178
+ file_stats = [] # one dict per file with non-zero statements
179
+ for path, info in data["files"].items():
180
+ s = info.get("summary", {})
181
+ total = s.get("num_statements", 0)
182
+ if total == 0:
183
+ continue
184
+ covered = s.get("covered_lines", 0)
185
+ pct = s.get("percent_covered", 0.0)
186
+ missing = info.get("missing_lines") or []
187
+ layer = extract_layer(path)
188
+
189
+ if proj_total == 0: # totals block missing — accumulate
190
+ proj_total += total
191
+ proj_covered += covered
192
+
193
+ cc = suffix_lookup(_cc_idx, path) if _cc_idx else None
194
+ mi = suffix_lookup(_mi_idx, path) if _mi_idx else None
195
+ raw = suffix_lookup(_raw_idx, path) if _raw_idx else None
196
+ hal = suffix_lookup(_hal_idx, path) if _hal_idx else None
197
+ file_stats.append(
198
+ {
199
+ "path": path,
200
+ "pct": pct,
201
+ "total": total,
202
+ "covered": covered,
203
+ "missing_count": len(missing),
204
+ "missing_lines": missing,
205
+ "layer": layer,
206
+ "layer_order": LAYER_ORDER.get(layer, 8),
207
+ # radon cc
208
+ "cc_avg": cc["avg"] if cc else None,
209
+ "cc_max": cc["max"] if cc else None,
210
+ "cc_n_blocks": cc["n_blocks"] if cc else None,
211
+ "cc_scores": cc["scores"] if cc else None, # individual function scores
212
+ # radon mi
213
+ "mi": mi["mi"] if mi else None,
214
+ "mi_rank": mi["rank"] if mi else None,
215
+ # radon raw
216
+ "comment_ratio": raw["comment_ratio"] if raw else None,
217
+ "sloc": raw["sloc"] if raw else None,
218
+ # radon hal
219
+ "hal_bugs": hal["bugs"] if hal else None,
220
+ "hal_difficulty": hal["difficulty"] if hal else None,
221
+ }
222
+ )
223
+
224
+ proj_pct = proj_covered / proj_total * 100 if proj_total else 0.0
225
+ coverages_sorted = sorted(f["pct"] for f in file_stats)
226
+
227
+ # Flat list of (cc_score, filepath) for all functions across all files
228
+ # used by the CC histogram to plot true function distribution.
229
+ cc_scores = sorted(
230
+ (score, f["path"]) for f in file_stats if f.get("cc_scores") for score in f["cc_scores"]
231
+ )
232
+
233
+ return {
234
+ "file_stats": file_stats,
235
+ "proj_total": proj_total,
236
+ "proj_covered": proj_covered,
237
+ "proj_pct": proj_pct,
238
+ "coverages_sorted": coverages_sorted,
239
+ "cc_scores": cc_scores, # (cc_value, filepath) per function
240
+ }
@@ -0,0 +1,295 @@
1
+ """Coverage analysis functions."""
2
+
3
+ from pathlib import Path
4
+ from .utils import percentile, format_line_ranges, ascii_histogram, blame_header
5
+
6
+
7
+ def scan_pragma_intervals(root: str) -> dict[str, list]:
8
+ """Find sequential # pragma: no cover blocks per .py file under root.
9
+
10
+ Args:
11
+ root: Root directory to scan
12
+
13
+ Returns:
14
+ Dictionary mapping file paths to lists of (start, end) tuples
15
+ """
16
+ result = {}
17
+ for path in Path(root).rglob("*.py"):
18
+ if any(part.startswith(".") or part == "__pycache__" for part in path.parts):
19
+ continue
20
+ try:
21
+ lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
22
+ pragma_rows = [i + 1 for i, line in enumerate(lines) if "# pragma: no cover" in line]
23
+ if not pragma_rows:
24
+ continue
25
+ # Find sequential blocks
26
+ intervals = []
27
+ start = None
28
+ for idx, row in enumerate(pragma_rows):
29
+ if start is None:
30
+ start = row
31
+ if idx == len(pragma_rows) - 1 or pragma_rows[idx + 1] != row + 1:
32
+ end = row
33
+ intervals.append((start, end))
34
+ start = None
35
+ result[str(path)] = intervals
36
+ except OSError:
37
+ pass
38
+ return result
39
+
40
+
41
+ def scan_pragma_counts(root: str) -> dict[str, int]:
42
+ """Count '# pragma: no cover' occurrences per .py file under root.
43
+
44
+ Args:
45
+ root: Root directory to scan
46
+
47
+ Returns:
48
+ Dictionary mapping file paths to pragma counts
49
+ """
50
+ result = {}
51
+ for path in Path(root).rglob("*.py"):
52
+ if any(part.startswith(".") or part == "__pycache__" for part in path.parts):
53
+ continue
54
+ try:
55
+ count = path.read_text(encoding="utf-8", errors="ignore").count("# pragma: no cover")
56
+ if count > 0:
57
+ result[str(path)] = count
58
+ except OSError:
59
+ pass
60
+ return result
61
+
62
+
63
+ def show_pragma_histogram(pragma_counts: dict, bins: int = 10, blame_limit: int = 20, width: int = 80) -> None:
64
+ """Histogram of '# pragma: no cover' count per file + all-files listing.
65
+
66
+ Also prints a secondary histogram and table for sequential blocks of
67
+ pragmas (intervals) so that you can spot long runs of skipped code.
68
+
69
+ Args:
70
+ pragma_counts: Dictionary mapping files to pragma counts
71
+ bins: Number of histogram bins
72
+ blame_limit: Maximum blamed items to display
73
+ width: Line width for output
74
+ """
75
+ # --- Sequential pragma interval histogram ---
76
+ pragma_intervals = scan_pragma_intervals(".")
77
+ interval_lengths = [
78
+ end - start + 1 for intervals in pragma_intervals.values() for (start, end) in intervals
79
+ ]
80
+ if interval_lengths:
81
+ n_int = len(interval_lengths)
82
+ total_int = sum(interval_lengths)
83
+ int_max = max(interval_lengths)
84
+ int_step = max((int_max + 0.1) / bins, 1)
85
+ edges = [i * int_step for i in range(bins + 1)]
86
+ int_buckets = [0] * bins
87
+ for v in interval_lengths:
88
+ placed = False
89
+ for bi in range(bins - 1):
90
+ if edges[bi] <= v < edges[bi + 1]:
91
+ int_buckets[bi] += 1
92
+ placed = True
93
+ break
94
+ if not placed:
95
+ int_buckets[-1] += 1
96
+ int_labels = [f"{int(edges[i]):>4}\u2013{int(edges[i + 1]):<4} rows" for i in range(bins)]
97
+ print(f"\n{'═' * width}")
98
+ print(f" PRAGMA INTERVAL LENGTH HISTOGRAM (sequential blocks, {n_int} intervals)")
99
+ print(f" [linear scale, {bins} bins]")
100
+ print()
101
+ ascii_histogram(int_buckets, int_labels, width=width)
102
+ print()
103
+ print(
104
+ f" p50: {int(percentile(interval_lengths, 50)):<6} "
105
+ f"p75: {int(percentile(interval_lengths, 75)):<6} "
106
+ f"p90: {int(percentile(interval_lengths, 90)):<6} "
107
+ f"p99: {int(percentile(interval_lengths, 99)):<6}"
108
+ )
109
+ print(f" avg: {total_int / n_int:<6.1f} max: {int_max:<6} total rows: {total_int}")
110
+ print(f"\n{'═' * width}")
111
+ print(f" PRAGMA INTERVAL ROWS {n_int}")
112
+ print(f"\n{'═' * width}")
113
+ for path, intervals in pragma_intervals.items():
114
+ for start, end in intervals:
115
+ print(f" {path} rows {start}-{end} ({end - start + 1} rows)")
116
+ # --- End interval histogram ---
117
+ if not pragma_counts:
118
+ return
119
+ file_list = sorted(pragma_counts.items(), key=lambda x: x[1], reverse=True)
120
+ counts_sorted = sorted(pragma_counts.values())
121
+ n = len(counts_sorted)
122
+ total = sum(counts_sorted)
123
+ m_max = counts_sorted[-1]
124
+
125
+ m_step = max((m_max + 0.1) / bins, 1)
126
+ edges = [i * m_step for i in range(bins + 1)]
127
+ p_buckets = [0] * bins
128
+ for v in counts_sorted:
129
+ placed = False
130
+ for bi in range(bins - 1):
131
+ if edges[bi] <= v < edges[bi + 1]:
132
+ p_buckets[bi] += 1
133
+ placed = True
134
+ break
135
+ if not placed:
136
+ p_buckets[-1] += 1
137
+ p_labels = [f"{int(edges[i]):>4}\u2013{int(edges[i + 1]):<4} pragmas" for i in range(bins)]
138
+
139
+ print(f"\n{'═' * width}")
140
+ print(f" PRAGMA: NO COVER DISTRIBUTION (files with \u22651 pragma: {n}, total: {total})")
141
+ print(f" [linear scale, {bins} bins]")
142
+ print()
143
+ ascii_histogram(p_buckets, p_labels, width=width)
144
+ print()
145
+ print(
146
+ f" p50: {int(percentile(counts_sorted, 50)):<6} "
147
+ f"p75: {int(percentile(counts_sorted, 75)):<6} "
148
+ f"p90: {int(percentile(counts_sorted, 90)):<6} "
149
+ f"p99: {int(percentile(counts_sorted, 99)):<6}"
150
+ )
151
+ print(f" avg: {total / n:<6.1f} max: {m_max:<6} total pragmas: {total}")
152
+
153
+ shown = file_list if not blame_limit else file_list[:blame_limit]
154
+ tail = (
155
+ f" (showing top {blame_limit}, --blame-limit 0 for all)"
156
+ if blame_limit and len(file_list) > blame_limit
157
+ else ""
158
+ )
159
+ print(f"\n{'═' * width}")
160
+ print(f" PRAGMA FILES {len(file_list)}{tail}")
161
+ print(f"\n{'═' * width}")
162
+ for path, count in shown:
163
+ print(f" {count:>4} pragmas {path}")
164
+
165
+
166
+ def show_coverage_histogram(
167
+ stats: dict,
168
+ bins: int = 10,
169
+ blame_limit: int = 20,
170
+ show_blame: bool = True,
171
+ threshold: float = 90.0,
172
+ pragma_counts: dict | None = None,
173
+ width: int = 80,
174
+ ):
175
+ """Display coverage distribution histogram and low-coverage blame.
176
+
177
+ Args:
178
+ stats: Precomputed statistics from precompute_coverage_stats()
179
+ bins: Number of histogram bins
180
+ blame_limit: Maximum blamed files to display
181
+ show_blame: Whether to show quality blame sections
182
+ threshold: Blame ceiling (show files below Q1 or this threshold, whichever is lower)
183
+ pragma_counts: Optional pragma count data to display
184
+ width: Line width for output
185
+ """
186
+ cs = stats["coverages_sorted"]
187
+ if not cs:
188
+ print("No files found.")
189
+ return
190
+
191
+ n = len(cs)
192
+ avg = sum(cs) / n
193
+ below_80 = sum(1 for c in cs if c < 80)
194
+ below_60 = sum(1 for c in cs if c < 60)
195
+
196
+ step = 100 / bins
197
+ buckets = [0] * bins
198
+ labels = []
199
+ for i in range(bins):
200
+ lo, hi = i * step, (i + 1) * step
201
+ labels.append(f"{lo:>5.1f}–{hi:<5.1f}%")
202
+ for c in cs:
203
+ if lo <= c < hi or (i == bins - 1 and c == 100.0):
204
+ buckets[i] += 1
205
+
206
+ print(f"\n{'═' * width}")
207
+ print(" COVERAGE DISTRIBUTION HISTOGRAM")
208
+ print(f"{'═' * width}")
209
+ print(f" Files: {n} Project coverage: {stats['proj_pct']:.1f}%")
210
+ print()
211
+ ascii_histogram(buckets, labels, width=width)
212
+ print()
213
+ print(f"{'─' * width}")
214
+ print(" PERCENTILES")
215
+ print(f"{'─' * width}")
216
+ for pct in (25, 50, 75, 90, 95, 99):
217
+ label = {25: "Q1", 50: "Q2/med", 75: "Q3"}.get(pct, f"p{pct}")
218
+ print(f" {label:<8} {percentile(cs, pct):>6.1f}%")
219
+ print(f" {'avg':<8} {avg:>6.1f}%")
220
+ print(f" {'min':<8} {cs[0]:>6.1f}%")
221
+ print(f" {'max':<8} {cs[-1]:>6.1f}%")
222
+ not_100 = sum(1 for c in cs if c < 100)
223
+ print(f" {'not-100%':<8} {not_100} files ({not_100 / n * 100:.1f}%)")
224
+ print(f" <80% {below_80} files ({below_80 / n * 100:.1f}%)")
225
+ print(f" <60% {below_60} files ({below_60 / n * 100:.1f}%)")
226
+
227
+ # ── missing lines histogram (non-100% files only) ──────────────────────
228
+ missing_counts = sorted(f["missing_count"] for f in stats["file_stats"] if f["pct"] < 100.0)
229
+ if missing_counts:
230
+ import math
231
+ nm = len(missing_counts)
232
+ total_missing = sum(missing_counts)
233
+ m_max = missing_counts[-1]
234
+ use_log = m_max > 0 and missing_counts[0] > 0 and (m_max / missing_counts[0]) >= 100
235
+ if use_log:
236
+ log_min = math.log10(max(missing_counts[0], 1))
237
+ log_max = math.log10(m_max + 1)
238
+ log_step = (log_max - log_min) / bins
239
+ edges = [10 ** (log_min + i * log_step) for i in range(bins + 1)]
240
+ scale_note = "log scale"
241
+ else:
242
+ m_step = max((m_max + 0.1) / bins, 1)
243
+ edges = [i * m_step for i in range(bins + 1)]
244
+ scale_note = "linear scale"
245
+ m_buckets = [0] * bins
246
+ for v in missing_counts:
247
+ for bi in range(bins):
248
+ if edges[bi] <= v < edges[bi + 1] or (bi == bins - 1 and v == m_max):
249
+ m_buckets[bi] += 1
250
+ break
251
+ else: # pragma: no cover
252
+ m_buckets[-1] += 1
253
+ m_labels = [f"{int(edges[i]):>5}–{int(edges[i + 1]):<5} lines" for i in range(bins)]
254
+ print(f"\n{'─' * width}")
255
+ print(f" MISSING LINES DISTRIBUTION (non-100% files: {nm})")
256
+ print(f" [{scale_note}, {bins} bins]")
257
+ print()
258
+ ascii_histogram(m_buckets, m_labels, width=width)
259
+ print()
260
+ m_avg = total_missing / nm
261
+ print(
262
+ f" p50: {int(percentile(missing_counts, 50)):<6} p75: {int(percentile(missing_counts, 75)):<6} "
263
+ f"p90: {int(percentile(missing_counts, 90)):<6} p99: {int(percentile(missing_counts, 99)):<6}"
264
+ )
265
+ print(f" avg: {m_avg:<6.1f} max: {m_max:<6} total missing: {total_missing:,} lines")
266
+
267
+ if pragma_counts:
268
+ show_pragma_histogram(pragma_counts, bins=bins, blame_limit=blame_limit, width=width)
269
+
270
+ if show_blame:
271
+ # quality blame — cap ceiling at `threshold` so near-perfect codebases don't sweep 25% of files
272
+ q1 = percentile(cs, 25)
273
+ blame_ceiling = min(q1, threshold)
274
+ boundary_note = (
275
+ f"Q1={q1:.1f}% capped at {threshold:.0f}%" if q1 > threshold else f"Q1={q1:.1f}%"
276
+ )
277
+ blamed = sorted(
278
+ [
279
+ (f["pct"], f["path"], f["missing_count"], f["missing_lines"])
280
+ for f in stats["file_stats"]
281
+ if f["pct"] < blame_ceiling
282
+ ],
283
+ key=lambda x: x[0],
284
+ )
285
+ blame_header(f"below {blame_ceiling:.1f}% ({boundary_note})", len(blamed), blame_limit, width)
286
+ display = blamed if not blame_limit else blamed[:blame_limit]
287
+ if display:
288
+ for pct, path, missing_count, missing_lines in display:
289
+ print(f" {pct:>5.1f}% {missing_count:>4} lines missing {path}")
290
+ ranges = format_line_ranges(missing_lines)
291
+ if ranges:
292
+ print(f" 📍 {ranges}")
293
+ else:
294
+ print(" ✅ No files below quality boundary.")
295
+ print(f"\n{'═' * width}\n")