codebase-stats 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_stats/__init__.py +148 -0
- codebase_stats/core.py +240 -0
- codebase_stats/coverage.py +295 -0
- codebase_stats/duration.py +245 -0
- codebase_stats/lowcov.py +204 -0
- codebase_stats/metrics.py +393 -0
- codebase_stats/radon.py +264 -0
- codebase_stats/reporter.py +283 -0
- codebase_stats/sizes.py +100 -0
- codebase_stats/tree.py +144 -0
- codebase_stats/utils.py +86 -0
- codebase_stats-0.0.1.dist-info/METADATA +376 -0
- codebase_stats-0.0.1.dist-info/RECORD +16 -0
- codebase_stats-0.0.1.dist-info/WHEEL +5 -0
- codebase_stats-0.0.1.dist-info/entry_points.txt +2 -0
- codebase_stats-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
"""CodebaseStatsReporter — Programmatic report generation for code metrics."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from io import StringIO
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from .core import (
|
|
10
|
+
load_coverage,
|
|
11
|
+
load_report,
|
|
12
|
+
precompute_coverage_stats,
|
|
13
|
+
)
|
|
14
|
+
from .coverage import scan_pragma_counts
|
|
15
|
+
from .metrics import (
|
|
16
|
+
run_radon,
|
|
17
|
+
run_radon_mi,
|
|
18
|
+
run_radon_raw,
|
|
19
|
+
run_radon_hal,
|
|
20
|
+
)
|
|
21
|
+
from .lowcov import parse_sorts
|
|
22
|
+
from .coverage import show_coverage_histogram
|
|
23
|
+
from .duration import show_duration_histogram
|
|
24
|
+
from .sizes import show_file_size_distribution
|
|
25
|
+
from .metrics import (
|
|
26
|
+
show_complexity_histogram,
|
|
27
|
+
show_mi_histogram,
|
|
28
|
+
show_raw_histogram,
|
|
29
|
+
show_hal_histogram,
|
|
30
|
+
)
|
|
31
|
+
from .tree import analyze_tree
|
|
32
|
+
from .lowcov import show_low_coverage
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CodebaseStatsReporter:
|
|
36
|
+
"""Generate comprehensive code quality reports."""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
coverage_file: str = "coverage.json",
|
|
41
|
+
report_file: Optional[str] = None,
|
|
42
|
+
radon_root: Optional[str] = None,
|
|
43
|
+
fs_root: str = ".",
|
|
44
|
+
tree_root: Optional[str] = None,
|
|
45
|
+
):
|
|
46
|
+
"""
|
|
47
|
+
Initialize the reporter.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
coverage_file: Path to coverage.json file
|
|
51
|
+
report_file: Path to pytest-json-report file
|
|
52
|
+
radon_root: Root directory for radon analysis (CC, MI, etc.)
|
|
53
|
+
fs_root: Root directory for file size analysis
|
|
54
|
+
tree_root: Root directory for tree analysis
|
|
55
|
+
"""
|
|
56
|
+
self.coverage_file = coverage_file
|
|
57
|
+
self.report_file = report_file
|
|
58
|
+
self.radon_root = radon_root
|
|
59
|
+
self.fs_root = fs_root
|
|
60
|
+
self.tree_root = tree_root
|
|
61
|
+
self.output = StringIO()
|
|
62
|
+
|
|
63
|
+
def _capture_output(self, func, *args, **kwargs):
|
|
64
|
+
"""Capture stdout from a function call."""
|
|
65
|
+
old_stdout = sys.stdout
|
|
66
|
+
sys.stdout = self.output
|
|
67
|
+
try:
|
|
68
|
+
func(*args, **kwargs)
|
|
69
|
+
finally:
|
|
70
|
+
sys.stdout = old_stdout
|
|
71
|
+
|
|
72
|
+
def _run_radon_analyses(self, debug: bool = False) -> tuple[dict, dict, dict, dict]:
|
|
73
|
+
"""Run all radon analyses in parallel."""
|
|
74
|
+
runners = {
|
|
75
|
+
"cc": lambda: run_radon(self.radon_root, debug=debug),
|
|
76
|
+
"mi": lambda: run_radon_mi(self.radon_root, debug=debug),
|
|
77
|
+
"raw": lambda: run_radon_raw(self.radon_root, debug=debug),
|
|
78
|
+
"hal": lambda: run_radon_hal(self.radon_root, debug=debug),
|
|
79
|
+
}
|
|
80
|
+
results = {key: {} for key in runners}
|
|
81
|
+
with ThreadPoolExecutor(max_workers=4) as pool:
|
|
82
|
+
futures = {pool.submit(fn): key for key, fn in runners.items()}
|
|
83
|
+
for future in as_completed(futures):
|
|
84
|
+
key = futures[future]
|
|
85
|
+
try:
|
|
86
|
+
results[key] = future.result()
|
|
87
|
+
except Exception as e:
|
|
88
|
+
print(f"⚠️ radon {key} failed: {e}")
|
|
89
|
+
return (
|
|
90
|
+
results["cc"],
|
|
91
|
+
results["mi"],
|
|
92
|
+
results["raw"],
|
|
93
|
+
results["hal"],
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def generate_full_report(
|
|
97
|
+
self,
|
|
98
|
+
bins: int = 10,
|
|
99
|
+
blame_limit: int = 20,
|
|
100
|
+
include_coverage: bool = True,
|
|
101
|
+
include_duration: bool = True,
|
|
102
|
+
include_sizes: bool = True,
|
|
103
|
+
include_complexity: bool = True,
|
|
104
|
+
include_mi: bool = True,
|
|
105
|
+
include_raw: bool = True,
|
|
106
|
+
include_hal: bool = True,
|
|
107
|
+
include_tree: bool = True,
|
|
108
|
+
include_lowcov: bool = True,
|
|
109
|
+
show_blame: bool = True,
|
|
110
|
+
threshold: float = 90.0,
|
|
111
|
+
max_threshold: Optional[float] = None,
|
|
112
|
+
top_n: int = 20,
|
|
113
|
+
extension: str = "py",
|
|
114
|
+
fs_percentiles: Optional[list[int]] = None,
|
|
115
|
+
sort_specs: Optional[list[str]] = None,
|
|
116
|
+
show_lines: bool = False,
|
|
117
|
+
slow_threshold: float = 1.0,
|
|
118
|
+
fs_above: Optional[int] = None,
|
|
119
|
+
debug_radon: bool = False,
|
|
120
|
+
) -> str:
|
|
121
|
+
"""
|
|
122
|
+
Generate a comprehensive code quality report.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
bins: Number of histogram bins
|
|
126
|
+
blame_limit: Maximum entries in quality blame sections
|
|
127
|
+
include_*: Sections to include in report
|
|
128
|
+
show_blame: Whether to show quality blame sections
|
|
129
|
+
threshold: Show files below this coverage % (default: 90.0)
|
|
130
|
+
max_threshold: Upper bound % (range filter)
|
|
131
|
+
top_n: Max rows in file listing (0 = all)
|
|
132
|
+
extension: File extension to measure
|
|
133
|
+
fs_percentiles: List of percentiles for file-size histogram
|
|
134
|
+
sort_specs: Sort field specifications
|
|
135
|
+
show_lines: Print missing line numbers
|
|
136
|
+
slow_threshold: Slow-test threshold in seconds
|
|
137
|
+
fs_above: List files above this percentile
|
|
138
|
+
debug_radon: Print debug info from radon
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Report as string
|
|
142
|
+
"""
|
|
143
|
+
if fs_percentiles is None:
|
|
144
|
+
fs_percentiles = [25, 50, 75, 90, 95, 99]
|
|
145
|
+
if sort_specs is None:
|
|
146
|
+
sort_specs = ["priority:desc"]
|
|
147
|
+
|
|
148
|
+
# Load radon data if needed
|
|
149
|
+
complexity_map = mi_map = raw_map = hal_map = {}
|
|
150
|
+
if include_complexity or include_mi or include_raw or include_hal:
|
|
151
|
+
if self.radon_root:
|
|
152
|
+
complexity_map, mi_map, raw_map, hal_map = self._run_radon_analyses(debug_radon)
|
|
153
|
+
|
|
154
|
+
# Scan pragmas
|
|
155
|
+
pragma_counts = {}
|
|
156
|
+
pragma_root = self.radon_root if self.radon_root else (self.fs_root if self.fs_root != "." else None)
|
|
157
|
+
if pragma_root:
|
|
158
|
+
pragma_counts = scan_pragma_counts(pragma_root)
|
|
159
|
+
|
|
160
|
+
# Load coverage data
|
|
161
|
+
cov_data = load_coverage(self.coverage_file)
|
|
162
|
+
stats = precompute_coverage_stats(cov_data, complexity_map, mi_map, raw_map, hal_map)
|
|
163
|
+
|
|
164
|
+
# Coverage histogram
|
|
165
|
+
if include_coverage:
|
|
166
|
+
self._capture_output(
|
|
167
|
+
show_coverage_histogram,
|
|
168
|
+
stats,
|
|
169
|
+
bins=bins,
|
|
170
|
+
blame_limit=blame_limit,
|
|
171
|
+
show_blame=show_blame,
|
|
172
|
+
threshold=threshold,
|
|
173
|
+
pragma_counts=pragma_counts,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Duration histogram
|
|
177
|
+
if include_duration and self.report_file:
|
|
178
|
+
try:
|
|
179
|
+
report_data = load_report(self.report_file)
|
|
180
|
+
self._capture_output(
|
|
181
|
+
show_duration_histogram,
|
|
182
|
+
report_data,
|
|
183
|
+
bins=bins,
|
|
184
|
+
slow_threshold=slow_threshold,
|
|
185
|
+
blame_limit=blame_limit,
|
|
186
|
+
show_blame=show_blame,
|
|
187
|
+
)
|
|
188
|
+
except Exception as e:
|
|
189
|
+
self.output.write(f"⚠️ Could not load report file: {e}\n")
|
|
190
|
+
|
|
191
|
+
# File size distribution
|
|
192
|
+
if include_sizes:
|
|
193
|
+
self._capture_output(
|
|
194
|
+
show_file_size_distribution,
|
|
195
|
+
root=self.fs_root,
|
|
196
|
+
extension=extension,
|
|
197
|
+
percentiles=fs_percentiles,
|
|
198
|
+
show_above_pct=fs_above,
|
|
199
|
+
bins=bins,
|
|
200
|
+
blame_limit=blame_limit,
|
|
201
|
+
show_blame=show_blame,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Low coverage listing
|
|
205
|
+
if include_lowcov:
|
|
206
|
+
sorts = parse_sorts(sort_specs, "desc")
|
|
207
|
+
self._capture_output(
|
|
208
|
+
show_low_coverage,
|
|
209
|
+
stats,
|
|
210
|
+
threshold=threshold,
|
|
211
|
+
max_threshold=max_threshold,
|
|
212
|
+
top_n=top_n if top_n > 0 else None,
|
|
213
|
+
sorts=sorts,
|
|
214
|
+
show_lines=show_lines,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Complexity histogram
|
|
218
|
+
if include_complexity and self.radon_root:
|
|
219
|
+
self._capture_output(
|
|
220
|
+
show_complexity_histogram,
|
|
221
|
+
stats,
|
|
222
|
+
bins=bins,
|
|
223
|
+
blame_limit=blame_limit,
|
|
224
|
+
show_blame=show_blame,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Maintainability index histogram
|
|
228
|
+
if include_mi and self.radon_root:
|
|
229
|
+
self._capture_output(
|
|
230
|
+
show_mi_histogram,
|
|
231
|
+
stats,
|
|
232
|
+
bins=bins,
|
|
233
|
+
blame_limit=blame_limit,
|
|
234
|
+
show_blame=show_blame,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Raw metrics histogram
|
|
238
|
+
if include_raw and self.radon_root:
|
|
239
|
+
self._capture_output(
|
|
240
|
+
show_raw_histogram,
|
|
241
|
+
stats,
|
|
242
|
+
bins=bins,
|
|
243
|
+
blame_limit=blame_limit,
|
|
244
|
+
show_blame=show_blame,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Halstead metrics histogram
|
|
248
|
+
if include_hal and self.radon_root:
|
|
249
|
+
self._capture_output(
|
|
250
|
+
show_hal_histogram,
|
|
251
|
+
stats,
|
|
252
|
+
bins=bins,
|
|
253
|
+
blame_limit=blame_limit,
|
|
254
|
+
show_blame=show_blame,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# File tree analysis
|
|
258
|
+
if include_tree and self.tree_root:
|
|
259
|
+
self._capture_output(
|
|
260
|
+
analyze_tree,
|
|
261
|
+
self.tree_root,
|
|
262
|
+
bins=bins,
|
|
263
|
+
blame_limit=blame_limit,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return self.output.getvalue()
|
|
267
|
+
|
|
268
|
+
def save_report(self, output_file: str, **kwargs) -> Path:
|
|
269
|
+
"""
|
|
270
|
+
Generate and save report to file.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
output_file: Path to save report to
|
|
274
|
+
**kwargs: Arguments passed to generate_full_report()
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
Path to saved file
|
|
278
|
+
"""
|
|
279
|
+
report = self.generate_full_report(**kwargs)
|
|
280
|
+
output_path = Path(output_file)
|
|
281
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
282
|
+
output_path.write_text(report)
|
|
283
|
+
return output_path
|
codebase_stats/sizes.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""File size distribution analysis."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from .utils import percentile, ascii_histogram, blame_header
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def show_file_size_distribution(
|
|
8
|
+
root: str = ".",
|
|
9
|
+
extension: str = "py",
|
|
10
|
+
percentiles=None,
|
|
11
|
+
show_above_pct=None,
|
|
12
|
+
bins: int = 10,
|
|
13
|
+
blame_limit: int = 20,
|
|
14
|
+
show_blame: bool = True,
|
|
15
|
+
width: int = 80,
|
|
16
|
+
):
|
|
17
|
+
"""Display file size distribution histogram.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
root: Root directory to scan
|
|
21
|
+
extension: File extension to analyze
|
|
22
|
+
percentiles: List of percentiles to display
|
|
23
|
+
show_above_pct: Show files above this percentile
|
|
24
|
+
bins: Number of histogram bins
|
|
25
|
+
blame_limit: Maximum blamed files to display
|
|
26
|
+
show_blame: Whether to show outlier blame
|
|
27
|
+
width: Line width for output
|
|
28
|
+
"""
|
|
29
|
+
if percentiles is None:
|
|
30
|
+
percentiles = [25, 50, 75, 90, 95, 99]
|
|
31
|
+
|
|
32
|
+
files = []
|
|
33
|
+
for path in Path(root).rglob(f"*.{extension}"):
|
|
34
|
+
if any(part.startswith(".") for part in path.parts):
|
|
35
|
+
continue
|
|
36
|
+
try:
|
|
37
|
+
files.append((path.read_text(encoding="utf-8", errors="ignore").count("\n"), str(path)))
|
|
38
|
+
except OSError:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
if not files:
|
|
42
|
+
print(f"No .{extension} files found under '{root}'")
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
files.sort()
|
|
46
|
+
counts = [f[0] for f in files]
|
|
47
|
+
n = len(counts)
|
|
48
|
+
minimum = counts[0]
|
|
49
|
+
maximum = counts[-1]
|
|
50
|
+
avg = sum(counts) / n
|
|
51
|
+
|
|
52
|
+
step = max((maximum - minimum + 1) / bins, 1)
|
|
53
|
+
edges = [minimum + i * step for i in range(bins + 1)]
|
|
54
|
+
buckets = [0] * bins
|
|
55
|
+
for c in counts:
|
|
56
|
+
for i in range(bins):
|
|
57
|
+
if edges[i] <= c < edges[i + 1]:
|
|
58
|
+
buckets[i] += 1
|
|
59
|
+
break
|
|
60
|
+
else: # pragma: no cover
|
|
61
|
+
buckets[-1] += 1
|
|
62
|
+
|
|
63
|
+
labels = [f"{int(edges[i]):>5}–{int(edges[i + 1]):<5} lines" for i in range(bins)]
|
|
64
|
+
|
|
65
|
+
print(f"\n{'═' * width}")
|
|
66
|
+
print(f" FILE SIZE DISTRIBUTION (*.{extension} under '{root}')")
|
|
67
|
+
print(f"{'═' * width}")
|
|
68
|
+
print(f" Files: {n} Total lines: {sum(counts):,} Mean: {avg:.0f} lines")
|
|
69
|
+
print()
|
|
70
|
+
ascii_histogram(buckets, labels, width=width)
|
|
71
|
+
print()
|
|
72
|
+
|
|
73
|
+
print(f"{'─' * width}")
|
|
74
|
+
print(" PERCENTILES")
|
|
75
|
+
print(f"{'─' * width}")
|
|
76
|
+
print(f" {'min':<8} {minimum:>6} lines")
|
|
77
|
+
for pct in percentiles:
|
|
78
|
+
label = {25: "Q1", 50: "Q2/med", 75: "Q3"}.get(pct, f"p{pct}")
|
|
79
|
+
print(f" {label:<8} {int(percentile(counts, pct)):>6} lines")
|
|
80
|
+
print(f" {'max':<8} {maximum:>6} lines")
|
|
81
|
+
print(f" {'mean':<8} {avg:>6.0f} lines")
|
|
82
|
+
|
|
83
|
+
if show_blame:
|
|
84
|
+
q1_lines = int(percentile(counts, 25))
|
|
85
|
+
q3_lines = int(percentile(counts, 75))
|
|
86
|
+
iqr_lines = q3_lines + int(1.5 * (q3_lines - q1_lines))
|
|
87
|
+
blamed_files = sorted(
|
|
88
|
+
[(lc, fp) for lc, fp in files if lc > iqr_lines], key=lambda x: x[0], reverse=True
|
|
89
|
+
)
|
|
90
|
+
blame_header(
|
|
91
|
+
f"size outliers Q3 + 1.5×IQR > {iqr_lines} lines", len(blamed_files), blame_limit, width
|
|
92
|
+
)
|
|
93
|
+
display = blamed_files if not blame_limit else blamed_files[:blame_limit]
|
|
94
|
+
if display:
|
|
95
|
+
for lc, fp in display:
|
|
96
|
+
print(f" {lc:>6} lines {fp}")
|
|
97
|
+
else:
|
|
98
|
+
print(" ✅ No file size outliers.")
|
|
99
|
+
|
|
100
|
+
print(f"\n{'═' * width}")
|
codebase_stats/tree.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""File tree structure analysis."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from .utils import percentile, ascii_histogram, blame_header
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def analyze_tree(root_path: str, bins: int = 10, blame_limit: int = 20, width: int = 80):
|
|
9
|
+
"""Analyze file tree structure, collecting metrics per directory.
|
|
10
|
+
|
|
11
|
+
Displays statistics and histograms about file and folder distribution.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
root_path: Root directory to analyze
|
|
15
|
+
bins: Number of histogram bins
|
|
16
|
+
blame_limit: Maximum blamed directories to display
|
|
17
|
+
width: Line width for output
|
|
18
|
+
"""
|
|
19
|
+
root = Path(root_path).resolve()
|
|
20
|
+
|
|
21
|
+
dir_stats = [] # list of (rel_path, file_count, folder_count)
|
|
22
|
+
|
|
23
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
24
|
+
# Filter out __pycache__ and hidden directories from traversal
|
|
25
|
+
dirnames[:] = [d for d in dirnames if d != "__pycache__" and not d.startswith(".")]
|
|
26
|
+
|
|
27
|
+
rel_path = os.path.relpath(dirpath, root)
|
|
28
|
+
if rel_path == ".":
|
|
29
|
+
rel_path = "./"
|
|
30
|
+
|
|
31
|
+
dir_stats.append({"path": rel_path, "files": len(filenames), "folders": len(dirnames)})
|
|
32
|
+
|
|
33
|
+
if not dir_stats:
|
|
34
|
+
print(f"⚠️ No directories found under {root}")
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
n_dirs = len(dir_stats)
|
|
38
|
+
file_counts = sorted(d["files"] for d in dir_stats)
|
|
39
|
+
folder_counts = sorted(d["folders"] for d in dir_stats)
|
|
40
|
+
|
|
41
|
+
total_files = sum(file_counts)
|
|
42
|
+
total_folders = sum(folder_counts)
|
|
43
|
+
|
|
44
|
+
print(f"\n{'═' * width}")
|
|
45
|
+
print(f" FILE TREE ANALYSIS: {root}")
|
|
46
|
+
print(f"{'═' * width}")
|
|
47
|
+
print(
|
|
48
|
+
f" Total Nodes: {n_dirs:<8} Total Files: {total_files:<8} Total Folders: {total_folders}"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# --- Files per Folder Histogram ---
|
|
52
|
+
print("\n DISTRIBUTION: FILES PER FOLDER")
|
|
53
|
+
print(f"{'─' * width}")
|
|
54
|
+
|
|
55
|
+
f_max = file_counts[-1]
|
|
56
|
+
f_step = max((f_max + 0.1) / bins, 1)
|
|
57
|
+
f_buckets = [0] * bins
|
|
58
|
+
f_labels = []
|
|
59
|
+
for i in range(bins):
|
|
60
|
+
lo, hi = i * f_step, (i + 1) * f_step
|
|
61
|
+
f_labels.append(f"{int(lo):>4}–{int(hi):<4} files")
|
|
62
|
+
for count in file_counts:
|
|
63
|
+
if lo <= count < hi or (i == bins - 1 and count >= hi):
|
|
64
|
+
f_buckets[i] += 1
|
|
65
|
+
|
|
66
|
+
ascii_histogram(f_buckets, f_labels, width=width)
|
|
67
|
+
|
|
68
|
+
print("\n STATS (Files per Folder):")
|
|
69
|
+
print(
|
|
70
|
+
f" min: {file_counts[0]:<6} max: {file_counts[-1]:<6} mean: {total_files / n_dirs:>5.1f}"
|
|
71
|
+
)
|
|
72
|
+
print(
|
|
73
|
+
f" p50: {int(percentile(file_counts, 50)):<6} p75: {int(percentile(file_counts, 75)):<6} p90: {int(percentile(file_counts, 90)):<6}"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# --- Subfolders per Folder Histogram ---
|
|
77
|
+
print("\n DISTRIBUTION: SUBFOLDERS PER FOLDER")
|
|
78
|
+
print(f"{'─' * width}")
|
|
79
|
+
|
|
80
|
+
d_max = folder_counts[-1]
|
|
81
|
+
d_step = max((d_max + 0.1) / bins, 1)
|
|
82
|
+
d_buckets = [0] * bins
|
|
83
|
+
d_labels = []
|
|
84
|
+
for i in range(bins):
|
|
85
|
+
lo, hi = i * d_step, (i + 1) * d_step
|
|
86
|
+
d_labels.append(f"{int(lo):>4}–{int(hi):<4} dirs ")
|
|
87
|
+
for count in folder_counts:
|
|
88
|
+
if lo <= count < hi or (i == bins - 1 and count >= hi):
|
|
89
|
+
d_buckets[i] += 1
|
|
90
|
+
|
|
91
|
+
ascii_histogram(d_buckets, d_labels, width=width)
|
|
92
|
+
|
|
93
|
+
print("\n STATS (Subfolders per Folder):")
|
|
94
|
+
print(
|
|
95
|
+
f" min: {folder_counts[0]:<6} max: {folder_counts[-1]:<6} mean: {total_folders / n_dirs:>5.1f}"
|
|
96
|
+
)
|
|
97
|
+
print(
|
|
98
|
+
f" p50: {int(percentile(folder_counts, 50)):<6} p75: {int(percentile(folder_counts, 75)):<6} p90: {int(percentile(folder_counts, 90)):<6}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if blame_limit > 0:
|
|
102
|
+
# --- Files Outliers (IQR Method) ---
|
|
103
|
+
q1_f = percentile(file_counts, 25)
|
|
104
|
+
q3_f = percentile(file_counts, 75)
|
|
105
|
+
boundary_f = q3_f + 1.5 * (q3_f - q1_f)
|
|
106
|
+
|
|
107
|
+
blamed_f = sorted(
|
|
108
|
+
[d for d in dir_stats if d["files"] > boundary_f],
|
|
109
|
+
key=lambda x: x["files"],
|
|
110
|
+
reverse=True,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
blame_header(
|
|
114
|
+
f"file count outliers (Q3+1.5×IQR > {boundary_f:g})", len(blamed_f), blame_limit, width
|
|
115
|
+
)
|
|
116
|
+
display_f = blamed_f if not blame_limit else blamed_f[:blame_limit]
|
|
117
|
+
if display_f:
|
|
118
|
+
for d in display_f:
|
|
119
|
+
print(f" {d['files']:>5} files {d['path']}")
|
|
120
|
+
else:
|
|
121
|
+
print(" ✅ No file count outliers.")
|
|
122
|
+
|
|
123
|
+
# --- Subfolder Outliers (IQR Method) ---
|
|
124
|
+
q1_d = percentile(folder_counts, 25)
|
|
125
|
+
q3_d = percentile(folder_counts, 75)
|
|
126
|
+
boundary_d = q3_d + 1.5 * (q3_d - q1_d)
|
|
127
|
+
|
|
128
|
+
blamed_d = sorted(
|
|
129
|
+
[d for d in dir_stats if d["folders"] > boundary_d],
|
|
130
|
+
key=lambda x: x["folders"],
|
|
131
|
+
reverse=True,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
blame_header(
|
|
135
|
+
f"subfolder count outliers (Q3+1.5×IQR > {boundary_d:g})", len(blamed_d), blame_limit, width
|
|
136
|
+
)
|
|
137
|
+
display_d = blamed_d if not blame_limit else blamed_d[:blame_limit]
|
|
138
|
+
if display_d:
|
|
139
|
+
for d in display_d:
|
|
140
|
+
print(f" {d['folders']:>5} dirs {d['path']}")
|
|
141
|
+
else:
|
|
142
|
+
print(" ✅ No subfolder count outliers.")
|
|
143
|
+
|
|
144
|
+
print(f"\n{'═' * width}\n")
|
codebase_stats/utils.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Shared utility functions for codebase analysis."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def percentile(sorted_vals: list, pct: int):
|
|
7
|
+
"""Calculate the pth percentile of a sorted values list."""
|
|
8
|
+
n = len(sorted_vals)
|
|
9
|
+
pos = max(0, min(int((n - 1) * pct / 100), n - 1))
|
|
10
|
+
return sorted_vals[pos]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def format_line_ranges(lines: list) -> str:
|
|
14
|
+
"""Convert a list of line numbers into a human-readable ranges string.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
lines: List of line numbers
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
String like "1-5, 10, 15-18"
|
|
21
|
+
"""
|
|
22
|
+
if not lines:
|
|
23
|
+
return ""
|
|
24
|
+
sl = sorted(set(lines))
|
|
25
|
+
ranges, start, end = [], sl[0], sl[0]
|
|
26
|
+
for n in sl[1:]:
|
|
27
|
+
if n == end + 1:
|
|
28
|
+
end = n
|
|
29
|
+
else:
|
|
30
|
+
ranges.append(str(start) if start == end else f"{start}-{end}")
|
|
31
|
+
start = end = n
|
|
32
|
+
ranges.append(str(start) if start == end else f"{start}-{end}")
|
|
33
|
+
return ", ".join(ranges)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def ascii_histogram(
|
|
37
|
+
counts: list, labels: list, bar_width: int = 36, suffixes: list = None, width: int = 80
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Print a fixed-width ASCII histogram.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
counts: List of counts for each bar
|
|
43
|
+
labels: List of labels for each bar
|
|
44
|
+
bar_width: Width of the bar in characters
|
|
45
|
+
suffixes: Optional per-row annotation
|
|
46
|
+
width: Total line width
|
|
47
|
+
"""
|
|
48
|
+
max_count = max(counts) or 1
|
|
49
|
+
total = sum(counts)
|
|
50
|
+
label_w = max(len(l) for l in labels)
|
|
51
|
+
for i, (label, count) in enumerate(zip(labels, counts)):
|
|
52
|
+
bar = "█" * round(count / max_count * bar_width)
|
|
53
|
+
pct = count / total * 100 if total else 0
|
|
54
|
+
suffix = f" {suffixes[i]}" if suffixes and i < len(suffixes) and suffixes[i] else ""
|
|
55
|
+
print(f" {label:<{label_w}} │{bar:<{bar_width}} {count:>4} ({pct:4.1f}%){suffix}")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def blame_header(label: str, total: int, limit: int, width: int = 80) -> None:
|
|
59
|
+
"""Print a blame section header.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
label: Description of what's being blamed
|
|
63
|
+
total: Total number of blamed items
|
|
64
|
+
limit: Current display limit for blamed items
|
|
65
|
+
width: Total line width
|
|
66
|
+
"""
|
|
67
|
+
print(f"\n{'─' * width}")
|
|
68
|
+
tail = f" (showing top {limit}, --blame-limit 0 for all)" if limit and total > limit else ""
|
|
69
|
+
print(f" QUALITY BLAME — {label} {total}{tail}")
|
|
70
|
+
print(f"{'─' * width}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def fmt_seconds(seconds: float) -> str:
|
|
74
|
+
"""Format seconds into human-readable time string.
|
|
75
|
+
|
|
76
|
+
Examples:
|
|
77
|
+
0.5 -> "500ms"
|
|
78
|
+
1.5 -> "1.50s"
|
|
79
|
+
125 -> "2m 05.0s"
|
|
80
|
+
"""
|
|
81
|
+
if seconds < 1:
|
|
82
|
+
return f"{seconds * 1000:.0f}ms"
|
|
83
|
+
if seconds < 60:
|
|
84
|
+
return f"{seconds:.2f}s"
|
|
85
|
+
m, s = divmod(seconds, 60)
|
|
86
|
+
return f"{int(m)}m {s:04.1f}s"
|