deadpush 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deadpush/deadness.py ADDED
@@ -0,0 +1,477 @@
1
+ """Multi-factor deadness scoring for dead code candidates.
2
+
3
+ Combines 6 independent signals into a single alive_score (0.0 = dead, 1.0 = alive).
4
+ False positives are weighted as worse than false negatives — when evidence is
5
+ ambiguous, the scorer abstains (tier = "uncertain").
6
+
7
+ New in Phase 3:
8
+ - Call-chain-aware deadness: propagates penalty through the call graph
9
+ - Test-aware deadness: symbols unreferenced in tests get lower confidence
10
+ - Composite signal: merges all 8 factors into one score
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import ast
16
+ import re
17
+ import subprocess
18
+ import time
19
+ from dataclasses import dataclass, field
20
+ from pathlib import Path
21
+ from typing import Any, Literal
22
+
23
+ from .config import Config
24
+ from .graph import CallGraph, Symbol
25
+ from .registration import RegistrationDetector
26
+ from .importgraph import ImportAnalyzer
27
+
28
+ # Module-level blame cache shared across scorer instances (TTL: 60s)
29
+ _GLOBAL_BLAME_CACHE: dict[str, tuple[float, dict[int, float]]] = {}
30
+
31
+ import time as _time
32
+
33
+
34
+ @dataclass
35
+ class DeadnessResult:
36
+ """Result of multi-factor scoring for a single symbol."""
37
+ alive_score: float
38
+ tier: Literal["high", "medium", "low", "uncertain"]
39
+ factors: dict[str, float] = field(default_factory=dict)
40
+ reasons: list[str] = field(default_factory=list)
41
+ uncertainty: str = ""
42
+
43
+
44
+ ABSTAIN_NAMES: set[str] = {
45
+ "__init__", "__repr__", "__str__", "__len__", "__call__",
46
+ "__enter__", "__exit__", "__iter__", "__next__", "__getitem__",
47
+ "__setitem__", "__delitem__", "__contains__", "__bool__",
48
+ "__hash__", "__eq__", "__ne__", "__lt__", "__le__", "__gt__", "__ge__",
49
+ "__add__", "__sub__", "__mul__", "__truediv__", "__floordiv__",
50
+ "__del__", "__new__", "__delattr__", "__getattr__", "__setattr__",
51
+ "__getattribute__", "__format__", "__reduce__", "__reduce_ex__",
52
+ "__sizeof__", "__subclasshook__", "__init_subclass__",
53
+ "__class_getitem__", "__instancecheck__", "__subclasscheck__",
54
+ "__aenter__", "__aexit__", "__aiter__", "__anext__",
55
+ "__await__", "__aenter__", "__aexit__",
56
+ }
57
+
58
+ KNOWN_HANDLER_CLASSES: set[str] = {
59
+ "FileSystemEventHandler",
60
+ "BaseHTTPRequestHandler",
61
+ "StreamRequestHandler",
62
+ "SimpleHTTPRequestHandler",
63
+ "threading.Thread",
64
+ "ABC",
65
+ "Protocol",
66
+ }
67
+
68
+
69
+ def _should_abstain(sym: Symbol, reg: RegistrationDetector) -> bool:
70
+ """Return True if this symbol should never be flagged as dead."""
71
+ name = sym.name
72
+ if name in ABSTAIN_NAMES:
73
+ return True
74
+ if name.startswith("__") and name.endswith("__"):
75
+ return True
76
+ if reg.is_registered(sym.id):
77
+ return True
78
+ return False
79
+
80
+
81
+ class MultiFactorDeadnessScorer:
82
+ """Score a single symbol using 8 independent factors.
83
+
84
+ Factors (weight):
85
+ in_degree (0.30) — how many callers in the call graph
86
+ registration (0.20) — framework registration patterns
87
+ string_ref (0.10) — name appears as string literal elsewhere
88
+ import_count (0.10) — imported by other modules
89
+ entry_point (0.05) — reachable from entry points
90
+ git_freshness (0.05) — recently modified (git blame)
91
+ call_chain (0.10) — callers are live (propagated from call graph)
92
+ test_coverage (0.10) — referenced in test files
93
+ """
94
+
95
+ WEIGHTS = {
96
+ "in_degree": 0.30,
97
+ "registration": 0.20,
98
+ "string_ref": 0.10,
99
+ "import_count": 0.10,
100
+ "entry_point": 0.05,
101
+ "git_freshness": 0.05,
102
+ "call_chain": 0.10,
103
+ "test_coverage": 0.10,
104
+ }
105
+
106
+ def __init__(
107
+ self,
108
+ config: Config,
109
+ repo_root: Path,
110
+ graph: CallGraph,
111
+ registration: RegistrationDetector,
112
+ imports: ImportAnalyzer,
113
+ roots: set[str],
114
+ all_file_paths: list[Path],
115
+ test_file_paths: list[Path] | None = None,
116
+ ):
117
+ self.config = config
118
+ self.repo_root = repo_root
119
+ self.graph = graph
120
+ self.registration = registration
121
+ self.imports = imports
122
+ self.roots = roots
123
+ self.all_file_paths = all_file_paths
124
+ self._blame_cache: dict[str, dict[int, float]] = {}
125
+ self._log_cache: dict[str, tuple[float, float]] = {}
126
+ # Phase 3: call-chain propagation + test coverage
127
+ self._call_chain_scores: dict[str, float] = {}
128
+ self._test_file_refs: set[str] = self._build_test_refs(test_file_paths or [])
129
+ self._git_history_checked = False
130
+ self._git_history_has_commits = False
131
+
132
+ def score(self, sym: Symbol) -> DeadnessResult | None:
133
+ """Score a single symbol. Returns None if abstention applies."""
134
+ if _should_abstain(sym, self.registration):
135
+ return None
136
+
137
+ factors: dict[str, float] = {}
138
+ reasons: list[str] = []
139
+
140
+ f1 = self._factor_in_degree(sym)
141
+ factors["in_degree"] = f1
142
+ if f1 >= 0.8:
143
+ reasons.append("Has multiple callers in the call graph")
144
+ elif f1 <= 0.2:
145
+ reasons.append("No callers in the call graph")
146
+
147
+ f2 = self.registration.score(sym.id)
148
+ factors["registration"] = f2
149
+ if f2 >= 0.8:
150
+ reasons.append("Registered via decorator or framework pattern")
151
+ elif f2 >= 0.4:
152
+ reasons.append("Appears in a registration context (dict/list/decorator)")
153
+
154
+ f3 = self._factor_string_ref(sym)
155
+ factors["string_ref"] = f3
156
+ if f3 >= 0.5:
157
+ reasons.append("Name referenced as string literal elsewhere")
158
+
159
+ f4 = self._factor_import_count(sym)
160
+ factors["import_count"] = f4
161
+ if f4 >= 0.7:
162
+ reasons.append("Imported by other modules")
163
+ elif f4 <= 0.2:
164
+ reasons.append("Not imported by any other module")
165
+
166
+ f5 = self._factor_entry_point(sym)
167
+ factors["entry_point"] = f5
168
+ if f5 >= 0.8:
169
+ reasons.append("Reachable from a detected entry point")
170
+
171
+ f6 = self._factor_git_freshness(sym)
172
+ factors["git_freshness"] = f6
173
+ if f6 >= 0.7:
174
+ reasons.append("Recently modified")
175
+ elif f6 <= 0.2:
176
+ reasons.append("Not modified recently or never")
177
+
178
+ f7 = self._factor_call_chain(sym)
179
+ factors["call_chain"] = f7
180
+ if f7 >= 0.8:
181
+ reasons.append("Called by live symbols in call graph")
182
+ elif f7 <= 0.2:
183
+ reasons.append("All callers appear to be dead code")
184
+
185
+ f8 = self._factor_test_coverage(sym)
186
+ factors["test_coverage"] = f8
187
+ if f8 >= 0.7:
188
+ reasons.append("Referenced in test files")
189
+ elif f8 <= 0.2:
190
+ reasons.append("Not referenced in any test file")
191
+
192
+ alive_score = sum(
193
+ self.WEIGHTS[k] * factors[k]
194
+ for k in self.WEIGHTS
195
+ )
196
+
197
+ tier = self.classify(alive_score)
198
+
199
+ uncertainty_parts: list[str] = []
200
+ if tier == "uncertain":
201
+ uncertainty_parts.append(f"alive_score {alive_score:.3f} in uncertain range (>0.7)")
202
+ rel = self._rel_path(sym.path)
203
+ if rel not in self._blame_cache:
204
+ uncertainty_parts.append("git blame data not available")
205
+ if not self._test_file_refs:
206
+ uncertainty_parts.append("no test files found for coverage analysis")
207
+
208
+ return DeadnessResult(
209
+ alive_score=round(alive_score, 3),
210
+ tier=tier,
211
+ factors=factors,
212
+ reasons=reasons,
213
+ uncertainty="; ".join(uncertainty_parts) if uncertainty_parts else "",
214
+ )
215
+
216
+ def _build_test_refs(self, test_file_paths: list[Path]) -> set[str]:
217
+ """Pre-scan test files for symbol name references."""
218
+ refs: set[str] = set()
219
+ for fp in test_file_paths:
220
+ try:
221
+ text = fp.read_text(encoding="utf-8", errors="ignore")
222
+ except Exception:
223
+ continue
224
+ # Match import-like and string references
225
+ for m in re.finditer(r"(?:import\s+(\w+)|from\s+(\w+)|['\"](\w+?)['\"])", text):
226
+ for g in m.groups():
227
+ if g and len(g) > 1 and g.isidentifier():
228
+ refs.add(g)
229
+ # Match function calls and attribute access
230
+ tree = None
231
+ try:
232
+ tree = ast.parse(text)
233
+ except SyntaxError:
234
+ continue
235
+ for node in ast.walk(tree):
236
+ if isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
237
+ if len(node.func.id) > 1:
238
+ refs.add(node.func.id)
239
+ elif isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
240
+ if len(node.func.attr) > 1:
241
+ refs.add(node.func.attr)
242
+ elif isinstance(node, ast.Name):
243
+ if isinstance(node.ctx, ast.Load) and len(node.id) > 1:
244
+ refs.add(node.id)
245
+ return refs
246
+
247
+ def _factor_test_coverage(self, sym: Symbol) -> float:
248
+ """Score based on whether the symbol is referenced in test files."""
249
+ name = sym.name.lower()
250
+ if not self._test_file_refs:
251
+ return 0.5 # neutral when no test files exist
252
+ if name in self._test_file_refs or name in {r.lower() for r in self._test_file_refs}:
253
+ return 0.9
254
+ # Check registration detector string refs that came from test files
255
+ if self.registration.score(sym.id) > 0:
256
+ return 0.7
257
+ return 0.2
258
+
259
+ def compute_call_chain_scores(self, alive_scores: dict[str, float]) -> None:
260
+ """Pass 2: propagate deadness through the call graph.
261
+
262
+ For each symbol, compute what fraction of its callers are alive
263
+ (alive_score > 0.2). If all callers are dead, the symbol's
264
+ call_chain factor drops accordingly.
265
+ """
266
+ for sym_id in alive_scores:
267
+ incoming = self.graph.incoming(sym_id)
268
+ if not incoming:
269
+ self._call_chain_scores[sym_id] = 0.0
270
+ continue
271
+ live_callers = 0
272
+ total_callers = 0
273
+ seen: set[str] = set()
274
+ for edge in incoming:
275
+ caller = edge.src
276
+ if caller in seen:
277
+ continue
278
+ seen.add(caller)
279
+ total_callers += 1
280
+ caller_score = alive_scores.get(caller)
281
+ if caller_score is None:
282
+ # Caller that wasn't scored (abstained) — treat as alive
283
+ live_callers += 1
284
+ elif caller_score > 0.2:
285
+ live_callers += 1
286
+ self._call_chain_scores[sym_id] = live_callers / total_callers if total_callers else 0.0
287
+
288
+ def _factor_call_chain(self, sym: Symbol) -> float:
289
+ """Score based on whether callers are live (post-propagation)."""
290
+ return self._call_chain_scores.get(sym.id, 0.0)
291
+
292
+ def _factor_in_degree(self, sym: Symbol) -> float:
293
+ """Score based on how many callers this symbol has in the call graph."""
294
+ incoming = self.graph.incoming(sym.id)
295
+ count = len(incoming)
296
+ if count == 0:
297
+ return 0.0
298
+ if count == 1:
299
+ return 0.3
300
+ if count <= 3:
301
+ return 0.6
302
+ return 0.9
303
+
304
+ def _factor_string_ref(self, sym: Symbol) -> float:
305
+ """Score based on whether the symbol's name appears as a string literal."""
306
+ count = self.imports.count_string_references(sym.name, sym.path)
307
+ if count == 0:
308
+ return 0.0
309
+ if count <= 2:
310
+ return 0.3
311
+ if count <= 5:
312
+ return 0.6
313
+ return 0.8
314
+
315
+ def _factor_import_count(self, sym: Symbol) -> float:
316
+ """Score based on how many files import this symbol."""
317
+ count = self.imports.count_external_imports(sym.name, sym.path)
318
+ if count == 0:
319
+ return 0.0
320
+ if count == 1:
321
+ return 0.4
322
+ if count <= 3:
323
+ return 0.7
324
+ return 1.0
325
+
326
+ def _factor_entry_point(self, sym: Symbol) -> float:
327
+ """Score based on entry point reachability."""
328
+ if sym.id in self.roots:
329
+ return 1.0
330
+ if sym.is_entry_point:
331
+ return 0.9
332
+ if self.registration.is_entry_point_file(sym.path):
333
+ return 0.5
334
+ return 0.0
335
+
336
+ def _has_git_history(self) -> bool:
337
+ """Check if the repo has any commits (avoids FP for new repos)."""
338
+ if self._git_history_checked:
339
+ return self._git_history_has_commits
340
+ self._git_history_checked = True
341
+ try:
342
+ result = subprocess.run(
343
+ ["git", "rev-list", "--count", "HEAD"],
344
+ capture_output=True, text=True, check=False, timeout=5,
345
+ cwd=self.repo_root,
346
+ )
347
+ self._git_history_has_commits = result.returncode == 0 and int(result.stdout.strip()) > 0
348
+ except Exception:
349
+ self._git_history_has_commits = False
350
+ return self._git_history_has_commits
351
+
352
+ def _factor_git_freshness(self, sym: Symbol) -> float:
353
+ """Score based on git blame (when was the symbol last modified)."""
354
+ if not self._has_git_history():
355
+ return 0.5 # neutral — no git history to judge freshness
356
+ rel = self._rel_path(sym.path)
357
+ try:
358
+ file_path = self.repo_root / rel
359
+ if not file_path.exists():
360
+ return 0.0
361
+
362
+ if rel not in self._blame_cache:
363
+ # Check global cache before blaming
364
+ global _GLOBAL_BLAME_CACHE
365
+ now = _time.time()
366
+ if rel in _GLOBAL_BLAME_CACHE:
367
+ ts, data = _GLOBAL_BLAME_CACHE[rel]
368
+ if now - ts < 60:
369
+ self._blame_cache[rel] = data
370
+ else:
371
+ del _GLOBAL_BLAME_CACHE[rel]
372
+ if rel not in self._blame_cache:
373
+ self._blame_cache[rel] = self._blame_file(file_path)
374
+
375
+ cache = self._blame_cache[rel]
376
+ if not cache:
377
+ return self._factor_git_log_fallback(sym.name, rel)
378
+
379
+ age_days = cache.get(sym.line)
380
+ if age_days is None:
381
+ return 0.0
382
+ if age_days < 7:
383
+ return 0.9
384
+ if age_days < 30:
385
+ return 0.7
386
+ if age_days < 90:
387
+ return 0.5
388
+ if age_days < 365:
389
+ return 0.3
390
+ return 0.0
391
+ except Exception:
392
+ return self._factor_git_log_fallback(sym.name, rel)
393
+
394
+ def prefetch_blame_data(self, max_workers: int = 10) -> None:
395
+ """Pre-fetch git blame data for all source files in parallel."""
396
+ from concurrent.futures import ThreadPoolExecutor, as_completed
397
+ files_to_blame = []
398
+ for f in self.all_file_paths:
399
+ rel = self._rel_path(str(f))
400
+ if rel not in self._blame_cache:
401
+ files_to_blame.append(f)
402
+ if not files_to_blame:
403
+ return
404
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
405
+ future_map = {pool.submit(self._blame_file, f): f for f in files_to_blame}
406
+ for future in as_completed(future_map):
407
+ f = future_map[future]
408
+ rel = self._rel_path(str(f))
409
+ try:
410
+ self._blame_cache[rel] = future.result()
411
+ except Exception:
412
+ pass
413
+
414
+ def _blame_file(self, file_path: Path) -> dict[int, float]:
415
+ """Run git blame on a file and return {line_number: age_days}."""
416
+ try:
417
+ result = subprocess.run(
418
+ ["git", "blame", "--porcelain", str(file_path)],
419
+ capture_output=True, text=True, check=False, timeout=10,
420
+ cwd=self.repo_root,
421
+ )
422
+ if result.returncode != 0:
423
+ return {}
424
+ now = time.time()
425
+ line_dates: dict[int, float] = {}
426
+ current_line = 1
427
+ for line in result.stdout.splitlines():
428
+ if line.startswith("author-time "):
429
+ commit_time = int(line.split()[1])
430
+ age_days = (now - commit_time) / 86400
431
+ line_dates[current_line] = age_days
432
+ elif line.startswith("\t"):
433
+ current_line += 1
434
+ elif line.startswith("boundary"):
435
+ pass
436
+ # Seed global cache
437
+ global _GLOBAL_BLAME_CACHE
438
+ _GLOBAL_BLAME_CACHE[self._rel_path(str(file_path))] = (_time.time(), line_dates)
439
+ return line_dates
440
+ except Exception:
441
+ return {}
442
+
443
+ def _factor_git_log_fallback(self, name: str, rel: str) -> float:
444
+ """Fallback: use git log -S to count recent mentions."""
445
+ key = (name, rel)
446
+ if key in self._log_cache:
447
+ return self._log_cache[key][0]
448
+ try:
449
+ result = subprocess.run(
450
+ ["git", "log", "-S", name, "--oneline", "-20", "--", rel],
451
+ capture_output=True, text=True, check=False, timeout=10,
452
+ cwd=self.repo_root,
453
+ )
454
+ if result.returncode == 0 and result.stdout.strip():
455
+ count = len(result.stdout.splitlines())
456
+ score = min(0.7, 0.1 + count * 0.03)
457
+ self._log_cache[key] = (score, 0.0)
458
+ return score
459
+ except Exception:
460
+ pass
461
+ self._log_cache[key] = (0.0, 0.0)
462
+ return 0.0
463
+
464
+ def _rel_path(self, abs_path: str) -> str:
465
+ try:
466
+ return str(Path(abs_path).relative_to(self.repo_root))
467
+ except ValueError:
468
+ return abs_path
469
+
470
+ def classify(self, score: float) -> Literal["high", "medium", "low", "uncertain"]:
471
+ if score <= 0.2:
472
+ return "high"
473
+ if score <= 0.4:
474
+ return "medium"
475
+ if score <= 0.7:
476
+ return "low"
477
+ return "uncertain"