devarch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1559 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import re
5
+ import sys
6
+ from collections import Counter, defaultdict
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ import subprocess
11
+ from typing import Iterable
12
+
13
+ from ..analyzers.ancient import find_ancient_files
14
+ from ..analyzers.dead_code import find_dead_code
15
+ from ..analyzers.duplicates import find_duplicates, similarity_report
16
+ from ..analyzers.health import calculate_health
17
+ from ..analyzers.monsters import find_monsters
18
+ from ..analyzers.ruins import find_empty_directories, find_unused_assets
19
+ from ..analyzers.suspicious import find_suspicious
20
+ from ..analyzers.todos import find_todos, todos_to_artifacts
21
+ from ..models import Artifact, ScanSummary
22
+ from ..utils.fs import RepoView, collect_repository, path_kind
23
+ from ..utils.git_info import GitSummary, collect_git_summary
24
+ from .discovery import build_reference_map, build_text_index
25
+
26
+
27
+ PY_IMPORT_RE = re.compile(r"^\s*from\s+([\w.]+)(?:\s+import|\s*$)|^\s*import\s+([\w.]+)", re.MULTILINE)
28
+ JS_IMPORT_RE = re.compile(r"""(?m)^\s*import\s+.*?\s+from\s+['"]([^'"]+)['"]|require\(['"]([^'"]+)['"]\)""")
29
+ RELATIVE_PREFIX_RE = re.compile(r"^(\.+)(.*)$")
30
+
31
+
32
+ @dataclass(slots=True)
33
+ class DependencyHub:
34
+ path: Path
35
+ referenced_by: int
36
+ depends_on: int
37
+ external_packages: list[str] = field(default_factory=list)
38
+ dependency_risk: str = "Low"
39
+ failure_impact: str = "Moderate"
40
+ confidence: float = 0.0
41
+
42
+
43
+ @dataclass(slots=True)
44
+ class FamilyTree:
45
+ name: str
46
+ root: Path
47
+ children: list[Path] = field(default_factory=list)
48
+ parent_modules: list[Path] = field(default_factory=list)
49
+ inherited_classes: list[str] = field(default_factory=list)
50
+ major_chains: list[list[Path]] = field(default_factory=list)
51
+
52
+
53
+ @dataclass(slots=True)
54
+ class CivilizationCluster:
55
+ name: str
56
+ files: list[Path]
57
+ referenced: int
58
+ last_active_days: int
59
+ status: str
60
+ confidence: float
61
+
62
+
63
+ @dataclass(slots=True)
64
+ class HeatmapBucket:
65
+ bucket: str
66
+ score: float
67
+ label: str
68
+ files: int
69
+
70
+
71
+ @dataclass(slots=True)
72
+ class PersonalityProfile:
73
+ type: str
74
+ traits: list[str]
75
+ risk: str
76
+
77
+
78
+ @dataclass(slots=True)
79
+ class ForecastProfile:
80
+ current_health: int
81
+ projected_6_months: int
82
+ projected_12_months: int
83
+ reason: str
84
+
85
+
86
+ @dataclass(slots=True)
87
+ class DNAProfile:
88
+ signature: list[str]
89
+ confidence: float
90
+
91
+
92
+ @dataclass(slots=True)
93
+ class TimelineEra:
94
+ year: int
95
+ title: str
96
+ activity: int
97
+
98
+
99
+ @dataclass(slots=True)
100
+ class InvestigationIncident:
101
+ incident: str
102
+ date: str
103
+ impact: str
104
+ outcome: str
105
+ risk: str
106
+ evidence: list[str] = field(default_factory=list)
107
+
108
+
109
+ @dataclass(slots=True)
110
+ class StructuralWeakness:
111
+ path: Path
112
+ referenced_by: int
113
+ failure_impact: str
114
+ recovery_difficulty: str
115
+ confidence: float
116
+
117
+
118
+ @dataclass(slots=True)
119
+ class EarthquakeSimulation:
120
+ target: Path
121
+ projected_damage: int
122
+ subsystems_lost: int
123
+ severity: str
124
+ affected_files: list[Path] = field(default_factory=list)
125
+
126
+
127
+ @dataclass(slots=True)
128
+ class ArchitectureClassification:
129
+ primary: str
130
+ secondary: str
131
+ confidence: float
132
+
133
+
134
+ @dataclass(slots=True)
135
+ class ContributorOwnership:
136
+ area: str
137
+ owner: str
138
+ maintenance_owner: str
139
+ abandoned_owner: str
140
+
141
+
142
+ @dataclass(slots=True)
143
+ class MutationEvent:
144
+ project_type: str
145
+ became: str
146
+ date: str
147
+ impact: str
148
+
149
+
150
+ @dataclass(slots=True)
151
+ class KnowledgeMap:
152
+ core: list[str] = field(default_factory=list)
153
+ dependency_graph: list[str] = field(default_factory=list)
154
+ route_graph: list[str] = field(default_factory=list)
155
+ service_graph: list[str] = field(default_factory=list)
156
+ architecture_graph: list[str] = field(default_factory=list)
157
+
158
+
159
+ @dataclass(slots=True)
160
+ class ContainmentZone:
161
+ location: str
162
+ complexity: int
163
+ spread_rate: str
164
+ recommendation: str
165
+
166
+
167
+ @dataclass(slots=True)
168
+ class SurvivalProfile:
169
+ score: int
170
+ risk: str
171
+ single_point_failure: str
172
+ maintainability: int
173
+ recoverability: int
174
+ onboarding_difficulty: int
175
+ bus_factor: int
176
+
177
+
178
+ @dataclass(slots=True)
179
+ class ForensicObservation:
180
+ observation: str
181
+ evidence: list[str] = field(default_factory=list)
182
+
183
+
184
+ @dataclass(slots=True)
185
+ class RepositoryIntelligence:
186
+ root: Path
187
+ view: RepoView
188
+ text_cache: dict[Path, str]
189
+ references: dict[Path, set[Path]]
190
+ dependencies: dict[Path, set[Path]]
191
+ reverse_dependencies: dict[Path, set[Path]]
192
+ external_packages: Counter[str]
193
+ dependency_hubs: list[DependencyHub]
194
+ dependency_cycles: list[list[Path]]
195
+ dependency_chains: list[list[Path]]
196
+ genealogy: list[FamilyTree]
197
+ civilizations: list[CivilizationCluster]
198
+ debt_heatmap: list[HeatmapBucket]
199
+ personality: PersonalityProfile
200
+ forecast: ForecastProfile
201
+ dna: DNAProfile
202
+ timeline_eras: list[TimelineEra]
203
+ ownership: dict[Path, str]
204
+ file_last_active_days: dict[Path, int]
205
+ artifact_confidence: dict[str, float]
206
+ graph_node_count: int
207
+ graph_edge_count: int
208
+ incidents: list[InvestigationIncident] = field(default_factory=list)
209
+ weaknesses: list[StructuralWeakness] = field(default_factory=list)
210
+ quake_simulation: EarthquakeSimulation | None = None
211
+ architecture: ArchitectureClassification | None = None
212
+ contributors: list[ContributorOwnership] = field(default_factory=list)
213
+ mutations: list[MutationEvent] = field(default_factory=list)
214
+ knowledge_map: KnowledgeMap = field(default_factory=KnowledgeMap)
215
+ containment_zones: list[ContainmentZone] = field(default_factory=list)
216
+ survival: SurvivalProfile | None = None
217
+ observations: list[ForensicObservation] = field(default_factory=list)
218
+
219
+
220
+ @dataclass(slots=True)
221
+ class RepositoryAnalysis:
222
+ summary: ScanSummary
223
+ intelligence: RepositoryIntelligence
224
+
225
+
226
+ def _git_run(root: Path, *args: str) -> str | None:
227
+ try:
228
+ result = subprocess.run(["git", "-C", str(root), *args], capture_output=True, text=True, check=True)
229
+ except (OSError, subprocess.CalledProcessError):
230
+ return None
231
+ return result.stdout.strip() or None
232
+
233
+
234
+ def _python_module_key(root: Path, path: Path) -> str:
235
+ return ".".join(path.relative_to(root).with_suffix("").parts)
236
+
237
+
238
+ def _package_keys(root: Path, path: Path) -> set[str]:
239
+ key = _python_module_key(root, path)
240
+ parts = key.split(".")
241
+ keys = {key, key.replace(".", "/"), path.name, path.stem}
242
+ for index in range(1, len(parts)):
243
+ prefix = ".".join(parts[:index])
244
+ keys.add(prefix)
245
+ keys.add(prefix.replace(".", "/"))
246
+ return {item for item in keys if item}
247
+
248
+
249
+ def _resolve_relative_module(source: Path, target: str, root: Path) -> str | None:
250
+ match = RELATIVE_PREFIX_RE.match(target)
251
+ if not match:
252
+ return None
253
+ dots, remainder = match.groups()
254
+ package_parts = list(source.relative_to(root).parts[:-1])
255
+ for _ in range(max(len(dots) - 1, 0)):
256
+ if package_parts:
257
+ package_parts.pop()
258
+ if remainder:
259
+ package_parts.extend([part for part in remainder.split(".") if part])
260
+ return ".".join(package_parts)
261
+
262
+
263
+ def _extract_python_dependencies(path: Path, content: str, root: Path) -> tuple[set[str], set[str], dict[str, list[str]]]:
264
+ internal: set[str] = set()
265
+ external: set[str] = set()
266
+ class_bases: dict[str, list[str]] = defaultdict(list)
267
+ try:
268
+ tree = ast.parse(content)
269
+ except SyntaxError:
270
+ return internal, external, class_bases
271
+
272
+ for node in ast.walk(tree):
273
+ if isinstance(node, ast.Import):
274
+ for alias in node.names:
275
+ name = alias.name
276
+ if name:
277
+ internal.add(name)
278
+ elif isinstance(node, ast.ImportFrom):
279
+ if node.module:
280
+ module_name = "." * node.level + node.module if node.level else node.module
281
+ else:
282
+ module_name = "." * node.level
283
+ if module_name.startswith("."):
284
+ resolved = _resolve_relative_module(path, module_name, root)
285
+ if resolved:
286
+ internal.add(resolved)
287
+ elif module_name:
288
+ internal.add(module_name)
289
+ elif isinstance(node, ast.ClassDef):
290
+ base_names: list[str] = []
291
+ for base in node.bases:
292
+ if isinstance(base, ast.Name):
293
+ base_names.append(base.id)
294
+ elif isinstance(base, ast.Attribute):
295
+ parts = []
296
+ current = base
297
+ while isinstance(current, ast.Attribute):
298
+ parts.append(current.attr)
299
+ current = current.value
300
+ if isinstance(current, ast.Name):
301
+ parts.append(current.id)
302
+ base_names.append(".".join(reversed(parts)))
303
+ if base_names:
304
+ class_bases[node.name].extend(base_names)
305
+
306
+ imported_modules = set()
307
+ for item in internal:
308
+ imported_modules.add(item.split(".")[0])
309
+ for node in ast.walk(tree):
310
+ if isinstance(node, ast.Import):
311
+ for alias in node.names:
312
+ name = alias.name.split(".")[0]
313
+ if name and name not in sys.stdlib_module_names:
314
+ external.add(name)
315
+ elif isinstance(node, ast.ImportFrom):
316
+ if node.module:
317
+ top = node.module.split(".")[0]
318
+ if top and top not in sys.stdlib_module_names and top not in imported_modules:
319
+ external.add(top)
320
+ return internal, external, class_bases
321
+
322
+
323
+ def _extract_js_dependencies(content: str) -> tuple[set[str], set[str]]:
324
+ internal: set[str] = set()
325
+ external: set[str] = set()
326
+ for match in JS_IMPORT_RE.finditer(content):
327
+ target = match.group(1) or match.group(2)
328
+ if not target:
329
+ continue
330
+ if target.startswith(".") or target.startswith("/"):
331
+ internal.add(target)
332
+ else:
333
+ external.add(target.split("/")[0])
334
+ return internal, external
335
+
336
+
337
+ def _build_module_index(view: RepoView) -> dict[str, Path]:
338
+ index: dict[str, Path] = {}
339
+ for path in view.files:
340
+ if path_kind(path) != "text":
341
+ continue
342
+ stem = path.stem
343
+ rel = path.relative_to(view.root).with_suffix("")
344
+ dotted = ".".join(rel.parts)
345
+ slash = rel.as_posix()
346
+ index[dotted] = path
347
+ index[slash] = path
348
+ index[stem] = path
349
+ if path.name == "__init__.py":
350
+ index[".".join(rel.parent.parts)] = path
351
+ index[rel.parent.as_posix()] = path
352
+ return index
353
+
354
+
355
+ def _resolve_target(source: Path, target: str, root: Path, module_index: dict[str, Path]) -> Path | None:
356
+ raw = target.strip()
357
+ if not raw:
358
+ return None
359
+ if raw.startswith("."):
360
+ relative = _resolve_relative_module(source, raw, root)
361
+ if relative:
362
+ candidate = module_index.get(relative) or module_index.get(relative.replace(".", "/"))
363
+ if candidate:
364
+ return candidate
365
+ for suffix in ("", ".py", ".js", ".ts", ".tsx", ".jsx", ".md"):
366
+ possible = root / relative.replace(".", "/")
367
+ if suffix and not str(possible).endswith(suffix):
368
+ possible = possible.with_suffix(suffix)
369
+ if possible.exists():
370
+ return possible.resolve()
371
+ return None
372
+ cleaned = raw.split(" as ", 1)[0].strip()
373
+ cleaned = cleaned.replace("/", ".")
374
+ if cleaned in module_index:
375
+ return module_index[cleaned]
376
+ parts = cleaned.split(".")
377
+ for index in range(len(parts), 0, -1):
378
+ prefix = ".".join(parts[:index])
379
+ if prefix in module_index:
380
+ return module_index[prefix]
381
+ for suffix in (".py", ".js", ".ts", ".tsx", ".jsx", ".md", "/__init__.py"):
382
+ candidate = root / cleaned.replace(".", "/")
383
+ if suffix == "/__init__.py":
384
+ possible = candidate / "__init__.py"
385
+ else:
386
+ possible = candidate.with_suffix(suffix) if candidate.suffix == "" else candidate
387
+ if possible.exists():
388
+ return possible.resolve()
389
+ return None
390
+
391
+
392
+ def _git_last_active_days(root: Path, path: Path, use_git: bool = True) -> int:
393
+ raw = _git_run(root, "log", "-1", "--format=%ct", "--", str(path.relative_to(root))) if use_git else None
394
+ if not raw:
395
+ try:
396
+ modified = datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc)
397
+ except OSError:
398
+ return 0
399
+ return max((datetime.now(timezone.utc) - modified).days, 0)
400
+ timestamp = int(raw.splitlines()[0])
401
+ return max((datetime.now(timezone.utc) - datetime.fromtimestamp(timestamp, tz=timezone.utc)).days, 0)
402
+
403
+
404
+ def _git_last_author(root: Path, path: Path) -> str:
405
+ raw = _git_run(root, "log", "-1", "--format=%an", "--", str(path.relative_to(root)))
406
+ return raw or "unknown"
407
+
408
+
409
+ def _cycle_dfs(start: Path, graph: dict[Path, set[Path]]) -> list[list[Path]]:
410
+ cycles: list[list[Path]] = []
411
+ stack: list[Path] = []
412
+ seen: set[Path] = set()
413
+
414
+ def visit(node: Path) -> None:
415
+ if node in stack:
416
+ index = stack.index(node)
417
+ cycle = stack[index:] + [node]
418
+ if len(cycle) > 2:
419
+ cycles.append(cycle)
420
+ return
421
+ if node in seen:
422
+ return
423
+ seen.add(node)
424
+ stack.append(node)
425
+ for child in graph.get(node, set()):
426
+ visit(child)
427
+ stack.pop()
428
+
429
+ visit(start)
430
+ return cycles
431
+
432
+
433
+ def _dependency_cycles(graph: dict[Path, set[Path]]) -> list[list[Path]]:
434
+ cycles: list[list[Path]] = []
435
+ for node in graph:
436
+ cycles.extend(_cycle_dfs(node, graph))
437
+ deduped: list[list[Path]] = []
438
+ seen: set[tuple[str, ...]] = set()
439
+ for cycle in cycles:
440
+ names = tuple(str(path) for path in cycle)
441
+ signature = tuple(sorted(names))
442
+ if signature not in seen:
443
+ seen.add(signature)
444
+ deduped.append(cycle)
445
+ return deduped
446
+
447
+
448
+ def _reachable_count(start: Path, graph: dict[Path, set[Path]]) -> int:
449
+ visited: set[Path] = set()
450
+ stack = [start]
451
+ while stack:
452
+ node = stack.pop()
453
+ for child in graph.get(node, set()):
454
+ if child not in visited and child != start:
455
+ visited.add(child)
456
+ stack.append(child)
457
+ return len(visited)
458
+
459
+
460
+ def _longest_chain(start: Path, graph: dict[Path, set[Path]], limit: int = 8) -> list[Path]:
461
+ best: list[Path] = [start]
462
+
463
+ def visit(node: Path, chain: list[Path], seen: set[Path]) -> None:
464
+ nonlocal best
465
+ if len(chain) > len(best):
466
+ best = chain[:]
467
+ if len(chain) >= limit:
468
+ return
469
+ for child in graph.get(node, set()):
470
+ if child in seen:
471
+ continue
472
+ visit(child, chain + [child], seen | {child})
473
+
474
+ visit(start, [start], {start})
475
+ return best
476
+
477
+
478
+ def _cluster_name(paths: list[Path]) -> str:
479
+ lowered = " ".join(path.name.lower() for path in paths)
480
+ if "auth" in lowered:
481
+ return "Legacy Authentication System"
482
+ if "payment" in lowered or "billing" in lowered:
483
+ return "Abandoned Payment Flow"
484
+ if "admin" in lowered:
485
+ return "Forgotten Admin Panel"
486
+ if "api" in lowered or "v1" in lowered or "v2" in lowered:
487
+ return "Legacy API Version"
488
+ return "Lost Subsystem"
489
+
490
+
491
+ def _group_by_top_level(paths: Iterable[Path], root: Path) -> dict[str, list[Path]]:
492
+ groups: dict[str, list[Path]] = defaultdict(list)
493
+ for path in paths:
494
+ rel = path.relative_to(root)
495
+ group = rel.parts[0] if rel.parts else rel.parent.name
496
+ groups[group].append(path)
497
+ return groups
498
+
499
+
500
+ def _repository_dna(view: RepoView, text_cache: dict[Path, str], health_score: int) -> DNAProfile:
501
+ signature: list[str] = []
502
+ joined = "\n".join(text_cache.values()).lower()
503
+ names = " ".join(path.name.lower() for path in view.files)
504
+ if "fastapi" in joined or "fastapi" in names:
505
+ signature.append("FASTAPI")
506
+ if "django" in joined or "django" in names:
507
+ signature.append("DJANGO")
508
+ if "flask" in joined or "flask" in names:
509
+ signature.append("FLASK")
510
+ if "postgres" in joined or "psycopg" in joined or "sqlalchemy" in joined:
511
+ signature.append("POSTGRES")
512
+ if any(path.suffix.lower() in {".ts", ".tsx"} for path in view.files):
513
+ signature.append("TYPESCRIPT")
514
+ if any(path.suffix.lower() == ".py" for path in view.files):
515
+ signature.append("PYTHON")
516
+ if any("test" in path.parts for path in view.files):
517
+ signature.append("TEST_HEAVY")
518
+ if any(path.suffix.lower() in {".md", ".rst"} for path in view.files):
519
+ signature.append("DOCUMENTED")
520
+ if len(view.files) < 20:
521
+ signature.append("COMPACT")
522
+ else:
523
+ signature.append("MODULAR" if len({path.parent for path in view.files}) > 6 else "MONOLITHIC")
524
+ signature.append("HIGH_MAINTAINABILITY" if health_score >= 80 else "MEDIUM_MAINTAINABILITY" if health_score >= 60 else "LOW_MAINTAINABILITY")
525
+ signature.append("LOW_COMPLEXITY" if health_score >= 85 else "MEDIUM_COMPLEXITY" if health_score >= 60 else "HIGH_COMPLEXITY")
526
+ confidence = min(0.99, 0.65 + (health_score / 300))
527
+ return DNAProfile(signature=signature, confidence=round(confidence, 2))
528
+
529
+
530
+ def _architecture_classification(view: RepoView, intelligence: "RepositoryIntelligence | None" = None) -> ArchitectureClassification:
531
+ top_level = {path.relative_to(view.root).parts[0] for path in view.files if path.relative_to(view.root).parts}
532
+ joined = " ".join(sorted(top_level)).lower()
533
+ if any(token in joined for token in ("service", "worker", "queue")):
534
+ primary = "Service Oriented"
535
+ elif any(token in joined for token in ("controller", "view", "model", "route", "api")):
536
+ primary = "Layered Monolith"
537
+ elif len(top_level) >= 8 and sum(1 for path in view.files if path.suffix.lower() in {".py", ".ts", ".tsx"}) > 20:
538
+ primary = "Modular Monolith"
539
+ else:
540
+ primary = "Prototype"
541
+
542
+ if intelligence and intelligence.dependency_cycles:
543
+ secondary = "Event Driven"
544
+ elif any(path.name.startswith("test_") for path in view.files):
545
+ secondary = "Domain Driven Design"
546
+ elif any("service" in part.lower() for part in top_level):
547
+ secondary = "Service Oriented"
548
+ else:
549
+ secondary = "Layered"
550
+ confidence = 0.82 if primary != "Prototype" else 0.68
551
+ return ArchitectureClassification(primary=primary, secondary=secondary, confidence=confidence)
552
+
553
+
554
+ def _contributors(root: Path, ownership: dict[Path, str], dependencies: dict[Path, set[Path]]) -> list[ContributorOwnership]:
555
+ by_area: dict[str, Counter[str]] = defaultdict(Counter)
556
+ for path, owner in ownership.items():
557
+ area = path.relative_to(root).parts[0] if path.relative_to(root).parts else path.stem
558
+ by_area[area][owner] += 1
559
+ contributors: list[ContributorOwnership] = []
560
+ for area, counts in sorted(by_area.items()):
561
+ owner, _ = counts.most_common(1)[0]
562
+ maintenance_owner = owner if owner != "unknown" else "unknown"
563
+ abandoned_owner = "unknown" if counts.get("unknown", 0) else owner
564
+ contributors.append(
565
+ ContributorOwnership(
566
+ area=area,
567
+ owner=owner,
568
+ maintenance_owner=maintenance_owner,
569
+ abandoned_owner=abandoned_owner,
570
+ )
571
+ )
572
+ return contributors
573
+
574
+
575
+ def _mutations(root: Path, view: RepoView, git_summary: GitSummary) -> list[MutationEvent]:
576
+ if not git_summary.available:
577
+ heuristics: list[MutationEvent] = []
578
+ names = " ".join(path.name.lower() for path in view.files)
579
+ if "legacy" in names or "migration" in names:
580
+ heuristics.append(
581
+ MutationEvent(
582
+ project_type="CLI",
583
+ became="Hybrid / Transitional System",
584
+ date=datetime.now(timezone.utc).strftime("%Y-%m"),
585
+ impact="Medium",
586
+ )
587
+ )
588
+ return heuristics
589
+
590
+ raw = _git_run(root, "log", "--reverse", "--format=%ad", "--date=format:%Y-%m", "--name-only")
591
+ if not raw:
592
+ return []
593
+ wave_counts: Counter[str] = Counter()
594
+ for line in raw.splitlines():
595
+ if re.match(r"^\d{4}-\d{2}$", line.strip()):
596
+ wave_counts[line.strip()] += 1
597
+ if not wave_counts:
598
+ return []
599
+ year_month, count = wave_counts.most_common(1)[0]
600
+ impact = "High" if count >= 20 else "Medium"
601
+ return [
602
+ MutationEvent(
603
+ project_type="CLI",
604
+ became="Web Platform" if count >= 20 else "Growing Platform",
605
+ date=year_month,
606
+ impact=impact,
607
+ )
608
+ ]
609
+
610
+
611
+ def _knowledge_map(view: RepoView, dependency_hubs: list[DependencyHub], architecture: ArchitectureClassification) -> KnowledgeMap:
612
+ core = sorted({path.relative_to(view.root).parts[0] for path in view.files if path.relative_to(view.root).parts})[:8]
613
+ dep_graph = [f"{hub.path.name} -> {hub.depends_on} deps" for hub in dependency_hubs[:10]]
614
+ route_graph = [f"{path.name}" for path in view.files if "route" in path.name.lower() or "api" in path.name.lower()]
615
+ service_graph = [f"{path.name}" for path in view.files if "service" in path.name.lower() or "worker" in path.name.lower()]
616
+ architecture_graph = [f"{architecture.primary} -> {architecture.secondary}"]
617
+ return KnowledgeMap(
618
+ core=core,
619
+ dependency_graph=dep_graph,
620
+ route_graph=route_graph,
621
+ service_graph=service_graph,
622
+ architecture_graph=architecture_graph,
623
+ )
624
+
625
+
626
+ def _containment_zones(view: RepoView, text_cache: dict[Path, str], dependencies: dict[Path, set[Path]], reverse_dependencies: dict[Path, set[Path]], artifacts: list[Artifact]) -> list[ContainmentZone]:
627
+ zones: list[ContainmentZone] = []
628
+ grouped = _group_by_top_level(view.files, view.root)
629
+ monster_paths = {artifact.path for artifact in artifacts if artifact.kind == "monster_file"}
630
+ duplicate_paths = {artifact.path for artifact in artifacts if artifact.kind == "duplicate_block"}
631
+ for location, paths in grouped.items():
632
+ complexity = 0
633
+ complexity += sum(text_cache.get(path, "").count("if ") + text_cache.get(path, "").count("for ") + text_cache.get(path, "").count("while ") for path in paths)
634
+ complexity += sum(len(dependencies.get(path, set())) + len(reverse_dependencies.get(path, set())) for path in paths)
635
+ complexity += sum(2 for path in paths if path in monster_paths)
636
+ complexity += sum(1 for path in paths if path in duplicate_paths)
637
+ if complexity == 0:
638
+ continue
639
+ if complexity >= 40:
640
+ spread = "Increasing"
641
+ rec = "Immediate Refactor"
642
+ elif complexity >= 20:
643
+ spread = "Moderate"
644
+ rec = "Contain and simplify"
645
+ else:
646
+ spread = "Stable"
647
+ rec = "Monitor"
648
+ zones.append(
649
+ ContainmentZone(
650
+ location=location,
651
+ complexity=min(100, complexity),
652
+ spread_rate=spread,
653
+ recommendation=rec,
654
+ )
655
+ )
656
+ return sorted(zones, key=lambda item: item.complexity, reverse=True)
657
+
658
+
659
+ def _survival_score(summary: ScanSummary, intelligence: "RepositoryIntelligence", contributors: list[ContributorOwnership]) -> SurvivalProfile:
660
+ maintainability = max(0, min(100, summary.health_score))
661
+ recoverability = max(0, min(100, 100 - len(intelligence.dependency_cycles) * 8 - len(intelligence.civilizations) * 6 - len(intelligence.weaknesses) * 4))
662
+ onboarding = max(0, min(100, len(intelligence.weaknesses) * 10 + len(intelligence.dependency_hubs) * 2 + (100 - summary.health_score) // 2))
663
+ bus_factor = max(1, min(5, len({item.owner for item in contributors if item.owner != "unknown"})))
664
+ score = round((maintainability * 0.4 + recoverability * 0.25 + (100 - onboarding) * 0.2 + bus_factor * 5 * 0.15))
665
+ if score >= 80:
666
+ risk = "Low"
667
+ elif score >= 60:
668
+ risk = "Moderate"
669
+ elif score >= 40:
670
+ risk = "High"
671
+ else:
672
+ risk = "Critical"
673
+ single_point = intelligence.weaknesses[0].path.name if intelligence.weaknesses else (intelligence.dependency_hubs[0].path.name if intelligence.dependency_hubs else "Unknown")
674
+ return SurvivalProfile(
675
+ score=score,
676
+ risk=risk,
677
+ single_point_failure=single_point,
678
+ maintainability=maintainability,
679
+ recoverability=recoverability,
680
+ onboarding_difficulty=onboarding,
681
+ bus_factor=bus_factor,
682
+ )
683
+
684
+
685
+ def _observations(
686
+ intelligence: "RepositoryIntelligence",
687
+ architecture: ArchitectureClassification,
688
+ investigations: list[InvestigationIncident],
689
+ weaknesses: list[StructuralWeakness],
690
+ civilizations: list[CivilizationCluster],
691
+ ) -> list[ForensicObservation]:
692
+ notes: list[ForensicObservation] = []
693
+ if civilizations:
694
+ notes.append(
695
+ ForensicObservation(
696
+ observation=f"The repository contains {len(civilizations)} partially abandoned system cluster(s).",
697
+ evidence=[civ.name for civ in civilizations[:3]],
698
+ )
699
+ )
700
+ if weaknesses:
701
+ notes.append(
702
+ ForensicObservation(
703
+ observation=f"{len(weaknesses)} structural bottleneck(s) concentrate failure risk in a few modules.",
704
+ evidence=[str(item.path) for item in weaknesses[:3]],
705
+ )
706
+ )
707
+ if investigations:
708
+ notes.append(
709
+ ForensicObservation(
710
+ observation="Evidence suggests a migration or refactor has occurred without fully retiring the old path.",
711
+ evidence=[incident.incident for incident in investigations[:2]],
712
+ )
713
+ )
714
+ notes.append(
715
+ ForensicObservation(
716
+ observation=f"The repository most closely resembles a {architecture.primary.lower()}.",
717
+ evidence=[architecture.secondary, f"confidence={architecture.confidence:.0%}"],
718
+ )
719
+ )
720
+ if intelligence.dependency_cycles:
721
+ notes.append(
722
+ ForensicObservation(
723
+ observation="Cyclic dependencies indicate architectural pressure and constrained change paths.",
724
+ evidence=[f"cycles={len(intelligence.dependency_cycles)}"],
725
+ )
726
+ )
727
+ return notes
728
+
729
+
730
+ def _classify_personality(
731
+ *,
732
+ health_score: int,
733
+ commit_count: int,
734
+ file_count: int,
735
+ monster_count: int,
736
+ duplicate_count: int,
737
+ ancient_count: int,
738
+ dependency_cycles: int,
739
+ external_packages: int,
740
+ ) -> PersonalityProfile:
741
+ if file_count <= 15 and commit_count < 20:
742
+ return PersonalityProfile(type="Prototype", traits=["Small surface area", "Fast-moving changes", "Minimal bureaucracy"], risk="Volatile")
743
+ if ancient_count and dependency_cycles and monster_count:
744
+ return PersonalityProfile(type="Fortress", traits=["Defensive layers", "Legacy defenses", "High inertia"], risk="Accumulated complexity")
745
+ if external_packages > 20 and file_count > 50 and health_score >= 70:
746
+ return PersonalityProfile(type="Enterprise", traits=["Structured layering", "Many integrations", "Policy driven"], risk="Integration drag")
747
+ if monster_count > 0 and duplicate_count > 0 and health_score < 75:
748
+ return PersonalityProfile(type="Startup", traits=["Rapid experimentation", "High feature growth", "Moderate organization"], risk="Accumulating technical debt")
749
+ if commit_count > 100 and file_count > 80 and health_score >= 75:
750
+ return PersonalityProfile(type="Scientist", traits=["Iterative exploration", "Measured evolution", "Strong evidence trail"], risk="Analysis overhead")
751
+ if file_count > 50 and dependency_cycles == 0 and external_packages < 12:
752
+ return PersonalityProfile(type="Architect", traits=["Clear boundaries", "Intentional structure", "Stable modules"], risk="Rigid change paths")
753
+ if commit_count > 60 and file_count > 40:
754
+ return PersonalityProfile(type="Explorer", traits=["Rapid experimentation", "High feature growth", "Moderate organization"], risk="Moderate technical debt")
755
+ return PersonalityProfile(type="Research Lab", traits=["Experimental paths", "Multiple branches of thought", "Evolving structure"], risk="Discovery overhead")
756
+
757
+
758
+ def _forecast(health_score: int, dependency_cycles: int, monster_count: int, duplicate_count: int, ancient_count: int) -> ForecastProfile:
759
+ drift = dependency_cycles * 3 + monster_count * 4 + duplicate_count * 2 + ancient_count
760
+ projected_6 = max(0, min(100, health_score - drift - 4))
761
+ projected_12 = max(0, min(100, health_score - drift - 10))
762
+ if drift:
763
+ reason = "Increasing dependency growth and structural debt"
764
+ else:
765
+ reason = "Stable graph and limited debt signals"
766
+ return ForecastProfile(
767
+ current_health=health_score,
768
+ projected_6_months=projected_6,
769
+ projected_12_months=projected_12,
770
+ reason=reason,
771
+ )
772
+
773
+
774
+ def _timeline_eras(root: Path) -> list[TimelineEra]:
775
+ raw = _git_run(root, "log", "--reverse", "--format=%ad", "--date=format:%Y")
776
+ if not raw:
777
+ return []
778
+ counts: Counter[int] = Counter()
779
+ for line in raw.splitlines():
780
+ if line.strip().isdigit():
781
+ counts[int(line.strip())] += 1
782
+ if not counts:
783
+ return []
784
+ eras: list[TimelineEra] = []
785
+ for year, activity in sorted(counts.items()):
786
+ if activity <= 4:
787
+ title = "Foundation Era"
788
+ elif activity <= 15:
789
+ title = "Expansion Era"
790
+ elif activity <= 30:
791
+ title = "Growth Era"
792
+ else:
793
+ title = "Feature Explosion Era"
794
+ eras.append(TimelineEra(year=year, title=title, activity=activity))
795
+ if len(eras) >= 2 and eras[-1].activity <= 6:
796
+ eras[-1].title = "Maintenance Era"
797
+ return eras
798
+
799
+
800
+ def _dependency_heatmap(
801
+ view: RepoView,
802
+ text_cache: dict[Path, str],
803
+ dependencies: dict[Path, set[Path]],
804
+ reverse_dependencies: dict[Path, set[Path]],
805
+ artifacts: list[Artifact],
806
+ ) -> list[HeatmapBucket]:
807
+ groups = _group_by_top_level(view.files, view.root)
808
+ by_path_artifacts = defaultdict(list)
809
+ for artifact in artifacts:
810
+ by_path_artifacts[artifact.path].append(artifact)
811
+
812
+ buckets: list[HeatmapBucket] = []
813
+ for group, paths in groups.items():
814
+ todo_density = sum(text_cache.get(path, "").count("TODO") + text_cache.get(path, "").count("FIXME") for path in paths)
815
+ monster_weight = sum(1 for path in paths for artifact in by_path_artifacts.get(path, []) if artifact.kind == "monster_file")
816
+ ancient_weight = sum(1 for path in paths for artifact in by_path_artifacts.get(path, []) if artifact.kind == "ancient_file")
817
+ duplicate_weight = sum(1 for path in paths for artifact in by_path_artifacts.get(path, []) if artifact.kind == "duplicate_block")
818
+ incoming = sum(len(reverse_dependencies.get(path, set())) for path in paths)
819
+ outgoing = sum(len(dependencies.get(path, set())) for path in paths)
820
+ score = todo_density * 0.7 + monster_weight * 3 + ancient_weight * 2 + duplicate_weight * 2 + incoming * 0.4 + outgoing * 0.25
821
+ if score >= 20:
822
+ label = "Severe"
823
+ elif score >= 12:
824
+ label = "High"
825
+ elif score >= 6:
826
+ label = "Moderate"
827
+ else:
828
+ label = "Light"
829
+ buckets.append(HeatmapBucket(bucket=group, score=round(score, 1), label=label, files=len(paths)))
830
+ return sorted(buckets, key=lambda item: item.score, reverse=True)
831
+
832
+
833
+ def _family_trees(
834
+ view: RepoView,
835
+ root: Path,
836
+ reverse_dependencies: dict[Path, set[Path]],
837
+ dependencies: dict[Path, set[Path]],
838
+ class_bases: dict[Path, dict[str, list[str]]],
839
+ ) -> list[FamilyTree]:
840
+ trees: list[FamilyTree] = []
841
+ grouped = _group_by_top_level(view.files, root)
842
+ for group, paths in grouped.items():
843
+ if len(paths) < 2:
844
+ continue
845
+ ordered = sorted(paths, key=lambda path: (len(reverse_dependencies.get(path, set())), len(dependencies.get(path, set()))), reverse=True)
846
+ main = ordered[0]
847
+ inherited: list[str] = []
848
+ for mapping in class_bases.values():
849
+ for class_name, bases in mapping.items():
850
+ for base in bases:
851
+ inherited.append(f"{class_name} -> {base}")
852
+ trees.append(
853
+ FamilyTree(
854
+ name=f"{group.title()} Family",
855
+ root=main,
856
+ children=ordered[1:5],
857
+ parent_modules=sorted(list(reverse_dependencies.get(main, set())), key=lambda p: str(p))[:5],
858
+ inherited_classes=inherited[:10],
859
+ major_chains=[],
860
+ )
861
+ )
862
+ return trees
863
+
864
+
865
+ def _civilizations(
866
+ view: RepoView,
867
+ root: Path,
868
+ reverse_dependencies: dict[Path, set[Path]],
869
+ file_last_active_days: dict[Path, int],
870
+ ) -> list[CivilizationCluster]:
871
+ clusters: list[CivilizationCluster] = []
872
+ grouped = _group_by_top_level(view.files, root)
873
+ for group, paths in grouped.items():
874
+ referenced = sum(1 for path in paths if reverse_dependencies.get(path))
875
+ last_active = max((file_last_active_days.get(path, 0) for path in paths), default=0)
876
+ if len(paths) < 3 or last_active < 120:
877
+ continue
878
+ extinct = referenced == 0 and last_active >= 365
879
+ dormant = referenced <= max(1, len(paths) // 4) and last_active >= 180
880
+ if not (extinct or dormant):
881
+ continue
882
+ name = _cluster_name(paths)
883
+ status = "Extinct" if extinct else "Dormant"
884
+ confidence = 0.88 if extinct else 0.76
885
+ clusters.append(
886
+ CivilizationCluster(
887
+ name=name,
888
+ files=sorted(paths),
889
+ referenced=referenced,
890
+ last_active_days=last_active,
891
+ status=status,
892
+ confidence=confidence,
893
+ )
894
+ )
895
+ return sorted(clusters, key=lambda item: (item.last_active_days, item.referenced), reverse=True)
896
+
897
+
898
+ def _dependency_hubs(
899
+ view: RepoView,
900
+ dependencies: dict[Path, set[Path]],
901
+ reverse_dependencies: dict[Path, set[Path]],
902
+ external_packages: dict[Path, set[str]],
903
+ ) -> list[DependencyHub]:
904
+ hubs: list[DependencyHub] = []
905
+ for path in view.files:
906
+ if path_kind(path) == "binary":
907
+ continue
908
+ referenced_by = len(reverse_dependencies.get(path, set()))
909
+ depends_on = len(dependencies.get(path, set()))
910
+ ext = sorted(external_packages.get(path, set()))
911
+ if referenced_by >= 5 or depends_on >= 8 or ext:
912
+ impact = "Critical" if referenced_by >= 20 or depends_on >= 15 else "High" if referenced_by >= 8 or depends_on >= 10 else "Moderate"
913
+ risk = "High" if referenced_by >= 10 or depends_on >= 10 else "Medium"
914
+ hubs.append(
915
+ DependencyHub(
916
+ path=path,
917
+ referenced_by=referenced_by,
918
+ depends_on=depends_on,
919
+ external_packages=ext[:8],
920
+ dependency_risk=risk,
921
+ failure_impact=impact,
922
+ confidence=min(0.99, 0.6 + referenced_by * 0.015 + depends_on * 0.01),
923
+ )
924
+ )
925
+ return sorted(hubs, key=lambda item: (item.referenced_by, item.depends_on), reverse=True)
926
+
927
+
928
+ def _dependency_chains(dependencies: dict[Path, set[Path]], hubs: list[DependencyHub]) -> list[list[Path]]:
929
+ chains: list[list[Path]] = []
930
+ for hub in hubs[:10]:
931
+ chain = _longest_chain(hub.path, dependencies)
932
+ if len(chain) > 1:
933
+ chains.append(chain)
934
+ return chains
935
+
936
+
937
+ def _structural_weaknesses(view: RepoView, dependencies: dict[Path, set[Path]], reverse_dependencies: dict[Path, set[Path]], dependency_hubs: list[DependencyHub]) -> list[StructuralWeakness]:
938
+ weaknesses: list[StructuralWeakness] = []
939
+ for hub in dependency_hubs[:12]:
940
+ if hub.referenced_by >= 8 and (hub.depends_on <= 2 or not dependencies.get(hub.path)):
941
+ difficulty = "High" if hub.depends_on > 0 else "Medium"
942
+ impact = "Severe" if hub.referenced_by >= 20 else "High"
943
+ weaknesses.append(
944
+ StructuralWeakness(
945
+ path=hub.path,
946
+ referenced_by=hub.referenced_by,
947
+ failure_impact=impact,
948
+ recovery_difficulty=difficulty,
949
+ confidence=min(0.99, hub.confidence + 0.1),
950
+ )
951
+ )
952
+
953
+ for path, dependents in sorted(reverse_dependencies.items(), key=lambda item: len(item[1]), reverse=True):
954
+ if len(dependents) >= 25 and path not in {weak.path for weak in weaknesses}:
955
+ weaknesses.append(
956
+ StructuralWeakness(
957
+ path=path,
958
+ referenced_by=len(dependents),
959
+ failure_impact="Severe" if len(dependents) >= 50 else "High",
960
+ recovery_difficulty="High",
961
+ confidence=0.9,
962
+ )
963
+ )
964
+ return weaknesses[:10]
965
+
966
+
967
+ def _quake_simulation(
968
+ view: RepoView,
969
+ dependencies: dict[Path, set[Path]],
970
+ reverse_dependencies: dict[Path, set[Path]],
971
+ target: str | None = None,
972
+ ) -> EarthquakeSimulation | None:
973
+ module_index = _build_module_index(view)
974
+ selected: Path | None = None
975
+ if target:
976
+ selected = module_index.get(target) or module_index.get(target.replace("/", "."))
977
+ if not selected:
978
+ for path in view.files:
979
+ if path.name == target or str(path).endswith(target):
980
+ selected = path
981
+ break
982
+ if selected is None:
983
+ if reverse_dependencies:
984
+ selected = max(reverse_dependencies.items(), key=lambda item: len(item[1]))[0]
985
+ elif dependencies:
986
+ selected = max(dependencies.items(), key=lambda item: len(item[1]))[0]
987
+ else:
988
+ return None
989
+
990
+ affected: set[Path] = set()
991
+ stack = [selected]
992
+ while stack:
993
+ node = stack.pop()
994
+ for child in reverse_dependencies.get(node, set()):
995
+ if child not in affected:
996
+ affected.add(child)
997
+ stack.append(child)
998
+
999
+ subsystem_count = len({path.relative_to(view.root).parts[0] for path in affected if path.relative_to(view.root).parts})
1000
+ severity = "Catastrophic" if len(affected) >= 25 else "Severe" if len(affected) >= 10 else "High" if len(affected) >= 4 else "Moderate"
1001
+ return EarthquakeSimulation(
1002
+ target=selected,
1003
+ projected_damage=len(affected),
1004
+ subsystems_lost=subsystem_count,
1005
+ severity=severity,
1006
+ affected_files=sorted(affected)[:30],
1007
+ )
1008
+
1009
+
1010
+ def _investigation_incidents(
1011
+ view: RepoView,
1012
+ text_cache: dict[Path, str],
1013
+ dependencies: dict[Path, set[Path]],
1014
+ reverse_dependencies: dict[Path, set[Path]],
1015
+ git_summary: GitSummary,
1016
+ ) -> list[InvestigationIncident]:
1017
+ evidence: list[str] = []
1018
+ dangerous_patterns = {
1019
+ "eval(": "unsafe dynamic evaluation",
1020
+ "exec(": "runtime code execution",
1021
+ "pickle.load": "unsafe deserialization",
1022
+ "yaml.load": "unsafe YAML loading",
1023
+ "shell=True": "shell injection surface",
1024
+ "os.system(": "shell execution",
1025
+ "subprocess.Popen": "process spawning",
1026
+ }
1027
+ for path, content in text_cache.items():
1028
+ for needle, description in dangerous_patterns.items():
1029
+ if needle in content:
1030
+ evidence.append(f"{path.name}: {description}")
1031
+ file_explosion = sorted(
1032
+ (
1033
+ (group, len(paths))
1034
+ for group, paths in _group_by_top_level(view.files, view.root).items()
1035
+ if len(paths) >= 8
1036
+ ),
1037
+ key=lambda item: item[1],
1038
+ reverse=True,
1039
+ )
1040
+ if file_explosion:
1041
+ evidence.append(f"File explosion in {file_explosion[0][0]} ({file_explosion[0][1]} files)")
1042
+ migration_names = [path for path in view.files if any(token in path.name.lower() for token in ("migration", "refactor", "legacy", "v2", "v3"))]
1043
+ if migration_names:
1044
+ evidence.append(f"{len(migration_names)} migration/refactor-era files detected")
1045
+
1046
+ if not evidence:
1047
+ return [
1048
+ InvestigationIncident(
1049
+ incident="Repository Baseline",
1050
+ date=git_summary.last_commit.date().isoformat() if git_summary.last_commit else datetime.now(timezone.utc).date().isoformat(),
1051
+ impact=f"{len(view.files)} files scanned",
1052
+ outcome="No obvious incident cluster detected",
1053
+ risk="Low",
1054
+ evidence=["No dangerous patterns observed in source files"],
1055
+ )
1056
+ ]
1057
+
1058
+ if git_summary.available and git_summary.last_commit:
1059
+ incident_date = git_summary.last_commit.date().isoformat()
1060
+ else:
1061
+ incident_date = datetime.now(timezone.utc).date().isoformat()
1062
+
1063
+ impact = f"{max(1, len(evidence))} indicators found"
1064
+ if migration_names and file_explosion:
1065
+ outcome = "Legacy system partially abandoned"
1066
+ risk = "High"
1067
+ incident = "Authentication Refactor" if any("auth" in path.name.lower() for path in migration_names) else "Structural Migration"
1068
+ elif evidence:
1069
+ outcome = "Suspicious architectural change patterns detected"
1070
+ risk = "Medium"
1071
+ incident = "Incident Cluster"
1072
+ else:
1073
+ outcome = "Repository appears stable"
1074
+ risk = "Low"
1075
+ incident = "Baseline"
1076
+ return [
1077
+ InvestigationIncident(
1078
+ incident=incident,
1079
+ date=incident_date,
1080
+ impact=impact,
1081
+ outcome=outcome,
1082
+ risk=risk,
1083
+ evidence=evidence[:8],
1084
+ )
1085
+ ]
1086
+
1087
+
1088
+ def build_repository_intelligence(
1089
+ root: Path,
1090
+ *,
1091
+ view: RepoView | None = None,
1092
+ text_cache: dict[Path, str] | None = None,
1093
+ artifacts: list[Artifact] | None = None,
1094
+ health_score: int | None = None,
1095
+ git_summary: GitSummary | None = None,
1096
+ ) -> RepositoryIntelligence:
1097
+ view = view or collect_repository(root)
1098
+ text_cache = text_cache or build_text_index(view)
1099
+ references = build_reference_map(view, text_cache)
1100
+ git_summary = git_summary or collect_git_summary(root)
1101
+
1102
+ module_index = _build_module_index(view)
1103
+ dependencies: dict[Path, set[Path]] = defaultdict(set)
1104
+ reverse_dependencies: dict[Path, set[Path]] = defaultdict(set)
1105
+ external_packages: dict[Path, set[str]] = defaultdict(set)
1106
+ ownership: dict[Path, str] = {}
1107
+ file_last_active_days: dict[Path, int] = {}
1108
+ class_bases_by_path: dict[Path, dict[str, list[str]]] = {}
1109
+
1110
+ for path in view.files:
1111
+ if path_kind(path) != "text":
1112
+ continue
1113
+ content = text_cache.get(path, "")
1114
+ if path.suffix.lower() == ".py":
1115
+ internal, external, class_bases = _extract_python_dependencies(path, content, root)
1116
+ class_bases_by_path[path] = class_bases
1117
+ for dep in internal:
1118
+ resolved = _resolve_target(path, dep, root, module_index)
1119
+ if resolved and resolved != path:
1120
+ dependencies[path].add(resolved)
1121
+ reverse_dependencies[resolved].add(path)
1122
+ external_packages[path].update(external)
1123
+ elif path.suffix.lower() in {".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs"}:
1124
+ internal, external = _extract_js_dependencies(content)
1125
+ for dep in internal:
1126
+ resolved = _resolve_target(path, dep, root, module_index)
1127
+ if resolved and resolved != path:
1128
+ dependencies[path].add(resolved)
1129
+ reverse_dependencies[resolved].add(path)
1130
+ external_packages[path].update(external)
1131
+ else:
1132
+ for match in PY_IMPORT_RE.finditer(content):
1133
+ target = match.group(1) or match.group(2)
1134
+ if not target:
1135
+ continue
1136
+ resolved = _resolve_target(path, target, root, module_index)
1137
+ if resolved and resolved != path:
1138
+ dependencies[path].add(resolved)
1139
+ reverse_dependencies[resolved].add(path)
1140
+
1141
+ if git_summary.available:
1142
+ ownership[path] = _git_last_author(root, path)
1143
+ file_last_active_days[path] = _git_last_active_days(root, path, use_git=True)
1144
+ else:
1145
+ ownership[path] = "unknown"
1146
+ file_last_active_days[path] = _git_last_active_days(root, path, use_git=False)
1147
+
1148
+ all_artifacts = artifacts or []
1149
+ if not all_artifacts:
1150
+ todos = find_todos(view.files)
1151
+ todo_artifacts = todos_to_artifacts(todos)
1152
+ ancient = find_ancient_files(view.files, build_reference_map(view, text_cache))
1153
+ dead_code = find_dead_code(view.root, view.files, text_cache)
1154
+ duplicates = find_duplicates(view.files, text_cache)
1155
+ monsters = find_monsters(view.files)
1156
+ ruins = find_empty_directories(view.directories, view.files) + find_unused_assets(view.files, text_cache)
1157
+ suspicious = find_suspicious(view.files)
1158
+ all_artifacts = [*todo_artifacts, *ancient, *dead_code, *duplicates, *monsters, *ruins, *suspicious]
1159
+
1160
+ artifact_confidence = {
1161
+ "dead_code": round(sum(item.confidence or 0 for item in all_artifacts if item.kind in {"dead_code_candidate", "unreachable_code"}) / max(1, sum(1 for item in all_artifacts if item.kind in {"dead_code_candidate", "unreachable_code"})), 2),
1162
+ "ancient_file": round(sum(item.confidence or 0 for item in all_artifacts if item.kind == "ancient_file") / max(1, sum(1 for item in all_artifacts if item.kind == "ancient_file")), 2),
1163
+ "duplicate_block": round(sum(item.confidence or 0 for item in all_artifacts if item.kind == "duplicate_block") / max(1, sum(1 for item in all_artifacts if item.kind == "duplicate_block")), 2),
1164
+ }
1165
+
1166
+ health_score = health_score if health_score is not None else 0
1167
+ dependency_hubs = _dependency_hubs(view, dependencies, reverse_dependencies, external_packages)
1168
+ dependency_cycles = _dependency_cycles(dependencies)
1169
+ dependency_chains = _dependency_chains(dependencies, dependency_hubs)
1170
+ genealogy = _family_trees(view, root, reverse_dependencies, dependencies, class_bases_by_path)
1171
+ civilizations = _civilizations(view, root, reverse_dependencies, file_last_active_days)
1172
+ debt_heatmap = _dependency_heatmap(view, text_cache, dependencies, reverse_dependencies, all_artifacts)
1173
+ weaknesses = _structural_weaknesses(view, dependencies, reverse_dependencies, dependency_hubs)
1174
+ architecture = _architecture_classification(view)
1175
+ personality = _classify_personality(
1176
+ health_score=health_score,
1177
+ commit_count=git_summary.commit_count,
1178
+ file_count=len(view.files),
1179
+ monster_count=sum(1 for artifact in all_artifacts if artifact.kind == "monster_file"),
1180
+ duplicate_count=sum(1 for artifact in all_artifacts if artifact.kind == "duplicate_block"),
1181
+ ancient_count=sum(1 for artifact in all_artifacts if artifact.kind == "ancient_file"),
1182
+ dependency_cycles=len(dependency_cycles),
1183
+ external_packages=sum(len(values) for values in external_packages.values()),
1184
+ )
1185
+ forecast = _forecast(
1186
+ health_score=health_score,
1187
+ dependency_cycles=len(dependency_cycles),
1188
+ monster_count=sum(1 for artifact in all_artifacts if artifact.kind == "monster_file"),
1189
+ duplicate_count=sum(1 for artifact in all_artifacts if artifact.kind == "duplicate_block"),
1190
+ ancient_count=sum(1 for artifact in all_artifacts if artifact.kind == "ancient_file"),
1191
+ )
1192
+ dna = _repository_dna(view, text_cache, health_score)
1193
+ timeline_eras = _timeline_eras(root)
1194
+ graph_node_count = len(view.files)
1195
+ graph_edge_count = sum(len(values) for values in dependencies.values())
1196
+ contributors = _contributors(root, ownership, dependencies)
1197
+ quake_simulation = _quake_simulation(view, dependencies, reverse_dependencies)
1198
+ mutations = _mutations(root, view, git_summary)
1199
+ survival = _survival_score(
1200
+ ScanSummary(
1201
+ root=root,
1202
+ scanned_at=datetime.now(timezone.utc),
1203
+ total_files=len(view.files),
1204
+ artifact_count=len(all_artifacts),
1205
+ ancient_count=sum(1 for item in all_artifacts if item.kind == "ancient_file"),
1206
+ todo_count=sum(1 for item in all_artifacts if item.kind == "todo"),
1207
+ duplicate_count=sum(1 for item in all_artifacts if item.kind == "duplicate_block"),
1208
+ dead_code_count=sum(1 for item in all_artifacts if item.kind in {"dead_code_candidate", "unreachable_code"}),
1209
+ monster_count=sum(1 for item in all_artifacts if item.kind == "monster_file"),
1210
+ ruin_count=sum(1 for item in all_artifacts if item.kind in {"empty_directory", "unused_asset"}),
1211
+ suspicious_count=sum(1 for item in all_artifacts if item.kind == "suspicious"),
1212
+ technical_debt_estimate=0.0,
1213
+ health_score=health_score,
1214
+ health_status="",
1215
+ ),
1216
+ intelligence=RepositoryIntelligence(
1217
+ root=root,
1218
+ view=view,
1219
+ text_cache=text_cache,
1220
+ references=references,
1221
+ dependencies=dependencies,
1222
+ reverse_dependencies=reverse_dependencies,
1223
+ external_packages=Counter(),
1224
+ dependency_hubs=dependency_hubs,
1225
+ dependency_cycles=dependency_cycles,
1226
+ dependency_chains=dependency_chains,
1227
+ genealogy=genealogy,
1228
+ civilizations=civilizations,
1229
+ debt_heatmap=debt_heatmap,
1230
+ personality=personality,
1231
+ forecast=forecast,
1232
+ dna=dna,
1233
+ timeline_eras=timeline_eras,
1234
+ ownership=ownership,
1235
+ file_last_active_days=file_last_active_days,
1236
+ artifact_confidence=artifact_confidence,
1237
+ graph_node_count=graph_node_count,
1238
+ graph_edge_count=graph_edge_count,
1239
+ weaknesses=weaknesses,
1240
+ architecture=architecture,
1241
+ ),
1242
+ contributors=contributors,
1243
+ )
1244
+ knowledge_map = _knowledge_map(view, dependency_hubs, architecture)
1245
+ investigations = _investigation_incidents(view, text_cache, dependencies, reverse_dependencies, git_summary)
1246
+ observations = _observations(
1247
+ RepositoryIntelligence(
1248
+ root=root,
1249
+ view=view,
1250
+ text_cache=text_cache,
1251
+ references=references,
1252
+ dependencies=dependencies,
1253
+ reverse_dependencies=reverse_dependencies,
1254
+ external_packages=Counter(),
1255
+ dependency_hubs=dependency_hubs,
1256
+ dependency_cycles=dependency_cycles,
1257
+ dependency_chains=dependency_chains,
1258
+ genealogy=genealogy,
1259
+ civilizations=civilizations,
1260
+ debt_heatmap=debt_heatmap,
1261
+ personality=personality,
1262
+ forecast=forecast,
1263
+ dna=dna,
1264
+ timeline_eras=timeline_eras,
1265
+ ownership=ownership,
1266
+ file_last_active_days=file_last_active_days,
1267
+ artifact_confidence=artifact_confidence,
1268
+ graph_node_count=graph_node_count,
1269
+ graph_edge_count=graph_edge_count,
1270
+ weaknesses=weaknesses,
1271
+ architecture=architecture,
1272
+ contributors=contributors,
1273
+ survival=survival,
1274
+ ),
1275
+ architecture,
1276
+ investigations,
1277
+ weaknesses,
1278
+ civilizations,
1279
+ )
1280
+
1281
+ external_package_counts = Counter()
1282
+ for packages in external_packages.values():
1283
+ external_package_counts.update(packages)
1284
+
1285
+ return RepositoryIntelligence(
1286
+ root=root,
1287
+ view=view,
1288
+ text_cache=text_cache,
1289
+ references=references,
1290
+ dependencies=dependencies,
1291
+ reverse_dependencies=reverse_dependencies,
1292
+ external_packages=external_package_counts,
1293
+ dependency_hubs=dependency_hubs,
1294
+ dependency_cycles=dependency_cycles,
1295
+ dependency_chains=dependency_chains,
1296
+ genealogy=genealogy,
1297
+ civilizations=civilizations,
1298
+ debt_heatmap=debt_heatmap,
1299
+ personality=personality,
1300
+ forecast=forecast,
1301
+ dna=dna,
1302
+ timeline_eras=timeline_eras,
1303
+ ownership=ownership,
1304
+ file_last_active_days=file_last_active_days,
1305
+ artifact_confidence=artifact_confidence,
1306
+ graph_node_count=graph_node_count,
1307
+ graph_edge_count=graph_edge_count,
1308
+ incidents=investigations,
1309
+ weaknesses=weaknesses,
1310
+ quake_simulation=quake_simulation,
1311
+ architecture=architecture,
1312
+ contributors=contributors,
1313
+ mutations=mutations,
1314
+ knowledge_map=knowledge_map,
1315
+ containment_zones=_containment_zones(view, text_cache, dependencies, reverse_dependencies, all_artifacts),
1316
+ survival=survival,
1317
+ observations=observations,
1318
+ )
1319
+
1320
+
1321
+ def analyze_repository(root: Path) -> RepositoryAnalysis:
1322
+ view = collect_repository(root)
1323
+ text_cache = build_text_index(view)
1324
+ references = build_reference_map(view, text_cache)
1325
+
1326
+ todos = find_todos(view.files)
1327
+ todo_artifacts = todos_to_artifacts(todos)
1328
+ ancient = find_ancient_files(view.files, references)
1329
+ dead_code = find_dead_code(view.root, view.files, text_cache)
1330
+ duplicates = find_duplicates(view.files, text_cache)
1331
+ monsters = find_monsters(view.files)
1332
+ ruins = find_empty_directories(view.directories, view.files) + find_unused_assets(view.files, text_cache)
1333
+ suspicious = find_suspicious(view.files)
1334
+
1335
+ artifacts: list[Artifact] = []
1336
+ artifacts.extend(todo_artifacts)
1337
+ artifacts.extend(ancient)
1338
+ artifacts.extend(dead_code)
1339
+ artifacts.extend(duplicates)
1340
+ artifacts.extend(monsters)
1341
+ artifacts.extend(ruins)
1342
+ artifacts.extend(suspicious)
1343
+
1344
+ health = calculate_health(
1345
+ total_files=len(view.files),
1346
+ dead_code_count=len(dead_code),
1347
+ duplicate_count=len(duplicates),
1348
+ ancient_count=len(ancient),
1349
+ todo_count=len(todo_artifacts),
1350
+ monster_count=len(monsters),
1351
+ ruin_count=len(ruins),
1352
+ suspicious_count=len(suspicious),
1353
+ )
1354
+
1355
+ warnings = list(health.warnings)
1356
+ if not artifacts:
1357
+ warnings.append("No major artifacts detected")
1358
+ if not view.files:
1359
+ warnings.append("Repository appears empty")
1360
+
1361
+ git_summary = collect_git_summary(root)
1362
+ intelligence = build_repository_intelligence(root, view=view, text_cache=text_cache, artifacts=artifacts, health_score=health.score, git_summary=git_summary)
1363
+ from ..analyzers import maintenance
1364
+
1365
+ remediation_findings = maintenance.remediation_findings(RepositoryAnalysis(summary=ScanSummary(
1366
+ root=view.root,
1367
+ scanned_at=datetime.now(timezone.utc),
1368
+ total_files=len(view.files),
1369
+ artifact_count=len(artifacts),
1370
+ ancient_count=len(ancient),
1371
+ todo_count=len(todo_artifacts),
1372
+ duplicate_count=len(duplicates),
1373
+ dead_code_count=len(dead_code),
1374
+ monster_count=len(monsters),
1375
+ ruin_count=len(ruins),
1376
+ suspicious_count=len(suspicious),
1377
+ technical_debt_estimate=health.debt_estimate,
1378
+ health_score=health.score,
1379
+ health_status=health.status,
1380
+ ), intelligence=intelligence))
1381
+ summary = ScanSummary(
1382
+ root=view.root,
1383
+ scanned_at=datetime.now(timezone.utc),
1384
+ total_files=len(view.files),
1385
+ artifact_count=len(artifacts),
1386
+ ancient_count=len(ancient),
1387
+ todo_count=len(todo_artifacts),
1388
+ duplicate_count=len(duplicates),
1389
+ dead_code_count=len(dead_code),
1390
+ monster_count=len(monsters),
1391
+ ruin_count=len(ruins),
1392
+ suspicious_count=len(suspicious),
1393
+ technical_debt_estimate=health.debt_estimate,
1394
+ health_score=health.score,
1395
+ health_status=health.status,
1396
+ warnings=warnings,
1397
+ artifacts=artifacts,
1398
+ timeline={
1399
+ "available": bool(git_summary.available),
1400
+ "commit_count": git_summary.commit_count,
1401
+ "repository_age_days": git_summary.repository_age_days,
1402
+ "repository_age_years": round(git_summary.repository_age_days / 365, 1) if git_summary.repository_age_days else 0,
1403
+ "first_commit": git_summary.first_commit.isoformat() if git_summary.first_commit else None,
1404
+ "last_commit": git_summary.last_commit.isoformat() if git_summary.last_commit else None,
1405
+ "most_modified_files": git_summary.most_modified_files,
1406
+ "eras": [{"year": era.year, "title": era.title, "activity": era.activity} for era in intelligence.timeline_eras],
1407
+ },
1408
+ extra={
1409
+ "similarity_pairs": similarity_report(view.files, text_cache),
1410
+ "dna": {"signature": intelligence.dna.signature, "confidence": intelligence.dna.confidence},
1411
+ "personality": {"type": intelligence.personality.type, "traits": intelligence.personality.traits, "risk": intelligence.personality.risk},
1412
+ "artifact_confidence": intelligence.artifact_confidence,
1413
+ "architecture": {
1414
+ "primary": intelligence.architecture.primary if intelligence.architecture else "Prototype",
1415
+ "secondary": intelligence.architecture.secondary if intelligence.architecture else "Layered",
1416
+ "confidence": intelligence.architecture.confidence if intelligence.architecture else 0.68,
1417
+ },
1418
+ "forecast": {
1419
+ "current_health": intelligence.forecast.current_health,
1420
+ "projected_6_months": intelligence.forecast.projected_6_months,
1421
+ "projected_12_months": intelligence.forecast.projected_12_months,
1422
+ "reason": intelligence.forecast.reason,
1423
+ },
1424
+ "investigation": [
1425
+ {
1426
+ "incident": incident.incident,
1427
+ "date": incident.date,
1428
+ "impact": incident.impact,
1429
+ "outcome": incident.outcome,
1430
+ "risk": incident.risk,
1431
+ "evidence": incident.evidence,
1432
+ }
1433
+ for incident in intelligence.incidents
1434
+ ],
1435
+ "weaknesses": [
1436
+ {
1437
+ "path": str(item.path),
1438
+ "referenced_by": item.referenced_by,
1439
+ "failure_impact": item.failure_impact,
1440
+ "recovery_difficulty": item.recovery_difficulty,
1441
+ "confidence": item.confidence,
1442
+ }
1443
+ for item in intelligence.weaknesses
1444
+ ],
1445
+ "quake": None
1446
+ if intelligence.quake_simulation is None
1447
+ else {
1448
+ "target": str(intelligence.quake_simulation.target),
1449
+ "projected_damage": intelligence.quake_simulation.projected_damage,
1450
+ "subsystems_lost": intelligence.quake_simulation.subsystems_lost,
1451
+ "severity": intelligence.quake_simulation.severity,
1452
+ "affected_files": [str(path) for path in intelligence.quake_simulation.affected_files],
1453
+ },
1454
+ "dependency_hubs": [
1455
+ {
1456
+ "path": str(hub.path),
1457
+ "referenced_by": hub.referenced_by,
1458
+ "depends_on": hub.depends_on,
1459
+ "external_packages": hub.external_packages,
1460
+ "dependency_risk": hub.dependency_risk,
1461
+ "failure_impact": hub.failure_impact,
1462
+ "confidence": hub.confidence,
1463
+ }
1464
+ for hub in intelligence.dependency_hubs
1465
+ ],
1466
+ "civilizations": [
1467
+ {
1468
+ "name": civ.name,
1469
+ "files": [str(path) for path in civ.files],
1470
+ "referenced": civ.referenced,
1471
+ "last_active_days": civ.last_active_days,
1472
+ "status": civ.status,
1473
+ "confidence": civ.confidence,
1474
+ }
1475
+ for civ in intelligence.civilizations
1476
+ ],
1477
+ "contributors": [
1478
+ {
1479
+ "area": item.area,
1480
+ "owner": item.owner,
1481
+ "maintenance_owner": item.maintenance_owner,
1482
+ "abandoned_owner": item.abandoned_owner,
1483
+ }
1484
+ for item in intelligence.contributors
1485
+ ],
1486
+ "mutations": [
1487
+ {
1488
+ "project_type": item.project_type,
1489
+ "became": item.became,
1490
+ "date": item.date,
1491
+ "impact": item.impact,
1492
+ }
1493
+ for item in intelligence.mutations
1494
+ ],
1495
+ "knowledge_map": {
1496
+ "core": intelligence.knowledge_map.core,
1497
+ "dependency_graph": intelligence.knowledge_map.dependency_graph,
1498
+ "route_graph": intelligence.knowledge_map.route_graph,
1499
+ "service_graph": intelligence.knowledge_map.service_graph,
1500
+ "architecture_graph": intelligence.knowledge_map.architecture_graph,
1501
+ },
1502
+ "containment_zones": [
1503
+ {
1504
+ "location": item.location,
1505
+ "complexity": item.complexity,
1506
+ "spread_rate": item.spread_rate,
1507
+ "recommendation": item.recommendation,
1508
+ }
1509
+ for item in intelligence.containment_zones
1510
+ ],
1511
+ "survival": {
1512
+ "score": intelligence.survival.score if intelligence.survival else 0,
1513
+ "risk": intelligence.survival.risk if intelligence.survival else "Unknown",
1514
+ "single_point_failure": intelligence.survival.single_point_failure if intelligence.survival else "Unknown",
1515
+ "maintainability": intelligence.survival.maintainability if intelligence.survival else 0,
1516
+ "recoverability": intelligence.survival.recoverability if intelligence.survival else 0,
1517
+ "onboarding_difficulty": intelligence.survival.onboarding_difficulty if intelligence.survival else 0,
1518
+ "bus_factor": intelligence.survival.bus_factor if intelligence.survival else 0,
1519
+ },
1520
+ "observations": [
1521
+ {
1522
+ "observation": item.observation,
1523
+ "evidence": item.evidence,
1524
+ }
1525
+ for item in intelligence.observations
1526
+ ],
1527
+ "debt_heatmap": [
1528
+ {
1529
+ "bucket": bucket.bucket,
1530
+ "score": bucket.score,
1531
+ "label": bucket.label,
1532
+ "files": bucket.files,
1533
+ }
1534
+ for bucket in intelligence.debt_heatmap
1535
+ ],
1536
+ "graph": {
1537
+ "nodes": intelligence.graph_node_count,
1538
+ "edges": intelligence.graph_edge_count,
1539
+ },
1540
+ "remediation": [
1541
+ {
1542
+ "problem": item.problem,
1543
+ "evidence": item.evidence,
1544
+ "impact": item.impact,
1545
+ "confidence": item.confidence,
1546
+ "recommended_fix": item.recommended_fix,
1547
+ "estimated_effort": item.estimated_effort,
1548
+ "risk_level": item.risk_level,
1549
+ "root_cause": item.root_cause,
1550
+ "likely_consequences": item.likely_consequences,
1551
+ "alternative_solution": item.alternative_solution,
1552
+ "implementation_difficulty": item.implementation_difficulty,
1553
+ "location": item.location,
1554
+ }
1555
+ for item in remediation_findings[:100]
1556
+ ],
1557
+ },
1558
+ )
1559
+ return RepositoryAnalysis(summary=summary, intelligence=intelligence)