devarch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
devarch/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ """Dev Archaeologist package."""
2
+
3
+ from .version import __version__
4
+
devarch/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .cli.main import app
2
+
3
+ app()
4
+
@@ -0,0 +1,2 @@
1
+ """Static analyzers used by Dev Archaeologist."""
2
+
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+
7
+ from ..models import Artifact
8
+ from ..utils.fs import path_kind, safe_stat
9
+
10
+
11
+ @dataclass(slots=True)
12
+ class AncientStats:
13
+ total: int
14
+ unreferenced: int
15
+
16
+
17
+ def file_age_days(path: Path) -> int:
18
+ modified = datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc)
19
+ return max((datetime.now(timezone.utc) - modified).days, 0)
20
+
21
+
22
+ def find_ancient_files(
23
+ files: list[Path],
24
+ references: dict[Path, set[Path]],
25
+ threshold_days: int = 365,
26
+ ) -> list[Artifact]:
27
+ artifacts: list[Artifact] = []
28
+ for path in files:
29
+ if path_kind(path) == "binary":
30
+ continue
31
+ age = file_age_days(path)
32
+ referenced = path in references and bool(references[path])
33
+ if age >= threshold_days or (age >= 180 and not referenced):
34
+ risk = "High" if age >= 730 or not referenced else "Medium"
35
+ status = "Unreferenced" if not referenced else "Referenced"
36
+ artifacts.append(
37
+ Artifact(
38
+ path=path,
39
+ kind="ancient_file",
40
+ risk=risk,
41
+ age_days=age,
42
+ size_bytes=safe_stat(path),
43
+ detail=status,
44
+ confidence=0.84 if not referenced else 0.7,
45
+ )
46
+ )
47
+ return artifacts
48
+
@@ -0,0 +1,92 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import re
5
+ from pathlib import Path
6
+
7
+ from ..models import Artifact
8
+ from ..utils.fs import path_kind, read_text
9
+
10
+
11
+ PY_IMPORT_RE = re.compile(r"^\s*(?:from\s+([\w.]+)\s+import|import\s+([\w.]+))", re.MULTILINE)
12
+ CODE_EXTENSIONS = {".py", ".pyi", ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs"}
13
+ def _module_name(path: Path, root: Path) -> str:
14
+ rel = path.relative_to(root).with_suffix("")
15
+ return ".".join(rel.parts)
16
+
17
+
18
+ def _has_unreachable_code(content: str) -> bool:
19
+ try:
20
+ tree = ast.parse(content)
21
+ except SyntaxError:
22
+ return False
23
+
24
+ terminal_types = (ast.Return, ast.Raise, ast.Break, ast.Continue)
25
+ for node in ast.walk(tree):
26
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
27
+ terminated = False
28
+ for stmt in node.body:
29
+ if terminated:
30
+ return True
31
+ terminated = isinstance(stmt, terminal_types)
32
+ return False
33
+
34
+
35
+ def find_dead_code(root: Path, files: list[Path], text_cache: dict[Path, str]) -> list[Artifact]:
36
+ artifacts: list[Artifact] = []
37
+ text_files = [path for path in files if path_kind(path) == "text"]
38
+
39
+ imported_modules: set[str] = set()
40
+ for path in text_files:
41
+ content = text_cache.get(path, "")
42
+ for match in PY_IMPORT_RE.finditer(content):
43
+ target = match.group(1) or match.group(2)
44
+ if target:
45
+ imported_modules.add(target.lstrip(".").replace(".", "/"))
46
+
47
+ for path in text_files:
48
+ content = text_cache.get(path, "")
49
+ module = _module_name(path, root)
50
+ module_path = module.replace(".", "/")
51
+ if path.suffix.lower() == ".py":
52
+ if "tests" in path.parts or path.name.startswith("test_") or path.name == "conftest.py":
53
+ continue
54
+ if path.name in {"__init__.py", "__main__.py"}:
55
+ continue
56
+ if not any(module_path.endswith(name) or name.endswith(module_path) for name in imported_modules):
57
+ if "if __name__ == \"__main__\"" not in content and "if __name__ == '__main__'" not in content:
58
+ artifacts.append(
59
+ Artifact(
60
+ path=path,
61
+ kind="dead_code_candidate",
62
+ risk="Medium",
63
+ detail="Module is not referenced by obvious imports",
64
+ confidence=0.62,
65
+ )
66
+ )
67
+
68
+ if _has_unreachable_code(content):
69
+ artifacts.append(
70
+ Artifact(
71
+ path=path,
72
+ kind="unreachable_code",
73
+ risk="Low",
74
+ detail="Function body contains statements after a terminal statement",
75
+ confidence=0.72,
76
+ )
77
+ )
78
+ else:
79
+ if path.suffix.lower() not in CODE_EXTENSIONS:
80
+ continue
81
+ if not any(module_path.endswith(name) or name.endswith(module_path) for name in imported_modules):
82
+ artifacts.append(
83
+ Artifact(
84
+ path=path,
85
+ kind="dead_code_candidate",
86
+ risk="Low",
87
+ detail="File is not referenced by obvious imports",
88
+ confidence=0.5,
89
+ )
90
+ )
91
+
92
+ return artifacts
@@ -0,0 +1,101 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ import re
7
+
8
+ from ..models import Artifact
9
+ from ..utils.fs import path_kind, read_text
10
+
11
+
12
+ def normalize_text(text: str) -> str:
13
+ text = re.sub(r"\"\"\".*?\"\"\"|'''.*?'''", "", text, flags=re.S)
14
+ text = re.sub(r"#.*$", "", text, flags=re.M)
15
+ text = re.sub(r"//.*$", "", text, flags=re.M)
16
+ text = re.sub(r"/\*.*?\*/", "", text, flags=re.S)
17
+ text = re.sub(r"\s+", " ", text)
18
+ text = re.sub(r"\b\d+\b", "0", text)
19
+ return text.strip().lower()
20
+
21
+
22
+ def tokenize_blocks(text: str, block_size: int = 12) -> list[str]:
23
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
24
+ if len(lines) <= block_size:
25
+ return ["\n".join(lines)] if lines else []
26
+ blocks: list[str] = []
27
+ for index in range(0, len(lines) - block_size + 1):
28
+ blocks.append("\n".join(lines[index : index + block_size]))
29
+ return blocks
30
+
31
+
32
+ def find_duplicates(files: list[Path], text_cache: dict[Path, str]) -> list[Artifact]:
33
+ signatures: dict[str, list[Path]] = defaultdict(list)
34
+ for path in files:
35
+ if path_kind(path) != "text":
36
+ continue
37
+ content = normalize_text(text_cache.get(path, ""))
38
+ if not content:
39
+ continue
40
+ for block in tokenize_blocks(content):
41
+ if len(block.split()) < 10:
42
+ continue
43
+ signatures[block].append(path)
44
+
45
+ artifacts: list[Artifact] = []
46
+ seen: set[tuple[Path, Path, str]] = set()
47
+ for block, paths in signatures.items():
48
+ if len(paths) < 2:
49
+ continue
50
+ for idx, left in enumerate(paths):
51
+ for right in paths[idx + 1 :]:
52
+ pair = tuple(sorted((left, right))) + (block,)
53
+ if pair in seen:
54
+ continue
55
+ seen.add(pair)
56
+ artifacts.append(
57
+ Artifact(
58
+ path=left,
59
+ kind="duplicate_block",
60
+ risk="Medium",
61
+ detail=f"Similar to {right}",
62
+ score=0.85,
63
+ confidence=0.85,
64
+ metadata={"match_path": str(right)},
65
+ )
66
+ )
67
+ return artifacts
68
+
69
+
70
+ def similarity_report(files: list[Path], text_cache: dict[Path, str]) -> list[dict[str, object]]:
71
+ normalized: dict[Path, set[str]] = {}
72
+ for path in files:
73
+ if path_kind(path) != "text":
74
+ continue
75
+ tokens = set(re.findall(r"[a-zA-Z_][a-zA-Z0-9_]+", normalize_text(text_cache.get(path, ""))))
76
+ if tokens:
77
+ normalized[path] = tokens
78
+
79
+ report: list[dict[str, object]] = []
80
+ paths = list(normalized)
81
+ for idx, left in enumerate(paths):
82
+ for right in paths[idx + 1 :]:
83
+ a = normalized[left]
84
+ b = normalized[right]
85
+ if not a or not b:
86
+ continue
87
+ intersection = len(a & b)
88
+ union = len(a | b)
89
+ if not union:
90
+ continue
91
+ similarity = round((intersection / union) * 100, 1)
92
+ if similarity >= 65:
93
+ report.append(
94
+ {
95
+ "left": str(left),
96
+ "right": str(right),
97
+ "similarity": similarity,
98
+ }
99
+ )
100
+ return sorted(report, key=lambda item: item["similarity"], reverse=True)
101
+
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass(slots=True)
7
+ class HealthMetrics:
8
+ score: int
9
+ status: str
10
+ warnings: list[str]
11
+ debt_estimate: float
12
+
13
+
14
+ def calculate_health(
15
+ *,
16
+ total_files: int,
17
+ dead_code_count: int,
18
+ duplicate_count: int,
19
+ ancient_count: int,
20
+ todo_count: int,
21
+ monster_count: int,
22
+ ruin_count: int,
23
+ suspicious_count: int,
24
+ ) -> HealthMetrics:
25
+ debt = (
26
+ dead_code_count * 2.0
27
+ + duplicate_count * 1.5
28
+ + ancient_count * 1.2
29
+ + todo_count * 0.35
30
+ + monster_count * 2.5
31
+ + ruin_count * 0.8
32
+ + suspicious_count * 0.6
33
+ )
34
+ if total_files:
35
+ debt += min(total_files / 250.0, 10.0)
36
+ score = max(0, min(100, int(round(100 - debt))))
37
+ if score >= 85:
38
+ status = "Healthy"
39
+ elif score >= 65:
40
+ status = "Moderate debt"
41
+ elif score >= 45:
42
+ status = "Debt detected"
43
+ else:
44
+ status = "Critical"
45
+
46
+ warnings: list[str] = []
47
+ if dead_code_count:
48
+ warnings.append("Dead code candidates detected")
49
+ if duplicate_count:
50
+ warnings.append("Duplicate implementations found")
51
+ if ancient_count:
52
+ warnings.append("Ancient files appear abandoned")
53
+ if monster_count:
54
+ warnings.append("Monster files need review")
55
+ if ruin_count:
56
+ warnings.append("Empty structures or unused assets found")
57
+ if suspicious_count:
58
+ warnings.append("Suspicious filenames found")
59
+ return HealthMetrics(score=score, status=status, warnings=warnings, debt_estimate=debt)
60
+