patchwork-conventions 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- patchwork/__init__.py +10 -0
- patchwork/cli.py +336 -0
- patchwork/mcp/__init__.py +1 -0
- patchwork/mcp/server.py +442 -0
- patchwork/miners/__init__.py +1 -0
- patchwork/miners/api_patterns.py +204 -0
- patchwork/miners/ast_base.py +113 -0
- patchwork/miners/config_detector.py +273 -0
- patchwork/miners/error_handling.py +207 -0
- patchwork/miners/git_patterns.py +169 -0
- patchwork/miners/imports.py +158 -0
- patchwork/miners/naming.py +277 -0
- patchwork/miners/structure.py +204 -0
- patchwork/miners/testing.py +204 -0
- patchwork/output/__init__.py +1 -0
- patchwork/output/report.py +417 -0
- patchwork/scanner.py +162 -0
- patchwork_conventions-0.1.0.dist-info/METADATA +393 -0
- patchwork_conventions-0.1.0.dist-info/RECORD +23 -0
- patchwork_conventions-0.1.0.dist-info/WHEEL +5 -0
- patchwork_conventions-0.1.0.dist-info/entry_points.txt +2 -0
- patchwork_conventions-0.1.0.dist-info/licenses/LICENSE +21 -0
- patchwork_conventions-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GitPatternMiner — Mines git history for workflow conventions:
|
|
3
|
+
- Commit message style (conventional commits / semantic / free-form)
|
|
4
|
+
- Branch naming convention
|
|
5
|
+
- PR/merge frequency
|
|
6
|
+
- Average commit size (files changed)
|
|
7
|
+
- Co-change pairs (files that always change together)
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
import subprocess
|
|
13
|
+
from collections import Counter, defaultdict
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class GitResult:
|
|
20
|
+
commit_style: str | None # 'conventional' | 'semantic' | 'free-form'
|
|
21
|
+
commit_examples: list[str]
|
|
22
|
+
branch_style: str | None # 'feature/name' | 'feat/name' | 'JIRA-123' | 'free-form'
|
|
23
|
+
avg_files_per_commit: float
|
|
24
|
+
total_commits_sampled: int
|
|
25
|
+
cochange_pairs: list[tuple[str, str, int]] # (fileA, fileB, count)
|
|
26
|
+
notes: list[str] = field(default_factory=list)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_CONVENTIONAL_RE = re.compile(
|
|
30
|
+
r'^(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert)(\(.+\))?!?:\s+.+'
|
|
31
|
+
)
|
|
32
|
+
_SEMANTIC_RE = re.compile(
|
|
33
|
+
r'^(add|update|remove|fix|change|bump|improve|rename|move|delete|merge)\s+',
|
|
34
|
+
re.IGNORECASE
|
|
35
|
+
)
|
|
36
|
+
_BRANCH_FEATURE = re.compile(r'^(feature|feat)/[\w/-]+$')
|
|
37
|
+
_BRANCH_JIRA = re.compile(r'^[A-Z]+-\d+')
|
|
38
|
+
_BRANCH_FIX = re.compile(r'^(fix|hotfix|bugfix)/[\w/-]+$')
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _run_git(args: list[str], cwd: Path, max_bytes: int = 500_000) -> str:
|
|
42
|
+
try:
|
|
43
|
+
result = subprocess.run(
|
|
44
|
+
["git"] + args,
|
|
45
|
+
cwd=str(cwd),
|
|
46
|
+
capture_output=True,
|
|
47
|
+
timeout=10,
|
|
48
|
+
)
|
|
49
|
+
return result.stdout[:max_bytes].decode("utf-8", errors="replace")
|
|
50
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
51
|
+
return ""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _is_git_repo(root: Path) -> bool:
|
|
55
|
+
return (root / ".git").exists()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class GitPatternMiner:
|
|
59
|
+
def __init__(self, root: Path):
|
|
60
|
+
self.root = root
|
|
61
|
+
|
|
62
|
+
def mine(self) -> GitResult | None:
|
|
63
|
+
if not _is_git_repo(self.root):
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
# Sample last 200 commit messages
|
|
67
|
+
log_out = _run_git(
|
|
68
|
+
["log", "--oneline", "-200", "--pretty=format:%s"],
|
|
69
|
+
self.root,
|
|
70
|
+
)
|
|
71
|
+
messages = [m.strip() for m in log_out.splitlines() if m.strip()]
|
|
72
|
+
|
|
73
|
+
if not messages:
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
# Commit style detection
|
|
77
|
+
conventional = sum(1 for m in messages if _CONVENTIONAL_RE.match(m))
|
|
78
|
+
semantic = sum(1 for m in messages if _SEMANTIC_RE.match(m))
|
|
79
|
+
total = len(messages)
|
|
80
|
+
|
|
81
|
+
if conventional / total > 0.5:
|
|
82
|
+
style = "conventional commits"
|
|
83
|
+
elif semantic / total > 0.4:
|
|
84
|
+
style = "semantic prefixes"
|
|
85
|
+
else:
|
|
86
|
+
style = "free-form"
|
|
87
|
+
|
|
88
|
+
examples = messages[:5]
|
|
89
|
+
|
|
90
|
+
# Branch names
|
|
91
|
+
branches_out = _run_git(
|
|
92
|
+
["branch", "-a", "--format=%(refname:short)"],
|
|
93
|
+
self.root,
|
|
94
|
+
)
|
|
95
|
+
branches = [b.strip() for b in branches_out.splitlines() if b.strip()]
|
|
96
|
+
branch_style = _detect_branch_style(branches)
|
|
97
|
+
|
|
98
|
+
# Average files per commit
|
|
99
|
+
diff_stat = _run_git(
|
|
100
|
+
["log", "--oneline", "-100", "--stat"],
|
|
101
|
+
self.root,
|
|
102
|
+
)
|
|
103
|
+
file_counts = re.findall(r'(\d+) files? changed', diff_stat)
|
|
104
|
+
avg_files = (
|
|
105
|
+
round(sum(int(x) for x in file_counts) / len(file_counts), 1)
|
|
106
|
+
if file_counts else 0.0
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Co-change analysis (files that often change together)
|
|
110
|
+
co_change = _detect_cochange(self.root)
|
|
111
|
+
|
|
112
|
+
notes: list[str] = []
|
|
113
|
+
if style == "conventional commits":
|
|
114
|
+
notes.append(
|
|
115
|
+
"Uses Conventional Commits — always prefix messages with type(scope): description"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return GitResult(
|
|
119
|
+
commit_style=style,
|
|
120
|
+
commit_examples=examples,
|
|
121
|
+
branch_style=branch_style,
|
|
122
|
+
avg_files_per_commit=avg_files,
|
|
123
|
+
total_commits_sampled=total,
|
|
124
|
+
cochange_pairs=co_change[:5],
|
|
125
|
+
notes=notes,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _detect_branch_style(branches: list[str]) -> str | None:
|
|
130
|
+
if not branches:
|
|
131
|
+
return None
|
|
132
|
+
feature = sum(1 for b in branches if _BRANCH_FEATURE.match(b))
|
|
133
|
+
fix = sum(1 for b in branches if _BRANCH_FIX.match(b))
|
|
134
|
+
jira = sum(1 for b in branches if _BRANCH_JIRA.match(b))
|
|
135
|
+
total = len(branches)
|
|
136
|
+
if (feature + fix) / total > 0.4:
|
|
137
|
+
return "feature/name + fix/name"
|
|
138
|
+
if jira / total > 0.4:
|
|
139
|
+
return "JIRA-123 ticket keys"
|
|
140
|
+
return "free-form"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _detect_cochange(root: Path) -> list[tuple[str, str, int]]:
|
|
144
|
+
"""Find file pairs that change together frequently."""
|
|
145
|
+
log_out = _run_git(
|
|
146
|
+
["log", "--name-only", "--pretty=format:--COMMIT--", "-100"],
|
|
147
|
+
root,
|
|
148
|
+
max_bytes=200_000,
|
|
149
|
+
)
|
|
150
|
+
commits: list[list[str]] = []
|
|
151
|
+
current: list[str] = []
|
|
152
|
+
for line in log_out.splitlines():
|
|
153
|
+
if line == "--COMMIT--":
|
|
154
|
+
if current:
|
|
155
|
+
commits.append(current)
|
|
156
|
+
current = []
|
|
157
|
+
elif line.strip() and not line.startswith("diff"):
|
|
158
|
+
current.append(line.strip())
|
|
159
|
+
if current:
|
|
160
|
+
commits.append(current)
|
|
161
|
+
|
|
162
|
+
pair_counts: Counter[tuple[str, str]] = Counter()
|
|
163
|
+
for commit_files in commits:
|
|
164
|
+
files = sorted(set(commit_files))
|
|
165
|
+
for i, f1 in enumerate(files):
|
|
166
|
+
for f2 in files[i + 1:]:
|
|
167
|
+
pair_counts[(f1, f2)] += 1
|
|
168
|
+
|
|
169
|
+
return [(a, b, count) for (a, b), count in pair_counts.most_common(10) if count >= 2]
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ImportMiner — Detects import style conventions:
|
|
3
|
+
- Absolute vs relative imports
|
|
4
|
+
- Path alias usage (e.g. @/, ~/, src/)
|
|
5
|
+
- Import grouping (stdlib / third-party / local)
|
|
6
|
+
- Barrel file patterns (index.ts re-exports)
|
|
7
|
+
- Destructuring vs namespace imports
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from collections import Counter
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ImportResult:
|
|
19
|
+
style: str # 'absolute' | 'relative' | 'mixed'
|
|
20
|
+
relative_confidence: float
|
|
21
|
+
aliases_used: list[str] # e.g. ['@/', 'src/']
|
|
22
|
+
grouping: str | None # 'grouped' | 'ungrouped'
|
|
23
|
+
destructuring: str | None # 'destructuring' | 'namespace' | 'mixed'
|
|
24
|
+
barrel_files: list[str] # relative paths of index.{ts,js}
|
|
25
|
+
common_stdlib: list[str] # most imported stdlib modules
|
|
26
|
+
common_third_party: list[str] # most imported third-party packages
|
|
27
|
+
notes: list[str] = field(default_factory=list)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
_PY_RELATIVE = re.compile(r'^\s*from\s+\.', re.MULTILINE)
|
|
31
|
+
_PY_ABSOLUTE = re.compile(r'^\s*(?:import|from)\s+(?!\.)', re.MULTILINE)
|
|
32
|
+
_PY_IMPORT = re.compile(r'^\s*(?:from\s+([\w.]+)\s+import|import\s+([\w.,\s]+))', re.MULTILINE)
|
|
33
|
+
|
|
34
|
+
_JS_RELATIVE = re.compile(r"""(?:import|require)\s*\(?['"](\./|\.\./)""", re.MULTILINE)
|
|
35
|
+
_JS_ABSOLUTE_ALIAS = re.compile(r"""(?:import|require)\s*\(?['"](@\w+/|~/)""", re.MULTILINE)
|
|
36
|
+
_JS_IMPORT_FROM = re.compile(r"""import\s+(?:\{[^}]+\}|\*\s+as\s+\w+|\w+)\s+from\s+['"]([^'"]+)['"]""")
|
|
37
|
+
_JS_DESTRUCTURE = re.compile(r"""import\s+\{[^}]+\}\s+from""")
|
|
38
|
+
_JS_NAMESPACE = re.compile(r"""import\s+\*\s+as\s+\w+\s+from""")
|
|
39
|
+
_JS_SIDE_EFFECT = re.compile(r"""import\s+['"][^'"]+['"]""")
|
|
40
|
+
|
|
41
|
+
_STDLIB_PY = {
|
|
42
|
+
"os", "sys", "re", "json", "pathlib", "typing", "dataclasses",
|
|
43
|
+
"collections", "itertools", "functools", "io", "time", "datetime",
|
|
44
|
+
"logging", "unittest", "asyncio", "threading", "subprocess",
|
|
45
|
+
"hashlib", "base64", "copy", "math", "random",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _detect_py_imports(paths: list[Path]) -> ImportResult:
|
|
50
|
+
relative_count = 0
|
|
51
|
+
absolute_count = 0
|
|
52
|
+
all_modules: list[str] = []
|
|
53
|
+
aliases: set[str] = set()
|
|
54
|
+
|
|
55
|
+
for path in paths[:200]:
|
|
56
|
+
try:
|
|
57
|
+
text = path.read_text(errors="replace")
|
|
58
|
+
except OSError:
|
|
59
|
+
continue
|
|
60
|
+
relative_count += len(_PY_RELATIVE.findall(text))
|
|
61
|
+
absolute_count += len(_PY_ABSOLUTE.findall(text))
|
|
62
|
+
for m in _PY_IMPORT.finditer(text):
|
|
63
|
+
mod = (m.group(1) or m.group(2) or "").strip().split(".")[0]
|
|
64
|
+
if mod:
|
|
65
|
+
all_modules.append(mod)
|
|
66
|
+
# Detect src/ or similar path aliases in pyproject/setup.cfg
|
|
67
|
+
if "@" in text or "from src." in text:
|
|
68
|
+
aliases.add("src/")
|
|
69
|
+
|
|
70
|
+
total = relative_count + absolute_count
|
|
71
|
+
rel_conf = relative_count / total if total else 0.0
|
|
72
|
+
style = "relative" if rel_conf > 0.6 else ("mixed" if rel_conf > 0.2 else "absolute")
|
|
73
|
+
|
|
74
|
+
counts = Counter(all_modules)
|
|
75
|
+
stdlib = [m for m, _ in counts.most_common(30) if m in _STDLIB_PY][:5]
|
|
76
|
+
third_party = [m for m, _ in counts.most_common(30) if m not in _STDLIB_PY][:8]
|
|
77
|
+
|
|
78
|
+
return ImportResult(
|
|
79
|
+
style=style,
|
|
80
|
+
relative_confidence=round(rel_conf, 2),
|
|
81
|
+
aliases_used=list(aliases),
|
|
82
|
+
grouping=None,
|
|
83
|
+
destructuring=None,
|
|
84
|
+
barrel_files=[],
|
|
85
|
+
common_stdlib=stdlib,
|
|
86
|
+
common_third_party=third_party,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _detect_js_imports(paths: list[Path], lang: str) -> ImportResult:
|
|
91
|
+
relative_count = 0
|
|
92
|
+
alias_count = 0
|
|
93
|
+
alias_prefixes: Counter[str] = Counter()
|
|
94
|
+
destructure_count = 0
|
|
95
|
+
namespace_count = 0
|
|
96
|
+
all_packages: list[str] = []
|
|
97
|
+
barrel_files: list[str] = []
|
|
98
|
+
|
|
99
|
+
for path in paths[:200]:
|
|
100
|
+
try:
|
|
101
|
+
text = path.read_text(errors="replace")
|
|
102
|
+
except OSError:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
relative_count += len(_JS_RELATIVE.findall(text))
|
|
106
|
+
found_aliases = _JS_ABSOLUTE_ALIAS.findall(text)
|
|
107
|
+
alias_count += len(found_aliases)
|
|
108
|
+
for a in found_aliases:
|
|
109
|
+
alias_prefixes[a] += 1
|
|
110
|
+
|
|
111
|
+
destructure_count += len(_JS_DESTRUCTURE.findall(text))
|
|
112
|
+
namespace_count += len(_JS_NAMESPACE.findall(text))
|
|
113
|
+
|
|
114
|
+
for m in _JS_IMPORT_FROM.finditer(text):
|
|
115
|
+
pkg = m.group(1)
|
|
116
|
+
if not pkg.startswith(".") and not pkg.startswith("@"):
|
|
117
|
+
top = pkg.split("/")[0]
|
|
118
|
+
all_packages.append(top)
|
|
119
|
+
|
|
120
|
+
# Barrel file detection
|
|
121
|
+
if path.stem == "index" and lang == "typescript":
|
|
122
|
+
if "export" in text and "from" in text:
|
|
123
|
+
barrel_files.append(str(path.name))
|
|
124
|
+
|
|
125
|
+
total = relative_count + alias_count
|
|
126
|
+
rel_conf = relative_count / total if total else 0.5
|
|
127
|
+
style = "relative" if rel_conf > 0.7 else ("mixed" if rel_conf > 0.3 else "absolute")
|
|
128
|
+
|
|
129
|
+
aliases = [a for a, _ in alias_prefixes.most_common(5)]
|
|
130
|
+
|
|
131
|
+
destr = None
|
|
132
|
+
if destructure_count + namespace_count > 0:
|
|
133
|
+
ratio = destructure_count / (destructure_count + namespace_count)
|
|
134
|
+
destr = "destructuring" if ratio > 0.7 else ("namespace" if ratio < 0.3 else "mixed")
|
|
135
|
+
|
|
136
|
+
third_party = [m for m, _ in Counter(all_packages).most_common(10)]
|
|
137
|
+
|
|
138
|
+
return ImportResult(
|
|
139
|
+
style=style,
|
|
140
|
+
relative_confidence=round(rel_conf, 2),
|
|
141
|
+
aliases_used=aliases,
|
|
142
|
+
grouping=None,
|
|
143
|
+
destructuring=destr,
|
|
144
|
+
barrel_files=barrel_files[:5],
|
|
145
|
+
common_stdlib=[],
|
|
146
|
+
common_third_party=third_party,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class ImportMiner:
|
|
151
|
+
def mine(self, by_lang: dict[str, list[Path]]) -> dict[str, ImportResult]:
|
|
152
|
+
results: dict[str, ImportResult] = {}
|
|
153
|
+
for lang, paths in by_lang.items():
|
|
154
|
+
if lang == "python":
|
|
155
|
+
results[lang] = _detect_py_imports(paths)
|
|
156
|
+
elif lang in ("javascript", "typescript"):
|
|
157
|
+
results[lang] = _detect_js_imports(paths, lang)
|
|
158
|
+
return results
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NamingMiner — Detects naming conventions across functions, classes, variables,
|
|
3
|
+
constants, files. Uses tree-sitter AST when available; falls back to regex.
|
|
4
|
+
|
|
5
|
+
Detects:
|
|
6
|
+
- camelCase / PascalCase / snake_case / SCREAMING_SNAKE / kebab-case
|
|
7
|
+
- Consistency score per category
|
|
8
|
+
- Language-specific override rules (e.g. Go unexported = lowercase)
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
from collections import Counter, defaultdict
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from patchwork.miners.ast_base import parse_file, walk, node_text
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class NamingConvention:
|
|
22
|
+
style: str # 'snake_case' | 'camelCase' | 'PascalCase' | 'SCREAMING_SNAKE' | 'mixed'
|
|
23
|
+
confidence: float # 0.0 – 1.0
|
|
24
|
+
examples: list[str] = field(default_factory=list)
|
|
25
|
+
counter_examples: list[str] = field(default_factory=list)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class NamingResult:
|
|
30
|
+
functions: NamingConvention | None = None
|
|
31
|
+
classes: NamingConvention | None = None
|
|
32
|
+
variables: NamingConvention | None = None
|
|
33
|
+
constants: NamingConvention | None = None
|
|
34
|
+
files: NamingConvention | None = None
|
|
35
|
+
private_prefix: str | None = None # e.g. '_' or '__'
|
|
36
|
+
test_prefix: str | None = None # e.g. 'test_' or 'Test'
|
|
37
|
+
notes: list[str] = field(default_factory=list)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ── Naming style detection ────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
_RE_SNAKE = re.compile(r'^[a-z][a-z0-9]*(_[a-z0-9]+)*$')
|
|
43
|
+
_RE_CAMEL = re.compile(r'^[a-z][a-zA-Z0-9]*$')
|
|
44
|
+
_RE_PASCAL = re.compile(r'^[A-Z][a-zA-Z0-9]*$')
|
|
45
|
+
_RE_SCREAMING = re.compile(r'^[A-Z][A-Z0-9]*(_[A-Z0-9]+)*$')
|
|
46
|
+
_RE_KEBAB = re.compile(r'^[a-z][a-z0-9]*(-[a-z0-9]+)*$')
|
|
47
|
+
_RE_PRIVATE_SINGLE = re.compile(r'^_[a-z]')
|
|
48
|
+
_RE_PRIVATE_DOUBLE = re.compile(r'^__[a-z]')
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _classify(name: str) -> str:
|
|
52
|
+
if _RE_SCREAMING.match(name):
|
|
53
|
+
return "SCREAMING_SNAKE"
|
|
54
|
+
if _RE_SNAKE.match(name):
|
|
55
|
+
return "snake_case"
|
|
56
|
+
if _RE_CAMEL.match(name):
|
|
57
|
+
return "camelCase"
|
|
58
|
+
if _RE_PASCAL.match(name):
|
|
59
|
+
return "PascalCase"
|
|
60
|
+
if _RE_KEBAB.match(name):
|
|
61
|
+
return "kebab-case"
|
|
62
|
+
return "mixed"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _majority_convention(names: list[str]) -> NamingConvention:
|
|
66
|
+
if not names:
|
|
67
|
+
return NamingConvention(style="mixed", confidence=0.0)
|
|
68
|
+
counts: Counter[str] = Counter(_classify(n) for n in names)
|
|
69
|
+
top_style, top_count = counts.most_common(1)[0]
|
|
70
|
+
confidence = top_count / len(names)
|
|
71
|
+
examples = [n for n in names if _classify(n) == top_style][:5]
|
|
72
|
+
counter_examples = [n for n in names if _classify(n) != top_style][:3]
|
|
73
|
+
return NamingConvention(
|
|
74
|
+
style=top_style,
|
|
75
|
+
confidence=round(confidence, 2),
|
|
76
|
+
examples=examples,
|
|
77
|
+
counter_examples=counter_examples,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ── Language-specific AST extraction ─────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
_FUNCTION_TYPES = {
|
|
84
|
+
"python": ("function_definition", "async_function_definition"),
|
|
85
|
+
"javascript": ("function_declaration", "arrow_function", "method_definition"),
|
|
86
|
+
"typescript": ("function_declaration", "arrow_function", "method_definition"),
|
|
87
|
+
"go": ("function_declaration", "method_declaration"),
|
|
88
|
+
"rust": ("function_item",),
|
|
89
|
+
"java": ("method_declaration", "constructor_declaration"),
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
_CLASS_TYPES = {
|
|
93
|
+
"python": ("class_definition",),
|
|
94
|
+
"javascript": ("class_declaration",),
|
|
95
|
+
"typescript": ("class_declaration",),
|
|
96
|
+
"go": ("type_declaration",),
|
|
97
|
+
"rust": ("struct_item", "enum_item", "trait_item"),
|
|
98
|
+
"java": ("class_declaration", "interface_declaration", "enum_declaration"),
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
_VAR_TYPES = {
|
|
102
|
+
"python": ("assignment",),
|
|
103
|
+
"javascript": ("variable_declarator",),
|
|
104
|
+
"typescript": ("variable_declarator",),
|
|
105
|
+
"go": ("short_var_decl", "var_spec"),
|
|
106
|
+
"rust": ("let_declaration",),
|
|
107
|
+
"java": ("variable_declarator",),
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _extract_function_names(root, source: bytes, lang: str) -> list[str]:
|
|
112
|
+
types = _FUNCTION_TYPES.get(lang, ())
|
|
113
|
+
names = []
|
|
114
|
+
for node in walk(root):
|
|
115
|
+
if node.type in types:
|
|
116
|
+
name_node = node.child_by_field_name("name")
|
|
117
|
+
if name_node:
|
|
118
|
+
names.append(node_text(name_node, source))
|
|
119
|
+
return names
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _extract_class_names(root, source: bytes, lang: str) -> list[str]:
|
|
123
|
+
types = _CLASS_TYPES.get(lang, ())
|
|
124
|
+
names = []
|
|
125
|
+
for node in walk(root):
|
|
126
|
+
if node.type in types:
|
|
127
|
+
name_node = node.child_by_field_name("name")
|
|
128
|
+
if name_node:
|
|
129
|
+
names.append(node_text(name_node, source))
|
|
130
|
+
return names
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _extract_var_names(root, source: bytes, lang: str) -> list[str]:
|
|
134
|
+
types = _VAR_TYPES.get(lang, ())
|
|
135
|
+
names = []
|
|
136
|
+
for node in walk(root):
|
|
137
|
+
if node.type in types:
|
|
138
|
+
name_node = node.child_by_field_name("name")
|
|
139
|
+
if name_node:
|
|
140
|
+
t = node_text(name_node, source)
|
|
141
|
+
if len(t) > 1 and not t.startswith("_"):
|
|
142
|
+
names.append(t)
|
|
143
|
+
return names[:200] # cap to avoid noise
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# ── Regex fallback ────────────────────────────────────────────────────────────
|
|
147
|
+
|
|
148
|
+
_FALLBACK_PATTERNS = {
|
|
149
|
+
"python": {
|
|
150
|
+
"functions": re.compile(r'^def\s+([A-Za-z_][A-Za-z0-9_]*)', re.MULTILINE),
|
|
151
|
+
"classes": re.compile(r'^class\s+([A-Za-z_][A-Za-z0-9_]*)', re.MULTILINE),
|
|
152
|
+
"constants": re.compile(r'^([A-Z][A-Z0-9_]{2,})\s*=', re.MULTILINE),
|
|
153
|
+
},
|
|
154
|
+
"javascript": {
|
|
155
|
+
"functions": re.compile(r'(?:function\s+|const\s+|let\s+|var\s+)([A-Za-z_$][A-Za-z0-9_$]*)\s*(?:=\s*(?:async\s+)?(?:function|\()|{|\()'),
|
|
156
|
+
"classes": re.compile(r'class\s+([A-Za-z_$][A-Za-z0-9_$]*)'),
|
|
157
|
+
"constants": re.compile(r'const\s+([A-Z_]{2,}[A-Z0-9_]*)\s*='),
|
|
158
|
+
},
|
|
159
|
+
"typescript": {
|
|
160
|
+
"functions": re.compile(r'(?:function\s+|const\s+|let\s+|async\s+function\s+)([A-Za-z_$][A-Za-z0-9_$]*)'),
|
|
161
|
+
"classes": re.compile(r'class\s+([A-Za-z_$][A-Za-z0-9_$]*)'),
|
|
162
|
+
"constants": re.compile(r'const\s+([A-Z_]{2,}[A-Z0-9_]*)\s*='),
|
|
163
|
+
},
|
|
164
|
+
"go": {
|
|
165
|
+
"functions": re.compile(r'^func\s+(?:\([^)]+\)\s+)?([A-Za-z][A-Za-z0-9]*)', re.MULTILINE),
|
|
166
|
+
"classes": re.compile(r'^type\s+([A-Za-z][A-Za-z0-9]*)\s+struct', re.MULTILINE),
|
|
167
|
+
"constants": re.compile(r'const\s+([A-Z][A-Za-z0-9]*)\s', re.MULTILINE),
|
|
168
|
+
},
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _regex_mine(paths: list[Path], lang: str) -> dict[str, list[str]]:
|
|
173
|
+
patterns = _FALLBACK_PATTERNS.get(lang, {})
|
|
174
|
+
result: dict[str, list[str]] = defaultdict(list)
|
|
175
|
+
for path in paths[:100]:
|
|
176
|
+
try:
|
|
177
|
+
text = path.read_text(errors="replace")
|
|
178
|
+
except OSError:
|
|
179
|
+
continue
|
|
180
|
+
for category, pat in patterns.items():
|
|
181
|
+
result[category].extend(pat.findall(text))
|
|
182
|
+
return result
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# ── Public miner ─────────────────────────────────────────────────────────────
|
|
186
|
+
|
|
187
|
+
class NamingMiner:
|
|
188
|
+
def mine(self, by_lang: dict[str, list[Path]]) -> dict[str, NamingResult]:
|
|
189
|
+
"""Return a NamingResult per language."""
|
|
190
|
+
results: dict[str, NamingResult] = {}
|
|
191
|
+
|
|
192
|
+
for lang, paths in by_lang.items():
|
|
193
|
+
all_funcs: list[str] = []
|
|
194
|
+
all_classes: list[str] = []
|
|
195
|
+
all_vars: list[str] = []
|
|
196
|
+
all_consts: list[str] = []
|
|
197
|
+
|
|
198
|
+
for path in paths[:150]: # cap for performance
|
|
199
|
+
try:
|
|
200
|
+
source = path.read_bytes()
|
|
201
|
+
except OSError:
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
from patchwork.miners.ast_base import parse_bytes
|
|
205
|
+
root = parse_bytes(source, lang)
|
|
206
|
+
|
|
207
|
+
if root is not None:
|
|
208
|
+
src_str = source.decode("utf-8", errors="replace")
|
|
209
|
+
all_funcs.extend(_extract_function_names(root, source, lang))
|
|
210
|
+
all_classes.extend(_extract_class_names(root, source, lang))
|
|
211
|
+
all_vars.extend(_extract_var_names(root, source, lang))
|
|
212
|
+
# Detect constants: names that are SCREAMING_SNAKE in assignments
|
|
213
|
+
all_consts.extend(
|
|
214
|
+
n for n in all_vars if _RE_SCREAMING.match(n)
|
|
215
|
+
)
|
|
216
|
+
else:
|
|
217
|
+
# Regex fallback
|
|
218
|
+
mined = _regex_mine([path], lang)
|
|
219
|
+
all_funcs.extend(mined.get("functions", []))
|
|
220
|
+
all_classes.extend(mined.get("classes", []))
|
|
221
|
+
all_consts.extend(mined.get("constants", []))
|
|
222
|
+
|
|
223
|
+
# Deduplicate and build result
|
|
224
|
+
all_funcs = list(dict.fromkeys(all_funcs))
|
|
225
|
+
all_classes = list(dict.fromkeys(all_classes))
|
|
226
|
+
all_vars = list(dict.fromkeys(all_vars))
|
|
227
|
+
all_consts = list(dict.fromkeys(all_consts))
|
|
228
|
+
|
|
229
|
+
# Detect private naming convention
|
|
230
|
+
private_prefix = None
|
|
231
|
+
if lang == "python":
|
|
232
|
+
dunder = [f for f in all_funcs if f.startswith("__") and not f.endswith("__")]
|
|
233
|
+
single = [f for f in all_funcs if f.startswith("_") and not f.startswith("__")]
|
|
234
|
+
if dunder:
|
|
235
|
+
private_prefix = "__"
|
|
236
|
+
elif single:
|
|
237
|
+
private_prefix = "_"
|
|
238
|
+
|
|
239
|
+
# Detect test prefix
|
|
240
|
+
test_prefix = None
|
|
241
|
+
test_funcs = [f for f in all_funcs if f.startswith("test_")]
|
|
242
|
+
Test_funcs = [f for f in all_funcs if f.startswith("Test")]
|
|
243
|
+
if test_funcs:
|
|
244
|
+
test_prefix = "test_"
|
|
245
|
+
elif Test_funcs:
|
|
246
|
+
test_prefix = "Test"
|
|
247
|
+
|
|
248
|
+
notes = []
|
|
249
|
+
if lang == "go":
|
|
250
|
+
exported = [f for f in all_funcs if f and f[0].isupper()]
|
|
251
|
+
unexported = [f for f in all_funcs if f and f[0].islower()]
|
|
252
|
+
if exported and unexported:
|
|
253
|
+
notes.append(
|
|
254
|
+
"Go convention: PascalCase for exported identifiers, camelCase for unexported"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
results[lang] = NamingResult(
|
|
258
|
+
functions=_majority_convention(
|
|
259
|
+
[f for f in all_funcs if not f.startswith("_")]
|
|
260
|
+
) if all_funcs else None,
|
|
261
|
+
classes=_majority_convention(all_classes) if all_classes else None,
|
|
262
|
+
variables=_majority_convention(
|
|
263
|
+
[v for v in all_vars if not _RE_SCREAMING.match(v)]
|
|
264
|
+
) if all_vars else None,
|
|
265
|
+
constants=_majority_convention(all_consts) if all_consts else None,
|
|
266
|
+
files=_file_naming(paths),
|
|
267
|
+
private_prefix=private_prefix,
|
|
268
|
+
test_prefix=test_prefix,
|
|
269
|
+
notes=notes,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
return results
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _file_naming(paths: list[Path]) -> NamingConvention:
|
|
276
|
+
stems = [p.stem for p in paths if p.stem and p.stem not in ("__init__", "index", "main")]
|
|
277
|
+
return _majority_convention(stems)
|