faultlines 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faultline/__init__.py +1 -0
- faultline/analyzer/__init__.py +0 -0
- faultline/analyzer/ast_extractor.py +354 -0
- faultline/analyzer/cochange_detector.py +198 -0
- faultline/analyzer/coverage.py +66 -0
- faultline/analyzer/evolve.py +664 -0
- faultline/analyzer/features.py +1330 -0
- faultline/analyzer/git.py +251 -0
- faultline/analyzer/import_graph.py +711 -0
- faultline/analyzer/incremental.py +201 -0
- faultline/analyzer/repo_classifier.py +355 -0
- faultline/analyzer/shared_files.py +121 -0
- faultline/analyzer/validation.py +232 -0
- faultline/analyzer/workspace.py +372 -0
- faultline/cli.py +1518 -0
- faultline/digest/__init__.py +2 -0
- faultline/digest/__main__.py +4 -0
- faultline/digest/cli.py +89 -0
- faultline/digest/git_reader.py +160 -0
- faultline/digest/summarizer.py +250 -0
- faultline/integrations/__init__.py +13 -0
- faultline/integrations/base.py +213 -0
- faultline/integrations/posthog_provider.py +191 -0
- faultline/integrations/sentry_provider.py +171 -0
- faultline/llm/__init__.py +0 -0
- faultline/llm/cost.py +280 -0
- faultline/llm/deepseek_client.py +148 -0
- faultline/llm/detector.py +4222 -0
- faultline/llm/flow_detector.py +933 -0
- faultline/llm/pipeline.py +201 -0
- faultline/llm/sonnet_scanner.py +1557 -0
- faultline/models/__init__.py +0 -0
- faultline/models/types.py +98 -0
- faultline/output/__init__.py +0 -0
- faultline/output/reporter.py +245 -0
- faultline/output/writer.py +38 -0
- faultlines-0.1.0.dist-info/METADATA +287 -0
- faultlines-0.1.0.dist-info/RECORD +41 -0
- faultlines-0.1.0.dist-info/WHEEL +4 -0
- faultlines-0.1.0.dist-info/entry_points.txt +2 -0
- faultlines-0.1.0.dist-info/licenses/LICENSE +183 -0
faultline/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
File without changes
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Regex-based signature extractor for TypeScript and JavaScript files.
|
|
3
|
+
|
|
4
|
+
Extracts exports, route definitions, and imports from each file
|
|
5
|
+
without any external AST dependencies. This "skeleton" is then
|
|
6
|
+
fed to an LLM to identify user-facing flows within each feature.
|
|
7
|
+
|
|
8
|
+
Supported patterns:
|
|
9
|
+
- Named exports: export function Foo / export const Foo / export class Foo
|
|
10
|
+
- Default exports: export default function Foo / export default class Foo
|
|
11
|
+
- Re-exports: export { Foo, Bar }
|
|
12
|
+
- Next.js routes: export async function GET/POST/PUT/DELETE/PATCH (App Router)
|
|
13
|
+
- Next.js pages: getServerSideProps, getStaticProps (Pages Router)
|
|
14
|
+
- Express routes: router.get('/path', ...) / app.post('/path', ...)
|
|
15
|
+
- ES imports: import X from 'Y'
|
|
16
|
+
"""
|
|
17
|
+
import re
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from faultline.models.types import SymbolRange
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_TS_JS_EXTENSIONS = {".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs"}
|
|
25
|
+
_PYTHON_EXTENSIONS = {".py"}
|
|
26
|
+
|
|
27
|
+
# Named function/class/const exports
|
|
28
|
+
_RE_NAMED_EXPORT = re.compile(
|
|
29
|
+
r"export\s+(?:async\s+)?(?:function\s*\*?\s*|class\s+|const\s+|let\s+|var\s+)(\w+)"
|
|
30
|
+
)
|
|
31
|
+
# Default function/class exports with a name
|
|
32
|
+
_RE_DEFAULT_EXPORT = re.compile(
|
|
33
|
+
r"export\s+default\s+(?:async\s+)?(?:function|class)\s+(\w+)"
|
|
34
|
+
)
|
|
35
|
+
# Re-export block: export { Foo, Bar as Baz }
|
|
36
|
+
_RE_REEXPORT = re.compile(r"export\s*\{([^}]+)\}")
|
|
37
|
+
|
|
38
|
+
# Next.js App Router HTTP method handlers
|
|
39
|
+
_RE_NEXTJS_ROUTE = re.compile(
|
|
40
|
+
r"export\s+(?:async\s+)?function\s+(GET|POST|PUT|DELETE|PATCH|HEAD|OPTIONS)\b"
|
|
41
|
+
)
|
|
42
|
+
# Next.js Pages Router data fetchers
|
|
43
|
+
_RE_NEXTJS_PAGE = re.compile(
|
|
44
|
+
r"export\s+(?:async\s+)?function\s+(getServerSideProps|getStaticProps|getStaticPaths)\b"
|
|
45
|
+
)
|
|
46
|
+
# Express/Fastify route definitions: router.get('/path', ...) or app.post('/path')
|
|
47
|
+
_RE_EXPRESS_ROUTE = re.compile(
|
|
48
|
+
r"\b(?:router|app|server)\s*\.\s*(get|post|put|delete|patch|head)\s*\(\s*['\"]([^'\"]+)['\"]"
|
|
49
|
+
)
|
|
50
|
+
# ES6 import paths
|
|
51
|
+
_RE_IMPORT = re.compile(r"import\s+.*?from\s+['\"]([^'\"]+)['\"]")
|
|
52
|
+
|
|
53
|
+
# Python patterns
|
|
54
|
+
_RE_PYTHON_CLASS = re.compile(r"^class\s+(\w+)", re.MULTILINE)
|
|
55
|
+
_RE_PYTHON_FUNC = re.compile(r"^(?:async\s+)?def\s+([a-zA-Z]\w*)", re.MULTILINE)
|
|
56
|
+
_RE_PYTHON_ROUTE = re.compile(
|
|
57
|
+
r"@\w*(?:router|app|blueprint|bp|api)\s*\.\s*(get|post|put|delete|patch)\s*\(\s*['\"]([^'\"]+)['\"]",
|
|
58
|
+
re.IGNORECASE,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# Named import destructuring: import { FOO, BAR as Baz } from './path'
|
|
63
|
+
_RE_NAMED_IMPORT = re.compile(
|
|
64
|
+
r"import\s*\{([^}]+)\}\s*from\s*['\"]([^'\"]+)['\"]"
|
|
65
|
+
)
|
|
66
|
+
# Namespace import: import * as X from './path'
|
|
67
|
+
_RE_NAMESPACE_IMPORT = re.compile(
|
|
68
|
+
r"import\s*\*\s*as\s+\w+\s+from\s*['\"]([^'\"]+)['\"]"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# TS type/interface/enum exports
|
|
72
|
+
_RE_TYPE_EXPORT = re.compile(
|
|
73
|
+
r"export\s+(?:declare\s+)?(?:type|interface|enum)\s+(\w+)"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class FileSignature:
|
|
79
|
+
path: str
|
|
80
|
+
exports: list[str] = field(default_factory=list)
|
|
81
|
+
routes: list[str] = field(default_factory=list)
|
|
82
|
+
imports: list[str] = field(default_factory=list)
|
|
83
|
+
symbol_ranges: list[SymbolRange] = field(default_factory=list)
|
|
84
|
+
source: str = field(default="", repr=False)
|
|
85
|
+
|
|
86
|
+
def is_empty(self) -> bool:
|
|
87
|
+
return not self.exports and not self.routes and not self.imports
|
|
88
|
+
|
|
89
|
+
def to_prompt_line(self) -> str:
|
|
90
|
+
"""Formats the signature as a single line for LLM prompts."""
|
|
91
|
+
parts = []
|
|
92
|
+
if self.exports:
|
|
93
|
+
parts.append(f"exports: {', '.join(self.exports[:8])}")
|
|
94
|
+
if self.routes:
|
|
95
|
+
parts.append(f"routes: {', '.join(self.routes[:5])}")
|
|
96
|
+
if not parts:
|
|
97
|
+
return ""
|
|
98
|
+
return f" {self.path} → {' | '.join(parts)}"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def extract_signatures(
|
|
102
|
+
files: list[str],
|
|
103
|
+
repo_path: str,
|
|
104
|
+
) -> dict[str, FileSignature]:
|
|
105
|
+
"""
|
|
106
|
+
Extracts function/route/import signatures from TypeScript and JavaScript files.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
files: List of relative file paths (relative to repo_path).
|
|
110
|
+
repo_path: Absolute path to the repository root.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Dict mapping relative file path → FileSignature.
|
|
114
|
+
Non-TS/JS files are skipped and not included in the result.
|
|
115
|
+
"""
|
|
116
|
+
result: dict[str, FileSignature] = {}
|
|
117
|
+
root = Path(repo_path)
|
|
118
|
+
|
|
119
|
+
for rel_path in files:
|
|
120
|
+
suffix = Path(rel_path).suffix.lower()
|
|
121
|
+
if suffix not in _TS_JS_EXTENSIONS and suffix not in _PYTHON_EXTENSIONS:
|
|
122
|
+
continue
|
|
123
|
+
abs_path = root / rel_path
|
|
124
|
+
try:
|
|
125
|
+
source = abs_path.read_text(encoding="utf-8", errors="ignore")
|
|
126
|
+
except OSError:
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
if suffix in _PYTHON_EXTENSIONS:
|
|
130
|
+
sig = _parse_python_file(rel_path, source)
|
|
131
|
+
else:
|
|
132
|
+
sig = _parse_file(rel_path, source)
|
|
133
|
+
sig.symbol_ranges = extract_symbol_ranges(source)
|
|
134
|
+
sig.source = source
|
|
135
|
+
|
|
136
|
+
if not sig.is_empty():
|
|
137
|
+
result[rel_path] = sig
|
|
138
|
+
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _parse_file(rel_path: str, source: str) -> FileSignature:
|
|
143
|
+
sig = FileSignature(path=rel_path)
|
|
144
|
+
|
|
145
|
+
# Collect named exports
|
|
146
|
+
seen_exports: set[str] = set()
|
|
147
|
+
|
|
148
|
+
for match in _RE_NAMED_EXPORT.finditer(source):
|
|
149
|
+
name = match.group(1)
|
|
150
|
+
if name not in seen_exports:
|
|
151
|
+
seen_exports.add(name)
|
|
152
|
+
sig.exports.append(name)
|
|
153
|
+
|
|
154
|
+
for match in _RE_DEFAULT_EXPORT.finditer(source):
|
|
155
|
+
name = match.group(1)
|
|
156
|
+
if name not in seen_exports:
|
|
157
|
+
seen_exports.add(name)
|
|
158
|
+
sig.exports.append(name)
|
|
159
|
+
|
|
160
|
+
for match in _RE_REEXPORT.finditer(source):
|
|
161
|
+
for token in match.group(1).split(","):
|
|
162
|
+
# Handle "Foo as Bar" → take the exported name "Bar"
|
|
163
|
+
parts = token.strip().split(" as ")
|
|
164
|
+
name = parts[-1].strip()
|
|
165
|
+
if name and name not in seen_exports:
|
|
166
|
+
seen_exports.add(name)
|
|
167
|
+
sig.exports.append(name)
|
|
168
|
+
|
|
169
|
+
# Collect route definitions
|
|
170
|
+
for match in _RE_NEXTJS_ROUTE.finditer(source):
|
|
171
|
+
method = match.group(1)
|
|
172
|
+
# Infer path from the file path for App Router (files live at the route path)
|
|
173
|
+
route_path = _infer_nextjs_route_path(rel_path)
|
|
174
|
+
sig.routes.append(f"{method} {route_path}")
|
|
175
|
+
|
|
176
|
+
for match in _RE_NEXTJS_PAGE.finditer(source):
|
|
177
|
+
sig.routes.append(match.group(1))
|
|
178
|
+
|
|
179
|
+
for match in _RE_EXPRESS_ROUTE.finditer(source):
|
|
180
|
+
method = match.group(1).upper()
|
|
181
|
+
path = match.group(2)
|
|
182
|
+
sig.routes.append(f"{method} {path}")
|
|
183
|
+
|
|
184
|
+
# Collect imports (only internal/relative, skip node_modules)
|
|
185
|
+
for match in _RE_IMPORT.finditer(source):
|
|
186
|
+
src = match.group(1)
|
|
187
|
+
if src.startswith(".") or src.startswith("@/") or src.startswith("~/"):
|
|
188
|
+
sig.imports.append(src)
|
|
189
|
+
|
|
190
|
+
return sig
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _parse_python_file(rel_path: str, source: str) -> FileSignature:
|
|
194
|
+
sig = FileSignature(path=rel_path)
|
|
195
|
+
seen: set[str] = set()
|
|
196
|
+
|
|
197
|
+
for match in _RE_PYTHON_CLASS.finditer(source):
|
|
198
|
+
name = match.group(1)
|
|
199
|
+
if name not in seen:
|
|
200
|
+
seen.add(name)
|
|
201
|
+
sig.exports.append(name)
|
|
202
|
+
|
|
203
|
+
for match in _RE_PYTHON_FUNC.finditer(source):
|
|
204
|
+
name = match.group(1)
|
|
205
|
+
if name not in seen:
|
|
206
|
+
seen.add(name)
|
|
207
|
+
sig.exports.append(name)
|
|
208
|
+
|
|
209
|
+
for match in _RE_PYTHON_ROUTE.finditer(source):
|
|
210
|
+
method = match.group(1).upper()
|
|
211
|
+
path = match.group(2)
|
|
212
|
+
sig.routes.append(f"{method} {path}")
|
|
213
|
+
|
|
214
|
+
return sig
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _infer_nextjs_route_path(rel_path: str) -> str:
|
|
218
|
+
"""
|
|
219
|
+
Infers the Next.js API route path from the file's relative path.
|
|
220
|
+
|
|
221
|
+
Examples:
|
|
222
|
+
app/api/auth/login/route.ts → /api/auth/login
|
|
223
|
+
pages/api/auth.ts → /api/auth
|
|
224
|
+
src/app/api/users/route.ts → /api/users
|
|
225
|
+
"""
|
|
226
|
+
p = Path(rel_path)
|
|
227
|
+
parts = p.parts
|
|
228
|
+
|
|
229
|
+
# Drop leading src/, app/ wrappers
|
|
230
|
+
skip = {"src", "app"}
|
|
231
|
+
start = 0
|
|
232
|
+
for i, part in enumerate(parts):
|
|
233
|
+
if part not in skip:
|
|
234
|
+
start = i
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
trimmed = parts[start:]
|
|
238
|
+
|
|
239
|
+
# Drop trailing "route.ts" filename
|
|
240
|
+
if trimmed and Path(trimmed[-1]).stem == "route":
|
|
241
|
+
trimmed = trimmed[:-1]
|
|
242
|
+
else:
|
|
243
|
+
# Drop the filename extension for pages/api style
|
|
244
|
+
trimmed = trimmed[:-1] + (Path(trimmed[-1]).stem,) if trimmed else trimmed
|
|
245
|
+
|
|
246
|
+
return "/" + "/".join(trimmed) if trimmed else "/"
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def extract_symbol_ranges(source: str) -> list[SymbolRange]:
|
|
250
|
+
"""Extracts line ranges for each exported symbol in TS/JS source.
|
|
251
|
+
|
|
252
|
+
MVP heuristic: each export's end_line = next export's start_line - 1,
|
|
253
|
+
or EOF for the last export. This avoids complex brace-balancing but
|
|
254
|
+
gives reasonable line attribution for most files.
|
|
255
|
+
"""
|
|
256
|
+
total_lines = source.count("\n") + 1
|
|
257
|
+
# Collect all export positions with their symbol names and kinds
|
|
258
|
+
exports: list[tuple[int, str, str]] = [] # (start_line, name, kind)
|
|
259
|
+
|
|
260
|
+
for match in _RE_NAMED_EXPORT.finditer(source):
|
|
261
|
+
line = source[:match.start()].count("\n") + 1
|
|
262
|
+
name = match.group(1)
|
|
263
|
+
# Determine kind from the keyword before the name
|
|
264
|
+
text = source[match.start():match.end()]
|
|
265
|
+
if "function" in text:
|
|
266
|
+
kind = "function"
|
|
267
|
+
elif "class" in text:
|
|
268
|
+
kind = "class"
|
|
269
|
+
else:
|
|
270
|
+
kind = "const"
|
|
271
|
+
exports.append((line, name, kind))
|
|
272
|
+
|
|
273
|
+
for match in _RE_DEFAULT_EXPORT.finditer(source):
|
|
274
|
+
line = source[:match.start()].count("\n") + 1
|
|
275
|
+
name = match.group(1)
|
|
276
|
+
text = source[match.start():match.end()]
|
|
277
|
+
kind = "class" if "class" in text else "function"
|
|
278
|
+
exports.append((line, name, kind))
|
|
279
|
+
|
|
280
|
+
for match in _RE_TYPE_EXPORT.finditer(source):
|
|
281
|
+
line = source[:match.start()].count("\n") + 1
|
|
282
|
+
name = match.group(1)
|
|
283
|
+
text = source[match.start():match.end()]
|
|
284
|
+
if "enum" in text:
|
|
285
|
+
kind = "enum"
|
|
286
|
+
elif "interface" in text:
|
|
287
|
+
kind = "type"
|
|
288
|
+
else:
|
|
289
|
+
kind = "type"
|
|
290
|
+
exports.append((line, name, kind))
|
|
291
|
+
|
|
292
|
+
for match in _RE_REEXPORT.finditer(source):
|
|
293
|
+
line = source[:match.start()].count("\n") + 1
|
|
294
|
+
for token in match.group(1).split(","):
|
|
295
|
+
parts = token.strip().split(" as ")
|
|
296
|
+
name = parts[-1].strip()
|
|
297
|
+
if name:
|
|
298
|
+
exports.append((line, name, "reexport"))
|
|
299
|
+
|
|
300
|
+
if not exports:
|
|
301
|
+
return []
|
|
302
|
+
|
|
303
|
+
# Sort by start_line, deduplicate by name (keep first occurrence)
|
|
304
|
+
exports.sort(key=lambda x: x[0])
|
|
305
|
+
seen: set[str] = set()
|
|
306
|
+
unique: list[tuple[int, str, str]] = []
|
|
307
|
+
for start, name, kind in exports:
|
|
308
|
+
if name not in seen:
|
|
309
|
+
seen.add(name)
|
|
310
|
+
unique.append((start, name, kind))
|
|
311
|
+
|
|
312
|
+
# Assign end_line: next export's start_line - 1, or EOF for last
|
|
313
|
+
ranges = []
|
|
314
|
+
for i, (start, name, kind) in enumerate(unique):
|
|
315
|
+
if i + 1 < len(unique):
|
|
316
|
+
end = unique[i + 1][0] - 1
|
|
317
|
+
else:
|
|
318
|
+
end = total_lines
|
|
319
|
+
ranges.append(SymbolRange(
|
|
320
|
+
name=name, start_line=start, end_line=max(start, end), kind=kind,
|
|
321
|
+
))
|
|
322
|
+
|
|
323
|
+
return ranges
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def extract_named_imports(source: str) -> dict[str, set[str]]:
|
|
327
|
+
"""Extracts named imports from TS/JS source.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Dict mapping module path → set of imported symbol names.
|
|
331
|
+
For namespace imports (import * as X), returns {"*"} as the symbol set.
|
|
332
|
+
"""
|
|
333
|
+
result: dict[str, set[str]] = {}
|
|
334
|
+
|
|
335
|
+
for match in _RE_NAMED_IMPORT.finditer(source):
|
|
336
|
+
names_str = match.group(1)
|
|
337
|
+
module = match.group(2)
|
|
338
|
+
if not (module.startswith(".") or module.startswith("@/") or module.startswith("~/")):
|
|
339
|
+
continue
|
|
340
|
+
names = set()
|
|
341
|
+
for token in names_str.split(","):
|
|
342
|
+
parts = token.strip().split(" as ")
|
|
343
|
+
original = parts[0].strip()
|
|
344
|
+
if original:
|
|
345
|
+
names.add(original)
|
|
346
|
+
if names:
|
|
347
|
+
result.setdefault(module, set()).update(names)
|
|
348
|
+
|
|
349
|
+
for match in _RE_NAMESPACE_IMPORT.finditer(source):
|
|
350
|
+
module = match.group(1)
|
|
351
|
+
if module.startswith(".") or module.startswith("@/") or module.startswith("~/"):
|
|
352
|
+
result.setdefault(module, set()).add("*")
|
|
353
|
+
|
|
354
|
+
return result
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""Deterministic feature detection via co-change community detection.
|
|
2
|
+
|
|
3
|
+
Files that frequently change together in git history are grouped into the same
|
|
4
|
+
feature using Union-Find. This is the primary detection algorithm.
|
|
5
|
+
|
|
6
|
+
Same git history → same groups every time (100% deterministic).
|
|
7
|
+
|
|
8
|
+
When --llm is enabled, the LLM *names* the groups (but does not determine them).
|
|
9
|
+
Results are cached, so repeated runs return identical names.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from itertools import combinations
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from faultline.models.types import Commit
|
|
17
|
+
|
|
18
|
+
# Minimum commits in history to trust co-change signal.
|
|
19
|
+
# Below this threshold the caller falls back to directory heuristics.
|
|
20
|
+
_MIN_COMMITS_FOR_COCHANGE = 50
|
|
21
|
+
|
|
22
|
+
# Jaccard coupling threshold for merging two files into the same feature.
|
|
23
|
+
# Jaccard = commits_touching_both / commits_touching_either.
|
|
24
|
+
# 0.20 means "these files change together in ≥20% of commits that touch either one".
|
|
25
|
+
_COCHANGE_THRESHOLD = 0.20
|
|
26
|
+
|
|
27
|
+
# Commits that touch more files than this are excluded (bulk ops / large refactors).
|
|
28
|
+
_MAX_FILES_PER_COMMIT = 30
|
|
29
|
+
|
|
30
|
+
# A file must appear in at least this many commits to participate in coupling.
|
|
31
|
+
# Files edited only once produce noisy pairs.
|
|
32
|
+
_MIN_FILE_COMMITS = 2
|
|
33
|
+
|
|
34
|
+
# Directory names that are generic structural wrappers, not business feature names.
|
|
35
|
+
_SKIP_DIRS = {
|
|
36
|
+
"src", "app", "lib", "pkg", "internal", "core",
|
|
37
|
+
"views", "pages", "screens", "routes", "containers",
|
|
38
|
+
"components", "layouts", "features",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _UnionFind:
|
|
43
|
+
"""Path-compressed, union-by-rank disjoint set data structure."""
|
|
44
|
+
|
|
45
|
+
def __init__(self, nodes: list[str]) -> None:
|
|
46
|
+
self._parent: dict[str, str] = {n: n for n in nodes}
|
|
47
|
+
self._rank: dict[str, int] = defaultdict(int)
|
|
48
|
+
|
|
49
|
+
def find(self, x: str) -> str:
|
|
50
|
+
if self._parent[x] != x:
|
|
51
|
+
self._parent[x] = self.find(self._parent[x]) # path compression
|
|
52
|
+
return self._parent[x]
|
|
53
|
+
|
|
54
|
+
def union(self, x: str, y: str) -> None:
|
|
55
|
+
rx, ry = self.find(x), self.find(y)
|
|
56
|
+
if rx == ry:
|
|
57
|
+
return
|
|
58
|
+
if self._rank[rx] < self._rank[ry]:
|
|
59
|
+
rx, ry = ry, rx
|
|
60
|
+
self._parent[ry] = rx
|
|
61
|
+
if self._rank[rx] == self._rank[ry]:
|
|
62
|
+
self._rank[rx] += 1
|
|
63
|
+
|
|
64
|
+
def groups(self) -> dict[str, list[str]]:
|
|
65
|
+
clusters: dict[str, list[str]] = defaultdict(list)
|
|
66
|
+
for node in self._parent:
|
|
67
|
+
clusters[self.find(node)].append(node)
|
|
68
|
+
return dict(clusters)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def detect_features_from_cochange(
|
|
72
|
+
files: list[str],
|
|
73
|
+
commits: list[Commit],
|
|
74
|
+
) -> dict[str, list[str]] | None:
|
|
75
|
+
"""Groups files into features based on co-change patterns.
|
|
76
|
+
|
|
77
|
+
Returns None when there are fewer than _MIN_COMMITS_FOR_COCHANGE commits —
|
|
78
|
+
the caller should fall back to directory-based heuristics in that case.
|
|
79
|
+
Also returns None if the resulting mapping is empty (no co-change signal).
|
|
80
|
+
|
|
81
|
+
The returned dict maps feature_name → list of file paths.
|
|
82
|
+
Names are directory-derived; pass the result to name_clusters_llm() or
|
|
83
|
+
name_clusters_ollama() to replace them with semantic business domain names.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
files: Tracked file paths (relative, with path prefix already stripped).
|
|
87
|
+
commits: Commit history for the analysis window.
|
|
88
|
+
"""
|
|
89
|
+
if len(commits) < _MIN_COMMITS_FOR_COCHANGE:
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
file_set = set(files)
|
|
93
|
+
|
|
94
|
+
# Index: file → set of commit SHAs (non-bulk commits only)
|
|
95
|
+
file_commits: dict[str, set[str]] = defaultdict(set)
|
|
96
|
+
for commit in commits:
|
|
97
|
+
if len(commit.files_changed) > _MAX_FILES_PER_COMMIT:
|
|
98
|
+
continue
|
|
99
|
+
for f in commit.files_changed:
|
|
100
|
+
if f in file_set:
|
|
101
|
+
file_commits[f].add(commit.sha)
|
|
102
|
+
|
|
103
|
+
# Inverted index: commit SHA → files (for efficient O(k²) pair counting)
|
|
104
|
+
commit_to_files: dict[str, list[str]] = defaultdict(list)
|
|
105
|
+
for f, shas in file_commits.items():
|
|
106
|
+
if len(shas) < _MIN_FILE_COMMITS:
|
|
107
|
+
continue # too few appearances — unreliable signal
|
|
108
|
+
for sha in shas:
|
|
109
|
+
commit_to_files[sha].append(f)
|
|
110
|
+
|
|
111
|
+
# Count co-occurrences for each file pair
|
|
112
|
+
pair_both: dict[tuple[str, str], int] = defaultdict(int)
|
|
113
|
+
for touched in commit_to_files.values():
|
|
114
|
+
if len(touched) < 2:
|
|
115
|
+
continue
|
|
116
|
+
for f1, f2 in combinations(sorted(touched), 2):
|
|
117
|
+
pair_both[(f1, f2)] += 1
|
|
118
|
+
|
|
119
|
+
# Union-Find: merge files whose Jaccard score meets the threshold
|
|
120
|
+
uf = _UnionFind(files)
|
|
121
|
+
for (f1, f2), both in pair_both.items():
|
|
122
|
+
a = len(file_commits.get(f1, set()))
|
|
123
|
+
b = len(file_commits.get(f2, set()))
|
|
124
|
+
denom = a + b - both
|
|
125
|
+
if denom > 0 and both / denom >= _COCHANGE_THRESHOLD:
|
|
126
|
+
uf.union(f1, f2)
|
|
127
|
+
|
|
128
|
+
result = _finalize_clusters(uf.groups())
|
|
129
|
+
return result if result else None
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _finalize_clusters(
|
|
133
|
+
raw_groups: dict[str, list[str]],
|
|
134
|
+
) -> dict[str, list[str]]:
|
|
135
|
+
"""Converts raw Union-Find groups to named feature clusters.
|
|
136
|
+
|
|
137
|
+
Multi-file clusters receive a directory-derived name.
|
|
138
|
+
Singleton clusters are merged into a same-directory cluster if one exists,
|
|
139
|
+
or grouped together under a shared directory name.
|
|
140
|
+
"""
|
|
141
|
+
multi: dict[str, list[str]] = {}
|
|
142
|
+
singletons: list[str] = []
|
|
143
|
+
|
|
144
|
+
for members in raw_groups.values():
|
|
145
|
+
members_sorted = sorted(members)
|
|
146
|
+
if len(members_sorted) >= 2:
|
|
147
|
+
name = _cluster_name(members_sorted)
|
|
148
|
+
name = _unique_name(name, multi)
|
|
149
|
+
multi[name] = members_sorted
|
|
150
|
+
else:
|
|
151
|
+
singletons.extend(members_sorted)
|
|
152
|
+
|
|
153
|
+
# Build dir → cluster index so singletons can be absorbed
|
|
154
|
+
dir_to_cluster: dict[str, str] = {}
|
|
155
|
+
for cluster_name, members in multi.items():
|
|
156
|
+
for f in members:
|
|
157
|
+
dir_to_cluster[str(Path(f).parent)] = cluster_name
|
|
158
|
+
|
|
159
|
+
# Assign singletons: merge into same-dir cluster or bucket by dir name
|
|
160
|
+
dir_orphans: dict[str, list[str]] = defaultdict(list)
|
|
161
|
+
for f in singletons:
|
|
162
|
+
d = str(Path(f).parent)
|
|
163
|
+
if d in dir_to_cluster:
|
|
164
|
+
multi[dir_to_cluster[d]].append(f)
|
|
165
|
+
else:
|
|
166
|
+
dir_orphans[_feature_name_from_path(f)].append(f)
|
|
167
|
+
|
|
168
|
+
for name, fs in dir_orphans.items():
|
|
169
|
+
name = _unique_name(name, multi)
|
|
170
|
+
multi[name] = sorted(fs)
|
|
171
|
+
|
|
172
|
+
return multi
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _cluster_name(files: list[str]) -> str:
|
|
176
|
+
"""Derives a cluster name from the most common meaningful directory component."""
|
|
177
|
+
counts: dict[str, int] = defaultdict(int)
|
|
178
|
+
for f in files:
|
|
179
|
+
counts[_feature_name_from_path(f)] += 1
|
|
180
|
+
return max(counts, key=lambda k: counts[k])
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _feature_name_from_path(path: str) -> str:
|
|
184
|
+
"""Extracts the first non-generic directory component as a feature name."""
|
|
185
|
+
for part in Path(path).parts[:-1]:
|
|
186
|
+
if part.lower() not in _SKIP_DIRS:
|
|
187
|
+
return part.lower()
|
|
188
|
+
return "root"
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _unique_name(name: str, existing: dict) -> str:
|
|
192
|
+
"""Returns a unique name by appending a numeric suffix if needed."""
|
|
193
|
+
if name not in existing:
|
|
194
|
+
return name
|
|
195
|
+
suffix = 2
|
|
196
|
+
while f"{name}-{suffix}" in existing:
|
|
197
|
+
suffix += 1
|
|
198
|
+
return f"{name}-{suffix}"
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Reads test coverage data from standard coverage file formats."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def read_coverage(repo_path: str, coverage_path: str | None = None) -> dict[str, float]:
|
|
8
|
+
"""
|
|
9
|
+
Returns file_path → line coverage % (0–100).
|
|
10
|
+
|
|
11
|
+
If coverage_path is provided, reads that file directly (lcov or jest format).
|
|
12
|
+
Otherwise tries coverage/coverage-summary.json (Jest/NYC) then coverage/lcov.info.
|
|
13
|
+
Returns empty dict if no coverage data found.
|
|
14
|
+
"""
|
|
15
|
+
if coverage_path:
|
|
16
|
+
p = Path(coverage_path)
|
|
17
|
+
if not p.exists():
|
|
18
|
+
return {}
|
|
19
|
+
if p.name.endswith(".json"):
|
|
20
|
+
return _read_jest(p)
|
|
21
|
+
return _read_lcov(p)
|
|
22
|
+
|
|
23
|
+
root = Path(repo_path)
|
|
24
|
+
# Auto-detect: check common locations
|
|
25
|
+
candidates = [
|
|
26
|
+
root / "coverage" / "coverage-summary.json",
|
|
27
|
+
root / "coverage" / "lcov.info",
|
|
28
|
+
root / "lcov.info",
|
|
29
|
+
root / "coverage.lcov",
|
|
30
|
+
]
|
|
31
|
+
for candidate in candidates:
|
|
32
|
+
if candidate.exists():
|
|
33
|
+
if candidate.name.endswith(".json"):
|
|
34
|
+
return _read_jest(candidate)
|
|
35
|
+
return _read_lcov(candidate)
|
|
36
|
+
return {}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _read_jest(path: Path) -> dict[str, float]:
|
|
40
|
+
data = json.loads(path.read_text())
|
|
41
|
+
result: dict[str, float] = {}
|
|
42
|
+
for file_path, stats in data.items():
|
|
43
|
+
if file_path == "total":
|
|
44
|
+
continue
|
|
45
|
+
pct = (stats.get("lines") or {}).get("pct")
|
|
46
|
+
if pct is not None:
|
|
47
|
+
result[str(file_path)] = float(pct)
|
|
48
|
+
return result
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _read_lcov(path: Path) -> dict[str, float]:
|
|
52
|
+
result: dict[str, float] = {}
|
|
53
|
+
current: str | None = None
|
|
54
|
+
lf = lh = 0
|
|
55
|
+
for line in path.read_text().splitlines():
|
|
56
|
+
if line.startswith("SF:"):
|
|
57
|
+
current = line[3:]
|
|
58
|
+
lf = lh = 0
|
|
59
|
+
elif line.startswith("LF:"):
|
|
60
|
+
lf = int(line[3:])
|
|
61
|
+
elif line.startswith("LH:"):
|
|
62
|
+
lh = int(line[3:])
|
|
63
|
+
elif line == "end_of_record" and current:
|
|
64
|
+
result[current] = round(lh / lf * 100, 1) if lf > 0 else 0.0
|
|
65
|
+
current = None
|
|
66
|
+
return result
|