chisel-test-impact 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chisel/__init__.py +1 -0
- chisel/ast_utils.py +578 -0
- chisel/cli.py +408 -0
- chisel/engine.py +428 -0
- chisel/git_analyzer.py +326 -0
- chisel/impact.py +397 -0
- chisel/mcp_server.py +226 -0
- chisel/mcp_stdio.py +142 -0
- chisel/metrics.py +203 -0
- chisel/project.py +196 -0
- chisel/rwlock.py +56 -0
- chisel/schemas.py +284 -0
- chisel/storage.py +539 -0
- chisel/test_mapper.py +657 -0
- chisel_test_impact-0.5.0.dist-info/METADATA +178 -0
- chisel_test_impact-0.5.0.dist-info/RECORD +20 -0
- chisel_test_impact-0.5.0.dist-info/WHEEL +5 -0
- chisel_test_impact-0.5.0.dist-info/entry_points.txt +3 -0
- chisel_test_impact-0.5.0.dist-info/licenses/LICENSE +21 -0
- chisel_test_impact-0.5.0.dist-info/top_level.txt +1 -0
chisel/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.5.0"
|
chisel/ast_utils.py
ADDED
|
@@ -0,0 +1,578 @@
|
|
|
1
|
+
"""Multi-language AST extraction for Chisel.
|
|
2
|
+
|
|
3
|
+
Extracts code units (functions, classes, structs, etc.) from source files
|
|
4
|
+
across Python, JavaScript/TypeScript, Go, Rust, C#, Java, C/C++, Kotlin,
|
|
5
|
+
Swift, PHP, Ruby, and Dart. Fully self-contained with zero external
|
|
6
|
+
dependencies beyond the Python standard library.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import ast
|
|
12
|
+
import hashlib
|
|
13
|
+
import re
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from functools import partial
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
# Directories to always skip when walking the project tree.
|
|
19
|
+
_SKIP_DIRS = {
|
|
20
|
+
".git", "node_modules", "__pycache__", ".tox", ".venv", "venv",
|
|
21
|
+
"env", ".mypy_cache", ".pytest_cache", ".ruff_cache", "dist",
|
|
22
|
+
"build", ".eggs", "target", "vendor", "Pods",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class CodeUnit:
|
|
28
|
+
"""Represents a single extractable unit of code."""
|
|
29
|
+
|
|
30
|
+
file_path: str
|
|
31
|
+
name: str
|
|
32
|
+
unit_type: str # "function", "async_function", "class", "struct", "enum", "impl", etc.
|
|
33
|
+
line_start: int
|
|
34
|
+
line_end: int
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Language detection
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
_EXTENSION_MAP = {
|
|
42
|
+
# Python
|
|
43
|
+
".py": "python", ".pyw": "python",
|
|
44
|
+
# JavaScript / TypeScript
|
|
45
|
+
".js": "javascript", ".jsx": "javascript", ".mjs": "javascript", ".cjs": "javascript",
|
|
46
|
+
".ts": "typescript", ".tsx": "typescript",
|
|
47
|
+
# Go
|
|
48
|
+
".go": "go",
|
|
49
|
+
# Rust
|
|
50
|
+
".rs": "rust",
|
|
51
|
+
# C#
|
|
52
|
+
".cs": "csharp",
|
|
53
|
+
# Java
|
|
54
|
+
".java": "java",
|
|
55
|
+
# C / C++
|
|
56
|
+
".c": "c", ".h": "c",
|
|
57
|
+
".cc": "cpp", ".cpp": "cpp", ".cxx": "cpp", ".hpp": "cpp", ".hxx": "cpp",
|
|
58
|
+
# Kotlin
|
|
59
|
+
".kt": "kotlin", ".kts": "kotlin",
|
|
60
|
+
# Swift
|
|
61
|
+
".swift": "swift",
|
|
62
|
+
# PHP
|
|
63
|
+
".php": "php",
|
|
64
|
+
# Ruby
|
|
65
|
+
".rb": "ruby",
|
|
66
|
+
# Dart
|
|
67
|
+
".dart": "dart",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def detect_language(file_path: str) -> str | None:
|
|
72
|
+
"""Return the language string for a file path based on its extension."""
|
|
73
|
+
ext = Path(file_path).suffix.lower()
|
|
74
|
+
return _EXTENSION_MAP.get(ext)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# File hashing
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def compute_file_hash(file_path: str) -> str:
|
|
83
|
+
"""Return the SHA-256 hex digest of a file's contents."""
|
|
84
|
+
h = hashlib.sha256()
|
|
85
|
+
with open(file_path, "rb") as f:
|
|
86
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
87
|
+
h.update(chunk)
|
|
88
|
+
return h.hexdigest()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
# Brace-matching helper (shared by all brace-delimited languages)
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _find_block_end(lines: list[str], start_idx: int) -> int:
|
|
97
|
+
"""Find the line number (1-based) of the closing brace for a block.
|
|
98
|
+
|
|
99
|
+
Scans forward from *start_idx* (0-based index into *lines*) looking for
|
|
100
|
+
the first ``{``. Once found, tracks brace depth and returns the 1-based
|
|
101
|
+
line number where depth returns to zero. If no opening brace is found,
|
|
102
|
+
returns ``start_idx + 1`` (the 1-based line of the start line itself).
|
|
103
|
+
|
|
104
|
+
String literals and single-line comments are stripped before counting
|
|
105
|
+
braces so that ``"{"`` or ``// }`` do not cause false matches.
|
|
106
|
+
"""
|
|
107
|
+
depth = 0
|
|
108
|
+
found_open = False
|
|
109
|
+
|
|
110
|
+
for i in range(start_idx, len(lines)):
|
|
111
|
+
cleaned = _strip_strings_and_comments(lines[i])
|
|
112
|
+
for ch in cleaned:
|
|
113
|
+
if ch == "{":
|
|
114
|
+
depth += 1
|
|
115
|
+
found_open = True
|
|
116
|
+
elif ch == "}":
|
|
117
|
+
depth -= 1
|
|
118
|
+
if found_open and depth == 0:
|
|
119
|
+
return i + 1 # 1-based
|
|
120
|
+
|
|
121
|
+
if found_open:
|
|
122
|
+
return len(lines)
|
|
123
|
+
return start_idx + 1
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _strip_strings_and_comments(line: str) -> str:
|
|
127
|
+
"""Remove string literals, ``//`` comments, and ``/* */`` blocks from a line."""
|
|
128
|
+
result: list = []
|
|
129
|
+
i = 0
|
|
130
|
+
length = len(line)
|
|
131
|
+
while i < length:
|
|
132
|
+
ch = line[i]
|
|
133
|
+
# Single-line comment: //
|
|
134
|
+
if ch == "/" and i + 1 < length and line[i + 1] == "/":
|
|
135
|
+
break
|
|
136
|
+
# Block comment: /* ... */ (may not close on same line)
|
|
137
|
+
if ch == "/" and i + 1 < length and line[i + 1] == "*":
|
|
138
|
+
end = line.find("*/", i + 2)
|
|
139
|
+
if end != -1:
|
|
140
|
+
i = end + 2
|
|
141
|
+
else:
|
|
142
|
+
break # unclosed block comment — ignore rest of line
|
|
143
|
+
continue
|
|
144
|
+
if ch in ('"', "'", "`"):
|
|
145
|
+
quote = ch
|
|
146
|
+
i += 1
|
|
147
|
+
while i < length and line[i] != quote:
|
|
148
|
+
if line[i] == "\\" and i + 1 < length:
|
|
149
|
+
i += 2
|
|
150
|
+
continue
|
|
151
|
+
i += 1
|
|
152
|
+
i += 1 # skip closing quote
|
|
153
|
+
continue
|
|
154
|
+
result.append(ch)
|
|
155
|
+
i += 1
|
|
156
|
+
return "".join(result)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _extract_brace_lang(
|
|
160
|
+
file_path: str, content: str, patterns: list,
|
|
161
|
+
) -> list[CodeUnit]:
|
|
162
|
+
"""Extract code units from a brace-delimited language.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
patterns: list of (compiled_regex, unit_type) tuples.
|
|
166
|
+
unit_type is a string, OR a callable(match) -> (name, type).
|
|
167
|
+
"""
|
|
168
|
+
units: list[CodeUnit] = []
|
|
169
|
+
lines = content.splitlines()
|
|
170
|
+
|
|
171
|
+
for idx, line in enumerate(lines):
|
|
172
|
+
lineno = idx + 1
|
|
173
|
+
for regex, unit_type in patterns:
|
|
174
|
+
m = regex.match(line)
|
|
175
|
+
if m:
|
|
176
|
+
end = _find_block_end(lines, idx)
|
|
177
|
+
if callable(unit_type):
|
|
178
|
+
name, utype = unit_type(m)
|
|
179
|
+
else:
|
|
180
|
+
name = m.group("name")
|
|
181
|
+
utype = unit_type
|
|
182
|
+
units.append(CodeUnit(file_path, name, utype, lineno, end))
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
return units
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# ---------------------------------------------------------------------------
|
|
189
|
+
# Python extraction
|
|
190
|
+
# ---------------------------------------------------------------------------
|
|
191
|
+
|
|
192
|
+
_PY_FUNC_RE = re.compile(
|
|
193
|
+
r"^(?P<indent>\s*)(?:async\s+)?def\s+(?P<name>[A-Za-z_]\w*)\s*\(",
|
|
194
|
+
)
|
|
195
|
+
_PY_CLASS_RE = re.compile(
|
|
196
|
+
r"^(?P<indent>\s*)class\s+(?P<name>[A-Za-z_]\w*)\s*[\(:]",
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _extract_python_ast(file_path: str, content: str) -> list[CodeUnit]:
|
|
201
|
+
"""Extract code units from Python source using the ``ast`` module."""
|
|
202
|
+
try:
|
|
203
|
+
tree = ast.parse(content, filename=file_path)
|
|
204
|
+
except SyntaxError:
|
|
205
|
+
return _extract_python_regex(file_path, content)
|
|
206
|
+
|
|
207
|
+
units: list[CodeUnit] = []
|
|
208
|
+
|
|
209
|
+
parent_map: dict = {}
|
|
210
|
+
for cls_node in ast.walk(tree):
|
|
211
|
+
if isinstance(cls_node, ast.ClassDef):
|
|
212
|
+
for child in ast.iter_child_nodes(cls_node):
|
|
213
|
+
parent_map[id(child)] = cls_node.name
|
|
214
|
+
|
|
215
|
+
for node in ast.walk(tree):
|
|
216
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
217
|
+
parent_class = parent_map.get(id(node))
|
|
218
|
+
name = f"{parent_class}.{node.name}" if parent_class else node.name
|
|
219
|
+
unit_type = (
|
|
220
|
+
"async_function"
|
|
221
|
+
if isinstance(node, ast.AsyncFunctionDef)
|
|
222
|
+
else "function"
|
|
223
|
+
)
|
|
224
|
+
end = getattr(node, "end_lineno", None) or node.lineno
|
|
225
|
+
units.append(CodeUnit(file_path, name, unit_type, node.lineno, end))
|
|
226
|
+
|
|
227
|
+
elif isinstance(node, ast.ClassDef):
|
|
228
|
+
end = getattr(node, "end_lineno", None) or node.lineno
|
|
229
|
+
units.append(CodeUnit(file_path, node.name, "class", node.lineno, end))
|
|
230
|
+
|
|
231
|
+
return units
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _extract_python_regex(file_path: str, content: str) -> list[CodeUnit]:
|
|
235
|
+
"""Regex fallback for Python files that fail ``ast.parse``."""
|
|
236
|
+
units: list[CodeUnit] = []
|
|
237
|
+
lines = content.splitlines()
|
|
238
|
+
current_class: str | None = None
|
|
239
|
+
current_class_indent: int = -1
|
|
240
|
+
|
|
241
|
+
for idx, line in enumerate(lines):
|
|
242
|
+
lineno = idx + 1
|
|
243
|
+
|
|
244
|
+
cls_m = _PY_CLASS_RE.match(line)
|
|
245
|
+
if cls_m:
|
|
246
|
+
indent_len = len(cls_m.group("indent"))
|
|
247
|
+
name = cls_m.group("name")
|
|
248
|
+
current_class = name
|
|
249
|
+
current_class_indent = indent_len
|
|
250
|
+
end = _py_block_end(lines, idx, indent_len)
|
|
251
|
+
units.append(CodeUnit(file_path, name, "class", lineno, end))
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
fn_m = _PY_FUNC_RE.match(line)
|
|
255
|
+
if fn_m:
|
|
256
|
+
indent_len = len(fn_m.group("indent"))
|
|
257
|
+
name = fn_m.group("name")
|
|
258
|
+
is_async = line.lstrip().startswith("async ")
|
|
259
|
+
|
|
260
|
+
if current_class and indent_len > current_class_indent:
|
|
261
|
+
name = f"{current_class}.{name}"
|
|
262
|
+
else:
|
|
263
|
+
current_class = None
|
|
264
|
+
current_class_indent = -1
|
|
265
|
+
|
|
266
|
+
unit_type = "async_function" if is_async else "function"
|
|
267
|
+
end = _py_block_end(lines, idx, indent_len)
|
|
268
|
+
units.append(CodeUnit(file_path, name, unit_type, lineno, end))
|
|
269
|
+
|
|
270
|
+
return units
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _py_block_end(lines: list[str], start_idx: int, indent: int) -> int:
|
|
274
|
+
"""Estimate the end line of a Python block starting at *start_idx*."""
|
|
275
|
+
for i in range(start_idx + 1, len(lines)):
|
|
276
|
+
stripped = lines[i].strip()
|
|
277
|
+
if not stripped or stripped.startswith("#"):
|
|
278
|
+
continue
|
|
279
|
+
line_indent = len(lines[i]) - len(lines[i].lstrip())
|
|
280
|
+
if line_indent <= indent:
|
|
281
|
+
return i
|
|
282
|
+
return len(lines)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# ---------------------------------------------------------------------------
|
|
286
|
+
# JavaScript / TypeScript
|
|
287
|
+
# ---------------------------------------------------------------------------
|
|
288
|
+
|
|
289
|
+
_JS_NAMED_FUNC_RE = re.compile(
|
|
290
|
+
r"^\s*(?:export\s+)?(?:async\s+)?function\s+(?P<name>[A-Za-z_$]\w*)\s*\(",
|
|
291
|
+
)
|
|
292
|
+
_JS_CLASS_RE = re.compile(
|
|
293
|
+
r"^\s*(?:export\s+)?class\s+(?P<name>[A-Za-z_$]\w*)",
|
|
294
|
+
)
|
|
295
|
+
_JS_ARROW_RE = re.compile(
|
|
296
|
+
r"^\s*(?:export\s+)?(?:const|let|var)\s+(?P<name>[A-Za-z_$]\w*)"
|
|
297
|
+
r"\s*=\s*(?:async\s+)?(?:\([^)]*\)|[A-Za-z_$]\w*)\s*=>",
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# ---------------------------------------------------------------------------
|
|
301
|
+
# Go
|
|
302
|
+
# ---------------------------------------------------------------------------
|
|
303
|
+
|
|
304
|
+
_GO_FUNC_RE = re.compile(
|
|
305
|
+
r"^\s*func\s+(?:\(\s*\w+\s+\*?\w+\s*\)\s+)?(?P<name>[A-Za-z_]\w*)\s*\(",
|
|
306
|
+
)
|
|
307
|
+
_GO_TYPE_RE = re.compile(
|
|
308
|
+
r"^\s*type\s+(?P<name>[A-Za-z_]\w*)\s+(?P<kind>struct|interface)\b",
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# ---------------------------------------------------------------------------
|
|
312
|
+
# Rust
|
|
313
|
+
# ---------------------------------------------------------------------------
|
|
314
|
+
|
|
315
|
+
_RS_FN_RE = re.compile(
|
|
316
|
+
r"^\s*(?:pub(?:\s*\(\s*\w+\s*\))?\s+)?(?:async\s+)?(?:unsafe\s+)?fn\s+(?P<name>[A-Za-z_]\w*)\s*[<(]",
|
|
317
|
+
)
|
|
318
|
+
_RS_STRUCT_RE = re.compile(
|
|
319
|
+
r"^\s*(?:pub(?:\s*\(\s*\w+\s*\))?\s+)?struct\s+(?P<name>[A-Za-z_]\w*)",
|
|
320
|
+
)
|
|
321
|
+
_RS_ENUM_RE = re.compile(
|
|
322
|
+
r"^\s*(?:pub(?:\s*\(\s*\w+\s*\))?\s+)?enum\s+(?P<name>[A-Za-z_]\w*)",
|
|
323
|
+
)
|
|
324
|
+
_RS_IMPL_RE = re.compile(
|
|
325
|
+
r"^\s*impl(?:\s*<[^>]*>)?\s+"
|
|
326
|
+
r"(?:[A-Za-z_]\w*(?:\s*<[^>]*>)?\s+for\s+)?"
|
|
327
|
+
r"(?P<name>[A-Za-z_]\w*(?:\s*<[^>]*>)?)",
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# ---------------------------------------------------------------------------
|
|
331
|
+
# C#
|
|
332
|
+
# ---------------------------------------------------------------------------
|
|
333
|
+
|
|
334
|
+
_CS_CLASS_RE = re.compile(
|
|
335
|
+
r"^(?:\s*\[[^\]]*\]\s*)*"
|
|
336
|
+
r"\s*(?:(?:public|private|protected|internal)\s+)?"
|
|
337
|
+
r"(?:(?:static|abstract|sealed|partial)\s+)*"
|
|
338
|
+
r"(?P<kind>class|struct|interface|enum|record)\s+(?P<name>[A-Za-z_]\w*)",
|
|
339
|
+
)
|
|
340
|
+
_CS_METHOD_RE = re.compile(
|
|
341
|
+
r"^(?:\s*\[[^\]]*\]\s*)*"
|
|
342
|
+
r"\s*(?:(?:public|private|protected|internal)\s+)?"
|
|
343
|
+
r"(?:(?:static|virtual|override|abstract|async|new|partial|extern|sealed|unsafe)\s+)*"
|
|
344
|
+
r"(?:[A-Za-z_]\w*(?:\.[A-Za-z_]\w*)*(?:<(?:[^<>]|<[^>]*>)*>)?(?:\[\])*\??\s+)"
|
|
345
|
+
r"(?P<name>[A-Za-z_]\w*)\s*[<(]",
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# ---------------------------------------------------------------------------
|
|
349
|
+
# Java
|
|
350
|
+
# ---------------------------------------------------------------------------
|
|
351
|
+
|
|
352
|
+
_JAVA_CLASS_RE = re.compile(
|
|
353
|
+
r"^(?:\s*@\w+(?:\s*\([^)]*\))?\s*)*"
|
|
354
|
+
r"\s*(?:(?:public|private|protected)\s+)?"
|
|
355
|
+
r"(?:(?:static|final|abstract|sealed)\s+)*"
|
|
356
|
+
r"(?P<kind>class|interface|enum|record)\s+(?P<name>[A-Za-z_]\w*)",
|
|
357
|
+
)
|
|
358
|
+
_JAVA_METHOD_RE = re.compile(
|
|
359
|
+
r"^(?:\s*@\w+(?:\s*\([^)]*\))?\s*)*"
|
|
360
|
+
r"\s*(?:(?:public|private|protected)\s+)?"
|
|
361
|
+
r"(?:(?:static|final|abstract|synchronized|native|default)\s+)*"
|
|
362
|
+
r"(?:[A-Za-z_]\w*(?:<(?:[^<>]|<[^>]*>)*>)?(?:\[\])*\s+)"
|
|
363
|
+
r"(?P<name>[A-Za-z_]\w*)\s*\(",
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# ---------------------------------------------------------------------------
|
|
367
|
+
# C / C++
|
|
368
|
+
# ---------------------------------------------------------------------------
|
|
369
|
+
|
|
370
|
+
_CPP_CLASS_RE = re.compile(
|
|
371
|
+
r"^\s*(?:template\s*<[^>]*>\s*)?"
|
|
372
|
+
r"(?P<kind>class|struct|namespace)\s+(?P<name>[A-Za-z_]\w*)",
|
|
373
|
+
)
|
|
374
|
+
_CPP_ENUM_RE = re.compile(
|
|
375
|
+
r"^\s*enum\s+(?:class\s+)?(?P<name>[A-Za-z_]\w*)",
|
|
376
|
+
)
|
|
377
|
+
_CPP_FUNC_RE = re.compile(
|
|
378
|
+
r"^\s*(?:template\s*<[^>]*>\s+)?"
|
|
379
|
+
r"(?:(?:static|inline|virtual|explicit|constexpr|extern|friend)\s+)*"
|
|
380
|
+
r"(?:[A-Za-z_]\w*(?:::\w+)*(?:\s*<(?:[^<>]|<[^>]*>)*>)?\s*[*&]?\s+)"
|
|
381
|
+
r"(?P<name>~?[A-Za-z_]\w*(?:::[A-Za-z_]\w*)?)\s*\(",
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# ---------------------------------------------------------------------------
|
|
385
|
+
# Kotlin
|
|
386
|
+
# ---------------------------------------------------------------------------
|
|
387
|
+
|
|
388
|
+
_KT_CLASS_RE = re.compile(
|
|
389
|
+
r"^\s*(?:(?:private|public|internal|protected|open|abstract|sealed|data|enum|inner|value|inline)\s+)*"
|
|
390
|
+
r"(?P<kind>class|object|interface)\s+(?P<name>[A-Za-z_]\w*)",
|
|
391
|
+
)
|
|
392
|
+
_KT_FUN_RE = re.compile(
|
|
393
|
+
r"^\s*(?:(?:private|public|internal|protected|open|override|suspend|inline|tailrec)\s+)*"
|
|
394
|
+
r"fun\s+(?:[A-Za-z_]\w*(?:<[^>]*>)?\.)?(?P<name>[A-Za-z_]\w*)\s*[<(]",
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
# ---------------------------------------------------------------------------
|
|
398
|
+
# Swift
|
|
399
|
+
# ---------------------------------------------------------------------------
|
|
400
|
+
|
|
401
|
+
_SWIFT_TYPE_RE = re.compile(
|
|
402
|
+
r"^(?:\s*@\w+(?:\s*\([^)]*\))?\s*)*"
|
|
403
|
+
r"\s*(?:(?:private|public|internal|fileprivate|open|final)\s+)*"
|
|
404
|
+
r"(?P<kind>class|struct|enum|protocol|actor)\s+(?P<name>[A-Za-z_]\w*)",
|
|
405
|
+
)
|
|
406
|
+
_SWIFT_FUNC_RE = re.compile(
|
|
407
|
+
r"^(?:\s*@\w+(?:\s*\([^)]*\))?\s*)*"
|
|
408
|
+
r"\s*(?:(?:private|public|internal|fileprivate|open|static|class|override|mutating|final)\s+)*"
|
|
409
|
+
r"func\s+(?P<name>[A-Za-z_]\w*)\s*[<(]",
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# ---------------------------------------------------------------------------
|
|
413
|
+
# PHP
|
|
414
|
+
# ---------------------------------------------------------------------------
|
|
415
|
+
|
|
416
|
+
_PHP_CLASS_RE = re.compile(
|
|
417
|
+
r"^\s*(?:(?:abstract|final)\s+)?(?P<kind>class|interface|trait|enum)\s+(?P<name>[A-Za-z_]\w*)",
|
|
418
|
+
)
|
|
419
|
+
_PHP_FUNC_RE = re.compile(
|
|
420
|
+
r"^\s*(?:(?:public|private|protected)\s+)?(?:static\s+)?function\s+(?P<name>[A-Za-z_]\w*)\s*\(",
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# ---------------------------------------------------------------------------
|
|
424
|
+
# Dart
|
|
425
|
+
# ---------------------------------------------------------------------------
|
|
426
|
+
|
|
427
|
+
_DART_CLASS_RE = re.compile(
|
|
428
|
+
r"^\s*(?:abstract\s+)?(?P<kind>class|mixin|extension)\s+(?P<name>[A-Za-z_]\w*)",
|
|
429
|
+
)
|
|
430
|
+
_DART_FUNC_RE = re.compile(
|
|
431
|
+
r"^\s*(?:(?:static|external)\s+)?"
|
|
432
|
+
r"(?:factory\s+|(?:[A-Za-z_]\w*(?:<[^>]*>)?\??\s+)?(?:(?:get|set)\s+)?)"
|
|
433
|
+
r"(?P<name>[A-Za-z_]\w*)\s*[<(={]",
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
# ---------------------------------------------------------------------------
|
|
437
|
+
# Ruby (end-delimited, not brace-delimited)
|
|
438
|
+
# ---------------------------------------------------------------------------
|
|
439
|
+
|
|
440
|
+
_RB_CLASS_RE = re.compile(
|
|
441
|
+
r"^(?P<indent>\s*)(?P<kind>class|module)\s+(?P<name>[A-Za-z_]\w*(?:::[A-Za-z_]\w*)*)",
|
|
442
|
+
)
|
|
443
|
+
_RB_DEF_RE = re.compile(
|
|
444
|
+
r"^(?P<indent>\s*)def\s+(?:self\.)?(?P<name>[A-Za-z_]\w*[?!=]?)\s*[\(;\n]?",
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def _ruby_block_end(lines: list[str], start_idx: int, indent: int) -> int:
|
|
449
|
+
"""Find the closing ``end`` for a Ruby block at the given indent level."""
|
|
450
|
+
for i in range(start_idx + 1, len(lines)):
|
|
451
|
+
stripped = lines[i].strip()
|
|
452
|
+
if not stripped or stripped.startswith("#"):
|
|
453
|
+
continue
|
|
454
|
+
line_indent = len(lines[i]) - len(lines[i].lstrip())
|
|
455
|
+
if line_indent <= indent and (stripped == "end" or stripped.startswith("end ")):
|
|
456
|
+
return i + 1 # 1-based
|
|
457
|
+
return len(lines)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _extract_ruby(file_path: str, content: str) -> list[CodeUnit]:
|
|
461
|
+
"""Extract code units from Ruby source using keyword-based block detection."""
|
|
462
|
+
units: list[CodeUnit] = []
|
|
463
|
+
lines = content.splitlines()
|
|
464
|
+
|
|
465
|
+
for idx, line in enumerate(lines):
|
|
466
|
+
lineno = idx + 1
|
|
467
|
+
m = _RB_CLASS_RE.match(line)
|
|
468
|
+
if m:
|
|
469
|
+
indent = len(m.group("indent"))
|
|
470
|
+
end = _ruby_block_end(lines, idx, indent)
|
|
471
|
+
units.append(CodeUnit(file_path, m.group("name"), m.group("kind"), lineno, end))
|
|
472
|
+
continue
|
|
473
|
+
m = _RB_DEF_RE.match(line)
|
|
474
|
+
if m:
|
|
475
|
+
indent = len(m.group("indent"))
|
|
476
|
+
end = _ruby_block_end(lines, idx, indent)
|
|
477
|
+
units.append(CodeUnit(file_path, m.group("name"), "function", lineno, end))
|
|
478
|
+
|
|
479
|
+
return units
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
# ---------------------------------------------------------------------------
|
|
483
|
+
# Per-language pattern tables
|
|
484
|
+
# ---------------------------------------------------------------------------
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def _name_kind(m):
|
|
488
|
+
"""Extract (name, kind) groups from a regex match — shared by many pattern tables."""
|
|
489
|
+
return m.group("name"), m.group("kind")
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
_JS_TS_PATTERNS = [
|
|
493
|
+
(_JS_NAMED_FUNC_RE, "function"),
|
|
494
|
+
(_JS_CLASS_RE, "class"),
|
|
495
|
+
(_JS_ARROW_RE, "function"),
|
|
496
|
+
]
|
|
497
|
+
|
|
498
|
+
_GO_PATTERNS = [
|
|
499
|
+
(_GO_FUNC_RE, "function"),
|
|
500
|
+
(_GO_TYPE_RE, _name_kind),
|
|
501
|
+
]
|
|
502
|
+
|
|
503
|
+
_RS_PATTERNS = [
|
|
504
|
+
(_RS_FN_RE, "function"),
|
|
505
|
+
(_RS_STRUCT_RE, "struct"),
|
|
506
|
+
(_RS_ENUM_RE, "enum"),
|
|
507
|
+
(_RS_IMPL_RE, lambda m: (m.group("name"), "impl")),
|
|
508
|
+
]
|
|
509
|
+
|
|
510
|
+
_CS_PATTERNS = [
|
|
511
|
+
(_CS_CLASS_RE, _name_kind),
|
|
512
|
+
(_CS_METHOD_RE, "function"),
|
|
513
|
+
]
|
|
514
|
+
|
|
515
|
+
_JAVA_PATTERNS = [
|
|
516
|
+
(_JAVA_CLASS_RE, _name_kind),
|
|
517
|
+
(_JAVA_METHOD_RE, "function"),
|
|
518
|
+
]
|
|
519
|
+
|
|
520
|
+
_CPP_PATTERNS = [
|
|
521
|
+
(_CPP_CLASS_RE, _name_kind),
|
|
522
|
+
(_CPP_ENUM_RE, "enum"),
|
|
523
|
+
(_CPP_FUNC_RE, "function"),
|
|
524
|
+
]
|
|
525
|
+
|
|
526
|
+
_KT_PATTERNS = [
|
|
527
|
+
(_KT_CLASS_RE, _name_kind),
|
|
528
|
+
(_KT_FUN_RE, "function"),
|
|
529
|
+
]
|
|
530
|
+
|
|
531
|
+
_SWIFT_PATTERNS = [
|
|
532
|
+
(_SWIFT_TYPE_RE, _name_kind),
|
|
533
|
+
(_SWIFT_FUNC_RE, "function"),
|
|
534
|
+
]
|
|
535
|
+
|
|
536
|
+
_PHP_PATTERNS = [
|
|
537
|
+
(_PHP_CLASS_RE, _name_kind),
|
|
538
|
+
(_PHP_FUNC_RE, "function"),
|
|
539
|
+
]
|
|
540
|
+
|
|
541
|
+
_DART_PATTERNS = [
|
|
542
|
+
(_DART_CLASS_RE, _name_kind),
|
|
543
|
+
(_DART_FUNC_RE, "function"),
|
|
544
|
+
]
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
# ---------------------------------------------------------------------------
|
|
548
|
+
# Dispatcher
|
|
549
|
+
# ---------------------------------------------------------------------------
|
|
550
|
+
|
|
551
|
+
_EXTRACTORS = {
|
|
552
|
+
"python": _extract_python_ast,
|
|
553
|
+
"javascript": partial(_extract_brace_lang, patterns=_JS_TS_PATTERNS),
|
|
554
|
+
"typescript": partial(_extract_brace_lang, patterns=_JS_TS_PATTERNS),
|
|
555
|
+
"go": partial(_extract_brace_lang, patterns=_GO_PATTERNS),
|
|
556
|
+
"rust": partial(_extract_brace_lang, patterns=_RS_PATTERNS),
|
|
557
|
+
"csharp": partial(_extract_brace_lang, patterns=_CS_PATTERNS),
|
|
558
|
+
"java": partial(_extract_brace_lang, patterns=_JAVA_PATTERNS),
|
|
559
|
+
"c": partial(_extract_brace_lang, patterns=_CPP_PATTERNS),
|
|
560
|
+
"cpp": partial(_extract_brace_lang, patterns=_CPP_PATTERNS),
|
|
561
|
+
"kotlin": partial(_extract_brace_lang, patterns=_KT_PATTERNS),
|
|
562
|
+
"swift": partial(_extract_brace_lang, patterns=_SWIFT_PATTERNS),
|
|
563
|
+
"php": partial(_extract_brace_lang, patterns=_PHP_PATTERNS),
|
|
564
|
+
"ruby": _extract_ruby,
|
|
565
|
+
"dart": partial(_extract_brace_lang, patterns=_DART_PATTERNS),
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def extract_code_units(file_path: str, content: str) -> list[CodeUnit]:
|
|
570
|
+
"""Extract code units from *content* using the appropriate language parser.
|
|
571
|
+
|
|
572
|
+
Dispatches to a language-specific extractor based on the file extension.
|
|
573
|
+
Returns an empty list for unsupported languages.
|
|
574
|
+
"""
|
|
575
|
+
lang = detect_language(file_path)
|
|
576
|
+
if lang is None or lang not in _EXTRACTORS:
|
|
577
|
+
return []
|
|
578
|
+
return _EXTRACTORS[lang](file_path, content)
|