synergyspec-selfevolving 1.1.10 → 1.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -3
- package/dist/commands/learn.js +78 -11
- package/dist/commands/self-evolution.d.ts +13 -0
- package/dist/commands/self-evolution.js +156 -20
- package/dist/commands/workflow/status.js +13 -0
- package/dist/core/change-readiness.d.ts +24 -0
- package/dist/core/change-readiness.js +47 -0
- package/dist/core/config-prompts.js +10 -0
- package/dist/core/fitness/health/local-source.d.ts +9 -6
- package/dist/core/fitness/health/local-source.js +9 -6
- package/dist/core/fitness/health/resolve-source.d.ts +4 -3
- package/dist/core/fitness/health/resolve-source.js +5 -4
- package/dist/core/fitness/sample.d.ts +17 -0
- package/dist/core/learn.d.ts +7 -0
- package/dist/core/learn.js +57 -5
- package/dist/core/project-config.d.ts +1 -0
- package/dist/core/project-config.js +11 -8
- package/dist/core/self-evolution/health-baseline.d.ts +24 -0
- package/dist/core/self-evolution/health-baseline.js +78 -0
- package/dist/core/self-evolution/index.d.ts +1 -0
- package/dist/core/self-evolution/index.js +1 -0
- package/dist/core/self-evolution/learn-observation-adapter.d.ts +16 -1
- package/dist/core/self-evolution/learn-observation-adapter.js +101 -15
- package/dist/core/self-evolution/promote.d.ts +25 -0
- package/dist/core/self-evolution/promote.js +21 -0
- package/dist/core/self-evolution/target-evolution.d.ts +7 -0
- package/dist/core/self-evolution/target-evolution.js +9 -0
- package/dist/core/templates/workflows/learn.js +10 -5
- package/package.json +2 -1
- package/scripts/code-health.py +1154 -0
|
@@ -0,0 +1,1154 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Self-contained local code-health analyzer for multi-language source trees.
|
|
3
|
+
|
|
4
|
+
This is the fallback MetricSource for the self-evolution fitness loop: it
|
|
5
|
+
needs no SonarQube server, no network, and no third-party packages -- only the
|
|
6
|
+
Python 3 standard library (``ast`` + ``tokenize`` for Python, plus a
|
|
7
|
+
dependency-free brace/keyword heuristic for the C-family and Rust).
|
|
8
|
+
|
|
9
|
+
Usage::
|
|
10
|
+
|
|
11
|
+
python code-health.py <dir>
|
|
12
|
+
|
|
13
|
+
It recursively analyzes every source file under ``<dir>`` -- Python (``.py``)
|
|
14
|
+
via ``ast``, and C (``.c``/``.h``), C++ (``.cc``/``.cpp``/``.cxx``/``.c++``/
|
|
15
|
+
``.hpp``/``.hh``/``.hxx``/``.h++``) and Rust (``.rs``) via a heuristic -- and
|
|
16
|
+
prints a single JSON object to stdout with EXACTLY these keys::
|
|
17
|
+
|
|
18
|
+
cyclomatic_p95
|
|
19
|
+
max_nesting_depth
|
|
20
|
+
cognitive_complexity
|
|
21
|
+
duplicated_lines_density
|
|
22
|
+
import_count
|
|
23
|
+
attr_method_usage_ratio
|
|
24
|
+
bare_except_count
|
|
25
|
+
|
|
26
|
+
The metrics are aggregated across ALL analyzed files of ALL languages
|
|
27
|
+
combined, with the same aggregation rules used for the Python-only path
|
|
28
|
+
(p95 over the combined cyclomatic list, mean over the combined cognitive
|
|
29
|
+
list, max depth across files, summed counts, dup density over the combined
|
|
30
|
+
corpus, mean cohesion ratio over all class/impl methods).
|
|
31
|
+
|
|
32
|
+
The precise backend for the heuristic languages is SonarQube; this local
|
|
33
|
+
source is a deliberate good-enough proxy -- no real C/C++/Rust parser exists
|
|
34
|
+
in the standard library, so a brace/keyword heuristic is expected.
|
|
35
|
+
|
|
36
|
+
Robustness contract: files that fail to parse are skipped (their text may
|
|
37
|
+
still feed duplication density if readable); the program always emits valid
|
|
38
|
+
JSON (zeros / neutral values when the tree is empty), and never raises out to
|
|
39
|
+
the shell on bad input.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import ast
|
|
45
|
+
import io
|
|
46
|
+
import json
|
|
47
|
+
import math
|
|
48
|
+
import os
|
|
49
|
+
import re
|
|
50
|
+
import sys
|
|
51
|
+
import tokenize
|
|
52
|
+
from typing import Iterable, List, Tuple
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# --------------------------------------------------------------------------- #
|
|
56
|
+
# Per-function cyclomatic complexity (McCabe) #
|
|
57
|
+
# --------------------------------------------------------------------------- #
|
|
58
|
+
|
|
59
|
+
# Boolean operators (and / or) add a decision point per *extra* operand: a
|
|
60
|
+
# chain `a and b and c` has 2 ``and`` edges, i.e. (len(values) - 1).
|
|
61
|
+
_BOOLOP_NODE = ast.BoolOp
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _is_function(node: ast.AST) -> bool:
|
|
65
|
+
return isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _cyclomatic_complexity(func: ast.AST) -> int:
|
|
69
|
+
"""1 + number of decision points inside ``func`` (not descending into
|
|
70
|
+
nested function/class bodies, which are counted as their own units)."""
|
|
71
|
+
complexity = 1
|
|
72
|
+
stack: List[ast.AST] = list(ast.iter_child_nodes(func))
|
|
73
|
+
while stack:
|
|
74
|
+
node = stack.pop()
|
|
75
|
+
# Do not descend into nested defs/classes: each function is its own
|
|
76
|
+
# complexity unit, mirroring per-function McCabe.
|
|
77
|
+
if _is_function(node) or isinstance(node, ast.ClassDef):
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
if isinstance(node, (ast.If, ast.For, ast.AsyncFor, ast.While, ast.ExceptHandler)):
|
|
81
|
+
complexity += 1
|
|
82
|
+
elif isinstance(node, _BOOLOP_NODE):
|
|
83
|
+
# +1 per logical edge: N operands => N-1 edges (And/Or).
|
|
84
|
+
complexity += max(0, len(node.values) - 1)
|
|
85
|
+
elif isinstance(node, ast.IfExp): # ternary a if cond else b
|
|
86
|
+
complexity += 1
|
|
87
|
+
elif isinstance(node, ast.comprehension):
|
|
88
|
+
# each `if` clause in a comprehension is a branch
|
|
89
|
+
complexity += len(node.ifs)
|
|
90
|
+
|
|
91
|
+
stack.extend(ast.iter_child_nodes(node))
|
|
92
|
+
return complexity
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _percentile(values: List[float], pct: float) -> float:
|
|
96
|
+
"""Linear-interpolation percentile (same method as numpy's default).
|
|
97
|
+
|
|
98
|
+
``pct`` in [0, 100]. Returns 0.0 for an empty input.
|
|
99
|
+
"""
|
|
100
|
+
if not values:
|
|
101
|
+
return 0.0
|
|
102
|
+
ordered = sorted(values)
|
|
103
|
+
if len(ordered) == 1:
|
|
104
|
+
return float(ordered[0])
|
|
105
|
+
rank = (pct / 100.0) * (len(ordered) - 1)
|
|
106
|
+
low = math.floor(rank)
|
|
107
|
+
high = math.ceil(rank)
|
|
108
|
+
if low == high:
|
|
109
|
+
return float(ordered[int(rank)])
|
|
110
|
+
frac = rank - low
|
|
111
|
+
return float(ordered[low] * (1 - frac) + ordered[high] * frac)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# --------------------------------------------------------------------------- #
|
|
115
|
+
# Max nesting depth of control structures #
|
|
116
|
+
# --------------------------------------------------------------------------- #
|
|
117
|
+
|
|
118
|
+
_NESTING_NODES = (
|
|
119
|
+
ast.If,
|
|
120
|
+
ast.For,
|
|
121
|
+
ast.AsyncFor,
|
|
122
|
+
ast.While,
|
|
123
|
+
ast.With,
|
|
124
|
+
ast.AsyncWith,
|
|
125
|
+
ast.Try,
|
|
126
|
+
ast.FunctionDef,
|
|
127
|
+
ast.AsyncFunctionDef,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _max_nesting_depth(tree: ast.AST) -> int:
|
|
132
|
+
"""Deepest nesting of control structures in a module tree."""
|
|
133
|
+
|
|
134
|
+
def walk(node: ast.AST, depth: int) -> int:
|
|
135
|
+
deepest = depth
|
|
136
|
+
for child in ast.iter_child_nodes(node):
|
|
137
|
+
if isinstance(child, _NESTING_NODES):
|
|
138
|
+
deepest = max(deepest, walk(child, depth + 1))
|
|
139
|
+
else:
|
|
140
|
+
deepest = max(deepest, walk(child, depth))
|
|
141
|
+
return deepest
|
|
142
|
+
|
|
143
|
+
return walk(tree, 0)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# --------------------------------------------------------------------------- #
|
|
147
|
+
# Cognitive complexity (Sonar-style approximation) #
|
|
148
|
+
# --------------------------------------------------------------------------- #
|
|
149
|
+
|
|
150
|
+
# Structures that both (a) increment by 1 and (b) raise the nesting level for
|
|
151
|
+
# anything nested beneath them, adding the current nesting level on top of the
|
|
152
|
+
# base increment.
|
|
153
|
+
_COGNITIVE_NESTING_NODES = (
|
|
154
|
+
ast.If,
|
|
155
|
+
ast.For,
|
|
156
|
+
ast.AsyncFor,
|
|
157
|
+
ast.While,
|
|
158
|
+
ast.ExceptHandler,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _cognitive_complexity(func: ast.AST) -> int:
|
|
163
|
+
"""Sonar-style cognitive complexity for a single function.
|
|
164
|
+
|
|
165
|
+
Rules approximated:
|
|
166
|
+
* +1 (plus current nesting level) for each control-flow structure that
|
|
167
|
+
breaks linear flow: if / for / while / except.
|
|
168
|
+
* nesting increases for code inside those structures.
|
|
169
|
+
* boolean operators add +1 per sequence of like operators (a flat penalty
|
|
170
|
+
for mixing logic), independent of nesting.
|
|
171
|
+
* else/elif and ternary add a flat +1 (no nesting bonus), matching
|
|
172
|
+
Sonar's "else does not add nesting" treatment closely enough for a
|
|
173
|
+
proxy.
|
|
174
|
+
"""
|
|
175
|
+
score = 0
|
|
176
|
+
|
|
177
|
+
def walk(node: ast.AST, nesting: int) -> None:
|
|
178
|
+
nonlocal score
|
|
179
|
+
for child in ast.iter_child_nodes(node):
|
|
180
|
+
# Each nested function is its own unit; don't recurse into it here.
|
|
181
|
+
if _is_function(child) or isinstance(child, ast.ClassDef):
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
if isinstance(child, _COGNITIVE_NESTING_NODES):
|
|
185
|
+
score += 1 + nesting
|
|
186
|
+
walk(child, nesting + 1)
|
|
187
|
+
# `orelse` of an If is the elif/else chain: flat +1 each, no
|
|
188
|
+
# extra nesting bonus, but its body still nests one level.
|
|
189
|
+
if isinstance(child, ast.If) and child.orelse:
|
|
190
|
+
score += 1
|
|
191
|
+
elif isinstance(child, _BOOLOP_NODE):
|
|
192
|
+
score += 1
|
|
193
|
+
walk(child, nesting)
|
|
194
|
+
elif isinstance(child, ast.IfExp): # ternary
|
|
195
|
+
score += 1
|
|
196
|
+
walk(child, nesting)
|
|
197
|
+
else:
|
|
198
|
+
walk(child, nesting)
|
|
199
|
+
|
|
200
|
+
walk(func, 0)
|
|
201
|
+
return score
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# --------------------------------------------------------------------------- #
|
|
205
|
+
# Bare / overly-broad except handlers #
|
|
206
|
+
# --------------------------------------------------------------------------- #
|
|
207
|
+
|
|
208
|
+
_BROAD_EXC_NAMES = {"Exception", "BaseException"}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _is_broad_handler(handler: ast.ExceptHandler) -> bool:
|
|
212
|
+
"""True for ``except:`` and ``except (Exception|BaseException):``."""
|
|
213
|
+
if handler.type is None:
|
|
214
|
+
return True # bare except:
|
|
215
|
+
names: Iterable[ast.AST]
|
|
216
|
+
if isinstance(handler.type, ast.Tuple):
|
|
217
|
+
names = handler.type.elts
|
|
218
|
+
else:
|
|
219
|
+
names = [handler.type]
|
|
220
|
+
for n in names:
|
|
221
|
+
if isinstance(n, ast.Name) and n.id in _BROAD_EXC_NAMES:
|
|
222
|
+
return True
|
|
223
|
+
# Handle attribute forms defensively (e.g. builtins.Exception).
|
|
224
|
+
if isinstance(n, ast.Attribute) and n.attr in _BROAD_EXC_NAMES:
|
|
225
|
+
return True
|
|
226
|
+
return False
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
# --------------------------------------------------------------------------- #
|
|
230
|
+
# Per-class attr / method usage ratio #
|
|
231
|
+
# --------------------------------------------------------------------------- #
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _self_attr_names(method: ast.AST) -> "tuple[set, set]":
|
|
235
|
+
"""Return (assigned_attrs, referenced_attrs) of ``self.<attr>`` inside a
|
|
236
|
+
single method body (does not descend into nested functions/classes)."""
|
|
237
|
+
assigned: set = set()
|
|
238
|
+
referenced: set = set()
|
|
239
|
+
|
|
240
|
+
def is_self_attr(node: ast.AST):
|
|
241
|
+
if (
|
|
242
|
+
isinstance(node, ast.Attribute)
|
|
243
|
+
and isinstance(node.value, ast.Name)
|
|
244
|
+
and node.value.id == "self"
|
|
245
|
+
):
|
|
246
|
+
return node.attr
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
def walk(node: ast.AST) -> None:
|
|
250
|
+
for child in ast.iter_child_nodes(node):
|
|
251
|
+
if _is_function(child) or isinstance(child, ast.ClassDef):
|
|
252
|
+
continue
|
|
253
|
+
attr = is_self_attr(child)
|
|
254
|
+
if attr is not None:
|
|
255
|
+
# Determine assignment vs reference by context.
|
|
256
|
+
referenced.add(attr)
|
|
257
|
+
walk(child)
|
|
258
|
+
|
|
259
|
+
# Assignments: scan all assignment targets explicitly.
|
|
260
|
+
def walk_assign(node: ast.AST) -> None:
|
|
261
|
+
for child in ast.iter_child_nodes(node):
|
|
262
|
+
if _is_function(child) or isinstance(child, ast.ClassDef):
|
|
263
|
+
continue
|
|
264
|
+
if isinstance(child, ast.Assign):
|
|
265
|
+
for tgt in child.targets:
|
|
266
|
+
a = is_self_attr(tgt)
|
|
267
|
+
if a is not None:
|
|
268
|
+
assigned.add(a)
|
|
269
|
+
elif isinstance(child, (ast.AnnAssign, ast.AugAssign)):
|
|
270
|
+
a = is_self_attr(child.target)
|
|
271
|
+
if a is not None:
|
|
272
|
+
assigned.add(a)
|
|
273
|
+
walk_assign(child)
|
|
274
|
+
|
|
275
|
+
walk(method)
|
|
276
|
+
walk_assign(method)
|
|
277
|
+
return assigned, referenced
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _attr_method_usage_ratio(classes: List[ast.ClassDef]) -> float:
|
|
281
|
+
"""Mean over all (class, method) pairs of the fraction of the class's full
|
|
282
|
+
``self.<attr>`` set that the method references.
|
|
283
|
+
|
|
284
|
+
Returns 1.0 when there are no classes / methods / attributes (a neutral
|
|
285
|
+
"no signal" value, matching the spec's "1.0 (or omit) when no classes").
|
|
286
|
+
"""
|
|
287
|
+
ratios: List[float] = []
|
|
288
|
+
for cls in classes:
|
|
289
|
+
methods = [m for m in cls.body if _is_function(m)]
|
|
290
|
+
if not methods:
|
|
291
|
+
continue
|
|
292
|
+
# Class-wide attribute universe = union of every method's assigned +
|
|
293
|
+
# referenced self.<attr>.
|
|
294
|
+
class_attrs: set = set()
|
|
295
|
+
per_method_refs = []
|
|
296
|
+
for m in methods:
|
|
297
|
+
assigned, referenced = _self_attr_names(m)
|
|
298
|
+
class_attrs |= assigned | referenced
|
|
299
|
+
per_method_refs.append(referenced | assigned)
|
|
300
|
+
if not class_attrs:
|
|
301
|
+
continue
|
|
302
|
+
denom = len(class_attrs)
|
|
303
|
+
for refs in per_method_refs:
|
|
304
|
+
used = len(refs & class_attrs)
|
|
305
|
+
ratios.append(used / denom)
|
|
306
|
+
|
|
307
|
+
if not ratios:
|
|
308
|
+
return 1.0
|
|
309
|
+
return sum(ratios) / len(ratios)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# --------------------------------------------------------------------------- #
|
|
313
|
+
# Duplicated lines density #
|
|
314
|
+
# --------------------------------------------------------------------------- #
|
|
315
|
+
|
|
316
|
+
_DUP_WINDOW = 4 # >= 4 consecutive identical stripped lines
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _non_trivial_lines(source: str) -> List[str]:
|
|
320
|
+
"""Stripped lines worth considering for duplication: drop blank lines and
|
|
321
|
+
pure-comment lines (they inflate duplication noise)."""
|
|
322
|
+
out: List[str] = []
|
|
323
|
+
for raw in source.splitlines():
|
|
324
|
+
s = raw.strip()
|
|
325
|
+
if not s:
|
|
326
|
+
continue
|
|
327
|
+
if s.startswith("#"):
|
|
328
|
+
continue
|
|
329
|
+
out.append(s)
|
|
330
|
+
return out
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _duplicated_lines_density(all_files_lines: List[List[str]]) -> float:
|
|
334
|
+
"""Fraction of non-trivial lines that fall inside a duplicated block.
|
|
335
|
+
|
|
336
|
+
A block of >= ``_DUP_WINDOW`` consecutive identical stripped lines that
|
|
337
|
+
appears (as a window hash) more than once anywhere in the corpus marks all
|
|
338
|
+
of its member lines as duplicated. Result clamped to [0, 1].
|
|
339
|
+
"""
|
|
340
|
+
# Flatten the corpus, remembering file boundaries so windows never straddle
|
|
341
|
+
# two files.
|
|
342
|
+
windows = {} # hash -> count
|
|
343
|
+
indexed: List["tuple[int, int]"] = [] # (file_idx, line_idx) per global line
|
|
344
|
+
file_line_lists = all_files_lines
|
|
345
|
+
|
|
346
|
+
total_lines = sum(len(f) for f in file_line_lists)
|
|
347
|
+
if total_lines == 0:
|
|
348
|
+
return 0.0
|
|
349
|
+
|
|
350
|
+
# First pass: count window occurrences per file.
|
|
351
|
+
for fi, lines in enumerate(file_line_lists):
|
|
352
|
+
for li in range(len(lines) - _DUP_WINDOW + 1):
|
|
353
|
+
window = tuple(lines[li : li + _DUP_WINDOW])
|
|
354
|
+
h = hash(window)
|
|
355
|
+
windows[h] = windows.get(h, 0) + 1
|
|
356
|
+
|
|
357
|
+
# Second pass: any line covered by a window whose hash count > 1 is dup.
|
|
358
|
+
duplicated = set() # (file_idx, line_idx)
|
|
359
|
+
for fi, lines in enumerate(file_line_lists):
|
|
360
|
+
for li in range(len(lines) - _DUP_WINDOW + 1):
|
|
361
|
+
window = tuple(lines[li : li + _DUP_WINDOW])
|
|
362
|
+
if windows.get(hash(window), 0) > 1:
|
|
363
|
+
for k in range(li, li + _DUP_WINDOW):
|
|
364
|
+
duplicated.add((fi, k))
|
|
365
|
+
|
|
366
|
+
density = len(duplicated) / total_lines
|
|
367
|
+
if density < 0:
|
|
368
|
+
return 0.0
|
|
369
|
+
if density > 1:
|
|
370
|
+
return 1.0
|
|
371
|
+
return density
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
# --------------------------------------------------------------------------- #
|
|
375
|
+
# Heuristic analyzer for C-family + Rust (brace/keyword proxy) #
|
|
376
|
+
# --------------------------------------------------------------------------- #
|
|
377
|
+
#
|
|
378
|
+
# No real parser for C/C++/Rust exists in the standard library, so these
|
|
379
|
+
# languages are handled with a brace-depth + keyword heuristic. The precise
|
|
380
|
+
# backend is SonarQube; this is a deliberate good-enough proxy. Everything
|
|
381
|
+
# below runs on a *stripped* copy of the source (comments and string/char
|
|
382
|
+
# literals replaced with spaces) so keywords inside them are never counted.
|
|
383
|
+
|
|
384
|
+
# Language tags for the heuristic dispatch.
|
|
385
|
+
_LANG_C = "c"
|
|
386
|
+
_LANG_CPP = "cpp"
|
|
387
|
+
_LANG_RUST = "rust"
|
|
388
|
+
|
|
389
|
+
# Extension -> language. Python is dispatched separately.
|
|
390
|
+
_C_EXTS = {".c", ".h"}
|
|
391
|
+
_CPP_EXTS = {".cc", ".cpp", ".cxx", ".c++", ".hpp", ".hh", ".hxx", ".h++"}
|
|
392
|
+
_RUST_EXTS = {".rs"}
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _strip_c_family(source: str) -> str:
|
|
396
|
+
"""Remove ``//`` line comments, ``/* ... */`` block comments (multi-line),
|
|
397
|
+
double-quoted strings, and char literals from C/C++/Rust source, replacing
|
|
398
|
+
each removed span with spaces while preserving newlines.
|
|
399
|
+
|
|
400
|
+
Also makes a best-effort attempt at Rust raw strings (``r"..."`` and
|
|
401
|
+
``r#"..."#`` with any number of ``#``). Preserving line breaks keeps both
|
|
402
|
+
line-based metrics (dup density) and brace-depth metrics aligned with the
|
|
403
|
+
original source.
|
|
404
|
+
"""
|
|
405
|
+
out: List[str] = []
|
|
406
|
+
i = 0
|
|
407
|
+
n = len(source)
|
|
408
|
+
|
|
409
|
+
def emit_blanking(span: str) -> None:
|
|
410
|
+
# Replace every non-newline char with a space; keep newlines.
|
|
411
|
+
out.append("".join("\n" if ch == "\n" else " " for ch in span))
|
|
412
|
+
|
|
413
|
+
while i < n:
|
|
414
|
+
ch = source[i]
|
|
415
|
+
nxt = source[i + 1] if i + 1 < n else ""
|
|
416
|
+
|
|
417
|
+
# Line comment //...
|
|
418
|
+
if ch == "/" and nxt == "/":
|
|
419
|
+
j = source.find("\n", i)
|
|
420
|
+
if j == -1:
|
|
421
|
+
j = n
|
|
422
|
+
emit_blanking(source[i:j])
|
|
423
|
+
i = j
|
|
424
|
+
continue
|
|
425
|
+
|
|
426
|
+
# Block comment /* ... */ (multi-line)
|
|
427
|
+
if ch == "/" and nxt == "*":
|
|
428
|
+
j = source.find("*/", i + 2)
|
|
429
|
+
if j == -1:
|
|
430
|
+
j = n
|
|
431
|
+
else:
|
|
432
|
+
j += 2
|
|
433
|
+
emit_blanking(source[i:j])
|
|
434
|
+
i = j
|
|
435
|
+
continue
|
|
436
|
+
|
|
437
|
+
# Rust raw string: r"..." or r#"..."# (with k>=1 hashes), and the
|
|
438
|
+
# br"..." byte-string variant. Only treat as raw string when 'r' is at
|
|
439
|
+
# a token boundary (not part of a longer identifier).
|
|
440
|
+
if (ch in ("r", "R")) and not (i > 0 and (source[i - 1].isalnum() or source[i - 1] == "_")):
|
|
441
|
+
k = i + 1
|
|
442
|
+
hashes = 0
|
|
443
|
+
while k < n and source[k] == "#":
|
|
444
|
+
hashes += 1
|
|
445
|
+
k += 1
|
|
446
|
+
if k < n and source[k] == '"':
|
|
447
|
+
# Opening quote at k; closing delimiter is `"` + hashes*`#`.
|
|
448
|
+
closer = '"' + ("#" * hashes)
|
|
449
|
+
end = source.find(closer, k + 1)
|
|
450
|
+
if end == -1:
|
|
451
|
+
end = n
|
|
452
|
+
else:
|
|
453
|
+
end += len(closer)
|
|
454
|
+
emit_blanking(source[i:end])
|
|
455
|
+
i = end
|
|
456
|
+
continue
|
|
457
|
+
# Rust byte raw string br"..." / br#"..."#
|
|
458
|
+
if ch == "b" and i + 1 < n and source[i + 1] in ("r", "R") and not (
|
|
459
|
+
i > 0 and (source[i - 1].isalnum() or source[i - 1] == "_")
|
|
460
|
+
):
|
|
461
|
+
k = i + 2
|
|
462
|
+
hashes = 0
|
|
463
|
+
while k < n and source[k] == "#":
|
|
464
|
+
hashes += 1
|
|
465
|
+
k += 1
|
|
466
|
+
if k < n and source[k] == '"':
|
|
467
|
+
closer = '"' + ("#" * hashes)
|
|
468
|
+
end = source.find(closer, k + 1)
|
|
469
|
+
if end == -1:
|
|
470
|
+
end = n
|
|
471
|
+
else:
|
|
472
|
+
end += len(closer)
|
|
473
|
+
emit_blanking(source[i:end])
|
|
474
|
+
i = end
|
|
475
|
+
continue
|
|
476
|
+
|
|
477
|
+
# Double-quoted string "..." with backslash escapes.
|
|
478
|
+
if ch == '"':
|
|
479
|
+
j = i + 1
|
|
480
|
+
while j < n:
|
|
481
|
+
if source[j] == "\\":
|
|
482
|
+
j += 2
|
|
483
|
+
continue
|
|
484
|
+
if source[j] == '"':
|
|
485
|
+
j += 1
|
|
486
|
+
break
|
|
487
|
+
j += 1
|
|
488
|
+
emit_blanking(source[i:j])
|
|
489
|
+
i = j
|
|
490
|
+
continue
|
|
491
|
+
|
|
492
|
+
# Char literal 'x' / '\n' / '\\'. Rust lifetimes (e.g. &'a) also start
|
|
493
|
+
# with a quote but are not terminated by a quote; guard by only
|
|
494
|
+
# consuming when a closing quote is found within a short window.
|
|
495
|
+
if ch == "'":
|
|
496
|
+
j = i + 1
|
|
497
|
+
consumed = -1
|
|
498
|
+
if j < n and source[j] == "\\":
|
|
499
|
+
# Escaped char: '\n', '\xFF', '\u{1F600}' ... scan to quote.
|
|
500
|
+
k = j + 1
|
|
501
|
+
while k < n and source[k] != "'" and source[k] != "\n":
|
|
502
|
+
k += 1
|
|
503
|
+
if k < n and source[k] == "'":
|
|
504
|
+
consumed = k + 1
|
|
505
|
+
else:
|
|
506
|
+
# Single char then a quote: 'a'
|
|
507
|
+
if j + 1 < n and source[j + 1] == "'":
|
|
508
|
+
consumed = j + 2
|
|
509
|
+
if consumed != -1:
|
|
510
|
+
emit_blanking(source[i:consumed])
|
|
511
|
+
i = consumed
|
|
512
|
+
continue
|
|
513
|
+
# Otherwise: a Rust lifetime / label -> leave the quote as-is.
|
|
514
|
+
out.append(ch)
|
|
515
|
+
i += 1
|
|
516
|
+
continue
|
|
517
|
+
|
|
518
|
+
out.append(ch)
|
|
519
|
+
i += 1
|
|
520
|
+
|
|
521
|
+
return "".join(out)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
# Identifier-boundary keyword matchers (operate on stripped text).
|
|
525
|
+
_RE_IF = re.compile(r"\bif\b")
|
|
526
|
+
_RE_FOR = re.compile(r"\bfor\b")
|
|
527
|
+
_RE_WHILE = re.compile(r"\bwhile\b")
|
|
528
|
+
_RE_CASE = re.compile(r"\bcase\b")
|
|
529
|
+
_RE_CATCH = re.compile(r"\bcatch\b")
|
|
530
|
+
_RE_SWITCH = re.compile(r"\bswitch\b")
|
|
531
|
+
_RE_ELSE = re.compile(r"\belse\b")
|
|
532
|
+
_RE_AND = re.compile(r"&&")
|
|
533
|
+
_RE_OR = re.compile(r"\|\|")
|
|
534
|
+
_RE_TERNARY_Q = re.compile(r"\?")
|
|
535
|
+
_RE_MATCH = re.compile(r"\bmatch\b")
|
|
536
|
+
_RE_LOOP = re.compile(r"\bloop\b")
|
|
537
|
+
_RE_IF_LET = re.compile(r"\bif\s+let\b")
|
|
538
|
+
_RE_WHILE_LET = re.compile(r"\bwhile\s+let\b")
|
|
539
|
+
_RE_FAT_ARROW = re.compile(r"=>")
|
|
540
|
+
|
|
541
|
+
# Function-header detection. Operate on the stripped text.
|
|
542
|
+
# Rust: `fn name(...)` (possibly `pub`, `async`, `const`, `unsafe`, generics).
|
|
543
|
+
_RE_RUST_FN = re.compile(r"\bfn\s+[A-Za-z_]\w*\s*(?:<[^{};]*?>)?\s*\(")
|
|
544
|
+
# C-family: an identifier (the function name) immediately followed by `(`.
|
|
545
|
+
# We additionally require the matched `(` to begin a parameter list that, when
|
|
546
|
+
# the matching brace block opens, denotes a function. We locate candidate
|
|
547
|
+
# headers as `<ident>(` not preceded by a keyword that would make it a call or
|
|
548
|
+
# control structure.
|
|
549
|
+
_RE_C_HEADER_IDENT = re.compile(r"(?<![\w])([A-Za-z_]\w*)\s*\(")
|
|
550
|
+
|
|
551
|
+
# Keywords that, when they are the identifier before `(`, mean it is NOT a
|
|
552
|
+
# function definition (control flow / common macros / operators).
|
|
553
|
+
_C_NON_FUNC_IDENTS = {
|
|
554
|
+
"if", "for", "while", "switch", "catch", "return", "sizeof", "do",
|
|
555
|
+
"case", "else", "defined", "static_assert", "assert", "decltype",
|
|
556
|
+
"alignof", "alignas", "noexcept", "throw", "new", "delete", "and", "or",
|
|
557
|
+
"not", "typeid", "static_cast", "dynamic_cast", "reinterpret_cast",
|
|
558
|
+
"const_cast",
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def _ternary_count(text: str) -> int:
|
|
563
|
+
"""Count ``?`` ternary operators, excluding Rust's ``?`` error-propagation
|
|
564
|
+
postfix and ``?`` inside obvious non-ternary spots. Heuristic: count `?`
|
|
565
|
+
that is NOT immediately followed by `;`, `)`, `,`, `.`, whitespace+`;`, or
|
|
566
|
+
end -- i.e. looks like the middle of ``cond ? a : b``. This is approximate;
|
|
567
|
+
monotonic sanity is what matters."""
|
|
568
|
+
count = 0
|
|
569
|
+
for m in _RE_TERNARY_Q.finditer(text):
|
|
570
|
+
j = m.end()
|
|
571
|
+
# skip following whitespace
|
|
572
|
+
k = j
|
|
573
|
+
while k < len(text) and text[k] in " \t":
|
|
574
|
+
k += 1
|
|
575
|
+
nxt = text[k] if k < len(text) else ""
|
|
576
|
+
# Rust postfix `?` is typically followed by ; ) . , } or whitespace+those
|
|
577
|
+
if nxt in (";", ")", ",", ".", "}", "?", ""):
|
|
578
|
+
continue
|
|
579
|
+
count += 1
|
|
580
|
+
return count
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _find_blocks(stripped: str) -> List[Tuple[int, int, int]]:
|
|
584
|
+
"""Return a list of ``(open_index, close_index, header_start)`` for every
|
|
585
|
+
top-level-or-nested brace block, where ``header_start`` is the index just
|
|
586
|
+
after the previous block boundary / statement terminator (used to inspect
|
|
587
|
+
the block's header text). ``open_index`` is the position of ``{`` and
|
|
588
|
+
``close_index`` is the position of the matching ``}`` (exclusive of the
|
|
589
|
+
brace itself is not implied -- close_index points AT the ``}``)."""
|
|
590
|
+
blocks: List[Tuple[int, int, int]] = []
|
|
591
|
+
stack: List[Tuple[int, int]] = [] # (open_index, header_start)
|
|
592
|
+
# header_start tracking: position after the most recent ; { } at the same
|
|
593
|
+
# textual scan, so we can recover the header preceding a `{`.
|
|
594
|
+
last_boundary = 0
|
|
595
|
+
n = len(stripped)
|
|
596
|
+
for i, ch in enumerate(stripped):
|
|
597
|
+
if ch == "{":
|
|
598
|
+
stack.append((i, last_boundary))
|
|
599
|
+
last_boundary = i + 1
|
|
600
|
+
elif ch == "}":
|
|
601
|
+
if stack:
|
|
602
|
+
open_i, hdr = stack.pop()
|
|
603
|
+
blocks.append((open_i, i, hdr))
|
|
604
|
+
last_boundary = i + 1
|
|
605
|
+
elif ch == ";":
|
|
606
|
+
last_boundary = i + 1
|
|
607
|
+
return blocks
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def _is_rust_fn_header(header: str) -> bool:
|
|
611
|
+
return _RE_RUST_FN.search(header) is not None
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def _is_c_fn_header(header: str) -> bool:
|
|
615
|
+
"""True if ``header`` (text between the previous boundary and this ``{``)
|
|
616
|
+
looks like a C-family function signature: contains ``<ident>(`` where the
|
|
617
|
+
ident is not a control keyword, and a closing ``)`` appears before the
|
|
618
|
+
block. Excludes struct/enum/namespace/class/union bodies (no call paren)
|
|
619
|
+
and control structures.
|
|
620
|
+
"""
|
|
621
|
+
# Reject obvious aggregate / namespace headers.
|
|
622
|
+
# (These can still contain `(` in rare cases, but the keyword presence is a
|
|
623
|
+
# strong signal it is not a free/member function definition.)
|
|
624
|
+
if re.search(r"\b(struct|enum|union|namespace|class)\b", header):
|
|
625
|
+
# A class with a base-clause has no `(`; method defs inside the class
|
|
626
|
+
# are separate blocks, so excluding the class block itself is correct.
|
|
627
|
+
return False
|
|
628
|
+
last = None
|
|
629
|
+
for m in _RE_C_HEADER_IDENT.finditer(header):
|
|
630
|
+
ident = m.group(1)
|
|
631
|
+
if ident in _C_NON_FUNC_IDENTS:
|
|
632
|
+
last = None
|
|
633
|
+
continue
|
|
634
|
+
last = m
|
|
635
|
+
if last is None:
|
|
636
|
+
return False
|
|
637
|
+
# Require a closing paren somewhere after the ident's `(`.
|
|
638
|
+
return ")" in header[last.end() - 1:]
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def _depth_at(stripped: str, idx: int) -> int:
|
|
642
|
+
"""Brace depth at character index ``idx`` (number of unmatched ``{`` before
|
|
643
|
+
it)."""
|
|
644
|
+
depth = 0
|
|
645
|
+
for ch in stripped[:idx]:
|
|
646
|
+
if ch == "{":
|
|
647
|
+
depth += 1
|
|
648
|
+
elif ch == "}":
|
|
649
|
+
depth -= 1
|
|
650
|
+
return depth
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def _analyze_heuristic(stripped: str, lang: str) -> dict:
|
|
654
|
+
"""Analyze one stripped C/C++/Rust file. Returns a dict with:
|
|
655
|
+
|
|
656
|
+
cyclomatic : list[int] (per detected function)
|
|
657
|
+
cognitive : list[int] (per detected function)
|
|
658
|
+
max_depth : int (deepest brace nesting inside fn bodies)
|
|
659
|
+
import_count : int
|
|
660
|
+
bare_except : int (broad/abrupt error-handling analogue)
|
|
661
|
+
cohesion : list[float] (per (class/impl, method) ratio)
|
|
662
|
+
"""
|
|
663
|
+
blocks = _find_blocks(stripped)
|
|
664
|
+
|
|
665
|
+
# Identify function blocks (header looks like a signature).
|
|
666
|
+
fn_blocks: List[Tuple[int, int]] = [] # (open_index, close_index)
|
|
667
|
+
for open_i, close_i, hdr_start in blocks:
|
|
668
|
+
header = stripped[hdr_start:open_i]
|
|
669
|
+
if lang == _LANG_RUST:
|
|
670
|
+
if _is_rust_fn_header(header):
|
|
671
|
+
fn_blocks.append((open_i, close_i))
|
|
672
|
+
else: # C / C++
|
|
673
|
+
if _is_c_fn_header(header):
|
|
674
|
+
fn_blocks.append((open_i, close_i))
|
|
675
|
+
|
|
676
|
+
cyclomatic: List[int] = []
|
|
677
|
+
cognitive: List[int] = []
|
|
678
|
+
max_depth = 0
|
|
679
|
+
|
|
680
|
+
for open_i, close_i in fn_blocks:
|
|
681
|
+
body = stripped[open_i + 1:close_i]
|
|
682
|
+
cyclomatic.append(_heuristic_cyclomatic(body, lang))
|
|
683
|
+
cognitive.append(_heuristic_cognitive(body, lang))
|
|
684
|
+
d = _max_brace_depth(body)
|
|
685
|
+
if d > max_depth:
|
|
686
|
+
max_depth = d
|
|
687
|
+
|
|
688
|
+
import_count = _heuristic_imports(stripped, lang)
|
|
689
|
+
bare_except = _heuristic_broad_handling(stripped, lang)
|
|
690
|
+
cohesion = _heuristic_cohesion(stripped, blocks, lang)
|
|
691
|
+
|
|
692
|
+
return {
|
|
693
|
+
"cyclomatic": cyclomatic,
|
|
694
|
+
"cognitive": cognitive,
|
|
695
|
+
"max_depth": max_depth,
|
|
696
|
+
"import_count": import_count,
|
|
697
|
+
"bare_except": bare_except,
|
|
698
|
+
"cohesion": cohesion,
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def _max_brace_depth(body: str) -> int:
|
|
703
|
+
"""Deepest brace nesting reached inside ``body`` (control structures in
|
|
704
|
+
these languages are brace-delimited). The function body itself is depth 1
|
|
705
|
+
relative to the function header; we report depth counting the outermost
|
|
706
|
+
brace as 1."""
|
|
707
|
+
depth = 0
|
|
708
|
+
deepest = 0
|
|
709
|
+
for ch in body:
|
|
710
|
+
if ch == "{":
|
|
711
|
+
depth += 1
|
|
712
|
+
if depth > deepest:
|
|
713
|
+
deepest = depth
|
|
714
|
+
elif ch == "}":
|
|
715
|
+
if depth > 0:
|
|
716
|
+
depth -= 1
|
|
717
|
+
# The body text excludes the function's own outer braces, so a top-level
|
|
718
|
+
# statement (no nested block) yields deepest == 0. Add 1 so a flat function
|
|
719
|
+
# counts as depth 1 (its own body) and one nested control block counts as 2,
|
|
720
|
+
# matching the "deepest brace nesting reached inside function bodies"
|
|
721
|
+
# framing where the function body is the first level.
|
|
722
|
+
return deepest + 1
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
def _heuristic_cyclomatic(body: str, lang: str) -> int:
|
|
726
|
+
"""1 + decision points in a function body (stripped text)."""
|
|
727
|
+
c = 1
|
|
728
|
+
c += len(_RE_IF.findall(body))
|
|
729
|
+
c += len(_RE_FOR.findall(body))
|
|
730
|
+
c += len(_RE_WHILE.findall(body))
|
|
731
|
+
c += len(_RE_CASE.findall(body))
|
|
732
|
+
c += len(_RE_CATCH.findall(body))
|
|
733
|
+
c += len(_RE_AND.findall(body))
|
|
734
|
+
c += len(_RE_OR.findall(body))
|
|
735
|
+
c += _ternary_count(body)
|
|
736
|
+
if lang == _LANG_RUST:
|
|
737
|
+
# match arms: count `=>` occurrences inside match blocks (approximate
|
|
738
|
+
# by counting all `=>`, which in stripped Rust appear in match arms and
|
|
739
|
+
# closures; closures are uncommon enough to keep this a fair proxy).
|
|
740
|
+
c += len(_RE_FAT_ARROW.findall(body))
|
|
741
|
+
c += len(_RE_LOOP.findall(body))
|
|
742
|
+
# if let / while let are already counted via the bare if/while above;
|
|
743
|
+
# no extra increment to avoid double-counting.
|
|
744
|
+
return c
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
def _heuristic_cognitive(body: str, lang: str) -> int:
|
|
748
|
+
"""Sonar-style nesting-weighted cognitive complexity, approximated by brace
|
|
749
|
+
depth. For each control structure add ``1 + nestingLevel``; add a flat +1
|
|
750
|
+
per ``&&``/``||`` and per ternary ``?``; add a flat +1 per ``else``.
|
|
751
|
+
|
|
752
|
+
Nesting level is the brace depth (relative to the function body) at the
|
|
753
|
+
position of the control keyword.
|
|
754
|
+
"""
|
|
755
|
+
score = 0
|
|
756
|
+
|
|
757
|
+
# Control structures that take a nesting bonus.
|
|
758
|
+
control_res = [_RE_IF, _RE_FOR, _RE_WHILE, _RE_SWITCH, _RE_CATCH]
|
|
759
|
+
if lang == _LANG_RUST:
|
|
760
|
+
control_res = [_RE_IF, _RE_FOR, _RE_WHILE, _RE_MATCH, _RE_LOOP]
|
|
761
|
+
|
|
762
|
+
for rex in control_res:
|
|
763
|
+
for m in rex.finditer(body):
|
|
764
|
+
nesting = _depth_at(body, m.start())
|
|
765
|
+
score += 1 + nesting
|
|
766
|
+
|
|
767
|
+
# Flat penalties (no nesting bonus).
|
|
768
|
+
score += len(_RE_AND.findall(body))
|
|
769
|
+
score += len(_RE_OR.findall(body))
|
|
770
|
+
score += _ternary_count(body)
|
|
771
|
+
score += len(_RE_ELSE.findall(body))
|
|
772
|
+
|
|
773
|
+
return score
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
def _heuristic_imports(stripped: str, lang: str) -> int:
|
|
777
|
+
if lang == _LANG_RUST:
|
|
778
|
+
# `use ...;` items + `extern crate ...;`
|
|
779
|
+
uses = len(re.findall(r"\buse\b[^;{}]*;", stripped))
|
|
780
|
+
crates = len(re.findall(r"\bextern\s+crate\b[^;{}]*;", stripped))
|
|
781
|
+
return uses + crates
|
|
782
|
+
# C / C++: `#include` directives.
|
|
783
|
+
return len(re.findall(r"(?m)^[ \t]*#[ \t]*include\b", stripped))
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
def _heuristic_broad_handling(stripped: str, lang: str) -> int:
|
|
787
|
+
"""bare_except analogue (abrupt / broad error handling, lower is better)."""
|
|
788
|
+
if lang == _LANG_RUST:
|
|
789
|
+
# .unwrap( , .expect( , panic!(
|
|
790
|
+
n = 0
|
|
791
|
+
n += len(re.findall(r"\.unwrap\s*\(", stripped))
|
|
792
|
+
n += len(re.findall(r"\.expect\s*\(", stripped))
|
|
793
|
+
n += len(re.findall(r"\bpanic!\s*\(", stripped))
|
|
794
|
+
return n
|
|
795
|
+
if lang == _LANG_CPP:
|
|
796
|
+
# catch (...) / catch(...) catch-all handlers.
|
|
797
|
+
return len(re.findall(r"\bcatch\s*\(\s*\.\.\.\s*\)", stripped))
|
|
798
|
+
# C: 0
|
|
799
|
+
return 0
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def _heuristic_cohesion(
|
|
803
|
+
stripped: str, blocks: List[Tuple[int, int, int]], lang: str
|
|
804
|
+
) -> List[float]:
|
|
805
|
+
"""Per (class/impl, method) cohesion ratios.
|
|
806
|
+
|
|
807
|
+
Rust: for each ``impl [Trait for] Type { ... }`` block, methods are ``fn``
|
|
808
|
+
items taking ``self``/``&self``/``&mut self``; attribute universe is the
|
|
809
|
+
union of ``self.<field>`` accesses across the impl's methods. Ratio for a
|
|
810
|
+
method = |fields it references| / |universe|. Skip impls with no self fields.
|
|
811
|
+
|
|
812
|
+
C++: for each ``class``/``struct`` body containing methods, approximate the
|
|
813
|
+
member universe via ``this-><member>`` accesses across methods; ratio = mean
|
|
814
|
+
over methods. Skip when no ``this->`` usage.
|
|
815
|
+
|
|
816
|
+
C: nothing.
|
|
817
|
+
"""
|
|
818
|
+
if lang == _LANG_C:
|
|
819
|
+
return []
|
|
820
|
+
ratios: List[float] = []
|
|
821
|
+
|
|
822
|
+
# Sort blocks by open index so we can find children of a container block.
|
|
823
|
+
blocks_sorted = sorted(blocks, key=lambda b: b[0])
|
|
824
|
+
|
|
825
|
+
if lang == _LANG_RUST:
|
|
826
|
+
for open_i, close_i, hdr_start in blocks_sorted:
|
|
827
|
+
header = stripped[hdr_start:open_i]
|
|
828
|
+
if not re.search(r"\bimpl\b", header):
|
|
829
|
+
continue
|
|
830
|
+
# Find method (fn) blocks directly inside this impl.
|
|
831
|
+
method_bodies = _direct_method_bodies(stripped, open_i, close_i, blocks_sorted, _LANG_RUST)
|
|
832
|
+
if not method_bodies:
|
|
833
|
+
continue
|
|
834
|
+
per_method_fields: List[set] = []
|
|
835
|
+
universe: set = set()
|
|
836
|
+
for mbody in method_bodies:
|
|
837
|
+
fields = set(re.findall(r"\bself\s*\.\s*([A-Za-z_]\w*)", mbody))
|
|
838
|
+
per_method_fields.append(fields)
|
|
839
|
+
universe |= fields
|
|
840
|
+
if not universe:
|
|
841
|
+
continue
|
|
842
|
+
denom = len(universe)
|
|
843
|
+
for fields in per_method_fields:
|
|
844
|
+
ratios.append(len(fields & universe) / denom)
|
|
845
|
+
return ratios
|
|
846
|
+
|
|
847
|
+
# C++: class / struct bodies.
|
|
848
|
+
for open_i, close_i, hdr_start in blocks_sorted:
|
|
849
|
+
header = stripped[hdr_start:open_i]
|
|
850
|
+
if not re.search(r"\b(class|struct)\b", header):
|
|
851
|
+
continue
|
|
852
|
+
# The class body is everything between its braces.
|
|
853
|
+
method_bodies = _direct_method_bodies(stripped, open_i, close_i, blocks_sorted, _LANG_CPP)
|
|
854
|
+
# Also consider out-of-line member functions? Approximate via this->
|
|
855
|
+
# within method blocks nested in the class body only (good-enough).
|
|
856
|
+
if not method_bodies:
|
|
857
|
+
continue
|
|
858
|
+
per_method_fields = []
|
|
859
|
+
universe = set()
|
|
860
|
+
for mbody in method_bodies:
|
|
861
|
+
fields = set(re.findall(r"\bthis\s*->\s*([A-Za-z_]\w*)", mbody))
|
|
862
|
+
per_method_fields.append(fields)
|
|
863
|
+
universe |= fields
|
|
864
|
+
if not universe:
|
|
865
|
+
continue
|
|
866
|
+
denom = len(universe)
|
|
867
|
+
for fields in per_method_fields:
|
|
868
|
+
ratios.append(len(fields & universe) / denom)
|
|
869
|
+
return ratios
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def _direct_method_bodies(
|
|
873
|
+
stripped: str,
|
|
874
|
+
container_open: int,
|
|
875
|
+
container_close: int,
|
|
876
|
+
blocks_sorted: List[Tuple[int, int, int]],
|
|
877
|
+
lang: str,
|
|
878
|
+
) -> List[str]:
|
|
879
|
+
"""Return the body texts of method blocks that are nested (at any depth)
|
|
880
|
+
inside the container span and whose header looks like a method.
|
|
881
|
+
|
|
882
|
+
For Rust we require ``fn`` taking ``self``; for C++ we require a function
|
|
883
|
+
header (``<ident>(`` not a control keyword). We collect any matching fn
|
|
884
|
+
block strictly inside the container; nested blocks inside those (control
|
|
885
|
+
structures) are naturally excluded because their headers don't match.
|
|
886
|
+
"""
|
|
887
|
+
bodies: List[str] = []
|
|
888
|
+
for open_i, close_i, hdr_start in blocks_sorted:
|
|
889
|
+
if not (container_open < open_i and close_i < container_close):
|
|
890
|
+
continue
|
|
891
|
+
header = stripped[hdr_start:open_i]
|
|
892
|
+
if lang == _LANG_RUST:
|
|
893
|
+
if _RE_RUST_FN.search(header) and re.search(r"\bself\b", header):
|
|
894
|
+
bodies.append(stripped[open_i + 1:close_i])
|
|
895
|
+
else: # C++
|
|
896
|
+
if _is_c_fn_header(header):
|
|
897
|
+
bodies.append(stripped[open_i + 1:close_i])
|
|
898
|
+
return bodies
|
|
899
|
+
|
|
900
|
+
|
|
901
|
+
# --------------------------------------------------------------------------- #
|
|
902
|
+
# File walking + aggregation #
|
|
903
|
+
# --------------------------------------------------------------------------- #
|
|
904
|
+
|
|
905
|
+
_SKIP_DIRS = {
|
|
906
|
+
".git",
|
|
907
|
+
"__pycache__",
|
|
908
|
+
".venv",
|
|
909
|
+
"venv",
|
|
910
|
+
"node_modules",
|
|
911
|
+
".mypy_cache",
|
|
912
|
+
"target",
|
|
913
|
+
"build",
|
|
914
|
+
"dist",
|
|
915
|
+
".pytest_cache",
|
|
916
|
+
"vendor",
|
|
917
|
+
"third_party",
|
|
918
|
+
"synergyspec-selfevolving",
|
|
919
|
+
".synergyspec-selfevolving",
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
_PY_EXT = ".py"
|
|
923
|
+
_HEURISTIC_EXTS = _C_EXTS | _CPP_EXTS | _RUST_EXTS
|
|
924
|
+
_ALL_EXTS = {_PY_EXT} | _HEURISTIC_EXTS
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def _ext_lang(path: str) -> "str | None":
|
|
928
|
+
"""Return the heuristic language tag for a path, or None if not C/C++/Rust."""
|
|
929
|
+
lower = path.lower()
|
|
930
|
+
# Longest-suffix match first for multi-dot exts like .c++ / .h++.
|
|
931
|
+
for ext in sorted(_HEURISTIC_EXTS, key=len, reverse=True):
|
|
932
|
+
if lower.endswith(ext):
|
|
933
|
+
if ext in _C_EXTS:
|
|
934
|
+
return _LANG_C
|
|
935
|
+
if ext in _CPP_EXTS:
|
|
936
|
+
return _LANG_CPP
|
|
937
|
+
if ext in _RUST_EXTS:
|
|
938
|
+
return _LANG_RUST
|
|
939
|
+
return None
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def _iter_source_files(root: str) -> "Iterable[tuple[str, str]]":
|
|
943
|
+
"""Yield ``(path, kind)`` where kind is 'py' or a heuristic language tag."""
|
|
944
|
+
def classify(p: str) -> "str | None":
|
|
945
|
+
if p.lower().endswith(_PY_EXT):
|
|
946
|
+
return "py"
|
|
947
|
+
return _ext_lang(p)
|
|
948
|
+
|
|
949
|
+
if os.path.isfile(root):
|
|
950
|
+
kind = classify(root)
|
|
951
|
+
if kind is not None:
|
|
952
|
+
yield root, kind
|
|
953
|
+
return
|
|
954
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
955
|
+
dirnames[:] = [d for d in dirnames if d not in _SKIP_DIRS]
|
|
956
|
+
for fn in filenames:
|
|
957
|
+
full = os.path.join(dirpath, fn)
|
|
958
|
+
kind = classify(full)
|
|
959
|
+
if kind is not None:
|
|
960
|
+
yield full, kind
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
def _iter_python_files(root: str) -> Iterable[str]:
|
|
964
|
+
"""Backwards-compatible Python-only iterator (retained for any callers /
|
|
965
|
+
tests that import it). Honors the extended skip-dir set."""
|
|
966
|
+
if os.path.isfile(root) and root.endswith(".py"):
|
|
967
|
+
yield root
|
|
968
|
+
return
|
|
969
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
970
|
+
dirnames[:] = [d for d in dirnames if d not in _SKIP_DIRS]
|
|
971
|
+
for fn in filenames:
|
|
972
|
+
if fn.endswith(".py"):
|
|
973
|
+
yield os.path.join(dirpath, fn)
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
def _read_source(path: str) -> "str | None":
|
|
977
|
+
try:
|
|
978
|
+
with tokenize.open(path) as f: # respects PEP 263 encoding cookies
|
|
979
|
+
return f.read()
|
|
980
|
+
except (OSError, SyntaxError, ValueError, UnicodeDecodeError):
|
|
981
|
+
# tokenize.open can raise SyntaxError on a bad encoding declaration.
|
|
982
|
+
try:
|
|
983
|
+
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
|
984
|
+
return f.read()
|
|
985
|
+
except OSError:
|
|
986
|
+
return None
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
def _non_trivial_lines_heuristic(stripped: str) -> List[str]:
|
|
990
|
+
"""Stripped lines worth considering for duplication in C/C++/Rust.
|
|
991
|
+
|
|
992
|
+
``stripped`` has already had comments and string/char literals replaced
|
|
993
|
+
with spaces (preserving newlines), so we only need to drop blank lines.
|
|
994
|
+
Returns the trimmed text of each remaining line.
|
|
995
|
+
"""
|
|
996
|
+
out: List[str] = []
|
|
997
|
+
for raw in stripped.splitlines():
|
|
998
|
+
s = raw.strip()
|
|
999
|
+
if not s:
|
|
1000
|
+
continue
|
|
1001
|
+
out.append(s)
|
|
1002
|
+
return out
|
|
1003
|
+
|
|
1004
|
+
|
|
1005
|
+
def analyze(root: str) -> dict:
|
|
1006
|
+
cyclomatic_values: List[float] = []
|
|
1007
|
+
cognitive_values: List[float] = []
|
|
1008
|
+
cohesion_ratios: List[float] = [] # heuristic class/impl cohesion ratios
|
|
1009
|
+
max_depth = 0
|
|
1010
|
+
import_count = 0
|
|
1011
|
+
bare_except_count = 0
|
|
1012
|
+
all_classes: List[ast.ClassDef] = []
|
|
1013
|
+
corpus_lines: List[List[str]] = []
|
|
1014
|
+
|
|
1015
|
+
for path, kind in _iter_source_files(root):
|
|
1016
|
+
source = _read_source(path)
|
|
1017
|
+
if source is None:
|
|
1018
|
+
continue
|
|
1019
|
+
|
|
1020
|
+
if kind == "py":
|
|
1021
|
+
# ---- Python path: byte-for-byte identical to the original. ----
|
|
1022
|
+
# Duplication works on raw text even if the file fails to parse.
|
|
1023
|
+
corpus_lines.append(_non_trivial_lines(source))
|
|
1024
|
+
|
|
1025
|
+
try:
|
|
1026
|
+
tree = ast.parse(source, filename=path)
|
|
1027
|
+
except (SyntaxError, ValueError):
|
|
1028
|
+
continue # skip un-parseable files but keep their lines for dup
|
|
1029
|
+
|
|
1030
|
+
max_depth = max(max_depth, _max_nesting_depth(tree))
|
|
1031
|
+
|
|
1032
|
+
for node in ast.walk(tree):
|
|
1033
|
+
if isinstance(node, ast.Import):
|
|
1034
|
+
import_count += len(node.names)
|
|
1035
|
+
elif isinstance(node, ast.ImportFrom):
|
|
1036
|
+
import_count += len(node.names)
|
|
1037
|
+
elif isinstance(node, ast.ExceptHandler):
|
|
1038
|
+
if _is_broad_handler(node):
|
|
1039
|
+
bare_except_count += 1
|
|
1040
|
+
elif _is_function(node):
|
|
1041
|
+
cyclomatic_values.append(_cyclomatic_complexity(node))
|
|
1042
|
+
cognitive_values.append(_cognitive_complexity(node))
|
|
1043
|
+
elif isinstance(node, ast.ClassDef):
|
|
1044
|
+
all_classes.append(node)
|
|
1045
|
+
continue
|
|
1046
|
+
|
|
1047
|
+
# ---- C / C++ / Rust heuristic path. ----
|
|
1048
|
+
try:
|
|
1049
|
+
stripped = _strip_c_family(source)
|
|
1050
|
+
except Exception: # noqa: BLE001 -- never let one file kill the run
|
|
1051
|
+
# Fall back to raw text for dup density only.
|
|
1052
|
+
corpus_lines.append(_non_trivial_lines_heuristic(source))
|
|
1053
|
+
continue
|
|
1054
|
+
|
|
1055
|
+
# Dup density uses the comment/string-stripped, blank-dropped lines.
|
|
1056
|
+
corpus_lines.append(_non_trivial_lines_heuristic(stripped))
|
|
1057
|
+
|
|
1058
|
+
try:
|
|
1059
|
+
res = _analyze_heuristic(stripped, kind)
|
|
1060
|
+
except Exception: # noqa: BLE001 -- robustness: skip metrics for this file
|
|
1061
|
+
continue
|
|
1062
|
+
|
|
1063
|
+
cyclomatic_values.extend(res["cyclomatic"])
|
|
1064
|
+
cognitive_values.extend(res["cognitive"])
|
|
1065
|
+
cohesion_ratios.extend(res["cohesion"])
|
|
1066
|
+
if res["max_depth"] > max_depth:
|
|
1067
|
+
max_depth = res["max_depth"]
|
|
1068
|
+
import_count += res["import_count"]
|
|
1069
|
+
bare_except_count += res["bare_except"]
|
|
1070
|
+
|
|
1071
|
+
cyclomatic_p95 = _percentile(cyclomatic_values, 95.0)
|
|
1072
|
+
cognitive_mean = (
|
|
1073
|
+
sum(cognitive_values) / len(cognitive_values) if cognitive_values else 0.0
|
|
1074
|
+
)
|
|
1075
|
+
dup_density = _duplicated_lines_density(corpus_lines)
|
|
1076
|
+
attr_ratio = _attr_method_usage_ratio_combined(all_classes, cohesion_ratios)
|
|
1077
|
+
|
|
1078
|
+
return {
|
|
1079
|
+
"cyclomatic_p95": round(cyclomatic_p95, 4),
|
|
1080
|
+
"max_nesting_depth": int(max_depth),
|
|
1081
|
+
"cognitive_complexity": round(cognitive_mean, 4),
|
|
1082
|
+
"duplicated_lines_density": round(dup_density, 4),
|
|
1083
|
+
"import_count": int(import_count),
|
|
1084
|
+
"attr_method_usage_ratio": round(attr_ratio, 4),
|
|
1085
|
+
"bare_except_count": int(bare_except_count),
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
|
|
1089
|
+
def _attr_method_usage_ratio_combined(
|
|
1090
|
+
classes: List[ast.ClassDef], heuristic_ratios: List[float]
|
|
1091
|
+
) -> float:
|
|
1092
|
+
"""Mean over the combined set of per-(class/impl, method) cohesion ratios
|
|
1093
|
+
from all languages: Python AST classes plus the precomputed heuristic
|
|
1094
|
+
(Rust impl / C++ class/struct) ratios. Returns 1.0 (neutral) when there is
|
|
1095
|
+
no signal at all, matching the Python-only behavior.
|
|
1096
|
+
|
|
1097
|
+
When there are NO heuristic ratios, this reduces exactly to the original
|
|
1098
|
+
``_attr_method_usage_ratio`` (so Python-only results are unchanged).
|
|
1099
|
+
"""
|
|
1100
|
+
py_ratios: List[float] = []
|
|
1101
|
+
for cls in classes:
|
|
1102
|
+
methods = [m for m in cls.body if _is_function(m)]
|
|
1103
|
+
if not methods:
|
|
1104
|
+
continue
|
|
1105
|
+
class_attrs: set = set()
|
|
1106
|
+
per_method_refs = []
|
|
1107
|
+
for m in methods:
|
|
1108
|
+
assigned, referenced = _self_attr_names(m)
|
|
1109
|
+
class_attrs |= assigned | referenced
|
|
1110
|
+
per_method_refs.append(referenced | assigned)
|
|
1111
|
+
if not class_attrs:
|
|
1112
|
+
continue
|
|
1113
|
+
denom = len(class_attrs)
|
|
1114
|
+
for refs in per_method_refs:
|
|
1115
|
+
used = len(refs & class_attrs)
|
|
1116
|
+
py_ratios.append(used / denom)
|
|
1117
|
+
|
|
1118
|
+
combined = py_ratios + list(heuristic_ratios)
|
|
1119
|
+
if not combined:
|
|
1120
|
+
return 1.0
|
|
1121
|
+
return sum(combined) / len(combined)
|
|
1122
|
+
|
|
1123
|
+
|
|
1124
|
+
def _empty_result() -> dict:
|
|
1125
|
+
return {
|
|
1126
|
+
"cyclomatic_p95": 0.0,
|
|
1127
|
+
"max_nesting_depth": 0,
|
|
1128
|
+
"cognitive_complexity": 0.0,
|
|
1129
|
+
"duplicated_lines_density": 0.0,
|
|
1130
|
+
"import_count": 0,
|
|
1131
|
+
"attr_method_usage_ratio": 1.0,
|
|
1132
|
+
"bare_except_count": 0,
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
|
|
1136
|
+
def main(argv: List[str]) -> int:
|
|
1137
|
+
if len(argv) < 2:
|
|
1138
|
+
# No directory given: emit a valid empty result rather than crash.
|
|
1139
|
+
print(json.dumps(_empty_result()))
|
|
1140
|
+
return 0
|
|
1141
|
+
root = argv[1]
|
|
1142
|
+
try:
|
|
1143
|
+
if not os.path.exists(root):
|
|
1144
|
+
print(json.dumps(_empty_result()))
|
|
1145
|
+
return 0
|
|
1146
|
+
result = analyze(root)
|
|
1147
|
+
except Exception: # noqa: BLE001 -- last-resort guard; always emit JSON
|
|
1148
|
+
result = _empty_result()
|
|
1149
|
+
print(json.dumps(result))
|
|
1150
|
+
return 0
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
if __name__ == "__main__":
|
|
1154
|
+
sys.exit(main(sys.argv))
|