shannon-codebase-insight 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shannon_codebase_insight-0.4.0.dist-info/METADATA +209 -0
- shannon_codebase_insight-0.4.0.dist-info/RECORD +37 -0
- shannon_codebase_insight-0.4.0.dist-info/WHEEL +5 -0
- shannon_codebase_insight-0.4.0.dist-info/entry_points.txt +7 -0
- shannon_codebase_insight-0.4.0.dist-info/licenses/LICENSE +21 -0
- shannon_codebase_insight-0.4.0.dist-info/top_level.txt +1 -0
- shannon_insight/__init__.py +25 -0
- shannon_insight/analyzers/__init__.py +8 -0
- shannon_insight/analyzers/base.py +215 -0
- shannon_insight/analyzers/go_analyzer.py +150 -0
- shannon_insight/analyzers/python_analyzer.py +169 -0
- shannon_insight/analyzers/typescript_analyzer.py +162 -0
- shannon_insight/cache.py +214 -0
- shannon_insight/cli.py +333 -0
- shannon_insight/config.py +235 -0
- shannon_insight/core.py +546 -0
- shannon_insight/exceptions/__init__.py +31 -0
- shannon_insight/exceptions/analysis.py +78 -0
- shannon_insight/exceptions/base.py +18 -0
- shannon_insight/exceptions/config.py +48 -0
- shannon_insight/file_ops.py +218 -0
- shannon_insight/logging_config.py +98 -0
- shannon_insight/math/__init__.py +15 -0
- shannon_insight/math/entropy.py +133 -0
- shannon_insight/math/fusion.py +109 -0
- shannon_insight/math/graph.py +209 -0
- shannon_insight/math/robust.py +106 -0
- shannon_insight/math/statistics.py +159 -0
- shannon_insight/models.py +48 -0
- shannon_insight/primitives/__init__.py +13 -0
- shannon_insight/primitives/detector.py +318 -0
- shannon_insight/primitives/extractor.py +278 -0
- shannon_insight/primitives/fusion.py +373 -0
- shannon_insight/primitives/recommendations.py +158 -0
- shannon_insight/py.typed +2 -0
- shannon_insight/security.py +284 -0
- shannon_insight/utils/__init__.py +1 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Anomaly detection using rigorous statistical methods.
|
|
3
|
+
|
|
4
|
+
Implements:
|
|
5
|
+
- Mahalanobis distance for multivariate outliers
|
|
6
|
+
- Robust Z-scores (MAD-based)
|
|
7
|
+
- Grubbs' test for single outliers
|
|
8
|
+
- Statistical significance testing
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Dict, List, Tuple
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
from ..models import Primitives
|
|
15
|
+
from ..logging_config import get_logger
|
|
16
|
+
from ..exceptions import InvalidConfigError, InsufficientDataError
|
|
17
|
+
from ..math import Statistics, RobustStatistics
|
|
18
|
+
|
|
19
|
+
logger = get_logger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class AnomalyDetector:
|
|
23
|
+
"""
|
|
24
|
+
Detect anomalies using statistically rigorous methods.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
MIN_FILES_FOR_MAHALANOBIS = 10
|
|
28
|
+
MIN_FILES_FOR_Z_SCORE = 5
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
primitives: Dict[str, Primitives],
|
|
33
|
+
threshold: float = 1.5,
|
|
34
|
+
use_multivariate: bool = True,
|
|
35
|
+
):
|
|
36
|
+
"""
|
|
37
|
+
Initialize anomaly detector.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
primitives: Dictionary mapping file paths to primitives
|
|
41
|
+
threshold: Detection threshold (default 1.5 for z-scores)
|
|
42
|
+
use_multivariate: Use Mahalanobis distance for multivariate detection
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
InvalidConfigError: If threshold is invalid
|
|
46
|
+
InsufficientDataError: If too few files for analysis
|
|
47
|
+
"""
|
|
48
|
+
if threshold <= 0 or threshold >= 10:
|
|
49
|
+
raise InvalidConfigError(
|
|
50
|
+
"threshold", threshold, "Threshold must be between 0.0 and 10.0"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
if len(primitives) < 3:
|
|
54
|
+
raise InsufficientDataError(
|
|
55
|
+
"Too few files for reliable analysis", minimum_required=3
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
self.primitives = primitives
|
|
59
|
+
self.threshold = threshold
|
|
60
|
+
self.use_multivariate = use_multivariate and len(primitives) >= self.MIN_FILES_FOR_MAHALANOBIS
|
|
61
|
+
|
|
62
|
+
logger.debug(
|
|
63
|
+
f"Initialized AnomalyDetector with threshold={threshold}, "
|
|
64
|
+
f"multivariate={self.use_multivariate}, files={len(primitives)}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def normalize(self) -> Dict[str, Primitives]:
|
|
68
|
+
"""
|
|
69
|
+
Normalize all primitives using robust statistical methods.
|
|
70
|
+
|
|
71
|
+
For small samples (<10): Uses modified Z-scores (MAD-based)
|
|
72
|
+
For larger samples: Uses standard Z-scores
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Dictionary mapping file paths to normalized primitives
|
|
76
|
+
"""
|
|
77
|
+
normalized = {}
|
|
78
|
+
|
|
79
|
+
if self.use_multivariate:
|
|
80
|
+
return self._normalize_multivariate(normalized)
|
|
81
|
+
|
|
82
|
+
# Extract each primitive into separate lists
|
|
83
|
+
entropy_vals = [p.structural_entropy for p in self.primitives.values()]
|
|
84
|
+
centrality_vals = [p.network_centrality for p in self.primitives.values()]
|
|
85
|
+
volatility_vals = [p.churn_volatility for p in self.primitives.values()]
|
|
86
|
+
coherence_vals = [p.semantic_coherence for p in self.primitives.values()]
|
|
87
|
+
load_vals = [p.cognitive_load for p in self.primitives.values()]
|
|
88
|
+
|
|
89
|
+
# Use robust z-scores for small samples
|
|
90
|
+
if len(self.primitives) < self.MIN_FILES_FOR_Z_SCORE:
|
|
91
|
+
entropy_z = RobustStatistics.modified_z_score(entropy_vals)
|
|
92
|
+
centrality_z = RobustStatistics.modified_z_score(centrality_vals)
|
|
93
|
+
volatility_z = RobustStatistics.modified_z_score(volatility_vals)
|
|
94
|
+
coherence_z = RobustStatistics.modified_z_score(coherence_vals)
|
|
95
|
+
load_z = RobustStatistics.modified_z_score(load_vals)
|
|
96
|
+
else:
|
|
97
|
+
# Use standard z-scores for larger samples
|
|
98
|
+
entropy_z = Statistics.z_scores(entropy_vals)
|
|
99
|
+
centrality_z = Statistics.z_scores(centrality_vals)
|
|
100
|
+
volatility_z = Statistics.z_scores(volatility_vals)
|
|
101
|
+
coherence_z = Statistics.z_scores(coherence_vals)
|
|
102
|
+
load_z = Statistics.z_scores(load_vals)
|
|
103
|
+
|
|
104
|
+
# Build normalized primitives
|
|
105
|
+
paths = list(self.primitives.keys())
|
|
106
|
+
for i, path in enumerate(paths):
|
|
107
|
+
normalized[path] = Primitives(
|
|
108
|
+
structural_entropy=entropy_z[i],
|
|
109
|
+
network_centrality=centrality_z[i],
|
|
110
|
+
churn_volatility=volatility_z[i],
|
|
111
|
+
semantic_coherence=coherence_z[i],
|
|
112
|
+
cognitive_load=load_z[i],
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return normalized
|
|
116
|
+
|
|
117
|
+
def _normalize_multivariate(
|
|
118
|
+
self, normalized: Dict[str, Primitives]
|
|
119
|
+
) -> Dict[str, Primitives]:
|
|
120
|
+
"""
|
|
121
|
+
Normalize using Mahalanobis distance for multivariate analysis.
|
|
122
|
+
|
|
123
|
+
Considers correlations between primitives, not just individual values.
|
|
124
|
+
"""
|
|
125
|
+
# Build feature matrix
|
|
126
|
+
paths = list(self.primitives.keys())
|
|
127
|
+
n = len(paths)
|
|
128
|
+
|
|
129
|
+
features = np.array(
|
|
130
|
+
[
|
|
131
|
+
[
|
|
132
|
+
self.primitives[path].structural_entropy,
|
|
133
|
+
self.primitives[path].network_centrality,
|
|
134
|
+
self.primitives[path].churn_volatility,
|
|
135
|
+
self.primitives[path].semantic_coherence,
|
|
136
|
+
self.primitives[path].cognitive_load,
|
|
137
|
+
]
|
|
138
|
+
for path in paths
|
|
139
|
+
]
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Compute mean and covariance
|
|
143
|
+
mean_vec = np.mean(features, axis=0)
|
|
144
|
+
cov_matrix = np.cov(features, rowvar=False)
|
|
145
|
+
|
|
146
|
+
# Compute Mahalanobis distance for each file
|
|
147
|
+
mahalanobis_distances = []
|
|
148
|
+
for i in range(n):
|
|
149
|
+
dist = Statistics.mahalanobis_distance(features[i], mean_vec, cov_matrix)
|
|
150
|
+
mahalanobis_distances.append(dist)
|
|
151
|
+
|
|
152
|
+
# Convert distances to z-like scores
|
|
153
|
+
# MD² follows chi-squared distribution with k degrees of freedom
|
|
154
|
+
k = 5 # number of dimensions
|
|
155
|
+
from scipy import stats
|
|
156
|
+
|
|
157
|
+
# Convert MD² to p-values
|
|
158
|
+
p_values = [
|
|
159
|
+
1 - stats.chi2.cdf(dist, k) for dist in mahalanobis_distances
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
# Convert to z-scores via inverse CDF.
|
|
163
|
+
# p ≈ 0 means maximally significant (large z), p ≈ 1 means no anomaly.
|
|
164
|
+
z_scores = []
|
|
165
|
+
for p in p_values:
|
|
166
|
+
if p <= 0:
|
|
167
|
+
z_scores.append(10.0) # practical cap for extreme significance
|
|
168
|
+
elif p >= 1:
|
|
169
|
+
z_scores.append(0.0)
|
|
170
|
+
else:
|
|
171
|
+
z_scores.append(stats.norm.ppf(1 - p))
|
|
172
|
+
|
|
173
|
+
# Normalize each primitive separately for reporting
|
|
174
|
+
entropy_vals = features[:, 0]
|
|
175
|
+
centrality_vals = features[:, 1]
|
|
176
|
+
volatility_vals = features[:, 2]
|
|
177
|
+
coherence_vals = features[:, 3]
|
|
178
|
+
load_vals = features[:, 4]
|
|
179
|
+
|
|
180
|
+
entropy_z = Statistics.z_scores(entropy_vals.tolist())
|
|
181
|
+
centrality_z = Statistics.z_scores(centrality_vals.tolist())
|
|
182
|
+
volatility_z = Statistics.z_scores(volatility_vals.tolist())
|
|
183
|
+
coherence_z = Statistics.z_scores(coherence_vals.tolist())
|
|
184
|
+
load_z = Statistics.z_scores(load_vals.tolist())
|
|
185
|
+
|
|
186
|
+
# Build normalized primitives
|
|
187
|
+
for i, path in enumerate(paths):
|
|
188
|
+
normalized[path] = Primitives(
|
|
189
|
+
structural_entropy=entropy_z[i],
|
|
190
|
+
network_centrality=centrality_z[i],
|
|
191
|
+
churn_volatility=volatility_z[i],
|
|
192
|
+
semantic_coherence=coherence_z[i],
|
|
193
|
+
cognitive_load=load_z[i],
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Store Mahalanobis scores for later use
|
|
197
|
+
self.mahalanobis_scores = {paths[i]: z_scores[i] for i in range(n)}
|
|
198
|
+
|
|
199
|
+
return normalized
|
|
200
|
+
|
|
201
|
+
def detect_anomalies(
|
|
202
|
+
self, normalized: Dict[str, Primitives]
|
|
203
|
+
) -> Dict[str, List[str]]:
|
|
204
|
+
"""
|
|
205
|
+
Detect which primitives are anomalous.
|
|
206
|
+
|
|
207
|
+
For multivariate mode: Uses Mahalanobis distance
|
|
208
|
+
For univariate mode: Uses individual z-scores
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Dictionary mapping file paths to list of anomaly flags
|
|
212
|
+
"""
|
|
213
|
+
anomalies = {}
|
|
214
|
+
|
|
215
|
+
if self.use_multivariate and hasattr(self, "mahalanobis_scores"):
|
|
216
|
+
return self._detect_multivariate_anomalies(normalized)
|
|
217
|
+
|
|
218
|
+
# Univariate detection
|
|
219
|
+
for path, prims in normalized.items():
|
|
220
|
+
flags = []
|
|
221
|
+
|
|
222
|
+
if abs(prims.structural_entropy) > self.threshold:
|
|
223
|
+
direction = "high" if prims.structural_entropy > 0 else "low"
|
|
224
|
+
flags.append(f"structural_entropy_{direction}")
|
|
225
|
+
|
|
226
|
+
if prims.network_centrality > self.threshold:
|
|
227
|
+
flags.append("high_centrality")
|
|
228
|
+
|
|
229
|
+
if abs(prims.churn_volatility) > self.threshold:
|
|
230
|
+
flags.append("high_volatility")
|
|
231
|
+
|
|
232
|
+
if abs(prims.semantic_coherence) > self.threshold:
|
|
233
|
+
direction = "low" if prims.semantic_coherence < 0 else "high"
|
|
234
|
+
flags.append(f"semantic_coherence_{direction}")
|
|
235
|
+
|
|
236
|
+
if prims.cognitive_load > self.threshold:
|
|
237
|
+
flags.append("high_cognitive_load")
|
|
238
|
+
|
|
239
|
+
if flags:
|
|
240
|
+
anomalies[path] = flags
|
|
241
|
+
|
|
242
|
+
return anomalies
|
|
243
|
+
|
|
244
|
+
def _detect_multivariate_anomalies(
|
|
245
|
+
self, normalized: Dict[str, Primitives]
|
|
246
|
+
) -> Dict[str, List[str]]:
|
|
247
|
+
"""
|
|
248
|
+
Detect anomalies using Mahalanobis distance.
|
|
249
|
+
|
|
250
|
+
Also identifies which specific primitives contribute to the anomaly.
|
|
251
|
+
"""
|
|
252
|
+
anomalies = {}
|
|
253
|
+
|
|
254
|
+
# Use chi-squared critical value for significance
|
|
255
|
+
k = 5 # number of dimensions
|
|
256
|
+
from scipy import stats
|
|
257
|
+
|
|
258
|
+
critical_value = stats.chi2.ppf(0.95, k) # 95% confidence
|
|
259
|
+
|
|
260
|
+
for path, mahalanobis_z in self.mahalanobis_scores.items():
|
|
261
|
+
if abs(mahalanobis_z) > self.threshold:
|
|
262
|
+
prims = normalized[path]
|
|
263
|
+
flags = []
|
|
264
|
+
|
|
265
|
+
# Identify which primitives are most anomalous.
|
|
266
|
+
# TODO: The 0.5 multiplier on threshold is a heuristic that
|
|
267
|
+
# relaxes per-primitive thresholds in multivariate mode.
|
|
268
|
+
# A rigorous alternative: decompose Mahalanobis D² into
|
|
269
|
+
# per-variable contributions via (x_i - mu_i)^2 * [Sigma^-1]_ii
|
|
270
|
+
# and compare each to its marginal chi-squared critical value.
|
|
271
|
+
if abs(prims.structural_entropy) > self.threshold * 0.5:
|
|
272
|
+
direction = "high" if prims.structural_entropy > 0 else "low"
|
|
273
|
+
flags.append(f"structural_entropy_{direction}")
|
|
274
|
+
|
|
275
|
+
if prims.network_centrality > self.threshold * 0.5:
|
|
276
|
+
flags.append("high_centrality")
|
|
277
|
+
|
|
278
|
+
if abs(prims.churn_volatility) > self.threshold * 0.5:
|
|
279
|
+
flags.append("high_volatility")
|
|
280
|
+
|
|
281
|
+
if abs(prims.semantic_coherence) > self.threshold * 0.5:
|
|
282
|
+
direction = "low" if prims.semantic_coherence < 0 else "high"
|
|
283
|
+
flags.append(f"semantic_coherence_{direction}")
|
|
284
|
+
|
|
285
|
+
if prims.cognitive_load > self.threshold * 0.5:
|
|
286
|
+
flags.append("high_cognitive_load")
|
|
287
|
+
|
|
288
|
+
if flags:
|
|
289
|
+
anomalies[path] = flags
|
|
290
|
+
|
|
291
|
+
return anomalies
|
|
292
|
+
|
|
293
|
+
def detect_outliers(
|
|
294
|
+
self, values: List[float], method: str = "grubbs"
|
|
295
|
+
) -> List[int]:
|
|
296
|
+
"""
|
|
297
|
+
Detect outliers in a univariate dataset.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
values: List of values
|
|
301
|
+
method: Detection method ('grubbs', 'iqr', 'mad')
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
List of outlier indices
|
|
305
|
+
"""
|
|
306
|
+
if method == "grubbs":
|
|
307
|
+
result = Statistics.grubbs_test(values)
|
|
308
|
+
if result:
|
|
309
|
+
return [result[0]]
|
|
310
|
+
return []
|
|
311
|
+
elif method == "iqr":
|
|
312
|
+
outliers = RobustStatistics.iqr_outliers(values)
|
|
313
|
+
return [i for i, is_outlier in enumerate(outliers) if is_outlier]
|
|
314
|
+
elif method == "mad":
|
|
315
|
+
modified_z = RobustStatistics.modified_z_score(values)
|
|
316
|
+
return [i for i, z in enumerate(modified_z) if abs(z) > 3.5]
|
|
317
|
+
else:
|
|
318
|
+
raise ValueError(f"Unknown method: {method}")
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""Extract the 5 orthogonal quality primitives"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Dict, List, Set, Optional
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from ..models import FileMetrics, Primitives
|
|
10
|
+
from ..cache import AnalysisCache
|
|
11
|
+
from ..logging_config import get_logger
|
|
12
|
+
from ..math import Entropy, GraphMetrics, RobustStatistics as RobustStats
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PrimitiveExtractor:
|
|
18
|
+
"""Extract the 5 orthogonal quality primitives"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
files: List[FileMetrics],
|
|
23
|
+
cache: Optional[AnalysisCache] = None,
|
|
24
|
+
config_hash: str = ""
|
|
25
|
+
):
|
|
26
|
+
self.files = files
|
|
27
|
+
self.file_map = {f.path: f for f in files}
|
|
28
|
+
self.cache = cache
|
|
29
|
+
self.config_hash = config_hash
|
|
30
|
+
logger.debug(f"Initialized PrimitiveExtractor for {len(files)} files")
|
|
31
|
+
|
|
32
|
+
def extract_all(self) -> Dict[str, Primitives]:
|
|
33
|
+
"""Extract all 5 primitives for each file"""
|
|
34
|
+
results = {}
|
|
35
|
+
|
|
36
|
+
# Build dependency graph (needed for centrality)
|
|
37
|
+
dep_graph = self._build_dependency_graph()
|
|
38
|
+
|
|
39
|
+
# Compute each primitive
|
|
40
|
+
entropies = self._compute_structural_entropy()
|
|
41
|
+
centralities = self._compute_network_centrality(dep_graph)
|
|
42
|
+
volatilities = self._compute_churn_volatility()
|
|
43
|
+
coherences = self._compute_semantic_coherence()
|
|
44
|
+
loads = self._compute_cognitive_load()
|
|
45
|
+
|
|
46
|
+
for file in self.files:
|
|
47
|
+
results[file.path] = Primitives(
|
|
48
|
+
structural_entropy=entropies.get(file.path, 0),
|
|
49
|
+
network_centrality=centralities.get(file.path, 0),
|
|
50
|
+
churn_volatility=volatilities.get(file.path, 0),
|
|
51
|
+
semantic_coherence=coherences.get(file.path, 0),
|
|
52
|
+
cognitive_load=loads.get(file.path, 0),
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
return results
|
|
56
|
+
|
|
57
|
+
# ---- Primitive 1: Structural Entropy ----
|
|
58
|
+
|
|
59
|
+
def _compute_structural_entropy(self) -> Dict[str, float]:
|
|
60
|
+
"""Compute normalized entropy of AST node type distribution."""
|
|
61
|
+
entropies = {}
|
|
62
|
+
|
|
63
|
+
for file in self.files:
|
|
64
|
+
if not file.ast_node_types or sum(file.ast_node_types.values()) == 0:
|
|
65
|
+
entropies[file.path] = 0.0
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
entropies[file.path] = Entropy.normalized(file.ast_node_types)
|
|
69
|
+
|
|
70
|
+
return entropies
|
|
71
|
+
|
|
72
|
+
# ---- Primitive 2: Network Centrality ----
|
|
73
|
+
|
|
74
|
+
# Standard library modules to ignore when building dependency graph
|
|
75
|
+
_STDLIB_NAMES = frozenset({
|
|
76
|
+
"abc", "ast", "asyncio", "base64", "bisect", "builtins", "calendar",
|
|
77
|
+
"cmath", "codecs", "collections", "concurrent", "contextlib", "copy",
|
|
78
|
+
"csv", "ctypes", "dataclasses", "datetime", "decimal", "difflib",
|
|
79
|
+
"email", "enum", "errno", "fcntl", "fileinput", "fnmatch", "fractions",
|
|
80
|
+
"ftplib", "functools", "gc", "getpass", "glob", "gzip", "hashlib",
|
|
81
|
+
"heapq", "hmac", "html", "http", "importlib", "inspect", "io",
|
|
82
|
+
"itertools", "json", "logging", "lzma", "math", "mimetypes",
|
|
83
|
+
"multiprocessing", "operator", "os", "pathlib", "pickle", "platform",
|
|
84
|
+
"pprint", "queue", "random", "re", "secrets", "select", "shelve",
|
|
85
|
+
"shlex", "shutil", "signal", "socket", "sqlite3", "ssl",
|
|
86
|
+
"statistics", "string", "struct", "subprocess", "sys", "tempfile",
|
|
87
|
+
"textwrap", "threading", "time", "timeit", "tkinter", "token",
|
|
88
|
+
"tomllib", "traceback", "types", "typing", "unicodedata", "unittest",
|
|
89
|
+
"urllib", "uuid", "venv", "warnings", "weakref", "xml", "zipfile",
|
|
90
|
+
"zlib",
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
# Common third-party packages to ignore
|
|
94
|
+
_THIRDPARTY_NAMES = frozenset({
|
|
95
|
+
"numpy", "np", "pandas", "pd", "scipy", "sklearn", "matplotlib",
|
|
96
|
+
"plt", "seaborn", "requests", "flask", "django", "fastapi",
|
|
97
|
+
"pydantic", "typer", "click", "rich", "diskcache", "pytest",
|
|
98
|
+
"setuptools", "wheel", "pip", "pkg_resources",
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
def _build_dependency_graph(self) -> Dict[str, Set[str]]:
|
|
102
|
+
"""Build file dependency graph from internal project imports only."""
|
|
103
|
+
graph = defaultdict(set)
|
|
104
|
+
|
|
105
|
+
# Map import paths to actual files
|
|
106
|
+
file_by_name = {}
|
|
107
|
+
for file in self.files:
|
|
108
|
+
name = Path(file.path).stem
|
|
109
|
+
file_by_name[name] = file.path
|
|
110
|
+
|
|
111
|
+
skip_names = self._STDLIB_NAMES | self._THIRDPARTY_NAMES
|
|
112
|
+
|
|
113
|
+
for file in self.files:
|
|
114
|
+
for imp in file.imports:
|
|
115
|
+
# Extract the leaf module name
|
|
116
|
+
pkg_name = imp.split("/")[-1].split(".")[-1]
|
|
117
|
+
|
|
118
|
+
# Skip stdlib and third-party imports
|
|
119
|
+
if pkg_name in skip_names:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
# Skip relative import markers
|
|
123
|
+
if pkg_name.startswith(".") or pkg_name == "":
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
# Only add edge if it points to a different file
|
|
127
|
+
if pkg_name in file_by_name and file_by_name[pkg_name] != file.path:
|
|
128
|
+
graph[file.path].add(file_by_name[pkg_name])
|
|
129
|
+
|
|
130
|
+
return dict(graph)
|
|
131
|
+
|
|
132
|
+
def _compute_network_centrality(
|
|
133
|
+
self, graph: Dict[str, Set[str]]
|
|
134
|
+
) -> Dict[str, float]:
|
|
135
|
+
"""Compute PageRank centrality.
|
|
136
|
+
|
|
137
|
+
TODO: Delegate to GraphMetrics.pagerank() for consistency and
|
|
138
|
+
correctness. The inline implementation differs from the canonical
|
|
139
|
+
version: it uses (1-d) instead of (1-d)/N for base probability,
|
|
140
|
+
has no dangling-node redistribution, no convergence check, and
|
|
141
|
+
misses nodes that appear only as targets. The min-max normalization
|
|
142
|
+
at the end masks the base-probability difference but the other
|
|
143
|
+
issues remain.
|
|
144
|
+
"""
|
|
145
|
+
# Initialize PageRank scores
|
|
146
|
+
scores = {f.path: 1.0 for f in self.files}
|
|
147
|
+
damping = 0.85
|
|
148
|
+
iterations = 20
|
|
149
|
+
|
|
150
|
+
# Build reverse graph (incoming edges)
|
|
151
|
+
incoming = defaultdict(set)
|
|
152
|
+
for src, targets in graph.items():
|
|
153
|
+
for tgt in targets:
|
|
154
|
+
incoming[tgt].add(src)
|
|
155
|
+
|
|
156
|
+
# PageRank iteration
|
|
157
|
+
for _ in range(iterations):
|
|
158
|
+
new_scores = {}
|
|
159
|
+
|
|
160
|
+
for file in self.files:
|
|
161
|
+
# Base probability
|
|
162
|
+
rank = 1 - damping
|
|
163
|
+
|
|
164
|
+
# Add contribution from incoming edges
|
|
165
|
+
for src in incoming.get(file.path, []):
|
|
166
|
+
out_degree = len(graph.get(src, []))
|
|
167
|
+
if out_degree > 0:
|
|
168
|
+
rank += damping * (scores[src] / out_degree)
|
|
169
|
+
|
|
170
|
+
new_scores[file.path] = rank
|
|
171
|
+
|
|
172
|
+
scores = new_scores
|
|
173
|
+
|
|
174
|
+
# Normalize to [0, 1]
|
|
175
|
+
if scores:
|
|
176
|
+
max_score = max(scores.values())
|
|
177
|
+
if max_score > 0:
|
|
178
|
+
scores = {k: v / max_score for k, v in scores.items()}
|
|
179
|
+
|
|
180
|
+
return scores
|
|
181
|
+
|
|
182
|
+
# ---- Primitive 3: Churn Volatility ----
|
|
183
|
+
|
|
184
|
+
def _compute_churn_volatility(self) -> Dict[str, float]:
|
|
185
|
+
"""Compute volatility of file modifications (filesystem-based)"""
|
|
186
|
+
volatilities = {}
|
|
187
|
+
|
|
188
|
+
# Since no git history, use file modification time as proxy
|
|
189
|
+
now = datetime.now().timestamp()
|
|
190
|
+
ages = [now - f.last_modified for f in self.files]
|
|
191
|
+
|
|
192
|
+
if not ages:
|
|
193
|
+
return {}
|
|
194
|
+
|
|
195
|
+
# Normalize age to volatility score
|
|
196
|
+
# Recent changes = high volatility
|
|
197
|
+
max_age = max(ages)
|
|
198
|
+
|
|
199
|
+
for file in self.files:
|
|
200
|
+
age = now - file.last_modified
|
|
201
|
+
# Invert: older = more stable = lower volatility
|
|
202
|
+
volatility = 1 - (age / max_age) if max_age > 0 else 0
|
|
203
|
+
volatilities[file.path] = volatility
|
|
204
|
+
|
|
205
|
+
return volatilities
|
|
206
|
+
|
|
207
|
+
# ---- Primitive 4: Semantic Coherence ----
|
|
208
|
+
|
|
209
|
+
def _compute_semantic_coherence(self) -> Dict[str, float]:
|
|
210
|
+
"""Compute semantic coherence via TF-IDF clustering"""
|
|
211
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
212
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
213
|
+
|
|
214
|
+
# Build document corpus (use imports + exports)
|
|
215
|
+
documents = []
|
|
216
|
+
paths = []
|
|
217
|
+
|
|
218
|
+
for file in self.files:
|
|
219
|
+
tokens = file.imports + file.exports
|
|
220
|
+
doc = " ".join(tokens) if tokens else "empty"
|
|
221
|
+
documents.append(doc)
|
|
222
|
+
paths.append(file.path)
|
|
223
|
+
|
|
224
|
+
if len(documents) < 2:
|
|
225
|
+
return {f.path: 1.0 for f in self.files}
|
|
226
|
+
|
|
227
|
+
# Compute TF-IDF vectors
|
|
228
|
+
vectorizer = TfidfVectorizer(min_df=1, max_df=0.8)
|
|
229
|
+
try:
|
|
230
|
+
tfidf_matrix = vectorizer.fit_transform(documents)
|
|
231
|
+
except Exception as e:
|
|
232
|
+
logger.warning(f"TF-IDF vectorization failed: {e}")
|
|
233
|
+
return {f.path: 1.0 for f in self.files}
|
|
234
|
+
|
|
235
|
+
# Compute pairwise similarities
|
|
236
|
+
similarities = cosine_similarity(tfidf_matrix)
|
|
237
|
+
|
|
238
|
+
coherences = {}
|
|
239
|
+
n = len(paths)
|
|
240
|
+
for i, path in enumerate(paths):
|
|
241
|
+
# Coherence = average similarity to OTHER files (exclude self)
|
|
242
|
+
if n > 1:
|
|
243
|
+
other_sims = [similarities[i][j] for j in range(n) if j != i]
|
|
244
|
+
coherences[path] = float(np.mean(other_sims))
|
|
245
|
+
else:
|
|
246
|
+
coherences[path] = 1.0
|
|
247
|
+
|
|
248
|
+
return coherences
|
|
249
|
+
|
|
250
|
+
# ---- Primitive 5: Cognitive Load ----
|
|
251
|
+
|
|
252
|
+
def _compute_cognitive_load(self) -> Dict[str, float]:
|
|
253
|
+
"""Compute cognitive load = concepts × complexity.
|
|
254
|
+
|
|
255
|
+
TODO: The formula CL = concepts * complexity * (1 + depth/10) is a
|
|
256
|
+
hand-tuned heuristic with no academic citation. The /10 divisor
|
|
257
|
+
maps typical nesting depths (0-5) to a 0-50% multiplier. Consider
|
|
258
|
+
citing Cant et al. (1995) "A Conceptual Complexity Metric" or
|
|
259
|
+
replacing with a validated cognitive complexity model such as
|
|
260
|
+
SonarSource's Cognitive Complexity (2017).
|
|
261
|
+
"""
|
|
262
|
+
loads = {}
|
|
263
|
+
|
|
264
|
+
for file in self.files:
|
|
265
|
+
# Concepts = functions + structs + interfaces
|
|
266
|
+
concepts = file.functions + file.structs + file.interfaces
|
|
267
|
+
|
|
268
|
+
# Cognitive load = concepts × complexity × nesting
|
|
269
|
+
load = concepts * file.complexity_score * (1 + file.nesting_depth / 10)
|
|
270
|
+
loads[file.path] = load
|
|
271
|
+
|
|
272
|
+
# Normalize to [0, 1]
|
|
273
|
+
if loads:
|
|
274
|
+
max_load = max(loads.values())
|
|
275
|
+
if max_load > 0:
|
|
276
|
+
loads = {k: v / max_load for k, v in loads.items()}
|
|
277
|
+
|
|
278
|
+
return loads
|