shannon-codebase-insight 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. shannon_codebase_insight-0.4.0.dist-info/METADATA +209 -0
  2. shannon_codebase_insight-0.4.0.dist-info/RECORD +37 -0
  3. shannon_codebase_insight-0.4.0.dist-info/WHEEL +5 -0
  4. shannon_codebase_insight-0.4.0.dist-info/entry_points.txt +7 -0
  5. shannon_codebase_insight-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. shannon_codebase_insight-0.4.0.dist-info/top_level.txt +1 -0
  7. shannon_insight/__init__.py +25 -0
  8. shannon_insight/analyzers/__init__.py +8 -0
  9. shannon_insight/analyzers/base.py +215 -0
  10. shannon_insight/analyzers/go_analyzer.py +150 -0
  11. shannon_insight/analyzers/python_analyzer.py +169 -0
  12. shannon_insight/analyzers/typescript_analyzer.py +162 -0
  13. shannon_insight/cache.py +214 -0
  14. shannon_insight/cli.py +333 -0
  15. shannon_insight/config.py +235 -0
  16. shannon_insight/core.py +546 -0
  17. shannon_insight/exceptions/__init__.py +31 -0
  18. shannon_insight/exceptions/analysis.py +78 -0
  19. shannon_insight/exceptions/base.py +18 -0
  20. shannon_insight/exceptions/config.py +48 -0
  21. shannon_insight/file_ops.py +218 -0
  22. shannon_insight/logging_config.py +98 -0
  23. shannon_insight/math/__init__.py +15 -0
  24. shannon_insight/math/entropy.py +133 -0
  25. shannon_insight/math/fusion.py +109 -0
  26. shannon_insight/math/graph.py +209 -0
  27. shannon_insight/math/robust.py +106 -0
  28. shannon_insight/math/statistics.py +159 -0
  29. shannon_insight/models.py +48 -0
  30. shannon_insight/primitives/__init__.py +13 -0
  31. shannon_insight/primitives/detector.py +318 -0
  32. shannon_insight/primitives/extractor.py +278 -0
  33. shannon_insight/primitives/fusion.py +373 -0
  34. shannon_insight/primitives/recommendations.py +158 -0
  35. shannon_insight/py.typed +2 -0
  36. shannon_insight/security.py +284 -0
  37. shannon_insight/utils/__init__.py +1 -0
@@ -0,0 +1,318 @@
1
+ """
2
+ Anomaly detection using rigorous statistical methods.
3
+
4
+ Implements:
5
+ - Mahalanobis distance for multivariate outliers
6
+ - Robust Z-scores (MAD-based)
7
+ - Grubbs' test for single outliers
8
+ - Statistical significance testing
9
+ """
10
+
11
+ from typing import Dict, List, Tuple
12
+ import numpy as np
13
+
14
+ from ..models import Primitives
15
+ from ..logging_config import get_logger
16
+ from ..exceptions import InvalidConfigError, InsufficientDataError
17
+ from ..math import Statistics, RobustStatistics
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class AnomalyDetector:
23
+ """
24
+ Detect anomalies using statistically rigorous methods.
25
+ """
26
+
27
+ MIN_FILES_FOR_MAHALANOBIS = 10
28
+ MIN_FILES_FOR_Z_SCORE = 5
29
+
30
+ def __init__(
31
+ self,
32
+ primitives: Dict[str, Primitives],
33
+ threshold: float = 1.5,
34
+ use_multivariate: bool = True,
35
+ ):
36
+ """
37
+ Initialize anomaly detector.
38
+
39
+ Args:
40
+ primitives: Dictionary mapping file paths to primitives
41
+ threshold: Detection threshold (default 1.5 for z-scores)
42
+ use_multivariate: Use Mahalanobis distance for multivariate detection
43
+
44
+ Raises:
45
+ InvalidConfigError: If threshold is invalid
46
+ InsufficientDataError: If too few files for analysis
47
+ """
48
+ if threshold <= 0 or threshold >= 10:
49
+ raise InvalidConfigError(
50
+ "threshold", threshold, "Threshold must be between 0.0 and 10.0"
51
+ )
52
+
53
+ if len(primitives) < 3:
54
+ raise InsufficientDataError(
55
+ "Too few files for reliable analysis", minimum_required=3
56
+ )
57
+
58
+ self.primitives = primitives
59
+ self.threshold = threshold
60
+ self.use_multivariate = use_multivariate and len(primitives) >= self.MIN_FILES_FOR_MAHALANOBIS
61
+
62
+ logger.debug(
63
+ f"Initialized AnomalyDetector with threshold={threshold}, "
64
+ f"multivariate={self.use_multivariate}, files={len(primitives)}"
65
+ )
66
+
67
+ def normalize(self) -> Dict[str, Primitives]:
68
+ """
69
+ Normalize all primitives using robust statistical methods.
70
+
71
+ For small samples (<10): Uses modified Z-scores (MAD-based)
72
+ For larger samples: Uses standard Z-scores
73
+
74
+ Returns:
75
+ Dictionary mapping file paths to normalized primitives
76
+ """
77
+ normalized = {}
78
+
79
+ if self.use_multivariate:
80
+ return self._normalize_multivariate(normalized)
81
+
82
+ # Extract each primitive into separate lists
83
+ entropy_vals = [p.structural_entropy for p in self.primitives.values()]
84
+ centrality_vals = [p.network_centrality for p in self.primitives.values()]
85
+ volatility_vals = [p.churn_volatility for p in self.primitives.values()]
86
+ coherence_vals = [p.semantic_coherence for p in self.primitives.values()]
87
+ load_vals = [p.cognitive_load for p in self.primitives.values()]
88
+
89
+ # Use robust z-scores for small samples
90
+ if len(self.primitives) < self.MIN_FILES_FOR_Z_SCORE:
91
+ entropy_z = RobustStatistics.modified_z_score(entropy_vals)
92
+ centrality_z = RobustStatistics.modified_z_score(centrality_vals)
93
+ volatility_z = RobustStatistics.modified_z_score(volatility_vals)
94
+ coherence_z = RobustStatistics.modified_z_score(coherence_vals)
95
+ load_z = RobustStatistics.modified_z_score(load_vals)
96
+ else:
97
+ # Use standard z-scores for larger samples
98
+ entropy_z = Statistics.z_scores(entropy_vals)
99
+ centrality_z = Statistics.z_scores(centrality_vals)
100
+ volatility_z = Statistics.z_scores(volatility_vals)
101
+ coherence_z = Statistics.z_scores(coherence_vals)
102
+ load_z = Statistics.z_scores(load_vals)
103
+
104
+ # Build normalized primitives
105
+ paths = list(self.primitives.keys())
106
+ for i, path in enumerate(paths):
107
+ normalized[path] = Primitives(
108
+ structural_entropy=entropy_z[i],
109
+ network_centrality=centrality_z[i],
110
+ churn_volatility=volatility_z[i],
111
+ semantic_coherence=coherence_z[i],
112
+ cognitive_load=load_z[i],
113
+ )
114
+
115
+ return normalized
116
+
117
+ def _normalize_multivariate(
118
+ self, normalized: Dict[str, Primitives]
119
+ ) -> Dict[str, Primitives]:
120
+ """
121
+ Normalize using Mahalanobis distance for multivariate analysis.
122
+
123
+ Considers correlations between primitives, not just individual values.
124
+ """
125
+ # Build feature matrix
126
+ paths = list(self.primitives.keys())
127
+ n = len(paths)
128
+
129
+ features = np.array(
130
+ [
131
+ [
132
+ self.primitives[path].structural_entropy,
133
+ self.primitives[path].network_centrality,
134
+ self.primitives[path].churn_volatility,
135
+ self.primitives[path].semantic_coherence,
136
+ self.primitives[path].cognitive_load,
137
+ ]
138
+ for path in paths
139
+ ]
140
+ )
141
+
142
+ # Compute mean and covariance
143
+ mean_vec = np.mean(features, axis=0)
144
+ cov_matrix = np.cov(features, rowvar=False)
145
+
146
+ # Compute Mahalanobis distance for each file
147
+ mahalanobis_distances = []
148
+ for i in range(n):
149
+ dist = Statistics.mahalanobis_distance(features[i], mean_vec, cov_matrix)
150
+ mahalanobis_distances.append(dist)
151
+
152
+ # Convert distances to z-like scores
153
+ # MD² follows chi-squared distribution with k degrees of freedom
154
+ k = 5 # number of dimensions
155
+ from scipy import stats
156
+
157
+ # Convert MD² to p-values
158
+ p_values = [
159
+ 1 - stats.chi2.cdf(dist, k) for dist in mahalanobis_distances
160
+ ]
161
+
162
+ # Convert to z-scores via inverse CDF.
163
+ # p ≈ 0 means maximally significant (large z), p ≈ 1 means no anomaly.
164
+ z_scores = []
165
+ for p in p_values:
166
+ if p <= 0:
167
+ z_scores.append(10.0) # practical cap for extreme significance
168
+ elif p >= 1:
169
+ z_scores.append(0.0)
170
+ else:
171
+ z_scores.append(stats.norm.ppf(1 - p))
172
+
173
+ # Normalize each primitive separately for reporting
174
+ entropy_vals = features[:, 0]
175
+ centrality_vals = features[:, 1]
176
+ volatility_vals = features[:, 2]
177
+ coherence_vals = features[:, 3]
178
+ load_vals = features[:, 4]
179
+
180
+ entropy_z = Statistics.z_scores(entropy_vals.tolist())
181
+ centrality_z = Statistics.z_scores(centrality_vals.tolist())
182
+ volatility_z = Statistics.z_scores(volatility_vals.tolist())
183
+ coherence_z = Statistics.z_scores(coherence_vals.tolist())
184
+ load_z = Statistics.z_scores(load_vals.tolist())
185
+
186
+ # Build normalized primitives
187
+ for i, path in enumerate(paths):
188
+ normalized[path] = Primitives(
189
+ structural_entropy=entropy_z[i],
190
+ network_centrality=centrality_z[i],
191
+ churn_volatility=volatility_z[i],
192
+ semantic_coherence=coherence_z[i],
193
+ cognitive_load=load_z[i],
194
+ )
195
+
196
+ # Store Mahalanobis scores for later use
197
+ self.mahalanobis_scores = {paths[i]: z_scores[i] for i in range(n)}
198
+
199
+ return normalized
200
+
201
+ def detect_anomalies(
202
+ self, normalized: Dict[str, Primitives]
203
+ ) -> Dict[str, List[str]]:
204
+ """
205
+ Detect which primitives are anomalous.
206
+
207
+ For multivariate mode: Uses Mahalanobis distance
208
+ For univariate mode: Uses individual z-scores
209
+
210
+ Returns:
211
+ Dictionary mapping file paths to list of anomaly flags
212
+ """
213
+ anomalies = {}
214
+
215
+ if self.use_multivariate and hasattr(self, "mahalanobis_scores"):
216
+ return self._detect_multivariate_anomalies(normalized)
217
+
218
+ # Univariate detection
219
+ for path, prims in normalized.items():
220
+ flags = []
221
+
222
+ if abs(prims.structural_entropy) > self.threshold:
223
+ direction = "high" if prims.structural_entropy > 0 else "low"
224
+ flags.append(f"structural_entropy_{direction}")
225
+
226
+ if prims.network_centrality > self.threshold:
227
+ flags.append("high_centrality")
228
+
229
+ if abs(prims.churn_volatility) > self.threshold:
230
+ flags.append("high_volatility")
231
+
232
+ if abs(prims.semantic_coherence) > self.threshold:
233
+ direction = "low" if prims.semantic_coherence < 0 else "high"
234
+ flags.append(f"semantic_coherence_{direction}")
235
+
236
+ if prims.cognitive_load > self.threshold:
237
+ flags.append("high_cognitive_load")
238
+
239
+ if flags:
240
+ anomalies[path] = flags
241
+
242
+ return anomalies
243
+
244
+ def _detect_multivariate_anomalies(
245
+ self, normalized: Dict[str, Primitives]
246
+ ) -> Dict[str, List[str]]:
247
+ """
248
+ Detect anomalies using Mahalanobis distance.
249
+
250
+ Also identifies which specific primitives contribute to the anomaly.
251
+ """
252
+ anomalies = {}
253
+
254
+ # Use chi-squared critical value for significance
255
+ k = 5 # number of dimensions
256
+ from scipy import stats
257
+
258
+ critical_value = stats.chi2.ppf(0.95, k) # 95% confidence
259
+
260
+ for path, mahalanobis_z in self.mahalanobis_scores.items():
261
+ if abs(mahalanobis_z) > self.threshold:
262
+ prims = normalized[path]
263
+ flags = []
264
+
265
+ # Identify which primitives are most anomalous.
266
+ # TODO: The 0.5 multiplier on threshold is a heuristic that
267
+ # relaxes per-primitive thresholds in multivariate mode.
268
+ # A rigorous alternative: decompose Mahalanobis D² into
269
+ # per-variable contributions via (x_i - mu_i)^2 * [Sigma^-1]_ii
270
+ # and compare each to its marginal chi-squared critical value.
271
+ if abs(prims.structural_entropy) > self.threshold * 0.5:
272
+ direction = "high" if prims.structural_entropy > 0 else "low"
273
+ flags.append(f"structural_entropy_{direction}")
274
+
275
+ if prims.network_centrality > self.threshold * 0.5:
276
+ flags.append("high_centrality")
277
+
278
+ if abs(prims.churn_volatility) > self.threshold * 0.5:
279
+ flags.append("high_volatility")
280
+
281
+ if abs(prims.semantic_coherence) > self.threshold * 0.5:
282
+ direction = "low" if prims.semantic_coherence < 0 else "high"
283
+ flags.append(f"semantic_coherence_{direction}")
284
+
285
+ if prims.cognitive_load > self.threshold * 0.5:
286
+ flags.append("high_cognitive_load")
287
+
288
+ if flags:
289
+ anomalies[path] = flags
290
+
291
+ return anomalies
292
+
293
+ def detect_outliers(
294
+ self, values: List[float], method: str = "grubbs"
295
+ ) -> List[int]:
296
+ """
297
+ Detect outliers in a univariate dataset.
298
+
299
+ Args:
300
+ values: List of values
301
+ method: Detection method ('grubbs', 'iqr', 'mad')
302
+
303
+ Returns:
304
+ List of outlier indices
305
+ """
306
+ if method == "grubbs":
307
+ result = Statistics.grubbs_test(values)
308
+ if result:
309
+ return [result[0]]
310
+ return []
311
+ elif method == "iqr":
312
+ outliers = RobustStatistics.iqr_outliers(values)
313
+ return [i for i, is_outlier in enumerate(outliers) if is_outlier]
314
+ elif method == "mad":
315
+ modified_z = RobustStatistics.modified_z_score(values)
316
+ return [i for i, z in enumerate(modified_z) if abs(z) > 3.5]
317
+ else:
318
+ raise ValueError(f"Unknown method: {method}")
@@ -0,0 +1,278 @@
1
+ """Extract the 5 orthogonal quality primitives"""
2
+
3
+ from pathlib import Path
4
+ from collections import defaultdict
5
+ from typing import Dict, List, Set, Optional
6
+ from datetime import datetime
7
+ import numpy as np
8
+
9
+ from ..models import FileMetrics, Primitives
10
+ from ..cache import AnalysisCache
11
+ from ..logging_config import get_logger
12
+ from ..math import Entropy, GraphMetrics, RobustStatistics as RobustStats
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ class PrimitiveExtractor:
18
+ """Extract the 5 orthogonal quality primitives"""
19
+
20
+ def __init__(
21
+ self,
22
+ files: List[FileMetrics],
23
+ cache: Optional[AnalysisCache] = None,
24
+ config_hash: str = ""
25
+ ):
26
+ self.files = files
27
+ self.file_map = {f.path: f for f in files}
28
+ self.cache = cache
29
+ self.config_hash = config_hash
30
+ logger.debug(f"Initialized PrimitiveExtractor for {len(files)} files")
31
+
32
+ def extract_all(self) -> Dict[str, Primitives]:
33
+ """Extract all 5 primitives for each file"""
34
+ results = {}
35
+
36
+ # Build dependency graph (needed for centrality)
37
+ dep_graph = self._build_dependency_graph()
38
+
39
+ # Compute each primitive
40
+ entropies = self._compute_structural_entropy()
41
+ centralities = self._compute_network_centrality(dep_graph)
42
+ volatilities = self._compute_churn_volatility()
43
+ coherences = self._compute_semantic_coherence()
44
+ loads = self._compute_cognitive_load()
45
+
46
+ for file in self.files:
47
+ results[file.path] = Primitives(
48
+ structural_entropy=entropies.get(file.path, 0),
49
+ network_centrality=centralities.get(file.path, 0),
50
+ churn_volatility=volatilities.get(file.path, 0),
51
+ semantic_coherence=coherences.get(file.path, 0),
52
+ cognitive_load=loads.get(file.path, 0),
53
+ )
54
+
55
+ return results
56
+
57
+ # ---- Primitive 1: Structural Entropy ----
58
+
59
+ def _compute_structural_entropy(self) -> Dict[str, float]:
60
+ """Compute normalized entropy of AST node type distribution."""
61
+ entropies = {}
62
+
63
+ for file in self.files:
64
+ if not file.ast_node_types or sum(file.ast_node_types.values()) == 0:
65
+ entropies[file.path] = 0.0
66
+ continue
67
+
68
+ entropies[file.path] = Entropy.normalized(file.ast_node_types)
69
+
70
+ return entropies
71
+
72
+ # ---- Primitive 2: Network Centrality ----
73
+
74
+ # Standard library modules to ignore when building dependency graph
75
+ _STDLIB_NAMES = frozenset({
76
+ "abc", "ast", "asyncio", "base64", "bisect", "builtins", "calendar",
77
+ "cmath", "codecs", "collections", "concurrent", "contextlib", "copy",
78
+ "csv", "ctypes", "dataclasses", "datetime", "decimal", "difflib",
79
+ "email", "enum", "errno", "fcntl", "fileinput", "fnmatch", "fractions",
80
+ "ftplib", "functools", "gc", "getpass", "glob", "gzip", "hashlib",
81
+ "heapq", "hmac", "html", "http", "importlib", "inspect", "io",
82
+ "itertools", "json", "logging", "lzma", "math", "mimetypes",
83
+ "multiprocessing", "operator", "os", "pathlib", "pickle", "platform",
84
+ "pprint", "queue", "random", "re", "secrets", "select", "shelve",
85
+ "shlex", "shutil", "signal", "socket", "sqlite3", "ssl",
86
+ "statistics", "string", "struct", "subprocess", "sys", "tempfile",
87
+ "textwrap", "threading", "time", "timeit", "tkinter", "token",
88
+ "tomllib", "traceback", "types", "typing", "unicodedata", "unittest",
89
+ "urllib", "uuid", "venv", "warnings", "weakref", "xml", "zipfile",
90
+ "zlib",
91
+ })
92
+
93
+ # Common third-party packages to ignore
94
+ _THIRDPARTY_NAMES = frozenset({
95
+ "numpy", "np", "pandas", "pd", "scipy", "sklearn", "matplotlib",
96
+ "plt", "seaborn", "requests", "flask", "django", "fastapi",
97
+ "pydantic", "typer", "click", "rich", "diskcache", "pytest",
98
+ "setuptools", "wheel", "pip", "pkg_resources",
99
+ })
100
+
101
+ def _build_dependency_graph(self) -> Dict[str, Set[str]]:
102
+ """Build file dependency graph from internal project imports only."""
103
+ graph = defaultdict(set)
104
+
105
+ # Map import paths to actual files
106
+ file_by_name = {}
107
+ for file in self.files:
108
+ name = Path(file.path).stem
109
+ file_by_name[name] = file.path
110
+
111
+ skip_names = self._STDLIB_NAMES | self._THIRDPARTY_NAMES
112
+
113
+ for file in self.files:
114
+ for imp in file.imports:
115
+ # Extract the leaf module name
116
+ pkg_name = imp.split("/")[-1].split(".")[-1]
117
+
118
+ # Skip stdlib and third-party imports
119
+ if pkg_name in skip_names:
120
+ continue
121
+
122
+ # Skip relative import markers
123
+ if pkg_name.startswith(".") or pkg_name == "":
124
+ continue
125
+
126
+ # Only add edge if it points to a different file
127
+ if pkg_name in file_by_name and file_by_name[pkg_name] != file.path:
128
+ graph[file.path].add(file_by_name[pkg_name])
129
+
130
+ return dict(graph)
131
+
132
+ def _compute_network_centrality(
133
+ self, graph: Dict[str, Set[str]]
134
+ ) -> Dict[str, float]:
135
+ """Compute PageRank centrality.
136
+
137
+ TODO: Delegate to GraphMetrics.pagerank() for consistency and
138
+ correctness. The inline implementation differs from the canonical
139
+ version: it uses (1-d) instead of (1-d)/N for base probability,
140
+ has no dangling-node redistribution, no convergence check, and
141
+ misses nodes that appear only as targets. The min-max normalization
142
+ at the end masks the base-probability difference but the other
143
+ issues remain.
144
+ """
145
+ # Initialize PageRank scores
146
+ scores = {f.path: 1.0 for f in self.files}
147
+ damping = 0.85
148
+ iterations = 20
149
+
150
+ # Build reverse graph (incoming edges)
151
+ incoming = defaultdict(set)
152
+ for src, targets in graph.items():
153
+ for tgt in targets:
154
+ incoming[tgt].add(src)
155
+
156
+ # PageRank iteration
157
+ for _ in range(iterations):
158
+ new_scores = {}
159
+
160
+ for file in self.files:
161
+ # Base probability
162
+ rank = 1 - damping
163
+
164
+ # Add contribution from incoming edges
165
+ for src in incoming.get(file.path, []):
166
+ out_degree = len(graph.get(src, []))
167
+ if out_degree > 0:
168
+ rank += damping * (scores[src] / out_degree)
169
+
170
+ new_scores[file.path] = rank
171
+
172
+ scores = new_scores
173
+
174
+ # Normalize to [0, 1]
175
+ if scores:
176
+ max_score = max(scores.values())
177
+ if max_score > 0:
178
+ scores = {k: v / max_score for k, v in scores.items()}
179
+
180
+ return scores
181
+
182
+ # ---- Primitive 3: Churn Volatility ----
183
+
184
+ def _compute_churn_volatility(self) -> Dict[str, float]:
185
+ """Compute volatility of file modifications (filesystem-based)"""
186
+ volatilities = {}
187
+
188
+ # Since no git history, use file modification time as proxy
189
+ now = datetime.now().timestamp()
190
+ ages = [now - f.last_modified for f in self.files]
191
+
192
+ if not ages:
193
+ return {}
194
+
195
+ # Normalize age to volatility score
196
+ # Recent changes = high volatility
197
+ max_age = max(ages)
198
+
199
+ for file in self.files:
200
+ age = now - file.last_modified
201
+ # Invert: older = more stable = lower volatility
202
+ volatility = 1 - (age / max_age) if max_age > 0 else 0
203
+ volatilities[file.path] = volatility
204
+
205
+ return volatilities
206
+
207
+ # ---- Primitive 4: Semantic Coherence ----
208
+
209
+ def _compute_semantic_coherence(self) -> Dict[str, float]:
210
+ """Compute semantic coherence via TF-IDF clustering"""
211
+ from sklearn.feature_extraction.text import TfidfVectorizer
212
+ from sklearn.metrics.pairwise import cosine_similarity
213
+
214
+ # Build document corpus (use imports + exports)
215
+ documents = []
216
+ paths = []
217
+
218
+ for file in self.files:
219
+ tokens = file.imports + file.exports
220
+ doc = " ".join(tokens) if tokens else "empty"
221
+ documents.append(doc)
222
+ paths.append(file.path)
223
+
224
+ if len(documents) < 2:
225
+ return {f.path: 1.0 for f in self.files}
226
+
227
+ # Compute TF-IDF vectors
228
+ vectorizer = TfidfVectorizer(min_df=1, max_df=0.8)
229
+ try:
230
+ tfidf_matrix = vectorizer.fit_transform(documents)
231
+ except Exception as e:
232
+ logger.warning(f"TF-IDF vectorization failed: {e}")
233
+ return {f.path: 1.0 for f in self.files}
234
+
235
+ # Compute pairwise similarities
236
+ similarities = cosine_similarity(tfidf_matrix)
237
+
238
+ coherences = {}
239
+ n = len(paths)
240
+ for i, path in enumerate(paths):
241
+ # Coherence = average similarity to OTHER files (exclude self)
242
+ if n > 1:
243
+ other_sims = [similarities[i][j] for j in range(n) if j != i]
244
+ coherences[path] = float(np.mean(other_sims))
245
+ else:
246
+ coherences[path] = 1.0
247
+
248
+ return coherences
249
+
250
+ # ---- Primitive 5: Cognitive Load ----
251
+
252
+ def _compute_cognitive_load(self) -> Dict[str, float]:
253
+ """Compute cognitive load = concepts × complexity.
254
+
255
+ TODO: The formula CL = concepts * complexity * (1 + depth/10) is a
256
+ hand-tuned heuristic with no academic citation. The /10 divisor
257
+ maps typical nesting depths (0-5) to a 0-50% multiplier. Consider
258
+ citing Cant et al. (1995) "A Conceptual Complexity Metric" or
259
+ replacing with a validated cognitive complexity model such as
260
+ SonarSource's Cognitive Complexity (2017).
261
+ """
262
+ loads = {}
263
+
264
+ for file in self.files:
265
+ # Concepts = functions + structs + interfaces
266
+ concepts = file.functions + file.structs + file.interfaces
267
+
268
+ # Cognitive load = concepts × complexity × nesting
269
+ load = concepts * file.complexity_score * (1 + file.nesting_depth / 10)
270
+ loads[file.path] = load
271
+
272
+ # Normalize to [0, 1]
273
+ if loads:
274
+ max_load = max(loads.values())
275
+ if max_load > 0:
276
+ loads = {k: v / max_load for k, v in loads.items()}
277
+
278
+ return loads