shannon-codebase-insight 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. shannon_codebase_insight-0.4.0.dist-info/METADATA +209 -0
  2. shannon_codebase_insight-0.4.0.dist-info/RECORD +37 -0
  3. shannon_codebase_insight-0.4.0.dist-info/WHEEL +5 -0
  4. shannon_codebase_insight-0.4.0.dist-info/entry_points.txt +7 -0
  5. shannon_codebase_insight-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. shannon_codebase_insight-0.4.0.dist-info/top_level.txt +1 -0
  7. shannon_insight/__init__.py +25 -0
  8. shannon_insight/analyzers/__init__.py +8 -0
  9. shannon_insight/analyzers/base.py +215 -0
  10. shannon_insight/analyzers/go_analyzer.py +150 -0
  11. shannon_insight/analyzers/python_analyzer.py +169 -0
  12. shannon_insight/analyzers/typescript_analyzer.py +162 -0
  13. shannon_insight/cache.py +214 -0
  14. shannon_insight/cli.py +333 -0
  15. shannon_insight/config.py +235 -0
  16. shannon_insight/core.py +546 -0
  17. shannon_insight/exceptions/__init__.py +31 -0
  18. shannon_insight/exceptions/analysis.py +78 -0
  19. shannon_insight/exceptions/base.py +18 -0
  20. shannon_insight/exceptions/config.py +48 -0
  21. shannon_insight/file_ops.py +218 -0
  22. shannon_insight/logging_config.py +98 -0
  23. shannon_insight/math/__init__.py +15 -0
  24. shannon_insight/math/entropy.py +133 -0
  25. shannon_insight/math/fusion.py +109 -0
  26. shannon_insight/math/graph.py +209 -0
  27. shannon_insight/math/robust.py +106 -0
  28. shannon_insight/math/statistics.py +159 -0
  29. shannon_insight/models.py +48 -0
  30. shannon_insight/primitives/__init__.py +13 -0
  31. shannon_insight/primitives/detector.py +318 -0
  32. shannon_insight/primitives/extractor.py +278 -0
  33. shannon_insight/primitives/fusion.py +373 -0
  34. shannon_insight/primitives/recommendations.py +158 -0
  35. shannon_insight/py.typed +2 -0
  36. shannon_insight/security.py +284 -0
  37. shannon_insight/utils/__init__.py +1 -0
@@ -0,0 +1,209 @@
1
+ """Graph theory: PageRank, betweenness centrality, eigenvector centrality."""
2
+
3
+ import math
4
+ from typing import Dict, List
5
+
6
+
7
+ class GraphMetrics:
8
+ """Graph theory calculations for dependency graphs."""
9
+
10
+ @staticmethod
11
+ def pagerank(
12
+ adjacency: Dict[str, List[str]],
13
+ damping: float = 0.85,
14
+ iterations: int = 100,
15
+ tolerance: float = 1e-6,
16
+ ) -> Dict[str, float]:
17
+ """
18
+ Compute PageRank using power iteration.
19
+
20
+ PR(A) = (1 - d) + d * Σ (PR(Ti) / C(Ti))
21
+
22
+ Args:
23
+ adjacency: Node -> list of neighbors
24
+ damping: Damping factor (0.85 is standard)
25
+ iterations: Maximum iterations
26
+ tolerance: Convergence tolerance
27
+
28
+ Returns:
29
+ Dictionary mapping nodes to PageRank scores
30
+ """
31
+ # Work on a copy to avoid mutating the caller's data structure.
32
+ adj: Dict[str, List[str]] = {k: list(v) for k, v in adjacency.items()}
33
+
34
+ nodes = set(adj.keys())
35
+ for neighbors in adj.values():
36
+ nodes.update(neighbors)
37
+
38
+ if not nodes:
39
+ return {}
40
+
41
+ N = len(nodes)
42
+ rank = {node: 1.0 / N for node in nodes}
43
+
44
+ # Identify dangling nodes (no outgoing edges).
45
+ # Standard treatment: redistribute their mass uniformly to all nodes.
46
+ # Reference: Langville & Meyer, "Google's PageRank and Beyond" (2006), Ch. 3.
47
+ dangling = [node for node in nodes if node not in adj or len(adj[node]) == 0]
48
+
49
+ # Ensure every node has an adjacency entry (possibly empty).
50
+ for node in nodes:
51
+ if node not in adj:
52
+ adj[node] = []
53
+
54
+ out_degree = {node: len(neighbors) for node, neighbors in adj.items()}
55
+
56
+ reverse: Dict[str, List[str]] = {node: [] for node in nodes}
57
+ for src, neighbors in adj.items():
58
+ for tgt in neighbors:
59
+ if tgt in reverse:
60
+ reverse[tgt].append(src)
61
+
62
+ for _ in range(iterations):
63
+ new_rank = {}
64
+ max_diff = 0.0
65
+
66
+ # Sum of rank mass sitting on dangling nodes.
67
+ dangling_sum = sum(rank[node] for node in dangling)
68
+
69
+ for node in nodes:
70
+ # Teleportation + dangling-node redistribution.
71
+ new_rank[node] = (1 - damping) / N + damping * dangling_sum / N
72
+
73
+ for src in reverse[node]:
74
+ if out_degree[src] > 0:
75
+ new_rank[node] += damping * (rank[src] / out_degree[src])
76
+
77
+ diff = abs(new_rank[node] - rank[node])
78
+ max_diff = max(max_diff, diff)
79
+
80
+ rank = new_rank
81
+
82
+ if max_diff < tolerance:
83
+ break
84
+
85
+ return rank
86
+
87
+ @staticmethod
88
+ def betweenness_centrality(
89
+ adjacency: Dict[str, List[str]], normalize: bool = True
90
+ ) -> Dict[str, float]:
91
+ """
92
+ Compute betweenness centrality using Brandes' algorithm.
93
+
94
+ C_B(v) = Σ (σ_st(v) / σ_st) where s != v != t
95
+
96
+ Args:
97
+ adjacency: Node -> list of neighbors
98
+ normalize: Normalize by (n-1)(n-2)/2 for undirected graphs
99
+
100
+ Returns:
101
+ Dictionary mapping nodes to betweenness centrality
102
+ """
103
+ nodes = set(adjacency.keys())
104
+ for neighbors in adjacency.values():
105
+ nodes.update(neighbors)
106
+
107
+ betweenness = {node: 0.0 for node in nodes}
108
+
109
+ for s in nodes:
110
+ stack: List[str] = []
111
+ predecessors: Dict[str, List[str]] = {v: [] for v in nodes}
112
+ sigma = {v: 0 for v in nodes}
113
+ sigma[s] = 1
114
+
115
+ dist = {v: -1 for v in nodes}
116
+ dist[s] = 0
117
+
118
+ queue = [s]
119
+
120
+ while queue:
121
+ v = queue.pop(0)
122
+ stack.append(v)
123
+
124
+ for w in adjacency.get(v, []):
125
+ if dist[w] < 0:
126
+ dist[w] = dist[v] + 1
127
+ queue.append(w)
128
+
129
+ if dist[w] == dist[v] + 1:
130
+ sigma[w] += sigma[v]
131
+ predecessors[w].append(v)
132
+
133
+ delta = {v: 0.0 for v in nodes}
134
+
135
+ while stack:
136
+ w = stack.pop()
137
+ for v in predecessors[w]:
138
+ delta[v] += (sigma[v] / sigma[w]) * (1 + delta[w])
139
+ if w != s:
140
+ betweenness[w] += delta[w]
141
+
142
+ if normalize:
143
+ n = len(nodes)
144
+ if n > 2:
145
+ # Directed graph: normalize by (n-1)(n-2).
146
+ # The BFS follows directed edges, so the factor-of-2 used
147
+ # for undirected graphs does not apply here.
148
+ # Reference: Brandes (2001), Section 4.
149
+ scale = 1.0 / ((n - 1) * (n - 2))
150
+ betweenness = {k: v * scale for k, v in betweenness.items()}
151
+
152
+ return betweenness
153
+
154
+ @staticmethod
155
+ def eigenvector_centrality(
156
+ adjacency: Dict[str, List[str]], iterations: int = 100, tolerance: float = 1e-6
157
+ ) -> Dict[str, float]:
158
+ """
159
+ Compute eigenvector centrality using power iteration.
160
+
161
+ x_i = (1/lambda) Σ A_ij x_j
162
+
163
+ Args:
164
+ adjacency: Node -> list of neighbors
165
+ iterations: Maximum iterations
166
+ tolerance: Convergence tolerance
167
+
168
+ Returns:
169
+ Dictionary mapping nodes to eigenvector centrality
170
+ """
171
+ # Collect ALL nodes — including those that appear only as targets.
172
+ nodes_set = set(adjacency.keys())
173
+ for neighbors in adjacency.values():
174
+ nodes_set.update(neighbors)
175
+ nodes = list(nodes_set)
176
+
177
+ if not nodes:
178
+ return {}
179
+
180
+ # TODO: Eigenvector centrality is ill-defined for disconnected graphs.
181
+ # The Perron-Frobenius theorem guarantees a unique positive leading
182
+ # eigenvector only for strongly connected (or irreducible) graphs.
183
+ # For disconnected graphs, smaller components may converge to zero.
184
+ # Consider falling back to PageRank or warning the caller.
185
+ # Reference: Newman, "Networks: An Introduction" (2010), Section 7.2.
186
+
187
+ x = {node: 1.0 for node in nodes}
188
+
189
+ for _ in range(iterations):
190
+ new_x = {}
191
+ max_diff = 0.0
192
+
193
+ for node in nodes:
194
+ sum_neighbors = sum(x.get(nbr, 0.0) for nbr in adjacency.get(node, []))
195
+ new_x[node] = sum_neighbors
196
+
197
+ norm = math.sqrt(sum(v * v for v in new_x.values()))
198
+ if norm > 0:
199
+ new_x = {k: v / norm for k, v in new_x.items()}
200
+
201
+ for node in nodes:
202
+ diff = abs(new_x[node] - x[node])
203
+ max_diff = max(max_diff, diff)
204
+
205
+ x = new_x
206
+ if max_diff < tolerance:
207
+ break
208
+
209
+ return x
@@ -0,0 +1,106 @@
1
+ """Robust statistics: MAD, modified z-scores, IQR, isolation forest."""
2
+
3
+ from typing import List, Optional, Union
4
+
5
+ import numpy as np
6
+
7
+
8
+ class RobustStatistics:
9
+ """Robust statistical methods resistant to outliers."""
10
+
11
+ @staticmethod
12
+ def median_absolute_deviation(values: Union[List[float], np.ndarray]) -> float:
13
+ """
14
+ Median Absolute Deviation: MAD = median(|x_i - median(x)|).
15
+
16
+ Args:
17
+ values: List or array of values
18
+
19
+ Returns:
20
+ MAD value
21
+ """
22
+ median_val = np.median(values)
23
+
24
+ if isinstance(values, np.ndarray):
25
+ deviations = np.abs(values - median_val)
26
+ else:
27
+ deviations = [abs(x - median_val) for x in values]
28
+
29
+ return float(np.median(deviations))
30
+
31
+ @staticmethod
32
+ def modified_z_score(
33
+ values: Union[List[float], np.ndarray], threshold: float = 3.5
34
+ ) -> List[float]:
35
+ """
36
+ Modified z-scores using MAD (robust to outliers).
37
+
38
+ M_i = 0.6745 * (x_i - median) / MAD
39
+
40
+ Args:
41
+ values: List of values
42
+ threshold: Outlier threshold (default 3.5)
43
+
44
+ Returns:
45
+ List of modified z-scores
46
+ """
47
+ median_val = float(np.median(values))
48
+ mad = RobustStatistics.median_absolute_deviation(values)
49
+
50
+ if mad == 0:
51
+ return [0.0] * len(values)
52
+
53
+ constant = 0.6745 # Normal distribution consistency constant
54
+ return [constant * (x - median_val) / mad for x in values]
55
+
56
+ @staticmethod
57
+ def iqr_outliers(values: List[float], multiplier: float = 1.5) -> List[bool]:
58
+ """
59
+ Detect outliers using Interquartile Range.
60
+
61
+ Outlier if x < Q1 - k*IQR or x > Q3 + k*IQR
62
+
63
+ Args:
64
+ values: List of values
65
+ multiplier: IQR multiplier (default 1.5)
66
+
67
+ Returns:
68
+ List of booleans indicating outliers
69
+ """
70
+ q1 = float(np.percentile(values, 25))
71
+ q3 = float(np.percentile(values, 75))
72
+ iqr = q3 - q1
73
+
74
+ lower_bound = q1 - multiplier * iqr
75
+ upper_bound = q3 + multiplier * iqr
76
+
77
+ return [(x < lower_bound or x > upper_bound) for x in values]
78
+
79
+ @staticmethod
80
+ def isolation_forest_outliers(
81
+ values: np.ndarray, contamination: Optional[float] = 0.1
82
+ ) -> np.ndarray:
83
+ """
84
+ Detect outliers using isolation forest.
85
+
86
+ Args:
87
+ values: Array of values
88
+ contamination: Expected proportion of outliers
89
+
90
+ Returns:
91
+ Boolean array indicating outliers
92
+ """
93
+ try:
94
+ from sklearn.ensemble import IsolationForest
95
+
96
+ contamination_val = "auto" if contamination is None else contamination
97
+ clf = IsolationForest(contamination=contamination_val, random_state=42)
98
+ outliers = clf.fit_predict(values.reshape(-1, 1))
99
+ return np.array([o == -1 for o in outliers])
100
+
101
+ except (ImportError, Exception):
102
+ return np.array(
103
+ RobustStatistics.iqr_outliers(
104
+ values.tolist() if hasattr(values, "tolist") else list(values)
105
+ )
106
+ )
@@ -0,0 +1,159 @@
1
+ """Descriptive and inferential statistics: z-scores, Mahalanobis, Grubbs' test."""
2
+
3
+ import math
4
+ import statistics as stdlib_stats
5
+ from typing import List, Optional, Tuple
6
+
7
+ import numpy as np
8
+
9
+
10
+ class Statistics:
11
+ """Statistical analysis methods."""
12
+
13
+ @staticmethod
14
+ def mean(values: List[float]) -> float:
15
+ """Compute arithmetic mean."""
16
+ if not values:
17
+ return 0.0
18
+ return stdlib_stats.mean(values)
19
+
20
+ @staticmethod
21
+ def stdev(values: List[float]) -> float:
22
+ """Compute sample standard deviation."""
23
+ if len(values) < 2:
24
+ return 0.0
25
+ return stdlib_stats.stdev(values)
26
+
27
+ @staticmethod
28
+ def z_scores(values: List[float]) -> List[float]:
29
+ """
30
+ Compute z-scores: z = (x - mu) / sigma.
31
+
32
+ Args:
33
+ values: List of values
34
+
35
+ Returns:
36
+ List of z-scores
37
+ """
38
+ if not values or len(values) < 2:
39
+ return [0.0] * len(values)
40
+
41
+ mean_val = Statistics.mean(values)
42
+ stdev_val = Statistics.stdev(values)
43
+
44
+ if stdev_val == 0:
45
+ return [0.0] * len(values)
46
+
47
+ return [(x - mean_val) / stdev_val for x in values]
48
+
49
+ @staticmethod
50
+ def z_score(x: float, mean: float, std: float) -> float:
51
+ """Compute single z-score: z = (x - mu) / sigma."""
52
+ if std == 0:
53
+ return 0.0
54
+ return (x - mean) / std
55
+
56
+ @staticmethod
57
+ def mahalanobis_distance(
58
+ point: np.ndarray, mean: np.ndarray, cov_matrix: np.ndarray
59
+ ) -> float:
60
+ """
61
+ Compute Mahalanobis distance: D^2 = (x - mu)^T Sigma^-1 (x - mu).
62
+
63
+ Args:
64
+ point: Observation vector
65
+ mean: Mean vector
66
+ cov_matrix: Covariance matrix
67
+
68
+ Returns:
69
+ Mahalanobis distance (squared)
70
+ """
71
+ diff = point - mean
72
+
73
+ try:
74
+ inv_cov = np.linalg.inv(cov_matrix)
75
+ except np.linalg.LinAlgError:
76
+ inv_cov = np.linalg.pinv(cov_matrix)
77
+
78
+ distance = diff.T @ inv_cov @ diff
79
+ return float(distance)
80
+
81
+ @staticmethod
82
+ def grubbs_test(
83
+ values: List[float], alpha: float = 0.05
84
+ ) -> Optional[Tuple[int, float]]:
85
+ """
86
+ Grubbs' test for detecting a single outlier.
87
+
88
+ G = (max|x_i - x_bar|) / s
89
+
90
+ Args:
91
+ values: List of values
92
+ alpha: Significance level (default 0.05)
93
+
94
+ Returns:
95
+ Tuple of (outlier_index, G_statistic) if outlier found, None otherwise
96
+ """
97
+ n = len(values)
98
+ if n < 3:
99
+ return None
100
+
101
+ mean_val = float(np.mean(values))
102
+ std_val = float(np.std(values, ddof=1))
103
+
104
+ if std_val == 0:
105
+ return None
106
+
107
+ deviations = [abs(x - mean_val) for x in values]
108
+ max_deviation = max(deviations)
109
+ outlier_index = deviations.index(max_deviation)
110
+
111
+ G = max_deviation / std_val
112
+
113
+ t_critical = Statistics._t_critical_value(alpha / (2 * n), n - 2)
114
+ G_critical = ((n - 1) / math.sqrt(n)) * math.sqrt(
115
+ t_critical**2 / (n - 2 + t_critical**2)
116
+ )
117
+
118
+ if G > G_critical:
119
+ return outlier_index, float(G)
120
+
121
+ return None
122
+
123
+ @staticmethod
124
+ def _t_critical_value(alpha: float, df: int) -> float:
125
+ """Inverse t-distribution critical value."""
126
+ from scipy import stats as sp_stats
127
+
128
+ return float(sp_stats.t.ppf(1 - alpha, df))
129
+
130
+ @staticmethod
131
+ def confidence_interval(
132
+ values: List[float], confidence: float = 0.95
133
+ ) -> Tuple[float, float]:
134
+ """
135
+ Confidence interval for the mean.
136
+
137
+ CI = x_bar +/- t_(alpha/2, n-1) * s / sqrt(n)
138
+
139
+ Args:
140
+ values: Sample values
141
+ confidence: Confidence level (default 0.95)
142
+
143
+ Returns:
144
+ Tuple of (lower_bound, upper_bound)
145
+ """
146
+ n = len(values)
147
+ if n < 2:
148
+ return (values[0], values[0]) if values else (0.0, 0.0)
149
+
150
+ mean_val = float(np.mean(values))
151
+ std_val = float(np.std(values, ddof=1))
152
+ alpha = 1 - confidence
153
+
154
+ from scipy import stats as sp_stats
155
+
156
+ t_critical = sp_stats.t.ppf(1 - alpha / 2, n - 1)
157
+ margin = t_critical * std_val / math.sqrt(n)
158
+
159
+ return (mean_val - margin, mean_val + margin)
@@ -0,0 +1,48 @@
1
+ """Data models for Shannon Insight"""
2
+
3
+ from dataclasses import dataclass
4
+ from collections import Counter
5
+ from typing import List
6
+
7
+
8
+ @dataclass
9
+ class FileMetrics:
10
+ """Raw observations for a single file"""
11
+
12
+ path: str
13
+ lines: int
14
+ tokens: int
15
+ imports: List[str]
16
+ exports: List[str]
17
+ functions: int
18
+ interfaces: int
19
+ structs: int
20
+ complexity_score: float
21
+ nesting_depth: int
22
+ ast_node_types: Counter
23
+ last_modified: float
24
+
25
+
26
+ @dataclass
27
+ class Primitives:
28
+ """Five orthogonal quality primitives"""
29
+
30
+ structural_entropy: float
31
+ network_centrality: float
32
+ churn_volatility: float
33
+ semantic_coherence: float
34
+ cognitive_load: float
35
+
36
+
37
+ @dataclass
38
+ class AnomalyReport:
39
+ """Final analysis output"""
40
+
41
+ file: str
42
+ overall_score: float
43
+ confidence: float
44
+ primitives: Primitives
45
+ normalized_primitives: Primitives
46
+ anomaly_flags: List[str]
47
+ root_causes: List[str]
48
+ recommendations: List[str]
@@ -0,0 +1,13 @@
1
+ """Primitive extractors for the five quality dimensions"""
2
+
3
+ from .extractor import PrimitiveExtractor
4
+ from .fusion import SignalFusion
5
+ from .detector import AnomalyDetector
6
+ from .recommendations import RecommendationEngine
7
+
8
+ __all__ = [
9
+ "PrimitiveExtractor",
10
+ "SignalFusion",
11
+ "AnomalyDetector",
12
+ "RecommendationEngine",
13
+ ]