codebase-digest-ai 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ """
2
+ Codebase Digest - AI-native code intelligence engine.
3
+
4
+ Transform any codebase into semantic architectural understanding,
5
+ execution flows, and human-readable engineering reports.
6
+ """
7
+
8
+ __version__ = "0.1.0"
@@ -0,0 +1,7 @@
1
+ """Analysis modules for codebase intelligence."""
2
+
3
+ from .codebase_analyzer import CodebaseAnalyzer
4
+ from .flow_analyzer import FlowAnalyzer
5
+ from .metrics_analyzer import MetricsAnalyzer
6
+
7
+ __all__ = ["CodebaseAnalyzer", "FlowAnalyzer", "MetricsAnalyzer"]
@@ -0,0 +1,183 @@
1
+ """Main codebase analyzer that orchestrates parsing and analysis."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Dict, List, Set
6
+
7
+ from ..models import CodebaseAnalysis
8
+ from ..parser import PythonParser, JavaScriptParser, BaseParser
9
+ from .flow_analyzer import FlowAnalyzer
10
+ from .metrics_analyzer import MetricsAnalyzer
11
+
12
+
13
+ class CodebaseAnalyzer:
14
+ """Main analyzer that coordinates parsing and analysis of a codebase."""
15
+
16
+ def __init__(self, root_path: Path):
17
+ self.root_path = Path(root_path)
18
+ self.parsers: Dict[str, BaseParser] = {}
19
+ self._register_parsers()
20
+
21
+ # Ignore patterns
22
+ self.ignore_patterns = {
23
+ '__pycache__', '.git', '.svn', '.hg', 'node_modules',
24
+ '.pytest_cache', '.mypy_cache', '.tox', 'venv', 'env',
25
+ '.venv', 'dist', 'build', '*.egg-info', '.DS_Store'
26
+ }
27
+
28
+ def _register_parsers(self):
29
+ """Register available parsers."""
30
+ # Register parsers by extension without instantiating
31
+ python_extensions = ['.py']
32
+ js_extensions = ['.js', '.jsx', '.ts', '.tsx']
33
+
34
+ for ext in python_extensions:
35
+ self.parsers[ext] = PythonParser
36
+
37
+ for ext in js_extensions:
38
+ self.parsers[ext] = JavaScriptParser
39
+
40
+ def analyze(self) -> CodebaseAnalysis:
41
+ """Perform complete codebase analysis."""
42
+ analysis = CodebaseAnalysis(root_path=self.root_path)
43
+
44
+ # Find all relevant files
45
+ files = self._find_source_files()
46
+ analysis.total_files = len(files)
47
+
48
+ # Parse each file
49
+ for file_path in files:
50
+ self._parse_file(file_path, analysis)
51
+
52
+ # Detect entry points
53
+ analysis.entry_points = self._detect_entry_points(files)
54
+
55
+ # Analyze execution flows
56
+ flow_analyzer = FlowAnalyzer(analysis)
57
+ analysis.execution_flows = flow_analyzer.analyze_flows()
58
+
59
+ # Calculate metrics
60
+ metrics_analyzer = MetricsAnalyzer(analysis)
61
+ analysis.total_lines = metrics_analyzer.count_total_lines(files)
62
+ analysis.languages = metrics_analyzer.detect_languages(files)
63
+ analysis.complexity_score = metrics_analyzer.calculate_complexity()
64
+
65
+ # Build directory tree
66
+ analysis.directory_tree = self._build_directory_tree()
67
+
68
+ return analysis
69
+
70
+ def _find_source_files(self) -> List[Path]:
71
+ """Find all source files in the codebase."""
72
+ files = []
73
+
74
+ for root, dirs, filenames in os.walk(self.root_path):
75
+ # Filter out ignored directories
76
+ dirs[:] = [d for d in dirs if not self._should_ignore(d)]
77
+
78
+ for filename in filenames:
79
+ file_path = Path(root) / filename
80
+ if self._is_source_file(file_path) and not self._should_ignore(filename):
81
+ files.append(file_path)
82
+
83
+ return files
84
+
85
+ def _is_source_file(self, file_path: Path) -> bool:
86
+ """Check if file is a source code file."""
87
+ return file_path.suffix in self.parsers
88
+
89
+ def _should_ignore(self, name: str) -> bool:
90
+ """Check if file/directory should be ignored."""
91
+ for pattern in self.ignore_patterns:
92
+ if pattern.startswith('*'):
93
+ if name.endswith(pattern[1:]):
94
+ return True
95
+ elif name == pattern or name.startswith(pattern):
96
+ return True
97
+ return False
98
+
99
+ def _parse_file(self, file_path: Path, analysis: CodebaseAnalysis):
100
+ """Parse a single file and add results to analysis."""
101
+ parser_class = self.parsers.get(file_path.suffix)
102
+ if not parser_class:
103
+ return
104
+
105
+ try:
106
+ parser = parser_class(file_path)
107
+
108
+ # Parse symbols
109
+ symbols = parser.parse_symbols()
110
+ analysis.symbols.extend(symbols)
111
+
112
+ # Parse imports
113
+ imports = parser.parse_imports()
114
+ analysis.imports.extend(imports)
115
+
116
+ # Parse calls
117
+ calls = parser.parse_calls()
118
+ analysis.call_relations.extend(calls)
119
+
120
+ # Parse domain entities
121
+ entities = parser.parse_domain_entities()
122
+ analysis.domain_entities.extend(entities)
123
+
124
+ except Exception as e:
125
+ # Log error but continue processing
126
+ print(f"Error parsing {file_path}: {e}")
127
+
128
+ def _detect_entry_points(self, files: List[Path]) -> List[Path]:
129
+ """Detect likely entry points in the codebase."""
130
+ entry_points = []
131
+
132
+ # Common entry point patterns
133
+ entry_patterns = [
134
+ 'main.py', 'app.py', 'server.py', 'run.py', 'start.py',
135
+ 'manage.py', '__main__.py', 'wsgi.py', 'asgi.py',
136
+ 'index.js', 'server.js', 'app.js', 'main.js'
137
+ ]
138
+
139
+ for file_path in files:
140
+ if file_path.name in entry_patterns:
141
+ entry_points.append(file_path)
142
+ elif file_path.name == '__init__.py':
143
+ # Check if it's a package entry point
144
+ if self._is_package_entry_point(file_path):
145
+ entry_points.append(file_path)
146
+
147
+ return entry_points
148
+
149
+ def _is_package_entry_point(self, init_file: Path) -> bool:
150
+ """Check if __init__.py file is a package entry point."""
151
+ try:
152
+ content = init_file.read_text(encoding='utf-8')
153
+ # Simple heuristic: contains main execution logic
154
+ return 'if __name__' in content or 'main(' in content
155
+ except:
156
+ return False
157
+
158
+ def _build_directory_tree(self) -> Dict:
159
+ """Build a directory tree structure."""
160
+ tree = {}
161
+
162
+ for root, dirs, files in os.walk(self.root_path):
163
+ # Filter ignored directories
164
+ dirs[:] = [d for d in dirs if not self._should_ignore(d)]
165
+
166
+ rel_path = Path(root).relative_to(self.root_path)
167
+
168
+ # Build nested structure
169
+ current = tree
170
+ for part in rel_path.parts:
171
+ if part not in current:
172
+ current[part] = {}
173
+ current = current[part]
174
+
175
+ # Add files
176
+ source_files = [f for f in files
177
+ if self._is_source_file(Path(root) / f)
178
+ and not self._should_ignore(f)]
179
+
180
+ if source_files:
181
+ current['_files'] = source_files
182
+
183
+ return tree
@@ -0,0 +1,164 @@
1
+ """Execution flow analysis."""
2
+
3
+ import networkx as nx
4
+ from typing import List, Set, Dict
5
+ from pathlib import Path
6
+
7
+ from ..models import CodebaseAnalysis, ExecutionFlow, CallRelation
8
+
9
+
10
+ class FlowAnalyzer:
11
+ """Analyzes execution flows through the codebase."""
12
+
13
+ def __init__(self, analysis: CodebaseAnalysis):
14
+ self.analysis = analysis
15
+ self.call_graph = self._build_call_graph()
16
+
17
+ def _build_call_graph(self) -> nx.DiGraph:
18
+ """Build a directed graph of function calls."""
19
+ graph = nx.DiGraph()
20
+
21
+ for call in self.analysis.call_relations:
22
+ caller_name = call.caller_symbol.name
23
+ graph.add_edge(caller_name, call.callee_name,
24
+ caller_file=call.caller_symbol.file_path,
25
+ callee_file=call.callee_file,
26
+ line_number=call.line_number)
27
+
28
+ return graph
29
+
30
+ def analyze_flows(self) -> List[ExecutionFlow]:
31
+ """Analyze execution flows starting from entry points."""
32
+ flows = []
33
+
34
+ # Find entry point functions
35
+ entry_functions = self._find_entry_functions()
36
+
37
+ for entry_func in entry_functions:
38
+ flow = self._trace_execution_flow(entry_func)
39
+ if flow:
40
+ flows.append(flow)
41
+
42
+ # Detect common patterns
43
+ flows.extend(self._detect_common_patterns())
44
+
45
+ return flows
46
+
47
+ def _find_entry_functions(self) -> List[str]:
48
+ """Find functions that are likely entry points."""
49
+ entry_functions = []
50
+
51
+ # Look for main functions
52
+ for symbol in self.analysis.symbols:
53
+ if symbol.name in ['main', '__main__', 'app', 'run', 'start']:
54
+ entry_functions.append(symbol.name)
55
+
56
+ # Look for HTTP route handlers
57
+ for symbol in self.analysis.symbols:
58
+ if any(decorator in ['@app.route', '@router.get', '@router.post',
59
+ '@api.route', '@bp.route']
60
+ for decorator in symbol.decorators):
61
+ entry_functions.append(symbol.name)
62
+
63
+ return entry_functions
64
+
65
+ def _trace_execution_flow(self, entry_function: str) -> ExecutionFlow:
66
+ """Trace execution flow from an entry function."""
67
+ if entry_function not in self.call_graph:
68
+ return None
69
+
70
+ # Use DFS to trace the flow
71
+ visited = set()
72
+ flow_steps = []
73
+ files_involved = set()
74
+
75
+ def dfs(func_name: str, depth: int = 0):
76
+ if depth > 10 or func_name in visited: # Prevent infinite recursion
77
+ return
78
+
79
+ visited.add(func_name)
80
+ flow_steps.append(func_name)
81
+
82
+ # Add file information
83
+ for call in self.analysis.call_relations:
84
+ if call.caller_symbol.name == func_name:
85
+ files_involved.add(call.caller_symbol.file_path)
86
+ if call.callee_file:
87
+ files_involved.add(call.callee_file)
88
+
89
+ # Continue tracing
90
+ if func_name in self.call_graph:
91
+ for successor in self.call_graph.successors(func_name):
92
+ dfs(successor, depth + 1)
93
+
94
+ dfs(entry_function)
95
+
96
+ if len(flow_steps) > 1:
97
+ return ExecutionFlow(
98
+ name=f"{entry_function}_flow",
99
+ entry_point=entry_function,
100
+ steps=flow_steps,
101
+ files_involved=files_involved,
102
+ description=f"Execution flow starting from {entry_function}"
103
+ )
104
+
105
+ return None
106
+
107
+ def _detect_common_patterns(self) -> List[ExecutionFlow]:
108
+ """Detect common execution patterns."""
109
+ patterns = []
110
+
111
+ # Detect CRUD patterns
112
+ crud_flow = self._detect_crud_pattern()
113
+ if crud_flow:
114
+ patterns.append(crud_flow)
115
+
116
+ # Detect authentication patterns
117
+ auth_flow = self._detect_auth_pattern()
118
+ if auth_flow:
119
+ patterns.append(auth_flow)
120
+
121
+ return patterns
122
+
123
+ def _detect_crud_pattern(self) -> ExecutionFlow:
124
+ """Detect CRUD (Create, Read, Update, Delete) patterns."""
125
+ crud_functions = []
126
+
127
+ for symbol in self.analysis.symbols:
128
+ name_lower = symbol.name.lower()
129
+ if any(crud_word in name_lower for crud_word in
130
+ ['create', 'read', 'get', 'update', 'delete', 'save', 'find']):
131
+ crud_functions.append(symbol.name)
132
+
133
+ if len(crud_functions) >= 3: # At least 3 CRUD operations
134
+ return ExecutionFlow(
135
+ name="crud_operations",
136
+ entry_point="CRUD Operations",
137
+ steps=crud_functions,
138
+ files_involved=set(),
139
+ description="CRUD operations detected in the codebase"
140
+ )
141
+
142
+ return None
143
+
144
+ def _detect_auth_pattern(self) -> ExecutionFlow:
145
+ """Detect authentication/authorization patterns."""
146
+ auth_functions = []
147
+
148
+ for symbol in self.analysis.symbols:
149
+ name_lower = symbol.name.lower()
150
+ if any(auth_word in name_lower for auth_word in
151
+ ['login', 'logout', 'authenticate', 'authorize', 'verify',
152
+ 'validate', 'token', 'session', 'permission']):
153
+ auth_functions.append(symbol.name)
154
+
155
+ if len(auth_functions) >= 2:
156
+ return ExecutionFlow(
157
+ name="authentication_flow",
158
+ entry_point="Authentication System",
159
+ steps=auth_functions,
160
+ files_involved=set(),
161
+ description="Authentication and authorization flow"
162
+ )
163
+
164
+ return None
@@ -0,0 +1,130 @@
1
+ """Metrics and statistics analyzer."""
2
+
3
+ from pathlib import Path
4
+ from typing import List, Set, Dict
5
+
6
+ from ..models import CodebaseAnalysis
7
+
8
+
9
+ class MetricsAnalyzer:
10
+ """Analyzes codebase metrics and statistics."""
11
+
12
+ def __init__(self, analysis: CodebaseAnalysis):
13
+ self.analysis = analysis
14
+
15
+ def count_total_lines(self, files: List[Path]) -> int:
16
+ """Count total lines of code."""
17
+ total_lines = 0
18
+
19
+ for file_path in files:
20
+ try:
21
+ with open(file_path, 'r', encoding='utf-8') as f:
22
+ total_lines += sum(1 for line in f if line.strip())
23
+ except:
24
+ continue
25
+
26
+ return total_lines
27
+
28
+ def detect_languages(self, files: List[Path]) -> Set[str]:
29
+ """Detect programming languages in the codebase."""
30
+ languages = set()
31
+
32
+ language_map = {
33
+ '.py': 'Python',
34
+ '.js': 'JavaScript',
35
+ '.jsx': 'JavaScript',
36
+ '.ts': 'TypeScript',
37
+ '.tsx': 'TypeScript',
38
+ '.java': 'Java',
39
+ '.cpp': 'C++',
40
+ '.c': 'C',
41
+ '.cs': 'C#',
42
+ '.go': 'Go',
43
+ '.rs': 'Rust',
44
+ '.php': 'PHP',
45
+ '.rb': 'Ruby',
46
+ '.swift': 'Swift',
47
+ '.kt': 'Kotlin',
48
+ '.scala': 'Scala'
49
+ }
50
+
51
+ for file_path in files:
52
+ lang = language_map.get(file_path.suffix)
53
+ if lang:
54
+ languages.add(lang)
55
+
56
+ return languages
57
+
58
+ def calculate_complexity(self) -> float:
59
+ """Calculate a simple complexity score."""
60
+ if not self.analysis.symbols:
61
+ return 0.0
62
+
63
+ # Simple complexity based on:
64
+ # - Number of functions/classes
65
+ # - Number of call relationships
66
+ # - Depth of call chains
67
+
68
+ symbol_count = len(self.analysis.symbols)
69
+ call_count = len(self.analysis.call_relations)
70
+
71
+ # Normalize to 0-100 scale
72
+ base_complexity = min(100, (symbol_count + call_count) / 10)
73
+
74
+ # Adjust for call chain depth
75
+ max_chain_depth = self._calculate_max_call_depth()
76
+ depth_factor = min(2.0, max_chain_depth / 5)
77
+
78
+ return min(100.0, base_complexity * depth_factor)
79
+
80
+ def _calculate_max_call_depth(self) -> int:
81
+ """Calculate the maximum depth of call chains."""
82
+ # Build a simple call graph
83
+ call_graph = {}
84
+ for call in self.analysis.call_relations:
85
+ caller_name = call.caller_symbol.name
86
+ if caller_name not in call_graph:
87
+ call_graph[caller_name] = []
88
+ call_graph[caller_name].append(call.callee_name)
89
+
90
+ # Find maximum depth using DFS
91
+ max_depth = 0
92
+
93
+ def dfs(func: str, visited: Set[str], depth: int) -> int:
94
+ if func in visited or func not in call_graph:
95
+ return depth
96
+
97
+ visited.add(func)
98
+ local_max = depth
99
+
100
+ for callee in call_graph[func]:
101
+ local_max = max(local_max, dfs(callee, visited.copy(), depth + 1))
102
+
103
+ return local_max
104
+
105
+ for func in call_graph:
106
+ max_depth = max(max_depth, dfs(func, set(), 0))
107
+
108
+ return max_depth
109
+
110
+ def get_file_statistics(self, files: List[Path]) -> Dict[str, int]:
111
+ """Get detailed file statistics."""
112
+ stats = {
113
+ 'total_files': len(files),
114
+ 'python_files': 0,
115
+ 'javascript_files': 0,
116
+ 'typescript_files': 0,
117
+ 'other_files': 0
118
+ }
119
+
120
+ for file_path in files:
121
+ if file_path.suffix == '.py':
122
+ stats['python_files'] += 1
123
+ elif file_path.suffix in ['.js', '.jsx']:
124
+ stats['javascript_files'] += 1
125
+ elif file_path.suffix in ['.ts', '.tsx']:
126
+ stats['typescript_files'] += 1
127
+ else:
128
+ stats['other_files'] += 1
129
+
130
+ return stats
@@ -0,0 +1 @@
1
+ """CLI module for codebase-digest."""