code2llm 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code2flow/__init__.py +47 -0
- code2flow/__main__.py +6 -0
- code2flow/analysis/__init__.py +23 -0
- code2flow/analysis/call_graph.py +210 -0
- code2flow/analysis/cfg.py +293 -0
- code2flow/analysis/coupling.py +77 -0
- code2flow/analysis/data_analysis.py +249 -0
- code2flow/analysis/dfg.py +224 -0
- code2flow/analysis/pipeline_detector.py +445 -0
- code2flow/analysis/side_effects.py +313 -0
- code2flow/analysis/smells.py +192 -0
- code2flow/analysis/type_inference.py +306 -0
- code2flow/cli.py +493 -0
- code2flow/core/__init__.py +36 -0
- code2flow/core/analyzer.py +765 -0
- code2flow/core/config.py +177 -0
- code2flow/core/models.py +194 -0
- code2flow/core/streaming_analyzer.py +666 -0
- code2flow/exporters/__init__.py +35 -0
- code2flow/exporters/base.py +13 -0
- code2flow/exporters/context_exporter.py +207 -0
- code2flow/exporters/flow_exporter.py +570 -0
- code2flow/exporters/json_exporter.py +17 -0
- code2flow/exporters/llm_exporter.py +12 -0
- code2flow/exporters/map_exporter.py +218 -0
- code2flow/exporters/mermaid_exporter.py +67 -0
- code2flow/exporters/toon.py +982 -0
- code2flow/exporters/yaml_exporter.py +108 -0
- code2flow/llm_flow_generator.py +451 -0
- code2flow/llm_task_generator.py +263 -0
- code2flow/mermaid_generator.py +481 -0
- code2flow/nlp/__init__.py +23 -0
- code2flow/nlp/config.py +174 -0
- code2flow/nlp/entity_resolution.py +326 -0
- code2flow/nlp/intent_matching.py +297 -0
- code2flow/nlp/normalization.py +122 -0
- code2flow/nlp/pipeline.py +388 -0
- code2flow/patterns/__init__.py +0 -0
- code2flow/patterns/detector.py +168 -0
- code2flow/refactor/__init__.py +0 -0
- code2flow/refactor/prompt_engine.py +150 -0
- code2flow/visualizers/__init__.py +0 -0
- code2flow/visualizers/graph.py +196 -0
- code2llm-0.3.7.dist-info/METADATA +604 -0
- code2llm-0.3.7.dist-info/RECORD +49 -0
- code2llm-0.3.7.dist-info/WHEEL +5 -0
- code2llm-0.3.7.dist-info/entry_points.txt +2 -0
- code2llm-0.3.7.dist-info/licenses/LICENSE +201 -0
- code2llm-0.3.7.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Data Analysis logic for code2flow - extracted from YAMLExporter."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
from ..core.models import AnalysisResult
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DataAnalyzer:
|
|
8
|
+
"""Analyze data flows, structures, and optimization opportunities."""
|
|
9
|
+
|
|
10
|
+
def analyze_data_flow(self, result: AnalysisResult) -> Dict[str, Any]:
|
|
11
|
+
"""Perform detailed data flow analysis."""
|
|
12
|
+
return {
|
|
13
|
+
'data_pipelines': self._find_data_pipelines(result),
|
|
14
|
+
'state_patterns': self._find_state_patterns(result),
|
|
15
|
+
'data_dependencies': self._find_data_dependencies(result),
|
|
16
|
+
'event_flows': self._find_event_flows(result),
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def analyze_data_structures(self, result: AnalysisResult) -> Dict[str, Any]:
|
|
20
|
+
"""Analyze data structures and optimization opportunities."""
|
|
21
|
+
data_types = self._analyze_data_types(result)
|
|
22
|
+
data_flow_graph = self._build_data_flow_graph(result)
|
|
23
|
+
process_patterns = self._identify_process_patterns(result)
|
|
24
|
+
optimization_analysis = self._analyze_optimization_opportunities(result, data_types, data_flow_graph)
|
|
25
|
+
|
|
26
|
+
return {
|
|
27
|
+
'data_types': data_types,
|
|
28
|
+
'data_flow_graph': data_flow_graph,
|
|
29
|
+
'process_patterns': process_patterns,
|
|
30
|
+
'optimization_analysis': optimization_analysis,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def _find_data_pipelines(self, result: AnalysisResult) -> list:
|
|
34
|
+
"""Find data transformation pipelines in the codebase."""
|
|
35
|
+
pipelines = []
|
|
36
|
+
input_indicators = ['parse', 'load', 'read', 'fetch', 'get', 'input', 'receive', 'extract']
|
|
37
|
+
transform_indicators = ['transform', 'convert', 'process', 'validate', 'filter', 'map', 'reduce', 'compute']
|
|
38
|
+
output_indicators = ['serialize', 'format', 'write', 'save', 'send', 'output', 'render', 'encode']
|
|
39
|
+
|
|
40
|
+
input_funcs = []
|
|
41
|
+
transform_funcs = []
|
|
42
|
+
output_funcs = []
|
|
43
|
+
|
|
44
|
+
for func_name, func in result.functions.items():
|
|
45
|
+
name_lower = func.name.lower()
|
|
46
|
+
if any(ind in name_lower for ind in input_indicators):
|
|
47
|
+
input_funcs.append((func_name, func))
|
|
48
|
+
elif any(ind in name_lower for ind in transform_indicators):
|
|
49
|
+
transform_funcs.append((func_name, func))
|
|
50
|
+
elif any(ind in name_lower for ind in output_indicators):
|
|
51
|
+
output_funcs.append((func_name, func))
|
|
52
|
+
|
|
53
|
+
for in_name, in_func in input_funcs[:20]:
|
|
54
|
+
for t_name, t_func in transform_funcs[:30]:
|
|
55
|
+
if t_name in in_func.calls:
|
|
56
|
+
for out_name, out_func in output_funcs[:20]:
|
|
57
|
+
if out_name in t_func.calls:
|
|
58
|
+
pipelines.append({
|
|
59
|
+
'pipeline_id': f"pipeline_{len(pipelines)+1}",
|
|
60
|
+
'stages': [
|
|
61
|
+
{'stage': 'input', 'function': in_name, 'description': in_func.docstring[:100] if in_func.docstring else 'N/A'},
|
|
62
|
+
{'stage': 'transform', 'function': t_name, 'description': t_func.docstring[:100] if t_func.docstring else 'N/A'},
|
|
63
|
+
{'stage': 'output', 'function': out_name, 'description': out_func.docstring[:100] if out_func.docstring else 'N/A'},
|
|
64
|
+
],
|
|
65
|
+
'data_flow': f"{in_name} → {t_name} → {out_name}",
|
|
66
|
+
})
|
|
67
|
+
if len(pipelines) >= 15:
|
|
68
|
+
return pipelines
|
|
69
|
+
return pipelines
|
|
70
|
+
|
|
71
|
+
def _find_state_patterns(self, result: AnalysisResult) -> list:
|
|
72
|
+
"""Find state management patterns."""
|
|
73
|
+
patterns = []
|
|
74
|
+
state_indicators = ['state', 'status', 'mode', 'phase', 'lifecycle', 'session', 'context']
|
|
75
|
+
transition_indicators = ['transition', 'change', 'update', 'set_state', 'switch']
|
|
76
|
+
|
|
77
|
+
for func_name, func in result.functions.items():
|
|
78
|
+
name_lower = func.name.lower()
|
|
79
|
+
if any(ind in name_lower for ind in state_indicators + transition_indicators):
|
|
80
|
+
affected_states = []
|
|
81
|
+
for call in list(func.calls)[:10]:
|
|
82
|
+
call_func = result.functions.get(call)
|
|
83
|
+
if call_func and any(ind in call_func.name.lower() for ind in state_indicators):
|
|
84
|
+
affected_states.append(call)
|
|
85
|
+
patterns.append({
|
|
86
|
+
'function': func_name,
|
|
87
|
+
'type': 'state_manager' if 'set' in name_lower or 'update' in name_lower else 'state_reader',
|
|
88
|
+
'affects_states': affected_states[:5],
|
|
89
|
+
'description': func.docstring[:150] if func.docstring else 'N/A',
|
|
90
|
+
})
|
|
91
|
+
if len(patterns) >= 20: break
|
|
92
|
+
return patterns
|
|
93
|
+
|
|
94
|
+
def _find_data_dependencies(self, result: AnalysisResult) -> list:
|
|
95
|
+
"""Find cross-module data dependencies."""
|
|
96
|
+
module_data_flow = {}
|
|
97
|
+
for func_name, func in result.functions.items():
|
|
98
|
+
func_module = func_name.rsplit('.', 1)[0] if '.' in func_name else 'root'
|
|
99
|
+
for called in list(func.calls)[:15]:
|
|
100
|
+
called_module = called.rsplit('.', 1)[0] if '.' in called else 'root'
|
|
101
|
+
if func_module != called_module and called in result.functions:
|
|
102
|
+
key = (func_module, called_module)
|
|
103
|
+
if key not in module_data_flow:
|
|
104
|
+
module_data_flow[key] = {'from_module': func_module, 'to_module': called_module, 'data_functions': [], 'call_count': 0}
|
|
105
|
+
module_data_flow[key]['data_functions'].append({'caller': func_name, 'callee': called})
|
|
106
|
+
module_data_flow[key]['call_count'] += 1
|
|
107
|
+
deps = sorted(module_data_flow.values(), key=lambda x: x['call_count'], reverse=True)
|
|
108
|
+
for dep in deps: dep['data_functions'] = dep['data_functions'][:10]
|
|
109
|
+
return deps[:15]
|
|
110
|
+
|
|
111
|
+
def _find_event_flows(self, result: AnalysisResult) -> list:
|
|
112
|
+
"""Find event-driven patterns."""
|
|
113
|
+
flows = []
|
|
114
|
+
event_indicators = ['event', 'emit', 'trigger', 'notify', 'callback', 'handler', 'listen', 'subscribe']
|
|
115
|
+
hook_indicators = ['hook', 'on_', 'before_', 'after_', 'pre_', 'post_']
|
|
116
|
+
for func_name, func in result.functions.items():
|
|
117
|
+
name_lower = func.name.lower()
|
|
118
|
+
if any(ind in name_lower for ind in event_indicators) or any(name_lower.startswith(ind) for ind in hook_indicators):
|
|
119
|
+
handlers = []
|
|
120
|
+
for called in list(func.calls)[:10]:
|
|
121
|
+
called_func = result.functions.get(called)
|
|
122
|
+
if called_func and any(ind in called_func.name.lower() for ind in event_indicators + ['handle', 'process']):
|
|
123
|
+
handlers.append(called)
|
|
124
|
+
flows.append({
|
|
125
|
+
'event_source': func_name,
|
|
126
|
+
'type': 'emitter' if 'emit' in name_lower or 'trigger' in name_lower else 'handler',
|
|
127
|
+
'handlers': handlers[:5],
|
|
128
|
+
'description': func.docstring[:150] if func.docstring else 'N/A',
|
|
129
|
+
})
|
|
130
|
+
if len(flows) >= 20: break
|
|
131
|
+
return flows
|
|
132
|
+
|
|
133
|
+
def _analyze_data_types(self, result: AnalysisResult) -> list:
|
|
134
|
+
"""Analyze data types and usage."""
|
|
135
|
+
data_types = {}
|
|
136
|
+
type_indicators = {
|
|
137
|
+
'list': ['list', 'array', 'items', 'elements', 'collection', 'sequence'],
|
|
138
|
+
'dict': ['dict', 'map', 'mapping', 'key_value', 'record', 'object'],
|
|
139
|
+
'str': ['string', 'text', 'content', 'message'],
|
|
140
|
+
'int': ['int', 'count', 'index', 'number', 'id'],
|
|
141
|
+
'float': ['float', 'decimal', 'score', 'probability'],
|
|
142
|
+
'bool': ['bool', 'flag', 'is_', 'has_'],
|
|
143
|
+
'tuple': ['tuple', 'pair'],
|
|
144
|
+
'set': ['set', 'unique'],
|
|
145
|
+
}
|
|
146
|
+
for func_name, func in result.functions.items():
|
|
147
|
+
name_lower = func.name.lower()
|
|
148
|
+
doc = func.docstring.lower() if func.docstring else ''
|
|
149
|
+
detected = [t for t, inds in type_indicators.items() if any(ind in name_lower or ind in doc for ind in inds)]
|
|
150
|
+
params = self._infer_parameter_types(func)
|
|
151
|
+
returns = self._infer_return_types(func)
|
|
152
|
+
if detected or params or returns:
|
|
153
|
+
type_key = ",".join(sorted(set(detected + params + returns)))
|
|
154
|
+
if type_key not in data_types:
|
|
155
|
+
data_types[type_key] = {'type_name': type_key, 'detected_types': list(set(detected)), 'parameter_types': list(set(params)), 'return_types': list(set(returns)), 'functions': [], 'usage_count': 0, 'cross_module_usage': 0}
|
|
156
|
+
data_types[type_key]['functions'].append(func_name)
|
|
157
|
+
data_types[type_key]['usage_count'] += 1
|
|
158
|
+
mod = func_name.rsplit('.', 1)[0] if '.' in func_name else 'root'
|
|
159
|
+
for called in list(func.calls)[:10]:
|
|
160
|
+
if (called.rsplit('.', 1)[0] if '.' in called else 'root') != mod:
|
|
161
|
+
data_types[type_key]['cross_module_usage'] += 1
|
|
162
|
+
break
|
|
163
|
+
return sorted(data_types.values(), key=lambda x: x['usage_count'], reverse=True)
|
|
164
|
+
|
|
165
|
+
def _infer_parameter_types(self, func) -> list:
|
|
166
|
+
params = []
|
|
167
|
+
name = func.name.lower()
|
|
168
|
+
if 'list' in name or 'items' in name: params.append('list')
|
|
169
|
+
if 'dict' in name or 'map' in name: params.append('dict')
|
|
170
|
+
if 'text' in name or 'string' in name: params.append('str')
|
|
171
|
+
if 'count' in name or 'index' in name: params.append('int')
|
|
172
|
+
return params
|
|
173
|
+
|
|
174
|
+
def _infer_return_types(self, func) -> list:
|
|
175
|
+
returns = []
|
|
176
|
+
name = func.name.lower()
|
|
177
|
+
if name.startswith(('get_', 'find_')): returns.append('dict')
|
|
178
|
+
if name.startswith(('is_', 'has_')): returns.append('bool')
|
|
179
|
+
if name.startswith(('count_', 'len_')): returns.append('int')
|
|
180
|
+
if name.startswith(('list_', 'get_all_')): returns.append('list')
|
|
181
|
+
return returns
|
|
182
|
+
|
|
183
|
+
def _build_data_flow_graph(self, result: AnalysisResult) -> dict:
|
|
184
|
+
nodes = {}
|
|
185
|
+
edges = []
|
|
186
|
+
for func_name, func in result.functions.items():
|
|
187
|
+
nodes[func_name] = {
|
|
188
|
+
'id': func_name, 'name': func.name.split('.')[-1],
|
|
189
|
+
'module': func_name.rsplit('.', 1)[0] if '.' in func_name else 'root',
|
|
190
|
+
'data_types': self._get_function_data_types(func),
|
|
191
|
+
'in_degree': len(func.called_by), 'out_degree': len(func.calls),
|
|
192
|
+
'is_hub': len(func.calls) > 5 or len(func.called_by) > 5,
|
|
193
|
+
}
|
|
194
|
+
for func_name, func in result.functions.items():
|
|
195
|
+
for called in list(func.calls)[:15]:
|
|
196
|
+
if called in result.functions:
|
|
197
|
+
edges.append({'from': func_name, 'to': called, 'data_flow': True, 'weight': 1})
|
|
198
|
+
return {'nodes': nodes, 'edges': edges, 'stats': {'total_nodes': len(nodes), 'total_edges': len(edges), 'hub_nodes': sum(1 for n in nodes.values() if n['is_hub'])}}
|
|
199
|
+
|
|
200
|
+
def _get_function_data_types(self, func) -> list:
|
|
201
|
+
types = []
|
|
202
|
+
name = func.name.lower()
|
|
203
|
+
if 'list' in name or 'items' in name: types.append('list')
|
|
204
|
+
if 'dict' in name or 'map' in name: types.append('dict')
|
|
205
|
+
if 'text' in name or 'string' in name: types.append('str')
|
|
206
|
+
if 'count' in name or 'index' in name: types.append('int')
|
|
207
|
+
if func.docstring:
|
|
208
|
+
doc = func.docstring.lower()
|
|
209
|
+
if 'list' in doc: types.append('list')
|
|
210
|
+
if 'dict' in doc: types.append('dict')
|
|
211
|
+
if 'string' in doc or 'text' in doc: types.append('str')
|
|
212
|
+
return list(set(types))
|
|
213
|
+
|
|
214
|
+
def _identify_process_patterns(self, result: AnalysisResult) -> list:
|
|
215
|
+
patterns = {'filter': [], 'map': [], 'reduce': [], 'aggregate': [], 'transform': [], 'validate': []}
|
|
216
|
+
indicators = {
|
|
217
|
+
'filter': ['filter', 'select', 'where', 'find'], 'map': ['map', 'transform', 'process'],
|
|
218
|
+
'reduce': ['reduce', 'sum', 'count', 'aggregate'], 'aggregate': ['group', 'bucket', 'cluster'],
|
|
219
|
+
'transform': ['transform', 'convert', 'normalize'], 'validate': ['validate', 'check', 'verify'],
|
|
220
|
+
}
|
|
221
|
+
for func_name, func in result.functions.items():
|
|
222
|
+
name = func.name.lower()
|
|
223
|
+
doc = func.docstring.lower() if func.docstring else ''
|
|
224
|
+
for p_type, inds in indicators.items():
|
|
225
|
+
if any(ind in name or ind in doc for ind in inds):
|
|
226
|
+
patterns[p_type].append({'function': func_name, 'description': func.docstring[:100] if func.docstring else 'N/A', 'data_flow': f"{len(func.called_by)} → {func_name} → {len(func.calls)}"})
|
|
227
|
+
break
|
|
228
|
+
res = []
|
|
229
|
+
for p_type, funcs in patterns.items():
|
|
230
|
+
res.append({'pattern_type': p_type, 'functions': funcs[:10], 'count': len(funcs)})
|
|
231
|
+
return sorted(res, key=lambda x: x['count'], reverse=True)
|
|
232
|
+
|
|
233
|
+
def _analyze_optimization_opportunities(self, result: AnalysisResult, data_types: list, dfg: dict) -> dict:
|
|
234
|
+
opt = {'potential_score': 0.0, 'type_consolidation': [], 'process_consolidation': [], 'hub_optimization': [], 'recommendations': []}
|
|
235
|
+
similar = {}
|
|
236
|
+
for dt in data_types:
|
|
237
|
+
sig = ",".join(sorted(dt['detected_types']))
|
|
238
|
+
if sig not in similar: similar[sig] = []
|
|
239
|
+
similar[sig].append(dt)
|
|
240
|
+
for sig, sims in similar.items():
|
|
241
|
+
if len(sims) > 1:
|
|
242
|
+
usage = sum(s['usage_count'] for s in sims)
|
|
243
|
+
if usage > 10: opt['type_consolidation'].append({'type_signature': sig, 'similar_types': [s['type_name'] for s in sims], 'total_usage': usage, 'potential_reduction': len(sims)-1})
|
|
244
|
+
for p in self._identify_process_patterns(result):
|
|
245
|
+
if p['count'] > 5: opt['process_consolidation'].append({'pattern_type': p['pattern_type'], 'function_count': p['count'], 'potential_reduction': p['count'] // 3})
|
|
246
|
+
for hub in [n for n in dfg['nodes'].values() if n['is_hub']][:10]:
|
|
247
|
+
opt['hub_optimization'].append({'function': hub['id'], 'connections': hub['in_degree'] + hub['out_degree'], 'optimization_type': 'split' if hub['out_degree'] > 10 else 'cache'})
|
|
248
|
+
opt['potential_score'] = (len(opt['type_consolidation']) * 10 + len(opt['process_consolidation']) * 15 + len(opt['hub_optimization']) * 5) / 100.0
|
|
249
|
+
return opt
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""Data Flow Graph (DFG) extractor using AST."""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Set, Dict, List
|
|
6
|
+
|
|
7
|
+
from ..core.config import Config
|
|
8
|
+
from ..core.models import AnalysisResult, FlowEdge, DataFlow, Mutation
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DFGExtractor(ast.NodeVisitor):
|
|
12
|
+
"""Extract Data Flow Graph from AST."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, config: Config):
|
|
15
|
+
self.config = config
|
|
16
|
+
self.result = AnalysisResult()
|
|
17
|
+
self.module_name = ""
|
|
18
|
+
self.file_path = ""
|
|
19
|
+
|
|
20
|
+
# Data flow tracking
|
|
21
|
+
self.variable_defs: Dict[str, int] = {} # variable -> node_id where defined
|
|
22
|
+
self.variable_uses: Dict[str, List[int]] = defaultdict(list) # variable -> nodes where used
|
|
23
|
+
self.current_scope = ""
|
|
24
|
+
self.scope_stack = []
|
|
25
|
+
|
|
26
|
+
def extract(self, tree: ast.AST, module_name: str, file_path: str) -> AnalysisResult:
|
|
27
|
+
"""Extract DFG from AST."""
|
|
28
|
+
self.result = AnalysisResult()
|
|
29
|
+
self.module_name = module_name
|
|
30
|
+
self.file_path = file_path
|
|
31
|
+
self.variable_defs = {}
|
|
32
|
+
self.variable_uses = defaultdict(list)
|
|
33
|
+
self.current_scope = module_name
|
|
34
|
+
self.scope_stack = [module_name]
|
|
35
|
+
|
|
36
|
+
self.visit(tree)
|
|
37
|
+
self._build_data_flow_edges()
|
|
38
|
+
|
|
39
|
+
return self.result
|
|
40
|
+
|
|
41
|
+
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
42
|
+
"""Visit function definition."""
|
|
43
|
+
func_name = f"{self.module_name}.{node.name}"
|
|
44
|
+
self.scope_stack.append(func_name)
|
|
45
|
+
self.current_scope = func_name
|
|
46
|
+
|
|
47
|
+
# Visit body
|
|
48
|
+
for stmt in node.body:
|
|
49
|
+
self.visit(stmt)
|
|
50
|
+
|
|
51
|
+
self.scope_stack.pop()
|
|
52
|
+
self.current_scope = self.scope_stack[-1] if self.scope_stack else self.module_name
|
|
53
|
+
|
|
54
|
+
def visit_Assign(self, node: ast.Assign):
|
|
55
|
+
"""Track variable assignments."""
|
|
56
|
+
# Get variables being assigned
|
|
57
|
+
targets = self._extract_targets(node.targets)
|
|
58
|
+
|
|
59
|
+
# Get dependencies from value
|
|
60
|
+
dependencies = self._extract_names(node.value)
|
|
61
|
+
|
|
62
|
+
for target in targets:
|
|
63
|
+
scoped_name = f"{self.current_scope}.{target}"
|
|
64
|
+
|
|
65
|
+
# Create data flow record
|
|
66
|
+
if scoped_name not in self.result.data_flows:
|
|
67
|
+
self.result.data_flows[scoped_name] = DataFlow(
|
|
68
|
+
variable=target,
|
|
69
|
+
dependencies=set()
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self.result.data_flows[scoped_name].dependencies.update(dependencies)
|
|
73
|
+
|
|
74
|
+
# Track this as a definition
|
|
75
|
+
self.variable_defs[scoped_name] = node.lineno
|
|
76
|
+
|
|
77
|
+
# Also record uses in the value
|
|
78
|
+
for dep in dependencies:
|
|
79
|
+
scoped_dep = f"{self.current_scope}.{dep}" if dep not in self.variable_defs else dep
|
|
80
|
+
if scoped_dep not in self.variable_uses:
|
|
81
|
+
self.variable_uses[scoped_dep] = []
|
|
82
|
+
self.variable_uses[scoped_dep].append(node.lineno)
|
|
83
|
+
|
|
84
|
+
# Track this as a mutation
|
|
85
|
+
for target in targets:
|
|
86
|
+
self.result.mutations.append(Mutation(
|
|
87
|
+
variable=target,
|
|
88
|
+
file=self.file_path,
|
|
89
|
+
line=node.lineno,
|
|
90
|
+
type="assign",
|
|
91
|
+
scope=self.current_scope,
|
|
92
|
+
context=self._expr_to_str(node.value)
|
|
93
|
+
))
|
|
94
|
+
|
|
95
|
+
self.generic_visit(node)
|
|
96
|
+
|
|
97
|
+
def visit_AugAssign(self, node: ast.AugAssign):
|
|
98
|
+
"""Track augmented assignments (+=, *=, etc.)."""
|
|
99
|
+
target = self._expr_to_str(node.target)
|
|
100
|
+
dependencies = self._extract_names(node.value)
|
|
101
|
+
|
|
102
|
+
scoped_name = f"{self.current_scope}.{target}"
|
|
103
|
+
|
|
104
|
+
if scoped_name not in self.result.data_flows:
|
|
105
|
+
self.result.data_flows[scoped_name] = DataFlow(
|
|
106
|
+
variable=target,
|
|
107
|
+
dependencies=set()
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Augmented assignment both uses and defines
|
|
111
|
+
self.result.data_flows[scoped_name].dependencies.add(target)
|
|
112
|
+
self.result.data_flows[scoped_name].dependencies.update(dependencies)
|
|
113
|
+
|
|
114
|
+
# Record as mutation
|
|
115
|
+
self.result.mutations.append(Mutation(
|
|
116
|
+
variable=target,
|
|
117
|
+
file=self.file_path,
|
|
118
|
+
line=node.lineno,
|
|
119
|
+
type="aug_assign",
|
|
120
|
+
scope=self.current_scope,
|
|
121
|
+
context=self._expr_to_str(node)
|
|
122
|
+
))
|
|
123
|
+
|
|
124
|
+
self.generic_visit(node)
|
|
125
|
+
|
|
126
|
+
def visit_For(self, node: ast.For):
|
|
127
|
+
"""Track loop variable."""
|
|
128
|
+
if isinstance(node.target, ast.Name):
|
|
129
|
+
loop_var = node.target.id
|
|
130
|
+
scoped_name = f"{self.current_scope}.{loop_var}"
|
|
131
|
+
|
|
132
|
+
# Loop variable depends on iterator
|
|
133
|
+
iter_deps = self._extract_names(node.iter)
|
|
134
|
+
|
|
135
|
+
if scoped_name not in self.result.data_flows:
|
|
136
|
+
self.result.data_flows[scoped_name] = DataFlow(
|
|
137
|
+
variable=loop_var,
|
|
138
|
+
dependencies=set(iter_deps)
|
|
139
|
+
)
|
|
140
|
+
else:
|
|
141
|
+
self.result.data_flows[scoped_name].dependencies.update(iter_deps)
|
|
142
|
+
|
|
143
|
+
self.generic_visit(node)
|
|
144
|
+
|
|
145
|
+
def visit_Call(self, node: ast.Call):
|
|
146
|
+
"""Track data flow through function calls."""
|
|
147
|
+
# Track arguments as data flow to the call
|
|
148
|
+
for i, arg in enumerate(node.args):
|
|
149
|
+
deps = self._extract_names(arg)
|
|
150
|
+
if deps:
|
|
151
|
+
# Create implicit data flow for this argument
|
|
152
|
+
call_str = self._expr_to_str(node.func)
|
|
153
|
+
flow_key = f"{call_str}.arg{i}"
|
|
154
|
+
|
|
155
|
+
if flow_key not in self.result.data_flows:
|
|
156
|
+
self.result.data_flows[flow_key] = DataFlow(
|
|
157
|
+
variable=flow_key,
|
|
158
|
+
dependencies=deps
|
|
159
|
+
)
|
|
160
|
+
else:
|
|
161
|
+
self.result.data_flows[flow_key].dependencies.update(deps)
|
|
162
|
+
|
|
163
|
+
# Track potential mutations via calls (heuristics)
|
|
164
|
+
if isinstance(node.func, ast.Attribute):
|
|
165
|
+
method_name = node.func.attr
|
|
166
|
+
if any(s in method_name.lower() for s in ['update', 'set', 'add', 'remove', 'append', 'extend', 'pop', 'clear']):
|
|
167
|
+
obj_name = self._expr_to_str(node.func.value)
|
|
168
|
+
self.result.mutations.append(Mutation(
|
|
169
|
+
variable=obj_name,
|
|
170
|
+
file=self.file_path,
|
|
171
|
+
line=node.lineno,
|
|
172
|
+
type="method_call",
|
|
173
|
+
scope=self.current_scope,
|
|
174
|
+
context=f"call to {method_name}"
|
|
175
|
+
))
|
|
176
|
+
|
|
177
|
+
self.generic_visit(node)
|
|
178
|
+
|
|
179
|
+
def _extract_targets(self, targets: List[ast.AST]) -> List[str]:
|
|
180
|
+
"""Extract variable names from assignment targets."""
|
|
181
|
+
names = []
|
|
182
|
+
for target in targets:
|
|
183
|
+
names.extend(self._get_names(target))
|
|
184
|
+
return names
|
|
185
|
+
|
|
186
|
+
def _get_names(self, node: ast.AST) -> List[str]:
|
|
187
|
+
"""Get all variable names from an AST node."""
|
|
188
|
+
names = []
|
|
189
|
+
for child in ast.walk(node):
|
|
190
|
+
if isinstance(child, ast.Name):
|
|
191
|
+
names.append(child.id)
|
|
192
|
+
elif isinstance(child, ast.Tuple) or isinstance(child, ast.List):
|
|
193
|
+
for elt in child.elts:
|
|
194
|
+
names.extend(self._get_names(elt))
|
|
195
|
+
return names
|
|
196
|
+
|
|
197
|
+
def _extract_names(self, node: ast.AST) -> Set[str]:
|
|
198
|
+
"""Extract all variable names used in expression."""
|
|
199
|
+
names = set()
|
|
200
|
+
if node is None:
|
|
201
|
+
return names
|
|
202
|
+
|
|
203
|
+
for child in ast.walk(node):
|
|
204
|
+
if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Load):
|
|
205
|
+
names.add(child.id)
|
|
206
|
+
|
|
207
|
+
return names
|
|
208
|
+
|
|
209
|
+
def _expr_to_str(self, node: ast.AST) -> str:
|
|
210
|
+
"""Convert AST expression to string."""
|
|
211
|
+
if node is None:
|
|
212
|
+
return "None"
|
|
213
|
+
try:
|
|
214
|
+
return ast.unparse(node) if hasattr(ast, 'unparse') else str(node)
|
|
215
|
+
except:
|
|
216
|
+
return str(node)
|
|
217
|
+
|
|
218
|
+
def _build_data_flow_edges(self):
|
|
219
|
+
"""Build DFG edges from data flow records."""
|
|
220
|
+
# For each variable, create edges from its dependencies
|
|
221
|
+
for var_name, data_flow in self.result.data_flows.items():
|
|
222
|
+
# This is a simplified representation
|
|
223
|
+
# In a full implementation, we'd map to actual node IDs
|
|
224
|
+
pass # Edges built during CFG extraction
|