code2llm 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. code2flow/__init__.py +47 -0
  2. code2flow/__main__.py +6 -0
  3. code2flow/analysis/__init__.py +23 -0
  4. code2flow/analysis/call_graph.py +210 -0
  5. code2flow/analysis/cfg.py +293 -0
  6. code2flow/analysis/coupling.py +77 -0
  7. code2flow/analysis/data_analysis.py +249 -0
  8. code2flow/analysis/dfg.py +224 -0
  9. code2flow/analysis/pipeline_detector.py +445 -0
  10. code2flow/analysis/side_effects.py +313 -0
  11. code2flow/analysis/smells.py +192 -0
  12. code2flow/analysis/type_inference.py +306 -0
  13. code2flow/cli.py +493 -0
  14. code2flow/core/__init__.py +36 -0
  15. code2flow/core/analyzer.py +765 -0
  16. code2flow/core/config.py +177 -0
  17. code2flow/core/models.py +194 -0
  18. code2flow/core/streaming_analyzer.py +666 -0
  19. code2flow/exporters/__init__.py +35 -0
  20. code2flow/exporters/base.py +13 -0
  21. code2flow/exporters/context_exporter.py +207 -0
  22. code2flow/exporters/flow_exporter.py +570 -0
  23. code2flow/exporters/json_exporter.py +17 -0
  24. code2flow/exporters/llm_exporter.py +12 -0
  25. code2flow/exporters/map_exporter.py +218 -0
  26. code2flow/exporters/mermaid_exporter.py +67 -0
  27. code2flow/exporters/toon.py +982 -0
  28. code2flow/exporters/yaml_exporter.py +108 -0
  29. code2flow/llm_flow_generator.py +451 -0
  30. code2flow/llm_task_generator.py +263 -0
  31. code2flow/mermaid_generator.py +481 -0
  32. code2flow/nlp/__init__.py +23 -0
  33. code2flow/nlp/config.py +174 -0
  34. code2flow/nlp/entity_resolution.py +326 -0
  35. code2flow/nlp/intent_matching.py +297 -0
  36. code2flow/nlp/normalization.py +122 -0
  37. code2flow/nlp/pipeline.py +388 -0
  38. code2flow/patterns/__init__.py +0 -0
  39. code2flow/patterns/detector.py +168 -0
  40. code2flow/refactor/__init__.py +0 -0
  41. code2flow/refactor/prompt_engine.py +150 -0
  42. code2flow/visualizers/__init__.py +0 -0
  43. code2flow/visualizers/graph.py +196 -0
  44. code2llm-0.3.7.dist-info/METADATA +604 -0
  45. code2llm-0.3.7.dist-info/RECORD +49 -0
  46. code2llm-0.3.7.dist-info/WHEEL +5 -0
  47. code2llm-0.3.7.dist-info/entry_points.txt +2 -0
  48. code2llm-0.3.7.dist-info/licenses/LICENSE +201 -0
  49. code2llm-0.3.7.dist-info/top_level.txt +1 -0
@@ -0,0 +1,445 @@
1
+ """Pipeline Detector — networkx-based pipeline auto-detection.
2
+
3
+ Uses call graph analysis with networkx to:
4
+ - Build a directed graph from function calls
5
+ - Find longest paths (pipeline candidates)
6
+ - Group pipelines by module domain (NLP, Analysis, Export, Refactor, etc.)
7
+ - Label entry/exit points
8
+ - Aggregate purity per pipeline using SideEffectDetector
9
+
10
+ Sprint 3 (v0.3.2): Replaces the custom DFS chain-tracing in FlowExporter.
11
+ """
12
+
13
+ import logging
14
+ from collections import defaultdict
15
+ from dataclasses import dataclass, field
16
+ from typing import Any, Dict, List, Optional, Set, Tuple
17
+
18
+ import networkx as nx
19
+
20
+ from ..core.models import AnalysisResult, FunctionInfo
21
+ from .side_effects import SideEffectDetector, SideEffectInfo
22
+ from .type_inference import TypeInferenceEngine
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Thresholds
27
+ MIN_PIPELINE_LENGTH = 3
28
+ MAX_PIPELINES = 12
29
+ CC_HIGH = 15
30
+
31
+ # Patterns to exclude from analysis
32
+ EXCLUDE_PATTERNS = frozenset({
33
+ 'venv', '.venv', 'env', '.env', 'publish-env', 'test-env',
34
+ 'site-packages', 'node_modules', '__pycache__', '.git',
35
+ 'dist', 'build', 'egg-info', '.tox', '.mypy_cache',
36
+ })
37
+
38
+ # Module-to-domain mapping heuristics
39
+ DOMAIN_KEYWORDS: Dict[str, List[str]] = {
40
+ "NLP": ["nlp", "natural", "language", "intent", "entity",
41
+ "query", "normalize", "tokenize", "match"],
42
+ "Analysis": ["analysis", "analyzer", "analyse", "analyze",
43
+ "metric", "complexity", "cfg", "dfg", "call_graph"],
44
+ "Export": ["export", "exporter", "render", "format", "output",
45
+ "toon", "mermaid", "json_export", "yaml_export"],
46
+ "Refactor": ["refactor", "smell", "suggest", "fix", "patch",
47
+ "template", "prompt", "engine"],
48
+ "Core": ["core", "config", "model", "base", "util", "helper"],
49
+ "IO": ["io", "file", "path", "read", "write", "load", "save",
50
+ "cache", "storage"],
51
+ }
52
+
53
+
54
+ @dataclass
55
+ class PipelineStage:
56
+ """A single stage in a detected pipeline."""
57
+ name: str
58
+ qualified_name: str
59
+ signature: str
60
+ cc: float
61
+ purity: str # pure | IO | cache | mutation
62
+ side_effect_summary: str
63
+ is_entry: bool = False
64
+ is_exit: bool = False
65
+
66
+
67
+ @dataclass
68
+ class Pipeline:
69
+ """A detected pipeline with stages, purity info, and domain."""
70
+ name: str
71
+ domain: str
72
+ stages: List[PipelineStage] = field(default_factory=list)
73
+ entry_point: str = ""
74
+ exit_point: str = ""
75
+ entry_type: str = "?"
76
+ exit_type: str = "?"
77
+ pure_count: int = 0
78
+ total_stages: int = 0
79
+ bottleneck: Optional[PipelineStage] = None
80
+ path_length: int = 0
81
+
82
+ @property
83
+ def purity_ratio(self) -> float:
84
+ return self.pure_count / self.total_stages if self.total_stages else 0.0
85
+
86
+ def to_dict(self) -> Dict[str, Any]:
87
+ return {
88
+ "name": self.name,
89
+ "domain": self.domain,
90
+ "stages": [
91
+ {
92
+ "name": s.name,
93
+ "qualified": s.qualified_name,
94
+ "signature": s.signature,
95
+ "cc": s.cc,
96
+ "purity": s.purity,
97
+ "is_entry": s.is_entry,
98
+ "is_exit": s.is_exit,
99
+ }
100
+ for s in self.stages
101
+ ],
102
+ "entry_point": self.entry_point,
103
+ "exit_point": self.exit_point,
104
+ "entry_type": self.entry_type,
105
+ "exit_type": self.exit_type,
106
+ "pure_count": self.pure_count,
107
+ "total_stages": self.total_stages,
108
+ "bottleneck": {
109
+ "name": self.bottleneck.name,
110
+ "cc": self.bottleneck.cc,
111
+ } if self.bottleneck else None,
112
+ }
113
+
114
+
115
+ class PipelineDetector:
116
+ """Detect pipelines in a codebase using networkx graph analysis.
117
+
118
+ Builds a call graph as a DiGraph, finds longest paths as pipeline
119
+ candidates, groups by module domain, and labels entry/exit points.
120
+ """
121
+
122
+ def __init__(
123
+ self,
124
+ type_engine: Optional[TypeInferenceEngine] = None,
125
+ side_effect_detector: Optional[SideEffectDetector] = None,
126
+ ):
127
+ self._type_engine = type_engine or TypeInferenceEngine()
128
+ self._se_detector = side_effect_detector or SideEffectDetector()
129
+
130
+ def detect(
131
+ self,
132
+ funcs: Dict[str, FunctionInfo],
133
+ se_info: Optional[Dict[str, SideEffectInfo]] = None,
134
+ ) -> List[Pipeline]:
135
+ """Detect pipelines from function call graph.
136
+
137
+ Args:
138
+ funcs: qualified_name -> FunctionInfo mapping (pre-filtered)
139
+ se_info: optional pre-computed side-effect info
140
+
141
+ Returns:
142
+ List of Pipeline objects sorted by path length desc.
143
+ """
144
+ if se_info is None:
145
+ se_info = self._se_detector.analyze_all(funcs)
146
+
147
+ # Build networkx DiGraph
148
+ graph = self._build_graph(funcs)
149
+ if graph.number_of_nodes() == 0:
150
+ return []
151
+
152
+ # Find pipeline candidates (longest paths in DAG)
153
+ paths = self._find_pipeline_paths(graph)
154
+
155
+ # Build Pipeline objects with stages, purity, domain
156
+ pipelines = self._build_pipelines(paths, funcs, se_info)
157
+
158
+ # Sort by path length desc
159
+ pipelines.sort(key=lambda p: p.path_length, reverse=True)
160
+
161
+ return pipelines[:MAX_PIPELINES]
162
+
163
+ # ------------------------------------------------------------------
164
+ # graph construction
165
+ # ------------------------------------------------------------------
166
+ def _build_graph(self, funcs: Dict[str, FunctionInfo]) -> nx.DiGraph:
167
+ """Build a directed graph from function call relationships."""
168
+ G = nx.DiGraph()
169
+
170
+ for qname, fi in funcs.items():
171
+ G.add_node(qname, module=fi.module, name=fi.name,
172
+ class_name=fi.class_name)
173
+
174
+ for qname, fi in funcs.items():
175
+ for callee in fi.calls:
176
+ resolved = self._resolve_callee(callee, funcs)
177
+ if resolved and resolved != qname: # no self-loops
178
+ G.add_edge(qname, resolved)
179
+
180
+ return G
181
+
182
+ # ------------------------------------------------------------------
183
+ # path finding
184
+ # ------------------------------------------------------------------
185
+ def _find_pipeline_paths(self, graph: nx.DiGraph) -> List[List[str]]:
186
+ """Find longest paths in the call graph as pipeline candidates.
187
+
188
+ Strategy:
189
+ 1. Find all source nodes (in-degree 0) as potential entry points
190
+ 2. Find all sink nodes (out-degree 0) as potential exit points
191
+ 3. For each source, find longest simple path to any sink
192
+ 4. Also consider longest paths in each weakly connected component
193
+ """
194
+ paths: List[List[str]] = []
195
+
196
+ # Get source nodes (in-degree 0) as potential pipeline entry points
197
+ sources = [n for n in graph.nodes() if graph.in_degree(n) == 0]
198
+
199
+ # If no natural sources, use nodes with low in-degree
200
+ if not sources:
201
+ sources = sorted(graph.nodes(),
202
+ key=lambda n: graph.in_degree(n))[:5]
203
+
204
+ # Try to find longest paths from each source
205
+ used_nodes: Set[str] = set()
206
+ for source in sources:
207
+ best_path = self._longest_path_from(graph, source, used_nodes)
208
+ if len(best_path) >= MIN_PIPELINE_LENGTH:
209
+ paths.append(best_path)
210
+ used_nodes.update(best_path)
211
+
212
+ # Also try: for each weakly connected component, find the longest path
213
+ for component in nx.weakly_connected_components(graph):
214
+ if len(component) < MIN_PIPELINE_LENGTH:
215
+ continue
216
+ # Skip if heavily overlapping with existing paths
217
+ overlap = len(component & used_nodes)
218
+ if overlap > len(component) * 0.5:
219
+ continue
220
+
221
+ subgraph = graph.subgraph(component)
222
+ path = self._longest_path_in_dag(subgraph)
223
+ if len(path) >= MIN_PIPELINE_LENGTH:
224
+ # Check overlap with existing paths
225
+ new_overlap = sum(1 for n in path if n in used_nodes)
226
+ if new_overlap <= len(path) * 0.5:
227
+ paths.append(path)
228
+ used_nodes.update(path)
229
+
230
+ return paths
231
+
232
+ def _longest_path_from(
233
+ self, graph: nx.DiGraph, source: str, used: Set[str]
234
+ ) -> List[str]:
235
+ """Find the longest simple path from a source node."""
236
+ best: List[str] = [source]
237
+
238
+ # BFS/DFS with depth limit for performance
239
+ stack: List[Tuple[str, List[str], Set[str]]] = [
240
+ (source, [source], {source})
241
+ ]
242
+ max_depth = 10
243
+
244
+ while stack:
245
+ current, path, visited = stack.pop()
246
+ if len(path) > len(best):
247
+ best = path
248
+
249
+ if len(path) >= max_depth:
250
+ continue
251
+
252
+ for successor in graph.successors(current):
253
+ if successor not in visited:
254
+ # Prefer nodes not yet used in other pipelines
255
+ stack.append((
256
+ successor,
257
+ path + [successor],
258
+ visited | {successor}
259
+ ))
260
+
261
+ return best
262
+
263
+ def _longest_path_in_dag(self, subgraph: nx.DiGraph) -> List[str]:
264
+ """Find the longest path in a DAG subgraph using networkx.
265
+
266
+ Falls back to DFS if the subgraph has cycles.
267
+ """
268
+ try:
269
+ # networkx dag_longest_path works on DAGs
270
+ return nx.dag_longest_path(subgraph)
271
+ except nx.NetworkXUnfeasible:
272
+ # Has cycles — fall back to finding longest simple path via DFS
273
+ sources = [n for n in subgraph.nodes()
274
+ if subgraph.in_degree(n) == 0]
275
+ if not sources:
276
+ sources = list(subgraph.nodes())[:3]
277
+
278
+ best: List[str] = []
279
+ for source in sources:
280
+ path = self._longest_path_from(subgraph, source, set())
281
+ if len(path) > len(best):
282
+ best = path
283
+ return best
284
+
285
+ # ------------------------------------------------------------------
286
+ # pipeline construction
287
+ # ------------------------------------------------------------------
288
+ def _build_pipelines(
289
+ self,
290
+ paths: List[List[str]],
291
+ funcs: Dict[str, FunctionInfo],
292
+ se_info: Dict[str, SideEffectInfo],
293
+ ) -> List[Pipeline]:
294
+ """Convert raw paths into Pipeline objects with full metadata."""
295
+ pipelines: List[Pipeline] = []
296
+
297
+ for path in paths:
298
+ stages = self._build_stages(path, funcs, se_info)
299
+ if not stages:
300
+ continue
301
+
302
+ domain = self._classify_domain(path, funcs)
303
+ name = self._derive_pipeline_name(path, funcs, domain)
304
+
305
+ # Entry/exit labeling
306
+ stages[0].is_entry = True
307
+ stages[-1].is_exit = True
308
+
309
+ # Purity aggregation
310
+ pure_count = sum(1 for s in stages if s.purity == "pure")
311
+ bottleneck = max(stages, key=lambda s: s.cc) if stages else None
312
+
313
+ # Entry/exit types
314
+ entry_type = self._get_entry_type(funcs.get(path[0]))
315
+ exit_type = self._get_exit_type(funcs.get(path[-1]))
316
+
317
+ pipeline = Pipeline(
318
+ name=name,
319
+ domain=domain,
320
+ stages=stages,
321
+ entry_point=path[0],
322
+ exit_point=path[-1],
323
+ entry_type=entry_type,
324
+ exit_type=exit_type,
325
+ pure_count=pure_count,
326
+ total_stages=len(stages),
327
+ bottleneck=bottleneck,
328
+ path_length=len(path),
329
+ )
330
+ pipelines.append(pipeline)
331
+
332
+ return pipelines
333
+
334
+ def _build_stages(
335
+ self, path: List[str],
336
+ funcs: Dict[str, FunctionInfo],
337
+ se_info: Dict[str, SideEffectInfo],
338
+ ) -> List[PipelineStage]:
339
+ """Build PipelineStage objects for each node in a path."""
340
+ stages: List[PipelineStage] = []
341
+ for qname in path:
342
+ fi = funcs.get(qname)
343
+ if not fi:
344
+ continue
345
+ cc = fi.complexity.get("cyclomatic_complexity", 0)
346
+ se = se_info.get(qname)
347
+ purity = se.classification if se else "pure"
348
+ se_summary = se.side_effect_summary if se else "pure"
349
+ sig = self._type_engine.get_typed_signature(fi)
350
+
351
+ stages.append(PipelineStage(
352
+ name=fi.name,
353
+ qualified_name=qname,
354
+ signature=sig,
355
+ cc=cc,
356
+ purity=purity,
357
+ side_effect_summary=se_summary,
358
+ ))
359
+ return stages
360
+
361
+ # ------------------------------------------------------------------
362
+ # domain classification
363
+ # ------------------------------------------------------------------
364
+ def _classify_domain(
365
+ self, path: List[str], funcs: Dict[str, FunctionInfo]
366
+ ) -> str:
367
+ """Classify pipeline domain by analyzing module names and function names."""
368
+ scores: Dict[str, int] = defaultdict(int)
369
+
370
+ for qname in path:
371
+ fi = funcs.get(qname)
372
+ if not fi:
373
+ continue
374
+ text = f"{fi.module} {fi.name}".lower()
375
+ for domain, keywords in DOMAIN_KEYWORDS.items():
376
+ for kw in keywords:
377
+ if kw in text:
378
+ scores[domain] += 1
379
+
380
+ if scores:
381
+ return max(scores, key=scores.get)
382
+ return "Unknown"
383
+
384
+ def _derive_pipeline_name(
385
+ self, path: List[str],
386
+ funcs: Dict[str, FunctionInfo],
387
+ domain: str,
388
+ ) -> str:
389
+ """Derive a human-readable pipeline name."""
390
+ # Use the dominant sub-module name
391
+ module_counts: Dict[str, int] = defaultdict(int)
392
+ for qname in path:
393
+ fi = funcs.get(qname)
394
+ if fi:
395
+ parts = fi.module.split(".")
396
+ # Use most specific module component
397
+ for part in parts:
398
+ if part and part not in ("code2flow", "__init__"):
399
+ module_counts[part] += 1
400
+
401
+ if module_counts:
402
+ dominant = max(module_counts, key=module_counts.get)
403
+ # Capitalize and use domain if module name is generic
404
+ if dominant in ("core", "base", "utils", "helpers"):
405
+ return domain
406
+ return dominant.capitalize()
407
+
408
+ return domain
409
+
410
+ # ------------------------------------------------------------------
411
+ # type helpers
412
+ # ------------------------------------------------------------------
413
+ def _get_entry_type(self, fi: Optional[FunctionInfo]) -> str:
414
+ """Get the input type of a pipeline's entry point."""
415
+ if not fi:
416
+ return "?"
417
+ args = self._type_engine.get_arg_types(fi)
418
+ for arg in args:
419
+ if arg["name"] == "self":
420
+ continue
421
+ if arg.get("type"):
422
+ return arg["type"]
423
+ return arg["name"]
424
+ return "?"
425
+
426
+ def _get_exit_type(self, fi: Optional[FunctionInfo]) -> str:
427
+ """Get the output type of a pipeline's exit point."""
428
+ if not fi:
429
+ return "?"
430
+ ret = self._type_engine.get_return_type(fi)
431
+ return ret if ret else "?"
432
+
433
+ # ------------------------------------------------------------------
434
+ # callee resolution
435
+ # ------------------------------------------------------------------
436
+ def _resolve_callee(
437
+ self, callee: str, funcs: Dict[str, FunctionInfo]
438
+ ) -> Optional[str]:
439
+ """Resolve callee name to qualified name."""
440
+ if callee in funcs:
441
+ return callee
442
+ for qname in funcs:
443
+ if qname.endswith(f".{callee}"):
444
+ return qname
445
+ return None