codegraph-gen 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph_gen/__init__.py +0 -0
- codegraph_gen/__main__.py +311 -0
- codegraph_gen/ai.py +77 -0
- codegraph_gen/analyzer.py +100 -0
- codegraph_gen/builder.py +747 -0
- codegraph_gen/cluster.py +116 -0
- codegraph_gen/config.py +76 -0
- codegraph_gen/detect.py +59 -0
- codegraph_gen/engine.py +367 -0
- codegraph_gen/parser/__init__.py +27 -0
- codegraph_gen/parser/base.py +38 -0
- codegraph_gen/parser/cpp.py +349 -0
- codegraph_gen/parser/go.py +268 -0
- codegraph_gen/parser/javascript.py +370 -0
- codegraph_gen/parser/kotlin.py +387 -0
- codegraph_gen/parser/python.py +415 -0
- codegraph_gen/parser/rust.py +497 -0
- codegraph_gen/parser/swift.py +327 -0
- codegraph_gen/py.typed +0 -0
- codegraph_gen/renderer.py +498 -0
- codegraph_gen/writer.py +97 -0
- codegraph_gen-0.2.0.dist-info/METADATA +169 -0
- codegraph_gen-0.2.0.dist-info/RECORD +25 -0
- codegraph_gen-0.2.0.dist-info/WHEEL +4 -0
- codegraph_gen-0.2.0.dist-info/entry_points.txt +4 -0
codegraph_gen/cluster.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import networkx as nx
|
|
3
|
+
from networkx.algorithms.community import louvain_communities
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def detect_components(
|
|
9
|
+
G: nx.DiGraph,
|
|
10
|
+
) -> tuple[dict[int, list[str]], dict[int, float], dict[int, str]]:
|
|
11
|
+
"""
|
|
12
|
+
Detects logical components in the graph using modularity clustering.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
components: dict mapping component_id -> list of node_ids
|
|
16
|
+
cohesion_scores: dict mapping component_id -> cohesion density float
|
|
17
|
+
component_names: dict mapping component_id -> human friendly name
|
|
18
|
+
"""
|
|
19
|
+
if G.number_of_nodes() == 0:
|
|
20
|
+
return {}, {}, {}
|
|
21
|
+
|
|
22
|
+
# Convert to undirected weighted graph for Louvain community detection
|
|
23
|
+
U = nx.Graph()
|
|
24
|
+
U.add_nodes_from(G.nodes)
|
|
25
|
+
|
|
26
|
+
for u, v, d in G.edges(data=True):
|
|
27
|
+
relation = d.get("relation")
|
|
28
|
+
if relation == "contains":
|
|
29
|
+
weight = 10.0
|
|
30
|
+
elif relation == "imports":
|
|
31
|
+
weight = 2.0
|
|
32
|
+
elif relation == "calls":
|
|
33
|
+
weight = 1.0
|
|
34
|
+
else:
|
|
35
|
+
weight = 1.0
|
|
36
|
+
|
|
37
|
+
if U.has_edge(u, v):
|
|
38
|
+
U[u][v]["weight"] += weight
|
|
39
|
+
else:
|
|
40
|
+
U.add_edge(u, v, weight=weight)
|
|
41
|
+
|
|
42
|
+
# Run Louvain community clustering with fixed seed for reproducibility
|
|
43
|
+
communities = list(louvain_communities(U, weight="weight", seed=42))
|
|
44
|
+
|
|
45
|
+
# Sort communities by size descending, breaking ties stably by sorted member IDs
|
|
46
|
+
communities.sort(key=lambda s: (-len(s), sorted(list(s))))
|
|
47
|
+
|
|
48
|
+
components = {}
|
|
49
|
+
cohesion_scores = {}
|
|
50
|
+
component_names = {}
|
|
51
|
+
raw_components = []
|
|
52
|
+
|
|
53
|
+
import os
|
|
54
|
+
from collections import Counter
|
|
55
|
+
|
|
56
|
+
for idx, member_set in enumerate(communities, start=1):
|
|
57
|
+
members = list(member_set)
|
|
58
|
+
components[idx] = members
|
|
59
|
+
|
|
60
|
+
# Calculate cohesion: density of the induced subgraph
|
|
61
|
+
subgraph = G.subgraph(members)
|
|
62
|
+
density = nx.density(subgraph)
|
|
63
|
+
cohesion_scores[idx] = round(density, 2)
|
|
64
|
+
|
|
65
|
+
# Name the component by its most central (highest degree) node
|
|
66
|
+
degrees = dict(G.degree(members))
|
|
67
|
+
if degrees:
|
|
68
|
+
# Sort by degree descending, and break ties alphabetically by node ID
|
|
69
|
+
sorted_nodes = sorted(degrees.keys(), key=lambda n: (-degrees[n], n))
|
|
70
|
+
most_central_node = sorted_nodes[0]
|
|
71
|
+
node_label = G.nodes[most_central_node].get("label", most_central_node)
|
|
72
|
+
# Remove trailing parens/extensions to make clean component name
|
|
73
|
+
clean_name = node_label.replace("()", "").split(".")[0]
|
|
74
|
+
else:
|
|
75
|
+
clean_name = f"Component {idx}"
|
|
76
|
+
|
|
77
|
+
# Find the longest common directory path
|
|
78
|
+
paths = []
|
|
79
|
+
for m in members:
|
|
80
|
+
sf = G.nodes[m].get("source_file")
|
|
81
|
+
if sf:
|
|
82
|
+
dir_path = os.path.dirname(sf)
|
|
83
|
+
if dir_path:
|
|
84
|
+
paths.append(dir_path)
|
|
85
|
+
|
|
86
|
+
common_dir = ""
|
|
87
|
+
if paths:
|
|
88
|
+
try:
|
|
89
|
+
common_dir = os.path.commonpath(paths)
|
|
90
|
+
if common_dir in (".", "", "/"):
|
|
91
|
+
common_dir = ""
|
|
92
|
+
except ValueError:
|
|
93
|
+
common_dir = ""
|
|
94
|
+
|
|
95
|
+
raw_components.append((idx, clean_name, common_dir))
|
|
96
|
+
|
|
97
|
+
# Count frequencies of candidate names to detect collisions
|
|
98
|
+
candidate_names = []
|
|
99
|
+
for idx, clean_name, common_dir in raw_components:
|
|
100
|
+
cand = common_dir if common_dir else clean_name
|
|
101
|
+
candidate_names.append(cand)
|
|
102
|
+
name_counts = Counter(candidate_names)
|
|
103
|
+
|
|
104
|
+
# Assign final unique component names
|
|
105
|
+
for idx, clean_name, common_dir in raw_components:
|
|
106
|
+
cand = common_dir if common_dir else clean_name
|
|
107
|
+
if name_counts[cand] == 1:
|
|
108
|
+
component_names[idx] = cand
|
|
109
|
+
else:
|
|
110
|
+
# Collision! Qualify the name to ensure uniqueness and clarity
|
|
111
|
+
if common_dir:
|
|
112
|
+
component_names[idx] = f"{common_dir} ({clean_name})"
|
|
113
|
+
else:
|
|
114
|
+
component_names[idx] = f"{clean_name} (Component {idx})"
|
|
115
|
+
|
|
116
|
+
return components, cohesion_scores, component_names
|
codegraph_gen/config.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
from codegraph_gen.parser.base import ExtractionResult
|
|
5
|
+
|
|
6
|
+
# Default exclusions for files and directories we want to ignore
|
|
7
|
+
DEFAULT_EXCLUSIONS = {
|
|
8
|
+
".git",
|
|
9
|
+
".venv",
|
|
10
|
+
"venv",
|
|
11
|
+
"node_modules",
|
|
12
|
+
"third_party",
|
|
13
|
+
"dist",
|
|
14
|
+
"build",
|
|
15
|
+
".build",
|
|
16
|
+
"__pycache__",
|
|
17
|
+
".pytest_cache",
|
|
18
|
+
".codegraph",
|
|
19
|
+
".idea",
|
|
20
|
+
".vscode",
|
|
21
|
+
"target",
|
|
22
|
+
"out",
|
|
23
|
+
"bin",
|
|
24
|
+
"obj",
|
|
25
|
+
"vendor",
|
|
26
|
+
"Pods",
|
|
27
|
+
"Carthage",
|
|
28
|
+
"DerivedData",
|
|
29
|
+
"build_output",
|
|
30
|
+
".next",
|
|
31
|
+
".nuxt",
|
|
32
|
+
".cache",
|
|
33
|
+
"build_mac",
|
|
34
|
+
"build_ios",
|
|
35
|
+
"build_ios_sim",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Mapping of supported languages to file extensions
|
|
40
|
+
LANGUAGE_EXTENSIONS = {
|
|
41
|
+
"python": {".py"},
|
|
42
|
+
"javascript": {".js", ".mjs", ".cjs"},
|
|
43
|
+
"typescript": {".ts", ".tsx"},
|
|
44
|
+
"kotlin": {".kt", ".kts"},
|
|
45
|
+
"go": {".go"},
|
|
46
|
+
"rust": {".rs"},
|
|
47
|
+
"swift": {".swift"},
|
|
48
|
+
"c": {".c", ".h"},
|
|
49
|
+
"cpp": {".cpp", ".cc", ".cxx", ".hpp", ".hxx"},
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
ALL_EXTENSIONS = {ext for exts in LANGUAGE_EXTENSIONS.values() for ext in exts}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class CacheEntry(BaseModel):
|
|
56
|
+
mtime: float
|
|
57
|
+
size: int
|
|
58
|
+
hash: str
|
|
59
|
+
result: ExtractionResult
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class CodegraphConfig(BaseModel):
|
|
63
|
+
"""Configuration class for codegraph parsing and exporting."""
|
|
64
|
+
|
|
65
|
+
workspace_dir: Path
|
|
66
|
+
output_dir: Path = Field(default_factory=lambda: Path(".codegraph"))
|
|
67
|
+
exclusions: set[str] = Field(default_factory=lambda: DEFAULT_EXCLUSIONS)
|
|
68
|
+
languages: set[str] = Field(default_factory=lambda: set(LANGUAGE_EXTENSIONS.keys()))
|
|
69
|
+
max_workers: int = Field(default_factory=lambda: os.cpu_count() or 4)
|
|
70
|
+
use_cache: bool = Field(default=True)
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def absolute_output_dir(self) -> Path:
|
|
74
|
+
if self.output_dir.is_absolute():
|
|
75
|
+
return self.output_dir
|
|
76
|
+
return self.workspace_dir / self.output_dir
|
codegraph_gen/detect.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from codegraph_gen.config import CodegraphConfig, LANGUAGE_EXTENSIONS
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def discover_files(config: CodegraphConfig) -> list[tuple[Path, str]]:
|
|
9
|
+
"""
|
|
10
|
+
Recursively discovers source files in the workspace directory.
|
|
11
|
+
Filters by allowed languages and ignores files/directories in exclusions.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
List of tuples: (absolute_file_path, language_name)
|
|
15
|
+
"""
|
|
16
|
+
found_files = []
|
|
17
|
+
workspace = config.workspace_dir.resolve()
|
|
18
|
+
|
|
19
|
+
# Map extension -> language
|
|
20
|
+
ext_to_lang = {}
|
|
21
|
+
for lang in config.languages:
|
|
22
|
+
if lang in LANGUAGE_EXTENSIONS:
|
|
23
|
+
for ext in LANGUAGE_EXTENSIONS[lang]:
|
|
24
|
+
ext_to_lang[ext] = lang
|
|
25
|
+
|
|
26
|
+
# Normalize exclusions to lowercase for case-insensitive matching
|
|
27
|
+
exclusions_lower = {exc.lower() for exc in config.exclusions}
|
|
28
|
+
|
|
29
|
+
def is_ignored(path: Path) -> bool:
|
|
30
|
+
# Check if any part of the path is in exclusions_lower
|
|
31
|
+
try:
|
|
32
|
+
rel_parts = path.relative_to(workspace).parts
|
|
33
|
+
except ValueError:
|
|
34
|
+
# Not under workspace
|
|
35
|
+
return True
|
|
36
|
+
|
|
37
|
+
for part in rel_parts:
|
|
38
|
+
if part.lower() in exclusions_lower:
|
|
39
|
+
return True
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
def scan_dir(directory: Path):
|
|
43
|
+
try:
|
|
44
|
+
for item in directory.iterdir():
|
|
45
|
+
if is_ignored(item):
|
|
46
|
+
continue
|
|
47
|
+
if item.is_dir():
|
|
48
|
+
scan_dir(item)
|
|
49
|
+
elif item.is_file():
|
|
50
|
+
ext = item.suffix.lower()
|
|
51
|
+
if ext in ext_to_lang:
|
|
52
|
+
found_files.append((item.resolve(), ext_to_lang[ext]))
|
|
53
|
+
except PermissionError:
|
|
54
|
+
logger.warning(f"Permission denied: {directory}")
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logger.error(f"Error scanning {directory}: {e}")
|
|
57
|
+
|
|
58
|
+
scan_dir(workspace)
|
|
59
|
+
return found_files
|
codegraph_gen/engine.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import json
|
|
3
|
+
import hashlib
|
|
4
|
+
import concurrent.futures
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
8
|
+
import networkx as nx
|
|
9
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
|
|
11
|
+
from codegraph_gen.config import CodegraphConfig, CacheEntry
|
|
12
|
+
from codegraph_gen.parser.base import ExtractionResult
|
|
13
|
+
from codegraph_gen.detect import discover_files
|
|
14
|
+
from codegraph_gen.parser import get_parser
|
|
15
|
+
from codegraph_gen.builder import build_graph
|
|
16
|
+
from codegraph_gen.cluster import detect_components
|
|
17
|
+
from codegraph_gen.analyzer import analyze_graph, AnalysisResult
|
|
18
|
+
from codegraph_gen.renderer import (
|
|
19
|
+
MarkdownRenderer,
|
|
20
|
+
get_node_filename,
|
|
21
|
+
get_component_filename,
|
|
22
|
+
)
|
|
23
|
+
from codegraph_gen.writer import VaultWriter
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_file_hash(path: Path) -> str:
|
|
29
|
+
"""Computes MD5 hash of a file."""
|
|
30
|
+
hasher = hashlib.md5()
|
|
31
|
+
try:
|
|
32
|
+
with open(path, "rb") as f:
|
|
33
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
34
|
+
hasher.update(chunk)
|
|
35
|
+
except Exception:
|
|
36
|
+
return ""
|
|
37
|
+
return hasher.hexdigest()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _parse_file_worker(
|
|
41
|
+
file_path: Path, lang: str, workspace_dir: Path
|
|
42
|
+
) -> tuple[Path, Optional[ExtractionResult], Optional[str]]:
|
|
43
|
+
"""Worker function for parallel file parsing."""
|
|
44
|
+
try:
|
|
45
|
+
from codegraph_gen.parser import get_parser
|
|
46
|
+
|
|
47
|
+
parser = get_parser(lang)
|
|
48
|
+
result = parser.parse_file(file_path, workspace_dir)
|
|
49
|
+
return file_path, result, None
|
|
50
|
+
except Exception as e:
|
|
51
|
+
import traceback
|
|
52
|
+
|
|
53
|
+
err_msg = f"{e}\n{traceback.format_exc()}"
|
|
54
|
+
return file_path, None, err_msg
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class PipelineStage(str, Enum):
|
|
58
|
+
DISCOVERING = "discovering"
|
|
59
|
+
PARSING = "parsing"
|
|
60
|
+
BUILDING = "building"
|
|
61
|
+
CLUSTERING = "clustering"
|
|
62
|
+
ANALYZING = "analyzing"
|
|
63
|
+
RENDERING = "rendering"
|
|
64
|
+
WRITING = "writing"
|
|
65
|
+
COMPLETED = "completed"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class PipelineResult(BaseModel):
|
|
69
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
70
|
+
|
|
71
|
+
graph: nx.DiGraph
|
|
72
|
+
files: List[Tuple[Path, str]]
|
|
73
|
+
components: Dict[int, List[str]]
|
|
74
|
+
cohesion_scores: Dict[int, float]
|
|
75
|
+
component_names: Dict[int, str]
|
|
76
|
+
analysis: AnalysisResult
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class CodegraphEngine:
|
|
80
|
+
def __init__(self, config: CodegraphConfig):
|
|
81
|
+
self.config = config
|
|
82
|
+
self.renderer = MarkdownRenderer(config.workspace_dir)
|
|
83
|
+
self.writer = VaultWriter()
|
|
84
|
+
|
|
85
|
+
def run_pipeline(
|
|
86
|
+
self,
|
|
87
|
+
progress_callback: Optional[
|
|
88
|
+
Callable[[PipelineStage, Any, int, int], None]
|
|
89
|
+
] = None,
|
|
90
|
+
) -> PipelineResult:
|
|
91
|
+
"""
|
|
92
|
+
Runs the full codegraph generation pipeline.
|
|
93
|
+
Args:
|
|
94
|
+
progress_callback: A function taking (stage, current_item, index, total)
|
|
95
|
+
"""
|
|
96
|
+
logger.info("Starting codegraph engine pipeline...")
|
|
97
|
+
|
|
98
|
+
# 1. Discover files
|
|
99
|
+
if progress_callback:
|
|
100
|
+
progress_callback(PipelineStage.DISCOVERING, None, 0, 0)
|
|
101
|
+
files = discover_files(self.config)
|
|
102
|
+
if not files:
|
|
103
|
+
logger.warning("No supported files found.")
|
|
104
|
+
if progress_callback:
|
|
105
|
+
progress_callback(PipelineStage.COMPLETED, None, 0, 0)
|
|
106
|
+
return PipelineResult(
|
|
107
|
+
graph=nx.DiGraph(),
|
|
108
|
+
files=[],
|
|
109
|
+
components={},
|
|
110
|
+
cohesion_scores={},
|
|
111
|
+
component_names={},
|
|
112
|
+
analysis=AnalysisResult(god_nodes=[], cycles=[], inter_comp_deps={}),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# 2. Parse files (with caching and optional parallel processing)
|
|
116
|
+
extractions = []
|
|
117
|
+
total_files = len(files)
|
|
118
|
+
|
|
119
|
+
cache_path = self.config.absolute_output_dir / "cache.json"
|
|
120
|
+
cache_entries = {}
|
|
121
|
+
if self.config.use_cache and cache_path.exists():
|
|
122
|
+
try:
|
|
123
|
+
with open(cache_path, "r", encoding="utf-8") as f:
|
|
124
|
+
cache_data = json.load(f)
|
|
125
|
+
for k, v in cache_data.items():
|
|
126
|
+
cache_entries[k] = CacheEntry(**v)
|
|
127
|
+
logger.info(f"Loaded {len(cache_entries)} cache entries.")
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.warning(f"Could not load cache: {e}")
|
|
130
|
+
|
|
131
|
+
files_to_parse = []
|
|
132
|
+
new_cache_entries = {}
|
|
133
|
+
|
|
134
|
+
for file_path, lang in files:
|
|
135
|
+
rel_path = str(file_path.relative_to(self.config.workspace_dir))
|
|
136
|
+
try:
|
|
137
|
+
stat = file_path.stat()
|
|
138
|
+
mtime = stat.st_mtime
|
|
139
|
+
size = stat.st_size
|
|
140
|
+
file_hash = get_file_hash(file_path)
|
|
141
|
+
|
|
142
|
+
# Check cache hit
|
|
143
|
+
if rel_path in cache_entries:
|
|
144
|
+
entry = cache_entries[rel_path]
|
|
145
|
+
if (
|
|
146
|
+
entry.mtime == mtime
|
|
147
|
+
and entry.size == size
|
|
148
|
+
and entry.hash == file_hash
|
|
149
|
+
):
|
|
150
|
+
extractions.append(entry.result)
|
|
151
|
+
new_cache_entries[rel_path] = entry
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
# Cache miss
|
|
155
|
+
files_to_parse.append(
|
|
156
|
+
(file_path, lang, rel_path, mtime, size, file_hash)
|
|
157
|
+
)
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.error(f"Error accessing file metadata for {file_path}: {e}")
|
|
160
|
+
# Fallback to parsing without cache metadata
|
|
161
|
+
files_to_parse.append((file_path, lang, rel_path, 0.0, 0, ""))
|
|
162
|
+
|
|
163
|
+
num_hits = total_files - len(files_to_parse)
|
|
164
|
+
if num_hits > 0:
|
|
165
|
+
logger.info(
|
|
166
|
+
f"Cache hit: {num_hits} / {total_files} files loaded from cache."
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if not files_to_parse:
|
|
170
|
+
if progress_callback:
|
|
171
|
+
progress_callback(PipelineStage.PARSING, None, total_files, total_files)
|
|
172
|
+
else:
|
|
173
|
+
max_workers = self.config.max_workers
|
|
174
|
+
if max_workers > 1 and len(files_to_parse) > 1:
|
|
175
|
+
logger.info(
|
|
176
|
+
f"Parsing {len(files_to_parse)} files in parallel with {max_workers} workers..."
|
|
177
|
+
)
|
|
178
|
+
with concurrent.futures.ProcessPoolExecutor(
|
|
179
|
+
max_workers=max_workers
|
|
180
|
+
) as executor:
|
|
181
|
+
futures = {
|
|
182
|
+
executor.submit(
|
|
183
|
+
_parse_file_worker,
|
|
184
|
+
file_path,
|
|
185
|
+
lang,
|
|
186
|
+
self.config.workspace_dir,
|
|
187
|
+
): (file_path, rel_path, mtime, size, file_hash)
|
|
188
|
+
for file_path, lang, rel_path, mtime, size, file_hash in files_to_parse
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
for idx, future in enumerate(
|
|
192
|
+
concurrent.futures.as_completed(futures), start=1
|
|
193
|
+
):
|
|
194
|
+
file_path, rel_path, mtime, size, file_hash = futures[future]
|
|
195
|
+
progress_idx = num_hits + idx
|
|
196
|
+
if progress_callback:
|
|
197
|
+
progress_callback(
|
|
198
|
+
PipelineStage.PARSING,
|
|
199
|
+
file_path,
|
|
200
|
+
progress_idx,
|
|
201
|
+
total_files,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
_, result, err_msg = future.result()
|
|
206
|
+
if err_msg:
|
|
207
|
+
logger.error(
|
|
208
|
+
f"Error parsing file {file_path} in worker: {err_msg}"
|
|
209
|
+
)
|
|
210
|
+
elif result:
|
|
211
|
+
extractions.append(result)
|
|
212
|
+
if file_hash:
|
|
213
|
+
new_cache_entries[rel_path] = CacheEntry(
|
|
214
|
+
mtime=mtime,
|
|
215
|
+
size=size,
|
|
216
|
+
hash=file_hash,
|
|
217
|
+
result=result,
|
|
218
|
+
)
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"Failed to parse file {file_path}: {e}")
|
|
221
|
+
else:
|
|
222
|
+
logger.info(f"Parsing {len(files_to_parse)} files sequentially...")
|
|
223
|
+
for idx, (
|
|
224
|
+
file_path,
|
|
225
|
+
lang,
|
|
226
|
+
rel_path,
|
|
227
|
+
mtime,
|
|
228
|
+
size,
|
|
229
|
+
file_hash,
|
|
230
|
+
) in enumerate(files_to_parse, start=1):
|
|
231
|
+
progress_idx = num_hits + idx
|
|
232
|
+
if progress_callback:
|
|
233
|
+
progress_callback(
|
|
234
|
+
PipelineStage.PARSING, file_path, progress_idx, total_files
|
|
235
|
+
)
|
|
236
|
+
try:
|
|
237
|
+
parser = get_parser(lang)
|
|
238
|
+
result = parser.parse_file(file_path, self.config.workspace_dir)
|
|
239
|
+
extractions.append(result)
|
|
240
|
+
if file_hash:
|
|
241
|
+
new_cache_entries[rel_path] = CacheEntry(
|
|
242
|
+
mtime=mtime, size=size, hash=file_hash, result=result
|
|
243
|
+
)
|
|
244
|
+
except Exception as e:
|
|
245
|
+
logger.error(f"Error parsing file {file_path}: {e}")
|
|
246
|
+
|
|
247
|
+
# 3. Build graph
|
|
248
|
+
if progress_callback:
|
|
249
|
+
progress_callback(PipelineStage.BUILDING, None, 0, 0)
|
|
250
|
+
G = build_graph(extractions, self.config.workspace_dir)
|
|
251
|
+
|
|
252
|
+
# 4. Component clustering
|
|
253
|
+
if progress_callback:
|
|
254
|
+
progress_callback(PipelineStage.CLUSTERING, None, 0, 0)
|
|
255
|
+
components, cohesion_scores, component_names = detect_components(G)
|
|
256
|
+
|
|
257
|
+
# 5. Graph analysis
|
|
258
|
+
if progress_callback:
|
|
259
|
+
progress_callback(PipelineStage.ANALYZING, None, 0, 0)
|
|
260
|
+
analysis = analyze_graph(G, components)
|
|
261
|
+
|
|
262
|
+
# 6. Render pages in memory
|
|
263
|
+
if progress_callback:
|
|
264
|
+
progress_callback(PipelineStage.RENDERING, None, 0, 0)
|
|
265
|
+
node_component_map = {}
|
|
266
|
+
for cid, members in components.items():
|
|
267
|
+
comp_name = component_names.get(cid, f"Component {cid}")
|
|
268
|
+
for member in members:
|
|
269
|
+
node_component_map[member] = comp_name
|
|
270
|
+
|
|
271
|
+
rendered_nodes = {}
|
|
272
|
+
for nid, ndata in G.nodes(data=True):
|
|
273
|
+
fname = get_node_filename(nid)
|
|
274
|
+
content = self.renderer.render_node_page(nid, ndata, G, node_component_map)
|
|
275
|
+
rendered_nodes[fname] = content
|
|
276
|
+
|
|
277
|
+
rendered_components = {}
|
|
278
|
+
for cid, members in components.items():
|
|
279
|
+
comp_name = component_names[cid]
|
|
280
|
+
cohesion = cohesion_scores[cid]
|
|
281
|
+
fname = get_component_filename(comp_name)
|
|
282
|
+
content = self.renderer.render_component_page(
|
|
283
|
+
cid,
|
|
284
|
+
members,
|
|
285
|
+
G,
|
|
286
|
+
cohesion,
|
|
287
|
+
comp_name,
|
|
288
|
+
analysis.inter_comp_deps,
|
|
289
|
+
component_names,
|
|
290
|
+
)
|
|
291
|
+
rendered_components[fname] = content
|
|
292
|
+
|
|
293
|
+
# Check if README already has AI Insights and preserve it
|
|
294
|
+
ai_insights = None
|
|
295
|
+
readme_path = self.config.absolute_output_dir / "README.md"
|
|
296
|
+
if readme_path.exists():
|
|
297
|
+
try:
|
|
298
|
+
old_readme = readme_path.read_text(encoding="utf-8")
|
|
299
|
+
marker = None
|
|
300
|
+
for m in (
|
|
301
|
+
"## AI Architectural Insights",
|
|
302
|
+
"## AI 架构深度洞察 (AI Architectural Insights)",
|
|
303
|
+
"## AI 架构深度洞察",
|
|
304
|
+
):
|
|
305
|
+
if m in old_readme:
|
|
306
|
+
marker = m
|
|
307
|
+
break
|
|
308
|
+
if marker:
|
|
309
|
+
parts = old_readme.split(marker, 1)
|
|
310
|
+
insights_text = parts[1].strip()
|
|
311
|
+
if insights_text:
|
|
312
|
+
ai_insights = insights_text
|
|
313
|
+
except Exception as e:
|
|
314
|
+
logger.warning(
|
|
315
|
+
f"Could not read existing README.md to preserve AI insights: {e}"
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
readme_content = self.renderer.render_readme(
|
|
319
|
+
G,
|
|
320
|
+
components,
|
|
321
|
+
cohesion_scores,
|
|
322
|
+
component_names,
|
|
323
|
+
analysis,
|
|
324
|
+
ai_insights=ai_insights,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
prompt_content = self.renderer.render_agent_prompt(
|
|
328
|
+
G, components, cohesion_scores, component_names, analysis
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# 7. Write vault to disk
|
|
332
|
+
if progress_callback:
|
|
333
|
+
progress_callback(PipelineStage.WRITING, None, 0, 0)
|
|
334
|
+
self.writer.write_vault(
|
|
335
|
+
self.config.absolute_output_dir,
|
|
336
|
+
rendered_nodes,
|
|
337
|
+
rendered_components,
|
|
338
|
+
readme_content,
|
|
339
|
+
prompt_content,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Write updated cache back to disk
|
|
343
|
+
if self.config.use_cache:
|
|
344
|
+
try:
|
|
345
|
+
self.config.absolute_output_dir.mkdir(parents=True, exist_ok=True)
|
|
346
|
+
with open(cache_path, "w", encoding="utf-8") as f:
|
|
347
|
+
json.dump(
|
|
348
|
+
{k: v.model_dump() for k, v in new_cache_entries.items()},
|
|
349
|
+
f,
|
|
350
|
+
indent=2,
|
|
351
|
+
)
|
|
352
|
+
logger.info(f"Saved {len(new_cache_entries)} cache entries.")
|
|
353
|
+
except Exception as e:
|
|
354
|
+
logger.warning(f"Could not save cache: {e}")
|
|
355
|
+
|
|
356
|
+
if progress_callback:
|
|
357
|
+
progress_callback(PipelineStage.COMPLETED, None, 0, 0)
|
|
358
|
+
|
|
359
|
+
logger.info("Pipeline executed successfully.")
|
|
360
|
+
return PipelineResult(
|
|
361
|
+
graph=G,
|
|
362
|
+
files=files,
|
|
363
|
+
components=components,
|
|
364
|
+
cohesion_scores=cohesion_scores,
|
|
365
|
+
component_names=component_names,
|
|
366
|
+
analysis=analysis,
|
|
367
|
+
)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from codegraph_gen.parser.base import BaseParser
|
|
2
|
+
from codegraph_gen.parser.python import PythonParser
|
|
3
|
+
from codegraph_gen.parser.javascript import JavaScriptParser
|
|
4
|
+
from codegraph_gen.parser.go import GoParser
|
|
5
|
+
from codegraph_gen.parser.rust import RustParser
|
|
6
|
+
from codegraph_gen.parser.swift import SwiftParser
|
|
7
|
+
from codegraph_gen.parser.cpp import CParser, CppParser
|
|
8
|
+
from codegraph_gen.parser.kotlin import KotlinParser
|
|
9
|
+
|
|
10
|
+
PARSERS: dict[str, type[BaseParser]] = {
|
|
11
|
+
"python": PythonParser,
|
|
12
|
+
"javascript": JavaScriptParser,
|
|
13
|
+
"typescript": JavaScriptParser, # uses same tree-sitter parser
|
|
14
|
+
"go": GoParser,
|
|
15
|
+
"rust": RustParser,
|
|
16
|
+
"swift": SwiftParser,
|
|
17
|
+
"c": CParser,
|
|
18
|
+
"cpp": CppParser,
|
|
19
|
+
"kotlin": KotlinParser,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_parser(language: str) -> BaseParser:
|
|
24
|
+
"""Returns an instance of the parser for the given language."""
|
|
25
|
+
if language not in PARSERS:
|
|
26
|
+
raise ValueError(f"Unsupported language: {language}")
|
|
27
|
+
return PARSERS[language]()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NodeSchema(BaseModel):
|
|
7
|
+
id: str # Unique identifier, e.g. "relative_path::symbol_name"
|
|
8
|
+
label: str # Human readable name, e.g. "my_function"
|
|
9
|
+
type: str # 'file', 'class', 'function', 'method', 'struct', 'interface', 'trait', 'protocol'
|
|
10
|
+
source_file: str # Path relative to workspace
|
|
11
|
+
line_start: int # 1-indexed
|
|
12
|
+
line_end: int # 1-indexed
|
|
13
|
+
signature: str # Signature snippet
|
|
14
|
+
docstring: str = "" # Docstring or comments
|
|
15
|
+
local_bindings: dict[
|
|
16
|
+
str, str
|
|
17
|
+
] = {} # Maps local variable/parameter name to its type name
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EdgeSchema(BaseModel):
|
|
21
|
+
source: str # Source node ID
|
|
22
|
+
target: str # Target node ID
|
|
23
|
+
relation: str # 'contains', 'imports', 'calls', 'inherits', 'implements'
|
|
24
|
+
import_map: dict[str, str] = {} # Maps local name to original symbol name
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ExtractionResult(BaseModel):
|
|
28
|
+
nodes: list[NodeSchema] = []
|
|
29
|
+
edges: list[EdgeSchema] = []
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class BaseParser(ABC):
|
|
33
|
+
"""Abstract base class for all language-specific AST parsers."""
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def parse_file(self, file_path: Path, workspace_dir: Path) -> ExtractionResult:
|
|
37
|
+
"""Parses a file and extracts symbols (nodes) and relations (edges)."""
|
|
38
|
+
pass
|