codegraph-cli-ai 0.1.7__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/PKG-INFO +4 -1
- codegraph_cli_ai-0.1.9/codegraph/cli.py +279 -0
- codegraph_cli_ai-0.1.9/codegraph/graph/builder.py +190 -0
- codegraph_cli_ai-0.1.9/codegraph/parsers/database_parser.py +25 -0
- codegraph_cli_ai-0.1.9/codegraph/parsers/image_parser.py +39 -0
- codegraph_cli_ai-0.1.9/codegraph/parsers/multimodal_parser.py +86 -0
- codegraph_cli_ai-0.1.9/codegraph/parsers/pdf_parser.py +43 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph/parsers/python_parser.py +13 -1
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph_cli_ai.egg-info/PKG-INFO +4 -1
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph_cli_ai.egg-info/SOURCES.txt +4 -0
- codegraph_cli_ai-0.1.9/codegraph_cli_ai.egg-info/requires.txt +6 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/pyproject.toml +5 -2
- codegraph_cli_ai-0.1.7/codegraph/cli.py +0 -454
- codegraph_cli_ai-0.1.7/codegraph/graph/builder.py +0 -110
- codegraph_cli_ai-0.1.7/codegraph_cli_ai.egg-info/requires.txt +0 -3
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/MANIFEST.in +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/README.md +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph_cli_ai.egg-info/dependency_links.txt +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph_cli_ai.egg-info/entry_points.txt +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/codegraph_cli_ai.egg-info/top_level.txt +0 -0
- {codegraph_cli_ai-0.1.7 → codegraph_cli_ai-0.1.9}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codegraph-cli-ai
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.9
|
|
4
4
|
Summary: CLI tool to analyze codebases and visualize knowledge graphs using AST
|
|
5
5
|
Author: Aditya Jogdand
|
|
6
6
|
License: MIT
|
|
@@ -15,6 +15,9 @@ Description-Content-Type: text/markdown
|
|
|
15
15
|
Requires-Dist: typer>=0.9.0
|
|
16
16
|
Requires-Dist: networkx>=3.0
|
|
17
17
|
Requires-Dist: pyvis>=0.3.2
|
|
18
|
+
Requires-Dist: pypdf>=3.0.0
|
|
19
|
+
Requires-Dist: pytesseract>=0.3.10
|
|
20
|
+
Requires-Dist: Pillow>=9.0.0
|
|
18
21
|
|
|
19
22
|
# CodeGraph AI
|
|
20
23
|
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CodeGraph AI - CLI Entry Point
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import typer
|
|
7
|
+
import webbrowser
|
|
8
|
+
import networkx as nx
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional, List, Set, Dict
|
|
12
|
+
from collections import deque
|
|
13
|
+
from codegraph.parsers.python_parser import PythonParser
|
|
14
|
+
from codegraph.parsers.multimodal_parser import MultiModalParser
|
|
15
|
+
from codegraph.graph.builder import GraphBuilder
|
|
16
|
+
|
|
17
|
+
app = typer.Typer()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@app.callback()
|
|
21
|
+
def main():
|
|
22
|
+
"""CodeGraph AI - Understand your codebase using graphs and AI."""
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Dynamic Ignore Defaults
|
|
27
|
+
IGNORE_FILES = {"graph.json", ".DS_Store"}
|
|
28
|
+
IGNORE_EXTENSIONS = {".pyc", ".log", ".pyo", ".pyd"}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def is_virtualenv(path: Path) -> bool:
|
|
32
|
+
"""Detect if a directory is a virtual environment based on its structure."""
|
|
33
|
+
if not path.is_dir():
|
|
34
|
+
return False
|
|
35
|
+
return (
|
|
36
|
+
(path / "pyvenv.cfg").exists()
|
|
37
|
+
or (path / "bin" / "python").exists()
|
|
38
|
+
or (path / "Scripts" / "python.exe").exists()
|
|
39
|
+
or (path / "bin" / "pip").exists()
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def should_ignore_dir(name: str, path: Path, root: Path) -> bool:
|
|
44
|
+
"""Heuristic-based dynamic directory ignoring."""
|
|
45
|
+
if name.startswith(".") and path != root:
|
|
46
|
+
return True
|
|
47
|
+
if name.startswith("__"):
|
|
48
|
+
return True
|
|
49
|
+
if is_virtualenv(path):
|
|
50
|
+
return True
|
|
51
|
+
if name in {"node_modules", "bin", "Scripts", "lib", "obj", "target", "build", "dist"}:
|
|
52
|
+
return True
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@app.command()
|
|
57
|
+
def index(
|
|
58
|
+
path: str = typer.Argument(".", help="Path to the repo or folder to index")
|
|
59
|
+
):
|
|
60
|
+
"""Scan a directory, parse all Python files, and save the knowledge graph."""
|
|
61
|
+
root = Path(path).resolve()
|
|
62
|
+
|
|
63
|
+
if not root.exists():
|
|
64
|
+
typer.echo(f"[error] Path does not exist: {root}", err=True)
|
|
65
|
+
raise typer.Exit(code=1)
|
|
66
|
+
|
|
67
|
+
typer.echo(f"Indexing: {root}\n")
|
|
68
|
+
|
|
69
|
+
all_files = []
|
|
70
|
+
|
|
71
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
72
|
+
dpath = Path(dirpath)
|
|
73
|
+
dirnames[:] = [d for d in dirnames if not should_ignore_dir(d, dpath / d, root)]
|
|
74
|
+
|
|
75
|
+
for f in filenames:
|
|
76
|
+
if f in IGNORE_FILES:
|
|
77
|
+
continue
|
|
78
|
+
p = dpath / f
|
|
79
|
+
if p.suffix.lower() in IGNORE_EXTENSIONS:
|
|
80
|
+
continue
|
|
81
|
+
all_files.append(p)
|
|
82
|
+
|
|
83
|
+
py_files = [f for f in all_files if f.suffix == ".py"]
|
|
84
|
+
asset_exts = {".csv", ".json", ".db", ".sqlite", ".pdf", ".png", ".jpg", ".jpeg"}
|
|
85
|
+
asset_files = [f for f in all_files if f.suffix.lower() in asset_exts]
|
|
86
|
+
|
|
87
|
+
if not py_files and not asset_files:
|
|
88
|
+
typer.echo("No supported files found (everything might be ignored).")
|
|
89
|
+
raise typer.Exit()
|
|
90
|
+
|
|
91
|
+
typer.echo(f"Found {len(py_files)} Python file(s) and {len(asset_files)} asset(s)\n")
|
|
92
|
+
|
|
93
|
+
py_parser = PythonParser()
|
|
94
|
+
mm_parser = MultiModalParser()
|
|
95
|
+
parsed_files = []
|
|
96
|
+
parsed_assets = []
|
|
97
|
+
failed_files = []
|
|
98
|
+
|
|
99
|
+
for filepath in py_files:
|
|
100
|
+
result = py_parser.parse_file(str(filepath))
|
|
101
|
+
if result.errors:
|
|
102
|
+
failed_files.append((str(filepath), result.errors))
|
|
103
|
+
else:
|
|
104
|
+
typer.echo(f" ✔ [code] {filepath.relative_to(root)}")
|
|
105
|
+
parsed_files.append(result)
|
|
106
|
+
|
|
107
|
+
for filepath in asset_files:
|
|
108
|
+
try:
|
|
109
|
+
asset = mm_parser.parse(str(filepath))
|
|
110
|
+
typer.echo(f" ✔ [asset] {filepath.relative_to(root)}")
|
|
111
|
+
parsed_assets.append(asset)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
failed_files.append((str(filepath), [str(e)]))
|
|
114
|
+
|
|
115
|
+
typer.echo("\nBuilding graph...")
|
|
116
|
+
builder = GraphBuilder()
|
|
117
|
+
builder.build(parsed_files, parsed_assets)
|
|
118
|
+
summary = builder.summary()
|
|
119
|
+
|
|
120
|
+
output_dir = root / ".codegraph"
|
|
121
|
+
output_dir.mkdir(exist_ok=True)
|
|
122
|
+
output_file = output_dir / "graph.json"
|
|
123
|
+
with output_file.open("w", encoding="utf-8") as fp:
|
|
124
|
+
json.dump(builder.to_dict(), fp, indent=2)
|
|
125
|
+
|
|
126
|
+
typer.echo("\n" + "=" * 50)
|
|
127
|
+
typer.echo("Index complete")
|
|
128
|
+
typer.echo(f" Graph saved : {output_file}")
|
|
129
|
+
typer.echo(f" Graph nodes : {summary['total_nodes']}")
|
|
130
|
+
typer.echo(f" Graph edges : {summary['total_edges']}")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@app.command()
|
|
134
|
+
def plot(
|
|
135
|
+
hide_external: bool = typer.Option(False, "--hide-external", help="Hide external/stdlib nodes"),
|
|
136
|
+
level: Optional[str] = typer.Option(None, "--level", help="Show only: file, function, class, method"),
|
|
137
|
+
focus: Optional[str] = typer.Option(None, "--focus", help="Focus on a specific file"),
|
|
138
|
+
edge_type: Optional[str] = typer.Option(None, "--edge-type", help="Filter edges"),
|
|
139
|
+
):
|
|
140
|
+
"""Visualize the knowledge graph as a premium interactive HTML file."""
|
|
141
|
+
root = Path(".").resolve()
|
|
142
|
+
graph_file = root / ".codegraph" / "graph.json"
|
|
143
|
+
|
|
144
|
+
if not graph_file.exists():
|
|
145
|
+
typer.echo("[error] No graph found. Run 'codegraph index' first.", err=True)
|
|
146
|
+
raise typer.Exit(code=1)
|
|
147
|
+
|
|
148
|
+
typer.echo(f"Loading graph from {graph_file}...")
|
|
149
|
+
with graph_file.open("r", encoding="utf-8") as f:
|
|
150
|
+
data = json.load(f)
|
|
151
|
+
|
|
152
|
+
G_orig = nx.DiGraph()
|
|
153
|
+
for node in data.get("nodes", []):
|
|
154
|
+
G_orig.add_node(node["id"], **{k: v for k, v in node.items() if k != "id"})
|
|
155
|
+
for edge in data.get("edges", []):
|
|
156
|
+
G_orig.add_edge(edge["source"], edge["target"], **{k: v for k, v in edge.items() if k not in ["source", "target"]})
|
|
157
|
+
|
|
158
|
+
G = G_orig.copy()
|
|
159
|
+
|
|
160
|
+
if hide_external:
|
|
161
|
+
remove = [n for n, d in G.nodes(data=True) if d.get("external", False)]
|
|
162
|
+
G.remove_nodes_from(remove)
|
|
163
|
+
|
|
164
|
+
if level:
|
|
165
|
+
level_map = {
|
|
166
|
+
"file": {"file", "dataset", "database", "document", "image"},
|
|
167
|
+
"function": {"function", "method"},
|
|
168
|
+
"class": {"class"}
|
|
169
|
+
}
|
|
170
|
+
requested_levels = level.split(",")
|
|
171
|
+
allowed_kinds = set()
|
|
172
|
+
for r in requested_levels:
|
|
173
|
+
if r in level_map:
|
|
174
|
+
allowed_kinds.update(level_map[r])
|
|
175
|
+
else:
|
|
176
|
+
allowed_kinds.add(r)
|
|
177
|
+
|
|
178
|
+
if ("file" in requested_levels or "document" in allowed_kinds) and len(requested_levels) < 3:
|
|
179
|
+
G = _collapse_to_level(G, allowed_kinds)
|
|
180
|
+
else:
|
|
181
|
+
remove = [n for n, d in G.nodes(data=True) if d.get("kind") not in allowed_kinds]
|
|
182
|
+
G.remove_nodes_from(remove)
|
|
183
|
+
|
|
184
|
+
if focus:
|
|
185
|
+
focus_id = f"file:{focus}"
|
|
186
|
+
if focus_id in G:
|
|
187
|
+
keep = {focus_id} | set(G.successors(focus_id)) | set(G.predecessors(focus_id))
|
|
188
|
+
remove = [n for n in G.nodes if n not in keep]
|
|
189
|
+
G.remove_nodes_from(remove)
|
|
190
|
+
|
|
191
|
+
if edge_type and edge_type != "all":
|
|
192
|
+
allowed_edges = set(edge_type.split(","))
|
|
193
|
+
remove_edges = [(s, t) for s, t, d in G.edges(data=True) if d.get("relation") not in allowed_edges]
|
|
194
|
+
G.remove_edges_from(remove_edges)
|
|
195
|
+
G.remove_nodes_from(list(nx.isolates(G)))
|
|
196
|
+
|
|
197
|
+
typer.echo(f"Rendering {G.number_of_nodes()} nodes, {G.number_of_edges()} edges...")
|
|
198
|
+
|
|
199
|
+
html = _build_premium_html(G)
|
|
200
|
+
output_path = root / "graph.html"
|
|
201
|
+
output_path.write_text(html, encoding="utf-8")
|
|
202
|
+
typer.echo(f"Saved to: {output_path}")
|
|
203
|
+
webbrowser.open(f"file://{output_path.resolve()}")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _collapse_to_level(G: nx.DiGraph, allowed_kinds: set) -> nx.DiGraph:
|
|
207
|
+
"""Contract the graph using depth-limited BFS for high-performance connectivity."""
|
|
208
|
+
new_G = nx.DiGraph()
|
|
209
|
+
keep_nodes = [n for n, d in G.nodes(data=True) if d.get("kind") in allowed_kinds]
|
|
210
|
+
|
|
211
|
+
# Pre-populate all nodes
|
|
212
|
+
for n in keep_nodes:
|
|
213
|
+
new_G.add_node(n, **G.nodes[n])
|
|
214
|
+
|
|
215
|
+
# For each source node, find reachable target nodes within 5 steps
|
|
216
|
+
for start_node in keep_nodes:
|
|
217
|
+
# Standard BFS with depth tracking
|
|
218
|
+
queue = deque([(start_node, 0)])
|
|
219
|
+
visited = {start_node}
|
|
220
|
+
|
|
221
|
+
while queue:
|
|
222
|
+
current, depth = queue.popleft()
|
|
223
|
+
if depth >= 5: continue
|
|
224
|
+
|
|
225
|
+
for neighbor in G.successors(current):
|
|
226
|
+
if neighbor in visited: continue
|
|
227
|
+
visited.add(neighbor)
|
|
228
|
+
|
|
229
|
+
if neighbor in new_G:
|
|
230
|
+
# Found a connection to another keep_node!
|
|
231
|
+
# Extract relation: if it's a multi-hop, use a representative one
|
|
232
|
+
edge_data = G.get_edge_data(current, neighbor)
|
|
233
|
+
# For simplicity, we just use the first edge's relation or a default
|
|
234
|
+
rel = edge_data.get("relation", "calls")
|
|
235
|
+
new_G.add_edge(start_node, neighbor, relation=rel)
|
|
236
|
+
# Stop searching down this branch once we hit a keep_node
|
|
237
|
+
continue
|
|
238
|
+
else:
|
|
239
|
+
# Keep searching through non-keep nodes
|
|
240
|
+
queue.append((neighbor, depth + 1))
|
|
241
|
+
|
|
242
|
+
return new_G
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _build_premium_html(G: nx.DiGraph) -> str:
|
|
246
|
+
"""Generate a self-contained premium HTML visualization."""
|
|
247
|
+
nodes_js = []
|
|
248
|
+
for n, d in G.nodes(data=True):
|
|
249
|
+
nodes_js.append(f"{{id: {json.dumps(n)}, label: {json.dumps(d.get('label', n))}}}")
|
|
250
|
+
edges_js = []
|
|
251
|
+
for s, t, d in G.edges(data=True):
|
|
252
|
+
edges_js.append(f"{{from: {json.dumps(s)}, to: {json.dumps(t)}, label: {json.dumps(d.get('relation', ''))}, arrows: 'to'}}")
|
|
253
|
+
|
|
254
|
+
nodes_str = ','.join(nodes_js)
|
|
255
|
+
edges_str = ','.join(edges_js)
|
|
256
|
+
|
|
257
|
+
return f"""<!DOCTYPE html>
|
|
258
|
+
<html>
|
|
259
|
+
<head>
|
|
260
|
+
<script src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
|
|
261
|
+
<style>body{{background:#0F1117;color:#fff;margin:0;overflow:hidden;}}#graph{{height:100vh;}}</style>
|
|
262
|
+
</head>
|
|
263
|
+
<body>
|
|
264
|
+
<div id="graph"></div>
|
|
265
|
+
<script>
|
|
266
|
+
const nodes = new vis.DataSet([{nodes_str}]);
|
|
267
|
+
const edges = new vis.DataSet([{edges_str}]);
|
|
268
|
+
new vis.Network(document.getElementById('graph'), {{nodes, edges}}, {{physics:{{solver:'forceAtlas2Based'}} }});
|
|
269
|
+
</script>
|
|
270
|
+
</body>
|
|
271
|
+
</html>"""
|
|
272
|
+
|
|
273
|
+
@app.command()
|
|
274
|
+
def ask(query: str = typer.Argument(..., help="Question about your codebase")):
|
|
275
|
+
typer.echo("ask: coming soon")
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
if __name__ == "__main__":
|
|
279
|
+
app()
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Graph Builder for CodeGraph AI
|
|
3
|
+
|
|
4
|
+
Node types:
|
|
5
|
+
- file : a .py file
|
|
6
|
+
- function : top-level function
|
|
7
|
+
- class : a class
|
|
8
|
+
- method : a method belonging to a class
|
|
9
|
+
- module : an imported module/package
|
|
10
|
+
|
|
11
|
+
Edge types:
|
|
12
|
+
- contains : file → function, file → class, class → method
|
|
13
|
+
- calls : function/method → function/method
|
|
14
|
+
- imports : file → module
|
|
15
|
+
- defined_in : function/method → file
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import builtins
|
|
19
|
+
import networkx as nx
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from codegraph.parsers.python_parser import ParsedFile
|
|
22
|
+
from codegraph.parsers.multimodal_parser import ParsedAsset
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
BUILTIN_FUNCTIONS = set(dir(builtins))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class GraphBuilder:
|
|
29
|
+
def __init__(self):
|
|
30
|
+
self.graph = nx.DiGraph()
|
|
31
|
+
self._function_to_file: dict[str, str] = {}
|
|
32
|
+
self._assets: list[ParsedAsset] = []
|
|
33
|
+
self._parsed_files: list[ParsedFile] = []
|
|
34
|
+
|
|
35
|
+
def add_file(self, parsed: ParsedFile) -> None:
|
|
36
|
+
self._parsed_files.append(parsed)
|
|
37
|
+
file_id = self._file_node_id(parsed.filepath)
|
|
38
|
+
filename = Path(parsed.filepath).name
|
|
39
|
+
|
|
40
|
+
self._add_node(file_id, kind="file", label=filename, external=False)
|
|
41
|
+
|
|
42
|
+
for cls in parsed.classes:
|
|
43
|
+
cls_id = f"class:{file_id}:{cls}"
|
|
44
|
+
self._add_node(cls_id, kind="class", label=cls, external=False, file=file_id)
|
|
45
|
+
self._add_edge(file_id, cls_id, relation="contains")
|
|
46
|
+
|
|
47
|
+
for func in parsed.functions:
|
|
48
|
+
func_id = f"func:{file_id}:{func}"
|
|
49
|
+
self._add_node(func_id, kind="function", label=func, external=False, file=file_id)
|
|
50
|
+
self._add_edge(file_id, func_id, relation="contains")
|
|
51
|
+
self._add_edge(func_id, file_id, relation="defined_in")
|
|
52
|
+
self._function_to_file[func] = file_id
|
|
53
|
+
|
|
54
|
+
for cls_name, method_name in parsed.methods:
|
|
55
|
+
cls_id = f"class:{file_id}:{cls_name}"
|
|
56
|
+
method_id = f"func:{file_id}:{method_name}"
|
|
57
|
+
self._add_node(method_id, kind="method", label=method_name, external=False, file=file_id, cls=cls_name)
|
|
58
|
+
self._add_edge(cls_id, method_id, relation="contains")
|
|
59
|
+
self._add_edge(method_id, file_id, relation="defined_in")
|
|
60
|
+
self._function_to_file[method_name] = file_id
|
|
61
|
+
|
|
62
|
+
for imp in parsed.imports:
|
|
63
|
+
mod_id = f"module:{imp}"
|
|
64
|
+
self._add_node(mod_id, kind="module", label=imp, external=True)
|
|
65
|
+
self._add_edge(file_id, mod_id, relation="imports")
|
|
66
|
+
|
|
67
|
+
for caller, callee in parsed.calls:
|
|
68
|
+
if callee in BUILTIN_FUNCTIONS:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
# Use file context for caller
|
|
72
|
+
caller_id = f"func:{file_id}:{caller}"
|
|
73
|
+
|
|
74
|
+
# Resolve callee ID using global map or default to external
|
|
75
|
+
target_file_id = self._function_to_file.get(callee)
|
|
76
|
+
if target_file_id:
|
|
77
|
+
callee_id = f"func:{target_file_id}:{callee}"
|
|
78
|
+
else:
|
|
79
|
+
callee_id = f"func:external:{callee}"
|
|
80
|
+
if not self.graph.has_node(callee_id):
|
|
81
|
+
self._add_node(callee_id, kind="function", label=callee, external=True)
|
|
82
|
+
|
|
83
|
+
self._add_edge(caller_id, callee_id, relation="calls")
|
|
84
|
+
|
|
85
|
+
def add_asset(self, asset: ParsedAsset) -> None:
|
|
86
|
+
self._assets.append(asset)
|
|
87
|
+
filename = Path(asset.filepath).name
|
|
88
|
+
node_id = f"{asset.kind}:{filename}"
|
|
89
|
+
|
|
90
|
+
self._add_node(
|
|
91
|
+
node_id,
|
|
92
|
+
kind=asset.kind,
|
|
93
|
+
label=filename,
|
|
94
|
+
filename=filename, # keep raw filename for linking
|
|
95
|
+
external=False,
|
|
96
|
+
metadata=asset.metadata
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def link_code_to_assets(self) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Connect code nodes to assets if the filename appears in:
|
|
102
|
+
- function name
|
|
103
|
+
- call list
|
|
104
|
+
- string usage
|
|
105
|
+
"""
|
|
106
|
+
for node_id, data in list(self.graph.nodes(data=True)):
|
|
107
|
+
if data.get("kind") not in ("function", "method"):
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
# Get function metadata from parsed files
|
|
111
|
+
func_name = data.get("label")
|
|
112
|
+
file_id = data.get("file")
|
|
113
|
+
|
|
114
|
+
# Find the parsed file this function belongs to
|
|
115
|
+
parsed = next((p for p in self._parsed_files if self._file_node_id(p.filepath) == file_id), None)
|
|
116
|
+
if not parsed:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
# Check each asset
|
|
120
|
+
for asset in self._assets:
|
|
121
|
+
asset_filename = Path(asset.filepath).name
|
|
122
|
+
asset_id = f"{asset.kind}:{asset_filename}"
|
|
123
|
+
relation = "uses" if asset.kind in ("dataset", "database") else "references"
|
|
124
|
+
|
|
125
|
+
# Check 1: function name
|
|
126
|
+
if asset_filename in func_name:
|
|
127
|
+
self._add_edge(node_id, asset_id, relation=relation)
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
# Check 2: call list for this function
|
|
131
|
+
calls = [c[1] for c in parsed.calls if c[0] == func_name]
|
|
132
|
+
if any(asset_filename in callee for callee in calls):
|
|
133
|
+
self._add_edge(node_id, asset_id, relation=relation)
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
# Check 3: string usage
|
|
137
|
+
strings = parsed.strings.get(func_name, [])
|
|
138
|
+
if any(asset_filename in s for s in strings):
|
|
139
|
+
self._add_edge(node_id, asset_id, relation=relation)
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
def build(self, parsed_files: list[ParsedFile], assets: list[ParsedAsset] = None) -> nx.DiGraph:
|
|
143
|
+
# First pass: add all files to populate function-to-file map
|
|
144
|
+
for parsed in parsed_files:
|
|
145
|
+
if not parsed.errors:
|
|
146
|
+
self.add_file(parsed)
|
|
147
|
+
|
|
148
|
+
# Second pass: add assets
|
|
149
|
+
if assets:
|
|
150
|
+
for asset in assets:
|
|
151
|
+
self.add_asset(asset)
|
|
152
|
+
|
|
153
|
+
# Third pass: link code to assets
|
|
154
|
+
self.link_code_to_assets()
|
|
155
|
+
|
|
156
|
+
return self.graph
|
|
157
|
+
|
|
158
|
+
def summary(self) -> dict:
|
|
159
|
+
nodes_by_kind = {}
|
|
160
|
+
for _, data in self.graph.nodes(data=True):
|
|
161
|
+
kind = data.get("kind", "unknown")
|
|
162
|
+
nodes_by_kind[kind] = nodes_by_kind.get(kind, 0) + 1
|
|
163
|
+
|
|
164
|
+
edges_by_relation = {}
|
|
165
|
+
for _, _, data in self.graph.edges(data=True):
|
|
166
|
+
rel = data.get("relation", "unknown")
|
|
167
|
+
edges_by_relation[rel] = edges_by_relation.get(rel, 0) + 1
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
"total_nodes": self.graph.number_of_nodes(),
|
|
171
|
+
"total_edges": self.graph.number_of_edges(),
|
|
172
|
+
"nodes_by_kind": nodes_by_kind,
|
|
173
|
+
"edges_by_relation": edges_by_relation,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
def to_dict(self) -> dict:
|
|
177
|
+
return {
|
|
178
|
+
"nodes": [{"id": node, **data} for node, data in self.graph.nodes(data=True)],
|
|
179
|
+
"edges": [{"source": src, "target": dst, **data} for src, dst, data in self.graph.edges(data=True)],
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
def _file_node_id(self, filepath: str) -> str:
|
|
183
|
+
return f"file:{Path(filepath).name}"
|
|
184
|
+
|
|
185
|
+
def _add_node(self, node_id: str, **attrs) -> None:
|
|
186
|
+
if not self.graph.has_node(node_id):
|
|
187
|
+
self.graph.add_node(node_id, **attrs)
|
|
188
|
+
|
|
189
|
+
def _add_edge(self, src: str, dst: str, **attrs) -> None:
|
|
190
|
+
self.graph.add_edge(src, dst, **attrs)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database Parser for CodeGraph AI
|
|
3
|
+
Extracts metadata from SQLite databases.
|
|
4
|
+
"""
|
|
5
|
+
import sqlite3
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
class DatabaseParser:
|
|
9
|
+
"""
|
|
10
|
+
Parses SQLite files to extract table names.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def parse(self, filepath: str) -> dict:
|
|
14
|
+
metadata = {"tables": []}
|
|
15
|
+
try:
|
|
16
|
+
conn = sqlite3.connect(filepath)
|
|
17
|
+
cursor = conn.cursor()
|
|
18
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
|
19
|
+
tables = [row[0] for row in cursor.fetchall()]
|
|
20
|
+
metadata["tables"] = tables
|
|
21
|
+
conn.close()
|
|
22
|
+
except Exception as e:
|
|
23
|
+
metadata["error"] = str(e)
|
|
24
|
+
|
|
25
|
+
return metadata
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Image Parser for CodeGraph AI
|
|
3
|
+
Extracts text from images using pytesseract (OCR).
|
|
4
|
+
"""
|
|
5
|
+
import pytesseract
|
|
6
|
+
from PIL import Image
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
class ImageParser:
|
|
10
|
+
"""
|
|
11
|
+
Parses Image files to extract text via OCR.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def parse(self, filepath: str) -> dict:
|
|
15
|
+
"""
|
|
16
|
+
Attempts OCR on the image. Falls back gracefully if OCR is unavailable.
|
|
17
|
+
"""
|
|
18
|
+
metadata = {
|
|
19
|
+
"text": ""
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
# Check if tesseract is installed
|
|
24
|
+
# (In a real system we'd handle TesseractNotFoundError specifically)
|
|
25
|
+
img = Image.open(filepath)
|
|
26
|
+
ocr_text = pytesseract.image_to_string(img)
|
|
27
|
+
|
|
28
|
+
# Limit the text length
|
|
29
|
+
if len(ocr_text) > 500:
|
|
30
|
+
ocr_text = ocr_text[:500] + "..."
|
|
31
|
+
|
|
32
|
+
metadata["text"] = ocr_text.strip()
|
|
33
|
+
|
|
34
|
+
except Exception as e:
|
|
35
|
+
# If tesseract is not found or fails, we still return success with empty text
|
|
36
|
+
# as requested in the requirements (handle failure gracefully)
|
|
37
|
+
metadata["error"] = f"OCR failed or not available: {str(e)}"
|
|
38
|
+
|
|
39
|
+
return metadata
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MultiModal Parser for CodeGraph AI
|
|
3
|
+
Extracts metadata from non-code assets like CSV, JSON, SQLite, PDF, and Images.
|
|
4
|
+
"""
|
|
5
|
+
import json
|
|
6
|
+
import csv
|
|
7
|
+
import sqlite3
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
|
|
11
|
+
from codegraph.parsers.pdf_parser import PDFParser
|
|
12
|
+
from codegraph.parsers.image_parser import ImageParser
|
|
13
|
+
from codegraph.parsers.database_parser import DatabaseParser
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class ParsedAsset:
|
|
18
|
+
filepath: str
|
|
19
|
+
kind: str # "dataset" | "database" | "document" | "image"
|
|
20
|
+
metadata: dict = field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MultiModalParser:
|
|
24
|
+
"""
|
|
25
|
+
Parses non-Python files (CSV, JSON, SQLite, PDF, Image) to extract metadata.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self):
|
|
29
|
+
self.pdf_parser = PDFParser()
|
|
30
|
+
self.image_parser = ImageParser()
|
|
31
|
+
self.database_parser = DatabaseParser()
|
|
32
|
+
|
|
33
|
+
def parse(self, filepath: str) -> ParsedAsset:
|
|
34
|
+
path = Path(filepath)
|
|
35
|
+
suffix = path.suffix.lower()
|
|
36
|
+
|
|
37
|
+
if suffix == ".csv":
|
|
38
|
+
return self._parse_csv(path)
|
|
39
|
+
elif suffix == ".json":
|
|
40
|
+
return self._parse_json(path)
|
|
41
|
+
elif suffix in (".db", ".sqlite"):
|
|
42
|
+
return self._parse_sqlite(path)
|
|
43
|
+
elif suffix == ".pdf":
|
|
44
|
+
return self._parse_pdf(path)
|
|
45
|
+
elif suffix in (".png", ".jpg", ".jpeg"):
|
|
46
|
+
return self._parse_image(path)
|
|
47
|
+
else:
|
|
48
|
+
return ParsedAsset(filepath=str(path), kind="unknown")
|
|
49
|
+
|
|
50
|
+
def _parse_csv(self, path: Path) -> ParsedAsset:
|
|
51
|
+
metadata = {"columns": []}
|
|
52
|
+
try:
|
|
53
|
+
with path.open("r", encoding="utf-8") as f:
|
|
54
|
+
reader = csv.reader(f)
|
|
55
|
+
header = next(reader, [])
|
|
56
|
+
metadata["columns"] = header
|
|
57
|
+
except Exception as e:
|
|
58
|
+
metadata["error"] = str(e)
|
|
59
|
+
|
|
60
|
+
return ParsedAsset(filepath=str(path), kind="dataset", metadata=metadata)
|
|
61
|
+
|
|
62
|
+
def _parse_json(self, path: Path) -> ParsedAsset:
|
|
63
|
+
metadata = {"keys": []}
|
|
64
|
+
try:
|
|
65
|
+
with path.open("r", encoding="utf-8") as f:
|
|
66
|
+
data = json.load(f)
|
|
67
|
+
if isinstance(data, dict):
|
|
68
|
+
metadata["keys"] = list(data.keys())
|
|
69
|
+
elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
|
|
70
|
+
metadata["keys"] = list(data[0].keys())
|
|
71
|
+
except Exception as e:
|
|
72
|
+
metadata["error"] = str(e)
|
|
73
|
+
|
|
74
|
+
return ParsedAsset(filepath=str(path), kind="dataset", metadata=metadata)
|
|
75
|
+
|
|
76
|
+
def _parse_sqlite(self, path: Path) -> ParsedAsset:
|
|
77
|
+
metadata = self.database_parser.parse(str(path))
|
|
78
|
+
return ParsedAsset(filepath=str(path), kind="database", metadata=metadata)
|
|
79
|
+
|
|
80
|
+
def _parse_pdf(self, path: Path) -> ParsedAsset:
|
|
81
|
+
metadata = self.pdf_parser.parse(str(path))
|
|
82
|
+
return ParsedAsset(filepath=str(path), kind="document", metadata=metadata)
|
|
83
|
+
|
|
84
|
+
def _parse_image(self, path: Path) -> ParsedAsset:
|
|
85
|
+
metadata = self.image_parser.parse(str(path))
|
|
86
|
+
return ParsedAsset(filepath=str(path), kind="image", metadata=metadata)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF Parser for CodeGraph AI
|
|
3
|
+
Extracts text and metadata from PDF files using pypdf.
|
|
4
|
+
"""
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from pypdf import PdfReader
|
|
7
|
+
|
|
8
|
+
class PDFParser:
|
|
9
|
+
"""
|
|
10
|
+
Parses PDF files to extract basic text preview and page count.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def parse(self, filepath: str) -> dict:
|
|
14
|
+
"""
|
|
15
|
+
Extracts metadata from the first few pages of a PDF.
|
|
16
|
+
"""
|
|
17
|
+
metadata = {
|
|
18
|
+
"num_pages": 0,
|
|
19
|
+
"text_preview": ""
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
reader = PdfReader(filepath)
|
|
24
|
+
metadata["num_pages"] = len(reader.pages)
|
|
25
|
+
|
|
26
|
+
# Extract text from first 2 pages as a preview
|
|
27
|
+
preview_text = []
|
|
28
|
+
for i in range(min(2, len(reader.pages))):
|
|
29
|
+
text = reader.pages[i].extract_text()
|
|
30
|
+
if text:
|
|
31
|
+
preview_text.append(text.strip())
|
|
32
|
+
|
|
33
|
+
# Limit the preview length to avoid huge tooltips
|
|
34
|
+
full_preview = "\n---\n".join(preview_text)
|
|
35
|
+
if len(full_preview) > 500:
|
|
36
|
+
full_preview = full_preview[:500] + "..."
|
|
37
|
+
|
|
38
|
+
metadata["text_preview"] = full_preview
|
|
39
|
+
|
|
40
|
+
except Exception as e:
|
|
41
|
+
metadata["error"] = str(e)
|
|
42
|
+
|
|
43
|
+
return metadata
|